diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 00000000000..160bda5f0f6
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,39 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+language: "en-US"
+
+# Only comment on Critical/Major bugs. No Minor, Trivial, or style comments.
+tone_instructions: "Only comment on Critical or Major bugs. Never comment on Minor issues, style, refactoring, or suggestions. When in doubt, stay silent."
+
+reviews:
+  # Use chill profile - filters out nitpicks automatically
+  profile: "chill"
+
+  # Disable all summary features
+  high_level_summary: false
+  high_level_summary_in_walkthrough: false
+
+  # Disable walkthrough comment entirely
+  collapse_walkthrough: true
+  changed_files_summary: false
+  sequence_diagrams: false
+
+  # Disable status/effort estimates
+  review_status: false
+  commit_status: false
+  estimate_code_review_effort: false
+
+  # Disable auto-suggestions for labels/reviewers
+  suggested_labels: false
+  suggested_reviewers: false
+
+  # Disable related issues/PRs lookup
+  assess_linked_issues: false
+  related_issues: false
+  related_prs: false
+
+  # Auto-review disabled - only review when explicitly requested via @coderabbitai review
+  auto_review:
+    enabled: false
+
+chat:
+  auto_reply: true
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 42b889606ee..7496893749c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -35,6 +35,8 @@ megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-tr
 
 megatron/post_training/ @NVIDIA/post-training
 
+megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs
+
 .gitlab/ @NVIDIA/ci
 .github/ @NVIDIA/ci
 .gitlab-ci.yml @NVIDIA/ci
@@ -44,6 +46,13 @@ tests/functional_tests/shell_test_utils/ @NVIDIA/ci
 tests/test_utils/recipes/ @NVIDIA/ci
 tests/unit_tests/run_ci_test.sh @NVIDIA/ci
 
+# API Backwards Compatibility Check
+scripts/check_api_backwards_compatibility.py @NVIDIA/ci
+scripts/README_API_COMPAT.md @NVIDIA/ci
+.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci
+docs/api-backwards-compatibility-check.md @NVIDIA/ci
+tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci
+
 megatron/rl/ @NVIDIA/reinforcement-learning
 examples/rl/ @NVIDIA/reinforcement-learning
 test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 10eef953d5d..9662160da10 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -9,7 +9,8 @@ assignees: ''
 
 **Describe the bug**
 
-A clear and concise description of what the bug is.
+A clear and concise description of what the bug is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
 
 **Steps/Code to reproduce bug**
 
@@ -25,4 +26,4 @@ A clear and concise description of what you expected to happen.
 
 **Additional context**
 
-Add any other context about the problem here.
+Add any other context about the problem here. 
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 7334f687d1b..b0da6789a8e 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -10,6 +10,9 @@ assignees: ''
 **Is your feature request related to a problem? Please describe.**
 A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 
+Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
+
 **Describe the solution you'd like**
 A clear and concise description of what you want to happen.
 
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
index b3d89a0ac1a..899ff44d6a6 100644
--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -9,4 +9,5 @@ assignees: ''
 ---
 
 **Your question**
-Ask a clear and concise question about Megatron-LM.
+Ask a clear and concise question about Megatron-LM. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/regression.md b/.github/ISSUE_TEMPLATE/regression.md
index 10078d23a6e..180db633cb8 100644
--- a/.github/ISSUE_TEMPLATE/regression.md
+++ b/.github/ISSUE_TEMPLATE/regression.md
@@ -8,7 +8,8 @@ assignees: ''
 ---
 
 **Describe the regression**
-A clear and concise description of what the regression is.
+A clear and concise description of what the regression is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
 
 **To Reproduce**
 Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
diff --git a/.github/actions/action.yml b/.github/actions/action.yml
index 5c35385b036..088877304a7 100644
--- a/.github/actions/action.yml
+++ b/.github/actions/action.yml
@@ -11,28 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-name: 'Test Template'
-description: 'Template for running NeMo tests in a containerized environment'
+name: "Test Template"
+description: "Template for running NeMo tests in a containerized environment"
 
 inputs:
   container-image:
-    description: 'Container image to use for test'
+    description: "Container image to use for test"
     required: true
   timeout:
-    description: 'Max runtime of test in minutes'
+    description: "Max runtime of test in minutes"
     required: false
-    default: '30'
+    default: "30"
   script:
-    description: 'Test script to execute'
+    description: "Test script to execute"
     required: true
   is-optional:
-    description: 'Pass this job on failure.'
+    description: "Pass this job on failure."
     required: false
-    default: 'false'
+    default: "false"
   is_unit_test:
-    description: 'Upload coverage as unit test'
+    description: "Upload coverage as unit test"
     required: false
-    default: 'false'
+    default: "false"
   tag:
     description: Latest or legacy test suite
     required: true
@@ -43,12 +43,51 @@ inputs:
     description: Model to launch
     required: false
   PAT:
-    description: 'GitHub Personal Access Token'
+    description: "GitHub Personal Access Token"
+    required: true
+  is_ci_workload:
+    description: "Is CI workload"
     required: true
 
 runs:
-  using: 'composite'
+  using: "composite"
   steps:
+    - name: Print node name
+      shell: bash -x -e -u -o pipefail {0}
+      run: echo "node_name=$NODE_NAME" | tee -a "$GITHUB_OUTPUT"
+
+    - name: GPU Sanity Check
+      shell: bash -x -e -u -o pipefail {0}
+      run: |
+        echo "Starting GPU Sanity Check..."
+
+        # 1. Check for active Compute Processes
+        # query-compute-apps returns a list of PIDs using the GPU. If empty, we are good.
+        OPEN_PROCESSES=$(docker run --rm --gpus all ubuntu nvidia-smi --query-compute-apps=pid,process_name --format=csv,noheader)
+
+        if [ -n "$OPEN_PROCESSES" ]; then
+          echo "::error::❌ GPU is not clean! Found active processes:"
+          echo "$OPEN_PROCESSES"
+        else
+          echo "✅ No active compute processes found."
+        fi
+
+        # 2. Check VRAM Usage (Optional but recommended)
+        # We allow a small buffer (e.g., < 300MiB) for driver overhead/Xorg, 
+        # though on headless K8s nodes this should be very close to 0.
+
+        MEMORY_USAGES=$(docker run --rm --gpus all ubuntu nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits)
+
+        # Check each GPU visible to the container
+        for MEMORY in $MEMORY_USAGES; do
+          if [ "$MEMORY" -gt 300 ]; then
+            echo "::error::❌ GPU VRAM usage is suspiciously high: ${MEMORY} MiB"
+          fi
+        done
+
+        echo "✅ GPU Memory is clean (all < 300 MiB)."
+        echo "Ready to start workflow."
+
     - name: Checkout repository
       uses: actions/checkout@v2
 
@@ -77,8 +116,11 @@ runs:
 
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
-        pip install --no-cache-dir uv
-        uv sync --only-group test
+        export NCCL_DEBUG=INFO
+        pip install --no-cache-dir "uv<0.9.29"
+        uv venv .venv
+        uv cache clean
+        uv sync --no-cache --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
           --scope unit-tests \
           --model unit-tests \
@@ -119,9 +161,11 @@ runs:
       id: has-run-functional-tests-label
       env:
         GH_TOKEN: ${{ github.token }}
+        IS_CI_WORKLOAD: ${{ inputs.is_ci_workload }}
       run: |
         PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
-        HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false"
+        HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "$IS_CI_WORKLOAD"
+        HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
         echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
 
     - name: Create run-script (e2e test)
@@ -139,21 +183,26 @@ runs:
           ARGS=(
             --scope mr-github
             --enable-lightweight-mode
+            --n-repeat 1
           )
         elif [ "${{ steps.has-run-functional-tests-label.outputs.main }}" == "true" ]; then
           ARGS=(
             --scope mr-github
+            --n-repeat 5
           )
         else
           ARGS=(
             --scope mr-github-slim
+            --n-repeat 5
           )
         fi
 
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
-        pip install --no-cache-dir uv
-        uv sync --only-group test
+        pip install --no-cache-dir "uv<0.9.29"
+        uv venv .venv
+        uv cache clean
+        uv sync --no-cache --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
           ${ARGS[@]} \
           --model ${{ inputs.model }} \
@@ -248,5 +297,5 @@ runs:
       if: always()
       with:
         name: ${{ steps.check.outputs.logs_report }}
-        path: ${{ inputs.is_unit_test == 'true' && 'logs' || 'assets_dir' }}
+        path: ${{ inputs.is_unit_test == 'true' && 'assets_dir/logs' || 'assets_dir' }}
         include-hidden-files: true
diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index f691460f8c7..6a21bb07ed7 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -69,6 +69,10 @@ test_matrix:
       - tests/unit_tests/transformer/test_transformer_block.py
       - tests/unit_tests/transformer/test_transformer_block_custom_pgs.py
       - tests/unit_tests/dist_checkpointing/test_local.py
+      - tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py
+
+  # TODO(test_layer_wise_optimizer.py): add logic for installing emerging-optimizers repo before execution
+  # ref: https://github.com/NVIDIA-NeMo/Emerging-Optimizers?tab=readme-ov-file#install-from-source
 
   # functional:
   #   train:
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index 8e703301ca7..72a5b915ecc 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -1,4 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
-trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "guyueh1", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json
new file mode 100644
index 00000000000..5fa49e966bc
--- /dev/null
+++ b/.github/oncall_schedule.json
@@ -0,0 +1,50 @@
+[
+    {
+        "user": "dimapihtar",
+        "date": "2026-01-28"
+    },
+    {
+        "user": "gautham-kollu",
+        "date": "2026-02-04"
+    },
+    {
+        "user": "janEbert",
+        "date": "2026-02-11"
+    },
+    {
+        "user": "Phlip79",
+        "date": "2026-02-18"
+    },
+    {
+        "user": "asolergi-nv",
+        "date": "2026-02-25"
+    },
+    {
+        "user": "BoxiangW",
+        "date": "2026-03-04"
+    },
+    {
+        "user": "maanug-nv",
+        "date": "2026-03-11"
+    },
+    {
+        "user": "dimapihtar",
+        "date": "2026-03-18"
+    },
+    {
+        "user": "gautham-kollu",
+        "date": "2026-03-25"
+    },
+    {
+        "user": "janEbert",
+        "date": "2026-04-01"
+    },
+    {
+        "user": "maanug-nv",
+        "date": "2026-04-08"
+    },
+    {
+        "user": "BoxiangW",
+        "date": "2026-04-15"
+    }
+]
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 7f7dedd27ad..5cd5138eb69 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,7 +1,7 @@
 # What does this PR do ?
 <!-- Add a one line overview of what this PR aims to accomplish. -->
 
-:warning: For major changes (either in lines of code or in its impact), please make sure to first share discuss a design-doc with the team.  
+:warning: For major changes (either in lines of code or in its impact), please make sure to first share a design doc with the team. If you're unsure what's the best way to do so, contact the @mcore-oncall.
 
 ## Contribution process
 
@@ -31,6 +31,8 @@ The following process is enforced via the CODEOWNERS file for changes into `mega
 <details>
 <summary>For MRs into `main` branch</summary>
 
+Feel free to message or comment the @mcore-oncall to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!
+
 #### (Step 1): Add PR label `Expert Review`
 
 #### (Step 2): Collect the expert reviewers reviews
diff --git a/.github/scripts/oncall_manager.py b/.github/scripts/oncall_manager.py
new file mode 100644
index 00000000000..332fcb1c8cc
--- /dev/null
+++ b/.github/scripts/oncall_manager.py
@@ -0,0 +1,439 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import json
+import requests
+import argparse
+from datetime import datetime, timedelta, timezone
+
+from slack_sdk import WebClient
+from slack_sdk.errors import SlackApiError
+
+# Constants
+GITHUB_API_URL = "https://api.github.com"
+SCHEDULE_FILE = ".github/oncall_schedule.json"
+ROTATION_TEAM_SLUG = "mcore-oncall-rotation"
+ACTIVE_ONCALL_TEAM_SLUG = "mcore-oncall"
+SLACK_USERGROUP_HANDLE = "mcore-oncall"
+TARGET_WEEKS = 12
+
+# Caches for email and Slack lookups
+_email_cache = {}
+_slack_id_cache = {}
+
+def get_headers():
+    token = os.environ.get("GH_TOKEN")
+    if not token:
+        # Fallback to GITHUB_TOKEN if GH_TOKEN not set
+        token = os.environ.get("GITHUB_TOKEN")
+        
+    if not token:
+        print("Error: GH_TOKEN or GITHUB_TOKEN not set")
+        sys.exit(1)
+        
+    return {
+        "Authorization": f"token {token}",
+        "Accept": "application/vnd.github.v3+json"
+    }
+
+def get_repo_info():
+    """Returns (owner, repo) from GITHUB_REPOSITORY env var."""
+    repo_env = os.environ.get("GITHUB_REPOSITORY")
+    if not repo_env:
+        print("Error: GITHUB_REPOSITORY environment variable not set")
+        sys.exit(1)
+    parts = repo_env.split("/")
+    return parts[0], parts[1]
+
+def get_team_members(org, team_slug):
+    """Fetches members of the GitHub team."""
+    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{team_slug}/members"
+    headers = get_headers()
+    
+    members = set()
+    page = 1
+    while True:
+        resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers)
+        if resp.status_code != 200:
+            print(f"Error fetching team members: {resp.status_code} {resp.text}")
+            sys.exit(1)
+        
+        data = resp.json()
+        if not data:
+            break
+            
+        members.update([m['login'] for m in data])
+        if len(data) < 100:
+            break
+        page += 1
+        
+    return members
+
+def get_user_email(username):
+    """Get user's email from GitHub, prioritizing @nvidia.com emails.
+    
+    Checks in order:
+    1. Public profile email
+    2. Recent commits in the repository
+    """
+    if username in _email_cache:
+        return _email_cache[username]
+    
+    headers = get_headers()
+    public_email = None
+    
+    try:
+        # 1. Try to get user's public profile email first
+        resp = requests.get(f"{GITHUB_API_URL}/users/{username}", headers=headers)
+        if resp.status_code == 200:
+            user_data = resp.json()
+            email = user_data.get('email')
+            if email and not email.endswith("@users.noreply.github.com"):
+                if email.endswith("@nvidia.com"):
+                    _email_cache[username] = email
+                    return email
+                # Store non-nvidia email as fallback
+                public_email = email
+        
+        # 2. Check recent commits in the repository for @nvidia.com email
+        repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM")
+        commits_url = f"{GITHUB_API_URL}/repos/{repo_env}/commits?author={username}&per_page=10"
+        resp = requests.get(commits_url, headers=headers)
+        
+        if resp.status_code == 200:
+            commits = resp.json()
+            for commit in commits:
+                # Get email from commit author
+                commit_data = commit.get('commit', {})
+                author_data = commit_data.get('author', {})
+                email = author_data.get('email')
+                
+                if email and not email.endswith("@users.noreply.github.com"):
+                    if email.endswith("@nvidia.com"):
+                        _email_cache[username] = email
+                        print(f"Found @nvidia.com email for {username} from commits: {email}")
+                        return email
+                    elif public_email is None:
+                        public_email = email
+        
+        # 3. Use public email if found, otherwise fallback
+        if public_email:
+            _email_cache[username] = public_email
+            print(f"Using public email for {username}: {public_email}")
+            return public_email
+        
+        # Fallback to noreply email
+        fallback = f"{username}@users.noreply.github.com"
+        _email_cache[username] = fallback
+        print(f"Warning: No email found for {username}, using fallback: {fallback}")
+        return fallback
+        
+    except Exception as e:
+        print(f"Warning: Could not get email for {username}: {e}")
+        fallback = f"{username}@users.noreply.github.com"
+        _email_cache[username] = fallback
+        return fallback
+
+def get_slack_client():
+    """Get Slack WebClient if token is available."""
+    slack_token = os.environ.get("SLACK_TOKEN")
+    if not slack_token:
+        return None
+    
+    return WebClient(token=slack_token)
+
+def get_slack_user_id(slack_client, email):
+    """Get Slack user ID from email."""
+    if not slack_client:
+        return None
+    
+    if email in _slack_id_cache:
+        return _slack_id_cache[email]
+    
+    try:
+        response = slack_client.users_lookupByEmail(email=email)
+        user_id = response["user"]["id"]
+        _slack_id_cache[email] = user_id
+        return user_id
+    except SlackApiError as e:
+        print(f"Warning: Could not find Slack user for {email}: {e.response['error']}")
+        _slack_id_cache[email] = None
+        return None
+
+def get_slack_usergroup_id(slack_client, handle):
+    """Get Slack usergroup ID from handle."""
+    if not slack_client:
+        return None
+    
+    try:
+        response = slack_client.usergroups_list(include_users=True)
+        for usergroup in response.get("usergroups", []):
+            if usergroup.get("handle") == handle:
+                return usergroup.get("id"), usergroup.get("users", [])
+        print(f"Warning: Slack usergroup '{handle}' not found")
+        return None, []
+    except SlackApiError as e:
+        print(f"Warning: Could not list Slack usergroups: {e.response['error']}")
+        return None, []
+
+def update_slack_usergroup(new_oncall_username, old_members_usernames):
+    """
+    Updates the Slack usergroup to contain only the new oncall user.
+    Adds new oncall first, then removes old members (usergroups need at least one member).
+    """
+    slack_client = get_slack_client()
+    if not slack_client:
+        print("Slack token not configured, skipping Slack usergroup update")
+        return
+    
+    # Get the new oncall's email and Slack user ID
+    new_email = get_user_email(new_oncall_username)
+    new_slack_id = get_slack_user_id(slack_client, new_email)
+    
+    if not new_slack_id:
+        print(f"Could not find Slack user ID for {new_oncall_username} ({new_email}), skipping Slack update")
+        return
+    
+    # Get the usergroup ID and current members
+    usergroup_id, current_slack_members = get_slack_usergroup_id(slack_client, SLACK_USERGROUP_HANDLE)
+    
+    if not usergroup_id:
+        print(f"Could not find Slack usergroup '{SLACK_USERGROUP_HANDLE}', skipping Slack update")
+        return
+    
+    try:
+        # Step 1: Add new oncall first (include current members to avoid removing anyone yet)
+        # This ensures usergroup always has at least one member
+        if new_slack_id not in current_slack_members:
+            updated_members = list(set(current_slack_members + [new_slack_id]))
+            slack_client.usergroups_users_update(
+                usergroup=usergroup_id,
+                users=updated_members
+            )
+            print(f"Added {new_oncall_username} to Slack usergroup '{SLACK_USERGROUP_HANDLE}'")
+        
+        # Step 2: Now set the usergroup to contain only the new oncall
+        slack_client.usergroups_users_update(
+            usergroup=usergroup_id,
+            users=[new_slack_id]
+        )
+        print(f"Updated Slack usergroup '{SLACK_USERGROUP_HANDLE}' to contain only {new_oncall_username}")
+        
+    except SlackApiError as e:
+        print(f"Failed to update Slack usergroup: {e.response['error']}")
+
+def load_schedule():
+    if not os.path.exists(SCHEDULE_FILE):
+        return []
+    try:
+        with open(SCHEDULE_FILE, 'r') as f:
+            data = json.load(f)
+            # Normalize to list of dicts if it's a list of strings
+            schedule = []
+            for item in data:
+                if isinstance(item, str):
+                    schedule.append({"user": item, "date": "YYYY-MM-DD"})
+                else:
+                    schedule.append(item)
+            return schedule
+    except (json.JSONDecodeError, FileNotFoundError):
+        return []
+
+def save_schedule(schedule):
+    with open(SCHEDULE_FILE, 'w') as f:
+        json.dump(schedule, f, indent=4)
+        f.write('\n') # trailing newline
+
+def update_active_oncall_team(org, new_oncall):
+    """Updates the active oncall team to contain only the new oncall user."""
+    # 1. Get current members of the active team
+    current_members = get_team_members(org, ACTIVE_ONCALL_TEAM_SLUG)
+    
+    # 2. Add the new oncall if not present
+    if new_oncall not in current_members:
+        url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{new_oncall}"
+        resp = requests.put(url, headers=get_headers())
+        if resp.status_code == 200:
+            print(f"Added {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}")
+        else:
+            print(f"Failed to add {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")
+
+    # 3. Remove everyone else
+    old_members = []
+    for member in current_members:
+        if member not in [new_oncall, 'svcnvidia-nemo-ci']:
+            old_members.append(member)
+            url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{member}"
+            resp = requests.delete(url, headers=get_headers())
+            if resp.status_code == 204:
+                print(f"Removed {member} from {ACTIVE_ONCALL_TEAM_SLUG}")
+            else:
+                print(f"Failed to remove {member} from {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")
+    
+    # 4. Update Slack usergroup (add new oncall first, then remove old members)
+    update_slack_usergroup(new_oncall, old_members)
+
+def rotate_schedule(repo_owner, dry_run=False):
+    schedule = load_schedule()
+    print(f"Current schedule length: {len(schedule)}")
+    
+    # 1. Rotate (Remove past week)
+    # Only if schedule is not empty.
+    if schedule:
+        # Check date of first entry
+        first_entry = schedule[0]
+        try:
+            # We assume the date is the *start* of the oncall shift (Wednesday).
+            # The shift ends 7 days later.
+            start_date = datetime.strptime(first_entry['date'], "%Y-%m-%d").date()
+            end_date = start_date + timedelta(days=7)
+            
+            today = datetime.now(timezone.utc).date()
+            
+            # If today is >= end_date, the shift is over.
+            # (e.g. Started last Wed, ends today Wed. If today is Wed, we rotate)
+            if today >= end_date:
+                removed = schedule.pop(0)
+                print(f"Rotated out: {removed} (Ended {end_date})")
+            else:
+                print(f"First entry {first_entry} has not ended yet (Ends {end_date}). Not removing.")
+        except ValueError:
+             # Fallback if date is invalid, rotate anyway
+             removed = schedule.pop(0)
+             print(f"Rotated out (invalid date): {removed}")
+    else:
+        print("Schedule empty, nothing to rotate.")
+
+    # 2. Replenish
+    ensure_schedule_filled(schedule, repo_owner)
+    
+    # 3. Update active oncall team
+    if schedule:
+        current_oncall = schedule[0]['user']
+        print(f"New active oncall: {current_oncall}")
+        if not dry_run:
+            update_active_oncall_team(repo_owner, current_oncall)
+        else:
+            print(f"Dry run: Would update {ACTIVE_ONCALL_TEAM_SLUG} to contain only {current_oncall}")
+    
+    if not dry_run:
+        save_schedule(schedule)
+        print("Schedule updated and saved.")
+    else:
+        print("Dry run: Schedule not saved.")
+        print(json.dumps(schedule, indent=4))
+
+def get_last_wednesday():
+    today = datetime.now(timezone.utc).date()
+    # Monday=0, Wednesday=2
+    offset = (today.weekday() - 2) % 7
+    return today - timedelta(days=offset)
+
+def ensure_schedule_filled(schedule, repo_owner):
+    """Appends users to schedule until it reaches TARGET_WEEKS."""
+    members = get_team_members(repo_owner, ROTATION_TEAM_SLUG)
+    if not members:
+        print(f"Warning: No team members found in {ROTATION_TEAM_SLUG}.")
+        return
+    if 'svcnvidia-nemo-ci' in members:
+        members.remove('svcnvidia-nemo-ci')
+    members = list(members)
+
+    members.sort() # Deterministic order
+    
+    while len(schedule) < TARGET_WEEKS:
+        # Determine start date for the new entry
+        if not schedule:
+            # Start with the most recent Wednesday if list is empty
+            next_date = get_last_wednesday()
+            
+            # Start with the first member alphabetically if list is empty
+            next_user = members[0]
+        else:
+            last_entry = schedule[-1]
+            last_user = last_entry['user']
+            
+            # Parse last date and add 7 days
+            try:
+                last_date = datetime.strptime(last_entry['date'], "%Y-%m-%d").date()
+                next_date = last_date + timedelta(days=7)
+            except ValueError:
+                # Fallback if date is invalid/placeholder
+                next_date = get_last_wednesday() + timedelta(days=7 * len(schedule))
+
+            try:
+                # Find index of last scheduled user in the team list
+                if last_user in members:
+                    last_idx = members.index(last_user)
+                    next_idx = (last_idx + 1) % len(members)
+                    next_user = members[next_idx]
+                else:
+                    # Last user not in team, just pick first member
+                    next_user = members[0]
+            except ValueError:
+                next_user = members[0]
+        
+        new_entry = {"user": next_user, "date": next_date.strftime("%Y-%m-%d")}
+        schedule.append(new_entry)
+        print(f"Appended: {new_entry}")
+
+def assign_reviewer(pr_number):
+    """Assigns the mcore-oncall team as the reviewer for the PR."""
+    owner, repo = get_repo_info()
+    url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/pulls/{pr_number}/requested_reviewers"
+    
+    # Assign the oncall team as reviewer
+    data = {"team_reviewers": [ACTIVE_ONCALL_TEAM_SLUG]}
+    resp = requests.post(url, headers=get_headers(), json=data)
+    
+    if resp.status_code in [201, 200]:
+        print(f"Successfully requested review from team NVIDIA/{ACTIVE_ONCALL_TEAM_SLUG}")
+    else:
+        print(f"Failed to request review: {resp.status_code} {resp.text}")
+        sys.exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description="Manage Oncall Schedule")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    
+    # Rotate command
+    parser_rotate = subparsers.add_parser("rotate", help="Rotate the schedule (remove first, append new)")
+    parser_rotate.add_argument("--dry-run", action="store_true", help="Do not save changes")
+
+    # Fill command (just fill up to 12 without rotating - useful for init)
+    parser_fill = subparsers.add_parser("fill", help="Fill the schedule to 12 weeks without rotating")
+    
+    # Assign command
+    parser_assign = subparsers.add_parser("assign", help="Assign current oncall to PR")
+    parser_assign.add_argument("--pr", type=int, required=True, help="PR number")
+
+    args = parser.parse_args()
+    
+    owner, _ = get_repo_info()
+    
+    if args.command == "rotate":
+        rotate_schedule(owner, dry_run=args.dry_run)
+    elif args.command == "fill":
+        schedule = load_schedule()
+        ensure_schedule_filled(schedule, owner)
+        save_schedule(schedule)
+        print("Schedule filled and saved.")
+    elif args.command == "assign":
+        assign_reviewer(args.pr)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/.github/scripts/sync_team_usergroups.py b/.github/scripts/sync_team_usergroups.py
new file mode 100644
index 00000000000..429387fc6de
--- /dev/null
+++ b/.github/scripts/sync_team_usergroups.py
@@ -0,0 +1,527 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Syncs GitHub team membership to Slack user groups.
+
+This script reads members from GitHub teams and updates the corresponding
+Slack user groups to match.
+"""
+
+import os
+import sys
+import argparse
+import requests
+
+from slack_sdk import WebClient
+from slack_sdk.errors import SlackApiError
+
+# Constants
+GITHUB_API_URL = "https://api.github.com"
+PARENT_TEAM_SLUG = "mcore-reviewers"
+
+# Caches for email and Slack lookups
+_email_cache = {}
+_slack_id_cache = {}
+_usergroups_cache = None
+
+
+def get_headers():
+    """Get GitHub API headers with authentication."""
+    token = os.environ.get("GH_TOKEN")
+    if not token:
+        token = os.environ.get("GITHUB_TOKEN")
+
+    if not token:
+        print("Error: GH_TOKEN or GITHUB_TOKEN not set")
+        sys.exit(1)
+
+    return {
+        "Authorization": f"token {token}",
+        "Accept": "application/vnd.github.v3+json",
+    }
+
+
+def get_org():
+    """Returns the organization from GITHUB_REPOSITORY env var or default."""
+    repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM")
+    return repo_env.split("/")[0]
+
+
+def github_team_to_slack_usergroup(team_slug):
+    """Convert a GitHub team slug to a Slack usergroup handle.
+
+    Rules:
+    - Base pattern: "test" -> "mcore-test"
+    - Remove "core-" prefix: "core-test" -> "mcore-test"
+    - Remove "megatron-" prefix: "megatron-test" -> "mcore-test"
+    - Remove "-and-": "test1-and-test2" -> "mcore-test1-test2"
+    - Shorten "mixture-of-experts" to "moe"
+    - Shorten "pipeline-parallelism" to "pp"
+    - Shorten "reinforcement-learning" to "rl"
+    """
+    name = team_slug
+
+    # Apply shortenings first (before removing prefixes)
+    name = name.replace("mixture-of-experts", "moe")
+    name = name.replace("pipeline-parallelism", "pp")
+    name = name.replace("reinforcement-learning", "rl")
+
+    # Remove prefixes
+    if name.startswith("core-"):
+        name = name[5:]  # Remove "core-"
+    elif name.startswith("megatron-"):
+        name = name[9:]  # Remove "megatron-"
+
+    # Remove "-and-"
+    name = name.replace("-and-", "-")
+
+    return f"mcore-{name}"
+
+
+def get_child_teams(org, parent_team_slug):
+    """Fetches child teams of a parent GitHub team."""
+    # First get the team ID
+    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{parent_team_slug}"
+    headers = get_headers()
+
+    resp = requests.get(url, headers=headers)
+    if resp.status_code != 200:
+        print(f"Error fetching parent team '{parent_team_slug}': {resp.status_code} {resp.text}")
+        return []
+
+    parent_team_id = resp.json().get("id")
+    if not parent_team_id:
+        print(f"Error: Could not get ID for team '{parent_team_slug}'")
+        return []
+
+    # Now fetch child teams
+    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{parent_team_slug}/teams"
+    child_teams = []
+    page = 1
+
+    while True:
+        resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers)
+        if resp.status_code != 200:
+            print(f"Error fetching child teams: {resp.status_code} {resp.text}")
+            return child_teams
+
+        data = resp.json()
+        if not data:
+            break
+
+        child_teams.extend([team["slug"] for team in data])
+        if len(data) < 100:
+            break
+        page += 1
+
+    return child_teams
+
+
+def get_team_members(org, team_slug):
+    """Fetches members of the GitHub team."""
+    url = f"{GITHUB_API_URL}/orgs/{org}/teams/{team_slug}/members"
+    headers = get_headers()
+
+    members = set()
+    page = 1
+    while True:
+        resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers)
+        if resp.status_code == 404:
+            print(f"Warning: Team '{team_slug}' not found in org '{org}'")
+            return set()
+        if resp.status_code != 200:
+            print(f"Error fetching team members: {resp.status_code} {resp.text}")
+            return set()
+
+        data = resp.json()
+        if not data:
+            break
+
+        members.update([m["login"] for m in data])
+        if len(data) < 100:
+            break
+        page += 1
+
+    return members
+
+
+def get_user_email(username):
+    """Get user's email from GitHub, prioritizing @nvidia.com emails.
+
+    Checks in order:
+    1. Public profile email
+    2. Recent commits in the repository
+    """
+    if username in _email_cache:
+        return _email_cache[username]
+
+    headers = get_headers()
+    public_email = None
+
+    try:
+        # 1. Try to get user's public profile email first
+        resp = requests.get(f"{GITHUB_API_URL}/users/{username}", headers=headers)
+        if resp.status_code == 200:
+            user_data = resp.json()
+            email = user_data.get('email')
+            if email and not email.endswith("@users.noreply.github.com"):
+                if email.endswith("@nvidia.com"):
+                    _email_cache[username] = email
+                    return email
+                # Store non-nvidia email as fallback
+                public_email = email
+
+        # 2. Check recent commits in the repository for @nvidia.com email
+        repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM")
+        commits_url = f"{GITHUB_API_URL}/repos/{repo_env}/commits?author={username}&per_page=10"
+        resp = requests.get(commits_url, headers=headers)
+
+        if resp.status_code == 200:
+            commits = resp.json()
+            for commit in commits:
+                # Get email from commit author
+                commit_data = commit.get('commit', {})
+                author_data = commit_data.get('author', {})
+                email = author_data.get('email')
+
+                if email and not email.endswith("@users.noreply.github.com"):
+                    if email.endswith("@nvidia.com"):
+                        _email_cache[username] = email
+                        print(f"Found @nvidia.com email for {username} from commits")
+                        return email
+                    elif public_email is None:
+                        public_email = email
+
+        # 3. Use public email if found, otherwise fallback
+        if public_email:
+            _email_cache[username] = public_email
+            print(f"Using public email for {username}: {public_email}")
+            return public_email
+
+        # Fallback to noreply email
+        fallback = f"{username}@users.noreply.github.com"
+        _email_cache[username] = fallback
+        print(f"Warning: No email found for {username}, using fallback: {fallback}")
+        return fallback
+
+    except Exception as e:
+        print(f"Warning: Could not get email for {username}: {e}")
+        fallback = f"{username}@users.noreply.github.com"
+        _email_cache[username] = fallback
+        return fallback
+
+
+def get_slack_client():
+    """Get Slack WebClient if token is available."""
+    slack_token = os.environ.get("SLACK_TOKEN")
+    if not slack_token:
+        return None
+
+    return WebClient(token=slack_token)
+
+
+def get_slack_user_id(slack_client, email):
+    """Get Slack user ID from email."""
+    if not slack_client:
+        return None
+
+    if email in _slack_id_cache:
+        return _slack_id_cache[email]
+
+    try:
+        response = slack_client.users_lookupByEmail(email=email)
+        user_id = response["user"]["id"]
+        _slack_id_cache[email] = user_id
+        return user_id
+    except SlackApiError as e:
+        print(f"Warning: Could not find Slack user for {email}: {e.response['error']}")
+        _slack_id_cache[email] = None
+        return None
+
+
+def fetch_all_usergroups(slack_client):
+    """Fetch all Slack usergroups once and cache them."""
+    global _usergroups_cache
+
+    if _usergroups_cache is not None:
+        return _usergroups_cache
+
+    if not slack_client:
+        _usergroups_cache = {}
+        return _usergroups_cache
+
+    try:
+        print("Fetching Slack usergroups...")
+        response = slack_client.usergroups_list(include_users=True)
+        _usergroups_cache = {}
+        for usergroup in response.get("usergroups", []):
+            handle = usergroup.get("handle")
+            if handle:
+                _usergroups_cache[handle] = {
+                    "id": usergroup.get("id"),
+                    "users": usergroup.get("users", []),
+                }
+        print(f"Fetched {len(_usergroups_cache)} usergroups")
+        return _usergroups_cache
+    except SlackApiError as e:
+        print(f"Warning: Could not list Slack usergroups: {e.response['error']}")
+        _usergroups_cache = {}
+        return _usergroups_cache
+
+
+def get_slack_usergroup_id(slack_client, handle):
+    """Get Slack usergroup ID from handle."""
+    usergroups = fetch_all_usergroups(slack_client)
+
+    if handle in usergroups:
+        return usergroups[handle]["id"], usergroups[handle]["users"]
+
+    return None, []
+
+
+def github_team_to_usergroup_name(team_slug):
+    """Convert a GitHub team slug to a Slack usergroup display name.
+
+    Example: "test3" -> "Megatron Core Experts: Test3"
+    """
+    # Title case each word separated by hyphens, then join with spaces
+    words = team_slug.split("-")
+    title_cased = " ".join(word.capitalize() for word in words)
+    return f"Megatron Core Experts: {title_cased}"
+
+
+def create_slack_usergroup(slack_client, handle, team_slug):
+    """Create a new Slack usergroup.
+
+    Args:
+        slack_client: Slack WebClient instance
+        handle: The usergroup handle (e.g., "mcore-test")
+        team_slug: The GitHub team slug (used for name and description)
+
+    Returns:
+        The usergroup ID if created successfully, None otherwise
+    """
+    global _usergroups_cache
+
+    name = github_team_to_usergroup_name(team_slug)
+    description = f'Expert review group "{team_slug}"'
+
+    try:
+        print(f"Creating Slack usergroup '@{handle}' with name '{name}'...")
+        response = slack_client.usergroups_create(
+            name=name,
+            handle=handle,
+            description=description,
+        )
+        usergroup = response.get("usergroup", {})
+        usergroup_id = usergroup.get("id")
+
+        if usergroup_id:
+            # Update cache with new usergroup
+            if _usergroups_cache is not None:
+                _usergroups_cache[handle] = {
+                    "id": usergroup_id,
+                    "users": [],
+                }
+            print(f"Successfully created Slack usergroup '@{handle}'")
+            return usergroup_id
+        else:
+            print(f"Error: Usergroup created but no ID returned")
+            return None
+
+    except SlackApiError as e:
+        print(f"Error creating Slack usergroup '@{handle}': {e.response['error']}")
+        return None
+
+
+def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False):
+    """Sync a GitHub team to a Slack usergroup."""
+    print(f"\n{'='*60}")
+    print(f"Syncing GitHub team '{team_slug}' -> Slack usergroup '@{usergroup_handle}'")
+    print(f"{'='*60}")
+
+    org = get_org()
+    slack_client = get_slack_client()
+
+    if not slack_client:
+        print("Error: Slack token not configured")
+        return False
+
+    # 1. Get GitHub team members
+    members = get_team_members(org, team_slug)
+    if not members:
+        print(f"No members found in GitHub team '{team_slug}'")
+        return False
+
+    # Filter out service accounts
+    members = {m for m in members if not m.startswith("svc")}
+    print(f"GitHub team members ({len(members)}): {sorted(members)}")
+
+    # 2. Get Slack user IDs for each member
+    slack_user_ids = []
+    missing_users = []
+
+    for username in sorted(members):
+        email = get_user_email(username)
+        slack_id = get_slack_user_id(slack_client, email)
+        if slack_id:
+            slack_user_ids.append(slack_id)
+        else:
+            missing_users.append((username, email, "not found in Slack"))
+
+    if missing_users:
+        print(f"\nWarning: Could not resolve {len(missing_users)} users:")
+        for username, email, reason in missing_users:
+            print(f"  - {username}: {reason}" + (f" (tried {email})" if email else ""))
+
+    if not slack_user_ids:
+        print(f"Error: No Slack users found for team '{team_slug}'")
+        return False
+
+    # 3. Get current Slack usergroup membership (or create if it doesn't exist)
+    usergroup_id, current_members = get_slack_usergroup_id(slack_client, usergroup_handle)
+
+    if not usergroup_id:
+        print(f"Slack usergroup '@{usergroup_handle}' not found, creating it...")
+        if dry_run:
+            print(f"Dry run: Would create usergroup '@{usergroup_handle}'")
+            current_members = []
+        else:
+            usergroup_id = create_slack_usergroup(slack_client, usergroup_handle, team_slug)
+            if not usergroup_id:
+                print(f"Error: Failed to create Slack usergroup '@{usergroup_handle}'")
+                return False
+            current_members = []
+
+    # 4. Compare and update
+    current_set = set(current_members)
+    new_set = set(slack_user_ids)
+
+    to_add = new_set - current_set
+    to_remove = current_set - new_set
+
+    print(f"\nCurrent usergroup members: {len(current_members)}")
+    print(f"New members to set: {len(slack_user_ids)}")
+    print(f"  Adding: {len(to_add)} users")
+    print(f"  Removing: {len(to_remove)} users")
+
+    if current_set == new_set:
+        print("No changes needed - usergroup is already in sync")
+        return True
+
+    if dry_run:
+        print(f"\nDry run: Would update '@{usergroup_handle}' with {len(slack_user_ids)} members")
+        return True
+
+    # 5. Update the usergroup
+    try:
+        slack_client.usergroups_users_update(
+            usergroup=usergroup_id, users=slack_user_ids
+        )
+        print(f"\nSuccessfully updated '@{usergroup_handle}' with {len(slack_user_ids)} members")
+        return True
+    except SlackApiError as e:
+        print(f"Error updating usergroup: {e.response['error']}")
+        return False
+
+
+def get_team_to_usergroup_mapping():
+    """Fetch child teams of mcore-reviewers and generate the mapping."""
+    org = get_org()
+    child_teams = get_child_teams(org, PARENT_TEAM_SLUG)
+
+    if not child_teams:
+        print(f"Error: No child teams found under '{PARENT_TEAM_SLUG}'")
+        return {}
+
+    mapping = {}
+    for team_slug in child_teams:
+        usergroup_handle = github_team_to_slack_usergroup(team_slug)
+        mapping[team_slug] = usergroup_handle
+
+    return mapping
+
+
+def sync_all_teams(dry_run=False):
+    """Sync all GitHub teams under mcore-reviewers to their Slack usergroups."""
+    print(f"Fetching child teams of '{PARENT_TEAM_SLUG}'...")
+    team_to_usergroup = get_team_to_usergroup_mapping()
+
+    if not team_to_usergroup:
+        return False
+
+    print(f"Found {len(team_to_usergroup)} teams to sync")
+    print("\nTeam to usergroup mapping:")
+    for team, usergroup in sorted(team_to_usergroup.items()):
+        print(f"  {team} -> @{usergroup}")
+
+    results = {"success": [], "failed": []}
+
+    for team_slug, usergroup_handle in team_to_usergroup.items():
+        success = sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=dry_run)
+        if success:
+            results["success"].append(team_slug)
+        else:
+            results["failed"].append(team_slug)
+
+    # Summary
+    print(f"\n{'='*60}")
+    print("SYNC SUMMARY")
+    print(f"{'='*60}")
+    print(f"Successful: {len(results['success'])}")
+    print(f"Failed: {len(results['failed'])}")
+
+    if results["failed"]:
+        print(f"\nFailed teams: {', '.join(results['failed'])}")
+        return False
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Sync GitHub team membership to Slack user groups"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be done without making changes",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List all configured team-to-usergroup mappings",
+    )
+
+    args = parser.parse_args()
+
+    if args.list:
+        print(f"Fetching child teams of '{PARENT_TEAM_SLUG}'...")
+        team_to_usergroup = get_team_to_usergroup_mapping()
+        if not team_to_usergroup:
+            sys.exit(1)
+        print("\nTeam-to-usergroup mappings:")
+        print(f"{'GitHub Team':<35} {'Slack Usergroup':<30}")
+        print("-" * 65)
+        for team, usergroup in sorted(team_to_usergroup.items()):
+            print(f"{team:<35} @{usergroup:<29}")
+        return
+
+    success = sync_all_teams(dry_run=args.dry_run)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/_build_test_publish_wheel.yml b/.github/workflows/_build_test_publish_wheel.yml
index 369a866f1c2..0b71577b587 100644
--- a/.github/workflows/_build_test_publish_wheel.yml
+++ b/.github/workflows/_build_test_publish_wheel.yml
@@ -17,8 +17,6 @@ on:
         type: boolean
         default: true
     secrets:
-      TWINE_USERNAME:
-        required: true
       TWINE_PASSWORD:
         required: true
 
@@ -74,7 +72,7 @@ jobs:
             rm LICENSE || true
             docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\
               for python_version in cp310 cp311 cp312 cp313; do \
-                /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools>=80.0.0" build; \
+                /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools<80.0.0,>=77.0.0" build; \
               done && \
               for python_version in cp310 cp311 cp312 cp313; do \
                 /opt/python/${python_version}-${python_version}/bin/python -m build; \
@@ -140,36 +138,51 @@ jobs:
       - name: Upload wheels
         uses: actions/upload-artifact@v4
         with:
-          name: wheels-${{ matrix.PACKAGE }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
+          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
           path: dist/
 
   publish-wheels:
     needs: [build-and-test-wheels]
     runs-on: ubuntu-latest
     if: inputs.no-publish == false
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }}
     strategy:
       fail-fast: false
       matrix:
         include:
-          - PACKAGE: megatron_core
-          - PACKAGE: megatron_fsdp
+          - PACKAGE: megatron-core
+            PLATFORM: arm64
+          - PACKAGE: megatron-core
+            PLATFORM: amd64
+          - PACKAGE: megatron-fsdp
+            PLATFORM: amd64
     env:
       PACKAGE: ${{ matrix.PACKAGE }}
     steps:
       - name: Download wheels
         uses: actions/download-artifact@v4
         with:
-          name: wheels-${{ matrix.PACKAGE }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
+          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
           path: dist/
           merge-multiple: true
 
       - name: Publish wheels
         env:
-          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+          TWINE_USERNAME: __token__
           TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
           TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }}
+          PLATFORM: ${{ matrix.PLATFORM }}
         run: |
-          ls -al dist/$PACKAGE*
+
+          # Delete sdist for arm64 since we already upload it with amd64.
+          if [ "$PLATFORM" == "arm64" ]; then
+            rm dist/*.tar.gz
+          fi
+
+          ls -al dist/
           pip install twine
-          twine upload -r $TWINE_REPOSITORY -u $TWINE_USERNAME -p $TWINE_PASSWORD dist/$PACKAGE*
+          twine upload \
+            --verbose \
+            -r $TWINE_REPOSITORY \
+            -u $TWINE_USERNAME \
+            -p $TWINE_PASSWORD \
+            dist/*
diff --git a/.github/workflows/_release_library.yml b/.github/workflows/_release_library.yml
index c166a58c21e..684dacc27aa 100644
--- a/.github/workflows/_release_library.yml
+++ b/.github/workflows/_release_library.yml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: 'Release'
+name: "Release"
 
 defaults:
   run:
@@ -38,13 +38,24 @@ on:
         description: Create a GitHub release
         type: boolean
         default: true
+      gh-release-use-changelog-builder:
+        required: false
+        description: Use release-changelog-builder-action to dynamically build changelog
+        type: boolean
+        default: true
+      gh-release-changelog-config:
+        required: false
+        description: Path to changelog builder configuration file
+        type: string
+        default: ".github/workflows/config/changelog-config.json"
+      gh-release-from-tag:
+        required: false
+        description: Starting tag for changelog builder (leave empty for auto-detect)
+        type: string
+        default: ""
     secrets:
-      TWINE_USERNAME:
-        required: true
       TWINE_PASSWORD:
         required: true
-      SLACK_WEBHOOK_ADMIN:
-        required: true
       SLACK_WEBHOOK:
         required: true
       PAT:
@@ -60,13 +71,12 @@ jobs:
     with:
       dry-run: true
       ref: ${{ inputs.release-ref }}
+      no-publish: true
     secrets:
-      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
 
   bump-next-version:
     runs-on: ubuntu-latest
-    environment: main # ${{ inputs.dry-run == true && 'public' || 'main' }}
     needs: build-test-publish-wheels-dry-run
     if: |
       (
@@ -74,7 +84,7 @@ jobs:
       )
       && !cancelled()
     outputs:
-      version: ${{ needs.bump-version-mcore.outputs.release-version }}
+      release-version: ${{ steps.bump-version-mcore.outputs.release-version }}
     env:
       IS_DRY_RUN: ${{ inputs.dry-run }}
     steps:
@@ -89,9 +99,10 @@ jobs:
       - name: Bump version MCore
         id: bump-version-mcore
         env:
-          SRC_DIR: ''
-          PYPROJECT_NAME: 'megatron.core'
+          SRC_DIR: ""
+          PYPROJECT_NAME: "megatron.core"
         run: |
+          set +u
           cd ${{ github.run_id }}
 
           PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"
@@ -101,7 +112,7 @@ jobs:
           PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}')
           PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
 
-          echo "release-version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"
+          echo "release-version=$MAJOR.$MINOR.$PATCH$PRERELEASE" | tee -a "$GITHUB_OUTPUT"
 
           if [[ "$PRERELEASE" != "" ]]; then
             if [[ "$PRERELEASE" == *rc* ]]; then
@@ -127,9 +138,11 @@ jobs:
       - name: Bump version MFSDP
         id: bump-version-mfsdp
         env:
-          SRC_DIR: 'megatron/core/distributed/fsdp/src/'
-          PYPROJECT_NAME: 'megatron_fsdp'
+          SRC_DIR: "megatron/core/distributed/fsdp/src/"
+          PYPROJECT_NAME: "megatron_fsdp"
         run: |
+          set +u
+
           cd ${{ github.run_id }}
 
           PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"
@@ -319,7 +332,6 @@ jobs:
   create-gh-release:
     needs: [build-test-publish-wheels, bump-next-version]
     runs-on: ubuntu-latest
-    environment: ${{ inputs.dry-run == true && 'public' || 'main' }}
     if: |
       (
         success() || !failure()
@@ -341,12 +353,51 @@ jobs:
           ref: ${{ inputs.release-ref }}
           token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
 
+      - name: Determine fromTag for changelog
+        id: determine-from-tag
+        if: inputs.gh-release-use-changelog-builder == true
+        run: |
+          cd ${{ github.run_id }}
+
+          # If gh-release-from-tag is provided, use it
+          if [[ -n "${{ inputs.gh-release-from-tag }}" ]]; then
+            FROM_TAG="${{ inputs.gh-release-from-tag }}"
+            echo "Using provided fromTag: $FROM_TAG"
+          else
+            # Get the most recent tag
+            FROM_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "")
+            if [[ -z "$FROM_TAG" ]]; then
+              echo "No previous tags found, leaving fromTag empty"
+            else
+              echo "Auto-detected most recent tag: $FROM_TAG"
+            fi
+          fi
+
+          echo "from-tag=$FROM_TAG" >> $GITHUB_OUTPUT
+
+      - name: Build Changelog
+        id: build-changelog
+        if: inputs.gh-release-use-changelog-builder == true
+        uses: mikepenz/release-changelog-builder-action@v6.1.0
+        env:
+          GITHUB_TOKEN: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
+        with:
+          configuration: ${{ github.run_id }}/${{ inputs.gh-release-changelog-config }}
+          owner: ${{ github.repository_owner }}
+          repo: ${{ github.event.repository.name }}
+          ignorePreReleases: "false"
+          failOnError: "false"
+          fromTag: ${{ steps.determine-from-tag.outputs.from-tag }}
+          toTag: ${{ inputs.release-ref }}
+          mode: ${{ inputs.gh-release-changelog-mode }}
+
       - name: Create release
         id: version-number
         env:
           SHA: ${{ inputs.release-ref }}
           GH_TOKEN: ${{ secrets.PAT }}
           IS_DRY_RUN: ${{ inputs.dry-run }}
+          BUILT_CHANGELOG: ${{ steps.build-changelog.outputs.changelog }}
         run: |
           cd ${{ github.run_id }}
 
@@ -355,7 +406,10 @@ jobs:
           IS_PRERELEASE=$([[ "$IS_RELEASE_CANDIDATE" == "true" || "$IS_ALPHA" == "true" ]] && echo "true" || echo "false")
           NAME="NVIDIA $PROJECT_NAME ${VERSION}"
 
-          if [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then
+          # Use built changelog if available, otherwise fall back to CHANGELOG.md
+          if [[ -n "$BUILT_CHANGELOG" ]]; then
+            CHANGELOG="$BUILT_CHANGELOG"
+          elif [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then
             DATE=$(date +"%Y-%m-%d")
             CHANGELOG="Prerelease: $NAME ($DATE)"
           else
@@ -398,10 +452,19 @@ jobs:
             eval "$CMD"
           fi
 
+  publish-docs:
+    needs: [bump-next-version, create-gh-release]
+    uses: ./.github/workflows/release-docs.yml
+    with:
+      dry-run: ${{ inputs.dry-run }}
+      publish-as-latest: true
+      docs-version-override: ${{ needs.bump-next-version.outputs.release-version }}
+      build-docs-ref: ${{ inputs.release-ref }}
+    secrets: inherit
+
   notify:
     needs: [build-test-publish-wheels, create-gh-release]
     runs-on: ubuntu-latest
-    environment: ${{ inputs.dry-run == true && 'public' || 'main' }}
     env:
       GH_URL: https://github.com/${{ github.repository }}/releases/tag/v${{ needs.build-test-publish-wheels.outputs.version }}
       PYPI_URL: https://${{ inputs.dry-run == true && 'test.' || '' }}pypi.org/project/${{ needs.build-test-publish-wheels.outputs.pypi-name }}/${{ needs.build-test-publish-wheels.outputs.version }}/
diff --git a/.github/workflows/_update_dependencies.yml b/.github/workflows/_update_dependencies.yml
index b769a480a00..0a5fb47605f 100644
--- a/.github/workflows/_update_dependencies.yml
+++ b/.github/workflows/_update_dependencies.yml
@@ -9,12 +9,6 @@ on:
     secrets:
       PAT:
         required: true
-      AZURE_CLIENT_ID:
-        required: true
-      AZURE_TENANT_ID:
-        required: true
-      AZURE_SUBSCRIPTION_ID:
-        required: true
       SSH_KEY:
         required: true
       SSH_PWD:
@@ -32,26 +26,12 @@ jobs:
         run: echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT"
 
   update-lockfile:
-    environment: nemo-ci
     runs-on: linux-amd64-cpu16
     needs: [pre-flight]
     env:
       SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
       TARGET_BRANCH: ${{ inputs.target-branch }}
     steps:
-      - name: Install Azure CLI
-        run: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-      - name: Azure Login
-        uses: azure/login@v2
-        with:
-          client-id: ${{ secrets.AZURE_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
-
-      - name: Azure ACR Login
-        run: az acr login --name nemoci
-
       - name: Checkout repo
         uses: actions/checkout@v4
         with:
@@ -96,7 +76,6 @@ jobs:
   create-pr:
     needs: [update-lockfile, pre-flight]
     runs-on: ubuntu-latest
-    environment: main
     env:
       SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
       TARGET_BRANCH: ${{ inputs.target-branch }}
@@ -132,7 +111,7 @@ jobs:
           name: lock-file-${{ env.SOURCE_BRANCH }}
 
       - name: Create Bump PR
-        uses: peter-evans/create-pull-request@v6
+        uses: peter-evans/create-pull-request@v8
         id: create-pull-request
         env:
           title: "chore(beep boop 🤖): Bump `uv.lock` (${{ inputs.target-branch}}) (${{ needs.pre-flight.outputs.date }})"
diff --git a/.github/workflows/auto-assign-milestone.yml b/.github/workflows/auto-assign-milestone.yml
index 4af8548c90f..d5e4941b1e7 100644
--- a/.github/workflows/auto-assign-milestone.yml
+++ b/.github/workflows/auto-assign-milestone.yml
@@ -13,7 +13,6 @@ permissions:
 jobs:
   assign-milestone:
     runs-on: ubuntu-latest
-    environment: nemo-ci
     if: github.repository == 'NVIDIA/Megatron-LM'
     steps:
       - name: Get PR info
diff --git a/.github/workflows/auto-reminder-bot.yml b/.github/workflows/auto-reminder-bot.yml
index 7a8377baa2e..8ff072560b9 100644
--- a/.github/workflows/auto-reminder-bot.yml
+++ b/.github/workflows/auto-reminder-bot.yml
@@ -9,7 +9,6 @@ name: Auto Reminder Bot
 
 jobs:
   run-script:
-    environment: main
     name: Run Auto Reminder Bot
     runs-on: ubuntu-latest
     if: github.repository == 'NVIDIA/Megatron-LM'
@@ -28,7 +27,7 @@ jobs:
 
       - name: Run Auto Reminder Bot
         run: |
-          export SLACK_TOKEN=${{ secrets.SLACK_TOKEN }}
-          export SLACK_WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK_URL }}
+          export SLACK_TOKEN=${{ secrets.SLACK_BOT_TOKEN }}
+          export SLACK_WEBHOOK_URL=${{ secrets.SLACK_REVIEW_REMINDER_CHANNEL_WEBHOOK }}
           export GH_TOKEN=${{ secrets.PAT }}
           python tests/test_utils/python_scripts/auto_reminder_github.py
diff --git a/.github/workflows/auto-update-copy-pr-bot.yml b/.github/workflows/auto-update-copy-pr-bot.yml
index d71876e7306..08a4ef68e89 100644
--- a/.github/workflows/auto-update-copy-pr-bot.yml
+++ b/.github/workflows/auto-update-copy-pr-bot.yml
@@ -8,11 +8,13 @@ name: Auto Update Copy PR Bot
 jobs:
   auto-update-copy-pr-bot:
     runs-on: ubuntu-latest
-    environment: nemo-ci
     if: github.repository == 'NVIDIA/Megatron-LM'
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.PAT }}
+          ref: main
 
       - name: Fetch list of members in mcore-reviewers team
         shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
new file mode 100644
index 00000000000..1bc7dfcb6e4
--- /dev/null
+++ b/.github/workflows/build-docs.yml
@@ -0,0 +1,65 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Build docs
+
+on:
+  push:
+    branches:
+      - main
+      - "pull-request/[0-9]+"
+      - "deploy-release/*"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+jobs:
+  pre-flight:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
+
+  build-docs:
+    needs: [pre-flight]
+    if: needs.pre-flight.outputs.is_deployment_workflow != 'true'
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0
+
+  build-docs-summary:
+    needs: [pre-flight, build-docs]
+    if: |
+      (
+        needs.pre-flight.outputs.is_deployment_workflow == 'true'
+        || always()
+      )
+      && !cancelled()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get workflow result
+        id: result
+        shell: bash -x -e -u -o pipefail {0}
+        env:
+          GH_TOKEN: ${{ github.token }}
+          RUN_ID: ${{ github.run_id }}
+          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml
index 7aec7ae9ab9..4530f38c6e9 100644
--- a/.github/workflows/build-test-publish-wheel.yml
+++ b/.github/workflows/build-test-publish-wheel.yml
@@ -18,8 +18,8 @@ name: Build, test, and publish a PyPi wheel (to testpypi).
 #   push:
 #     branches:
 #       - main
-#       - 'pull-request/[0-9]+'
-#       - 'deploy-release/*'
+#       - "pull-request/[0-9]+"
+#       - "deploy-release/*"
 #   merge_group:
 #     types: [checks_requested]
 
@@ -42,8 +42,7 @@ jobs:
     with:
       no-publish: true
     secrets:
-      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
-      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
 
   build-test-publish-wheel-summary:
     needs: [pre-flight, build-test-publish-wheels]
@@ -65,7 +64,7 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
           GITHUB_RUN_ID: ${{ github.run_id }}
-          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' || github.ref != 'refs/heads/main' }}
+          SKIPPING_IS_ALLOWED: true
         run: |
           FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
 
diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml
index d6d5f9c9b20..44340bdedc5 100644
--- a/.github/workflows/check_api_backwards_compatibility_workflow.yml
+++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml
@@ -1,4 +1,6 @@
-name: API Compatibility Check
+# Temporarily disable this check until we can enforce it on PRs
+#
+# name: API Compatibility Check
 
 # on:
 #   push:
@@ -17,258 +19,258 @@ name: API Compatibility Check
 #         description: 'Baseline git reference (tag/branch/commit)'
 #         required: true
 
-jobs:
-  pre-flight:
-    name: Pre-flight check
-    runs-on: ubuntu-latest
-    outputs:
-      should_skip: ${{ steps.check_files.outputs.should_skip }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      
-      - name: Check if relevant files changed
-        id: check_files
-        run: |
-          # For manual triggers, never skip
-          if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
-            echo "should_skip=false" >> $GITHUB_OUTPUT
-            echo "Manual trigger - will run compatibility check"
-            exit 0
-          fi
+# jobs:
+#   pre-flight:
+#     name: Pre-flight check
+#     runs-on: ubuntu-latest
+#     outputs:
+#       should_skip: ${{ steps.check_files.outputs.should_skip }}
+#     steps:
+#       - name: Checkout code
+#         uses: actions/checkout@v4
+#         with:
+#           fetch-depth: 0
 
-          # Determine base SHA based on event type
-          if [ "${{ github.event_name }}" == "merge_group" ]; then
-            BASE_SHA="${{ github.event.merge_group.base_sha }}"
-            echo "Merge group event - comparing against base: $BASE_SHA"
-          else
-            # For push events, use merge-base to find common ancestor
-            # This ensures we only detect changes actually made in this PR branch,
-            # not changes that happened in main after the branch was created
-            BASE_SHA=$(git merge-base origin/main HEAD 2>/dev/null || echo "")
-            if [ -z "$BASE_SHA" ]; then
-              # Fallback for pull-request/* branches targeting dev
-              BASE_SHA=$(git merge-base origin/dev HEAD 2>/dev/null || echo "")
-            fi
-            echo "Push event - comparing against merge-base: $BASE_SHA"
-          fi
+#       - name: Check if relevant files changed
+#         id: check_files
+#         run: |
+#           # For manual triggers, never skip
+#           if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+#             echo "should_skip=false" >> $GITHUB_OUTPUT
+#             echo "Manual trigger - will run compatibility check"
+#             exit 0
+#           fi
 
-          if [ -z "$BASE_SHA" ]; then
-            echo "Could not determine base SHA - will run compatibility check"
-            echo "should_skip=false" >> $GITHUB_OUTPUT
-            exit 0
-          fi
+#           # Determine base SHA based on event type
+#           if [ "${{ github.event_name }}" == "merge_group" ]; then
+#             BASE_SHA="${{ github.event.merge_group.base_sha }}"
+#             echo "Merge group event - comparing against base: $BASE_SHA"
+#           else
+#             # For push events, use merge-base to find common ancestor
+#             # This ensures we only detect changes actually made in this PR branch,
+#             # not changes that happened in main after the branch was created
+#             BASE_SHA=$(git merge-base origin/main HEAD 2>/dev/null || echo "")
+#             if [ -z "$BASE_SHA" ]; then
+#               # Fallback for pull-request/* branches targeting dev
+#               BASE_SHA=$(git merge-base origin/dev HEAD 2>/dev/null || echo "")
+#             fi
+#             echo "Push event - comparing against merge-base: $BASE_SHA"
+#           fi
 
-          # Check for changes in megatron/core Python files (excluding tests and legacy)
-          # Note: Using both *.py and **/*.py to match files at root and in subdirectories
-          CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \
-            'megatron/core/*.py' \
-            'megatron/core/**/*.py' \
-            ':!megatron/core/tests/**' \
-            ':!megatron/legacy/**' 2>/dev/null || echo "")
+#           if [ -z "$BASE_SHA" ]; then
+#             echo "Could not determine base SHA - will run compatibility check"
+#             echo "should_skip=false" >> $GITHUB_OUTPUT
+#             exit 0
+#           fi
 
-          if [ -z "$CHANGED_FILES" ]; then
-            echo "should_skip=true" >> $GITHUB_OUTPUT
-            echo "No relevant megatron/core files changed - will skip compatibility check"
-          else
-            echo "should_skip=false" >> $GITHUB_OUTPUT
-            echo "Relevant files changed:"
-            echo "$CHANGED_FILES"
-          fi
+#           # Check for changes in megatron/core Python files (excluding tests and legacy)
+#           # Note: Using both *.py and **/*.py to match files at root and in subdirectories
+#           CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \
+#             'megatron/core/*.py' \
+#             'megatron/core/**/*.py' \
+#             ':!megatron/core/tests/**' \
+#             ':!megatron/legacy/**' 2>/dev/null || echo "")
 
-  check-compatibility:
-    needs: [pre-flight]
-    if: needs.pre-flight.outputs.should_skip != 'true'
-    name: Check API Backward Compatibility
-    runs-on: ubuntu-latest
-    
-    # ============================================================================
-    # Configuration Parameters (modify here)
-    # ============================================================================
-    env:
-      # Default baseline for automatic PR checks
-      # Can be: branch name (e.g., 'main'), commit hash, or tag
-      # Will be resolved to commit hash during execution
-      DEFAULT_BASELINE: 'f5344166732f45bb0dd825dc875288ea97b15b47'
-      # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*')
-      TAG_PATTERN: 'core_v*'
-      # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only)
-      TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$'
-    # ============================================================================
-    
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0  # Need full history to access baseline ref
-      
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-      
-      - name: Install griffe
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install griffe
-          python -c "import griffe; print('Griffe installed successfully')"
-          python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed"
-          python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed"
-      
-      - name: Determine baseline reference
-        id: baseline
-        run: |
-          if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
-            # Use manually specified baseline (branch, tag, or commit hash)
-            BASELINE_REF="${{ github.event.inputs.baseline }}"
-          else
-            # Use the configured default baseline
-            BASELINE_REF="${{ env.DEFAULT_BASELINE }}"
-            
-            # Uncomment below to auto-detect from tags instead:
-            # BASELINE_REF=$(git tag -l '${{ env.TAG_PATTERN }}' | grep -E '${{ env.TAG_REGEX_FILTER }}' | sort -V | tail -1)
-            # if [ -z "$BASELINE_REF" ]; then
-            #   echo "Warning: No tags matching pattern found. Using default: ${{ env.DEFAULT_BASELINE }}" >&2
-            #   BASELINE_REF="${{ env.DEFAULT_BASELINE }}"
-            # fi
-          fi
-          
-          # Resolve baseline to commit hash (works for branches, tags, or commit hashes)
-          BASELINE_HASH=$(git rev-parse "$BASELINE_REF")
-          
-          echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT
-          echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)"
-      
-      - name: Run compatibility check
-        id: compat_check
-        run: |
-          # Save output to file for later display
-          python scripts/check_api_backwards_compatibility.py \
-            --baseline ${{ steps.baseline.outputs.baseline }} \
-            --verbose 2>&1 | tee compat_check_output.txt
-          
-          # Capture exit code
-          EXIT_CODE=${PIPESTATUS[0]}
-          echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
-          exit $EXIT_CODE
-        continue-on-error: true
-      
-      - name: Fail job if breaking changes detected
-        if: steps.compat_check.outcome == 'failure'
-        run: |
-          echo ""
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo "🔍 WHAT IS THIS CHECK?"
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo ""
-          echo "This check ensures that changes to Megatron Core's public API do not"
-          echo "break backward compatibility for users. It compares your PR against"
-          echo "the latest stable release to detect breaking changes in:"
-          echo ""
-          echo "  • Function signatures (parameters, order, types)"
-          echo "  • Class structures and methods"
-          echo "  • Return types and public interfaces"
-          echo ""
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo "🛠️  HOW TO FIX THIS"
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo ""
-          echo "Choose ONE of these resolution strategies:"
-          echo ""
-          echo "1️⃣  REVERT THE BREAKING CHANGE (Recommended)"
-          echo "   → Modify your code to preserve backward compatibility"
-          echo "   → Add new parameters as optional (with defaults)"
-          echo "   → Keep existing parameters in the same order"
-          echo ""
-          echo "2️⃣  MARK AS INTERNAL API (If this is internal code)"
-          echo "   → Add @internal_api decorator from megatron.core.utils"
-          echo ""
-          echo "   Example (for classes):"
-          echo "      from megatron.core.utils import internal_api"
-          echo ""
-          echo "      @internal_api"
-          echo "      class ExperimentalFeature:"
-          echo "          pass"
-          echo ""
-          echo "   Example (for functions):"
-          echo "      from megatron.core.utils import internal_api"
-          echo ""
-          echo "      @internal_api"
-          echo "      def internal_helper_function():"
-          echo "          pass"
-          echo ""
-          echo "3️⃣  MARK AS EXPERIMENTAL API (If this is experimental code)"
-          echo "   → Add @experimental_api decorator from megatron.core.utils"
-          echo ""
-          echo "   Example:"
-          echo "      from megatron.core.utils import experimental_api"
-          echo ""
-          echo "      @experimental_api"
-          echo "      class ExperimentalFeature:"
-          echo "          pass"
-          echo ""
-          echo "4️⃣  USE DEPRECATION (For gradual API changes)"
-          echo "   → Add @deprecated decorator for transition period"
-          echo "   → Example:"
-          echo "      from megatron.core.utils import deprecated"
-          echo ""
-          echo "      @deprecated(version='1.0', removal_version='2.0',"
-          echo "                  alternative='new_function')"
-          echo "      def old_function():"
-          echo "          pass"
-          echo ""
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo "📋 BREAKING CHANGES DETECTED"
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo ""
-          cat compat_check_output.txt
-          echo ""
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo "📚 MORE INFORMATION"
-          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-          echo ""
-          echo "📖 Full documentation: docs/api-backwards-compatibility-check.md"
-          echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py"
-          echo "❓ Questions? Check the docs or ask in #megatron-core"
-          echo ""
-          
-          echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy."
-          exit 1
-      
-      - name: Success message
-        if: steps.compat_check.outcome == 'success'
-        run: |
-          echo "::notice::✅ No breaking API changes detected!"
+#           if [ -z "$CHANGED_FILES" ]; then
+#             echo "should_skip=true" >> $GITHUB_OUTPUT
+#             echo "No relevant megatron/core files changed - will skip compatibility check"
+#           else
+#             echo "should_skip=false" >> $GITHUB_OUTPUT
+#             echo "Relevant files changed:"
+#             echo "$CHANGED_FILES"
+#           fi
 
-  api-backward-compatibility-summary:
-    needs: [pre-flight, check-compatibility]
-    runs-on: ubuntu-latest
-    name: API Backward Compatibility Check Summary
-    if: always() && !cancelled()
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+#   check-compatibility:
+#     needs: [pre-flight]
+#     if: needs.pre-flight.outputs.should_skip != 'true'
+#     name: "OPTIONAL: Check API Backward Compatibility"
+#     runs-on: ubuntu-latest
 
-      - name: Validate workflow result
-        shell: bash -x -e -u -o pipefail {0}
-        env:
-          GH_TOKEN: ${{ github.token }}
-          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.should_skip == 'true' }}
-        run: |
-          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "API Backward Compatibility Check Summary")] | length') || echo 0
+#     # ============================================================================
+#     # Configuration Parameters (modify here)
+#     # ============================================================================
+#     env:
+#       # Default baseline for automatic PR checks
+#       # Can be: branch name (e.g., 'main'), commit hash, or tag
+#       # Will be resolved to commit hash during execution
+#       DEFAULT_BASELINE: '5ab481cb45efc72add12f8ba0378e849b3d2bc50'
+#       # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*')
+#       TAG_PATTERN: 'core_v*'
+#       # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only)
+#       TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$'
+#     # ============================================================================
 
-          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
-              if [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
-                  echo "✅ Compatibility check was skipped (no relevant files changed)"
-              else
-                  echo "✅ All checks passed successfully"
-              fi
-              exit 0
-          else
-              echo "❌ Found $FAILED_JOBS failed job(s)"
-              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "API Backward Compatibility Check Summary") | .name'
-              exit 1
-          fi
+#     steps:
+#       - name: Checkout code
+#         uses: actions/checkout@v4
+#         with:
+#           fetch-depth: 0  # Need full history to access baseline ref
+
+#       - name: Set up Python
+#         uses: actions/setup-python@v5
+#         with:
+#           python-version: '3.12'
+
+#       - name: Install griffe
+#         run: |
+#           python -m pip install --upgrade pip
+#           python -m pip install griffe
+#           python -c "import griffe; print('Griffe installed successfully')"
+#           python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed"
+#           python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed"
+
+#       - name: Determine baseline reference
+#         id: baseline
+#         run: |
+#           if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+#             # Use manually specified baseline (branch, tag, or commit hash)
+#             BASELINE_REF="${{ github.event.inputs.baseline }}"
+#           else
+#             # Use the configured default baseline
+#             BASELINE_REF="${{ env.DEFAULT_BASELINE }}"
+
+#             # Uncomment below to auto-detect from tags instead:
+#             # BASELINE_REF=$(git tag -l '${{ env.TAG_PATTERN }}' | grep -E '${{ env.TAG_REGEX_FILTER }}' | sort -V | tail -1)
+#             # if [ -z "$BASELINE_REF" ]; then
+#             #   echo "Warning: No tags matching pattern found. Using default: ${{ env.DEFAULT_BASELINE }}" >&2
+#             #   BASELINE_REF="${{ env.DEFAULT_BASELINE }}"
+#             # fi
+#           fi
+
+#           # Resolve baseline to commit hash (works for branches, tags, or commit hashes)
+#           BASELINE_HASH=$(git rev-parse "$BASELINE_REF")
+
+#           echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT
+#           echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)"
+
+#       - name: Run compatibility check
+#         id: compat_check
+#         run: |
+#           # Save output to file for later display
+#           python scripts/check_api_backwards_compatibility.py \
+#             --baseline ${{ steps.baseline.outputs.baseline }} \
+#             --verbose 2>&1 | tee compat_check_output.txt
+
+#           # Capture exit code
+#           EXIT_CODE=${PIPESTATUS[0]}
+#           echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
+#           exit $EXIT_CODE
+#         continue-on-error: true
+
+#       - name: Fail job if breaking changes detected
+#         if: steps.compat_check.outcome == 'failure'
+#         run: |
+#           echo ""
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo "🔍 WHAT IS THIS CHECK?"
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo ""
+#           echo "This check ensures that changes to Megatron Core's public API do not"
+#           echo "break backward compatibility for users. It compares your PR against"
+#           echo "the latest stable release to detect breaking changes in:"
+#           echo ""
+#           echo "  • Function signatures (parameters, order, types)"
+#           echo "  • Class structures and methods"
+#           echo "  • Return types and public interfaces"
+#           echo ""
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo "🛠️  HOW TO FIX THIS"
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo ""
+#           echo "Choose ONE of these resolution strategies:"
+#           echo ""
+#           echo "1️⃣  REVERT THE BREAKING CHANGE (Recommended)"
+#           echo "   → Modify your code to preserve backward compatibility"
+#           echo "   → Add new parameters as optional (with defaults)"
+#           echo "   → Keep existing parameters in the same order"
+#           echo ""
+#           echo "2️⃣  MARK AS INTERNAL API (If this is internal code)"
+#           echo "   → Add @internal_api decorator from megatron.core.utils"
+#           echo ""
+#           echo "   Example (for classes):"
+#           echo "      from megatron.core.utils import internal_api"
+#           echo ""
+#           echo "      @internal_api"
+#           echo "      class ExperimentalFeature:"
+#           echo "          pass"
+#           echo ""
+#           echo "   Example (for functions):"
+#           echo "      from megatron.core.utils import internal_api"
+#           echo ""
+#           echo "      @internal_api"
+#           echo "      def internal_helper_function():"
+#           echo "          pass"
+#           echo ""
+#           echo "3️⃣  MARK AS EXPERIMENTAL API (If this is experimental code)"
+#           echo "   → Add @experimental_api decorator from megatron.core.utils"
+#           echo ""
+#           echo "   Example:"
+#           echo "      from megatron.core.utils import experimental_api"
+#           echo ""
+#           echo "      @experimental_api"
+#           echo "      class ExperimentalFeature:"
+#           echo "          pass"
+#           echo ""
+#           echo "4️⃣  USE DEPRECATION (For gradual API changes)"
+#           echo "   → Add @deprecated decorator for transition period"
+#           echo "   → Example:"
+#           echo "      from megatron.core.utils import deprecated"
+#           echo ""
+#           echo "      @deprecated(version='1.0', removal_version='2.0',"
+#           echo "                  alternative='new_function')"
+#           echo "      def old_function():"
+#           echo "          pass"
+#           echo ""
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo "📋 BREAKING CHANGES DETECTED"
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo ""
+#           cat compat_check_output.txt
+#           echo ""
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo "📚 MORE INFORMATION"
+#           echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+#           echo ""
+#           echo "📖 Full documentation: docs/api-backwards-compatibility-check.md"
+#           echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py"
+#           echo "❓ Questions? Check the docs or ask in #megatron-core"
+#           echo ""
+
+#           echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy."
+#           exit 1
+
+#       - name: Success message
+#         if: steps.compat_check.outcome == 'success'
+#         run: |
+#           echo "::notice::✅ No breaking API changes detected!"
+
+#   api-backward-compatibility-summary:
+#     needs: [pre-flight, check-compatibility]
+#     runs-on: ubuntu-latest
+#     name: "OPTIONAL: API Backward Compatibility Check Summary"
+#     if: always() && !cancelled()
+#     steps:
+#       - name: Checkout
+#         uses: actions/checkout@v4
+
+#       - name: Validate workflow result
+#         shell: bash -x -e -u -o pipefail {0}
+#         env:
+#           GH_TOKEN: ${{ github.token }}
+#           SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.should_skip == 'true' }}
+#         run: |
+#           FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary")] | length') || echo 0
+
+#           if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+#               if [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+#                   echo "✅ Compatibility check was skipped (no relevant files changed)"
+#               else
+#                   echo "✅ All checks passed successfully"
+#               fi
+#               exit 0
+#           else
+#               echo "❌ Found $FAILED_JOBS failed job(s)"
+#               gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary") | .name'
+#               exit 1
+#           fi
 
diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 6a3f4c416a9..8102968043a 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -26,5 +26,5 @@ jobs:
       target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+'
     secrets:
       PAT: ${{ secrets.PAT }}
-      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml
index 61f991cfadf..de46e29e74b 100644
--- a/.github/workflows/cicd-approve-test-queue.yml
+++ b/.github/workflows/cicd-approve-test-queue.yml
@@ -181,8 +181,8 @@ jobs:
     steps:
       - name: Notify
         env:
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
+          SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
+          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
           GITHUB_RUN_ID: ${{ github.run_id }}
           GITHUB_REPOSITORY: ${{ github.repository }}
         run: |
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 85936fb99ed..d72be6ea9c7 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -18,16 +18,14 @@ name: CICD Megatron-LM
 #     - cron: 0 0 * * *
 #   push:
 #     branches:
-#       - dev
-#       - main
-#       - 'pull-request/[0-9]+'
-#       - 'deploy-release/*'
+#       - "pull-request/[0-9]+"
+#       - "deploy-release/*"
 #   merge_group:
 #     types: [checks_requested]
 #   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref || github.event.pull_request.number }}
   cancel-in-progress: true
 
 permissions:
@@ -51,6 +49,7 @@ jobs:
     env:
       GITHUB_TOKEN: ${{ secrets.PAT }}
       REPO: ${{ github.repository }}
+      DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -84,52 +83,53 @@ jobs:
 
           # Use SSO membership check result
           IS_MEMBER="${{ steps.check-sso.outputs.is_member }}"
+
+          # If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo
+          if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then
+            PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
+
+            echo "Checking if $PR_AUTHOR is a repo collaborator..."
+            API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR"
+            REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
+              -H "Accept: application/vnd.github+json" \
+              -H "Authorization: Bearer $GITHUB_TOKEN" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              $API_URL)
+
+            echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..."
+            API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR"
+            ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
+              -H "Accept: application/vnd.github+json" \
+              -H "Authorization: Bearer $GITHUB_TOKEN" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              $API_URL)
+
+            echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..."
+            API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR"
+            ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
+              -H "Accept: application/vnd.github+json" \
+              -H "Authorization: Bearer $GITHUB_TOKEN" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              $API_URL)
+
+            if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then
+              IS_MEMBER="true"
+            else
+              exit 1
+            fi
+          fi
+
+          # Use SSO membership check result
           if [ "$IS_MEMBER" == "true" ]; then
             echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
           else
             echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT
           fi
 
-      - name: Find Comment
-        uses: peter-evans/find-comment@v4
-        if: startsWith(github.ref, 'refs/heads/pull-request/')
-        id: fc
-        with:
-          issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
-          repository: ${{ github.repository }}
-          body-includes: '<!--external-contributor-comment-->'
-
-      - name: Delete comment
-        uses: actions/github-script@v7
-        if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.fc.outputs.comment-id != ''
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.deleteComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              comment_id: ${{ steps.fc.outputs.comment-id }}
-            })
-
-      - name: Write pull request comment
-        if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.check-membership.outputs.is_maintainer == 'false'
-        uses: peter-evans/create-or-update-comment@v5
-        with:
-          issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
-          repository: ${{ github.repository }}
-          body: |
-            <!--external-contributor-comment-->
-
-            Thank you for your contribution!
-
-            NVIDIA Megatron-LM is currently transitioning to development on Github. We will aim to review your PR after we complete our transition and stabilize our Github development process.
-
-            Thank you for your understanding.
-
   pre-flight:
     needs: [is-not-external-contributor]
     if: github.repository == 'NVIDIA/Megatron-LM'
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.10
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
 
   linting:
     runs-on: ubuntu-latest
@@ -177,10 +177,11 @@ jobs:
   cicd-wait-in-queue:
     runs-on: ubuntu-latest
     needs: [pre-flight, linting]
-    environment: ${{ needs.pre-flight.outputs.is_merge_group == 'true' && 'merge-gate' || 'test' }}
+    environment: "test"
     if: |
       !(needs.pre-flight.outputs.is_ci_workload == 'true'
       || needs.pre-flight.outputs.is_deployment_workflow == 'true'
+      || needs.pre-flight.outputs.is_merge_group == 'true'
       || needs.pre-flight.outputs.docs_only == 'true')
     steps:
       - name: Running CI tests
@@ -192,16 +193,42 @@ jobs:
     needs: [is-not-external-contributor, pre-flight, cicd-wait-in-queue]
     runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
     if: |
-      (
+      needs.is-not-external-contributor.result != 'cancelled'
+      && needs.pre-flight.result != 'cancelled'
+      && needs.cicd-wait-in-queue.result != 'cancelled'
+      && (
         success()
         || needs.pre-flight.outputs.is_ci_workload == 'true'
+        || needs.pre-flight.outputs.is_merge_group == 'true'
         || needs.pre-flight.outputs.force_run_all == 'true'
       )
-      && needs.pre-flight.outputs.is_merge_group == 'false'
       && !cancelled()
     steps:
+      - name: Get PR info
+        id: get-pr-info
+        if: startsWith(github.ref, 'refs/heads/pull-request/')
+        uses: nv-gha-runners/get-pr-info@main
+
+      - name: Get merge commit sha
+        shell: bash -x -e -u -o pipefail {0}
+        id: sha
+        env:
+          IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
+          IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
+        run: |
+          if [[ "$IS_PR" == "true" ]]; then
+            SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
+          elif [[ "$IS_MERGE_GROUP" == "true" ]]; then
+            SHA=${{ github.event.merge_group.head_sha }}
+          else
+            SHA=${GITHUB_SHA}
+          fi
+          echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"
+
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.sha.outputs.main }}
 
       - name: Setup python
         uses: actions/setup-python@v5
@@ -214,11 +241,6 @@ jobs:
           apt-get update
           apt-get install -y gh
 
-      - name: Get PR info
-        id: get-pr-info
-        if: startsWith(github.ref, 'refs/heads/pull-request/')
-        uses: nv-gha-runners/get-pr-info@main
-
       - name: Has lts label
         id: has-lts-label
         env:
@@ -284,7 +306,7 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Build and push
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           file: ./docker/Dockerfile.ci.dev
           push: true
@@ -315,12 +337,15 @@ jobs:
       - cicd-wait-in-queue
       - cicd-container-build
     if: |
-      (
+      needs.pre-flight.result != 'cancelled'
+      && needs.cicd-wait-in-queue.result != 'cancelled'
+      && needs.cicd-container-build.result != 'cancelled'
+      && (
         success()
         || needs.pre-flight.outputs.is_ci_workload == 'true'
         || needs.pre-flight.outputs.force_run_all == 'true'
+        || needs.pre-flight.outputs.is_merge_group == 'true'
       )
-      && needs.pre-flight.outputs.is_merge_group == 'false'
       && !cancelled()
     steps:
       - name: Checkout
@@ -328,7 +353,7 @@ jobs:
       - name: Parse unit tests
         id: parse-unit-tests
         run: |
-          cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json
+          cat tests/test_utils/recipes/h100/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json
           echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT
 
   cicd-unit-tests-latest:
@@ -343,14 +368,20 @@ jobs:
       - cicd-container-build
       - cicd-parse-unit-tests
     runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
-    name: '${{ matrix.bucket }} - latest'
+    timeout-minutes: 60
+    name: "${{ matrix.bucket }} - latest"
     if: |
-      (
+      needs.is-not-external-contributor.result != 'cancelled'
+      && needs.pre-flight.result != 'cancelled'
+      && needs.cicd-wait-in-queue.result != 'cancelled'
+      && needs.cicd-container-build.result != 'cancelled'
+      && needs.cicd-parse-unit-tests.result != 'cancelled'
+      && (
         success()
         || needs.pre-flight.outputs.is_ci_workload == 'true'
         || needs.pre-flight.outputs.force_run_all == 'true'
+        || needs.pre-flight.outputs.is_merge_group == 'true'
       )
-      && needs.pre-flight.outputs.is_merge_group == 'false'
       && !cancelled()
     env:
       PIP_DISABLE_PIP_VERSION_CHECK: 1
@@ -365,9 +396,10 @@ jobs:
           test_case: ${{ matrix.bucket }}
           tag: latest
           timeout: ${{ matrix.timeout || 30 }}
-          is_unit_test: 'true'
+          is_unit_test: "true"
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
+          is_ci_workload: ${{ needs.pre-flight.outputs.is_ci_workload }}
 
   cicd-parse-integration-tests:
     runs-on: ubuntu-latest
@@ -377,12 +409,16 @@ jobs:
       - cicd-container-build
       - cicd-unit-tests-latest
     if: |
-      (
+      needs.pre-flight.result != 'cancelled'
+      && needs.cicd-wait-in-queue.result != 'cancelled'
+      && needs.cicd-container-build.result != 'cancelled'
+      && needs.cicd-unit-tests-latest.result != 'cancelled'
+      && (
         success()
         || needs.pre-flight.outputs.is_ci_workload == 'true'
         || needs.pre-flight.outputs.force_run_all == 'true'
+        || needs.pre-flight.outputs.is_merge_group == 'true'
       )
-      && needs.pre-flight.outputs.is_merge_group == 'false'
       && !cancelled()
     outputs:
       integration-tests: ${{ steps.main.outputs.integration-tests }}
@@ -408,16 +444,18 @@ jobs:
         id: has-run-functional-tests-label
         env:
           GH_TOKEN: ${{ secrets.PAT }}
+          IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }}
         run: |
           PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
-          HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false"
+          HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")')
+          HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
           echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
 
       - name: Parse functional tests
         id: main
         env:
           HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.main }}
-          HAS_RUN_FUNCTIONAL_TESTS_LABEL: ${{ steps.has-run-functional-tests-label.outputs.main }}
+          HAS_RUN_FUNCTIONAL_TESTS_LABEL: ${{ steps.has-run-functional-tests-label.outputs.main == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
         run: |
           export PYTHONPATH=$(pwd)
 
@@ -458,6 +496,7 @@ jobs:
           echo "integration-tests=$(cat integration-tests.json)" | tee -a "$GITHUB_OUTPUT"
 
   cicd-integration-tests-latest:
+    timeout-minutes: 60
     strategy:
       fail-fast: false
       matrix:
@@ -469,18 +508,23 @@ jobs:
       - cicd-parse-integration-tests
       - cicd-unit-tests-latest
     runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
-    name: '${{ matrix.model }}/${{ matrix.test_case }} - latest'
+    name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
     env:
       PIP_DISABLE_PIP_VERSION_CHECK: 1
       PIP_NO_PYTHON_VERSION_WARNING: 1
       PIP_ROOT_USER_ACTION: ignore
     if: |
-      (
+      needs.is-not-external-contributor.result != 'cancelled'
+      && needs.pre-flight.result != 'cancelled'
+      && needs.cicd-wait-in-queue.result != 'cancelled'
+      && needs.cicd-parse-integration-tests.result != 'cancelled'
+      && needs.cicd-unit-tests-latest.result != 'cancelled'
+      && (
         success()
         || needs.pre-flight.outputs.is_ci_workload == 'true'
         || needs.pre-flight.outputs.force_run_all == 'true'
+        || needs.pre-flight.outputs.is_merge_group == 'true'
       )
-      && needs.pre-flight.outputs.is_merge_group == 'false'
       && !cancelled()
     steps:
       - name: Checkout
@@ -492,9 +536,11 @@ jobs:
           model: ${{ matrix.model }}
           tag: latest
           timeout: ${{ matrix.timeout || 30 }}
-          is_unit_test: 'false'
+          is_unit_test: "false"
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
+          is_ci_workload: ${{ needs.pre-flight.outputs.is_ci_workload }}
+          is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}
 
   Nemo_CICD_Test:
     needs:
@@ -525,8 +571,8 @@ jobs:
           GITHUB_RUN_ID: ${{ github.run_id }}
           SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
         run: |
-          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure")] | length') || echo 0
-          SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped")] | length') || echo 0
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure" and .name != "merge-queue-notification" and .name != "cicd-mbridge-testing")] | length') || echo 0
+          SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped" and .name != "merge-queue-notification" and .name != "cicd-mbridge-testing")] | length') || echo 0
 
           if [ "${FAILED_JOBS:-0}" -eq 0 ] && ([ "${SKIPPED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]); then
               echo "✅ All previous jobs completed successfully"
@@ -618,6 +664,34 @@ jobs:
             .coverage
           include-hidden-files: true
 
+  merge-queue-notification:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'merge_group'
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Extract PR number from merge group
+        id: get-pr-number
+        run: |
+          # Extract PR number from merge group head_ref (format: refs/heads/gh-readonly-queue/main/pr-<number>-<sha>)
+          PR_NUMBER=$(echo "${{ github.event.merge_group.head_ref }}" | sed -n 's/.*\/pr-\([0-9]*\)-.*/\1/p')
+          echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
+
+      - name: Comment on PR with action run URL
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.PAT }}
+          script: |
+            const prNumber = ${{ steps.get-pr-number.outputs.pr_number }};
+            const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`;
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body: `🔄 Merge queue validation started!\n\nYou can track the progress here: ${runUrl}`
+            });
+
   cleanup-taint-node:
     runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
     needs:
diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json
new file mode 100644
index 00000000000..e640b90a0f3
--- /dev/null
+++ b/.github/workflows/config/changelog-config.json
@@ -0,0 +1,24 @@
+{
+    "categories": [],
+    "ignore_labels": [
+      "ignore"
+    ],
+    "sort": "ASC",
+    "template": "\n${{CHANGELOG}}\n\n<details><summary>Changelog Details</summary>\n\n${{UNCATEGORIZED}}\n</details>\n",
+    "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
+    "commit_template": "- ${{TITLE}} by @${{AUTHOR}}",
+    "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
+    "duplicate_filter": {
+      "pattern": ".+",
+      "on_property": "title",
+      "method": "match"
+    },
+    "transformers": [],
+    "max_tags_to_fetch": 100,
+    "max_pull_requests": 500,
+    "max_back_track_time_days": 365,
+    "exclude_merge_branches": [],
+    "tag_resolver": {
+      "method": "semver"
+    }
+}
diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml
index 46efe4fbb53..c3de445483d 100644
--- a/.github/workflows/dependabot.yml
+++ b/.github/workflows/dependabot.yml
@@ -11,7 +11,6 @@ permissions:
 jobs:
   get-release-branch-names:
     runs-on: ubuntu-latest
-    environment: nemo-ci
     outputs:
       mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
     if: github.repository == 'NVIDIA/Megatron-LM'
@@ -41,9 +40,6 @@ jobs:
       target-branch: ${{ matrix.target-branch }}
     secrets:
       PAT: ${{ secrets.PAT }}
-      AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
-      AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
-      AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
       SSH_KEY: ${{ secrets.SSH_KEY }}
       SSH_PWD: ${{ secrets.SSH_PWD }}
 
@@ -54,8 +50,8 @@ jobs:
     steps:
       - name: Notify
         env:
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
+          SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
+          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
           GITHUB_RUN_ID: ${{ github.run_id }}
           GITHUB_REPOSITORY: ${{ github.repository }}
         run: |
diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml
index 6192c5e048a..17ff8e18494 100644
--- a/.github/workflows/install-test.yml
+++ b/.github/workflows/install-test.yml
@@ -22,8 +22,8 @@ name: Installation Test
 #     branches:
 #       - dev
 #       - main
-#       - 'pull-request/[0-9]+'
-#       - 'deploy-release/*'
+#       - "pull-request/[0-9]+"
+#       - "deploy-release/*"
 #   merge_group:
 #     types: [checks_requested]
 
@@ -43,11 +43,10 @@ jobs:
     name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
     container:
       image: nvcr.io/nvidia/pytorch:25.05-py3
-    environment: nemo-ci
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.12']
+        python-version: ["3.12"]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -89,11 +88,10 @@ jobs:
     name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
     container:
       image: nvcr.io/nvidia/pytorch:25.05-py3
-    environment: nemo-ci
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.12']
+        python-version: ["3.12"]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
diff --git a/.github/workflows/multi-approval-bot.yml b/.github/workflows/multi-approval-bot.yml
index 58df3607e18..eb1d7c86f70 100644
--- a/.github/workflows/multi-approval-bot.yml
+++ b/.github/workflows/multi-approval-bot.yml
@@ -15,7 +15,6 @@ jobs:
   codeowners-approval:
     needs: [pre-flight]
     runs-on: ubuntu-latest
-    environment: nemo-ci
     if: |
       !(needs.pre-flight.outputs.docs_only == 'true'
       || needs.pre-flight.outputs.is_merge_group == 'true'
diff --git a/.github/workflows/oncall-assign.yml b/.github/workflows/oncall-assign.yml
new file mode 100644
index 00000000000..d4cc47d5f9e
--- /dev/null
+++ b/.github/workflows/oncall-assign.yml
@@ -0,0 +1,47 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Oncall Assign
+
+on:
+  pull_request_target:
+    types: [opened, ready_for_review]
+    branches:
+      - main
+
+permissions:
+  pull-requests: write
+  contents: read
+
+jobs:
+  assign-reviewer:
+    runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: pip install requests slack-sdk
+
+      - name: Assign Reviewer
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
+        run: |
+          python .github/scripts/oncall_manager.py assign --pr ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/oncall-rotation.yml b/.github/workflows/oncall-rotation.yml
new file mode 100644
index 00000000000..a621be7f652
--- /dev/null
+++ b/.github/workflows/oncall-rotation.yml
@@ -0,0 +1,60 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Oncall Rotation
+
+on:
+  schedule:
+    # Runs at 09:00 UTC every Wednesday
+    - cron: "0 9 * * 3"
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  rotate-schedule:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.PAT }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Rotate Schedule
+        env:
+          # Token to read org team members. Needs read:org scope.
+          GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }}
+          # Slack token for updating the Slack usergroup
+          SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }}
+        run: |
+          pip install --no-cache-dir "uv<0.9.29"
+          uv venv .venv
+          uv cache clean
+          uv sync --no-cache 
+          uv run --with slack-sdk python .github/scripts/oncall_manager.py rotate
+
+      - name: Commit and Push changes
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git add .github/oncall_schedule.json
+          git commit -m "chore: rotate oncall schedule" || echo "No changes to commit"
+          git pull --rebase
+          git push origin HEAD:main
diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
new file mode 100644
index 00000000000..35226649fc9
--- /dev/null
+++ b/.github/workflows/release-docs.yml
@@ -0,0 +1,104 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Release docs
+on:
+  workflow_dispatch:
+    inputs:
+      dry-run:
+        description: Whether to run the workflow in dry-run mode
+        required: true
+        type: boolean
+        default: true
+      publish-as-latest:
+        description: Publish as Latest stable version.
+        required: false
+        type: boolean
+        default: true
+      docs-version-override:
+        description: Docs version if commit is not tagged
+        required: false
+        type: string
+        default: ""
+      notify-emails:
+        description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      dry-run:
+        description: Whether to run the workflow in dry-run mode
+        required: true
+        type: boolean
+        default: true
+      publish-as-latest:
+        description: Publish as Latest stable version.
+        required: false
+        type: boolean
+        default: true
+      docs-version-override:
+        description: Docs version if commit is not tagged
+        required: false
+        type: string
+        default: ""
+      notify-emails:
+        description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
+        required: false
+        type: string
+      build-docs-ref:
+        description: Reference to build the docs from
+        required: false
+        type: string
+        default: ${{ github.sha }}
+
+jobs:
+  build-docs:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0
+    with:
+      ref: ${{ inputs.build-docs-ref }}
+
+  publish-docs:
+    runs-on: ubuntu-latest
+    needs: [build-docs]
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: NVIDIA-NeMo/FW-CI-templates
+          ref: v0.74.0
+          path: FW-CI-templates
+
+      - uses: ./FW-CI-templates/.github/actions/publish-docs
+        # This workflow runs either on main, or on a version tag. Any other git ref will lead
+        # to an error.
+        # If its on main, it will publish to "latest" directory in Akamai.
+        # If its on a versioned tag, it will extract the version number from the tag (strip `v` prefix)
+        # and publish to the versioned directory in Akamai.
+        with:
+          dry-run: ${{ inputs.dry-run }}
+          artifacts-name: docs-html
+          artifacts-path: _build/html
+          emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
+          overwrite-latest-on-tag: ${{ inputs.publish-as-latest }}
+          docs-version-override: ${{ inputs.docs-version-override }}
+          run-on-version-tag-only: ${{ github.ref_name != 'main' }}
+          request-name: megatron-core-publish-docs-${{ github.run_id }}
+          aws-region: ${{ vars.DOCS_AWS_REGION }}
+          aws-role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          akamai-host: ${{ secrets.AKAMAI_HOST }}
+          akamai-client-token: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
+          akamai-client-secret: ${{ secrets.AKAMAI_CLIENT_SECRET }}
+          akamai-access-token: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
+          s3-target-root: ${{ secrets.S3_BUCKET_NAME }}
+          s3-target-path: megatron-core/developer-guide
diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml
index 9b22997d36a..862790476e2 100644
--- a/.github/workflows/release-freeze.yml
+++ b/.github/workflows/release-freeze.yml
@@ -42,5 +42,5 @@ jobs:
       freeze-commit: ${{ inputs.freeze-commit }}
       dry-run: ${{ inputs.dry-run }}
     secrets:
-      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
-      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_MAIN_CHANNEL_WEBHOOK }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 1a75cff9832..13a5615d5e6 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-name: 'Release Megatron-Core'
+name: "Release Megatron-Core"
 
 # on:
 #   workflow_dispatch:
@@ -30,6 +30,16 @@ name: 'Release Megatron-Core'
 #         required: true
 #         default: true
 #         type: boolean
+#       generate-changelog:
+#         description: Generate changelog
+#         required: false
+#         default: true
+#         type: boolean
+#       publish-docs:
+#         description: Publish docs
+#         required: false
+#         default: true
+#         type: boolean
 #       version-bump-branch:
 #         description: Branch for version bump
 #         required: true
@@ -47,9 +57,9 @@ jobs:
       dry-run: ${{ inputs.dry-run || false }}
       version-bump-branch: ${{ inputs.version-bump-branch || github.ref_name }}
       create-gh-release: ${{ inputs.create-gh-release || true }}
+      gh-release-use-changelog-builder: ${{ inputs.generate-changelog }}
+      publish-docs: ${{ inputs.publish-docs }}
     secrets:
-      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
-      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
-      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
+      SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }}
       PAT: ${{ secrets.PAT }}
diff --git a/.github/workflows/sync-team-usergroups.yml b/.github/workflows/sync-team-usergroups.yml
new file mode 100644
index 00000000000..6db5127d9a0
--- /dev/null
+++ b/.github/workflows/sync-team-usergroups.yml
@@ -0,0 +1,41 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Sync GitHub Teams to Slack User Groups
+
+on:
+  workflow_dispatch:
+
+jobs:
+  sync-usergroups:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Sync Teams to User Groups
+        env:
+          GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }}
+          SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }}
+        run: |
+          pip install --no-cache-dir "uv<0.9.29"
+          uv venv .venv
+          uv cache clean
+          uv sync --no-cache 
+          uv run --with slack-sdk python .github/scripts/sync_team_usergroups.py
diff --git a/.gitignore b/.gitignore
index 144a8c2b89d..a9ce4aa0a93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,8 @@ onelogger.err
 .venv
 runs/
 /test_cases/
-**/dist/
\ No newline at end of file
+**/dist/
+
+# Sphinx documentation
+docs/_build
+docs/apidocs
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 53574fdea22..a238f2c9999 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,16 +1,16 @@
 .merge_train_rule: &merge_train_rule
-  UNIT_TEST: "yes"
+  UNIT_TEST: 'yes'
   UNIT_TEST_REPEAT: 1
   UNIT_TEST_TIMEOUT: 30
-  INTEGRATION_TEST: "no"
+  INTEGRATION_TEST: 'no'
   INTEGRATION_TEST_SCOPE: mr
-  FUNCTIONAL_TEST: "yes"
+  FUNCTIONAL_TEST: 'yes'
   FUNCTIONAL_TEST_SCOPE: mr-slim
   FUNCTIONAL_TEST_REPEAT: 1
   FUNCTIONAL_TEST_TIME_LIMIT: 2700
-  CLUSTER_A100: ""
-  CLUSTER_H100: ""
-  PUBLISH: "no"
+  CLUSTER_A100: ''
+  CLUSTER_H100: ''
+  PUBLISH: 'no'
 
 workflow:
   rules:
@@ -35,30 +35,30 @@ workflow:
     # For push to main
     - if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH =~ /^core_/)
       variables:
-        UNIT_TEST: "no"
-        INTEGRATION_TEST: "no"
-        FUNCTIONAL_TEST: "yes"
+        UNIT_TEST: 'no'
+        INTEGRATION_TEST: 'no'
+        FUNCTIONAL_TEST: 'yes'
         FUNCTIONAL_TEST_SCOPE: mr
         FUNCTIONAL_TEST_REPEAT: 5
-        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
         FUNCTIONAL_TEST_TIME_LIMIT: 3600
-        CLUSTER_A100: ""
-        CLUSTER_H100: ""
-        PUBLISH: "no"
+        CLUSTER_A100: ''
+        CLUSTER_H100: ''
+        PUBLISH: 'no'
       auto_cancel:
         on_new_commit: interruptible
 
     # For merge-trains that need to be fast-tracked
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
       variables:
-        UNIT_TEST: "yes"
+        UNIT_TEST: 'yes'
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: "no"
-        FUNCTIONAL_TEST: "no"
-        CLUSTER_A100: ""
-        CLUSTER_H100: ""
-        PUBLISH: "no"
+        INTEGRATION_TEST: 'no'
+        FUNCTIONAL_TEST: 'no'
+        CLUSTER_A100: ''
+        CLUSTER_H100: ''
+        PUBLISH: 'no'
 
     # For normal merge-trains
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
@@ -67,75 +67,75 @@ workflow:
     # For MRs with integration suite
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
       variables:
-        UNIT_TEST: "yes"
+        UNIT_TEST: 'yes'
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: "yes"
+        INTEGRATION_TEST: 'yes'
         INTEGRATION_TEST_SCOPE: mr
-        FUNCTIONAL_TEST: "no"
+        FUNCTIONAL_TEST: 'no'
         FUNCTIONAL_TEST_SCOPE: mr-slim
         FUNCTIONAL_TEST_REPEAT: 1
         FUNCTIONAL_TEST_TIME_LIMIT: 2700
-        CLUSTER_A100: ""
-        CLUSTER_H100: ""
-        PUBLISH: "no"
+        CLUSTER_A100: ''
+        CLUSTER_H100: ''
+        PUBLISH: 'no'
 
     # For MRs with nightly
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/
       variables:
-        UNIT_TEST: "yes"
+        UNIT_TEST: 'yes'
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: "no"
-        FUNCTIONAL_TEST: "yes"
+        INTEGRATION_TEST: 'no'
+        FUNCTIONAL_TEST: 'yes'
         FUNCTIONAL_TEST_SCOPE: nightly
         FUNCTIONAL_TEST_REPEAT: 5
-        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
         FUNCTIONAL_TEST_TIME_LIMIT: 2700
-        CLUSTER_A100: ""
-        CLUSTER_H100: ""
-        PUBLISH: "no"
+        CLUSTER_A100: ''
+        CLUSTER_H100: ''
+        PUBLISH: 'no'
 
     # For MRs with weekly
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/
       variables:
-        UNIT_TEST: "yes"
+        UNIT_TEST: 'yes'
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: "no"
-        FUNCTIONAL_TEST: "yes"
+        INTEGRATION_TEST: 'no'
+        FUNCTIONAL_TEST: 'yes'
         FUNCTIONAL_TEST_SCOPE: weekly
         FUNCTIONAL_TEST_REPEAT: 1
-        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
         FUNCTIONAL_TEST_TIME_LIMIT: 9000
-        CLUSTER_A100: ""
-        CLUSTER_H100: ""
-        PUBLISH: "no"
+        CLUSTER_A100: ''
+        CLUSTER_H100: ''
+        PUBLISH: 'no'
 
     # For MRs with heavy suite
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/
       variables:
-        UNIT_TEST: "yes"
+        UNIT_TEST: 'yes'
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: "no"
-        FUNCTIONAL_TEST: "yes"
+        INTEGRATION_TEST: 'no'
+        FUNCTIONAL_TEST: 'yes'
         FUNCTIONAL_TEST_SCOPE: mr
         FUNCTIONAL_TEST_REPEAT: 1
         FUNCTIONAL_TEST_TIME_LIMIT: 2700
-        CLUSTER_A100: ""
-        CLUSTER_H100: ""
-        PUBLISH: "no"
+        CLUSTER_A100: ''
+        CLUSTER_H100: ''
+        PUBLISH: 'no'
 
     # Default MRs
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
       variables:
-        UNIT_TEST: "yes"
+        UNIT_TEST: 'yes'
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: "no"
-        FUNCTIONAL_TEST: "no"
-        PUBLISH: "no"
+        INTEGRATION_TEST: 'no'
+        FUNCTIONAL_TEST: 'no'
+        PUBLISH: 'no'
 
     - when: never
 
@@ -157,104 +157,109 @@ default:
 
 variables:
   BUILD:
-    value: "yes"
+    value: 'yes'
   UNIT_TEST:
-    value: "yes"
+    value: 'yes'
     options:
-      - "yes"
-      - "no"
+      - 'yes'
+      - 'no'
     description: To run the funtional test suite
   UNIT_TEST_REPEAT:
-    value: "1"
-    description: "Number of repetitions"
+    value: '1'
+    description: 'Number of repetitions'
   UNIT_TEST_TIMEOUT:
-    value: "30"
+    value: '30'
     description: Timeout (minutes) for Unit tests (all repeats)
   INTEGRATION_TEST:
-    value: "yes"
+    value: 'yes'
     options:
-      - "yes"
-      - "no"
+      - 'yes'
+      - 'no'
     description: To run the integration test suite
   INTEGRATION_TEST_SCOPE:
-    value: "mr"
+    value: 'mr'
     options:
-      - "mr"
-      - "nightly"
-      - "weekly"
-      - "pre-release"
-      - "release"
-    description: "Testsuite to run (only for INTEGRATION_TEST=yes)"
+      - 'mr'
+      - 'nightly'
+      - 'weekly'
+      - 'pre-release'
+      - 'release'
+    description: 'Testsuite to run (only for INTEGRATION_TEST=yes)'
   INTEGRATION_TEST_TIME_LIMIT:
-    value: "900"
-    description: "Timeout in seconds per test"
+    value: '900'
+    description: 'Timeout in seconds per test'
   INTEGRATION_TEST_CASES:
-    value: "all"
+    value: 'all'
     description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
   FUNCTIONAL_TEST:
-    value: "yes"
+    value: 'yes'
     options:
-      - "yes"
-      - "no"
+      - 'yes'
+      - 'no'
     description: To run the funtional test suite
   FUNCTIONAL_TEST_SCOPE:
-    value: "mr"
+    value: 'mr'
     options:
-      - "mr"
-      - "nightly"
-      - "weekly"
-      - "pre-release"
-      - "release"
-    description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
+      - 'mr'
+      - 'nightly'
+      - 'weekly'
+      - 'pre-release'
+      - 'release'
+    description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)'
   FUNCTIONAL_TEST_REPEAT:
-    value: "5"
-    description: "Number of repetitions per test"
+    value: '5'
+    description: 'Number of repetitions per test'
   FUNCTIONAL_TEST_TIME_LIMIT:
-    value: "2700"
-    description: "Timeout in seconds per test"
+    value: '2700'
+    description: 'Timeout in seconds per test'
   FUNCTIONAL_TEST_CASES:
-    value: "all"
+    value: 'all'
     description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
   FUNCTIONAL_TEST_NAME:
-    description: "Name of functional test run (only for pre-release and release)"
-    value: "$$CI_COMMIT_SHA"
+    description: 'Name of functional test run (only for pre-release and release)'
+    value: '$$CI_COMMIT_SHA'
   FUNCTIONAL_TEST_RECORD_CHECKPOINTS:
-    value: "no"
-    description: "Record golden checkpoints"
+    value: 'no'
+    description: 'Record golden checkpoints'
     options:
-      - "yes"
-      - "no"
+      - 'yes'
+      - 'no'
   CLUSTER_A100:
-    value: "dgxa100_dracooci"
+    value: 'dgxa100_dracooci'
     options:
-      - "dgxa100_dracooci"
-      - "dgxa100_dracooci-ord"
-    description: "Cluster for A100 workloads"
+      - 'dgxa100_dracooci'
+      - 'dgxa100_dracooci-ord'
+    description: 'Cluster for A100 workloads'
   CLUSTER_H100:
-    value: "dgxh100_coreweave"
+    value: 'dgxh100_coreweave'
     options:
-      - "dgxh100_coreweave"
-      - "dgxh100_eos"
-    description: "Cluster for H100 workloads"
+      - 'dgxh100_coreweave'
+      - 'dgxh100_eos'
+    description: 'Cluster for H100 workloads'
+  CLUSTER_GB200:
+    value: 'dgxgb200_oci-hsg'
+    options:
+      - 'dgxgb200_oci-hsg'
+    description: 'Cluster for H100 workloads'
   PUBLISH:
-    value: "no"
+    value: 'no'
     options:
-      - "yes"
-      - "no"
+      - 'yes'
+      - 'no'
     description: Build and publish a wheel to PyPi
   PUBLISH_COMMIT:
-    value: "$$CI_COMMIT_SHA"
+    value: '$$CI_COMMIT_SHA'
     description: Which commit to publish
   PUBLISH_VERSION_BUMP_BRANCH:
-    value: "$$CI_COMMIT_BRANCH"
+    value: '$$CI_COMMIT_BRANCH'
     description: Which branch to target for version bump
   PUBLISH_SCOPE:
-    value: "code-freeze"
+    value: 'code-freeze'
     options:
-      - "code-freeze"
-      - "release"
-      - "review-reminder"
-      - "upgrade-dependencies"
+      - 'code-freeze'
+      - 'release'
+      - 'review-reminder'
+      - 'upgrade-dependencies'
     description: Type of publish (freeze or final release)
 
   # CI wide variables
@@ -262,7 +267,7 @@ variables:
   CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
   CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
   UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
-  TE_GIT_REF: ""
+  TE_GIT_REF: ''
 
 include:
   - .gitlab/stages/00.pre.yml
diff --git a/.gitlab/labeler-config.yml b/.gitlab/labeler-config.yml
index 0e218e4bae7..2c37345c0e6 100644
--- a/.gitlab/labeler-config.yml
+++ b/.gitlab/labeler-config.yml
@@ -14,9 +14,6 @@ BERT:
 GPT:
   - megatron/core/models/gpt/**
 
-RETRO:
-  - megatron/core/models/retro/**
-
 Dist-Ckpt:
   - megatron/core/dist_checkpointing
 
diff --git a/.gitlab/scripts/build.sh b/.gitlab/scripts/build.sh
index e64434e834d..0f34b838384 100644
--- a/.gitlab/scripts/build.sh
+++ b/.gitlab/scripts/build.sh
@@ -20,17 +20,28 @@ docker buildx create --name container --driver=docker-container --use tls-enviro
 
 ADDITIONAL_PARAMS=()
 
+CI_COMMIT_BRANCH="${CI_COMMIT_BRANCH:-$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}"
+
 if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then
     ADDITIONAL_PARAMS+=("--pull")
-    ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main,mode=max")
-    ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_COMMIT_BRANCH}")
-elif [[ -n "$CI_MERGE_REQUEST_IID" ]]; then
-    ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID},mode=max")
-    ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}")
+fi
+
+CI_COMMIT_BRANCH=$(echo "$CI_COMMIT_BRANCH" | tr '/' '-' | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9._-]/-/g')
+ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_COMMIT_BRANCH}-${PLATFORM},mode=max")
+ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_COMMIT_BRANCH}-${PLATFORM}")
+ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:main-${PLATFORM}")
+ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:dev-${PLATFORM}")
+
+ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_COMMIT_BRANCH}-${PLATFORM}")
+
+if [[ -n "$CI_MERGE_REQUEST_IID" ]]; then
+    ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID}-${PLATFORM},mode=max")
+    ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID}-${PLATFORM}")
+    ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}-${PLATFORM}")
 fi
 
 if [[ "$CI_COMMIT_BRANCH" == "ci-nightly" ]]; then
-    ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
+    ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly-${PLATFORM}")
 fi
 
 if [[ -n "$TE_GIT_REF" ]]; then
@@ -46,13 +57,11 @@ DOCKER_BUILDKIT=1 docker build \
     --secret id=LOGGER_INDEX_URL \
     --target $STAGE \
     -f docker/$FILE \
-    -t ${IMAGE}:${CI_PIPELINE_ID} \
+    -t ${IMAGE}:${CI_PIPELINE_ID}-${PLATFORM} \
     --builder=container \
     --build-arg JET_API_VERSION=$JET_API_VERSION \
-    --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \
-    --cache-from type=registry,ref=${IMAGE}-buildcache:dev \
-    --cache-from type=registry,ref=${IMAGE}-buildcache:main \
     --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
+    --provenance=false \
     --push \
     --progress plain \
     ${ADDITIONAL_PARAMS[@]} .
diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index a22c2cf3ea7..d3ac804e599 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -49,7 +49,7 @@ pre:create_ci_branches:
   stage: .pre
   image: python:3.10
   variables:
-    GIT_STRATEGY: "clone"
+    GIT_STRATEGY: 'clone'
   script:
     - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
     - git switch --force-create $branch
@@ -79,7 +79,7 @@ pre:create_ci_branches_dev:
   stage: .pre
   image: python:3.10
   variables:
-    GIT_STRATEGY: "clone"
+    GIT_STRATEGY: 'clone'
   script:
     - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
     - git switch --force-create $branch
@@ -102,7 +102,7 @@ pre:label_merge_request:
     - cd gitlab-mr-labeler
     - go install .
     - cd ..
-    - go install github.com/itchyny/gojq/cmd/gojq@latest
+    - go install github.com/itchyny/gojq/cmd/gojq@v0.12.17
   script:
     - set -x
     - |
@@ -136,7 +136,7 @@ pre:maybe_cherry_pick_to_main:
   stage: .pre
   image: nentangso/alpine-git-curl-jq
   variables:
-    GIT_STRATEGY: "clone"
+    GIT_STRATEGY: 'clone'
   script:
     - |
       set -x
@@ -201,7 +201,7 @@ pre:maybe_cherry_pick_commit:
   stage: .pre
   image: nentangso/alpine-git-curl-jq
   variables:
-    GIT_STRATEGY: "clone"
+    GIT_STRATEGY: 'clone'
   script:
     - set -x
     - set +e
diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml
index d67225311f6..20252e7d045 100644
--- a/.gitlab/stages/01.build.yml
+++ b/.gitlab/stages/01.build.yml
@@ -9,10 +9,10 @@
   extends: [.build_rules, .dind_rules]
   stage: build
   tags:
-    - arch/amd64
+    - arch/${PLATFORM}
     - origin/jet-fleet
     - env/prod
-    - ${TAG}
+    - purpose/builder-large
   services:
     - name: docker:24.0.5-dind
       variables:
@@ -23,7 +23,6 @@
     DOCKER_TLS_CERTDIR: "/certs"
     DOCKER_TLS_VERIFY: 1
     DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client"
-    TAG: purpose/builder-large
     STAGE: jet
     MCORE_BACKWARDS_REF: core_r0.14.0
     KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi
@@ -48,7 +47,7 @@
     reports:
       dotenv: build.env
 
-test:build_image:
+test:pre_build_image:
   extends: [.build_image]
   parallel:
     matrix:
@@ -56,13 +55,30 @@ test:build_image:
         FILE: Dockerfile.ci.dev
         IMAGE_TYPE: lts
         BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
+        PLATFORM: amd64
+      - IMAGE: CI_MCORE_LTS_IMAGE
+        FILE: Dockerfile.ci.dev
+        IMAGE_TYPE: lts
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
+        PLATFORM: arm64
       - IMAGE: CI_MCORE_DEV_IMAGE
         FILE: Dockerfile.ci.dev
         IMAGE_TYPE: dev
-        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3
+        PLATFORM: amd64
+      - IMAGE: CI_MCORE_DEV_IMAGE
+        FILE: Dockerfile.ci.dev
+        IMAGE_TYPE: dev
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3
+        PLATFORM: arm64
       - IMAGE: UTILITY_IMAGE
         FILE: Dockerfile.linting
         BASE_IMAGE: python:3.10
+        PLATFORM: amd64
+      - IMAGE: UTILITY_IMAGE
+        FILE: Dockerfile.linting
+        BASE_IMAGE: python:3.10
+        PLATFORM: arm64
 
 test:build_nemo_image:
   extends: [.build_image]
@@ -70,6 +86,57 @@ test:build_nemo_image:
     IMAGE: CI_NEMO_IMAGE
     FILE: Dockerfile.ci.nemo
     BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
+    PLATFORM: amd64
   rules:
     - if: $FUNCTIONAL_TEST == "yes" || $INTEGRATION_TEST == "yes" || $CI_COMMIT_BRANCH == "ci-rebuild-mcore-nemo-image"
       when: on_success
+
+test:build_image:
+  needs: [test:pre_build_image]
+  extends: [.build_rules, .dind_rules]
+  parallel:
+    matrix:
+      - IMAGE: CI_MCORE_LTS_IMAGE
+      - IMAGE: CI_MCORE_DEV_IMAGE
+      - IMAGE: UTILITY_IMAGE
+  stage: build
+  tags:
+    - arch/amd64
+    - origin/jet-fleet
+    - env/prod
+    - purpose/builder-large
+  services:
+    - name: docker:24.0.5-dind
+      variables:
+        HEALTHCHECK_TCP_PORT: "2376"
+  timeout: 180m
+  variables:
+    DOCKER_HOST: tcp://docker:2376
+    DOCKER_TLS_CERTDIR: "/certs"
+    DOCKER_TLS_VERIFY: 1
+    DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client"
+    STAGE: jet
+    MCORE_BACKWARDS_REF: core_r0.14.0
+    KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi
+    KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi
+    SHARED_PATH: /builds/$CI_PROJECT_PATH/shared
+  script:
+    - |
+      set -x
+
+      env
+      eval "IMAGE=\$$IMAGE"
+
+      docker manifest create ${IMAGE}:${CI_PIPELINE_ID} \
+        ${IMAGE}:${CI_PIPELINE_ID}-amd64 \
+        ${IMAGE}:${CI_PIPELINE_ID}-arm64
+
+      docker manifest push ${IMAGE}:${CI_PIPELINE_ID}
+    - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env
+    - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env
+    - cat build.env
+  retry:
+    max: 2
+  artifacts:
+    reports:
+      dotenv: build.env
diff --git a/.gitlab/stages/03.integration-tests.yml b/.gitlab/stages/03.integration-tests.yml
index 824721b9fb1..d28ecd8e137 100644
--- a/.gitlab/stages/03.integration-tests.yml
+++ b/.gitlab/stages/03.integration-tests.yml
@@ -43,6 +43,7 @@ integration:configure:
     - |
       A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
       H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
+      GB200_CLUSTER=$([[ "$CLUSTER_GB200" != "" ]] && echo $CLUSTER_GB200 || echo $DEFAULT_GB200_CLUSTER)
     - |
       ARGS=(
         "--scope $INTEGRATION_TEST_SCOPE"
@@ -88,12 +89,30 @@ integration:configure:
         --platform dgx_h100 \
         --cluster $H100_CLUSTER \
         --output-path "functional-test-job-lts-H100.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment lts \
+        --platform dgx_gb2100 \
+        --cluster $GB200_CLUSTER \
+        --output-path "functional-test-job-lts-GB200.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment lts \
+        --platform dgx_gb200 \
+        --cluster $GB200_CLUSTER \
+        --output-path "functional-test-job-lts-GB200.yaml"
   artifacts:
     paths:
       - functional-test-job-lts-A100.yaml
       - functional-test-job-lts-H100.yaml
       - functional-test-job-dev-H100.yaml
       - functional-test-job-dev-A100.yaml
+      - functional-test-job-lts-GB200.yaml
+      - functional-test-job-dev-GB200.yaml
       - tests/test_utils/local_recipes
 
 .integration_run:
@@ -132,6 +151,12 @@ integration:run_lts_dgx_h100:
     ENVIRONMENT: lts
     CLUSTER: H100
 
+integration:run_lts_dgx_gb200:
+  extends: [.integration_run]
+  variables:
+    ENVIRONMENT: lts
+    CLUSTER: GB200
+
 integration:run_dev_dgx_a100:
   extends: [.integration_run]
   variables:
@@ -143,3 +168,9 @@ integration:run_dev_dgx_h100:
   variables:
     ENVIRONMENT: dev
     CLUSTER: H100
+
+integration:run_dev_dgx_gb200:
+  extends: [.integration_run]
+  variables:
+    ENVIRONMENT: dev
+    CLUSTER: GB200
diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml
index e9aab319ab1..77298f200c5 100644
--- a/.gitlab/stages/04.functional-tests.yml
+++ b/.gitlab/stages/04.functional-tests.yml
@@ -50,6 +50,7 @@ functional:configure:
     - |
       A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
       H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
+      GB200_CLUSTER=$([[ "$CLUSTER_GB200" != "" ]] && echo $CLUSTER_GB200 || echo $DEFAULT_GB200_CLUSTER)
     - |
       RECORD_CHECKPOINTS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Record checkpoints"* || "$FUNCTIONAL_TEST_RECORD_CHECKPOINTS" == "yes" ]] && echo "true" || echo "false")
     - |
@@ -113,12 +114,32 @@ functional:configure:
         --cluster $H100_CLUSTER \
         --output-path "functional-test-job-lts-H100.yaml" \
         ${RELEASE_ARGS[@]}
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment dev \
+        --platform dgx_gb200 \
+        --cluster $GB200_CLUSTER \
+        --output-path "functional-test-job-dev-GB200.yaml" \
+        ${RELEASE_ARGS[@]}
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment lts \
+        --platform dgx_gb200 \
+        --cluster $GB200_CLUSTER \
+        --output-path "functional-test-job-lts-GB200.yaml" \
+        ${RELEASE_ARGS[@]}
   artifacts:
     paths:
       - functional-test-job-lts-A100.yaml
       - functional-test-job-lts-H100.yaml
       - functional-test-job-dev-A100.yaml
       - functional-test-job-dev-H100.yaml
+      - functional-test-job-lts-GB200.yaml
+      - functional-test-job-dev-GB200.yaml
       - tests/test_utils/local_recipes
 
 .functional_run:
@@ -157,6 +178,12 @@ functional:run_lts_dgx_h100:
     ENVIRONMENT: lts
     CLUSTER: H100
 
+functional:run_lts_dgx_gb200:
+  extends: [.functional_run]
+  variables:
+    ENVIRONMENT: lts
+    CLUSTER: GB200
+
 functional:run_dev_dgx_a100:
   extends: [.functional_run]
   variables:
@@ -169,18 +196,24 @@ functional:run_dev_dgx_h100:
     ENVIRONMENT: dev
     CLUSTER: H100
 
+functional:run_dev_dgx_gb200:
+  extends: [.functional_run]
+  variables:
+    ENVIRONMENT: dev
+    CLUSTER: GB200
+
 functional:run_nemo:
   extends: [.functional_tests_rules]
   trigger:
-    project: 'dl/joc/nemo-ci'
+    project: "dl/joc/nemo-ci"
     branch: main-mirror
     strategy: depend
   inherit:
     variables: true
   variables:
     MCORE_COMMIT: $CI_COMMIT_SHA
-    TEST_NEMO2_MODULE: 'True'
-    ALLOW_FAILURE_DEPENDENCY: 'True'
+    TEST_NEMO2_MODULE: "True"
+    ALLOW_FAILURE_DEPENDENCY: "True"
     TESTS_TO_RUN_ON_THIS_COMMIT: nightly
   rules:
     - if: $FUNCTIONAL_TEST == "yes"
@@ -196,6 +229,8 @@ functional:x_notify:
     - functional:run_dev_dgx_a100
     - functional:run_lts_dgx_h100
     - functional:run_dev_dgx_h100
+    - functional:run_lts_dgx_gb200
+    - functional:run_dev_dgx_gb200
   tags:
     - arch/amd64
     - env/prod
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index babdc18b8a4..00000000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,368 +0,0 @@
-# Changelog
-
-## NVIDIA Megatron Core 0.15.0
-
-* Features  
-  * Performance  
-    * Fused QKV preprocessing with precomputed RoPE caches (3x preprocessing speedup, 10-14% E2E) ([MR \!3912](https://github.com/NVIDIA/Megatron-LM/commit/f0d9fa97fead9825ae3eada36ee2df568bfa415b))  
-    * Use new TE interface for user buffers ([MR \!3886](https://github.com/NVIDIA/Megatron-LM/commit/d47b83807142b6490c7a000e63d25a479b106fd9))  
-    * Add CPU activation offloading via TE ([MR \!4286](https://github.com/NVIDIA/Megatron-LM/commit/310671436c36e6bd198e92c4f30bc84469cc31d8))  
-    * Add configurable double buffering ([MR \!4026](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4026))  
-    * Add Muon optimizer and distributed optimizer support ([MR \!4106](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4106))  
-    * Add setting to support Adam or AdamW optimizer ([MR \!3866](https://github.com/NVIDIA/Megatron-LM/commit/03fd0b41b3840c6f19558161d98373a9242402e5))  
-  * MoE  
-    * Add DTensor support for EP and DSv3 modules ([MR \!3955](https://github.com/NVIDIA/Megatron-LM/commit/268fda08592528b7bc1a21aadaed259980ca8efb))  
-    * Add HybridEP backend to Flex Dispatcher ([MR \!4237](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4237))  
-    * Support FP8 recomputation for MoE components ([MR \!4030](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4030))  
-    * Implement NVFP4 Zero Padding for MoE ([MR \!4225](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4225))  
-    * Compute shared experts before router ([MR \!4068](https://github.com/NVIDIA/Megatron-LM/commit/e8024d716f3036ebcef8c5254c7830ad09aaf41b))  
-    * Enable bias in expert MLP ([MR \!3858](https://github.com/NVIDIA/Megatron-LM/commit/a329dd6da586261a45a8f7d04c1e659ffedd80ae))  
-  * Model support  
-    * Add YaRN support for GPT-OSS ([MR \!4044](https://github.com/NVIDIA/Megatron-LM/commit/2c1b77a9984bfa978e7cf1f58522e5f8e045d017))  
-    * Add support for Qwen3-Next arguments ([MR \!4070](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4070))  
-    * Add FP8 init for MTP ([MR \!3958](https://github.com/NVIDIA/Megatron-LM/commit/d6c6e54ec5eb43d4e196c7ae84e0e88f28613e6b))  
-    * Add fp8\_dpa option for FP8 scaling ([MR \!4053](https://github.com/NVIDIA/Megatron-LM/commit/61047e60e617e71ebe120ec293b62df6b0efc84f))  
-    * Add RADIO-g support to converter and tester ([MR \!4371](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4371))  
-    * Add audio semantic reasoning data for voice chat and speech instructions ([MR \!4397](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4397))  
-  * FSDP  
-    * Enable joint training of parallel modules ([MR \!3850](https://github.com/NVIDIA/Megatron-LM/commit/53008b844f98886a2144c216ecd25952cb2dda58))  
-    * Add support for multimodule communication ([MR \!4235](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4235))  
-  * Inference  
-    * Add CUDA Graph runner lookup table cache (up to 2x E2E speedup) ([MR \!4082](https://github.com/NVIDIA/Megatron-LM/commit/ab43252fdbedcc3662014ae0e110bd3278d844f4))  
-    * Add MoE dropping and padding router for CUDA Graph \+ decode ([MR \!3816](https://github.com/NVIDIA/Megatron-LM/commit/56818f9e5090ff9eb0f13f10bfe408aae4031c5c))  
-    * Dynamic audio shapes with variable sequence lengths (2.5x throughput improvement) ([MR \!4274](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4274))  
-    * Integrate unified memory for dynamic inference context ([MR \!3985](https://github.com/NVIDIA/Megatron-LM/commit/ef4ae4528a0924159069b9f3a2719616156bafa2))  
-  * Post-training  
-    * Add GPT-OSS ModelOpt support with quantization, import/export ([MR \!4169](https://github.com/NVIDIA/Megatron-LM/commit/a2d8c806b35bc708b13e6c069e19e5dfb49b8481))  
-    * Enable KD support with hybrid training loop ([MR \!4021](https://github.com/NVIDIA/Megatron-LM/commit/48d7275062a8307f82bd0fa6c1504032c7f3af96))  
-    * Add ModelOpt pruning example ([MR \!4022](https://github.com/NVIDIA/Megatron-LM/commit/5a58976ebe007064c2ff5e76e815aa5fcf1a8787))  
-  * RL  
-    * Add importance sampling and partial rollouts to Megatron RL ([MR \!4000](https://github.com/NVIDIA/Megatron-LM/commit/8399280ed3b72a183f44820896a67392c0a47e3e))  
-    * Add sequence packing for RL ([MR \!4191](https://github.com/NVIDIA/Megatron-LM/commit/ee8e9307f3ad655e6a46f98a483d8192995b02c2))  
-  * Ease of use  
-    * Handle CUDA absence during import ([MR \!4120](https://github.com/NVIDIA/Megatron-LM/commit/ae44e49271dc45b51a7400ecf6debc598ba90b54))  
-    * Add granary dataloader functionality ([MR \!4291](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4291))  
-    * Enable SWA mixing with attention ([MR \!3855](https://github.com/NVIDIA/Megatron-LM/commit/e5bc9249d7ad34355f5db4c8ff7d7a9080f94dc2))  
-* Bug fixes  
-  * Fix convergence bug in MXFP8 parameter gradient buffer reuse ([MR \!3999](https://github.com/NVIDIA/Megatron-LM/commit/c2c36f77cf7a0476daee5bb2dec604c2764de320))  
-  * Fix loss mask cloning to prevent incorrect updates ([MR \!4164](https://github.com/NVIDIA/Megatron-LM/commit/c94d58f3260aa568588265e07b3c06bb58cbde41))  
-  * Fix metadata loss in checkpoints ([MR \!4182](https://github.com/NVIDIA/Megatron-LM/commit/d8c6aa4c0b5d4c15ec1196802bce292d4580ed4a))  
-  * Fix FSDP grad accum fusion support ([MR \!4018](https://github.com/NVIDIA/Megatron-LM/commit/9f72f4775509668173c75eaab5d58a49f4473748))  
-  * Fix non-TE optimizer checkpoint issue ([MR \!3931](https://github.com/NVIDIA/Megatron-LM/commit/2ebb6ee95af8b547e3c0ac394d494cb189b890bc))  
-  * Fix BERT virtual pipeline parallelism ([MR \!3993](https://github.com/NVIDIA/Megatron-LM/commit/18420b63408101fe5a49d125fb29625f1ad6ab26))  
-  * Fix gc.freeze() slowdown by adding gc.collect() on last layer ([MR \!4003](https://github.com/NVIDIA/Megatron-LM/commit/a3f9e566c9595753553a73d403b2a481ad283fc0))  
-  * Fix full iteration CUDA graph non-tensor handling ([MR \!4019](https://github.com/NVIDIA/Megatron-LM/commit/8479eb35fbca9631acb846c3ad5d868e02214227))  
-  * Fix model\_auto\_sync mis-set and add gradient assertion ([MR \!4062](https://github.com/NVIDIA/Megatron-LM/commit/03045f2d880813695f75707e3262a2bfb4206dfe))  
-  * Fix HF import dtype and checkpoint loading issues ([MR \!4095](https://github.com/NVIDIA/Megatron-LM/commit/435e7e0620ff870d99debd73b3c9113226622dde))  
-  * Fix missing initialization in ProcessGroupCollection ([MR \!4159](https://github.com/NVIDIA/Megatron-LM/commit/5f2becf232a85df8687dc539e604e00a6a875da1))  
-  * Fix sink attention TP ([MR \!4173](https://github.com/NVIDIA/Megatron-LM/commit/3b1b9b267193d72d4f8dc710561c2368de8c114c))  
-  * Fix num\_microbatches calculation ([MR \!4199](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4199))  
-  * Fix 1f1b overlap unit tests for MTP standalone ([MR \!4210](https://github.com/NVIDIA/Megatron-LM/commit/44bc753d69cf509c158bb261434498b141fe5130))  
-  * Fix stale state dict handling ([MR \!4226](https://github.com/NVIDIA/Megatron-LM/commit/0ba847081113a92ce01084f33cd4a0c1f31b327b))  
-  * Fix dataset divergence with tokenizer PAD handling ([MR \!4231](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4231))  
-  * Fix parameter initialization ([MR \!4296](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4296))  
-  * Ensure tensor-parallel attributes set regardless of initialization flag ([MR \!4312](https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/4312))  
-* Known issues
-
-## NVIDIA Megatron Core 0.14.0
-
-* Features  
-  * Inference  
-    * Add async support for DynamicInferenceEngine ([MR \!3187](https://github.com/NVIDIA/Megatron-LM/commit/05079d55a5bfcc7a43f4619e36a40a9e8db3f882))  
-    * Pad input tensors and enable FP8 weights for FP8 inference ([MR \!3341](https://github.com/NVIDIA/Megatron-LM/commit/6a6cd478839d90cf09a837adf8c79cbc844bc920))  
-    * Force inference to always gather logits with tensor parallelism ([MR \!3442](https://github.com/NVIDIA/Megatron-LM/commit/7c9cdcb794089968278c7272e0261a68edf5d369))  
-    * Multi batch size CUDA Graphs for Dynamic Inference ([MR \!3402](https://github.com/NVIDIA/Megatron-LM/commit/30aabe5e3133c6d70aa55aaabad4ea8cb04ce63c))  
-  * Post-training  
-    * ModelOpt updates ([MR \!3268](https://github.com/NVIDIA/Megatron-LM/commit/550ed5243c3a18e39430c15e8918ee63e41d7eaf))  
-      * Add speculative decoding AR validation feature  
-      * Add DeepSeek and Qwen model configs  
-  * Performance  
-    * ModelCommProcessGroup integration ([MR \!3391](https://github.com/NVIDIA/Megatron-LM/commit/26adc2dfde53fbc2b063e2fdd1d9ed26578811a6))  
-    * Add HyperCommGrid: N-Dimensional Communication Grid for Model Parallelism ([MR \!3398](https://github.com/NVIDIA/Megatron-LM/commit/45400df7da7fa23e3aff86804e5ac254d9a8d3c0))  
-      * Flexible creation and management of communication groups  
-    * Add support for Spike No More embedding initializations and weight decay skipping ([MR \!3500](https://github.com/NVIDIA/Megatron-LM/commit/ee74aa66a06b24e511270f285db475941ef63bfd))  
-  * MoE  
-    * We're actively optimizing large-scale fine-grained MoE performance on Blackwell Platform.  
-    * Features:  
-      * Support Expert Parallel A2A Overlapping ([MR \!3470](https://github.com/NVIDIA/Megatron-LM/commit/0c6c1176fb3e3e00534b3591f1ad023d4ecad6fb); [MR \!3074](https://github.com/NVIDIA/Megatron-LM/commit/4b30ec54aba97e16a083eca33d2df1dd48e1b48f))  
-      * Support CP and recompute for MTP ([MR \!3330](https://github.com/NVIDIA/Megatron-LM/commit/650ab87d04105869f197f2ddc441e3b18ca93724))  
-      * Add support for global aux loss ([MR \!3318](https://github.com/NVIDIA/Megatron-LM/commit/e58d9080ea212e005ccba0b6607bfcc86451285d))  
-    * Memory Optimization  
-      * Support recomputation for FP8 layernorm/moe\_act/shared\_experts ([MR \!3465](https://github.com/NVIDIA/Megatron-LM/commit/6850cc6a739d168f8c84db6cdacf4fe2931c0c49))  
-      * Support optimizer offloading for DSV3 FP8 training  ([MR \!3659](https://github.com/NVIDIA/Megatron-LM/commit/abbde02f54b62a5194ebe951218e98feceba6d42))  
-    * Performance Optimization  
-      * Add MoE router fusion ([MR \!3809](https://github.com/NVIDIA/Megatron-LM/commit/d93743a9f11d5d17824b8b49868cc90f2904896f))  
-      * Updates for MoE cudagraph ([MR \!3631](https://github.com/NVIDIA/Megatron-LM/commit/95452706d7aa16dc174813e12639a8c8356fbe87))  
-    * Bug fixes:  
-      * Fix router input jitter dtype ([MR \!3774](https://github.com/NVIDIA/Megatron-LM/commit/20b395424d2e2bbfaab57b2f954294eb57c90c82))
-  * Model support  
-    * Add MiMo video VLM train example ([MR \!3543](https://github.com/NVIDIA/Megatron-LM/commit/786f5629d3462aff2f8855f51db70e882c475116))  
-    * Add AVLM for MIMO ([MR \!3624](https://github.com/NVIDIA/Megatron-LM/commit/db41707430bff743f986b5779712c74242b99caa))  
-  * Ease of use  
-    * Add uv support for source installs ([MR \!3615](https://github.com/NVIDIA/Megatron-LM/commit/164204cd7216e642bdef7299c569d95f02f9a79e))  
-    * Automated weekly prereleases ([MR \!3574](https://github.com/NVIDIA/Megatron-LM/commit/7e59266c70ef34a246438640af690b55c7ecac28))  
-* Bug fixes  
-  * Use mscale\_all\_dim for softmax\_factor ([MR \!2800](https://github.com/NVIDIA/Megatron-LM/commit/e96a358f60c82b8ac8d965d91c3cc4ad0230a4e0))  
-  * Fix FP8 param blockwise scaling unit test ([MR \!3480](https://github.com/NVIDIA/Megatron-LM/commit/57082f946a04c3390fcfc43634dc546ec3ded033))  
-  * Fix unit test blockwise scaling ([MR \!3491](https://github.com/NVIDIA/Megatron-LM/commit/6d95fe63658f967e56a3fda88a9c30a424fcb520))  
-  * Optimize prefill for token-less requests ([MR \!3499](https://github.com/NVIDIA/Megatron-LM/commit/daaa650a9ac4291d4027ca2fdeb4298ce024efd2))  
-  * Add default values for Fp8Padding and Fp8Unpadding ([MR \!3501](https://github.com/NVIDIA/Megatron-LM/commit/42b2b1d10a9cb699b7e5aa40f6bfba9c2a1348aa))  
-  * Fix CUDA graph logic for flexible pp layout ([MR \!3505](https://github.com/NVIDIA/Megatron-LM/commit/020d85e50ddf0f0282802002acb3662129a519c5))  
-  * Load FP8 models with strict=False ([MR \!3508](https://github.com/NVIDIA/Megatron-LM/commit/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2))  
-  * Skip rope check for torch \< 1.4.0 ([MR \!3528](https://github.com/NVIDIA/Megatron-LM/commit/d8180ef8ed0bb6f305dcdedf1b27d91304f361a3))  
-  * Disable Apex tests for stability ([MR \!3539](https://github.com/NVIDIA/Megatron-LM/commit/d1256277fe378add0a2cfd7251f5a350b6d126ec))  
-  * Fix typo in parallel\_state expert parallelism ([MR \!3548](https://github.com/NVIDIA/Megatron-LM/commit/5783ff32af759b8102cf0cb0bb82b30c48b9da26))  
-  * Guard modelopt on macOS ([MR \!3549](https://github.com/NVIDIA/Megatron-LM/commit/76144fe1106e4fb0e69aa75b7a6ab66e71e8f37f))  
-  * Retry on CUDA function failure ([MR \!3554](https://github.com/NVIDIA/Megatron-LM/commit/809aab68307a64c1386d68cc78ef70f8f4e12a80))  
-  * Fix NCCL mem pool creation error ([MR \!3557](https://github.com/NVIDIA/Megatron-LM/commit/b61e21153146a563309b5d44cb5d7f7425806072))  
-  * Fix get\_rotary\_seq\_len return type ([MR \!3559](https://github.com/NVIDIA/Megatron-LM/commit/1fa6bc83c7aeae95abc8e86ff0aac596985a01c3))  
-  * Retry on CUDA function failure ([MR \!3560](https://github.com/NVIDIA/Megatron-LM/commit/7da88d74865c3f1a59894173246f26e7b3bf91b9))  
-  * Fix NCCL allocator attribute error ([MR \!3565](https://github.com/NVIDIA/Megatron-LM/commit/6b656114795d74c3353cb007c59af49b1752f447))  
-  * Ensure multi-prompt inference works ([MR \!3568](https://github.com/NVIDIA/Megatron-LM/commit/0fae48931000c9c7af06f7dcf037b5b7d96e0cd6))  
-  * Fix MD5 on FIPS systems ([MR \!3577](https://github.com/NVIDIA/Megatron-LM/commit/83ee8c2848a3b1d42b40086a64da11e19f4b191f))  
-  * Fixes dynamic context and inference bugs ([MR \!3582](https://github.com/NVIDIA/Megatron-LM/commit/e9c1da60a1ccc85376666d58568ed1d3e5a4f9db))  
-  * Fix TE version for interleaved fused RoPE ([MR \!3586](https://github.com/NVIDIA/Megatron-LM/commit/b72b6cc161f5273b545bca09677382917cf20492))  
-  * Fix MTP with MoE and TP logging ([MR \!3594](https://github.com/NVIDIA/Megatron-LM/commit/9af96623b66693e058f6bfce8d0094dc976792d8))  
-  * Guard TE import fix ([MR \!3596](https://github.com/NVIDIA/Megatron-LM/commit/1bf946b1ec3f11e71459c7c0d06a97edbed96a1a))  
-  * Add assertion for NCCL UB case ([MR \!3599](https://github.com/NVIDIA/Megatron-LM/commit/e11d28592f19c122859be764b7afe7c208d9acc1))  
-  * Remove Encoder PP related Functions ([MR \!3604](https://github.com/NVIDIA/Megatron-LM/commit/9e49aa4446a58cc21c4dc0c5d0806551ad075ca7))  
-  * Fix segfaults in tests ([MR \!3605](https://github.com/NVIDIA/Megatron-LM/commit/f6492fe8164fd5b9ad55007d435ccfc66cb98cc7))  
-  * Fix TE error in distributed optimizer ([MR \!3625](https://github.com/NVIDIA/Megatron-LM/commit/e6c510ff3c1159f8955589b26f7c395bdf0607d9))  
-  * Remove redundant barrier in checkpoint flow ([MR \!3626](https://github.com/NVIDIA/Megatron-LM/commit/26869feb6a3ac7f5616cb7253c37a4244d107d70))  
-  * Support VPP MTP, fix logging ([MR \!3630](https://github.com/NVIDIA/Megatron-LM/commit/c351a473c7eedac2c43eab0815afb9759f4f8187))  
-  * Retry mechanism for free(): invalid pointer errors ([MR \!3632](https://github.com/NVIDIA/Megatron-LM/commit/ec35b41b2df145a7ccb84afc48d94e0786e094da))  
-  * Fix test\_replication.py issues ([MR \!3633](https://github.com/NVIDIA/Megatron-LM/commit/f7b50b271b2e0e396069e02551b21aa6fb374b43))  
-  * Fix typo in parallel\_state ([MR \!3634](https://github.com/NVIDIA/Megatron-LM/commit/3c79a2c330290df58804c33e28e7c197fcc1f0b9))  
-  * Fix CUDA graph logic determination ([MR \!3635](https://github.com/NVIDIA/Megatron-LM/commit/90efa3ef8a3c4f9e0f1db9f67ab9348bfa501387))  
-  * Fix TE installation error ([MR \!3636](https://github.com/NVIDIA/Megatron-LM/commit/7e7322c01c9cb8ec254ecd9042700b22b70fe5c8))  
-  * Ensure correct sharding type in local tests ([MR \!3643](https://github.com/NVIDIA/Megatron-LM/commit/946357f8dd7fdc12424b3a66bc999e6c0a02696c))  
-  * Fix cudagraphed backward buffer reuse for last layer ([MR \!3645](https://github.com/NVIDIA/Megatron-LM/commit/ee61cf450d24760952e8995aab045ab6d55b986e))  
-  * Set default for packed\_seq\_params in get\_rotary\_seq\_len ([MR \!3651](https://github.com/NVIDIA/Megatron-LM/commit/510d58c46664f44c556005ac928c5c531e12f761))  
-  * Fix dynamic example script errors ([MR \!3653](https://github.com/NVIDIA/Megatron-LM/commit/72e290bf1f4bbf0c8047bb10a51da6ea6372e163))  
-  * Guard TE import fix ([MR \!3666](https://github.com/NVIDIA/Megatron-LM/commit/ac198fc0d60a8c748597e01ca4c6887d3a7bcf3d))  
-* Breaking changes:  
-  * `megatron.core.distributed.custom_fsdp` refactored as breaking change to `megatron.core.distributed.fsdp.src.megatron_fsdp`  
-* Known issues
-
-## NVIDIA Megatron Core 0.13.0
-
-* Support bf16 dtype for optimizer states to use precision-aware optimizer in TransformerEngine  
-* MoE
-  * Features:  
-    * Flexible Asymmetric Virtual Pipeline Parallelism with Custom Pipeline Layout (--pipeline-model-parallel-layout)  
-    * Add support to pass custom parallelism groups to MoE modules.  
-    * Add Hybrid Shard Data-Parallel support for MoE models (--num-distributed-optimizer-instances)  
-    * Support EP \+ custom FSDP training for DeepSeek-V3  
-    * FP8 support for Multi-Token-Prediction  
-  * Memory Optimization  
-    * Fine-grained recomputation to reduce activation memory. (--recompute-modules with \--recompute-granularity selective)  
-    * Memory efficient token permutation by moving the probs multiplication from unpermutation to activation function of GroupedMLP.  
-  * Performance Optimization  
-    * MLA RoPE fusion kernel and YARN embedding cache.  
-    * FP8 padding optimization of MoE models by padding the routing map.  
-  * Bug fixes:  
-    * Fix the aux loss calculation when expert\_bias or group limited routing is used. This leads to load\_balancing\_loss values change compared to the previous version.  
-    * Fix packed sequence support for MLA  
-  * Known Issues:  
-    * MTP is not compatible with flexible pipeline layout, will be fixed at \!3594.  
-    * MTP convergence issue with TP2, will be fixed at \!3594.
-
-## NVIDIA Megatron Core 0.12.0
-
-* Add FP8 recipe selection to arguments (--fp8-recipe, --first-last-layers-bf16, --num-layers-at-start-in-bf16, --num-layers-at-end-in-bf16)
-* Context parallel: fix loss scaling when calculate_per_token_loss=True
-* Make the number of data parallel communication buckets configurable (--ddp-num-buckets, --ddp-pad-buckets-for-high-nccl-busbw)
-* Inference
-  * Support in-flight batching and chunked KV cache
-  * Reduce memory usage,
-    * by not materializing full attention mask
-    * by only materializing logits for the last token during decode
-    * by removing an obsolete tensor reference
-* Hybrid Model
-  * Inference
-    * Add CUDA graph support
-    * Change tools/run_mamba_text_generation_server.py to use megatron.core.inference
-    * Fix a shape issue when materializing logits for Mamba model
-  * Improve initialization of Mamba layers
-  * Add configuration switches (--mamba-state-dim, --mamba-head-dim, --mamba-num-groups, --is-hybrid-model)
-  * Make num_floating_point_operations work with hybrid model
-  * Make hybrid_conversion.py work with mixer that uses TE linear
-  * Add FP8 support
-  * Fix Mamba dt_bias tensor parallelism
-  * Support multimodal tokenizer
-  * Improve data parallelism scaling
-* MoE
-  * Features:
-    * DeepEP support, compatible with all the parallelisms and token drop / dropless
-    * Important precision improvement: Enable FP32/FP64 routing and unpermutation using –moe-router-dtype. FP32 is recommended for all fine-grained MoE training
-    * CUDA Graph support for MoE
-    * Multi-Token Prediction (MTP) Support
-    * Fused indices_to_multihot kernel for DeepEP dispatcher
-  * Bug fixes:
-    * Fix Hang Issue with MoE+Dense Hybrid models
-    * Update theoretical memory and tflops estimation for MoE and MLA
-    * Fix MoE Aux loss scaling for per token loss
-    * Fixes for group limited routing and expert bias. We verified these fixes through dsv3 e2e verifications
-  * Known issues:
-    * The ckpt trained with Custom FSDP for MoE may not be compatible with 3D parallel training.
-
-## NVIDIA Megatron Core 0.11.0
-
-* Add multi datacenter training support though N/S connection
-* MoE
-  * Features
-    * Support DeepSeek-V3 fine-tuning
-      * Aux-loss-free load balancing strategy
-      * Node-limited routing and Device-limited routing support.
-      * Tensor Parallelism support for MLA and Sequence Auxiliary Loss
-      * MTP (with TP and PP support) is coming soon.
-    * Permutation / Unpermutation fusion kernel from TransformerEngine.
-    * Uneven virtual pipeline parallel split support in first and last PP stage.
-  * Bug fixes:
-    * Fix the grad scale when TP != expert-TP and average_in_collective is enabled in DDP.
-    * Fix TEGroupedMLP distckpt compatibility issue with FP8 padding/unpadding.
-  * Known Issues:
-    * When training the Dense+MoE hybrid model, the process will hang if any PP rank does not have expert params.
-* Add MX-FP16 support for optimizer and master weights
-* CUDA Graph memory optimizations
-* Enable UCC backend for PP communication
-* Optimizer CPU offload support for memory savings
-* Models
-  * Initial RADIO/CRADIO implementation
-  * llama3.2 support
-* Hybrid Model
-  * Support quantization via TensorRT Model Optimizer
-
-## NVIDIA Megatron Core 0.10.0
-
-* Adding MLA to MCore
-* Enable FP8 for GroupedMLP
-* MoE Parallel Folding
-* Enhance MoE Architecture: Support MoE Layer Frequency Patterns and Configurable MoE FFN Hidden Size
-* Multimodal: NVLM training and evaluation support in MCore
-* Mamba Hybrid
-  * Increase performance and reduce memory footprint of Triton language/compiler distributed caching
-  * Add more unit testing and fix bugs
-
-## NVIDIA Megatron Core 0.9.0
-
-* Uneven pipeline parallelism
-  * Enable pipeline parallelism where first and last ranks have fewer transformer layers than the intermediate ranks
-* Per layer CUDAGraph support for GPT training with Transformer Engine modules
-* Enable different TP sizes for the vision encoder
-* Enable pipeline parallelism for T5 & Llava models
-* Support multi-tile multi-image input in Llava models
-* MoE
-  * FP8 support
-  * Runtime upcycling support
-  * Dispatcher implementation optimizations
-  * Shared expert support with overlapping optimizations
-    * Qwen Model support
-* Known Issues
-  * When using sequence parallel, during the transformer block forward pass, dropout is not using the appropriate rng context.
-* NVRx / Fault tolerance
-  * fault and hang detection in addition to existing straggler detection
-  * graceful exit and auto restart
-
-## NVIDIA Megatron Core 0.8.0
-
-* Multimodal
-  * Added initial support for training vision language models using the LLaVA architecture
-  * Added initial support for inference with multimodal inputs
-  * End-to-end multimodal example from data collection to training to evaluation is provided in examples/multimodal
-* MoE
-  * Context Parallel support.
-  * Distributed checkpoint support for grouped GEMM.
-* Mamba
-
-## NVIDIA Megatron Core 0.7.0
-
-* MoE
-  * Token drop support
-  * Several efficiency optimizations
-  * Improved model parallelism
-  * Memory optimizations
-* Distributed checkpointing
-  * Enabled for Retro
-  * Asynchronous checkpoint saving
-* Several minor bug fixes, speed improvements, and memory optimizations
-
-## NVIDIA Megatron Core 0.6.0
-
-* MoE (Mixture of Experts)
-  * Performance optimization
-    * Communication optimization for multi GPU and Single GPU
-    * 23% improvement (323 TFLOPS/GPU) over MCore 0.5.0 on Mixtral with Hopper BF16
-    * GroupedMLP enhancement for Hopper
-    * DP Overlapping. Support overlapping computation with gradient reduction and parameter gathering.
-  * All-to-All based Token Dispatcher
-  * Layer-wise logging for load balancing loss.
-  * Improved expert parallel support including distributed optimizer.
-* Distributed optimizer
-* RETRO
-  * Data processing
-* BERT
-  * Distributed checkpointing
-* Dist checkpointing
-  * PyTorch native distributed backend
-  * Improved saving/loading speed
-* TensorRT-LLM Export
-  * Integration with TensorRT Model Optimizer Post-training quantization (PTQ)
-  * Text generation driver to perform PTQ in Megatron-LM
-  * Llama2 and Nemotron3-8b examples to use TensorRT-LLM unified build API to build engine after training.
-* Several minor enhancements, bug fixes, and documentation updates
-
-## NVIDIA Megatron Core 0.5.0
-
-### Key Features and Enhancements
-
-Megatron core documentation is now [live!](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/index.html#quick-start)
-
-### Model Features
-
-* MoE (Mixture of Experts)
-  * Support for Z-loss, Load balancing and Sinkhorn
-  * Layer and communications refactor
-  * Richer parallelism mappings and EP can be combined with other model parallel techniques for larger MoE variants, e.g. EP + TP + DP + SP + PP
-  * Token dropless architecture with Top-K routing
-  * Performance optimization with with GroupedGEMM when number of local experts is > 1
-  * Distributed checkpointing
-* Interleaved rotary embedding
-
-### Datasets
-
-* Masked WordPiece datasets for BERT and T5
-* Raw and mock datasets
-
-### Parallelism
-
-### Performance
-
-* Activation offloading to CPU
-* Rope and Swiglu fusion
-* Sliding window attention (via Transformer Engine)
-
-### General Improvements
-
-* Timers
-
-## NVIDIA Megatron Core 0.4.0
-
-### Key Features and Enhancements
-
-#### Models
-
-* BERT
-* RETRO
-* T5
-
-#### Parallelism
-
-* Mixture of Experts support for GPT
-* Model parallel efficient Distributed Data Parallel (DDP)
-* Context Parallel (2D Tensor Parallel) support
-
-#### Datasets
-
-* GPT Dataset
-* Blended Dataset
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 615227600cc..6b128dce590 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,66 +1,3 @@
-# Contributing to Megatron-LM
+# Contributing to Megatron
 
-This document outlines the processes and policies for issues and pull requests by non-NVIDIA contributors to the Megatron-LM github repository.
-
-Everyone is welcome to contribute to the project but development of Megatron-LM continues internally at NVIDIA. When contributing it important to ensure that changes are in line with the project direction. Small changes to fix bugs are welcomed and appreciated. If proposing large architectural changes or changes for stylistic reasons open an issue first so we can discuss it.
-
-PRs will first be pulled into NVIDIA's internal Megatron-LM repo and then pushed back out to the open github repo with proper credit given to the committers.
-
-## Issue policy
-
-Please do file any bugs you find, keeping the following in mind:
-
-- If filing a bug, i.e. you have found something that doesn't work as expected, use the BUG template.
-- If you've found a regression in speed or accuracy use the REGRESSION template.
-- If you are requesting a new feature or modification of an existing feature use the ENHANCEMENT template.
-- If opening an issue to ask a question no template is needed but please make your question as clear and concise as possible.
-- One issue per bug. Putting multiple things in the same issue makes both discussion and completion unnecessarily complicated.
-- Your bug is mostly likely to get attention from the development team quickly if we can easily reproduce it.
-- Use proper spelling, grammar, and punctuation.
-- Write in an authoritative and technical tone.
-
-## Code submission policy
-
-Here are some dos & don'ts to try and stick to:
-
-### Do:
-
-- Format new code in a style that is consistent with the file being changed. Megatron-LM doesn't (yet) have a style guide or enforced formatting.
-- Split your changes into separate, atomic commits i.e. A commit per feature or fix.
-- Make sure your commits are rebased on the master branch.
-- Write the commit message subject line in the imperative mood ("Change the default argument for X", not "Changed the default argument for X").
-- Write your commit messages in proper English, with care and punctuation.
-- Check the spelling of your code, comments and commit messages.
-
-### Don't:
-
-- Submit code that's incompatible with the project licence.
-- Touch anything outside the stated scope of the PR. This includes formatting changes to code not relevant to the PR.
-- Iterate excessively on your design across multiple commits.
-- Include commented-out code.
-- Attempt large architectural changes without first opening an issue to discuss.
-
-## Issue and Pull Request Q&A (Updated Jul 2023)
-
-### I've submitted an issue and PR. When can I expect to get some feedback?
-
-Megatron-LM is developed and maintained by a small team of researchers. We will endeavour to read and acknowledge all new issues and PRs within a week. A few rules of thumb:
-- Reproducible bugs/regressions and bug/regression fixes are likely to get the attention of maintainers the quickest.
-- Issues requesting an enhancement may only recieve acknowlegement that they've been read and may be closed with a "wontfix" label if they're not inline with the project direction. If they are acknowledged and remain open you can assume the maintainers agree they're a desirable feature.
-- Support requests, i.e. requests for help running the code, have the lowest priority and will be responded to as maintainer time permits.
-
-### If my issue or PR isn't getting attention, how long should I wait before pinging one of the project maintainers?
-
-One week if there is no acknowledgement of the intial request.
-
-### Who are the project maintainers I should ping?
-
-The corresponding maintainers at this time are @jaredcasper and @jon-barker.
-
-### Is there a policy for issues and PRs that haven't been touched in X days? Should they be closed?
-
-Yes, starting in July 2023 we have a bot that will mark untouched PRs as "stale" after 60 days.
-
-We have a long backlog of issues and PRs dating back 3.5 years. We are trying to triage these now by working backwards. Older issues we believe may still be relevant may recieve a request to re-test them with the latest code. If there's no response they may be closed. Again, if you they should be re-opened then just respond with a comment to that effect.
-
-Thank-you!
\ No newline at end of file
+Visit our [contributing page](https://docs.nvidia.com/megatron-core/developer-guide/latest/developer/contribute.html).
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 8ebf5004971..4570d9283c4 100644
--- a/LICENSE
+++ b/LICENSE
@@ -270,3 +270,58 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
+--------------------------------------------------------------------------------
+LICENSE FOR Thinking Machines Lab 
+
+MIT License
+
+Copyright 2025 Thinking Machines Lab 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+LICENSE FOR
+Meta Platforms, Inc. and affiliates.
+
+BSD License
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Meta nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 5e52a90b625..58417a272db 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@
 </div>
 
 
-Megatron-LM & Megatron Core
-===========================
+Megatron-LM and Megatron Core
+=============================
 
 <h4>GPU-optimized library for training transformer models at scale</h4>
 
@@ -18,7 +18,7 @@ Megatron-LM & Megatron Core
 
 <div align="left">
 
-## ⚡ Quick Start
+## ⚡️Quick Start
 
 ```bash
 # Clone Megatron-LM-FL repository (includes megatron.core and megatron.plugin)
@@ -28,12 +28,42 @@ cd Megatron-LM-FL
 pip install --no-build-isolation .[mlm,dev]
 ```
 
-**→ [Complete Installation Guide](#installation)** - Docker, pip variants (dev,lts,etc.), source installation, and system requirements
+## About
+
+This repository contains two components: **Megatron-LM** and **Megatron Core**.
+
+**Megatron-LM** is a reference example that includes Megatron Core plus pre-configured training scripts. Best for research teams, learning distributed training, and quick experimentation.
+
+**Megatron Core** is a composable library with GPU-optimized building blocks for custom training frameworks. It provides transformer building blocks, advanced parallelism strategies (TP, PP, DP, EP, CP), mixed precision support (FP16, BF16, FP8, FP4), and model architectures. Best for framework developers and ML engineers building custom training pipelines.
+
+**[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** provides bidirectional Hugging Face ↔ Megatron checkpoint conversion with production-ready recipes.
+
+
+## Quick Start
+
+Install Megatron Core with pip:
+
+1. Install Megatron Core with required dependencies:
+
+    ```bash
+    pip install --no-build-isolation megatron-core[mlm,dev]
+    ```
+
+2. Clone repository for examples:
+
+    ```bash
+    git clone https://github.com/NVIDIA/Megatron-LM.git
+    cd Megatron-LM
+    pip install --no-build-isolation .[mlm,dev]
+    ```
+
 
 # Latest News
 
-- 📣 NEW! **[Megatron Dev Branch](https://github.com/NVIDIA/Megatron-LM/tree/dev)** - early access branch with experimental features.
-- 🔄 **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
+- **[2026/01]** **[Dynamic Context Parallelism](https://developer.nvidia.com/blog/speeding-up-variable-length-training-with-dynamic-context-parallelism-and-nvidia-megatron-core/)** - Up to 1.48x speedup for variable-length sequence training with adaptive CP sizing.
+- **[2025/12]** **Megatron Core development has moved to GitHub!** All development and CI now happens in the open. We welcome community contributions.
+- **[2025/10]** **[Megatron Dev Branch](https://github.com/NVIDIA/Megatron-LM/tree/dev)** - early access branch with experimental features.
+- **[2025/10]** **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
 - **[2025/08]** **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
 - **[2025/08]** **[GPT-OSS Model](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
 - **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
@@ -44,63 +74,14 @@ pip install --no-build-isolation .[mlm,dev]
 
 - **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)).
 - **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
-- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron Core intro](#Megatron Core) for more details.
+- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs.
 
 </details>
 
-<details>
-<summary>Table of Contents</summary>
-
-**Getting Started**
-
-- [Quick Start](#-quick-start)
-- [Latest News](#latest-news)
-- [Megatron Overview](#megatron-overview)
-  - [Project Structure](#project-structure)
-  - [Megatron-LM: Reference Implementation](#megatron-lm-reference-implementation)
-  - [Megatron Core: Production Library](#megatron-core-production-library)
-- [Installation](#installation)
-  - [Docker (Recommended)](#-docker-recommended)
-  - [Pip Installation](#-pip-installation)
-  - [Source Installation](#-source-installation)
-  - [System Requirements](#system-requirements)
-
-**Core Features**
-
-- [Performance Benchmarking](#performance-benchmarking)
-  - [Weak Scaling Results](#weak-scaling-results)
-  - [Strong Scaling Results](#strong-scaling-results)
-- [Ecosystem Libraries](#ecosystem-libraries)
-
-**Training**
-
-- [Training](#training)
-  - [Getting Started](#getting-started)
-  - [Data Preparation](#data-preparation)
-- [Parallelism Strategies](#parallelism-strategies)
-  - [Data Parallelism (DP)](#data-parallelism-dp)
-  - [Tensor Parallelism (TP)](#tensor-parallelism-tp)
-  - [Pipeline Parallelism (PP)](#pipeline-parallelism-pp)
-  - [Context Parallelism (CP)](#context-parallelism-cp)
-  - [Expert Parallelism (EP)](#expert-parallelism-ep)
-  - [Parallelism Selection Guide](#parallelism-selection-guide)
-- [Performance Optimizations](#performance-optimizations)
-
-**Resources**
-
-- [Examples](./examples/) - Training scripts and tutorials
-- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
-- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
-- [Community & Support](#-community--support) - Get help and contribute
-  - [Getting Help](#getting-help)
-  - [Contributing](#contributing)
-  - [Citation](#citation)
 
-</details>
 
-# Megatron Overview
 
-## Project Structure
+# Project Structure
 
 ```
 Megatron-LM/
@@ -125,135 +106,11 @@ Megatron-LM/
 └── docs/                        # Documentation
 ```
 
-### Megatron-LM: Reference Implementation
-
-**Reference implementation** that includes Megatron Core plus everything needed to train models.
-
-**Best for:**
-
-- **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware
-- **Research teams** exploring new architectures and training techniques
-- **Learning distributed training** concepts and best practices
-- **Quick experimentation** with proven model configurations
-
-**What you get:**
-
-- Pre-configured training scripts for GPT, LLama, DeepSeek, Qwen, and more.
-- End-to-end examples from data prep to evaluation
-- Research-focused tools and utilities
-
-### Megatron Core: Composable Library
-
-**Composable library** with GPU-optimized building blocks for custom training frameworks.
-
-**Best for:**
-
-- **Framework developers** building on top of modular and optimized components
-- **Research teams** needing custom training loops, optimizers, or data pipelines
-- **ML engineers** requiring fault-tolerant training pipelines
-
-**What you get:**
-
-- Composable transformer building blocks (attention, MLP, etc.)
-- Advanced parallelism strategies (TP, PP, DP, EP, CP)
-- Pipeline schedules and distributed optimizers
-- Mixed precision support (FP16, BF16, FP8)
-- GPU-optimized kernels and memory management
-- High-performance dataloaders and dataset utilities
-- Model architectures (LLaMA, Qwen, GPT, Mixtral, Mamba, etc.)
-
-## Ecosystem Libraries
-
-**Libraries used by Megatron Core:**
-
-- **[Megatron Energon](https://github.com/NVIDIA/Megatron-Energon)** 📣 **NEW!** - Multi-modal data loader (text, images, video, audio) with distributed loading and dataset blending
-- **[Transformer Engine](https://github.com/NVIDIA/TransformerEngine)** - Optimized kernels and FP8 mixed precision support
-- **[Resiliency Extension (NVRx)](https://github.com/NVIDIA/nvidia-resiliency-ext)** - Fault tolerant training with failure detection and recovery
-
-**Libraries using Megatron Core:**
-
-- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
-- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
-- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
-- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](./examples/post_training/modelopt/).
-
-**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
-
-# Installation
-
-## 🐳 Docker (Recommended)
-
-We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability.
-
-**Note:** The NGC PyTorch container constraints the python environment globally via `PIP_CONSTRAINT`. In the following examples we will unset the variable.
 
-This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs:
-
-- PyTorch (latest stable version)
-- CUDA, cuDNN, NCCL (latest stable versions)
-- Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs
-- For best performance, use NVIDIA Turing GPU architecture generations and later
-
-```bash
-# Run container with mounted directories
-docker run --runtime --nvidia --gpus all -it --rm \
-  -v /path/to/megatron:/workspace/megatron \
-  -v /path/to/dataset:/workspace/dataset \
-  -v /path/to/checkpoints:/workspace/checkpoints \
-  -e PIP_CONSTRAINT= \
-  nvcr.io/nvidia/pytorch:25.04-py3
-```
-
-## Pip Installation
-
-Megatron Core offers support for two NGC PyTorch containers:
-
-- `dev`: Moving head that supports the most recent upstream dependencies
-- `lts`: Long-term support of NGC PyTorch 24.01
-
-Both containers can be combined with `mlm` which adds package dependencies for Megatron-LM on top of Megatron Core.
-
-```bash
-# Install the latest release dependencies
-pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-pip install --no-build-isolation megatron-core[dev]
-# For running an M-LM application:
-pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-pip install --no-build-isolation megatron-core[mlm,dev]
-```
-
-```bash
-# Install packages for LTS support NGC PyTorch 24.01
-pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-pip install --no-build-isolation megatron-core[lts]
-# For running an M-LM application:
-pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-pip install --no-build-isolation megatron-core[mlm,lts]
-```
-
-For a version of Megatron Core with only torch, run:
-
-```bash
-pip install megatron-core
-```
-
-## System Requirements
-
-### Hardware Requirements
-
-- **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs
-- **Recommended**: NVIDIA Turing architecture or later
-
-### Software Requirements
-
-- **CUDA/cuDNN/NCCL**: Latest stable versions
-- **PyTorch**: Latest stable version
-- **Transformer Engine**: Latest stable version
-- **Python**: 3.12 recommended
 
 # Performance Benchmarking
 
-For our latest performance benchmarking results, please refer to [NVIDIA NeMo Framework Performance Summary](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html).
+For our latest performance benchmarking results, please refer to [NVIDIA Megatron Bridge Performance Summary](https://docs.nvidia.com/nemo/megatron-bridge/latest/performance-summary.html).
 
 Our codebase efficiently trains models from 2B to 462B parameters across thousands of GPUs, achieving up to **47% Model FLOP Utilization (MFU)** on H100 clusters.
 
@@ -286,199 +143,16 @@ We also strong scaled the standard GPT-3 model (our version has slightly more th
 
 ![Strong scaling](images/strong_scaling.png)
 
-# Training
 
-## Getting Started
 
-### Simple Training Example
 
-```bash
-# Distributed training example (2 GPUs, mock data)
-torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
-```
 
-### LLama-3 Training Example
-
-```bash
-# 8 GPUs, FP8 precision, mock data
-./examples/llama/train_llama3_8b_fp8.sh
-```
-
-## Data Preparation
-
-### JSONL Data Format
-
-```json
-{"text": "Your training text here..."}
-{"text": "Another training sample..."}
-```
-
-### Basic Preprocessing
-
-```bash
-python tools/preprocess_data.py \
-    --input data.jsonl \
-    --output-prefix processed_data \
-    --tokenizer-type HuggingFaceTokenizer \
-    --tokenizer-model /path/to/tokenizer.model \
-    --workers 8 \
-    --append-eod
-```
-
-### Key Arguments
-
-- `--input`: Path to input JSON/JSONL file
-- `--output-prefix`: Prefix for output binary files (.bin and .idx)
-- `--tokenizer-type`: Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, etc.)
-- `--tokenizer-model`: Path to tokenizer model file
-- `--workers`: Number of parallel workers for processing
-- `--append-eod`: Add end-of-document token
-
-<!-- **→ [Complete Data Preparation Guide](./docs/data-preparation.md)** - Comprehensive guide covering advanced preprocessing, dataset collection, deduplication, and optimization strategies -->
-
-# Parallelism Strategies
-
-## Data Parallelism (DP)
-
-### Standard Data Parallel
-
-```bash
-# Standard DDP - replicate model on each GPU
-torchrun --nproc_per_node=8 pretrain_gpt.py \
-    --data-parallel-sharding-strategy no_shard
-```
-
-### Fully Sharded Data Parallel (FSDP)
-
-```bash
-# Megatron's optimized FSDP (~15% faster than PyTorch FSDP2)
---use-custom-fsdp
-
-# PyTorch FSDP2
---use-torch-fsdp2
-
-# Sharding strategies
---data-parallel-sharding-strategy optim              # Shard optimizer states (ZeRO-1)
---data-parallel-sharding-strategy optim_grads        # Shard gradients + optimizer (ZeRO-2)
---data-parallel-sharding-strategy optim_grads_params # Shard parameters + gradients + optimizer (ZeRO-3)
-```
-
-## Tensor Parallelism (TP)
-
-Split individual model layers across GPUs:
-
-```bash
---tensor-model-parallel-size 4  # 4-way tensor parallelism
---sequence-parallel             # Enable sequence parallelism (recommended with TP)
-```
-
-## Pipeline Parallelism (PP)
-
-Split model depth across GPUs:
-
-```bash
---pipeline-model-parallel-size 8     # 8 pipeline stages
---virtual-pipeline-model-parallel-size 4  # Virtual pipeline for better load balancing
-```
-
-## Context Parallelism (CP)
-
-Split long sequences across GPUs for handling long contexts:
-
-```bash
---context-parallel-size 2                    # 2-way context parallelism
---cp-comm-type p2p                          # Communication: p2p, a2a, allgather, a2a+p2p
---hierarchical-context-parallel-sizes 2 4   # Hierarchical context parallelism
-```
-
-## Expert Parallelism (EP)
-
-For Mixture of Experts (MoE) models:
-
-```bash
---expert-model-parallel-size 4  # 4-way expert parallelism
---num-experts 8                 # 8 experts per MoE layer
---moe-grouped-gemm              # Optimize expert computation
-```
-
-## Combining Parallelism Strategies
-
-### Parallelism Selection Guide
-
-Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/tree/main/scripts/performance/recommended_model_configs):
-
-| Model | Size | GPUs | TP | PP | CP | EP | Notes |
-|-------|------|------|----|----|----|----|-------|
-| **LLama-3** | 8B | 8 | 1 | 1 | 2 | 1 | CP for long seqlen (8K) |
-| **LLama-3** | 70B | 64 | 4 | 4 | 2 | 1 | TP+PP |
-| **LLama-3.1** | 405B | 1024 | 8 | 8 | 2 | 1 | 3D parallelism for scale |
-| **GPT-3** | 175B | 128-512 | 4 | 8 | 1 | 1 | Large model config |
-| **Mixtral** | 8x7B | 64 | 1 | 4 | 1 | 8 | EP for MoE |
-| **Mixtral** | 8x22B | 256 | 4 | 4 | 8 | 8 | Combined TP+EP for large MoE |
-| **DeepSeek-V3** | 671B | 1024 | 2 | 16 | 1 | 64 | Large MoE config |
-
-### MoE-Specific Requirements
-
-**Important**: When combining Expert Parallelism (EP) with Tensor Parallelism (TP), **Sequence Parallelism (SP) must be enabled**.
-
-## Performance Optimizations
-
-| Feature | Flag | Benefit |
-|---------|------|---------|
-| **FlashAttention** | `--attention-backend` | Faster attention and lower memory usage |
-| **FP8 Training** | `--fp8-hybrid` | Faster training |
-| **Activation Checkpointing** | `--recompute-activations` | Reduced memory usage |
-| **Data Parallelism Communication Overlap** | `--overlap-grad-reduce` | Faster distributed training |
-| **Distributed Optimizer** | `--use-distributed-optimizer` | Reduced checkpointing time |
-
-**→ [NVIDIA NeMo Framework Performance Tuning Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html#performance-tuning-guide)** - Comprehensive performance optimization guide covering advanced tuning techniques, communication overlaps, memory optimizations, and profiling options.
-
-### FlashAttention
-
-[FlashAttention](https://github.com/Dao-AILab/flash-attention) is a fast and memory-efficient attention algorithm. We recommend the default usage, which uses cuDNN for attention via Transformer Engine and provides up to 50% speedups on forward and 84% on backward propagation with FP8 kernels. The `flash-attn` package is also supported via `--use-flash-attn`.
-
-### Mixed Precision Training
-
-```bash
---fp16                    # Standard FP16
---bf16                    # BFloat16 (recommended for large models)
---fp8-hybrid              # FP8 training (Hopper, Ada, and Blackwell GPUs)
-```
-
-### Activation Checkpointing and Recomputation
-
-```bash
-# For limited memory
---recompute-activations
-
-# For extreme memory constraints
---recompute-granularity full \
---recompute-method uniform
-```
-
-### Data Parallelism Communication Overlap
-
-```bash
---overlap-grad-reduce
---overlap-param-gather
-```
-
-### Distributed Optimizer
-
-```bash
---use-distributed-optimizer
-```
 
 # Roadmaps
 
-Stay up-to-date with our development roadmaps and planned features:
-
-- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
-- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
+- **[MoE Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
 
-*More roadmap trackers will be added soon.*
-
-# Community & Support
+# Resources
 
 ## Getting Help
 
@@ -498,6 +172,8 @@ We ❤️ contributions! Ways to contribute:
 
 ## Citation
 
+If you use Megatron in your research or project, we appreciate that you use the following citations:
+
 ```bibtex
 @article{megatron-lm,
   title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
diff --git a/docker/.ngc_version.dev b/docker/.ngc_version.dev
index 6b72812b34f..8e8108b9a9a 100644
--- a/docker/.ngc_version.dev
+++ b/docker/.ngc_version.dev
@@ -1 +1 @@
-nvcr.io/nvidia/pytorch:25.09-py3
\ No newline at end of file
+nvcr.io/nvidia/pytorch:25.11-py3
\ No newline at end of file
diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev
index 7ee4dacd27b..3df1986007c 100644
--- a/docker/Dockerfile.ci.dev
+++ b/docker/Dockerfile.ci.dev
@@ -1,3 +1,4 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 # syntax=docker/dockerfile:1.3-labs
 
 ARG FROM_IMAGE_NAME
@@ -15,10 +16,17 @@ ENV UV_LINK_MODE=copy
 
 RUN bash -ex <<"EOF"
     apt-get update
-    apt-get install -y --no-install-recommends gettext python3-venv psmisc
+    apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime
     apt-get clean
     python -m venv /opt/jet
-    wget https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_amd64 -O /usr/local/bin/yq
+    ARCH=$(uname -m)
+    case "${ARCH}" in \
+      "x86_64") YQ_ARCH=amd64 ;; \
+      "aarch64") YQ_ARCH=arm64 ;; \
+      "armv7l") YQ_ARCH=arm ;; \
+      *) echo "Unsupported architecture: ${ARCH}" && exit 1 ;; \
+    esac 
+    wget https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_${YQ_ARCH} -O /usr/local/bin/yq
     chmod a+x /usr/local/bin/yq
     curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
 EOF
@@ -55,14 +63,14 @@ EOF
 COPY docker/patches/deepep.patch /workspace/deepep.patch
 RUN bash -ex <<"EOF"
     cd /workspace
-    uv pip install nvidia-nvshmem-cu13
+    uv pip install nvidia-nvshmem-cu13==3.4.5
     pushd /opt/venv/lib/python3.12/site-packages/nvidia/nvshmem/lib/
         ln -s libnvshmem_host.so.3 libnvshmem_host.so
     popd
 
     git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git
     pushd DeepEP
-        git checkout 1dddd194c26911c35b4f53a148617dd73de0ffc9
+        git checkout eb9cee7de5a24193bf09500668d3a619d3d3f3fb
         patch -p1 < /workspace/deepep.patch
     popd
     TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/.
@@ -80,7 +88,8 @@ RUN --mount=type=secret,id=JET_INDEX_URLS bash -ex <<"EOF"
     JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS)
     python -m venv /opt/jet 
     /opt/jet/bin/pip install --no-cache-dir $JET_INDEX_URLS \
-        jet-api==$JET_API_VERSION
+        jet-api==$JET_API_VERSION \
+        "setuptools<80.0.0,>=77.0.0"
 EOF
 
 RUN --mount=type=secret,id=JET_INDEX_URLS \
@@ -88,7 +97,7 @@ RUN --mount=type=secret,id=JET_INDEX_URLS \
     JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS)
     LOGGER_INDEX_URL=$(cat /run/secrets/LOGGER_INDEX_URL)
     uv pip install --no-cache-dir --upgrade $LOGGER_INDEX_URL "one-logger"
-    uv pip install --no-cache-dir --upgrade "setuptools<80.0.0"
-    uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=3.0" 
+    uv pip install --no-cache-dir --upgrade "setuptools<80.0.0,>=77.0.0"
+    uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=4.0" 
 EOF
 ###
diff --git a/docker/Dockerfile.ci.nemo b/docker/Dockerfile.ci.nemo
index 2369602f54d..b00349e101a 100644
--- a/docker/Dockerfile.ci.nemo
+++ b/docker/Dockerfile.ci.nemo
@@ -1,3 +1,4 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 # syntax=docker/dockerfile:1.3-labs
 
 ARG FROM_IMAGE_NAME
@@ -14,7 +15,7 @@ FROM main as jet
 ARG JET_API_VERSION
 RUN --mount=type=secret,id=JET_INDEX_URLS \
     JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
-    pip install --no-cache-dir jet-api==$JET_API_VERSION "jet-client~=3.0" --upgrade $JET_INDEX_URLS
+    pip install --no-cache-dir jet-api==$JET_API_VERSION "jet-client~=4.0" --upgrade $JET_INDEX_URLS
 
 ENV PATH="$PATH:/opt/jet/bin"
 ###
diff --git a/docker/common/install.sh b/docker/common/install.sh
index 761244a1068..01003c0e7aa 100644
--- a/docker/common/install.sh
+++ b/docker/common/install.sh
@@ -136,7 +136,7 @@ main() {
         . $UV_PROJECT_ENVIRONMENT/bin/activate
 
         pip install --pre --no-cache-dir --upgrade pip
-        pip install --pre --no-cache-dir torch pybind11 wheel_stub ninja wheel packaging "setuptools>=77.0.0"
+        pip install --pre --no-cache-dir torch pybind11 wheel_stub ninja wheel packaging "setuptools<80.0.0,>=77.0.0"
         pip install --pre --no-cache-dir --no-build-isolation .
     fi
 
diff --git a/docker/common/install_source_wheels.sh b/docker/common/install_source_wheels.sh
index 1308e604822..2f144a6ff0a 100644
--- a/docker/common/install_source_wheels.sh
+++ b/docker/common/install_source_wheels.sh
@@ -54,4 +54,4 @@ uv pip install --no-cache-dir \
     $MAMBA_WHEEL \
     $CAUSALCONV1D_WHEEL \
     $GROUPEDGEMM_WHEEL \
-    "setuptools<80.0.0"
+    "setuptools<80.0.0,>=77.0.0"
diff --git a/docs/advanced/index.md b/docs/advanced/index.md
new file mode 100644
index 00000000000..573cb0ee81a
--- /dev/null
+++ b/docs/advanced/index.md
@@ -0,0 +1,5 @@
+# Discussions
+
+In-depth technical discussions and optimization guides:
+
+- [Optimizing DeepSeek-V3 Training on GB200 NVL72](https://github.com/NVIDIA/Megatron-LM/blob/dev/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md) - Achieving 970 TFLOPS/GPU with MXFP8, kernel optimizations, and HybridEP
diff --git a/docs/api-backwards-compatibility-check.md b/docs/api-backwards-compatibility-check.md
index 0e78eaec669..83d9975c44f 100644
--- a/docs/api-backwards-compatibility-check.md
+++ b/docs/api-backwards-compatibility-check.md
@@ -1,3 +1,16 @@
+---
+orphan: true
+---
+
+<!---
+   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
+   NVIDIA CORPORATION and its licensors retain all intellectual property
+   and proprietary rights in and to this software, related documentation
+   and any modifications thereto. Any use, reproduction, disclosure or
+   distribution of this software and related documentation without an express
+   license agreement from NVIDIA CORPORATION is strictly prohibited.
+-->
+
 # API Backward Compatibility Checking
 
 ## Overview
@@ -7,6 +20,7 @@ Megatron Core uses automated API compatibility checking to ensure stable interfa
 ## How It Works
 
 The compatibility checker:
+
 1. Compares the current code against the latest release
 2. Detects breaking changes in function signatures
 3. Fails CI if breaking changes are found (unless explicitly exempted)
@@ -72,6 +86,7 @@ def experimental_feature(x, y):
 ```
 
 **When to use `@internal_api`:**
+
 - Internal APIs not documented for external use
 - Experimental features explicitly marked as unstable
 - Functions in development that haven't been released yet
@@ -90,6 +105,7 @@ def new_experimental_feature(x, y):
 ```
 
 **When to use `@experimental_api`:**
+
 - Experimental features explicitly marked as unstable
 - New APIs under active development
 - Features that haven't been stabilized yet
@@ -113,6 +129,7 @@ def old_function(x):
 ```
 
 **Deprecation Timeline:**
+
 1. **Version N** - Add `@deprecated` decorator, function still works
 2. **Version N+1** - Keep function with deprecation warnings
 3. **Version N+2** - Remove function (users have been warned)
@@ -141,6 +158,7 @@ def train_model(config, dataloader):
 def train_model(config, dataloader, optimizer="adam"):
     pass
 ```
+
 **Result:** ✅ Check passes
 
 ---
@@ -156,6 +174,7 @@ def train_model(config, dataloader, optimizer="adam"):
 def train_model(config, dataloader):
     pass
 ```
+
 **Result:** ❌ Check fails - "Parameter 'optimizer' removed"
 
 ---
@@ -175,6 +194,7 @@ def _internal_compute(x, y):
 def _internal_compute(x, y, z):  # Added parameter
     pass
 ```
+
 **Result:** ✅ Check passes (function is exempt)
 
 ---
@@ -182,7 +202,7 @@ def _internal_compute(x, y, z):  # Added parameter
 ### Example 4: Deprecation Workflow
 
 ```python
-from megatron.core.backwards_compatibility_decorators import deprecated
+from megatron.core.utils import deprecated
 
 # Version 1.0 - Add deprecation
 @deprecated(
@@ -319,13 +339,13 @@ If the checker reports a breaking change that isn't actually breaking, file an i
 - **Script:** `scripts/check_api_backwards_compatibility.py`
 - **Workflow:** `.github/workflows/check_api_backwards_compatibility_workflow.yml`
 - **Decorators:** `megatron/core/backwards_compatibility_decorators.py`
-- **Griffe Documentation:** https://mkdocstrings.github.io/griffe/
+- **Griffe Documentation:** <https://mkdocstrings.github.io/griffe/>
 
 ## Support
 
 For questions or issues:
+
 1. Check this documentation
 2. Review existing PRs with compatibility checks
 3. Ask in the Megatron-LM Slack/Discord
 4. File an issue on GitHub
-
diff --git a/docs/api-guide/core/datasets.md b/docs/api-guide/core/datasets.md
new file mode 100644
index 00000000000..e97e99ae1db
--- /dev/null
+++ b/docs/api-guide/core/datasets.md
@@ -0,0 +1,4 @@
+# datasets package
+
+```{include} ../../../megatron/core/datasets/readme.md
+```
diff --git a/docs/api-guide/core/dist_checkpointing.md b/docs/api-guide/core/dist_checkpointing.md
new file mode 100644
index 00000000000..959aa4b07e0
--- /dev/null
+++ b/docs/api-guide/core/dist_checkpointing.md
@@ -0,0 +1,82 @@
+# dist_checkpointing package
+
+A library for saving and loading the distributed checkpoints.
+A *distributed checkpoint* in Megatron Core uses the ``torch_dist`` format,
+a custom checkpointing mechanism built on top of PyTorch's native
+checkpointing capabilities.
+
+A key property of distributed checkpoints is that a checkpoint saved under one
+parallel configuration (tensor, pipeline, or data parallelism) can be loaded
+under a different parallel configuration. This enables flexible scaling and
+resharding of models across heterogeneous training setups.
+
+Using the library requires defining sharded state_dict dictionaries with functions from  *mapping* and *optimizer* modules.
+Those state dicts can be saved or loaded with a *serialization* module using strategies from *strategies* module.
+
+## Safe Checkpoint Loading
+
+Since **PyTorch 2.6**, the default behavior of `torch.load` is `weights_only=True`.
+This ensures that only tensors and allow-listed classes are loaded, reducing the risk of arbitrary code execution.
+
+If you encounter an error such as:
+
+```bash
+WeightsUnpickler error: Unsupported global: GLOBAL argparse.Namespace was not an allowed global by default.
+```
+
+you can fix it by explicitly allow-listing the missing class in your script:
+
+```python
+import torch, argparse
+
+torch.serialization.add_safe_globals([argparse.Namespace])
+```
+
+Checkpointing Distributed Optimizer
+-----------------------------------
+
+Checkpoint Compatibility and Optimizer State Formats
+####################################################
+
+Beginning with **mcore v0.14**, the ``flattened_range`` attribute was removed from ``dist_checkpointing``. As a result:
+
+- Optimizer states saved with mcore versions < 0.14 are no longer loadable. Loading these legacy optimizer states is not supported because the required sharded metadata is no longer available.
+- Model weights from older checkpoints remain fully compatible. No additional work is required—model weights from checkpoints produced by earlier versions are loaded automatically.
+
+Distributed Optimizer Checkpoint Formats
+########################################
+
+The refactor of the Distributed Optimizer introduces **two checkpoint formats**:
+
+- dp_reshardable (Default)
+   - Fast save/load performance.
+   - Not reshardable — not possible to change model parallelism when using this format.
+   - Recommended for general training when model parallelism changes are not needed.
+- fully_reshardable
+   - Fully reshardable — supports arbitrary changes in model parallelism.
+   - Slower than dp_reshardable.
+   - Enabled via the ``--dist-ckpt-optim-fully-reshardable`` flag.
+
+Workflow for Changing Model Parallelism
+#######################################
+
+You can combine formats to optimize both flexibility and performance:
+
+   1. Train using ``dp_reshardable`` (default) for faster checkpointing.
+   2. When you need to change model parallelism:
+
+      - Stop training.
+      - Change model parallelism for train config.
+      - Resume training with ``--dist-ckpt-optim-fully-reshardable``.
+
+   3. Save at least one checkpoint under the new model parallel configuration.
+   4. (Optional) To continue the training with updated model parallelism and better checkpointing performance, stop training and switch back to ``dp_reshardable`` format by removing ``--dist-ckpt-optim-fully-reshardable``.
+
+## Subpackages
+
+```{toctree}
+:maxdepth: 4
+
+dist_checkpointing.strategies
+```
+
diff --git a/docs/api-guide/core/dist_checkpointing.strategies.md b/docs/api-guide/core/dist_checkpointing.strategies.md
new file mode 100644
index 00000000000..7aab8609504
--- /dev/null
+++ b/docs/api-guide/core/dist_checkpointing.strategies.md
@@ -0,0 +1,7 @@
+# dist_checkpointing.strategies package
+
+Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies).
+
+Strategies can be used for implementing new checkpoint formats or implementing new (more optimal for a given use case) ways of saving/loading of existing formats.
+Strategies are passed to `dist_checkpointing.load` and `dist_checkpointing.save` functions and control the actual saving/loading procedure.
+
diff --git a/docs/api-guide/core/distributed.md b/docs/api-guide/core/distributed.md
new file mode 100644
index 00000000000..1921c0bdd57
--- /dev/null
+++ b/docs/api-guide/core/distributed.md
@@ -0,0 +1,10 @@
+# distributed package
+
+This package contains various utilities to finalize model weight gradients
+on each rank before the optimizer step. This includes a distributed data
+parallelism wrapper to all-reduce or reduce-scatter the gradients across
+data-parallel replicas, and a `finalize_model_grads` method to
+synchronize gradients across different parallelism modes (e.g., 'tied'
+layers on different pipeline stages, or gradients for experts in a MoE on
+different ranks due to expert parallelism).
+
diff --git a/docs/api-guide/core/fusions.md b/docs/api-guide/core/fusions.md
new file mode 100644
index 00000000000..396280ad7da
--- /dev/null
+++ b/docs/api-guide/core/fusions.md
@@ -0,0 +1,11 @@
+# fusions package
+
+This package provides modules that provide commonly fused
+operations. Fusing operations improves compute efficiency by
+increasing the amount of work done each time a tensor is read from
+memory. To perform the fusion, modules in this either rely on PyTorch
+functionality for doing just-in-time compilation
+(i.e. `torch.jit.script` in older PyTorch versions of `torch.compile`
+in recent versions), or call into custom kernels in external libraries
+such as Apex or TransformerEngine.
+
diff --git a/docs/api-guide/core/index.md b/docs/api-guide/core/index.md
new file mode 100644
index 00000000000..150fd72cb1e
--- /dev/null
+++ b/docs/api-guide/core/index.md
@@ -0,0 +1,16 @@
+# Core APIs
+
+Low-level API reference for core Megatron components.
+
+```{toctree}
+:maxdepth: 2
+
+transformer
+tensor_parallel
+pipeline_parallel
+fusions
+distributed
+datasets
+dist_checkpointing
+dist_checkpointing.strategies
+```
diff --git a/docs/api-guide/core/pipeline_parallel.md b/docs/api-guide/core/pipeline_parallel.md
new file mode 100644
index 00000000000..42fac8cc449
--- /dev/null
+++ b/docs/api-guide/core/pipeline_parallel.md
@@ -0,0 +1,7 @@
+# pipeline_parallel package
+
+This package contains implementations for two different pipeline parallelism
+schedules (one without interleaving and one with interleaving, see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473)
+for details), and a default no-pipelining schedule. It also contains methods
+for the point-to-point communication that is needed between pipeline stages.
+
diff --git a/docs/api-guide/core/tensor_parallel.md b/docs/api-guide/core/tensor_parallel.md
new file mode 100644
index 00000000000..33a9160c82b
--- /dev/null
+++ b/docs/api-guide/core/tensor_parallel.md
@@ -0,0 +1,8 @@
+# tensor_parallel package
+
+This package contains an implementation for tensor parallelism in transformer
+models (see [Megatron-LM: Training Multi-Billion Parameter Language Models
+Using Model Parallelism](https://arxiv.org/abs/1909.08053) and [Reducing
+Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2205.05198)
+for details).
+
diff --git a/docs/api-guide/core/transformer.md b/docs/api-guide/core/transformer.md
new file mode 100644
index 00000000000..c6807dfdfd0
--- /dev/null
+++ b/docs/api-guide/core/transformer.md
@@ -0,0 +1,10 @@
+# transformer package
+
+The `transformer` package provides a customizable and configurable
+implementation of the transformer model architecture. Each component
+of a transformer stack, from entire layers down to individual linear
+layers, can be customized by swapping in different PyTorch modules
+using the "spec" parameters. The
+configuration of the transformer (hidden size, number of layers,
+number of attention heads, etc.) is provided via a `TransformerConfig`
+object.
diff --git a/docs/api-guide/index.md b/docs/api-guide/index.md
new file mode 100644
index 00000000000..293e77f8ce2
--- /dev/null
+++ b/docs/api-guide/index.md
@@ -0,0 +1,12 @@
+# API Guide
+
+API reference documentation for Megatron Core components.
+
+```{toctree}
+:maxdepth: 3
+
+router_replay
+models/index
+core/index
+internal/index
+```
diff --git a/docs/api-guide/internal/index.md b/docs/api-guide/internal/index.md
new file mode 100644
index 00000000000..c216a976c77
--- /dev/null
+++ b/docs/api-guide/internal/index.md
@@ -0,0 +1,10 @@
+# Internal Utilities
+
+Internal utility APIs.
+
+```{toctree}
+:maxdepth: 2
+
+num_microbatches_calculator
+optimizer_param_scheduler
+```
diff --git a/docs/api-guide/internal/num_microbatches_calculator.md b/docs/api-guide/internal/num_microbatches_calculator.md
new file mode 100644
index 00000000000..470c9e49128
--- /dev/null
+++ b/docs/api-guide/internal/num_microbatches_calculator.md
@@ -0,0 +1,4 @@
+# Microbatches Calculator
+
+This api is used to calculate the number of microbatches required to fit a given model on a given batch size.
+
diff --git a/docs/api-guide/internal/optimizer_param_scheduler.md b/docs/api-guide/internal/optimizer_param_scheduler.md
new file mode 100644
index 00000000000..13e1f77ccc0
--- /dev/null
+++ b/docs/api-guide/internal/optimizer_param_scheduler.md
@@ -0,0 +1,4 @@
+# Optimizer Parameters Scheduler
+
+This api is used to calculate the learning rate and weight decay for the optimizer.
+
diff --git a/docs/api-guide/models/index.md b/docs/api-guide/models/index.md
new file mode 100644
index 00000000000..c6279d2409a
--- /dev/null
+++ b/docs/api-guide/models/index.md
@@ -0,0 +1,12 @@
+# Model APIs
+
+API reference for Megatron Core model implementations.
+
+```{toctree}
+:maxdepth: 2
+
+models
+models.gpt
+models.bert
+models.t5
+```
diff --git a/docs/api-guide/models/models.bert.md b/docs/api-guide/models/models.bert.md
new file mode 100644
index 00000000000..3c53027c7c9
--- /dev/null
+++ b/docs/api-guide/models/models.bert.md
@@ -0,0 +1,4 @@
+# models.bert package
+
+Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks .
+
diff --git a/docs/api-guide/models/models.gpt.md b/docs/api-guide/models/models.gpt.md
new file mode 100644
index 00000000000..a7c254d348b
--- /dev/null
+++ b/docs/api-guide/models/models.gpt.md
@@ -0,0 +1,4 @@
+# models.gpt package
+
+This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added.
+
diff --git a/docs/api-guide/models/models.md b/docs/api-guide/models/models.md
new file mode 100644
index 00000000000..69dfc80211d
--- /dev/null
+++ b/docs/api-guide/models/models.md
@@ -0,0 +1,14 @@
+# models package
+
+This package contains most of the popular LLMs . Currently we have support for GPT, Bert, and T5 . This is an ever growing list so keep an eye out.
+
+## Subpackages
+
+```{toctree}
+:maxdepth: 4
+
+models.gpt
+models.t5
+models.bert
+```
+
diff --git a/docs/api-guide/models/models.t5.md b/docs/api-guide/models/models.t5.md
new file mode 100644
index 00000000000..90952096b63
--- /dev/null
+++ b/docs/api-guide/models/models.t5.md
@@ -0,0 +1,2 @@
+# models.t5 package
+
diff --git a/docs/api-guide/router_replay.md b/docs/api-guide/router_replay.md
new file mode 100644
index 00000000000..cf479afb4e3
--- /dev/null
+++ b/docs/api-guide/router_replay.md
@@ -0,0 +1,177 @@
+# Design Document: MoE Router Replay Feature
+
+## 1. Overview
+
+This document provides a detailed description of the "Router Replay" feature implemented within the Megatron-LM Core for Mixture-of-Experts (MoE) models.
+
+This feature is designed to enhance determinism and analyzability in MoE model training and inference. It enables the model to load routing decisions from a predefined file and enforce their use during the forward pass, thereby bypassing the real-time routing computation.
+
+## 2. Motivation
+
+*   **Determinism & Reproducibility**: In distributed training, MoE routing decisions can exhibit minor variations due to factors like floating-point precision. By replaying a fixed routing table, the MoE computation path is guaranteed to be identical across runs, which facilitates debugging and reproducing experimental results.
+*   **Performance Profiling**: The router's own computation (e.g., logits calculation, top-k selection) incurs overhead. In replay mode, this part of the computation can be completely skipped, allowing for more precise isolation and profiling of performance bottlenecks within the Expert Layers themselves.
+*   **Debugging Aid**: When issues arise in the model, fixing the routing decisions helps to isolate variables, making it easier to determine whether the problem lies with the routing mechanism or the expert computations.
+
+## 3. Design and Architecture
+
+The design follows the principles of being non-intrusive and on-demand, with the core idea of activating the replay logic only when explicitly requested by the user.
+
+*   **Core Components**:
+    *   `RouterReplay` (located in `megatron/core/transformer/moe/router_replay.py`): A utility class for replaying MoE routing decisions. When enabled via the `moe_enable_routing_replay` flag, a separate instance of `RouterReplay` is created for each MoE layer's router. Each instance is responsible for loading routing data and providing the deterministic routing decisions for its corresponding layer during the forward pass.
+    *   `moe_enable_routing_replay` (located in `megatron/core/transformer/transformer_config.py`): A boolean global configuration flag that serves as the sole entry point for enabling this feature.
+
+*   **Workflow**:
+    The feature supports different modes, such as recording and replaying, controlled by a `RouterReplayAction`.
+
+    1.  **Enabling the Feature**: The user sets `moe_enable_routing_replay` to `True` in the model configuration.
+    2.  **Initialization**: When `moe_enable_routing_replay` is true, each `TopKRouter` creates its own `RouterReplay` instance.
+    3.  **Mode Configuration**: The user must programmatically set the desired router replay action (e.g., `record`, `forward_replay`, `backward_replay`) on the `RouterReplay` instances.
+    4.  **Execution Flow (within a mini-batch)**:
+        *   **Forward Pass**:
+            *   For each micro-batch, the `topk_routing_with_score_function` checks the `router_replay_action`.
+            *   **In `record` mode**: The dynamically computed `top-k` expert indices are captured and stored.
+            *   **In `forward_replay` mode**: The function retrieves pre-loaded expert indices from `target_topk_idx`. These indices are used for the forward computation and are also appended to the `replay_backward_list` to prepare for the backward pass.
+        *   **Backward Pass**:
+            *   For each micro-batch (processed in reverse order in pipeline parallelism), the `router_replay_action` is checked again.
+            *   **In `backward_replay` mode**: The function retrieves the expert indices for the corresponding micro-batch by popping them from the `replay_backward_list`. This mode is intended for training recomputation (e.g., activation checkpointing and pipeline recompute) so the same routing decisions are used during recompute/backward as in forward, ensuring determinism and correctness.
+
+## 4. Implementation Details
+
+The implementation cleanly separates the replay logic from the router's core computation.
+
+*   **`megatron/core/transformer/transformer_config.py`**:
+    *   Adds the configuration option `moe_enable_routing_replay: bool = False`.
+
+*   **`megatron/core/transformer/moe/moe_utils.py`**:
+    *   Introduces the `RouterReplay` class to manage the state for recording and replaying routing decisions for a single MoE layer.
+        *   `target_topk_idx`: An attribute holding the expert indices for the current micro-batch during forward replay mode.
+        *   `recorded_topk_idx`: An attribute for storing the computed expert indices when in record mode.
+        *   `replay_backward_list`: A list that accumulates the `top-k` indices used during the forward passes of a mini-batch. This list is consumed in FIFO order during the backward pass to ensure correctness under pipeline parallelism.
+        *   `set_target_indices()`: A method to load the replay indices into `target_topk_idx` for the forward pass.
+        *   `record_indices()`: A method to save the computed indices.
+    *   The `topk_routing_with_score_function` is modified to contain the core logic. It checks the `router_replay_action` on the `router_replay` instance and accordingly performs one of the following actions: computes and records indices, replays indices from `target_topk_idx` (for forward), replays indices from `replay_backward_list` (for backward), or falls through to the default dynamic routing.
+
+### Training recompute usage
+
+- During forward replay, `set_target_indices()` prepares `replay_backward_list` so each micro-batch’s indices are available for recomputation.
+- During recompute/backward, set action to `REPLAY_BACKWARD` so indices are consumed in FIFO order to mirror the forward sequence.
+
+## 5. Usage Guide
+
+1.  **Enable & Instantiate**
+    - Create one `RouterReplay` instance per MoE router layer when building the model.
+    - Optionally use the global helpers to set/clear actions across all layers.
+2.  **Record Routing Decisions**
+    - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)`.
+    - Run the model; retrieve per-layer indices via `RouterReplay.get_recorded_data()` and persist.
+3.  **Forward Replay**
+    - Load indices and distribute: `RouterReplay.set_replay_data(list_of_tensors)`.
+    - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)`.
+    - Run the model; dynamic top‑k is bypassed and target indices are used.
+4.  **Backward Replay**
+    - For training recomputation (activation checkpointing or pipeline recompute), set action: `REPLAY_BACKWARD` during recomputation.
+    - Per micro‑batch indices are consumed from `replay_backward_list` in FIFO order.
+5.  **Cleanup**
+    - Use `RouterReplay.clear_global_indices()`, `RouterReplay.clear_global_router_replay_action()`, and `RouterReplay.clear_global_router_replay_instances()` to restore default behavior and prevent memory leaks.
+
+### Quick usage with `topk_routing_with_score_function`
+
+```python
+import torch
+from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction
+from megatron.core.transformer.moe.moe_utils import topk_routing_with_score_function
+
+rr = RouterReplay()
+
+# Record
+RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)
+logits = torch.randn(8, 16)
+probs_rec, routing_map_rec = topk_routing_with_score_function(
+    logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr,
+)
+recorded = rr.get_recorded_indices()
+torch.save(recorded, "/tmp/replay.pt")
+
+# Forward replay
+rr.clear_router_replay_action()
+rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD)
+target = torch.load("/tmp/replay.pt")
+rr.set_target_indices(target)
+probs_rep, routing_map_rep = topk_routing_with_score_function(
+    logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr,
+)
+
+RouterReplay.clear_global_router_replay_action()
+RouterReplay.clear_global_indices()
+RouterReplay.clear_global_router_replay_instances()
+```
+
+## 6. Minimal Demo
+
+Here is a minimal code example showing how to use RouterReplay for recording and replaying:
+
+```python
+import torch
+import torch.distributed as dist
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.moe.router import TopKRouter
+from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction
+
+
+# Initialize distributed training
+if not dist.is_initialized():
+    dist.init_process_group(backend="nccl")
+
+# Create a transformer config with RouterReplay enabled
+config = TransformerConfig(
+    num_experts=8,
+    expert_model_parallel_size=1,
+    num_top_k=2,
+    moe_enable_routing_replay=True
+)
+
+# Create a TopKRouter instance
+router = TopKRouter(config)
+
+# Generate sample input (batch_size, sequence_length, hidden_size)
+logits = torch.randn(16, 32, 8).to(torch.cuda.current_device())
+
+# -----------------
+# 1. Recording Mode
+# -----------------
+print("=== Recording Mode ===")
+# Set global router replay action to RECORD
+RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)
+
+# Perform routing
+routing_output = router.forward(logits)
+print(f"Recorded top-k indices shape: {routing_output.top_k_idx.shape}")
+
+# -----------------
+# 2. Forward Replay Mode
+# -----------------
+print("\n=== Forward Replay Mode ===")
+# Save recorded indices to a file
+torch.save(routing_output.top_k_idx, "/tmp/replay.pt")
+
+# Load indices from file and set as target for replay
+replay_indices = torch.load("/tmp/replay.pt")
+for router_instance in RouterReplay.global_router_replay_instances:
+    router_instance.target_topk_idx = replay_indices
+
+# Set global router replay action to REPLAY_FORWARD
+RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)
+
+# Perform routing again - this will use the replayed indices
+replay_routing_output = router.forward(logits)
+print(f"Replayed top-k indices shape: {replay_routing_output.top_k_idx.shape}")
+print(f"Are indices the same? {torch.equal(routing_output.top_k_idx, replay_routing_output.top_k_idx)}")
+
+
+# Clean up
+RouterReplay.clear_global_router_replay_action()
+RouterReplay.clear_global_indices()
+RouterReplay.clear_global_router_replay_instances()
+if dist.is_initialized():
+    dist.destroy_process_group()
+```
diff --git a/docs/autodoc2_docstrings_parser.py b/docs/autodoc2_docstrings_parser.py
new file mode 100644
index 00000000000..14b722de65b
--- /dev/null
+++ b/docs/autodoc2_docstrings_parser.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from docutils import nodes
+from myst_parser.parsers.sphinx_ import MystParser
+from sphinx.ext.napoleon.docstring import GoogleDocstring
+
+
+class NapoleonParser(MystParser):
+    """Add support for Google style docstrings."""
+
+    def parse(self, input_string: str, document: nodes.document) -> None:
+        """Parse Google style docstrings."""
+
+        # Get the Sphinx configuration
+        config = document.settings.env.config
+
+        # Process with Google style
+        google_parsed = str(GoogleDocstring(input_string, config))
+
+        return super().parse(google_parsed, document)
+
+
+Parser = NapoleonParser
diff --git a/docs/broken_links_false_positives.json b/docs/broken_links_false_positives.json
new file mode 100644
index 00000000000..01377be5804
--- /dev/null
+++ b/docs/broken_links_false_positives.json
@@ -0,0 +1,3 @@
+{
+    "uri": "http://localhost:8080/"
+}
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 00000000000..4f9882907de
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+import os
+import sys
+
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "Megatron Core"
+copyright = "2026, NVIDIA Corporation"
+author = "NVIDIA Corporation"
+release = "0.16.0"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "myst_parser",  # For our markdown docs
+    "sphinx.ext.viewcode",  # For adding a link to view source code in docs
+    "sphinx.ext.doctest",  # Allows testing in docstrings
+    "sphinx.ext.napoleon",  # For google style docstrings
+    "sphinx_copybutton",  # For copy button in code blocks
+]
+
+# Check if we should skip autodoc generation
+# usage: SKIP_AUTODOC=true
+skip_autodoc = os.environ.get("SKIP_AUTODOC", "false").lower() == "true"
+
+if not skip_autodoc:
+    extensions.append("autodoc2")  # Generates API docs
+
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# -- Options for MyST Parser (Markdown) --------------------------------------
+# MyST Parser settings
+myst_enable_extensions = [
+    "dollarmath",  # Enables dollar math for inline math
+    "amsmath",  # Enables LaTeX math for display mode
+    "colon_fence",  # Enables code blocks using ::: delimiters instead of ```
+    "deflist",  # Supports definition lists with term: definition format
+    "fieldlist",  # Enables field lists for metadata like :author: Name
+    "tasklist",  # Adds support for GitHub-style task lists with [ ] and [x]
+    "attrs_block",  # Enables setting attributes on block elements using {#id .class key=val}
+]
+myst_heading_anchors = 5  # Generates anchor links for headings up to level 5
+
+# Suppress "more than one target found for cross-reference" warnings for Python symbols
+# that have the same name across multiple modules (e.g. DistributedDataParallelConfig,
+# ModelType). These are structural ambiguities in the codebase – the cross-reference
+# still resolves; Sphinx just cannot pick the unique target automatically.
+suppress_warnings = ["ref.python"]
+
+# -- Options for Autodoc2 ---------------------------------------------------
+sys.path.insert(0, os.path.abspath(".."))
+
+if not skip_autodoc:
+    autodoc2_packages = [
+        {
+            "path": "../megatron/core",  # Path to your package relative to conf.py
+            "exclude_dirs": ["converters"],  # list of directory names to exclude
+        }
+    ]
+    autodoc2_render_plugin = "myst"  # Use MyST for rendering docstrings
+    autodoc2_output_dir = "apidocs"  # Output directory for autodoc2 (relative to docs/)
+    # This is a workaround that uses the parser located in autodoc2_docstrings_parser.py to allow autodoc2 to
+    # render google style docstrings.
+    # Related Issue: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33
+    autodoc2_docstring_parser_regexes = [
+        (r".*", "docs.autodoc2_docstrings_parser"),
+    ]
+    # Regex patterns whose values contain raw regex syntax (e.g. \p{L}) that docutils
+    # mis-parses as footnote/reference markup. Exclude them from the generated docs.
+    autodoc2_hidden_regexes = [
+        r".*\._PATTERN_TIKTOKEN.*",
+    ]
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "nvidia_sphinx_theme"
+html_theme_options = {
+    "switcher": {
+        "json_url": "versions1.json",
+        "version_match": release,
+    },
+    "icon_links": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/NVIDIA/Megatron-LM/",
+            "icon": "fa-brands fa-github",
+        }
+    ],
+    "extra_head": {
+        """
+    <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>
+    """
+    },
+    "extra_footer": {
+        """
+    <script type="text/javascript">if (typeof _satellite !== "undefined") {_satellite.pageBottom();}</script>
+    """
+    },
+}
+html_extra_path = ["project.json", "versions1.json"]
+
+# Github links are now getting rate limited from the Github Actions
+linkcheck_ignore = [
+    ".*github\\.com.*",
+    ".*githubusercontent\\.com.*",
+]
diff --git a/docs/developer/contribute.md b/docs/developer/contribute.md
new file mode 100644
index 00000000000..859b5562f4b
--- /dev/null
+++ b/docs/developer/contribute.md
@@ -0,0 +1,61 @@
+# Contributing to Megatron-LM
+
+This document outlines the processes and policies for issues and pull requests by non-NVIDIA contributors to the Megatron-LM GitHub repository.
+
+Everyone is welcome to contribute to the project! We recently migrated from using an internal repo to doing all development directly from the GitHub repository.
+
+When contributing it is important to ensure that changes are in line with the project direction. Small changes to fix bugs are welcomed and appreciated. If proposing large architectural changes or changes for stylistic reasons open an issue first so we can discuss it.
+
+## Issue policy
+
+Please do file any bugs you find, keeping the following in mind:
+
+- If filing a bug, i.e. you have found something that doesn't work as expected, use the BUG template.
+- If you've found a regression in speed or accuracy use the REGRESSION template.
+- If you are requesting a new feature or modification of an existing feature use the ENHANCEMENT template.
+- If opening an issue to ask a question no template is needed but please make your question as clear and concise as possible.
+- One issue per bug. Putting multiple things in the same issue makes both discussion and completion unnecessarily complicated.
+- Your bug is mostly likely to get attention from the development team quickly if we can easily reproduce it.
+- Use proper spelling, grammar, and punctuation.
+- Write in an authoritative and technical tone.
+
+## Code submission policy
+
+### Do
+
+- Format new code in a style that is consistent with the file being changed. Megatron-LM doesn't (yet) have a style guide or enforced formatting.
+- Split your changes into separate, atomic commits i.e. A commit per feature or fix.
+- Make sure your commits are rebased on the master branch.
+- Write the commit message subject line in the imperative mood ("Change the default argument for X", not "Changed the default argument for X").
+- Write your commit messages in proper English, with care and punctuation.
+- Check the spelling of your code, comments and commit messages.
+
+### Don't
+
+- Submit code that's incompatible with the project licence.
+- Touch anything outside the stated scope of the PR. This includes formatting changes to code not relevant to the PR.
+- Iterate excessively on your design across multiple commits.
+- Include commented-out code.
+- Attempt large architectural changes without first opening an issue to discuss.
+
+## Issue and Pull Request Q&A
+
+### I've submitted an issue and PR. When can I expect to get some feedback?
+
+You should receive a response within 2 business days.
+
+### I need help, who should I ping?
+
+Use [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall).
+
+### If my issue or PR isn't getting attention, what should I do?
+
+After 2 business days, tag the user [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall).
+
+### Is there a policy for issues and PRs that haven't been touched in X days? Should they be closed?
+
+Yes, we have a bot that will mark untouched PRs as "stale" after 60 days.
+
+We have a long backlog of issues and PRs dating back years. We are trying to triage these now by working backwards. Older issues we believe may still be relevant may recieve a request to re-test them with the latest code. If there's no response they may be closed. Again, if you they should be re-opened then just respond with a comment to that effect.
+
+Thank you!
\ No newline at end of file
diff --git a/docs/developer/generate_docs.md b/docs/developer/generate_docs.md
new file mode 100644
index 00000000000..52fa288122d
--- /dev/null
+++ b/docs/developer/generate_docs.md
@@ -0,0 +1,13 @@
+# Generating Docs Locally
+
+To generate docs locally, use the following commands:
+
+```
+cd docs
+uv run --only-group docs sphinx-autobuild . _build/html --port 8080 --host 127.0.0.1
+```
+
+Docs will be generated at <http://localhost:8080/>.
+
+**Recommended:** set the environment variable `SKIP_AUTODOC=true` when generating docs 
+to skip the generation of `apidocs`.
\ No newline at end of file
diff --git a/docs/developer/oncall.md b/docs/developer/oncall.md
new file mode 100644
index 00000000000..b88da7bb6df
--- /dev/null
+++ b/docs/developer/oncall.md
@@ -0,0 +1,48 @@
+# Oncall Overview
+
+During your oncall week, you will be assigned to all PRs marked “Ready for 
+Review”. From a high-level, your responsibilities include:
+
+- Review all new PRs
+- Accelerate the review process
+- Ensure issues and discussion questions are answered
+
+## PR Responsibilities
+
+Below is the checklist that the oncall needs to go through for each PR.
+
+- Should the PR remain a single PR?
+  - Each PR should have at most 1 expert reviewer, although there will be some outlier cases
+- Label PR as “complexity: low”, “complexity: medium”, or “complexity: high” depending on complexity
+  - Expert reviewers have final say, oncall just sets the initial complexity level
+  - Initial complexity level guideline
+    - Low: <100 lines changed
+    - Medium: 100 < lines changed < 500
+    - High: > 500 lines changed
+- Does this PR have proper testing coverage?
+  - If new logic is added, is the new logic tested?
+- Should the PR add documentation for any new features?
+- Does the PR conform to our style guidelines?
+  - Code structure
+  - Cleanliness
+  - Comments
+  - File structure
+- Do all tests pass?
+  - Oncall will need to kick off testing suite for external reviewers
+  - Comment “/ok to test commid_id” to kick off testing suite
+- Add the “Expert Review” label
+  - Select an expert reviewer from each expert group as a reviewer. If you’re unsure who to select, pick a “maintainer” or manager.
+  - **Expert reviewers should review within 1 business day.** Message the assigned reviewer if it is taking longer. The reviewer either needs to review the PR or suggest an alternate reviewer.
+  - If the reviewer is not responding after 2 business days, escalate to the reviewer's manager.
+- Add the “Final Review” label after experts approve
+  - Final reviewers should review within 1 business day. Message the assigned reviewer if it is taking longer.
+  - If the reviewer is not responding after 2 business days, escalate to the reviewer's manager.
+
+## Issues and Discussion Questions
+
+If you do not know the answer to an issue or discussion question: that's ok! **Delegate to someone who does.**
+
+On a daily basis, track the following:
+
+- [new issues](https://github.com/NVIDIA/Megatron-LM/issues): check to see if there are any new issues before they become out of SLA!
+- [out of SLA issues](https://github.com/orgs/NVIDIA-NeMo/projects/20/views/4?sliceBy%5Bvalue%5D=NVIDIA%2FMegatron-LM): useful dashboard that tracks all out of SLA issues
diff --git a/docs/developer/submit.md b/docs/developer/submit.md
new file mode 100644
index 00000000000..a096312d21e
--- /dev/null
+++ b/docs/developer/submit.md
@@ -0,0 +1,16 @@
+# How to Submit a PR
+
+## Step 1: Add PR label `Expert Review`
+
+## Step 2: Collect the expert reviewers reviews
+
+1. Attach the `Expert Review` label when your PR is ready for review.
+2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon.
+
+:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing.  
+Final Review might get declined if these requirements are not fulfilled.
+
+## Step 3: Final Review
+
+1. Add `Final Review` label
+2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon.
diff --git a/docs/discussions/README.md b/docs/discussions/README.md
new file mode 100644
index 00000000000..a2662785a31
--- /dev/null
+++ b/docs/discussions/README.md
@@ -0,0 +1,35 @@
+---
+orphan: true
+---
+
+<!---
+   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
+   NVIDIA CORPORATION and its licensors retain all intellectual property
+   and proprietary rights in and to this software, related documentation
+   and any modifications thereto. Any use, reproduction, disclosure or
+   distribution of this software and related documentation without an express
+   license agreement from NVIDIA CORPORATION is strictly prohibited.
+-->
+
+# Megatron Discussions
+
+This directory contains in-depth guides, tutorials, and discussions about optimizing and using Megatron for various use cases.
+
+## Available Guides
+
+### Training Guides
+
+- **[Megatron-FSDP User Guide](megatron-fsdp-user-guide/megatron-fsdp-user-guide.md)**
+
+  A practical guide to enable Megatron-FSDP training, including a quick-start example for DeepSeek-V3, required and recommended configurations, and instructions for checkpoint conversion from torch_dist to fsdp_dtensor.
+
+## Contributing
+
+If you'd like to contribute a guide or tutorial, please follow this structure:
+
+1. Create a new directory: `docs/discussions/your-guide-name/`
+2. Add your main guide: `docs/discussions/your-guide-name/your-guide-name.md`
+3. Create an images directory: `docs/discussions/your-guide-name/images/`
+4. Update this README.md with a link to your guide
+
+Each guide should be self-contained with its own images and supporting files.
diff --git a/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh
new file mode 100644
index 00000000000..9f302c93f8f
--- /dev/null
+++ b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Configuration: Set these paths before running the script
+MEGATRON_PATH=${MEGATRON_PATH:-"your_own_megatron_path"} # Path to Megatron-LM repository
+CONTAINER_IMAGE=${CONTAINER_IMAGE:-"your_own_container_image"} # Path to .sqsh or docker image url
+OUTPUT_PATH=${OUTPUT_PATH:-"your_own_output_path"} # Path for SLURM logs
+
+# Checkpoint conversion command
+# Note: Update the checkpoint paths in the command below
+RUN_CMD="
+cd ${MEGATRON_PATH};
+git rev-parse HEAD;
+export PYTHONPATH=${MEGATRON_PATH}:${PYTHONPATH};
+python3 tools/checkpoint/checkpoint_inspector.py \
+    convert-torch-dist-to-fsdp-dtensor --swiglu \
+    your_own_path_to_input_torch_dist_checkpoint \
+    your_own_path_to_output_fsdp_dtensor_checkpoint \
+    --param-to-param-group-map-json your_own_path_to_param_to_param_group_map.json"
+
+# SLURM settings
+SLURM_LOGS="${OUTPUT_PATH}/slurm_logs"
+mkdir -p ${SLURM_LOGS} || {
+    echo "Error: Failed to create SLURM logs directory ${SLURM_LOGS}"
+    exit 1
+}
+
+# Submit SLURM job
+# Note: Update SBATCH parameters below according to your cluster configuration
+set +e
+sbatch <<EOF
+#!/bin/bash
+
+#SBATCH --job-name=your_own_job_name
+#SBATCH --partition=your_own_partition
+#SBATCH --nodes=your_own_num_nodes
+#SBATCH --ntasks-per-node=your_own_tasks_per_node
+#SBATCH --gres=gpu:your_own_gpu_per_node
+#SBATCH --time=your_own_time
+#SBATCH --account=your_own_account
+#SBATCH --exclusive
+#SBATCH --dependency=singleton
+
+srun --mpi=pmix -l \
+    --container-image=${CONTAINER_IMAGE} \
+    --container-mounts=your_own_container_mounts \
+    --container-workdir=${MEGATRON_PATH} \
+    bash -x -c "${RUN_CMD}" 2>&1 | tee ${SLURM_LOGS}/\${SLURM_JOB_ID}.log
+
+EOF
+set -e
diff --git a/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh
new file mode 100644
index 00000000000..7b93d25d943
--- /dev/null
+++ b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh
@@ -0,0 +1,223 @@
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=19
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export PYTHONWARNINGS=ignore
+export TRITON_CACHE_DIR=/tmp/triton_cache_$SLURM_NODEID
+
+# Configuration: Set these variables before running the script
+MEGATRON_PATH=${MEGATRON_PATH:-"your_own_megatron_path"} # Path to Megatron-LM repository
+CONTAINER_IMAGE=${CONTAINER_IMAGE:-"your_own_container_image"} # Path to .sqsh or docker image url
+OUTPUT_PATH=${OUTPUT_PATH:-"your_own_output_path"} # Path for output logs and checkpoints
+DATA_PATH=${DATA_PATH:-"your_own_data_path"}
+USE_MEGATRON_FSDP=${USE_MEGATRON_FSDP:-1}
+SHARDING_STRATEGY=${SHARDING_STRATEGY:-"optim_grads_params"}
+PROFILE=${PROFILE:-0}
+WANDB=${WANDB:-1}
+
+TP=${TP:-1}
+EP=${EP:-8}
+MBS=${MBS:-4}
+GBS=${GBS:-2048}
+COMMENT=${COMMENT:-"hybridep-selective-recompute"}
+
+PRETRAIN_ARGS=(
+    --distributed-timeout-minutes 60
+    --tensor-model-parallel-size ${TP}
+    --expert-model-parallel-size ${EP}
+    --expert-tensor-parallel-size 1
+    --context-parallel-size 1
+    --use-distributed-optimizer
+    --overlap-grad-reduce
+    --overlap-param-gather
+    --use-mcore-models
+    --sequence-parallel
+    --use-flash-attn
+    --disable-bias-linear
+    --micro-batch-size ${MBS}
+    --global-batch-size ${GBS}
+    --train-samples 585937500
+    --exit-duration-in-mins 220
+    --no-check-for-nan-in-loss-and-grad
+    --manual-gc
+    --manual-gc-interval 10
+    --recompute-granularity selective
+    --recompute-modules mlp moe mla_up_proj layernorm
+    --transformer-impl transformer_engine
+    --seq-length 4096
+    --data-cache-path ${OUTPUT_PATH}/cache
+    --tokenizer-type HuggingFaceTokenizer
+    --tokenizer-model deepseek-ai/DeepSeek-V3
+    --data-path ${DATA_PATH}
+    --split 99,1,0
+    --no-mmap-bin-files
+    --no-create-attention-mask-in-dataloader
+    --num-workers 6
+    --num-layers 61
+    --hidden-size 7168
+    --ffn-hidden-size 18432
+    --num-attention-heads 128
+    --kv-channels 128
+    --max-position-embeddings 4096
+    --position-embedding-type rope
+    --rotary-base 10000
+    --make-vocab-size-divisible-by 3232
+    --normalization RMSNorm
+    --norm-epsilon 1e-6
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --multi-latent-attention
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --clip-grad 1.0
+    --weight-decay 0.1
+    --qk-layernorm
+    --lr-decay-samples 584765624
+    --lr-warmup-samples 1536000
+    --lr-warmup-init 3.9e-7
+    --lr 3.9e-6
+    --min-lr 3.9e-7
+    --lr-decay-style cosine
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --num-experts 256
+    --moe-layer-freq [0]*3+[1]*58
+    --moe-ffn-hidden-size 2048
+    --moe-shared-expert-intermediate-size 2048
+    --moe-router-load-balancing-type seq_aux_loss
+    --moe-router-topk 8
+    --moe-token-dispatcher-type flex
+    --moe-flex-dispatcher-backend hybridep
+    --moe-router-pre-softmax
+    --moe-grouped-gemm
+    --moe-aux-loss-coeff 1e-4
+    --moe-router-group-topk 4
+    --moe-router-num-groups 8
+    --moe-router-topk-scaling-factor 2.5
+    --moe-router-score-function sigmoid
+    --moe-router-enable-expert-bias
+    --moe-router-bias-update-rate 1e-3
+    --moe-router-dtype fp32
+    --moe-permute-fusion
+    --moe-router-force-load-balancing
+    --q-lora-rank 1536
+    --kv-lora-rank 512
+    --qk-head-dim 128
+    --qk-pos-emb-head-dim 64
+    --v-head-dim 128
+    --rotary-scaling-factor 40
+    --mscale 1.0
+    --mscale-all-dim 1.0
+    --mtp-num-layers 1
+    --mtp-loss-scaling-factor 0.1
+    --eval-iters 32
+    --eval-interval 100
+    --auto-detect-ckpt-format
+    --load ${OUTPUT_PATH}/checkpoints
+    --save ${OUTPUT_PATH}/checkpoints
+    --save-interval 100
+    --dist-ckpt-strictness log_all
+    --init-method-std 0.02
+    --log-timers-to-tensorboard
+    --log-memory-to-tensorboard
+    --log-num-zeros-in-grad
+    --log-params-norm
+    --log-validation-ppl-to-tensorboard
+    --log-throughput
+    --log-interval 1
+    --logging-level 40
+    --tensorboard-dir ${OUTPUT_PATH}/tensorboard
+    --bf16
+    --enable-experimental
+) 
+
+if [ "${USE_MEGATRON_FSDP}" = 1 ]; then
+    unset CUDA_DEVICE_MAX_CONNECTIONS
+    PRETRAIN_ARGS=(
+        "${PRETRAIN_ARGS[@]}"
+        --use-megatron-fsdp
+        --data-parallel-sharding-strategy ${SHARDING_STRATEGY}
+        --no-gradient-accumulation-fusion
+        --use-distributed-optimizer
+        --calculate-per-token-loss
+        --init-model-with-meta-device
+        --ckpt-format fsdp_dtensor
+        --grad-reduce-in-bf16
+        --fsdp-double-buffer
+        --use-nccl-ub
+    )
+fi
+
+# Profiling command
+if [ "${PROFILE}" = 1 ]; then
+    PROFILE_CMD="nsys profile --sample=none --cpuctxsw=none --trace=cuda,nvtx,cublas,cudnn \
+        --capture-range=cudaProfilerApi \
+        --capture-range-end=stop \
+        --cuda-graph-trace=node \
+        --cuda-memory-usage=true \
+        -f true -x true \
+        -o ${OUTPUT_PATH}/nsys/Megatron-FSDP-Deepseek-V3-TP${TP}EP${EP}-MBS${MBS}GBS${GBS}-${COMMENT}"
+    PRETRAIN_ARGS=(
+        "${PRETRAIN_ARGS[@]}"
+        --profile
+        --profile-step-start 10
+        --profile-step-end 12
+        --profile-ranks 0
+    )
+    echo "PROFILE_CMD="
+    echo $PROFILE_CMD
+else
+    PROFILE_CMD=""
+fi
+
+if [ "${WANDB}" = 1 ]; then
+    export WANDB_API_KEY=${WANDB_API_KEY:-"your_own_wandb_api_key"}
+    PRETRAIN_ARGS=(
+        "${PRETRAIN_ARGS[@]}"
+        --wandb-project your_own_wandb_project
+        --wandb-exp-name DeepSeek-V3-TP${TP}EP${EP}-MBS${MBS}GBS${GBS}-${COMMENT}
+    )
+fi
+
+TRAINING_CMD="
+cd ${MEGATRON_PATH};
+git rev-parse HEAD;
+export PYTHONPATH=${MEGATRON_PATH}:${PYTHONPATH};
+${PROFILE_CMD} python ${MEGATRON_PATH}/pretrain_gpt.py ${PRETRAIN_ARGS[@]}"
+
+# SLURM settings
+SLURM_LOGS="${OUTPUT_PATH}/slurm_logs"
+mkdir -p ${SLURM_LOGS} || {
+    echo "Error: Failed to create SLURM logs directory ${SLURM_LOGS}"
+    exit 1
+}
+
+# Submit SLURM job
+# Note: Update SBATCH parameters below according to your cluster configuration
+set +e
+sbatch <<EOF
+#!/bin/bash
+
+#SBATCH --job-name=your_own_job_name
+#SBATCH --partition=your_own_partition
+#SBATCH --nodes=your_own_num_nodes
+#SBATCH --ntasks-per-node=your_own_tasks_per_node
+#SBATCH --gres=gpu:your_own_gpu_per_node
+#SBATCH --time=your_own_time
+#SBATCH --account=your_own_account
+#SBATCH --exclusive
+#SBATCH --dependency=singleton
+
+srun \
+    --mpi=pmix -l \
+    --container-image=${CONTAINER_IMAGE} \
+    --container-mounts=your_own_container_mounts \
+    --container-workdir=${MEGATRON_PATH} \
+    bash -x -c "${TRAINING_CMD}" 2>&1 | tee ${SLURM_LOGS}/\${SLURM_JOB_ID}.log
+
+EOF
+set -e
diff --git a/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md
new file mode 100644
index 00000000000..59be2bedef5
--- /dev/null
+++ b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md
@@ -0,0 +1,130 @@
+---
+orphan: true
+---
+
+<!---
+   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
+   NVIDIA CORPORATION and its licensors retain all intellectual property
+   and proprietary rights in and to this software, related documentation
+   and any modifications thereto. Any use, reproduction, disclosure or
+   distribution of this software and related documentation without an express
+   license agreement from NVIDIA CORPORATION is strictly prohibited.
+-->
+
+# Megatron-FSDP User Guide
+
+## Table of Contents
+
+- [Megatron-FSDP Quick Start](#megatron-fsdp-quick-start)
+- [Checkpoint Conversion from 3D-Parallel to Megatron-FSDP](#checkpoint-conversion-from-3d-parallel-to-megatron-fsdp)
+
+## Megatron-FSDP Quick Start
+
+We recommend using the latest [NVIDIA NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags), which provides a tested software stack and optimized performance.
+
+For your reference, we provide an example launch script for DeepSeek-V3: [`sbatch_mfsdp_deepseek_v3.sh`](./example-scripts/sbatch_mfsdp_deepseek_v3.sh).
+
+### Required Configurations
+
+To enable Megatron-FSDP, add the following required flags to your training script:
+
+```bash
+--use-megatron-fsdp
+--data-parallel-sharding-strategy optim_grads_params
+--no-gradient-accumulation-fusion
+--use-distributed-optimizer
+--ckpt-format fsdp_dtensor
+```
+
+### Recommended Configurations
+
+We also recommend adding the following configurations to further improve performance:
+
+```bash
+unset CUDA_DEVICE_MAX_CONNECTIONS
+```
+
+```bash
+--calculate-per-token-loss
+--init-model-with-meta-device
+--grad-reduce-in-bf16
+--fsdp-double-buffer
+--use-nccl-ub
+```
+
+💡 **Detailed explanations of these configurations are provided below.**
+
+#### 1. Disable `CUDA_DEVICE_MAX_CONNECTIONS`
+
+To ensure full parallelization of FSDP communication and computation, disable the CUDA_DEVICE_MAX_CONNECTIONS environment variable. This step avoids potential bubbles in the CUDA stream. (But it may slow down TP and CP to some extent.)
+
+#### 2. Add `--calculate-per-token-loss`
+
+For gradients sharding mode optimization, include the `--calculate-per-token-loss` flag in your training script. This improves performance by reducing the frequency of gradient scaling, which is also a sizable drain on SM resources.
+
+#### 3. Add `--init-model-with-meta-device`
+
+Allows model initialization using meta device, followed by layer-by-layer initialization of distributed model weight buffers via the `Module.reset_parameters` API, facilitating the initialization of extremely large models.
+
+#### 4. Add `--grad-reduce-in-bf16`
+
+Enables gradient reduction in BF16 precision instead of FP32, reducing communication volume and accelerating the backward pass.
+
+#### 5. Add `--fsdp-double-buffer`
+
+Uses persistently allocated double buffers for temporarily-defined memory needed in `MegatronFSDP` communications. While having persistent double buffers may increase peak VRAM utilization, it is necessary to register NCCL user buffers (`nccl_ub=True`) for `MegatronFSDP`. Currently, this is supported only for simple repetitive model structures such as GPT.
+
+- **Only effective when using Megatron-LM.**
+- Defaults to `False`. Automatically overridden to `True` when `nccl_ub` is enabled.
+
+#### 6. Add `--use-nccl-ub`
+
+Allocates and [registers NCCL user buffers](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#) for param and grad buffers. This option enables an SM-efficient NCCL algorithm that could improve the performance of overlapped computations. This flag will be much more effective when used together with [SHARP](https://docs.nvidia.com/networking/display/sharpv3130) if the FSDP communication includes both NVL and IB domains. Enabling this option will cause additional memory overhead due to the requirement to enable the `fsdp_double_buffer` option.
+
+- **Only effective when using Megatron-LM.**
+- Defaults to `False`.
+- By default we try to use NCCL window (symmetric) registration if it is available. If not it falls back to conventional local registration.
+- **Incompatible with PyTorch's segmentable allocator:** Do not set `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` when using `--use-nccl-ub`, as this will cause a runtime error due to compatibility issues with the `torch.cuda.MemPool` API.
+
+## Checkpoint Conversion from 3D-Parallel to Megatron-FSDP
+
+Megatron-FSDP introduces `fsdp_dtensor`, a DTensor-based distributed checkpoint format that serves as its standard. To help you smoothly transition from 3D-Parallel to Megatron-FSDP, we provide a script for converting checkpoints from the `torch_dist` format to the `fsdp_dtensor` format. Using DeepSeek-V3 as an example, the detailed conversion process is described below.
+
+### Step 1: Generate 3D-Parallel Checkpoint with `param_to_param_group_map`
+
+Run your 3D-parallel + EP training script to generate a `torch_dist` checkpoint along with a directory containing `param_to_param_group_map` files. Add the following flag to your training script:
+
+```bash
+--dump-param-to-param-group-map /path/to/param_to_param_group_map
+```
+
+If you already have a `torch_dist` checkpoint, simply specify the `--dump-param-to-param-group-map /path/to/param_to_param_group_map` flag and run a very short experiment-this will create the `param_to_param_group_map` you need without full pretraining.
+
+### Step 2: Export `param_to_param_group_map` to a JSON File
+
+Convert the `param_to_param_group_map` into a JSON file for easier processing by running:
+
+```bash
+python tools/checkpoint/checkpoint_inspector.py print-torch-dcp-in-json /path/to/param_to_param_group_map
+```
+
+This will create a `param_to_param_group_map.json` file in the `/path/to/param_to_param_group_map` directory.
+
+### Step 3: Convert Checkpoint from `torch_dist` to `fsdp_dtensor`
+
+Convert your `torch_dist` checkpoint to the `fsdp_dtensor` format using the parameter to `param_to_param_group_map` JSON file:
+
+```bash
+torchrun --nproc_per_node=8 --nnodes=1 \
+    tools/checkpoint/checkpoint_inspector.py \
+    convert-torch-dist-to-fsdp-dtensor --swiglu \
+    /path/to/input_torch_dist_checkpoint \
+    /path/to/output_fsdp_dtensor_checkpoint \
+    --param-to-param-group-map-json /path/to/param_to_param_group_map.json
+```
+
+**Note:** For multi-node conversion tasks, please refer to the example script: [`sbatch_checkpoint_convert.sh`](./example-scripts/sbatch_checkpoint_convert.sh).
+
+### Step 4: Launch Megatron-FSDP Training
+
+Start your Megatron-FSDP training job using the converted `fsdp_dtensor` checkpoint.
diff --git a/docs/documentation.md b/docs/documentation.md
new file mode 100644
index 00000000000..d8bc23b2bae
--- /dev/null
+++ b/docs/documentation.md
@@ -0,0 +1,68 @@
+---
+orphan: true
+---
+
+<!---
+   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
+   NVIDIA CORPORATION and its licensors retain all intellectual property
+   and proprietary rights in and to this software, related documentation
+   and any modifications thereto. Any use, reproduction, disclosure or
+   distribution of this software and related documentation without an express
+   license agreement from NVIDIA CORPORATION is strictly prohibited.
+-->
+
+# Documentation Development
+
+- [Documentation Development](#documentation-development)
+  - [Build the Documentation](#build-the-documentation)
+  - [Live Building](#live-building)
+  - [Documentation Version](#documentation-version)
+
+## Build the Documentation
+
+The following sections describe how to set up and build the NeMo RL documentation.
+
+Switch to the documentation source folder and generate HTML output.
+
+```sh
+cd docs/
+uv run --group docs sphinx-build . _build/html
+```
+
+- The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder.
+- The generated python API docs are placed in `apidocs` under the `docs/` folder.
+
+## Checking for Broken Links
+
+To check for broken http links in the docs, run this command:
+
+```sh
+cd docs/
+uv run --group docs sphinx-build --builder linkcheck . _build/linkcheck
+```
+
+It will output a JSON file at `_build/linkcheck/output.json` with links it found while building the
+docs. Records will have a status of `broken` if the link is not reachable. The `docs/conf.py` file is
+configured to ignore github links because the CI test will often experience rate limit errors.
+Comment out the `linkcheck_ignore` variable there to check all the links.
+
+## Live Building
+
+When writing documentation, it can be helpful to serve the documentation and have it update live while you edit.
+
+To do so, run:
+
+```sh
+cd docs/
+uv run --group docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
+```
+
+Open a web browser and go to `http://${HOST_WHERE_SPHINX_COMMAND_RUN}:12345` to view the output.
+
+## Documentation Version
+
+The three files below control the version switcher. Before you attempt to publish a new version of the documentation, update these files to match the latest version numbers.
+
+- docs/versions1.json
+- docs/project.json
+- docs/conf.py
diff --git a/docs/get-started/install.md b/docs/get-started/install.md
new file mode 100644
index 00000000000..dd000500f58
--- /dev/null
+++ b/docs/get-started/install.md
@@ -0,0 +1,87 @@
+# Megatron Core Installation
+
+Installation is supported using Docker and pip.
+
+## System Requirements
+
+### Hardware Requirements
+
+- **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs
+- **Recommended**: NVIDIA Turing architecture or later
+
+### Software Requirements
+
+- **CUDA/cuDNN/NCCL**: Latest stable versions
+- **PyTorch**: Latest stable version
+- **Transformer Engine**: Latest stable version
+- **Python**: 3.12 recommended
+
+
+## Docker Installation (Recommended)
+
+We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing matrix. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability.
+
+**Note:** The NGC PyTorch container constraints the python environment globally via `PIP_CONSTRAINT`. In the following examples we will unset the variable.
+
+This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs:
+
+- PyTorch (latest stable version)
+- CUDA, cuDNN, NCCL (latest stable versions)
+- Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs
+- For best performance, use NVIDIA Turing GPU architecture generations and later
+
+```bash
+# Run container with mounted directories
+docker run --runtime --nvidia --gpus all -it --rm \
+  -v /path/to/megatron:/workspace/megatron \
+  -v /path/to/dataset:/workspace/dataset \
+  -v /path/to/checkpoints:/workspace/checkpoints \
+  -e PIP_CONSTRAINT= \
+  nvcr.io/nvidia/pytorch:25.04-py3
+```
+
+## Pip Installation
+
+Megatron Core installation offers support for two NGC PyTorch containers:
+
+- `dev`: Moving head that supports the most recent upstream dependencies
+- `lts`: Long-term support of NGC PyTorch 24.01
+
+Both containers can be combined with `mlm`, which adds package dependencies for Megatron-LM on top of Megatron Core.
+
+
+1. Install the latest release dependencies
+
+    ```bash
+    pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
+    pip install --no-build-isolation megatron-core[dev]
+    ```
+
+2. Next choose one of the following options:
+
+* For running an Megatron LM application
+
+        ```bash
+        pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
+        pip install --no-build-isolation megatron-core[mlm,dev]
+        ```
+* Install packages for LTS support NGC PyTorch 24.01
+
+        ```bash
+        pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
+        pip install --no-build-isolation megatron-core[lts]
+        ```
+
+* For running an Megatron LM application
+
+        ```bash
+        pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
+        pip install --no-build-isolation megatron-core[mlm,lts]
+        ```
+
+* For a version of Megatron Core with only Torch, run
+
+        ```bash
+        pip install megatron-core
+        ```
+
diff --git a/docs/get-started/overview.md b/docs/get-started/overview.md
new file mode 100644
index 00000000000..d705b45e349
--- /dev/null
+++ b/docs/get-started/overview.md
@@ -0,0 +1,84 @@
+# Overview
+
+Megatron-Core and Megatron-LM are open-source tools that are typically used together to train LLMs at scale across GPUs. Megatron-Core expands the capability of Megatron-LM. Megatron Bridge connects Megatron-Core and Megatron-LM to other popular training models, such as Hugging Face.
+
+## Megatron Core
+
+NVIDIA Megatron Core is a library of essential building blocks for highly efficient large-scale generative AI training. It can be used to train models with unparalleled speed at scale across thousands of GPUs. It provides an extensive set of tools for multimodal and speech AI. It expands Megatron LM capabilities.
+
+Megatron-Core contains GPU-optimized techniques featuring advanced parallelism strategies, optimizations like FP8 training, and support for the latest LLM, MoE, and multimodal architectures. It abstracts these techniques into composable and modular APIs.
+
+Megatron-Core is compatible with all NVIDIA Tensor Core GPUs and popular LLM architectures such as GPT, BERT, T5, and RETRO.
+
+
+**Composable library** with GPU-optimized building blocks for custom training frameworks.
+
+**Best for:**
+
+- **Framework developers** building on top of modular and optimized components
+- **Research teams** needing custom training loops, optimizers, or data pipelines
+- **ML engineers** requiring fault-tolerant training pipelines
+
+**What you get:**
+
+- Composable transformer building blocks (attention, MLP)
+- Advanced parallelism strategies (TP, PP, DP, EP, CP)
+- Pipeline schedules and distributed optimizers
+- Mixed precision support (FP16, BF16, FP8)
+- GPU-optimized kernels and memory management
+- High-performance dataloaders and dataset utilities
+- Model architectures (LLaMA, Qwen, GPT, Mixtral, Mamba)
+
+## Megatron-LM
+
+Megatron-LM is a reference implementation, with a lightweight large-scale LLM training framework. It offers a customizable native PyTorch training loop with fewer abstraction layers. It was designed for scaling transformer models to the multi-billion and trillion-parameter regimes under realistic memory and compute constraints. **It serves as a straightforward entry point for exploring Megatron-Core.**
+
+It uses advanced parallelization techniques including model parallelism (tensor and pipeline), to allow models with billions of parameters to fit and train across large GPU clusters. It enables breakthroughs in large-scale NLP tasks. It splits model computations across many GPUs, overcoming single-GPU memory limits for training huge models, like GPT-style transformers.  
+
+
+**Reference implementation** that includes Megatron Core plus everything needed to train models.
+
+**Best for:**
+
+- **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware
+- **Research teams** exploring new architectures and training techniques
+- **Learning distributed training** concepts and best practices
+- **Quick experimentation** with proven model configurations
+
+**What you get:**
+
+- Pre-configured training scripts for GPT, LLaMA, DeepSeek, Qwen, and more.
+- End-to-end examples from data prep to evaluation
+- Research-focused tools and utilities
+
+
+
+## Megatron Bridge
+
+Megatron Bridge provides out-of-the-box bridges and training recipes for models built on top of base model architectures from Megatron Core.  
+
+Megatron Bridge provides a robust, parallelism-aware pathway to convert models and checkpoints. This bidirectional converter performs on-the-fly, model-parallel-aware, per-parameter conversion, and full in-memory loading.
+
+After training or modifying a Megatron model, you can convert it again for deployment or sharing.  
+
+[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)
+
+
+
+## Ecosystem Libraries
+
+**Libraries used by Megatron Core:**
+
+- **[Megatron Energon](https://github.com/NVIDIA/Megatron-Energon)** - Multi-modal data loader (text, images, video, audio) with distributed loading and dataset blending
+- **[Transformer Engine](https://github.com/NVIDIA/TransformerEngine)** - Optimized kernels and FP8 mixed precision support
+- **[Resiliency Extension (NVRx)](https://github.com/NVIDIA/nvidia-resiliency-ext)** - Fault tolerant training with failure detection and recovery
+
+**Libraries using Megatron Core:**
+
+- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
+- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
+- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
+- **[Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt).
+
+**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+
diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md
new file mode 100644
index 00000000000..61868e7877c
--- /dev/null
+++ b/docs/get-started/quickstart.md
@@ -0,0 +1,73 @@
+# Quick Start
+
+## Quick Installation
+
+Install Megatron Core with pip:
+
+1. Install Megatron Core with required dependencies:
+
+    ```bash
+    pip install --no-build-isolation megatron-core[mlm,dev]
+    ```
+
+2. Clone repository for examples:
+
+    ```bash
+    git clone https://github.com/NVIDIA/Megatron-LM.git
+    cd Megatron-LM
+    pip install --no-build-isolation .[mlm,dev]
+    ```
+
+That's it! You're ready to start training.
+
+## Your First Training Run
+
+### Simple Training Example
+
+```bash
+# Distributed training example (2 GPUs, mock data)
+torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
+```
+
+### LLaMA-3 Training Example
+
+```bash
+# 8 GPUs, FP8 precision, mock data
+./examples/llama/train_llama3_8b_fp8.sh
+```
+
+## Data Preparation
+
+### JSONL Data Format
+
+```json
+{"text": "Your training text here..."}
+{"text": "Another training sample..."}
+```
+
+### Basic Preprocessing
+
+```bash
+python tools/preprocess_data.py \
+    --input data.jsonl \
+    --output-prefix processed_data \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model /path/to/tokenizer.model \
+    --workers 8 \
+    --append-eod
+```
+
+### Key Arguments
+
+- `--input`: Path to input JSON/JSONL file
+- `--output-prefix`: Prefix for output binary files (.bin and .idx)
+- `--tokenizer-type`: Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, etc.)
+- `--tokenizer-model`: Path to tokenizer model file
+- `--workers`: Number of parallel workers for processing
+- `--append-eod`: Add end-of-document token
+
+## Next Steps
+
+- Explore [Parallelism Strategies](../user-guide/parallelism-guide.md) to scale your training
+- Learn about [Data Preparation](../user-guide/data-preparation.md) best practices
+- Check out [Advanced Features](../user-guide/features/index.md) for advanced capabilities
diff --git a/docs/get-started/releasenotes.md b/docs/get-started/releasenotes.md
new file mode 100644
index 00000000000..e2d77cf0070
--- /dev/null
+++ b/docs/get-started/releasenotes.md
@@ -0,0 +1,10 @@
+# Release Notes
+
+
+## Roadmaps
+
+Stay up-to-date with our development roadmaps and planned features:
+
+- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
+- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
+
diff --git a/docs/source/images/context_parallel/CP_overview.png b/docs/images/context_parallel/CP_overview.png
similarity index 100%
rename from docs/source/images/context_parallel/CP_overview.png
rename to docs/images/context_parallel/CP_overview.png
diff --git a/docs/source/images/context_parallel/CP_results.png b/docs/images/context_parallel/CP_results.png
similarity index 100%
rename from docs/source/images/context_parallel/CP_results.png
rename to docs/images/context_parallel/CP_results.png
diff --git a/docs/source/images/custom_fsdp/FSDP_Allreduce.png b/docs/images/custom_fsdp/FSDP_Allreduce.png
similarity index 100%
rename from docs/source/images/custom_fsdp/FSDP_Allreduce.png
rename to docs/images/custom_fsdp/FSDP_Allreduce.png
diff --git a/docs/source/images/custom_fsdp/FSDP_workflow.png b/docs/images/custom_fsdp/FSDP_workflow.png
similarity index 100%
rename from docs/source/images/custom_fsdp/FSDP_workflow.png
rename to docs/images/custom_fsdp/FSDP_workflow.png
diff --git a/docs/source/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png b/docs/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png
similarity index 100%
rename from docs/source/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png
rename to docs/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png
diff --git a/docs/source/images/distrib_optimizer/data_flow.png b/docs/images/distrib_optimizer/data_flow.png
similarity index 100%
rename from docs/source/images/distrib_optimizer/data_flow.png
rename to docs/images/distrib_optimizer/data_flow.png
diff --git a/docs/source/images/distrib_optimizer/sharding_scheme.png b/docs/images/distrib_optimizer/sharding_scheme.png
similarity index 100%
rename from docs/source/images/distrib_optimizer/sharding_scheme.png
rename to docs/images/distrib_optimizer/sharding_scheme.png
diff --git a/docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png b/docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png
new file mode 100644
index 00000000000..6c8afa78bb1
Binary files /dev/null and b/docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png differ
diff --git a/docs/source/images/moe/token_drop.png b/docs/images/moe/token_drop.png
similarity index 100%
rename from docs/source/images/moe/token_drop.png
rename to docs/images/moe/token_drop.png
diff --git a/docs/source/images/multi_token_prediction/MTP_implementation.png b/docs/images/multi_token_prediction/MTP_implementation.png
similarity index 100%
rename from docs/source/images/multi_token_prediction/MTP_implementation.png
rename to docs/images/multi_token_prediction/MTP_implementation.png
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000000..ee321702528
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,97 @@
+# Megatron Core User Guide
+
+**Megatron Core** is a GPU-optimized library for training large language models at scale. It provides modular, composable building blocks for creating custom training frameworks with state-of-the-art parallelism strategies and performance optimizations.
+
+Megatron Core offers a flexible, reusable foundation for building large-scale transformer training systems. **Megatron-LM** serves as a reference implementation demonstrating how to use Megatron Core components to train models with billions to trillions of parameters across distributed GPU clusters.
+
+## Key Features
+
+* Composable transformer building blocks (attention, MLP, etc.)
+* Advanced parallelism strategies (TP, PP, DP, EP, CP)
+* Pipeline schedules and distributed optimizers
+* Mixed precision support (FP16, BF16, FP8)
+* GPU-optimized kernels and memory management
+* High-performance dataloaders and dataset utilities
+* Model architectures (LLaMA, Qwen, DeepSeek, GPT, Mamba, etc.)
+
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: About Megatron Core
+
+get-started/overview
+get-started/releasenotes
+```
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: Get Started
+
+get-started/quickstart
+get-started/install
+```
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: Basic Usage
+
+user-guide/data-preparation
+user-guide/training-examples
+user-guide/parallelism-guide
+```
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: Supported Models
+
+models/index
+```
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: Advanced Features
+
+user-guide/features/moe
+user-guide/features/context_parallel
+user-guide/features/custom_fsdp
+user-guide/features/dist_optimizer
+user-guide/features/optimizer_cpu_offload
+user-guide/features/pipeline_parallel_layout
+user-guide/features/fine_grained_activation_offloading
+user-guide/features/megatron_energon
+user-guide/features/megatron_rl
+user-guide/features/tokenizers
+```
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+:caption: Developer Guide
+
+developer/contribute
+developer/submit
+developer/oncall
+developer/generate_docs
+```
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: Discussions
+
+advanced/index
+```
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: API Reference
+
+api-guide/index
+apidocs/index.rst
+```
\ No newline at end of file
diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
index 5dd61866e87..8509ce6f47c 100644
--- a/docs/llama_mistral.md
+++ b/docs/llama_mistral.md
@@ -1,8 +1,8 @@
 # Llama, Mistral and other Llama-like model support in Megatron-LM
 
-NOTE: In order to simplify code we now only support converting llama-3.x and mistral checkpoints downloaded from Huggingface.
+NOTE: In order to simplify code we now only support converting llama-3.x and mistral checkpoints downloaded from Hugging Face. For converting other models, see [Megatron Bridge](models/index.md).
 
-The [Llama-2](https://ai.meta.com/llama/) and [Llama-3.x](https://llama.meta.com/) family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see https://arxiv.org/pdf/2307.09288.pdf and https://ai.meta.com/blog/meta-llama-3/).
+The Llama-2 and Llama-3.x family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see <https://arxiv.org/pdf/2307.09288.pdf>).
 
 Similarly, [Mistral-7b](https://mistral.ai/news/announcing-mistral-7b/) is an open-source model with pretrained and finetuned (for chat) variants that achieve strong benchmark results.
 
@@ -28,20 +28,19 @@ Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatr
     - [MMLU](#mmlu)
 - [Llama-3.x](#llama-3x)
   - [Download Huggingface checkpoints](#download-huggingface-checkpoints)
-  - [Convert checkpoint format](#convert-checkpoint-format-1)
-    - [Huggingface format](#huggingface-format-1)
+  - [Convert checkpoint format](#convert-checkpoint-format)
+    - [Huggingface format](#huggingface-format)
   - [(Optional) Validate checkpoints](#optional-validate-checkpoints)
-  - [Launch model](#launch-model-1)
+  - [Launch model](#launch-model)
 - [Mistral-7b](#mistral-7b)
-  - [Download Huggingface checkpoints](#download-huggingface-checkpoints-2)
-  - [Convert checkpoint format](#convert-checkpoint-format-3)
-  - [(Optional) Validate checkpoints](#optional-validate-checkpoints-2)
-  - [Launch model](#launch-model-3)
+  - [Download Huggingface checkpoints](#download-huggingface-checkpoints)
+  - [Convert checkpoint format](#convert-checkpoint-format)
+  - [(Optional) Validate checkpoints](#optional-validate-checkpoints)
+  - [Launch model](#launch-model)
 - [Other Llama-like model support](#other-llama-like-model-support)
 - [Known numerical differences](#known-numerical-differences)
 - [Using legacy model format](#using-legacy-model-format)
 
-
 # Llama-2
 
 Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps:
@@ -54,7 +53,7 @@ The following sections detail these steps. The final section lists benchmark res
 
 ## Download Meta or Huggingface checkpoints
 
-Users must first apply for access to download the Llama-2 checkpoints either directly from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or through [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
+Users must first apply for access to download the Llama-2 checkpoints either directly [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
 
 ## Convert checkpoint format
 
@@ -140,11 +139,11 @@ If loading for either inference or finetuning, use the following arguments:
 
 ### Launch Meta
 
-Meta checkpoints can be launched with: https://github.com/facebookresearch/llama
+Meta checkpoints can be launched with: <https://github.com/facebookresearch/llama>
 
 ### Launch Huggingface
 
-Huggingface checkpoints can be launched with: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+Huggingface checkpoints can be launched with: <https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py>
 
 ## Benchmark results
 
@@ -428,7 +427,7 @@ Many models such as Yi-34B and Qwen2.x use the Llama architecture and may be con
 
 It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list:
 
-1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: https://github.com/NVIDIA/TransformerEngine/issues/1132
+1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: <https://github.com/NVIDIA/TransformerEngine/issues/1132>
 2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas Megatron core combines them into a single GEMM for efficiency. This leads to small numerical differences.
 
 # Using legacy model format
diff --git a/docs/models/index.md b/docs/models/index.md
new file mode 100644
index 00000000000..6fabd1f582c
--- /dev/null
+++ b/docs/models/index.md
@@ -0,0 +1,17 @@
+# Supported Models
+
+Megatron Core supports a wide range of language and multimodal models with optimized implementations for large-scale training.
+
+## Model Conversion
+
+For converting HuggingFace models to Megatron format, use [Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge), the official standalone converter. Megatron Bridge supports an extensive list of models including LLaMA, Mistral, Mixtral, Qwen, DeepSeek, Gemma, Phi, Nemotron, and many more.
+
+See the [Megatron Bridge supported models list](https://github.com/NVIDIA-NeMo/Megatron-Bridge?tab=readme-ov-file#supported-models) for the complete and up-to-date list of supported models.
+
+```{toctree}
+:maxdepth: 1
+
+llms
+multimodal
+../llama_mistral
+```
diff --git a/docs/models/llms.md b/docs/models/llms.md
new file mode 100644
index 00000000000..6789a4c551c
--- /dev/null
+++ b/docs/models/llms.md
@@ -0,0 +1,50 @@
+# Language Models
+
+Megatron Core supports the following language model architectures for large-scale training.
+
+## Converting HuggingFace Models
+
+Use [**Megatron Bridge**](https://github.com/NVIDIA-NeMo/Megatron-Bridge) to convert HuggingFace models to Megatron format. Megatron Bridge is the official standalone converter with support for an extensive list of models including LLaMA, Mistral, Mixtral, Qwen, DeepSeek, Gemma, Phi, Nemotron, and many more.
+
+See the [Megatron Bridge supported models list](https://github.com/NVIDIA-NeMo/Megatron-Bridge?tab=readme-ov-file#supported-models) for the complete and up-to-date list.
+
+## Decoder-Only Models
+
+| Model | Description | Key Features |
+|-------|-------------|--------------|
+| **GPT** | Generative Pre-trained Transformer | Standard autoregressive LM, foundational architecture |
+| **LLaMA** | Meta's LLaMA family | Efficient architecture with RoPE, SwiGLU, RMSNorm |
+| **Mistral** | Mistral AI models | Sliding window attention, efficient inference |
+| **Mixtral** | Sparse Mixture-of-Experts | 8x7B MoE architecture for efficient scaling |
+| **Qwen** | Alibaba's Qwen series | HuggingFace integration, multilingual support |
+| **Mamba** | State Space Model | Subquadratic sequence length scaling, efficient long context |
+
+## Encoder-Only Models
+
+| Model | Description | Key Features |
+|-------|-------------|--------------|
+| **BERT** | Bidirectional Encoder Representations | Masked language modeling, classification tasks |
+
+## Encoder-Decoder Models
+
+| Model | Description | Key Features |
+|-------|-------------|--------------|
+| **T5** | Text-to-Text Transfer Transformer | Unified text-to-text framework, sequence-to-sequence |
+
+## Example Scripts
+
+Training examples for these models can be found in the `examples/` directory:
+- `examples/gpt3/` - GPT-3 training scripts
+- `examples/llama/` - LLaMA training scripts
+- `examples/mixtral/` - Mixtral MoE training
+- `examples/mamba/` - Mamba training scripts
+- `examples/bert/` - BERT training scripts
+- `examples/t5/` - T5 training scripts
+
+## Model Implementation
+
+All language models are built using Megatron Core's composable transformer blocks, enabling:
+- Flexible parallelism strategies (TP, PP, DP, EP, CP)
+- Mixed precision training (FP16, BF16, FP8)
+- Distributed checkpointing
+- Efficient memory management
diff --git a/docs/models/multimodal.md b/docs/models/multimodal.md
new file mode 100644
index 00000000000..66ed8ccd9cb
--- /dev/null
+++ b/docs/models/multimodal.md
@@ -0,0 +1,61 @@
+# Multimodal Models
+
+Megatron Core supports multimodal models that combine language with vision, audio, and other modalities for comprehensive multimodal understanding.
+
+## MIMO: Multimodal In/Out Framework
+
+**MIMO (Multimodal In/Out Model)** is an experimental framework in Megatron Core that supports arbitrary combinations of modalities including vision, audio, and text. MIMO provides a flexible architecture for building custom multimodal models.
+
+> **Note**: MIMO is experimental and under active development. The API may change in future releases.
+
+**Key Features:**
+- Arbitrary modality combinations (vision, audio, text, etc.)
+- Flexible encoder architecture for different input modalities
+- Unified embedding space across modalities
+- Support for both vision-language and audio-vision-language models
+
+See [examples/mimo](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mimo) for training scripts and examples.
+
+## Vision-Language Models
+
+| Model | Description | Vision Encoder | Language Model |
+|-------|-------------|----------------|----------------|
+| **LLaVA** | Visual instruction tuning | CLIP ViT-L/14 | Mistral-7B / LLaMA |
+| **NVLM** | NVIDIA Vision-Language Model | CLIP / Custom ViT | LLaMA-based |
+| **LLaMA 3.1 Nemotron Nano VL** | Efficient multimodal model | Vision Transformer | LLaMA 3.1 8B |
+
+## Vision Encoders
+
+| Model | Description | Key Features |
+|-------|-------------|--------------|
+| **CLIP ViT** | OpenAI's CLIP Vision Transformer | Image-text alignment, multiple scales (L/14@336px) |
+| **RADIO** | Resolution-Agnostic Dynamic Image Optimization | Flexible resolution handling, efficient vision encoding |
+
+## Diffusion Models
+
+For multimodal diffusion models (image generation, text-to-image, etc.), see [NeMo Diffusion Models](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/diffusion). NeMo provides production-ready implementations of:
+- Stable Diffusion variants
+- Text-to-image generation
+- Image-to-image translation
+- ControlNet and other conditioning mechanisms
+
+## Multimodal Features
+
+- **Image-Text Alignment**: Pre-training on image-caption pairs
+- **Visual Instruction Tuning**: Fine-tuning on instruction-following datasets
+- **Flexible Vision Encoders**: Support for different ViT architectures and resolutions
+- **Combined Checkpointing**: Unified checkpoints combining vision and language models
+- **Efficient Training**: Full parallelism support (TP, PP, DP) for both vision and language components
+
+## Example Scripts
+
+Multimodal training examples can be found in the following directories:
+
+**MIMO Framework:**
+- `examples/mimo/` - Multimodal In/Out training with support for vision-language and audio-vision-language models
+
+**Specific Multimodal Models:**
+- `examples/multimodal/` - LLaVA-style training with Mistral + CLIP
+- `examples/multimodal/nvlm/` - NVLM training scripts
+- `examples/multimodal/llama_3p1_nemotron_nano_vl_8b_v1/` - Nemotron VL training
+- `examples/multimodal/radio/` - RADIO vision encoder integration
diff --git a/docs/project.json b/docs/project.json
new file mode 100644
index 00000000000..fb24596d03b
--- /dev/null
+++ b/docs/project.json
@@ -0,0 +1,2 @@
+{"name": "megatron-lm", "version": "0.16.0"}
+
diff --git a/docs/source/api-guide/datasets.rst b/docs/source/api-guide/datasets.rst
deleted file mode 100644
index 247a3f07d3f..00000000000
--- a/docs/source/api-guide/datasets.rst
+++ /dev/null
@@ -1,104 +0,0 @@
-datasets package
-================
-
-.. mdinclude :: ../../../megatron/core/datasets/readme.md
-
-Submodules
-----------
-
-datasets.blended\_megatron\_dataset\_config module
----------------------------------------------------
-
-.. automodule:: core.datasets.blended_megatron_dataset_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.blended\_megatron\_dataset\_builder module
----------------------------------------------------
-
-.. automodule:: core.datasets.blended_megatron_dataset_builder
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.megatron\_tokenizer module
------------------------------------
-
-.. automodule:: core.datasets.megatron_tokenizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.indexed\_dataset module
---------------------------------
-
-.. automodule:: core.datasets.indexed_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.megatron\_dataset module
----------------------------------
-
-.. automodule:: core.datasets.megatron_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.gpt\_dataset module
-----------------------------
-
-.. automodule:: core.datasets.gpt_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.masked\_dataset module
--------------------------------
-
-.. automodule:: core.datasets.masked_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.bert\_dataset module
------------------------------
-
-.. automodule:: core.datasets.bert_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.t5\_dataset module
----------------------------
-
-.. automodule:: core.datasets.t5_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.blended\_dataset module
-----------------------------------
-
-.. automodule:: core.datasets.blended_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-datasets.utils module
----------------------
-
-.. automodule:: core.datasets.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.datasets
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
diff --git a/docs/source/api-guide/dist_checkpointing.rst b/docs/source/api-guide/dist_checkpointing.rst
deleted file mode 100644
index 3b683b0b3bd..00000000000
--- a/docs/source/api-guide/dist_checkpointing.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-dist\_checkpointing package
-===========================
-
-A library for saving and loading the distributed checkpoints.
-A "distributed checkpoint" can have various underlying formats (current default format is based on Zarr)
-but has a distinctive property - the checkpoint saved in one parallel configuration (tensor/pipeline/data parallelism)
-can be loaded in a different parallel configuration.
-
-Using the library requires defining sharded state_dict dictionaries with functions from  *mapping* and *optimizer* modules.
-Those state dicts can be saved or loaded with a *serialization* module using strategies from *strategies* module.
-
-Safe Checkpoint Loading
------------------------
-
-Since **PyTorch 2.6**, the default behavior of `torch.load` is `weights_only=True`.
-This ensures that only tensors and allow-listed classes are loaded, reducing the risk of arbitrary code execution.
-
-If you encounter an error such as:
-
-.. code-block:: bash
-
-   WeightsUnpickler error: Unsupported global: GLOBAL argparse.Namespace was not an allowed global by default.
-
-you can fix it by explicitly allow-listing the missing class in your script:
-
-.. code-block:: python
-
-   import torch, argparse
-
-   torch.serialization.add_safe_globals([argparse.Namespace])
-
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   dist_checkpointing.strategies
-
-Submodules
-----------
-
-dist\_checkpointing.serialization module
-----------------------------------------
-
-.. automodule:: core.dist_checkpointing.serialization
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-dist\_checkpointing.mapping module
-----------------------------------
-
-.. automodule:: core.dist_checkpointing.mapping
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-dist\_checkpointing.optimizer module
-------------------------------------
-
-.. automodule:: core.dist_checkpointing.optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-dist\_checkpointing.core module
--------------------------------
-
-.. automodule:: core.dist_checkpointing.core
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-dist\_checkpointing.dict\_utils module
---------------------------------------
-
-.. automodule:: core.dist_checkpointing.dict_utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-dist\_checkpointing.utils module
---------------------------------
-
-.. automodule:: core.dist_checkpointing.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.dist_checkpointing
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/dist_checkpointing.strategies.rst b/docs/source/api-guide/dist_checkpointing.strategies.rst
deleted file mode 100644
index 41e674c761e..00000000000
--- a/docs/source/api-guide/dist_checkpointing.strategies.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-dist\_checkpointing.strategies package
-======================================
-
-Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies).
-
-Strategies can be used for implementing new checkpoint formats or implementing new (more optimal for a given use case) ways of saving/loading of existing formats.
-Strategies are passed to `dist_checkpointing.load` and `dist_checkpointing.save` functions and control the actual saving/loading procedure.
-
-Submodules
-----------
-
-dist\_checkpointing.strategies.base module
-------------------------------------------
-
-.. automodule:: core.dist_checkpointing.strategies.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-dist\_checkpointing.strategies.tensorstore module
--------------------------------------------------
-
-.. automodule:: core.dist_checkpointing.strategies.tensorstore
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-dist\_checkpointing.strategies.two\_stage module
-------------------------------------------------
-
-.. automodule:: core.dist_checkpointing.strategies.two_stage
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-dist\_checkpointing.strategies.zarr module
-------------------------------------------
-
-.. automodule:: core.dist_checkpointing.strategies.zarr
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.dist_checkpointing.strategies
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/distributed.rst b/docs/source/api-guide/distributed.rst
deleted file mode 100644
index 737820331c1..00000000000
--- a/docs/source/api-guide/distributed.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-distributed package
-===================
-
-This package contains various utilities to finalize model weight gradients
-on each rank before the optimizer step. This includes a distributed data
-parallelism wrapper to all-reduce or reduce-scatter the gradients across
-data-parallel replicas, and a `finalize\_model\_grads` method to
-synchronize gradients across different parallelism modes (e.g., 'tied'
-layers on different pipeline stages, or gradients for experts in a MoE on
-different ranks due to expert parallelism).
-
-Submodules
-----------
-
-distributed.distributed\_data\_parallel
----------------------------------------
-
-Model wrapper for distributed data parallelism. Stores gradients in a
-contiguous buffer, and supports the option of overlapping communication
-(all-reduce or reduce-scatter) with backprop computation by breaking up
-full model's gradients into smaller buckets and running all-reduce /
-reduce-scatter on each bucket asynchronously. 
-
-.. automodule:: core.distributed.distributed_data_parallel
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-distributed.finalize\_model\_grads
-----------------------------------
-
-Finalize model gradients for optimizer step across all used parallelism modes.
-Synchronizes the all-reduce / reduce-scatter of model gradients across DP replicas,
-all-reduces the layernorm gradients for sequence parallelism, embedding gradients
-across first and last pipeline stages (if not tied), and expert gradients for expert
-parallelism.
-
-.. automodule:: core.distributed.finalize_model_grads
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
-
-Contains functionality to synchronize gradients across different ranks before
-optimizer step.
-
-.. automodule:: core.distributed
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/fusions.rst b/docs/source/api-guide/fusions.rst
deleted file mode 100644
index 22782ca84ec..00000000000
--- a/docs/source/api-guide/fusions.rst
+++ /dev/null
@@ -1,65 +0,0 @@
-fusions package
-===============
-
-This package provides modules that provide commonly fused
-operations. Fusing operations improves compute efficiency by
-increasing the amount of work done each time a tensor is read from
-memory. To perform the fusion, modules in this either rely on PyTorch
-functionality for doing just-in-time compilation
-(i.e. `torch.jit.script` in older PyTorch versions of `torch.compile`
-in recent versions), or call into custom kernels in external libraries
-such as Apex or TransformerEngine.
-
-Submodules
-----------
-
-fusions.fused\_bias\_dropout module
------------------------------------
-
-This module uses PyTorch JIT to fuse the bias add and dropout operations. Since dropout is not used during inference, different functions are used when in train mode and when in inference mode.
-
-.. automodule:: core.fusions.fused_bias_dropout
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-fusions.fused\_bias\_gelu module
---------------------------------
-
-This module uses PyTorch JIT to fuse the bias add and GeLU nonlinearity operations.
-
-.. automodule:: core.fusions.fused_bias_gelu
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-fusions.fused\_layer\_norm module
----------------------------------
-
-This module provides a wrapper around various fused LayerNorm implementation in Apex.
-
-.. automodule:: core.fusions.fused_layer_norm
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-fusions.fused\_softmax module
------------------------------
-
-This module provides wrappers around variations of Softmax in Apex.
-
-.. automodule:: core.fusions.fused_softmax
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-fusions.fused\_cross\_entropy\_loss module
-------------------------------------------
-
-This module uses PyTorch JIT to fuse the cross entropy loss calculation and batches communication calls.
-
-.. automodule:: core.fusions.fused_cross_entropy
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst
deleted file mode 100644
index 710a7caf4de..00000000000
--- a/docs/source/api-guide/index.rst
+++ /dev/null
@@ -1,24 +0,0 @@
-API Guide
-=========
-
-.. toctree::
-   :maxdepth: 4
-
-   models
-   tensor_parallel
-   context_parallel
-   pipeline_parallel
-   custom_fsdp
-   fusions
-   transformer
-   moe
-   dist_checkpointing
-   dist_optimizer
-   distributed
-   datasets
-   multi_latent_attention
-   num_microbatches_calculator
-   optimizer_param_scheduler
-   optimizer_cpu_offload
-   multi_token_prediction
-   tokenizers
diff --git a/docs/source/api-guide/models.bert.rst b/docs/source/api-guide/models.bert.rst
deleted file mode 100644
index 1b562ce72c8..00000000000
--- a/docs/source/api-guide/models.bert.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-models.bert package
-===================
-Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 
-
-Submodules
-----------
-
-models.bert.bert\_model module
-------------------------------
-
-.. automodule:: core.models.bert.bert_model
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.models.bert
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/models.gpt.rst b/docs/source/api-guide/models.gpt.rst
deleted file mode 100644
index 31c4da6a9c1..00000000000
--- a/docs/source/api-guide/models.gpt.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-models.gpt package
-==================
-This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 
-
-Submodules
-----------
-
-models.gpt.gpt\_model module
-----------------------------
-
-.. automodule:: core.models.gpt.gpt_model
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.models.gpt
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/models.rst b/docs/source/api-guide/models.rst
deleted file mode 100644
index 12c40e4f350..00000000000
--- a/docs/source/api-guide/models.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-models package
-==============
-This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   models.gpt
-   models.t5
-   models.bert
-
-Module contents
----------------
-
-.. automodule:: core.models
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/models.t5.rst b/docs/source/api-guide/models.t5.rst
deleted file mode 100644
index 1cc33156821..00000000000
--- a/docs/source/api-guide/models.t5.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-models.t5 package
-=================
-
-Submodules
-----------
-
-models.t5.t5\_model module
---------------------------
-
-.. automodule:: core.models.T5.t5_model
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.models.T5
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/moe.rst b/docs/source/api-guide/moe.rst
deleted file mode 100644
index 9afc01e080b..00000000000
--- a/docs/source/api-guide/moe.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Mixture of Experts package
-==========================
-
-.. mdinclude :: ../../../megatron/core/transformer/moe/README.md
diff --git a/docs/source/api-guide/num_microbatches_calculator.rst b/docs/source/api-guide/num_microbatches_calculator.rst
deleted file mode 100644
index 4790b317495..00000000000
--- a/docs/source/api-guide/num_microbatches_calculator.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Microbatches Calculator
-=======================
-This api is used to calculate the number of microbatches required to fit a given model on a given batch size.
-
-
-Module contents
----------------
-
-.. automodule:: core.num_microbatches_calculator
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/optimizer_cpu_offload.rst b/docs/source/api-guide/optimizer_cpu_offload.rst
deleted file mode 100644
index fdbae6654bd..00000000000
--- a/docs/source/api-guide/optimizer_cpu_offload.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Optimizer CPU offload package
-==============================
-
-.. mdinclude :: ../../../megatron/core/optimizer/cpu_offloading/README.md
diff --git a/docs/source/api-guide/optimizer_param_scheduler.rst b/docs/source/api-guide/optimizer_param_scheduler.rst
deleted file mode 100644
index caf5d8abfb4..00000000000
--- a/docs/source/api-guide/optimizer_param_scheduler.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Optimizer Parameters Scheduler
-==============================
-This api is used to calculate the learning rate and weight decay for the optimizer.
-
-
-Module contents
----------------
-
-.. automodule:: core.optimizer_param_scheduler
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/pipeline_parallel.rst b/docs/source/api-guide/pipeline_parallel.rst
deleted file mode 100644
index 0c1909d9031..00000000000
--- a/docs/source/api-guide/pipeline_parallel.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-pipeline\_parallel package
-==========================
-
-This package contains implementations for two different pipeline parallelism
-schedules (one without interleaving and one with interleaving, see `Efficient
-Large-Scale Language Model Training on GPU Clusters Using Megatron-LM <https://arxiv.org/abs/2104.04473>`_
-for details), and a default no-pipelining schedule. It also contains methods
-for the point-to-point communication that is needed between pipeline stages.
-
-Submodules
-----------
-
-.. mdinclude:: pipeline_parallel_layout.md
-
-pipeline\_parallel.p2p\_communication module
---------------------------------------------
-
-Contains implementations for the various point-to-point communication needed
-(e.g., `recv_forward` and `recv_backward`) in the different pipeline parallelism
-schedules.
-
-.. automodule:: core.pipeline_parallel.p2p_communication
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-pipeline\_parallel.schedules module
------------------------------------
-
-Contains implementations for two pipeline parallelism schedules
-(`forward_backward_pipelining_with_interleaving`for pipeline parallelism with
-interleaving, `forward_backward_pipelining_without_interleaving` for pipeline
-parallelism without interleaving) and a default no-pipelining schedule
-(`forward_backward_no_pipelining`). `get_forward_backward_func` returns the right
-scheduling function to use based on the configuration being trained
-(e.g., if pipeline-parallel size is 1, use `forward_backward_no_pipelining`).
-
-.. automodule:: core.pipeline_parallel.schedules
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.pipeline_parallel
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/tensor_parallel.rst b/docs/source/api-guide/tensor_parallel.rst
deleted file mode 100644
index d8ae9dea222..00000000000
--- a/docs/source/api-guide/tensor_parallel.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-tensor\_parallel package
-========================
-
-This package contains an implementation for tensor parallelism in transformer
-models (see `Megatron-LM: Training Multi-Billion Parameter Language Models
-Using Model Parallelism <https://arxiv.org/abs/1909.08053>`_ and `Reducing
-Activation Recomputation in Large Transformer Models <https://arxiv.org/abs/2205.05198>`_
-for details).
-
-Submodules
-----------
-
-tensor\_parallel.cross\_entropy module
---------------------------------------
-
-.. automodule:: core.tensor_parallel.cross_entropy
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-tensor\_parallel.data module
-----------------------------
-
-.. automodule:: core.tensor_parallel.data
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-tensor\_parallel.layers module
-------------------------------
-
-.. automodule:: core.tensor_parallel.layers
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-tensor\_parallel.mappings module
---------------------------------
-
-.. automodule:: core.tensor_parallel.mappings
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-tensor\_parallel.random module
-------------------------------
-
-.. automodule:: core.tensor_parallel.random
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-tensor\_parallel.utils module
------------------------------
-
-.. automodule:: core.tensor_parallel.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.tensor_parallel
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api-guide/tokenizers.md b/docs/source/api-guide/tokenizers.md
deleted file mode 100644
index 5aaf9866f1e..00000000000
--- a/docs/source/api-guide/tokenizers.md
+++ /dev/null
@@ -1,137 +0,0 @@
-# New Tokenizer System
-
-## Key Differences from the Old Tokenizer System
-
-### 1. Hugging Face–style API
-
-We now have a `MegatronTokenizer` class that provides a familiar, simple API similar to Hugging Face’s:
-
-`.from_pretrained()` – Load a tokenizer from a directory or file, automatically detecting the type and settings.
-
-`.write_metadata()` – Save tokenizer configuration (metadata) so that it can be reused without re-specifying parameters.
-
-This eliminates the need for long initialization arguments and hard-coded settings in training scripts.
-
-### 2. Tokenizer Metadata
-
-A metadata file (JSON) now stores all essential tokenizer configuration in one place:
- - Tokenizer library (e.g., HuggingFace, SentencePiece, TikToken, etc.)
- - Chat templates
- - Tokenizer class
-
-Benefits:
- - You only need to set these parameters once.
- - No more passing multiple CLI arguments for tokenizer settings.
- - Easy sharing — just copy the tokenizer directory with its metadata file.
-
-### 3. Library Classes Are Now Internal
-
-In the old system, you had to know which tokenizer library to use (`SentencePieceTokenizer`, `HuggingFaceTokenizer`, etc.) and instantiate it manually.
-
-In the new system:
- - The library is automatically detected from the metadata.
- - The correct tokenizer implementation is chosen under the hood.
- - Users don’t need to manually manage tokenizer classes.
-
-### 3. Support for Model-specific Tokenizer Classes
-
-The system now supports:
- - Built-in LLM-specific tokenizers. 
- - Custom tokenizers: You can create your own tokenizer class by inheriting from `MegatronTokenizerText` and specify it in the `tokenizer_class` field in the metadata file.
- - This allows advanced customization while keeping defaults simple for most users.
-
-### 4. Usage
-
-**Creating and Saving Metadata**
-
-```python
-from megatron.core.tokenizers import MegatronTokenizer
-
-# The metadata will be stored as a file named tokenizer_metadata.json inside the tokenizer’s directory.
-MegatronTokenizer.write_metadata(
-    tokenizer_path="/path/to/tokenizer.model",
-    tokenizer_library="sentencepiece",
-    chat_template="chat template in jinja format",
-)
-
-# To use custom tokenizer class
-from megatron.core.tokenizers.text import MegatronTokenizerText
-
-class CustomTokenizer(MegatronTokenizerText):
-    ...
-
-MegatronTokenizer.write_metadata(
-    tokenizer_path="/path/to/tokenizer.model",
-    tokenizer_library="sentencepiece",
-    chat_template="chat template in jinja format",
-    tokenizer_class=CustomTokenizer,
-)
-
-# To save metadata to another dir
-MegatronTokenizer.write_metadata(
-    tokenizer_path="/path/to/tokenizer.model",
-    tokenizer_library="sentencepiece",
-    metadata_path="/path/to/save/metadata.json",
-)
-
-```
-
-**Restoring the tokenizer**
-
-```python
-from megatron.core.tokenizers import MegatronTokenizer
-
-MegatronTokenizer.from_pretrained(
-    tokenizer_path="/path/to/tokenizer.model",
-)
-
-# If metadata is not in tokenizer’s dir
-MegatronTokenizer.from_pretrained(
-    tokenizer_path="/path/to/tokenizer.model",
-    metadata_path="/path/to/metadata.json",
-)
-
-# Pass metadata as dict
-MegatronTokenizer.from_pretrained(
-    tokenizer_path="GPT2BPETokenizer",
-    metadata_path={"library": "megatron"},
-    vocab_file="/path/to/vocab.txt",
-)
-
-# Pass additional params
-MegatronTokenizer.from_pretrained(
-    tokenizer_path="/path/to/tokenizer/model.json",
-    metadata_path={"library": "tiktoken"},
-    pattern="v2",
-    num_special_tokens=1000,
-)
-
-# Null tokenzier
-MegatronTokenizer.from_pretrained(
-    metadata_path={"library": "null"},
-    vocab_size=131072,
-)
-
-```
-
-### 4. Megatron-LM pretraining compatibility
-
-New tokenizer system is compatible with megatron-lm pretrain script. If `--tokenizer-metadata` is not specified, a default metadata file will be generated automatically.
-
-```bash
-# Null tokenizer
-torchrun --nproc_per_node=1 pretrain_gpt.py \
-    ... \
-    --tokenizer-type NullTokenizer \
-    --vocab-size 131072
-
-# HuggingFace tokenizer with specified metadata
-torchrun --nproc_per_node=1 pretrain_gpt.py \
-    ... \
-    --tokenizer-type HuggingFaceTokenizer \
-    --tokenizer-model meta-llama/Meta-Llama-3-8B \
-    --tokenizer-metadata /path/to/metadata.json
-
-```
-
-The Megatron-LM pretraining script still supports the legacy tokenizer system. To enable it, simply add the `--legacy-tokenizer` flag.
diff --git a/docs/source/api-guide/transformer.rst b/docs/source/api-guide/transformer.rst
deleted file mode 100644
index 6e2e894d549..00000000000
--- a/docs/source/api-guide/transformer.rst
+++ /dev/null
@@ -1,136 +0,0 @@
-transformer package
-===================
-
-The `transformer` package provides a customizable and configurable
-implementation of the transformer model architecture. Each component
-of a transformer stack, from entire layers down to individual linear
-layers, can be customized by swapping in different PyTorch modules
-using the "spec" parameters (see `here
-<https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/mcore_customization.html>`_). The
-configuration of the transformer (hidden size, number of layers,
-number of attention heads, etc.) is provided via a `TransformerConfig`
-object.
-
-Submodules
-----------
-
-transformer.attention module
-----------------------------
-
-This is the entire attention portion, either self or cross attention,
-of a transformer layer including the query, key, and value
-projections, a "core" attention calculation (e.g. dot product
-attention), and final output linear projection.
-
-.. automodule:: core.transformer.attention
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.dot\_product\_attention module
-------------------------------------------
-
-This is a PyTorch-only implementation of dot product attention. A more
-efficient implementation, like those provided by FlashAttention or
-CUDNN's FusedAttention, are typically used when training speed is
-important.
-
-.. automodule:: core.transformer.dot_product_attention
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.enums module
-------------------------
-
-.. automodule:: core.transformer.enums
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.identity\_op module
--------------------------------
-
-This provides a pass-through module that can be used in specs to
-indicate that the operation should not be performed. For example, when
-using LayerNorm with the subsequent linear layer, an IdentityOp can be
-passed in as the LayerNorm module to use.
-
-.. automodule:: core.transformer.identity_op
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.mlp module
-----------------------
-
-This is the entire MLP portion of the transformer layer with an input
-projection, non-linearity, and output projection.
-
-.. automodule:: core.transformer.mlp
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.module module
--------------------------
-
-This provides a common base class for all modules used in the
-transformer that contains some common functionality.
-
-.. automodule:: core.transformer.module
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.transformer\_block module
--------------------------------------
-
-A block, or stack, of several transformer layers. The layers can all
-be the same or each can be unique.
-
-.. automodule:: core.transformer.transformer_block
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.transformer\_config module
---------------------------------------
-
-This contains all of the configuration options for the
-transformer. Using a dataclass reduces code bloat by keeping all
-arguments together in a dataclass instead of passing several arguments
-through multiple layers of function calls.
-
-.. automodule:: core.transformer.transformer_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.transformer\_layer module
--------------------------------------
-
-A single standard transformer layer including attention and MLP blocks.
-
-.. automodule:: core.transformer.transformer_layer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-transformer.utils module
-------------------------
-
-Various utilities used in the transformer implementation.
-
-.. automodule:: core.transformer.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: core.transformer
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/index.rst b/docs/source/index.rst
deleted file mode 100644
index f2a89b8ac77..00000000000
--- a/docs/source/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. Lumache documentation master file, created by
-   sphinx-quickstart on Tue Aug 15 13:44:10 2023.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Megatron Core User Guide
-===================================
-
-**Megatron Core** is a Python library that has the core components required to build your language models. 
-A reference implementation of Megatron Core can be found in  `NeMo <https://github.com/NVIDIA/NeMo/tree/main>`_ It offers a *simple* and
-*intuitive* API.
-
-.. toctree::
-   :maxdepth: 2
-   :caption: User Guide
-
-   user-guide/index
-
-.. toctree::
-   :maxdepth: 3
-   :caption: API Guide
-   
-   api-guide/index
diff --git a/docs/source/user-guide/index.rst b/docs/source/user-guide/index.rst
deleted file mode 100644
index aab745736b3..00000000000
--- a/docs/source/user-guide/index.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-User Guide 
-============
-
-.. mdinclude:: ../../../megatron/core/QuickStart.md
-.. mdinclude:: ../../../megatron/core/MSC_Integration.md
\ No newline at end of file
diff --git a/docs/user-guide/data-preparation.md b/docs/user-guide/data-preparation.md
new file mode 100644
index 00000000000..3ff5eedba89
--- /dev/null
+++ b/docs/user-guide/data-preparation.md
@@ -0,0 +1,70 @@
+# Data Preparation
+
+Preparing your data correctly is essential for successful training with Megatron Core.
+
+## Data Format
+
+Megatron Core expects training data in JSONL (JSON Lines) format, where each line is a JSON object:
+
+```json
+{"text": "Your training text here..."}
+{"text": "Another training sample..."}
+{"text": "More training data..."}
+```
+
+## Preprocessing Data
+
+Use the `preprocess_data.py` tool to convert your JSONL data into Megatron's binary format:
+
+```bash
+python tools/preprocess_data.py \
+    --input data.jsonl \
+    --output-prefix processed_data \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model /path/to/tokenizer.model \
+    --workers 8 \
+    --append-eod
+```
+
+### Key Arguments
+
+| Argument | Description |
+|----------|-------------|
+| `--input` | Path to input JSON/JSONL file |
+| `--output-prefix` | Prefix for output binary files (.bin and .idx) |
+| `--tokenizer-type` | Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, etc.) |
+| `--tokenizer-model` | Path to tokenizer model file |
+| `--workers` | Number of parallel workers for processing |
+| `--append-eod` | Add end-of-document token |
+
+## Output Files
+
+The preprocessing tool generates two files:
+- `processed_data.bin` - Binary file containing tokenized sequences
+- `processed_data.idx` - Index file for fast random access
+
+## Using Preprocessed Data
+
+Reference your preprocessed data in training scripts:
+
+```bash
+--data-path processed_data \
+--split 949,50,1  # Train/validation/test split
+```
+
+## Common Tokenizers
+
+### HuggingFace Tokenizers
+
+```bash
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model /path/to/tokenizer.model
+```
+
+### GPT-2 BPE Tokenizer
+
+```bash
+--tokenizer-type GPT2BPETokenizer \
+--vocab-file gpt2-vocab.json \
+--merge-file gpt2-merges.txt
+```
diff --git a/docs/source/api-guide/context_parallel.rst b/docs/user-guide/features/context_parallel.md
similarity index 73%
rename from docs/source/api-guide/context_parallel.rst
rename to docs/user-guide/features/context_parallel.md
index c08defd2108..841c16326b3 100644
--- a/docs/source/api-guide/context_parallel.rst
+++ b/docs/user-guide/features/context_parallel.md
@@ -1,35 +1,34 @@
-context\_parallel package
-=========================
+# context_parallel package
 
-Context parallelism overview 
-----------------------------
+## Context parallelism overview
 
-.. figure:: ../images/context_parallel/CP_overview.png
-   :alt: cp_overview
-   :align: center
-   
-   Figure 1: A transformer layer running with TP2CP2. Communications next to Attention are for CP, others are for TP. (AG/RS: all-gather in forward and reduce-scatter in backward, RS/AG: reduce-scatter in forward and all-gather in backward, /AG: no-op in forward and all-gather in backward).
+```{figure} ../../images/context_parallel/CP_overview.png
+:alt: cp_overview
+:align: center
+
+Figure 1: A transformer layer running with TP2CP2. Communications next to Attention are for CP, others are for TP. (AG/RS: all-gather in forward and reduce-scatter in backward, RS/AG: reduce-scatter in forward and all-gather in backward, /AG: no-op in forward and all-gather in backward).
+```
 
 Context Parallelism ("CP") is a parallelization scheme on the dimension of sequence length. Unlike prior SP (sequence parallelism) which only splits the sequence of Dropout and LayerNorm activations, CP partitions the network inputs and all activations along sequence dimension. With CP, all modules except attention (e.g., Linear, LayerNorm, etc.) can work as usual without any changes, because they do not have inter-token operations. As for attention, the Q (query) of each token needs to compute with the KV (key and value) of all tokens in the same sequence. Hence, CP requires additional all-gather across GPUs to collect the full sequence of KV. Correspondingly, reduce-scatter should be applied to the activation gradients of KV in backward propagation. To reduce activation memory footprint, each GPU only stores the KV of a sequence chunk in forward and gathers KV again in backward. KV communication happens between a GPU and its counterparts in other TP groups. The all-gather and reduce-scatter are transformed to point-to-point communications in ring topology under the hood. Exchanging KV also can leverage MQA/GQA to reduce communication volumes, as they only have one or few attention heads for KV.
 
-For example, in Figure 1, assuming sequence length is 8K, each GPU processes 4K tokens. GPU0 and GPU2 compose a CP group, they exchange KV with each other. Same thing also happens between GPU1 and GPU3. CP is similar to `Ring Attention <https://arxiv.org/abs/2310.01889>`_ but provides better performance by (1) leveraging the latest OSS and cuDNN flash attention kernels; (2) removing unnecessary computation resulted from low-triangle causal masking and achieving optimal load balance among GPUs.
+For example, in Figure 1, assuming sequence length is 8K, each GPU processes 4K tokens. GPU0 and GPU2 compose a CP group, they exchange KV with each other. Same thing also happens between GPU1 and GPU3. CP is similar to [Ring Attention](https://arxiv.org/abs/2310.01889) but provides better performance by (1) leveraging the latest OSS and cuDNN flash attention kernels; (2) removing unnecessary computation resulted from low-triangle causal masking and achieving optimal load balance among GPUs.
+
+## Context parallelism benefits
 
-Context parallelism benefits 
-----------------------------
+```{figure} ../../images/context_parallel/CP_results.png
+:alt: cp_results
+:align: center
 
-.. figure:: ../images/context_parallel/CP_results.png
-   :alt: cp_results
-   :align: center
-   
-   Figure 2: Speedup of 175B GPT with various TP+CP combinations vs. full recompute (i.e., TP8CP1).
+Figure 2: Speedup of 175B GPT with various TP+CP combinations vs. full recompute (i.e., TP8CP1).
+```
 
 LLM encounters OOM (out of memory) issue with long context (i.e., long sequence length) because of linearly increasing memory footprint of activations. Recomputing activations in backward can avoid OOM but also introduce significant overheads (~30% with full recompute). Enlarging TP (tensor model parallelism) can fix the OOM issue as well, but it potentially makes compute (e.g., Linear) too short to overlap communication latencies. To be clear, scaling out to more GPUs with bigger TP can hit the overlapping problem no matter if OOM happens.
 
 CP can better address the issues. With CP, each GPU only computes on a part of the sequence, which reduces both computation and communication by CP times. Therefore, there are no concerns about the overlapping between them. The activation memory footprint per GPU is also CP times smaller, hence no OOM issue anymore. As Figure 2 shows, the combinations of TP and CP can achieve optimal performance by eliminating recompute overheads and making the best tradeoff between computation and communications.
 
-Enabling context parallelism
-----------------------------
+## Enabling context parallelism
 
 CP support has been added to GPT. All models that share GPT code path also should be able to benefit from CP, such as Llama. CP can work with TP (tensor model parallelism), PP (pipeline model parallelism), and DP (data parallelism), where the total number of GPUs equals TPxCPxPPxDP. CP also can work with different attention variants, including MHA/MQA/GQA, uni-directional and bi-directional masking.
 
 CP is enabled by simply setting context_parallel_size=<CP_SIZE> in command line. Default context_parallel_size is 1, which means CP is disabled. Running with CP requires Megatron-Core (>=0.5.0) and Transformer Engine (>=1.1).
+
diff --git a/docs/source/api-guide/custom_fsdp.md b/docs/user-guide/features/custom_fsdp.md
similarity index 97%
rename from docs/source/api-guide/custom_fsdp.md
rename to docs/user-guide/features/custom_fsdp.md
index e265de8ae4b..2f81eb0c5ef 100644
--- a/docs/source/api-guide/custom_fsdp.md
+++ b/docs/user-guide/features/custom_fsdp.md
@@ -1,6 +1,6 @@
-**NOTE: In M-Core 0.14, the custom FSDP refactored its checkpoint implementation to use DTensor-based torch distributed checkpointing. The custom FSDP was also renamed Megatron FSDP. The relevant sections of this document are no longer applicable.**
+# Megatron FSDP
 
-# MCore Custom Fully Sharded Data Parallel (FSDP)
+**NOTE: In M-Core 0.14, the custom FSDP refactored its checkpoint implementation to use DTensor-based torch distributed checkpointing. The custom FSDP was also renamed Megatron FSDP. The relevant sections of this document are no longer applicable.**
 
 ## How to use ?
 
@@ -13,6 +13,8 @@ Add these flag to enable MCore custom FSDP.
 --use-distributed-optimizer
 ```
 
+For a practical guide covering required configurations, checkpoint conversion, and example scripts, see the [Megatron-FSDP User Guide](../../discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md).
+
 ## Key Features
 
 - **Sharding Strategy**: Efficiently shards optimizer states, gradients, and parameters to reduce memory consumption.
@@ -52,7 +54,7 @@ The design of Custom FSDP draws inspiration from PyTorch FSDP [Zhao, Yanli, et a
 
 > When training with FSDP, the GPU memory footprint is smaller than when training with DDP across all workers. This makes the training of some very large models feasible by allowing larger models or batch sizes to fit on device. This comes with the cost of increased communication volume. The communication overhead is reduced by internal optimizations like overlapping communication and computation.
 
-![FSDP workflow](../images/custom_fsdp/FSDP_workflow.png)
+![FSDP workflow](../../images/custom_fsdp/FSDP_workflow.png)
 
 *Notice that the unit processed in workflow here is the “FSDP instance 1: N layers”, where an FSDP instance is the smallest FSDP processing unit (also a PyTorch module), which means that we can safely release this module weights after using it (executing the forward or backward of this module), and there will be no other computations computations relying on these weights. This capability is the foundation of FSDP's layer-by-layer execution and memory-saving strategy. An FSDP instance is also referred to as an **FSDP Unit**.*
 
@@ -76,13 +78,13 @@ In backward path
 
 One way to view FSDP’s sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards.
 
-![FSDP Allreduce](../images/custom_fsdp/FSDP_Allreduce.png)
+![FSDP Allreduce](../../images/custom_fsdp/FSDP_Allreduce.png)
 
 ### 2. Custom FSDP underlying data structure
 
 To implement the FSDP functionality described above, the custom FSDP is designed with the following Python classes and data structure:
 
-![MCore Custom FSDP Class Diagram](../images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png)
+![MCore Custom FSDP Class Diagram](../../images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png)
 
 ### 3. The custom FSDP interface: FullyShardedDataParallel
 
diff --git a/docs/source/api-guide/dist_optimizer.md b/docs/user-guide/features/dist_optimizer.md
similarity index 95%
rename from docs/source/api-guide/dist_optimizer.md
rename to docs/user-guide/features/dist_optimizer.md
index 34f42d5343f..ddb6079885c 100644
--- a/docs/source/api-guide/dist_optimizer.md
+++ b/docs/user-guide/features/dist_optimizer.md
@@ -16,11 +16,11 @@ The figures below illustrate the distributed optimizer's sharding scheme, and th
 
 ## Data flow
 
-![Data flow](../images/distrib_optimizer/data_flow.png)
+![Data flow](../../images/distrib_optimizer/data_flow.png)
 
 ## Sharding scheme
 
-![Sharding scheme](../images/distrib_optimizer/sharding_scheme.png)
+![Sharding scheme](../../images/distrib_optimizer/sharding_scheme.png)
 
 ## Key steps
 
diff --git a/docs/user-guide/features/fine_grained_activation_offloading.md b/docs/user-guide/features/fine_grained_activation_offloading.md
new file mode 100644
index 00000000000..53211d1d06c
--- /dev/null
+++ b/docs/user-guide/features/fine_grained_activation_offloading.md
@@ -0,0 +1,31 @@
+# Fine-grained Activation Offloading (collaborated with rednote)
+
+Memory capacity is more and more important with the rising of extreme sparse MoE models like DeepSeek-V3 and Qwen3-235B. Fine-grained recomputing reduces the memory footprint at the cost of extra recomputation, while offloading could utilize the host-device bandwidth to achieve nearly zero-overhead. Fine-grained Activation Offloading targets at offloading the activation at the granularity of specific modules, so that we can calibrate the amount of offloading activation to maximize the training throughput.
+
+Currently, the supported offloading modules are `"attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act"`, which could work with fine-grained recomputation to release almost all activations of a transformer layer.
+
+**Features**
+* Support PP=1/PP/Interleaved PP
+* Compatible with fine-grained recomputation
+* Support FP8
+* Support MTP
+* Support mixed dense & moe layer
+* Support A2A Overlap
+* Support CUDA Graph
+  * (Temporary) cuda graph scope cannot contains the offloading modules
+
+**Usage**
+```bash
+# Enable fine-grained activation offloading
+--fine-grained-activation-offloading
+
+# Specify which modules are going to offload its input
+# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act".
+--offload-modules expert_fc1
+```
+**Compatible with Fine-grained Recomputation**
+- For modules with minor perf overhead like layernorm or moe_act, use recomputing to reduce memory footprint;
+- For other modules, use offloading to reduce memory footprint;
+- Make sure the offloading/reloading could be overlapped with computing;
+
+![Fine-grained Activation Offloading and Fine-grained Recomputation](../../images/fine_grained_activation_offloading/offloading_and_recomputing.png)
diff --git a/docs/user-guide/features/index.md b/docs/user-guide/features/index.md
new file mode 100644
index 00000000000..9b036ab5f99
--- /dev/null
+++ b/docs/user-guide/features/index.md
@@ -0,0 +1,18 @@
+# Advanced Features
+
+Advanced feature guides for key Megatron Core capabilities.
+
+```{toctree}
+:maxdepth: 2
+
+fine_grained_activation_offloading
+moe
+context_parallel
+custom_fsdp
+dist_optimizer
+optimizer_cpu_offload
+pipeline_parallel_layout
+tokenizers
+megatron_energon
+megatron_rl
+```
diff --git a/docs/user-guide/features/megatron_energon.md b/docs/user-guide/features/megatron_energon.md
new file mode 100644
index 00000000000..d08bde21e38
--- /dev/null
+++ b/docs/user-guide/features/megatron_energon.md
@@ -0,0 +1,132 @@
+# Megatron Energon
+
+Advanced multimodal dataloader for efficient loading of text, images, video, and audio at scale.
+
+## Overview
+
+[**Megatron Energon**](https://github.com/NVIDIA/Megatron-Energon) is purpose-built for large-scale multimodal training with:
+
+- **Multimodal support** - Text, images, video, audio
+- **Distributed loading** - Optimized for multi-node training
+- **Data blending** - Mix datasets with configurable weights
+- **WebDataset format** - Efficient streaming from cloud storage
+- **State management** - Save and restore training position
+
+## Installation
+
+```bash
+pip install megatron-energon
+```
+
+## Key Features
+
+### Data Processing
+
+- **Packing** - Optimize sequence length utilization
+- **Grouping** - Smart batching of similar-length sequences
+- **Joining** - Combine multiple dataset sources
+- **Object storage** - Stream from S3, GCS, Azure Blob Storage
+
+### Production-Ready
+
+- Distributed loading across workers and nodes
+- Checkpoint data loading state
+- Memory-efficient streaming
+- Parallel data loading with prefetching
+
+## Basic Usage
+
+```python
+from megatron.energon import get_train_dataset, get_loader, WorkerConfig
+
+# Create dataset
+ds = get_train_dataset(
+    '/path/to/dataset',
+    batch_size=32,
+    shuffle_buffer_size=1000,
+    worker_config=WorkerConfig.default_worker_config(),
+)
+
+# Create loader and iterate
+for batch in get_loader(ds):
+    # Training step
+    pass
+```
+
+## Multimodal Example
+
+```python
+# Load image-text dataset
+ds = get_train_dataset(
+    '/path/to/multimodal/dataset',
+    batch_size=32,
+    worker_config=WorkerConfig(num_workers=8, prefetch_factor=2),
+)
+
+for batch in get_loader(ds):
+    images = batch['image']  # Image tensors
+    texts = batch['text']    # Text captions
+    # Process batch
+```
+
+## Dataset Blending
+
+Mix multiple datasets with custom weights:
+
+```python
+from megatron.energon import Blender
+
+blended_ds = Blender([
+    ('/path/to/dataset1', 0.6),  # 60%
+    ('/path/to/dataset2', 0.3),  # 30%
+    ('/path/to/dataset3', 0.1),  # 10%
+])
+```
+
+## Configuration
+
+### Worker Configuration
+
+```python
+WorkerConfig(
+    num_workers=8,              # Parallel workers
+    prefetch_factor=2,          # Batches to prefetch per worker
+    persistent_workers=True,    # Keep workers alive between epochs
+)
+```
+
+### Common Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `batch_size` | Samples per batch |
+| `shuffle_buffer_size` | Buffer size for randomization |
+| `max_samples_per_sequence` | Max samples to pack into one sequence |
+| `worker_config` | Worker configuration for parallel loading |
+
+## Integration with Megatron-LM
+
+```python
+from megatron.energon import get_train_dataset, get_loader
+from megatron.training import get_args
+
+args = get_args()
+
+train_ds = get_train_dataset(
+    args.data_path,
+    batch_size=args.micro_batch_size,
+)
+
+for iteration, batch in enumerate(get_loader(train_ds)):
+    loss = train_step(batch)
+```
+
+## Resources
+
+- **[Megatron Energon GitHub](https://github.com/NVIDIA/Megatron-Energon)** - Documentation and examples
+- **[Multimodal Examples](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/multimodal)** - Megatron-LM multimodal training
+
+## Next Steps
+
+- Check [Multimodal Models](../../models/multimodal.md) for supported architectures
+- See [Training Examples](../training-examples.md) for integration examples
diff --git a/docs/user-guide/features/megatron_rl.md b/docs/user-guide/features/megatron_rl.md
new file mode 100644
index 00000000000..128b41bdaf5
--- /dev/null
+++ b/docs/user-guide/features/megatron_rl.md
@@ -0,0 +1,46 @@
+# Megatron RL
+
+Reinforcement learning library for post-training large language models at scale.
+
+## Overview
+
+[**Megatron RL**](https://github.com/NVIDIA/Megatron-LM/tree/dev/megatron/rl) adds native reinforcement learning capabilities to Megatron-LM for large-scale RL-based post-training of foundation models.
+
+> **Note**: Megatron RL is under active development and primarily designed for research teams exploring RL post-training on modern NVIDIA hardware. For production deployments, use [**NeMo RL**](https://github.com/NVIDIA-NeMo/RL).
+
+## Key Features
+
+- **Decoupled Design** - Clean separation between agent/environment logic and RL implementation
+- **Flexible Inference** - Support for Megatron, OpenAI, and HuggingFace inference backends
+- **Trainer/Evaluator** - Manages rollout generation and coordinates with inference systems
+- **Megatron Integration** - Native integration with Megatron Core inference system
+
+## Architecture
+
+### Components
+
+**Agents & Environments**
+- Accept inference handles
+- Return experience rollouts with rewards
+- Implement custom RL logic
+
+**Trainer/Evaluator**
+- Controls rollout generation
+- Coordinates with inference systems
+- Manages training loops
+
+**Inference Interface**
+- Provides `.generate(prompt, **generation_args)` endpoint
+- Supports multiple backends (Megatron, OpenAI, HuggingFace)
+
+## Use Cases
+
+- RLHF (Reinforcement Learning from Human Feedback)
+- Custom reward-based fine-tuning
+- Policy optimization for specific tasks
+- Research on RL post-training techniques
+
+## Resources
+
+- **[Megatron RL GitHub](https://github.com/NVIDIA/Megatron-LM/tree/dev/megatron/rl)** - Source code and documentation
+- **[Megatron Core Inference](../../api-guide/core/transformer.md)** - Native inference integration
diff --git a/docs/user-guide/features/moe.md b/docs/user-guide/features/moe.md
new file mode 100644
index 00000000000..56aca8c6999
--- /dev/null
+++ b/docs/user-guide/features/moe.md
@@ -0,0 +1,12 @@
+# Mixture of Experts
+
+```{toctree}
+:maxdepth: 1
+:caption: MoE Features
+
+multi_token_prediction
+multi_latent_attention
+```
+
+```{include} ../../../megatron/core/transformer/moe/README.md
+```
diff --git a/docs/source/api-guide/multi_latent_attention.rst b/docs/user-guide/features/multi_latent_attention.md
similarity index 79%
rename from docs/source/api-guide/multi_latent_attention.rst
rename to docs/user-guide/features/multi_latent_attention.md
index 64e2da07d09..5628b8cfee3 100644
--- a/docs/source/api-guide/multi_latent_attention.rst
+++ b/docs/user-guide/features/multi_latent_attention.md
@@ -1,14 +1,12 @@
-Multi-Latent Attention
-======================
+# Multi-Latent Attention
 
-Multi-Latent Attention overview 
--------------------------------
+## Multi-Latent Attention overview
 
 Multi-Latent Attention ("MLA") is an innovative attention mechanism introduced by Deepseek team that enhances the efficiency of attention computation by leveraging multiple latent spaces. This approach is particularly beneficial for large language models (LLMs), as it reduces the computational burden associated with traditional attention mechanisms. According to Deepseek-V2 technical report, MLA achieves better performance compared to Multi-Head Attention (MHA) and requires smaller KV cache.
 
-Enabling Multi-Latent Attention
--------------------------------
+## Enabling Multi-Latent Attention
 
 To enable MLA in Megatron-LM, set the following flags in command line:
 - `--multi-latent-attention` to enable MLA in MLP.
 - Set `MLATransformerConfig` to configure MLA.
+
diff --git a/docs/source/api-guide/multi_token_prediction.md b/docs/user-guide/features/multi_token_prediction.md
similarity index 57%
rename from docs/source/api-guide/multi_token_prediction.md
rename to docs/user-guide/features/multi_token_prediction.md
index 4059fa5326e..891bf4c93c5 100644
--- a/docs/source/api-guide/multi_token_prediction.md
+++ b/docs/user-guide/features/multi_token_prediction.md
@@ -3,7 +3,7 @@
 Multi-Token Prediction (MTP) extends the prediction scope to multiple future tokens at each position. On the one hand, an MTP objective densifies the training signals and may improve
 data efficiency. On the other hand, MTP may enable the model to pre-plan its representations for better prediction of future tokens. In this implementation of MTP, we sequentially predict additional tokens and keep the complete causal chain at each prediction depth. The following figure illustrates our implementation of MTP in [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3/).
 
-![MTP_implementation](../images/multi_token_prediction/MTP_implementation.png)
+![MTP_implementation](../../images/multi_token_prediction/MTP_implementation.png)
 
 The k-th MTP module consists of a shared embedding layer, a projection matrix, a Transformer block, and a shared output head. For the i-th input token at the (k - 1)-th prediction depth, we first combine the representation of the i-th token and the embedding of the (i + K)-th token with the linear projection. The combined serves as the input of the Transformer block at the k-th depth to produce the output representation.
 
@@ -18,6 +18,31 @@ We can train GPTModel like models with Multi-Token Prediction (MTP) by setting m
 | mtp_num_layers | Number of Multi-Token Prediction (MTP) Layers. MTP extends the prediction scope to multiple future tokens at each position. This MTP implementation sequentially predict additional tokens by using D sequential modules to predict D additional tokens. Default is None. |
 | mtp_loss_scaling_factor | Scaling factor of Multi-Token Prediction (MTP) loss. We compute the average of the MTP losses across all depths, and multiply it the scaling factor to obtain the overall MTP loss, which serves as an additional training objective. Default is 0.1. |
 
+## Pipeline Parallel Layout for MTP
+
+MTP supports flexible placement of MTP layers across pipeline stages using a custom `pipeline_model_parallel_layout`. By default, all MTP layers are placed on the last pipeline stage, but you can customize their placement.
+
+### MTP Standalone Mode
+
+When MTP layers are placed in a separate virtual pipeline (vpp) stage that is not on the last pipeline rank, the `mtp_standalone` flag is automatically set to `True`. This mode enables MTP to run independently in its own pipeline stage.
+
+### Layout Format
+
+Use `m` to represent MTP layers in the pipeline layout string. For example:
+- `"E|t*3|(t|)*5mL"` - MTP in the last stage
+- `"E|t*3|(t|)*4tm|L"` - MTP in the second-to-last stage with a decoder layer
+- `"E|t*3|(t|)*3tt|m|L"` - MTP in a standalone stage (second-to-last) with no other layers
+
+### Constraints
+
+- All MTP layers must be placed in the same one virtual pipeline stage.
+- MTP layers cannot be placed on the first pipeline rank.
+
+## Implementation Notes
+
+- For models with MTP layers, the final layernorm is placed in the stage that contains the last decoder layer, rather than in the post-process stage. This may cause small numerical differences in gradient norm reduction when final layernorm is placed in different pipeline stages in deterministic mode. Bitwise alignment can be achieved by disabling gradient norm clipping.
+- MTP loss is computed in the post-processing stage.
+
 ## Precautions
 
 Please do not use Context Parallel (CP), or arbitrary AttnMaskType, or learned absolute position embedding type with MTP. These use cases are not yet supported.
diff --git a/docs/user-guide/features/optimizer_cpu_offload.md b/docs/user-guide/features/optimizer_cpu_offload.md
new file mode 100644
index 00000000000..408d7f6a788
--- /dev/null
+++ b/docs/user-guide/features/optimizer_cpu_offload.md
@@ -0,0 +1,4 @@
+# Optimizer CPU Offload
+
+```{include} ../../../megatron/core/optimizer/cpu_offloading/README.md
+```
diff --git a/docs/source/api-guide/pipeline_parallel_layout.md b/docs/user-guide/features/pipeline_parallel_layout.md
similarity index 100%
rename from docs/source/api-guide/pipeline_parallel_layout.md
rename to docs/user-guide/features/pipeline_parallel_layout.md
diff --git a/docs/user-guide/features/tokenizers.md b/docs/user-guide/features/tokenizers.md
new file mode 100644
index 00000000000..0aecf8df8a7
--- /dev/null
+++ b/docs/user-guide/features/tokenizers.md
@@ -0,0 +1,230 @@
+# Tokenizers
+
+Megatron Core provides a unified tokenizer system with a HuggingFace-style API for easy tokenizer management and configuration.
+
+## Overview
+
+The `MegatronTokenizer` class offers a simple, familiar API for loading and managing tokenizers:
+
+- **Automatic detection** - Load any tokenizer type without specifying the library
+- **Metadata-based configuration** - Store tokenizer settings in JSON for easy reuse
+- **HuggingFace-compatible API** - Familiar `.from_pretrained()` interface
+- **Custom tokenizer support** - Extend with model-specific tokenization logic
+
+## Key Features
+
+### Unified API
+
+Use the same API regardless of tokenizer backend (SentencePiece, HuggingFace, TikToken, etc.):
+
+```python
+from megatron.core.tokenizers import MegatronTokenizer
+
+tokenizer = MegatronTokenizer.from_pretrained("/path/to/tokenizer")
+```
+
+### Tokenizer Metadata
+
+Configuration is stored in a JSON metadata file containing:
+- Tokenizer library (HuggingFace, SentencePiece, TikToken, etc.)
+- Chat templates
+- Custom tokenizer class
+- Special token configurations
+
+**Benefits:**
+- Set configuration once, reuse everywhere
+- No repeated CLI arguments
+- Easy sharing - just copy the tokenizer directory
+
+### Automatic Library Detection
+
+The correct tokenizer implementation is automatically selected:
+- No need to specify `SentencePieceTokenizer`, `HuggingFaceTokenizer`, etc.
+- Library type detected from metadata
+- Seamless switching between tokenizer backends
+
+## Basic Usage
+
+### Creating Tokenizer Metadata
+
+Save tokenizer configuration for reuse:
+
+```python
+from megatron.core.tokenizers import MegatronTokenizer
+
+# Create metadata for a SentencePiece tokenizer
+MegatronTokenizer.write_metadata(
+    tokenizer_path="/path/to/tokenizer.model",
+    tokenizer_library="sentencepiece",
+    chat_template="{% for message in messages %}{{ message.content }}{% endfor %}",
+)
+```
+
+The metadata is saved as `tokenizer_metadata.json` in the tokenizer directory.
+
+### Loading a Tokenizer
+
+Load from a directory with metadata:
+
+```python
+from megatron.core.tokenizers import MegatronTokenizer
+
+# Load with auto-detected configuration
+tokenizer = MegatronTokenizer.from_pretrained("/path/to/tokenizer.model")
+```
+
+### Loading with Custom Metadata Path
+
+If metadata is stored separately:
+
+```python
+tokenizer = MegatronTokenizer.from_pretrained(
+    tokenizer_path="/path/to/tokenizer.model",
+    metadata_path="/path/to/custom/metadata.json",
+)
+```
+
+### Loading with Inline Metadata
+
+Pass metadata as a dictionary:
+
+```python
+tokenizer = MegatronTokenizer.from_pretrained(
+    tokenizer_path="GPT2BPETokenizer",
+    metadata_path={"library": "megatron"},
+    vocab_file="/path/to/vocab.txt",
+)
+```
+
+## Advanced Usage
+
+### Custom Tokenizer Classes
+
+Create model-specific tokenization logic:
+
+```python
+from megatron.core.tokenizers.text import MegatronTokenizerText
+
+class CustomTokenizer(MegatronTokenizerText):
+    def encode(self, text):
+        # Custom encoding logic
+        return super().encode(text)
+
+    def decode(self, tokens):
+        # Custom decoding logic
+        return super().decode(tokens)
+
+# Save metadata with custom class
+MegatronTokenizer.write_metadata(
+    tokenizer_path="/path/to/tokenizer.model",
+    tokenizer_library="sentencepiece",
+    tokenizer_class=CustomTokenizer,
+)
+```
+
+### TikToken Tokenizers
+
+Configure TikToken-based tokenizers:
+
+```python
+tokenizer = MegatronTokenizer.from_pretrained(
+    tokenizer_path="/path/to/tokenizer/model.json",
+    metadata_path={"library": "tiktoken"},
+    pattern="v2",
+    num_special_tokens=1000,
+)
+```
+
+### Null Tokenizer
+
+Use a null tokenizer for testing or non-text models:
+
+```python
+tokenizer = MegatronTokenizer.from_pretrained(
+    metadata_path={"library": "null"},
+    vocab_size=131072,
+)
+```
+
+## Integration with Megatron-LM
+
+### Using with Training Scripts
+
+The tokenizer system integrates seamlessly with Megatron-LM training:
+
+```bash
+# Null tokenizer for testing
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 131072 \
+    ...
+```
+
+```bash
+# HuggingFace tokenizer with metadata
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model meta-llama/Meta-Llama-3-8B \
+    --tokenizer-metadata /path/to/metadata.json \
+    ...
+```
+
+### Auto-Generated Metadata
+
+If `--tokenizer-metadata` is not specified, a default metadata file is generated automatically based on the tokenizer type.
+
+### Legacy Tokenizer Support
+
+The old tokenizer system is still supported for backward compatibility:
+
+```bash
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --legacy-tokenizer \
+    ...
+```
+
+## Supported Tokenizer Libraries
+
+| Library | Description | Use Case |
+|---------|-------------|----------|
+| **HuggingFace** | Transformers tokenizers | Most modern LLMs (LLaMA, Mistral, etc.) |
+| **SentencePiece** | Google's tokenizer | GPT-style models, custom vocabularies |
+| **TikToken** | OpenAI's tokenizer | GPT-3.5/GPT-4 style tokenization |
+| **Megatron** | Built-in tokenizers | Legacy GPT-2 BPE |
+| **Null** | No-op tokenizer | Testing, non-text modalities |
+
+## Common Tokenizer Types
+
+### LLaMA / Mistral
+
+```python
+MegatronTokenizer.write_metadata(
+    tokenizer_path="/path/to/llama/tokenizer.model",
+    tokenizer_library="sentencepiece",
+)
+```
+
+### GPT-2
+
+```python
+MegatronTokenizer.write_metadata(
+    tokenizer_path="GPT2BPETokenizer",
+    tokenizer_library="megatron",
+    vocab_file="/path/to/gpt2-vocab.json",
+    merge_file="/path/to/gpt2-merges.txt",
+)
+```
+
+## Best Practices
+
+1. **Always save metadata** - Create metadata once, reuse across training runs
+2. **Use HuggingFace tokenizers** - When possible, for modern LLM compatibility
+3. **Test tokenization** - Verify encode/decode before starting training
+4. **Version control metadata** - Include `tokenizer_metadata.json` in your experiment configs
+5. **Share tokenizer directories** - Include both model files and metadata for reproducibility
+
+## Next Steps
+
+- **Prepare Data**: See [Data Preparation](../data-preparation.md) for preprocessing with tokenizers
+- **Train Models**: Use tokenizers in [Training Examples](../training-examples.md)
+- **Supported Models**: Check [Language Models](../../models/llms.md) for model-specific tokenizers
diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md
new file mode 100644
index 00000000000..ae7bd5ea80b
--- /dev/null
+++ b/docs/user-guide/index.md
@@ -0,0 +1,27 @@
+---
+orphan: true
+---
+
+<!---
+   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
+   NVIDIA CORPORATION and its licensors retain all intellectual property
+   and proprietary rights in and to this software, related documentation
+   and any modifications thereto. Any use, reproduction, disclosure or
+   distribution of this software and related documentation without an express
+   license agreement from NVIDIA CORPORATION is strictly prohibited.
+-->
+
+# User Guide
+
+Comprehensive guides for using Megatron Core and Megatron-LM.
+
+```{toctree}
+:maxdepth: 2
+
+quickstart
+msc_integration
+data-preparation
+training-examples
+parallelism-guide
+features/index
+```
diff --git a/docs/user-guide/msc_integration.md b/docs/user-guide/msc_integration.md
new file mode 100644
index 00000000000..fd73ac7e8f4
--- /dev/null
+++ b/docs/user-guide/msc_integration.md
@@ -0,0 +1,3 @@
+```{include} ../../megatron/core/MSC_Integration.md
+```
+
diff --git a/docs/user-guide/parallelism-guide.md b/docs/user-guide/parallelism-guide.md
new file mode 100644
index 00000000000..2baf518ae85
--- /dev/null
+++ b/docs/user-guide/parallelism-guide.md
@@ -0,0 +1,211 @@
+# Parallelism Strategies Guide
+
+Megatron Core supports multiple parallelism strategies that can be combined to efficiently train models from billions to trillions of parameters across thousands of GPUs.
+
+## Overview
+
+| Strategy | What it parallelizes | Best for |
+|----------|---------------------|----------|
+| **Data Parallelism (DP)** | Batch dimension | Standard training, most common |
+| **Tensor Parallelism (TP)** | Individual layers | Large layers, GPU memory constraints |
+| **Pipeline Parallelism (PP)** | Model depth | Very deep models |
+| **Context Parallelism (CP)** | Sequence length | Long sequences (8K+ tokens) |
+| **Expert Parallelism (EP)** | MoE experts | Mixture-of-Experts models |
+
+## Data Parallelism (DP)
+
+Replicate the model across GPUs and split the batch.
+
+### Standard Data Parallel (DDP)
+
+```bash
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --data-parallel-sharding-strategy no_shard
+```
+
+Each GPU has a full copy of the model and processes a portion of the batch.
+
+### Fully Sharded Data Parallel (FSDP)
+
+Shard model parameters, gradients, and optimizer states to reduce memory:
+
+```bash
+# Megatron FSDP (~15% faster than PyTorch FSDP2)
+--use-megatron-fsdp \
+--data-parallel-sharding-strategy optim_grads_params
+```
+
+**Sharding strategies:**
+- `optim` - Shard optimizer states only (ZeRO-1)
+- `optim_grads` - Shard gradients + optimizer (ZeRO-2)
+- `optim_grads_params` - Shard parameters + gradients + optimizer (ZeRO-3)
+
+## Tensor Parallelism (TP)
+
+Split individual model layers across GPUs. Recommended for large hidden dimensions.
+
+```bash
+--tensor-model-parallel-size 4  # 4-way tensor parallelism
+--sequence-parallel              # Enable sequence parallelism (recommended)
+```
+
+**When to use:**
+- Model layers don't fit on single GPU
+- Large hidden dimensions (4096+)
+- Usually combined with DP and PP
+
+## Pipeline Parallelism (PP)
+
+Split model layers across GPUs vertically (by depth).
+
+```bash
+--pipeline-model-parallel-size 8              # 8 pipeline stages
+--num-layers-per-virtual-pipeline-stage 4     # Virtual pipeline for load balancing
+```
+
+**When to use:**
+- Very deep models (50+ layers)
+- Combine with TP for large models
+- Helps distribute memory across GPUs
+
+## Context Parallelism (CP)
+
+Split long sequences across GPUs for efficient long-context training.
+
+```bash
+--context-parallel-size 2           # 2-way context parallelism
+--cp-comm-type p2p                  # Communication type
+```
+
+**When to use:**
+- Long sequences (8K+ tokens)
+- Reduces activation memory
+- Can combine with TP, PP, DP
+
+**→ [Context Parallelism Deep Dive](features/context_parallel.md)** - Detailed guide with performance analysis
+
+## Expert Parallelism (EP)
+
+Distribute experts across GPUs in Mixture-of-Experts models.
+
+```bash
+--expert-model-parallel-size 8  # 8-way expert parallelism
+--num-experts 64                # 64 experts per MoE layer
+--moe-grouped-gemm              # Optimize expert computation
+```
+
+**Important:** When combining EP with TP, you **must enable Sequence Parallelism**:
+
+```bash
+--tensor-model-parallel-size 4
+--expert-model-parallel-size 8
+--sequence-parallel  # Required when using TP + EP
+```
+
+## Parallelism Selection Guide
+
+Recommended configurations based on [NVIDIA NeMo production setups](https://github.com/NVIDIA/NeMo/tree/main/scripts/performance/recommended_model_configs):
+
+### Language Models
+
+| Model | Size | GPUs | TP | PP | CP | EP | Configuration Notes |
+|-------|------|------|----|----|----|----|---------------------|
+| **LLaMA-3** | 8B | 8 | 1 | 1 | 2 | 1 | CP=2 for long context (8K seqlen) |
+| **LLaMA-3** | 70B | 64 | 4 | 4 | 2 | 1 | Balanced TP+PP for 70B scale |
+| **LLaMA-3.1** | 405B | 1024 | 8 | 8 | 2 | 1 | 3D parallelism (TP+PP+CP) |
+| **GPT-3** | 175B | 128-512 | 4 | 8 | 1 | 1 | Standard large model config |
+
+### Mixture-of-Experts Models
+
+| Model | Size | GPUs | TP | PP | CP | EP | Configuration Notes |
+|-------|------|------|----|----|----|----|---------------------|
+| **Mixtral** | 8x7B | 64 | 1 | 4 | 1 | 8 | EP=8 for 8 experts |
+| **Mixtral** | 8x22B | 256 | 4 | 4 | 1 | 8 | TP+PP+EP for large MoE |
+| **DeepSeek-V3** | 671B | 1024 | 2 | 16 | 1 | 64 | Massive MoE with 256 experts |
+
+## Combining Strategies
+
+### Total GPU Count
+
+The total number of GPUs is calculated as:
+
+```
+Total GPUs = TP × PP × CP × EP × DP
+```
+
+### Example: LLaMA-3 70B on 64 GPUs
+
+```bash
+# TP=4, PP=4, CP=2, DP=2 => 4 × 4 × 2 × 2 = 64 GPUs
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --tensor-model-parallel-size 4 \
+    --pipeline-model-parallel-size 4 \
+    --context-parallel-size 2 \
+    --num-layers 80 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --seq-length 8192 \
+    --micro-batch-size 1 \
+    --global-batch-size 512 \
+    --bf16
+```
+
+## Performance Optimizations
+
+### Communication Overlap
+
+Enable overlapping of communication with computation:
+
+```bash
+--overlap-grad-reduce      # Overlap gradient reduction with backward pass
+--overlap-param-gather     # Overlap parameter gathering with forward pass
+--tp-comm-overlap          # Overlap TP communication
+```
+
+### Distributed Optimizer
+
+Recommended for all multi-GPU training:
+
+```bash
+--use-distributed-optimizer
+```
+
+Benefits:
+- Faster checkpointing
+- Reduced memory when combined with FSDP
+- Better performance at scale
+
+### Sequence Parallelism
+
+Always enable when using TP:
+
+```bash
+--sequence-parallel
+```
+
+Reduces activation memory by sharding sequence dimension in LayerNorm and Dropout.
+
+## Choosing the Right Strategy
+
+### Start Simple
+1. Begin with **Data Parallelism** (DP) only
+2. Add **Tensor Parallelism** (TP) if model doesn't fit
+3. Add **Pipeline Parallelism** (PP) for very large models
+4. Add **Context Parallelism** (CP) for long sequences
+
+### Memory Constraints
+- Use **FSDP** to reduce memory per GPU
+- Use **TP** to split large layers
+- Use **PP** to split model depth
+- Enable **activation checkpointing** for extreme cases
+
+### Communication Bottlenecks
+- Reduce **TP** degree (increases memory per GPU)
+- Increase **PP** degree (may reduce efficiency)
+- Use **CP** instead of larger TP for long sequences
+
+## Next Steps
+
+- **API Reference**: See [Tensor Parallel](../api-guide/core/tensor_parallel.md) and [Pipeline Parallel](../api-guide/core/pipeline_parallel.md) API documentation
+- **Advanced Features**: Explore [Megatron FSDP](features/custom_fsdp.md) and [Distributed Optimizer](features/dist_optimizer.md)
+- **Performance Tuning**: Check [NVIDIA NeMo Performance Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html)
diff --git a/docs/user-guide/quickstart.md b/docs/user-guide/quickstart.md
new file mode 100644
index 00000000000..7baed06d6be
--- /dev/null
+++ b/docs/user-guide/quickstart.md
@@ -0,0 +1,3 @@
+```{include} ../../megatron/core/QuickStart.md
+```
+
diff --git a/docs/user-guide/training-examples.md b/docs/user-guide/training-examples.md
new file mode 100644
index 00000000000..2824c608c36
--- /dev/null
+++ b/docs/user-guide/training-examples.md
@@ -0,0 +1,146 @@
+# Training Examples
+
+Get started with Megatron Core training using these practical examples.
+
+## Simple Training Example
+
+The simplest way to get started is with the basic training loop using mock data:
+
+```bash
+# Distributed training on 2 GPUs with mock data
+torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
+```
+
+This example:
+- Runs on 2 GPUs
+- Uses generated mock data (no data preparation needed)
+- Demonstrates basic distributed training setup
+- Perfect for testing your installation
+
+## LLaMA-3 Training Examples
+
+### LLaMA-3 8B with FP8
+
+Train LLaMA-3 8B model with FP8 mixed precision on 8 GPUs:
+
+```bash
+./examples/llama/train_llama3_8b_fp8.sh
+```
+
+**Configuration:**
+- 8 GPUs
+- FP8 mixed precision (requires Hopper/Ada/Blackwell GPUs)
+- Mock data for quick testing
+
+### Custom LLaMA Training
+
+For training with your own data:
+
+```bash
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 2048 \
+    --max-position-embeddings 2048 \
+    --micro-batch-size 4 \
+    --global-batch-size 32 \
+    --train-iters 100000 \
+    --lr 3.0e-4 \
+    --min-lr 3.0e-5 \
+    --lr-decay-style cosine \
+    --lr-warmup-iters 2000 \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --bf16 \
+    --data-path /path/to/your/preprocessed_data \
+    --split 949,50,1 \
+    --save /path/to/checkpoints \
+    --load /path/to/checkpoints \
+    --log-interval 10 \
+    --save-interval 1000 \
+    --eval-interval 1000
+```
+
+## GPT-3 Training Example
+
+Train a GPT-3 style model:
+
+```bash
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --tensor-model-parallel-size 2 \
+    --pipeline-model-parallel-size 2 \
+    --num-layers 24 \
+    --hidden-size 2048 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 2 \
+    --global-batch-size 16 \
+    --train-iters 100000 \
+    --lr 1.5e-4 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --lr-warmup-iters 1000 \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --data-path /path/to/preprocessed_data \
+    --split 949,50,1 \
+    --save /path/to/checkpoints \
+    --load /path/to/checkpoints
+```
+
+## Key Training Arguments
+
+### Model Architecture
+
+| Argument | Description |
+|----------|-------------|
+| `--num-layers` | Number of transformer layers |
+| `--hidden-size` | Hidden dimension size |
+| `--num-attention-heads` | Number of attention heads |
+| `--seq-length` | Sequence length for training |
+
+### Training Configuration
+
+| Argument | Description |
+|----------|-------------|
+| `--micro-batch-size` | Batch size per GPU |
+| `--global-batch-size` | Total batch size across all GPUs |
+| `--train-iters` | Number of training iterations |
+
+### Learning Rate
+
+| Argument | Description |
+|----------|-------------|
+| `--lr` | Peak learning rate |
+| `--min-lr` | Minimum learning rate |
+| `--lr-decay-style` | LR schedule (cosine, linear, constant) |
+| `--lr-warmup-iters` | Warmup iterations |
+
+### Mixed Precision
+
+| Argument | Description |
+|----------|-------------|
+| `--fp16` | FP16 mixed precision |
+| `--bf16` | BF16 mixed precision (recommended) |
+| `--fp8-hybrid` | FP8 mixed precision (Hopper/Ada/Blackwell) |
+
+### Data and Checkpointing
+
+| Argument | Description |
+|----------|-------------|
+| `--data-path` | Path to preprocessed data |
+| `--split` | Train/validation/test split (e.g., 949,50,1) |
+| `--save` | Checkpoint save directory |
+| `--load` | Checkpoint load directory |
+| `--save-interval` | Save checkpoint every N iterations |
+
+## Next Steps
+
+- **Optimize Performance**: See [Advanced Features](features/index.md) for FSDP, distributed optimizer, and other optimizations
+- **Scale Up**: Learn about [Parallelism Strategies](parallelism-guide.md) to train larger models across more GPUs
+- **Prepare Data**: Follow the [Data Preparation](data-preparation.md) guide to process your own datasets
diff --git a/docs/versions1.json b/docs/versions1.json
new file mode 100644
index 00000000000..c7af98b9f77
--- /dev/null
+++ b/docs/versions1.json
@@ -0,0 +1,17 @@
+[
+    {
+        "name": "nightly",
+        "version": "nightly",
+        "url": "https://docs.nvidia.com/megatron-core/nightly/"
+    },
+    {
+        "name": "0.16.0 (latest)",
+        "version": "0.16.0",
+        "url": "https://docs.nvidia.com/megatron-core/latest/"
+    },
+    {
+        "name": "0.15.0",
+        "version": "0.15.0",
+        "url": "https://docs.nvidia.com/megatron-core/0.15.0/"
+    }
+]
diff --git a/examples/export/README.md b/examples/export/README.md
index bdd07da263d..c9539e8ab21 100644
--- a/examples/export/README.md
+++ b/examples/export/README.md
@@ -1,10 +1,10 @@
 # Megatron Core Export
 
-This module is used to export megatron core models to different inference frameworks. 
-Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+This module is used to export megatron core models to different inference frameworks.
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc.
 
 ## PTQ AND EXPORT
-Follow the examples of [TensorRT Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment. 
+Follow the examples of [Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment.
 
 # TRTLLM EXPORT
 Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml
index 2fd40e62143..18d305d9cb1 100644
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
@@ -257,7 +257,6 @@ vocab_extra_ids: 0
 seq_length: 4096
 encoder_seq_length: null
 decoder_seq_length: null
-retriever_seq_length: 256
 sample_rate: 1.0
 mask_prob: 0.15
 short_seq_prob: 0.1
diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py
index ebcaf8b043f..88b744b3ac0 100644
--- a/examples/inference/gpt/gpt_dynamic_inference.py
+++ b/examples/inference/gpt/gpt_dynamic_inference.py
@@ -26,6 +26,7 @@
     build_dynamic_engine_setup_prefix,
     build_requests,
     get_curr_time,
+    get_global_peak_memory_stats_bytes,
 )
 from megatron.core.inference.contexts.dynamic_context import (
     ContextOverflowError,
@@ -155,7 +156,7 @@ def get_inference_context(
         max_sequence_length = args.inference_max_seq_length
 
     metrics_writer = None
-    if args.inference_wandb_logging_step_interval > 0:
+    if args.inference_logging_step_interval > 0 and args.inference_wandb_logging:
         metrics_writer = get_wandb_writer()
 
     # Inference context.
@@ -174,8 +175,11 @@ def get_inference_context(
         ),
         block_size_tokens=args.inference_dynamic_batching_block_size,
         buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb,
+        paused_buffer_size_gb=args.inference_dynamic_batching_paused_buffer_size_gb,
+        max_requests=args.inference_dynamic_batching_max_requests,
         max_tokens=args.inference_dynamic_batching_max_tokens,
         tensor_model_parallel_size=args.tensor_model_parallel_size,
+        pipeline_model_parallel_size=args.pipeline_model_parallel_size,
         materialize_only_last_token_logits=not args.return_log_probs,
         mamba_inference_state_config=mamba_inference_state_config,
         cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents,
@@ -187,6 +191,7 @@ def get_inference_context(
         cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens,
         cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count,
         metrics_writer=metrics_writer,
+        offload_kv_cache=args.rl_offload_kv_cache_during_training
     )
 
     return context
@@ -310,7 +315,7 @@ def _add_request():
         # Step inference engine (i.e., generate a token for each active request).
         # Before step, we haven't done the scheduling, so we cannot know the is_decode_only
         try:
-            result = engine.step_modern(verbose=True)
+            result = engine.step_modern()
         except EngineSuspendedError as e:
             result = e
             pass # ignore error in order to call 'engine.resume()' below.
@@ -367,6 +372,7 @@ def _add_request():
                 request.time_end = get_curr_time()
                 request.state = "finished"
                 request.request_id = finished_request.request_id
+                request.events = finished_request.events
 
                 # Update prompt, in case engine has been suspended and resumed.
                 request.prompt_tokens = finished_request.prompt_tokens.tolist()
@@ -397,6 +403,10 @@ def _add_request():
         if not (engine.has_unfinished_requests() or num_requests_added < num_requests_total):
             break
 
+    # Resume engine (NOOP if not suspended).
+    if engine.is_suspended:
+        engine.resume()
+
     return {
         "step_times" : step_times,
         "add_times" : add_times,
@@ -431,6 +441,10 @@ def main():
     else:
         tokenizer = build_tokenizer(args)
 
+    # Reset peak memory stats so functional tests measure this run and not
+    # whatever happened earlier during initialization.
+    torch.cuda.reset_peak_memory_stats()
+
     # Sampling params.
     sampling_params = SamplingParams(
         temperature=args.temperature,
@@ -441,6 +455,7 @@ def main():
         num_tokens_to_generate=args.num_tokens_to_generate,
         termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod,
         top_n_logprobs=args.top_n_logprobs,
+        stop_words=args.stop_words,
     ) 
 
     model = get_model()
@@ -475,7 +490,7 @@ def main():
         random_seed=args.seed,
         track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events,
         enable_chunked_prefill=not args.disable_chunked_prefill,
-        inference_logging_step_interval=args.inference_wandb_logging_step_interval,
+        inference_logging_step_interval=args.inference_logging_step_interval,
     )
 
     setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests)
@@ -486,6 +501,13 @@ def main():
     # Run and time test, optionally `args.inference_repeat_n` times.
     throughputs = []
     for _ in range(args.inference_repeat_n):
+
+        # Reset engine.
+        engine.reset()
+
+        torch.cuda.reset_peak_memory_stats()
+
+        # Trial.
         t = get_curr_time()
         result = run_inference(requests, engine)
         step_times = result["step_times"]
@@ -504,8 +526,9 @@ def main():
             f"request.state == '{request.state}' != 'finished'."
         )
 
-    # Print unique prompts + outputs.
+    peak_mem_stats = get_global_peak_memory_stats_bytes()
 
+    # Print unique prompts + outputs.
     if torch.distributed.get_rank() == 0:
         def escape_str(s):
             return s.replace("\n", "\\n")
@@ -524,7 +547,7 @@ def escape_str(s):
             # ---- Prompt summary line ----
             prompt_len = len(requests[request_idxs[0]].prompt_tokens)
             escaped_prompt_text = escape_str(prompt_text)
-            print(f"{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}")
+            print(f"\n{unique_idx+1}/{len(unique_prompt_map)} [n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}")
 
             # ---- Group all outputs for this prompt ----
             output_map = defaultdict(list)
@@ -534,6 +557,12 @@ def escape_str(s):
 
             # ---- Print each unique output ----
             for output_text, output_request_idxs in output_map.items():
+                evicted = False
+                for idx in output_request_idxs:
+                    for event in requests[idx].events:
+                        if event.type.name == "EVICT":
+                            evicted = True
+                            break
                 if output_text is not None:
                     # Use hash of prompt + generated text in case engine was
                     # suspended and resumed, which misaligns boundary between
@@ -547,7 +576,7 @@ def escape_str(s):
                     o_hash = "--"
                     o_len = 0
                     escaped_output_text = "--"
-                print(f"  >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}] {escaped_output_text}")
+                print(f"  >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}{', <evicted>' if evicted else ''}] {escaped_output_text}")
                 text_hashes.append(o_hash)
 
         # Write results to JSON. Primarily used for functional testing.
@@ -575,7 +604,11 @@ def escape_str(s):
                     json_results[req.request_id] = result_dict
 
             # Track system-level throughput as a test / debug metric
-            json_results["throughput"] = throughputs
+            if args.record_throughput:
+                json_results["throughput"] = throughputs
+            # Attach peak memory metrics; the functional test only validates these
+            # if the fields exist in the golden values.
+            json_results.update(peak_mem_stats)
 
             print(f' Saving results to {args.output_path}')
             with open(args.output_path, "w") as fp:
@@ -617,11 +650,11 @@ def escape_str(s):
         )
         print(
             f"{setup_prefix} … "
-            f"throughput: {throughput:.3f} tok/s",
+            f"throughput: {throughput:.3f} tok/s … ",
             f"total time: {total_time:.3f}s … "
             f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … "
             f"steps: {engine.step_count:d} … "
-            f"capture {capture_str} … "
+            f"capture {capture_str}"
         )
         print("~~~")
 
@@ -631,4 +664,4 @@ def escape_str(s):
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/inference/gpt/gpt_dynamic_inference_12b.sh b/examples/inference/gpt/gpt_dynamic_inference_12b.sh
index 20f1a29cb5b..4991d9d5177 100644
--- a/examples/inference/gpt/gpt_dynamic_inference_12b.sh
+++ b/examples/inference/gpt/gpt_dynamic_inference_12b.sh
@@ -97,6 +97,11 @@ if [[ -v PROMPTS ]]; then
         --prompts ${PROMPTS} \
         --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
     "
+elif [[ -v PROMPT_FILE ]]; then
+    ARGS+=" \
+        --prompt-file ${PROMPT_FILE} \
+        --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
+    "
 else
     ARGS+=" \
         --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
diff --git a/examples/inference/gpt/gpt_dynamic_inference_357m.sh b/examples/inference/gpt/gpt_dynamic_inference_357m.sh
index 215cc2bac8f..44abb575c63 100644
--- a/examples/inference/gpt/gpt_dynamic_inference_357m.sh
+++ b/examples/inference/gpt/gpt_dynamic_inference_357m.sh
@@ -83,6 +83,11 @@ if [[ -v PROMPTS ]]; then
         --prompts ${PROMPTS} \
         --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
     "
+elif [[ -v PROMPT_FILE ]]; then
+    ARGS+=" \
+        --prompt-file ${PROMPT_FILE} \
+        --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
+    "
 else
     ARGS+=" \
         --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
index c6bf1e9648a..cbb7a1aa745 100644
--- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
+++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
@@ -42,7 +42,7 @@
 async def main(
     engine: DynamicInferenceEngine,
     requests: List[Request],
-    port: int,
+    port: int | None = None,
     sampling_params: SamplingParams | None = None,
 ):
     if sampling_params is not None:
@@ -51,28 +51,21 @@ async def main(
             "Sampling parameters are specified per request.",
             DeprecationWarning,
         )
+        
     # once you call engine.start_listening_to_data_parallel_coordinator,
     # the engine will start accepting requests from the data parallel coordinator.
     # and processing them in an asyncio coroutine.
+    # leaving inference_coordinator_port as None will find a free port automatically.
     
-    await engine.start_listening_to_data_parallel_coordinator(
+    dp_addr = await engine.start_listening_to_data_parallel_coordinator(
         inference_coordinator_port=port,
         launch_inference_coordinator=True,
-        verbose=True,
     )
 
-    # if you want to use your own inference coordinator -
-    # 1. set launch_inference_coordinator to False
-    # 2. setup a router socket at tcp://MASTER_ADDR:PORT
-    # 3. wait for data parallel groups to establish connection (BasicInferenceCoordinator.__init__)
-    # 4. look at InferenceCoordinator.start() to see how we can route requests from users <-> data parallel groups
-    #   based on headers.
-    # 5. look at InferenceClient to see how we create requests with headers.
-
     args = get_args()
 
     # Test suspend/resume intervals.
-    if args.suspend_resume_interval is not None:
+    if dist.get_rank() == 0 and args.suspend_resume_interval is not None:
         # Since the client doesn't directly call engine.async_step here, we test
         # the suspend-resume system ~4 times.
         suspend_resume_interval = max(1, len(requests) // 4)
@@ -91,7 +84,7 @@ async def main(
 
     # Create client and run example.
     if dist.get_rank() == 0:
-        client = InferenceClient(port)  # submits requests to the inference coordinator
+        client = InferenceClient(dp_addr)  # submits requests to the inference coordinator
         await client.start()
         base_arrival_time = time.time_ns() / 10**9
         for request in requests:
@@ -99,7 +92,8 @@ async def main(
         futures = []
         num_requests_total = len(requests)
         num_requests_added = 0
-        
+        # logging.info("Waiting for 20 seconds before starting to add requests. This is to mimic an RL style setup..")
+        # time.sleep(20)
         while True:
             current_time = time.time_ns() / 10**9
             if args.incoming_requests_per_step is None:
@@ -159,7 +153,7 @@ async def main(
                     "generated_tokens": req.generated_tokens,
                     "latency": req.latency,  # InferenceClient populates this field in the returned future.
                 }
-                if req.sampling_params["return_log_probs"]:
+                if req.sampling_params.return_log_probs:
                     result_dict["logprobs"] = req.prompt_log_probs + req.generated_log_probs
                 throughput = len(req.generated_tokens) / req.latency
                 throughputs.append(throughput)
@@ -185,11 +179,14 @@ async def main(
                 ))
 
         # kill the engines and suspend the client
-        client.stop_engines()
+        # Right now, we can only call stop when all requests are done. 
+        # Todo: Make this explicit in the Client class....
+        await client.stop_engines()
         client.stop()
 
     # once the stop signal eventually makes its way to each GPU, the engines will stop.
     await asyncio.gather(engine.engine_loop_task)
+    logging.info(f"Rank: {dist.get_rank()} stopped their engine instance successfully.")
 
 
 if __name__ == "__main__":
@@ -201,10 +198,6 @@ async def main(
             args_defaults={'no_load_rng': True, 'no_load_optim': True},
         )
 
-        # Start Nsight profiler.
-        if os.environ.get("NSIGHT_PREFIX"):
-            torch.cuda.cudart().cudaProfilerStart()
-
         args = get_args()
         tokenizer = get_tokenizer()
 
@@ -243,6 +236,7 @@ async def main(
             enable_cuda_graph=args.cuda_graph_impl == "local",
             random_seed=args.seed,
             enable_chunked_prefill=not args.disable_chunked_prefill,
+            inference_logging_step_interval=args.inference_logging_step_interval,
         )
 
         if dist.get_rank() == 0:
@@ -251,6 +245,10 @@ async def main(
             print(setup_prefix)
             print("~~~")
 
+        # Start Nsight profiler.
+        if os.environ.get("NSIGHT_PREFIX"):
+            torch.cuda.cudart().cudaProfilerStart()
+
         asyncio.run(
             main(
                 engine,
@@ -258,3 +256,7 @@ async def main(
                 args.inference_coordinator_port,
             )
         )
+
+        # Stop Nsight profiler.
+        if os.environ.get("NSIGHT_PREFIX"):
+            torch.cuda.cudart().cudaProfilerStop()
diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py
index 2dcae5549a6..03a60927ab2 100644
--- a/examples/inference/gpt/gpt_static_inference.py
+++ b/examples/inference/gpt/gpt_static_inference.py
@@ -104,7 +104,13 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInfere
     text_generation_controller = TextGenerationController(
         inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
     )
-    return StaticInferenceEngine(text_generation_controller=text_generation_controller, legacy=args.use_legacy_static_engine)
+    engine_kwargs = {
+        "text_generation_controller" : text_generation_controller,
+        "legacy" : args.use_legacy_static_engine,
+    }
+    if not args.use_legacy_static_engine:
+        engine_kwargs["buffer_size_gb"] = args.inference_dynamic_batching_buffer_size_gb
+    return StaticInferenceEngine(**engine_kwargs)
 
 
 async def generate(
diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py
index 41fb0df69c2..a04b856c0a6 100644
--- a/examples/inference/gpt/utils.py
+++ b/examples/inference/gpt/utils.py
@@ -72,7 +72,7 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
         help="Add a deterministic number of requests per step. This arg is "
         "prioritized over `--incoming-requests-per-sec` below (which is non-"
         "deterministic). Note that the number of requests added per step is "
-        "additionally limited by the inference context's `max_active_requests`, "
+        "additionally limited by the inference context's `max_requests`, "
         "`max_tokens`, and KV buffer size.",
     )
     group.add_argument(
@@ -102,6 +102,15 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
         default=False,
         help='Skip prompt log probs.',
     )
+    group.add_argument(
+        "--stop-words",
+        metavar='WORD',
+        type=str,
+        nargs='+',
+        default=None,
+        help='Stop words to terminate generation. Each word should be quoted and '
+        'separated by space. Example: --stop-words "\\n\\n" "END" "###"',
+    )
     group.add_argument(
         "--output-path",
         type=str,
@@ -135,6 +144,13 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
         default=False,
         help='Use flashinfer fused rope implementation.',
     )
+    group.add_argument(
+        "--no-record-throughput",
+        action='store_false',
+        dest="record_throughput",
+        help="Disable throughput recording in --output-file"
+        
+    )
 
     return parser
 
@@ -377,7 +393,7 @@ def build_dynamic_engine_setup_prefix(
 
     Args:
         args (Namespace): Command-line arguments for this run.
-        context (DynamicInferenceContext): Stores limits such as `max_active_requests`,
+        context (DynamicInferenceContext): Stores limits such as `max_requests`,
             `max_tokens`, and `gtd_request_count`.
         requests (List[DynamicInferenceRequest]): List of inference requests.
 
@@ -414,7 +430,7 @@ def build_dynamic_engine_setup_prefix(
     buffer_limits_str = (
         f"bf: {get_mem_size_str(args.inference_dynamic_batching_buffer_size_gb*1024**3)}, "
         f"{context.block_allocator.active_count} chunks "
-        f"[r {context.max_active_requests}, t {context.max_tokens}]"
+        f"[r {context.max_requests}, t {context.max_tokens}]"
     )
 
     parts = [
@@ -427,3 +443,17 @@ def build_dynamic_engine_setup_prefix(
     ]
 
     return " | ".join(parts)
+
+
+def get_global_peak_memory_stats_bytes() -> dict:
+    """Peak allocated CUDA memory aggregated across ranks (MAX), in bytes.
+
+    Uses `torch.cuda.max_memory_allocated()` and assumes peak stats were reset
+    before the benchmark run.
+    """
+    peak_alloc = int(torch.cuda.max_memory_allocated())
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        t = torch.tensor([peak_alloc], device="cuda", dtype=torch.int64)
+        torch.distributed.all_reduce(t, op=torch.distributed.ReduceOp.MAX)
+        peak_alloc = int(t[0].item())
+    return {"mem-max-allocated-bytes": peak_alloc}
\ No newline at end of file
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 2adb591b52a..9872185ab2f 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -118,7 +118,7 @@ Legend:
 - EP: Expert Parallel size
 - GA: Gradient Accumulation steps
 
-As NeMo uses Megatron-Core, for the latest performance benchmarks, please refer to the official [NeMo documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html).
+As NeMo uses Megatron-Core, for the latest performance benchmarks, please refer to the official [NeMo documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-summary.html).
 
 ## 5. Test Datasets
 <a id="test-datasets" name="test-datasets"></a>
diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile
index 7b54091ae63..ccb5741da62 100644
--- a/examples/multimodal/Dockerfile
+++ b/examples/multimodal/Dockerfile
@@ -1,26 +1,59 @@
-FROM nvcr.io/nvidia/pytorch:24.02-py3
+# Base image: NVIDIA PyTorch container with CUDA, cuDNN, NCCL, Python, and uv pre-installed
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 
-RUN apt update && \
-    apt -y upgrade && \
-    apt install -y --no-install-recommends \
-        software-properties-common \
-        build-essential \
-        python3-pip \
-        python3-dev \
-        bash \
-        git \
-        vim \
-        tmux \
-        python-is-python3 \
-        default-jre
+# Install JRE for pycocoevalcap
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        default-jre && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
-RUN pip install --upgrade pip
-RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging
-RUN pip install transformers datasets accelerate timm
-RUN pip install pytest-cov pytest_mock nltk wrapt
-RUN pip install zarr "tensorstore==0.1.45"
-RUN pip install black isort click==8.0.2
-RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken
-RUN pip install git+https://github.com/openai/CLIP.git
-# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
-RUN pip install open_clip_torch open-flamingo[eval] --no-deps
+# Using --break-system-packages to allow system-wide installation in managed environment
+RUN uv pip install --system --no-cache --break-system-packages \
+    einops \
+    einops-exts \
+    sentencepiece \
+    braceexpand \
+    webdataset \
+    packaging \
+    transformers \
+    datasets \
+    accelerate \
+    timm \
+    pytest-cov \
+    pytest_mock \
+    nltk \
+    wrapt \
+    zarr \
+    tensorstore \
+    black \
+    isort \
+    click \
+    pycocoevalcap \
+    megatron-energon \
+    mistral-common \
+    tiktoken \
+    # Additional dependencies for megatron-core[mlm]
+    flask-restful \
+    wandb
+
+# Install CLIP from GitHub
+RUN uv pip install --system --no-cache --break-system-packages \
+    git+https://github.com/openai/CLIP.git
+
+# Install packages with --no-deps to avoid outdated and unnecessary dependencies
+RUN uv pip install --system --no-cache --break-system-packages --no-deps \
+    open_clip_torch \
+    "open-flamingo[eval]"
+
+# Copy Megatron-LM source and install megatron-core
+# This assumes the build context is the Megatron-LM root directory
+# Build with: docker build -t megatron-multimodal -f examples/multimodal/Dockerfile .
+WORKDIR /workspace/megatron-lm
+COPY . .
+
+# Install megatron-core in editable mode for development
+RUN uv pip install --system --no-cache --break-system-packages --no-build-isolation -e ".[mlm]"
+
+# Set working directory to examples for convenience
+WORKDIR /workspace/megatron-lm
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index a65839f8f15..e7fe2e62b8e 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -13,6 +13,10 @@ Multimodal support in megatron is still under active development. This example i
 ### Docker container
 
 You can build a docker container using `examples/multimodal/Dockerfile` to run this example.
+```
+# At the Megatron-LM root directory, execute the following
+docker build -t megatron-multimodal -f examples/multimodal/Dockerfile .
+```
 
 ### Language model
 
diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py
index 4c50ecea10a..56821f2cec6 100644
--- a/examples/multimodal/layer_specs.py
+++ b/examples/multimodal/layer_specs.py
@@ -2,6 +2,10 @@
 import torch
 
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
+from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
+from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+from megatron.core.ssm.mlp_layer import MLPLayer
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.dot_product_attention import DotProductAttention
@@ -10,10 +14,7 @@
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
-from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
-from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
-from megatron.core.ssm.mlp_layer import MLPLayer
+from megatron.core.typed_torch import not_none
 
 try:
     from megatron.core.extensions.transformer_engine import (
@@ -26,6 +27,13 @@
 
     HAVE_TE = True
 except ImportError:
+    (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    ) = (None, None, None, None, None)
     HAVE_TE = False
 
 try:
@@ -54,12 +62,8 @@ def get_layer_spec(is_vit, normalization) -> ModuleSpec:
             norm = TENorm
         else:
             version = torch.__version__.split('.')
-            version_geq_2_4 = (
-                int(TORCH_VERSION[0]) > 2
-                or (
-                    int(TORCH_VERSION[0]) == 2
-                    and int(TORCH_VERSION[1]) >= 4
-                )
+            version_geq_2_4 = int(TORCH_VERSION[0]) > 2 or (
+                int(TORCH_VERSION[0]) == 2 and int(TORCH_VERSION[1]) >= 4
             )
             assert version_geq_2_4, "Torch version >= 2.4.0 is required for RMSNorm"
             if HAVE_APEX:
@@ -108,8 +112,8 @@ def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec:
                 module=SelfAttention,
                 params={"attn_mask_type": attn_mask_type},
                 submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
+                    linear_qkv=not_none(TELayerNormColumnParallelLinear),
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
@@ -122,6 +126,7 @@ def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec:
         ),
     )
 
+
 def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
     attn_mask_type = AttnMaskType.causal
     # Padding mask is needed for e.g. Context Parallel.
@@ -153,8 +158,8 @@ def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
                         module=SelfAttention,
                         params={"attn_mask_type": attn_mask_type},
                         submodules=SelfAttentionSubmodules(
-                            linear_qkv=TELayerNormColumnParallelLinear,
-                            core_attention=TEDotProductAttention,
+                            linear_qkv=not_none(TELayerNormColumnParallelLinear),
+                            core_attention=not_none(TEDotProductAttention),
                             linear_proj=TERowParallelLinear,
                         ),
                     ),
@@ -170,7 +175,8 @@ def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
                     mlp=ModuleSpec(
                         module=MLP,
                         submodules=MLPSubmodules(
-                            linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+                            linear_fc1=TELayerNormColumnParallelLinear,
+                            linear_fc2=TERowParallelLinear,
                         ),
                     ),
                     mlp_bda=get_bias_dropout_add,
@@ -179,6 +185,7 @@ def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
         ),
     )
 
+
 def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
     # Dense MLP w/ or w/o TE modules.
     return ModuleSpec(
diff --git a/examples/multimodal/llama_3p1_nemotron_nano_vl_8b_v1/Dockerfile b/examples/multimodal/llama_3p1_nemotron_nano_vl_8b_v1/Dockerfile
index 7f30dc6c156..186046ab8c3 100644
--- a/examples/multimodal/llama_3p1_nemotron_nano_vl_8b_v1/Dockerfile
+++ b/examples/multimodal/llama_3p1_nemotron_nano_vl_8b_v1/Dockerfile
@@ -36,5 +36,5 @@ RUN pip install fairscale fire blobfile
 # Use --no-deps for the following to avoid outdated and unnecessary dependencies.
 RUN pip install mmf --no-deps
 RUN pip install open_clip_torch open-flamingo[eval] --no-deps
-RUN pip install zarr "tensorstore==0.1.45"
+RUN pip install "tensorstore==0.1.45"
 RUN pip install git+https://github.com/NVIDIA/Megatron-Energon.git#egg=megatron-energon[av_decode]
diff --git a/examples/multimodal/nvlm/internvit.py b/examples/multimodal/nvlm/internvit.py
index 62f3bdccd85..9df9af23f05 100644
--- a/examples/multimodal/nvlm/internvit.py
+++ b/examples/multimodal/nvlm/internvit.py
@@ -14,7 +14,10 @@
 
 import torch
 
-from megatron.core.utils import divide
+from examples.multimodal.layer_scaling import (
+    LayerScalingTransformerLayer,
+    get_bias_dropout_add_layer_scaling,
+)
 from megatron.core.extensions.transformer_engine import (
     TEColumnParallelLinear,
     TEDotProductAttention,
@@ -35,9 +38,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
-
-from examples.multimodal.layer_scaling import LayerScalingTransformerLayer, get_bias_dropout_add_layer_scaling
-
+from megatron.core.utils import divide
 
 try:
     import apex
@@ -128,10 +129,14 @@ def _gather_var(self, input_, max_dim):
 
         if rank < valid_ranks:  # Ranks without any dummy attention heads.
             var = input_.sum(-1, keepdim=True)
-        elif rank == valid_ranks:  # The only rank which may contain 'residual_heads' dummy attention heads.
+        elif (
+            rank == valid_ranks
+        ):  # The only rank which may contain 'residual_heads' dummy attention heads.
             var = input_[..., :max_dim].sum(-1, keepdim=True)
         else:
-            var = input_.sum(-1, keepdim=True) * 0.0  # All heads in these ranks are dummy heads: Zero-out.
+            var = (
+                input_.sum(-1, keepdim=True) * 0.0
+            )  # All heads in these ranks are dummy heads: Zero-out.
 
         tensor_list = [torch.empty_like(var) for _ in range(world_size)]
         tensor_list[rank] = var
@@ -175,8 +180,7 @@ def __init__(
         # Need to override linear_qkv, q_layernorm and k_layernorm.
         qkv_bias = False
 
-        self.linear_qkv = build_module(
-            submodules.linear_qkv,
+        self.linear_qkv = submodules.linear_qkv(
             self.config.hidden_size,
             self.query_projection_size + 2 * self.kv_projection_size,
             config=self.config,
@@ -256,6 +260,7 @@ def get_internvit_layer_spec(use_te) -> ModuleSpec:
         ),
     )
 
+
 def get_internvit300M_layer_spec(use_te) -> ModuleSpec:
     mlp = get_mlp_module_spec(use_te)  # no norm
 
diff --git a/examples/multimodal/radio/radio_g.py b/examples/multimodal/radio/radio_g.py
index 3ce793be75d..f139632df86 100644
--- a/examples/multimodal/radio/radio_g.py
+++ b/examples/multimodal/radio/radio_g.py
@@ -3,6 +3,10 @@
 
 import torch
 
+from examples.multimodal.layer_scaling import (
+    LayerScalingTransformerLayer,
+    get_bias_dropout_add_layer_scaling,
+)
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.dot_product_attention import DotProductAttention
@@ -11,7 +15,7 @@
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-from examples.multimodal.layer_scaling import LayerScalingTransformerLayer, get_bias_dropout_add_layer_scaling
+from megatron.core.typed_torch import not_none
 
 try:
     from megatron.core.extensions.transformer_engine import (
@@ -24,6 +28,13 @@
 
     HAVE_TE = True
 except ImportError:
+    (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    ) = (None, None, None, None, None)
     HAVE_TE = False
 
 try:
@@ -113,8 +124,8 @@ def get_radio_g_layer_spec_te() -> ModuleSpec:
                 module=SelfAttention,
                 params={"attn_mask_type": attn_mask_type},
                 submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
+                    linear_qkv=not_none(TELayerNormColumnParallelLinear),
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
diff --git a/examples/post_training/modelopt/Dockerfile b/examples/post_training/modelopt/Dockerfile
index e127215904d..f44306ef08e 100644
--- a/examples/post_training/modelopt/Dockerfile
+++ b/examples/post_training/modelopt/Dockerfile
@@ -4,7 +4,7 @@ ARG PIP_CONSTRAINT=
 
 WORKDIR /workspace/nmm-sandbox
 
-RUN pip install jsonlines omegaconf
+RUN pip install omegaconf
 RUN pip install flask flask_restful fire nltk
 RUN pip install tiktoken blobfile
 
diff --git a/examples/post_training/modelopt/README.md b/examples/post_training/modelopt/README.md
index 33528c30097..93b5022b2aa 100644
--- a/examples/post_training/modelopt/README.md
+++ b/examples/post_training/modelopt/README.md
@@ -1,18 +1,19 @@
 <div align="center">
 
-# TensorRT Model Optimizer Integrated Examples
+# Model Optimizer Integrated Examples
 
 
-[TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) |
+[Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) |
 [Local Examples](#getting-started-in-a-local-environment) |
 [Configuration](./ADVANCED.md#advanced-configuration) |
 [Slurm Examples](./ADVANCED.md#slurm-examples) |
 [Speculative Decoding](./speculative.md) |
+[Knowledge Distillation](./distillation.md) |
 [Advanced Topics](./ADVANCED.md)
 
 </div>
 
-[TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (**ModelOpt**, `nvidia-modelopt`)
+[Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (**ModelOpt**, `nvidia-modelopt`)
 provides end-to-end model optimization for NVIDIA hardware including quantization (real or simulated),
 knowledge distillation, pruning, speculative decoding, and more.
 
@@ -32,6 +33,7 @@ knowledge distillation, pruning, speculative decoding, and more.
 | `meta-llama/Llama-4-{Scout,Maverick}-17B-{16,128}E-Instruct` | ✅ | ✅ | - | - |
 | `moonshotai/Kimi-K2-Instruct` | ✅ | ✅ | - | - |
 | `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | ✅ | - | ✅ | ✅ |
+| `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` | ✅ | - | ✅ | ✅ |
 | `openai/gpt-oss-{20b, 120b}` | ✅ | **Online** | ✅ | ✅ |
 | `Qwen/Qwen3-{0.6B, 8B}` | ✅ | ✅ | ✅ | ✅ |
 | `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | ✅ | ✅ | ✅ |
@@ -42,7 +44,7 @@ Install `nvidia-modelopt` from [PyPI](https://pypi.org/project/nvidia-modelopt/)
 ```sh
 pip install -U nvidia-modelopt
 ```
-Alternatively, you can install from [source](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+Alternatively, you can install from [source](https://github.com/NVIDIA/Model-Optimizer)
 to try our latest features.
 
 > **❗ IMPORTANT:** The first positional argument (e.g. `meta-llama/Llama-3.2-1B-Instruct`) of each script
@@ -57,18 +59,19 @@ Provide the pretrained checkpoint path through variable `${HF_MODEL_CKPT}` and p
 Hugging Face-Like quantized checkpoint for TensorRT-LLM, vLLM, or SGLang deployement,
 provide `${EXPORT_DIR}` to `export.sh`.
 
-> **📙 NOTE:** ModelOpt supports different quantization formats. By default, we simulate the
-> low-precision numerical behavior (fake-quant) which can be run on GPUs with compute > 80.
+> **📙 NOTE:** ModelOpt supports different quantization formats which are listed in the [ModelOpt quant configs](https://github.com/NVIDIA/Model-Optimizer/blob/7971fff05882da7eae16eae6bc927d1481dcd63f/modelopt/torch/quantization/config.py#L626).
+> The quant config is specified by the full config name in all-caps, e.g. NVFP4_DEFAULT_CFG.
+> By default, we simulate the low-precision numerical behavior (fake-quant) which can be run on GPUs with compute > 80.
 > Real low-precision paramters (e.g. `E4M3` or `E2M1`)
 > and low-precision compute (e.g. `FP8Linear`) are also supported depending on GPU compute capability.
-> **See [Adanvanced Topics](./ADVANCED.md) for details**.
+> **See [Advanced Topics](./ADVANCED.md) for details**.
 
 ```sh
 \
     TP=1 \
     HF_MODEL_CKPT=<pretrained_model_name_or_path> \
     MLM_MODEL_SAVE=/tmp/Llama-3.2-1B-Instruct_quant \
-    ./quantize.sh meta-llama/Llama-3.2-1B-Instruct nvfp4
+    ./quantize.sh meta-llama/Llama-3.2-1B-Instruct NVFP4_DEFAULT_CFG 
 
 \
     PP=1 \
@@ -78,6 +81,8 @@ provide `${EXPORT_DIR}` to `export.sh`.
     ./export.sh meta-llama/Llama-3.2-1B-Instruct
 ```
 
+For KV cache quantization, add a flag like `MLM_EXTRA_ARGS="--export-kv-cache-quant fp8"` while specifying your desired KV cache precision (see `KV_QUANT_CFG_CHOICES` in `quantize.py`).
+
 ### ⭐ Online BF16 EAGLE3 Training
 
 Online EAGLE3 training has both the target (frozen) and draft models in the memory where the `hidden_states`
@@ -100,11 +105,54 @@ deployment.
     ./export.sh meta-llama/Llama-3.2-1B-Instruct
 ```
 
-See [Adanvanced Topics](./ADVANCED.md) for a `moonshotai/Kimi-K2-Instruct` EAGLE3 training example using `slurm`.
+See [Advanced Topics](./ADVANCED.md) for a `moonshotai/Kimi-K2-Instruct` EAGLE3 training example using `slurm`.
+
+### ⭐ Offline BF16 EAGLE3 Training
+Unlike online EAGLE3 training, offline workflow precomputes target model `hidden_states` and dumps to disk.
+Then only the draft model is called during training. AL is no longer reported during training. After training,
+`export.sh` is used to export EAGLE3 checkpoint.
+
+```sh
+\
+    # Convert to online eagle3 model for base model feature extraction
+    HF_MODEL_CKPT=<pretrained_model_name_or_path> \
+    MLM_MODEL_SAVE=/tmp/Llama-3.2-1B-Eagle3 \
+    MLM_EXTRA_ARGS="--algorithm eagle3" \
+    ./convert.sh meta-llama/Llama-3.2-1B-Instruct
+
+\
+    # Dump base model feature to disk
+    MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Eagle3 \
+    MLM_EXTRA_ARGS="--output-dir /tmp/offline_data" \
+    ./offline_feature_extrach.sh meta-llama/Llama-3.2-1B-Instruct
+
+\
+    # Convert to offline eagle3 model
+    HF_MODEL_CKPT=<pretrained_model_name_or_path> \
+    MLM_MODEL_SAVE=/tmp/Llama-3.2-1B-Eagle3-offline \
+    MLM_EXTRA_ARGS="--algorithm eagle3 --export-offline-model" \
+    ./convert.sh meta-llama/Llama-3.2-1B-Instruct
+
+\
+    # Train the offline eagle3 model using extracted features
+    MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Eagle3-offline \
+    MLM_MODEL_SAVE=/tmp/Llama-3.2-1B-Eagle3-offline \
+    MLM_EXTRA_ARGS="--export-offline-model --offline-distillation-data /tmp/offline_data" \
+    ./finetune.sh meta-llama/Llama-3.2-1B-Instruct
+
+\
+    # Export the trained eagle3 checkpoint
+    PP=1 \
+    HF_MODEL_CKPT=<pretrained_model_name_or_path> \
+    MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Eagle3-offline \
+    EXPORT_DIR=/tmp/Llama-3.2-1B-Eagle3-Export \
+    MLM_EXTRA_ARGS="--export-offline-model" \
+    ./export.sh meta-llama/Llama-3.2-1B-Instruct
+```
 
 ### ⭐ Pruning
 
-Checkout pruning getting started section and guidelines for configuring pruning parameters in the [ModelOpt pruning README](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/pruning).
+Checkout pruning getting started section and guidelines for configuring pruning parameters in the [ModelOpt pruning README](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning).
 
 Pruning is supported for GPT and Mamba models in Pipeline Parallel mode. Available pruning dimensions are:
 
@@ -167,4 +215,8 @@ The saved Megatron-LM distributed checkpoint (output of above scripts) can be re
 ```
 
 ## Advanced Usage
-TBD
+To contribute, please ping [@NVIDIA/post-training](https://github.com/orgs/NVIDIA/teams/post-training) team members. We format the examples with
+```
+uvx black@24.10.0 .
+uvx isort .
+```
diff --git a/examples/post_training/modelopt/conf/arguments.sh b/examples/post_training/modelopt/conf/arguments.sh
index 0193bf8b643..f17b8c9154a 100644
--- a/examples/post_training/modelopt/conf/arguments.sh
+++ b/examples/post_training/modelopt/conf/arguments.sh
@@ -81,7 +81,7 @@ if [ -z ${LAUNCH_SCRIPT} ]; then
     LAUNCH_SCRIPT="torchrun --nproc_per_node=$((ETP * EP * PP * CP * DP))"
 fi
 
-# Install TensorRT Model Optimizer if haven't.
+# Install Model Optimizer if haven't.
 if [ -z ${MLM_SKIP_INSTALL} ]; then
     pip install -r ${SCRIPT_DIR}/requirements.txt
 fi
diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.sh
new file mode 100644
index 00000000000..c294e03235c
--- /dev/null
+++ b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+    TOKENIZER_MODEL=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --trust-remote-code \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --moe-token-dispatcher-type allgather \
+    --enable-experimental \
+    --moe-permute-fusion \
+    --use-fused-weighted-squared-relu \
+    --cross-entropy-loss-fusion \
+    --cross-entropy-fusion-impl native \
+    --moe-router-score-function sigmoid \
+    --moe-grouped-gemm \
+    --num-experts 128 \
+    --moe-router-topk 6 \
+    --moe-aux-loss-coeff 1e-4 \
+    --moe-router-topk-scaling-factor 2.5 \
+    --moe-router-enable-expert-bias \
+    --moe-router-dtype fp32 \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-shared-expert-intermediate-size 3712 \
+    \
+    --attention-backend flash \
+    --disable-gloo-process-groups \
+    --is-hybrid-model \
+    --mamba-num-heads 64 \
+    --mamba-head-dim 64 \
+    --hybrid-override-pattern MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME \
+    --use-mcore-models \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --init-method-std 0.0173 \
+    --position-embedding-type none \
+    --squared-relu \
+    --num-layers 52 \
+    --hidden-size 2688 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --ffn-hidden-size 1856 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    \
+    --tokenizer-type HuggingFaceTokenizer \
+    --bf16 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --export-model-type MambaModel \
+    "
diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh
index d6ba1e1dcc4..a2212483008 100644
--- a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh
+++ b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh
@@ -8,6 +8,7 @@ else
 fi
 
 MODEL_ARGS=" \
+    --trust-remote-code \
     --save-interval 100000 \
     --micro-batch-size 1 \
     --bf16 \
diff --git a/examples/post_training/modelopt/conf/nvidia/Nemotron-H-47B-Reasoning-128K.sh b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-47B-Reasoning-128K.sh
new file mode 100644
index 00000000000..ad07c1061c5
--- /dev/null
+++ b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-47B-Reasoning-128K.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=nvidia/Nemotron-H-47B-Reasoning-128K
+    TOKENIZER_MODEL=nvidia/Nemotron-H-47B-Reasoning-128K
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --trust-remote-code \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --attention-backend flash \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type none \
+    --normalization RMSNorm \
+    --squared-relu \
+    --num-layers 98 \
+    --hidden-size 8192 \
+    --ffn-hidden-size 30720 \
+    --num-attention-heads 64 \
+    --kv-channels 128 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --hybrid-override-pattern M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M- \
+    --is-hybrid-model \
+    --mamba-head-dim 64 \
+    --mamba-num-heads 256 \
+    --mamba-num-groups 8 \
+    --mamba-state-dim 256 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --use-mcore-models \
+    --export-model-type MambaModel \
+"
diff --git a/examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh
index 4f32fbd63ad..4ba91dbd8c6 100644
--- a/examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh
+++ b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh
@@ -8,6 +8,7 @@ else
 fi
 
 MODEL_ARGS=" \
+    --trust-remote-code \
     --save-interval 100000 \
     --micro-batch-size 1 \
     --bf16 \
diff --git a/examples/post_training/modelopt/conf/nvidia/Nemotron-H-56B-Base-8K.sh b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-56B-Base-8K.sh
new file mode 100644
index 00000000000..8377f0f11d6
--- /dev/null
+++ b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-56B-Base-8K.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=nvidia/Nemotron-H-56B-Base-8K
+    TOKENIZER_MODEL=nvidia/Nemotron-H-56B-Base-8K
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+MODEL_ARGS=" \
+    --trust-remote-code \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --attention-backend flash \
+    --is-hybrid-model \
+    --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
+    --mamba-state-dim 256 \
+    --tiktoken-pattern v2 \
+    --use-mcore-models \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --init-method-std 0.0099 \
+    --position-embedding-type none \
+    --squared-relu \
+    --num-layers 118 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 32768 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --exit-duration-in-mins 230 \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 1 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --bf16 \
+    --export-model-type MambaModel \
+    "
diff --git a/examples/post_training/modelopt/conf/nvidia/Nemotron-H-8B-Base-8K.sh b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-8B-Base-8K.sh
index bfcb8ee0b02..b04bf76f360 100644
--- a/examples/post_training/modelopt/conf/nvidia/Nemotron-H-8B-Base-8K.sh
+++ b/examples/post_training/modelopt/conf/nvidia/Nemotron-H-8B-Base-8K.sh
@@ -8,6 +8,7 @@ else
 fi
 
 MODEL_ARGS=" \
+    --trust-remote-code \
     --save-interval 100000 \
     --micro-batch-size 1 \
     --bf16 \
diff --git a/examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh b/examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh
index 7ef969b059d..d2c4cda36b2 100644
--- a/examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh
+++ b/examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh
@@ -8,6 +8,7 @@ else
 fi
 
 MODEL_ARGS=" \
+    --trust-remote-code \
     --save-interval 100000 \
     --micro-batch-size 1 \
     --bf16 \
diff --git a/examples/post_training/modelopt/convert.sh b/examples/post_training/modelopt/convert.sh
old mode 100644
new mode 100755
diff --git a/examples/post_training/modelopt/convert_model.py b/examples/post_training/modelopt/convert_model.py
index 20ee59a2fe0..cf5f6e5bbbb 100644
--- a/examples/post_training/modelopt/convert_model.py
+++ b/examples/post_training/modelopt/convert_model.py
@@ -19,7 +19,11 @@
 from megatron.post_training.arguments import add_modelopt_args
 from megatron.post_training.checkpointing import load_modelopt_checkpoint
 from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
-from megatron.post_training.utils import report_current_memory_info, to_empty_if_meta
+from megatron.post_training.utils import (
+    modelopt_version_at_least,
+    report_current_memory_info,
+    to_empty_if_meta,
+)
 from megatron.training import get_args, get_tokenizer
 from megatron.training.checkpointing import save_checkpoint
 from megatron.training.initialize import initialize_megatron
@@ -50,14 +54,11 @@ def add_convert_args(parser):
         help='Chosing between different speculative decoding algorithms. Default is None.',
     )
     group.add_argument(
-        '--export-num-medusa-heads',
-        type=int,
-        default=0,
-        help='Number of Medusa heads for speculative decoding.',
-    )
-    group.add_argument(
-        "--eagle-config", type=str, default=None, help="EAGLE architecture config. If not given, " \
-        "a default config will be use. If provided, it will overwrite the default config."
+        "--eagle-config",
+        type=str,
+        default=None,
+        help="EAGLE architecture config. If not given, "
+        "a default config will be use. If provided, it will overwrite the default config.",
     )
 
     add_modelopt_args(parser)
@@ -121,7 +122,9 @@ def check_arguments():
             UserWarning,
         )
 
-    model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
+    model = get_model(
+        functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
+    )
     report_current_memory_info()
 
     unwrapped_model = unwrap_model(model)[0]
@@ -130,12 +133,17 @@ def check_arguments():
         import_dtype = torch.float16 if args.fp16 else torch.bfloat16
         unwrapped_model = unwrap_model(model)[0]
         workspace_dir = os.environ.get("MLM_WORK_DIR", "/tmp")
-        print_rank_0("Import model from Hugging Face checkpoint in dtype {}.".format(str(import_dtype)))
+        print_rank_0(
+            "Import model from Hugging Face checkpoint in dtype {}.".format(str(import_dtype))
+        )
+        import_kwargs = {
+            "dtype": import_dtype,
+            "moe_router_dtype": args.moe_router_dtype,
+        }
+        if modelopt_version_at_least("0.41.0"):
+            import_kwargs.update({"trust_remote_code": args.trust_remote_code})
         import_mcore_gpt_from_hf(
-            unwrapped_model,
-            args.pretrained_model_path,
-            workspace_dir,
-            dtype = import_dtype,
+            unwrapped_model, args.pretrained_model_path, workspace_dir, **import_kwargs
         )
     elif args.load is not None:
         _ = load_modelopt_checkpoint(model)
@@ -143,15 +151,10 @@ def check_arguments():
     if args.algorithm in ("eagle1", "eagle3"):
         mtsp_config = ALGO_TO_CONFIG[args.algorithm]
         if args.eagle_config:
-            with open(args.eagle_config)as f:
+            with open(args.eagle_config) as f:
                 eagle_config = json.load(f)
             mtsp_config["config"]["eagle_architecture_config"].update(eagle_config)
-        # Update eagle hidden_size and vocab_size according to the base model
-        mtsp_config["config"]["eagle_architecture_config"]["hidden_size"] = unwrapped_model.config.hidden_size
-        mtsp_config["config"]["eagle_architecture_config"]["vocab_size"] = unwrapped_model.vocab_size
-        if not args.eagle_config or "draft_vocab_size" not in eagle_config:
-            # If draft_vocab_size is not provided, set it to vocab_size
-            mtsp_config["config"]["eagle_architecture_config"]["draft_vocab_size"] = unwrapped_model.vocab_size
+
         if args.export_offline_model:
             mtsp_config["config"]["eagle_offline"] = True
 
@@ -162,12 +165,11 @@ def check_arguments():
             if eagle_module is not None:
                 mcore_eagle_state_dict = torch.load(args.extra_model_path)
                 eagle_module.load_state_dict(mcore_eagle_state_dict, strict=False)
-                
+
     elif args.algorithm == "medusa":
         config = {"medusa_num_heads": args.export_num_medusa_heads, "medusa_num_layers": 1}
         unwrapped_model = mtsp.convert(unwrapped_model, [("medusa", config)])
 
-
     print_rank_0(f"Converted Model:\n {model}")
     torch.distributed.barrier()
 
diff --git a/megatron/post_training/docs/distillation.md b/examples/post_training/modelopt/distillation.md
similarity index 75%
rename from megatron/post_training/docs/distillation.md
rename to examples/post_training/modelopt/distillation.md
index 9f0d5524176..49f73c4edde 100644
--- a/megatron/post_training/docs/distillation.md
+++ b/examples/post_training/modelopt/distillation.md
@@ -1,9 +1,5 @@
 # Megatron-LM ModelOpt Distillation Integration
 
-## Table of Contents
-
-[[_TOC_]]
-
 ## How To
 
 ### Prerequisites
@@ -16,22 +12,22 @@ We require the following pieces of data:
 * Teacher model weights
 * Student model weights (unless starting from scratch)
 * NeMo-format config file for teacher model
-* Distillation run config file
 * Tokenizer
 * Dataset
 
-It also requires the installation of the [NVIDIA Model Optimizer library](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (minimum version 0.15)
+And optionally:
+* Distillation run config file
 
 ### Teacher checkpoint format
 
 We enforce the use of a config yaml in [NeMo](https://github.com/NVIDIA/NeMo) checkpoint-format style to define the arguments to the teacher model.
 The normal command-line arguments go toward constructing the student, thus the values in this file
-override the student arguments before being handed to the teacher constructor. This file must be
-named `model_config.yaml` and be placed in the root of the teacher model checkpoint folder.
-Unlike NeMo-generated checkpoints, Megatron-LM checkpoints do not contain these files by default and must be manually created.
+override the student arguments before being handed to the teacher constructor. This file must be either passed in via
+`--export-kd-teacher-model-config` or be named `model_config.yaml` in the root of the teacher model checkpoint folder.
+Unlike NeMo-generated checkpoints, Megatron-LM checkpoints do not contain this file by default and must be manually created.
 
-> NOTE: Not all keys in the NEMO-style yaml correspond 1:1 to the argument names for Megatron-LM. These
-are converted in `megatron/inference/gpt/model_provider.py`.
+> NOTE: Not all keys in the NeMo-style yaml correspond 1:1 to the argument names for Megatron-LM. These
+are converted in `megatron/post_training/model_builder.py`.
 
 ### Distillation config format
 
@@ -44,29 +40,39 @@ intermediate_layer_pairs:
   - ["decoder.final_layernorm", "decoder.layers.30.input_layernorm"]
 skip_lm_loss: true
 kd_loss_scale: 10.0
+logit_kl_temperature: 1.0
 ```
 
 * `logit_layers` defines the names of the student and teacher submodules, respectively, whose outputs are the logits.
 * `intermediate_layer_pairs` defines the potentially multiple – or zero – pairs of intermediate activation layers to also perform loss on.
-* `skip_lm_loss` decides whether or not to compute and combine the original training LM loss with the KD loss
+* `skip_lm_loss` decides whether or not to compute and combine the original training LM loss with the KD loss.
 * `kd_loss_scale` will scale the KD loss before adding it to the LM loss, if `skip_lm_loss` is `False`.
+* `logit_kl_temperature` is the temperature smoothing factor to multiply the logits by prior to softmax and loss.
+
+Without this configuration file, the default logits-only distillation with scale and temperatures of 1.0 will be performed.
 
 ### Training
 
-Distillation is triggered by calling `pretrain_gpt.py` with the additional following arguments:
+Distillation is triggered by calling `pretrain_gpt.py` or `pretrain_mamba.py` with the following arguments:
 
 ```bash
---kd-teacher-load <path-to-teacher-checkpoint>
---kd-distill-cfg <path-to-distill-config-yaml-file>
+--export-kd-teacher-load <path-to-teacher-checkpoint>
 --export-te-mcore-model
 ```
 
+optionally alongside the additional following arguments:
+
+```bash
+--export-kd-distill-cfg <path-to-distill-config-yaml-file>
+--export-kd-teacher-model-config <path-to-teacher-model-config-file>
+```
+
 > NOTE: If the teacher checkpoint happens to be in a different format from the student's (whose format is specified via `--ckpt-format`), it can
 be distinguished separately using the additional flag `--export-kd-teacher-ckpt-format`.
 
 ## Distillation API and design
 
-Knowledge Distillation is done via the [NVIDIA Model Optimizer library](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+Knowledge Distillation is done via the [NVIDIA Model Optimizer library](https://github.com/NVIDIA/Model-Optimizer).
 
 The model creation step wraps the base model as the student in a
 `modelopt.torch.distill.DistillationModel` wrapper which also contains the teacher model.
@@ -81,8 +87,6 @@ both defined in `modelopt.torch.distill.plugins.megatron`.
 
 * Interleaved Pipeline Parallel is unsupported for Distillation.
 
-* Only Megatron-Core models (not legacy Megatron) are supported for Distillation.
-
 ## Known Issues
 
 * An unknown memory allocation (a few megabytes per microbatch) takes place when the model is converted to a
diff --git a/examples/post_training/modelopt/export.py b/examples/post_training/modelopt/export.py
old mode 100644
new mode 100755
index 8794c4c738c..9dc66eecb6d
--- a/examples/post_training/modelopt/export.py
+++ b/examples/post_training/modelopt/export.py
@@ -5,6 +5,7 @@
 import os
 import sys
 import warnings
+from pathlib import Path
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
 
@@ -14,6 +15,7 @@
 from megatron.post_training.arguments import add_modelopt_args
 from megatron.post_training.checkpointing import load_modelopt_checkpoint
 from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
+from megatron.post_training.utils import modelopt_version_at_least
 from megatron.training import get_args, get_model
 from megatron.training.initialize import initialize_megatron
 from megatron.training.utils import unwrap_model
@@ -65,27 +67,35 @@ def add_modelopt_export_args(parser):
             UserWarning,
         )
 
-    model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
+    model = get_model(
+        functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
+    )
 
     # Materialize the model from meta device to cpu before loading the checkpoint.
     unwrapped_model = unwrap_model(model)[0]
     unwrapped_model.to_empty(device="cpu")
 
-    if args.load is not None:
+    if args.load is not None and Path(args.load).is_dir():
         _ = load_modelopt_checkpoint(model)
+    else:
+        raise ValueError(f"Invalid load checkpoint directory: {args.load}")
+
 
     # Decide whether we are exporting only the extra_modules (e.g. EAGLE3).
     # Only the last pp stage may have extra_modules, hence broadcast from the last rank.
-    export_extra_modules = hasattr(unwrapped_model, "eagle_module") or hasattr(unwrapped_model, "medusa_heads")
+    export_extra_modules = hasattr(unwrapped_model, "eagle_module") or hasattr(
+        unwrapped_model, "medusa_heads"
+    )
     torch.distributed.broadcast_object_list(
-        [export_extra_modules],
-        src=torch.distributed.get_world_size() - 1,
+        [export_extra_modules], src=torch.distributed.get_world_size() - 1
     )
 
-    mtex.export_mcore_gpt_to_hf(
-        unwrapped_model,
-        args.pretrained_model_name,
-        export_extra_modules=export_extra_modules,
-        dtype=torch.bfloat16,
-        export_dir=args.export_dir,
-    )
+    export_kwargs = {
+        "export_extra_modules": export_extra_modules,
+        "dtype": torch.bfloat16,
+        "export_dir": args.export_dir,
+        "moe_router_dtype": unwrapped_model.config.moe_router_dtype,
+    }
+    if modelopt_version_at_least("0.41.0"):
+        export_kwargs.update({"trust_remote_code": args.trust_remote_code})
+    mtex.export_mcore_gpt_to_hf(unwrapped_model, args.pretrained_model_name, **export_kwargs)
diff --git a/examples/post_training/modelopt/export.sh b/examples/post_training/modelopt/export.sh
old mode 100644
new mode 100755
diff --git a/examples/post_training/modelopt/finetune.py b/examples/post_training/modelopt/finetune.py
index 6489d394392..19ece4ef299 100755
--- a/examples/post_training/modelopt/finetune.py
+++ b/examples/post_training/modelopt/finetune.py
@@ -8,8 +8,6 @@
 from functools import partial
 from typing import Any, Dict, Optional
 
-import jsonlines
-
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
 
 import datasets
@@ -110,13 +108,21 @@ class SFTDataset(torch.utils.data.Dataset):
         "Open-Orca/OpenOrca": "{{ messages['question'] + ' ' + messages['response'] + ' ' }}",
     }
 
+    @classmethod
+    def _wildcard_get(cls, directory: Dict[str, Any], name: str, default_value=None):
+        ret = default_value
+        for key, val in directory.items():
+            if key in name:
+                ret = val
+                break
+        return ret
+
     def __init__(
         self,
         num_packed_samples: int,
-        data_path: Optional[str],
+        hf_dataset: str,
         tokenizer: transformers.PreTrainedTokenizerBase,
         seq_length: int,
-        hf_dataset: Optional[str] = None,
         num_shards: int = 1,
         shard_index: int = 0,
     ):
@@ -129,20 +135,20 @@ def __init__(
         until the packed dataset has sufficient length.
 
         Args:
-            data_path: Path to the json or jsonl file
             num_packed_samples: total number of packed samples (cyclic access)
-            tokenizer: hf tokenizer
+            hf_dataset: Huggingface dataset name or local path
+            tokenizer: Huggingface PreTrainedTokenizer instance
             seq_length: max sequence length
-            hf_dataset: not supported yet
+            num_shards: number of shards for distributed training
+            shard_index: shard index for distributed training
         """
         if not isinstance(tokenizer, transformers.PreTrainedTokenizerBase):
             raise ValueError("SFTDataset only supports transformers.PreTrainedTokenizerBase!")
 
         self.num_packed_samples = num_packed_samples
-        self.data_path = data_path
+        self.hf_dataset = hf_dataset
         self.tokenizer = tokenizer
         self.seq_length = seq_length
-        self.hf_dataset = hf_dataset
         self.data_transformation = lambda data: data
         self.num_shards = num_shards
         self.shard_index = shard_index
@@ -155,42 +161,32 @@ def __init__(
             REMOVE_THINK_CHAT_TEMPLATE, ""
         )
 
-        if data_path is not None:
-            if data_path.endswith(".json"):
-                self._raw_samples = json.load(open(data_path))
-            elif data_path.endswith(".jsonl"):
-                with jsonlines.open(data_path, mode='r') as reader:
-                    self._raw_samples = [obj for obj in reader]
-            else:
-                raise ValueError("data_path must be json or jsonl")
-        elif self.hf_dataset is not None:
-            hf_dataset_kwargs = SFTDataset.hf_dataset_to_kwargs.get(
-                self.hf_dataset, {"split": "train"}
-            )
-            self._raw_samples = datasets.load_dataset(self.hf_dataset, token=os.environ.get("HF_TOKEN", None), **hf_dataset_kwargs)
-            self._raw_samples = self._raw_samples.shard(
-                num_shards=self.num_shards, index=shard_index
-            )
-
-            print(
-                "Rank {:3}/{:3} creates SFT data shard {:3}/{:3} with {:10} raw samples".format(
-                    torch.distributed.get_rank(),
-                    torch.distributed.get_world_size(),
-                    self.shard_index,
-                    self.num_shards,
-                    len(self._raw_samples),
-                ),
-                flush=True,
-            )
+        hf_dataset_kwargs = SFTDataset.hf_dataset_to_kwargs.get(
+            self.hf_dataset, {"split": "train"}
+        )
+        self._raw_samples = datasets.load_dataset(self.hf_dataset, token=os.environ.get("HF_TOKEN", None), **hf_dataset_kwargs)
+        self._raw_samples = self._raw_samples.shard(
+            num_shards=self.num_shards, index=shard_index
+        )
 
-        else:
-            raise ValueError("Either hf_dataset or data_path must be provided!")
+        print(
+            "Rank {:3}/{:3} creates SFT data shard {:3}/{:3} with {:10} raw samples".format(
+                torch.distributed.get_rank(),
+                torch.distributed.get_world_size(),
+                self.shard_index,
+                self.num_shards,
+                len(self._raw_samples),
+            ),
+            flush=True,
+        )
 
         if self.tokenizer.chat_template is None:
             self.tokenizer.chat_template = SFTDataset.hf_dataset_to_prompt_template
         elif self.hf_dataset is not None:
-            self.data_transformation = SFTDataset.hf_dataset_to_conversation.get(
-                self.hf_dataset, lambda data: data
+            self.data_transformation = SFTDataset._wildcard_get(
+                SFTDataset.hf_dataset_to_conversation,
+                self.hf_dataset,
+                default_value=lambda data: data,
             )
 
         if self.tokenizer.chat_template is None:
@@ -361,23 +357,17 @@ def train_valid_test_sft_datasets_provider(train_val_test_num_samples):
         print_rank_0("> finished creating offline SFT datasets ...")
     else:
         kwargs = {
+            "hf_dataset": args.finetune_hf_dataset,
             "tokenizer": tokenizer._tokenizer,
             "seq_length": args.seq_length,
             # Optional kwargs
-            "hf_dataset": args.finetune_hf_dataset,
             "num_shards": mpu.get_expert_data_parallel_world_size(),
             "shard_index": mpu.get_expert_data_parallel_rank(),
         }
 
-        data_path = [
-            args.train_data_path[0] if args.train_data_path else None,
-            args.valid_data_path[0] if args.valid_data_path else None,
-            args.test_data_path[0] if args.test_data_path else None,
-        ]
-
-        train_ds = SFTDataset(train_val_test_num_samples[0], data_path[0], **kwargs)
-        valid_ds = SFTDataset(train_val_test_num_samples[1], data_path[1], **kwargs)
-        test_ds = SFTDataset(train_val_test_num_samples[2], data_path[2], **kwargs)
+        train_ds = SFTDataset(train_val_test_num_samples[0], **kwargs)
+        valid_ds = SFTDataset(train_val_test_num_samples[1], **kwargs)
+        test_ds = SFTDataset(train_val_test_num_samples[2], **kwargs)
 
         print_rank_0("> finished creating SFT datasets ...")
 
diff --git a/examples/post_training/modelopt/finetune.sh b/examples/post_training/modelopt/finetune.sh
index 21493697374..e7ba0f022dc 100755
--- a/examples/post_training/modelopt/finetune.sh
+++ b/examples/post_training/modelopt/finetune.sh
@@ -23,13 +23,18 @@ if [ -z ${MLM_MODEL_SAVE} ]; then
     printf "${MLM_WARNING} Variable ${PURPLE}MLM_MODEL_SAVE${WHITE} is not set (default: ${MLM_MODEL_CKPT})!\n"
 fi
 
+if [ -z ${DATASET} ]; then
+    DATASET="Magpie-Align/Magpie-Llama-3.1-Pro-MT-300K-Filtered"
+    printf "${MLM_WARNING} Variable ${PURPLE}DATASET${WHITE} is not set (default: Magpie-Align/Magpie-Llama-3.1-Pro-MT-300K-Filtered)!\n"
+fi
+
 if [ -z ${MLM_DATA_ARGS} ]; then
     MLM_DATA_ARGS=" \
         --train-samples 128000 \
         --lr-decay-samples 128000 \
         --lr-warmup-samples 0 \
         --split 100,0,0 \
-        --finetune-hf-dataset Magpie-Align/Magpie-Llama-3.1-Pro-MT-300K-Filtered \
+        --finetune-hf-dataset ${DATASET} \
     "
 fi
 
@@ -56,6 +61,7 @@ if [ -z ${MLM_OPTIM_ARGS} ]; then
         --adam-beta1 0.9 \
         --adam-beta2 0.95 \
         --init-method-std 0.010 \
+        --use-distributed-optimizer \
     "
 fi
 
diff --git a/examples/post_training/modelopt/generate.py b/examples/post_training/modelopt/generate.py
index a773ea89f00..63d3f241f59 100644
--- a/examples/post_training/modelopt/generate.py
+++ b/examples/post_training/modelopt/generate.py
@@ -20,6 +20,8 @@
 from megatron.training.utils import print_rank_0, unwrap_model
 from model_provider import model_provider
 
+import modelopt.torch.quantization as mtq
+
 warnings.filterwarnings('once')
 
 
@@ -129,6 +131,12 @@ def get_conversations(example):
     unwrapped_model = unwrap_model(model)[0]
     unwrapped_model.eval()
 
+    # Fold the scalars into weight for speedup.
+    # [TODO]: fold_weight current assumes all weight_quantizer has weight allocated;
+    # however, this is not the case when share_embeddings_and_output_weights is False.
+    if getattr(unwrapped_model, "share_embeddings_and_output_weights", False):
+        mtq.fold_weight(unwrapped_model)
+
     for idx, example in enumerate(dataset):
         if idx > args.fraction * len(dataset):
             break
diff --git a/examples/post_training/modelopt/generate.sh b/examples/post_training/modelopt/generate.sh
old mode 100644
new mode 100755
diff --git a/examples/post_training/modelopt/mmlu.py b/examples/post_training/modelopt/mmlu.py
index 1446afc8392..d475ac9fb30 100644
--- a/examples/post_training/modelopt/mmlu.py
+++ b/examples/post_training/modelopt/mmlu.py
@@ -5,11 +5,14 @@
 import os
 import sys
 import warnings
+import datasets
+import logging
+import torch.distributed as dist
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
 
 import torch
-from datasets import load_dataset
+from diskcache import Cache
 
 from megatron.post_training.arguments import add_modelopt_args
 from megatron.post_training.checkpointing import load_modelopt_checkpoint
@@ -18,10 +21,13 @@
 from megatron.post_training.utils import report_current_memory_info
 from megatron.training import get_args, get_model, get_tokenizer, initialize_megatron
 from megatron.training.utils import print_rank_0, unwrap_model
+import modelopt.torch.quantization as mtq
 from model_provider import model_provider
 
-warnings.filterwarnings('ignore')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO) # set to debug if you need more logging
 
+warnings.filterwarnings('ignore')
 
 def add_mmlu_args(parser):
     """Add additional arguments for ModelOpt text generation PTQ."""
@@ -30,6 +36,8 @@ def add_mmlu_args(parser):
     group.add_argument("--fraction", type=float, default=1.0, help="Fraction of dataset to use.")
     group.add_argument("--lower-bound", type=float, default=None)
     group.add_argument("--no-subject-prompt", action="store_true", help="Use empty prompt instead of subject-based prompt.")
+    group.add_argument("--mmlu-dataset", type=str, default="cais/mmlu", help="The default dataset to use is cais/mmlu from the HG hub.")
+    group.add_argument("--cache-dir", type=str, default=None)
     add_modelopt_args(parser)
     return parser
 
@@ -134,7 +142,7 @@ def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_prompt=F
     )
 
     args = get_args()
-
+    cache = Cache(args.cache_dir)
     # Meta device initialization for ParallelLinear only works if using cpu initialization.
     # Meta device initialization is used such that models can be materialized in low-precision
     # directly when ModelOpt real quant is used. Otherwise, the model is first initialized
@@ -152,6 +160,12 @@ def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_prompt=F
     model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
     report_current_memory_info()
 
+    # Materialize the model from meta device to gpu before loading the checkpoint.
+    unwrapped_model = unwrap_model(model)[0]
+    unwrapped_model.eval()
+    unwrapped_model.to_empty(device="cuda")
+    report_current_memory_info()
+
     disable_tqdm = args.disable_tqdm or torch.distributed.get_rank() > 0
 
     tokenizer = get_tokenizer()._tokenizer
@@ -160,29 +174,42 @@ def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_prompt=F
         load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
         print_rank_0("Done loading checkpoint")
 
-    unwrapped_model = unwrap_model(model)[0]
-    unwrapped_model.eval()
+    # Fold the scalars into weight for speedup.
+    # [TODO]: fold_weight current assumes all weight_quantizer has weight allocated;
+    # however, this is not the case when share_embeddings_and_output_weights is False.
+    if not getattr(unwrapped_model, "share_embeddings_and_output_weights", False):
+        mtq.fold_weight(unwrapped_model)
 
     all_subjects = get_all_subjects()
 
     all_correct = {}
 
     for subject in all_subjects:
-        test_data = load_dataset("cais/mmlu", subject, split="test")
-        dev_data = load_dataset("cais/mmlu", subject, split="dev")
+        test_data = datasets.load_dataset(args.mmlu_dataset, subject, split="test")
+        dev_data = datasets.load_dataset(args.mmlu_dataset, subject, split="dev")
 
         correct = []
         for idx, test_example in enumerate(test_data):
             if idx > args.fraction * len(test_data):
                 break
-            prompt = generate_prompt(test_example, dev_data, few_shots=0, no_subject_prompt=args.no_subject_prompt)
             label = ["A", "B", "C", "D"][test_example["answer"]]
-            tokens = tokenizer(prompt, return_tensors="pt")
-            with torch.no_grad():
-                generated_ids = simple_generate(
-                    unwrapped_model, tokens.input_ids.cuda(), osl=2, disable_tqdm=disable_tqdm
-                )
-            predict = tokenizer.batch_decode(generated_ids)[0].strip()
+            prompt = generate_prompt(test_example, dev_data, few_shots=0, no_subject_prompt=args.no_subject_prompt)
+            cache_key = f"{args.load}_{subject}_{prompt}" # model name, subject, prompt
+
+            if cache_key in cache:
+                predict = cache[cache_key]
+                if dist.get_rank() == 0:
+                    logger.debug(f"Cache hit for {args.load}_{subject}")
+            else:
+                tokens = tokenizer(prompt, return_tensors="pt")
+                with torch.no_grad():
+                    generated_ids = simple_generate(
+                        unwrapped_model, tokens.input_ids.cuda(), osl=2, disable_tqdm=disable_tqdm
+                    )
+                predict = tokenizer.batch_decode(generated_ids)[0].strip()
+                if torch.distributed.get_rank() == 0:
+                    cache.add(cache_key, predict)
+
             correct += [True] if predict.startswith(label) else [False]
         all_correct[subject] = correct
 
@@ -207,5 +234,5 @@ def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_prompt=F
             flush=True,
         )
 
-    if args.lower_bound is not None:
-        assert sum(avg_correct) / len(avg_correct) > args.lower_bound
+        if args.lower_bound is not None:
+            assert sum(avg_correct) / len(avg_correct) > args.lower_bound
diff --git a/examples/post_training/modelopt/mmlu.sh b/examples/post_training/modelopt/mmlu.sh
old mode 100644
new mode 100755
diff --git a/examples/post_training/modelopt/offline_feature_extract.sh b/examples/post_training/modelopt/offline_feature_extract.sh
old mode 100644
new mode 100755
diff --git a/examples/post_training/modelopt/prune.py b/examples/post_training/modelopt/prune.py
index 6a0178a1420..2671b6badd9 100644
--- a/examples/post_training/modelopt/prune.py
+++ b/examples/post_training/modelopt/prune.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
-"""Example script for pruning a GPT / Mamba model using TensorRT Model Optimizer (ModelOpt).
+"""Example script for pruning a GPT / Mamba model using Model Optimizer (ModelOpt).
 
-Read more about ModelOpt pruning at https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/pruning
+Read more about ModelOpt pruning at https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning
 """
 
 import functools
@@ -20,12 +20,18 @@
 from modelopt.torch.export import import_mcore_gpt_from_hf
 from modelopt.torch.prune.plugins.mcore_minitron import SUPPORTED_HPARAMS
 
-from megatron.core.parallel_state import get_pipeline_model_parallel_group, get_tensor_model_parallel_group
+from megatron.core.parallel_state import (
+    get_pipeline_model_parallel_group,
+    get_tensor_model_parallel_group,
+)
 from megatron.post_training.arguments import add_modelopt_args
 from megatron.post_training.checkpointing import load_modelopt_checkpoint
 from megatron.post_training.generate import simple_generate
 from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
-from megatron.post_training.utils import report_current_memory_info
+from megatron.post_training.utils import (
+    modelopt_version_at_least,
+    report_current_memory_info,
+)
 from megatron.training import get_args, get_model, get_tokenizer, initialize_megatron
 from megatron.training.checkpointing import save_checkpoint
 from megatron.training.utils import print_rank_0, unwrap_model
@@ -38,10 +44,7 @@ def add_prune_args(parser):
     """Add additional arguments for ModelOpt pruning."""
     group = parser.add_argument_group(title="ModelOpt pruning")
     group.add_argument(
-        "--calib-size",
-        type=int,
-        default=1024,
-        help="Samples to use for pruning calibration.",
+        "--calib-size", type=int, default=1024, help="Samples to use for pruning calibration."
     )
     group.add_argument(
         "--prompts",
@@ -56,21 +59,14 @@ def add_prune_args(parser):
         help="Reference texts. Please use | to separate different batches.",
     )
     group.add_argument(
-        "--pretrained-model-path",
-        type=str,
-        default=None,
-        help="HuggingFace pretrained model",
+        "--pretrained-model-path", type=str, default=None, help="HuggingFace pretrained model"
     )
     # Pruning parameters
     group.add_argument(
-        "--target-ffn-hidden-size",
-        type=int,
-        help="Prune MLP FFN hidden size to this value",
+        "--target-ffn-hidden-size", type=int, help="Prune MLP FFN hidden size to this value"
     )
     group.add_argument(
-        "--target-hidden-size",
-        type=int,
-        help="Prune hidden size (embedding dim) to this value",
+        "--target-hidden-size", type=int, help="Prune hidden size (embedding dim) to this value"
     )
     group.add_argument(
         "--target-num-attention-heads",
@@ -93,14 +89,10 @@ def add_prune_args(parser):
         help="Prune dimension of Mamba attention heads to this value",
     )
     group.add_argument(
-        "--target-num-moe-experts",
-        type=int,
-        help="Prune number of MoE experts to this value",
+        "--target-num-moe-experts", type=int, help="Prune number of MoE experts to this value"
     )
     group.add_argument(
-        "--target-moe-ffn-hidden-size",
-        type=int,
-        help="Prune MoE FFN hidden size to this value",
+        "--target-moe-ffn-hidden-size", type=int, help="Prune MoE FFN hidden size to this value"
     )
     group.add_argument(
         "--target-moe-shared-expert-intermediate-size",
@@ -169,7 +161,9 @@ def get_params(model):
     check_arguments(args)
 
     tokenizer = get_tokenizer()._tokenizer
-    model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
+    model = get_model(
+        functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
+    )
     unwrapped_model = unwrap_model(model)[0]
 
     report_current_memory_info()
@@ -181,11 +175,11 @@ def get_params(model):
     if args.pretrained_model_path is not None:
         import_dtype = torch.float16 if args.fp16 else torch.bfloat16
         workspace_dir = os.environ.get("MLM_WORK_DIR", "/tmp")
+        import_kwargs = {"dtype": import_dtype}
+        if modelopt_version_at_least("0.41.0"):
+            import_kwargs.update({"trust_remote_code": args.trust_remote_code})
         import_mcore_gpt_from_hf(
-            unwrapped_model,
-            args.pretrained_model_path,
-            workspace_dir,
-            dtype=import_dtype,
+            unwrapped_model, args.pretrained_model_path, workspace_dir, **import_kwargs
         )
 
     def _custom_prompt_forward_loop_func(model):
@@ -211,7 +205,9 @@ def _hf_dataset_forword_loop_func(model):
             simple_generate(model, tokens.input_ids.cuda(), osl=1)
 
     if args.layers_to_drop:
-        mtp.mcore_minitron.drop_mcore_language_model_layers(model, layers_to_drop=args.layers_to_drop)
+        mtp.mcore_minitron.drop_mcore_language_model_layers(
+            model, layers_to_drop=args.layers_to_drop
+        )
     else:
         print_rank_0("Pruning model...")
         export_config = {
diff --git a/examples/post_training/modelopt/quantize.py b/examples/post_training/modelopt/quantize.py
index 737aed68b6a..ceedce606a5 100644
--- a/examples/post_training/modelopt/quantize.py
+++ b/examples/post_training/modelopt/quantize.py
@@ -2,26 +2,49 @@
 
 """Sample Generate GPT."""
 
+import copy
 import functools
 import os
 import sys
 import warnings
 
 import torch
+import torch.distributed
 from datasets import load_dataset
 from tqdm import tqdm
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
 
 import modelopt.torch.quantization as mtq
+
+try:
+    import modelopt.torch.quantization.plugins.psx_formats as mtq_psx
+except ImportError:
+    mtq_psx = None
+    warnings.warn(
+        "psx_formats is not installed. PSX formats quantization configs will not be available."
+    )
+
+try:
+    import modelopt.torch.quantization.plugins.luts as mtq_luts
+except ImportError:
+    mtq_luts = None
+    warnings.warn("luts is not installed. LUTs quantization configs will not be available.")
+
+
 from modelopt.torch.export import import_mcore_gpt_from_hf
 
+from megatron.core import parallel_state
 from megatron.core.transformer.moe.router import TopKRouter
 from megatron.post_training.arguments import add_modelopt_args
 from megatron.post_training.checkpointing import load_modelopt_checkpoint
 from megatron.post_training.generate import simple_generate
 from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
-from megatron.post_training.utils import report_current_memory_info
+from megatron.post_training.utils import (
+    modelopt_version_at_least,
+    print_distributed_quant_summary,
+    report_current_memory_info,
+)
 from megatron.training import get_args, get_model, get_tokenizer, initialize_megatron
 from megatron.training.checkpointing import save_checkpoint
 from megatron.training.utils import print_rank_0, unwrap_model
@@ -29,16 +52,27 @@
 
 warnings.filterwarnings("ignore")
 
+QUANT_CFG_CHOICES = {}
 
-QUANT_CFG_CHOICES = {
-    "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
-    "fp8": mtq.FP8_DEFAULT_CFG,
-    "fp8_blockwise": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
-    "int4_awq": mtq.INT4_AWQ_CFG,
-    "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
-    "nvfp4": mtq.NVFP4_DEFAULT_CFG,
+# Auto-load all quant configs by full name
+for k in mtq.config.choices:
+    QUANT_CFG_CHOICES[k] = getattr(mtq, k)
+
+KV_QUANT_CFG_CHOICES = {
+    "none": "none",
+    "fp8": "FP8_KV_CFG",
+    "fp8_affine": "FP8_AFFINE_KV_CFG",
+    "nvfp4": "NVFP4_KV_CFG",
+    "nvfp4_affine": "NVFP4_AFFINE_KV_CFG",
+    "nvfp4_rotate": "NVFP4_KV_ROTATE_CFG",
 }
 
+if mtq_psx is not None:
+    QUANT_CFG_CHOICES.update({k: getattr(mtq_psx, k) for k in mtq_psx.choices})
+
+if mtq_luts is not None:
+    QUANT_CFG_CHOICES.update({k: getattr(mtq_luts, k) for k in mtq_luts.choices})
+
 
 def add_text_generate_ptq_args(parser):
     """Add additional arguments for ModelOpt text generation PTQ."""
@@ -46,6 +80,12 @@ def add_text_generate_ptq_args(parser):
     group.add_argument(
         "--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
     )
+    group.add_argument(
+        "--calib-dataset",
+        type=str,
+        default="abisee/cnn_dailymail",
+        help="The default clibration dataset is cnn_dailymail from HF hub.",
+    )
     group.add_argument(
         "--prompts",
         type=str,
@@ -61,26 +101,30 @@ def add_text_generate_ptq_args(parser):
     group.add_argument(
         "--pretrained-model-path", type=str, default=None, help="HuggingFace pretrained model"
     )
-    group.add_argument(
-        "--compress",
-        action="store_true",
-        help="Enable real low-bit quantization.",
-    )
+    group.add_argument("--compress", action="store_true", help="Enable real low-bit quantization.")
     group.add_argument(
         "--disable-qkv-quant",
         action="store_true",
         help="Disable q, k, v linear from being quantized.",
     )
-    group.add_argument(
-        "--weight-only",
-        action="store_true",
-        help="Disable input quantization.",
-    )
+    group.add_argument("--weight-only", action="store_true", help="Disable input quantization.")
     group.add_argument(
         "--force-all-expert-routing",
         action="store_true",
         help="Forcing all experts to be routed during the calibration.",
     )
+    group.add_argument(
+        "--num-first-layers-to-skip-quant",
+        type=int,
+        default=None,
+        help="Number of first layers to skip quantization.",
+    )
+    group.add_argument(
+        "--num-last-layers-to-skip-quant",
+        type=int,
+        default=None,
+        help="Number of last layers to skip quantization.",
+    )
     add_modelopt_args(parser)
     return parser
 
@@ -97,6 +141,62 @@ def check_arguments():
         args.moe_grouped_gemm = False
 
 
+def _is_first_layers(name: str, num_layers: int = 1, num_layers_to_disable: int = 1) -> bool:
+    if "layers." not in name:
+        return False
+    try:
+        layer_idx = int(name.split("layers.")[-1].split(".")[0])
+    except ValueError:
+        return False
+    return layer_idx < num_layers_to_disable
+
+
+def _is_last_layers(name: str, num_layers: int = 1, num_layers_to_disable: int = 1) -> bool:
+    if "layers." not in name:
+        return False
+    try:
+        layer_idx = int(name.split("layers.")[-1].split(".")[0])
+    except ValueError:
+        return False
+    return layer_idx >= num_layers - num_layers_to_disable
+
+
+def get_first_layers_disabled_config(config, num_layers: int = 1, num_layers_to_disable: int = 1):
+    """Get a config for `mtq.quantize` with first & last `num_layers_to_disable` layers disabled.
+
+    The layers to disable are the first & last `num_layers_to_disable` layers.
+    """
+    config = copy.deepcopy(config)
+    quant_cfg = config.get("quant_cfg", {})
+    quant_cfg.update(
+        {
+            functools.partial(
+                _is_first_layers, num_layers=num_layers, num_layers_to_disable=num_layers_to_disable
+            ): {"enable": False}
+        }
+    )
+    config["quant_cfg"] = quant_cfg
+    return config
+
+
+def get_last_layers_disabled_config(config, num_layers: int = 1, num_layers_to_disable: int = 1):
+    """Get a config for `mtq.quantize` with last `num_layers_to_disable` layers disabled.
+
+    The layers to disable are the last `num_layers_to_disable` layers.
+    """
+    config = copy.deepcopy(config)
+    quant_cfg = config.get("quant_cfg", {})
+    quant_cfg.update(
+        {
+            functools.partial(
+                _is_last_layers, num_layers=num_layers, num_layers_to_disable=num_layers_to_disable
+            ): {"enable": False}
+        }
+    )
+    config["quant_cfg"] = quant_cfg
+    return config
+
+
 def get_modelopt_torch_quantization_config():
     """Return a quantization config."""
     args = get_args()
@@ -108,8 +208,6 @@ def get_modelopt_torch_quantization_config():
         "axis": None,
         "enable": True,
     }
-    # Disable mamba-mixer quantization for now.
-    mtq_config["quant_cfg"]["*mixer.*"] = {"enable": False}
     if args.export_quant_cfg == "fp8":
         # Enable Medusa heads and kv-cache quantization
         mtq_config["quant_cfg"]["*medusa_heads**"] = fp8_config
@@ -125,17 +223,38 @@ def get_modelopt_torch_quantization_config():
     # Customization
     if args.disable_qkv_quant:
         mtq_config["quant_cfg"]["*self_attention*"] = {"enable": False}
-    if args.export_kv_cache_quant and not args.compress:
-        mtq_config["quant_cfg"]["*linear_qkv.output_quantizer"] = fp8_config
+
+    # KV Cache Quantization
+    enable_quant_kv_cache = args.export_kv_cache_quant != "none"
+    if enable_quant_kv_cache and not args.compress:
+        kv_cache_quant_cfg = getattr(mtq, KV_QUANT_CFG_CHOICES[args.export_kv_cache_quant])[
+            "quant_cfg"
+        ]
+        mtq_config = mtq.utils.update_quant_cfg_with_kv_cache_quant(mtq_config, kv_cache_quant_cfg)
+
+    # Weight Only Quantization
     if args.weight_only:
         mtq_config["quant_cfg"]["*input_quantizer"] = {"enable": False}
+    if args.num_first_layers_to_skip_quant is not None:
+        mtq_config = get_first_layers_disabled_config(
+            mtq_config,
+            num_layers=args.num_layers,
+            num_layers_to_disable=args.num_first_layers_to_skip_quant,
+        )
+    if args.num_last_layers_to_skip_quant is not None:
+        mtq_config = get_last_layers_disabled_config(
+            mtq_config,
+            num_layers=args.num_layers,
+            num_layers_to_disable=args.num_last_layers_to_skip_quant,
+        )
 
     return mtq_config
 
 
 def get_calib_dataloader(calib_size=512, max_sequence_length=512):
     """Return a dataloader for calibration."""
-    dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+    args = get_args()
+    dataset = load_dataset(args.calib_dataset, name="3.0.0", split="train")
     text_column = "article"
 
     calib_size = min(len(dataset), calib_size)
@@ -158,7 +277,9 @@ def get_calib_dataloader(calib_size=512, max_sequence_length=512):
     args = get_args()
 
     tokenizer = get_tokenizer()._tokenizer
-    model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
+    model = get_model(
+        functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
+    )
 
     report_current_memory_info()
 
@@ -168,14 +289,15 @@ def get_calib_dataloader(calib_size=512, max_sequence_length=512):
 
     if args.pretrained_model_path is not None:
         from modelopt.torch.export import import_mcore_gpt_from_hf
+
         import_dtype = torch.float16 if args.fp16 else torch.bfloat16
         unwrapped_model = unwrap_model(model)[0]
         workspace_dir = os.environ.get("MLM_WORK_DIR", "/tmp")
+        import_kwargs = {"dtype": import_dtype}
+        if modelopt_version_at_least("0.41.0"):
+            import_kwargs.update({"trust_remote_code": args.trust_remote_code})
         import_mcore_gpt_from_hf(
-            unwrapped_model,
-            args.pretrained_model_path,
-            workspace_dir,
-            dtype=import_dtype,
+            unwrapped_model, args.pretrained_model_path, workspace_dir, **import_kwargs
         )
 
     def _custom_prompt_forward_loop_func(model):
@@ -196,23 +318,20 @@ def _custom_prompt_forward_loop_func(model):
     def _hf_dataset_forword_loop_func(model):
         dataloader = get_calib_dataloader(args.calib_size)
 
-        if args.force_all_expert_routing:
-            for name, module in model.named_modules():
-                if isinstance(module, TopKRouter):
-                    module.topk = module.num_experts
-
         for prompt in tqdm(dataloader, total=args.calib_size, disable=torch.distributed.get_rank()):
             tokens = tokenizer(prompt, return_tensors="pt")
             generated_ids = simple_generate(model, tokens.input_ids.cuda(), osl=1)
 
-            if args.force_all_expert_routing:
-                for name, module in model.named_modules():
-                    if isinstance(module, TopKRouter):
-                        module.topk = module.config.moe_router_topk
-
     unwrapped_model = unwrap_model(model)[0]
 
-    if args.export_quant_cfg in QUANT_CFG_CHOICES:
+    if args.force_all_expert_routing:
+        warnings.warn(
+            "--force-all-expert-routing will be deprecated in the next release and is no longer needed."
+        )
+
+    if args.export_quant_cfg is not None:
+        if args.export_quant_cfg not in QUANT_CFG_CHOICES:
+            raise ValueError(f"Unsupported quantization config {args.export_quant_cfg}.")
         print_rank_0("Quantizing the model...")
         mtq_config = get_modelopt_torch_quantization_config()
         ptq_forward_loop_func = _hf_dataset_forword_loop_func
@@ -230,19 +349,9 @@ def _hf_dataset_forword_loop_func(model):
             mtq.compress(unwrapped_model)
             print_rank_0("Weights are now compressed to low-bit!")
 
-    print_rank_0(f"Fake Quantized Model:\n {unwrapped_model}")
-
-    if torch.distributed.get_rank() == 0:
-        for k, v in unwrapped_model.state_dict().items():
-            if "amax" not in k and "_scale" not in k:
-                continue
-            if isinstance(v, torch.Tensor):
-                v_amax = torch.max(torch.abs(v.clone().detach().to(torch.bfloat16)))
-                print("{:80} {:32} {:32} max {:.4e}".format(k, str(v.dtype), str(v.shape), v_amax))
-            else:
-                print("{:80}".format(k))
+        print_distributed_quant_summary(model, "Quantized Model:")
 
     _custom_prompt_forward_loop_func(unwrapped_model)
 
-    if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
+    if args.save is not None:
         save_checkpoint(1, model, None, None, 0, release=True)
diff --git a/examples/post_training/modelopt/quantize.sh b/examples/post_training/modelopt/quantize.sh
old mode 100644
new mode 100755
diff --git a/examples/post_training/modelopt/requirements.txt b/examples/post_training/modelopt/requirements.txt
index dd1f47ef6c8..3c763e01cc1 100644
--- a/examples/post_training/modelopt/requirements.txt
+++ b/examples/post_training/modelopt/requirements.txt
@@ -1,9 +1,6 @@
+diskcache
 datasets
-jsonlines
 nvidia-modelopt
 omegaconf
-pulp
 tensorstore!=0.1.46,!=0.1.72
-torchprofile
 transformers
-zarr
diff --git a/examples/post_training/modelopt/speculative.md b/examples/post_training/modelopt/speculative.md
index 16a50511b94..6ea9dea9478 100755
--- a/examples/post_training/modelopt/speculative.md
+++ b/examples/post_training/modelopt/speculative.md
@@ -4,7 +4,7 @@
 
 </div>
 
-[Medusa](https://arxiv.org/abs/2401.10774) and [EAGLE](https://arxiv.org/pdf/2401.15077) 
+[Medusa](https://arxiv.org/abs/2401.10774) and [EAGLE](https://arxiv.org/pdf/2401.15077)
 training and model export are supported (fast decoding is supported through TensorRT-LLM).
 
 Medusa head top-1 accuracy is reported per step (**NOTE:** the accuracy here does not
@@ -42,8 +42,8 @@ and acceptance rate (AR).
 
 For simplicity and efficiency, we use `vllm serve --quantization modelopt` to host an quantized
 endpoint and we feed multi-turn conversation data to synthesize the assistant output.
-See ModelOpt's example (https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/speculative_decoding)
-for more details. The final output is stored as jsonlines in an OpenAI chat completion format.
+See ModelOpt's example (https://github.com/NVIDIA/Model-Optimizer/tree/main/speculative_decoding)
+for more details. The final output is stored as `jsonlines` in an OpenAI chat completion format.
 
 
 ### Quantization-Aware Training (QAT)
@@ -66,7 +66,7 @@ python examples/post_training/modelopt/finetune.py \
 
 ### Export Checkpoint
 
-Last, we export the Medusa heads or EAGLE module so that it can be deployed on runtime framework (i.e., TensorRT-LLM). 
+Last, we export the Medusa heads or EAGLE module so that it can be deployed on runtime framework (i.e., TensorRT-LLM).
 
 ```sh
 python examples/post_training/modelopt/export.py \
diff --git a/examples/post_training/modelopt/validate.sh b/examples/post_training/modelopt/validate.sh
old mode 100644
new mode 100755
diff --git a/examples/retro/README.md b/examples/retro/README.md
deleted file mode 100644
index f78bcdeb56b..00000000000
--- a/examples/retro/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# RETRO MODEL
-
-## Table of contents
-- [1. Training Setup](#1-training-setup)
-- [2. Data Preprocessing](#2-data-preprocessing)
-- [3. Configurations](#3-configurations)
-
-## 1. Training setup
-<a id="markdown-training-setup" name="training-setup"></a>
-
-To run the model using a docker container run it as follows
-```
-PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
-CHECKPOINT_PATH="" #<Specify path>
-TENSORBOARD_LOGS_PATH=""#<Specify path>
-
-docker run \
-  --gpus=all \
-  --ipc=host \
-  --workdir /workspace/megatron-lm \
-  -v /path/to/data:/path/to/data \
-  -v /path/to/megatron-lm:/workspace/megatron-lm \
-  megatron-lm nvcr.io/nvidia/pytorch:23.09-py3 \
-  bash examples/retro/train_retro_2b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH"
-
-```
-NOTE: Depending on the environment you are running it the above command might look slightly different.
-
-NOTE: Due to how Retro preprocess and caches elements of the pretraining dataset before training begins, some arguments are auto-loaded from the Retro preprocessing configuration. These loaded arguments include:
-
-- `--data-path`
-- `--data-cache-path`
-- `--eval-interval`
-- `--eval-iters`
-- `--global-batch-size`
-- `--tokenizer-type`
-- `--tokenizer-model`
-- `--vocab-file`
-- `--merge-file`
-- `--seed`
-- `--seq-length`
-- `--train-samples`
-
-
-## 2. Data Preprocessing
-<a id="markdown-data-preprocessing" name="data-preprocessing"></a>
-
-Retro preprocesses and caches data prior to pretraining, to greatly speed up pretraining. During data preprocessing, the retrieval database is built, and neighbor IDs are queried for each sample within the pretraining dataset. Please see `preprocess_data.sh` for an example script to preprocess data for Retro. The reference documentation for data preprocessing can be found [here](tools/retro/README.md).
-
-
-## 3. Configurations
-<a id="markdown-configurations" name="configurations"></a>
-The example in this folder shows you how to run a 2B model. Below are a few other example configurations.
-
-### 857M
-```
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --seq-length 2048 \
-       --tensor-model-parallel-size 1 \
-       --pipeline-model-parallel-size 1 \
-
-```
-
-### 4B
-```
-       --num-layers 48 \
-       --hidden-size 2560 \
-       --num-attention-heads 32 \
-       --tensor-model-parallel-size 1 \
-       --pipeline-model-parallel-size 1 \
-
-```
diff --git a/examples/retro/preprocess_data.sh b/examples/retro/preprocess_data.sh
deleted file mode 100644
index 5d2e66ba0e7..00000000000
--- a/examples/retro/preprocess_data.sh
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/bin/bash
-
-set -u
-
-unset NCCL_DEBUG
-
-######## Megatron, Retro dirs. ########
-
-REPO_DIR="<path/to/megatron/repo>"
-RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
-
-######## Task (e.g., db, index, query). ########
-
-# This script takes a single argument, which specifies the retro task to be
-# performed. The available tasks are: db-build, index-train, index-add, and
-# query-neighbors.
-
-# ~~ Examples ~~
-# RETRO_TASKS="db-build"          # Build the retrieval database
-# RETRO_TASKS="index-train"       # Train the index
-# RETRO_TASKS="index-add"         # Add data to the index
-# RETRO_TASKS="query-neighbors"   # Perform query pretraining for neighbors
-
-# You can also provide the task as a command-line argument when executing the
-# script. Example: ./preprocess_data.sh index-add
-RETRO_TASKS=$1
-
-######## Data. ########
-DATA_BLEND="<see --data-path in arguments.py>"
-
-######## Index. ########
-
-RETRO_INDEX_STR="OPQ32_64,IVF65536_HNSW8,PQ32"
-RETRO_INDEX_NTRAIN=66625331
-RETRO_INDEX_TRAIN_LOAD_FRACTION=0.97
-RETRO_INDEX_ADD_LOAD_FRACTION=0.95
-
-######## GPT. ########
-
-RETRO_GPT_SEED=1234
-RETRO_GPT_SPLIT="98,2,0"
-RETRO_GPT_DATA_PATH=${DATA_BLEND}
-RETRO_GPT_TRAIN_SAMPLES=200000
-RETRO_GPT_EVAL_INTERVAL=2000
-RETRO_GPT_EVAL_ITERS=50
-RETRO_GPT_LR_DECAY_SAMPLES=175000
-RETRO_GPT_LR_WARMUP_SAMPLES=10000
-RETRO_GPT_SEQ_LENGTH=2048
-RETRO_GPT_GLOBAL_BATCH_SIZE=256
-RETRO_GPT_CHUNK_LENGTH=64
-
-######## Query. ########
-
-RETRO_QUERY_NUM_NEIGHBORS_QUERY=200
-RETRO_QUERY_NUM_NEIGHBORS_SAVE=20
-RETRO_QUERY_EF_SEARCH=32
-RETRO_QUERY_NPROBE=4096
-
-######## Args. ########
-
-ARGS=" \
-    --distributed-timeout-minutes 600 \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --micro-batch-size 1 \
-    --global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --load ${RETRO_PROJECT_DIR}/checkpoints/bert \
-    --exit-on-missing-checkpoint \
-    --no-load-optim \
-    --data-path [null] \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${RETRO_PROJECT_DIR}/tokenizer/bert-large-uncased-vocab.txt \
-    --split ${RETRO_GPT_SPLIT} \
-    --distributed-backend nccl \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
-    --lr-decay-samples ${RETRO_GPT_LR_DECAY_SAMPLES} \
-    --lr-warmup-samples ${RETRO_GPT_LR_WARMUP_SAMPLES} \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
-    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
-    --bf16 \
-    --no-data-sharding \
-    --no-gradient-accumulation-fusion \
-    --no-async-tensor-model-parallel-allreduce \
-    --bert-embedder-type megatron \
-    --output-bert-embeddings \
-    \
-    --retro-project-dir ${RETRO_PROJECT_DIR} \
-    --retro-tasks ${RETRO_TASKS} \
-    --retro-bert-vocab-file tokenizer/bert-large-uncased-vocab.txt \
-    --retro-bert-tokenizer-type BertWordPieceLowerCase \
-    \
-    --retro-gpt-seed ${RETRO_GPT_SEED} \
-    --retro-gpt-tokenizer-type GPTSentencePieceTokenizer \
-    --retro-gpt-tokenizer-model /path/to/tokenizer/model \
-    --retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
-    --retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
-    --retro-gpt-global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
-    --retro-gpt-eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
-    --retro-gpt-eval-iters ${RETRO_GPT_EVAL_ITERS} \
-    --retro-gpt-split ${RETRO_GPT_SPLIT} \
-    --retro-gpt-data-path ${RETRO_GPT_DATA_PATH} \
-    --retro-gpt-train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
-    \
-    --retro-index-str ${RETRO_INDEX_STR} \
-    --retro-index-ntrain ${RETRO_INDEX_NTRAIN} \
-    --retro-index-train-load-fraction ${RETRO_INDEX_TRAIN_LOAD_FRACTION} \
-    --retro-index-add-load-fraction ${RETRO_INDEX_ADD_LOAD_FRACTION} \
-    --no-retro-index-delete-training-embeddings \
-    --no-retro-index-delete-added-codes \
-    \
-    --retro-query-num-neighbors-query ${RETRO_QUERY_NUM_NEIGHBORS_QUERY} \
-    --retro-query-num-neighbors-save ${RETRO_QUERY_NUM_NEIGHBORS_SAVE} \
-    --retro-query-ef-search ${RETRO_QUERY_EF_SEARCH} \
-    --retro-query-nprobe ${RETRO_QUERY_NPROBE} \
-"
-
-######## Command. ########
-
-NPROCS=8 # Number of GPUs.
-CMD="\
-    cd ${REPO_DIR} && pwd && \
-    export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
-    python -m torch.distributed.run \
-    --nproc_per_node ${NPROCS} \
-    --nnodes 1 \
-    --node_rank ${NODE_RANK} \
-    --master_addr ${MASTER_ADDR} \
-    --master_port 6000 \
-    tools/retro/preprocess_data.py ${ARGS} \
-"
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "CMD = '$CMD'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-eval $CMD
diff --git a/examples/retro/train_retro_2b_distributed.sh b/examples/retro/train_retro_2b_distributed.sh
deleted file mode 100644
index c8276b56f43..00000000000
--- a/examples/retro/train_retro_2b_distributed.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-
-# Runs the "307M" parameter Retro model.
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NUM_NODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
-
-CHECKPOINT_PATH=$1 #<Specify path>
-TENSORBOARD_LOGS_PATH=$2 #<Specify path>
-
-DISTRIBUTED_ARGS=(
-    --nproc_per_node $GPUS_PER_NODE 
-    --nnodes $NUM_NODES 
-    --master_addr $MASTER_ADDR 
-    --master_port $MASTER_PORT
-)
-
-######## GPT or Retro? ########
-
-# 0 : GPT.
-# 1 : Retro
-
-ADD_RETRIEVER=1
-
-######## Megatron, Retro dirs. ########
-
-RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
-
-######## Model, training args. ########
-
-# ** Note: --seq-length auto loaded from Retro project dir.
-RETRO_MODEL_ARGS=(
-    --num-layers 32
-    --hidden-size 2048
-    --num-attention-heads 32
-)
-
-# ** Note: --data-path, --tokenizer-type, and --tokenizer-model auto loaded from Retro project dir.
-DATA_ARGS=(
-    --split 98,2,0
-)
-
-MODEL_PARALLEL_ARGS=(
-    --tensor-model-parallel-size 8
-    --pipeline-model-parallel-size 1 
-)
-
-# ** Note: --eval-interval, --eval-iters auto loaded from Retro project dir.
-EVAL_AND_LOGGING_ARGS=(
-    --log-interval 100
-    --save-interval 10000 
-    --eval-interval 1000 
-    --save $CHECKPOINT_PATH 
-    --load $CHECKPOINT_PATH 
-    --eval-iters 10
-    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
-)
-
-TRAINING_ARGS=" \
-    --retro-project-dir ${RETRO_PROJECT_DIR} \
-    --transformer-impl transformer_engine \
-    --num-workers 8 \
-    --micro-batch-size 4 \
-    --lr-decay-samples 166400000 \
-    --lr-warmup-samples 162761 \
-    --lr 6.0e-4 \
-    --min-lr 6.0e-5 \
-    --lr-decay-style cosine \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.023 \
-    --log-params-norm \
-    --log-num-zeros-in-grad \
-    --bf16 \
-    --no-data-sharding \
-"
-
-if [ "$ADD_RETRIEVER" = "1" ]; then
-    TRAINING_ARGS+=" --retro-add-retriever"
-fi
-
-######## Command. ########
-
-torchrun ${DISTRIBUTED_ARGS[@]} pretrain_retro.py \
-    ${RETRO_MODEL_ARGS[@]} \
-    ${TRAINING_ARGS} \
-    ${MODEL_PARALLEL_ARGS[@]} \
-    ${DATA_ARGS[@]} \
-    ${EVAL_AND_LOGGING_ARGS[@]}
diff --git a/examples/rl/environment_configs/gsm8k.yaml b/examples/rl/environment_configs/gsm8k.yaml
index ae0a319d9df..dc0f34dd4ca 100644
--- a/examples/rl/environment_configs/gsm8k.yaml
+++ b/examples/rl/environment_configs/gsm8k.yaml
@@ -1,5 +1,6 @@
 - agent_type: examples.rl.environments.math.gsm8k_agent.GSM8KAgent
   agent_args:
     answer_format: "boxed"
+    format_reward: 0.5
   weight: 1.0
   evaluation_only: false
diff --git a/examples/rl/environment_configs/gsm8k_nanov3.yaml b/examples/rl/environment_configs/gsm8k_nanov3.yaml
new file mode 100644
index 00000000000..30403ed052b
--- /dev/null
+++ b/examples/rl/environment_configs/gsm8k_nanov3.yaml
@@ -0,0 +1,10 @@
+- agent_type: examples.rl.environments.math.gsm8k_agent.GSM8KAgent
+  agent_args:
+    answer_format: "boxed"
+    format_reward: 0.5
+    assistant_suffix: "Assistant: "
+    chat_mode: true
+    negative_reward: 0.0
+    partial_end_reward: 0.75
+  weight: 1.0
+  evaluation_only: false
diff --git a/examples/rl/environment_configs/openmathinstructv2.yaml b/examples/rl/environment_configs/openmathinstructv2.yaml
new file mode 100644
index 00000000000..7685d224575
--- /dev/null
+++ b/examples/rl/environment_configs/openmathinstructv2.yaml
@@ -0,0 +1,3 @@
+- agent_type: examples.rl.environments.math.openmath_agent.OpenMathInstructAgent
+  agent_args: {}
+  weight: 1.0
diff --git a/examples/rl/environments/math/gsm8k_agent.py b/examples/rl/environments/math/gsm8k_agent.py
index 348ba655dbb..3bb39bc09f9 100644
--- a/examples/rl/environments/math/gsm8k_agent.py
+++ b/examples/rl/environments/math/gsm8k_agent.py
@@ -23,8 +23,23 @@
 
 
 class GSM8KAgent(MathAgent):
-    def __init__(self, answer_format: str = "boxed", format_reward: float = 0.0, **kwargs):
-        super().__init__(format_reward=format_reward, answer_format=answer_format, **kwargs)
+    def __init__(self,
+        answer_format: str = "boxed",
+        chat_mode: bool = False,
+        assistant_suffix: str = "Assistant: Let me solve this step by step.\n<think>",
+        format_reward: float = 0.0,
+        negative_reward: float = 0.0,
+        partial_end_reward: float = 0.0,
+        **kwargs):
+        super().__init__(
+            answer_format=answer_format,
+            chat_mode=chat_mode,
+            assistant_suffix=assistant_suffix,
+            format_reward=format_reward,
+            negative_reward=negative_reward,
+            partial_end_reward=partial_end_reward,
+            **kwargs
+        )
         self.env_id: str = "gsm8k"
 
     def reformat_datum(self, datum: dict) -> dict:
diff --git a/examples/rl/environments/math/math_agent.py b/examples/rl/environments/math/math_agent.py
index d63e3f25623..67feb3b4adb 100644
--- a/examples/rl/environments/math/math_agent.py
+++ b/examples/rl/environments/math/math_agent.py
@@ -21,15 +21,38 @@
     MATHVERIFY_AVAILABLE
 ), "math_verify is not installed but now required. Install it using `pip install math-verify` to continue."
 
-NEGATIVE_REWARD = 0.0
-
-
 class MathAgent(RewardOnlyAgent):
-    def __init__(self, format_reward: float = 0.0, answer_format: str = "tagged", **kwargs):
+    def __init__(self,
+        format_reward: float = 0.0,
+        answer_format: str = "tagged",
+        assistant_suffix: str = "Assistant: Let me solve this step by step.\n<think>",
+        chat_mode: bool = False,
+        negative_reward: float = 0.0,
+        partial_end_reward: float = 0.0,
+        **kwargs):
+        """
+        Args:
+            format_reward (float): Reward given when the answer is in the expected format,
+                even if the answer is incorrect or is missing the end-of-text token.
+            answer_format (str): Which answer format is expected: "tagged" for <answer> tags,
+                or "boxed" for \boxed{} LaTeX formatting.
+            assistant_suffix (str): The suffix string included in the assistant's response, typically to
+                guide the assistant's output format and "persona". For example, "Let me solve this step by step."
+            chat_mode (bool): If True, agent operates in a chat (conversational) context.
+            negative_reward (float): Reward assigned for a clearly incorrect or unparseable answer.
+            partial_end_reward (float): Reward when the answer is correct but an expected end token is not matched exactly.
+            **kwargs: Additional arguments for the base RewardOnlyAgent.
+        """
         super().__init__(**kwargs)
+
         assert answer_format in ["tagged", "boxed"], "Invalid answer format"
+
         self.format_reward = format_reward
         self.answer_format = answer_format
+        self.assistant_suffix = assistant_suffix
+        self.chat_mode = chat_mode
+        self.negative_reward = negative_reward
+        self.partial_end_reward = partial_end_reward
 
     def compute_score(self, response: str, golden: dict, golden_key: str = "answer") -> float:
         """Take a response and a golden answer and return a score. Supports tagged or boxed answers.
@@ -37,32 +60,70 @@ def compute_score(self, response: str, golden: dict, golden_key: str = "answer")
         Uses the final answer in the response string to compute the score.
         """
         # Allow <answer> tags or \boxed{} tags (this is a bit of cheating in favor of deepseek distilled models I think)
-        for pattern in [
-            r'<answer>(.*?)</answer>',
-            r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}",
-        ]:
-            match = re.finditer(pattern, response, re.DOTALL)
-            matches = list(match)
-            if matches:
-                final_answer = matches[-1].group(1).strip()
-                break
-        else:
-            # Did not format the answer correctly
-            return NEGATIVE_REWARD
+        matched_format = None
+        end_tokens = ["<|end_of_text|>", "<|endoftext|>", "</s>"]
 
-        try:
-            parsed_answer = parse(final_answer)
-        except ValueError as e:
-            print("Failed to parse the answer.")
-            traceback.print_stack()
-            return NEGATIVE_REWARD
+        # Only an answer immediately followed by a known end token yields 1.0 reward.
+        answer_tag_pattern = r'<answer>(.*?)</answer>'
+        answer_tag_match = list(re.finditer(answer_tag_pattern, response, re.DOTALL))
+        if answer_tag_match:
+            # Only consider the last occurrence
+            last_match = answer_tag_match[-1]
+            final_answer = last_match.group(1).strip()
+            after = response[last_match.end():].lstrip()  # strip whitespace between </answer> and token
 
-        correct_answer = verify(str(golden[golden_key]), parsed_answer)
-        if correct_answer:
-            return 1.0
+            try:
+                parsed_answer = parse(final_answer)
+            except ValueError as e:
+                print("Failed to parse the answer.")
+                traceback.print_stack()
+                return self.negative_reward
+
+            correct_answer = verify(str(golden[golden_key]), parsed_answer)
+            if correct_answer:
+                # Accept either <|end_of_text|> or <|endoftext|> as valid terminators, for flexibility.
+                for token in end_tokens:
+                    if after.startswith(token):
+                        return 1.0
+                # If the end token is present later (extra text before it), give partial credit.
+                for token in end_tokens:
+                    if token in after:
+                        return self.partial_end_reward
+                # If a correct answer but missing immediate end, give format reward (not NEGATIVE_REWARD).
+                return self.format_reward
+            else:
+                # Incorrect answer, regardless of format/end-of-text
+                return self.format_reward
         else:
-            # Formatting is correct but the answer is incorrect
-            return self.format_reward
+            # Fallback: check boxed answer format for diagnostic/format reward as before
+            boxed_pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}"
+            boxed_match = list(re.finditer(boxed_pattern, response, re.DOTALL))
+            if boxed_match:
+                last_match = boxed_match[-1]
+                final_answer = last_match.group(1).strip()
+                after = response[last_match.end():].lstrip()
+                try:
+                    parsed_answer = parse(final_answer)
+                except ValueError as e:
+                    print("Failed to parse the answer.")
+                    traceback.print_stack()
+                    return self.negative_reward
+
+                correct_answer = verify(str(golden[golden_key]), parsed_answer)
+                if correct_answer:
+                    for token in end_tokens:
+                        if after.startswith(token):
+                            return 1.0
+                    for token in end_tokens:
+                        if token in after:
+                            return self.partial_end_reward
+                    return self.format_reward
+                else:
+                    # Formatting is correct but the answer is incorrect
+                    return self.format_reward
+            else:
+                # Did not format the answer correctly
+                return self.negative_reward
 
     def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:
         """Take a string math problem and return the prompt. Supports requesting tagged or boxed answers. Supports chat mode prompts."""
@@ -80,6 +141,5 @@ def make_prefix(self, problem_key: str = "problem", **kwargs) -> str:
     The question will be a word math problem. Show your work in <think> </think> tags. 
     {answer_format}
     User: {kwargs[problem_key]}
-    Assistant: Let me solve this step by step.
-    <think>"""
+    {self.assistant_suffix}"""
         return prefix
diff --git a/examples/rl/model_configs/common.sh b/examples/rl/model_configs/common.sh
new file mode 100644
index 00000000000..4f6ca0e18cf
--- /dev/null
+++ b/examples/rl/model_configs/common.sh
@@ -0,0 +1,43 @@
+echo "Loading common options"
+
+export UB_TIMEOUT=720
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_DEBUG=WARN
+
+
+
+COMMON_OPTIONS="\
+    --tensor-model-parallel-size $TP  \
+    --pipeline-model-parallel-size $PP  \
+    --use-mcore-models \
+    --transformer-impl transformer_engine \
+    --${PRECISION:-bf16} \
+    --te-rng-tracker \
+    --rl-offload-optimizer-during-inference \
+    --inference-dynamic-batching-buffer-size-gb 20 \
+    --data-parallel-random-init \
+    --attention-backend flash \
+    --timing-log-level 1 \
+    --log-timers-to-tensorboard \
+    --save-retain-interval 120 \
+    --inference-dynamic-batching-num-cuda-graphs 1 \
+    --inference-dynamic-batching-unified-memory-level 1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 ${ADAM_BETA2:-0.95} \
+    --adam-eps 1e-8 \
+    "
+
+if [ ${LOWER_PRECISION:-false} == true ]; then
+    echo "Lower precision experiments, disabling cuda graphs."
+    ENABLE_CUDA_GRAPH=false
+    COMMON_OPTIONS="${COMMON_OPTIONS} --no-gradient-accumulation-fusion"
+else 
+    COMMON_OPTIONS="${COMMON_OPTIONS}"
+fi
+
+if [ ${ENABLE_CUDA_GRAPH:-true} == true ]; then
+    COMMON_OPTIONS="${COMMON_OPTIONS} --cuda-graph-impl=local"
+fi
diff --git a/examples/rl/model_configs/llama3p1_8b_instruct.sh b/examples/rl/model_configs/llama3p1_8b_instruct.sh
new file mode 100644
index 00000000000..24d285a6cf7
--- /dev/null
+++ b/examples/rl/model_configs/llama3p1_8b_instruct.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+TP=${TP:-8}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-4}
+LLM="llama3p1_8b_instruct"
+EXTRAS=""
+
+echo "Using Llama 3.1 8B Instruct model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  ENTROPY_WEIGHT=${ENTROPY_WEIGHT:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+elif [ "$(basename "$ENV_CONFIG")" = "openmathinstructv2.yaml" ]; then
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-32}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.01"}
+  ENTROPY_WEIGHT=${ENTROPY_WEIGHT:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-4096}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+  EXTRAS="--lr-warmup-samples 5120"
+else
+  # Some default values if config is missing.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-4096}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-32}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.01"}
+  ENTROPY_WEIGHT=${ENTROPY_WEIGHT:-"0.0"}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --grpo-entropy-term-weight $ENTROPY_WEIGHT \
+  --langrl-env-config $ENV_CONFIG "
+
+MODEL_OPTIONS="\
+  --disable-bias-linear \
+  --ckpt-format torch_dist \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --add-qkv-bias \
+  --normalization RMSNorm \
+  --group-query-attention \
+  --num-query-groups 8 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --untie-embeddings-and-output-weights \
+  --weight-decay 0.1 \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 500000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --num-layers 32  \
+  --hidden-size 4096  \
+  --ffn-hidden-size 14336 \
+  --num-attention-heads 32  \
+  --max-position-embeddings 131072  \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model unsloth/Meta-Llama-3.1-8B-Instruct \
+  --langrl-inference-server-type "inplace_megatron_chat" \
+  --langrl-inference-server-conversation-template "unsloth/Meta-Llama-3.1-8B-Instruct" \
+  --lr 3e-7 \
+  --make-vocab-size-divisible-by 128 \
+  --clip-grad 1.0 \
+  --rl-use-sequence-packing \
+  --rl-sequence-packing-algo fifo \
+  $EXTRAS"
+
diff --git a/examples/rl/model_configs/nemotron5_56b.sh b/examples/rl/model_configs/nemotron5_56b.sh
new file mode 100644
index 00000000000..fd2cc4f7212
--- /dev/null
+++ b/examples/rl/model_configs/nemotron5_56b.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+TP=${TP:-8}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-2}
+LLM="nemotron5_56b"
+
+echo "Using Nemotron5 56B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-32}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Original Qwen model uses a wrong padding_id token. unsloth tokenizer fixes it.
+MODEL_OPTIONS="\
+  --calculate-per-token-loss \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+ --fp8-format hybrid \
+    --fp8-amax-history-len 1 \
+    --fp8-amax-compute-algo max \
+    --fp8-interval 1 \
+    --fp8-margin 0 \
+    --first-last-layers-bf16 \
+    \
+    --fp8-recipe tensorwise \
+    --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
+    --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+    --mamba-state-dim 256 \
+    --per-split-data-args-path ${BLEND_PATH} \
+    --tiktoken-pattern v2 \
+    --distributed-timeout-minutes 60 \
+    --use-mcore-models \
+    --no-mmap-bin-files \
+    --sequence-parallel \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --init-method-std 0.0099 \
+    --position-embedding-type none \
+    --squared-relu \
+    --num-layers 118 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 32768 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --exit-duration-in-mins 5750 \
+    --seq-length 8191 \
+    --max-position-embeddings 8192 \
+  --tensor-model-parallel-size $TP  \
+  --pipeline-model-parallel-size $PP  \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+   --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --tokenizer-type TikTokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+     --position-embedding-type none \
+     --dist-ckpt-strictness log_unexpected \
+      --ckpt-format torch_dist \
+--ckpt-fully-parallel-save \
+    --ckpt-fully-parallel-load \
+     --async-save \
+    --ckpt-assume-constant-structure \
+    --log-progress  \
+    --timing-log-option minmax \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --log-throughput \
+--use-distributed-optimizer \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+    --no-create-attention-mask-in-dataloader \
+  --lr 1e-6 \
+  --model-temperature 1.2 \
+  --lr-warmup-samples 0 \
+  "
diff --git a/examples/rl/model_configs/nemotron5_8b.sh b/examples/rl/model_configs/nemotron5_8b.sh
new file mode 100644
index 00000000000..7b8947ae763
--- /dev/null
+++ b/examples/rl/model_configs/nemotron5_8b.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+TP=${TP:-8}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-2}
+LLM="nemotron5_8b"
+
+echo "Using Nemotron5 8B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-32}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Original Qwen model uses a wrong padding_id token. unsloth tokenizer fixes it.
+MODEL_OPTIONS="\
+  --calculate-per-token-loss \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
+    --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+    --tiktoken-pattern v2 \
+    --distributed-timeout-minutes 60 \
+    --use-mcore-models \
+    --no-mmap-bin-files \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-5 \
+   --init-method-std 0.014 \
+    --position-embedding-type none \
+    --squared-relu \
+    --num-layers 52 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 21504 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --exit-duration-in-mins 5750 \
+    --seq-length 8191 \
+    --max-position-embeddings 8192 \
+  --tensor-model-parallel-size $TP  \
+  --pipeline-model-parallel-size $PP  \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+   --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --tokenizer-type TikTokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --no-use-tokenizer-model-from-checkpoint-args \
+     --position-embedding-type none \
+     --dist-ckpt-strictness log_unexpected \
+      --ckpt-format torch_dist \
+--ckpt-fully-parallel-save \
+    --ckpt-fully-parallel-load \
+--use-distributed-optimizer \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+    --no-create-attention-mask-in-dataloader \
+  --lr 1e-6 \
+  --lr-warmup-samples 0 \
+  "
diff --git a/examples/rl/model_configs/nemotron5p5_12b_H.sh b/examples/rl/model_configs/nemotron5p5_12b_H.sh
new file mode 100644
index 00000000000..9e97051e087
--- /dev/null
+++ b/examples/rl/model_configs/nemotron5p5_12b_H.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+TP=${TP:-4}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-2}
+LLM="nemotron5p5_12b_H"
+
+echo "Using Nemotron5p5 12B model checkpoint"
+
+export LOWER_PRECISION=true
+export SEQUENCE_PARALLEL=false
+echo "Lower precision: $LOWER_PRECISION"
+echo "Sequence parallel: $SEQUENCE_PARALLEL"
+
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-32}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Original Qwen model uses a wrong padding_id token. unsloth tokenizer fixes it.
+MODEL_OPTIONS="\
+  --calculate-per-token-loss \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --fp8-recipe blockwise \
+  --fp8-format e4m3 \
+  --first-last-layers-bf16 \
+  --num-layers-at-start-in-bf16 2 \
+  --num-layers-at-end-in-bf16 2 \
+  --fp8-param-gather \
+  --disable-gloo-process-groups \
+  --is-hybrid-model \
+  --mamba-head-dim 80 \
+  --hybrid-override-pattern M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M- \
+  --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+  --tiktoken-pattern v2 \
+  --distributed-timeout-minutes 10 \
+  --use-mcore-models \
+  --no-mmap-bin-files \
+  --untie-embeddings-and-output-weights \
+  --disable-bias-linear \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-5 \
+   --init-method-std 0.0125 \
+    --position-embedding-type none \
+  --squared-relu \
+    --num-layers 62 \
+    --hidden-size 5120 \
+    --num-attention-heads 40 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 20480 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --exit-duration-in-mins 5750 \
+    --seq-length 8191 \
+    --max-position-embeddings 8192 \
+    --tensor-model-parallel-size $TP  \
+    --pipeline-model-parallel-size $PP  \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --lr 4.5e-6 \
+    --min-lr 4.5e-7 \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --tokenizer-type TikTokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --save-interval 2000 \
+    --ckpt-format torch_dist \
+    --ckpt-fully-parallel-save \
+    --ckpt-fully-parallel-load \
+    --ckpt-assume-constant-structure \
+    --log-progress  \
+    --timing-log-option minmax \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --log-throughput \
+    --bf16 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --use-distributed-optimizer \
+    --ddp-num-buckets 5 \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+    --no-create-attention-mask-in-dataloader \
+    --manual-gc \
+    --num-workers 1 \
+    --log-straggler \
+    --disable-straggler-on-startup \
+    --straggler-minmax-count 16 \
+    --check-weight-hash-across-dp-replicas-interval 20000 \
+    --rerun-mode disabled \
+    --rl-default-temperature 0.9 \
+    --rl-default-top-p 0.95 \
+  "
diff --git a/examples/rl/model_configs/nemotron6_3b_moe.sh b/examples/rl/model_configs/nemotron6_3b_moe.sh
new file mode 100644
index 00000000000..eff4f6cf0b3
--- /dev/null
+++ b/examples/rl/model_configs/nemotron6_3b_moe.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+TP=${TP:-2}
+PP=${PP:-1}
+EP=${EP:-32}
+NODES_REQUIRED=${NODES_REQUIRED:-4}
+LLM="nemotron6_3b_moe"
+
+echo "Using Nemotron6 3B MOE model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-11999}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-20}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-2}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-16}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-32}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-1024}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-20}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+MODEL_OPTIONS="\
+  --rl-skip-bos-token \
+  --no-rl-use-sequence-packing \
+  --rl-partial-rollouts \
+  --rl-offload-optimizer-during-inference \
+  --moe-pad-experts-for-cuda-graph-inference \
+  --inference-dynamic-batching-max-tokens 8192 \
+  --inference-dynamic-batching-max-requests 128 \
+  --inference-dynamic-batching-num-cuda-graphs 2 \
+  --decode-only-cuda-graphs \
+  --cuda-graph-impl local \
+  --cuda-graph-scope full \
+  --use-checkpoint-args \
+  --enable-experimental \
+  --cross-entropy-loss-fusion \
+  --cross-entropy-fusion-impl native \
+  --moe-aux-loss-coeff 0.0 \
+  --moe-router-dtype fp64 \
+  --moe-router-load-balancing-type aux_loss \
+  --moe-router-score-function sigmoid \
+  --moe-token-dispatcher-type alltoall \
+  --moe-router-enable-expert-bias \
+  --moe-router-topk-scaling-factor 2.5 \
+  --disable-gloo-process-groups \
+  --rl-default-top-k -1 \
+  --rl-default-temperature 1.0 \
+  --rl-default-top-p 1.0 \
+  --rl-inference-logprobs-is-correction \
+  --rl-importance-sampling-truncation-coef 10.0 \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --distributed-timeout-minutes 60 \
+  --use-mcore-models \
+  --no-mmap-bin-files \
+  --disable-bias-linear \
+  --norm-epsilon 1e-5 \
+  --init-method-std 0.014 \
+  --exit-duration-in-mins 5750 \
+  --max-position-embeddings $MAX_SEQ_LENGTH \
+  --tensor-model-parallel-size $TP  \
+  --pipeline-model-parallel-size $PP  \
+  --expert-model-parallel-size $EP \
+  --expert-tensor-parallel-size 1 \
+  --weight-decay 0.01 \
+  --clip-grad 1.0 \
+  --tiktoken-pattern v2 \
+  --tokenizer-type TikTokenizer \
+  --tokenizer-model ${TOKENIZER_MODEL} \
+  --dist-ckpt-strictness log_unexpected
+  --ckpt-format torch_dist \
+  --ckpt-fully-parallel-save \
+  --ckpt-fully-parallel-load \
+  --use-distributed-optimizer \
+  --overlap-grad-reduce \
+  --overlap-param-gather \
+  --no-create-attention-mask-in-dataloader \
+  --lr 3e-6 \
+  --min-lr 3e-6 \
+  --lr-decay-style constant \
+  --lr-warmup-samples 640 \
+  --lr-warmup-init 0.3e-7 \
+  --no-load-optim \
+  --no-load-rng \
+  "
diff --git a/examples/rl/model_configs/qwen3_30b_a3b_moe.sh b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh
new file mode 100644
index 00000000000..775a9587ba4
--- /dev/null
+++ b/examples/rl/model_configs/qwen3_30b_a3b_moe.sh
@@ -0,0 +1,97 @@
+#!/bin/bash 
+
+TP=${TP:-4}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-1}
+
+echo "Using Qwen3-30B-A3B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# Default values
+GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-256}
+MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
+EXIT_INTERVAL=${EXIT_INTERVAL:-20}
+CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-20}
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+
+MODEL_OPTIONS="
+--seq-length $MAX_SEQ_LENGTH \
+--inference-max-seq-length $MAX_SEQ_LENGTH \
+--inference-max-batch-size $MAX_INFERENCE_BS \
+--pretrained-checkpoint $CHECKPOINT \
+--no-use-tokenizer-model-from-checkpoint-args \
+--seq-length 8192 \
+--inference-max-seq-length 8192 \
+--bf16 \
+--tensor-model-parallel-size $TP  \
+--pipeline-model-parallel-size $PP  \
+--expert-model-parallel-size $EP \
+--attention-backend flash \
+--transformer-impl transformer_engine \
+--te-rng-tracker \
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model Qwen/Qwen3-30B-A3B \
+--untie-embeddings-and-output-weights \
+--num-layers 48 \
+--hidden-size 2048 \
+--ffn-hidden-size 6144 \
+--num-attention-heads 32 \
+--kv-channels 128 \
+--max-position-embeddings 8192 \
+--group-query-attention \
+--num-query-groups 4 \
+--normalization RMSNorm \
+--norm-epsilon 1e-6 \
+--position-embedding-type rope \
+--rotary-percent 1.0 \
+--rotary-base 1000000 \
+--use-rotary-position-embeddings \
+--swiglu \
+--disable-bias-linear \
+--num-experts 128 \
+--moe-router-topk 8 \
+--moe-ffn-hidden-size 768 \
+--moe-aux-loss-coeff 0.001 \
+--moe-router-load-balancing-type aux_loss \
+--attention-dropout 0.0 \
+--hidden-dropout 0.0 \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32 \
+--vocab-size 151936 \
+--make-vocab-size-divisible-by 128 \
+--dist-ckpt-strictness log_unexpected \
+--qk-layernorm \
+--moe-token-dispatcher-type alltoall \
+--moe-layer-freq 1 \
+--optimizer adam \
+--adam-beta1 0.9 \
+--adam-beta2 0.999 \
+--adam-eps 1e-8 \
+--lr 1e-6 \
+--min-lr 1e-7 \
+--lr-warmup-samples 0 \
+--clip-grad 1.0 \
+--weight-decay 0.01 \
+--no-load-optim \
+--ckpt-format torch_dist
+"
diff --git a/examples/rl/model_configs/qwen3_32b.sh b/examples/rl/model_configs/qwen3_32b.sh
new file mode 100644
index 00000000000..cd153a04f3c
--- /dev/null
+++ b/examples/rl/model_configs/qwen3_32b.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+TP=${TP:-4}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-1}
+
+echo "Using Qwen3 32B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# Default values
+GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-256}
+MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
+EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Model configuration based on MegatronBridge run_config.yaml
+MODEL_OPTIONS="\
+  --ckpt-format torch_dist \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --untie-embeddings-and-output-weights \
+  --num-layers 64 \
+  --hidden-size 5120 \
+  --ffn-hidden-size 25600 \
+  --num-attention-heads 64 \
+  --kv-channels 128 \
+  --max-position-embeddings 40960 \
+  --group-query-attention \
+  --num-query-groups 8 \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-6 \
+  --qk-layernorm \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 1000000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --disable-bias-linear \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model Qwen/Qwen3-4B \
+  --vocab-size 151936 \
+  --make-vocab-size-divisible-by 128 \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.999 \
+  --adam-eps 1e-8 \
+  --lr 1e-6 \
+  --min-lr 1e-7 \
+  --lr-warmup-samples 0 \
+  --clip-grad 1.0 \
+  --weight-decay 0.01 \
+  --recompute-granularity selective \
+  --recompute-activations \
+  --recompute-modules core_attn \
+  "
+
diff --git a/examples/rl/model_configs/qwen3_4b.sh b/examples/rl/model_configs/qwen3_4b.sh
new file mode 100644
index 00000000000..da238511fd3
--- /dev/null
+++ b/examples/rl/model_configs/qwen3_4b.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+TP=${TP:-1}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-1}
+
+echo "Using Qwen3 4B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# Default values
+GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-256}
+MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-32768}
+EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Model configuration based on MegatronBridge run_config.yaml
+MODEL_OPTIONS="\
+  --ckpt-format torch_dist \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --num-layers 36 \
+  --hidden-size 2560 \
+  --ffn-hidden-size 9728 \
+  --num-attention-heads 32 \
+  --kv-channels 128 \
+  --max-position-embeddings 40960 \
+  --group-query-attention \
+  --num-query-groups 8 \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-6 \
+  --qk-layernorm \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 1000000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --disable-bias-linear \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model Qwen/Qwen3-4B \
+  --langrl-inference-server-type "inplace_megatron_chat" \
+  --langrl-inference-server-conversation-template "Qwen/Qwen3-4B" \
+  --vocab-size 151936 \
+  --make-vocab-size-divisible-by 128 \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.999 \
+  --adam-eps 1e-8 \
+  --lr 1e-6 \
+  --min-lr 1e-7 \
+  --lr-warmup-samples 0 \
+  --clip-grad 1.0 \
+  --weight-decay 0.01 \
+  --recompute-granularity selective \
+  --recompute-activations \
+  --recompute-modules core_attn \
+  "
+
diff --git a/examples/rl/model_configs/qwen3_8b.sh b/examples/rl/model_configs/qwen3_8b.sh
new file mode 100644
index 00000000000..6758cd84c3d
--- /dev/null
+++ b/examples/rl/model_configs/qwen3_8b.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+TP=${TP:-1}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-1}
+
+echo "Using Qwen3 8B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# Default values
+GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-256}
+MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-32768}
+EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Model configuration based on MegatronBridge run_config.yaml
+MODEL_OPTIONS="\
+  --ckpt-format torch_dist \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --untie-embeddings-and-output-weights \
+  --num-layers 36 \
+  --hidden-size 4096 \
+  --ffn-hidden-size 12288 \
+  --num-attention-heads 32 \
+  --kv-channels 128 \
+  --max-position-embeddings 40960 \
+  --group-query-attention \
+  --num-query-groups 8 \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-6 \
+  --qk-layernorm \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 1000000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --disable-bias-linear \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model Qwen/Qwen3-8B \
+  --langrl-inference-server-type "inplace_megatron_chat" \
+  --langrl-inference-server-conversation-template "Qwen/Qwen3-8B" \
+  --vocab-size 151936 \
+  --make-vocab-size-divisible-by 128 \
+  --optimizer adam \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.999 \
+  --adam-eps 1e-8 \
+  --lr 1e-6 \
+  --min-lr 1e-7 \
+  --lr-warmup-samples 0 \
+  --clip-grad 1.0 \
+  --weight-decay 0.01 \
+  --recompute-granularity selective \
+  --recompute-activations \
+  --recompute-modules core_attn \
+  "
+
diff --git a/examples/rl/model_configs/qwen_2p5_32b.sh b/examples/rl/model_configs/qwen_2p5_32b.sh
new file mode 100644
index 00000000000..d82972ba477
--- /dev/null
+++ b/examples/rl/model_configs/qwen_2p5_32b.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+TP=${TP:-8}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-8}
+LLM="qwen2p5_32b"
+
+echo "Using Qwen 2.5 32B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.28}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-32}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-32}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-12000}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Original Qwen model uses a wrong padding_id token. unsloth tokenizer fixes it.
+MODEL_OPTIONS="\
+  --calculate-per-token-loss \
+  --ckpt-format torch_dist \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --untie-embeddings-and-output-weights \
+  --disable-bias-linear \
+  --add-qkv-bias \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-5 \
+  --group-query-attention \
+  --num-query-groups 8 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --weight-decay 0.0 \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 1000000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --num-layers 64  \
+  --hidden-size 5120  \
+  --ffn-hidden-size 27648 \
+  --num-attention-heads 40  \
+  --max-position-embeddings 131072 \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model unsloth/Qwen2.5-32B \
+  --lr 1e-6 \
+  --lr-warmup-samples 0 \
+  --make-vocab-size-divisible-by 128 \
+  --clip-grad 1.0 \
+  --recompute-granularity selective \
+  --recompute-activations "
diff --git a/examples/rl/model_configs/qwen_2p5_3b.sh b/examples/rl/model_configs/qwen_2p5_3b.sh
new file mode 100644
index 00000000000..246afae6ad2
--- /dev/null
+++ b/examples/rl/model_configs/qwen_2p5_3b.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+TP=${TP:-2}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-2}
+LLM="qwen2p5_3b"
+
+echo "Using Qwen 2.5 3B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  ENTROPY_WEIGHT=${ENTROPY_WEIGHT:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-32}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  ENTROPY_WEIGHT=${ENTROPY_WEIGHT:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --grpo-entropy-term-weight $ENTROPY_WEIGHT \
+  --langrl-env-config $ENV_CONFIG "
+
+# Original Qwen model uses a wrong padding_id token. unsloth tokenizer fixes it.
+MODEL_OPTIONS="\
+  --calculate-per-token-loss \
+  --ckpt-format torch_dist \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --disable-bias-linear \
+  --add-qkv-bias \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-6 \
+  --group-query-attention \
+  --num-query-groups 2 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --weight-decay 0.0 \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 1000000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --num-layers 36  \
+  --hidden-size 2048  \
+  --ffn-hidden-size 11008 \
+  --num-attention-heads 16  \
+  --max-position-embeddings 32768  \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model unsloth/Qwen2.5-3B \
+  --lr 0.000001 \
+  --lr-warmup-samples 0 \
+  --make-vocab-size-divisible-by 64 \
+  --clip-grad 1.0 \
+  --rl-use-sequence-packing \
+  --rl-sequence-packing-algo fifo"
diff --git a/examples/rl/model_configs/qwen_2p5_distill_7b.sh b/examples/rl/model_configs/qwen_2p5_distill_7b.sh
new file mode 100644
index 00000000000..149ac77965f
--- /dev/null
+++ b/examples/rl/model_configs/qwen_2p5_distill_7b.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+TP=${TP:-2}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-2}
+LLM="qwen2p5_distill_7b"
+echo "Using Qwen 2.5 DSR1 7B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if true; then
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-8}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-128}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-8192}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-32}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Original Qwen model uses a wrong padding_id token. unsloth tokenizer fixes it.
+MODEL_OPTIONS="\
+  --calculate-per-token-loss \
+  --ckpt-format torch \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --untie-embeddings-and-output-weights \
+  --disable-bias-linear \
+  --add-qkv-bias \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-6 \
+  --group-query-attention \
+  --num-query-groups 4 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --weight-decay 0.0 \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 10000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --num-layers 28  \
+  --hidden-size 3584  \
+  --ffn-hidden-size 18944 \
+  --num-attention-heads 28  \
+  --max-position-embeddings 131072  \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model "unsloth/DeepSeek-R1-Distill-Qwen-7B" \
+  --langrl-inference-server-type "inplace_megatron_chat" \
+  --langrl-inference-server-conversation-template "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" \
+  --lr 0.000001 \
+  --lr-warmup-samples 0 \
+  --make-vocab-size-divisible-by 128 \
+  --clip-grad 1.0 "
+
+RUN_REQUEST_ARGS="\
+  --inference-type inplace_megatron_chat \
+  --inference-server-conversation-template "unsloth/DeepSeek-R1-Distill-Qwen-7B" \
+  $RUN_REQUEST_ARGS "
diff --git a/examples/rl/model_configs/qwen_2p5_math_7b.sh b/examples/rl/model_configs/qwen_2p5_math_7b.sh
new file mode 100644
index 00000000000..1d631fa80a5
--- /dev/null
+++ b/examples/rl/model_configs/qwen_2p5_math_7b.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+TP=${TP:-1}
+PP=${PP:-1}
+NODES_REQUIRED=${NODES_REQUIRED:-2}
+LLM="qwen2p5_math_7b"
+echo "Using Qwen 2.5 Math 7B model checkpoint"
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+source $(dirname $SCRIPT_PATH)/common.sh
+
+# In all cases, one can override those values.
+# However, running without envs will give you some
+# good perf out of the box for established envs.
+if [ "$(basename "$ENV_CONFIG")" = "dapo.yaml" ]; then
+  echo "Using DAPO environment config"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-64}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-1024}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-4096}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+else
+  # Some default values if config is unsupported.
+  echo "Undected environment config, using default values"
+  GRPO_CLAMP_EPS_LOWER=${GRPO_CLAMP_EPS_LOWER:-0.2}
+  GRPO_CLAMP_EPS_UPPER=${GRPO_CLAMP_EPS_UPPER:-0.2}
+  MAX_INFERENCE_BS=${MAX_INFERENCE_BS:-64}
+  GRPO_GROUP_SIZE=${GRPO_GROUP_SIZE:-16}
+  GRPO_PROMPTS_PER_STEP=${GRPO_PROMPTS_PER_STEP:-32}
+  GRPO_ITERATIONS=${GRPO_ITERATIONS:-1}
+  GRPO_KL_BETA=${GRPO_KL_BETA:-"0.0"}
+  TRAINING_BATCH_SIZE=${TRAINING_BATCH_SIZE:-512}
+  MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
+  MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-4096}
+  EXIT_INTERVAL=${EXIT_INTERVAL:-16}
+  CHKPT_SAVE_INTERVAL=${CHKPT_SAVE_INTERVAL:-16}
+fi
+
+ENV_DEPENDENT="\
+  --micro-batch-size $MICRO_BATCH_SIZE \
+  --global-batch-size $TRAINING_BATCH_SIZE \
+  --grpo-group-size $GRPO_GROUP_SIZE \
+  --grpo-prompts-per-step $GRPO_PROMPTS_PER_STEP \
+  --grpo-iterations $GRPO_ITERATIONS \
+  --grpo-clamp-eps-lower $GRPO_CLAMP_EPS_LOWER \
+  --grpo-clamp-eps-upper $GRPO_CLAMP_EPS_UPPER \
+  --grpo-kl-beta $GRPO_KL_BETA \
+  --langrl-env-config $ENV_CONFIG "
+
+# Original Qwen model uses a wrong padding_id token. unsloth tokenizer fixes it.
+MODEL_OPTIONS="\
+  --calculate-per-token-loss \
+  --ckpt-format torch \
+  --seq-length $MAX_SEQ_LENGTH \
+  --inference-max-seq-length $MAX_SEQ_LENGTH \
+  --inference-max-batch-size $MAX_INFERENCE_BS \
+  --pretrained-checkpoint $CHECKPOINT \
+  --untie-embeddings-and-output-weights \
+  --disable-bias-linear \
+  --add-qkv-bias \
+  --normalization RMSNorm \
+  --norm-epsilon 1e-6 \
+  --group-query-attention \
+  --num-query-groups 4 \
+  --no-masked-softmax-fusion \
+  --attention-softmax-in-fp32 \
+  --attention-dropout 0.0 \
+  --hidden-dropout 0.0 \
+  --weight-decay 0.0 \
+  --position-embedding-type rope \
+  --rotary-percent 1.0 \
+  --rotary-base 10000 \
+  --use-rotary-position-embeddings \
+  --swiglu \
+  --num-layers 28  \
+  --hidden-size 3584  \
+  --ffn-hidden-size 18944 \
+  --num-attention-heads 28  \
+  --max-position-embeddings 4096  \
+  --tokenizer-type HuggingFaceTokenizer \
+  --tokenizer-model "unsloth/Qwen2.5-Math-7B" \
+  --lr 0.000001 \
+  --lr-warmup-samples 0 \
+  --make-vocab-size-divisible-by 128 \
+  --clip-grad 1.0 "
+
+RUN_REQUEST_ARGS="\
+  --inference-type inplace_megatron_chat \
+  --inference-server-conversation-template "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" \
+  $RUN_REQUEST_ARGS "
diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py
index 311caf6582e..f748d74ecec 100644
--- a/examples/run_simple_mcore_train_loop.py
+++ b/examples/run_simple_mcore_train_loop.py
@@ -42,10 +42,14 @@ def initialize_distributed(
     parallel_state.destroy_model_parallel()
 
     # Torch setup for distributed training
-    rank: int = int(os.environ["LOCAL_RANK"])
-    world_size: int = torch.cuda.device_count()
-    torch.cuda.set_device(rank)
-    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+    rank: int = int(os.environ["RANK"])
+    world_size: int = int(os.environ["WORLD_SIZE"])
+    local_rank: int = int(os.environ["LOCAL_RANK"])
+
+    torch.cuda.set_device(local_rank)
+    torch.distributed.init_process_group(
+        backend="nccl", rank=rank, world_size=world_size
+    )
 
     # Megatron core distributed training initialization
     parallel_state.initialize_model_parallel(
diff --git a/gpt_builders.py b/gpt_builders.py
index 88284460fcf..dfe41f7b88e 100644
--- a/gpt_builders.py
+++ b/gpt_builders.py
@@ -7,6 +7,10 @@
     get_gpt_layer_with_transformer_engine_spec,
     get_gpt_layer_with_inference_spec,
     get_gpt_mtp_block_spec,
+    get_gpt_decoder_layer_specs,
+)
+from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+    get_transformer_block_with_experimental_attention_variant_spec,
 )
 from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import (
     get_gpt_heterogeneous_layer_spec,
@@ -42,7 +46,13 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
         else:
             use_te = args.transformer_impl == "transformer_engine"
 
-            if args.num_experts:
+            if args.experimental_attention_variant is not None:
+                transformer_layer_spec = (
+                    get_transformer_block_with_experimental_attention_variant_spec(
+                        config=config, vp_stage=vp_stage
+                    )
+                )
+            elif args.num_experts:
                 assert not (config.transformer_impl == "inference_optimized")
                 # Define the decoder block spec
                 transformer_layer_spec = get_gpt_decoder_block_spec(
@@ -69,7 +79,12 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
                 # Only happens with block spec (TransformerBlockSubmodules) when using MoE.
                 transformer_layer_spec_for_mtp = _get_transformer_layer_spec(use_te, config)
             else:
-                transformer_layer_spec_for_mtp = transformer_layer_spec
+                # Define the decoder block spec
+                decoder_layer_specs = get_gpt_decoder_layer_specs(
+                    config, use_transformer_engine=use_te, normalization=args.normalization, qk_l2_norm=args.qk_l2_norm, vp_stage=vp_stage
+                )
+                transformer_layer_spec_for_mtp = decoder_layer_specs[-1]
+            # Use spec of the last layer in decoder block as spec of the transformer layer in MTP
             mtp_block_spec = get_gpt_mtp_block_spec(
                 config,
                 transformer_layer_spec_for_mtp,
@@ -101,12 +116,12 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
 
 def _get_transformer_layer_spec(use_te, config):
     """Get transformer layer specification based on configuration.
-
+    
     Args:
         use_te (bool): Whether to use Transformer Engine
         args: Training arguments
         config: Model configuration
-
+        
     Returns:
         transformer_layer_spec: The transformer layer specification
     """
@@ -117,9 +132,12 @@ def _get_transformer_layer_spec(use_te, config):
             args.moe_grouped_gemm,
             args.qk_layernorm,
             args.multi_latent_attention,
+            args.experimental_attention_variant,
             moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm,
             qk_l2_norm=args.qk_l2_norm,
             use_kitchen=config.use_kitchen,
+            use_kitchen_attention=config.use_kitchen_attention,
+            kitchen_attention_backend=config.kitchen_attention_backend,
         )
     elif config.transformer_impl == "inference_optimized":
         return get_gpt_layer_with_inference_spec(
@@ -133,7 +151,10 @@ def _get_transformer_layer_spec(use_te, config):
             args.moe_grouped_gemm,
             args.qk_layernorm,
             args.multi_latent_attention,
+            args.experimental_attention_variant,
             moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm,
             normalization=args.normalization,
             use_kitchen=config.use_kitchen,
+            use_kitchen_attention=config.use_kitchen_attention,
+            kitchen_attention_backend=config.kitchen_attention_backend,
         )
diff --git a/greptile.json b/greptile.json
new file mode 100644
index 00000000000..38013ea8869
--- /dev/null
+++ b/greptile.json
@@ -0,0 +1,40 @@
+{
+    "labels": [],
+    "comment": "Disclaimer: This is AI-generated.",
+    "commentTypes": ["logic", "syntax", "style"],
+    "instructions": "Only comment if the PR description is unchanged from the default template, if a docstring is missing, or if there is a typo.",
+    "ignoreKeywords": "rename\nlinter\nprettier\ngreptile-ignor",
+    "ignorePatterns": "greptile.json\ntesting/**/*.py\n*.md\n*.txt\n*.json",
+    "patternRepositories": ["NVIDIA/Megatron-LM"],
+    "triggerOnUpdates": true,
+    "shouldUpdateDescription": false,
+    "disabledLabels": ["docs"],
+    "includeAuthors": [],
+    "excludeAuthors": ["github-actions"],
+    "strictness": 3,
+    "fixWithAI": false,
+    "includeBranches": ["main"],
+    "statusCheck": false,
+    "skipReview": "AUTOMATIC",
+    "summarySection": {
+      "included": false,
+      "collapsible": false,
+      "defaultOpen": false
+    },
+    "issuesTableSection": {
+      "included": false,
+      "collapsible": false,
+      "defaultOpen": false
+    },
+    "confidenceScoreSection": {
+      "included": false,
+      "collapsible": false,
+      "defaultOpen": false
+    },
+    "sequenceDiagramSection": {
+      "included": false,
+      "collapsible": false,
+      "defaultOpen": false
+    },
+    "statusCommentsEnabled": false
+  }
\ No newline at end of file
diff --git a/mamba_builders.py b/mamba_builders.py
index 53d675bc3cc..6a792ba6ea5 100644
--- a/mamba_builders.py
+++ b/mamba_builders.py
@@ -6,7 +6,7 @@
 from megatron.core.transformer.spec_utils import import_module
 from megatron.training import print_rank_0
 from megatron.training.arguments import core_transformer_config_from_args
-
+from megatron.core.models.mamba.mamba_layer_specs import mamba_inference_stack_spec
 
 def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None):
     print_rank_0('building MAMBA model ...')
@@ -14,7 +14,10 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p
         config = core_transformer_config_from_args(args, TransformerConfig)
     assert args.use_legacy_models is False, "Mamba only supported in Mcore!"
 
-    if args.spec is not None:
+    if config.transformer_impl == "inference_optimized":
+        mamba_stack_spec = mamba_inference_stack_spec 
+        assert not config.inference_fuse_tp_communication, "inference_fuse_tp_communication is not supported for Mamba"
+    elif args.spec is not None:
         mamba_stack_spec = import_module(args.spec)
     else:
         raise ValueError("You must provide a valid Mamba layer spec via --spec")
diff --git a/megatron/core/MSC_Integration.md b/megatron/core/MSC_Integration.md
index d0c93a80129..da8b5c982b8 100644
--- a/megatron/core/MSC_Integration.md
+++ b/megatron/core/MSC_Integration.md
@@ -1,4 +1,4 @@
-## Multi-Storage Client (MSC) Integration
+# Multi-Storage Client (MSC) Integration
 
 The [Multi-Storage Client](https://github.com/NVIDIA/multi-storage-client) (MSC) provides a unified interface for reading datasets and storing checkpoints from both filesystems (e.g., local disk, NFS, Lustre) and object storage providers such as S3, GCS, OCI, Azure, AIStore, and SwiftStack.
 
@@ -8,7 +8,7 @@ This guide will walk you through how to:
 2. How to train models directly using datasets in object storage
 3. How to save and load model checkpoints to/from object storage
 
-### Installation
+## Installation
 
 MSC is vended as `the multi-storage-client` package on PyPI.
 
@@ -25,7 +25,7 @@ pip install "multi-storage-client[boto3]"
 pip install "multi-storage-client[google-cloud-storage]"
 ```
 
-### Configuration File
+## Configuration File
 
 MSC uses a YAML configuration file to define how it connects to object storage systems. This design allows you to specify one or more storage profiles, each representing a different storage backend or bucket. MSC keeps your training scripts clean and portable by centralizing details in a config file. There is no need to hardcode access keys, bucket names, or other provider-specific options directly into your code.
 
@@ -58,7 +58,7 @@ To tell MSC where to find this file, set the following environment variable befo
 export MSC_CONFIG=/path/to/msc_config.yaml
 ```
 
-### MSC URL Format
+## MSC URL Format
 
 MSC uses a custom URL scheme to identify and access files across different object storage providers. This scheme makes it easy to reference data and checkpoints without worrying about the underlying storage implementation. An MSC URL has the following structure:
 
@@ -96,7 +96,7 @@ is interpreted as accessing the object with the key `dataset/train/data.bin` ins
 This abstraction allows training scripts to reference storage resources uniformly—whether they're hosted on AWS, GCP, Oracle, or Azure—just by switching profiles in the config file.
 
 
-### Train from Object Storage
+## Train from Object Storage
 
 To train with datasets stored in object storage, use an MSC URL with the `--data-path` argument. This URL references a dataset stored under a profile defined in your MSC configuration file.
 
@@ -112,7 +112,7 @@ python pretrain_gpt.py                                      \
 
 **NOTE:** All four arguments must be provided when training with datasets in object storage using MSC.
 
-### Save and Load Checkpoints from Object Storage
+## Save and Load Checkpoints from Object Storage
 
 MSC can be used to save and load model checkpoints directly from object storage by specifying MSC URLs for the `--save` and `--load` arguments. This allows you to manage checkpoints in object storage.
 
@@ -125,7 +125,7 @@ python pretrain_gpt.py                \
 
 **Notes:** Only the `torch_dist` checkpoint format is currently supported when saving to or loading from MSC URLs.
 
-### Disable MSC
+## Disable MSC
 
 By default, MSC integration is automatically enabled when the `multi-storage-client` library is installed. MSC is also used for regular filesystem paths (like `/filesystem_mountpoint/path` in `--data-path`, `--save`, or `--load`) even when not using explicit MSC URLs. MSC functions as a very thin abstraction layer with negligible performance impact when used with regular paths, so there's typically no need to disable it. If you need to disable MSC, you can do so using the `--disable-msc` flag:
 
@@ -133,7 +133,7 @@ By default, MSC integration is automatically enabled when the `multi-storage-cli
 python pretrain_gpt.py --disable-msc
 ```
 
-### Performance Considerations
+## Performance Considerations
 
 When using object storage with MSC, there are a few important performance implications to keep in mind:
 
@@ -165,7 +165,7 @@ cache:
 
 For optimal performance, configure the cache directory on a high-speed local storage device such as an NVMe SSD.
 
-### Additional Resources and Advanced Configuration
+## Additional Resources and Advanced Configuration
 
 Refer to the [MSC Configuration Documentation](https://nvidia.github.io/multi-storage-client/references/configuration.html) for complete documentation on MSC configuration options, including detailed information about supported storage providers, credentials management, and advanced caching strategies.
 
diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md
index dedde653db1..e0c670fd240 100644
--- a/megatron/core/QuickStart.md
+++ b/megatron/core/QuickStart.md
@@ -1,4 +1,8 @@
-## Quick Start
+---
+orphan: true
+---
+
+# Quick Start
 
 This guide for Megatron Core walks you through the following tasks:
 
@@ -10,7 +14,7 @@ This guide for Megatron Core walks you through the following tasks:
 
 **NOTE:** The following sample was tested using Megatron Core version 0.8.0 and NGC PyTorch Container version 24.02.
 
-### Set Up Your Environment
+## Set Up Your Environment
 
 1. Run a new Docker container.
 
@@ -21,7 +25,7 @@ This guide for Megatron Core walks you through the following tasks:
 
     git clone https://github.com/NVIDIA/Megatron-LM.git
     cd Megatron-LM
-    pip install -U setuptools packaging
+    pip install -U "setuptools<80.0.0,>=77.0.0" packaging
     pip install --no-build-isolation .[dev]
     ```
 
@@ -29,7 +33,7 @@ This guide for Megatron Core walks you through the following tasks:
 
 For a more comprehensive overview of different installation methods, refer to the [Installation Guide](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/Installation_Guide.md)
 
-### Write Your First Training Loop
+## Write Your First Training Loop
 
 In this task, you create a sample GPT model split across tensors (Tensor model parallel) on two GPUS, and run a forward pass through it using a MockGPT dataset helper class that was created in Megatron Core.
 
@@ -251,7 +255,7 @@ In this task, you create a sample GPT model split across tensors (Tensor model p
 
     <br>
 
-### Review Advanced Examples
+## Review Advanced Examples
 
 To review more advanced examples, explore [pretrain_gpt.py](https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py). ``pretrain_gpt.py`` has more complex training loops and includes the following Megatron Core features:
 
diff --git a/megatron/core/_rank_utils.py b/megatron/core/_rank_utils.py
new file mode 100644
index 00000000000..6b1a35ca798
--- /dev/null
+++ b/megatron/core/_rank_utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Low-level rank utilities with minimal dependencies to avoid circular imports."""
+
+import logging
+import os
+from typing import Any
+
+import torch
+
+
+def safe_get_rank() -> int:
+    """Safely get the rank of the current process.
+
+    Returns the rank from torch.distributed if initialized, otherwise falls back
+    to the RANK environment variable, defaulting to 0.
+
+    Returns:
+        int: The rank of the current process.
+    """
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+
+    # If torch.distributed is not initialized, try to read environment variables.
+    try:
+        return int(os.environ.get("RANK", 0))
+    except (ValueError, TypeError):
+        return 0
+
+
+def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any) -> None:
+    """Log a message only on a single rank.
+
+    If torch distributed is initialized, write log on only one rank.
+
+    Args:
+        logger: The logger to write the logs.
+        *args: All logging.Logger.log positional arguments.
+        rank: The rank to write on. Defaults to 0.
+        **kwargs: All logging.Logger.log keyword arguments.
+    """
+    if safe_get_rank() == rank:
+        logger.log(*args, **kwargs)
diff --git a/megatron/core/datasets/Makefile b/megatron/core/datasets/Makefile
index e745f52399b..16f251bf903 100644
--- a/megatron/core/datasets/Makefile
+++ b/megatron/core/datasets/Makefile
@@ -1,4 +1,4 @@
-CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CXXFLAGS += -O3 -Wall -shared -std=c++17 -fPIC -fdiagnostics-color
 CPPFLAGS += $(shell python3 -m pybind11 --includes)
 
 LIBNAME = helpers_cpp
diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py
index d36f618349d..aa0ba7501cf 100644
--- a/megatron/core/datasets/bert_dataset.py
+++ b/megatron/core/datasets/bert_dataset.py
@@ -78,7 +78,7 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
             idx (int): The index into the dataset
 
         Returns:
-            Dict[str, Union[int, numpy.ndarray]]: The
+            Dict[str, Union[int, numpy.ndarray]]: The sample information wrapped in a dictionary
         """
 
         idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py
index d38bf25e411..906cf041d6f 100644
--- a/megatron/core/datasets/blended_dataset.py
+++ b/megatron/core/datasets/blended_dataset.py
@@ -88,9 +88,23 @@ def __init__(
         self.dataset_index, self.dataset_sample_index = self._build_indices()
 
     def __len__(self) -> int:
+        if self.config.defer_npy_index_mmap:
+            size = sum(self.weights)
+            if self.size is not None:
+                size = self.size
+            return size
+
         return self.dataset_index.shape[0]
 
     def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
+        if self.dataset_index is None:
+            self.dataset_index = numpy.load(
+                self.path_to_dataset_index, allow_pickle=True, mmap_mode="r"
+            )
+            self.dataset_sample_index = numpy.load(
+                self.path_to_dataset_sample_index, allow_pickle=True, mmap_mode="r"
+            )
+
         dataset_id = self.dataset_index[idx]
         dataset_sample_id = self.dataset_sample_index[idx]
         return {"dataset_id": dataset_id, **self.datasets[dataset_id][dataset_sample_id]}
@@ -105,6 +119,15 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
         Returns:
             Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
         """
+        if self.config.defer_npy_index_mmap:
+            # NOTE(asolergi-nv): Direct path to lazy memmap the indexes
+            get_path_to = lambda suffix: os.path.join(
+                self.config.path_to_cache,
+                f"{self.unique_description_hash}-{type(self).__name__}-{self.split.name}-{suffix}",
+            )
+            self.path_to_dataset_index = get_path_to("dataset_index.npy")
+            self.path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
+            return None, None
 
         path_to_cache = self.config.path_to_cache
 
@@ -116,10 +139,14 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
             path_to_description = get_path_to("description.txt")
             path_to_dataset_index = get_path_to("dataset_index.npy")
             path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
-            cache_hit = all(
-                map(
-                    os.path.isfile,
-                    [path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
+            cache_hit = (
+                True
+                if self.config.fast_cache_load
+                else all(
+                    map(
+                        os.path.isfile,
+                        [path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
+                    )
                 )
             )
         else:
diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
index 761724bf7fd..fb59df983f8 100644
--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import logging
 import math
@@ -10,6 +10,7 @@
 
 from megatron.core.datasets.blended_dataset import BlendedDataset
 from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
 from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
 from megatron.core.datasets.utils import Split, normalize
 from megatron.core.utils import log_single_rank
@@ -215,7 +216,14 @@ def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDataset]]:
                     blended_datasets[i] = self.build_generic_dataset(
                         BlendedDataset,
                         self.is_built_on_rank,
-                        True,  # synchronize_ranks, default behavior to build on rank-0 first
+                        (
+                            False
+                            if (
+                                isinstance(self.config, GPTDatasetConfig)
+                                and self.config.fast_cache_load
+                            )
+                            else True
+                        ),  # synchronize_ranks, default behavior to build on rank-0 first. Set to False if we are using --dataloader-fast-cache-load # pylint: disable=C0301
                         megatron_datasets[i],
                         weights_i,
                         size_i,
@@ -306,7 +314,14 @@ def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDataset]]:
                     blended_datasets[i] = self.build_generic_dataset(
                         BlendedDataset,
                         self.is_built_on_rank,
-                        True,  # synchronize_ranks, default behavior to build on rank-0 first
+                        (
+                            False
+                            if (
+                                isinstance(self.config, GPTDatasetConfig)
+                                and self.config.fast_cache_load
+                            )
+                            else True
+                        ),  # synchronize_ranks, default behavior to build on rank-0 first. Set to False if we are using --dataloader-fast-cache-load # pylint: disable=C0301
                         megatron_datasets,
                         weights,
                         size,
@@ -364,7 +379,10 @@ def _threading_helper(
         megatron_datasets = [[] for _ in range(len(Split))]
         num_dataset_builder_threads = self.config.num_dataset_builder_threads
 
-        if torch.distributed.is_initialized():
+        # NOTE(asolergi-nv): Skip rank-0 first dataset building if we are using --dataloader-fast-cache-load # pylint: disable=C0301
+        if torch.distributed.is_initialized() and not (
+            isinstance(self.config, GPTDatasetConfig) and self.config.fast_cache_load
+        ):
             rank = torch.distributed.get_rank()
             # First, build on rank 0
             if is_built_on_zero_rank():
@@ -420,6 +438,14 @@ def _build_megatron_dataset_splits(
         Returns:
             List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
         """
+        synchronize_ranks = (
+            False
+            if (
+                synchronize_ranks
+                and (isinstance(self.cls, GPTDatasetConfig) and self.config.fast_cache_load)
+            )
+            else synchronize_ranks
+        )  # NOTE(asolergi-nv): Set synchronize_ranks to False if we are using --dataloader-fast-cache-load # pylint: disable=C0301
         # short-cut if we are not building on this rank
         if torch.distributed.is_initialized() and not self.is_built_on_rank():
             for i in range(len(Split)):
@@ -432,14 +458,6 @@ def _build_megatron_dataset_splits(
 
         # Build the split indices for the low level dataset
         num_elements = self.cls.numel_low_level_dataset(low_level_dataset)
-        split_indices = []
-        for i, _ in enumerate(Split):
-            if split[i] is not None:
-                beg = int(round(split[i][0] * float(num_elements)))
-                end = int(round(split[i][1] * float(num_elements)))
-                split_indices.append(numpy.arange(start=beg, stop=end, step=1, dtype=numpy.int32))
-            else:
-                split_indices.append(None)
 
         # Build the mid level dataset
         mid_level_datasets = []
@@ -447,6 +465,14 @@ def _build_megatron_dataset_splits(
             if split[i] is None:
                 mid_level_datasets.append(None)
             else:
+                indexed_indices = None
+                if not (
+                    isinstance(self.config, GPTDatasetConfig) and self.config.fast_cache_load
+                ):  # NOTE(asolergi-nv): Skip indexed_indices building if we are using --dataloader-fast-cache-load # pylint: disable=C0301
+                    beg = int(round(split[i][0] * float(num_elements)))
+                    end = int(round(split[i][1] * float(num_elements)))
+                    indexed_indices = numpy.arange(start=beg, stop=end, step=1, dtype=numpy.int32)
+
                 mid_level_datasets.append(
                     self.build_generic_dataset(
                         self.cls,
@@ -454,7 +480,7 @@ def _build_megatron_dataset_splits(
                         synchronize_ranks,
                         low_level_dataset,
                         dataset_path,
-                        split_indices[i],
+                        indexed_indices,
                         sizes[i],
                         _split,
                         self.config,
@@ -480,6 +506,9 @@ def build_generic_dataset(
                 built. In special cases, e.g. when we are building the low level dataset for a
                 RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
 
+            is_built_on_rank (Callable): A callable which returns True if the dataset should be
+                built on the current rank and False otherwise.
+
             synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
                 behavior. Set to False when we enforce this behavior at higher level.
 
diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py
index fd7132acc0f..a86efbe4963 100644
--- a/megatron/core/datasets/blended_megatron_dataset_config.py
+++ b/megatron/core/datasets/blended_megatron_dataset_config.py
@@ -31,7 +31,7 @@ class BlendedMegatronDatasetConfig:
 
     blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] = None
     """A set of blends, as defined above, one for each split distribution. Not to be used with
-       'blend'. Defauls to None.
+       'blend'. Defaults to None.
     """
 
     multiple_validation_sets: Optional[bool] = None
@@ -88,8 +88,29 @@ class BlendedMegatronDatasetConfig:
        incorrect tokenizer - this option may be set to True. This is typically not recommended.
     """
 
+    fast_cache_load: bool = False
+    """Option to use the fast cache loading path. Requires all the dataset caches to be built."""
+
+    defer_npy_index_mmap: bool = False
+    """Option to defer the mmap of the dataset indexes until the first access.
+       Requires all the dataset caches to be built.
+    """
+
     def __post_init__(self) -> None:
         """Do asserts and set fields post init"""
+        if self.fast_cache_load:
+            assert (
+                self.path_to_cache is not None
+            ), "--data-cache-path must be provided when using --dataloader-fast-cache-load."
+            assert (
+                self.blend is None
+            ), f"--dataloader-fast-cache-load and --data-path cannot be used together. \
+            Use --per-split-data-args-path or --train-data-path, --valid-data-path and \
+            --test-data-path instead."
+        if self.defer_npy_index_mmap:
+            assert (
+                self.path_to_cache is not None
+            ), "--data-cache-path must be provided when using --dataloader-defer-npy-index-mmap."
         if self.blend_per_split is not None and any(self.blend_per_split):
             assert self.blend is None, "blend and blend_per_split are incompatible"
             assert self.split is None, "split and blend_per_split are incompatible"
@@ -160,9 +181,6 @@ def convert_split_vector_to_split_matrix(
 
     [0.99, 0.01, 0.0] -> [(0, 0.99), (0.99, 1.0), None]
 
-    Ex. a conversion for Retro when Retro pretraining uses a [0.99, 0.01, 0.0] split and Retro
-    preprocessing used a [0.98, 0.02, 0.0] split:
-
     [0.99, 0.01, 0.0], [0.98, 0.02, 0.0] -> [(0, 0.98), (0.99, 1.0), None]
 
     Args:
diff --git a/megatron/core/datasets/data_schedule.py b/megatron/core/datasets/data_schedule.py
new file mode 100644
index 00000000000..0f016473b6a
--- /dev/null
+++ b/megatron/core/datasets/data_schedule.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved.
+
+from typing import Any, List, Optional
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.pipeline_parallel.hybrid_cp_schedule import BalancedCPScheduler
+from megatron.core.process_groups_config import ProcessGroupCollection
+
+
+class HybridCPDataLoaderWrapper:
+    """
+    A wrapper class that wraps around an existing data_iterator.
+    For every __next__ call,
+    1. Each DP rank pulls a batch of packed samples.
+    2. Extracts the sequence lengths of each sub-sample and all-gathers across the DP group.
+    3. Schedules the sub-samples to the DPxCP ranks using the BalancedCPScheduler.
+    4. Based on the schedule, reroutes the sub-samples to the correct rank using all-to-all.
+    5. Returns the assigned sub-samples to this rank.
+
+    Args:
+        data_iterator: The original data_iterator to wrap around
+        config: The config object containing the max_seqlen_per_dp_cp_rank
+        dp_cp_group: Data parallel context parallel group.
+    """
+
+    def __init__(
+        self, data_iterator, config, pg_collection: Optional[ProcessGroupCollection] = None
+    ):
+        self.data_iterator = data_iterator
+        self.config = config
+        if pg_collection is None:
+            self.dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+            self.dp_group = parallel_state.get_data_parallel_group()
+            self.tp_group = parallel_state.get_tensor_model_parallel_group()
+        else:
+            self.dp_cp_group = pg_collection.dp_cp
+            self.dp_group = pg_collection.dp
+            self.tp_group = pg_collection.tp
+        assert (
+            self.dp_cp_group is not None and self.dp_group is not None and self.tp_group is not None
+        ), "dp_cp_group, dp_group, tp_group must not be None when using hybrid context parallel"
+
+        self.cp_balancing_scheduler = BalancedCPScheduler(
+            max_seq_len_per_rank=self.config.max_seqlen_per_dp_cp_rank, dp_cp_group=self.dp_cp_group
+        )
+
+        self.total_hdp_gpus = self.dp_cp_group.size()
+
+    def __iter__(self):
+        """Return self as an iterator."""
+        return self
+
+    def get_global_seqlens(self, subsample_seqlens: torch.Tensor) -> List[int]:
+        """
+        Gathers the sequence lengths of all subsamples from all DP ranks.
+        Each DP rank loads the same number of microbatches but each microbatch
+        may have a different number of subsamples.
+
+        We find the number of subsamples each rank holds and then gather the
+        sequence lengths of all subsamples from all ranks.
+        """
+        # Collect the number of subsamples from all ranks
+        local_len = torch.tensor([subsample_seqlens.shape[0]], dtype=torch.int32).cuda()
+        dp_subsample_count = [torch.zeros_like(local_len) for _ in range(self.dp_group.size())]
+        torch.distributed.all_gather(dp_subsample_count, local_len, group=self.dp_group)
+
+        # Find the max number of subsamples across all ranks and pad subsample_seqlens to max length
+        dp_subsample_counts = torch.stack(dp_subsample_count, dim=0).cpu().view(-1)
+        max_sub_samples = int(dp_subsample_counts.max().item())
+
+        if local_len.item() < max_sub_samples:
+            subsample_seqlens_padded = torch.cat(
+                [
+                    subsample_seqlens,
+                    torch.zeros(max_sub_samples - local_len.item(), dtype=torch.int32).cuda(),
+                ],
+                dim=0,
+            )
+        else:
+            subsample_seqlens_padded = subsample_seqlens
+
+        # Gather the subsample_seqlens from all ranks
+        seqlens_gathered = [
+            torch.empty_like(subsample_seqlens_padded) for _ in range(self.dp_group.size())
+        ]
+        torch.distributed.all_gather(
+            seqlens_gathered, subsample_seqlens_padded, group=self.dp_group
+        )
+
+        # Trim each seqlens_gathered to the length of the correct sample
+        for dp_rank, seqlen in enumerate(seqlens_gathered):
+            seqlens_gathered[dp_rank] = seqlen[: dp_subsample_counts[dp_rank]]
+
+        seqlens_gathered = torch.cat(seqlens_gathered, dim=0)
+        seqlens_gathered = seqlens_gathered.cpu().tolist()
+
+        # Calculate the offsets to assign unique global ID to each subsample.
+        csum = torch.cumsum(dp_subsample_counts, dim=0, dtype=torch.int32)
+        offsets = torch.cat([torch.zeros(1, dtype=torch.int32), csum[:-1]], dim=0)
+
+        return seqlens_gathered, offsets
+
+    def get_global_id_seqlens(self, num_local_subsamples, offsets, seqlens_gathered):
+        """
+        Calculates the global ID for each subsample.
+
+        We assign a unique global ID to each subsample.
+
+        Returns:
+        global_id_seqlens: list of (global_id, seqlen) tuples for scheduling.
+        global_ids_this_rank: list of global IDs locally present on this rank.
+        """
+        dp_rank = self.dp_group.rank()
+        global_ids = torch.arange(len(seqlens_gathered), dtype=torch.int32).cuda()
+        # Create a list of (global_id, seqlen) tuples for scheduling
+        global_id_seqlens = [(i, seqlens_gathered[i]) for i in range(len(global_ids))]
+        # Get the global IDs locally present on this rank
+        global_ids_this_rank = global_ids[
+            offsets[dp_rank] : offsets[dp_rank] + num_local_subsamples
+        ]
+
+        return global_id_seqlens, global_ids_this_rank
+
+    def _gid_to_src_rank(self, gid: int, offsets: List[int]) -> int:
+        dp_src_rank = torch.bucketize(gid, offsets[1:] - 1)
+        # Since the torch.distributed.get_process_group_ranks
+        # provides the global rank, we need to consider TP
+        hdp_rank = (
+            torch.distributed.get_process_group_ranks(self.dp_group)[dp_src_rank]
+            // self.tp_group.size()
+        )
+        return hdp_rank
+
+    def reroute_samples_to_hdp_ranks(
+        self, batch, global_ids_this_rank, global_id_seqlens, sample_id_groups, offsets
+    ):
+        """
+        Reroutes the sub-samples to the correct rank after scheduling.
+
+        For each key in the batch dict, we perform an all-to-all communication
+        to transfer the data to the correct ranks.
+        Since all CP ranks within a DP group have the same data, we only need
+        to transfer data between matching CP ranks.
+        """
+        gid2local_id = {int(gid): i for i, gid in enumerate(global_ids_this_rank)}
+        hdp_rank = self.dp_cp_group.rank()
+        dp_ranks = torch.distributed.get_process_group_ranks(self.dp_group)
+        # Here we actually want to get the DP group's rank within the HDP group,
+        # we need to consider TP
+        dp_ranks = [r // self.tp_group.size() for r in dp_ranks]
+
+        data_keys = batch[0].keys()
+
+        # Create the send plan
+        combined_sample_id_groups: List[List[int]] = [[] for _ in range(self.total_hdp_gpus)]
+
+        for d in range(self.total_hdp_gpus):
+            for sample_id_group in sample_id_groups:
+                combined_sample_id_groups[d].extend(sample_id_group[d])
+
+        for dest_rank in range(self.total_hdp_gpus):
+            combined_sample_id_groups[dest_rank].sort()
+
+        # Filter out samples that are not present on this rank
+        send_ids_sorted = [
+            gid
+            for d in dp_ranks
+            for gid in combined_sample_id_groups[d]
+            if gid in global_ids_this_rank
+        ]
+        # send_counts = [len(combined_sample_id_groups[d]) for d in range(self.total_hdp_gpus)]
+
+        send_lens_split = [0] * self.total_hdp_gpus
+        for dest_rank in range(self.total_hdp_gpus):
+            if dest_rank in dp_ranks:
+                send_lens_split[dest_rank] = sum(
+                    [
+                        global_id_seqlens[gid][1]
+                        for gid in combined_sample_id_groups[dest_rank]
+                        if gid in global_ids_this_rank
+                    ]
+                )
+            else:
+                # We only need to share local data with DP ranks that have different data.
+                send_lens_split[dest_rank] = 0
+
+        # Create the recv plan
+        recv_sample_id_groups = [[] for _ in range(self.total_hdp_gpus)]
+        for gid in combined_sample_id_groups[hdp_rank]:
+            src_rank = self._gid_to_src_rank(gid, offsets)
+            recv_sample_id_groups[src_rank].append(gid)
+
+        recv_lens_split = [0] * self.total_hdp_gpus
+        for src_rank in range(self.total_hdp_gpus):
+            recv_lens_split[src_rank] = sum(
+                [global_id_seqlens[gid][1] for gid in recv_sample_id_groups[src_rank]]
+            )
+
+        recv_ids_sorted = [
+            gid for d in range(self.total_hdp_gpus) for gid in recv_sample_id_groups[d]
+        ]
+        recv_counts = [len(recv_sample_id_groups[d]) for d in range(self.total_hdp_gpus)]
+
+        recv_samples = [{k: None for k in data_keys} for _ in range(sum(recv_counts))]
+
+        def _pack_sample_by_key(key: str) -> torch.Tensor:
+            flattened_tensors = []
+            for gid in send_ids_sorted:
+                t = batch[gid2local_id[gid]][key].to(torch.cuda.current_device(), non_blocking=True)
+                flattened_tensors.append(t)
+            return (
+                torch.cat(flattened_tensors, dim=0)
+                if flattened_tensors
+                else torch.empty(0, device=torch.cuda.current_device(), dtype=batch[0][key].dtype)
+            )
+
+        def _unpack_sample_by_key(key: str, recv_tensor: torch.Tensor):
+            cursor = 0
+            for i, gid in enumerate(recv_ids_sorted):
+                sample_len = global_id_seqlens[gid][1]
+                recv_samples[i][key] = recv_tensor[cursor : cursor + sample_len]
+                cursor += sample_len
+
+        for key in data_keys:
+            send_tensor = _pack_sample_by_key(key)
+            recv_tensor = torch.empty(
+                sum(recv_lens_split), device=torch.cuda.current_device(), dtype=send_tensor.dtype
+            )
+            torch.distributed.all_to_all_single(
+                output=recv_tensor,
+                input=send_tensor,
+                output_split_sizes=recv_lens_split,
+                input_split_sizes=send_lens_split,
+                group=self.dp_cp_group,
+            )
+            _unpack_sample_by_key(key, recv_tensor)
+
+        recv_sample_with_id = {
+            recv_id: recv_samples[i] for i, recv_id in enumerate(recv_ids_sorted)
+        }
+        return recv_sample_with_id
+
+    def unpack_batch(self, batch):
+        """
+        Unpacks the packed samples into a list of sub-samples.
+        Since each sub-sample may be routed to different DPxCP ranks,
+        we unpack the sample here to avoid unnecessarily transferring
+        the entire packed sample.
+        """
+        batch_unpacked = []
+        for sample in batch:
+            for sub_sample in range(sample["cu_seqlens"].shape[0] - 1):
+                sub_sample_dict = {}
+                start_idx = sample["cu_seqlens"][sub_sample]
+                end_idx = sample["cu_seqlens"][sub_sample + 1]
+                if end_idx - start_idx == 0:
+                    continue
+                for key in sample.keys():
+                    if key in ["cu_seqlens", "batch_idx", "max_seqlen"]:
+                        continue
+                    sub_sample_dict[key] = sample[key][start_idx:end_idx]
+                batch_unpacked.append(sub_sample_dict)
+        return batch_unpacked
+
+    def __next__(self) -> Any:
+        """
+        Get the next item from the dataset, pull scheduling metadata and return it.
+        """
+        if self.data_iterator is None:
+            # TP0 reads from data_iterator, others receive via broadcast.
+            return None, None
+        else:
+            batch = next(self.data_iterator)
+        subsample_seqlens = []
+        for sample in batch:
+            subsample_seqlens.extend(
+                [
+                    int(sample["cu_seqlens"][i + 1] - sample["cu_seqlens"][i])
+                    for i in range(0, sample["cu_seqlens"].shape[0] - 1)
+                ]
+            )
+        subsample_seqlens = torch.tensor(subsample_seqlens, dtype=torch.int32).cuda()
+        subsample_seqlens = subsample_seqlens[subsample_seqlens != 0]
+
+        seqlens_gathered, offsets = self.get_global_seqlens(subsample_seqlens)
+
+        global_id_seqlens, global_ids_this_rank = self.get_global_id_seqlens(
+            subsample_seqlens.shape[0], offsets, seqlens_gathered
+        )
+
+        groups, sample_id_groups = self.cp_balancing_scheduler.get_groups_and_subsamples(
+            global_id_seqlens, self.config
+        )
+
+        batch = self.unpack_batch(batch)
+        samples_this_rank_with_id = self.reroute_samples_to_hdp_ranks(
+            batch, global_ids_this_rank, global_id_seqlens, sample_id_groups, offsets
+        )
+        return samples_this_rank_with_id, sample_id_groups
diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
index 5f563882dea..2a5aa0a783a 100644
--- a/megatron/core/datasets/gpt_dataset.py
+++ b/megatron/core/datasets/gpt_dataset.py
@@ -3,7 +3,8 @@
 import logging
 import os
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from math import ceil
 from typing import Dict, Optional, Tuple
 
 import numpy
@@ -51,6 +52,32 @@ class GPTDatasetConfig(BlendedMegatronDatasetConfig):
     object_storage_cache_path: Optional[str] = None
     """Path for caching indices for s3 or msc dataloading."""
 
+    data_parallel_size: int = 1
+    """Option to enable data parallelism"""
+
+    sequence_parallel_size: int = 0
+    """Option to indicate the sequence parallelism size when using TP
+    Set to 0 if sequence parallel is not enabled regardless of TP size.
+    """
+
+    hybrid_context_parallel: bool = False
+    """Option to enable hybrid context parallelism. When setting this to True, 
+    each sample should be divisible by the data parallel size * context parallel size * 2.
+    If sequence parallel is enabled, it should be divisible by the 
+    data parallel size * context parallel size * sequence parallel size * 2.
+    """
+
+    sequences_per_dataset: Optional[Dict[str, int]] = None
+    """If provided, the sequence and document counts for each dataset. 
+       Check --per-dataset-sequences-path
+    """
+
+    token_dtype_code: Optional[int] = field(init=False, default=None)
+    """The dtype code for the token ids. 4 for int32, 8 for uint16."""
+
+    context_parallel_size: Optional[int] = None
+    """The size of the context parallel group. Needed for padding in packed sequences."""
+
     def __post_init__(self) -> None:
         """Do asserts and set fields post init"""
         super().__post_init__()
@@ -61,6 +88,17 @@ def __post_init__(self) -> None:
         assert self.reset_attention_mask is not None
         assert self.eod_mask_loss is not None
 
+        self.token_dtype_code = (
+            None
+            if self.tokenizer.vocab_size is None
+            else (4 if self.tokenizer.vocab_size > numpy.iinfo(numpy.uint16).max + 1 else 8)
+        )
+        if self.sequences_per_dataset is not None:
+            assert (
+                self.token_dtype_code is not None
+            ), "Tokenizer vocab size is not set, deactivate --per-dataset-sequences-path or \
+            fix the tokenizer."
+
 
 class GPTDataset(MegatronDataset):
     """The base GPT dataset
@@ -145,7 +183,17 @@ def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfig) -> Inde
                     path_to_idx_cache=config.object_storage_cache_path
                 ),
             )
-        return IndexedDataset(dataset_path, multimodal=False, mmap=config.mmap_bin_files)
+        sequences_per_dataset = None
+        if config.sequences_per_dataset:
+            sequences_per_dataset = config.sequences_per_dataset[dataset_path]
+        return IndexedDataset(
+            dataset_path,
+            multimodal=False,
+            mmap=config.mmap_bin_files,
+            fast_cache_load=config.fast_cache_load,
+            sequences_per_dataset=sequences_per_dataset,
+            dtype_code=config.token_dtype_code,
+        )
 
     def __len__(self) -> int:
         """Abstract method implementation
@@ -153,13 +201,34 @@ def __len__(self) -> int:
         Returns:
             int: The length of the dataset
         """
+        if self.config.defer_npy_index_mmap:
+            # NOTE(asolergi-nv): We need the number of samples of every GPTDataset to build/hit the BlendedDataset cache # pylint: disable=C0301
+            # NOTE(asolergi-nv): Uses logic from megatron/core/datasets/helpers.cpp::build_sample_idx to compute the number of samples # pylint: disable=C0301
+            num_tokens_per_epoch = self._get_num_tokens_per_epoch()
+            num_epochs = self._get_num_epochs(num_tokens_per_epoch)
+
+            drop_last_partial_sequence = True
+            if self.index_split == Split.valid:
+                drop_last_partial_sequence = self.config.drop_last_partial_validation_sequence
+
+            if drop_last_partial_sequence:
+                return (
+                    num_epochs * num_tokens_per_epoch - self.config.add_extra_token_to_sequence
+                ) // self.config.sequence_length
+            else:
+                return ceil(
+                    float(
+                        num_epochs * num_tokens_per_epoch - self.config.add_extra_token_to_sequence
+                    )
+                    / self.config.sequence_length
+                )
         return self.sample_index.shape[0] - 1
 
     def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]:
         """Abstract method implementation
 
         Args:
-            idx (Optioal[int]): The index into the dataset
+            idx (Optional[int]): The index into the dataset
 
         Returns:
             Dict[str, torch.Tensor]: The sample information wrapped in a dictionary
@@ -239,6 +308,18 @@ def _query_document_sample_shuffle_indices(
         Returns:
             Tuple[numpy.ndarray, numpy.ndarray]: The text ids and document ids
         """
+        if self.shuffle_index is None:
+            # NOTE(asolergi-nv): Lazy memmap the indexes
+            self.shuffle_index = numpy.load(
+                self.path_to_shuffle_index, allow_pickle=True, mmap_mode='r'
+            )
+            self.sample_index = numpy.load(
+                self.path_to_sample_index, allow_pickle=True, mmap_mode='r'
+            )
+            self.document_index = numpy.load(
+                self.path_to_document_index, allow_pickle=True, mmap_mode='r'
+            )
+
         # Do the shuffle mapping
         idx = self.shuffle_index[idx]
 
@@ -258,7 +339,7 @@ def _query_document_sample_shuffle_indices(
             sample_parts.append(
                 self.dataset.get(
                     self.document_index[doc_index_beg],
-                    offset=doc_index_beg_offset,
+                    offset=int(doc_index_beg_offset),
                     length=doc_index_end_offset
                     - doc_index_beg_offset
                     + self.config.add_extra_token_to_sequence,
@@ -279,7 +360,7 @@ def _query_document_sample_shuffle_indices(
                     else doc_index_end_offset + self.config.add_extra_token_to_sequence
                 )
                 sample_parts.append(
-                    self.dataset.get(self.document_index[i], offset=offset, length=length)
+                    self.dataset.get(self.document_index[i], offset=int(offset), length=length)
                 )
         assert len(document_ids) == len(
             sample_parts
@@ -320,6 +401,15 @@ def _build_document_sample_shuffle_indices(
             Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]: The document index, the sample
             index, and the shuffle index
         """
+        if self.config.defer_npy_index_mmap:
+            # NOTE(asolergi-nv): Direct path to lazy memmap the indexes
+            base = f"{self.unique_description_hash}-{type(self).__name__}-{self.index_split.name}"
+            get_path_to = lambda affix: os.path.join(self.config.path_to_cache, f"{base}-{affix}")
+            self.path_to_document_index = get_path_to("document_index.npy")
+            self.path_to_sample_index = get_path_to("sample_index.npy")
+            self.path_to_shuffle_index = get_path_to("shuffle_index.npy")
+            return None, None, None
+
         path_to_cache = self.config.path_to_cache
         if path_to_cache is None and not self.config.mock:
             path_to_cache = os.path.join(
@@ -333,15 +423,19 @@ def _build_document_sample_shuffle_indices(
             path_to_document_index = get_path_to("document_index.npy")
             path_to_sample_index = get_path_to("sample_index.npy")
             path_to_shuffle_index = get_path_to("shuffle_index.npy")
-            cache_hit = all(
-                map(
-                    os.path.isfile,
-                    [
-                        path_to_description,
-                        path_to_document_index,
-                        path_to_sample_index,
-                        path_to_shuffle_index,
-                    ],
+            cache_hit = (
+                True
+                if self.config.fast_cache_load
+                else all(
+                    map(
+                        os.path.isfile,
+                        [
+                            path_to_description,
+                            path_to_document_index,
+                            path_to_sample_index,
+                            path_to_shuffle_index,
+                        ],
+                    )
                 )
             )
         else:
@@ -706,10 +800,11 @@ class MockGPTLowLevelDataset:
     """The hard-coded number of samples to generate"""
 
     max_sequence_length: int = 4096
-    """The hard-coded max sequence length to generate"""
+    """The hard-coded max sequence length of the random generated sequences"""
 
     def __init__(self, tokenizer: MegatronTokenizerBase) -> None:
-        self.tokenizer = tokenizer
+        self.vocab_size = tokenizer.vocab_size
+        self.eod_token = tokenizer.eod
         rng = numpy.random.default_rng(seed=self.seed)
         self.sequence_lengths = rng.integers(
             low=1, high=self.max_sequence_length, size=self.size, dtype=numpy.int32
@@ -721,12 +816,12 @@ def __len__(self) -> int:
     def __getitem__(self, idx: int) -> numpy.number:
         length = self.sequence_lengths[idx]
         sample = numpy.int64(
-            numpy.concatenate([numpy.arange(length - 1) + 1, [self.tokenizer.eod]])
+            numpy.concatenate([(numpy.arange(length - 1) + 1) % self.vocab_size, [self.eod_token]])
         )
         return sample
 
     def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
-        """This function is n abstraction over __getitem__ with support for slicing
+        """This function is an abstraction over __getitem__ with support for slicing
 
         Args:
             idx (int): The index into the dataset
@@ -747,7 +842,7 @@ class MockGPTDataset(GPTDataset):
     """The mock GPT dataset
 
     Args:
-        indexed_dataset (MockGPTLowLevelDataset): The MockGPTLowLevelDataset around which to build
+        dataset (MockGPTLowLevelDataset): The MockGPTLowLevelDataset around which to build
             the MockGPTDataset
 
         dataset_path (Optional[str]): This argument is of no consequence for the MockGPTDataset
diff --git a/megatron/core/datasets/helpers.cpp b/megatron/core/datasets/helpers.cpp
index bfb2958da51..1f587618d84 100644
--- a/megatron/core/datasets/helpers.cpp
+++ b/megatron/core/datasets/helpers.cpp
@@ -166,7 +166,8 @@ py::array_t<T> build_sample_idx(
   // Remove bound checks.
   auto sizes = sizes_.unchecked<1>();
   auto document_idx = document_idx_.unchecked<1>();
-
+  
+  // NOTE(asolergi-nv): This is the logic used to compute the number of samples in the GPTDataset when leveraging defer_npy_index_mmap
   // Build the sample idx as a contiguous 1-D array of type T.
   int64_t num_samples = 0;
   if (drop_last_partial_sequence == true) {
diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py
index 13c9df341da..76de4cca8d2 100644
--- a/megatron/core/datasets/indexed_dataset.py
+++ b/megatron/core/datasets/indexed_dataset.py
@@ -13,6 +13,7 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
+from datetime import datetime
 from enum import Enum
 from functools import lru_cache
 from itertools import accumulate
@@ -236,26 +237,45 @@ class _IndexReader(object):
         idx_path (str): The path to the index file
 
         multimodal (bool): Whether the dataset is multimodal
+
+        sequences_per_dataset (Optional[Tuple[int, int]]): The sequences per dataset.
+
+        dtype_code (int): The dtype code of the tokenized documents.
     """
 
-    def __init__(self, idx_path: str, multimodal: bool) -> None:
+    def __init__(
+        self,
+        idx_path: str,
+        multimodal: bool,
+        sequences_per_dataset: Optional[Tuple[int, int]] = None,
+        dtype_code: int = None,
+    ) -> None:
         log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}")
 
-        with open(idx_path, "rb") as stream:
-            header = stream.read(9)
-            assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}"
+        if sequences_per_dataset:
+            self.dtype = DType.dtype_from_code(dtype_code)
+            self.dtype_size = DType.size(self.dtype)
+            self.sequence_count = sequences_per_dataset[0]
+            self.document_count = sequences_per_dataset[1]
+            offset = 34  # 9 bytes from the header + 8 bytes from the version
+            # + 1 bytes for the dtype code + 8 bytes for the sequence count
+            # + 8 bytes for the document count = 34 bytes
+        else:
+            with open(idx_path, "rb") as stream:
+                header = stream.read(9)
+                assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}"
 
-            version = struct.unpack("<Q", stream.read(8))[0]
-            assert version == 1, f"bad version, cannot read: {idx_path}"
+                version = struct.unpack("<Q", stream.read(8))[0]
+                assert version == 1, f"bad version, cannot read: {idx_path}"
 
-            code = struct.unpack("<B", stream.read(1))[0]
-            self.dtype = DType.dtype_from_code(code)
-            self.dtype_size = DType.size(self.dtype)
+                code = struct.unpack("<B", stream.read(1))[0]
+                self.dtype = DType.dtype_from_code(code)
+                self.dtype_size = DType.size(self.dtype)
 
-            self.sequence_count = struct.unpack("<Q", stream.read(8))[0]
-            self.document_count = struct.unpack("<Q", stream.read(8))[0]
+                self.sequence_count = struct.unpack("<Q", stream.read(8))[0]
+                self.document_count = struct.unpack("<Q", stream.read(8))[0]
 
-            offset = stream.tell()
+                offset = stream.tell()
 
         self.bin_buffer_mmap = numpy.memmap(idx_path, mode="r", order="C")
         self.bin_buffer = memoryview(self.bin_buffer_mmap)
@@ -306,10 +326,6 @@ def __init__(self, idx_path: str, multimodal: bool) -> None:
             t_end = time.time()
             log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
 
-        assert self.sequence_lengths.shape[0] == len(self)
-        assert self.sequence_lengths.shape[0] == self.sequence_count
-        assert self.sequence_lengths.shape[0] == self.document_indices[-1]
-
         log_single_rank(logger, logging.INFO, f"> total number of sequences: {len(self)}")
         log_single_rank(
             logger,
@@ -374,7 +390,7 @@ class _MMapBinReader(_BinReader):
     """A _BinReader that memory maps the data (.bin) file
 
     Args:
-        bin_path (str): bin_path (str): The path to the data (.bin) file.
+        bin_path (str): The path to the data (.bin) file.
     """
 
     def __init__(self, bin_path: str) -> None:
@@ -416,11 +432,17 @@ class _FileBinReader(_BinReader):
     """A _BinReader that reads from the data (.bin) file using a file pointer
 
     Args:
-        bin_path (str): bin_path (str): The path to the data (.bin) file.
+        bin_path (str): The path to the data (.bin) file.
     """
 
-    def __init__(self, bin_path: str) -> None:
+    def __init__(
+        self, bin_path: str, num_max_retries: int = 3, sleep_duration_start: int = 10
+    ) -> None:
         self._bin_path = bin_path
+        # Retry-specific parameters. With default arguments, sleep for 10, 20, 40 seconds
+        # between retries.
+        self.num_max_retries = num_max_retries
+        self.sleep_duration_start = sleep_duration_start
 
     def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray:
         """Read bytes into a numpy array.
@@ -436,24 +458,50 @@ def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndar
             numpy.ndarray: An array with `count` items and data-type `dtype` constructed from
                 reading bytes from the data file starting at `offset`.
         """
-        sequence = numpy.empty(count, dtype=dtype)
-        if MultiStorageClientFeature.is_enabled():
-            msc = MultiStorageClientFeature.import_package()
-            with msc.open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
-                bin_buffer_file.seek(offset)
-                bin_buffer_file.readinto(sequence)
-        else:
-            with open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
-                bin_buffer_file.seek(offset)
-                bin_buffer_file.readinto(sequence)
-        return sequence
+
+        def _read():
+            """Helper method to read `count` bytes from self._bin_path at provided offset."""
+            sequence = numpy.empty(count, dtype=dtype)
+            if MultiStorageClientFeature.is_enabled():
+                msc = MultiStorageClientFeature.import_package()
+                with msc.open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
+                    bin_buffer_file.seek(offset)
+                    bin_buffer_file.readinto(sequence)
+            else:
+                with open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
+                    bin_buffer_file.seek(offset)
+                    bin_buffer_file.readinto(sequence)
+            return sequence
+
+        sleep_duration = self.sleep_duration_start
+        for i in range(self.num_max_retries + 1):
+            try:
+                return _read()
+            except Exception as e:
+                time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+                if i == self.num_max_retries:
+                    logger.warning(
+                        f"[{time_str}] {self.num_max_retries+1} total tries to read data item "
+                        f"failed; going to abort and re-raise exception \"{e}\"..."
+                    )
+                    # Re-raise exception if in last iteration of for loop.
+                    raise e
+                logger.warning(
+                    f"[{time_str}] Attempt {i+1}/{self.num_max_retries+1} to read data item "
+                    f"failed with exception \"{e}\"; going to sleep for {sleep_duration} "
+                    "seconds and then re-try..."
+                )
+                time.sleep(sleep_duration)
+                sleep_duration = sleep_duration * 2
+
+        raise RuntimeError("Should not reach here!")
 
 
 class _S3BinReader(_BinReader):
     """A _BinReader that reads from the data (.bin) file from S3
 
     Args:
-        bin_path (str): bin_path (str): The path to the data (.bin) file.
+        bin_path (str): The path to the data (.bin) file.
 
         bin_chunk_nbytes (int, optional): If not None, then maintain an in-memory cache to speed
             up calls to the `read` method. Furthermore, on a cache miss, download this number of
@@ -539,7 +587,9 @@ class _MultiStorageClientBinReader(_BinReader):
     """A _BinReader that reads from the data (.bin) file using the multi-storage client.
 
     Args:
-        bin_path (str): bin_path (str): The path to the data (.bin) file.
+        bin_path (str): The path to the data (.bin) file.
+
+        object_storage_config (ObjectStorageConfig): The object storage config.
     """
 
     def __init__(self, bin_path: str, object_storage_config: ObjectStorageConfig) -> None:
@@ -573,6 +623,12 @@ class IndexedDataset(torch.utils.data.Dataset):
             `object_storage_config.path_to_idx_cache` and streams data from the data (.bin) file
             in `object_storage_config.bin_chunk_nbytes` blocks. Note that `mmap` must be disabled
             for S3 data loading. Defaults to None.
+
+        fast_cache_load (bool): Whether to use the fast cache mode.
+
+        sequences_per_dataset (Optional[Tuple[int, int]]): The sequences per dataset.
+
+        dtype_code (int): The dtype code of the tokenized documents.
     """
 
     def __init__(
@@ -582,6 +638,9 @@ def __init__(
         mmap: bool = True,
         object_storage_config: Optional[ObjectStorageConfig] = None,
         s3_config: Optional[S3Config] = None,
+        fast_cache_load: bool = False,
+        sequences_per_dataset: Optional[Tuple[int, int]] = None,
+        dtype_code: int = None,
     ) -> None:
         super().__init__()
         self.path_prefix: str
@@ -601,7 +660,20 @@ def __init__(
             cache_idx_path = get_index_cache_path(idx_path, object_storage_config)
             cache_index_file(idx_path, cache_idx_path)
 
-        self.initialize(path_prefix, multimodal, mmap, object_storage_config)
+        self.initialize(
+            path_prefix,
+            multimodal,
+            mmap,
+            object_storage_config,
+            fast_cache_load,
+            sequences_per_dataset,
+            dtype_code,
+        )
+
+        if not fast_cache_load:
+            assert self.index.sequence_lengths.shape[0] == self.index.document_indices[-1]
+            assert self.index.sequence_lengths.shape[0] == len(self.index)
+            assert self.index.sequence_lengths.shape[0] == self.index.sequence_count
 
     def initialize(
         self,
@@ -609,6 +681,9 @@ def initialize(
         multimodal: bool,
         mmap: bool,
         object_storage_config: Optional[ObjectStorageConfig],
+        fast_cache_load: bool = False,
+        sequences_per_dataset: Optional[Tuple[int, int]] = None,
+        dtype_code: int = None,
     ) -> None:
         """Initialize the dataset
 
@@ -624,18 +699,27 @@ def initialize(
 
             object_storage_config (Optional[ObjectStorageConfig]): See IndexedDataset docstring
                 for details.
+
+            fast_cache_load (bool): Whether to use the fast cache mode.
+
+            sequences_per_dataset (Optional[Tuple[int, int]]): The sequences per dataset.
+
+            dtype_code (int): The dtype code of the tokenized documents.
         """
         idx_path = get_idx_path(path_prefix)
         bin_path = get_bin_path(path_prefix)
-        if object_storage_config is None:
-            assert os.path.exists(idx_path) and os.path.exists(
-                bin_path
-            ), "One or both of the .idx and .bin files cannot be found at the "
-            f"path prefix {path_prefix}"
+        if object_storage_config is None and not fast_cache_load:
+            assert os.path.exists(idx_path) and os.path.exists(bin_path), (
+                "One or both of the .idx and .bin files cannot be found at the "
+                f"path prefix {path_prefix}"
+            )
         self.path_prefix = path_prefix
         self.multimodal = multimodal
         self.mmap = mmap
         self.object_storage_config = object_storage_config
+        self.fast_cache_load = fast_cache_load
+        self.sequences_per_dataset = sequences_per_dataset
+        self.dtype_code = dtype_code
         if mmap:
             assert not object_storage_config
             self.bin_reader = _MMapBinReader(bin_path)
@@ -647,7 +731,7 @@ def initialize(
             idx_path = get_index_cache_path(get_idx_path(path_prefix), object_storage_config)
         else:
             self.bin_reader = _FileBinReader(bin_path)
-        self.index = _IndexReader(idx_path, self.multimodal)
+        self.index = _IndexReader(idx_path, self.multimodal, sequences_per_dataset, dtype_code)
 
     def __getstate__(self) -> Tuple[str, bool, bool, Optional[ObjectStorageConfig]]:
         """Get the state during pickling
@@ -655,7 +739,15 @@ def __getstate__(self) -> Tuple[str, bool, bool, Optional[ObjectStorageConfig]]:
         Returns:
             Tuple[str, bool, bool, Optional[ObjectStorageConfig]]: The state tuple
         """
-        return self.path_prefix, self.multimodal, self.mmap, self.object_storage_config
+        return (
+            self.path_prefix,
+            self.multimodal,
+            self.mmap,
+            self.object_storage_config,
+            self.fast_cache_load,
+            self.sequences_per_dataset,
+            self.dtype_code,
+        )
 
     def __setstate__(self, state: Tuple[str, bool, bool, Optional[ObjectStorageConfig]]) -> None:
         """Set the state during un-pickling
@@ -663,8 +755,24 @@ def __setstate__(self, state: Tuple[str, bool, bool, Optional[ObjectStorageConfi
         Args:
             state (Tuple[str, bool, bool, Optional[ObjectStorageConfig]]): The state tuple
         """
-        path_prefix, multimodal, mmap, object_storage_config = state
-        self.initialize(path_prefix, multimodal, mmap, object_storage_config)
+        (
+            path_prefix,
+            multimodal,
+            mmap,
+            object_storage_config,
+            fast_cache_load,
+            sequences_per_dataset,
+            dtype_code,
+        ) = state
+        self.initialize(
+            path_prefix,
+            multimodal,
+            mmap,
+            object_storage_config,
+            fast_cache_load,
+            sequences_per_dataset,
+            dtype_code,
+        )
 
     def __del__(self) -> None:
         """Clean up the object"""
diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py
index e4caf06035b..9d57ce5bd53 100644
--- a/megatron/core/datasets/masked_dataset.py
+++ b/megatron/core/datasets/masked_dataset.py
@@ -114,12 +114,30 @@ def __init__(
 
     @staticmethod
     def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
+        """Return the number of documents in the underlying low level dataset.
+
+        Args:
+            low_level_dataset (IndexedDataset): The underlying IndexedDataset
+
+        Returns:
+            int: The number of unique elements in the underlying IndexedDataset
+        """
         return low_level_dataset.document_indices.shape[0] - 1
 
     @staticmethod
     def build_low_level_dataset(
         dataset_path: str, config: MaskedWordPieceDatasetConfig
     ) -> IndexedDataset:
+        """Build the low level dataset (IndexedDataset) from the given path.
+
+        Args:
+            dataset_path (str): The real path prefix to the IndexedDataset .bin and .idx files
+
+            config (MaskedWordPieceDatasetConfig): The config
+
+        Returns:
+            IndexedDataset: The underlying IndexedDataset
+        """
         return IndexedDataset(dataset_path)
 
     @staticmethod
@@ -245,7 +263,7 @@ def _create_masked_lm_predictions(
                 2. masked_positions -> The indices for the masked token ids
                 3. masked_labels    -> The original token ids for the masked token ids
                 4. boundaries       -> The sentence and word boundaries for the sequence
-                4. masked_spans     -> The masked positions and labels with N-gram info intact
+                5. masked_spans     -> The masked positions and labels with N-gram info intact
         """
         # Build the token sentence and word boundaries and the masking candidates
         # e.g. [cls, id, ##id, ##id, id, ##id, sep, id, ##id, sep]
diff --git a/megatron/core/datasets/megatron_tokenizer.py b/megatron/core/datasets/megatron_tokenizer.py
index 08b602c4766..a8c2a431e59 100644
--- a/megatron/core/datasets/megatron_tokenizer.py
+++ b/megatron/core/datasets/megatron_tokenizer.py
@@ -23,11 +23,15 @@ class MegatronLegacyTokenizer(ABC):
     """
 
     def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
+        from megatron.core.utils import log_single_rank
+
         # Deprecation warning
-        logger.warning(
-            "You’re using the legacy tokenizer system, which is deprecated "
-            "and will be removed in a future release. Please migrate to the new tokenizer system "
-            "(`megatron.core.tokenizers.MegatronTokenizer`)."
+        log_single_rank(
+            logger,
+            logging.WARNING,
+            "You're using the legacy tokenizer system, which is deprecated "
+            "and will be removed in a future release. Please migrate to the new "
+            "tokenizer system (`megatron.core.tokenizers.MegatronTokenizer`).",
         )
         self.unique_identifiers = OrderedDict()
         self.unique_identifiers["class"] = type(self).__name__
diff --git a/megatron/core/datasets/readme.md b/megatron/core/datasets/readme.md
index 12ade943b53..452bf24e4a2 100644
--- a/megatron/core/datasets/readme.md
+++ b/megatron/core/datasets/readme.md
@@ -9,11 +9,11 @@ Data preprocessing is built around the following classes:
 
 At the moment, an end-to-end data preprocessing implementation is left to the user. See the class docstring(s) for more details.
 
-#### IndexedDatasetBuilder
+### IndexedDatasetBuilder
 
 The `IndexedDatasetBuilder` is capable of building and merging `IndexedDataset` instances.
 
-#### IndexedDataset
+### IndexedDataset
 
 The `IndexedDataset` class is the lowest-level data interface in Megatron Core. Internally, an `IndexedDataset` instance references two binaries: the data file (`.bin`) contains document/sequence data and the index file (`.idx`) contains document/sequence metadata.
 
@@ -42,32 +42,32 @@ Building the data loaders is a distributed-aware process built around the follow
 
 See the class docstrings for more details.
 
-#### BlendedMegatronDatasetConfig (extendable)
+### BlendedMegatronDatasetConfig (extendable)
 
 The `BlendedMegatronDatasetConfig` class parameterizes the `BlendedMegatronDatasetBuilder` and in turn the `MegatronDataset` and `BlendedDataset`.
 
 Different training/inference regimes will require different extensions e.g. the `GPTDatasetConfig`
 
-#### BlendedMegatronDatasetBuilder
+### BlendedMegatronDatasetBuilder
 
 The `BlendedMegatronDatasetBuilder` class builds the highest-level data interfaces in Megatron Core.
 
 **NB:** All ranks should attempt to build the dataset via the `BlendedMegatronDatasetBuilder` or the program will hang. Which ranks follow through on their attempts can be controlled via the `BlendedMegatronDatasetConfig`.
 
-#### IndexedDataset
+### IndexedDataset
 
 The `IndexedDataset` class is the lowest-level data interface in Megatron Core.
 
 The `IndexedDataset` should already exist on disk before attempting to build any of the high-level data interfaces.
 
 
-#### MegatronDataset (extendable)
+### MegatronDataset (extendable)
 
 The `MegatronDataset` abstract class is a high-level data interface in Megatron Core. It is an abstraction built upon the `IndexedDataset`.
 
 Different training/inference regimes will require different extensions e.g. the `GPTDataset`
 
-#### BlendedDataset
+### BlendedDataset
 
 The `BlendedDataset` class is a high-level data interface in Megatron Core. It is an abstraction built upon the `MegatronDataset`.
 
@@ -191,3 +191,13 @@ To query the `BlendedDataset` for the _k_-th sample we do the following
     ```
 
 To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `BlendedDataset.__init__` function.
+
+## Fast DataLoader initialization
+
+Especially for large-scale runs, DataLoader initialization can take several minutes, since it involves opening and memory-mapping multiple files and can significantly stress the filesystem. To speed up this process, we have developed the following three optimizations, controlled by configuration flags":
+
+  - `--dataloader-fast-cache-load`: This option assumes that the dataset cache already exists in the specified `--data-cache-path`. When enabled, it speeds up the creation process by removing synchronization points and file check assertions.
+
+  - `--dataloader-defer-npy-index-mmap`: This option also assumes that the dataset cache already exists in the specified `--data-cache-path`. When enabled, it defers the memory mapping of the dataset indexes (.npy files) until their first access. We recommend using this configuration together with `--num-workers` > 0 so that the DataLoader prefetches the next batches of data, thereby hiding the cost of index memory mapping.
+
+  - `--per-dataset-sequences-path`: With this configuration, we specify the JSON file generated by the `tools/build_sequences_per_dataset.py` script. This script generates a single file containing the required metadata from all the specified file prefixes. This configuration is especially useful when dealing with hundreds to thousands of file prefixes, since it requires only a single `open` operation instead of one per file prefix.
\ No newline at end of file
diff --git a/megatron/core/datasets/retro/__init__.py b/megatron/core/datasets/retro/__init__.py
deleted file mode 100644
index 7ce970c6e9f..00000000000
--- a/megatron/core/datasets/retro/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-from .config import RetroGPTChunkDatasets
-from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
-from .query.retro_dataset import get_retro_datasets
diff --git a/megatron/core/datasets/retro/config/__init__.py b/megatron/core/datasets/retro/config/__init__.py
deleted file mode 100644
index 3635bedb3f4..00000000000
--- a/megatron/core/datasets/retro/config/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-Exports:
-
-  - Embedder: Base class for all Bert embedders.
-  - RetroBertEmbedders: Container class for in-memory and on-disk embedders.
-  - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing.
-  - RetroGPTChunkDatasets: Container class for train, valid, and test datasets.
-  - RetroTokenizers: Container class for GPT and Bert tokenizers.
-"""
-
-from .bert_embedders import Embedder, RetroBertEmbedders
-from .config import RetroPreprocessingConfig
-from .gpt_chunk_datasets import RetroGPTChunkDatasets
-from .tokenizers import RetroTokenizers
diff --git a/megatron/core/datasets/retro/config/bert_embedders.py b/megatron/core/datasets/retro/config/bert_embedders.py
deleted file mode 100644
index c34cd3d79dd..00000000000
--- a/megatron/core/datasets/retro/config/bert_embedders.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Container dataclass for holding both in-memory and on-disk Bert embedders."""
-
-import abc
-from dataclasses import dataclass
-
-import numpy as np
-import torch
-
-
-class Embedder(abc.ABC):
-    """Base class for all Bert embedders.
-
-    All embedders should be able to embed either an entire text dataset (to a 2D
-    numpy array), or a single text string (to a 1D numpy array).
-    """
-
-    @abc.abstractmethod
-    def embed_text_dataset(self, text_dataset: torch.utils.data.Dataset) -> np.ndarray:
-        """Embed a text dataset.
-
-        Args:
-            text_dataset (torch.utils.data.Dataset): Text dataset to embed.
-                Each sample of the text dataset should output a dict with a key 'text'
-                and a string value.
-
-        Returns:
-            A 2D ndarray with shape (len(text_dataset), dimension(embedder)).
-        """
-
-    @abc.abstractmethod
-    def embed_text(self, text: str) -> np.ndarray:
-        """Embed a simple string of text.
-
-        Args:
-            text (str): A single text sample.
-
-        Returns:
-            A 1D ndarray with shape (dimensions(embedder),).
-        """
-
-
-@dataclass
-class RetroBertEmbedders:
-    """Container dataclass for in-memory and on-disk Bert embedders."""
-
-    disk: Embedder
-    mem: Embedder
diff --git a/megatron/core/datasets/retro/config/config.py b/megatron/core/datasets/retro/config/config.py
deleted file mode 100644
index ac9ca841242..00000000000
--- a/megatron/core/datasets/retro/config/config.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Retro preprocessing config."""
-
-from dataclasses import dataclass
-
-from megatron.core.transformer import TransformerConfig
-
-from .bert_embedders import RetroBertEmbedders
-from .gpt_chunk_datasets import RetroGPTChunkDatasets
-from .tokenizers import RetroTokenizers
-
-
-@dataclass
-class RetroPreprocessingConfig(TransformerConfig):
-    """Configuration object for Retro preprocessing.
-
-    *Note* : Arguments prefixed with '--retro-gpt-*' or '--retro-bert-*' are
-    included and named as such to more easily handle managing both models
-    running at the same time. Megatron is not optimized to run two models at
-    once, so this naming convention makes it clearer.
-
-    Args:
-
-        retro_project_dir (str): Retro project directory, which contains the preprocessed data for for pretraining. This directory is built during preprocessing (see tools/retro/README.md), and contains subdirectories for the chunk database and pretraining neighbors.
-        retro_tasks (str): Comma-separated list of tasks to run. Run entire preprocesing pipeline by using '--retro-tasks build'. Alternatively, run individual stages with tasks (in this order) 'db-build', 'index-build', or 'query-pretraining-neighbors'. For example, '--retro-tasks db-build,index-build,query-pretraining-neighbors' is equivalent to '--retro-tasks build'; or the argument can contain a subset of these tasks. Stages must always be run in the correct order (listed above).
-        retro_task_validate (float): If defined, validate a randomly sampled subset of the existing results of the given task. Each task implements a 'validate' method that is responsible for sampling a `retro_task_validate` fraction of the existing results, and then checking for bitwise equality with the current code base. (E.g., `--retro-task-validate 0.01`.)
-        retro_block_size (int): Number of chunks to process at a time when generating Bert embeddings and querying the search index. Partial results for each block are generally saved to disk in separate files.
-        retro_doc_block_size (int): Number of documents to processe at time when processing token datasets into chunk databases. The partial chunk database for each block is saved into a separate file.
-        retro_gpt_seed (int): Random seed used for python, numpy, pytorch, and cuda.
-        retro_gpt_data_path (str): Path to the training dataset. Accepted format: 1) a single data path, 2) multiple datasets in the form: dataset1-weight dataset1-path dataset2-weight dataset2-path ... It is used with --split when a single dataset used for all three: train, valid and test. It is exclusive to the other --*-data-path args.
-        retro_gpt_data_cache_path (str): Path to a directory to hold cached index files.
-        retro_gpt_split (str): Comma-separated list of proportions for training, validation, and test split. For example the split `90,5,5` will use 90%% of data for training, 5%% for validation and 5%% for test.
-        retro_gpt_train_samples (int): Total number of samples to train over all training runs.
-        retro_gpt_eval_interval (int): GPT evaluation interval.
-        retro_gpt_eval_iters (int): GPT evaluation iterations.
-        retro_gpt_tokenizer_type (str): GPT tokenizer type.
-        retro_gpt_tokenizer_model (str): GPT tokenizer model file.
-        retro_gpt_vocab_file (str): GPT vocab file.
-        retro_gpt_merge_file (str): GPT merge file.
-        retro_gpt_seq_length (int): GPT sequence length.
-        retro_gpt_global_batch_size (int): GPT global batch size.
-        retro_gpt_chunk_length (int): GPT chunk length.
-        retro_bert_tokenizer_type (str): Bert tokenizer type (for when using '--bert-embedder-type megatron').
-        retro_bert_vocab_file (str): Bert vocab file.
-        retro_bert_batch_size (int): Micro-batch size for processing Bert embeddings.
-        retro_bert_max_chunk_length (int): Maximum sequence length for Bert embeddings. (Named 'chunk' here in reference to these Bert sequences being converted from GPT chunks.)
-        retro_index_type (str): A 'faiss-base' index is a simple, un-optimized wrapper around a Faiss index. A 'faiss-par-add' index optimizes the 'add()' method by making it multi-node and multi-process, but with bit-wise equivalent results.
-        retro_index_str (str): Index string used for calling faiss.index_factory(). For example, 'IVF262144_HNSW32,Flat' or 'OPQ32_256,IVF4194304_HNSW32,PQ32'.
-        retro_index_ntrain (int): Number of database chunks to use for training the index. This value must be less or equal to the total number of chunks in the database.
-        retro_index_train_load_fraction (float): Fraction of sampled chunks to use for training the index. Useful when our total sampled embeddings use too much memory; lowering the load fraction is less costly than re-embedding a new sampled dataset from scratch.
-        retro_index_add_load_fraction (float): Fraction of database chunks to use for adding to the index. Useful when our total index size would use too much memory; lowering the load fraction is less costly than re-designing our token datasets.
-        retro_index_delete_training_embeddings (bool): Delete training embeddings for the search index. Useful for debugging.
-        retro_index_delete_added_codes (bool): Delete added codes for the search index. Useful for debugging.
-        retro_query_ef_search (int): Index ef-search parameter for Hierarchical Navigable Small Worlds (HNSW) during querying.
-        retro_query_nprobe (int): Index nprobe parameter for Inverted File (IVF) during querying.
-        retro_query_num_neighbors_query (int): Number of neighbors to retrieve when calling index.search().
-        retro_query_num_neighbors_save (int): Number of neighbors to save to disk after the index's returned neighbors. If longer than target value, neighbors truncated; and if shorter than target value, neighbors are padded with -1's.
-        retro_bert_embedders (RetroBertEmbedders): Set of Bert embedders used for embedding chunks. Contains entries: 1) 'mem' for an in-memory embedder, and 2) 'disk' for an embedder that saves results in blocks to disk.
-        retro_gpt_chunk_datasets (RetroGPTChunkDatasets): GPT datasets for 'train', 'valid', and 'test'.
-        retro_tokenizers (RetroTokenizers): GPT ('gpt') and Bert ('bert') tokenizers.
-    """
-
-    # Basic.
-    retro_project_dir: str = None
-    retro_tasks: str = 'build'
-    retro_task_validate: float = None
-    retro_block_size: int = 100000
-    retro_doc_block_size: int = 100000
-
-    # GPT.
-    retro_gpt_seed: int = 1234
-    retro_gpt_data_path: list = None  # basic list here, for parsing purposes
-    retro_gpt_data_cache_path: str = None
-    retro_gpt_split: str = '969,30,1'
-    retro_gpt_train_samples: int = None
-    retro_gpt_eval_interval: int = None
-    retro_gpt_eval_iters: int = None
-    retro_gpt_tokenizer_type: str = None
-    retro_gpt_tokenizer_model: str = None
-    retro_gpt_vocab_file: str = None
-    retro_gpt_merge_file: str = None
-    retro_gpt_seq_length: int = None
-    retro_gpt_global_batch_size: int = None
-    retro_gpt_chunk_length: int = 64
-
-    # Bert.
-    retro_bert_tokenizer_type: str = None
-    retro_bert_vocab_file: str = None
-    retro_bert_batch_size: int = 128
-    retro_bert_max_chunk_length: int = 256
-
-    # Index.
-    retro_index_type: str = 'faiss-par-add'
-    retro_index_str: str = None
-    retro_index_ntrain: int = None
-    retro_index_train_load_fraction: float = 1.0
-    retro_index_add_load_fraction: float = 1.0
-    retro_index_delete_training_embeddings: bool = True
-    retro_index_delete_added_codes: bool = True
-
-    # Query.
-    retro_query_ef_search: int = 256
-    retro_query_nprobe: int = 65536
-    retro_query_num_neighbors_query: int = 200
-    retro_query_num_neighbors_save: int = 20
-
-    # Tools.
-    retro_bert_embedders: RetroBertEmbedders = None
-    retro_gpt_chunk_datasets: RetroGPTChunkDatasets = None
-    retro_tokenizers: RetroTokenizers = None
-
-    def __post_init__(self) -> None:
-        """Validate Retro config."""
-
-        # Validate required attributes.
-        assert self.retro_project_dir is not None
-        assert self.retro_tasks is not None
-        assert self.retro_gpt_data_path is not None or self.retro_gpt_data_cache_path is not None
-        assert self.retro_gpt_train_samples is not None
-        assert self.retro_gpt_eval_interval is not None
-        assert self.retro_gpt_eval_iters is not None
-        assert self.retro_gpt_tokenizer_type is not None
-        assert self.retro_gpt_tokenizer_model is not None or (
-            self.retro_gpt_vocab_file is not None and self.retro_gpt_merge_file is not None
-        )
-        assert self.retro_gpt_seq_length is not None
-        assert self.retro_gpt_global_batch_size is not None
-        assert self.retro_bert_tokenizer_type is not None
-        assert self.retro_bert_vocab_file is not None
-        assert self.retro_index_str is not None
-        assert self.retro_index_ntrain is not None
-
-        # Split retro tasks.
-        self.retro_tasks = self.retro_tasks.split(",")
diff --git a/megatron/core/datasets/retro/config/gpt_chunk_datasets.py b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py
deleted file mode 100644
index 831b1d812bf..00000000000
--- a/megatron/core/datasets/retro/config/gpt_chunk_datasets.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Container dataclass for GPT chunk datasets (train, valid, and test)."""
-
-from dataclasses import dataclass
-
-
-@dataclass
-class RetroGPTChunkDatasets:
-    """Container dataclass for GPT chunk datasets."""
-
-    # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'.
-    train: dict = None
-    valid: dict = None
-    test: dict = None
diff --git a/megatron/core/datasets/retro/config/tokenizers.py b/megatron/core/datasets/retro/config/tokenizers.py
deleted file mode 100644
index 69ca94b3cfd..00000000000
--- a/megatron/core/datasets/retro/config/tokenizers.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Container class for GPT and Bert tokenizers."""
-
-from dataclasses import dataclass
-
-from megatron.core.tokenizers import MegatronTokenizerBase
-
-
-@dataclass
-class RetroTokenizers:
-    """Container class for GPT and Bert tokenizers."""
-
-    gpt: MegatronTokenizerBase = None
-    bert: MegatronTokenizerBase = None
diff --git a/megatron/core/datasets/retro/db/__init__.py b/megatron/core/datasets/retro/db/__init__.py
deleted file mode 100644
index f1f460b3b02..00000000000
--- a/megatron/core/datasets/retro/db/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-Exports:
-
-  - build_db: Build a chunk database from a list of indexed datasets.
-"""
-
-from .build import build_db
diff --git a/megatron/core/datasets/retro/db/build.py b/megatron/core/datasets/retro/db/build.py
deleted file mode 100644
index 0cd94729385..00000000000
--- a/megatron/core/datasets/retro/db/build.py
+++ /dev/null
@@ -1,649 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Build a chunk database from a list of indexed datasets.
-
-Building a chunk database consists of.
-
-  - Breaking each document of each indexed dataset into consecutive
-      retro_gpt_chunk_length chunks.
-  - Re-tokenize each chunk into Bert, and discard any chunks with empty Bert
-      tokens.
-  - Save chunk offsets to disk for each indexed dataset.
-"""
-
-import os
-import types
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.indexed_dataset import IndexedDataset
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.utils import (
-    extract_data_config,
-    get_blocks_by_rank,
-    log_retro_rank_0,
-    retro_makedir,
-)
-
-from .utils import (
-    get_indexed_dataset_infos,
-    get_indexed_dataset_infos_path,
-    get_individual_chunk_db,
-    get_individual_db_dir,
-    get_individual_db_paths,
-    get_individual_doc_offsets,
-    get_merged_db_path_map,
-    init_indexed_dataset_infos,
-    save_indexed_dataset_infos,
-)
-
-try:
-    from tqdm import tqdm
-
-    HAVE_TQDM = True
-except ImportError:
-    HAVE_TQDM = False
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-
-def build_partial_db(
-    config: types.SimpleNamespace,
-    dataset_idx: int,
-    n_datasets: int,
-    indexed_dataset: IndexedDataset,
-    block_id: int,
-    n_blocks: int,
-    block: dict,
-    proc_id: int,
-    n_procs: int,
-) -> Tuple[int, list, list, dict]:
-    """Process a document index range of the indexed dataset.
-
-    The chunk database is built in parallel blocks, since de-tokenizing &
-    re-tokenizing for Bert-length computation is expensive. This method
-    iterates each document and extracts sequential 'chunk-length' sequences
-    from each document.
-
-    Args:
-        config (types.SimpleNamespace): Subset of Retro config, containing
-            'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
-        dataset_idx (int): Index of this dataset out of all blended datasets.
-        n_datasets (int): Total number of blended datasets.
-        indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
-        block_id (int): Block index out of all blocks to be processed.
-        n_blocks (int):  Total number of blocks to be processed.
-        block (dict): Range information such as start/end points for chunking idnexed dataset.
-        proc_id (int): Process ID for tracking parallel process order.
-        n_procs (int): Total number of parallel processes.
-
-    Returns:
-        A tuple containing:
-
-        - Process ID.
-        - List of valid chunks.
-        - List of invalid chunks (i.e., chunks that converted to empty Bert embeddings.).
-        - Dict mapping document ID to number of valid chunks.
-    """
-
-    if not HAVE_TQDM:
-        raise ImportError("tqdm is required to use the RetroDataset. Please install tqdm.")
-
-    # Document start/end indexes.
-    doc_range = block["range"]
-    n_docs = doc_range[1] - doc_range[0]
-    n_docs_per_proc = int(np.ceil(n_docs / n_procs))
-    doc_start_id = doc_range[0] + proc_id * n_docs_per_proc
-    doc_end_id = min(doc_range[1], doc_start_id + n_docs_per_proc)
-
-    # Print progress.
-    progress_proc_ids = set(range(n_procs)) if torch.distributed.get_rank() == 0 else set()
-    if proc_id in progress_proc_ids:
-        log_retro_rank_0(
-            " > building partial chunk db, proc %d / %d, docs %d:%d / %d."
-            % (proc_id, n_procs, doc_start_id, doc_end_id, n_docs)
-        )
-
-    # Progress bars (snapshot of overall progress).
-    doc_id_iter = range(doc_start_id, doc_end_id)
-    pbar = (
-        tqdm(doc_id_iter, "parse doc chunks", miniters=len(doc_id_iter) // 20)
-        if proc_id in progress_proc_ids
-        else doc_id_iter
-    )
-
-    # Iterate documents & parse chunks.
-    chunk_db_valid: List[Tuple] = []
-    chunk_db_invalid: List[Tuple] = []
-    doc_size_map = {}
-    for doc_id in pbar:
-        # Progress description.
-        try:
-            pbar.set_description(
-                "%sds %d / %d, block %d / %d, proc %d / %d."
-                % (
-                    "" if config.task_validate is None else "[validate] ",
-                    dataset_idx,
-                    n_datasets,
-                    block_id,
-                    n_blocks,
-                    proc_id,
-                    n_procs,
-                )
-            )
-        except Exception:
-            pass
-
-        # Remove EOD token.
-        doc = indexed_dataset.get(doc_id)
-        if doc[-1].item() == config.gpt_eod:
-            doc = doc[:-1]
-        doc_len = len(doc)
-
-        # Chunk start/end indexes.
-        chunk_start_idxs = list(range(0, doc_len, config.chunk_length))
-        chunk_end_idxs = [min(doc_len, s + config.chunk_length) for s in chunk_start_idxs]
-
-        # Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid').
-        doc_size_map[doc_id] = 0
-        for i, chunk_start_idx in enumerate(chunk_start_idxs):
-            # Re-tokenize.
-            chunk_end_idx = chunk_end_idxs[i]
-            gpt_token_ids = indexed_dataset.get(
-                idx=doc_id, offset=chunk_start_idx, length=chunk_end_idx - chunk_start_idx
-            )
-            text = config.gpt_detokenize(gpt_token_ids.tolist())
-            bert_token_ids = config.bert_tokenize(text)
-
-            # 'Valid' for non-empty Bert chunks; 'invalid' otherwise.
-            if len(bert_token_ids) == 0:
-                _chunk_db = chunk_db_invalid
-            else:
-                _chunk_db = chunk_db_valid
-                doc_size_map[doc_id] += 1
-            _chunk_db.append((doc_id, chunk_start_idx, chunk_end_idx, len(bert_token_ids)))
-
-    return proc_id, chunk_db_valid, chunk_db_invalid, doc_size_map
-
-
-def build_block_db(
-    config: RetroPreprocessingConfig,
-    dataset_idx: int,
-    n_datasets: int,
-    indexed_dataset: IndexedDataset,
-    n_procs: int,
-    executor: ProcessPoolExecutor,
-    n_missing_blocks: int,
-    block_idx: int,
-    block: dict,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Split each document within block into consecutive retro_gpt_chunk_length size chunks.
-
-    Args:
-        config (RetroPreprocessingConfig): For DB building, we make use of attributes
-            'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
-        dataset_idx (int): Index of this dataset out of all blended datasets.
-        n_datasets (int): Total number of blended datasets.
-        indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
-        n_procs (int): Total number of parallel processes.
-            executor (ProcessPoolExecutor): Executor for launching parallel processes.
-        n_missing_blocks (int):  Total number of blocks to be processed.
-        block_idx (int): Block index out of all blocks to be processed.
-        block (dict): Range information such as start/end points for chunking idnexed dataset.
-
-    Returns:
-        A tuple containing:
-
-        - List of valid chunks.
-        - List of invalid chunks (i.e., chunks that converted to empty Bert embeddings.).
-        - Dict mapping document ID to number of valid chunks.
-    """
-
-    # Build partial dbs.
-    log_retro_rank_0(" > build partial dbs.")
-    futures = []
-    for proc_id in range(n_procs):  # not true process id
-        futures.append(
-            executor.submit(
-                build_partial_db,
-                types.SimpleNamespace(
-                    chunk_length=config.retro_gpt_chunk_length,
-                    gpt_eod=config.retro_tokenizers.gpt.eod,
-                    gpt_detokenize=config.retro_tokenizers.gpt.detokenize,
-                    bert_tokenize=config.retro_tokenizers.bert.tokenize,
-                    task_validate=config.retro_task_validate,
-                ),
-                dataset_idx,
-                n_datasets,
-                indexed_dataset,
-                block_idx,
-                n_missing_blocks,
-                block,
-                proc_id,
-                n_procs,
-            )
-        )
-    partial_chunk_dbs = []
-    for future in as_completed(futures):
-        partial_chunk_dbs.append(future.result())
-
-    # Concatenate chunks.
-    partial_chunk_dbs.sort(key=lambda item: item[0])  # sort by proc_id
-    chunk_db_valid = [
-        item for partial_chunk_db in partial_chunk_dbs for item in partial_chunk_db[1]
-    ]
-    chunk_db_invalid = [
-        item for partial_chunk_db in partial_chunk_dbs for item in partial_chunk_db[2]
-    ]
-
-    # Convert to numpy.
-    log_retro_rank_0(" > converting chunk db to numpy.")
-    chunk_db_valid = np.array(chunk_db_valid, dtype="uint32")
-    chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32")
-
-    # Document offsets.
-    doc_sizes = [
-        (d, s) for partial_chunk_db in partial_chunk_dbs for d, s in partial_chunk_db[3].items()
-    ]
-    doc_sizes.sort(key=lambda item: item[0])
-    doc_offsets = np.cumsum([item[1] for item in doc_sizes]).astype("uint64")
-    doc_offsets = np.stack(
-        (np.array([item[0] for item in doc_sizes], dtype="uint64"), doc_offsets), axis=1
-    )
-
-    return chunk_db_valid, chunk_db_invalid, doc_offsets
-
-
-def save_block_db(
-    block: dict, chunk_db_valid: np.ndarray, chunk_db_invalid: np.ndarray, doc_offsets: np.ndarray
-) -> None:
-    """Save block of chunked tokens to disk. These blocks are later used for
-    training and adding to the vector index.
-
-    Args:
-        block (dict): Range information such as start/end points for chunking idnexed dataset.
-        chunk_db_valid (np.ndarray): Array of valid chunk indexes.
-        chunk_db_invalid (np.ndarray): Array of invalid chunk indexes.
-        doc_offsets (np.ndarray): Array of document offsets by chunks.
-    """
-    if not HAVE_H5PY:
-        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
-
-    log_retro_rank_0(" > saving individual db.")
-    with h5py.File(block["path"], "w") as f:
-        dset = f.create_dataset("chunks_valid", data=chunk_db_valid)
-        dset = f.create_dataset("chunks_invalid", data=chunk_db_invalid)
-        dset = f.create_dataset("doc_offsets", data=doc_offsets)
-
-
-def build_individual_db(
-    config: RetroPreprocessingConfig, dataset_idx: int, n_datasets: int, dataset_info: dict
-) -> None:
-    """Process a single indexed dataset & extract chunks.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        dataset_idx (int): Dataset index within blended dataset.
-        n_datasets (int): Total number of datasets within blended dataset.
-        dataset_info (dict): Metadata for dataset
-            (see `save_indexed_dataset_infos()` in `utils.py` for more detail).
-    """
-
-    # Make directory.
-    db_dir = get_individual_db_dir(config.retro_project_dir, dataset_info["prefix"])
-    retro_makedir(config, db_dir)
-
-    # Indexed dataset.
-    indexed_dataset = dataset_info["dataset"]
-
-    # Missing DB blocks (split by documents).
-    blocks = get_blocks_by_rank(
-        db_dir,
-        len(indexed_dataset),
-        config.retro_doc_block_size,
-        validate=lambda f: f["chunks_valid"].shape == (0,) or f["chunks_valid"].shape[1] == 4,
-        sample=config.retro_task_validate,
-    )
-    if config.retro_task_validate is None:
-        active_blocks = blocks.missing
-    else:
-        assert blocks.n_missing_world == 0
-        active_blocks = blocks.existing
-
-    # Prevent missing-path-write race condition.
-    torch.distributed.barrier()
-
-    # Nothing to do?
-    if config.retro_task_validate is None and not active_blocks:
-        return
-
-    # Num processes.
-    if blocks.n_missing_world == 1:
-        n_procs = 128
-    elif blocks.n_missing_world <= 2:
-        n_procs = 64
-    elif blocks.n_missing_world <= 4:
-        n_procs = 32
-    elif blocks.n_missing_world <= 8:
-        n_procs = 16
-    else:
-        n_procs = 8
-
-    # Process documents in parallel.
-    with ProcessPoolExecutor(max_workers=n_procs) as executor:
-        for block_idx, block in enumerate(active_blocks):
-            if block is not None:
-                # Build block DB.
-                chunk_db_valid, chunk_db_invalid, doc_offsets = build_block_db(
-                    config=config,
-                    dataset_idx=dataset_idx,
-                    n_datasets=n_datasets,
-                    indexed_dataset=indexed_dataset,
-                    n_procs=n_procs,
-                    executor=executor,
-                    n_missing_blocks=len(active_blocks),
-                    block_idx=block_idx,
-                    block=block,
-                )
-
-                if config.retro_task_validate is None:
-                    # Save block DB.
-                    save_block_db(
-                        block=block,
-                        chunk_db_valid=chunk_db_valid,
-                        chunk_db_invalid=chunk_db_invalid,
-                        doc_offsets=doc_offsets,
-                    )
-
-                else:
-                    # Load existing block DB.
-                    with h5py.File(block["path"]) as f:
-                        existing_chunks_valid = np.copy(f["chunks_valid"])
-                        existing_chunks_invalid = np.copy(f["chunks_invalid"])
-                        existing_doc_offsets = np.copy(f["doc_offsets"])
-
-                    # Check equality.
-                    log_retro_rank_0(" > validate.")
-                    assert np.array_equal(existing_chunks_valid, chunk_db_valid)
-                    assert np.array_equal(existing_chunks_invalid, chunk_db_invalid)
-                    assert np.array_equal(existing_doc_offsets, doc_offsets)
-
-            # Wait for all ranks to finish block.
-            log_retro_rank_0(" > waiting for all ranks to finish block.")
-            torch.distributed.barrier()
-
-    log_retro_rank_0(" > finished saving individual db.")
-
-
-def build_individual_dbs(
-    config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict]
-) -> None:
-    """Iterate each indexed dataset & process its chunks.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset.
-    """
-
-    # Build individual DBs.
-    log_retro_rank_0(" > build individual chunk dbs.")
-    for ds_idx, ds_info in enumerate(indexed_dataset_infos):
-        # Progress.
-        log_retro_rank_0(
-            " > building individual db, dataset %d / %d ... '%s'."
-            % (ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
-        )
-
-        # Process single dataset.
-        build_individual_db(config, ds_idx, len(indexed_dataset_infos), ds_info)
-
-
-def update_chunk_counts(
-    config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict]
-) -> None:
-    """Set n_chunks_train & n_chunks sampled for each individual DB.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
-            (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
-    """
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    # Data ratio sum (for setting index training chunks).
-    data_ratio_sum = sum([d["ratio"] for d in indexed_dataset_infos])
-
-    # Training split size (split at document level).
-    train_fraction = float(extract_data_config(config).split.split(",")[0]) / 100
-    assert train_fraction > 0 and train_fraction <= 1
-
-    # Set n_chunks (including n_chunks_sampled for unambiguity).
-    log_retro_rank_0(" > compute n_chunks.")
-    for ds_index, ds_info in enumerate(indexed_dataset_infos):
-        db_paths = get_individual_db_paths(config.retro_project_dir, ds_info["prefix"])
-
-        # Update counts.
-        ds_info["n_docs"] = len(ds_info["dataset"].document_indices) - 1
-        ds_info["n_docs_train"] = int(train_fraction * ds_info["n_docs"])
-        ds_info["n_chunks"] = 0  # previously, 'n_chunks_valid'
-        ds_info["n_chunks_train"] = 0
-        ds_info["n_chunks_invalid"] = 0
-        for db_path in tqdm(
-            db_paths, "%d/%d, %s" % (ds_index, len(indexed_dataset_infos), ds_info["prefix"])
-        ):
-            with h5py.File(db_path, "r") as f:
-                ds_info["n_chunks"] += len(f["chunks_valid"])
-                ds_info["n_chunks_invalid"] += len(f["chunks_invalid"])
-                ds_info["n_chunks_train"] += (
-                    (np.copy(f["chunks_valid"][:, 0]) < ds_info["n_docs_train"]).sum().item()
-                )
-
-        ds_info["n_chunks_sampled"] = int(
-            config.retro_index_ntrain * ds_info["ratio"] / data_ratio_sum
-        )
-
-        # Verify counts.
-        assert ds_info["n_chunks_train"] <= ds_info["n_chunks"], "n_train (%d) > n_total (%d)." % (
-            ds_info["n_chunks_train"],
-            ds_info["n_chunks"],
-        )
-        assert (
-            ds_info["n_chunks_sampled"] <= ds_info["n_chunks_train"]
-        ), "n_sampled (%d) > n_train (%d)." % (
-            ds_info["n_chunks_sampled"],
-            ds_info["n_chunks_train"],
-        )
-
-
-def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str) -> None:
-    """Merge individual DBs into single DB.
-
-    Args:
-        project_dir (str): Retro project dir.
-        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
-            (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
-        db_type (str): DB type (e.g., 'sampled', 'train', or 'valid').
-    """
-
-    if not HAVE_H5PY:
-        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    log_retro_rank_0(" > build %s chunk db." % db_type)
-
-    # Count chunks.
-    if db_type == "sampled":
-        n_chunks_key = "n_chunks_sampled"
-        n_docs_key = None
-    elif db_type == "train":
-        n_chunks_key = "n_chunks_train"
-        n_docs_key = "n_docs_train"
-    elif db_type == "valid":
-        n_docs_key = None
-    else:
-        raise Exception("handle db_type '%s'." % db_type)
-
-    if db_type == "valid":
-        n_chunks = sum(m["n_chunks"] - m["n_chunks_train"] for m in indexed_dataset_infos)
-    else:
-        n_chunks = sum(m[n_chunks_key] for m in indexed_dataset_infos)
-        n_docs = None if n_docs_key is None else sum(m[n_docs_key] for m in indexed_dataset_infos)
-
-    # DB path.
-    db_path = get_merged_db_path_map(project_dir)[db_type]
-
-    # Delete existing chunk db if incorrect size.
-    if os.path.exists(db_path):
-        try:
-            f = h5py.File(db_path)
-            n_alloc = len(f["chunks"])  # total allocated
-            n_written = f["n_written"][0].item()  # total written
-            f.close()
-
-            if n_chunks != n_alloc or n_chunks != n_written:
-                os.remove(db_path)
-
-        except Exception as e:
-            if isinstance(e, OSError):
-                os.remove(db_path)
-            elif isinstance(e, KeyError):
-                f.close()
-                os.remove(db_path)
-            else:
-                raise e
-
-    # Build merged chunk db.
-    if not os.path.exists(db_path):
-        os.makedirs(os.path.dirname(db_path), exist_ok=True)
-        f = h5py.File(db_path, "w")
-
-        # Initialize output arrays.
-        merged_chunk_db: np.ndarray = f.create_dataset("chunks", (n_chunks, 5), dtype="uint32")
-        merged_doc_offsets: np.ndarray = (
-            None
-            if n_docs_key is None
-            else f.create_dataset("doc_offsets", (n_docs, 3), dtype="uint64")
-        )
-        n_written = f.create_dataset("n_written", (1,), dtype="uint64")
-        n_written[0] = 0
-
-        # Iterate indexed datasets & collect chunks.
-        chunk_start_index = 0
-        doc_start_index = 0
-        doc_start_offset = 0
-        for ds_idx, ds_info in enumerate(indexed_dataset_infos):
-            log_retro_rank_0(
-                " > merging dbs; '%s', dataset %d / %d ... '%s'."
-                % (db_type, ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
-            )
-            individual_chunk_db: np.ndarray = get_individual_chunk_db(project_dir, ds_idx, ds_info)
-            individual_doc_offsets: np.ndarray = (
-                None
-                if n_docs_key is None
-                else get_individual_doc_offsets(project_dir, ds_idx, ds_info)
-            )
-
-            if db_type == "valid":
-                individual_chunk_db = individual_chunk_db[ds_info["n_chunks_train"] :]
-                if n_docs_key is None:
-                    individual_doc_offsets = None
-                else:
-                    train_doc_offset = individual_doc_offsets[ds_info["n_docs_train"] - 1, 2]
-                    individual_doc_offsets = np.copy(
-                        individual_doc_offsets[ds_info["n_docs_train"] :]
-                    )
-                    individual_doc_offsets[:, 2] -= train_doc_offset
-
-                    log_retro_rank_0("~~~")
-                    log_retro_rank_0(individual_doc_offsets)
-                    log_retro_rank_0(train_doc_offset)
-                    raise Exception("test me.")
-            else:
-                individual_chunk_db = individual_chunk_db[: ds_info[n_chunks_key]]
-                individual_doc_offsets = (
-                    None
-                    if n_docs_key is None
-                    else np.copy(individual_doc_offsets[: ds_info[n_docs_key]])
-                )
-
-            merged_chunk_db[chunk_start_index : chunk_start_index + len(individual_chunk_db)] = (
-                individual_chunk_db
-            )
-            chunk_start_index += len(individual_chunk_db)
-            n_written[0] = chunk_start_index
-            if n_docs_key is not None:
-                individual_doc_offsets[:, 2] += doc_start_offset
-                doc_end_index = doc_start_index + individual_doc_offsets.shape[0]
-                merged_doc_offsets[doc_start_index:doc_end_index] = individual_doc_offsets
-                doc_start_index = doc_end_index
-                doc_start_offset = individual_doc_offsets[-1, 2].item()
-
-        f.close()
-
-
-def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> None:
-    """Merge individual dataset components into single database.
-
-    This method merges databases for DB types:
-    - 'sampled': used for training the vector index.
-    - 'train': used for adding to the trained vector index.
-    - 'valid': can be used for validating/testing the vector index.
-
-    Args:
-        project_dir (str): Retro project dir.
-        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
-            (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
-    """
-    merge_dbs(project_dir, indexed_dataset_infos, "sampled")
-    merge_dbs(project_dir, indexed_dataset_infos, "train")
-    merge_dbs(project_dir, indexed_dataset_infos, "valid")
-
-
-def build_db(config: RetroPreprocessingConfig) -> None:
-    """Extract token chunks from each indexed dataset.
-
-    Iterate each document of each indexed dataset, extract that document's chunks,
-        and save to a 'DB' (hdf5 file).
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    project_dir = config.retro_project_dir
-
-    # Indexed dataset info.
-    if config.retro_task_validate is None:
-        indexed_dataset_infos = init_indexed_dataset_infos(config)
-    else:
-        indexed_dataset_infos = get_indexed_dataset_infos(config.retro_project_dir)
-    # Build individual dbs.
-    build_individual_dbs(config, indexed_dataset_infos)
-
-    # If validating, return here.
-    if config.retro_task_validate is not None:
-        return
-
-    # Single-process going forward.
-    if torch.distributed.get_rank() != 0:
-        return
-
-    # Update n_chunks & save indexed dataset infos.
-    if not os.path.exists(get_indexed_dataset_infos_path(project_dir)):
-        update_chunk_counts(config, indexed_dataset_infos)
-        save_indexed_dataset_infos(project_dir, indexed_dataset_infos)
-    indexed_dataset_infos = get_indexed_dataset_infos(project_dir)
-
-    # Builded merged dbs.
-    build_merged_dbs(project_dir, indexed_dataset_infos)
diff --git a/megatron/core/datasets/retro/db/dataset.py b/megatron/core/datasets/retro/db/dataset.py
deleted file mode 100644
index 61b62601d8c..00000000000
--- a/megatron/core/datasets/retro/db/dataset.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""A DBDataset is for iterating the chunks of the chunk database.
-
-This dataset is used for both training a vector index, and adding vectors to a
-trained index.
-"""
-
-from typing import List
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.indexed_dataset import IndexedDataset
-
-try:
-    from tqdm import tqdm
-
-    HAVE_TQDM = True
-except ImportError:
-    HAVE_TQDM = False
-
-
-class DBDataset(torch.utils.data.Dataset):
-    """Dataset for iterating chunks.
-
-    Args:
-        db_path (str): Path of HDF5-format chunk database.
-        indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database.
-        chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets.
-            Format [dataset_idx, doc_id, start_idx, end_idx, bert_length].
-        chunk_length (int): Max GPT chunk length (e.g., 64).
-        eod_token_id (int): EOD token ID.
-    """
-
-    def __init__(
-        self,
-        db_path: str,
-        indexed_datasets: List[IndexedDataset],
-        chunks: np.ndarray,
-        chunk_length: int,
-        eod_token_id: int,
-    ):
-        assert chunks.shape[1] == 5, (
-            "expected 5 columns (dataset_idx, "
-            "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); "
-            "found %d columns." % chunks.shape[1]
-        )
-
-        self.db_path = db_path
-        self.indexed_datasets = indexed_datasets
-        self.chunks = chunks
-        self.doc_chunk_map = None
-
-        self.max_chunk_length = chunk_length
-        self.eod_token_id = eod_token_id
-
-    def __len__(self) -> int:
-        """Length of DB dataset.
-
-        Returns:
-            Number of chunks contained in the dataset.
-        """
-        return self.chunks.shape[0]
-
-    def __getitem__(self, chunk_id: int) -> dict:
-        """DB dataset sample.
-
-        Args:
-            chunk_id (int): Index of chunk within dataset.
-
-        Returns:
-            A dict containing:
-            - 'doc_id': Document index within indexed dataset.
-            - 'text': GPT token IDs.
-        """
-
-        # Chunk start/end indexes.
-        indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = [
-            value.item() for value in self.chunks[chunk_id]
-        ]
-        chunk_length = token_end_idx - token_start_idx
-        indexed_dataset = self.indexed_datasets[indexed_dataset_id]
-
-        # Chunk token ids.
-        token_ids = indexed_dataset.get(doc_id, offset=token_start_idx, length=chunk_length)
-
-        # Extend chunks to max_chunk_length by padding with EOD tokens.
-        if chunk_length != self.max_chunk_length:
-            assert chunk_length < self.max_chunk_length, "invalid chunk len."
-            token_ids = token_ids.tolist()
-            token_ids += [self.eod_token_id] * (self.max_chunk_length - chunk_length)
-
-        return {"doc_id": doc_id, "text": np.array(token_ids, dtype=np.int64)}
-
-    def load_doc_tuples(self) -> None:
-        """Load the dataset & document ids.
-
-        Load the dataset id & document id of each chunk in the database, to
-        be used for causality filtering during querying.
-        """
-        if not HAVE_TQDM:
-            raise ImportError("tqdm is required to use the DBDataset. Please install tqdm.")
-
-        self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
-        block_size = int(1e6)
-        for start_idx in tqdm(
-            range(0, len(self), block_size),
-            "load doc tuples",
-            miniters=(len(self) // block_size) // 10,
-            disable=torch.distributed.get_rank() != 0,
-        ):
-            end_idx = min(len(self), start_idx + block_size)
-            self.doc_tuples[start_idx:end_idx] = self.chunks[start_idx:end_idx, :2]
diff --git a/megatron/core/datasets/retro/db/utils.py b/megatron/core/datasets/retro/db/utils.py
deleted file mode 100644
index 7906f4bf9ec..00000000000
--- a/megatron/core/datasets/retro/db/utils.py
+++ /dev/null
@@ -1,398 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Utilities for building a chunk database."""
-
-import glob
-import json
-import os
-from typing import Dict, List, Optional
-
-import numpy as np
-
-from megatron.core.datasets.indexed_dataset import IndexedDataset
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.models.retro.utils import get_gpt_data_dir
-
-from .dataset import DBDataset
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-
-def get_db_dir(project_dir: str) -> str:
-    """Sub-directory for DB data.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-
-    Returns:
-        Path of the DB sub-directory within the project.
-    """
-    return os.path.join(project_dir, "db")
-
-
-def init_indexed_dataset_infos(config: RetroPreprocessingConfig) -> List[Dict]:
-    """Gather meta-info about each indexed dataset.
-
-    The returned info array allows for easy access to the configuration, and
-    helps remove ambiguity.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        List of processing metadata for each dataset, including:
-        - ratio: Data split weight.
-        - prefix: Relative path to dataset under DB sub-directory.
-    """
-
-    data_dir = get_gpt_data_dir(config.retro_project_dir)
-    data_blend: List[str] = config.retro_gpt_data_path
-    assert len(data_blend) % 2 == 0, "currently, only blended dataset is supported."
-
-    # Dataset infos.
-    infos = []
-    for i in range(0, len(data_blend), 2):
-        ratio = float(data_blend[i])
-        prefix = data_blend[i + 1]
-        path = os.path.join(data_dir, prefix + ".bin")
-        assert os.path.exists(path), "couldn't find '%s'." % path
-        infos.append({"ratio": ratio, "prefix": prefix})
-
-    # Load indexed datasets.
-    load_indexed_datasets(config.retro_project_dir, infos)
-
-    return infos
-
-
-def get_indexed_dataset_infos_path(project_dir: str) -> str:
-    """Path to indexed dataset meta-infos.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-
-    Returns:
-        Path to the `indexed_dataset_infos.json` file.
-    """
-    return os.path.join(get_db_dir(project_dir), "indexed_dataset_infos.json")
-
-
-def save_indexed_dataset_infos(project_dir: str, indexed_dataset_infos: List[Dict]) -> None:
-    """Save dataset order & meta-info.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        indexed_dataset_infos (List[Dict]): List of metadata for each dataset,
-            with each entry containing:
-
-        - ratio: Data split weight.
-        - prefix: Relative path to dataset under DB sub-directory.
-        - n_docs: Number of documents.
-        - n_docs_train: Number of documents used for pretraining.
-        - n_chunks: Number of valid chunks.
-        - n_chunks_train: Number of valid chunks used for pretraining.
-        - n_chunks_invalid: Number of invalid chunks.
-        - n_chunks_sampled: Number of valid chunks used for vector index training.
-    """
-
-    # Remove 'dataset' field.
-    clean_infos = []
-    for info in indexed_dataset_infos:
-        info = dict(info)
-        del info["dataset"]
-        clean_infos.append(info)
-
-    # Save.
-    with open(get_indexed_dataset_infos_path(project_dir), "w") as f:
-        json.dump(clean_infos, f, indent=4)
-
-
-def load_indexed_datasets(project_dir: str, indexed_dataset_infos: List[Dict]) -> None:
-    """Loaded indexed datasets into memory-mapped datasets.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        indexed_dataset_infos (List[Dict]): List of metadata for each dataset
-            (see `save_indexed_dataset_infos()` for more details.
-    """
-    data_dir = get_gpt_data_dir(project_dir)
-    for info in indexed_dataset_infos:
-        info["dataset"] = IndexedDataset(os.path.join(data_dir, info["prefix"]), mmap=True)
-
-
-def get_indexed_dataset_infos(project_dir: str) -> List[Dict]:
-    """Load indexed dataset meta-infos.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-
-    Returns:
-        List of metadata for each dataset (see `save_indexed_dataset_infos()` for more details.
-    """
-
-    # Load json.
-    path = get_indexed_dataset_infos_path(project_dir)
-    with open(path) as f:
-        infos = json.load(f)
-
-    # Load indexed datasets.
-    load_indexed_datasets(project_dir, infos)
-
-    return infos
-
-
-def get_individual_db_dir(project_dir: str, prefix: str) -> str:
-    """Individual DB's directory.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        prefix (str): Unique relative path to dataset within project dir.
-
-    Returns:
-        Path to the given datasets's chunk database.
-    """
-    return os.path.join(get_db_dir(project_dir), "individual", prefix)
-
-
-def get_individual_db_paths(project_dir: str, prefix: str) -> List[str]:
-    """Get paths of all database blocks of an individual dataset.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        prefix (str): Unique relative path to dataset within project dir.
-
-    Returns:
-        Paths to each HDF5 chunk database files that comprises this datasets full chunk database.
-    """
-    return sorted(glob.glob(get_individual_db_dir(project_dir, prefix) + "/*hdf5"))
-
-
-def get_individual_chunk_db(project_dir: str, ds_id: int, ds_info: dict) -> np.ndarray:
-    """Load individual dataset's chunk DB.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        ds_id (int): Index of dataset within blended dataset.
-        ds_info (dict): Preprocessing metadata for dataset
-            (see `save_indexed_dataset_infos()` for more detail).
-
-    Returns:
-        Array of chunk start/end indexes for this dataset,
-            where the chunk indexes can be used for indexing into
-            the corresponding indexed dataset.
-    """
-
-    if not HAVE_H5PY:
-        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
-
-    paths = get_individual_db_paths(project_dir, ds_info["prefix"])
-    # *Note*: convert to dataset, rather than copying to memory.
-    db = np.zeros((ds_info["n_chunks"], 5), dtype="uint32")
-    db[:, 0] = ds_id
-    start_idx = 0
-    for path in paths:
-        f = h5py.File(path, "r")
-        n_chunks_current = f["chunks_valid"].shape[0]
-        db[start_idx : (start_idx + n_chunks_current), 1:] = f["chunks_valid"]
-        start_idx += n_chunks_current
-        f.close()
-
-    assert start_idx == ds_info["n_chunks"]
-
-    return db
-
-
-def get_individual_doc_offsets(project_dir: str, ds_id: int, ds_info: dict) -> np.ndarray:
-    """Load individual dataset's document offsets.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        ds_id (int): Index of dataset within blended dataset.
-        ds_info (dict): Preprocessing metadata for dataset
-            (see `save_indexed_dataset_infos()` for more detail).
-
-    Returns:
-        Array of document offsets by chunk index for this dataset.
-    """
-
-    if not HAVE_H5PY:
-        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
-
-    paths = get_individual_db_paths(project_dir, ds_info["prefix"])
-    # *Note*: convert to dataset, rather than copying to memory.
-    doc_offsets = np.zeros((ds_info["n_docs"], 3), dtype="uint64")
-    doc_offsets[:, 0] = ds_id
-    start_idx = 0
-    start_offset = 0
-    for path in paths:
-        with h5py.File(path) as f:
-            current_doc_offsets = np.copy(f["doc_offsets"])
-            current_doc_offsets[:, 1] += start_offset
-            current_ndocs = current_doc_offsets.shape[0]
-            doc_offsets[start_idx : (start_idx + current_ndocs), 1:] = current_doc_offsets
-            start_idx += current_ndocs
-            start_offset = current_doc_offsets[-1, 1].item()
-
-    return doc_offsets
-
-
-def get_merged_db_path_map(project_dir: str) -> dict:
-    """Paths to merged datasets.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-
-    Returns:
-        A dict of chunk databases, one for each of:
-        - sampled: Chunks used for training the vector index.
-        - train: Chunks used for pretraining 'train' dataset.
-        - valid: Chunks used for pretraining 'valid' dataset.
-    """
-    base_dir = get_db_dir(project_dir)
-    return {
-        "sampled": os.path.join(base_dir, "merged", "sampled.hdf5"),
-        "train": os.path.join(base_dir, "merged", "train.hdf5"),
-        "valid": os.path.join(base_dir, "merged", "valid.hdf5"),
-    }
-
-
-def get_merged_dataset(
-    project_dir: str,
-    chunk_length: int,
-    eod_token_id: int,
-    db_type: str,
-    indexed_dataset_infos: Optional[List[Dict]] = None,
-) -> DBDataset:
-    """Get merged dataset.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        chunk_length (int): GPT chunk length (e.g., 64).
-        eod_token_id (int): EOD token ID.
-        db_type (str): DB type (e.g., 'sampled', 'train', or 'valid').
-        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list
-            of dataset metadata (see `save_indexed_dataset_infos()` for more detail).
-            If not provided, the indexed dataset infos will be loaded from disk.
-
-    Returns:
-        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
-    """
-    if not HAVE_H5PY:
-        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
-
-    if not indexed_dataset_infos:
-        indexed_dataset_infos = get_indexed_dataset_infos(project_dir)
-
-    # Load chunks.
-    db_path = get_merged_db_path_map(project_dir)[db_type]
-    f = h5py.File(db_path, "r")
-    chunks = f["chunks"]
-
-    # DB dataset.
-    indexed_datasets = [info["dataset"] for info in indexed_dataset_infos]
-    dataset = DBDataset(
-        db_path=db_path,
-        indexed_datasets=indexed_datasets,
-        chunks=chunks,
-        chunk_length=chunk_length,
-        eod_token_id=eod_token_id,
-    )
-
-    return dataset
-
-
-def get_merged_sampled_dataset(
-    project_dir: str,
-    chunk_length: int,
-    eod_token_id: int,
-    indexed_dataset_infos: Optional[List[Dict]] = None,
-) -> DBDataset:
-    """Get sampled dataset (for training the vector index).
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        chunk_length (int): GPT chunk length (e.g., 64).
-        eod_token_id (int): EOD token ID.
-        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list
-            of dataset metadata (see `save_indexed_dataset_infos()` for more detail).
-            If not provided, the indexed dataset infos will be loaded from disk.
-
-    Returns:
-        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
-    """
-    return get_merged_dataset(
-        project_dir, chunk_length, eod_token_id, "sampled", indexed_dataset_infos
-    )
-
-
-def get_merged_train_dataset(
-    project_dir: str,
-    chunk_length: int,
-    eod_token_id: int,
-    indexed_dataset_infos: Optional[List[Dict]] = None,
-) -> DBDataset:
-    """Get training dataset (for adding to the vector index).
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        chunk_length (int): GPT chunk length (e.g., 64).
-        eod_token_id (int): EOD token ID.
-        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of
-            dataset metadata (see `save_indexed_dataset_infos()` for more detail).
-            If not provided, the indexed dataset infos will be loaded from disk.
-
-    Returns:
-        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
-    """
-    return get_merged_dataset(
-        project_dir, chunk_length, eod_token_id, "train", indexed_dataset_infos
-    )
-
-
-def get_merged_valid_dataset(
-    project_dir: str,
-    chunk_length: int,
-    eod_token_id: int,
-    indexed_dataset_infos: Optional[List[Dict]] = None,
-) -> DBDataset:
-    """Get validation dataset (for testing the vector index).
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        chunk_length (int): GPT chunk length (e.g., 64).
-        eod_token_id (int): EOD token ID.
-        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list
-            of dataset metadata (see `save_indexed_dataset_infos()` for more detail).
-            If not provided, the indexed dataset infos will be loaded from disk.
-
-    Returns:
-        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
-    """
-    return get_merged_dataset(
-        project_dir, chunk_length, eod_token_id, "valid", indexed_dataset_infos
-    )
-
-
-def get_merged_datasets(project_dir: str, chunk_length: int, eod_token_id: int) -> dict:
-    """Get all merged datasets.
-
-    Args:
-        project_dir (str): Path to Retro project dir.
-        chunk_length (int): GPT chunk length (e.g., 64).
-        eod_token_id (int): EOD token ID.
-
-    Returns:
-        A dict mapping DB type ('sampled', 'train', or 'valid') to the corresponding DBDataset,
-            which is a dataset that wraps the HDF5 chunk index array.
-    """
-    fns = {
-        "sampled": get_merged_sampled_dataset,
-        "train": get_merged_train_dataset,
-        "valid": get_merged_valid_dataset,
-    }
-    datasets = {key: fn(project_dir, chunk_length, eod_token_id) for key, fn in fns.items()}
-    return datasets
diff --git a/megatron/core/datasets/retro/external_libs.py b/megatron/core/datasets/retro/external_libs.py
deleted file mode 100644
index 3ac29bda2eb..00000000000
--- a/megatron/core/datasets/retro/external_libs.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Required external libraries for Retro preprocessing."""
-
-import importlib
-
-required_libs = ["faiss", "h5py", "transformers"]  # for huggingface bert
-
-for lib in required_libs:
-    try:
-        globals()[lib] = importlib.import_module(lib)
-    except ImportError as e:
-        pass
diff --git a/megatron/core/datasets/retro/index/__init__.py b/megatron/core/datasets/retro/index/__init__.py
deleted file mode 100644
index d069f55f228..00000000000
--- a/megatron/core/datasets/retro/index/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-Exports:
-
-  - train_index: Train an index on representative vectors.
-  - add_to_index: Add vectors to a trained index.
-  - build_index: Wrapper function that calls above two functions.
-"""
-
-from .build import add_to_index, build_index, train_index
diff --git a/megatron/core/datasets/retro/index/build.py b/megatron/core/datasets/retro/index/build.py
deleted file mode 100644
index f02b4288f9e..00000000000
--- a/megatron/core/datasets/retro/index/build.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Construct an index.
-
-Constructing an index generally happens in two phases:
-
-  - index.train(): Train an index on a representative set of vectors.
-  - index.add(): Add vectors to an index, to be available for retrieval.
-"""
-
-import os
-import shutil
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.db.utils import (
-    get_merged_sampled_dataset,
-    get_merged_train_dataset,
-)
-from megatron.core.datasets.retro.utils import GPTToTextDataset
-
-from .factory import IndexFactory
-from .utils import (
-    get_training_data_block_dir,
-    get_training_data_block_paths,
-    get_training_data_merged_path,
-    get_training_data_root_dir,
-)
-
-try:
-    from tqdm import tqdm
-
-    HAVE_TQDM = True
-except ImportError:
-    HAVE_TQDM = False
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-##################################################
-# Train index.
-##################################################
-
-
-def get_empty_index_path(config: RetroPreprocessingConfig) -> str:
-    """Path of empty index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Path to the empty (trained, but without added samples) vector index.
-    """
-    index = IndexFactory.get_index(config.retro_index_type)
-    empty_index_path = index.get_empty_index_path(config)
-    return empty_index_path
-
-
-def get_block_nload(block_path: str, load_fraction: float) -> int:
-    """Compute number of blocks to load.
-
-    This is computed by multiplying the total number of available blocks with the
-    fraction of blocks to load.
-
-    Args:
-        block_path (str): Path to HDF5 file containing block of data. File must contain key 'data'.
-        load_fraction (float): Fraction (0 < load_fraction <= 1) of block samples to load.
-
-    Returns:
-        Number of block samples to load.
-    """
-    if not HAVE_H5PY:
-        raise ImportError(
-            "h5py is required to use the merge_embedding_blocks function. Please install h5py."
-        )
-
-    with h5py.File(block_path) as fi:
-        return int(load_fraction * fi["data"].shape[0])
-
-
-def merge_embedding_blocks(config: RetroPreprocessingConfig) -> None:
-    """Merge individual embedding blocks into a single binary mmap file.
-
-    The embeddings are initially stored in block-sized (e.g., ~100k embeddings per
-    block) HDF5 files. These individual block files must be merged into a single
-    file before training, to be based as a numpy mmap array to the index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    if not HAVE_TQDM:
-        raise ImportError(
-            "tqdm is required to use the merge_embedding_blocks function. Please install tqdm."
-        )
-
-    if not HAVE_H5PY:
-        raise ImportError(
-            "h5py is required to use the merge_embedding_blocks function. Please install h5py."
-        )
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    # Get block, merged paths.
-    load_fraction = config.retro_index_train_load_fraction
-    block_paths = get_training_data_block_paths(config)
-    bin_path = get_training_data_merged_path(config)
-
-    # Skip, if already built.
-    if os.path.exists(bin_path):
-        return
-
-    # Merge blocks.
-    with open(bin_path, "wb") as fo:
-        byte_offset = 0
-        for block_idx, block_path in enumerate(
-            tqdm(
-                block_paths,
-                "merge train embeddings",
-                miniters=len(block_paths) // 10,
-                disable=torch.distributed.get_rank() != 0,
-            )
-        ):
-            with h5py.File(block_path) as fi:
-                nload = get_block_nload(block_path, load_fraction)
-                block = np.array(fi["data"][:nload], copy=False)
-
-                fo.write(block.tobytes())
-
-                byte_offset += block.size * block.itemsize
-                fo.seek(byte_offset)
-
-
-def get_text_dataset_for_training(config: RetroPreprocessingConfig) -> GPTToTextDataset:
-    """Convert GPT token chunk dataset to a text dataset for passing to the
-    embedder.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        The text dataset consisting of tokens converted from sampled chunk database.
-    """
-    gpt_dataset = get_merged_sampled_dataset(
-        project_dir=config.retro_project_dir,
-        chunk_length=config.retro_gpt_chunk_length,
-        eod_token_id=config.retro_tokenizers.gpt.eod,
-    )
-    text_dataset = GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt)
-    return text_dataset
-
-
-def embed_training_chunks(config: RetroPreprocessingConfig) -> None:
-    """Embed DB chunks.
-
-    Store chunks in blocks on disk. These blocks will later be merged into
-    a single dataset for training the index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    merged_train_data_path = get_training_data_merged_path(config)
-    if os.path.exists(merged_train_data_path):
-        return
-
-    # Get training text dataset.
-    text_dataset = get_text_dataset_for_training(config)
-
-    # Embed dataset.
-    embedder = config.retro_bert_embedders.disk
-    embedder.embed_text_dataset("index", get_training_data_block_dir(config), text_dataset)
-
-    # Merge embeddings.
-    merge_embedding_blocks(config)
-
-
-def train_on_embeddings(config: RetroPreprocessingConfig) -> None:
-    """Train index on embedded DB chunks.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-    index = IndexFactory.get_index(config.retro_index_type)
-    index.train(config)
-
-
-def remove_embeddings(config: RetroPreprocessingConfig) -> None:
-    """Remove embeddings after training.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() != 0:
-        return
-    empty_index_path = get_empty_index_path(config)
-    assert os.path.isfile(empty_index_path)
-    shutil.rmtree(get_training_data_root_dir(config), ignore_errors=True)
-
-
-def _train_index(config: RetroPreprocessingConfig) -> None:
-    """Train index on DB chunks.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    # Check if trained index already exists.
-    if not os.path.isfile(get_empty_index_path(config)):
-        # Embed training chunks.
-        embed_training_chunks(config)
-
-        # Train index on embeddings.
-        train_on_embeddings(config)
-
-    # Wait for (single-process) training to complete.
-    torch.distributed.barrier()
-
-    # Remove embeddings.
-    if config.retro_index_delete_training_embeddings:
-        remove_embeddings(config)
-
-
-def train_index(config: RetroPreprocessingConfig) -> None:
-    """Entry point for training the index.
-
-    We select whether to train a new index, or validate an existing index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    # Train new index.
-    if config.retro_task_validate is None:
-        _train_index(config)
-
-    # Validate existing trained index.
-    else:
-        from .validate import validate_training_embeddings
-
-        validate_training_embeddings(config)
-
-
-##################################################
-# Add to index.
-##################################################
-
-
-def get_text_dataset_for_adding(config: RetroPreprocessingConfig) -> GPTToTextDataset:
-    """Convert GPT token chunk dataset to a text dataset for passing to the
-    embedder.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        The text dataset that consists of tokens converted from the 'train' chunk database.
-        These are the chunks used for retrieval by the pretraining 'train' dataset.
-    """
-    gpt_dataset = get_merged_train_dataset(
-        project_dir=config.retro_project_dir,
-        chunk_length=config.retro_gpt_chunk_length,
-        eod_token_id=config.retro_tokenizers.gpt.eod,
-    )
-    text_dataset = GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt)
-    return text_dataset
-
-
-def _add_to_index(config: RetroPreprocessingConfig) -> str:
-    """Add DB chunks to index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Path to the populated index.
-    """
-
-    # Get index.
-    index = IndexFactory.get_index(config.retro_index_type)
-
-    # Get text dataset.
-    text_dataset = get_text_dataset_for_adding(config)
-
-    # Add to index.
-    output_index_path = index.add(config, text_dataset)
-
-    return output_index_path
-
-
-def add_to_index(config: RetroPreprocessingConfig) -> None:
-    """Entry point for adding to the index.
-
-    We select whether to add to a new index, or validate an existing index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    # Add to new index.
-    if config.retro_task_validate is None:
-        _add_to_index(config)
-
-    # Validate existing encodings.
-    else:
-        from .validate import validate_added_encodings
-
-        validate_added_encodings(config)
-
-
-##################################################
-# Build index (train + add).
-##################################################
-
-
-def build_index(config: RetroPreprocessingConfig) -> None:
-    """Build index.
-
-    Building index involves sequentially running stages above:
-    - Train index (on sampled training chunks).
-    - Add to index (on all training chunks).
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    # Train index.
-    train_index(config)
-
-    # Add to index.
-    add_to_index(config)
diff --git a/megatron/core/datasets/retro/index/factory.py b/megatron/core/datasets/retro/index/factory.py
deleted file mode 100644
index f88084ddb13..00000000000
--- a/megatron/core/datasets/retro/index/factory.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""The IndexFactory constructs an index from an index type string."""
-
-from megatron.core.datasets.retro.index.index import Index
-
-from .indexes import FaissBaseIndex, FaissParallelAddIndex
-
-
-class IndexFactory:
-    """Get index.
-
-    Index type generally read from argument '--retro-index-ty'.
-    """
-
-    @classmethod
-    def get_index_class(cls, index_type: str) -> type:
-        """Get an index class, given a type string.
-
-        Args:
-            index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add().
-
-        Returns:
-            An `Index` sub-type corresponding to the `index_type`.
-        """
-        return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex}[index_type]
-
-    @classmethod
-    def get_index(cls, index_type: str) -> Index:
-        """Construct an index from an index type string.
-
-        Args:
-            index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add().
-
-        Returns:
-            An `Index` instance corresponding to the `index_type`.
-        """
-        index_class = cls.get_index_class(index_type)
-        index = index_class()
-        return index
diff --git a/megatron/core/datasets/retro/index/index.py b/megatron/core/datasets/retro/index/index.py
deleted file mode 100644
index 129c239de34..00000000000
--- a/megatron/core/datasets/retro/index/index.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Base class for all vector indexes.
-
-A vector index is a type of retrieval database that is queried using vectors,
-and returns vectors that are 'similar' (e.g., by cosine distance) to the query
-vector. The construction and usage of an index generally has the following
-pattern:
-
-  - Train the index on representative vectors.
-  - Add vectors to the index (i.e., vectors available for retrieval)
-  - Query index with new vector, to retrieve similar vector indexes.
-"""
-
-import abc
-import os
-from typing import Tuple
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.retro.config import Embedder, RetroPreprocessingConfig
-from megatron.core.datasets.retro.utils import GPTToTextDataset
-
-from .utils import get_index_dir
-
-try:
-    import faiss
-
-    HAVE_FAISS = True
-except ImportError:
-    HAVE_FAISS = False
-
-
-class Index(abc.ABC):
-    """Abstract base class for indexes.
-
-    *Note* : While currently only Faiss-based classes are implemented, in the
-    future, this class will be extended with other types of indexes that have
-    different performance-accuracy trade-offs.
-
-    The primary methods to override are:
-    - train() : Train index on the sampled training chunks.
-    - add() : Add all training chunks to index.
-    """
-
-    @classmethod
-    def make_object_verbose(cls, index: "faiss.Index", verbose: bool) -> None:
-        """Make index object verbose.
-
-        Args:
-            index (faiss.Index): Faiss object to set verbose.
-            verbose (bool): Sets whether index should log status updates during training and adding.
-        """
-        if not HAVE_FAISS:
-            raise ImportError("faiss is required to use the Index class. Please install faiss.")
-
-        assert isinstance(verbose, bool)
-        faiss.ParameterSpace().set_index_parameter(index, "verbose", verbose)
-
-    def get_empty_index_path(self, config: RetroPreprocessingConfig) -> str:
-        """Get file path to empty index (i.e., trained, but unpopulated).
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-
-        Returns:
-            File path to empty index
-                (i.e., this index has had index.train() called, but not yet index.add()).
-        """
-        return os.path.join(
-            get_index_dir(config), "empty_%.3f.faissindex" % config.retro_index_train_load_fraction
-        )
-
-    def get_empty_index(self, config: RetroPreprocessingConfig) -> "faiss.Index":
-        """Get empty index (i.e., trained, but unpopulated).
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-
-        Returns:
-            Empty Faiss index, loaded from storage.
-        """
-        if not HAVE_FAISS:
-            raise ImportError("faiss is required to use the Index class. Please install faiss.")
-        return faiss.read_index(self.get_empty_index_path(config))
-
-    def get_added_index_path(self, config: RetroPreprocessingConfig) -> str:
-        """Get file path to index that has been populated with vectors.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-
-        Returns:
-            File path to added index
-                (i.e., this index has had both index.train() and index.add() called).
-        """
-        return os.path.join(
-            get_index_dir(config),
-            "added_%.3f_%.3f.faissindex"
-            % (config.retro_index_train_load_fraction, config.retro_index_add_load_fraction),
-        )
-
-    def get_added_index(self, config: RetroPreprocessingConfig) -> "faiss.Index":
-        """Get index that has been populated with vectors.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-
-        Returns:
-            'Added' (i.e., populated) Faiss index, loaded from storage.
-        """
-        if not HAVE_FAISS:
-            raise ImportError("faiss is required to use the Index class. Please install faiss.")
-        return faiss.read_index(self.get_added_index_path(config))
-
-    @abc.abstractmethod
-    def train(self, config: RetroPreprocessingConfig) -> None:
-        """Train index on a representative set of vectors.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-        """
-
-    @abc.abstractmethod
-    def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
-        """Add vectors to index.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-            text_dataset (GPTToTextDataset): Text dataset that will be embedded
-                and added to the index.
-        """
-
-    def embed_text_dataset_block(
-        self, embedder: Embedder, text_dataset: GPTToTextDataset, _range: Tuple[int, int]
-    ) -> np.ndarray:
-        """Embed a range of a text dataset.
-
-        Args:
-            embedder (Embedder): Embedder used for embedding a text dataset.
-            text_dataset (GPTToTextDataset): Text dataset that will be embedded.
-            _range (Tuple[int, int]): Start/end sample indices within
-                text dataset used for embedding.
-
-        Returns:
-            An array of embeddings, with shape (len(text_dataset), dimension(embedder)).
-        """
-        sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range))
-        return embedder.embed_text_dataset(sub_dataset)
diff --git a/megatron/core/datasets/retro/index/indexes/__init__.py b/megatron/core/datasets/retro/index/indexes/__init__.py
deleted file mode 100644
index c445909fea5..00000000000
--- a/megatron/core/datasets/retro/index/indexes/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-Exports:
-- FaissBaseIndex: Unoptimized Faiss index wrapper
-- FaissParallelAddIndex: Optimized index.add() for Faiss index.
-"""
-
-from .faiss_base import FaissBaseIndex
-from .faiss_par_add import FaissParallelAddIndex
diff --git a/megatron/core/datasets/retro/index/indexes/faiss_base.py b/megatron/core/datasets/retro/index/indexes/faiss_base.py
deleted file mode 100644
index 6db0a420dff..00000000000
--- a/megatron/core/datasets/retro/index/indexes/faiss_base.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-This class implements a simple, un-optimized wrapper around a Faiss index, that
-implements the Index interface (see ..index.py). While this class is
-instantiable, it is meant to be extended with optimizations in classes that
-inherit from this class (see FaissParAddIndex, for an example).
-"""
-
-import os
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.index.index import Index
-from megatron.core.datasets.retro.index.utils import (
-    get_training_data_merged_path,
-    num_samples_to_block_ranges,
-)
-from megatron.core.datasets.retro.utils import GPTToTextDataset, log_retro_rank_0
-
-try:
-    import faiss
-
-    HAVE_FAISS = True
-except ImportError:
-    HAVE_FAISS = False
-
-
-try:
-    from tqdm import tqdm
-
-    HAVE_TQDM = True
-except ImportError:
-    HAVE_TQDM = False
-
-
-class FaissBaseIndex(Index):
-    """Base class for Faiss-base indexes.
-
-    This class wraps a Faiss index, and adds additional functionality for training
-    and adding codes. This base class performs a naive sequential code adding,
-    while the optimized FaissParallelAddIndex class performs a parallel
-    index.add().
-    """
-
-    def _train(self, config: RetroPreprocessingConfig) -> None:
-        """Train index (rank 0's method).
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-        """
-
-        if not HAVE_FAISS:
-            raise ImportError(
-                "faiss is required to use the FaissBaseIndex class. Please install faiss."
-            )
-
-        assert torch.distributed.get_rank() == 0
-
-        # Set num threads (torch.distributed reset it to 1).
-        faiss.omp_set_num_threads(64)
-
-        empty_index_path = self.get_empty_index_path(config)
-
-        # Index already exists? -> return.
-        if os.path.isfile(empty_index_path):
-            return
-
-        # Load data.
-        merged_path = get_training_data_merged_path(config)
-        inp = np.memmap(merged_path, dtype="f4", mode="r").reshape((-1, config.hidden_size))
-
-        # Init index.
-        index = faiss.index_factory(config.hidden_size, config.retro_index_str)
-
-        # Move to GPU.
-        log_retro_rank_0("> move faiss index to gpu.")
-        index_ivf = faiss.extract_index_ivf(index)
-        clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(index_ivf.d))
-        index_ivf.clustering_index = clustering_index
-        log_retro_rank_0("> finished moving to gpu.")
-        self.make_object_verbose(index, True)
-        self.make_object_verbose(index_ivf, True)
-        self.make_object_verbose(index_ivf.quantizer, True)
-        self.make_object_verbose(index_ivf.clustering_index, True)
-
-        # Train index.
-        index.train(inp)
-
-        # Save index.
-        faiss.write_index(index, empty_index_path)
-
-    def train(self, config: RetroPreprocessingConfig) -> None:
-        """Train index.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-        """
-
-        # Single process only.
-        if torch.distributed.get_rank() == 0:
-            self._train(config)
-
-        torch.distributed.barrier()
-
-    def _add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
-        """Add to index (rank 0's method).
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-            text_dataset (GPTToTextDataset): Text dataset that will be embedded
-                and added to the index.
-        """
-
-        if not HAVE_FAISS:
-            raise ImportError(
-                "faiss is required to use the FaissBaseIndex class. Please install faiss."
-            )
-
-        if not HAVE_TQDM:
-            raise ImportError(
-                "tqdm is required to use the FaissBaseIndex class. Please install tqdm."
-            )
-
-        assert torch.distributed.get_rank() == 0
-
-        dataset_sample_ranges = num_samples_to_block_ranges(len(text_dataset))
-
-        # Set num threads (torch.distributed reset it to 1).
-        faiss.omp_set_num_threads(64)
-
-        # Bert embedder.
-        embedder = config.bert_embedders.mem
-
-        # Empty/added index paths.
-        empty_index_path = self.get_empty_index_path()
-        added_index_path = self.get_added_index_path()
-
-        # Skip adding, if index exists.
-        if os.path.isfile(added_index_path):
-            return
-
-        # Read trained index.
-        index = faiss.read_index(empty_index_path)
-
-        # Iterate data blocks & add.
-        for sample_range in tqdm(dataset_sample_ranges, "faiss_base.add"):
-            # Embed text.
-            embeds = self.embed_text_dataset_block(embedder, text_dataset, sample_range)
-
-            # Add to index.
-            index.add(embeds)
-
-        # Write index.
-        faiss.write_index(index, added_index_path)
-
-    def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> str:
-        """Add to index.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-            text_dataset (GPTToTextDataset): Text dataset that will be embedded
-                and added to the index.
-
-        Returns:
-            File path to the populated index.
-        """
-
-        # Single process only.
-        if torch.distributed.get_rank() == 0:
-            self._add(config, text_dataset)
-
-        # Wait for rank 0.
-        torch.distributed.barrier()
-
-        # Get output index path, for return.
-        return self.get_added_index_path(config)
diff --git a/megatron/core/datasets/retro/index/indexes/faiss_par_add.py b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py
deleted file mode 100644
index ccd79f31d4b..00000000000
--- a/megatron/core/datasets/retro/index/indexes/faiss_par_add.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Multi-process & multi-node version of Faiss's index.add().
-
-This class inherits from FaissBaseIndex, and optimizes the 'add()' method by
-making it multi-node and multi-process, with bit-wise equivalence to
-FaissBaseIndex. This allows 'add()' to scale out to very large datasets, since
-the vast majority of the computational effort is embarrassingly parallel.
-"""
-
-import os
-import shutil
-from typing import Tuple
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.retro.config import Embedder, RetroPreprocessingConfig
-from megatron.core.datasets.retro.index.utils import get_added_code_paths, get_added_codes_dir
-from megatron.core.datasets.retro.utils import (
-    GPTToTextDataset,
-    get_blocks_by_rank,
-    log_retro_rank_0,
-    retro_makedir,
-)
-
-from .faiss_base import FaissBaseIndex
-
-try:
-    import psutil
-
-    HAVE_PSUTIL = True
-except ImportError:
-    HAVE_PSUTIL = False
-
-try:
-    from tqdm import tqdm
-
-    HAVE_TQDM = True
-except ImportError:
-    HAVE_TQDM = False
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-try:
-    import faiss
-
-    HAVE_FAISS = True
-except ImportError:
-    HAVE_FAISS = False
-
-
-class FaissParallelAddIndex(FaissBaseIndex):
-    """
-    This class parallelizes both 1) encoding vectors, and 2) adding codes to the
-    index. This class is more performant than naive use of Faiss, because most
-    of the computational work is in encoding the vectors, which is an
-    embarassingly parallel operation.
-    """
-
-    def encode_block(
-        self, index: "faiss.Index", embedder: Embedder, text_dataset: GPTToTextDataset, block: dict
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        """Encode sub-dataset block, to be later added to index.
-
-        Encode the data subset, generally in blocks of 1M vectors each. For
-        each block, the empty/trained index is loaded, codes are computed
-        via index.sa_encode(), and the resulting codes are saved to disk.
-
-        Args:
-            index (faiss.Index): Faiss index object.
-            embedder (Embedder): Embedder used to embed text dataset.
-            text_dataset (GPTToTextDataset): Text dataset to be embedded and encoded.
-            block (dict): Range information specifying start/end indices within text dataset.
-
-        Returns:
-            A tuple of (embeddings, encodings) for the given block subset of the text dataset.
-        """
-
-        # Embed block.
-        embeddings = self.embed_text_dataset_block(embedder, text_dataset, block["range"])
-
-        # Encode block.
-        log_retro_rank_0("encode.")
-        codes = index.sa_encode(embeddings)
-
-        # Return embeddings for validation purposes.
-        return embeddings, codes
-
-    def save_block(self, config: RetroPreprocessingConfig, block: dict, codes: np.ndarray) -> None:
-        """Save block of codes to disk.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-            block (dict): Range information specifying the start/end indices within
-                the encoded text dataset. Here, the 'path' item is used for writing
-                the encodings to storage.
-            codes (np.ndarray): Block of encodings to be saved to storage.
-        """
-        # Save neighbors.
-        log_retro_rank_0("save codes.")
-        retro_makedir(config, os.path.dirname(block["path"]))
-        with h5py.File(block["path"], "w") as f:
-            f.create_dataset("data", data=codes)
-
-    def encode(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
-        """Encode text dataset, to be later added to index.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-            text_dataset (GPTToTextDataset): Text dataset to be encoded by the index.
-        """
-
-        codes_dir = get_added_codes_dir(config)
-        retro_makedir(config, codes_dir)
-
-        # Index.
-        index = self.get_empty_index(config)
-
-        # Bert embedder.
-        embedder = config.retro_bert_embedders.mem
-
-        # Missing code blocks.
-        def validate(f: h5py.File) -> None:
-            """Validation method for validating loaded encodings.
-
-            Args:
-                f (h5py.File): File that contains encodings.
-            """
-            assert len(f["data"].shape) == 2
-
-        blocks = get_blocks_by_rank(
-            codes_dir, len(text_dataset), config.retro_block_size, validate=validate
-        )
-
-        # Encode each block.
-        for block_index, block in enumerate(blocks.missing):
-            if block is not None:
-                # Progress.
-                log_retro_rank_0(
-                    "encode block %d / %d ... %s."
-                    % (block_index, len(blocks.missing), block["path"])
-                )
-
-                # Encode and save.
-                _, codes = self.encode_block(index, embedder, text_dataset, block)
-                self.save_block(config, block, codes)
-
-            # Synchronize progress across all ranks. (for easier observation)
-            log_retro_rank_0(" > waiting for other ranks to finish block.")
-            torch.distributed.barrier()
-
-    def add_codes(self, config: RetroPreprocessingConfig) -> None:
-        """Read codes from disk, and add them to the index.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-        """
-
-        if not HAVE_PSUTIL:
-            raise ImportError(
-                "psutil is required to use the FaissParallelAddIndex class. Please install psutil."
-            )
-
-        if not HAVE_TQDM:
-            raise ImportError(
-                "tqdm is required to use the FaissParallelAddIndex class. Please install tqdm."
-            )
-
-        if not HAVE_FAISS:
-            raise ImportError(
-                "faiss is required to use the FaissParallelAddIndex class. Please install faiss."
-            )
-
-        if not HAVE_H5PY:
-            raise ImportError(
-                "h5py is required to use the FaissParallelAddIndex class. Please install h5py."
-            )
-
-        if torch.distributed.get_rank() != 0:
-            return
-
-        added_index_path = self.get_added_index_path(config)
-        if os.path.exists(added_index_path):
-            return
-
-        # Index.
-        log_retro_rank_0("read empty index.")
-        index = self.get_empty_index(config)
-        index_ivf = faiss.extract_index_ivf(index)
-
-        # Add codes.
-        log_retro_rank_0("add codes.")
-        code_paths = get_added_code_paths(config)
-        pbar = tqdm(code_paths)
-        for code_path in pbar:
-            pbar.set_description(
-                "add codes, mem %.3f gb, %.1f%%"
-                % (psutil.virtual_memory()[3] / 1024**3, psutil.virtual_memory()[2])
-            )
-            with h5py.File(code_path) as f:
-                nload = int(config.retro_index_add_load_fraction * f["data"].shape[0])
-                offset = int(os.path.basename(code_path).split("-")[0])
-                xids = np.arange(offset, offset + nload)
-                codes = np.copy(f["data"][:nload])
-                index_ivf.add_sa_codes(codes, xids)
-
-        # Update index's ntotal.
-        index.ntotal = index_ivf.ntotal
-
-        # Write index.
-        log_retro_rank_0("write added index.")
-        faiss.write_index(index, added_index_path)
-
-    def remove_codes(self, config: RetroPreprocessingConfig) -> None:
-        """Remove added codes after adding to index.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-        """
-        if torch.distributed.get_rank() != 0:
-            return
-        assert os.path.isfile(self.get_added_index_path(config))
-
-        if config.retro_index_delete_added_codes:
-            raise Exception("remove?")
-            shutil.rmtree(get_added_codes_dir(config), ignore_errors=True)
-
-    def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
-        """Add vectors to index.
-
-        Args:
-            config (RetroPreprocessingConfig): Retro preprocessing config.
-            text_dataset (GPTToTextDataset): Text dataset that will be embedded
-                and added to the index.
-        """
-
-        # Encode chunks.
-        self.encode(config, text_dataset)
-
-        # Add codes to index.
-        self.add_codes(config)
-
-        # Wait for (single-process) adding to complete.
-        torch.distributed.barrier()
-
-        # Remove codes.
-        self.remove_codes(config)
diff --git a/megatron/core/datasets/retro/index/utils.py b/megatron/core/datasets/retro/index/utils.py
deleted file mode 100644
index 58229439ae6..00000000000
--- a/megatron/core/datasets/retro/index/utils.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Utilities for building an index."""
-
-import glob
-import os
-from typing import List, Tuple
-
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.utils import retro_makedir
-
-
-def get_index_dir(config: RetroPreprocessingConfig) -> str:
-    """Create sub-directory for this index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Path to index sub-directory within Retro project.
-    """
-
-    # Directory path.
-    index_dir_path = os.path.join(
-        config.retro_project_dir, "index", config.retro_index_type, config.retro_index_str
-    )
-
-    # Make directory.
-    retro_makedir(config, index_dir_path)
-
-    return index_dir_path
-
-
-def num_samples_to_block_ranges(
-    config: RetroPreprocessingConfig, num_samples: int
-) -> List[Tuple[int, int]]:
-    """Split a range (length num_samples) into sequence of block ranges
-    of size block_size.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        num_samples (int): Split `num_samples` into consecutive block ranges, where each block is size `config.retro_block_size`.
-
-    Returns:
-        A list of tuples where each item is the (start, end) index for a given block.
-    """
-    block_size = config.retro_block_size
-    start_idxs = list(range(0, num_samples, block_size))
-    end_idxs = [min(num_samples, s + block_size) for s in start_idxs]
-    ranges = list(zip(start_idxs, end_idxs))
-    return ranges
-
-
-def get_training_data_root_dir(config: RetroPreprocessingConfig) -> str:
-    """Get root directory for embeddings (blocks and merged data).
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Path to the training data directory, which contains both training embedding blocks and the final merged training embeddings.
-    """
-    return os.path.join(config.retro_project_dir, "index", "train_emb")
-
-
-def get_training_data_block_dir(config: RetroPreprocessingConfig) -> str:
-    """Get directory for of saved embedding blocks.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Path to the directory containing the training embedding blocks, which will be later merged into a single embedding array.
-    """
-    return os.path.join(get_training_data_root_dir(config), "blocks")
-
-
-def get_training_data_block_paths(config: RetroPreprocessingConfig) -> List[str]:
-    """Get paths to saved embedding blocks.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Paths of all training embedding blocks.
-    """
-    return sorted(glob.glob(get_training_data_block_dir(config) + "/*.hdf5"))
-
-
-def get_training_data_merged_path(config: RetroPreprocessingConfig) -> str:
-    """Get path to merged training embeddings.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Path to the merged training embedding binary file.
-    """
-    return os.path.join(
-        get_training_data_root_dir(config),
-        "train_%.3f.bin" % config.retro_index_train_load_fraction,
-    )
-
-
-def get_added_codes_dir(config: RetroPreprocessingConfig) -> str:
-    """Get directory of saved encodings.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Path to the directory containing the vector encodings for adding to the index.
-    """
-    return os.path.join(get_index_dir(config), "add_codes")
-
-
-def get_added_code_paths(config: RetroPreprocessingConfig) -> List[str]:
-    """Get paths to all saved encodings.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        Paths of all vector encoding blocks, for adding to the index.
-    """
-    return sorted(glob.glob(get_added_codes_dir(config) + "/*.hdf5"))
diff --git a/megatron/core/datasets/retro/index/validate.py b/megatron/core/datasets/retro/index/validate.py
deleted file mode 100644
index 5f75147a8af..00000000000
--- a/megatron/core/datasets/retro/index/validate.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Validate an index's data.
-
-This module contains functionality for checking for bitwise equality across code
-changes. The training and adding steps of index construction can be validated
-separately. The following high-level checks are supported:
-
-  - Training: Validate that saved training embeddings are bitwise equal with a
-      sample set of freshly computed embeddings. (*Note*:
-      `--no-retro-index-delete-training-embeddings` must be used.)
-  - Adding: Validate that the saved encodings are bitwise equal with a sample of
-      sample set of freshly computed encodings. (*Note*:
-      `--no-retro-index-delete-added-codes` must be used.)
-"""
-
-import numpy as np
-import torch
-from torch.utils.data import Subset
-
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.utils import get_blocks_by_rank, log_retro_rank_0
-
-from .build import get_text_dataset_for_adding, get_text_dataset_for_training
-from .factory import IndexFactory
-from .utils import get_added_codes_dir, get_training_data_block_dir
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-
-##################################################
-# Validate trained index.
-##################################################
-
-
-def validate_training_embeddings(config: RetroPreprocessingConfig) -> None:
-    """Validate training embeddings.
-
-    Steps:
-    - Randomly sample subset of text dataset blocks.
-    - Embed each block.
-    - Compare against saved embeddings.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    if not HAVE_H5PY:
-        raise ImportError(
-            "h5py is required to use the validate_training_embeddings function. "
-            "Please install h5py."
-        )
-
-    # Training text dataset.
-    text_dataset = get_text_dataset_for_training(config)
-
-    # Sample existing blocks.
-    blocks = get_blocks_by_rank(
-        dirname=get_training_data_block_dir(config),
-        n_samples=len(text_dataset),
-        block_size=config.retro_block_size,
-        validate=None,
-        sample=config.retro_task_validate,
-    )
-
-    assert blocks.n_missing_world == 0
-
-    # Embed & validate blocks.
-    embedder = config.retro_bert_embedders.mem
-    for block_idx, block in enumerate(blocks.existing):
-        # Missing block lists are extended with None to have equal-length
-        # lists. Skip the Nones.
-        if block is not None:
-            # Progress. (*note*: move world progress to here.)
-            log_retro_rank_0(
-                "embed training block %d / %d ... %s."
-                % (block_idx, len(blocks.existing), block["path"])
-            )
-
-            # Load existing block embeddings.
-            with h5py.File(block["path"]) as f:
-                existing_embeddings = np.copy(f["data"])
-
-            # Embed block.
-            sub_dataset = Subset(text_dataset, range(*block["range"]))
-            embeddings = embedder.embed_text_dataset(sub_dataset, "train")
-
-            # Check equality.
-            log_retro_rank_0(" > validate.")
-            assert np.array_equal(existing_embeddings, embeddings)
-
-        # Synchronize progress across all ranks. (for easier observation)
-        log_retro_rank_0(" > waiting for other ranks to finish block.")
-        torch.distributed.barrier()
-
-    log_retro_rank_0(" > finished validating training embeddings.")
-
-
-##################################################
-# Validate filled index.
-##################################################
-
-
-def validate_added_encodings(config: RetroPreprocessingConfig) -> None:
-    """Validate added encodings.
-
-    Steps:
-    - Randomly sample subset of text dataset blocks.
-    - Encode each block.
-    - Compare against saved encodings.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    # Index.
-    index = IndexFactory.get_index(config.retro_index_type)
-    inner_index = index.get_empty_index(config)
-
-    # Text dataset.
-    text_dataset = get_text_dataset_for_adding(config)
-
-    # Sample existing blocks.
-    def validate(f: h5py.File) -> None:
-        """Validation method for validating encoding blocks.
-
-        Args:
-            f (h5py.File): File with block of encodings.
-        """
-        assert len(f["data"].shape) == 2
-
-    blocks = get_blocks_by_rank(
-        dirname=get_added_codes_dir(config),
-        n_samples=len(text_dataset),
-        block_size=config.retro_block_size,
-        validate=validate,
-        sample=config.retro_task_validate,
-    )
-
-    assert blocks.n_missing_world == 0
-
-    # Encode and validate blocks.
-    embedder = config.retro_bert_embedders.mem
-    for block_idx, block in enumerate(blocks.existing):
-        if block is not None:
-            # Progress.
-            log_retro_rank_0(
-                "encode block %d / %d ... %s." % (block_idx, len(blocks.existing), block["path"])
-            )
-
-            # Load existing codes.
-            with h5py.File(block["path"]) as f:
-                existing_codes = np.copy(f["data"])
-
-            # Encode block.
-            embeddings, codes = index.encode_block(inner_index, embedder, text_dataset, block)
-
-            # Check equality.
-            log_retro_rank_0(" > validate.")
-            assert np.array_equal(existing_codes, codes)
-
-        # Synchronize progress across all ranks. (for easier observation)
-        log_retro_rank_0(" > waiting for other ranks to finish block.")
-        torch.distributed.barrier()
-
-    log_retro_rank_0(" > finished validating added encodings.")
-
-
-##################################################
-# Validate index (trained + filled).
-##################################################
-
-
-def validate_index(config: RetroPreprocessingConfig) -> None:
-    """Validate index.
-
-    Validating index involves sequentially running stages above:
-    - Validate trained index.
-    - Validate filled index.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    # Validate training embeddings.
-    validate_training_embeddings(config)
-
-    # Validate added codes.
-    validate_added_encodings(config)
diff --git a/megatron/core/datasets/retro/query/__init__.py b/megatron/core/datasets/retro/query/__init__.py
deleted file mode 100644
index ac9483373c9..00000000000
--- a/megatron/core/datasets/retro/query/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
diff --git a/megatron/core/datasets/retro/query/gpt_chunk_dataset.py b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py
deleted file mode 100644
index 6191a30a31f..00000000000
--- a/megatron/core/datasets/retro/query/gpt_chunk_dataset.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-A GPTChunkDataset is a wrapper around a regular GPTDataset, that sequentially
-chunks the sample tokens into `retro_chunk_length` sized smaller samples.
-
-For example, if the GPTDataset has 100 samples and a sequence length of 2048, and
-retro_chunk_length is 64, then the GPTChunkDataset will contain 100*(2048/64) =
-3200 samples, each with length 64.
-"""
-
-import torch
-
-from megatron.core.datasets.gpt_dataset import GPTDataset
-from megatron.core.datasets.retro.utils import get_num_chunks_per_sample
-
-from .utils import get_neighbor_dir
-
-
-class GPTChunkDataset(torch.utils.data.Dataset):
-    """Pretraining chunk dataset wraps a standard GPT dataset.
-
-    This dataset conceptually divides each sample (e.g., length 2048)
-    into chunks (e.g., length 64) and restructures them into a list of
-    chunks (e.g., length num_samples * num_chunks_per_sample).
-
-    Args:
-        sample_dataset (GPTDataset): Original GPT dataset, with `sequence_length` size samples.
-        sample_length (int): Alias for `sequence_length`.
-        chunk_length (int): Retro chunk length (e.g., 64).
-    """
-
-    def __init__(self, sample_dataset: GPTDataset, sample_length: int, chunk_length: int):
-
-        super().__init__()
-
-        self.sample_dataset = sample_dataset
-        self.chunk_length = chunk_length
-        self.n_chunks_per_sample = get_num_chunks_per_sample(sample_length, chunk_length)
-        self.n_samples = len(sample_dataset)
-        self.n_chunks = self.n_samples * self.n_chunks_per_sample
-
-    def __len__(self) -> int:
-        """Get dataset length.
-
-        Returns:
-            Dataset length.
-        """
-        return self.n_chunks
-
-    def __getitem__(self, idx: int) -> dict:
-        """Get sample, including represented document IDs.
-
-        Args:
-            idx (int): Sample index.
-
-        Returns:
-            A sample, which contains both the chunk-length token sample ('text') along with all document_ids ('doc_ids') contained withing the full `sequence_length` sample.
-        """
-
-        # Convert global chunk index to global sample index & local chunk index.
-        sample_idx = idx // self.n_chunks_per_sample
-        chunk_idx = idx % self.n_chunks_per_sample
-
-        # Extract sample data.
-        sample = self.sample_dataset[sample_idx]
-        sample_token_ids = sample["text"]
-        sample_doc_ids = sample["document_ids"]
-
-        # Chunk start/end token idxs.
-        token_start_idx = chunk_idx * self.chunk_length
-        token_end_idx = token_start_idx + self.chunk_length
-        chunk_token_ids = sample_token_ids[token_start_idx:token_end_idx]
-
-        # Sample.
-        return {"doc_ids": sample_doc_ids, "text": chunk_token_ids}
-
-
-def build_gpt_chunk_datasets_from_gpt_datasets(
-    project_dir: str, gpt_datasets: dict, sample_length: int, chunk_length: int
-) -> dict:
-    """Get train, valid, test GPT chunk datasets.
-
-    Args:
-        project_dir (str): Retro project dir.
-        gpt_datasets (dict): Mapping of 'train', 'valid', and 'test' GPT datasets (original, unchunked datasets).
-        sample_length (int): Alias of `sequence_length`.
-        chunk_length (int): Retro chunk length (e.g., 64).
-
-    Returns:
-        A <dict> ?
-    """
-
-    # GPT chunk datasets.
-    chunk_datasets = {
-        key: (
-            {
-                "dataset": GPTChunkDataset(sample_ds, sample_length, chunk_length),
-                "neighbor_dir": get_neighbor_dir(project_dir, key, sample_ds),
-                "num_active_chunks": num_active_samples
-                * get_num_chunks_per_sample(sample_length, chunk_length),
-            }
-            if sample_ds
-            else None
-        )
-        for key, (sample_ds, num_active_samples) in gpt_datasets.items()
-    }
-
-    return chunk_datasets
diff --git a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
deleted file mode 100644
index 52b0b6bac4a..00000000000
--- a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""A MultiSplitGPTDataset can handle multiple intersecting split strings, as well
-as returning all of the document IDs of a sample."""
-
-import logging
-from dataclasses import dataclass
-from typing import Dict, List
-
-import numpy
-
-from megatron.core.datasets.blended_megatron_dataset_config import (
-    convert_split_vector_to_split_matrix,
-    parse_and_normalize_split,
-)
-from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
-from megatron.core.datasets.indexed_dataset import IndexedDataset
-from megatron.core.datasets.utils import Split
-from megatron.core.utils import log_single_rank
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class MultiSplitGPTDatasetConfig(GPTDatasetConfig):
-    """Configuration object for Megatron Core blended and Retro datasets.
-
-    Args:
-        return_document_ids (bool): Whether to return the document ids when querying the dataset.
-            Turn this option on during preprocessing.
-        split_preprocessing (str): The Retro preprocessing split string.
-            It follows the same pattern convention as 'split'.
-            Not to be used with 'blend_per_split'.
-    """
-
-    return_document_ids: bool = None
-
-    split_preprocessing: str = None
-
-    def __post_init__(self) -> None:
-        """Validate config attributes."""
-
-        super().__post_init__()
-        assert self.split is not None, "the Retro data pipeline does not support 'blend_per_split'"
-        assert self.return_document_ids is not None, "this attribute must be user defined"
-        assert self.split_preprocessing is not None, "this attribute must be user defined"
-        split_vector = parse_and_normalize_split(self.split)
-        split_preprocessing_vector = parse_and_normalize_split(self.split_preprocessing)
-        if not numpy.allclose(split_vector, split_preprocessing_vector):
-            self.split_matrix = convert_split_vector_to_split_matrix(
-                split_vector, split_preprocessing_vector
-            )
-            log_single_rank(
-                logger,
-                logging.WARNING,
-                f"split =/= split_preprocessing. Let split_matrix = {self.split_matrix}",
-            )
-
-
-class MultiSplitGPTDataset(GPTDataset):
-    """Retro's customized GPT dataset.
-
-    Args:
-        indexed_dataset (IndexedDataset): The IndexedDataset around which
-            to build the MegatronDataset.
-        dataset_path (str): The real path on disk to the dataset, for bookkeeping.
-        indexed_indices (numpy.ndarray): The set of the documents indices to expose.
-        num_samples (int): The number of samples to draw from the indexed dataset.
-        index_split (Split): The indexed_indices Split.
-        config (MultiSplitGPTDatasetConfig): The Retro-specific container for all
-            config sourced parameters.
-    """
-
-    def __init__(
-        self,
-        indexed_dataset: IndexedDataset,
-        dataset_path: str,
-        indexed_indices: numpy.ndarray,
-        num_samples: int,
-        index_split: Split,
-        config: MultiSplitGPTDatasetConfig,
-    ) -> None:
-        super().__init__(
-            indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
-        )
-
-    def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
-        """Get dataset sample.
-
-        Args:
-            idx (int): The index into the dataset.
-
-        Returns:
-            Dict[str, numpy.ndarray]: The text ids and (optionally)
-                the document ids wrapped in a dictionary.
-        """
-        text, document_ids = self._query_document_sample_shuffle_indices(idx)
-        if self.config.return_document_ids:
-            return {"text": text, "document_ids": document_ids}
-        else:
-            return {"text": text}
-
-    @staticmethod
-    def _key_config_attributes() -> List[str]:
-        """Add custom attributes for building unique dataset hash.
-
-        The preprocessing split used for preprocessing will constrain
-            the samples available for pretraining.
-
-        Returns:
-            List[str]: The key config attributes.
-        """
-        return super(MultiSplitGPTDataset, MultiSplitGPTDataset)._key_config_attributes() + [
-            "split_preprocessing"
-        ]
diff --git a/megatron/core/datasets/retro/query/query.py b/megatron/core/datasets/retro/query/query.py
deleted file mode 100644
index 42d93d5aafb..00000000000
--- a/megatron/core/datasets/retro/query/query.py
+++ /dev/null
@@ -1,449 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Entry point for querying an index using a GPTChunkDataset.
-
-Querying involves:
-
-  - Iterate all chunks in the GPTChunkDataset.
-  - Query index for neighbor chunk IDs (i.e., chunks from the chunk database).
-  - Save neighbor chunk IDs to disk, for use in building a RetroDataset sample
-      during pretraining.
-"""
-
-import os
-import time
-import typing
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.db.dataset import DBDataset
-from megatron.core.datasets.retro.db.utils import (
-    get_merged_train_dataset as get_db_merged_train_dataset,
-)
-from megatron.core.datasets.retro.index.factory import IndexFactory
-from megatron.core.datasets.retro.index.index import Index
-from megatron.core.datasets.retro.index.utils import get_index_dir
-from megatron.core.datasets.retro.query.gpt_chunk_dataset import GPTChunkDataset
-from megatron.core.datasets.retro.utils import (
-    GPTToTextDataset,
-    get_blocks_by_rank,
-    log_retro_rank_0,
-    retro_makedir,
-)
-
-try:
-    import psutil
-
-    HAVE_PSUTIL = True
-except ImportError:
-    HAVE_PSUTIL = False
-
-try:
-    from tqdm import tqdm
-
-    HAVE_TQDM = True
-except ImportError:
-    HAVE_TQDM = False
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-try:
-    import faiss
-
-    HAVE_FAISS = True
-except ImportError:
-    HAVE_FAISS = False
-
-
-def get_index(config: RetroPreprocessingConfig, ondisk: bool = False) -> "faiss.Index":
-    """Read index from disk.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        ondisk (bool): If `ondisk = True`, memory map the index.
-            (For debugging purposes only; very non-performant.)
-
-    Returns:
-        A Faiss index, loaded from storage.
-    """
-    if not HAVE_FAISS:
-        raise ImportError(
-            "faiss is required to use the query_neighbors function. " "Please install faiss."
-        )
-
-    # Load index.
-    index_wrapper = IndexFactory.get_index(config.retro_index_type)
-    index_dir = get_index_dir(config)
-    added_index_path = index_wrapper.get_added_index_path(config)
-    if ondisk:
-        index = faiss.read_index(added_index_path, faiss.IO_FLAG_MMAP)
-    else:
-        index = faiss.read_index(added_index_path)
-
-    # Search parameters.
-    faiss.ParameterSpace().set_index_parameter(index, "efSearch", config.retro_query_ef_search)
-    faiss.ParameterSpace().set_index_parameter(index, "nprobe", config.retro_query_nprobe)
-
-    return index
-
-
-def embed_block(
-    config: RetroPreprocessingConfig, gpt_dataset: GPTChunkDataset, block: dict
-) -> np.ndarray:
-    """Embed block of chunks.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        gpt_dataset (GPTChunkDataset): Chunk dataset to be embedded.
-        block (dict): Range information containing start/end indices of subset of chunk dataset.
-
-    Returns:
-        Embeddings array, with shape (len(block["range"]), dimension(embedder)).
-    """
-    text_block_dataset = torch.utils.data.Subset(
-        GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt), range(*block["range"])
-    )
-    return config.retro_bert_embedders.mem.embed_text_dataset(text_block_dataset)
-
-
-def query_embeddings(
-    config: RetroPreprocessingConfig,
-    db_dataset: DBDataset,
-    index: Index,
-    embeddings: np.ndarray,
-    chunk_id_range: range,
-    sample_map: dict,
-    n_chunks_per_sample: int,
-    verbose: bool = True,
-) -> typing.Tuple[np.ndarray, np.ndarray]:
-    """Query neighbors of a block of embeddings.
-
-    Querying includes:
-      - Query index for neighbor chunk IDs.
-      - Filter chunk IDs that have the same document ID as the queried embedding.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        db_dataset (DBDataset): Dataset containing chunk database entries.
-        index (Index): Vector index populated with chunk database indices.
-        embeddings (np.ndarray): Embeddings from GPT chunk dataset.
-        chunk_id_range (range): Chunk ID range from GPT chunk dataset.
-        sample_map (dict): Mapping of sample_idx to dataset_idx and document_ids.
-            Used for document filtering.
-        n_chunks_per_sample (int): Number of chunks per sample
-            (e.g., sequence_length / chunk_length).
-        verbose (bool): Log querying progress.
-
-    Returns:
-        A tuple of original (unfiltered) neighbor IDs, and filtered (by document ID) neighbor IDs.
-    """
-
-    # Query neighbor ids.
-    if verbose:
-        log_retro_rank_0("search.")
-    t = time.time()
-    assert index.ntotal > 0, "check we don't accidentally have an empty index."
-    _, query_neighbor_ids = index.search(embeddings, config.retro_query_num_neighbors_query)
-    if verbose:
-        log_retro_rank_0("  time : %.3f sec." % (time.time() - t))
-
-    # Filter banned neighbor ids.
-    if verbose:
-        log_retro_rank_0("filter banned neighbor ids.")
-    filtered_neighbor_ids = np.full(
-        shape=(len(query_neighbor_ids), config.retro_query_num_neighbors_save),
-        fill_value=-1,
-        dtype="int64",
-    )
-    min_chunk_id, max_chunk_id = chunk_id_range
-    for chunk_id in range(min_chunk_id, max_chunk_id):
-        sample_id = chunk_id // n_chunks_per_sample
-        sample = sample_map[sample_id]
-        sample_dataset_idx = sample["dataset_idx"].item()
-        sample_doc_ids = sample["doc_ids"].tolist()
-        sample_doc_tuples = [(sample_dataset_idx, d) for d in sample_doc_ids]
-
-        # Get valid neighbors (!= -1).
-        query_row = [i for i in query_neighbor_ids[chunk_id - min_chunk_id] if i >= 0]
-
-        # Filter row.
-        filtered_row = [
-            i
-            for i in query_row
-            if tuple(db_dataset.doc_tuples[i].tolist()) not in sample_doc_tuples
-        ]
-        filtered_row = filtered_row[: config.retro_query_num_neighbors_save]
-        filtered_row += [-1] * (config.retro_query_num_neighbors_save - len(filtered_row))
-        filtered_neighbor_ids[chunk_id - min_chunk_id] = filtered_row
-
-    return query_neighbor_ids, filtered_neighbor_ids
-
-
-def query_embedding_block(
-    config: RetroPreprocessingConfig,
-    db_dataset: DBDataset,
-    index: Index,
-    embeddings: np.ndarray,
-    chunk_id_range: range,
-    sample_map: dict,
-    n_chunks_per_sample: int,
-) -> typing.Tuple[np.ndarray, np.ndarray]:
-    """Query a block of embeddings.
-
-    The block is broken into smaller sub-blocks, for easier tracking of progress.
-    Both the raw neighbor IDs and the filtered neighbor IDs (i.e., chunks with the
-    same document ID are removed) are collected.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        db_dataset (DBDataset): Dataset containing chunk database entries.
-        index (Index): Vector index populated with chunk database indices.
-        embeddings (np.ndarray): Embeddings from GPT chunk dataset.
-        chunk_id_range (range): Chunk ID range from GPT chunk dataset.
-        sample_map (dict): Mapping of sample_idx to dataset_idx and document_ids.
-            Used for document filtering.
-        n_chunks_per_sample (int): Number of chunks per sample
-            (e.g., sequence_length / chunk_length).
-
-    Returns:
-        A tuple of original (unfiltered) neighbor IDs, and filtered (by document ID) neighbor IDs.
-    """
-
-    if not HAVE_TQDM:
-        raise ImportError(
-            "tqdm is required to use the query_embeddings function. Please install tqdm."
-        )
-
-    query_neighbor_ids = []
-    filtered_neighbor_ids = []
-
-    # Query in sub-blocks.
-    partial_block_size = 1000
-    for partial_start_idx in tqdm(
-        range(0, len(embeddings), partial_block_size),
-        "  search",
-        miniters=(len(embeddings) // partial_block_size) // 10,
-        disable=torch.distributed.get_rank() != 0,
-    ):
-        partial_end_idx = min(len(embeddings), partial_start_idx + partial_block_size)
-        partial_embeddings = embeddings[partial_start_idx:partial_end_idx]
-        partial_chunk_id_range = (
-            chunk_id_range[0] + partial_start_idx,
-            chunk_id_range[0] + partial_end_idx,
-        )
-        partial_query_neighbor_ids, partial_filtered_neighbor_ids = query_embeddings(
-            config,
-            db_dataset,
-            index,
-            partial_embeddings,
-            partial_chunk_id_range,
-            sample_map,
-            n_chunks_per_sample,
-            verbose=False,
-        )
-        query_neighbor_ids.append(partial_query_neighbor_ids)
-        filtered_neighbor_ids.append(partial_filtered_neighbor_ids)
-
-    # Concatenate.
-    query_neighbor_ids = np.concatenate(query_neighbor_ids, axis=0)
-    filtered_neighbor_ids = np.concatenate(filtered_neighbor_ids, axis=0)
-
-    return query_neighbor_ids, filtered_neighbor_ids
-
-
-def query_block_neighbors(
-    config: RetroPreprocessingConfig,
-    db_dataset: DBDataset,
-    query_dataset: GPTChunkDataset,
-    index: Index,
-    block: dict,
-) -> None:
-    """Query neighbors of a dataset block (i.e., range).
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        db_dataset (DBDataset): Dataset containing chunk database entries.
-        query_dataset (GPTChunkDataset): GPT chunk dataset to be queried.
-        index (Index): Vector index populated with chunk database indices.
-        block (dict): Range information containing start/end indices
-            for querying GPT chunk dataset.
-    """
-
-    if not HAVE_H5PY:
-        raise ImportError(
-            "h5py is required to use the query_block_neighbors function. Please install h5py."
-        )
-
-    n_chunks_per_sample = query_dataset.n_chunks_per_sample
-
-    # Sample map.
-    sample_ids = sorted(
-        list(set(chunk_id // n_chunks_per_sample for chunk_id in range(*block["range"])))
-    )
-    sample_map = {}
-    for i in sample_ids:
-        sample = query_dataset.sample_dataset[i]
-        sample_map[i] = {"dataset_idx": sample["dataset_id"], "doc_ids": sample["document_ids"]}
-
-    # Embed block.
-    embeddings = embed_block(config, query_dataset, block)
-
-    # Query embeddings.
-    _, filtered_neighbor_ids = query_embedding_block(
-        config, db_dataset, index, embeddings, block["range"], sample_map, n_chunks_per_sample
-    )
-
-    if config.retro_task_validate is None:
-        # Save neighbors.
-        log_retro_rank_0("save neighbors.")
-        retro_makedir(config, os.path.dirname(block["path"]))
-        f = h5py.File(block["path"], "w")
-        f.create_dataset("neighbors", data=filtered_neighbor_ids)
-        f.close()
-
-    else:
-        # Validate neighbors.
-        with h5py.File(block["path"]) as f:
-            existing_neighbor_ids = np.copy(f["neighbors"])
-            assert np.array_equal(existing_neighbor_ids, filtered_neighbor_ids)
-
-
-def query_dataset_neighbors(
-    config: RetroPreprocessingConfig,
-    db_dataset: DBDataset,
-    query_dataset: GPTChunkDataset,
-    num_active_chunks: int,
-    prefix: str,
-    neighbor_dir: str,
-    index: Index,
-) -> None:
-    """Query neighbors of each chunk within a dataset.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        db_dataset (DBDataset): Dataset containing chunk database entries.
-        query_dataset (GPTChunkDataset): GPT chunk dataset to be queried.
-        num_active_chunks (int): The 'active' chunks are the subset of the GPT chunk dataset
-            that aren't being queried. This argument is used when validating the correctness
-            of a subset of the GPT chunk dataset.
-        prefix (str): Extra string for logging progress.
-        neighbor_dir (str): File path to directory for saving neighbor IDs.
-        index (Index): Vector index populated with chunk database indices.
-    """
-    if not HAVE_H5PY:
-        raise ImportError(
-            "h5py is required to use the query_dataset_neighbors function. Please install h5py."
-        )
-
-    def validate(f: h5py.File) -> None:
-        """Validation method for validating saved neighbor IDs.
-
-        Args:
-            f (h5py.File): File containing save neighbor IDs.
-        """
-        assert (
-            f["neighbors"].shape[1] == config.retro_query_num_neighbors_save
-        ), "neighbors.shape == %s; num_neighbors_target == %d." % (
-            str(f["neighbors"].shape),
-            config.retro_num_neighbors_target,
-        )
-
-    if config.retro_task_validate is None:
-        retro_makedir(config, neighbor_dir)
-        blocks = get_blocks_by_rank(
-            neighbor_dir, num_active_chunks, config.retro_block_size, validate=validate
-        )
-        active_blocks = blocks.missing
-    else:
-        blocks = get_blocks_by_rank(
-            neighbor_dir,
-            num_active_chunks,
-            config.retro_block_size,
-            validate=validate,
-            sample=config.retro_task_validate,
-        )
-        assert blocks.n_missing_world == 0
-        active_blocks = blocks.existing
-
-    if not HAVE_PSUTIL:
-        raise ImportError(
-            "psutil is required to use the query_dataset_neighbors function. Please install psutil."
-        )
-
-    # Query each block.
-    for block_index, block in enumerate(active_blocks):
-        if block is not None:
-            # Progress.
-            log_retro_rank_0(
-                "%squery '%s' block %d / %d ... %s ... mem %.3f gb, %.1f%%."
-                % (
-                    "" if config.retro_task_validate is None else "[validate] ",
-                    prefix,
-                    block_index,
-                    len(active_blocks),
-                    os.path.basename(block["path"]),
-                    psutil.virtual_memory()[3] / 1024**3,
-                    psutil.virtual_memory()[2],
-                )
-            )
-
-            # Query block neighbors.
-            query_block_neighbors(config, db_dataset, query_dataset, index, block)
-
-        # Synchronize progress across all ranks. (for easier observation)
-        log_retro_rank_0(" > waiting for other ranks to finish block.")
-        torch.distributed.barrier()
-
-
-def query_neighbors(config: RetroPreprocessingConfig) -> None:
-    """Query pretraining datasets (train & valid).
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-    """
-
-    if not HAVE_FAISS:
-        raise ImportError(
-            "faiss is required to use the query_neighbors function. Please install faiss."
-        )
-
-    # Num threads.
-    faiss.omp_set_num_threads(64)
-
-    # Load chunk db dataset.
-    log_retro_rank_0("load chunk db dataset.")
-    db_dataset = get_db_merged_train_dataset(
-        project_dir=config.retro_project_dir,
-        chunk_length=config.retro_gpt_chunk_length,
-        eod_token_id=config.retro_tokenizers.gpt.eod,
-    )
-    db_dataset.load_doc_tuples()
-
-    # Load index.
-    log_retro_rank_0(" > get index.")
-    index = get_index(config)
-
-    # Query each (i.e., train, valid, test) dataset.
-    log_retro_rank_0(" > query.")
-    for prefix, info in vars(config.retro_gpt_chunk_datasets).items():
-        if info is None:
-            continue
-        log_retro_rank_0(
-            " > query '%s' dataset ... %d samples." % (prefix, info["num_active_chunks"])
-        )
-        query_dataset_neighbors(
-            config,
-            db_dataset,
-            info["dataset"],
-            info["num_active_chunks"],
-            prefix,
-            info["neighbor_dir"],
-            index,
-        )
diff --git a/megatron/core/datasets/retro/query/retro_dataset.py b/megatron/core/datasets/retro/query/retro_dataset.py
deleted file mode 100644
index 3316f8dbbc9..00000000000
--- a/megatron/core/datasets/retro/query/retro_dataset.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-A RetroDataset wraps both:
-
-  - A GPTDataset (which is nested as GPTChunkDataset -> MultiSplitGPTDataset ->
-      GPTDataset).
-  - Neighbor IDs of chunks in the chunk database, that were saved during
-      preprocessing.
-
-Both the GPT sample data and the neighbor IDs are returned within a sample from
-this dataset.
-"""
-
-import os
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-import torch
-
-from megatron.core.datasets.retro.db.dataset import DBDataset
-from megatron.core.datasets.retro.db.utils import get_merged_train_dataset as get_db_dataset
-from megatron.core.datasets.retro.utils import BlockPathMap, log_retro_rank_0
-from megatron.core.models.retro import RetroConfig
-
-from .gpt_chunk_dataset import GPTChunkDataset, build_gpt_chunk_datasets_from_gpt_datasets
-from .utils import get_query_dir
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-
-class RetroDataset(torch.utils.data.Dataset):
-    """Dataset of retro samples.
-
-    Each sample contains the original GPT sample, along with the token IDs
-    of each neighbor of each chunk within the sequence. Neighbor array has
-    shape (num_chunks_per_sample, num_neighbors, num_retrieved_tokens).
-
-    ** Note: chunk dataset wraps original GPT dataset (see gpt_chunk_dataset.py).
-
-    Args:
-        num_queried_samples (int): Total number of queried samples.
-        num_neighbors (int): Total number of saved neighbors.
-        num_retrieved_chunks (int): Number of retrieved chunks
-            (e.g., 2 for neighbor + continuation).
-        block_size (int): Number of neighbor entries per file.
-        db_dataset (DBDataset): Chunk database used for retrieval.
-        chunk_dataset (GPTChunkDataset): GPT chunk dataset, which is a wrapper
-            around a standard GPT dataset that breaks each sample into chunks.
-        neighbor_path_map (BlockPathMap): Mapping of neighbor ID to file path.
-    """
-
-    def __init__(
-        self,
-        num_queried_samples: int,
-        num_neighbors: int,
-        num_retrieved_chunks: int,
-        block_size: int,
-        db_dataset: DBDataset,
-        chunk_dataset: GPTChunkDataset,
-        neighbor_path_map: BlockPathMap,
-    ):
-        super().__init__()
-
-        self.num_queried_samples = num_queried_samples
-        self.num_neighbors = num_neighbors
-        self.num_retrieved_chunks = num_retrieved_chunks
-        self.block_size = block_size
-        self.db_dataset = db_dataset
-        self.chunk_dataset = chunk_dataset
-        self.neighbor_path_map = neighbor_path_map
-
-    def __len__(self) -> int:
-        """Dataset length.
-
-        Returns:
-            Number of samples in dataset.
-        """
-        return len(self.chunk_dataset.sample_dataset)
-
-    def __getitem__(self, sample_idx: int) -> dict:
-        """Get dataset sample.
-
-        Args:
-            sample_idx (int): Index of sample in dataset.
-
-        Returns:
-            A dict consisting of GPT sample (attribute 'text') and corresponding neighbor chunk IDs
-            ('neighbor_chunks', for indexing chunk database) and neighbor token IDs
-            (corresponding chunk database GPT tokens).
-        """
-        if not HAVE_H5PY:
-            raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
-
-        n_chunks_per_sample = self.chunk_dataset.n_chunks_per_sample
-
-        # Wrap sample idx around number of queried samples.
-        sample_idx = sample_idx % self.num_queried_samples
-
-        # Get standard sample.
-        sample = self.chunk_dataset.sample_dataset[sample_idx]
-
-        # Sample idx to chunk idxs.
-        chunk_idxs = list(
-            range(sample_idx * n_chunks_per_sample, (sample_idx + 1) * n_chunks_per_sample)
-        )
-
-        # Collect retrieved tokens.
-        all_retrieved_chunk_ids = []
-        all_retrieved_token_ids = []
-        for chunk_idx in chunk_idxs:
-            # Neighbor chunk ids.
-            neighbor_path = self.neighbor_path_map[chunk_idx]
-            with h5py.File(neighbor_path, "r") as f:
-                neighbor_chunk_ids = f["neighbors"][
-                    chunk_idx % self.block_size, : self.num_neighbors
-                ].tolist()
-
-            # Retrieved (neighbor + continuation) token ids.
-            retrieved_chunk_ids = []
-            retrieved_token_ids = []
-            for neighbor_chunk_id in neighbor_chunk_ids:
-                current_chunk_ids = [
-                    i % len(self.db_dataset)
-                    for i in range(neighbor_chunk_id, neighbor_chunk_id + self.num_retrieved_chunks)
-                ]
-                current_token_ids = [self.db_dataset[ci]["text"] for ci in current_chunk_ids]
-                retrieved_chunk_ids.append(current_chunk_ids)
-                retrieved_token_ids.append(current_token_ids)
-
-            # Collect retrieved tokens.
-            all_retrieved_chunk_ids.append(retrieved_chunk_ids)
-            all_retrieved_token_ids.append(retrieved_token_ids)
-
-        # Reshape retrieved tokens.
-        all_retrieved_chunk_ids = np.array(all_retrieved_chunk_ids).reshape(
-            (n_chunks_per_sample, self.num_neighbors, -1)
-        )
-        all_retrieved_token_ids = np.array(all_retrieved_token_ids).reshape(
-            (n_chunks_per_sample, self.num_neighbors, -1)
-        )
-
-        # Sample.
-        sample: Dict[str, np.ndarray] = {
-            **sample,
-            "neighbor_chunks": all_retrieved_chunk_ids,
-            "neighbor_tokens": all_retrieved_token_ids,
-        }
-
-        return sample
-
-
-def get_retro_datasets(
-    config: RetroConfig, gpt_datasets: dict, sample_length: int, eod_token_id: int
-) -> Tuple[Optional[RetroDataset], Optional[RetroDataset], Optional[RetroDataset]]:
-    """Get train, valid, test retro datasets.
-
-    Args:
-        config (RetroConfig): Retro preprocessing config.
-        gpt_datasets (dict): Mapping of data split key
-            ('train', 'valid', or 'test') to the original sequence-length
-            GPT dataset (i.e., not the chunk dataset).
-        sample_length (int): Alias to `sequence_length`.
-        eod_token_id (int): GPT EOD token ID.
-
-    Returns:
-        A tuple of 'train', 'valid', and 'test' `RetroDataset`s.
-    """
-
-    # DB dataset.
-    db_dataset = get_db_dataset(
-        project_dir=config.retro_project_dir,
-        chunk_length=config.retro_chunk_length,
-        eod_token_id=eod_token_id,
-    )
-
-    # GPT chunk datasets.
-    chunk_ds_info_map = build_gpt_chunk_datasets_from_gpt_datasets(
-        project_dir=config.retro_project_dir,
-        gpt_datasets=gpt_datasets,
-        sample_length=sample_length,
-        chunk_length=config.retro_chunk_length,
-    )
-
-    # Retro datasets.
-    retro_dataset_map: Dict[str, Optional[RetroDataset]] = {}
-    query_dir = get_query_dir(config.retro_project_dir)
-    for data_key, chunk_ds_info in chunk_ds_info_map.items():
-        # Skip unused datasets.
-        if chunk_ds_info is None:
-            retro_dataset_map[data_key] = None
-            continue
-
-        # For consistency with preprocessing, the neighbor_dir is overwritten
-        # (from its setting in `build_gpt_chunk_datasets_from_gpt_datasets()`
-        # above). This is one piece -- along with setting data_path and
-        # train_samples from config.json -- of ensuring consistency between
-        # preprocessing and pretraining.
-        chunk_dataset = chunk_ds_info["dataset"]
-        chunk_ds_info["neighbor_dir"] = os.path.join(
-            query_dir, config.retro_neighbor_dirs[data_key]
-        )
-        neighbor_dir = chunk_ds_info["neighbor_dir"]
-        neighbor_path_map = BlockPathMap.from_dir(
-            dir=neighbor_dir, block_size=config.retro_block_size
-        )
-
-        # Verify num chunks.
-        n_active_chunks = chunk_ds_info["num_active_chunks"]
-        n_neighbor_chunks = neighbor_path_map.max_idx
-
-        if not os.path.isdir(neighbor_dir):
-            if torch.distributed.get_rank() == 0:
-                raise Exception(
-                    "neighbor directory '%s' not found; please "
-                    "compare --train-samples, --seq-length, --seed, "
-                    "--eval-iters, and --eval-interval, with "
-                    "retro preprocessing args." % neighbor_dir
-                )
-            torch.distributed.barrier()
-            exit()
-
-        if config.retro_verify_neighbor_count and n_active_chunks != n_neighbor_chunks:
-            if torch.distributed.get_rank() == 0:
-                log_retro_rank_0("neighbor_dir : %s" % neighbor_dir)
-                log_retro_rank_0("neighbor_path_map : %s" % neighbor_path_map)
-                raise Exception(
-                    "num sampled chunks (%d) != num neighbor chunks "
-                    "(%d); did you complete querying the entire "
-                    "pretraining dataset?" % (n_active_chunks, n_neighbor_chunks)
-                )
-            torch.distributed.barrier()
-            exit()
-
-        # Retro dataset.
-        retro_dataset_map[data_key] = RetroDataset(
-            num_queried_samples=gpt_datasets[data_key][1],
-            num_neighbors=config.retro_num_neighbors,
-            num_retrieved_chunks=config.retro_num_retrieved_chunks,
-            block_size=config.retro_block_size,
-            db_dataset=db_dataset,
-            chunk_dataset=chunk_dataset,
-            neighbor_path_map=neighbor_path_map,
-        )
-
-    return (retro_dataset_map["train"], retro_dataset_map["valid"], retro_dataset_map["test"])
diff --git a/megatron/core/datasets/retro/query/utils.py b/megatron/core/datasets/retro/query/utils.py
deleted file mode 100644
index b4e0c67009a..00000000000
--- a/megatron/core/datasets/retro/query/utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Utilities for querying the pretraining dataset."""
-
-import os
-
-from megatron.core.datasets.megatron_dataset import MegatronDataset
-
-
-def get_query_dir(project_dir: str) -> str:
-    """Get root directory of all saved query data.
-
-    Args:
-        project_dir (str): Retro project dir.
-
-    Returns:
-        Path to query sub-directory in Retro project.
-    """
-    return os.path.join(project_dir, "query")
-
-
-def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str:
-    """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test).
-
-    Args:
-        project_dir (str): Retro project dir.
-        key (str): Dataset split key; 'train', 'valid', or 'test'.
-        dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors.
-
-    Returns:
-        Path to directory containing this dataset's neighbors within Retro project.
-    """
-    return os.path.join(
-        get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}")
-    )
diff --git a/megatron/core/datasets/retro/utils.py b/megatron/core/datasets/retro/utils.py
deleted file mode 100644
index 5d9900697fc..00000000000
--- a/megatron/core/datasets/retro/utils.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Utilities for Retro preprocessing."""
-
-import glob
-import logging
-import os
-from types import SimpleNamespace
-from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict
-
-import numpy as np
-import torch
-from torch.distributed import ProcessGroup
-
-from megatron.core import parallel_state
-from megatron.core.datasets.retro.config import RetroPreprocessingConfig
-from megatron.core.datasets.retro.query.multi_split_gpt_dataset import (
-    MultiSplitGPTDataset,
-    MultiSplitGPTDatasetConfig,
-)
-from megatron.core.utils import log_single_rank
-
-logger = logging.getLogger(__name__)
-
-try:
-    from tqdm import tqdm
-
-    HAVE_TQDM = True
-except ImportError:
-    HAVE_TQDM = False
-
-try:
-    import h5py
-
-    HAVE_H5PY = True
-except ImportError:
-    HAVE_H5PY = False
-
-
-class Block(TypedDict):
-    """Specific block arg type to mute mypy."""
-
-    range: Tuple[int, int]
-    path: str
-
-
-def log_retro_rank_0(message: str) -> None:
-    """Log on rank 0.
-
-    Args:
-        message (str): Message to log.
-    """
-    log_single_rank(logger, logging.INFO, "[RETRO] " + message)
-
-
-def retro_makedir(config: RetroPreprocessingConfig, path: str) -> None:
-    """Make a directory, conditional on not being in validation mode.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-        path (str): Path to directory.
-    """
-    if config.retro_task_validate is None:
-        os.makedirs(path, exist_ok=True)
-
-
-def extract_data_config(config: RetroPreprocessingConfig) -> MultiSplitGPTDatasetConfig:
-    """Extract data config from dataset.
-
-    Args:
-        config (RetroPreprocessingConfig): Retro preprocessing config.
-
-    Returns:
-        The config object used to build the dataset.
-    """
-    return config.retro_gpt_chunk_datasets.train["dataset"].sample_dataset.config
-
-
-def get_num_chunks_per_sample(sample_length: int, chunk_length: int) -> int:
-    """Compute seq_length // chunk_length.
-
-    Args:
-        sample_length (int): Alias of `sequence_length`.
-        chunk_length (int): Retro chunk length (e.g., 64).
-
-    Returns:
-        Number of chunks per sample (i.e., `sequence_length` / `chunk_length`).
-    """
-    assert sample_length % chunk_length == 0
-    return sample_length // chunk_length
-
-
-class GPTToTextDataset(torch.utils.data.Dataset):
-    """Dataset to convert GPT tokens to text.
-
-    Args:
-        gpt_dataset (MultiSplitGPTDataset): GPT dataset, which outputs GPT token samples.
-        gpt_tokenizer (Any): GPT tokenizer.
-    """
-
-    def __init__(self, gpt_dataset: MultiSplitGPTDataset, gpt_tokenizer: Any):
-        super().__init__()
-
-        self.gpt_dataset = gpt_dataset
-        self.gpt_tokenizer = gpt_tokenizer
-
-    def __len__(self) -> int:
-        """Dataset length.
-
-        Returns:
-            Number of samples in the dataset.
-        """
-        return len(self.gpt_dataset)
-
-    def __getitem__(self, idx: int) -> dict:
-        """Get dataset sample.
-
-        Args:
-            idx (int): Index of sample.
-
-        Returns:
-            A dict containing attribute 'text' of type string.
-        """
-        gpt_token_ids = self.gpt_dataset[idx]["text"].tolist()
-        text = self.gpt_tokenizer.detokenize(gpt_token_ids)
-        return {"text": text}
-
-
-def get_blocks(
-    dirname: str, n_samples: int, block_size: int, validate: Optional[Callable] = None
-) -> SimpleNamespace:
-    """Divide range [0, num_samples) to sequence of block ranges.
-
-    This is a core method within the concept of block processing. The idea
-    is to divide a range (size n_samples) into a sequence of blocks. Each
-    block corresponds to a file within 'dirname' with name
-    '{start_idx}-{end_idx}.hdf5'. This method checks for the existence of
-    these files, and returns two lists, one for existing blocks and one for
-    missing blocks.
-
-    Args:
-        dirname (str): Path to directory containing block files.
-        n_samples (int): Ideal number of samples.
-            The total number of saved block data is <=n_samples.
-        block_size (int): Max number of samples per block file (e.g., 100000).
-        validate (Callable): Method for validating each block file during load.
-
-    Returns:
-        A namespace consisting of 2 lists: existing blocks, and missing blocks.
-        The total number of samples between the existing and missing blocks should
-        equal n_samples above.
-    """
-
-    if not HAVE_TQDM:
-        raise ImportError("tqdm is required to use the RetroDataset. Please install tqdm.")
-
-    if not HAVE_H5PY:
-        raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
-
-    assert os.path.isdir(dirname), "missing directory '%s.'" % dirname
-
-    # Block ranges.
-    block_start_idxs = list(range(0, n_samples, block_size))
-    block_end_idxs = [min(n_samples, i + block_size) for i in block_start_idxs]
-    block_ranges = list(zip(block_start_idxs, block_end_idxs))
-
-    # All block files (existing + missing).
-    n_digits = int(np.ceil(np.log(n_samples) / np.log(10)) + 1)
-
-    all_blocks: List[Block] = [
-        {
-            "range": r,
-            "path": os.path.join(
-                dirname, "%s-%s.hdf5" % tuple([str(i).zfill(n_digits) for i in r])
-            ),
-        }
-        for r in block_ranges
-    ]
-    all_block_path_set = set(block["path"] for block in all_blocks)
-
-    # Validate function.
-    validate = (lambda f: None) if validate is None else validate
-
-    # Delete corrupt files.
-    if torch.distributed.get_rank() == 0:
-        existing_block_paths = [
-            block["path"] for block in all_blocks if os.path.exists(block["path"])
-        ]
-        for index, path in enumerate(tqdm(existing_block_paths, "validating block.")):
-            assert path in all_block_path_set, "unexpected filename, '%s'." % path
-
-            try:
-                f = h5py.File(path, "r")
-            except Exception:
-                os.remove(path)
-                continue
-
-            try:
-                validate(f)
-            except Exception:
-                os.remove(path)
-            finally:
-                f.close()
-
-    # Wait for files to be deleted.
-    torch.distributed.barrier()
-
-    # Collect blocks.
-    blocks = SimpleNamespace(
-        existing=[b for b in all_blocks if os.path.exists(b["path"])],
-        missing=[b for b in all_blocks if not os.path.exists(b["path"])],
-    )
-
-    return blocks
-
-
-def get_blocks_by_rank(
-    dirname: str,
-    n_samples: int,
-    block_size: int,
-    validate: Optional[Callable] = None,
-    sample: Optional[float] = None,
-    process_group: Optional[ProcessGroup] = None,
-) -> SimpleNamespace:
-    """Divide existing and missing blocks evenly across all ranks.
-
-    See 'get_blocks()' above for description. The returned lists of existing and
-    missing blocks are split evenly across ranks via interleaving. This way,
-    each rank has a roughly equal number of blocks to process for a
-    downstream operation.
-
-    Args:
-        dirname (str): Path to directory containing block files.
-        n_samples (int): Ideal number of samples. The total number of saved block data
-            is <=n_samples.
-        block_size (int): Max number of samples per block file (e.g., 100000).
-        validate (Callable): Method for validating each block file during load.
-        sample (Optional[float]): If provided, sample a random subset of the blocks.
-            Used for validating preprocessing correctness.
-        process_group (Optional[ProcessGroup]): Process group for distributed operations.
-            If None, uses data parallel group.
-
-    Returns:
-        A namespace consisting of 2 lists: existing blocks, and missing blocks.
-        Each of these two lists is potentially a sub-sample of the total set of
-        existing and missing blocks, depending on whether sampling is used.
-        Additionally, the attributes n_existing_world and n_missing_world are the
-        total number of existing and missing blocks, independent of samples.
-        Therefore, (n_existing_world + n_missing_world) * block_size == n_samples.
-    """
-
-    if process_group is None:
-        process_group = parallel_state.get_data_parallel_group()
-
-    # Get world blocks.
-    blocks = get_blocks(dirname, n_samples, block_size, validate)
-
-    # This rank's existing and missing files.
-    rank_existing_blocks = blocks.existing[
-        process_group.rank() : len(blocks.existing) : process_group.size()
-    ]
-    rank_missing_blocks = blocks.missing[
-        process_group.rank() : len(blocks.missing) : process_group.size()
-    ]
-
-    # Extend rank's existing and missing blocks (with None) such that all ranks
-    # have equal length lists. This allows for easier tracking of global progress.
-    def get_world_max(n: int) -> int:
-        """Get max value across ranks.
-
-        Args:
-            n (int): Value on this rank.
-
-        Returns:
-            Max value across all ranks.
-        """
-        n_tensor = torch.cuda.LongTensor([n])
-        torch.distributed.all_reduce(n_tensor, op=torch.distributed.ReduceOp.MAX)
-        return n_tensor.item()
-
-    max_n_existing = get_world_max(len(rank_existing_blocks))
-    max_n_missing = get_world_max(len(rank_missing_blocks))
-
-    rank_existing_blocks += [None] * (max_n_existing - len(rank_existing_blocks))
-    rank_missing_blocks += [None] * (max_n_missing - len(rank_missing_blocks))
-
-    # Collect blocks.
-    blocks = SimpleNamespace(
-        n_existing_world=len(blocks.existing),
-        n_missing_world=len(blocks.missing),
-        existing=rank_existing_blocks,
-        missing=rank_missing_blocks,
-    )
-
-    if sample is not None:
-        # Sample existing and missing blocks evenly across all ranks. The
-        # returned lists of blocks are randomly sampled (without replacement)
-        # to yield `sample * len(blocks)` number of blocks.
-
-        # Randomly sample blocks.
-        def sample_blocks(_blocks: List[Optional[Dict]]) -> List[Optional[Dict]]:
-            """Sample a random subset of all blocks.
-
-            Args:
-                _blocks (List[Optional[Dict]]): List of all blocks.
-
-            Returns:
-                A random subset of the blocks.
-            """
-            n_blocks_sample = int(np.ceil(sample * len(_blocks)))
-            sampled_blocks: List[Optional[Dict]] = [b for b in _blocks if b is not None]
-
-            np.random.seed(None)
-            np.random.shuffle(sampled_blocks)
-
-            sampled_blocks = sampled_blocks[:n_blocks_sample]
-            sampled_blocks += [None] * (n_blocks_sample - len(sampled_blocks))
-
-            return sampled_blocks
-
-        blocks.existing = sample_blocks(blocks.existing)
-        blocks.missing = sample_blocks(blocks.missing)
-
-    return blocks
-
-
-class BlockPathMap:
-    """Map an index to its containing block path.
-
-    The common use for this class is to have a directory of files containing
-    blocks of processed data, of uniform block size (e.g., 100k samples per
-    file). Each file must follow a naming convention of 'startIdx-endIdx.[ext]',
-    where 'endIdx' minus 'startIdx' must equal the block size, with the possible
-    exception of the final block. Given an input index, this class maps the
-    index to the containing block file.
-
-    Args:
-        block_paths (List[str]): List of paths to saved block files.
-        block_size (int): Max number of samples per block file (e.g., 100000).
-    """
-
-    @classmethod
-    def from_dir(cls, dir: str, block_size: int, ext: str = "hdf5") -> Any:
-        """Get list of block files, and create map.
-
-        Args:
-            dir (str): Path to directory containing saved block files.
-            block_size (int): Max number of samples per block file (e.g., 100000).
-            ext (str): Block file extension (e.g., 'hdf5').
-
-        Returns:
-            A mapping of sample index to block file path.
-        """
-        assert os.path.isdir(dir), f"directory not found, '{dir}'."
-        return cls(sorted(glob.glob(dir + f"/*.{ext}")), block_size)
-
-    def __init__(self, block_paths: List[str], block_size: int):
-        self.max_idx = 0
-        self.block_path_map = {}
-        for block_path in block_paths:
-            name = os.path.splitext(os.path.basename(block_path))[0]
-            start_idx, end_idx = [int(i) for i in name.split("-")]
-            self.block_path_map[start_idx] = block_path
-            self.max_idx = max(self.max_idx, end_idx)
-        self.block_size = block_size
-
-    def __str__(self) -> str:
-        """Stringify the mapping.
-
-        Returns:
-            A string representation of this block path map.
-        """
-        return "%d paths" % len(self.block_path_map)
-
-    def __getitem__(self, idx: int) -> str:
-        """Get block path from index.
-
-        Args:
-            idx (int): Index of sample.
-
-        Returns:
-            The path to the block file containing the sample index.
-        """
-        block_start_idx = self.block_size * (idx // self.block_size)
-        block_path = self.block_path_map[block_start_idx]
-        return block_path
diff --git a/megatron/core/datasets/t5_dataset.py b/megatron/core/datasets/t5_dataset.py
index edcba299005..07d221e0c80 100644
--- a/megatron/core/datasets/t5_dataset.py
+++ b/megatron/core/datasets/t5_dataset.py
@@ -144,6 +144,7 @@ def config_attention_mask(
             decoder_mask (torch.tensor): A 2-D array of tokens (bs, q_len)
             use_local (bool): Whether the current T5 model uses local (vs TE)
                 transformer implmentation
+            test_te_version (str): The Transformer Engine version to test against. Defaults to None.
 
         Returns:
             Configured encoder_mask, decoder_mask, encoder_decoder_mask
@@ -228,7 +229,8 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
             idx (int): The index into the dataset
 
         Returns:
-            Dict[str, Union[int, numpy.ndarray]]: The
+            Dict[str, Union[int, numpy.ndarray]]: The sample data including encoder input, decoder
+                input/output, and masks.
         """
         idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
         sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
@@ -312,7 +314,7 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
         # For padded sequences, ensure the embedding layer can map the token ID
         encoder_input[encoder_input == self._pad_token_id] = 0
         decoder_input[decoder_input == self._pad_token_id] = 0
-        labels[labels == self._pad_token_id] = 0
+        decoder_output[decoder_output == self._pad_token_id] = 0
 
         return {
             "text_enc": encoder_input,
diff --git a/megatron/core/dist_checkpointing/exchange_utils.py b/megatron/core/dist_checkpointing/exchange_utils.py
index 2f791449057..79f906b237a 100644
--- a/megatron/core/dist_checkpointing/exchange_utils.py
+++ b/megatron/core/dist_checkpointing/exchange_utils.py
@@ -11,7 +11,7 @@
 import numpy as np
 import torch
 
-from ..utils import get_pg_rank, get_pg_size
+from ..utils import get_pg_rank, get_pg_size, log_single_rank
 from .core import CheckpointingException
 from .dict_utils import nested_values
 from .mapping import ShardedStateDict, ShardedTensor, is_main_replica
@@ -62,10 +62,7 @@ class ShardDistribution(NamedTuple):
 
 def _shard_size(sh_ten: ShardedTensor):
     """Returns size in bytes of a given sharded tensor."""
-    if sh_ten.flattened_range is None:
-        numel = np.prod(sh_ten.local_shape)
-    else:
-        numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start
+    numel = np.prod(sh_ten.local_shape)
     return numel * torch._utils._element_size(sh_ten.dtype)
 
 
@@ -411,11 +408,11 @@ def exchange_loaded_tensors_gather_object(
     # Error checks
     if len(all_loaded_tensors) != sum(map(len, all_loaded_tensors_list)):
         err_msg = "Duplicate shard ids loaded by different ranks"
-        if torch.distributed.get_rank() == 0:
-            logger.error(
-                f"{err_msg}. Shards ids by rank:"
-                f" {[lt.keys() for lt in all_loaded_tensors_list]}"
-            )
+        log_single_rank(
+            logger,
+            logging.ERROR,
+            f"{err_msg}. Shards ids by rank:" f" {[lt.keys() for lt in all_loaded_tensors_list]}",
+        )
         raise CheckpointingException(err_msg)
 
     return all_loaded_tensors
@@ -442,11 +439,11 @@ def exchange_loaded_objects_gather_object(
     # Error checks
     if len(all_loaded_objects) != sum(map(len, all_loaded_objects_list)):
         err_msg = "Duplicate shard ids loaded by different ranks"
-        if torch.distributed.get_rank() == 0:
-            logger.error(
-                f"{err_msg}. Shards ids by rank:"
-                f" {[lt.keys() for lt in all_loaded_objects_list]}"
-            )
+        log_single_rank(
+            logger,
+            logging.ERROR,
+            f"{err_msg}. Shards ids by rank:" f" {[lt.keys() for lt in all_loaded_objects_list]}",
+        )
         raise CheckpointingException(err_msg)
 
     return all_loaded_objects
diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py
index 45a105666ab..dfe7e7df55b 100644
--- a/megatron/core/dist_checkpointing/mapping.py
+++ b/megatron/core/dist_checkpointing/mapping.py
@@ -12,7 +12,6 @@
 from itertools import chain
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 
 from .core import CheckpointingException
@@ -113,20 +112,6 @@ class with `from_rank_offsets` or `from_rank_offsets_flat` constructors.
                 raise CheckpointingException(
                     f"Data shape should match `local_shape` attribute for {self}"
                 )
-            if has_flattened_range:
-                if self.data.ndim != 1:
-                    raise CheckpointingException(f"Data should be 1D for a flattened {self}")
-                real_data = self.data
-                try:
-                    self.data = None
-                    self.init_data(device="meta")
-                    if self.data.shape != real_data.shape:
-                        raise CheckpointingException(
-                            f"Data shape {real_data.shape} doesnt match"
-                            f" expected {self.data.shape} for {self}"
-                        )
-                finally:
-                    self.data = real_data
 
         if len(self.global_shape) != len(self.global_offset):
             raise CheckpointingException(
@@ -145,18 +130,8 @@ class with `from_rank_offsets` or `from_rank_offsets_flat` constructors.
                         f"Global offset ({off}) must be divisible by local shape ({sh}) for {self}."
                     )
 
-        if has_flattened_range and self.flattened_range.step is not None:
-            raise CheckpointingException(
-                f"`step` argument in the flattened range of a ShardedTensor is not supported."
-            )
-
         if self.flattened_range is not None:
-            if not _logged_deprecations.get("flattened_range", False):
-                logger.warning(
-                    "ShardedTensor.flattened_range is deprecated."
-                    " Use latest DistributedOptimizer formats."
-                )
-                _logged_deprecations["flattened_range"] = True
+            raise CheckpointingException("ShardedTensor.flattened_range is not supported.")
 
     @property
     def has_regular_grid(self):
@@ -181,45 +156,6 @@ def global_slice(self) -> Tuple[Union[int, slice], ...]:
             )
         )
 
-    def global_coordinates(self) -> Tuple[np.ndarray, ...]:
-        """
-        Returns a tuple of np.ndarrays representing the coordinates of the global tensor
-        that this ShardedTensor corresponds to.
-        """
-        if self.flattened_range is None:
-            raise CheckpointingException(
-                f"`global_coordinates` is undefined for"
-                f" {self.__class__.__name__} without `flattened_range`"
-            )
-
-        local_coords = self.local_coordinates()
-        assert len(local_coords) + self.prepend_axis_num == len(self.global_offset), (
-            len(local_coords),
-            self,
-        )
-        global_coords = tuple(
-            c + off
-            for c, off in zip((0,) * self.prepend_axis_num + local_coords, self.global_offset)
-        )
-        return global_coords
-
-    def local_coordinates(self) -> Tuple[np.ndarray, ...]:
-        """
-        Returns a tuple of np.ndarrays representing the coordinates of the local tensor
-        that this ShardedTensor corresponds to.
-        """
-
-        if self.flattened_range is None:
-            raise CheckpointingException(
-                f"`local_coordinates` is undefined for"
-                f" {self.__class__.__name__} without `flattened_range`"
-            )
-
-        # TODO: np.unravel_index?
-        mask = np.zeros(np.prod(self.local_shape), dtype=bool)
-        mask[self.flattened_range] = True
-        return np.nonzero(mask.reshape(self.local_shape))
-
     def local_chunk_offset_in_global(self) -> Tuple[int, ...]:
         """Offset of a local chunk in a global array of chunks.
 
@@ -308,51 +244,6 @@ def from_rank_offsets(
             **init_kwargs,
         )
 
-    @classmethod
-    def from_rank_offsets_flat(
-        cls,
-        key: str,
-        data: torch.Tensor,
-        non_flat_local_shape: Tuple[int, ...],
-        *args,
-        flattened_range: Optional[slice] = None,
-        **kwargs,
-    ):
-        """Allows to construct a *flattened* ShardedTensor given offset specified in process ranks.
-
-        Args:
-            key (str):
-            data (torch.Tensor): this should be a flattened data tensor
-            non_flat_local_shape (Tuple[int, ...]): expected local shape of a non-flat chunk
-            *args: passed unchanged to the `from_rank_offsets` constructor
-            flattened_range (slice): see ShardedTensor. Defaults to None, but must be set to
-                a non-None slice.
-            **kwargs:
-
-        Returns:
-            ShardedTensor: constructed ShardedTensor instance
-        """
-        if flattened_range is None:
-            raise CheckpointingException(
-                "Cannot instantiate a non-flat ShardedTensor with `from_rank_offsets_flat` method."
-                " Use `from_rank_offsets` instead"
-            )
-        if data.ndim != 1:
-            raise CheckpointingException(
-                f"Flattened ShardedTensor requires 1D data, got shape: {data.shape}"
-            )
-        if flattened_range.stop - flattened_range.start != data.numel():
-            raise CheckpointingException(
-                f"Flattened ShardedTensor data length ({data.numel()}) must meet the "
-                f"slice length: {flattened_range.stop - flattened_range.start}"
-            )
-
-        non_flat_data_meta = torch.empty(*non_flat_local_shape, dtype=data.dtype, device="meta")
-        sh_ten = cls.from_rank_offsets(key, non_flat_data_meta, *args, **kwargs)
-        instance = replace(sh_ten, data=data, flattened_range=flattened_range)
-        instance.validate_metadata_integrity()
-        return instance
-
     def init_data(self, device: Union[str, torch.device], init_fn=torch.empty):
         """
         Initialize the tensor data of this ShardedTensor.
@@ -367,8 +258,6 @@ def init_data(self, device: Union[str, torch.device], init_fn=torch.empty):
         if self.data is not None:
             return
         self.data = init_fn(self.local_shape, dtype=self.dtype, device=device)
-        if self.flattened_range is not None:
-            self.data = self.data.flatten()[self.flattened_range.start : self.flattened_range.stop]
 
     def narrow(self, dim: int, start: int, length: int) -> List["ShardedTensor"]:
         """This is an analogue of torch.narrow for ShardedTensors.
@@ -417,85 +306,17 @@ def _safe_div(x, y):
             _safe_div(self.global_offset[prepended_dim] * length, local_length_along_dim),
         )
 
-        if self.flattened_range is None:
-            new_data = self.data.narrow(dim, start, length)
-            # always a single result tensor
-            return [
-                replace(
-                    self,
-                    data=new_data,
-                    local_shape=new_data.shape,
-                    global_shape=global_shape,
-                    global_offset=global_offset,
-                )
-            ]
-        else:
-            if dim != 0:
-                raise CheckpointingException(
-                    f"Narrowing along the first axis is supported for now only, got dim={dim}"
-                )
-
-            # If dim=0, we will always get 0 or 1 resulting tensor.
-            # If dim>1, in general there can be more result tensors (e.g. max 3 for dim=1)
-
-            # For on original flat ShardedTensor of local shape [3, 4] and
-            # flattened_range=slice(5, 10),
-            # the X signs mark the actual (flat) data in `self.data`
-            # notice 12 (3*4) total "virtual" elements, out of which 5 is actual data.
-            # flat original: [.....XXXXX..]
-
-            # If we narrow to start=1, length=1 in the original local shape dimensions,
-            # the overlapping flat slice would be:
-            # narrow to:     [....XXXX....]
-            # flat overlap:  [.....XXX....]
-
-            # Now `data` is flattened and sliced, so we must compute local_shape manually
-
-            local_shape = _update_tuple(self.local_shape, dim, length)
-            other_dims_volume = np.prod(
-                _update_tuple(local_shape, dim, 1)
-            )  # 4 in the example above
-            volume_before_split = other_dims_volume * start  # 4 in the example above
-            volume_of_split = other_dims_volume * length  # 4 in the example above
-
-            flat_slice_start_shifted = (
-                self.flattened_range.start - volume_before_split
-            )  # 5 - 4 = 1 in the example above
-            flat_slice_stop_shifted = (
-                self.flattened_range.stop - volume_before_split
-            )  # 10 - 4 = 6 in the example above
-
-            # Find an intersection of
-            # (flat_slice_start_shifted, flat_slice_stop_shifted) vs (0, volume_of_split)
-
-            if flat_slice_stop_shifted <= 0 or flat_slice_start_shifted >= volume_of_split:
-                return []  # no intersection
-
-            # new_flattened_range = slice(1, 4) in the example above
-            new_flattened_range = slice(
-                max(flat_slice_start_shifted, 0), min(flat_slice_stop_shifted, volume_of_split)
+        new_data = self.data.narrow(dim, start, length)
+        # always a single result tensor
+        return [
+            replace(
+                self,
+                data=new_data,
+                local_shape=new_data.shape,
+                global_shape=global_shape,
+                global_offset=global_offset,
             )
-            # Apply the intersection to the flattened data tensor.
-            # Compute start and slice appropriate length
-            intersection_slice_start = (
-                new_flattened_range.start - flat_slice_start_shifted
-            )  # 0 in the example above
-            new_data = self.data[
-                intersection_slice_start : intersection_slice_start
-                + new_flattened_range.stop
-                - new_flattened_range.start
-            ]
-
-            return [
-                replace(
-                    self,
-                    data=new_data,
-                    local_shape=local_shape,
-                    global_shape=global_shape,
-                    global_offset=global_offset,
-                    flattened_range=new_flattened_range,
-                )
-            ]
+        ]
 
 
 def is_main_replica(replica_id: ReplicaId):
diff --git a/megatron/core/dist_checkpointing/optimizer.py b/megatron/core/dist_checkpointing/optimizer.py
index fb08ea52b1b..69227f1ab66 100644
--- a/megatron/core/dist_checkpointing/optimizer.py
+++ b/megatron/core/dist_checkpointing/optimizer.py
@@ -13,7 +13,7 @@
 
 import torch
 
-from megatron.core.utils import to_local_if_dtensor
+from megatron.core.utils import log_single_rank, to_local_if_dtensor
 
 from .dict_utils import nested_values
 from .mapping import (
@@ -70,10 +70,12 @@ def get_param_id_to_sharded_param_map(
             logger.debug(f'{ten} is not tracked by the optimizer')
 
     if not id_to_sharded_param_map:
-        logger.warning(
+        log_single_rank(
+            logger,
+            logging.WARNING,
             "Sharded parameters mapping is empty. It means tensors in model state dict"
             " do not correspond to tensors in optimizer parameters map."
-            " Make sure to call state_dict with `keep_vars=True`."
+            " Make sure to call state_dict with `keep_vars=True`.",
         )
     return id_to_sharded_param_map
 
diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py
index 0469949c67d..94c7a6cf663 100644
--- a/megatron/core/dist_checkpointing/serialization.py
+++ b/megatron/core/dist_checkpointing/serialization.py
@@ -15,6 +15,7 @@
 import torch
 
 from megatron.core.msc_utils import MultiStorageClientFeature
+from megatron.core.utils import log_single_rank
 
 from . import ShardedTensor
 from .core import CheckpointingConfig, save_config
@@ -181,9 +182,12 @@ def load_common_state_dict(checkpoint_dir: Union[str, Path]) -> StateDict:
     """
     if isinstance(checkpoint_dir, Path):
         checkpoint_dir = str(checkpoint_dir)
-        logger.warning(
-            "DEPRECATED: Passing 'checkpoint_dir' as a Path object in load_common_state_dict will "
-            "no longer be supported in a future release. Please pass it as a string instead."
+        log_single_rank(
+            logger,
+            logging.WARNING,
+            "DEPRECATED: Passing 'checkpoint_dir' as a Path object in "
+            "load_common_state_dict will no longer be supported in a future release. "
+            "Please pass it as a string instead.",
         )
     sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy(checkpoint_dir)
     return common_strategy.load_common(checkpoint_dir)
diff --git a/megatron/core/dist_checkpointing/state_dict_utils.py b/megatron/core/dist_checkpointing/state_dict_utils.py
index cfb2379a9d3..59ec18e70ed 100644
--- a/megatron/core/dist_checkpointing/state_dict_utils.py
+++ b/megatron/core/dist_checkpointing/state_dict_utils.py
@@ -13,7 +13,7 @@
     StateDict,
     apply_factories,
 )
-from .utils import extract_nonpersistent, extract_sharded_base
+from .utils import _clean_metadata_for_serialization, extract_nonpersistent, extract_sharded_base
 from .validation import determine_global_metadata, validate_sharding_integrity
 
 
@@ -43,6 +43,11 @@ def save_preprocess(
     sharded_part = filter_out_empty_flatten_tensor(sharded_part)
     if validate_access_integrity:
         preprocessed_common_state_dict = common_state_dict
+        if "content_metadata" in preprocessed_common_state_dict:
+            preprocessed_common_state_dict["content_metadata"] = _clean_metadata_for_serialization(
+                preprocessed_common_state_dict["content_metadata"]
+            )
+
         if preprocess_common_before_consistancy_check:
             preprocessed_common_state_dict = preprocess_common_before_consistancy_check(
                 common_state_dict
diff --git a/megatron/core/dist_checkpointing/strategies/async_utils.py b/megatron/core/dist_checkpointing/strategies/async_utils.py
index 4c1aab1b1d7..85941fd1ed6 100644
--- a/megatron/core/dist_checkpointing/strategies/async_utils.py
+++ b/megatron/core/dist_checkpointing/strategies/async_utils.py
@@ -16,6 +16,8 @@
 import torch
 from torch import multiprocessing as mp
 
+from megatron.core.utils import log_single_rank
+
 from ..utils import debug_time
 
 logger = logging.getLogger(__name__)
@@ -167,7 +169,7 @@ def sync_all_async_calls(self, is_alive: int) -> bool:
     @abstractmethod
     def close(self, abort=False):
         """Terminate the async caller at exit of an application or some termination conditions"""
-        logger.info(f"AsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller")
+        logger.debug(f"AsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller")
 
     def __del__(self):
         raise NotImplementedError("This should be implemented")
@@ -265,7 +267,11 @@ def close(self, abort=False):
         if self.process:
             logger.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
             if abort:
-                logger.warning(f"Temporal worker aborted in rank {torch.distributed.get_rank()}")
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    f"Temporal worker aborted in rank {torch.distributed.get_rank()}",
+                )
                 self.process.kill()
             else:
                 self.process.join()
@@ -319,7 +325,7 @@ def schedule_async_call(self, async_req: AsyncRequest) -> None:
         self.start_time = time()
         if self.process is None:
             ctx = mp.get_context('spawn')
-            logger.info(
+            logger.debug(
                 f"PersistentAsyncCaller: {torch.distributed.get_rank()}, Starting Async Caller"
             )
             self.process: mp.Process = ctx.Process(
@@ -333,7 +339,7 @@ def schedule_async_call(self, async_req: AsyncRequest) -> None:
                 ),
             )
             self.process.start()
-            logger.info(
+            logger.debug(
                 f"PersistentAsyncCaller: {torch.distributed.get_rank()}, Started Async Caller"
             )
 
@@ -419,12 +425,16 @@ def close(self, abort=False):
             abort (bool, optional): Default to False. Needs to be manually set to true when
                 the checkpoint async process needs to be aborted.
         """
-        logger.info(
+        logger.debug(
             f"PersistentAsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller"
         )
         if self.process:
             if abort:
-                logger.warning(f"Persistent worker aborted in rank {torch.distributed.get_rank()}")
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    f"Persistent worker aborted in rank {torch.distributed.get_rank()}",
+                )
                 self.process.kill()
             else:
                 self.queue.put('DONE')
@@ -466,9 +476,18 @@ def async_loop(
                                        to get aligned with the training rank's logging level
 
         """
+        # Set logger.
         logger = logging.getLogger(__name__)
         logger.setLevel(log_level)
-        logger.info(f"PersistentAsyncCaller: persistent ckpt worker for {rank} has started")
+        logger.debug(f"PersistentAsyncCaller: persistent ckpt worker for {rank} has started")
+
+        # Set CUDA device to appropriate local_rank to ensure allocations / CUDA contexts
+        # in this new process are on the right device, and device 0 on the node does not
+        # take on undue memory burden from other devices on node (default behavior without
+        # this line).
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+
+        # Start busy loop waiting for and executing checkpoint saves.
         while True:
             item = queue.get()
             if isinstance(item, str) and item == 'DONE':
@@ -487,7 +506,7 @@ def async_loop(
                 comp_q.put(item.call_idx)
                 queue.task_done()
 
-        logger.info(f"PersistentAsyncCaller: persistent ckpt worker for {rank}  has terminated")
+        logger.debug(f"PersistentAsyncCaller: persistent ckpt worker for {rank}  has terminated")
 
 
 class _ActiveAsyncRequest(NamedTuple):
diff --git a/megatron/core/dist_checkpointing/strategies/base.py b/megatron/core/dist_checkpointing/strategies/base.py
index 4ecc0948b18..53422b362f6 100644
--- a/megatron/core/dist_checkpointing/strategies/base.py
+++ b/megatron/core/dist_checkpointing/strategies/base.py
@@ -30,19 +30,10 @@ def get_default_strategy(action: StrategyAction, backend: str, version: int):
     """Retrieves a default strategy for a given action, backend and version."""
     error_hint: str = ""
     try:
-        if backend == 'zarr':
-            error_hint = ' Please install `zarr` and `tensorstore!=0.1.46` packages'
-            from .tensorstore import register_default_tensorstore_strategies
+        error_hint = ' Please use PyTorch version >=2.1'
+        from .torch import register_default_torch_strategies
 
-            register_default_tensorstore_strategies()
-            from .zarr import register_default_zarr_strategies
-
-            register_default_zarr_strategies()
-        elif backend == 'torch_dist':
-            error_hint = ' Please use PyTorch version >=2.1'
-            from .torch import register_default_torch_strategies
-
-            register_default_torch_strategies()
+        register_default_torch_strategies()
     except ImportError as e:
         raise CheckpointingException(
             f'Cannot import a default strategy for: {(action.value, backend, version)}. '
diff --git a/megatron/core/dist_checkpointing/strategies/resharding.py b/megatron/core/dist_checkpointing/strategies/resharding.py
deleted file mode 100644
index d343d98d949..00000000000
--- a/megatron/core/dist_checkpointing/strategies/resharding.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Performant resharding of flattened tensors.
-
-Tensors that are first sharded (e.g. across TP) and then flattened cause
-very irregular access patterns during loading. The idea for performant save/load
-is to store tensors with global shape [X, Y, Z] and local shape [x, y, z]
-as tensors with global shape [X // x, Y // y, Z // z, x * y * z] and
-local shape [1, 1, 1, x * y * z]. This allows parallel save of tensors along the
-last (flattened) dimension. During loading, some additional resharding is needed.
-"""
-
-import logging
-import math
-from dataclasses import dataclass
-from itertools import product
-from typing import Any, Dict, Tuple, Union
-
-import numpy as np
-import torch
-from torch.distributed.checkpoint import ChunkStorageMetadata
-from torch.distributed.checkpoint.resharding import _shards_get_overlap_region_wrt_saved_tensor
-
-from megatron.core.dist_checkpointing import ShardedTensor
-from megatron.core.dist_checkpointing.core import CheckpointingException
-from megatron.core.dist_checkpointing.dict_utils import (
-    dict_list_map_inplace,
-    extract_matching_values,
-)
-from megatron.core.dist_checkpointing.mapping import (
-    ShardedStateDict,
-    ShardedTensorFactory,
-    StateDict,
-    apply_factories,
-    apply_factory_merges,
-)
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class TensorReformulationMetadata:
-    """Metadata needed to restore the original tensor shape.
-
-    Args:
-        ckpt_orig_global_shape (Tuple[int, ...]): original global shape of the tensor
-            saved in the checkpoint. This is the global shape of the application,
-            further reformulated into `ckpt_reform_global_shape` while saving.
-        ckpt_reform_global_shape (Tuple[int, ...]): reformulated global shape of the tensor
-            saved in the checkpoint. This is the actual saved shape.
-    """
-
-    ckpt_orig_global_shape: Tuple[int, ...]
-    ckpt_reform_global_shape: Tuple[int, ...]
-
-    def __post_init__(self):
-        assert self.ckpt_orig_global_shape
-
-
-def nd_flattened_tensor_reformulated_global_shape(sh_ten: ShardedTensor) -> Tuple[int, ...]:
-    """Reformulated global shape of the flattened N-D ShardedTensor.
-
-    N-D tensor global shape [X, Y, Z] and local shape [x, y, z]
-    is reformulated into global shape [X // x, Y // y, Z // z, x * y * z] and
-    local shape [1, 1, 1, x * y * z], to allow parallel save of tensors along the
-    last (flattened) dimension.
-
-    Args:
-        sh_ten (ShardedTensor): flattened N-D ShardedTensor (N > 1)
-
-    Returns:
-        Tuple[int, ...]: reformulated tensor shape
-    """
-
-    assert is_nd_flattened_tensor(sh_ten), sh_ten
-    return sh_ten.axis_fragmentations + (int(np.prod(sh_ten.local_shape)),)
-
-
-def is_nd_flattened_tensor(sh_ten: Any) -> bool:
-    """Checks if ShardedTensor is flattened and more than 1-dimensional
-
-    Args:
-        sh_ten (Any): any object
-
-    Returns:
-        bool: whether the given object is a flattened ShardedTensor and is N-dimensional (N > 1)
-    """
-    return isinstance(sh_ten, ShardedTensor) and sh_ten.flattened_range is not None
-
-
-# information needed to restore. With current implementation, this is a nested state dict
-# with ShardedTensorFactories which is basically a ShardedStateDict type
-ReformulationRestoreMetadata = ShardedStateDict
-
-
-def apply_nd_flattened_tensors_reformulation(
-    sharded_state_dict: ShardedStateDict,
-    reformulation_metadata: Dict[str, TensorReformulationMetadata],
-) -> Tuple[ShardedStateDict, ReformulationRestoreMetadata]:
-    """Applies N-D reformulation to a given sharded state dict.
-
-    After applying the method and loading the reformulated state dict,
-    the `restore_nd_flattened_tensors_formulation` needs to be applied.
-
-    Current implementation uses ShardedTensorFactories for convenience of
-    restoring the original structure, but it's just an implementation detail.
-    Turns N-D ShardedTensors into factories and immediately applies them,
-    keeping the data needed to restore the original structure.
-
-    Args:
-        sharded_state_dict (ShardedStateDict): sharded state dict potentially
-            with tensors to reformulate.
-        reformulation_metadata (Dict[str, TensorReformulationMetadata]): dict
-            containing all metadata needed for reformulating tensors in `sharded_state_dict`.
-            for each N-D flattened tensor `sh_ten` in `sharded_state_dict` there must be an
-            entry with `sh_ten.key`.
-
-    Returns:
-        tuple:
-            ShardedStateDict - reformulated sharded state dict
-            ReformulationRestoreMetadata - data needed to restore the original formulation
-                with `restore_nd_flattened_tensors_formulation`
-    """
-
-    def maybe_reformulate_nd_flattened_tensor(sh_ten: Any):
-        if not isinstance(sh_ten, ShardedTensor) or not is_nd_flattened_tensor(sh_ten):
-            return sh_ten
-        # N-D flattened ShardedTensor
-        try:
-            sh_ten_reformulation_metadata = reformulation_metadata[sh_ten.key]
-        except KeyError as e:
-            # Handle legacy checkpointing where 1-D flatten tensor metadata was not saved
-            if len(sh_ten.global_shape) == 1:
-                return sh_ten
-            raise CheckpointingException(
-                f"Missing reformulation metadata for tensor {sh_ten}. "
-                f"Existing keys: {reformulation_metadata.keys()}"
-            ) from e
-
-        ckpt_actual_saved_shape = sh_ten_reformulation_metadata.ckpt_reform_global_shape
-        app_actual_load_shape = nd_flattened_tensor_reformulated_global_shape(sh_ten)
-        if ckpt_actual_saved_shape == app_actual_load_shape:
-            # Same shape - no need to reshard
-            return sh_ten
-
-        return reformulate_single_nd_flattened_tensor(sh_ten, sh_ten_reformulation_metadata)
-
-    # Turn N-D tensors into factories and immediately apply them
-    dict_list_map_inplace(maybe_reformulate_nd_flattened_tensor, sharded_state_dict)
-    sh_ten_factories, _ = extract_matching_values(
-        sharded_state_dict,
-        lambda x: isinstance(x, ShardedTensorFactory),
-        return_lists_as_dicts=True,
-    )
-    apply_factories(sharded_state_dict)
-
-    # Unlink `data` pointers to free memory
-    def unlink_data(x):
-        x.data = None
-        return x
-
-    dict_list_map_inplace(unlink_data, sh_ten_factories)
-    return sharded_state_dict, sh_ten_factories
-
-
-def restore_nd_flattened_tensors_formulation(
-    state_dict: StateDict, formulation_restore_metadata: ReformulationRestoreMetadata
-) -> StateDict:
-    """Restores the original state dict from a reformulated form.
-
-    Inverse of `apply_nd_flattened_tensors_reformulation`.
-
-    Args:
-        state_dict (StateDict): state dict obtained by loading a reformulated
-            sharded state dict.
-        formulation_restore_metadata (ReformulationRestoreMetadata): metadata returned by
-            `apply_nd_flattened_tensors_reformulation` function
-
-    Returns:
-        StateDict: state dict with the original tensors formulation restored
-    """
-    return apply_factory_merges(state_dict, formulation_restore_metadata)
-
-
-def reformulate_single_nd_flattened_tensor(
-    sh_ten: ShardedTensor, reformulation_metadata: TensorReformulationMetadata
-) -> Union[Any, ShardedTensorFactory]:
-    """Reformulates shapes of a single N-D flattened ShardedTensor.
-
-    We need to define a pair of transformations:
-    - turn N-D ShardedTensor with original formulation into multiple reformulated ShardedTensors
-    - merge multiple reformulated loaded torch.Tensors into a single original tensor
-    Current implementation uses ShardedTensorFactories as a convenient mechanism
-    for specifying and keeping track of those transformations.
-
-    Args:
-        sh_ten (ShardedTensor): sharded tensor to reformulate.
-        reformulation_metadata (TensorReformulationMetadata): metadata needed to
-            perform the reformulation
-
-    Returns:
-        ShardedTensorFactory: factory that keeps information how to reformulate
-            (build) the ShardedTensor and then restore original formulation (merge)
-            after loading.
-    """
-    rmd = reformulation_metadata
-    # Data won't be needed - remove unnecessary tensor references
-    sh_ten = sh_ten.without_data()
-
-    # Based on reformulation_metadata, determine other tensor shapes and metadata
-    ckpt_axis_fragmentation = rmd.ckpt_reform_global_shape[:-1]
-    for sh, fragm in zip(rmd.ckpt_orig_global_shape, ckpt_axis_fragmentation):
-        assert sh % fragm == 0, (sh_ten, rmd.ckpt_reform_global_shape)
-    ckpt_local_shape_with_prepended_axis = tuple(
-        sh // fragm for sh, fragm in zip(rmd.ckpt_orig_global_shape, ckpt_axis_fragmentation)
-    )
-    assert (
-        ckpt_local_shape_with_prepended_axis[: sh_ten.prepend_axis_num]
-        == (1,) * sh_ten.prepend_axis_num
-    ), (ckpt_local_shape_with_prepended_axis, sh_ten)
-    ckpt_local_shape = ckpt_local_shape_with_prepended_axis[sh_ten.prepend_axis_num :]
-
-    # Iterate over reformulated shapes needed by the application and from checkpoint,
-    # and generate new ShardedTensors that match the checkpoint sharding.
-    overlap_dim_offsets = []
-    assert len(ckpt_axis_fragmentation) == len(sh_ten.axis_fragmentations), (
-        ckpt_axis_fragmentation,
-        sh_ten,
-    )
-    for dim, (app_chunk_dim_offset, ckpt_fragm, app_fragm) in enumerate(
-        zip(
-            sh_ten.local_chunk_offset_in_global(),
-            ckpt_axis_fragmentation,
-            sh_ten.axis_fragmentations,
-        )
-    ):
-        # without `int`, it's an exact offset of the app shard expressed in ckpt_local_shape units
-        first_overlap_dim_offset = int(ckpt_fragm / app_fragm * app_chunk_dim_offset)
-        # `math.ceil` argument is an exact offset of the app next shard expressed
-        # in ckpt_local_shape units
-        next_overlap_dim_offset = math.ceil(ckpt_fragm / app_fragm * (app_chunk_dim_offset + 1))
-        overlap_dim_offsets.append(range(first_overlap_dim_offset, next_overlap_dim_offset))
-
-    logger.debug(
-        f"Generated the following number of overlap shards for each dimension: "
-        f"{list(map(len, overlap_dim_offsets))} for fragmentation ckpt "
-        f"{ckpt_axis_fragmentation} vs app {sh_ten.axis_fragmentations} "
-        f"and chunk offset {sh_ten.local_chunk_offset_in_global()}"
-    )
-    reformulated_sh_tens = {}
-    for chunk_offset in product(*overlap_dim_offsets):
-        global_offset = tuple(
-            chunk_off * chunk_shape
-            for chunk_off, chunk_shape in zip(chunk_offset, ckpt_local_shape_with_prepended_axis)
-        )
-        reformulated_sh_tens[(global_offset, ckpt_local_shape)] = ShardedTensor(
-            sh_ten.key,
-            None,
-            sh_ten.dtype,
-            ckpt_local_shape,
-            rmd.ckpt_orig_global_shape,
-            global_offset,
-            ckpt_axis_fragmentation,
-            sh_ten.replica_id,
-            sh_ten.prepend_axis_num,
-            sh_ten.allow_shape_mismatch,
-            flattened_range=slice(0, rmd.ckpt_reform_global_shape[-1]),  # whole ckpt shard
-        )
-
-    # Now, we have to define the transformations from application sharding
-    # to checkpoint sharding.
-
-    @torch.no_grad()
-    def sh_ten_build_fn(*args, **kwargs):
-        # Here we simply return the precomputed tensors.
-        return reformulated_sh_tens
-
-    @torch.no_grad()
-    def sh_ten_merge_fn(sub_state_dict):
-        # This is the non-flattened local tensor with original formulation
-        # that we are going to fill with shards loaded from the checkpoint.
-        app_non_flat_ten = torch.empty(
-            sh_ten.local_shape,
-            dtype=sh_ten.dtype,
-            device=sh_ten.data.device if sh_ten.data is not None else None,
-        )
-
-        assert len(sub_state_dict) > 0
-        for (ckpt_global_offset, ckpt_local_shape), ckpt_ten in sub_state_dict.items():
-            # For each ckpt shard, we fill the appropriate application shard part
-            dest_ten = app_non_flat_ten
-            src_ten = ckpt_ten.view(ckpt_local_shape)
-            # We don't need narrowing over `prepend_axis_num` axes so we take
-            # the [sh_ten.prepend_axis_num:] offsets slice
-            for (
-                dim,
-                offset_for_saved_tensor,
-                offset_for_current_tensor,
-                length,
-            ) in _shards_get_overlap_region_wrt_saved_tensor(
-                saved_shard=ChunkStorageMetadata(
-                    ckpt_global_offset[sh_ten.prepend_axis_num :], ckpt_local_shape
-                ),
-                current_shard=ChunkStorageMetadata(
-                    sh_ten.global_offset[sh_ten.prepend_axis_num :], sh_ten.local_shape
-                ),
-            ):
-                src_ten = src_ten.narrow(dim, offset_for_saved_tensor, length)
-                dest_ten = dest_ten.narrow(dim, offset_for_current_tensor, length)
-            dest_ten.copy_(src_ten)
-        return app_non_flat_ten.flatten()[sh_ten.flattened_range]
-
-    return ShardedTensorFactory(
-        sh_ten.key,
-        sh_ten.data,
-        sh_ten_build_fn,
-        sh_ten_merge_fn,
-        sh_ten.replica_id,
-        sh_ten.flattened_range,
-    )
diff --git a/megatron/core/dist_checkpointing/strategies/tensorstore.py b/megatron/core/dist_checkpointing/strategies/tensorstore.py
deleted file mode 100644
index 6472c9d58f9..00000000000
--- a/megatron/core/dist_checkpointing/strategies/tensorstore.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Strategies using TensorStore to load and save Zarr arrays."""
-
-from functools import partial
-from itertools import starmap
-from logging import getLogger
-from pathlib import Path
-from typing import Union
-
-import torch
-
-from ..core import CheckpointingException
-from ..dict_utils import dict_list_map_inplace
-from ..mapping import ShardedStateDict, ShardedTensor
-from .base import LoadShardedStrategy, StrategyAction, register_default_strategy
-from .zarr import load_zarr_based_sharded_metadata, postprocess_numpy_array
-
-try:
-    import tensorstore as ts
-
-    HAVE_TENSORSTORE = True
-except ImportError:
-    from unittest.mock import MagicMock
-
-    ts = MagicMock()
-    HAVE_TENSORSTORE = False
-
-
-logger = getLogger(__name__)
-
-
-def register_default_tensorstore_strategies():
-    """Register default strategies leveraging tensorstore."""
-    register_default_strategy(
-        StrategyAction.LOAD_SHARDED, "zarr", 1, TensorStoreLoadShardedStrategy()
-    )
-
-
-class TensorStoreLoadShardedStrategy(LoadShardedStrategy):
-    """Load strategy for Zarr backend using `tensorstore` for loading."""
-
-    def __init__(self, load_directly_on_device: bool = False):
-        super().__init__()
-        self.load_directly_on_device = load_directly_on_device
-
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union[str, Path]):
-        if isinstance(checkpoint_dir, str):
-            checkpoint_dir = Path(checkpoint_dir)
-
-        if torch.distributed.get_rank() == 0:
-            print(f"Loading distributed checkpoint with {self.__class__.__name__}")
-            if self.load_directly_on_device:
-                print(f"Loading distributed checkpoint directly on the GPU")
-        load_fn = partial(
-            _load_from_array,
-            checkpoint_dir=checkpoint_dir,
-            load_directly_on_device=self.load_directly_on_device,
-        )
-        dict_list_map_inplace(load_fn, sharded_state_dict)
-        return sharded_state_dict
-
-    def load_tensors_metadata(self, checkpoint_dir: Union[str, Path]):
-        if isinstance(checkpoint_dir, str):
-            checkpoint_dir = Path(checkpoint_dir)
-
-        def get_ts_shape_dtype(path):
-            arr = open_ts_array(path)
-            return arr.shape, arr.dtype.numpy_dtype
-
-        return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype)
-
-    def check_backend_compatibility(self, loaded_version):
-        pass  # TODO
-
-    def check_version_compatibility(self, loaded_version):
-        pass  # TODO
-
-
-def merge_global_slice_with_shape(global_slice, actual_shape, key):
-    """Intersects the global slice with the actual shape (prevent overflow)."""
-
-    def _merge_slice(dim_slice, dim_size):
-        if isinstance(dim_slice, slice):
-            assert (
-                dim_slice.start < dim_size
-            ), f"Got empty slice for ShardedTensor {key} ({dim_slice}, {dim_size})"
-            if dim_slice.stop > dim_size:
-                dim_slice = slice(dim_slice.start, dim_size, dim_slice.step)
-        return dim_slice
-
-    assert len(global_slice) == len(actual_shape), (global_slice, actual_shape, key)
-    return tuple(starmap(_merge_slice, zip(global_slice, actual_shape)))
-
-
-def _load_from_array(
-    sharded_tensor: ShardedTensor,
-    checkpoint_dir: Path,
-    load_directly_on_device: bool = False,
-    apply_flattened_range: bool = True,
-):
-    x = _load_regular_chunk(sharded_tensor, checkpoint_dir)
-    ten = postprocess_numpy_array(x, sharded_tensor, apply_flattened_range)
-    if load_directly_on_device:
-        sharded_tensor.data.data.copy_(ten)
-        return sharded_tensor.data
-    else:
-        return ten
-
-
-def _load_regular_chunk(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
-    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
-    arr = open_ts_array(checkpoint_dir / sharded_tensor.key)
-    if sharded_tensor.global_shape == arr.shape:
-        x = (
-            arr[sharded_tensor.global_slice()].read().result()
-        )  # flattened tensors loading is delayed
-    elif sharded_tensor.allow_shape_mismatch:
-        global_slice = merge_global_slice_with_shape(
-            sharded_tensor.global_slice(), arr.shape, sharded_tensor.key
-        )
-        x = arr[global_slice].read().result()  # flattened tensors loading is delayed
-    else:
-        _msg = (
-            f"Global shape mismatch for loaded ({arr.shape})"
-            f" and expected ({sharded_tensor.global_shape}) tensor"
-            f" for key {sharded_tensor.key}"
-        )
-        raise CheckpointingException(_msg)
-    return x
-
-
-def open_ts_array(arr_path: Path):
-    """Opens a Zarr file array with Tensorstore with basic setting.
-
-    Args:
-        arr_path (Path): path to a Zarr (Tensorstore) array
-    """
-    if not HAVE_TENSORSTORE:
-        raise RuntimeError(
-            "tensorstore is required, please install it with `pip install tensorstore`"
-        )
-    spec = {"driver": "zarr", "metadata_key": ".zarray", "kvstore": {}}
-    spec["kvstore"] = {"driver": "file", "path": str(arr_path)}
-    try:
-        arr = ts.open(ts.Spec(spec), open=True).result()
-    except Exception as e:
-        raise CheckpointingException(f"Array {arr_path} could not be loaded. Error: {e}") from e
-    return arr
diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
index 97468e93a3d..065005177c6 100644
--- a/megatron/core/dist_checkpointing/strategies/torch.py
+++ b/megatron/core/dist_checkpointing/strategies/torch.py
@@ -1,13 +1,12 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 """ Strategies using PyTorch distributed.checkpoint as an underlying format. """
 import io
 import os
 import pickle
 import warnings
-from collections import ChainMap, defaultdict
+from collections import defaultdict
 from contextlib import contextmanager
-from dataclasses import dataclass
 from itertools import product
 from logging import getLogger
 from pathlib import Path
@@ -59,13 +58,6 @@
 from .cached_metadata_filesystem_reader import CachedMetadataFileSystemReader
 from .checkpointable import CheckpointableShardedTensor, LocalShardsContainer
 from .filesystem_async import FileSystemWriterAsync
-from .resharding import (
-    TensorReformulationMetadata,
-    apply_nd_flattened_tensors_reformulation,
-    is_nd_flattened_tensor,
-    nd_flattened_tensor_reformulated_global_shape,
-    restore_nd_flattened_tensors_formulation,
-)
 from .state_dict_saver import save_state_dict_async_finalize, save_state_dict_async_plan
 
 try:
@@ -91,6 +83,20 @@
 _metadata_fn: str = ".metadata"
 
 
+# dummy class needed to load mcore v0.15 checkpoints with optim. states
+class MCoreMetadata:
+    """ """
+
+    pass
+
+
+# dummy class needed to load old checkpoint weights
+class MCoreSavePlan:
+    """ """
+
+    pass
+
+
 def register_default_torch_strategies():
     """Register default strategies related to PyT Distributed backend."""
     register_default_strategy(
@@ -172,63 +178,29 @@ def sharded_tensor_to_torch_sharded_tensor(
         rank = torch.distributed.get_rank()
 
     some_sh_ten = sh_tens[0]
-    has_flattened_range = some_sh_ten.flattened_range is not None
 
     for sh_ten in sh_tens:
-        assert (sh_ten.flattened_range is not None) == has_flattened_range, sh_tens
         if not sh_ten.data.is_contiguous():
             sh_ten.data = sh_ten.data.contiguous()
 
-    if load_legacy_1d_flatten_tensors and len(some_sh_ten.global_shape) == 1:
-        # Legacy 1-D flattened tensors are loaded as non-flat regular ShardedTensors
-        has_flattened_range = False
-
     local_global_offsets = {}
 
     prepend_axis_num = sh_tens[0].prepend_axis_num
     # Determine local shards according to tensor type (see docs)
-    if has_flattened_range:
-        # Type (3) case: N-D flattened ShardedTensors
-        for sh_ten in sh_tens:
-            local_global_offsets.setdefault(sh_ten.local_chunk_offset_in_global(), []).append(
-                sh_ten
-            )
-            assert sh_ten.data.ndim == 1, sh_ten
-            sh_ten.data = sh_ten.data.view((1,) * len(sh_ten.global_shape) + (-1,))
-
-        # Global shape reformulation:
-        global_shape = nd_flattened_tensor_reformulated_global_shape(some_sh_ten)
-        offsets_shape = (1,) * len(
-            some_sh_ten.global_shape
-        )  # reformulated global shape has shape equal ti number of local chunks
-
-        local_shards = [
-            Shard.from_tensor_and_offsets(
-                sh_ten.data,
-                list(
-                    sh_ten.local_chunk_offset_in_global() + (sh_ten.flattened_range.start,)
-                ),  # additional flattened offset
-                rank,
-            )
-            for sh_ten in sh_tens
-        ]
-    else:
-        # Type (1) case: non-flat regular ShardedTensors
-        for sh_ten in sh_tens:
-            local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten)
-            sh_ten.data = sh_ten.data.view(
-                (1,) * prepend_axis_num + sh_ten.local_shape
-            )  # adjust to prepended_axis_num
+    # Type (1) case: non-flat regular ShardedTensors
+    for sh_ten in sh_tens:
+        local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten)
+        sh_ten.data = sh_ten.data.view(
+            (1,) * prepend_axis_num + sh_ten.local_shape
+        )  # adjust to prepended_axis_num
 
-        global_shape = some_sh_ten.global_shape
-        offsets_shape = some_sh_ten.data.shape  # includes prepended axes
+    global_shape = some_sh_ten.global_shape
+    offsets_shape = some_sh_ten.data.shape  # includes prepended axes
 
-        local_shards = [
-            Shard.from_tensor_and_offsets(
-                sh_ten.data, list(sh_ten.global_offset), rank  # simple case
-            )
-            for sh_ten in sh_tens
-        ]
+    local_shards = [
+        Shard.from_tensor_and_offsets(sh_ten.data, list(sh_ten.global_offset), rank)  # simple case
+        for sh_ten in sh_tens
+    ]
 
     # Create a ShardedTensor without invoking communication. Determine global shards
     world_size = torch.distributed.get_world_size()
@@ -240,20 +212,8 @@ def sharded_tensor_to_torch_sharded_tensor(
             # local shard
             placement = f"rank:{rank}/cuda"
             for sh_ten in local_global_offsets[offset]:
-                if has_flattened_range:
-                    assert offset == sh_ten.local_chunk_offset_in_global(), (
-                        offset,
-                        sh_ten.local_chunk_offset_in_global(),
-                    )
-                    # This is not an actual offset, but an offset of the whole shard
-                    # This is needed for a PyT Dist internal integrity check
-                    _shard_offset = sh_ten.local_chunk_offset_in_global() + (0,)
-                    size = (1,) * len(offsets_shape) + global_shape[-1:]
-                else:
-                    size = sh_ten.data.shape
-                    _shard_offset = offset
-                shard_metadata.append(ShardMetadata(_shard_offset, size, placement))
-
+                size = sh_ten.data.shape
+                shard_metadata.append(ShardMetadata(offset, size, placement))
         else:
             # pylint: disable=line-too-long
             # for shards from other ranks we provide simplistic data - this information will be discarded
@@ -261,11 +221,7 @@ def sharded_tensor_to_torch_sharded_tensor(
             # Due to a bug in PyT 24.05 container we must specify some concrete rank within a world size.
             # The exact rank doesn't matter as long as it's different than my rank - hence (rank + 1) % WS.
             placement = f"rank:{(rank + 1) % world_size}/cuda"
-            if has_flattened_range:
-                offset = offset + (0,)
-                size = (1,) * len(offsets_shape) + global_shape[-1:]
-            else:
-                size = offsets_shape
+            size = offsets_shape
             shard_metadata.append(ShardMetadata(offset, size, placement))
 
     tensor = some_sh_ten.data
@@ -286,9 +242,6 @@ def sharded_tensor_to_torch_sharded_tensor(
     # Store MCore related data as PyTShardedTensor attribute.
     # This won't be stored in the checkpoint, only for runtime purposes
     pyt_sh_ten.mcore_sh_ten = sh_ten.without_data()
-    pyt_sh_ten.mcore_metadata = {}
-    if has_flattened_range:
-        pyt_sh_ten.mcore_metadata['nd_reformulated_orig_global_shape'] = sh_ten.global_shape
     return pyt_sh_ten
 
 
@@ -400,13 +353,9 @@ def _unwrap_pyt_sharded_tensor(
     ret_tensors = []
     for sh in sh_ten.local_shards():
         ten = sh.tensor
-        if mcore_sh_ten.flattened_range is not None:
-            assert ten.shape[:-1] == (1,) * (len(ten.shape) - 1), ten.shape
-            ten = ten.view(-1)
-        else:
-            for _ in range(mcore_sh_ten.prepend_axis_num):
-                assert ten.size(0) == 1
-                ten = ten[0]  # NOTE: ten.squeeze(0) uses more memory for FP8 tensors
+        for _ in range(mcore_sh_ten.prepend_axis_num):
+            assert ten.size(0) == 1
+            ten = ten[0]  # NOTE: ten.squeeze(0) uses more memory for FP8 tensors
         ret_tensors.append(ten)
     return ret_tensors
 
@@ -458,22 +407,6 @@ def _restore_dict_types(x: Union[dict, list, Any], keys_template: Union[dict, li
             _restore_dict_types(x_val, templ_val)
 
 
-@dataclass
-class MCoreMetadata(Metadata):
-    """Metadata with mcore specific data."""
-
-    # holds data related to flattened_range
-    # TODO: remove when flattened_range is properly removed
-    mcore_data: Optional[Dict[str, Dict[str, Any]]] = None  # Mcore related data about each tensor
-
-
-@dataclass(frozen=True)
-class MCoreSavePlan(SavePlan):
-    """SavePlan with MCore specific data."""
-
-    mcore_data: Optional[Dict[str, Dict[str, Any]]] = None  # Mcore related data about each tensor
-
-
 class MCoreSavePlanner(DefaultSavePlanner):
     """Differs with the default planner by saving BytesIO objects on all ranks.
 
@@ -489,7 +422,6 @@ def __init__(
         self,
         *args,
         dedup_replicated_tensors: Optional[bool] = None,
-        nd_flattened_global_shapes: Optional[Dict[str, Tuple[int, ...]]] = None,
         can_run_decentralized_global_plan: bool = True,
         **kwargs,
     ) -> None:
@@ -498,7 +430,6 @@ def __init__(
         if get_torch_version() <= PkgVersion("2.2"):
             kwargs['dedup_replicated_tensors'] = dedup_replicated_tensors
         super().__init__(*args, **kwargs)
-        self.nd_flattened_global_shapes = nd_flattened_global_shapes or {}
         self.can_run_decentralized_global_plan = can_run_decentralized_global_plan
         if can_run_decentralized_global_plan:
             assert (
@@ -525,26 +456,9 @@ def create_local_plan(self) -> SavePlan:
             # For MCore, these should be already non-duplicates.
             write_items += _create_write_items(fqn, obj)
 
-        self.plan = MCoreSavePlan(
-            items=write_items,
-            planner_data=self.mappings,
-            mcore_data={
-                k: sh_ten.mcore_metadata
-                for k, sh_ten in self.state_dict.items()
-                if isinstance(sh_ten, TorchShardedTensor)
-            },
-        )
+        self.plan = SavePlan(items=write_items, planner_data=self.mappings)
         return self.plan
 
-    def create_global_plan(self, all_plans: List[MCoreSavePlan]) -> Tuple[List[SavePlan], Metadata]:
-        """Merges MCore data for all plans."""
-        global_plan, metadata = super().create_global_plan(all_plans)
-        mcore_data = dict(
-            ChainMap(*(plan.mcore_data for plan in all_plans))  # type: ignore[arg-type]
-        )
-        metadata = MCoreMetadata(mcore_data=mcore_data, **vars(metadata))
-        return global_plan, metadata
-
     def create_decentralized_global_plan(self, local_plan: SavePlan) -> SavePlan:
         """Nothing to do, just some checks.
 
@@ -586,14 +500,6 @@ def __init__(
         self.allow_shape_mismatch_sharded_tensors = allow_shape_mismatch_sharded_tensors
         self._intermediate_read_item_and_target: Optional[Tuple[ReadItem, torch.Tensor]] = None
 
-    @staticmethod
-    def _expected_shape(sh_ten):
-        return (
-            nd_flattened_tensor_reformulated_global_shape(sh_ten)
-            if is_nd_flattened_tensor(sh_ten)
-            else sh_ten.global_shape
-        )
-
     def _validate_global_shapes(self, metadata, sharded_tensors):
         for sh_ten in sharded_tensors:
             if sh_ten.key not in metadata.state_dict_metadata:
@@ -602,14 +508,8 @@ def _validate_global_shapes(self, metadata, sharded_tensors):
                     f" {sorted(metadata.state_dict_metadata.keys())}"
                 )
             loaded_shape = metadata.state_dict_metadata[sh_ten.key].size
-            expected_shape = self._expected_shape(sh_ten)
+            expected_shape = sh_ten.global_shape
             if loaded_shape != expected_shape:
-                if is_nd_flattened_tensor(sh_ten) and len(sh_ten.global_shape) == 1:
-                    # Handle legacy 1-D flattened tensors checkpoint format
-                    # where the global shape is not stored in the metadata
-                    expected_shape = sh_ten.global_shape
-                    if loaded_shape == expected_shape:
-                        continue
                 _msg = (
                     f'Global shape mismatch for loaded ({loaded_shape})'
                     f' and expected ({expected_shape}) tensor'
@@ -635,7 +535,7 @@ def _temporarily_bypass_shape_validation(self):
         try:
             # Temporarily set sizes to expected shapes
             for md, _, sharded_tensor in metadata_with_sizes:
-                md.size = self._expected_shape(sharded_tensor)
+                md.size = sharded_tensor.global_shape
             yield
         finally:
             # Restore original sizes after yield
@@ -860,48 +760,6 @@ def _get_filesystem_reader(
     return FileSystemReader(checkpoint_dir)
 
 
-def get_reformulation_metadata(
-    sharded_state_dict: ShardedStateDict, checkpoint_dir: Path
-) -> Dict[str, TensorReformulationMetadata]:
-    """Reads MCore data for N-D flattened tensors from checkpoint metadata during ckpt load.
-
-    Args:
-        sharded_state_dict (ShardedStateDict): sharded state dict to load
-        checkpoint_dir (Path): checkpoint directory
-
-    Returns:
-        Dict[str, TensorReformulationMetadata] - dictionary that maps keys of every
-            N-D flattened tensor from the sharded_state_dict to its original global shape
-            as stored in `mcore_data` in the checkpoint.
-    """
-    fs_reader = _get_filesystem_reader(checkpoint_dir)
-    ckpt_metadata = fs_reader.read_metadata()
-    reformulation_metadata = {}
-    for sh_ten in nested_values(sharded_state_dict):
-        if not is_nd_flattened_tensor(sh_ten):
-            continue
-        try:
-            ckpt_global_shape = ckpt_metadata.mcore_data[sh_ten.key][
-                'nd_reformulated_orig_global_shape'
-            ]
-        except KeyError as e:
-            if len(sh_ten.global_shape) == 1:
-                warnings.warn(
-                    f'Legacy checkpoint format detected for 1-D flattened tensor {sh_ten}. '
-                    'Skip metadata reformulation.'
-                )
-                continue
-            raise CheckpointingException(
-                f'Cannot find global shape metadata for N-D flattened tensor {sh_ten} '
-                f'in checkpoint metadata: {ckpt_metadata.mcore_data}'
-            ) from e
-
-        reformulation_metadata[sh_ten.key] = TensorReformulationMetadata(
-            ckpt_global_shape, ckpt_metadata.state_dict_metadata[sh_ten.key].size
-        )
-    return reformulation_metadata
-
-
 class TorchDistLoadShardedStrategy(LoadShardedStrategy):
     """Basic load strategy for the PyT Distributed format."""
 
@@ -919,25 +777,14 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
 
         Returns: loaded state dict
         """
+        ######### FlagScale Begin #########
         # Get the value from the environment variable if it exists, otherwise default to True
         single_file_per_tensor_ckpt = os.getenv('FS_SFPT_CKPT_LOAD', 'False').lower() in (
             'true',
             '1',
             't',
         )
-
-        # Apply N-D tensors resharding
-        reformulation_metadata = get_reformulation_metadata(sharded_state_dict, checkpoint_dir)
-        sharded_state_dict, formulation_restore_data = apply_nd_flattened_tensors_reformulation(
-            sharded_state_dict, reformulation_metadata
-        )
-
-        # Check if there are legacy 1-D flattened tensors in the checkpoint
-        has_legacy_1d_flattened_tensors = False
-        for sh_ten in nested_values(sharded_state_dict):
-            if is_nd_flattened_tensor(sh_ten) and sh_ten.key not in reformulation_metadata:
-                has_legacy_1d_flattened_tensors = True
-                break
+        ######### FlagScale End #########
 
         flexible_shape_sharded_tensors = [
             sh_ten
@@ -955,9 +802,7 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
         (sharded_state_dict, flat_mapping, rename_mapping) = (
             _replace_state_dict_keys_with_sharded_keys(sharded_state_dict)
         )
-        pyt_state_dict = mcore_to_pyt_state_dict(
-            sharded_state_dict, True, load_legacy_1d_flatten_tensors=has_legacy_1d_flattened_tensors
-        )
+        pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, True)
         # Load PyT Distributed format
         fsr = _get_filesystem_reader(checkpoint_dir, cache_metadata=True)
         if not single_file_per_tensor_ckpt: 
@@ -966,7 +811,7 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
                 fsr,
                 planner=MCoreLoadPlanner(
                     shapes_validation_sharded_tensors=flexible_shape_sharded_tensors,
-                allow_shape_mismatch_sharded_tensors=allow_shape_mismatch_sharded_tensors,
+                    allow_shape_mismatch_sharded_tensors=allow_shape_mismatch_sharded_tensors,
                 ),
             )
         else:
@@ -1000,10 +845,6 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
             mcore_state_dict, flat_mapping, rename_mapping  # type: ignore[arg-type]
         )
         _restore_dict_types(mcore_state_dict, orig_sharded_state_dict)
-        # Apply N-D tensors resharding postprocessing
-        mcore_state_dict = restore_nd_flattened_tensors_formulation(
-            mcore_state_dict, formulation_restore_data
-        )
         return mcore_state_dict
 
     def load_tensors_metadata(self, checkpoint_dir: Path, metadata: Metadata = None):
@@ -1018,25 +859,10 @@ def load_tensors_metadata(self, checkpoint_dir: Path, metadata: Metadata = None)
             if not isinstance(tp, TensorStorageMetadata):
                 continue  # load only tensors
 
-            nd_orig_global_shape = mcore_data.get(k, {}).get('nd_reformulated_orig_global_shape')
-            if nd_orig_global_shape is None:
-                # Regular tensor
-                sharded_metadata[k] = ShardedTensor.from_rank_offsets(
-                    k, torch.empty(tp.size, **tp.properties.__dict__, device='meta')
-                ).without_data()
-            else:
-                # N-D flattened tensor
-                unflat_ten = torch.empty(
-                    nd_orig_global_shape, **tp.properties.__dict__, device='meta'
-                )
-                flat_ten = unflat_ten.flatten()
-                sharded_metadata[k] = ShardedTensor.from_rank_offsets_flat(
-                    k,
-                    flat_ten,
-                    unflat_ten.shape,
-                    flattened_range=slice(0, unflat_ten.numel()),  # whole slice
-                ).without_data()
-
+            # Regular tensor
+            sharded_metadata[k] = ShardedTensor.from_rank_offsets(
+                k, torch.empty(tp.size, **tp.properties.__dict__, device='meta')
+            ).without_data()
         return sharded_metadata
 
     def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateDict:
diff --git a/megatron/core/dist_checkpointing/strategies/two_stage.py b/megatron/core/dist_checkpointing/strategies/two_stage.py
deleted file mode 100644
index a9500525bf2..00000000000
--- a/megatron/core/dist_checkpointing/strategies/two_stage.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" 2-stage checkpoint loading. """
-import time
-from collections import defaultdict
-from dataclasses import dataclass
-from functools import partial, wraps
-from itertools import chain
-from logging import getLogger
-from operator import attrgetter, itemgetter
-from pathlib import Path
-from typing import List, Optional, Tuple, Union
-
-import torch
-
-from ..dict_utils import dict_list_map_inplace, map_reduce, nested_values
-from ..mapping import ShardedStateDict, ShardedTensor
-from .base import LoadShardedStrategy
-from .tensorstore import _load_from_array, open_ts_array
-from .zarr import flatten_range, load_zarr_based_sharded_metadata
-
-_import_trigger = None
-
-
-timers = defaultdict(list)
-
-logger = getLogger(__name__)
-logger.warning(
-    'megatron.core.dist_checkpointing.two_stage module is deprecated'
-    ' and will be removed in Megatron-Core v0.12. Please use'
-    ' FullyParallelLoadStrategyWrapper to accomplish a parallelized checkpoint load.'
-)
-
-
-def timed(verbose=True):
-    """Timing decorator."""
-
-    def timed_dec(fn):
-        name = fn.__name__
-
-        @wraps(fn)
-        def wrapped(*args, **kwargs):
-            if verbose:
-                logger.debug(f'{name} init')
-            start = time.time()
-            ret = fn(*args, **kwargs)
-            took = time.time() - start
-            if verbose:
-                logger.debug(f'{name} took {took}s')
-            timers[name].append(took)
-            return ret
-
-        return wrapped
-
-    return timed_dec
-
-
-@dataclass
-class _ShardedTensorMetadata:
-    global_rank: int
-    sharded_tensor_no_data: ShardedTensor
-    dist_group_rank: Tuple[int]  # id of distributed group
-    dist_group_ranks: Tuple[int]  # id of distributed group
-    data_size: Optional[int] = None  # bytes
-
-
-def sharded_tensor_chunk_id(sharded_tensor: ShardedTensor):
-    """Id of a sharded tensor."""
-    return (sharded_tensor.key, sharded_tensor.global_offset)
-
-
-class TwoStageDataParallelLoadShardedStrategy(LoadShardedStrategy):
-    """Loads one checkpoint replica from storage and broadcasts to other nodes.
-
-    This strategy loads checkpoint from storage on minimal set of nodes
-    and distributes the checkpoint to other nodes with torch.distributed.
-    Loading is performed with tensorstore.
-
-    Steps:
-    0. (optional) create Gloo distributed groups
-    1. Exchange ShardedTensors metadata between all nodes
-    2. Align needed tensors within DP groups
-    3. For each globally unique tensor:
-    3.a) on one of the ranks load it from storage to CPU and move to CUDA
-    3.b) allocate CUDA tensor on other ranks
-    3.c) broadcast within DP group
-    3.d) copy tensor content to the model param location
-    3.e) free tensor buffers from a) and b)
-
-    Notes:
-    1. Loading and broadcasting is done sequentially to avoid both host and device OOMs
-    2. There is a lot of overlap potential between all three steps done for each tensor:
-    2.a) loading from storage to numpy
-    2.b) moving CPU tensors to CUDA
-    2.c) broadcast
-    """
-
-    def __init__(self, data_parallel_group, cpu_transfer=True):
-        super().__init__()
-
-        self.cpu_transfer = cpu_transfer
-        self.data_parallel_group_orig = data_parallel_group
-        self.data_parallel_group = None if cpu_transfer else data_parallel_group
-        self.dp_group_ranks = tuple(
-            sorted(torch.distributed.get_process_group_ranks(data_parallel_group))
-        )
-        self.dp_group_rank = self.data_parallel_group_orig.rank()
-        self.global_rank = torch.distributed.get_rank()
-
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
-        """Main load method."""
-        self.maybe_init_gloo_group()
-        all_tensors_sorted = self._build_load_plan(sharded_state_dict)
-        self._exchange_loaded_tensors(all_tensors_sorted, sharded_state_dict, checkpoint_dir)
-        # TODO: fix hang in summarize_load_times
-        # self.summarize_load_times()
-        return sharded_state_dict
-
-    def summarize_load_times(self):
-        """Summarize load times."""
-        torch.distributed.barrier()
-        logger.info('Checkpoint loading finished. Summary:')
-        # TODO: `timers` keys are not guaranteed to be the same across ranks which causes hangs
-        for key, times in sorted(timers.items()):
-            times_sum = sum(times)
-            max_times = torch.tensor([times_sum], device='cuda')
-            avg_times = torch.tensor([times_sum], device='cuda')
-            torch.distributed.all_reduce(max_times, op=torch.distributed.ReduceOp.MAX)
-            torch.distributed.all_reduce(avg_times, op=torch.distributed.ReduceOp.SUM)
-            avg_times /= torch.distributed.get_world_size()
-            if torch.distributed.get_rank() == 0:
-                logger.info(f'{key}: max {max_times[0]}, avg {avg_times[0]}')
-
-    @timed(verbose=False)
-    def load_tensor_from_storage(self, checkpoint_dir, ten_meta: _ShardedTensorMetadata):
-        """Load tensor from storage."""
-        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) init')
-        ret = _load_from_array(
-            ten_meta.sharded_tensor_no_data,
-            checkpoint_dir,
-            load_directly_on_device=False,
-            apply_flattened_range=False,
-        )
-        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) DONE')
-        return ret
-
-    @timed()
-    def maybe_init_gloo_group(self):
-        """Create Gloo groups."""
-        if not self.cpu_transfer:
-            return
-        all_groups = [None] * torch.distributed.get_world_size()
-        torch.distributed.all_gather_object(all_groups, self.dp_group_ranks)
-        all_groups = set(tuple(sorted(gr)) for gr in all_groups)
-        for group_ranks in sorted(all_groups):
-            # "two_stage" module will be deprecated, so not replace new_group()
-            # with ...parallel_state.create_group() func setting group_desc here.
-            gloo_pg = torch.distributed.new_group(ranks=group_ranks, backend='gloo')
-            if self.global_rank in group_ranks:
-                self.data_parallel_group = gloo_pg
-                assert self.dp_group_rank == self.data_parallel_group.rank()
-
-    def check_backend_compatibility(self, loaded_version):
-        pass  # TODO
-
-    def check_version_compatibility(self, loaded_version):
-        pass  # TODO
-
-    @timed()
-    def _build_load_plan(
-        self, sharded_state_dict: ShardedStateDict
-    ) -> List[_ShardedTensorMetadata]:
-        local_meta = [
-            _ShardedTensorMetadata(
-                self.global_rank,
-                sharded_ten.without_data(),
-                self.dp_group_rank,
-                self.dp_group_ranks,
-            )
-            for sharded_ten in nested_values(sharded_state_dict)
-        ]
-        all_meta = [None] * self.data_parallel_group.size()
-        torch.distributed.all_gather_object(all_meta, local_meta, group=self.data_parallel_group)
-        all_meta = list(chain.from_iterable(all_meta))
-        all_tensors_sorted = self.deduplicate_chunks(all_meta)
-        return all_tensors_sorted
-
-    @timed()
-    def deduplicate_chunks(self, ten_metas: List[_ShardedTensorMetadata]):
-        """Group tensors by chunk and then pick the tensor with the lowest rank.
-
-        NOTE: with proper loading overlap, loading from randomized ranks
-         (instead of the smallest one) could be beneficial here.
-        """
-        ten_metas = map_reduce(
-            ten_metas,
-            key_fn=lambda meta: sharded_tensor_chunk_id(meta.sharded_tensor_no_data),
-            reduce_fn=partial(min, key=attrgetter('dist_group_rank')),
-        )
-        all_metas_sorted = list(map(itemgetter(1), sorted(ten_metas.items())))
-        return all_metas_sorted
-
-    @timed()
-    def _exchange_loaded_tensors(
-        self, ten_metas: List[_ShardedTensorMetadata], sharded_state_dict, checkpoint_dir
-    ):
-        logger.debug(f'_exchange_loaded_tensors, num ten_metas: {len(ten_metas)}')
-        for ten_meta in ten_metas:
-
-            src_rank = torch.distributed.get_global_rank(
-                self.data_parallel_group, ten_meta.dist_group_rank
-            )
-
-            if self.dp_group_rank == ten_meta.dist_group_rank:
-                exchange_tensor = self.load_tensor_from_storage(checkpoint_dir, ten_meta)
-                if not self.cpu_transfer:
-                    exchange_tensor = exchange_tensor.cuda()
-            else:
-                # TODO: for non-flattened ranges we could reuse the buffer from the start here
-                exchange_tensor = torch.empty(
-                    ten_meta.sharded_tensor_no_data.local_shape,
-                    device='cpu' if self.cpu_transfer else 'cuda',
-                    dtype=ten_meta.sharded_tensor_no_data.dtype,
-                )
-
-            logger.debug(
-                f'exchange {ten_meta.sharded_tensor_no_data.key}, {exchange_tensor.shape}\
-({exchange_tensor.numel()}), broadcast({src_rank} -> {self.dp_group_ranks})'
-            )
-            torch.distributed.broadcast(
-                exchange_tensor, group=self.data_parallel_group, src=src_rank
-            )
-            self._distribute_data_to_state_dict(ten_meta, exchange_tensor, sharded_state_dict)
-            logger.debug(f'exchange {ten_meta.sharded_tensor_no_data.key} done')
-
-            # free buffer memory
-            exchange_tensor = None
-
-    @timed(verbose=False)
-    def _distribute_data_to_state_dict(
-        self,
-        ten_meta: _ShardedTensorMetadata,
-        loaded_ten: torch.Tensor,
-        sharded_state_dict: ShardedStateDict,
-    ):
-        tensor_key = sharded_tensor_chunk_id(ten_meta.sharded_tensor_no_data)
-
-        def _fill_in_data(t: Union[ShardedTensor, torch.Tensor]):
-            if not isinstance(t, ShardedTensor) or sharded_tensor_chunk_id(t) != tensor_key:
-                # already filled-in or key not matching
-                return t
-            sharded_tensor: ShardedTensor = t
-            x = loaded_ten
-            if sharded_tensor.flattened_range is not None:
-                x = flatten_range(sharded_tensor, x)
-
-            # Reuse existing buffer
-            sharded_tensor.data.data.copy_(x)
-            return sharded_tensor.data
-
-        dict_list_map_inplace(_fill_in_data, sharded_state_dict)
-
-    def load_tensors_metadata(self, checkpoint_dir: Path):
-        def get_ts_shape_dtype(path):
-            arr = open_ts_array(path)
-            return arr.shape, arr.dtype.numpy_dtype
-
-        return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype)
diff --git a/megatron/core/dist_checkpointing/strategies/zarr.py b/megatron/core/dist_checkpointing/strategies/zarr.py
deleted file mode 100644
index cea21ddea16..00000000000
--- a/megatron/core/dist_checkpointing/strategies/zarr.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Strategies using Zarr as an underlying format."""
-
-import logging
-import os
-from functools import partial
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-
-from ..core import CheckpointingException
-from ..dict_utils import dict_list_map_inplace, nested_values
-from ..mapping import ShardedStateDict, ShardedTensor, is_main_replica
-from .base import (
-    LoadShardedStrategy,
-    SaveShardedStrategy,
-    StrategyAction,
-    register_default_strategy,
-)
-
-logger = logging.getLogger(__name__)
-
-try:
-    import zarr
-
-    HAVE_ZARR = True
-except ImportError:
-    from unittest.mock import MagicMock
-
-    zarr = MagicMock()
-    HAVE_ZARR = False
-
-
-numpy_to_torch_dtype_dict = {
-    np.dtype("bool"): torch.bool,
-    np.dtype("uint8"): torch.uint8,
-    np.dtype("int8"): torch.int8,
-    np.dtype("int16"): torch.int16,
-    np.dtype("int32"): torch.int32,
-    np.dtype("int64"): torch.int64,
-    np.dtype("float16"): torch.float16,
-    np.dtype("float32"): torch.float32,
-    np.dtype("float64"): torch.float64,
-    np.dtype("complex64"): torch.complex64,
-    np.dtype("complex128"): torch.complex128,
-}
-
-torch_to_numpy_dtype_dict = {v: k for k, v in numpy_to_torch_dtype_dict.items()}
-
-
-try:
-    # Register a bfloat16 type with this import
-    import tensorstore  # pylint: disable=unused-import
-
-    HAS_BFLOAT16 = True
-    numpy_to_torch_dtype_dict[np.dtype("bfloat16")] = torch.bfloat16
-    torch_to_numpy_dtype_dict[torch.bfloat16] = np.dtype("bfloat16")
-except ImportError:
-    HAS_BFLOAT16 = False
-
-logger = getLogger(__name__)
-
-
-def register_default_zarr_strategies():
-    """Register default strategies related to Zarr backend."""
-    register_default_strategy(
-        StrategyAction.SAVE_SHARDED, "zarr", 1, ZarrSaveShardedStrategy("zarr", 1)
-    )
-
-
-class ZarrSaveShardedStrategy(SaveShardedStrategy):
-    """Save strategy for Zarr backend."""
-
-    def __init__(self, backend: str, version: int):
-        super().__init__(backend, version)
-        raise CheckpointingException(
-            "`zarr` distributed checkpoint backend is no longer supported. "
-            "Please switch to PyTorch Distributed format (`torch_dist`)."
-        )
-
-    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union[str, Path]):
-        if isinstance(checkpoint_dir, str):
-            checkpoint_dir = Path(checkpoint_dir)
-
-        sharded_tensors = list(nested_values(sharded_state_dict))
-        arrays = _create_or_open_zarr_arrays(sharded_tensors, checkpoint_dir)
-        for ten, arr in zip(sharded_tensors, arrays):
-            _save_to_existing_array(ten, arr)
-        torch.distributed.barrier()
-
-
-def _create_or_open_zarr_arrays(
-    sharded_tensors: List[ShardedTensor], checkpoint_dir: Path
-) -> List[Optional[zarr.Array]]:
-    """Returns list of zarr arrays corresponding to given tensors.
-
-    For a sharded tensors that:
-    a) is main replica and represents the first chunk (all offsets 0), creates the Zarr array
-    b) is main replica but not the first chunk,
-        opens the arrays created in (a) (possibly by other process)
-    c) otherwise, sets the corresponding array to None since it won't be used
-
-    Args:
-        sharded_tensors (List[ShardedTensor]): sharded tensors from a given rank
-            that will be saved to checkpoint
-        checkpoint_dir (Path): checkpoint in which the arrays will be created
-    """
-    if not HAVE_ZARR:
-        raise RuntimeError("zarr is required, please install it with `pip install zarr`")
-
-    arrays = []
-    for ten in sharded_tensors:
-        arr = _create_zarr_array(ten, checkpoint_dir) if _should_create_array(ten) else None
-        arrays.append(arr)
-
-    torch.distributed.barrier()
-    # Open arrays created above by other processes
-    for arr_idx, ten in enumerate(sharded_tensors):
-        if arrays[arr_idx] is not None:
-            # array created by this process
-            assert _should_create_array(ten), ten
-            continue
-        if not is_main_replica(ten.replica_id):
-            # this array won't be needed for saving and can stay None
-            continue
-        open_kwargs = {}
-        if ten.flattened_range is not None:
-            open_kwargs["synchronizer"] = zarr.ProcessSynchronizer(
-                str(checkpoint_dir / f"{ten.key}.sync")
-            )
-        arrays[arr_idx] = _open_zarr_array_verbose(checkpoint_dir / ten.key, "r+", **open_kwargs)
-    return arrays
-
-
-def _should_create_array(ten: ShardedTensor):
-    return (
-        is_main_replica(ten.replica_id)
-        and set(ten.global_offset) == {0}
-        and (ten.flattened_range is None or ten.flattened_range.start == 0)
-    )
-
-
-def _save_to_existing_array(sharded_tensor: ShardedTensor, arr: Optional[zarr.Array]):
-    if not is_main_replica(sharded_tensor.replica_id):
-        return
-    assert arr is not None
-    x = sharded_tensor.data
-    x = x.detach().cpu()
-    torch.cuda.synchronize()
-    if x.dtype == torch.bfloat16:
-        x = x.float()
-        x = x.numpy()
-        x = x.astype("bfloat16")
-    else:
-        x = x.numpy()
-
-    if sharded_tensor.flattened_range is None:
-        arr[sharded_tensor.global_slice()] = x
-    else:
-        arr.set_coordinate_selection(sharded_tensor.global_coordinates(), x)
-
-
-def _create_zarr_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
-    np_dtype = torch_to_numpy_dtype_dict[sharded_tensor.dtype]
-    try:
-        arr = zarr.create(
-            sharded_tensor.global_shape,
-            dtype=np_dtype,
-            store=checkpoint_dir / sharded_tensor.key,
-            chunks=sharded_tensor.max_allowed_chunks(),
-            compressor=None,
-            fill_value=None,
-            write_empty_chunks=True,
-            synchronizer=(
-                zarr.ProcessSynchronizer(str(checkpoint_dir / f'{sharded_tensor.key}.sync'))
-                if sharded_tensor.flattened_range is not None
-                else None
-            ),
-        )
-        logger.debug(f"Created a new Zarr array at {checkpoint_dir / sharded_tensor.key}")
-    except zarr.errors.ContainsArrayError as e:
-        raise CheckpointingException(
-            f"Array {checkpoint_dir / sharded_tensor.key} already exists"
-        ) from e
-
-    if HAS_BFLOAT16 and np_dtype == np.dtype("bfloat16"):
-        arr._dtype = np_dtype
-        zarray = arr.store[".zarray"]
-        arr.store[".zarray"] = zarray.replace(b"<V2", b"bfloat16")
-    return arr
-
-
-class ZarrLoadShardedStrategy(LoadShardedStrategy):
-    """Load strategy for the Zarr backend."""
-
-    def __init__(self, backend: str, version: int):
-        super().__init__(backend, version)
-        raise CheckpointingException(
-            "`zarr` distributed checkpoint backend is no longer supported. "
-            "Please switch to PyTorch Distributed format (`torch_dist`)."
-        )
-
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union[str, Path]):
-        if isinstance(checkpoint_dir, str):
-            checkpoint_dir = Path(checkpoint_dir)
-
-        dict_list_map_inplace(
-            partial(_load_from_array, checkpoint_dir=checkpoint_dir), sharded_state_dict
-        )
-        return sharded_state_dict
-
-    def load_tensors_metadata(self, checkpoint_dir: Union[str, Path]):
-        def get_zarr_shape_dtype(path):
-            arr = zarr.open(path, "r")
-            return arr.shape, arr.dtype
-
-        if isinstance(checkpoint_dir, str):
-            checkpoint_dir = Path(checkpoint_dir)
-
-        return load_zarr_based_sharded_metadata(checkpoint_dir, get_zarr_shape_dtype)
-
-    def check_backend_compatibility(self, loaded_version):
-        pass  # TODO
-
-    def check_version_compatibility(self, loaded_version):
-        pass  # TODO
-
-
-def _load_from_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
-    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
-    arr = _open_zarr_array_verbose(checkpoint_dir / sharded_tensor.key, "r")
-
-    if not sharded_tensor.allow_shape_mismatch and sharded_tensor.global_shape != arr.shape:
-        _msg = (
-            f"Global shape mismatch for loaded ({arr.shape})"
-            f" and expected ({sharded_tensor.global_shape}) tensor"
-            f" for key {sharded_tensor.key}"
-        )
-        raise CheckpointingException(_msg)
-
-    x = arr[sharded_tensor.global_slice()]  # flattened tensors loading is delayed
-    return postprocess_numpy_array(x, sharded_tensor)
-
-
-def _open_zarr_array_verbose(path: Path, mode: str, **open_kwargs):
-    try:
-        return zarr.open(str(path), mode, **open_kwargs)
-    except zarr.errors.PathNotFoundError as e:
-        ckpt_dir = path.parent
-        err_msg = f"Array {path} not found"
-        if ckpt_dir.exists():
-            ckpt_files = [f.name for f in ckpt_dir.iterdir()]
-            logger.debug(f"{err_msg}. Checkpoint directory {ckpt_dir} content: {ckpt_files}")
-        else:
-            err_msg += f". Checkpoint directory {ckpt_dir} does not exist."
-        raise CheckpointingException(err_msg) from e
-
-
-def postprocess_numpy_array(loaded_array, sharded_tensor, apply_flattened_range=True):
-    """Turn numpy array to torch tensor."""
-    x = loaded_array
-    if HAS_BFLOAT16 and x.dtype == np.dtype("bfloat16"):
-        x = x.astype(np.dtype("float32"))
-        x = torch.from_numpy(x)
-        x = x.bfloat16()
-    else:
-        x = torch.from_numpy(x)
-    # TODO: consider some other consistency checks
-    if x.shape != sharded_tensor.local_shape:
-        if sharded_tensor.allow_shape_mismatch:
-            x = pad_to_expected_shape(x, sharded_tensor)
-        else:
-            _msg = (
-                f"Local shape mismatch for loaded ({x.shape})"
-                f" and expected ({sharded_tensor.local_shape}) tensor"
-                f" for key {sharded_tensor.key}"
-            )
-            raise CheckpointingException(_msg)
-
-    if apply_flattened_range and sharded_tensor.flattened_range is not None:
-        x = flatten_range(sharded_tensor, x)
-
-    # TODO: consider cuda() tensors support
-    return x
-
-
-def flatten_range(sharded_tensor, x):
-    """Apply flattened range to a tensor."""
-    return x.flatten()[sharded_tensor.flattened_range]
-
-
-def pad_to_expected_shape(x: torch.Tensor, expected_sharded_ten: ShardedTensor):
-    """Pad tensor to the expected shape."""
-    pad_args = []
-    assert len(x.shape) == len(expected_sharded_ten.local_shape)
-    # Reversed iteration order because F.pad expects so
-    for x_sh, exp_sh, axis_fragm in reversed(
-        list(
-            zip(x.shape, expected_sharded_ten.local_shape, expected_sharded_ten.axis_fragmentations)
-        )
-    ):
-        if x_sh == exp_sh:
-            pad_args.extend((0, 0))
-        elif x_sh > exp_sh:
-            assert False, (
-                f"Expected shape ({exp_sh}) smaller than actual ({x_sh})"
-                f" for {repr(expected_sharded_ten)}"
-            )
-        else:
-            pad_args.extend((0, exp_sh - x_sh))
-    # TODO: behavior control with envvar is for testing purposes only, remove it
-    if not int(os.environ.get("DIST_CKPT_PAD_REPLICATE", 0)):
-        return torch.nn.functional.pad(x, pad_args)
-
-    # unsqueeze and squeeze to get shapes supported by cudnn
-    logger.info(f"Replicating last row for {expected_sharded_ten.key}")
-    if x.dtype == torch.bfloat16:
-        return (
-            torch.nn.functional.pad(x.float().unsqueeze(0), pad_args, mode="replicate")
-            .squeeze(0)
-            .bfloat16()
-        )
-    return torch.nn.functional.pad(x.unsqueeze(0), pad_args, mode="replicate").squeeze(0)
-
-
-def load_zarr_based_sharded_metadata(
-    checkpoint_dir: Path, get_shape_dtype_fn: Callable[[str], Tuple[Tuple[int], np.dtype]]
-) -> ShardedStateDict:
-    """Load metadata of Zarr arrays.
-
-    Args:
-        checkpoint_dir (str): checkpoint root directory
-        get_shape_dtype_fn (str -> ((int, ...), np.dtype)): a function returning
-            an array shape and dtype for a given Zarr array path
-    """
-
-    sharded_state_dict = {}
-    for subdir in checkpoint_dir.iterdir():
-        if not subdir.is_dir() or not (subdir / ".zarray").exists() or subdir.suffix == ".sync":
-            continue
-        key = subdir.name
-        arr_shape, arr_dtype = get_shape_dtype_fn(str(subdir))
-
-        sharded_state_dict[key] = ShardedTensor(
-            key,
-            None,
-            numpy_to_torch_dtype_dict[arr_dtype],
-            arr_shape,
-            arr_shape,
-            tuple(0 for _ in arr_shape),
-            tuple(1 for _ in arr_shape),
-        )
-    return sharded_state_dict
diff --git a/megatron/core/dist_checkpointing/utils.py b/megatron/core/dist_checkpointing/utils.py
index 6dcab0c0dda..161a3477725 100644
--- a/megatron/core/dist_checkpointing/utils.py
+++ b/megatron/core/dist_checkpointing/utils.py
@@ -330,3 +330,20 @@ def debug_msg(msg: str):
     """
     with logger_stack(None, None) as (stacked_name, last_logger):
         last_logger.debug(f"{stacked_name} {msg}")
+
+
+def _clean_metadata_for_serialization(metadata: dict) -> dict:
+    """Create a clean copy of metadata for serialization by removing non-serializable objects.
+
+    Args:
+        metadata: Original metadata dict
+
+    Returns:
+        Clean metadata dict suitable for serialization
+    """
+    if metadata is None:
+        return None
+    clean_metadata = metadata.copy()
+    # Remove dp_cp_group as it's not serializable
+    clean_metadata.pop('dp_cp_group', None)
+    return clean_metadata
diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py
index 9bcb59bdbf4..48f2bda8737 100644
--- a/megatron/core/dist_checkpointing/validation.py
+++ b/megatron/core/dist_checkpointing/validation.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
 import logging
 import os
 from collections import Counter, defaultdict
@@ -10,12 +11,7 @@
 
 from megatron.core.dist_checkpointing import ShardedTensor
 from megatron.core.dist_checkpointing.core import CheckpointingException, maybe_load_config
-from megatron.core.dist_checkpointing.dict_utils import (
-    diff,
-    extract_matching_values,
-    map_reduce,
-    nested_values,
-)
+from megatron.core.dist_checkpointing.dict_utils import diff, extract_matching_values, nested_values
 from megatron.core.dist_checkpointing.mapping import (
     CommonStateDict,
     ShardedBase,
@@ -468,29 +464,12 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
                 local_shape,
                 some_rank_shard,
             )
-        assert (sharding.flattened_range is not None) == has_flattened_range, (
-            (sharding.flattened_range is not None),
-            has_flattened_range,
-            some_rank_shard,
-        )
 
     if not has_regular_sharding_grid:
         # In case of uneven sharding we defer the validation to DCP
         return
 
     shard_access_cnt = _compute_shards_access(rank_sharding)
-    if has_flattened_range:
-        map_reduce(
-            rank_sharding,
-            lambda x: x[1].global_offset,
-            lambda x: x[1],
-            _validate_sharding_for_key_flattened,
-        )
-        # For each shard with at least 1 flattened tensor in it, the above
-        # `_validate_sharding_for_key_flattened` ensure a correct consistent pattern
-        # The only thing that can go wrong at this point is that some shard don't have
-        # *any* representatives which will be checked later by comparing `shard_access_cnt == 1`
-        shard_access_cnt = torch.minimum(shard_access_cnt, torch.tensor([1]))
     if not torch.all(shard_access_cnt == 1):
         raise CheckpointingException(
             f"Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}"
@@ -507,25 +486,6 @@ def _compute_shards_access(rank_sharding):
     return shard_access_cnt
 
 
-def _validate_sharding_for_key_flattened(tensors_by_shard):
-    all_slices = []
-    local_shape = tensors_by_shard[0].local_shape
-    for sharding in tensors_by_shard:
-        assert sharding.local_shape == local_shape
-        sharding: ShardedTensor
-        if not is_main_replica(sharding.replica_id):
-            continue
-
-        all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
-
-    starts, stops = map(np.asarray, zip(*sorted(all_slices)))
-    expected_size = np.prod(local_shape)
-    if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]):
-        raise CheckpointingException(
-            f"Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}"
-        )
-
-
 def _validate_objects_for_key(sharded_objects: List[ShardedObject]):
     """Ensure uniqueness of saved objects."""
     unique_keys = [
diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py
index df1d7ae94db..55179ff3024 100644
--- a/megatron/core/distributed/distributed_data_parallel.py
+++ b/megatron/core/distributed/distributed_data_parallel.py
@@ -6,7 +6,6 @@
 
 import torch
 
-from .. import parallel_state
 from ..config_logger import has_config_logger_enabled, log_config_to_disk
 from ..fp8_utils import is_float8tensor, post_all_gather_processing
 from ..process_groups_config import ProcessGroupCollection
@@ -55,10 +54,15 @@ def __init__(
         # If using very large dp_sizes, make buckets larger to ensure that chunks used in NCCL
         # ring-reduce implementations are large enough to remain bandwidth-bound rather than
         # latency-bound.
+        # Setup process groups, handling both None and provided pg_collection values.
+        process_group_dict = ProcessGroupCollection.setup_process_groups_for_ddp(
+            pg_collection, config, ddp_config
+        )
+
+        # If bucket_size is not provided as an input, use sane default based on dp_group size.
+        dp_group = process_group_dict['dp_group']
         if ddp_config.bucket_size is None:
-            ddp_config.bucket_size = max(
-                40000000, 1000000 * parallel_state.get_data_parallel_world_size()
-            )
+            ddp_config.bucket_size = max(40000000, 1000000 * dp_group.size())
         # Set bucket_size to infinity if overlap_grad_reduce is False.
         if not ddp_config.overlap_grad_reduce:
             ddp_config.bucket_size = None
@@ -70,51 +74,26 @@ def __init__(
             f'Setting up DistributedDataParallel with config {self.ddp_config}',
         )
 
-        if pg_collection is None:
-            self.dp_group = parallel_state.get_data_parallel_group(
-                with_context_parallel=False, partial_data_parallel=False
-            )
-            self.dp_cp_group = parallel_state.get_data_parallel_group(
-                with_context_parallel=True, partial_data_parallel=False
-            )
-            self.intra_dp_cp_group = parallel_state.get_data_parallel_group(
-                with_context_parallel=True, partial_data_parallel=True
-            )
-            self.expt_dp_group = parallel_state.get_expert_data_parallel_group()
-            self.intra_expt_dp_group = parallel_state.get_expert_data_parallel_group(
-                partial_expert_data_parallel=True
-            )
-            if self.ddp_config.num_distributed_optimizer_instances > 1:
-                self.inter_dist_opt_group = (
-                    parallel_state.get_inter_distributed_optimizer_instance_group()
-                )
-            self.tp_group = parallel_state.get_tensor_model_parallel_group()
-            self.pp_group = parallel_state.get_pipeline_model_parallel_group()
-            self.ep_group = parallel_state.get_expert_model_parallel_group()
-        else:
-            # Setup process groups using DDP-specific helper method
-            process_groups = ProcessGroupCollection.setup_process_groups_for_ddp(
-                pg_collection, config, self.ddp_config
-            )
-
-            self.dp_group = process_groups['dp_group']
-            self.dp_cp_group = process_groups['dp_cp_group']
-            self.intra_dp_cp_group = process_groups['intra_dp_cp_group']
-            self.expt_dp_group = process_groups['expt_dp_group']
-            self.intra_expt_dp_group = process_groups['intra_expt_dp_group']
-            self.tp_group = process_groups['tp_group']
-            self.pp_group = process_groups['pp_group']
-            self.ep_group = process_groups['ep_group']
+        # Assign all required process groups
+        self.dp_group = process_group_dict['dp_group']
+        self.dp_cp_group = process_group_dict['dp_cp_group']
+        self.intra_dp_cp_group = process_group_dict['intra_dp_cp_group']
+        self.expt_dp_group = process_group_dict['expt_dp_group']
+        self.intra_expt_dp_group = process_group_dict['intra_expt_dp_group']
+        self.tp_group = process_group_dict['tp_group']
+        self.pp_group = process_group_dict['pp_group']
+        self.ep_group = process_group_dict['ep_group']
 
-            # Set inter_dist_opt_group if multiple optimizer instances
-            if self.ddp_config.num_distributed_optimizer_instances > 1:
-                self.inter_dist_opt_group = process_groups['inter_dist_opt_group']
+        # Set inter_dist_opt_group if multiple optimizer instances
+        if self.ddp_config.num_distributed_optimizer_instances > 1:
+            self.inter_dist_opt_group = process_group_dict['inter_dist_opt_group']
 
         # Turn off bucketing if we are on a pipeline stage that is not the first (since
         # data-parallel communication on these stages is not on the critical path), or if
         # disable_bucketing is True (e.g., we might not want to break up model parameters
         # into buckets for model chunks after the first in the interleaved schedule).
         self.bucket_size = self.ddp_config.bucket_size
+        self.force_all_reduce = False
         if isinstance(self.pp_group, list):
             pp_rank = self.pp_group[0].rank()
         else:
@@ -462,7 +441,9 @@ def hook(*unused):
                 param.grad = None
 
                 if self.ddp_config.overlap_grad_reduce:
-                    self.param_to_bucket_group[param].register_grad_ready(param)
+                    self.param_to_bucket_group[param].register_grad_ready(
+                        param, self.force_all_reduce
+                    )
 
         return hook
 
@@ -541,7 +522,7 @@ def start_grad_sync(self, *unused):
         for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
             bucket_group.start_grad_sync()
 
-    def finish_grad_sync(self):
+    def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
         """
         Finishes grad sync (all-reduce or reduce-scatter) communication operations
         for all model gradients.
@@ -551,7 +532,7 @@ def finish_grad_sync(self):
         communication ops.
         """
         for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
-            bucket_group.finish_grad_sync()
+            bucket_group.finish_grad_sync(force_all_reduce=force_all_reduce)
 
     def scale_gradients(self, scaling_factor: float):
         """Scale all gradients inside the buffers by `scaling_factor`."""
@@ -590,3 +571,41 @@ def broadcast_params(self):
                 src=torch.distributed.get_global_rank(data_parallel_group, 0),
                 group=data_parallel_group,
             )
+
+    def offload_grad_buffers(self, synchronize: bool = True, empty_cache: bool = True) -> None:
+        """
+        Free all grad_data tensors to release GPU memory.
+
+        Uses storage().resize_(0) to release memory while keeping tensor views intact.
+        All bucket.grad_data and param.main_grad views remain valid tensor objects
+        (though accessing them during offload is undefined behavior).
+
+        Args:
+            synchronize: Whether to call torch.cuda.synchronize() before freeing.
+            empty_cache: Whether to call torch.cuda.empty_cache() after freeing.
+        """
+        if synchronize:
+            torch.cuda.synchronize()
+
+        for buffer in self.buffers + self.expert_parallel_buffers:
+            buffer.offload_to_cpu(move_params=False, move_grads=True)
+
+        if empty_cache:
+            torch.cuda.empty_cache()
+
+    def restore_grad_buffers(self, synchronize: bool = True) -> None:
+        """
+        Reallocate grad_data tensors on GPU.
+
+        All existing views (bucket.grad_data, param.main_grad) automatically
+        become valid again since they share the same storage. The grad_data
+        is zeroed after reallocation.
+
+        Args:
+            synchronize: Whether to call torch.cuda.synchronize() after allocation.
+        """
+        for buffer in self.buffers + self.expert_parallel_buffers:
+            buffer.reload_from_cpu(move_params=False, move_grads=True)
+
+        if synchronize:
+            torch.cuda.synchronize()
diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py
index 27579cc5256..a37e3aaf1c2 100644
--- a/megatron/core/distributed/distributed_data_parallel_config.py
+++ b/megatron/core/distributed/distributed_data_parallel_config.py
@@ -105,7 +105,6 @@ class DistributedDataParallelConfig:
       The follwoing will be the expected number of SM usage for various cases.
       (Note that this is just a reference number and the number of SM usage could vary 
       on message size, communication domain size and nccl version.)
-      ----------------------------------------------------------
       | Communication domain | use_sharp | SM usage of "AG/RS" |
       |----------------------|-----------|---------------------|
       | NVL                  | N/A       | 4 / 5               |
@@ -113,7 +112,6 @@ class DistributedDataParallelConfig:
       | NVL+IB               | True      | 6 / 6               |
       | IB                   | False     | 1 / 4               |
       | IB                   | True      | 1 / 1               |
-      ----------------------------------------------------------
     """
 
     fsdp_double_buffer: bool = False
@@ -137,6 +135,14 @@ class DistributedDataParallelConfig:
       when nccl_ub is set.
     """
 
+    fsdp_manual_registration: bool = False
+    """If true, manually register the FSDP communication buffers to NCCL user buffer.
+      This option is only effective when use_megatron_fsdp and nccl_ub is set.
+      For symmetric registration with large models, the registration itself can take 
+      a significant amount of time. This option minimizes the number of registration calls
+      to minimize the registration time.
+    """
+
     delay_wgrad_compute: bool = False
     """Delay the weight gradient computation to improve batch-level communication overlapping"""
 
@@ -149,14 +155,6 @@ def __post_init__(self):
         """Check the validity of the config."""
         if self.reuse_grad_buf_for_mxfp8_param_ag:
             assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8."
-            # Using mxfp8 param without overlap param gather and overlap grad reduce will cause NaN.
-            # TODO: Remove this assertion when the issue is fixed.
-            assert (
-                self.overlap_param_gather
-            ), "--overlap-param-gather is required when using mxfp8 params"
-            assert (
-                self.overlap_grad_reduce
-            ), "--overlap-grad-reduce is required when using mxfp8 params"
 
         if self.nccl_ub:
             if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','):
diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py
index 2f7507dd083..b9a18b7b13a 100644
--- a/megatron/core/distributed/finalize_model_grads.py
+++ b/megatron/core/distributed/finalize_model_grads.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from functools import partial
 from typing import Callable, List, Optional, Union
@@ -199,7 +199,11 @@ def _allreduce_word_embedding_grads(
             pp_group = parallel_state.get_pipeline_model_parallel_group()
 
     _allreduce_embedding_grad(
-        model, embd_group, pp_group, partial(_get_shared_word_embedding_weight, config=config)
+        model,
+        embd_group,
+        pp_group,
+        partial(_get_shared_word_embedding_weight, config=config),
+        config=config,
     )
 
 
@@ -210,6 +214,7 @@ def _allreduce_embedding_grad(
     pp_group: torch.distributed.ProcessGroup,
     weight_getter: Callable[[torch.nn.Module], Optional[torch.nn.Parameter]],
     skip_if_none: bool = True,
+    config: TransformerConfig = None,
 ):
     """Unified helper to all-reduce embedding parameters across pipeline stages.
 
@@ -236,6 +241,9 @@ def _allreduce_embedding_grad(
             model_module = model[0]
         elif is_pp_last_stage(pp_group):
             model_module = model[-1]
+        elif getattr(config, 'mtp_num_layers', None) is not None and config.mtp_num_layers > 0:
+            # Embedding for MTP layers is in the last virtual pipeline model parallel stage.
+            model_module = model[-1]
         else:  # We do not support an interleaved schedule for models with encoders yet.
             model_module = model[0]
 
@@ -396,6 +404,7 @@ def finalize_model_grads(
     model: List[torch.nn.Module],
     num_tokens: Optional[torch.Tensor] = None,
     pg_collection: Optional[ProcessGroupCollection] = None,
+    force_all_reduce: Optional[bool] = False,
 ):
     """
     All-reduce all model grads across DP replicas, layernorm grads for sequence parallelism,
@@ -438,7 +447,7 @@ def finalize_model_grads(
     if config.timers is not None:
         config.timers('all-grads-sync', log_level=1).start(barrier=config.barrier_with_L1_time)
     for model_chunk in model:
-        model_chunk.finish_grad_sync()
+        model_chunk.finish_grad_sync(force_all_reduce=force_all_reduce)
     if config.timers is not None:
         config.timers('all-grads-sync').stop()
 
diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py
index 7432a7f9a36..d6384e70488 100644
--- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py
+++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py
@@ -111,6 +111,9 @@ def __init__(
                 dist_index=self.megatron_fsdp_dist_index,
                 calculate_per_token_loss=config.calculate_per_token_loss,
                 init_model_with_meta_device=config.init_model_with_meta_device,
+                enable_fine_grained_param_gather_hook=(
+                    config.fp8_recipe == "mxfp8" and ddp_config.fp8_param_gather
+                ),
             ),
         )
         self.param_and_grad_buffer = self.module.param_and_grad_buffer
@@ -123,6 +126,7 @@ def __init__(
         self.broadcast_params = self.module.broadcast_params
         self.module.state_dict_for_save_checkpoint = self.module.state_dict
         self.state_dict_for_save_checkpoint = self.state_dict
+        self.module.config = config
 
         self.sync_rng_states_across_tp_group()
 
diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md
index 9e036f22f67..bc4cdaa078e 100644
--- a/megatron/core/distributed/fsdp/src/README.md
+++ b/megatron/core/distributed/fsdp/src/README.md
@@ -62,20 +62,26 @@ Transform your PyTorch model to use Fully Sharded Data Parallelism with just a f
 
 ```python
 import torch
-from megatron_fsdp import fully_shard
-
-# Your existing model and optimizer
-model = YourModel()
-optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+from megatron_fsdp import (
+    fully_shard_model,
+    fully_shard_optimizer,
+)
 
-# Enable FSDP with Megatron-FSDP.
-# Alternatively, you can use fully_shard_model() followed by fully_shard_optimizer()!
-model, optimizer = fully_shard(
+"""
+Enable FSDP with Megatron-FSDP via the `fully_shard_*` API.
+"""
+# Shard your model.
+model = fully_shard_model(
     model,
-    optimizer,
-    device_mesh=device_mesh, # Your global DeviceMesh.
-    dp_shard_dim="dp_shard_cp", # Sharding across the flattened DP-CP mesh.
-    fsdp_unit_modules=[YourTransformerBlock], # Modules to shard.
+    fsdp_unit_modules=[
+        YourModelLayerClass,
+        "import.path.to.model.class.YourModelLayerClass",
+    ],
+    ...
+)
+# Shard your optimizer.
+optimizer = fully_shard_optimizer(
+    torch.optim.Adam(model.parameters(), lr=1e-3)
 )
 
 # Your model is now ready for distributed training!
@@ -86,7 +92,9 @@ model, optimizer = fully_shard(
 `fully_shard` / `fully_shard_model` / `fully_shard_optimizer` are simple entrypoints into `MegatronFSDP`.
 
 - No need to call `fully_shard` on all the sub-modules, just pass your sub-module classes or import paths to `fully_shard`!
-- One liner for the sharding change, which seamlessly preserves the identity of your training loop.
+- Seamlessly preserves the identity of your training loop with only a few lines of code and multiple options for initialization:
+  - `fully_shard_*` is a two-line change when sharding the model and optimizer separately.
+  - `fully_shard` is a one-line change for previously-initialized models and optimizers.
 
 Compare this with FSDP2:
 
@@ -94,24 +102,43 @@ Compare this with FSDP2:
 import torch
 from torch.distributed.fsdp import fully_shard
 
-# Your existing model and optimizer
+# Your existing model and optimizer.
 model = YourModel()
 optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
 
-# Enable FSDP with FSDP2
+# Enable FSDP with FSDP2.
 for module in model.modules():
-    if isinstance(module, YourTransformerBlock): # Sub-Modules to shard
+    # Sub-Modules to shard.
+    if isinstance(module, YourModelLayerClass):
         fully_shard(module)
 fully_shard(model)
 
 # Your model is now ready for distributed training!
 ```
 
-## `fully_shard` / `MegatronFSDP` API - Advanced Features
+### `torch.compile` Compatibility
+
+Megatron-FSDP is compatible with `torch.compile`, but this feature is still experimental and may introduce performance regressions in some workloads.
+
+## 📖 Megatron-FSDP Comprehensive Walkthrough
+
+### Import `megatron_fsdp`.
 
 ```python
 import torch
-from megatron_fsdp import fully_shard
+from megatron_fsdp import (
+    fully_shard_model,
+    fully_shard_optimizer,
+)
+```
+
+### Set up a distributed environment using `DeviceMesh`.
+
+`DeviceMesh` simplifies the construction of complex arrangements of devices
+to support various parallelisms.
+
+```python
+from torch.distributed.device_mesh import DeviceMesh
 
 # Initialize DeviceMesh.
 device_mesh = torch.distributed.device_mesh.init_device_mesh(
@@ -127,27 +154,35 @@ device_mesh[("dp_shard", "cp")]._flatten("dp_shard_cp")
 # Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group.
 device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp")
 hsdp_group = device_mesh["hsdp"].get_group()
+
 # Initialize DeviceMesh for expert parallel (EP) modules when using FSDP + EP.
-expert_device_mesh = torch.distributed.device_mesh.init_device_mesh(
-    "cuda",
-    mesh_shape=(expt_dp_shard_size, expt_tp_size),
-    mesh_dim_names=("dp_shard", "tp"),
+expt_device_mesh = DeviceMesh.from_group(
+    [expt_dp_group, expt_tp_group],
+    device_type="cuda",
+    mesh=expt_mesh.tolist(),
+    mesh_dim_names=["dp_shard_cp", "tp"],
 )
+```
 
-# Fully-shards your model and distributes your optimizer.
-model, optimizer = fully_shard(
+### Convert models into fully-sharded `MegatronFSDP` models with `fully_shard_model`.
+
+This wraps the model in a MegatronFSDP class that schedules the sharding
+lifecycle of the model parameters and gradients during training and inference.
+
+```python
+model = fully_shard_model(
     # PyTorch (Root) Module
     model,
-    # PyTorch Optimizer
-    optimizer,
+    # Sharded Modules
+    fsdp_unit_modules=[...],
     # Device Mesh
     device_mesh=device_mesh
     # Always required for FSDP or HSDP.
     dp_shard_dim="dp_shard_cp",
     # Set this required argument to use HSDP instead of FSDP. Otherwise, set this to None.
     dp_outer_dim="dp_outer",
-    # Only required for TP-sensitive models (i.e. Megatron-LM / TransformerEngine) or when using DTensor-based TP.
-    # Otherwise, set this to None.
+    # Only required for TP-sensitive models (i.e. Megatron-LM / TransformerEngine)
+    # or when using DTensor-based TP. Otherwise, set this to None.
     tp_dim="tp",
     # Only required when using HSDP. Otherwise, set this to None.
     hybrid_fsdp_group=hsdp_group,
@@ -156,8 +191,6 @@ model, optimizer = fully_shard(
     # FSDP Sharding Strategy: no_shard (0) / optim (1) / optim_grads (2) / optim_grads_params (3)
     zero_dp_strategy=3,
     outer_dp_sharding_strategy=1,
-    # Sharded Modules
-    fsdp_unit_modules=[...],
     # Initialize the model on devices in shards to avoid OOM. Requires device("meta")-init for model.
     init_model_with_meta_device=True,
     # Reduce gradients in FP32.
@@ -171,35 +204,97 @@ model, optimizer = fully_shard(
     # Preprocess state dict for DCP checkpointing. Required for Torch Distributed Checkpoint.
     preproc_state_dict_for_dcp_ckpt=True,
 )
+```
+
+The original `torch.nn.Module` can be accessed at `MegatronFSDP.module`.
+
+### Initialize and fully-shard your optimizer on the `MegatronFSDP` model.
+
+Initialize your optimizer on the Megatron-FSDP model distributed `Parameter`(s).
+If your optimizer has already been initialized, either use the `fully_shard`
+entrypoint, or use `optimizer.add_param_group({"params": model.parameters()})`
+after resetting your optimizer state via `optimizer.param_groups.clear()`
+and `optimizer.state.clear()`.
+
+```python
+optimizer = torch.optim.Optimizer(model.parameters())
+```
+
+`fully_shard_optimizer` modifies your `optimizer.step()`, `optimizer.zero_grad()`,
+and distributed optimizer parameters to punctually trigger scheduled FSDP operations
+for Megatron-FSDP.
+
+```python
+fully_shard_optimizer(
+    # PyTorch Optimizer
+    optimizer,
+    # Preprocess state dict for DCP checkpointing.
+    # Required for Torch Distributed Checkpoint.
+    preproc_state_dict_for_dcp_ckpt=True,
+)
+```
+
+Extended arguments to `step()` and `zero_grad()` control these FSDP operations:
+
+```python
+    optimizer.step(
+        ...,
+        # Sync all gradients before the optimizer step. Alternatively enabled using
+        # `sync_model_each_microbatch=True` in MegatronFSDP.
+        sync_grad_before_optimizer_step=True,
+        # After `optimizer.step()`, install optimized weights into MegatronFSDP's buffers.
+        install_optimized_model_weights=True,
+    )
+
+    optimizer.zero_grad(
+        ...,
+        # Also zero out MegatronFSDP's gradient accumulation buffers.
+        zero_grad_buffer=True
+    )
+```
+
+### `MegatronFSDP` Distributed Checkpointing
 
+Distributed checkpoints can be saved and loaded using Torch DCP. Alternatively,
+you can load non-distributed checkpoints before fully-sharding your model with
+any existing checkpoint utility compatible with PyTorch Modules.
+
+```python
 # Save model and optimizer state.
-torch.distributed.checkpoint.save({"model": model.state_dict(), "optimizer": optimizer.state_dict()}, checkpoint_id=str(CKPT_DIR))
+torch.distributed.checkpoint.save(
+    {"model": model.state_dict(), "optimizer": optimizer.state_dict()},
+    checkpoint_id=str(CKPT_DIR)
+)
 
 # Load model and optimizer state.
 ckpt_state_dict = {"model": model.state_dict(), "optimizer": optimizer.state_dict()}
 torch.distributed.checkpoint.load(state_dict=ckpt_state_dict, checkpoint_id=str(CKPT_DIR))
-# model.load_state_dict(strict=False) is only necessary to ignore TE FP8 extra state
+# `model.load_state_dict(strict=False)` is only necessary to ignore TE FP8 extra state
 # that is missing from the DCP checkpoint but present in TEBaseModule.
 # Megatron-FSDP does not support TE FP8 extra state checkpointing with DCP.
 model.load_state_dict(ckpt_state_dict["model"], strict=False)
 optimizer.load_state_dict(ckpt_state_dict["optimizer"])
 ```
 
+## ⚙️ `fully_shard` / `MegatronFSDP` API - Advanced Features
+
+Megatron-FSDP's `fully_shard_*` API has a comprehensive set of arguments for fine-tuning your model's performance.
+
+- `fsdp_unit_modules` is a list of sub-module classes or `str` import-paths associated with modules that you want `MegatronFSDP` to fully-shard.
+  - Required if `1`, `2`, or `3` are specified as the sharding strategy. Defaults to `None`, in which case Megatron-FSDP will replicate the parameters similar to DDP.
 - `zero_dp_strategy` (and `outer_dp_sharding_strategy`) configure different degrees of zero-redundancy data parallelism as described in [ZeRO (Zero Redundancy Optimizer)](https://arxiv.org/abs/1910.02054). It reduces CUDA memory utilization during model training by distributing model parameters, gradients, and optimizer states across multiple devices in the DP `ProcessGroup`, and collectively communicating subsets of parameters and gradients to specific devices when needed for computation or differentiation. More aggressive sharding strategies will entail more communication overhead, with `no_shard` being the least memory efficient but most communication efficient, and `optim_grads_params` being the most memory efficient but least communication efficient. `outer_dp_sharding_strategy` has the same options, except for the (required) "outer" DP group (`dp_outer_dim` / `hybrid_fsdp_group`) when using [Hybrid-Sharded Data Parallelism (HSDP)](https://arxiv.org/pdf/2304.11277), and only `no_shard` (DP Replication) and `optim` (Optimizer State Hybrid Sharding, requires `zero_dp_strategy='optim_grads_params`) are supported.
   - Default: `optim_grads_params` or `3` for `zero_dp_strategy` and `no_shard` or `0` for `outer_dp_sharding_strategy`
   - `0` or `no_shard` implies that your model is not sharded. Similar memory usage to `DDP`.
   - `1` or `optim` implies that your optimizer state is sharded for distributed optimization. Similar to optimizer state sharding in `ZeRO-DP`.
   - `2` or `optim_grads` implies that your optimizer state and gradients are sharded. Similar to `ZeRO-2`.
   - `3` or `optim_grads_params` implies that your optimizer state, gradients, and training parameters are sharded. Similar to `ZeRO-3`.
-- `fsdp_unit_modules` is a list of sub-module classes or `str` import-paths associated with modules that you want `MegatronFSDP` to fully-shard.
-  - Required if `1`, `2`, or `3` are specified as the sharding strategy. Defaults to `None`.
-- `device_mesh` is a [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) that informs `MegatronFSDP` of your distributed environment for sharding in conjunction with hardware configuration and other parallelisms.
+- `device_mesh` is a [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) that informs `MegatronFSDP` of your distributed environment for sharding in conjunction with hardware configuration and other parallelisms. If not provided, `megatron_fsdp.fully_shard(_model)` will build an FSDP DeviceMesh for you automatically.
   - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding, and is commonly the flattened combination of the data parallel (DP) and context parallel (CP) sub-meshes.
     - When model parameters are replicated across DP-CP during the backward pass, resultant gradients across DP and CP ranks are reduced simultaneously, normalized by the DP-CP world size. For more information about how ring attention shards the sequence dimension through the attention and non-attention layers of the Transformer, refer to: [Ring Attention with Blockwise Transformers for Near-Infinite Context](https://arxiv.org/abs/2310.01889).
   - `dp_outer_dim` is the name of the sub-mesh corresponding to the "outer" DP group, which is required for replication or sharding in HSDP. `fully_shard` will perform HSDP if `dp_outer_dim` is specified.
   - `tp_dim` is the name of the sub-mesh used for tensor parallelism (TP), which is required for `(FSDP, TP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` TP.
     - For more information about tensor parallelism, refer to: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053).
-  - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded coordinate system for the weight and gradient buffers. Required for HSDP.
+  - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded mesh coordinates for the weight and gradient buffers. Required for HSDP.
 - `expt_device_mesh` is another [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) tailored for the expert parallel (EP) modules in `MegatronFSDP`.
   - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding of the EP modules, enabling expert data parallelism (EDP).
   - `tp_dim` is the name of the sub-mesh used for expert tensor parallelism (ETP), which is required for `(FSDP, ETP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` ETP.
@@ -216,17 +311,62 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"])
     - Both default to `True`.
 - `sync_model_each_microbatch` will trigger a `wait` (`MegatronFSDP.finish_grad_sync()`) on gradient reduction, parameter de-allocation, and optimizer parameter / gradient installation (in preparation for `optimizer.step()`) after every forward-backward pass. When using HSDP, parameters and gradients will be all-gathered and reduced respectively on the "outer" DP group each training step instead of each optimization cycle. This behavior is desirable for a transparent and user-friendly sharded training loop where post-backward transformations on the gradient and a clean compute / memory state are necessary between training iterations, but damages performance in situations where optimization is delayed (e.g. gradient accumulation) where the communications of the previous training iteration can be overlapped with the compute of the next training iteration. Will also override `is_last_microbatch` / `microbatch_count` logic in `MegatronFSDP`.
     - Defaults to `True` for `fully_shard`, but defaults to `False` when using the `MegatronFSDP` class directly.
-- `keep_fp8_transpose_cache_when_using_custom_fsdp` will keep the fp8 transpose cache when using `MegatronFSDP`. This option will cause (number of parameter $\times$ 1 Byte) of memory overhead, but can skip the weight transpose operation in the backward propagation. This feature will not give any benefit from the Blackwell architecture.
-    - **Only effective when using Megatron-LM.**
+- `enable_fine_grained_param_gather` modifies FSDP to all-gather parameters with per-Module granularity instead of collectively unsharding all sub-modules of a unit module in Megatron-FSDP.
+    - Defaults to `False`.
+- `keep_fp8_transpose_cache` will keep the fp8 transpose cache when using `MegatronFSDP`. This option will cause (number of parameter $\times$ 1 Byte) of memory overhead, but can skip the weight transpose operation in the backward propagation. This feature will not give any benefit from the Blackwell architecture.
     - Defaults to `False`.
 - `nccl_ub` will allocate and register the NCCL userbuffer for param and grad buffers. This option enables an SM-efficient NCCL algorithm that could improve the performance of overlapped computations. This flag will be much more effective when used together with SHARP if the FSDP communication includes both NVL and IB domains. Enabling this option will cause additional memory overhead due to the requirement to enable the `fsdp_double_buffer` option.
-    - **Only effective when using Megatron-LM.**
+    - **Only effective when using with Megatron-Core.**
+    - Defaults to `False`.
+    - By default we try to use NCCL window (symmetric) registration if it is available. If not it falls back to conventional local registration.
+- `fsdp_manual_registration` will manually register the FSDP communication buffers with the NCCL user buffer. For symmetric registration with large models, the registration itself can take a significant amount of time. This option minimizes the number of registration calls to reduce the registration time. However, with this option enabled, you need to manually call the `ParamAndGradBuffer.manual_buffer_registration()` function after the first iteration. This is already implemented in the Megatron-LM training loop. In other use cases, users are expected to call this function themselves.
+    - **Only effective when using with Megatron-Core.**
+    - This option is only effective when `nccl_ub` is enabled.
+    - Defaults to `False`.
+- `disable_symmetric_registration` will disable NCCL window (i.e. symmetric) registration when using `nccl_ub`. 
     - Defaults to `False`.
-    - By default we try to use NCCL window (symmetric) registration if it is available. If not it falls back to conventional local registraion.
-- `disable_symmetric_registration` will disable NCCL window (i.e. symmetric) registraion when using `nccl_ub`. 
-    - Dafaults to `False`.
 - `fsdp_double_buffer` will use persistently allocated double buffers for temporarily-defined memory needed in `MegatronFSDP` communications. Having persistent double buffers may increase peak VRAM utilization, but is required to register NCCL user buffers (`nccl_ub=True`) for `MegatronFSDP`. Currently, this is only supported for simple repetitive model structures such as GPT.
-    - **Only effective when using Megatron-LM.**
     - Defaults to `False`. Automatically overridden to `True` when `nccl_ub` is enabled.
 - `preproc_state_dict_for_dcp_ckpt` adds `model.state_dict()` and `optimizer.state_dict()` post-hooks that modify the model and optimizer state in preparation for `torch.distributed.checkpoint.{save,load}` ([Torch DCP](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html)) checkpointing. Specifically, it adds `__create_write_items__` and `__create_chunk_list__` methods to Tensors utilized by Torch DCP to redistribute parameters when saving and loading model and optimizer checkpoints. Can be deactivated should the user need a custom distributed checkpointing strategy.
     - Defaults to `True`.
+
+## 🧮 Using Megatron-FSDP with [`TransformerEngine`](https://github.com/NVIDIA/TransformerEngine)
+
+Megatron-FSDP natively supports mixed-precision activations and parameter sharding in conjunction with [TransformerEngine](https://github.com/NVIDIA/TransformerEngine).
+
+- Within the [`transformer_engine.pytorch.autocast(recipe: transformer_engine.common.recipe.Recipe)`](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.autocast) context, model activations are converted based on the recipe.
+- Within the [`transformer_engine.pytorch.quantized_model_init(recipe: transformer_engine.common.recipe.Recipe)`](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.quantized_model_init) context, TransformerEngine native modules (e.g. [`transformer_engine.pytorch.TransformerLayer`](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.TransformerLayer)) have their parameters converted based on the recipe.
+    - Requires FP8 model activations, i.e. `transformer_engine.pytorch.autocast`.
+
+```python
+# FP8 Recipe
+fp8_recipe = transformer_engine.common.recipe.MXFP8BlockScaling(
+    fp8_format=transformer_engine.common.recipe.Format.HYBRID,
+)
+
+# Construct TransformerEngine model with FP8 parameters.
+with transformer_engine.pytorch.quantized_model_init(
+    recipe=fp8_recipe,
+    # Needed for FP8 parameters with Megatron-FSDP.
+    preserve_high_precision_init_val=True,
+):
+    te_model = transformer_engine.pytorch.TransformerLayer(...)
+
+# Fully-shard the model.
+mfsdp_model = fully_shard_model(
+    module=te_model,
+    fsdp_unit_modules=[te.pytorch.TransformerLayer],
+    # Only FSDP / ZeRO-3 supports FP8 parameters.
+    zero_dp_strategy=3,
+    # Needed for FP8 parameters. (Default is already True.)
+    preserve_fp32_weights=True,
+    # Needed for select FP8 recipes.
+    keep_fp8_transpose_cache=True,
+)
+
+# Evaluate and differentiate the model with FP8 activations.
+with transformer_engine.pytorch.autocast(recipe=fp8_recipe):
+    mfsdp_model(x).sum().backward()
+```
+
+ℹ️ `TransformerEngine` kernels have a fair bit of configuration constraints when using FP8-quantized parameters, such as using fused QKV parameters or defining activations and parameters with shapes compatible to FP8 CuBLAS kernels on supported hardware from NVIDIA. To properly initialize `TransformerLayer`, you can refer to the toy model used in our FP8 unit tests: `Megatron-LM/tests/unit_tests/distributed/fsdp/test_mfsdp_fully_shard.py::TestMegatronFsdpFullyShard::test_fully_shard_te_quantized`.
\ No newline at end of file
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py
index 5151ecabfb5..1380315a2e0 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py
@@ -100,7 +100,6 @@ class DistributedDataParallelConfig:
       The follwoing will be the expected number of SM usage for various cases.
       (Note that this is just a reference number and the number of SM usage could vary 
       on message size, communication domain size and nccl version.)
-      ----------------------------------------------------------
       | Communication domain | use_sharp | SM usage of "AG/RS" |
       |----------------------|-----------|---------------------|
       | NVL                  | N/A       | 4 / 5               |
@@ -108,7 +107,6 @@ class DistributedDataParallelConfig:
       | NVL+IB               | True      | 6 / 6               |
       | IB                   | False     | 1 / 4               |
       | IB                   | True      | 1 / 1               |
-      ----------------------------------------------------------
     """
 
     fsdp_double_buffer: bool = False
@@ -131,20 +129,20 @@ class DistributedDataParallelConfig:
       when nccl_ub is set.
     """
 
+    fsdp_manual_registration: bool = False
+    """If true, manually register the FSDP communication buffers to NCCL user buffer.
+      This option is only effective when use_megatron_fsdp and nccl_ub is set.
+      For symmetric registration with large models, the registration itself can take 
+      a significant amount of time. This option minimizes the number of registration calls
+      to minimize the registration time.
+    """
+
     def __post_init__(self):
         import os
 
         """Check the validity of the config."""
         if self.reuse_grad_buf_for_mxfp8_param_ag:
             assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8."
-            # Using mxfp8 param without overlap param gather and overlap grad reduce will cause NaN.
-            # TODO: Remove this assertion when the issue is fixed.
-            assert (
-                self.overlap_param_gather
-            ), "--overlap-param-gather is required when using mxfp8 params"
-            assert (
-                self.overlap_grad_reduce
-            ), "--overlap-grad-reduce is required when using mxfp8 params"
 
         if self.nccl_ub:
             if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','):
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
index e98362a1a03..df210f15f05 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
@@ -15,10 +15,11 @@
 import logging
 import types
 from enum import IntEnum
-from typing import Optional, Sequence, Type
+from typing import Callable, Optional, Sequence, Type
 
 import torch
 from torch.distributed import DeviceMesh
+from torch.distributed.device_mesh import init_device_mesh
 
 from .megatron_fsdp import MegatronFSDP
 from .uneven_dtensor import preprocess_state_dict_for_uneven_dtensor
@@ -57,10 +58,22 @@ class ShardingStrategy(IntEnum):
     OPTIM_GRADS_PARAMS = 3
 
 
+def experimental_api(func: Callable) -> Callable:
+    """
+    Mark a function or class as experimental API in Megatron CI/CD.
+
+    TODO(@cspades): Copied from megatron.core.utils to avoid depending on MCore
+    for Megatron-FSDP. Should remove when the API is no longer experimental.
+    """
+    func._experimental_api = True
+    return func
+
+
+@experimental_api
 def fully_shard_model(
     module: torch.nn.Module,
-    device_mesh: DeviceMesh,
-    dp_shard_dim: str,
+    device_mesh: Optional[DeviceMesh] = None,
+    dp_shard_dim: Optional[str] = None,
     dp_outer_dim: Optional[str] = None,
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
@@ -84,16 +97,171 @@ def fully_shard_model(
     nccl_ub: bool = False,
     fsdp_double_buffer: bool = False,
     disable_symmetric_registration: bool = False,
-):
+    enable_fine_grained_param_gather: bool = False,
+) -> torch.nn.Module:
     """
-    Fully-shard the model for Megatron-FSDP.
+    Fully-shard the model for Megatron-FSDP. This wraps the model in a MegatronFSDP
+    class that schedules the sharding lifecycle of the model parameters and gradients
+    during training and inference.
+
+    The original `torch.nn.Module` can be accessed at `MegatronFSDP.module`.
 
     Args:
-        Subset of the arguments for fully_shard(). Refer to the docstring for fully_shard().
+        module (torch.nn.Module):
+            The PyTorch module fully-sharded and managed by Megatron-FSDP.
+
+        device_mesh (Optional[DeviceMesh]):
+            Device mesh object defining the topology for distributed training. If not provided,
+            Megatron-FSDP will build a default FSDP DeviceMesh.
+
+        dp_shard_dim (Optional[str]):
+            Name of the data parallel sharding sub-mesh in the device_mesh. Supports
+            a flattened DP-CP sub-mesh, in which case parameters, gradients, and
+            optimizer state will be sharded across both DP and CP ranks.
+
+        dp_outer_dim (Optional[str]):
+            Name of the "outer" DP sub-mesh in the device_mesh for hybrid-sharding (HSDP),
+            which supports "DP-Replicate" as well as optimizer state sharding (HFSDP).
+            Defaults to None. Required for HSDP, which is enabled by this argument.
+
+        tp_dim (Optional[str]):
+            Name of the tensor parallel sub-mesh in the device_mesh, which is necessary
+            for strided sharding between TP and FSDP (and fully-sharded HSDP) dimensions.
+            Defaults to None. Required if TP is used in the model, or if TransformerEngine
+            layers are utilized, as TE defaults to "TP=1".
+
+        hybrid_fsdp_group (Optional[torch.distributed.ProcessGroup]):
+            Cumulative data parallel process group for hybrid FSDP that can be manufactured
+            by flattening the outer-FSDP (dp_outer_dim) and FSDP (dp_shard_dim) process groups
+            or sub-meshes. Defaults to None. Required for HSDP, i.e. if dp_outer_dim is not None.
+
+        expt_device_mesh (Optional[DeviceMesh]):
+            Expert parallel device mesh object defining the topology for MoE distributed training.
+            Utilizes the mesh dimension names specified by the *_dim arguments.
+
+        fsdp_unit_modules (Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]]):
+            List of (sub-)module classes or (sub-)module class import paths that are "units",
+            which are torch.nn.Module(s) that are sharded and scheduled by Megatron-FSDP.
+            In particular, FSDP unit module parameters can be "safely" deallocated after
+            the forward() or backward() pass without interfering with other computational
+            operations that rely on those parameters in the complete PyTorch model.
+            This information is utilized by Megatron-FSDP to optimally shard, gather, and
+            overlap communications during the forward and backward pass of the module.
+            Defaults to None, which is peak-memory-equivalent to DDP / "no_shard".
+
+        zero_dp_strategy (str | int):
+            Zero-redundancy sharding strategy for sharding data parallel parameters and gradients.
+            - "no_shard" / 0: No optimizer, gradient, or parameter sharding. Similar
+                memory usage to DDP.
+            - "optim" / 1: Shards optimizer states (and main weights for mixed precision training),
+                which is conceptually similar to optimizer state sharding in `ZeRO-DP`.
+            - "optim_grads" / 2: Shards gradients and optimizer states, which is conceptually
+                similar to "ZeRO-2".
+            - "optim_grads_params" / 3: Shards parameters, gradients and optimizer states, which
+                is conceptually similar to "ZeRO-3".
+            Defaults to "optim_grads_params" / 3.
+
+        outer_dp_sharding_strategy (str | int):
+            Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP).
+            Shares the same semantics as zero_dp_strategy, but only 'no_shard' / 0 (DP Replication)
+            and 'optim' / 1 (Optimizer State Hybrid Sharding) are supported, and 'optim' / 1 is only
+            supported when zero_dp_strategy='optim_grads_params'.
+            This option is only effective when HSDP is enabled, i.e. when dp_outer_dim is not None.
+            Defaults to "no_shard" / 0, which replicates model parameters across the dp_outer group.
+
+        device (Optional[torch.device]):
+            Target device for the sharded model. Used to migrate all parameters in the model
+            to an expected device. If init_model_with_meta_device=True, this argument is ignored.
+            Defaults to None.
+
+        init_model_with_meta_device (bool):
+            Utilized to initialize large models that do not fit on a single device, and requires
+            implementing a custom Module.reset_parameters() or Module._reset_parameters() method.
+            Defaults to False.
+
+        grad_reduce_in_fp32 (bool):
+            Whether to perform gradient reduction in FP32. Defaults to False.
+
+        preserve_fp32_weights (bool):
+            Whether to preserve FP32 optimization weights. Defaults to True.
+
+        overlap_grad_reduce (bool):
+            Whether to overlap gradient reduce-scatter (or all-reduce) with backward compute.
+            Defaults to True.
+
+        overlap_param_gather (bool):
+            Whether to overlap parameter all-gather with forward and backward compute.
+            Defaults to True.
+
+        sync_model_each_microbatch (bool): Whether to sync parameters and install gradients on
+            each training step. When disabled, Megatron-FSDP will overlap reduce-scatter with
+            subsequent compute and delay HSDP gather and reduce operations per optimization cycle,
+            which improves performance and throughput when using delayed optimization strategies
+            such as gradient accumulation. Defaults to True, can be modified before the model
+            forward / backward pass via MegatronFSDP.set_model_auto_sync(bool) or controlled
+            with the (no_)sync context managers or microbatch_count and is_last_microbatch.
+
+        preproc_state_dict_for_dcp_ckpt (bool):
+            Whether to preprocess the unevenly-sharded state dict for DCP checkpointing,
+            for both the model and the optimizer.
+            Defaults to True.
+
+        check_for_nan_in_grad (bool):
+            Whether to check for NaN values in gradients. Defaults to True.
+
+        average_in_collective (bool):
+            Whether to average gradients in collective communication. Defaults to False.
+
+        disable_bucketing (bool):
+            Whether to disable gradient bucketing optimization, which permits more granular
+            and precise communication of parameters and gradients. Defaults to False.
+
+        calculate_per_token_loss (bool):
+            Whether to calculate loss per token, which deactivates gradient scaling.
+            Defaults to False.
+
+        keep_fp8_transpose_cache (bool):
+            Whether to keep the FP8 transpose cache when using a Megatron FSDP.
+            Defaults to False.
+
+        nccl_ub (bool):
+            Whether to use NCCL UCC for communication. Defaults to False.
+
+        fsdp_double_buffer (bool):
+            Whether to use double buffer for FSDP. Defaults to False.
+
+        disable_symmetric_registration (bool):
+            Whether to disable symmetric (window) registration for NCCL UB registration.
+            This option forces conventional (local) UB registration when nccl_ub is set.
+            Defaults to False.
+
+        enable_fine_grained_param_gather (bool):
+            Whether to enable "fine-grained" param all-gather, which can improve performance
+            when using MXFP8 parameters with activation recomputation. Specifically, it
+            unshards parameters per-Module instead of unsharding all sub-modules of an FSDP
+            unit module simultaneously. Defaults to False.
 
     Returns:
         model (MegatronFSDP): The wrapped Megatron-FSDP model configured for FSDP.
     """
+    # If no DeviceMesh or FSDP dimension is provided, then build an FSDP DeviceMesh.
+    # Modify arguments into arguments necessary for vanilla FSDP.
+    if device_mesh is None:
+        if dp_shard_dim is None:
+            dp_shard_dim = "fsdp"
+        if tp_dim is None:
+            # Trivial TP dimension to seamlessly support TransformerEngine.
+            tp_dim = "tp"
+        # Deactivate DP-Outer, which needs to be consistent with Expert DeviceMesh.
+        dp_outer_dim = None
+        hybrid_fsdp_group = None
+        outer_dp_sharding_strategy = ShardingStrategy.NO_SHARD
+        device_mesh = init_device_mesh(
+            device_type="cuda",
+            mesh_shape=(torch.distributed.get_world_size(), 1),
+            mesh_dim_names=(dp_shard_dim, tp_dim),
+        )
+
     # Parse zero_dp_strategy and outer_dp_sharding_strategy.
     # TODO(@cspades): Integrate this Enum into MegatronFSDP.
     if zero_dp_strategy == ShardingStrategy.NO_SHARD:
@@ -136,7 +304,7 @@ def fully_shard_model(
     if _outer_fsdp_sharding and zero_dp_strategy != "optim_grads_params":
         # If sharding on outer DP using HSDP, then we must use HSDP buffers and
         # we must be fully-sharding on inner DP. HSDP is an extension of FSDP.
-        # FIXME(@shjwudp, @cspades): This is an unexpected lack of support.
+        # TODO(@shjwudp, @cspades): Requires various modifications to support.
         raise ValueError(
             f"Sharding with Hybrid (Fully) Sharded Data Parallel (HSDP) requires "
             "zero_dp_strategy to use FSDP ('optim_grads_params', 3), because "
@@ -201,6 +369,7 @@ def fully_shard_model(
         calculate_per_token_loss=calculate_per_token_loss,
         init_model_with_meta_device=init_model_with_meta_device,
         sync_model_each_microbatch=sync_model_each_microbatch,
+        enable_fine_grained_param_gather_hook=enable_fine_grained_param_gather,
     )
 
     # Register a state dict post-hook to add Torch DCP metadata for writing checkpoints.
@@ -227,45 +396,60 @@ def preprocess_dcp_and_te_extra_state(state_dict):
     return model
 
 
+@experimental_api
 def fully_shard_optimizer(
-    model: MegatronFSDP,
-    optimizer: torch.optim.Optimizer,
-    preproc_state_dict_for_dcp_ckpt: bool = True,
-):
+    optimizer: torch.optim.Optimizer, preproc_state_dict_for_dcp_ckpt: bool = True
+) -> torch.optim.Optimizer:
     """
     Fully shard the optimizer for Megatron-FSDP. This is an in-place operation on the optimizer
     instance, which modifies the optimizer to call methods exposed by the MegatronFSDP model API.
 
-    Args:
-        model (MegatronFSDP):
-            The Megatron-FSDP model to be fully sharded.
+    The optimizer should be registered on the MegatronFSDP distributed model parameters:
+    ```
+        # Fully-shard the model.
+        mfsdp_model = fully_shard_model(model, ...)
+
+        # Register the fully-sharded parameters with the optimizer.
+        # Use MegatronFSDP._replace_param_with_distributed_if_needed()
+        # to swap to the distributed optimizer state parameters.
+        optimizer = fully_shard_optimizer(Adam(params=mfsdp_model.parameters()))
+    ```
 
+    Args:
         optimizer (torch.optim.Optimizer):
-            The optimizer to be fully sharded.
+            (Distributed) optimizer for training the model, which is extended to automatically
+            execute necessary Megatron-FSDP operations during the training loop.
 
         preproc_state_dict_for_dcp_ckpt (bool):
             Whether to preprocess the state dict for DCP checkpointing. Defaults to True.
-    """
-    # Swap to the model distributed parameters for the optimizer state.
-    # MegatronFSDP.__init__() will call this method upon completion, but
-    # this is to ensure intended operation even if fully_shard_optimizer()
-    # is invoked post-initialization, in which case it is possible the user
-    # has swapped to the MegatronFSDP unsharded compute parameters.
-    model._replace_param_with_distributed_if_needed()
 
-    # Replace the optimizer module parameter references with
-    # Megatron-FSDP-managed parameters.
-    optimizer.param_groups.clear()
-    optimizer.state.clear()
-    optimizer.add_param_group({"params": model.module.parameters()})
+    Returns:
+        optimizer (torch.optim.Optimizer): The in-place modified optimizer for Megatron-FSDP.
+    """
+    # Extract a reference to MegatronFSDP from the first registered Parameter.
+    if not optimizer.param_groups:
+        raise ValueError(
+            f"[MegatronFSDP fully_shard_optimizer()] Provided optimizer doesn't "
+            f"have any registered parameters: {optimizer}"
+        )
+    first_mfsdp_param = optimizer.param_groups[0][next(iter(optimizer.param_groups[0]))][0]
+    if not getattr(first_mfsdp_param, "_megatron_fsdp_model", None):
+        raise ValueError(
+            f"[MegatronFSDP fully_shard_optimizer()] Could not retrieve a reference to "
+            f"MegatronFSDP from the first registered Parameter: {first_mfsdp_param} \n"
+            "Make sure the optimizer is registered to the MegatronFSDP distributed "
+            "parameters via MegatronFSDP._replace_param_with_distributed_if_needed() "
+            "before initializing the optimizer on the MegatronFSDP model. "
+        )
+    mfsdp_model = first_mfsdp_param._megatron_fsdp_model
 
     # Save a reference to the optimizer.step() and optimizer.zero_grad() methods.
     optimizer_step_base_func = type(optimizer).step
     optimizer_zero_grad_base_func = type(optimizer).zero_grad
 
     # Define a new optimizer.step() method that distributes optimizer state and gradients,
-    # waits for asynchronous gradient reduce-scatter work to be completed, and
-    # updates model weights.
+    # waits for asynchronous gradient reduce-scatter work to be completed, and updates
+    # model weights. These options can be turned off via arguments in optimizer.step().
     def megatron_fsdp_optimizer_step(optimizer, *args, **kwargs):
         # Extract extended kwargs.
         sync_grad_before_optimizer_step = kwargs.pop("sync_grad_before_optimizer_step", True)
@@ -278,18 +462,19 @@ def megatron_fsdp_optimizer_step(optimizer, *args, **kwargs):
         # NOTE: Only necessary if MegatronFSDP.model_auto_sync = False, in which case
         # gradient synchronization is not automatically handled by MegatronFSDP during
         # the post-backward hook and we need to synchronize manually.
-        if sync_grad_before_optimizer_step and not model.model_auto_sync:
-            model.finish_grad_sync()
+        if sync_grad_before_optimizer_step and not mfsdp_model.model_auto_sync:
+            mfsdp_model.finish_grad_sync()
 
         # Execute the base optimizer.step() on the model optimizer named parameters.
         optimizer_step_base_func(optimizer, *args, **kwargs)
 
         # Update the raw module training parameters with optimized values.
         if install_optimized_model_weights:
-            model.install_optimized_model_weights()
+            mfsdp_model.install_optimized_model_weights()
 
     # Define a new optimizer.zero_grad() method that zeros the gradient in both
-    # the optimizer as well as the Megatron-FSDP gradient buffer.
+    # the optimizer as well as the Megatron-FSDP gradient buffer. These options
+    # can be turned off via arguments in optimizer.zero_grad().
     def megatron_fsdp_optimizer_zero_grad(optimizer, *args, **kwargs):
         # Extract extended kwargs.
         zero_grad_buffer = kwargs.pop("zero_grad_buffer", True)
@@ -299,7 +484,7 @@ def megatron_fsdp_optimizer_zero_grad(optimizer, *args, **kwargs):
 
         # Zero out the gradient in the Megatron-FSDP gradient buffer.
         if zero_grad_buffer:
-            model.zero_grad_buffer()
+            mfsdp_model.zero_grad_buffer()
 
     # Override the optimizer.step() and optimizer.zero_grad() methods to support
     # Megatron-FSDP operations.
@@ -324,12 +509,15 @@ def megatron_fsdp_optimizer_zero_grad(optimizer, *args, **kwargs):
             lambda *args, **kwargs: preprocess_state_dict_for_uneven_dtensor(optimizer_state_dict)
         )
 
+    # Return the in-place modified optimizer.
+    return optimizer
+
 
 def fully_shard(
     module: torch.nn.Module,
     optimizer: torch.optim.Optimizer,
-    device_mesh: DeviceMesh,
-    dp_shard_dim: str,
+    device_mesh: Optional[DeviceMesh] = None,
+    dp_shard_dim: Optional[str] = None,
     dp_outer_dim: Optional[str] = None,
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
@@ -353,6 +541,7 @@ def fully_shard(
     nccl_ub: bool = False,
     fsdp_double_buffer: bool = False,
     disable_symmetric_registration: bool = False,
+    enable_fine_grained_param_gather: bool = False,
 ) -> tuple[MegatronFSDP, torch.optim.Optimizer]:
     """
     Fully shard the model and the optimizer for Megatron-FSDP.
@@ -361,138 +550,7 @@ def fully_shard(
     be compatible with the Megatron-FSDP training strategy.
 
     Args:
-        module (torch.nn.Module):
-            The PyTorch module fully-sharded and managed by Megatron-FSDP.
-
-        optimizer (torch.optim.Optimizer):
-            (Distributed) optimizer for training the model, which is extended to automatically
-            execute necessary Megatron-FSDP operations during the training loop. If not provided,
-            the user is expected to utilize fully_shard_optimizer() or the MegatronFSDP API to
-            manually configure the model for optimization. Defaults to None.
-
-        device_mesh (DeviceMesh):
-            Device mesh object defining the topology for distributed training.
-
-        dp_shard_dim (str):
-            Name of the data parallel sharding sub-mesh in the device_mesh. Supports
-            a flattened DP-CP sub-mesh, in which case parameters, gradients, and
-            optimizer state will be sharded across both DP and CP ranks.
-            Required to enable the core functionality of Megatron-FSDP.
-
-        dp_outer_dim (Optional[str]):
-            Name of the "outer" DP sub-mesh in the device_mesh for hybrid-sharding (HSDP),
-            which supports "DP-Replicate" as well as optimizer state sharding (HFSDP).
-            Defaults to None. Required for HSDP, which is enabled by this argument.
-
-        tp_dim (Optional[str]):
-            Name of the tensor parallel sub-mesh in the device_mesh, which is necessary
-            for strided sharding between TP and FSDP (and fully-sharded HSDP) dimensions.
-            Defaults to None. Required if TP is used in the model, or if TransformerEngine
-            layers are utilized, as TE defaults to "TP=1".
-
-        hybrid_fsdp_group (Optional[torch.distributed.ProcessGroup]):
-            Cumulative data parallel process group for hybrid FSDP that can be manufactured
-            by flattening the outer-FSDP (dp_outer_dim) and FSDP (dp_shard_dim) process groups
-            or sub-meshes. Defaults to None. Required for HSDP, i.e. if dp_outer_dim is not None.
-
-        expt_device_mesh (Optional[DeviceMesh]):
-            Expert parallel device mesh object defining the topology for MoE distributed training.
-
-        fsdp_unit_modules (Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]]):
-            List of (sub-)module classes or (sub-)module class import paths that are "units",
-            which are torch.nn.Module(s) that are sharded and scheduled by Megatron-FSDP.
-            In particular, FSDP unit module parameters can be "safely" deallocated after
-            the forward() or backward() pass without interfering with other computational
-            operations that rely on those parameters in the complete PyTorch model.
-            This information is utilized by Megatron-FSDP to optimally shard, gather, and
-            overlap communications during the forward and backward pass of the module.
-            Defaults to None, which is peak-memory-equivalent to DDP / "no_shard".
-
-        zero_dp_strategy (str | int):
-            Zero-redundancy sharding strategy for sharding data parallel parameters and gradients.
-            - "no_shard" / 0: No optimizer, gradient, or parameter sharding. Similar
-                memory usage to DDP.
-            - "optim" / 1: Shards optimizer states (and main weights for mixed precision training),
-                which is conceptually similar to optimizer state sharding in `ZeRO-DP`.
-            - "optim_grads" / 2: Shards gradients and optimizer states, which is conceptually
-                similar to "ZeRO-2".
-            - "optim_grads_params" / 3: Shards parameters, gradients and optimizer states, which
-                is conceptually similar to "ZeRO-3".
-            Defaults to "optim_grads_params" / 3.
-
-        outer_dp_sharding_strategy (str | int):
-            Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP).
-            Shares the same semantics as zero_dp_strategy, but only 'no_shard' / 0 (DP Replication)
-            and 'optim' / 1 (Optimizer State Hybrid Sharding) are supported, and 'optim' / 1 is only
-            supported when zero_dp_strategy='optim_grads_params'.
-            This option is only effective when HSDP is enabled, i.e. when dp_outer_dim is not None.
-            Defaults to "no_shard" / 0, which replicates model parameters across the dp_outer group.
-
-        device (Optional[torch.device]):
-            Target device for the sharded model. Used to migrate all parameters in the model
-            to an expected device. If init_model_with_meta_device=True, this argument is ignored.
-            Defaults to None.
-
-        init_model_with_meta_device (bool):
-            Utilized to initialize large models that do not fit on a single device, and requires
-            implementing a custom Module.reset_parameters() or Module._reset_parameters() method.
-            Defaults to False.
-
-        grad_reduce_in_fp32 (bool):
-            Whether to perform gradient reduction in FP32. Defaults to False.
-
-        preserve_fp32_weights (bool):
-            Whether to preserve FP32 optimization weights. Defaults to True.
-
-        overlap_grad_reduce (bool):
-            Whether to overlap gradient reduce-scatter (or all-reduce) with backward compute.
-            Defaults to True.
-
-        overlap_param_gather (bool):
-            Whether to overlap parameter all-gather with forward and backward compute.
-            Defaults to True.
-
-        sync_model_each_microbatch (bool): Whether to sync parameters and install gradients on
-            each training step. When disabled, Megatron-FSDP will overlap reduce-scatter with
-            subsequent compute and delay HSDP gather and reduce operations per optimization cycle,
-            which improves performance and throughput when using delayed optimization strategies
-            such as gradient accumulation. Defaults to True, can be modified before the model
-            forward / backward pass via MegatronFSDP.set_model_auto_sync(bool) or controlled
-            with the (no_)sync context managers or microbatch_count and is_last_microbatch.
-
-        preproc_state_dict_for_dcp_ckpt (bool):
-            Whether to preprocess the unevenly-sharded state dict for DCP checkpointing,
-            for both the model and the optimizer.
-            Defaults to True.
-
-        check_for_nan_in_grad (bool):
-            Whether to check for NaN values in gradients. Defaults to True.
-
-        average_in_collective (bool):
-            Whether to average gradients in collective communication. Defaults to False.
-            TODO: This is currently NOT supported!
-
-        disable_bucketing (bool):
-            Whether to disable gradient bucketing optimization, which permits more granular
-            and precise communication of parameters and gradients. Defaults to False.
-
-        calculate_per_token_loss (bool):
-            Whether to calculate loss per token, which deactivates gradient scaling.
-            Defaults to False.
-
-        keep_fp8_transpose_cache (bool):
-            Whether to keep the FP8 transpose cache when using a Megatron FSDP.
-            Defaults to False.
-
-        nccl_ub (bool):
-            Whether to use NCCL UCC for communication. Defaults to False.
-
-        fsdp_double_buffer (bool):
-            Whether to use double buffer for FSDP. Defaults to False.
-
-        disable_symmetric_registration (bool):
-            Whether to disable symmetric (window) registration for NCCL UB registration.
-            This option forces conventional (local) UB registration when nccl_ub is set.
+        Union of arguments from fully_shard_model and fully_shard_optimizer.
 
     Returns:
         torch.nn.Module: The wrapped Megatron-FSDP model configured for distributed training.
@@ -530,11 +588,19 @@ def fully_shard(
         nccl_ub=nccl_ub,
         fsdp_double_buffer=fsdp_double_buffer,
         disable_symmetric_registration=disable_symmetric_registration,
+        enable_fine_grained_param_gather=enable_fine_grained_param_gather,
     )
 
     # Extend optimizer methods to support Megatron-FSDP operations.
-    if optimizer is not None:
-        fully_shard_optimizer(model, optimizer, preproc_state_dict_for_dcp_ckpt)
+    # Replace the optimizer module parameter references with
+    # Megatron-FSDP-managed distributed parameters.
+    model._replace_param_with_distributed_if_needed()
+    optimizer.param_groups.clear()
+    optimizer.state.clear()
+    optimizer.add_param_group({"params": model.parameters()})
+    fully_shard_optimizer(
+        optimizer, preproc_state_dict_for_dcp_ckpt=preproc_state_dict_for_dcp_ckpt
+    )
 
     # Return model and optimizer.
     return model, optimizer
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py
index 8a63e0f5cf7..bd13e76379e 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py
@@ -23,6 +23,20 @@
 import torch.nn as nn
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
+from .mixed_precision import (
+    fp8_create_transpose_cache,
+    fp8_discard_transpose_cache,
+    is_float8tensor,
+)
+from .param_and_grad_buffer import (
+    AllGatherPipeline,
+    BucketingPolicy,
+    GradReducePipeline,
+    ParamAndGradBuffer,
+    PrefetchOrder,
+    override_sharded_param_methods_with_safety_checks,
+    to_local_if_dtensor,
+)
 from .utils import FSDPDistributedIndex
 
 logger = logging.getLogger(__name__)
@@ -34,23 +48,12 @@
     from megatron.core.distributed.distributed_data_parallel_config import (
         DistributedDataParallelConfig,
     )
-    from megatron.core.fp8_utils import is_float8tensor
     from megatron.core.utils import is_submodule
 except ImportError:
     # Megatron-LM is not installed, use Megatron-FSDP as a standalone module.
     logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.")
     from .distributed_data_parallel_config import DistributedDataParallelConfig
-    from .utils import is_float8tensor, is_submodule
-
-from .param_and_grad_buffer import (
-    AllGatherPipeline,
-    BucketingPolicy,
-    GradReducePipeline,
-    ParamAndGradBuffer,
-    PrefetchOrder,
-    override_sharded_param_methods_with_safety_checks,
-    to_local_if_dtensor,
-)
+    from .utils import is_submodule
 
 
 class TrainingState(Enum):
@@ -136,6 +139,9 @@ class MegatronFSDP(torch.nn.Module):
         disable_symmetric_registration (bool): Whether to disable symmetric (window) registration
             for NCCL userbuffer registration. This option will force to use conventional (local)
             userbuffer registration when nccl_ub is set.
+        enable_fine_grained_param_gather (bool): Whether to enable "fine-grained" param all-gather,
+            which can improve performance when using MXFP8 parameters with activation recomputation.
+
     Examples:
         >>> model = GPTModel(config)
         >>> model = MegatronFSDP(
@@ -168,6 +174,7 @@ def __init__(
         nccl_ub: bool = False,
         fsdp_double_buffer: bool = False,
         disable_symmetric_registration: bool = False,
+        enable_fine_grained_param_gather_hook: bool = False,
     ):
         super().__init__()
         # If device is not specified, use the current device.
@@ -217,6 +224,7 @@ def __init__(
 
         self.calculate_per_token_loss = calculate_per_token_loss
         self.init_model_with_meta_device = init_model_with_meta_device
+        self.enable_fine_grained_param_gather_hook = enable_fine_grained_param_gather_hook
 
         # Whether to constantly synchronize the model every training iteration,
         # which defaults to False to overlap communication with computation
@@ -264,8 +272,10 @@ def __init__(
             "optim",
         ]
         if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params":
-            # Default to overlapped NCCL communication when fully-sharding.
+            # Default to overlapped parameter gather when fully-sharding.
             self.ddp_config.overlap_param_gather = True
+        if self.ddp_config.data_parallel_sharding_strategy in ["optim_grads_params", "optim_grads"]:
+            # Default to overlapped gradient reduce-scatter when sharding gradients.
             self.ddp_config.overlap_grad_reduce = True
         if not self.is_delay_grad_reduce:
             # Gradient reduce-scatter must be overlapped when using sharding optimizer
@@ -286,8 +296,14 @@ def __init__(
         self._register_fsdp_hooks(self.module)
         self.microbatch_count = 0
 
+        # Add a reference from the distributed parameters to self for API
+        # accessibility, e.g. when attaching MegatronFSDP scheduled ops
+        # to the distributed optimizer.step() and optimizer.zero_grad().
         self.is_param_fsdp_distributed = False
         self._replace_param_with_distributed_if_needed()
+        for param in self.module.parameters():
+            # Attach MegatronFSDP reference to the parameter.
+            setattr(param, "_megatron_fsdp_model", self)
 
     def _check_module_parameter_types(self):
         """
@@ -400,6 +416,7 @@ def all_gather_and_wait_parameters_ready(
         prefetch=True,
         prefetch_order=PrefetchOrder.FORWARD_PASS_ORDER,
         wait_bucket_ready=True,
+        bwd=False,
     ):
         """
         All-gather parameters across the data parallel group and wait for
@@ -426,11 +443,14 @@ def all_gather_and_wait_parameters_ready(
                 and self.ddp_config.outer_dp_sharding_strategy != "no_shard"
                 and (self.microbatch_count == 0 or self.model_auto_sync)
             ),
+            bwd=bwd,
         )
         if wait_bucket_ready:
             for param in params:
                 bucket_id = self.param_and_grad_buffer.param_to_param_group[param]
-                ag_pipeline.wait_bucket_ready(bucket_id)
+                ag_pipeline.wait_bucket_ready(bucket_id, bwd)
+                if bwd and is_float8tensor(param):
+                    fp8_create_transpose_cache(param)
 
         for param in params:
             # This setting is needed to make FSDP store the weight object when used
@@ -475,6 +495,7 @@ def _register_fsdp_hooks(self, root_module):
         self.forward_pre_hooks = {}
         self.forward_hooks = {}
         self.backward_pre_hooks = {}
+        self.grad_acc_hooks = {}
 
         """
         An FSDP unit is a module designed to manage the lifecycle of model parameters
@@ -489,10 +510,30 @@ def _register_fsdp_hooks(self, root_module):
         """
         fsdp_unit_modules = self.fsdp_unit_modules
 
-        def release_module_parameters(module, *unused):
+        def release_module_parameters(module, bwd, lazy=False, *unused):
+            """
+            Release the parameters of a given module after completing the forward
+            and backward passes.
+
+            Args:
+                module: The module whose parameters should be released.
+                bwd (bool): Indicates if the release is triggered during the backward pass.
+                lazy (bool, optional): Determines when the parameter buffer (bucket) is released.
+                    - If False, the buffer is released immediately.
+                    - If True, the release is deferred until just before the all-gather pipeline
+                    requests a new buffer. The delayed release is performed by invoking
+                    `recycle_unused_buckets`.
+                *unused: Placeholder for any unused arguments.
+
+            Notes:
+                - The function maps each parameter to its corresponding buffer group,
+                then releases the associated bucket through the all-gather pipeline.
+                - If `ddp_config.keep_fp8_transpose_cache` is False, it also clears
+                the FP8 transpose cache associated with the module’s parameters.
+            """
             for param in module.parameters():
                 bucket_id = self.param_and_grad_buffer.param_to_param_group[param]
-                self.all_gather_pipeline.release_bucket(bucket_id)
+                self.all_gather_pipeline.release_bucket(bucket_id, bwd, lazy=lazy)
 
             if not self.ddp_config.keep_fp8_transpose_cache:
                 release_params_fp8_transpose_cache(module.parameters())
@@ -500,8 +541,7 @@ def release_module_parameters(module, *unused):
         def release_params_fp8_transpose_cache(params):
             for param in params:
                 if is_float8tensor(param):
-                    param._transpose_invalid = True
-                    param._transpose = None
+                    fp8_discard_transpose_cache(param)
 
         def _grad_acc(param):
             """
@@ -528,6 +568,7 @@ def _grad_acc(param):
                     param.main_grad = param.get_main_grad()
                     if param.grad is not None:
                         # Copy the gradient into the allocated main gradient bucket.
+                        # It will be reduce-scattered and accumulated into gbuf.
                         param.main_grad.copy_(to_local_if_dtensor(param.grad))
                         del param.grad
                     else:
@@ -537,6 +578,7 @@ def _grad_acc(param):
                 if not param.grad_added_to_main_grad:
                     if param.grad is not None:
                         # Add the gradient into the allocated main gradient bucket.
+                        # For unsharded gradients, this is gradient accumulation.
                         param.main_grad = param.get_main_grad()
                         param.main_grad.add_(to_local_if_dtensor(param.grad))
                         del param.grad
@@ -549,43 +591,78 @@ def _grad_acc(param):
 
         self._params_require_handle_grad = set()
 
-        def _post_backward(module, *unused):
+        def _post_backward_release_module(module, *unused):
             """
-            Deallocate the module parameters after the backward pass,
-            and reduce-scatter the gradients before the optimizer step.
+            Post-backward hook for an FSDP unit to release parameters and process
+            its gradients after the backward pass.
+
+            This hook:
+            - Validates that the module is an FSDP unit and that the data-parallel
+            sharding strategy is ``"optim_grads_params"``.
+            - Releases the module's parameters for the backward phase to free memory.
+            - Marks the module as IDLE in the training state machine.
             """
-            if isinstance(module, tuple(fsdp_unit_modules)):
-                if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params":
-                    # Deallocate the module parameters after the backward pass,
-                    # because we have our data-parallel gradients computed.
-                    release_module_parameters(module)
-                    module._training_state = TrainingState.IDLE
-                param_list = list(module.parameters())
-            else:
-                param_list = list(module.parameters(recurse=False))
+            assert isinstance(module, tuple(fsdp_unit_modules))
+            assert self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params"
 
-            # If the parameter is shared, we do not accumulate gradients
-            # here, as the gradients will be accumulated in the
-            # root post-backward hook.
-            param_list = [p for p in param_list if not getattr(p, "_is_shared", False)]
+            # Release parameters for this module after backward.
+            release_module_parameters(module, bwd=True)
+
+            # Transition this module back to the IDLE training state.
+            module._training_state = TrainingState.IDLE
 
-            # Write computed gradients into the allocated main gradient bucket for reduce-scatter.
+        @torch.compiler.disable
+        def _process_post_backward_gradients(param_list):
+            """
+            Process gradients for a list of parameters after the backward pass.
+
+            This helper accumulates gradients into the main_grad buffer and, when
+            appropriate, launches asynchronous reduce-scatter operations according
+            to the data-parallel sharding strategy and training phase.
+
+            Args:
+                param_list (List[torch.nn.Parameter]): Parameters whose gradients
+                    should be processed.
+
+            Behavior:
+                - Skips processing for shared parameters (those with ``_is_shared=True``),
+                since their gradients are handled by the root post-backward hook.
+                - Determines whether to reduce gradients based on:
+                    * Data-parallel sharding strategy (``"optim_grads"`` or
+                        ``"optim_grads_params"``).
+                    * Whether this is the last microbatch of the iteration.
+                    * Whether ``model_auto_sync`` is enabled.
+                - When reduction conditions are met, performs an asynchronous
+                reduce-scatter of gradients prior to the optimizer step, which
+                requires a subsequent call to ``finish_grad_sync()`` to complete.
+                - Marks parameters as processed by adding them to
+                    ``_params_require_handle_grad``.
+
+            Notes:
+                - With gradient-sharding strategies, gradient reduction occurs on
+                every backward propagation.
+                - Without gradient sharding, gradient reduction is deferred until
+                the last microbatch or when auto-sync is enabled.
+                - In hybrid FSDP configurations, an outer FSDP group gradient reduction
+                may be triggered.
+            """
+            # Filter out shared parameters whose gradients are handled by the root hook.
+            param_list = [p for p in param_list if not getattr(p, "_is_shared", False)]
             for param in param_list:
                 _grad_acc(param)
-                self._params_require_handle_grad.discard(param)
 
+            # Only reduce if gradients are sharded, or on the final microbatch, or when
+            # model_auto_sync is enabled.
             grad_reduce_every_bprop = self.ddp_config.data_parallel_sharding_strategy in [
                 "optim_grads",
                 "optim_grads_params",
             ]
-            # Only reduce if we are sharding gradients, or are on the final microbatch.
-            # If is_last_microbatch is not specified, then we should reduce gradients
-            # if model_auto_sync is enabled, otherwise wait until is_last_microbatch
-            # is actually specified by the user, context manager, or FW before reduction.
             is_last_microbatch = getattr(self, "is_last_microbatch", False)
+
             if grad_reduce_every_bprop or is_last_microbatch or self.model_auto_sync:
-                # Reduce-scatter the gradients asynchronously before the optimizer step.
-                # Requires calling finish_grad_sync() to wait for the reduce-scatter to complete.
+                # Launch asynchronous reduce-scatter of gradients before the optimizer
+                # step. This requires a later call to finish_grad_sync() to wait for
+                # completion.
                 self.grad_reduce_pipeline.reduce_gradients(
                     param_list,
                     suggested_queue_capacity=self.suggested_RS_queue_capacity,
@@ -595,6 +672,11 @@ def _post_backward(module, *unused):
                     ),
                 )
 
+            # Mark parameters as processed.
+            for param in param_list:
+                self._params_require_handle_grad.discard(param)
+
+        @torch.compiler.disable
         def _pre_forward_param_unshard(
             module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
         ):
@@ -615,6 +697,9 @@ def _pre_forward_param_unshard(
                 # to allocate as little memory as possible for this forward pass.
                 param_list = list(module.parameters(recurse=False))
 
+            if self.enable_fine_grained_param_gather_hook:
+                param_list = list(module.parameters(recurse=False))
+
             # All-gather the parameters before the forward pass.
             self.all_gather_and_wait_parameters_ready(
                 params=param_list,
@@ -623,6 +708,7 @@ def _pre_forward_param_unshard(
             )
             return args, kwargs
 
+        @torch.compiler.disable
         def _register_post_backward_hook(
             post_backward_hook: callable,
             module: nn.Module,
@@ -630,12 +716,14 @@ def _register_post_backward_hook(
             kwargs: Dict[str, Any],
         ):
             """
-            Pre-forward hook utilized to attach a gradient reduction post-backward
-            hook to the module.
+            Register a post-backward hook for the given module by inserting an autograd
+            Function in front of it. Note that a post-backward hook implemented in this
+            way is not compatible with in-place modifications of the module's inputs,
+            since such operations can trigger an autograd error that
+            "the output is a view and is being modified in-place".
             """
-            # Register the backward function to reduce gradients after the backward pass.
-            # And for optim_grads_params, we need to release the parameters after the backward pass.
             if not torch.is_grad_enabled():
+                # No gradients / backward pass, don't attach the post-backward hook.
                 return args, kwargs
 
             # Preprocess the input arguments.
@@ -653,11 +741,10 @@ def _register_post_backward_hook(
                 return args, kwargs
 
             """
-            Bootstrapped identity autograd function that attaches a post-backward
-            "hook" to the module to trigger model resharding / deallocation and
-            gradient reduce-scatter immediately after the module backward pass has
-            completed to deallocate this layer's model and gradient memory before
-            the subsequent backward pass.
+            Identity autograd Function that attaches a post-backward "hook" to the
+            module, triggering parameter deallocation immediately after the module's
+            backward pass has completed in order to shard this layer's model memory
+            once the current backward stage is done.
             """
             inp_tensors = RegisterFSDPBackwardFunction.apply(
                 functools.partial(post_backward_hook, module), *inp_tensors
@@ -676,7 +763,10 @@ def _register_post_backward_hook(
 
         def _root_post_backward(*unused):
             # Make sure all the gradients are handled.
-            for param in self._params_require_handle_grad:
+            ordered_params = sorted(
+                list(self._params_require_handle_grad), key=lambda p: self.param_to_name[p]
+            )
+            for param in ordered_params:
                 _grad_acc(param)
 
             # Reduce the remaining gradients.
@@ -691,7 +781,7 @@ def _root_post_backward(*unused):
             is_last_microbatch = getattr(self, "is_last_microbatch", False)
             if grad_reduce_every_bprop or is_last_microbatch or self.model_auto_sync:
                 self.grad_reduce_pipeline.reduce_gradients(
-                    list(self._params_require_handle_grad),
+                    ordered_params,
                     suggested_queue_capacity=self.suggested_RS_queue_capacity,
                     outer_fsdp_group_grad_reduce=(
                         self.dist_index.use_hybrid_fsdp
@@ -714,30 +804,38 @@ def _root_post_backward(*unused):
             if self.model_auto_sync:
                 self.finish_grad_sync()
 
-        def _pre_backward(module: nn.Module, *unused):
+        @torch.compiler.disable
+        def _pre_backward_param_unshard(module: nn.Module, *unused):
             """
             Sub-module pre-backward hook to all-gather the module parameters
             before the backward pass.
             """
-            # Set the module's training state to PRE_BACKWARD to skip resharding
-            # and unsharding operations when performing activation recomputation
-            # / gradient checkpointing.
+            # Set the module's training state to PRE_BACKWARD.
             module._training_state = TrainingState.PRE_BACKWARD
+
             if isinstance(module, tuple(fsdp_unit_modules)):
-                # All-gather / unshard the module parameters before the backward pass.
-                self.all_gather_and_wait_parameters_ready(
-                    list(module.parameters()), prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER
-                )
+                param_list = list(module.parameters())
+            else:
+                param_list = list(module.parameters(recurse=False))
+
+            if self.enable_fine_grained_param_gather_hook:
+                param_list = list(module.parameters(recurse=False))
+
+            # All-gather / unshard the module parameters before the backward pass.
+            self.all_gather_and_wait_parameters_ready(
+                param_list, prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER, bwd=True
+            )
 
         self._root_pre_backward_hook_issued = False
 
         def _root_pre_backward(module: nn.Module, *unused):
-            """Marks the module's training state as 'pre_backward' before the
+            """Marks the module's training state as PRE_BACKWARD before the
             backprop, this function is registered on the root module.
 
-            This marking enables us to determine whether forward pass needs to
-            perform reshard/unshard operations in activation recomputation
-            scenarios.
+            This root pre-backward hook informs all modules to skip forward
+            pre-fetching in the pre-forward hooks (for activation recomputation)
+            and skip weight deallocation / resharding in the post-forward hooks
+            during the backward pass, which are instead performed by backward hooks.
             """
             if self._root_pre_backward_hook_issued:
                 return
@@ -746,7 +844,7 @@ def _root_pre_backward(module: nn.Module, *unused):
             if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params":
                 for module in root_module.modules():
                     if isinstance(module, tuple(fsdp_unit_modules)):
-                        # Set PRE_BACKWARD state to skip resharding and unsharding operations
+                        # Set PRE_BACKWARD state to skip resharding and forward pre-fetching
                         # when performing activation recomputation / gradient checkpointing.
                         module._training_state = TrainingState.PRE_BACKWARD
                 # set all param buckets can be released
@@ -754,7 +852,9 @@ def _root_pre_backward(module: nn.Module, *unused):
                 for bucket_id in range(ag_pipeline.num_buckets):
                     group = self.param_and_grad_buffer.parameter_groups[bucket_id]
                     if group.fsdp_unit_id is not None:
-                        ag_pipeline.bucket_can_be_released[bucket_id] = True
+                        ag_pipeline.bucket_can_be_released[
+                            ag_pipeline.get_bucket_key(bucket_id, bwd=False)
+                        ] = True
             # Track parameters that require gradient reduction and optimization.
             self._params_require_handle_grad = set()
             for param_group in self.param_and_grad_buffer.parameter_groups:
@@ -767,21 +867,31 @@ def _root_pre_backward(module: nn.Module, *unused):
             # the backward pass.
             torch.autograd.Variable._execution_engine.queue_callback(_root_post_backward)
 
+        @torch.compiler.disable
         def _post_forward(module: nn.Module, input: Any, output: Any):
             # When composed with module-hook-based activation recomputation, the
             # post-backward hook is responsible for resharding the module parameters
-            # after the forward pass. Skip resharding the module parameters in this case.
+            # after the forward pass. In this case, the resharding is performed lazily.
             if module._training_state == TrainingState.PRE_BACKWARD:
-                # Skip weight deallocation until the backward pass is complete
-                # during activation recomputation / gradient checkpointing.
-                return output
+                # Delay parameter resharding because this is currently running inside
+                # the activation recomputation forward. The corresponding backward
+                # pass may still need these parameters, and delaying avoids an
+                # unnecessary all-gather.
+                lazy_release = True
+            else:
+                lazy_release = False
+                module._training_state = TrainingState.IDLE
+
+            assert isinstance(
+                module, tuple(fsdp_unit_modules)
+            ), "_post_forward hook should only be registered on FSDP unit modules."
 
             # Release the module parameters after the forward pass to save memory.
-            release_module_parameters(module)
-            module._training_state = TrainingState.IDLE
+            release_module_parameters(module, bwd=False, lazy=lazy_release)
 
             return output
 
+        @torch.compiler.disable
         def _release_module_fp8_transpose_cache(module: nn.Module, *unused):
             release_params_fp8_transpose_cache(module.parameters(recurse=False))
 
@@ -791,6 +901,7 @@ def create_custom_backward_hook(module, custom_backward_handler):
             to the output tensor(s) of a module during a post-forward hook.
             """
 
+            @torch.compiler.disable
             def forward_hook(_module, inputs, output):
                 # Replace the output to avoid the output tensor being the same as
                 # the input tensor, which makes it impossible to identify which
@@ -818,21 +929,41 @@ def forward_hook(_module, inputs, output):
             # on the output tensor(s).
             return module.register_forward_hook(forward_hook)
 
+        def _register_pre_forward_param_unshard_hook(module):
+            """
+            Register the forward pre-hook to unshard parameters before the forward pass.
+            If we are not sharding anything, we do not have a model weight buffer and thus
+            have nothing to all-gather / un-shard.
+            """
+            if self.ddp_config.data_parallel_sharding_strategy != "no_shard":
+                self.forward_pre_hooks[f"{module._get_name()} parameter unshard"] = (
+                    module.register_forward_pre_hook(
+                        _pre_forward_param_unshard, prepend=True, with_kwargs=True
+                    )
+                )
+
+        def _register_pre_backward_param_unshard_hook(module):
+            """
+            Register the backward pre-hook to unshard FSDP unit module parameters
+            immediately before the backward pass via attaching a gradient-triggered
+            hook to the output tensor(s) of a module during a post-forward hook.
+            """
+            self.backward_pre_hooks[f"all-gather {module._get_name()} parameters"] = (
+                create_custom_backward_hook(module, _pre_backward_param_unshard)
+            )
+
         fsdp_modules = []
         for name, module in root_module.named_modules():
+            if self.enable_fine_grained_param_gather_hook:
+                _register_pre_forward_param_unshard_hook(module)
+                _register_pre_backward_param_unshard_hook(module)
+
             # Skip if the module is already registered in fsdp_modules.
             if any(is_submodule(module, fsdp_module) for fsdp_module in fsdp_modules):
                 continue
 
-            # Register the forward pre-hook to unshard parameters before the forward pass.
-            # If we are not sharding anything, we do not have a model weight buffer and thus
-            # have nothing to all-gather / un-shard.
-            if self.ddp_config.data_parallel_sharding_strategy != "no_shard":
-                self.forward_pre_hooks[f"module {name} parameter unshard"] = (
-                    module.register_forward_pre_hook(
-                        _pre_forward_param_unshard, prepend=True, with_kwargs=True
-                    )
-                )
+            if not self.enable_fine_grained_param_gather_hook:
+                _register_pre_forward_param_unshard_hook(module)
 
             if isinstance(module, tuple(fsdp_unit_modules)):
                 fsdp_modules.append(module)
@@ -843,12 +974,8 @@ def forward_hook(_module, inputs, output):
                     module.register_forward_hook(_post_forward, prepend=False)
                 )
 
-                # Register the backward pre-hook to unshard FSDP unit module parameters
-                # immediately before the backward pass via attaching a gradient-triggered
-                # hook to the output tensor(s) of a module during a post-forward hook.
-                self.backward_pre_hooks[f"all-gather module {name} parameters"] = (
-                    create_custom_backward_hook(module, _pre_backward)
-                )
+                if not self.enable_fine_grained_param_gather_hook:
+                    _register_pre_backward_param_unshard_hook(module)
             elif (
                 not self.ddp_config.keep_fp8_transpose_cache
                 and self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params"
@@ -861,15 +988,28 @@ def forward_hook(_module, inputs, output):
                     module.register_forward_hook(_release_module_fp8_transpose_cache, prepend=False)
                 )
 
-            # Register the post-backward hook to deallocate model parameters and
-            # reduce-scatter gradients immediately after the module backward pass
-            # has completed to conserve memory for the subsequent backward pass.
-            self.forward_pre_hooks[f"module {name} register post-backward hook"] = (
-                module.register_forward_pre_hook(
-                    functools.partial(_register_post_backward_hook, _post_backward),
-                    with_kwargs=True,
+            # Register the post-backward hook to deallocate model parameters
+            # and reduce-scatter gradients after the backward pass.
+            if isinstance(module, tuple(fsdp_unit_modules)):
+                if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params":
+                    self.forward_pre_hooks[f"module {name} register post-backward hook"] = (
+                        module.register_forward_pre_hook(
+                            functools.partial(
+                                _register_post_backward_hook, _post_backward_release_module
+                            ),
+                            with_kwargs=True,
+                        )
+                    )
+                grad_acc_param_list = list(module.parameters())
+            else:
+                grad_acc_param_list = list(module.parameters(recurse=False))
+
+            for param in grad_acc_param_list:
+                self.grad_acc_hooks[f"grad_acc and reduce for {self.param_to_name[param]}"] = (
+                    param.register_post_accumulate_grad_hook(
+                        lambda p: _process_post_backward_gradients([p])
+                    )
                 )
-            )
 
         # Register root module pre- and post-backward hooks in cases where the
         # forward function of root module is not called, but rather the forward
@@ -878,10 +1018,7 @@ def forward_hook(_module, inputs, output):
             if len(list(module.parameters())) != len(list(root_module.parameters())):
                 # Only attach to root sub-module.
                 continue
-            # Add a pre-backward hook to reshard / deallocate model parameters prior
-            # to the backward pass.
-            # Furthermore, add a gradient-triggered post-backward hook to reduce-scatter
-            # leftover gradients.
+            # Install the root pre-backward hook.
             self.backward_pre_hooks[f"{name} _root_pre_backward"] = create_custom_backward_hook(
                 module, _root_pre_backward
             )
@@ -986,7 +1123,7 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo
         else:
             self.synchronize_param_gather()
             for bucket_id in range(self.all_gather_pipeline.num_buckets):
-                self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id)
+                self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id, bwd=False)
                 group = self.param_and_grad_buffer.parameter_groups[bucket_id]
                 if group.model_weight_buffer is None:
                     continue
@@ -994,9 +1131,10 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo
                 if group.model_weight_buffer.is_data_distributed:
                     # If model weight is sharded, we wait for the all-gather to complete and
                     # then release the bucket immediately to save memory usage.
-                    self.all_gather_pipeline.wait_bucket_ready(bucket_id)
+                    self.all_gather_pipeline.wait_bucket_ready(bucket_id, False)
+
             for bucket_id in range(self.all_gather_pipeline.num_buckets):
-                self.all_gather_pipeline.wait_bucket_ready(bucket_id)
+                self.all_gather_pipeline.wait_bucket_ready(bucket_id, False)
 
     def start_grad_sync(self, *unused):
         """
@@ -1042,7 +1180,7 @@ def attach_grad_to_optimizer_state(self):
         """
         self.param_and_grad_buffer.update_main_grads()
 
-    def finish_grad_sync(self):
+    def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
         """
         Finishes grad sync (all-reduce or reduce-scatter) communication operations
         for all model gradients. Call prior to the optimization step to resolve
@@ -1051,6 +1189,9 @@ def finish_grad_sync(self):
         When overlap_grad_reduce is set to True, waits for asynchronous communication
         calls to complete. When overlap_grad_reduce is set to False, calls synchronous
         communication ops.
+
+        NOTE: force_all_reduce is included as an argument to maintain API compatibility
+        with DDP.force_grad_sync.
         """
         # Synchronize gradient reduce-scatter operations for all model gradients.
         self.synchronize_gradient_reduce()
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py
new file mode 100644
index 00000000000..177e3b1caa2
--- /dev/null
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py
@@ -0,0 +1,361 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from contextlib import nullcontext
+from importlib.metadata import version
+from typing import List, Optional, Tuple
+
+import torch
+from packaging.version import Version as PkgVersion
+
+logger = logging.getLogger(__name__)
+
+# Detect if Transformer Engine is installed
+try:
+    import transformer_engine  # pylint: disable=W0611
+    from transformer_engine.pytorch.module.base import TransformerEngineBaseModule
+
+    HAVE_TE = True
+except (ImportError, ModuleNotFoundError):
+    TransformerEngineBaseModule = None
+    HAVE_TE = False
+    logger.info("Using Megatron-FSDP without Transformer Engine.")
+
+# Detect the Transformer Engine version
+try:
+    import transformer_engine as te
+
+    if hasattr(te, "__version__"):
+        TE_VERSION = PkgVersion(str(te.__version__))
+    else:
+        TE_VERSION = PkgVersion(version("transformer-engine"))
+except:
+    TE_VERSION = None
+
+# Detect the quantized_model_init or fp8_model_init context manager.
+if HAVE_TE:
+    try:
+        from transformer_engine.pytorch import quantized_model_init
+
+        QUANTIZED_MODEL_INIT_CLASS = quantized_model_init
+    except:
+        # Fallback to original FP8 model init.
+        from transformer_engine.pytorch import fp8_model_init
+
+        QUANTIZED_MODEL_INIT_CLASS = fp8_model_init
+else:
+    QUANTIZED_MODEL_INIT_CLASS = nullcontext
+
+# Detect the FP8 tensor class
+try:
+    from transformer_engine.pytorch.tensor import QuantizedTensor
+
+    HAVE_TE_FP8_TENSOR_CLASS = True
+    FP8_TENSOR_CLASS = QuantizedTensor
+except:
+    try:
+        from transformer_engine.pytorch.float8_tensor import Float8Tensor
+
+        HAVE_TE_FP8_TENSOR_CLASS = True
+        FP8_TENSOR_CLASS = Float8Tensor
+    except:
+        HAVE_TE_FP8_TENSOR_CLASS = False
+
+# Detect the MXFP8 tensor class
+try:
+    from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Tensor
+
+    HAVE_TE_MXFP8TENSOR = True
+except:
+    HAVE_TE_MXFP8TENSOR = False
+
+# Detect the Blockwise FP8 tensor class
+try:
+    from transformer_engine.pytorch.tensor.float8_blockwise_tensor import Float8BlockwiseQTensor
+
+    HAVE_TE_BLOCKWISE_FP8TENSOR = True
+except:
+    HAVE_TE_BLOCKWISE_FP8TENSOR = False
+
+# Detect the "cast_master_weights_to_fp8" function of Transformer Engine
+try:
+    from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8
+
+    HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = True
+except:
+    HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = False
+
+    # Try to import multi_tensor_apply, used in the fallback of fp8 quantization.
+    try:
+        from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale
+
+        multi_tensor_scale_impl = multi_tensor_scale
+    except ImportError:
+        try:
+            import amp_C
+            from apex.multi_tensor_apply import multi_tensor_applier
+
+            multi_tensor_scale_impl = amp_C.multi_tensor_scale
+        except ImportError:
+            import warnings
+
+            warnings.warn(
+                "Transformer Engine and Apex are not installed. "
+                "Falling back to local implementations of "
+                "multi_tensor_applier and multi_tensor_scale"
+            )
+
+            def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args):
+                """Multi tensor op applier"""
+                return op(2048 * 32, noop_flag_buffer, tensor_lists, *args)
+
+            def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale):
+                """Works as a drop-in replacement for amp_C.multi_tensor_scale."""
+                for src, dst in zip(tensor_lists[0], tensor_lists[1]):
+                    dst.copy_(src * scale)
+
+            multi_tensor_applier = local_multi_tensor_applier
+            multi_tensor_scale_impl = local_multi_tensor_scale
+
+    def _multi_tensor_copy_this_to_that(
+        this: List[torch.Tensor],
+        that: List[torch.Tensor],
+        overflow_buf: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use multi-tensor-applier to copy values from one list to another.
+        We don't have a bfloat16 implementation so for now if the overflow_buf
+        is not provided, we default back to simple loop copy to be compatible
+        with bfloat16.
+        """
+        if overflow_buf is not None:
+            overflow_buf.fill_(0)
+            # Scaling with factor `1.0` is equivalent to copy.
+            multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0)
+        else:
+            for this_, that_ in zip(this, that):
+                that_.copy_(this_)
+
+
+# Detect the "post_all_gather_processing" function of Transformer Engine
+try:
+    from transformer_engine.pytorch.tensor.utils import post_all_gather_processing
+
+    HAVE_TE_POST_ALL_GATHER_PROCESSING = True
+except:
+    HAVE_TE_POST_ALL_GATHER_PROCESSING = False
+
+
+def is_te_min_version(vers, check_equality=True):
+    """Check if minimum version of `transformer-engine` is installed."""
+    if not isinstance(TE_VERSION, PkgVersion):
+        return False
+
+    if check_equality:
+        return TE_VERSION >= PkgVersion(vers)
+    else:
+        return TE_VERSION > PkgVersion(vers)
+
+
+def is_float8tensor(tensor: torch.Tensor) -> bool:
+    """Check if a tensor is a FP8 tensor."""
+    return HAVE_TE and isinstance(tensor, FP8_TENSOR_CLASS)
+
+
+def is_blockwise_float8tensor(tensor: torch.Tensor) -> bool:
+    """Check if a tensor is a Blockwise FP8 tensor."""
+    return HAVE_TE_BLOCKWISE_FP8TENSOR and isinstance(tensor, Float8BlockwiseQTensor)
+
+
+def fp8_need_transpose_data(tensor: torch.Tensor) -> bool:
+    """Check if a FP8 tensor needs transpose data."""
+    return HAVE_TE_MXFP8TENSOR and isinstance(tensor, MXFP8Tensor)
+
+
+def fp8_need_transpose_data_for_meta_device_init(module: TransformerEngineBaseModule) -> bool:
+    """Check if a FP8 tensor needs transpose data, for meta device init scenario."""
+    return HAVE_TE_MXFP8TENSOR and module.fp8_meta["recipe"].mxfp8()
+
+
+def fp8_discard_transpose_cache(tensor: torch.Tensor) -> None:
+    """Discard the transpose cache of a FP8 tensor."""
+    assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor"
+
+    if hasattr(tensor, "_transpose_invalid"):
+        tensor._transpose_invalid = True
+        tensor._transpose = None
+    elif not fp8_need_transpose_data(tensor):
+        tensor.update_usage(rowwise_usage=True, columnwise_usage=False)
+
+
+def fp8_create_transpose_cache(tensors: List[torch.Tensor]) -> None:
+    """Create the transpose cache of a FP8 tensor."""
+    if HAVE_TE_POST_ALL_GATHER_PROCESSING:
+        post_all_gather_processing(tensors)
+    else:
+        _fp8_create_transpose_cache_fallback(tensors)
+
+
+def _fp8_create_transpose_cache_fallback(tensors: List[torch.Tensor]) -> None:
+    if not isinstance(tensors, list):
+        tensors = [tensors]
+    for tensor in tensors:
+        assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor"
+        if hasattr(tensor, "_create_transpose"):
+            tensor._create_transpose()
+        else:
+            tensor._create_columnwise()
+
+
+def fp8_set_raw_data(tensor: torch.Tensor, data: torch.Tensor, set_transpose: bool = False) -> None:
+    """Set the raw data of a Transformer Engine Float8Tensor."""
+    assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor"
+
+    if set_transpose:
+        assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data"
+        data_attr = "_columnwise_data"
+    else:
+        data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data"
+
+    old_data = getattr(tensor, data_attr)
+    if old_data is not None:
+        assert (
+            old_data.dtype == data.dtype
+        ), f"The data types of raw data don't match {old_data.dtype} vs {data.dtype}"
+        assert (
+            old_data.shape == data.shape
+        ), f"Shape {old_data.shape} of old_data doesn't match {data.shape} of new_data"
+    setattr(tensor, data_attr, data)
+
+
+def fp8_get_raw_data(tensor: torch.Tensor, get_transpose: bool = False) -> torch.Tensor:
+    """Get the underlying raw storage of a FP8 tensor."""
+    assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor"
+
+    if get_transpose:
+        assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data"
+        data_attr = "_columnwise_data"
+    else:
+        data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data"
+
+    return getattr(tensor, data_attr)
+
+
+def fp8_dequantize(tensor: torch.Tensor) -> torch.Tensor:
+    """Dequantize a FP8 tensor to a higher precision."""
+    assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor"
+    assert is_te_min_version(
+        "2.0"
+    ), "Transformer Engine >= 2.0 is required for dequantizing parameters."
+    return tensor.dequantize()
+
+
+def fp8_quantize(
+    model_params: List[torch.Tensor],
+    main_params: List[torch.Tensor],
+    start_offsets: List[int],
+    data_parallel_group: torch.distributed.ProcessGroup,
+    fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]],
+) -> None:
+    """Quantize sharded parameters to FP8."""
+    if len(model_params) == 0:
+        return
+    fsdp_shard_model_params = [x[0] if x[1] is None else x for x in fsdp_shard_model_params]
+
+    if HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8:
+        cast_master_weights_to_fp8(
+            model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params
+        )
+    else:
+        _fp8_quantize_fallback(
+            model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params
+        )
+
+
+def _fp8_quantize_fallback(
+    model_params: List[torch.Tensor],
+    main_params: List[torch.Tensor],
+    start_offsets: List[int],
+    data_parallel_group: torch.distributed.ProcessGroup,
+    fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]],
+) -> None:
+    for model_param, main_param, start_offset, fsdp_shard_model_param in zip(
+        model_params, main_params, start_offsets, fsdp_shard_model_params
+    ):
+        if main_param is None:
+            continue
+
+        if fsdp_shard_model_param is not None:
+            shard_model_param = fsdp_shard_model_param
+        else:
+            shard_model_param = model_param._data.view(-1)[
+                start_offset : start_offset + main_param.numel()
+            ]
+
+        quantizer = model_param._quantizer
+        # When not using fp8 params, the main_param (fp32) is first cast to bf16/fp16, and then
+        # cast to fp8 during forward. This logic keeps numerical consistency with bf16 params.
+        main_param = main_param.to(model_param.dtype)
+        out = Float8Tensor(
+            shape=main_param.size(),
+            dtype=model_param.dtype,
+            requires_grad=False,
+            data=shard_model_param,
+            fp8_scale_inv=model_param._scale_inv,
+            fp8_dtype=model_param._fp8_dtype,
+            quantizer=quantizer,
+        )
+        quantizer.update_quantized(main_param, out)
+
+        amaxes = []
+        scales = []
+        scale_invs = []
+        for model_param in model_params:
+            quantizer = model_param._quantizer
+            amaxes.append(quantizer.amax.view(1))
+            scales.append(quantizer.scale.view(1))
+            scale_invs.append(model_param._scale_inv.view(1))
+            model_param._reset_caches()
+
+        dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda")
+
+        # Update scaling factors.
+        packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device)
+        packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))]
+        _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf)
+        torch.reciprocal(packed_scales, out=packed_scales)
+        _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf)
+
+        # Reduce amaxes.
+        # Note: Assume each param has a separate amax.
+        packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device)
+        packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))]
+        _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf)
+        torch.distributed.all_reduce(
+            packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group
+        )
+        _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf)
+
+
+def get_quantized_model_init_context_cls():
+    """
+    Get the TransformerEngine model parameter quantization context manager.
+    """
+    if QUANTIZED_MODEL_INIT_CLASS is nullcontext:
+        logger.warning(
+            f"quantized_model_init / fp8_model_init context was requested but does not exist. "
+            f"Verify TransformerEngine is installed (TE_INSTALLED={HAVE_TE})."
+        )
+    return QUANTIZED_MODEL_INIT_CLASS
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py
index a3a282a01c0..ef12b1edbe5 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py
@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 3
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = ""
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
index 6a294b69602..3ec117ebd9e 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -31,8 +31,19 @@
 import torch
 from torch.distributed import _coalescing_manager
 from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.distributed.tensor.device_mesh import _mesh_resources
 
+from .mixed_precision import (
+    fp8_discard_transpose_cache,
+    fp8_get_raw_data,
+    fp8_need_transpose_data,
+    fp8_need_transpose_data_for_meta_device_init,
+    fp8_quantize,
+    fp8_set_raw_data,
+    get_quantized_model_init_context_cls,
+    is_blockwise_float8tensor,
+    is_float8tensor,
+    is_te_min_version,
+)
 from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor
 from .utils import (
     _MODEL_PARALLEL_RNG_TRACKER_NAME,
@@ -51,32 +62,32 @@
     from megatron.core.distributed.distributed_data_parallel_config import (
         DistributedDataParallelConfig,
     )
-    from megatron.core.fp8_utils import (
-        is_float8tensor,
-        modify_underlying_storage,
-        quantize_param_shard,
-    )
     from megatron.core.tensor_parallel import get_cuda_rng_tracker
-    from megatron.core.utils import is_submodule, is_te_min_version
+    from megatron.core.utils import is_submodule, log_single_rank
 
+    HAVE_MCORE = True
     logger.info("Detected Megatron Core, using Megatron-FSDP with Megatron.")
 
 except ImportError:
     # Megatron-LM is not installed, use Megatron-FSDP as a standalone module.
     from .distributed_data_parallel_config import DistributedDataParallelConfig
-    from .utils import (
-        get_cuda_rng_tracker,
-        is_float8tensor,
-        is_submodule,
-        is_te_min_version,
-        modify_underlying_storage,
-        quantize_param_shard,
-    )
+    from .utils import get_cuda_rng_tracker, is_submodule
+
+    HAVE_MCORE = False
+
+    def log_single_rank(
+        logger_: logging.Logger, level: int, msg: str, *args, rank: int = 0, **kwargs
+    ):
+        """Fallback log_single_rank when Megatron Core is not available."""
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == rank:
+                logger_.log(level, msg, *args, **kwargs)
+        else:
+            logger_.log(level, msg, *args, **kwargs)
 
     logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.")
 
 try:
-    from transformer_engine.pytorch import fp8_model_init
     from transformer_engine.pytorch.module.base import TransformerEngineBaseModule
 
     HAVE_TE = True
@@ -217,11 +228,12 @@ def __exit__(self, *args):
         self.mem_allocator.__exit__(*args)
         for group in self.groups[1:]:
             backend = group._get_backend(torch.device("cuda", torch.cuda.current_device()))
-            if torch.distributed.get_rank() == 0:
-                logger.info(
-                    f"[MultiGroupUBRAllocator] Registering mem pool to group {group}, "
-                    f"group.group_desc:{group.group_desc}"
-                )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[MultiGroupUBRAllocator] Registering mem pool to group {group}, "
+                f"group.group_desc:{group.group_desc}",
+            )
             backend.register_mem_pool(self.pool)
 
 
@@ -817,7 +829,7 @@ def __init__(
         data_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
         dp_rank: Optional[int] = None,
         temporary_bucket_allocator: Optional[TemporaryBucketAllocator] = None,
-        is_dtype_float8: bool = False,
+        is_transpose_buffer: bool = False,
         gradient_scaling_factor: Optional[float] = None,
         chunk_size_factor: int = 1,
         mem_alloc_context: Optional[Callable] = None,
@@ -850,7 +862,7 @@ def __init__(
         self.temporary_bucket_allocator = (
             temporary_bucket_allocator if temporary_bucket_allocator else TemporaryBucketAllocator()
         )
-        self.is_dtype_float8 = is_dtype_float8
+        self.is_transpose_buffer = is_transpose_buffer
         self.gradient_scaling_factor = gradient_scaling_factor
         self.mem_alloc_context = mem_alloc_context if mem_alloc_context else nullcontext
 
@@ -946,11 +958,11 @@ def fetch_bucket(
             for p in self.params:
                 item_id = self.param_idx[p]
                 p = to_local_if_dtensor(p)
+                data = self.get_item_from_bucket(bucket, item_id).view(p.shape)
                 if is_float8tensor(p):
-                    p._data = self.get_item_from_bucket(bucket, item_id).view(p.shape)
+                    fp8_set_raw_data(p, data, self.is_transpose_buffer)
                 else:
-                    p.data = self.get_item_from_bucket(bucket, item_id).view(p.shape)
-
+                    p.data = data
         return bucket
 
     def free_bucket_storage(self):
@@ -1119,6 +1131,9 @@ def set_item(self, item_id: int, item_data: torch.Tensor) -> None:
         # When fully sharded, we need to get the slice of the item to be stored in this shard.
         # Otherwise, we can just flatten the entire item since this buffer contains
         # the entire bucket.
+        if is_float8tensor(item_data):
+            item_data = fp8_get_raw_data(item_data, self.is_transpose_buffer)
+
         if self.is_data_distributed:
             # Get the coordinates of the slice of the item that is contained in this shard.
             slice_start, slice_end = self._get_item_slice_in_shard(item_id)
@@ -1225,6 +1240,8 @@ class ParameterGroup:
             Factor determining chunk size for grouped parameter processing.
         model_weight_buffer (Optional[DataParallelBuffer]):
             Buffer used to store model weights for data-parallel operations.
+        transpose_weight_buffer (Optional[DataParallelBuffer]):
+            Buffer used to store transpose weights for data-parallel operations.
         main_weight_buffer (Optional[DataParallelBuffer]):
             Buffer used to store main model weights for data-parallel operations.
         main_grad_buffer (Optional[DataParallelBuffer]):
@@ -1244,6 +1261,7 @@ class ParameterGroup:
     fsdp_unit_id: Optional[int] = None
     chunk_size_factor: int = 1
     model_weight_buffer: Optional[DataParallelBuffer] = None
+    transpose_weight_buffer: Optional[DataParallelBuffer] = None
     main_weight_buffer: Optional[DataParallelBuffer] = None
     main_grad_buffer: Optional[DataParallelBuffer] = None
     hsdp_wbuf: Optional[DataParallelBuffer] = None
@@ -1314,12 +1332,10 @@ def _does_param_require_new_bucket(param):
     parameter_groups = []
     for name, param in module.named_parameters():
         # We need this information to correctly dynamically allocate Tensors!
+        is_fp8 = is_float8tensor(param)
+        is_fp8_meta_device_init = meta_device_init_fp8_params.get(name, (False, False))[0]
         param_attrs = dict(
-            dtype=(
-                "float8"
-                if is_float8tensor(param) or meta_device_init_fp8_params.get(name, False)
-                else param.dtype
-            ),
+            dtype="float8" if (is_fp8 or is_fp8_meta_device_init) else param.dtype,
             is_expert_param=is_expert_parameter(name, param),
             requires_grad=param.requires_grad,
             fsdp_unit_id=None,
@@ -1567,6 +1583,7 @@ def __init__(
             reset_parameters_for_meta_device_init_module
         )
         self.ubr_groups = None
+        self.already_registered = False
         # User buffer registration related settings
         if self.ddp_config.nccl_ub:
             assert nccl_allocator is not None, (
@@ -1583,14 +1600,17 @@ def __init__(
             NCCL_MEMORY_POOL = nccl_allocator.create_nccl_mem_pool(
                 symmetric=not self.ddp_config.disable_symmetric_registration
             )
-            if torch.distributed.get_rank() == 0:
-                logging.info(
-                    f"[Rank {torch.distributed.get_rank()}] Created NCCL memory pool for \
-                        UserBuffer Registration"
-                )
-                logging.info(
-                    f"[Rank {torch.distributed.get_rank()}] FSDP double buffer is enabled."
-                )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[Rank {torch.distributed.get_rank()}] Created NCCL memory pool for "
+                "UserBuffer Registration",
+            )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[Rank {torch.distributed.get_rank()}] FSDP double buffer is enabled.",
+            )
             # Select the communicator groups to register FSDP buffers.
             self.ubr_groups = [self.dist_index.get_fsdp_group(is_expert_parallel=False)]
             if self.dist_index.get_fsdp_group(is_expert_parallel=True) is not None:
@@ -1599,28 +1619,42 @@ def __init__(
             if self.dist_index.get_outer_fsdp_group() is not None:
                 # Outer/Inter-FSDP group when using hybrid FSDP
                 self.ubr_groups.append(self.dist_index.get_outer_fsdp_group())
-
-            if torch.distributed.get_rank() == 0:
-                logging.info(
-                    f"[ParamAndGradBuffer] FSDP UBRegistration Groups ({len(self.ubr_groups)}):"
+            if (
+                self.dist_index.get_fsdp_group(
+                    is_expert_parallel=False, independent_all_gather=True
                 )
+                is not None
+            ):
+                # All-gather group used when overlapping all-gather and gradient reduction.
+                self.ubr_groups.append(
+                    self.dist_index.get_fsdp_group(
+                        is_expert_parallel=False, independent_all_gather=True
+                    )
+                )
+
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[ParamAndGradBuffer] FSDP UBRegistration Groups ({len(self.ubr_groups)}):",
+            )
             # All ranks in each group must participate in the collective to avoid deadlock.
             for i, group in enumerate(self.ubr_groups):
-                if torch.distributed.get_rank() == 0:
-                    logging.info(
-                        f"Group [{i+1}/{len(self.ubr_groups)}] \
-                            group.group_desc: {group.group_desc}, group.size(): {group.size()}"
-                    )
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    f"Group [{i+1}/{len(self.ubr_groups)}] "
+                    f"group.group_desc: {group.group_desc}, group.size(): {group.size()}",
+                )
                 torch.distributed.barrier(group=group, async_op=False)
-                if torch.distributed.get_rank() == 0:
-                    logging.info(
-                        f"Call Success with the group [{i+1}/{len(self.ubr_groups)}] \
-                            group.group_desc: {group.group_desc}"
-                    )
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    f"Call Success with the group [{i+1}/{len(self.ubr_groups)}] "
+                    f"group.group_desc: {group.group_desc}",
+                )
             # Call barrier from the global communitcator group
             torch.distributed.barrier(async_op=False)
-            if torch.distributed.get_rank() == 0:
-                logging.info(f"Call Success with the global communicator group")
+            log_single_rank(logger, logging.INFO, "Call Success with the global communicator group")
 
         # If using nccl_ub, it returns a function that registers buffers to the NCCL memory pool
         # Buffer is registered to data_parallel_group and expert_data_parallel_group if it exists
@@ -1641,7 +1675,10 @@ def __init__(
                     # to determine whether this parameter is fp8 or not.
                     fp8_meta_index = m.param_init_meta[name].fp8_meta_index
                     if m.primary_weights_in_fp8 and fp8_meta_index is not None:
-                        meta_device_init_fp8_params[self.param_to_name[param]] = True
+                        meta_device_init_fp8_params[self.param_to_name[param]] = (
+                            True,
+                            fp8_need_transpose_data_for_meta_device_init(m),
+                        )
 
         # Get the parameter groups.
         (self.parameter_groups, self.param_to_param_group, self.bucket_to_bucket_group) = (
@@ -1670,6 +1707,10 @@ def get_mem_alloc_context(self, groups=None, symmetric=True):
                 groups = [self.dist_index.get_fsdp_group(is_expert_parallel=False)]
 
             if NCCL_ALLOCATOR == "MCORE":
+                if self.ddp_config.fsdp_manual_registration:
+                    return functools.partial(
+                        nccl_allocator.MemPoolAllocatorWithoutRegistration, NCCL_MEMORY_POOL
+                    )
                 if len(groups) == 1:
                     # register buffers to the default group directly using nccl memory allocator
                     mem_alloc_context = functools.partial(
@@ -1686,6 +1727,12 @@ def get_mem_alloc_context(self, groups=None, symmetric=True):
                         symmetric=symmetric,
                     )
             elif NCCL_ALLOCATOR == "APEX":
+                if self.ddp_config.fsdp_manual_registration:
+                    logging.warning(
+                        "FSDP manual registration is not supported for APEX NCCL allocator."
+                        "falling back to default registration. "
+                        "Please use Megatron Core NCCL allocator for manual registration."
+                    )
                 if symmetric:
                     logging.warning(
                         "Symmetric registration is not supported for APEX NCCL allocator."
@@ -1709,6 +1756,41 @@ def get_mem_alloc_context(self, groups=None, symmetric=True):
         else:
             return nullcontext
 
+    def manual_buffer_registration(self):
+        """
+        Manually register the FSDP communication buffers to NCCL user buffer.
+        """
+        assert self.ddp_config.nccl_ub, "NCCL UBR is not enabled"
+        assert self.ddp_config.fsdp_double_buffer, "FSDP double buffer is not enabled"
+        assert self.ddp_config.fsdp_manual_registration, "FSDP manual registration is not enabled"
+        assert not self.already_registered, "Mem pool is already registered"
+
+        self.already_registered = True
+
+        global NCCL_MEMORY_POOL
+        torch.cuda.synchronize()
+        torch.distributed.barrier(async_op=False)
+        torch.cuda.synchronize()
+
+        for group in self.ubr_groups:
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[MCORE][FSDP][Manual REG] Registering mem pool to group {group},"
+                f"group.group_desc:{group.group_desc}, group.size(): {group.size()}",
+            )
+            nccl_allocator.register_mem_pool(
+                NCCL_MEMORY_POOL,
+                group,
+                symmetric=not self.ddp_config.disable_symmetric_registration,
+            )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[MCORE][FSDP][Manual REG] Registered mem pool to group {group},"
+                f"group.group_desc:{group.group_desc}, group.size(): {group.size()}",
+            )
+
     def _log_parameter_groups(self):
         """Compact log of FSDP parameter groups and their parameters."""
 
@@ -1725,6 +1807,7 @@ def _bytes_to_mb(bytes_val: int) -> str:
             numel = sum(to_local_if_dtensor(p).shape.numel() for p in group.params)
             buffers = {
                 "weight": group.model_weight_buffer,
+                "transpose_weight": group.transpose_weight_buffer,
                 "main_weight": group.main_weight_buffer,
                 "grad": group.main_grad_buffer,
             }
@@ -1755,8 +1838,7 @@ def _bytes_to_mb(bytes_val: int) -> str:
             f"Total pad: {_bytes_to_mb(total_padded_bytes)}"
         )
 
-        if torch.distributed.get_rank() == 0:
-            logger.info("\n".join(log_lines))
+        log_single_rank(logger, logging.INFO, "\n".join(log_lines))
 
     def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
         """
@@ -1794,12 +1876,18 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
             self.weight_alloc = FixedPoolAllocator(
                 name="fsdp_params", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM
             )
+            self.transpose_weight_alloc = FixedPoolAllocator(
+                name="fsdp_fp8_transpose_params",
+                fsdp_param_groups=self.parameter_groups,
+                size=UB_BUFFER_NUM,
+            )
             self.main_grad_alloc = FixedPoolAllocator(
                 name="fsdp_grads", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM
             )
             self.double_buf_units = self.weight_alloc.fsdp_double_buffer_units
         else:
             self.weight_alloc = StorageResizeBasedBucketAllocator()
+            self.transpose_weight_alloc = StorageResizeBasedBucketAllocator()
             self.main_grad_alloc = None
             self.double_buf_units = []
 
@@ -1832,6 +1920,18 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                     is_expert_parallel=group.is_expert_param
                 )
 
+            # When --create-all-gather-group is enabled, use a separate process group for
+            # all-gather operations (model_weight_buffer) to enable overlap with gradient reduction
+            # operations (main_grad_buffer). This avoids head-of-line blocking between forward
+            # all-gather and backward reduce-scatter on the same communicator.
+            model_wbuf_dp_group = main_buf_dp_group
+            if not group.is_expert_param and not should_create_hfsdp_wbuf_and_gbuf:
+                ag_group = self.dist_index.get_fsdp_group(
+                    is_expert_parallel=False, independent_all_gather=True
+                )
+                if ag_group is not None:
+                    model_wbuf_dp_group = ag_group
+
             gradient_scaling_factor = (
                 self.gradient_scaling_factor
                 if not group.is_expert_param
@@ -1839,8 +1939,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
             )
             # Check if the parameter group is FP8.
             one_param = group.params[0]
-            is_dtype_float8 = is_float8tensor(one_param) or meta_device_init_fp8_params.get(
-                self.param_to_name[one_param], False
+            is_dtype_float8 = (
+                is_float8tensor(one_param)
+                or meta_device_init_fp8_params.get(self.param_to_name[one_param], (False, False))[0]
             )
             if is_dtype_float8:
                 param_dtype = torch.uint8
@@ -1849,6 +1950,16 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                 param_dtype = group.params[0].dtype
                 grad_dtype = param_dtype
 
+            # Check if the parameter group needs a transpose buffer for model weights.
+            # Currently, only mxfp8 needs it.
+            need_transpose_data = is_float8tensor(one_param) and fp8_need_transpose_data(one_param)
+            need_transpose_data_for_meta_device_init = meta_device_init_fp8_params.get(
+                self.param_to_name[one_param], (False, False)
+            )[1]
+            should_create_transpose_weight_buffer = (
+                need_transpose_data or need_transpose_data_for_meta_device_init
+            )
+
             # Check if the parameter group requires a grad buffer or main weight buffer.
             should_create_grad_buffer_or_main_weight_buffer = (
                 not self.only_create_grad_buffer_and_main_weight_buffer_for_param_requires_grad
@@ -1861,17 +1972,33 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                     self.ddp_config,
                     group.params,
                     is_data_distributed=is_model_weight_buffer_distributed
-                    and main_buf_dp_group.size() > 1,
+                    and model_wbuf_dp_group.size() > 1,
                     dtype=param_dtype,
                     device=self.device,
-                    data_parallel_group=main_buf_dp_group,
-                    is_dtype_float8=is_dtype_float8,
+                    data_parallel_group=model_wbuf_dp_group,
+                    is_transpose_buffer=False,
                     temporary_bucket_allocator=self.weight_alloc,
                     bucket_id=group_id,
                     chunk_size_factor=group.chunk_size_factor,
                     mem_alloc_context=self.mem_alloc_context,
                     **main_buf_extra_kwargs,
                 )
+                if should_create_transpose_weight_buffer:
+                    group.transpose_weight_buffer = DataParallelBuffer(
+                        self.ddp_config,
+                        group.params,
+                        is_data_distributed=is_model_weight_buffer_distributed
+                        and main_buf_dp_group.size() > 1,
+                        dtype=param_dtype,
+                        device=self.device,
+                        data_parallel_group=main_buf_dp_group,
+                        is_transpose_buffer=True,
+                        temporary_bucket_allocator=self.transpose_weight_alloc,
+                        bucket_id=group_id,
+                        chunk_size_factor=group.chunk_size_factor,
+                        mem_alloc_context=self.mem_alloc_context,
+                        **main_buf_extra_kwargs,
+                    )
 
             # Initialize the main weight buffer.
             if should_create_grad_buffer_or_main_weight_buffer and preserve_fp32_weights:
@@ -1903,7 +2030,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                     dtype=torch.float32 if grad_reduce_in_fp32 else grad_dtype,
                     device=self.device,
                     data_parallel_group=main_buf_dp_group,
-                    is_dtype_float8=False,
+                    is_transpose_buffer=False,
                     temporary_bucket_allocator=self.main_grad_alloc,
                     gradient_scaling_factor=gradient_scaling_factor,
                     bucket_id=group_id,
@@ -1927,7 +2054,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                     dtype=wbuf.dtype,
                     device=wbuf.device,
                     data_parallel_group=hsdp_buf_dp_group,
-                    is_dtype_float8=wbuf.is_dtype_float8,
+                    is_transpose_buffer=False,
                     temporary_bucket_allocator=self.weight_alloc,
                     bucket_id=group_id,
                     chunk_size_factor=group.chunk_size_factor,
@@ -1943,6 +2070,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                     ),
                 )
 
+                if group.transpose_weight_buffer is not None:
+                    raise NotImplementedError("HSDP for transpose buffer is not implemented yet")
+
                 if should_create_grad_buffer_or_main_weight_buffer:
                     # Initialize the HSDP grad buffer.
                     gbuf = group.main_grad_buffer
@@ -1954,7 +2084,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                         dtype=gbuf.dtype,
                         device=gbuf.device,
                         data_parallel_group=hsdp_buf_dp_group,
-                        is_dtype_float8=gbuf.is_dtype_float8,
+                        is_transpose_buffer=False,
                         temporary_bucket_allocator=self.main_grad_alloc,
                         gradient_scaling_factor=gradient_scaling_factor,
                         bucket_id=group_id,
@@ -1998,8 +2128,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                 f"CUDA params numel: {cuda_params_numel / 1_000_000:.2f} M, "
                 f"CPU params numel: {cpu_params_numel / 1_000_000:.2f} M"
             )
-            if torch.distributed.get_rank() == 0:
-                logger.info(log_str)
+            log_single_rank(logger, logging.INFO, log_str)
 
         # Initialize the model weight buffer data of each parameter group.
         # Specifically, replace the Torch module's parameter data with tensors
@@ -2037,6 +2166,20 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                             torch.empty(wbuf.data_size, dtype=wbuf.dtype, device=self.device)
                         )
                 bucket = wbuf.fetch_bucket()
+
+            tbuf = group.transpose_weight_buffer
+            if tbuf:
+                with self.mem_alloc_context():
+                    if group.hsdp_wbuf:
+                        raise NotImplementedError(
+                            "HSDP for transpose buffer is not implemented yet"
+                        )
+                    else:
+                        tbuf.init_data(
+                            torch.empty(tbuf.data_size, dtype=tbuf.dtype, device=self.device)
+                        )
+                transpose_bucket = tbuf.fetch_bucket()
+
             mbuf = group.main_weight_buffer
             if mbuf:
                 # Manually instantiate an empty tensor into the main weight buffer.
@@ -2090,25 +2233,41 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                             if not self.ddp_config.keep_fp8_transpose_cache:
                                 for _param in m.parameters(recurse=False):
                                     if is_float8tensor(_param):
-                                        _param._transpose_invalid = True
-                                        _param._transpose = None
+                                        fp8_discard_transpose_cache(_param)
                     # Raise error if a meta parameter still exists after initialization.
                     assert not p.is_meta, (self.param_to_name[p], module_reset_flag)
 
+                    p_local = to_local_if_dtensor(p)
+
                     # Copy the model weight parameter tensor into the buffer.
                     # When distributed, this shards and preserves the data across all ranks.
-                    wbuf.set_item(item_id, to_local_if_dtensor(p))
+                    wbuf.set_item(item_id, p_local)
+                    if tbuf:
+                        tbuf.set_item(item_id, p_local)
 
                     # Retrieve the newly allocated parameter data from the global bucket.
                     # Attach the bucket-allocated parameter data to the module parameter,
                     # to use the bucket-allocated data for autograd and NCCL.
-                    new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view(
-                        to_local_if_dtensor(p).shape
-                    )
-                    if is_float8tensor(p):
-                        # Needed to instantiate FP8 parameters. Requires installing
-                        # TransformerEngine.
-                        modify_underlying_storage(p, new_param_data)
+                    new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view(p_local.shape)
+                    if tbuf:
+                        new_transpose_data = tbuf.get_item_from_bucket(
+                            transpose_bucket, item_id
+                        ).view(p_local.shape)
+                    else:
+                        new_transpose_data = None
+
+                    if is_float8tensor(p_local):
+                        old_param_data = fp8_get_raw_data(p_local)
+                        assert old_param_data._base is None
+                        new_param_data.detach().copy_(old_param_data)
+                        fp8_set_raw_data(p_local, new_param_data)
+                        del old_param_data
+                        if new_transpose_data is not None:
+                            old_transpose_data = fp8_get_raw_data(p_local, True)
+                            assert old_transpose_data._base is None
+                            new_transpose_data.detach().copy_(old_transpose_data)
+                            fp8_set_raw_data(p_local, new_transpose_data, True)
+                            del old_transpose_data
                     elif isinstance(p, DTensor):
                         old_param_data = p._local_tensor.data
                         p._local_tensor.data = new_param_data
@@ -2146,7 +2305,12 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                         # the (high-precision) main weight buffer.
                         # Nothing else needs to be done, because the main weights
                         # do not require autograd operations, only possibly sharding.
-                        mbuf.set_item(item_id, to_local_if_dtensor(p))
+                        p_local = to_local_if_dtensor(p)
+                        assert not is_float8tensor(p_local), (
+                            self.param_to_name[p],
+                            "fp8 param should use get_high_precision_init_val method.",
+                        )
+                        mbuf.set_item(item_id, p_local)
 
             if wbuf and wbuf.is_data_distributed:
                 # Free the memory backing the temporarily-allocated bucket associated
@@ -2158,6 +2322,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
                 # before forward activations and gradients are allocated in training.
                 wbuf.free_bucket_storage()
 
+            if tbuf and tbuf.is_data_distributed:
+                tbuf.free_bucket_storage()
+
         # Allocate the main_weight buffer and main_grad buffer data in one buffer.
         if self.buffer_all_in_one:
             with self.mem_alloc_context():
@@ -2281,6 +2448,7 @@ def _reset_parameters(self, old_params, new_params):
                 group.params[item_id] = new_p
                 for buf in [
                     group.model_weight_buffer,
+                    group.transpose_weight_buffer,
                     group.main_weight_buffer,
                     group.main_grad_buffer,
                     group.hsdp_wbuf,
@@ -2328,6 +2496,7 @@ def _init_distributed_params(self):
         dist_main_weight = {}
         for pg in self.parameter_groups:
             wbuf = pg.model_weight_buffer
+            tbuf = pg.transpose_weight_buffer
             mbuf = pg.main_weight_buffer
             for item_id, orig_param in enumerate(pg.params):
                 param_name = self.param_to_name[orig_param]
@@ -2355,6 +2524,7 @@ def _init_distributed_params(self):
                     )
                     dist_main_weight[param_name] = dist_param
                 elif wbuf:
+                    assert tbuf is None, "Transpose buffer should only exist when main params exist"
                     dist_param = make_fsdp_dtensor(
                         local_tensor=wbuf.get_item(item_id, only_shard=sharded_optimizer_state),
                         param=orig_param,
@@ -2474,8 +2644,9 @@ def update_main_grads(self):
                 item_id, only_shard=sharded_optimizer_state
             )
             if group.main_weight_buffer is not None:
-                # Convert the gradient to the main weight buffer dtype.
-                optimizer_grad = optimizer_grad.to(param.dtype)
+                if not getattr(self, "use_precision_aware_optimizer", False):
+                    # Convert the gradient to the main weight buffer dtype.
+                    optimizer_grad = optimizer_grad.to(param.dtype)
 
             if name not in self.dist_main_grad:
                 # Register the gradient as a distributed tensor.
@@ -2513,7 +2684,12 @@ def num_buckets(self):
 
     @torch.no_grad()
     def copy_main_weights_to_model_weights(self):
-        """Update the model weights from the main weights."""
+        """
+        Update the model weights from the main weights.
+
+        If FP8 parameters are utilized, this function will quantize the high-precision
+        main weights prior to installation into the model compute weight buffers.
+        """
         dense_param_quantize_kwargs = {
             "model_params": [],
             "main_params": [],
@@ -2523,9 +2699,54 @@ def copy_main_weights_to_model_weights(self):
         expert_param_quantize_kwargs = copy.deepcopy(dense_param_quantize_kwargs)
         data_parallel_group = None
         expert_data_parallel_group = None
+        clear_quantize_kwargs = lambda kwargs: [d.clear() for d in kwargs.values()]
+
+        def _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs):
+            if len(dense_param_quantize_kwargs["model_params"]) > 0:
+                # If we have FP8 parameters, we need to quantize them.
+                fp8_quantize(data_parallel_group=data_parallel_group, **dense_param_quantize_kwargs)
+
+            if len(expert_param_quantize_kwargs["model_params"]) > 0:
+                # If we have FP8 expert parameters, we need to quantize them.
+                fp8_quantize(
+                    data_parallel_group=expert_data_parallel_group, **expert_param_quantize_kwargs
+                )
+
+            clear_quantize_kwargs(dense_param_quantize_kwargs)
+            clear_quantize_kwargs(expert_param_quantize_kwargs)
+
+        # Special handling of blockwise FP8
+        BATCH_QUANT_MEMORY_LIMIT_BYTES = 5 * 1024**3  # 5 GB
+        blockwise_fp8_weight_buffers = []
+        blockwise_fp8_param_buffers = []
+
+        def _batch_quantize_blockwise_fp8_params(
+            dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers
+        ):
+            if len(blockwise_fp8_param_buffers) == 0:
+                return
+
+            # Copy original param shards into their blockwise FP8 working buffers
+            for bufs in blockwise_fp8_param_buffers:
+                bufs["bucket_param"].copy_(bufs["param"])
+
+            # Apply FP8 quantization to blockwise FP8 parameters
+            _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs)
+
+            # Copy quantized params back from working buffers to original param tensors
+            for bufs in blockwise_fp8_param_buffers:
+                bufs["param"].copy_(bufs["bucket_param"])
+            blockwise_fp8_param_buffers.clear()
+
+            # Free bucket storage for blockwise FP8 weight buffers
+            for wbuf in blockwise_fp8_weight_buffers:
+                wbuf.free_bucket_storage()
+            blockwise_fp8_weight_buffers.clear()
+
         for pg in self.parameter_groups:
             mbuf = pg.main_weight_buffer
             wbuf = pg.model_weight_buffer
+            tbuf = pg.transpose_weight_buffer
             if mbuf is None:
                 continue
 
@@ -2541,44 +2762,96 @@ def copy_main_weights_to_model_weights(self):
             shard_offsets_in_fp8 = quantize_func_kwargs["start_offsets"]
             shard_model_params = quantize_func_kwargs["fsdp_shard_model_params"]
 
+            has_blockwise_fp8_param = False
             for param in pg.params:
                 item_id = mbuf.param_idx[param]
                 if wbuf:
                     if wbuf.is_data_distributed or mbuf.is_data_distributed:
                         model_param = wbuf.get_item(item_id, only_shard=True)
+                        if tbuf:
+                            transpose_param = tbuf.get_item(item_id, only_shard=True)
+                        else:
+                            transpose_param = None
                         main_weight = mbuf.get_item(item_id, only_shard=True)
                     else:
                         model_param = wbuf.get_item(item_id)
+                        if tbuf:
+                            transpose_param = tbuf.get_item(item_id)
+                        else:
+                            transpose_param = None
                         main_weight = mbuf.get_item(item_id)
                 else:
                     assert not mbuf.is_data_distributed
                     model_param = to_local_if_dtensor(param)
                     main_weight = mbuf.get_item(item_id)
 
+                # TODO(@kunlunl, @cspades): Currently, we only support FP8 parameters
+                # for FSDP, i.e. fully-sharded compute parameters with a high-precision
+                # main weight buffer. Would it be possible to add if branches here to
+                # quantize the original param (no_shard) or wbuf data (optim, optim_grads)
+                # for a seamless user experience and coverage for ZeRO-1 and ZeRO-2?
+
+                if is_blockwise_float8tensor(param):
+                    fp8_params.append(param)
+                    if model_param.numel() == 0:
+                        # Empty parameter.
+                        shard_fp32_from_fp8.append(None)
+                        shard_offsets_in_fp8.append(None)
+                        shard_model_params.append([None, None])
+                    else:
+                        shard_fp32_from_fp8.append(main_weight)
+                        shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0])
+                        bucket = wbuf.fetch_bucket()
+                        b_model_param = wbuf.get_item_from_bucket(bucket, item_id)[
+                            slice(*wbuf.locate_item_in_global_item(item_id))
+                        ]
+                        assert (
+                            transpose_param is None
+                        ), "Blockwise FP8 does not support transpose param."
+                        shard_model_params.append([b_model_param, None])
+                        assert b_model_param.numel() == model_param.numel(), (
+                            f"Blockwise FP8 bucket param numel {b_model_param.numel()} does"
+                            f" not match model param numel {model_param.numel()}"
+                            f" name: {self.param_to_name[param]}"
+                        )
+                        blockwise_fp8_param_buffers.append(
+                            {"bucket_param": b_model_param, "param": model_param}
+                        )
+                        has_blockwise_fp8_param = True
+                    continue
+
                 if is_float8tensor(param):
                     fp8_params.append(param)
                     if model_param.numel() == 0:
+                        # Empty parameter.
                         shard_fp32_from_fp8.append(None)
                         shard_offsets_in_fp8.append(None)
-                        shard_model_params.append(None)
+                        shard_model_params.append([None, None])
                     else:
                         shard_fp32_from_fp8.append(main_weight)
                         shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0])
-                        shard_model_params.append(model_param)
+                        shard_model_params.append([model_param, transpose_param])
                     continue
 
                 if model_param.numel() > 0:
                     model_param.data.copy_(main_weight.view(model_param.shape))
 
-        if len(dense_param_quantize_kwargs["model_params"]) > 0:
-            # If we have FP8 parameters, we need to quantize them.
-            dense_param_quantize_kwargs["data_parallel_group"] = data_parallel_group
-            quantize_param_shard(**dense_param_quantize_kwargs)
+            if has_blockwise_fp8_param:
+                blockwise_fp8_weight_buffers.append(wbuf)
+                if (
+                    sum([wbuf.bucket_index.size for wbuf in blockwise_fp8_weight_buffers])
+                    > BATCH_QUANT_MEMORY_LIMIT_BYTES
+                ):
+                    _batch_quantize_blockwise_fp8_params(
+                        dense_param_quantize_kwargs,
+                        expert_param_quantize_kwargs,
+                        blockwise_fp8_param_buffers,
+                    )
 
-        if len(expert_param_quantize_kwargs["model_params"]) > 0:
-            # If we have FP8 expert parameters, we need to quantize them.
-            expert_param_quantize_kwargs["data_parallel_group"] = expert_data_parallel_group
-            quantize_param_shard(**expert_param_quantize_kwargs)
+        _batch_quantize_blockwise_fp8_params(
+            dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers
+        )
+        _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs)
 
     @torch.no_grad()
     def copy_model_weights_to_main_weights(self):
@@ -2596,6 +2869,7 @@ def copy_model_weights_to_main_weights(self):
                 f"Master weight buffer size {mbuf.data.numel()} does not match "
                 f"model weight buffer size {copyin_data.numel()}"
             )
+            # TODO(mxfp8): Make sure it's not a fp8 buf?
             mbuf.data.copy_(copyin_data.data)
 
     def all_gather_parameters(self, async_op: bool = True):
@@ -2613,15 +2887,18 @@ def all_gather_parameters(self, async_op: bool = True):
 
         all_gather_ops = []
         for g in self.parameter_groups:
-            shard = g.model_weight_buffer.get_shard_from_local_buffer()
-            all_gather_handler = torch.distributed.all_gather_into_tensor(
-                output_tensor=g.model_weight_buffer.data,
-                input_tensor=shard,
-                group=g.model_weight_buffer.data_parallel_group,
-                async_op=async_op,
-            )
-            if async_op:
-                all_gather_ops.append(all_gather_handler)
+            for buf in [g.model_weight_buffer, g.transpose_weight_buffer]:
+                if buf is None:
+                    continue
+                shard = buf.get_shard_from_local_buffer()
+                all_gather_handler = torch.distributed.all_gather_into_tensor(
+                    output_tensor=buf.data,
+                    input_tensor=shard,
+                    group=buf.data_parallel_group,
+                    async_op=async_op,
+                )
+                if async_op:
+                    all_gather_ops.append(all_gather_handler)
 
         for op in all_gather_ops:
             op.wait()
@@ -2642,7 +2919,7 @@ def reduce_scatter_gradients(self, async_op: bool = True):
         reduce_scatter_ops = []
         for g in self.parameter_groups:
             gbuf = g.main_grad_buffer
-            if gbuf is not None:
+            if gbuf is None:
                 continue
             scaling_factor = gbuf.gradient_scaling_factor
             reduce_op = gradient_reduce_preprocessing(gbuf.data, scaling_factor, self.ddp_config)
@@ -2943,9 +3220,9 @@ def _bucket_group_gradient_reduce(
                     # Scale gradients.
                     scaling_factor = gbuf.gradient_scaling_factor
                     reduce_op = gradient_reduce_preprocessing(
-                        gbuf.data, scaling_factor, gbuf.ddp_config
+                        bucket.data, scaling_factor, gbuf.ddp_config
                     )
-                    if not gbuf.is_data_distributed:
+                    if ddp_config.data_parallel_sharding_strategy == "no_shard":
                         # All-reduce the gradients on every rank. No scattering
                         # or sharding necessary.
                         torch.distributed.all_reduce(
@@ -3092,9 +3369,16 @@ def __init__(
         # Track the status of all-gather operations for each bucket.
         self.param_gather_event_map = {}
         # All buckets are initially deallocated / empty after initialization of ParamAndGradBuffer.
-        self.bucket_status = {i: BucketStatus.EMPTY for i in range(self.buffer.num_buckets)}
+        self.bucket_status = {}
+        for i in range(self.buffer.num_buckets):
+            for bwd in [False, True]:
+                self.bucket_status[self.get_bucket_key(i, bwd)] = BucketStatus.EMPTY
+
         # Track whether each bucket can be deallocated.
-        self.bucket_can_be_released = {i: False for i in range(self.buffer.num_buckets)}
+        self.bucket_can_be_released = {}
+        for i in range(self.buffer.num_buckets):
+            for bwd in [False, True]:
+                self.bucket_can_be_released[self.get_bucket_key(i, bwd)] = False
 
         # Map each bucket to the bucket group it belongs to by enumerated ID.
         # Made to collect a subset of buckets in the same bucket group.
@@ -3119,6 +3403,13 @@ def __init__(
             # all-gather parameters across groups.
             self.outer_fsdp_group_param_gather_stream = torch.cuda.Stream()
 
+    def get_bucket_key(self, bucket_id, bwd):
+        """Get the key for the bucket."""
+        has_transpose_buffer = (
+            self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None
+        )
+        return (bucket_id, has_transpose_buffer and bwd)
+
     @property
     def num_buckets(self):
         """Return the number of buckets."""
@@ -3135,10 +3426,11 @@ def reset(self):
                 UserWarning,
             )
             while len(self.param_gather_event_map) > 0:
-                bucket_id = next(iter(self.param_gather_event_map))
-                self.wait_bucket_ready(bucket_id)
+                (bucket_id, bwd) = next(iter(self.param_gather_event_map))
+                self.wait_bucket_ready(bucket_id, bwd)
         for bucket_id in range(self.num_buckets):
-            self.bucket_can_be_released[bucket_id] = True
+            for bwd in [False, True]:
+                self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = True
         self.recycle_unused_buckets()
 
         assert all([status is BucketStatus.EMPTY for status in self.bucket_status.values()]), (
@@ -3160,6 +3452,7 @@ def all_gather_params(
         suggested_AG_prefetch_size: Optional[int] = None,
         async_param_gather: bool = True,
         outer_fsdp_group_param_gather: bool = False,
+        bwd: bool = False,
     ):
         """All-gather the params. If prefetch is enabled, prefetch next buckets
         in the order of `prefetch_order`.
@@ -3194,7 +3487,7 @@ def all_gather_params(
 
         # Do not release the buckets that are being all-gathered.
         for bucket_id in ag_buckets:
-            self.bucket_can_be_released[bucket_id] = False
+            self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = False
 
         # If prefetch is enabled, we will add prefetch buckets to ag_buckets.
         if prefetch:
@@ -3266,7 +3559,11 @@ def need_skip_prefetch(bucket_id):
                 bucket_id = next_bucket_id(ag_buckets)
 
         # Only all-gather on buckets that have not been allocated yet.
-        ag_buckets = [i for i in ag_buckets if self.bucket_status[i] == BucketStatus.EMPTY]
+        ag_buckets = [
+            bucket_id
+            for bucket_id in ag_buckets
+            if self.bucket_status[self.get_bucket_key(bucket_id, bwd)] == BucketStatus.EMPTY
+        ]
         if len(ag_buckets) == 0:
             return
 
@@ -3285,6 +3582,7 @@ def need_skip_prefetch(bucket_id):
                 self.ag_stream if self.ag_stream is not None else torch.cuda.current_stream()
             )
             if outer_fsdp_group_param_gather:
+                # TODO(mxfp8): Support hsdp
                 self.outer_fsdp_group_param_gather_stream.wait_stream(torch.cuda.current_stream())
                 with torch.cuda.stream(self.outer_fsdp_group_param_gather_stream):
                     outer_fsdp_group = self.buffer.dist_index.get_outer_fsdp_group()
@@ -3312,12 +3610,13 @@ def need_skip_prefetch(bucket_id):
                     for bucket_id in buckets:
                         # All-gather the module weights from each FSDP buffer shard
                         # into an allocated bucket containing unsharded weights.
-                        self.async_bucket_gather(bucket_id)
+                        self.async_bucket_gather(bucket_id, bwd)
 
             # Replace the parameter all-gather event with coalescing event.
             for bucket_id in buckets:
-                _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_id]
-                self.param_gather_event_map[bucket_id] = (
+                bucket_key = self.get_bucket_key(bucket_id, bwd)
+                _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_key]
+                self.param_gather_event_map[bucket_key] = (
                     coalescing_event,
                     mark_bucket_ready_to_use,
                 )
@@ -3325,14 +3624,16 @@ def need_skip_prefetch(bucket_id):
         # Wait for all-gather to finish
         if not async_param_gather:
             for bucket_id in buckets:
-                self.wait_bucket_ready(bucket_id)
+                self.wait_bucket_ready(bucket_id, bwd)
 
-    def wait_bucket_ready(self, bucket_id, empty_ok=False):
+    def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False):
         """Wait for the bucket to be ready."""
-        if self.bucket_status[bucket_id] == BucketStatus.READY_TO_USE:
+        bucket_key = self.get_bucket_key(bucket_id, bwd)
+
+        if self.bucket_status[bucket_key] == BucketStatus.READY_TO_USE:
             # Already ready to use.
             return
-        if self.bucket_status[bucket_id] == BucketStatus.EMPTY:
+        if self.bucket_status[bucket_key] == BucketStatus.EMPTY:
             if empty_ok:
                 return
             # Bucket shouldn't be empty, this implies that the bucket
@@ -3340,48 +3641,92 @@ def wait_bucket_ready(self, bucket_id, empty_ok=False):
             raise ValueError(f"Bucket {bucket_id} is empty.")
 
         # Wait for asynchronous / overlapped NCCL operations to complete.
-        param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_id)
+        param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_key)
         param_gather_event.wait()
         mark_bucket_ready_to_use()
 
     @torch.no_grad()
-    def release_bucket(self, bucket_id: int):
-        """Release the bucket."""
-        if self.bucket_status[bucket_id] == BucketStatus.EMPTY:
+    def release_bucket(self, bucket_id, bwd, lazy: bool = False):
+        """
+        Release the specified parameter bucket, freeing its associated buffer storage.
+
+        This function marks or frees the memory of a parameter bucket depending on
+        whether lazy release is enabled. It ensures that buckets are not released
+        while still being communicated or in use by the pipeline.
+
+        Args:
+            bucket_id (int): Identifier of the bucket to be released.
+            bwd (bool): Indicates if the release is triggered during the backward pass.
+            lazy (bool, optional): Determines when the parameter buffer (bucket) is released.
+                - If False, the buffer is released immediately.
+                - If True, the release is deferred until just before the all-gather pipeline
+                requests a new buffer. The delayed release is performed by invoking
+                `recycle_unused_buckets`.
+
+        Raises:
+            ValueError: If the specified bucket is currently in communication and
+                cannot be safely released.
+
+        Notes:
+            - Buckets marked as lazy will be released later when the pipeline determines
+            they are no longer needed.
+            - If the bucket has a transpose weight buffer (used in FP8 backward passes),
+            this buffer is freed; otherwise, the model weight buffer is released.
+        """
+        bucket_key = self.get_bucket_key(bucket_id, bwd)
+        if self.bucket_status[bucket_key] == BucketStatus.EMPTY:
+            return
+
+        if lazy:
+            # Mark the bucket can be released later.
+            self.bucket_can_be_released[bucket_key] = True
             return
 
-        self.wait_bucket_ready(bucket_id, empty_ok=True)
-        if self.bucket_status[bucket_id] == BucketStatus.COMMUNICATING:
+        self.wait_bucket_ready(bucket_id, bwd, empty_ok=True)
+        if self.bucket_status[bucket_key] == BucketStatus.COMMUNICATING:
             raise ValueError(f"Bucket {bucket_id} is communicating.")
 
-        wbuf = self.buffer.parameter_groups[bucket_id].model_weight_buffer
-        wbuf.free_bucket_storage()
-        self.bucket_status[bucket_id] = BucketStatus.EMPTY
+        if bwd and self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None:
+            buf = self.buffer.parameter_groups[bucket_id].transpose_weight_buffer
+        else:
+            buf = self.buffer.parameter_groups[bucket_id].model_weight_buffer
+
+        buf.free_bucket_storage()
+        self.bucket_status[bucket_key] = BucketStatus.EMPTY
 
     def recycle_unused_buckets(self):
         """Recycle the unused buckets."""
-        for bucket_id, can_be_released in self.bucket_can_be_released.items():
+        for bucket_key, can_be_released in self.bucket_can_be_released.items():
             if can_be_released:
-                self.release_bucket(bucket_id)
-                self.bucket_can_be_released[bucket_id] = False
+                bucket_id, is_transpose_weight = bucket_key[0], bucket_key[1]
+                self.release_bucket(bucket_id, is_transpose_weight)
+                self.bucket_can_be_released[bucket_key] = False
 
-    def get_fsdp_buffer(self, bucket_id: int) -> DataParallelBuffer:
+    def get_fsdp_buffer(self, bucket_id: int, bwd=False) -> DataParallelBuffer:
         """Get the FSDP buffer with the given bucket ID."""
         param_group = self.buffer.parameter_groups[bucket_id]
         if self.buffer.ddp_config.outer_dp_sharding_strategy != "no_shard":
-            return param_group.hsdp_wbuf
-        return param_group.model_weight_buffer
+            if bwd and param_group.transpose_weight_buffer is not None:
+                raise RuntimeError("Transpose buffer is not supported for HSDP")
+            else:
+                return param_group.hsdp_wbuf
+        if bwd and param_group.transpose_weight_buffer is not None:
+            return param_group.transpose_weight_buffer
+        else:
+            return param_group.model_weight_buffer
 
     @torch.no_grad()
-    def async_bucket_gather(self, bucket_id: int) -> None:
+    def async_bucket_gather(self, bucket_id, bwd) -> None:
         """All-gather the bucket and set the items."""
-        self.bucket_can_be_released[bucket_id] = False
-        if self.bucket_status[bucket_id] != BucketStatus.EMPTY:
+        bucket_key = self.get_bucket_key(bucket_id, bwd)
+
+        self.bucket_can_be_released[bucket_key] = False
+        if self.bucket_status[bucket_key] != BucketStatus.EMPTY:
             return
 
-        self.bucket_status[bucket_id] = BucketStatus.COMMUNICATING
+        self.bucket_status[bucket_key] = BucketStatus.COMMUNICATING
 
-        wbuf = self.get_fsdp_buffer(bucket_id)
+        wbuf = self.get_fsdp_buffer(bucket_id, bwd)
 
         # Lazy release the unused buckets.
         self.recycle_unused_buckets()
@@ -3396,18 +3741,21 @@ def async_bucket_gather(self, bucket_id: int) -> None:
             async_op=True,
         )
 
-        def get_closure(bucket_id):
+        def get_closure(bucket_id, bwd):
             @torch.no_grad()
             def mark_bucket_ready_to_use():
                 # Mark the bucket as ready to use - all NCCL operations are complete.
-                self.bucket_status[bucket_id] = BucketStatus.READY_TO_USE
+                self.bucket_status[self.get_bucket_key(bucket_id, bwd)] = BucketStatus.READY_TO_USE
 
             return mark_bucket_ready_to_use
 
-        mark_bucket_ready_to_use = get_closure(bucket_id)
+        mark_bucket_ready_to_use = get_closure(bucket_id, bwd)
 
         # Track the async all-gather operation for the bucket.
-        self.param_gather_event_map[bucket_id] = (param_gather_event, mark_bucket_ready_to_use)
+        self.param_gather_event_map[self.get_bucket_key(bucket_id, bwd)] = (
+            param_gather_event,
+            mark_bucket_ready_to_use,
+        )
 
 
 @torch.no_grad()
@@ -3450,8 +3798,12 @@ def check_gpu_memory(threshold=0.9):
 
     near_full = allocated_ratio >= threshold or reserved_ratio >= threshold
 
-    if near_full and torch.distributed.get_rank() == 0:
-        logger.info(f"GPU Memory: Allocated: {allocated_ratio:.2%}, Reserved: {reserved_ratio:.2%}")
+    if near_full:
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"GPU Memory: Allocated: {allocated_ratio:.2%}, Reserved: {reserved_ratio:.2%}",
+        )
     return near_full
 
 
@@ -3467,11 +3819,26 @@ def __init__(self, init_param_with_fp8=False, with_cuda_rng_tracker=False):
     def __enter__(self):
         self.stack = ExitStack()
         if self.init_param_with_fp8:
-            assert HAVE_TE
-            args = {"enabled": True}
-            if "preserve_high_precision_init_val" in inspect.signature(fp8_model_init).parameters:
-                args["preserve_high_precision_init_val"] = True
-            self.stack.enter_context(fp8_model_init(**args))
+            # FIXME(@cspades): This appears to be a legacy dependency that is not needed for
+            # more recent versions of TransformerEngine, which only requires this context during
+            # TransformerEngineBaseModule.__init__. Should be removed if backwards compatibility
+            # is confirmed, because overwrites the quantized_model_init context specified by user.
+            assert (
+                HAVE_TE
+            ), "TransformerEngine is required for using FP8 parameters with Megatron-FSDP."
+            # Retrieve import for quantized_model_init (new) or fp8_model_init (old).
+            # Will be nullcontext if TE is not installed.
+            te_quantized_model_init_cls = get_quantized_model_init_context_cls()
+            if te_quantized_model_init_cls is not nullcontext:
+                # Enable TE quantized parameter context manager.
+                args = {"enabled": True}
+                if (
+                    "preserve_high_precision_init_val"
+                    in inspect.signature(te_quantized_model_init_cls).parameters
+                ):
+                    # Required for Megatron-FSDP + FP8 parameters.
+                    args["preserve_high_precision_init_val"] = True
+                self.stack.enter_context(te_quantized_model_init_cls(**args))
 
         if self.with_cuda_rng_tracker:
             # Megatron / TE RNG tracker needs to be initialized and seeded by the user or FW
@@ -3500,15 +3867,13 @@ def override_sharded_param_methods_with_safety_checks(params, all_gather_pipelin
 
         def override_sharded_param_to_function_closure(p, to_function):
             def override_sharded_param_to_function(*args, **kwargs):
-                bucket_id = all_gather_pipeline.buffer.param_to_param_group[p]
-                status = all_gather_pipeline.bucket_status[bucket_id]
-                if status == BucketStatus.READY_TO_USE:
-                    return to_function(*args, **kwargs)
-                raise RuntimeError(
-                    "This parameter is already shard by MCore FSDP and the "
-                    "shared-state parameter does not support 'to' function."
-                    "please define the dtype and device of the parameter before FSDP wrap."
-                )
+                if p._typed_storage()._size() == 0:
+                    warnings.warn(
+                        "The parameter may be sharded by Megatron-FSDP, "
+                        "no actual 'to' operation is performed."
+                    )
+                    return torch.empty([])
+                return to_function(*args, **kwargs)
 
             return override_sharded_param_to_function
 
@@ -3516,15 +3881,13 @@ def override_sharded_param_to_function(*args, **kwargs):
 
         def override_sharded_param_cpu_function_closure(p, cpu_function):
             def override_sharded_param_cpu_function(*args, **kwargs):
-                bucket_id = all_gather_pipeline.buffer.param_to_param_group[p]
-                status = all_gather_pipeline.bucket_status[bucket_id]
-                if status == BucketStatus.READY_TO_USE:
-                    return cpu_function(*args, **kwargs)
-                warnings.warn(
-                    "The parameters are sharded by MCore FSDP, and no actual cpu "
-                    "operation is performed."
-                )
-                return torch.empty([], device="cpu")
+                if p._typed_storage()._size() == 0:
+                    warnings.warn(
+                        "The parameter may be sharded by Megatron-FSDP, "
+                        "no actual 'cpu' operation is performed."
+                    )
+                    return torch.empty([], device="cpu")
+                return cpu_function(*args, **kwargs)
 
             return override_sharded_param_cpu_function
 
@@ -3576,20 +3939,6 @@ def _get_fsdp_tensor_spec(
     if isinstance(param, DTensor) and cast(DTensor, param)._spec.num_shards > 1:
         # Retrieve original DTensorSpec (for TP).
         dtensor_spec = cast(DTensor, param)._spec
-        dtensor_mesh = getattr(dtensor_spec, "mesh", None)
-
-        # Validate that the DTensor root mesh is identical to the Megatron-FSDP device mesh.
-        megatron_fsdp_global_mesh = dist_index.get_root_mesh(is_expert_parallel=is_expert_param)
-        dtensor_global_mesh = _mesh_resources.get_root_mesh(dtensor_mesh)
-        # FIXME(boxiangw): add or megatron_fsdp_global_mesh != dtensor_global_mesh:
-        # _mesh_resources.get_root_mesh(dtensor_mesh) is not getting the correct root mesh
-        if dtensor_global_mesh is None:
-            raise ValueError(
-                f"When utilizing DTensor-based modules with Megatron-FSDP, the DTensor root "
-                f"device mesh must be identical to the Megatron-FSDP root device mesh.\n"
-                f"DTensor Root Mesh: {dtensor_global_mesh} / Megatron-FSDP "
-                f"Root Mesh: {megatron_fsdp_global_mesh}"
-            )
 
         # Get the placements for the parameter.
         assert len(dtensor_spec.placements) == 1, (
@@ -3777,7 +4126,7 @@ def make_fsdp_dtensor(
                 device_mesh=tp_mesh,
                 placements=placements,
                 run_check=run_check,
-                shape=global_shape,
+                shape=tuple(global_shape),
                 stride=torch.empty(global_shape).stride(),
             )
 
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
index d358ae6cab7..f0b93056c9c 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
@@ -278,7 +278,18 @@ def gather_uneven_dtensor_to_full_tensor(
         full_flattened_mesh_dim_name = "_".join(device_mesh.mesh_dim_names)
         if full_flattened_mesh_dim_name in get_mesh_names(device_mesh):
             # Retrieve the existing flattened DeviceMesh ProcessGroup.
-            process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
+            try:
+                # Two Cases: Name is a root dimension, or using the old DeviceMesh
+                # API which allows us to get flattened dimensions.
+                process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
+            except:
+                # Name is a flattened dimension that cannot be retrieved from the
+                # DeviceMesh.__getitem__, so fall-back to new DeviceMesh API.
+                process_group = (
+                    device_mesh._get_root_mesh()
+                    ._flatten_mapping[full_flattened_mesh_dim_name]
+                    .get_group()
+                )
         else:
             # Create the _-separated flattened DeviceMesh ProcessGroup.
             process_group = device_mesh._flatten().get_group()
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
index 83677a9b955..d5fbc91fcf8 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
@@ -19,7 +19,14 @@
 from contextlib import nullcontext
 from functools import reduce
 from importlib.metadata import version
-from typing import Callable, List, Optional, Sequence, Union
+from typing import Callable, Optional, Sequence, Union
+
+try:
+    import megatron.core.parallel_state as parallel_state
+
+    HAVE_MEGATRON_CORE = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
 
 try:
     import einops
@@ -34,7 +41,6 @@
 from torch.cuda import _lazy_call, _lazy_init
 from torch.cuda import device as device_ctx_manager
 from torch.distributed import DeviceMesh, ProcessGroup
-from torch.distributed.device_mesh import _mesh_resources
 
 logger = logging.getLogger(__name__)
 
@@ -79,52 +85,6 @@ def is_te_min_version(vers, check_equality=True):
     return te_version > PkgVersion(vers)
 
 
-# Check if Transformer Engine has class for fp8 tensors.
-try:
-    if is_te_min_version("2.0"):
-        # In TE2.x, QuantizedTensor is the base class for all different type of fp8 tensors,
-        # including fp8 tensor for delayed scaling, current scaling and mxfp8, etc.
-        from transformer_engine.pytorch.tensor import QuantizedTensor as FP8_TENSOR_CLASS
-    else:
-        from transformer_engine.pytorch.float8_tensor import Float8Tensor as FP8_TENSOR_CLASS
-
-    HAVE_TE_FP8_TENSOR_CLASS = True
-except (ImportError, ModuleNotFoundError):
-    # FP8 tensor class not found
-    HAVE_TE_FP8_TENSOR_CLASS = False
-
-try:
-    from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale
-
-    multi_tensor_scale_impl = multi_tensor_scale
-except ImportError:
-    try:
-        import amp_C
-        from apex.multi_tensor_apply import multi_tensor_applier
-
-        multi_tensor_scale_impl = amp_C.multi_tensor_scale
-    except ImportError:
-        import warnings
-
-        warnings.warn(
-            "Transformer Engine and Apex are not installed. "
-            "Falling back to local implementations of "
-            "multi_tensor_applier and multi_tensor_scale"
-        )
-
-        def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args):
-            """Multi tensor op applier"""
-            return op(2048 * 32, noop_flag_buffer, tensor_lists, *args)
-
-        def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale):
-            """Works as a drop-in replacement for amp_C.multi_tensor_scale."""
-            for src, dst in zip(tensor_lists[0], tensor_lists[1]):
-                dst.copy_(src * scale)
-
-        multi_tensor_applier = local_multi_tensor_applier
-        multi_tensor_scale_impl = local_multi_tensor_scale
-
-
 def is_submodule(module, parent_module, strict=True):
     """
     Check if a module is a submodule of another module.
@@ -138,39 +98,50 @@ def is_submodule(module, parent_module, strict=True):
     return False
 
 
-def is_float8tensor(tensor: torch.Tensor) -> bool:
-    """Check if a tensor is a Transformer Engine Float8Tensor.
-
-    Note that in TE2.x, in order to support more recipes, the design of the fp8 tensor class has
-    changed. Now Float8Tensor is only used for current scaling and delayed scaling. And mxfp8
-    and blockwise scaling have their own fp8 tensor classes. These different fp8 tensor classes
-    are both inherited from QuantizedTensor. So, for TE1.x, FP8_TENSOR_CLASS is Float8Tensor,
-    and for TE2.x, FP8_TENSOR_CLASS is QuantizedTensor.
-    """
-    return HAVE_TE_FP8_TENSOR_CLASS and isinstance(tensor, FP8_TENSOR_CLASS)
-
-
-def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]:
+def get_mesh_names(
+    device_mesh: Optional[DeviceMesh] = None, only_submesh_dims: bool = False
+) -> list[str]:
     """
-    Get all the sub-mesh names in the DeviceMesh.
+    Get all the sub-mesh ("dp", "cp", etc.) and flattened-mesh ("dp_cp", etc.) names
+    in the DeviceMesh. When only_submesh_dims=True, only checks for sub-mesh dimensions.
     """
     if device_mesh is None:
         # Device mesh does not exist.
         return []
-    # Order of the returned list of mesh dimension names must match the order / index
-    # of the root mesh dimension names followed by children / flattened sub-meshes:
-    # [<root mesh dimension names>, <child mesh dimension names>]
-    mesh_dim_names = (
+
+    # Sub-mesh dimension names.
+    submesh_dim_names = (
         list(device_mesh.mesh_dim_names) if device_mesh.mesh_dim_names is not None else []
     )
-    submesh_dim_names = [
-        submesh_dim_name
-        for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
-        for submesh_dim_name in (child_mesh.mesh_dim_names or [])
-        # Add flattened or other unaccounted for children of the root mesh.
-        if root_mesh == device_mesh and submesh_dim_name not in mesh_dim_names
-    ]
-    return mesh_dim_names + submesh_dim_names
+
+    # Flattened mesh dimension names.
+    try:
+        # Retrieve all flattened meshes associated with DeviceMesh.
+        # The flattened DeviceMesh are all located in the _flatten_mapping
+        # dictionary of the root DeviceMesh.
+        flatten_mesh_names = [
+            flat_dim
+            for flat_dim, flat_mesh in device_mesh._get_root_mesh()._flatten_mapping.items()
+        ]
+    except AttributeError:
+        # Fallback to the DeviceMesh global state to retrieve flattened
+        # meshes associated with the DeviceMesh.
+        from torch.distributed.device_mesh import _mesh_resources
+
+        flatten_mesh_names = [
+            child_mesh_dim_name
+            for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
+            for child_mesh_dim_name in (child_mesh.mesh_dim_names or [])
+            if root_mesh == device_mesh and child_mesh_dim_name not in submesh_dim_names
+        ]
+
+    # Order of the returned list of mesh dimension names must match the index
+    # of the root mesh dimension names followed by flattened sub-meshes:
+    # [<root mesh dimension names>, <flattened mesh dimension names>]
+    if only_submesh_dims:
+        return submesh_dim_names
+    else:
+        return submesh_dim_names + flatten_mesh_names
 
 
 def contains_submesh(
@@ -188,198 +159,6 @@ def contains_submesh(
     return all(submesh_name in device_mesh_names for submesh_name in submesh_names)
 
 
-def _multi_tensor_copy_this_to_that(
-    this: List[torch.Tensor], that: List[torch.Tensor], overflow_buf: Optional[torch.Tensor] = None
-):
-    """
-    Use multi-tensor-applier to copy values from one list to another.
-    We don't have a bfloat16 implementation so for now if the overflow_buf
-    is not provided, we default back to simple loop copy to be compatible
-    with bfloat16.
-    """
-    if overflow_buf is not None:
-        overflow_buf.fill_(0)
-        # Scaling with factor `1.0` is equivalent to copy.
-        multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0)
-    else:
-        for this_, that_ in zip(this, that):
-            that_.copy_(this_)
-
-
-"""
-The code below abstracts the functionalities needed for implementing "--fp8-param-gather" into
-several functions. It provides different implementations for each function based on different
-versions of TE, ensuring compatibility across various TE versions.
-
-Currently, there are three functions:
-    - modify_underlying_storage
-        This function is used in DDP to place all parameters into a contiguous buffer. For
-        non-fp8 tensors, replacing their data is simple, just using code like
-        "tensor.data = new_data". However, for fp8 tensors, their raw data is not stored in the
-        ".data" attribute, and it varies with different TE versions and different recipes. This
-        function provides a unified interface to replace the underlying storage of a fp8 tensor.
-    - quantize_param_shard
-        This function is used in dist-opt to cast fp32 main params to fp8 params. For non-fp8
-        params, this casting is as simple as "bf16_params.copy_(fp32_main_params)"; but for fp8
-        params, the casting logic varies with different TE versions and different recipes. This
-        function provides a unified interface to cast fp32 main params to fp8 params, and also
-        updates the necessary attributes (like amax, scale, scale_inv or transpose cache) of the
-        fp8 model params.
-    - correct_amax_history_if_needed
-        This function is used to correct the amax history of fp8 tensors. In TE1.x, some inplace
-        copy operations will write unwanted values to the amax_history of fp8 tensors. This function
-        corrects the amax_history back. For TE2.x, it's an empty function.
-        Only useful for delayed scaling.
-"""
-if HAVE_TE and is_te_min_version("2.2"):
-    # Supported TE versions: 2.2+
-    from transformer_engine.pytorch.tensor import QuantizedTensor
-
-    def _modify_underlying_storage_impl(
-        fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor
-    ) -> None:
-        from transformer_engine.pytorch.tensor.utils import replace_raw_data
-
-        replace_raw_data(fp8_tensor, new_raw_data)
-
-    def _quantize_param_shard_impl(
-        model_params: List[QuantizedTensor],
-        main_params: List[torch.Tensor],
-        start_offsets: List[int],
-        data_parallel_group: ProcessGroup,
-        fsdp_shard_model_params: Optional[List[torch.Tensor]] = None,
-    ) -> None:
-        if len(model_params) == 0:
-            return
-
-        from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8
-
-        args = [model_params, main_params, start_offsets, data_parallel_group]
-        if fsdp_shard_model_params is not None:
-            if get_te_version() == PkgVersion("2.3.0.dev0+5fdd7bb") or is_te_min_version("2.3.0"):
-                args.append(fsdp_shard_model_params)
-            else:
-                raise NotImplementedError(
-                    f"FSDP with --fp8-param-gather is not supported in TE v{get_te_version()}"
-                )
-        cast_master_weights_to_fp8(*args)
-
-elif HAVE_TE and is_te_min_version("2.0"):
-    # Supported TE versions: 2.0
-    from transformer_engine.pytorch.tensor import QuantizedTensor
-    from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor
-
-    def _modify_underlying_storage_impl(
-        fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor
-    ) -> None:
-        old_raw_data = fp8_tensor._data
-        assert old_raw_data.dtype == new_raw_data.dtype
-        new_raw_data.detach().copy_(old_raw_data)
-        fp8_tensor._data = new_raw_data
-        del old_raw_data
-
-    def _quantize_param_shard_impl(
-        model_params: List[QuantizedTensor],
-        main_params: List[torch.Tensor],
-        start_offsets: List[int],
-        data_parallel_group: ProcessGroup,
-        fsdp_shard_model_params: Optional[List[torch.Tensor]] = None,
-    ) -> None:
-        if len(model_params) == 0:
-            return
-
-        if fsdp_shard_model_params is None:
-            fsdp_shard_model_params = [None] * len(model_params)
-
-        for model_param, main_param, start_offset, fsdp_shard_model_param in zip(
-            model_params, main_params, start_offsets, fsdp_shard_model_params
-        ):
-            if main_param is None:
-                continue
-
-            if fsdp_shard_model_param is not None:
-                shard_model_param = fsdp_shard_model_param
-            else:
-                shard_model_param = model_param._data.view(-1)[
-                    start_offset : start_offset + main_param.numel()
-                ]
-
-            quantizer = model_param._quantizer
-            # When not using --fp8-param-gather, the main_param (fp32) is first cast to bf16/fp16,
-            # and then cast to fp8 during forward.
-            # Although it's not necessary when --fp8-param-gather is enabled, we still keep this
-            # logic to keep numerical consistency. So here cast the main_param to model_param.dtype.
-            main_param = main_param.to(model_param.dtype)
-            out = Float8Tensor(
-                shape=main_param.size(),
-                dtype=model_param.dtype,
-                requires_grad=False,
-                data=shard_model_param,
-                fp8_scale_inv=model_param._scale_inv,
-                fp8_dtype=model_param._fp8_dtype,
-                quantizer=quantizer,
-            )
-            quantizer.update_quantized(main_param, out)
-
-        amaxes = []
-        scales = []
-        scale_invs = []
-        for model_param in model_params:
-            quantizer = model_param._quantizer
-            amaxes.append(quantizer.amax.view(1))
-            scales.append(quantizer.scale.view(1))
-            scale_invs.append(model_param._scale_inv.view(1))
-            model_param._reset_caches()
-
-        dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda")
-
-        # Update scaling factors.
-        packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device)
-        packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))]
-        _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf)
-        torch.reciprocal(packed_scales, out=packed_scales)
-        _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf)
-
-        # Reduce amaxes.
-        # Note: Assume each param has a separate amax.
-        packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device)
-        packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))]
-        _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf)
-        torch.distributed.all_reduce(
-            packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group
-        )
-        _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf)
-
-else:
-    # Fallback impl if TE version is invalid or TE is not installed.
-    def _modify_underlying_storage_impl(*args, **kwargs):
-        raise RuntimeError(
-            "Invalid Transformer Engine version for FP8 distributed optimizer, "
-            "please install Transformer Engine 2.0+ or install Megatron-Core"
-        )
-
-    def _quantize_param_shard_impl(*args, **kwargs):
-        raise RuntimeError(
-            "Invalid Transformer Engine version for FP8 distributed optimizer, "
-            "please install Transformer Engine 2.0+ or install Megatron-Core"
-        )
-
-
-def modify_underlying_storage(tensor: torch.Tensor, new_raw_data: torch.Tensor):
-    """Replace the underlying raw data of a tensor with new data."""
-    _modify_underlying_storage_impl(tensor, new_raw_data)
-
-
-def quantize_param_shard(
-    model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params=None
-):
-    """Cast shard fp32 main params to fp8 model params."""
-    assert HAVE_TE, "Transformer Engine is required for quantizing parameters."
-    _quantize_param_shard_impl(
-        model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params
-    )
-
-
 def _get_cuda_rng_state(
     device: Union[int, str, torch.device] = "cuda", clone: bool = False, graph_safe: bool = False
 ) -> torch.Tensor:
@@ -714,6 +493,13 @@ def __init__(
             if contains_submesh(self.device_mesh, self.dp_shard_dim)
             else None
         )
+        # AG group comes from parallel_state, not the mesh
+        # the purpose of this independent group is to overlap all-gather and gradient reduction.
+        self.fsdp_group_ag = None
+        if HAVE_MEGATRON_CORE and parallel_state.has_separate_all_gather_group():
+            self.fsdp_group_ag = parallel_state.get_data_parallel_group(
+                with_context_parallel=True, independent_all_gather=True
+            )
         # Retrieve the outer-FSDP process group from the DeviceMesh.
         self.outer_fsdp_group = (
             self.device_mesh[self.dp_outer_dim].get_group()
@@ -733,16 +519,14 @@ def __init__(
         )
 
         """
-        Store a persistent reference to the core device meshes that back Megatron-FSDP.
-        This is necessary because _MeshEnv (_mesh_resources) may not persist:
-            - _mesh_resources.child_to_root_mapping
-            - _mesh_resources.root_to_flatten_mapping
-            - _mesh_resources.flatten_name_to_root_dims
-            - ...
-        during Torch Autograd, so child and flattened sub-meshes may be cleared.
-        For example, this breaks Megatron-FSDP when self.dp_shard_dim is the flattened
-        sub-mesh of the DP and CP root mesh dimensions.
-        FIXME(@cspades): Identify the root cause of this behavior.
+        Megatron-FSDP is responsible for storing all required DeviceMesh
+        as per best practices recommended by the DeviceMesh API.
+
+        NOTE(@cspades): In PyTorch 2.11, retrieving flattened mesh dimensions
+        will be impossible via the device_mesh[...] API. We will require all
+        users to correctly _unflatten() their DeviceMesh such that all
+        dimensions used by Megatron-FSDP are sub-meshes of the DeviceMesh.
+        contains_submesh(...) -> get_mesh_names(only_submesh_dims=True).
         """
         self.mesh_library = {}
 
@@ -769,6 +553,8 @@ def register_submesh(device_mesh, submesh, is_expert_parallel):
 
         # Register EP submeshes
         if self.expt_device_mesh is not None:
+            register_submesh(self.device_mesh, hsdp_submesh, True)
+            register_submesh(self.device_mesh, hsdp_tp_submesh, True)
             register_submesh(self.expt_device_mesh, tp_submesh, True)
             register_submesh(self.expt_device_mesh, fsdp_tp_submesh, True)
             register_submesh(self.expt_device_mesh, fsdp_submesh, True)
@@ -848,10 +634,14 @@ def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup:
             return self.hybrid_fsdp_group
         return self.fsdp_group
 
-    def get_fsdp_group(self, is_expert_parallel: bool = False) -> ProcessGroup:
+    def get_fsdp_group(
+        self, is_expert_parallel: bool = False, independent_all_gather: bool = False
+    ) -> ProcessGroup:
         """Get the FSDP process group."""
         if is_expert_parallel:
             return self.expt_fsdp_group
+        if independent_all_gather:
+            return self.fsdp_group_ag
         return self.fsdp_group
 
     def get_outer_fsdp_group(self) -> ProcessGroup:
@@ -862,6 +652,9 @@ def get_outer_fsdp_group(self) -> ProcessGroup:
 
     def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh:
         """Get the device mesh."""
+        # NOTE(@cspades): This is FSDPDistributedIndex's root mesh, NOT the actual
+        # root mesh that the DeviceMesh or expert DeviceMesh was un-flattened from.
+        # To get the root mesh, use: DeviceMesh._get_root_mesh().
         if is_expert_parallel:
             return self.expt_device_mesh
         return self.device_mesh
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
index d34fdebaf75..b192f182d9b 100644
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -16,6 +16,7 @@
 from megatron.core import parallel_state
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.rerun_state_machine import get_rerun_state_machine
+from megatron.core.utils import log_single_rank
 
 from ..fp8_utils import (
     is_float8tensor,
@@ -142,9 +143,7 @@ def __init__(
             self.data_parallel_group = collective_group
 
         # State for bookkeeping: params is the set of parameters this bucket group is
-        # responsible for, params_with_grad is the set of parameters with grads
-        # available. When overlap_grad_reduce is True, communication (all-reduce
-        # or reduce-scatter) is issued when params_with_grad equals params.
+        # responsible for, param_to_bucket maps params to the corresponding bucket.
         self.param_to_bucket = {}
         self.params = set()
         for bucket in self.buckets:
@@ -164,8 +163,28 @@ def __init__(
         global dist_reduce_scatter_func
         if self.ddp_config.reduce_scatter_with_fp32_accumulation:
             dist_reduce_scatter_func = reduce_scatter_with_fp32_accumulation
+            log_single_rank(
+                logger,
+                logging.INFO,
+                "Using reduce_scatter_with_fp32_accumulation as reduce-scatter implementation",
+            )
+
+        # per_param_grad_ready_counts is a dict mapping parameters to number of times
+        # `register_grad_ready` is called for that parameter *when
+        # self.is_last_microbatch is True*. Should be 1 for most params but could be greater
+        # than 1 if control flow passes through the same parameter multiple times. We lazily
+        # populate this in the first batch, hence the .is_first_batch attribute.
+        # When overlap_grad_reduce is True, communication (all-reduce or reduce-scatter)
+        # is issued when per_param_grad_ready_counts equals golden_per_param_grad_ready_counts.
+        # In other words, communication is dispatched as soon as all gradients in this bucket
+        # are *ready*, as marked by the backward hook.
+        # The set of keys in per_param_grad_ready_counts should be equal to `params`.
+        self.golden_per_param_grad_ready_counts = {}
+        self.per_param_grad_ready_counts = {}
+        self.is_last_microbatch = True
+        self.is_first_batch = True
 
-        self.reset()
+        # Other metadata to keep track of collectives.
         self.param_gather_handle = None
         self.param_gather_dispatched = False
         self.grad_reduce_handle = None
@@ -182,7 +201,12 @@ def reset(self):
         """
         Reset metadata in bucket group in preparation for the next iteration of training.
         """
-        self.params_with_grad = set()
+        if self.is_first_batch and len(self.per_param_grad_ready_counts) > 0:
+            # Record golden per_param_grad_ready_counts.
+            assert len(self.per_param_grad_ready_counts) == len(self.params)
+            self.golden_per_param_grad_ready_counts = self.per_param_grad_ready_counts
+            self.is_first_batch = False
+        self.per_param_grad_ready_counts = {}
         self.is_last_microbatch = True
 
     def check_grads(self, check_for_nan_or_inf, check_for_large):
@@ -337,7 +361,7 @@ def finish_param_sync(self, skip_next_bucket_dispatch: bool = False):
                 if len(fp8_params) > 0:
                     post_all_gather_processing(fp8_params)
 
-    def start_grad_sync(self):
+    def start_grad_sync(self, force_all_reduce: Optional[bool] = False):
         """
         Initiates grad sync (all-reduce or reduce-scatter) communication operations
         for all buckets in the bucket group.
@@ -346,6 +370,11 @@ def start_grad_sync(self):
         communication call. When ddp_config.overlap_grad_reduce is set to False, makes
         synchronous call.
         """
+        if self.is_first_batch and self.grad_reduce_handle is not None:
+            # Make this start_grad_sync call a no-op if in first batch and collective has
+            # already been dispatched.
+            return
+
         assert (
             self.grad_reduce_handle is None
         ), "Should not have multiple communication calls outstanding at once"
@@ -403,7 +432,7 @@ def start_grad_sync(self):
         grad_reduce_handle = None
         with stream_context, _coalescing_manager(communication_group, async_ops=async_op) as cm:
             for idx, bucket in enumerate(self.buckets):
-                if self.ddp_config.use_distributed_optimizer:
+                if self.ddp_config.use_distributed_optimizer and not force_all_reduce:
                     if self.cached_grad_buffer_shard_list[idx] is None:
                         self.cached_grad_buffer_shard_list[idx] = shard_buffer(
                             bucket.grad_data, self.intra_distributed_optimizer_instance_size
@@ -419,6 +448,10 @@ def start_grad_sync(self):
                         async_op=async_op,
                     )
                 else:
+                    if torch.distributed.get_rank() == 0 and force_all_reduce:
+                        logger.info(
+                            f"Performing reduction using all_reduce because {force_all_reduce=}"
+                        )
                     torch.distributed.all_reduce(
                         bucket.grad_data, op=reduce_op, group=communication_group, async_op=async_op
                     )
@@ -453,7 +486,7 @@ def start_grad_sync(self):
                     )
 
         if async_op:
-            if self.ddp_config.reduce_scatter_with_fp32_accumulation:
+            if self.ddp_config.reduce_scatter_with_fp32_accumulation and not force_all_reduce:
                 assert (
                     len(self.buckets) == 1
                 ), "Only 1 bucket supported with reduce_scatter_with_fp32_accumulation=True"
@@ -471,7 +504,7 @@ def start_grad_sync(self):
             # None.
             self.grad_reduce_handle = None
 
-    def finish_grad_sync(self):
+    def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
         """
         Finishes grad sync (all-reduce or reduce-scatter) communication operations
         for all buckets in the bucket group.
@@ -483,8 +516,13 @@ def finish_grad_sync(self):
         self.param_gather_dispatched = False
         # If overlap_grad_reduce is False, start (and finish) synchronous communication call here.
         if not self.ddp_config.overlap_grad_reduce:
-            self.start_grad_sync()
+            self.start_grad_sync(force_all_reduce=force_all_reduce)
             return
+        # If first batch, start asynchronous communication here. register_grad_ready() launches
+        # asynchronous communication only once self.golden_per_param_grad_ready_counts is
+        # populated at the end of this first batch.
+        if self.is_first_batch:
+            self.start_grad_sync(force_all_reduce=force_all_reduce)
         # When using multiple DistOpt instances, we don't need to sync here as we launch
         # communications on a separate communication stream.
         if self.ddp_config.num_distributed_optimizer_instances > 1:
@@ -492,12 +530,15 @@ def finish_grad_sync(self):
             return
         assert self.grad_reduce_handle is not None, (
             f"Communication call has not been issued for this bucket "
-            f"({len(self.params_with_grad)}/{len(self.params)} params have grad available)"
+            f"({len(self.per_param_grad_ready_counts)}/{len(self.params)} "
+            "params have grad available)"
         )
         self.grad_reduce_handle.wait()
         self.grad_reduce_handle = None
 
-    def register_grad_ready(self, param: torch.nn.Parameter):
+    def register_grad_ready(
+        self, param: torch.nn.Parameter, force_all_reduce: Optional[bool] = False
+    ):
         """
         Registers grads for the passed-in param to be "ready" for grad sync.
 
@@ -510,11 +551,14 @@ def register_grad_ready(self, param: torch.nn.Parameter):
         ), "register_grad_ready() should only be called when overlap_grad_reduce is True"
         if self.is_last_microbatch:
             assert param in self.param_to_bucket, "Param is not in the bucket group"
-            assert param not in self.params_with_grad, "Cannot set grad twice"
-            self.params_with_grad.add(param)
+            if param not in self.per_param_grad_ready_counts:
+                self.per_param_grad_ready_counts[param] = 0
+            self.per_param_grad_ready_counts[param] += 1
             # If all params in bucket group have grads available, issue communication call.
-            if len(self.params_with_grad) == len(self.params):
-                self.start_grad_sync()
+            if not self.is_first_batch:
+                if self.per_param_grad_ready_counts == self.golden_per_param_grad_ready_counts:
+                    assert len(self.per_param_grad_ready_counts) == len(self.params)
+                    self.start_grad_sync(force_all_reduce=force_all_reduce)
 
 
 class _ParamAndGradBuffer:
@@ -718,6 +762,12 @@ def _does_param_require_new_bucket(param):
                 group=self.data_parallel_group,
                 symmetric=not self.ddp_config.disable_symmetric_registration,
             )
+            # Since nccl communicator group is created lazily, we need to perform a warmup call to
+            # initialize NCCL comm buffers for this dp_group before doing buffer registration.
+            torch.distributed.barrier()
+            tmp_warmup_tensor = torch.zeros([1], device="cuda")
+            torch.distributed.all_reduce(tmp_warmup_tensor, group=self.data_parallel_group)
+            torch.distributed.barrier()
         else:
             # If nccl_ub is False, mem_alloc_context is nullcontext.
             mem_alloc_context = nullcontext
@@ -757,6 +807,10 @@ def _does_param_require_new_bucket(param):
                     requires_grad=False,
                 )
 
+        self.grad_data_size = 0
+        self.param_data_size = 0
+        self.param_data_cpu = None
+
         # Finally, map param.data and param.main_grad fields to buffers.
         bucket_params = []
         bucket_start_index = 0
@@ -907,6 +961,38 @@ def reset(self):
         """
         self.grad_data.zero_()
 
+    def offload_to_cpu(self, move_params: bool = True, move_grads: bool = True) -> None:
+        """
+        Offload the buffers to CPU.
+        """
+        if move_grads and self.grad_data is not None and self.grad_data.storage().size() > 0:
+            self.grad_data_size = self.grad_data.storage().size()
+            self.grad_data.storage().resize_(0)
+        if move_params and self.param_data is not None and self.param_data.storage().size() > 0:
+            self.param_data_size = self.param_data.storage().size()
+            if self.param_data_cpu is not None:
+                self.param_data_cpu.copy_(self.param_data, non_blocking=True)
+            else:
+                self.param_data_cpu = self.param_data.cpu().pin_memory()
+            self.param_data.storage().resize_(0)
+
+    def reload_from_cpu(self, move_params: bool = True, move_grads: bool = True):
+        """
+        Reload the buffers from CPU.
+        """
+        if (
+            move_params
+            and self.param_data is not None
+            and self.param_data_cpu is not None
+            and self.param_data.storage().size() == 0
+        ):
+            self.param_data.storage().resize_(self.param_data_size)
+            self.param_data.copy_(self.param_data_cpu, non_blocking=True)
+        if move_grads and self.grad_data is not None and self.grad_data_size > 0:
+            self.grad_data.storage().resize_(self.grad_data_size)
+            self.grad_data.zero_()
+            self.grad_data_size = 0
+
 
 def partition_buckets(
     buffers: List[_ParamAndGradBuffer], force_single_bucket_group: bool = False
diff --git a/megatron/core/enums.py b/megatron/core/enums.py
index fcca219badd..9b76bc52a87 100644
--- a/megatron/core/enums.py
+++ b/megatron/core/enums.py
@@ -7,8 +7,6 @@ class ModelType(enum.Enum):
     """Model type."""
 
     encoder_or_decoder = 1
-    retro_encoder = 2
-    retro_decoder = 3
 
     @property
     def encoder_and_decoder(self):
diff --git a/megatron/core/extensions/TransformerEngineMixedPrecision.md b/megatron/core/extensions/TransformerEngineMixedPrecision.md
new file mode 100644
index 00000000000..4e98eb4acb4
--- /dev/null
+++ b/megatron/core/extensions/TransformerEngineMixedPrecision.md
@@ -0,0 +1,111 @@
+# About
+
+Megatron training exposes the argument "--te-precision-config-file"
+to allow experimentation with fine-grained control over the precision
+of modules within a megatron network.
+
+## Design Goals
+
+The design aims to support configuration of the precision of linear
+and grouped linear modules via the selection of a transformer engine
+quantization recipe.
+
+The fp8_autocast abstraction is already used to enable and disable a
+single quantization recipe when evaluating the forward pass of a network.
+This same mechanism is extended to execute targeted layers with the
+desired quantization recipe, permitting mixed precision recipes.
+
+The configurations function by optionally overriding the precision a module
+would execute in. Not every module must have a configured override. Modules
+are checked by module name against a sequence of patterns to determine if
+an override recipe is applicable. By default, if the non-overridden precision
+of a layer is non-quantized, as the primary desired use case is to customize
+modules that are already quantized, and it is useful to respect other arguments
+like `--first-last-layers-bf16`.
+
+## Limitations
+
+Relying on the module name to match against a configuration means the match is
+executed post-initialization, and initialization customization for a recipe
+override such as `fp4-param` and `fp8-param` are not in scope.
+
+The validation precision configurations rely on self.training. They have not
+yet been verified compatible with cuda-graphs and/or activation recompute.
+
+There are some decisions in megatron that are made using the TransformerConfig's
+settings for fp4 and fp8, possibly including layer number rather than using the
+quantization autocast context. The configured overrides do not inform these
+decisions with the current implementation.
+
+## Validation precision
+
+It is supported to configure a different precision when evaluating against the
+validation set (when module.training is False). When evaluating a quantization
+recipe, having a consistent forward pass for evaluation versus a baseline isolates
+the quality of learning from the ability to infer with the quantization.
+
+## Recipe configuration
+
+Recipe configurations are named entries in a "configs" dictionary.
+
+These examples show an mxfp8 recipe, a bf16 recipe, an mxfp8 recipe that
+evaluates in bf16, and an nvfp4 recipe that evaluates in bf16.
+```
+configs:
+  mxfp8:
+    transformer_engine_config_type: "TEQuantizationParams"
+    training_recipe:
+      fp8_quantization_recipe: "mxfp8"
+  bf16:
+    transformer_engine_config_type: "TEQuantizationParams"
+    training_recipe: {}
+  mxfp8_evaluate_bf16:
+    transformer_engine_config_type: "TEQuantizationParams"
+    training_recipe:
+      fp8_quantization_recipe: "mxfp8"
+    evaluation_recipe: {}
+  nvfp4_evaluate_bf16:
+    transformer_engine_config_type: "TEQuantizationParams"
+    training_recipe:
+      fp4_quantization_recipe: "nvfp4"
+    evaluation_recipe: {}
+```
+
+Recipes are selected by matchers. Currently implemented are glob style
+expressions.
+
+Matchers are ordered, and the first enabled matcher to match against
+a module name chooses the config from the configs list.
+
+In this example, assuming a default quantization recipe is enabled,
+attention linear modules `linear_qkv` and `linear_proj` are selected
+for the "bf16" recipe override and mamba mixer linear layers `out_proj`
+and `in_proj` are selected for the "mxfp8" recipe override.
+
+```
+matchers:
+  attn_qkv_bf16:
+    config: "bf16"
+    type: "glob"
+    pattern: "*.linear_qkv"
+    enabled: true
+  attn_proj_bf16:
+    config: "bf16"
+    type: "glob"
+    pattern: "*.linear_proj"
+    enabled: true
+  mamba_outproj_mxfp8:
+    config: "mxfp8"
+    type: "glob"
+    pattern: "*mixer.out_proj"
+    enabled: true
+  mamba_inproj_mxfp8:
+    config: "mxfp8"
+    type: "glob"
+    pattern: "*mixer.in_proj"
+    enabled: true
+```
+
+Matches or modules that do not match to a configuration, and execute with their
+default precision, will be logged so that quantization configurations can be
+observed. Make sure to set `--logging-level` (to 20) in order to emit to logs.
diff --git a/megatron/core/extensions/kitchen.py b/megatron/core/extensions/kitchen.py
index 7f2f1fac9c8..a8a83fb341c 100644
--- a/megatron/core/extensions/kitchen.py
+++ b/megatron/core/extensions/kitchen.py
@@ -1,1092 +1,30 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
-import warnings
-from dataclasses import dataclass, fields
-from enum import Enum
-from typing import Any, Callable, Dict, Optional, Set, Tuple
-
-import torch
-
-from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
-from megatron.core.model_parallel_config import ModelParallelConfig
-from megatron.core.models.backends import BackendSpecProvider
-from megatron.core.parallel_state import (
-    get_expert_data_parallel_rank,
-    get_expert_model_parallel_rank,
-    get_expert_model_parallel_world_size,
-)
-from megatron.core.quantization.quant_config import MatchContext, QuantizationConfig
-from megatron.core.tensor_parallel.random import (
-    get_cuda_rng_tracker,
-    get_data_parallel_rng_tracker_name,
-    get_expert_parallel_rng_tracker_name,
-)
-from megatron.core.tensor_parallel.utils import divide
-from megatron.core.transformer.mlp import MLPSubmodules
-from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
-from megatron.core.utils import get_tensor_model_parallel_group_if_none
-
-# Parsing constant
-_KITCHEN_CONFIG_TYPE_KEY = "kitchen_config_type"
-try:
-    import nvidia_kitchen
-    from nvidia_kitchen.config import QLinearParams, get_qlinear_params_from_qat_params
-
-    HAVE_KITCHEN = True
-except ImportError:
-    from unittest.mock import MagicMock
-
-    HAVE_KITCHEN = False
-    nvidia_kitchen = MagicMock()
-    QLinearParams = MagicMock()
-    get_qlinear_params_from_qat_params = MagicMock()
-
-
-class KitchenConfigType(Enum):
-    """Configuration object types in config dictionary"""
-
-    QLINEAR_PARAMS = "QLinearParams"
-    # Could be extended with attention params e.g. QAttentionParams
-
-
-@dataclass
-class QLinearParamsConfigSchema:
-    """Dataclass to parse values from config dict of 'QLinearParams' type"""
-
-    kitchen_config_type: KitchenConfigType
-    recipe_idx: int
-
-    @classmethod
-    def parse_config_dict(cls, config_dict: Dict[Any, Any]) -> "QLinearParamsConfigSchema":
-        """
-        Parse config dictionary and return a schema instance.
-
-
-        Expected config format: {"kitchen_config_type": "QLinearParams", "recipe_idx": <int>}
-        """
-        expected_keys = cls.get_expected_keys()
-        actual_keys = set(config_dict.keys())
-
-        # Check for missing keys
-        missing = expected_keys - actual_keys
-        if missing:
-            raise KeyError(f"Missing required keys: {missing}")
-
-        # Check for unexpected keys
-        unexpected = actual_keys - expected_keys
-        if unexpected:
-            raise KeyError(f"Unexpected keys in config: {unexpected}")
-
-        try:
-            config_type = KitchenConfigType(config_dict[_KITCHEN_CONFIG_TYPE_KEY])
-        except ValueError:
-            raise ValueError(f"Unsupported config type '{config_dict['kitchen_config_type']}'.")
-
-        if config_type != KitchenConfigType.QLINEAR_PARAMS:
-            raise ValueError(f"Parsing config dict of incorrect type '{config_type}'")
-
-        # Create instance with converted enum
-        return cls(kitchen_config_type=config_type, recipe_idx=config_dict["recipe_idx"])
-
-    @classmethod
-    def get_expected_keys(cls) -> Set[str]:
-        """Get expected keys from the dataclass fields."""
-        return {field.name for field in fields(cls)}
-
-    def __post_init__(self):
-        # config type check
-        if not isinstance(self.kitchen_config_type, KitchenConfigType):
-            raise TypeError(
-                "kitchen_config_type must be KitchenConfigType, "
-                f"got {type(self.kitchen_config_type)}"
-            )
-
-        if self.kitchen_config_type != KitchenConfigType.QLINEAR_PARAMS:
-            raise TypeError(
-                f"kitchen_config_type must be QLinearParams got {self.kitchen_config_type}"
-            )
-        # recipe_idx check
-        if not isinstance(self.recipe_idx, int) or self.recipe_idx <= 0:
-            raise ValueError(f"recipe_idx must be a positive integer, got {self.recipe_idx}")
-
-    def to_kitchen_qlinear(self) -> QLinearParams:
-        """Converts to kitchen library's QLinearParams object."""
-        return get_qlinear_params_from_qat_params(self.recipe_idx)
-
-
-@dataclass
-class KitchenQuantizationParams:
-    """Quantization parameters used for kitchen extensions"""
-
-    qlinear_params: Optional[QLinearParams]
-    # Could be extended with attention params,
-    # sparsity, etc.
-    # match_input is what selected the config.
-    match_input: MatchContext
-    params_config_key: str
-
-    @staticmethod
-    def parse_from_config(quant_config: QuantizationConfig) -> "KitchenQuantizationParams":
-        """Parses quantization config for a layer or throw an error."""
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        assert (
-            quant_config is not None
-        ), "Kitchen extension expects a quantization config for linear layers."
-        config = quant_config.config
-        try:
-            config_type = KitchenConfigType(config[_KITCHEN_CONFIG_TYPE_KEY])
-        except KeyError:
-            raise ValueError(
-                f"Kitchen config dictionary must have '{_KITCHEN_CONFIG_TYPE_KEY}' key."
-            )
-        except ValueError:
-            raise ValueError(f"Unsupported config type '{config['kitchen_config_type']}'.")
-
-        if config_type == KitchenConfigType.QLINEAR_PARAMS:
-            return KitchenQuantizationParams(
-                qlinear_params=QLinearParamsConfigSchema.parse_config_dict(
-                    config
-                ).to_kitchen_qlinear(),
-                match_input=quant_config.match_input,
-                params_config_key=quant_config.config_key,
-            )
-        else:
-            raise NotImplementedError(f"Unhandled configuration type {config_type}")
-
-
-def _get_extra_kitchen_kwargs(config: TransformerConfig):
-    extra_kitchen_kwargs = {"params_dtype": config.params_dtype}
-
-    if config.use_cpu_initialization:
-        raise ValueError("Kitchen backend does not support use_cpu_initialization.")
-    elif config.init_model_with_meta_device:
-        extra_kitchen_kwargs["device"] = "meta"
-    else:
-        extra_kitchen_kwargs["device"] = torch.cuda.current_device()
-    return extra_kitchen_kwargs
-
-
-class KitchenLinear(nvidia_kitchen.Linear):
-    """
-    Wrapper for Kitchen's `Linear` layer.
-
-    Note that if Megatron's parallel_state has not been initialized
-    yet, the tp_group passed to Kitchen will be None and must be set later
-    via set_tensor_parallel_group().
-
-    parallel_mode currently supports 3 different values:
-        - "column": Split the weight matrix along output dimension (for KitchenColumnParallelLinear)
-        - "row": Split the weight matrix along input dimension (for KitchenRowParallelLinear)
-        - "duplicated": No tensor parallelism and weight is duplicated across TP ranks
-        - Note: For expert linear layers, we will disable communication logic here
-                as TP communication is handled in token_dispatcher.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        parallel_mode: Optional[str],
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        skip_bias_add: bool,
-        skip_weight_param_allocation: bool,
-        tp_comm_buffer_name: Optional[str] = None,
-        layer_number: Optional[int] = None,
-        is_expert: bool = False,
-        tp_group: Optional[torch.distributed.ProcessGroup] = None,
-    ):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-        self.config = config
-
-        # Kitchen returns a zero length Tensor when bias=False and
-        # return_bias=True, but we prefer None.  So in that case we
-        # tell TE to not return the bias, and return None
-        # ourselves. This way our forward always returns two values
-        # and we don't have to deal with the zero length Tensor.
-        self.kitchen_return_bias = skip_bias_add and bias
-        self.is_first_microbatch = True
-        self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
-        if skip_weight_param_allocation:
-            raise ValueError("Kitchen linear layers do not support skip_weight_param_allocation")
-
-        # Save params for finish_init
-        self.stashed_input_size = input_size
-        self.stashed_output_size = output_size
-        self.stashed_parallel_mode = parallel_mode
-        self.stashed_init_method = init_method
-        self.stashed_bias = bias
-        self.stashed_tp_comm_buffer_name = tp_comm_buffer_name
-        self.stashed_layer_number = layer_number
-        self.stashed_is_expert = is_expert
-        self.stashed_tp_group = tp_group
-
-        self.init_finished = False
-
-    def finish_init(self, quantization_config: QuantizationConfig):
-        """Required post-init of quantization configuration."""
-        extra_kwargs = _get_extra_kitchen_kwargs(self.config)
-
-        # Restore args from stash
-        input_size = self.stashed_input_size
-        output_size = self.stashed_output_size
-        parallel_mode = self.stashed_parallel_mode
-        init_method = self.stashed_init_method
-        bias = self.stashed_bias
-        tp_comm_buffer_name = self.stashed_tp_comm_buffer_name
-        layer_number = self.stashed_layer_number
-        is_expert = self.stashed_is_expert
-        tp_group = self.stashed_tp_group
-
-        self.kitchen_quant_params = KitchenQuantizationParams.parse_from_config(quantization_config)
-        assert self.kitchen_quant_params.qlinear_params is not None
-        extra_kwargs["qlinear_params"] = self.kitchen_quant_params.qlinear_params
-
-        if tp_comm_buffer_name:
-            self.config.tp_comm_overlap = False
-            warnings.warn(
-                f"The user buffer name {tp_comm_buffer_name} is not supported in "
-                "Kitchen. Disabling TP communication overlap for this layer."
-            )
-            extra_kwargs["ub_name"] = tp_comm_buffer_name
-
-        extra_kwargs["layer_number"] = layer_number
-
-        if parallel_mode == "duplicated":
-            assert tp_group is None, "duplicated linear should not have tp_group set"
-            tp_size = 1
-        else:
-            assert tp_group is not None, "Parallel linear should always have tp_group set"
-            tp_size = tp_group.size()
-
-        self.expert_parallel = self.config.expert_model_parallel_size > 1
-        if is_expert:
-            rng_tracker_name = get_expert_parallel_rng_tracker_name()
-        else:
-            if parallel_mode == "duplicated":
-                rng_tracker_name = get_data_parallel_rng_tracker_name()
-            else:
-                rng_tracker_name = None
-        extra_kwargs["rng_tracker_name"] = rng_tracker_name
-
-        kitchen_parallel_mode = parallel_mode
-        if parallel_mode == "duplicated":
-            # Handle non-parallel case
-            tp_group = None
-            tp_size = 1
-            explicit_expert_comm = False
-            kitchen_parallel_mode = None
-        else:
-            # Disable communications in kitchen when using TP or EP by megatron
-            explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel)
-
-            if explicit_expert_comm:
-                if parallel_mode == "column":
-                    output_size = divide(output_size, tp_size)
-                elif parallel_mode == "row":
-                    input_size = divide(input_size, tp_size)
-                kitchen_parallel_mode = None
-                tp_size = 1
-                tp_group = None
-
-        super().__init__(
-            in_features=input_size,
-            out_features=output_size,
-            sequence_parallel=self.config.sequence_parallel,
-            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
-            # Pass None if not initialized for backward compatibility with the ckpt converter.
-            tp_group=tp_group if torch.distributed.is_initialized() else None,
-            tp_size=tp_size,
-            get_rng_state_tracker=(
-                get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
-            ),
-            init_method=(init_method if self.config.perform_initialization else (lambda w: None)),
-            bias=bias,
-            return_bias=self.kitchen_return_bias,
-            parallel_mode=kitchen_parallel_mode,
-            **extra_kwargs,
-        )
-
-        for param in self.parameters():
-            if is_expert:
-                # Reduce the gradient on the expert_data_parallel group for expert linear layers
-                setattr(param, "allreduce", not self.expert_parallel)
-            else:
-                # Reduce the gradient on DP group
-                setattr(param, "allreduce", True)
-                if parallel_mode == "duplicated":
-                    # Reduce the gradient further on the TP group since the weight is
-                    # duplicated across TP ranks
-                    setattr(param, "sequence_parallel", self.config.sequence_parallel)
-
-        del self.stashed_input_size
-        del self.stashed_output_size
-        del self.stashed_parallel_mode
-        del self.stashed_init_method
-        del self.stashed_bias
-        del self.stashed_tp_comm_buffer_name
-        del self.stashed_layer_number
-        del self.stashed_is_expert
-        del self.stashed_tp_group
-        self.init_finished = True
-
-    def forward(self, x):
-        """Forward."""
-        assert self.init_finished
-        _is_first_microbatch = (
-            None if self.disable_parameter_transpose_cache else self.is_first_microbatch
-        )
-        out = super().forward(x, is_first_microbatch=_is_first_microbatch)
-        self.is_first_microbatch = False
-
-        # Kitchen only returns a tuple when return_bias is True, otherwise
-        # it returns a single Tensor, we always want to return two
-        # values regardless of the arguments.
-        if self.kitchen_return_bias:
-            return out
-        return out, None
-
-    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        """Replicate cross TP/DP."""
-
-        # Provide the dist-ckpt support when KitchenLinear is directly used
-        # It can only happen with duplicated parallel mode
-        assert (
-            self.parallel_mode is None
-        ), "KitchenLinear sharded_state_dict can only be used with duplicated parallel mode"
-        state_dict = self.state_dict(prefix="", keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(state_dict, prefix, None, sharded_offsets)
-
-
-class KitchenColumnParallelLinear(KitchenLinear):
-    """
-    Wrapper for the Kitchen's `Linear` layer but specialized similar
-    to megatron's `ColumnParallelLinear` layer.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        gather_output: bool,
-        bias: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        skip_weight_param_allocation: bool = False,
-        tp_comm_buffer_name: Optional[str] = None,
-        layer_number: Optional[int] = None,
-        tp_group: Optional[torch.distributed.ProcessGroup] = None,
-    ):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        if gather_output:
-            raise ValueError("Kitchen linear layers do not support gather_output = True")
-        tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-        world_size = tp_group.size()
-        rank = tp_group.rank()
-
-        super().__init__(
-            input_size=input_size,
-            output_size=output_size,
-            parallel_mode="column",
-            config=config,
-            init_method=(init_method if config.perform_initialization else (lambda w: None)),
-            bias=bias,
-            skip_bias_add=skip_bias_add,
-            is_expert=is_expert,
-            skip_weight_param_allocation=skip_weight_param_allocation,
-            tp_comm_buffer_name=tp_comm_buffer_name,
-            layer_number=layer_number,
-            tp_group=tp_group,
-        )
-
-        if config.use_cpu_initialization:
-            raise ValueError("Kitchen extension doesn't support use_cpu_initialization.")
-
-    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        """Sharding along axis 0, bias sharded"""
-        state_dict = self.state_dict(prefix="", keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0, "bias": 0}, sharded_offsets
-        )
-
-    def __repr__(self):
-        return (
-            f"{type(self).__name__}(in_features={self.in_features}, "
-            f"out_features={self.out_features}, bias={self.use_bias}, TP={self.tp_size})"
-        )
-
-
-class KitchenRowParallelLinear(KitchenLinear):
-    """
-    Wrapper for Kitchen's `Linear` layer but specialized similar
-    to megatron's `RowParallelLinear` layer.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        input_is_parallel: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        tp_comm_buffer_name: Optional[str] = None,
-        layer_number: Optional[int] = None,
-        tp_group: Optional[torch.distributed.ProcessGroup] = None,
-    ):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        if not input_is_parallel:
-            raise ValueError("Kitchen linear layers do not support input_is_parallel = False")
-        tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-
-        super().__init__(
-            input_size=input_size,
-            output_size=output_size,
-            parallel_mode="row",
-            config=config,
-            init_method=(init_method if config.perform_initialization else (lambda w: None)),
-            bias=bias,
-            skip_bias_add=skip_bias_add,
-            skip_weight_param_allocation=False,
-            # We don't currently use this for row parallel layers # pylint: disable=line-too-long
-            is_expert=is_expert,
-            tp_comm_buffer_name=tp_comm_buffer_name,
-            layer_number=layer_number,
-            tp_group=tp_group,
-        )
-        if config.use_cpu_initialization:
-            raise ValueError("Kitchen extension does not support use_cpu_initialization.")
-
-    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        """Sharding along axis 1, bias not sharded"""
-        state_dict = self.state_dict(prefix="", keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 1}, sharded_offsets
-        )
-
-    def __repr__(self):
-        return (
-            f"{type(self).__name__}(in_features={self.in_features}, "
-            f"out_features={self.out_features}, bias={self.use_bias}, TP={self.tp_size})"
-        )
-
-
-class KitchenGroupedLinear(nvidia_kitchen.GroupedLinear):
-    """
-    Wrapper for Kitchen's `GroupedLinear` layer.
-
-    Note that if Megatron's parallel_state has not been initialized
-    yet, the tp_group passed to TE will be None and must be set later
-    via set_tensor_parallel_group().
-    """
-
-    def __init__(
-        self,
-        num_gemms: int,
-        input_size: int,
-        output_size: int,
-        *,
-        parallel_mode: Optional[str],
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        skip_bias_add: bool,
-        is_expert: bool = False,
-        tp_comm_buffer_name: Optional[str] = None,
-        layer_number: Optional[int] = None,
-        tp_group: Optional[torch.distributed.ProcessGroup] = None,
-    ):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        self.config = config
-
-        # Kitchen returns a zero length Tensor when bias=False and
-        # return_bias=True, but we prefer None.  So in that case we
-        # tell TE to not return the bias, and return None
-        # ourselves. This way our forward always returns two values
-        # and we don't have to deal with the zero length Tensor.
-        self.kitchen_return_bias = skip_bias_add and bias
-        self.is_first_microbatch = True
-        self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
-
-        # Stash parameters for finish_init
-        self.stashed_num_gemms = num_gemms
-        self.stashed_input_size = input_size
-        self.stashed_output_size = output_size
-        self.stashed_parallel_mode = parallel_mode
-        self.stashed_init_method = init_method
-        self.stashed_bias = bias
-        self.stashed_is_expert = is_expert
-        self.stashed_tp_comm_buffer_name = tp_comm_buffer_name
-        self.stashed_layer_number = layer_number
-        self.stashed_tp_group = tp_group
-        self.init_finished = False
-
-    def finish_init(self, quantization_config: QuantizationConfig) -> None:
-        """Required post-init of quantization configuration."""
-        # Restore parameters from stash
-        num_gemms = self.stashed_num_gemms
-        input_size = self.stashed_input_size
-        output_size = self.stashed_output_size
-        parallel_mode = self.stashed_parallel_mode
-        init_method = self.stashed_init_method
-        bias = self.stashed_bias
-        is_expert = self.stashed_is_expert
-        tp_comm_buffer_name = self.stashed_tp_comm_buffer_name
-        layer_number = self.stashed_layer_number
-        tp_group = self.stashed_tp_group
-
-        extra_kwargs = _get_extra_kitchen_kwargs(self.config)
-        extra_kwargs["ub_name"] = tp_comm_buffer_name
-        extra_kwargs["layer_number"] = layer_number
-
-        self.kitchen_quant_params = KitchenQuantizationParams.parse_from_config(quantization_config)
-        assert self.kitchen_quant_params.qlinear_params is not None
-        extra_kwargs["qlinear_params"] = self.kitchen_quant_params.qlinear_params
-
-        self.expert_parallel = self.config.expert_model_parallel_size > 1
-        if is_expert:
-            extra_kwargs["rng_tracker_name"] = get_expert_parallel_rng_tracker_name()
-
-        # The comms between TP and EP group is explicitly handled by MoE token dispatcher.
-        # So we disable comms by making Kitchen agnostic of model parallel.
-        tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-        tp_size = tp_group.size()
-
-        self.explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel)
-
-        if self.explicit_expert_comm:
-            if parallel_mode == "column":
-                output_size = divide(output_size, tp_size)
-            elif parallel_mode == "row":
-                input_size = divide(input_size, tp_size)
-            parallel_mode = None
-            tp_size = 1
-            tp_group = None
-
-        super().__init__(
-            num_gemms=num_gemms,
-            in_features=input_size,
-            out_features=output_size,
-            sequence_parallel=self.config.sequence_parallel,
-            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
-            tp_group=tp_group if torch.distributed.is_initialized() else None,
-            tp_size=tp_size,
-            get_rng_state_tracker=(
-                get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
-            ),
-            init_method=(init_method if self.config.perform_initialization else (lambda w: None)),
-            bias=bias,
-            return_bias=self.kitchen_return_bias,
-            parallel_mode=parallel_mode,
-            **extra_kwargs,
-        )
-
-        for param in self.parameters():
-            setattr(param, "allreduce", not (is_expert and self.expert_parallel))
-
-        def merge_extra_states(
-            self,
-            state_dict,
-            prefix,
-            local_metadata,
-            strict,
-            missing_keys,
-            unexpected_keys,
-            error_msgs,
-        ):
-            """
-            Merge multiple "_extra_state" into one.
-            """
-            self.init_fp8_metadata(num_gemms=self.num_gemms)
-            fp8_checkpoint = self.fp8_meta["fp8_checkpoint"] or self.fp8 or self.fp8_calibration
-
-            try:
-                state_list = [
-                    state_dict.pop(f"{prefix}_extra_state{i}") for i in range(1, self.num_gemms)
-                ]
-            except KeyError:
-                # "_extra_state{i}" only exists for dist-ckpt. Return for torch native ckpt.
-                return
-
-            if not fp8_checkpoint:
-                return
-            state_list = [state_dict.pop(f"{prefix}_extra_state")] + state_list
-            state_list = [self._decode_extra_state(state) for state in state_list]
-            extra_fp8_variables = state_list[0]["extra_fp8_variables"]
-            extra_fp8_variables["num_gemms"] = self.num_gemms
-            extra_state = {"extra_fp8_variables": extra_fp8_variables}
-            state_dict[f"{prefix}_extra_state"] = self._encode_extra_state(extra_state)
-
-        self._register_load_state_dict_pre_hook(merge_extra_states, with_module=True)
-        del self.stashed_num_gemms
-        del self.stashed_input_size
-        del self.stashed_output_size
-        del self.stashed_parallel_mode
-        del self.stashed_init_method
-        del self.stashed_bias
-        del self.stashed_is_expert
-        del self.stashed_tp_comm_buffer_name
-        del self.stashed_layer_number
-        del self.stashed_tp_group
-        self.init_finished = True
-
-    def forward(self, x, m_splits):
-        """Forward."""
-        assert self.init_finished
-        _is_first_microbatch = (
-            None if self.disable_parameter_transpose_cache else self.is_first_microbatch
-        )
-        out = super().forward(x, m_splits, is_first_microbatch=_is_first_microbatch)
-        self.is_first_microbatch = False
-
-        # Kitchen only returns a tuple when return_bias is True, otherwise
-        # it returns a single Tensor, we always want to return two
-        # values regardless of the arguments.
-        if self.kitchen_return_bias:
-            return out
-        return out, None
-
-    def _encode_extra_state(self, state):
-        torch.cuda.synchronize()
-        state_serialized = bytearray(pickle.dumps(state))
-        state_serialized = torch.frombuffer(state_serialized, dtype=torch.uint8)
-        return state_serialized
-
-    def _decode_extra_state(self, state):
-        if isinstance(state, torch.Tensor):
-            return pickle.loads(state.detach().cpu().numpy().tobytes())
-        elif isinstance(state, io.BytesIO):
-            state.seek(0)
-            return torch.load(state, map_location="cuda")
-        else:
-            raise RuntimeError("Unsupported checkpoint format.")
-
-    def _split_extra_state(self, state):
-        fp8_checkpoint = self.fp8_meta["fp8_checkpoint"]
-        # Kitchen is compatible with TE checkpoint format, but never
-        # uses fp8_checkpoints.
-        assert not fp8_checkpoint
-        return [state] * self.num_gemms
-
-    def _sharded_state_dict_grouped(
-        self, tp_axis_map, prefix="", sharded_offsets=(), metadata=None
-    ):
-        """
-        prefix should be module_name to make keys identical to sequetial ones.
-        """
-        assert self.init_finished
-        sharded_state_dict = {}
-        full_state_dict = self.state_dict(prefix="", keep_vars=True)
-        num_global_experts = get_expert_model_parallel_world_size() * self.num_gemms
-        local_expert_indices_offset = get_expert_model_parallel_rank() * self.num_gemms
-        ep_axis = len(sharded_offsets)
-        extra_states = self._split_extra_state(full_state_dict["_extra_state"])
-        for gemm_idx in range(self.num_gemms):
-            state_dict = {
-                f"{gemm_idx}.weight": full_state_dict[f"weight{gemm_idx}"],
-                f"{gemm_idx}._extra_state": extra_states[gemm_idx],
-            }
-            if self.use_bias:
-                state_dict[f"{gemm_idx}.bias"] = full_state_dict[f"bias{gemm_idx}"]
-            sub_sd = make_sharded_tensors_for_checkpoint(
-                state_dict,
-                "",
-                tp_axis_map,
-                (
-                    *sharded_offsets,
-                    (ep_axis, local_expert_indices_offset + gemm_idx, num_global_experts),
-                ),
-            )
-            # Remove expert layers indexing from sharded keys
-            replace_prefix_for_sharding(sub_sd, f"{gemm_idx}.", prefix)
-            sharded_state_dict.update(
-                {
-                    f"{prefix}weight{gemm_idx}": sub_sd[f"{gemm_idx}.weight"],
-                    f"{prefix}_extra_state{'' if gemm_idx == 0 else gemm_idx}": sub_sd[
-                        f"{gemm_idx}._extra_state"
-                    ],
-                }
-            )
-            if self.use_bias:
-                sharded_state_dict[f"{prefix}bias{gemm_idx}"] = sub_sd[f"{gemm_idx}.bias"]
-        # Adjust replica ids - replication along DP modulo EP
-        for k, sh_ten in sharded_state_dict.items():
-            replica_id = sh_ten.replica_id
-            assert (
-                len(replica_id) == 3
-            ), f"Expected replica_id for {k} to be in (PP, TP, DP) format, got: {replica_id}"
-            if getattr(sh_ten, "is_data_parallel_fully_shard", False):
-                edp_replica_id = 0
-            else:
-                edp_replica_id = get_expert_data_parallel_rank()
-            sh_ten.replica_id = (*replica_id[:2], edp_replica_id)
-        return sharded_state_dict
-
-
-class KitchenColumnParallelGroupedLinear(KitchenGroupedLinear):
-    """
-    Wrapper for Kitchen's `GroupedLinear` layer but specialized
-    to column-parallel style.
-    """
-
-    def __init__(
-        self,
-        num_gemms: int,
-        input_size: int,
-        output_size: int,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        tp_comm_buffer_name: Optional[str] = None,
-        layer_number: Optional[int] = None,
-        tp_group: Optional[torch.distributed.ProcessGroup] = None,
-    ):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        super().__init__(
-            num_gemms=num_gemms,
-            input_size=input_size,
-            output_size=output_size,
-            parallel_mode="column",
-            config=config,
-            init_method=(init_method if config.perform_initialization else (lambda w: None)),
-            bias=bias,
-            skip_bias_add=skip_bias_add,
-            is_expert=is_expert,
-            tp_comm_buffer_name=tp_comm_buffer_name,
-            layer_number=layer_number,
-            tp_group=tp_group,
-        )
-
-    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        """
-        For each gemm, sharding along axis 0, bias sharded.
-        Assume sharded_offsets[-1] is the expert parallel offset.
-        """
-        tp_axis_map = {}
-        for gemm_idx in range(self.num_gemms):
-            tp_axis_map.update({f"{gemm_idx}.weight": 0, f"{gemm_idx}.bias": 0})
-        return super()._sharded_state_dict_grouped(tp_axis_map, prefix, sharded_offsets, metadata)
-
-
-class KitchenRowParallelGroupedLinear(KitchenGroupedLinear):
-    """
-    Wrapper for Kitchen's `GroupedLinear` layer but specialized
-    to row-parallel style.
-    """
-
-    def __init__(
-        self,
-        num_gemms: int,
-        input_size: int,
-        output_size: int,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        tp_comm_buffer_name: Optional[str] = None,
-        layer_number: Optional[int] = None,
-        tp_group: Optional[torch.distributed.ProcessGroup] = None,
-    ):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        super().__init__(
-            num_gemms=num_gemms,
-            input_size=input_size,
-            output_size=output_size,
-            parallel_mode="row",
-            config=config,
-            init_method=(init_method if config.perform_initialization else (lambda w: None)),
-            bias=bias,
-            skip_bias_add=skip_bias_add,
-            is_expert=is_expert,
-            tp_comm_buffer_name=tp_comm_buffer_name,
-            layer_number=layer_number,
-            tp_group=tp_group,
-        )
-
-    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        """
-        For each gemm, sharding along axis 1, bias not sharded.
-        Assume sharded_offsets[-1] is the expert parallel offset.
-        """
-        tp_axis_map = {f"{gemm_idx}.weight": 1 for gemm_idx in range(self.num_gemms)}
-        return super()._sharded_state_dict_grouped(tp_axis_map, prefix, sharded_offsets, metadata)
-
-
-class KitchenLayerNormColumnParallelLinear(nvidia_kitchen.LayerNormLinear):
-    """
-    Wrapper for Kitchen's `LayerNormLinear` layer that combines
-    layernorm and linear layers
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        config: TransformerConfig,
-        init_method: Callable,
-        gather_output: bool,
-        bias: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        skip_weight_param_allocation: bool = False,
-        layer_number: Optional[int] = None,
-        tp_comm_buffer_name: Optional[str] = None,
-        tp_group: Optional[torch.distributed.ProcessGroup] = None,
-    ):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        self.config = config
-
-        if gather_output:
-            raise ValueError("Kitchen linear layers do not support gather_output = True")
-
-        if is_expert:
-            raise ValueError("Kitchen linear layers do not yet support MoE")
-
-        if skip_weight_param_allocation:
-            raise ValueError("Kitchen linear layers do not support skip_weight_param_allocation")
-
-        tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-        # Kitchen returns a zero length Tensor when bias=False and
-        # return_bias=True, but we prefer None.  So in that case we
-        # tell Kitchen to not return the bias, and return None
-        # ourselves. This way our forward always returns two values
-        # and we don't have to deal with the zero length Tensor.
-        self.kitchen_return_bias = skip_bias_add and bias
-        self.is_first_microbatch = True
-        self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
-        self.tp_size = tp_group.size()
-        self.tp_rank = tp_group.rank()
-
-        if self.config.tp_comm_overlap:
-            raise ValueError("Kitchen LayerNormLinear does not support tp_comm_overlap")
-
-        if self.config.symmetric_ar_type is not None:
-            raise ValueError("Kitchen LayerNormLinear does not support symmetric all-reduce")
-
-        if config.use_cpu_initialization:
-            raise ValueError("Kitchen extension does not support use_cpu_initialization")
-
-        # Stash parameters for finish_init.
-        self.stashed_input_size = input_size
-        self.stashed_output_size = output_size
-        self.stashed_init_method = init_method
-        self.stashed_gather_output = gather_output
-        self.stashed_bias = bias
-        self.stashed_skip_bias_add = skip_bias_add
-        self.stashed_is_expert = is_expert
-        self.stashed_skip_weight_param_allocation = skip_weight_param_allocation
-        self.stashed_layer_number = layer_number
-        self.stashed_tp_comm_buffer_name = tp_comm_buffer_name
-        self.stashed_tp_group = tp_group
-        self.init_finished = False
-
-    def finish_init(self, quantization_config: QuantizationConfig) -> None:
-        """Required post-init of quantization configuration."""
-        # Restore parameters from stash
-        input_size = self.stashed_input_size
-        output_size = self.stashed_output_size
-        init_method = self.stashed_init_method
-        gather_output = self.stashed_gather_output
-        bias = self.stashed_bias
-        skip_bias_add = self.stashed_skip_bias_add
-        is_expert = self.stashed_is_expert
-        skip_weight_param_allocation = self.stashed_skip_weight_param_allocation
-        layer_number = self.stashed_layer_number
-        tp_comm_buffer_name = self.stashed_tp_comm_buffer_name
-        tp_group = self.stashed_tp_group
-
-        extra_kwargs = _get_extra_kitchen_kwargs(self.config)
-        extra_kwargs["normalization"] = self.config.normalization
-        self.kitchen_quant_params = KitchenQuantizationParams.parse_from_config(quantization_config)
-        assert self.kitchen_quant_params.qlinear_params is not None
-        extra_kwargs["qlinear_params"] = self.kitchen_quant_params.qlinear_params
-        extra_kwargs["ub_name"] = tp_comm_buffer_name
-
-        super().__init__(
-            in_features=input_size,
-            out_features=output_size,
-            eps=self.config.layernorm_epsilon,
-            sequence_parallel=self.config.sequence_parallel,
-            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
-            tp_group=tp_group if torch.distributed.is_initialized() else None,
-            tp_size=self.config.tensor_model_parallel_size,
-            get_rng_state_tracker=(
-                get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
-            ),
-            init_method=(init_method if self.config.perform_initialization else (lambda w: None)),
-            bias=bias,
-            return_bias=self.kitchen_return_bias,
-            parallel_mode="column",
-            return_layernorm_output=False,
-            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-            layer_number=layer_number,
-            **extra_kwargs,
-        )
-        del self.stashed_input_size
-        del self.stashed_output_size
-        del self.stashed_init_method
-        del self.stashed_gather_output
-        del self.stashed_bias
-        del self.stashed_skip_bias_add
-        del self.stashed_is_expert
-        del self.stashed_skip_weight_param_allocation
-        del self.stashed_layer_number
-        del self.stashed_tp_comm_buffer_name
-        del self.stashed_tp_group
-        self.init_finished = True
-
-    def forward(self, x):
-        """Forward."""
-        assert self.init_finished
-        _is_first_microbatch = (
-            None if self.disable_parameter_transpose_cache else self.is_first_microbatch
-        )
-        out = super().forward(x, is_first_microbatch=_is_first_microbatch)
-        self.is_first_microbatch = False
-
-        # Kitchen only returns a tuple when return_bias is True, otherwise
-        # it returns a single Tensor, we always want to return two
-        # values regardless of the arguments.
-        if self.kitchen_return_bias:
-            return out
-        return out, None
-
-    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-        """Sharding along axis 0, bias sharded"""
-        assert self.init_finished
-        state_dict = self.state_dict(prefix="", keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0, "bias": 0}, sharded_offsets
-        )
-
-    def __repr__(self):
-        return (
-            f"{type(self).__name__}(in_features={self.in_features}, "
-            f"out_features={self.out_features}, bias={self.use_bias}, TP={self.tp_size})"
-        )
-
-
-class KitchenSpecProvider(BackendSpecProvider):
-    """A protocol for providing the submodules used in Spec building."""
-
-    def __init__(self, fallback: BackendSpecProvider):
-        if not HAVE_KITCHEN:
-            raise ImportError(
-                "Kitchen extension requires the nvidia_kitchen package. "
-                "Please install it with `pip install nvidia-kitchen`."
-            )
-
-        self.fallback = fallback
-
-    def column_parallel_linear(self) -> type:
-        """Which column parallel linear module kitchen backend uses"""
-        return KitchenColumnParallelLinear
-
-    def row_parallel_linear(self) -> type:
-        """Which row parallel linear module kitchen backend uses"""
-        return KitchenRowParallelLinear
-
-    def fuse_layernorm_and_linear(self) -> bool:
-        """Does kitchen backend support a single module for layernorm and linear"""
-        # NOTE(kwyss): This is coupled with get_mlp_module_spec_for_backend and
-        # the initialization of TransformerLayerSubmodules such as in
-        # get_gpt_layer_local_spec or get_gpt_layer_with_transformer_engine_spec
-        # where an explicit norm may be provided. Kitchen extension chooses to
-        # match the topology of the fallback with this code.
-        # Arguably, we should pass the info down to get_mlp_module_spec_for_backend
-        # explicitly about whether to include a norm.
-        return self.fallback.fuse_layernorm_and_linear()
-
-    def column_parallel_layer_norm_linear(self) -> Optional[type]:
-        """Which module for sequential layernorm and linear"""
-        return KitchenLayerNormColumnParallelLinear
-
-    def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type:
-        """Which module to use for layer norm"""
-        return self.fallback.layer_norm(rms_norm=rms_norm, for_qk=for_qk)
-
-    def core_attention(self) -> type:
-        """Which module to use for attention"""
-        return self.fallback.core_attention()
-
-    def grouped_mlp_modules(
-        self, moe_use_grouped_gemm: bool, moe_use_legacy_grouped_gemm: bool
-    ) -> Tuple[type, Optional[MLPSubmodules]]:
-        """Which module and submodules to use for grouped mlp"""
-        if moe_use_grouped_gemm and not moe_use_legacy_grouped_gemm:
-            # NOTE: TEGroupedMLP is a bit of a misnomer.
-            # It doesn't strictly require TE except for the GroupedLinear,
-            # which Kitchen also provides an implementation of.
-            return TEGroupedMLP, MLPSubmodules(
-                linear_fc1=KitchenColumnParallelGroupedLinear,
-                linear_fc2=KitchenRowParallelGroupedLinear,
-            )
-        elif moe_use_grouped_gemm:
-            warnings.warn(
-                "The legacy GroupedMLP will be deprecated in Megatron-Core v0.12.0. "
-                "Please update the TransformerEngine to version>=1.7.0 and use TEGroupedMLP."
-            )
-            return GroupedMLP, None
-        else:
-            return SequentialMLP, MLPSubmodules(
-                linear_fc1=KitchenColumnParallelLinear, linear_fc2=KitchenRowParallelLinear
-            )
-
-    def activation_func(self) -> type:
-        """Which module to use for activation function"""
-        return None
+HAVE_KITCHEN = False
+
+from unittest.mock import MagicMock
+
+AutogradFunctionImplementation = MagicMock()
+KitchenSpecProvider = MagicMock()
+
+QAttentionParamsConfigSchema = MagicMock()
+QFlashAttentionParamsConfigSchema = MagicMock()
+QLinearParamsConfigSchema = MagicMock()
+QLinearParams = MagicMock()
+QuantizeRecipe = MagicMock()
+QuantizeRecipeAttnBMM = MagicMock()
+get_qattention_params_from_predefined = MagicMock()
+get_qfa_params_from_recipe_name = MagicMock()
+get_qlinear_params_from_predefined = MagicMock()
+get_qlinear_params_from_qat_params = MagicMock()
+
+KitchenColumnParallelGroupedLinear = MagicMock()
+KitchenColumnParallelLinear = MagicMock()
+KitchenDotProductAttention = MagicMock()
+KitchenFlashAttention = MagicMock()
+KitchenLayerNormColumnParallelLinear = MagicMock()
+KitchenRowParallelGroupedLinear = MagicMock()
+KitchenRowParallelLinear = MagicMock()
+
+# N.B. Kitchen extension is not released publicly.
+# This extension is just a stub.
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index 32f42a079a7..aa0760ec6ea 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -1,12 +1,14 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import dataclasses
+import enum
 import inspect
 import io
 import os
 import pickle
 import warnings
-from typing import Any, Callable, List, Optional, Tuple
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -16,18 +18,19 @@
 
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.enums import Fp4Recipe, Fp8Recipe
 from megatron.core.model_parallel_config import ModelParallelConfig
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.parallel_state import (
+    get_amax_reduction_group,
     get_context_parallel_group,
-    get_expert_data_parallel_rank,
-    get_expert_model_parallel_rank,
-    get_expert_model_parallel_world_size,
     get_hierarchical_context_parallel_groups,
     get_tensor_model_parallel_group,
     get_tensor_model_parallel_world_size,
+    model_parallel_is_initialized,
 )
 from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.quantization.quant_config import QuantizationConfig
 from megatron.core.tensor_parallel.layers import (
     _initialize_affine_weight_cpu,
     set_tensor_model_parallel_attributes,
@@ -42,6 +45,7 @@
 from megatron.core.transformer.mlp import MLP
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import (
+    ensure_metadata_has_dp_cp_group,
     is_layer_window_attention,
     make_sharded_tensors_for_checkpoint,
 )
@@ -56,13 +60,237 @@
 
 try:
     import transformer_engine as te
+    from transformer_engine.pytorch.fp8 import FP8GlobalStateManager, fp8_autocast
 
     HAVE_TE = True
 except ImportError:
-    from unittest.mock import MagicMock
+    if TYPE_CHECKING:
+        # For type checking, treat transformer_engine as always available.
+        import transformer_engine as te
+        from transformer_engine.pytorch.fp8 import FP8GlobalStateManager, fp8_autocast
 
-    te = MagicMock()
-    HAVE_TE = False
+        HAVE_TE = True
+    else:
+        from unittest.mock import MagicMock
+
+        te = MagicMock()
+        HAVE_TE = False
+
+_TE_CONFIG_TYPE_KEY = "transformer_engine_config_type"
+
+
+class TransformerEngineConfigType(enum.Enum):
+    """Configuration object types in config dictionary"""
+
+    TEQuantizationParams = "TEQuantizationParams"
+
+
+@dataclasses.dataclass
+class TEQuantizationRecipe:
+    """Class to capture options for opening an autocast context in forward"""
+
+    fp8_quantization_recipe: Optional[Fp8Recipe] = None
+    """
+    An FP8 quantization override if the module should use FP8.
+    If no FP8 or FP4 quantization is configured, the recipe is execution
+    in high-precision (BF16).
+    """
+    fp4_quantization_recipe: Optional[Fp4Recipe] = None
+    """
+    An FP4 quantization override if the module should use FP4.
+    If no FP8 or FP4 quantization is configured, the recipe is execution
+    in high-precision (BF16).
+    """
+    custom_recipe_factory: Optional[str] = None
+    """The path to a custom recipe factory if a custom Fp4 or Fp8 recipe is configured"""
+    fp8_format: str = "e4m3"
+    """A format to select from an FP8Recipe"""
+    override_quantized_autocast: bool = True
+    """
+    If the quantization autocast context for a targeted module is enabled,
+    whether to override it and change (or disable) the quantization recipe.
+    """
+    override_nonquantized_autocast: bool = False
+    """
+    If the quantization autocast context for a targeted module is not enabled,
+    whether to override it and enable a quantization recipe.
+    """
+    tp_only_amax_red: bool = False
+    """
+    If an amax reduction is applicable, such as in per-tensor quantization recipe,
+    whether to reduce only along TP groups.
+    """
+
+    @classmethod
+    def parse_from_config(cls, quant_config: Dict[Any, Any]) -> "TEQuantizationRecipe":
+        """
+        Parse config from quantization dictionary.
+        """
+        kwargs = {}
+        class_keys = cls.get_config_keys()
+        for field in class_keys:
+            if field in quant_config:
+                kwargs[field] = quant_config[field]
+        for field in quant_config:
+            if field not in class_keys:
+                raise ValueError(f"Field '{field}' not valid for this configuration.")
+        instance = TEQuantizationRecipe(**kwargs)
+        if instance.fp8_quantization_recipe == Fp8Recipe.delayed:
+            raise ValueError("Delayed scaling not in scope of te per-module quantization config.")
+        if (
+            instance.fp8_quantization_recipe is not None
+            and instance.fp4_quantization_recipe is not None
+        ):
+            raise ValueError("fp8 and fp4 quantization settings are mutually exclusive.")
+        if (
+            instance.fp8_quantization_recipe == Fp8Recipe.custom
+            or instance.fp4_quantization_recipe == Fp4Recipe.custom
+        ):
+            if instance.custom_recipe_factory is None:
+                raise ValueError("custom fp8 or fp4 recipe requires custom_recipe_factory")
+        return instance
+
+    @classmethod
+    def get_config_keys(cls) -> Set[str]:
+        """Get expected keys from the dataclass fields."""
+        return {field.name for field in dataclasses.fields(cls)}
+
+
+@dataclasses.dataclass
+class TEQuantizationParams:
+    """Class to capture precision options for training and evaluation."""
+
+    training_recipe: TEQuantizationRecipe
+    """Precision override for when self.training is True"""
+    evaluation_recipe: Optional[TEQuantizationRecipe]
+    """
+    Precision override for when self.training is False.
+    If None, training_recipe is used.
+    """
+
+    @staticmethod
+    def parse_from_config(quant_config: QuantizationConfig) -> "TEQuantizationParams":
+        """Parses quantization config for a layer or throw an error."""
+        config = quant_config.config
+        try:
+            config_type = TransformerEngineConfigType(config[_TE_CONFIG_TYPE_KEY])
+        except KeyError:
+            raise ValueError(
+                f"TransformerEngine config dictionary must have '{_TE_CONFIG_TYPE_KEY}' key."
+            )
+        except ValueError:
+            raise ValueError(f"Unsupported config type '{config[_TE_CONFIG_TYPE_KEY]}'.")
+
+        if config_type == TransformerEngineConfigType.TEQuantizationParams:
+            if 'training_recipe' not in config.keys():
+                raise ValueError(
+                    "TransformerEngine config dictionary must have 'training_recipe' key"
+                )
+            training_recipe = TEQuantizationRecipe.parse_from_config(config['training_recipe'])
+            if 'evaluation_recipe' not in config.keys():
+                evaluation_recipe = None
+                assert len(config.keys()) == 2
+            else:
+                evaluation_recipe = TEQuantizationRecipe.parse_from_config(
+                    config['evaluation_recipe']
+                )
+                assert len(config.keys()) == 3
+            return TEQuantizationParams(
+                training_recipe=training_recipe, evaluation_recipe=evaluation_recipe
+            )
+        else:
+            raise NotImplementedError(f"Unhandled configuration type {config_type}")
+
+
+def _get_fp8_autocast_for_quant_recipe(qrecipe: TEQuantizationRecipe):
+    if FP8GlobalStateManager.is_fp8_enabled():
+        if not qrecipe.override_quantized_autocast:
+            return nullcontext()
+    else:
+        if not qrecipe.override_nonquantized_autocast:
+            return nullcontext()
+
+    if qrecipe.fp8_quantization_recipe is None and qrecipe.fp4_quantization_recipe is None:
+        # Force BF16 for this layer and override autocast
+        return fp8_autocast(enabled=False)
+    else:
+        amax_group = None
+        if model_parallel_is_initialized():
+            amax_group = get_amax_reduction_group(
+                with_context_parallel=True, tp_only_amax_red=qrecipe.tp_only_amax_red
+            )
+        if (
+            qrecipe.fp8_quantization_recipe == Fp8Recipe.custom
+            or qrecipe.fp4_quantization_recipe == Fp4Recipe.custom
+        ):
+            from megatron.core.fp8_utils import _get_custom_recipe
+
+            assert qrecipe.custom_recipe_factory is not None
+            quant_recipe = _get_custom_recipe(qrecipe.custom_recipe_factory)
+        elif qrecipe.fp8_quantization_recipe is not None:
+            if qrecipe.fp8_format == "e4m3":
+                fp8_format = te.common.recipe.Format.E4M3
+            elif qrecipe.fp8_format == "hybrid":
+                fp8_format = te.common.recipe.Format.HYBRID
+            else:
+                raise ValueError(f"Unhandled fp8_format {qrecipe.fp8_format}")
+
+            if qrecipe.fp8_quantization_recipe == Fp8Recipe.tensorwise:
+                quant_recipe = te.common.recipe.Float8CurrentScaling(fp8_format=fp8_format)
+            elif qrecipe.fp8_quantization_recipe == Fp8Recipe.blockwise:
+                quant_recipe = te.common.recipe.Float8BlockScaling(fp8_format=fp8_format)
+            elif qrecipe.fp8_quantization_recipe == Fp8Recipe.mxfp8:
+                quant_recipe = te.common.recipe.MXFP8BlockScaling(fp8_format=fp8_format)
+            else:
+                raise ValueError(f"Unhandled fp8 recipe: {qrecipe.fp8_quantization_recipe}")
+        else:
+            # Fp4 configured.
+            if qrecipe.fp4_quantization_recipe == Fp4Recipe.nvfp4:
+                quant_recipe = te.common.recipe.NVFP4BlockScaling()
+            else:
+                raise ValueError(f"Unhandled fp4 recipe: {qrecipe.fp8_quantization_recipe}")
+
+        return fp8_autocast(enabled=True, fp8_recipe=quant_recipe, fp8_group=amax_group)
+
+
+def _get_fp8_autocast_for_quant_params(qparams: TEQuantizationParams | None, training: bool):
+    if qparams is None:
+        return nullcontext()
+    elif not training and qparams.evaluation_recipe is not None:
+        return _get_fp8_autocast_for_quant_recipe(qparams.evaluation_recipe)
+    else:
+        return _get_fp8_autocast_for_quant_recipe(qparams.training_recipe)
+
+
+def _get_should_context_be_quantized_recipe(
+    qrecipe: TEQuantizationRecipe, is_original_context_quantized: bool
+):
+    if is_original_context_quantized:
+        if not qrecipe.override_quantized_autocast:
+            return is_original_context_quantized
+    else:
+        if not qrecipe.override_nonquantized_autocast:
+            return is_original_context_quantized
+    if qrecipe.fp8_quantization_recipe is None and qrecipe.fp4_quantization_recipe is None:
+        # Force BF16 for this layer and override autocast
+        return False
+    else:
+        return True
+
+
+def _get_should_context_be_quantized_params(
+    qparams: TEQuantizationParams | None, training: bool, is_context_quantized: bool
+):
+    if qparams is None:
+        return is_context_quantized
+    elif not training and qparams.evaluation_recipe is not None:
+        return _get_should_context_be_quantized_recipe(
+            qparams.evaluation_recipe, is_context_quantized
+        )
+    else:
+        return _get_should_context_be_quantized_recipe(
+            qparams.training_recipe, is_context_quantized
+        )
 
 
 def _get_extra_te_kwargs(config: TransformerConfig):
@@ -299,6 +527,7 @@ def __init__(
                 extra_kwargs["delay_wgrad_compute"] = self.config.delay_wgrad_compute
             else:
                 raise RuntimeError("Only TE with version >=2.3.0 supports delay_wgrad_compute now.")
+
         if (
             self.config.tp_comm_overlap
             and tp_comm_buffer_name
@@ -312,7 +541,7 @@ def __init__(
             )
 
         if is_te_min_version("0.8.0"):
-            if self.config.tp_comm_overlap:
+            if self.config.tp_comm_overlap and parallel_mode != "duplicated":
                 if is_te_min_version("1.5.0"):
                     # Use old overlap flags if they were supplied instead
                     extra_kwargs["ub_overlap_ag"] = (
@@ -370,9 +599,10 @@ def __init__(
             extra_kwargs["rng_tracker_name"] = rng_tracker_name
 
         te_parallel_mode = parallel_mode
+        tp_group_for_te = tp_group
         if parallel_mode == "duplicated":
             # Handle non-parallel case
-            tp_group = None
+            tp_group_for_te = None
             tp_size = 1
             explicit_expert_comm = False
             te_parallel_mode = None
@@ -387,7 +617,7 @@ def __init__(
                     input_size = divide(input_size, tp_size)
                 te_parallel_mode = None
                 tp_size = 1
-                tp_group = None
+                tp_group_for_te = None
 
         super().__init__(
             in_features=input_size,
@@ -395,7 +625,7 @@ def __init__(
             sequence_parallel=self.config.sequence_parallel,
             fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
             # Pass None if not initialized for backward compatibility with the ckpt converter.
-            tp_group=tp_group if torch.distributed.is_initialized() else None,
+            tp_group=tp_group_for_te if torch.distributed.is_initialized() else None,
             tp_size=tp_size,
             get_rng_state_tracker=(
                 get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
@@ -406,6 +636,7 @@ def __init__(
             parallel_mode=te_parallel_mode,
             **extra_kwargs,
         )
+        self.te_quant_params: Optional[TEQuantizationParams] = None
 
         for param in self.parameters():
             if is_expert:
@@ -419,12 +650,31 @@ def __init__(
                     # duplicated across TP ranks
                     setattr(param, "sequence_parallel", self.config.sequence_parallel)
 
+        tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
+
+    def finish_init(self, quantization_config: QuantizationConfig):
+        """Post-init of quantization override"""
+        if quantization_config is None:
+            self.te_quant_params = None
+        else:
+            self.te_quant_params = TEQuantizationParams.parse_from_config(quantization_config)
+
+    def will_execute_quantized(self, is_context_quantized: bool) -> bool:
+        """Returns whether the module is configured to execute quantized."""
+        return _get_should_context_be_quantized_params(
+            self.te_quant_params, self.training, is_context_quantized
+        )
+
     def forward(self, x):
         """Forward."""
         _is_first_microbatch = (
             None if self.disable_parameter_transpose_cache else self.is_first_microbatch
         )
-        out = super().forward(x, is_first_microbatch=_is_first_microbatch)
+        quant_context = _get_fp8_autocast_for_quant_params(self.te_quant_params, self.training)
+
+        with quant_context:
+            out = super().forward(x, is_first_microbatch=_is_first_microbatch)
         self.is_first_microbatch = False
 
         # TE only returns a tuple when return_bias is True, otherwise
@@ -443,7 +693,14 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
             self.parallel_mode is None
         ), "TELinear sharded_state_dict can only be used with duplicated parallel mode"
         state_dict = self.state_dict(prefix="", keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(state_dict, prefix, None, sharded_offsets)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict,
+            prefix,
+            None,
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
+        )
 
     def backward_dw(self):
         """Compute weight gradients during the backward pass if delay_wgrad_compute is enabled."""
@@ -469,6 +726,7 @@ def __init__(
         skip_weight_param_allocation: bool = False,
         tp_comm_buffer_name: Optional[str] = None,
         tp_group: Optional[torch.distributed.ProcessGroup] = None,
+        stride: int = 1,
     ):
         if not HAVE_TE:
             raise ImportError(
@@ -491,6 +749,7 @@ def __init__(
 
         # TODO: For backward compatibility, remove in v0.15.
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
 
         # TE returns a zero length Tensor when bias=False and
         # return_bias=True, but we prefer None.  So in that case we
@@ -559,6 +818,8 @@ def __init__(
             ), "Must have at least TE version 2.3 or higher to use symmetric memory all reduce"
             extra_kwargs["symmetric_ar_type"] = self.config.symmetric_ar_type
 
+        self.stride = stride
+
         super().__init__(
             in_features=input_size,
             out_features=output_size,
@@ -582,6 +843,12 @@ def __init__(
             zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
             **extra_kwargs,
         )
+        self.te_quant_params: Optional[TEQuantizationParams] = None
+
+        # Set proper partition_stride
+        setattr(self.weight, 'partition_stride', stride)
+        if bias and hasattr(self, 'bias') and self.bias is not None:
+            setattr(self.bias, 'partition_stride', stride)
 
         if config.use_cpu_initialization:
             output_size_per_partition = divide(output_size, self.tp_size)
@@ -592,7 +859,7 @@ def __init__(
                 output_size_per_partition,
                 0,
                 init_method=condition_init_method(config, init_method),
-                stride=1,
+                stride=stride,
                 return_master_weight=False,
                 rank=self.tp_rank,
                 world_size=self.tp_size,
@@ -602,17 +869,34 @@ def __init__(
                 self.bias = Parameter(
                     torch.empty(output_size_per_partition, dtype=config.params_dtype)
                 )
-                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
                 with torch.no_grad():
                     self.bias.zero_()
                 setattr(self.bias, "allreduce", True)
 
+    def finish_init(self, quantization_config: QuantizationConfig):
+        """Post-init of quantization override"""
+        if quantization_config is None:
+            self.te_quant_params = None
+        else:
+            self.te_quant_params = TEQuantizationParams.parse_from_config(quantization_config)
+
+    def will_execute_quantized(self, is_context_quantized: bool) -> bool:
+        """Returns whether the module is configured to execute quantized."""
+        return _get_should_context_be_quantized_params(
+            self.te_quant_params, self.training, is_context_quantized
+        )
+
     def forward(self, x):
         """Forward."""
         _is_first_microbatch = (
             None if self.disable_parameter_transpose_cache else self.is_first_microbatch
         )
-        out = super().forward(x, is_first_microbatch=_is_first_microbatch)
+        quant_context = _get_fp8_autocast_for_quant_params(self.te_quant_params, self.training)
+
+        with quant_context:
+            out = super().forward(x, is_first_microbatch=_is_first_microbatch)
+
         self.is_first_microbatch = False
 
         # TE only returns a tuple when return_bias is True, otherwise
@@ -624,9 +908,15 @@ def forward(self, x):
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0, "bias": 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 0, "bias": 0},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
     def __repr__(self):
@@ -659,6 +949,7 @@ def __init__(
         skip_weight_param_allocation: bool = False,
         tp_comm_buffer_name: Optional[str] = None,
         tp_group: Optional[torch.distributed.ProcessGroup] = None,
+        stride: int = 1,
     ):
         if not HAVE_TE:
             raise ImportError(
@@ -669,8 +960,10 @@ def __init__(
         if gather_output:
             raise ValueError("Transformer Engine linear layers do not support gather_output = True")
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
         world_size = get_pg_size(tp_group)
         rank = get_pg_rank(tp_group)
+        self.stride = stride
 
         super().__init__(
             input_size=input_size,
@@ -691,6 +984,11 @@ def __init__(
             tp_group=tp_group,
         )
 
+        # Set proper partition_stride
+        setattr(self.weight, 'partition_stride', stride)
+        if bias and hasattr(self, 'bias') and self.bias is not None:
+            setattr(self.bias, 'partition_stride', stride)
+
         if config.use_cpu_initialization:
             output_size_per_partition = divide(output_size, world_size)
             _ = _initialize_affine_weight_cpu(
@@ -700,7 +998,7 @@ def __init__(
                 output_size_per_partition,
                 0,
                 init_method=condition_init_method(config, init_method),
-                stride=1,
+                stride=stride,
                 return_master_weight=False,
                 rank=rank,
                 world_size=world_size,
@@ -710,7 +1008,7 @@ def __init__(
                 self.bias = Parameter(
                     torch.empty(output_size_per_partition, dtype=config.params_dtype)
                 )
-                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
                 with torch.no_grad():
                     self.bias.zero_()
                 setattr(self.bias, "allreduce", True)
@@ -719,7 +1017,12 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0, "bias": 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 0, "bias": 0},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
     def __repr__(self):
@@ -763,6 +1066,7 @@ def __init__(
                 "Transformer Engine linear layers do not support input_is_parallel = False"
             )
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
 
         super().__init__(
             input_size=input_size,
@@ -813,7 +1117,12 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 1, bias not sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 1}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 1},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
     def __repr__(self):
@@ -849,8 +1158,9 @@ def __init__(
         softmax_scale: Optional[float] = None,
         k_channels: Optional[int] = None,
         v_channels: Optional[int] = None,
-        cp_comm_type: str = "p2p",
-        pg_collection: ProcessGroupCollection = None,
+        num_splits: Optional[int] = None,
+        cp_comm_type: Optional[str] = "p2p",
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ):
         if not HAVE_TE:
             raise ImportError(
@@ -861,6 +1171,10 @@ def __init__(
         self.config = config
         self.te_forward_mask_type = False
         self.qkv_format: str = "sbhd"
+        # Default to 1 split when batch-invariant mode is enabled, unless explicitly overridden
+        self.num_splits: Optional[int] = (
+            1 if (num_splits is None and self.config.batch_invariant_mode) else num_splits
+        )
 
         if self.config.apply_query_key_layer_scaling != bool(
             int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))
@@ -900,6 +1214,7 @@ def __init__(
                 assert hasattr(
                     pg_collection, "hcp"
                 ), "TEDotProductAttention pg_collection must have hierarchical cp pg"
+        self._tp_group = pg_collection.tp
 
         if is_te_min_version("0.10.0"):
             extra_kwargs["attention_type"] = attention_type
@@ -975,6 +1290,7 @@ def __init__(
         self.kept_packed_seq_params = set(
             field.name for field in dataclasses.fields(PackedSeqParams)
         )
+
         if get_te_version() < PkgVersion("1.3.0"):
             # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
             # copies (#555)
@@ -989,6 +1305,14 @@ def __init__(
             self.kept_packed_seq_params.discard("cu_seqlens_q_padded")
             self.kept_packed_seq_params.discard("cu_seqlens_kv_padded")
 
+        if config.qk_clip or config.log_max_attention_logit:
+            # qk-clip is only supported in TE 2.9.0 and later
+            assert is_te_min_version("2.9.0"), "qk-clip is only supported in TE 2.9.0 and later"
+
+            # TE 2.9.0 introduces return_max_logit for qk-clip getting the max attention logits
+            extra_kwargs["return_max_logit"] = True
+            self.current_max_attn_logits = None
+
         super().__init__(
             num_attention_heads=self.config.num_attention_heads,
             kv_channels=kv_channels,
@@ -1011,12 +1335,41 @@ def forward(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        attention_mask: Tensor,
+        attention_mask: Optional[Tensor],
         attn_mask_type: AttnMaskType,
-        attention_bias: Tensor = None,
-        packed_seq_params: PackedSeqParams = None,
-    ):
+        attention_bias: Optional[Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        num_splits: Optional[int] = None,
+    ) -> torch.Tensor:
         """Forward."""
+        if packed_seq_params is not None:
+            # If Dynamic CP group is provided, update TE DPA CP group
+            if packed_seq_params.cp_group is not None:
+                self.cp_group = packed_seq_params.cp_group
+                super().set_context_parallel_group(
+                    self.cp_group,
+                    torch.distributed.get_process_group_ranks(self.cp_group),
+                    TEDotProductAttention.cp_stream,
+                    self.cp_comm_type,
+                )
+            # If cp_group is None but local_cp_size is provided,
+            # Indicates to turn off CP dynamically
+            elif packed_seq_params.local_cp_size is not None:
+                assert (
+                    packed_seq_params.local_cp_size == 1
+                ), "local_cp_size must be == 1 if provided without cp_group"
+                super().set_context_parallel_group(None, None, None, self.cp_comm_type)
+            self.kept_packed_seq_params.discard("cp_group")
+            self.kept_packed_seq_params.discard("local_cp_size")
+
+        # Default to constructor-provided num_splits unless explicitly overridden
+        if num_splits is None:
+            num_splits = self.num_splits
+        if num_splits is not None:
+            assert is_te_min_version("2.10.0"), (
+                f"Transformer-Engine v{get_te_version()} must be >= 2.10.0 to support" "num_splits."
+            )
+
         packed_seq_kwargs = (
             {key: getattr(packed_seq_params, key) for key in self.kept_packed_seq_params}
             if packed_seq_params is not None
@@ -1049,19 +1402,34 @@ def forward(
                     attn_mask_type = AttnMaskType.padding_causal
                 elif attn_mask_type == AttnMaskType.no_mask:
                     attn_mask_type = AttnMaskType.padding
-            core_attn_out = super().forward(
-                query,
-                key,
-                value,
-                attention_mask,
-                attn_mask_type=attn_mask_type.name,
-                **attention_bias_kwargs,
-                **packed_seq_kwargs,
+            _fa_kwargs = dict(
+                attn_mask_type=attn_mask_type.name, **attention_bias_kwargs, **packed_seq_kwargs
             )
+            if num_splits is not None:
+                _fa_kwargs["num_splits"] = num_splits
+
+            core_attn_out = super().forward(query, key, value, attention_mask, **_fa_kwargs)
+
+            if self.config.qk_clip or self.config.log_max_attention_logit:
+                # qk-clip is only supported in TE 2.9.0 and later
+                assert is_te_min_version("2.9.0"), "qk-clip is only supported in TE 2.9.0 and later"
+
+                # Update Q K outside of TE Attention API
+                core_attn_out, batch_max_attention_logits = core_attn_out
+
+                # Update QK_Clip balancing eta
+                if self.current_max_attn_logits is None:
+                    self.current_max_attn_logits = batch_max_attention_logits
+                else:
+                    self.current_max_attn_logits = torch.max(
+                        self.current_max_attn_logits, batch_max_attention_logits
+                    )
+
         else:
-            core_attn_out = super().forward(
-                query, key, value, attention_mask, **attention_bias_kwargs, **packed_seq_kwargs
-            )
+            _fa_kwargs = dict(**attention_bias_kwargs, **packed_seq_kwargs)
+            if num_splits is not None:
+                _fa_kwargs["num_splits"] = num_splits
+            core_attn_out = super().forward(query, key, value, attention_mask, **_fa_kwargs)
 
         return core_attn_out
 
@@ -1077,7 +1445,12 @@ def sharded_state_dict(
         else:
             state_dict = {}
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {'softmax_offset': 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {'softmax_offset': 0},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
 
@@ -1105,7 +1478,7 @@ def __init__(
             skip_bias_add: bool,
             is_expert: bool = False,
             tp_comm_buffer_name: Optional[str] = None,
-            tp_group: Optional[torch.distributed.ProcessGroup] = None,
+            pg_collection: Optional[ProcessGroupCollection] = None,
         ):
             self.config = config
 
@@ -1136,8 +1509,14 @@ def __init__(
 
             # The comms between TP and EP group is explicitly handled by MoE token dispatcher.
             # So we disable comms by making TE agnostic of model parallel.
-            tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+            if pg_collection is None:
+                pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+            self._pg_collection = pg_collection
+            assert is_expert, "TEGroupedLinear only supports expert parallelism"
+            tp_group = pg_collection.expt_tp
+            self._tp_group = tp_group
             tp_size = get_pg_size(tp_group)
+            tp_group_for_te = tp_group
 
             self.explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel)
 
@@ -1148,7 +1527,7 @@ def __init__(
                     input_size = divide(input_size, tp_size)
                 parallel_mode = None
                 tp_size = 1
-                tp_group = None
+                tp_group_for_te = None
 
             super().__init__(
                 num_gemms=num_gemms,
@@ -1156,7 +1535,7 @@ def __init__(
                 out_features=output_size,
                 sequence_parallel=self.config.sequence_parallel,
                 fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
-                tp_group=tp_group if torch.distributed.is_initialized() else None,
+                tp_group=tp_group_for_te if torch.distributed.is_initialized() else None,
                 tp_size=tp_size,
                 get_rng_state_tracker=(
                     get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
@@ -1167,7 +1546,7 @@ def __init__(
                 parallel_mode=parallel_mode,
                 **extra_kwargs,
             )
-
+            self.te_quant_params: Optional[TEQuantizationParams] = None
             for param in self.parameters():
                 setattr(param, "allreduce", not (is_expert and self.expert_parallel))
 
@@ -1261,12 +1640,28 @@ def merge_extra_states(
 
             self._register_load_state_dict_pre_hook(merge_extra_states, with_module=True)
 
+        def finish_init(self, quantization_config: QuantizationConfig):
+            """Post-init of quantization override"""
+            if quantization_config is None:
+                self.te_quant_params = None
+            else:
+                self.te_quant_params = TEQuantizationParams.parse_from_config(quantization_config)
+
+        def will_execute_quantized(self, is_context_quantized: bool) -> bool:
+            """Returns whether the module is configured to execute quantized."""
+            return _get_should_context_be_quantized_params(
+                self.te_quant_params, self.training, is_context_quantized
+            )
+
         def forward(self, x, m_splits):
             """Forward."""
             _is_first_microbatch = (
                 None if self.disable_parameter_transpose_cache else self.is_first_microbatch
             )
-            out = super().forward(x, m_splits, is_first_microbatch=_is_first_microbatch)
+            quant_context = _get_fp8_autocast_for_quant_params(self.te_quant_params, self.training)
+
+            with quant_context:
+                out = super().forward(x, m_splits, is_first_microbatch=_is_first_microbatch)
             self.is_first_microbatch = False
 
             # TE only returns a tuple when return_bias is True, otherwise
@@ -1349,8 +1744,8 @@ def _sharded_state_dict_grouped(
             singleton_local_shards = (metadata or {}).get('singleton_local_shards', False)
             sharded_state_dict = {}
             full_state_dict = self.state_dict(prefix="", keep_vars=True)
-            num_global_experts = get_expert_model_parallel_world_size() * self.num_gemms
-            local_expert_indices_offset = get_expert_model_parallel_rank() * self.num_gemms
+            num_global_experts = get_pg_size(self._pg_collection.ep) * self.num_gemms
+            local_expert_indices_offset = get_pg_rank(self._pg_collection.ep) * self.num_gemms
             ep_axis = len(sharded_offsets)
             extra_states = self._split_extra_state(full_state_dict["_extra_state"])
             for gemm_idx in range(self.num_gemms):
@@ -1371,7 +1766,12 @@ def _sharded_state_dict_grouped(
                         (ep_axis, global_expert_idx, num_global_experts),
                     )
                 sub_sd = make_sharded_tensors_for_checkpoint(
-                    state_dict, '', tp_axis_map, new_sharded_offsets
+                    state_dict,
+                    '',
+                    tp_axis_map,
+                    new_sharded_offsets,
+                    tp_group=self._tp_group,
+                    dp_cp_group=metadata["dp_cp_group"],
                 )
                 # Remove expert layers indexing from sharded keys
                 replace_prefix_for_sharding(sub_sd, f"{gemm_idx}.", expert_prefix)
@@ -1394,7 +1794,7 @@ def _sharded_state_dict_grouped(
                 if getattr(sh_ten, "is_data_parallel_fully_shard", False):
                     edp_replica_id = 0
                 else:
-                    edp_replica_id = get_expert_data_parallel_rank()
+                    edp_replica_id = get_pg_rank(self._pg_collection.expt_dp)
                 sh_ten.replica_id = (*replica_id[:2], edp_replica_id)
             return sharded_state_dict
 
@@ -1424,7 +1824,7 @@ def __init__(
             skip_bias_add: bool,
             is_expert: bool,
             tp_comm_buffer_name: Optional[str] = None,
-            tp_group: Optional[torch.distributed.ProcessGroup] = None,
+            pg_collection: Optional[ProcessGroupCollection] = None,
         ):
             super().__init__(
                 num_gemms=num_gemms,
@@ -1437,7 +1837,7 @@ def __init__(
                 skip_bias_add=skip_bias_add,
                 is_expert=is_expert,
                 tp_comm_buffer_name=tp_comm_buffer_name,
-                tp_group=tp_group,
+                pg_collection=pg_collection,
             )
 
         def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
@@ -1470,7 +1870,7 @@ def __init__(
             skip_bias_add: bool,
             is_expert: bool,
             tp_comm_buffer_name: Optional[str] = None,
-            tp_group: Optional[torch.distributed.ProcessGroup] = None,
+            pg_collection: Optional[ProcessGroupCollection] = None,
         ):
             super().__init__(
                 num_gemms=num_gemms,
@@ -1483,7 +1883,7 @@ def __init__(
                 skip_bias_add=skip_bias_add,
                 is_expert=is_expert,
                 tp_comm_buffer_name=tp_comm_buffer_name,
-                tp_group=tp_group,
+                pg_collection=pg_collection,
             )
 
         def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
@@ -1761,7 +2161,7 @@ def forward_post_hook(module, *_) -> None:
                     "TEFusedMLP module does not support submodules with post-backward hooks"
                 )
 
-        def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]:
+        def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]:
             """Forward."""
 
             # Construct fused impl if needed
@@ -2116,3 +2516,18 @@ def set_save_original_input(module):
             "set_save_original_input is only needed on transformer-engine modules that save "
             "quantized tensors by default. It needs transformer-engine>=2.6.0dev0."
         )
+
+
+try:
+    # pylint: disable=unused-import
+    from transformer_engine.pytorch import cpu_offload_v1 as cpu_offload
+except ImportError:
+    try:
+        from transformer_engine.pytorch import cpu_offload
+    except ImportError:
+        cpu_offload = None
+try:
+    # pylint: disable=unused-import
+    from transformer_engine.pytorch.float8_tensor import Float8Tensor
+except ImportError:
+    Float8Tensor = None
diff --git a/megatron/core/fp4_utils.py b/megatron/core/fp4_utils.py
index 4f9e7e5d026..42e7f875826 100644
--- a/megatron/core/fp4_utils.py
+++ b/megatron/core/fp4_utils.py
@@ -61,13 +61,23 @@ def get_fp4_align_size(fp4_recipe: Fp4Recipe) -> int:
     Note that since we are also random hadamard transform for NVFP4 training, we want
     fused group nvfp4 quantize plus hadamard transform. Hadamard transform will leverage
     tensor core instructions for better performance, while group quantize kernels also
-    prefer a more aligned size in token dimension M. Therefore, we apply align size 64
-    here for better performance in MOE.
+    prefer a more aligned size in token dimension M. The efficiently leverage grouped
+    kernels, padding needs to be 64 multiple, but 128 multiple will bring even faster.
+
+    When it comes to MOE cuda graph support, the number of tokens for each expert should
+    be a buffer on device memory, which means that we don't know the token dimension for
+    each expertin host, therefore we cannot calculate the zero padded scaling factors shape
+    on host to comply with the NVFP4 GEMM scaling factor layout. However, if we have already
+    zero padded the tokens to 128 multiple, then there is no need for such padding, so that
+    host doesn't need to copy the token distribution from device to host (which will break
+    the CUDA graph).
 
     Paper link: https://arxiv.org/pdf/2509.25149
+    Scaling factor layout: https://docs.nvidia.com/cuda/cublas/#d-block-scaling-factors-layout
+    TE NVFP4 Grouped Quantization: https://github.com/NVIDIA/TransformerEngine/pull/2411
     """
     # pylint: disable=unused-argument
-    return 64
+    return 128
 
 
 def dequantize_fp4_tensor(fp4_tensor: torch.Tensor) -> torch.Tensor:
diff --git a/megatron/core/fp8_utils.py b/megatron/core/fp8_utils.py
index 1c52e965cd7..fa6be91dfbf 100644
--- a/megatron/core/fp8_utils.py
+++ b/megatron/core/fp8_utils.py
@@ -571,6 +571,7 @@ def get_fp8_recipe(config: TransformerConfig):
                     fp8_format=fp8_format
                 )
             elif config.fp8_recipe == Fp8Recipe.custom:
+                assert config.fp8_quantizer_factory is not None
                 fp8_recipe = _get_custom_recipe(config.fp8_quantizer_factory)
             else:
                 raise ValueError(
@@ -689,8 +690,13 @@ def _wrap_te_linear_for_padding(module: torch.nn.Module):
 
         @wraps(original_forward)
         def padded_forward(input_tensor, *args, **kwargs):
-            # Only do padding for fp8 if we are in fp8 context
-            if not FP8GlobalStateManager.is_fp8_enabled():
+            is_context_quantized = FP8GlobalStateManager.is_fp8_enabled()
+            if hasattr(module, "will_execute_quantized"):
+                module_uses_quant = module.will_execute_quantized(is_context_quantized)
+            else:
+                module_uses_quant = is_context_quantized
+            # Only do padding for fp8 if we are in fp8 or fp4 context
+            if not module_uses_quant:
                 return original_forward(input_tensor, *args, **kwargs)
 
             # With sequence parallelism we need to all-gather before padding
diff --git a/megatron/core/full_cuda_graph.py b/megatron/core/full_cuda_graph.py
index 22d56511852..7c11195f33b 100644
--- a/megatron/core/full_cuda_graph.py
+++ b/megatron/core/full_cuda_graph.py
@@ -179,7 +179,7 @@ def __call__(self, *args, **kwargs):
                 )
             torch.cuda.synchronize()
             torch.distributed.barrier()
-            logger.info(f'CUDA graph capture done!!!')
+            logger.info(f'CUDA graph capture done for {training_str}!!!')
 
         if FullCudaGraphWrapper.cuda_graph[training_str] is None:
             FullCudaGraphWrapper.result[training_str] = self.forward_backward_func(*args, **kwargs)
diff --git a/megatron/core/hyper_comm_grid.py b/megatron/core/hyper_comm_grid.py
index dce2aa16a7f..f624ba7bdb3 100644
--- a/megatron/core/hyper_comm_grid.py
+++ b/megatron/core/hyper_comm_grid.py
@@ -158,9 +158,11 @@ def create_pg(self, dims: Union[str, list[str]], **kwargs: Any) -> dist.ProcessG
         rank_enum = self._gen_rank_enum(ordered_dims)
         pg, _ = dist.new_subgroups_by_enumeration(rank_enum, backend=self.backend, **kwargs)
 
-        logging.info(f"Generated process group for {unique_group_key} with enumeration {rank_enum}")
+        if dist.get_rank() == 0:
+            logging.info(
+                f"Generated process group for {unique_group_key} with enumeration {rank_enum}"
+            )
         self._pgs[unique_group_key] = pg
-
         return pg
 
     def get_pg(self, dims: Union[str, list[str]]) -> dist.ProcessGroup:
@@ -178,6 +180,22 @@ def get_pg(self, dims: Union[str, list[str]]) -> dist.ProcessGroup:
 
         return self._pgs[unique_group_key]
 
+    def get_rank_enum(self, dims: Union[str, list[str]]) -> list[list[int]]:
+        r"""Get the rank enumeration for the requested dimension(s).
+
+        This is the exact enumeration that would be used by create_pg for the same
+        dims. It is useful for creating additional groups whose membership is derived from
+        the grid (e.g., embedding/position-embedding groups derived from PP groups).
+
+        Args:
+            dims: Dimension name or list of dimension names.
+
+        Returns:
+            List of rank lists (one per subgroup).
+        """
+        ordered_dims, _ = self._order_dims(dims)
+        return self._gen_rank_enum(ordered_dims)
+
     def _gen_rank_enum(self, dims: list[str]) -> list[list[int]]:
         r"""Generate rank enumeration before calling new_subgroups_by_enumeration
 
diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 637092b468e..e6ea32a6df8 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -12,6 +12,10 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
+import torch
+
+from megatron.core.utils import get_pg_size
+
 
 @dataclass(order=True, frozen=True)
 class InferenceBatchDimensions:
@@ -21,6 +25,7 @@ class InferenceBatchDimensions:
         token_count : number of total input tokens
         prefill_req_count : number of prefill requests
         decode_req_count : number of decode requests
+        has_explicit_chunked_prefill_req : whether the batch has an explicit chunked prefill request
 
     The batch dimensions are ordered by token_count, then by prefill_req_count,
     then by decode_req_count.
@@ -30,6 +35,7 @@ class InferenceBatchDimensions:
     token_count: int = 0
     prefill_req_count: int = 0
     decode_req_count: int = 0
+    has_explicit_chunked_prefill_req: bool = False
 
     def __str__(self):
         """
@@ -49,6 +55,9 @@ def is_applicable_for_batch_dim(
         for prefill or decode requests. Otherwise, prefill slots
         can only be used for prefill requests.
         """
+        if real_batch_dim.has_explicit_chunked_prefill_req != self.has_explicit_chunked_prefill_req:
+            return False
+
         if real_batch_dim.prefill_req_count == 0:
             return (
                 self.token_count >= real_batch_dim.token_count
@@ -95,6 +104,10 @@ def is_valid(self, max_requests: int, max_sequence_length: int) -> bool:
         if self.token_count > self.prefill_req_count * max_sequence_length + self.decode_req_count:
             return False
 
+        # Check if there is an invalid chunked prefill request.
+        if self.prefill_req_count == 0 and self.has_explicit_chunked_prefill_req:
+            return False
+
         return True
 
     def __hash__(self):
@@ -102,7 +115,14 @@ def __hash__(self):
         Returns a hash of the batch dimension.
         In cuda graph quick matching, the batch dimension is used as a key in a dictionary.
         """
-        return hash((self.token_count, self.prefill_req_count, self.decode_req_count))
+        return hash(
+            (
+                self.token_count,
+                self.prefill_req_count,
+                self.decode_req_count,
+                self.has_explicit_chunked_prefill_req,
+            )
+        )
 
     def __eq__(self, other: "InferenceBatchDimensions") -> bool:
         """
@@ -110,10 +130,16 @@ def __eq__(self, other: "InferenceBatchDimensions") -> bool:
         """
         if other is None:
             return False
-        return (self.token_count, self.prefill_req_count, self.decode_req_count) == (
+        return (
+            self.token_count,
+            self.prefill_req_count,
+            self.decode_req_count,
+            self.has_explicit_chunked_prefill_req,
+        ) == (
             other.token_count,
             other.prefill_req_count,
             other.decode_req_count,
+            other.has_explicit_chunked_prefill_req,
         )
 
     @property
@@ -123,6 +149,73 @@ def req_count(self) -> int:
         """
         return self.prefill_req_count + self.decode_req_count
 
+    @staticmethod
+    def adjust_batch_dims_for_expert_parallelism(
+        local_batch_dims,
+        strict: bool,
+        decode_only_cuda_graphs: bool,
+        ep_group: Optional[torch.distributed.ProcessGroup] = None,
+    ) -> Optional["InferenceBatchDimensions"]:
+        """Adjusted cuda graph batch dimensions for expert parallelism.
+            We take the max token count across expert model parallel group.
+
+        Args:
+            local_batch_dims: The local batch dimensions to adjust.
+            strict: Whether to use strict matching for batch dimensions.
+            decode_only_cuda_graphs: Whether CUDA graphs are only used for decode steps.
+            ep_group: Optional expert parallel process group. If None, uses global parallel state.
+                      When using different EP sizes for inference vs training, pass the
+                      inference EP group explicitly.
+
+        Return:
+            (InferenceBatchDimensions) A new InferenceBatchDimensions object with
+            adjusted dimensions, or None if eager mode should be used.
+        """
+        ep_size = get_pg_size(ep_group)
+        if ep_size <= 1:
+            return local_batch_dims
+        # all reduce local work across expert model parallel group
+
+        has_explicit_chunked_prefill_req = local_batch_dims.has_explicit_chunked_prefill_req
+        is_non_decode = local_batch_dims.prefill_req_count > 0
+        sync_tensor = torch.tensor(
+            [
+                local_batch_dims.token_count,
+                int(is_non_decode),
+                int(has_explicit_chunked_prefill_req),
+            ],
+            dtype=torch.int32,
+            device=torch.cuda.current_device(),
+        )
+
+        torch.distributed.all_reduce(sync_tensor, op=torch.distributed.ReduceOp.MAX, group=ep_group)
+
+        sync_tensor = sync_tensor.cpu()
+        is_any_ep_rank_in_non_decode = sync_tensor[1].item() == 1
+        any_ep_rank_has_explicit_chunked_prefill_req = sync_tensor[2].item() == 1
+
+        # We force eager mode for scenarios where some ranks will run with CUDA graphs
+        # while others will not. Without this check, the all-to-all communication in the
+        # expert routing layer would pad up to the maximum capacity only for the ranks that
+        # are using CUDA graphs in this step, leading to a NCCL hang.
+        # This can happen in the following cases:
+        #   1. If we only allow decode CUDA graphs but some ranks are running non-decode batches
+        #   2. Some ranks are running explicit chunked prefill requests
+        #       (graphs are not recorded for batches with explicit chunked prefill requests)
+        if (
+            decode_only_cuda_graphs and is_any_ep_rank_in_non_decode
+        ) or any_ep_rank_has_explicit_chunked_prefill_req:
+            return None  # indicate no match, run in eager mode
+
+        assert not has_explicit_chunked_prefill_req
+        adjusted_batch_dim = InferenceBatchDimensions(
+            token_count=int(sync_tensor[0].item()),
+            prefill_req_count=local_batch_dims.prefill_req_count,
+            decode_req_count=local_batch_dims.decode_req_count,
+            has_explicit_chunked_prefill_req=False,
+        )
+        return adjusted_batch_dim
+
 
 class CUDAGraphBatchDimensionBuilder:
     """Builder for creating and managing CUDA graph batch dimensions.
@@ -355,6 +448,8 @@ def match_graph_config(
         real_batch_dim: InferenceBatchDimensions,
         cuda_graph_batch_dimensions_list: List[InferenceBatchDimensions],
         strict: bool = False,
+        decode_only_cuda_graphs: bool = False,
+        ep_group: Optional[torch.distributed.ProcessGroup] = None,
     ) -> Optional[InferenceBatchDimensions]:
         """
         Matches the best CUDA graph batch dimension for the given real batch dimension.
@@ -364,16 +459,39 @@ def match_graph_config(
             cuda_graph_batch_dimensions_list: List of available CUDA graph batch dimensions
             strict: If False, prefill slots can be used for prefill or decode requests.
                    If True, prefill slots can only be used for prefill requests.
-
+            decode_only_cuda_graphs: Used by expert parallel matching. If this is true,
+            and one of the EP ranks is running a non-decode step, we elect to run in
+            eager mode instead of matching a decode-only cuda graph.
+            ep_group: Optional expert parallel process group. If None, uses global parallel state.
+                      When using different EP sizes for inference vs training, pass the
+                      inference EP group explicitly.
         Returns:
             The best matching CUDA graph batch dimension, or None if no applicable match is found
         """
+
+        if not cuda_graph_batch_dimensions_list:
+            # no need to match if no cuda graph batch dimensions are provided
+            return None
+
+        adjusted_batch_dim = InferenceBatchDimensions.adjust_batch_dims_for_expert_parallelism(
+            real_batch_dim,
+            strict=strict,
+            decode_only_cuda_graphs=decode_only_cuda_graphs,
+            ep_group=ep_group,
+        )
+
+        if adjusted_batch_dim is None:
+            # we hit this scenario if decode_only_cuda_graphs is true,
+            # and one of the EP ranks is running a non-decode step
+            # in that case, all ranks have to run in eager mode
+            return None
+
         # first filter out batch dimensions with smaller token count, prefill req count,
         # or decode req count, as they are not applicable
         graph_batch_dims_applicable = [
             graph_batch_dim
             for graph_batch_dim in cuda_graph_batch_dimensions_list
-            if graph_batch_dim.is_applicable_for_batch_dim(real_batch_dim, strict=strict)
+            if graph_batch_dim.is_applicable_for_batch_dim(adjusted_batch_dim, strict=strict)
         ]
         if len(graph_batch_dims_applicable) == 0:
             return None
diff --git a/megatron/core/inference/communication/torch_symm_triton/__init__.py b/megatron/core/inference/communication/torch_symm_triton/__init__.py
new file mode 100644
index 00000000000..ca58663d9ec
--- /dev/null
+++ b/megatron/core/inference/communication/torch_symm_triton/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+from .collectives import multimem_all_gather, multimem_reduce_scatter
+from .fused_collectives import fused_multimem_rs_add_norm_ag
diff --git a/megatron/core/inference/communication/torch_symm_triton/barrier.py b/megatron/core/inference/communication/torch_symm_triton/barrier.py
new file mode 100644
index 00000000000..d26b094828d
--- /dev/null
+++ b/megatron/core/inference/communication/torch_symm_triton/barrier.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# Adapted from: https://github.com/meta-pytorch/kraken.git
+
+from unittest.mock import MagicMock
+
+from megatron.core.utils import null_decorator
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:
+    triton = MagicMock()
+    tl = MagicMock()
+    triton.jit = null_decorator
+
+from .utils import get_flat_bid, get_flat_tid
+
+
+@triton.jit
+def _send_signal(addrs, sem: tl.constexpr):
+    tl.inline_asm_elementwise(
+        f"""
+        {{
+            .reg .u32   %tmp32_<1>;
+            .reg .pred  %p<1>;
+
+            send_signal:
+                atom.global.{sem}.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                setp.eq.u32 %p0, %tmp32_0, 0;
+                @!%p0 bra send_signal;
+        }}
+        """,
+        "=r, l",
+        [addrs],
+        dtype=addrs.dtype,
+        is_pure=False,
+        pack=1,
+    )
+
+
+@triton.jit
+def _wait_signal(addrs, sem: tl.constexpr):
+    tl.inline_asm_elementwise(
+        f"""
+        {{
+            .reg .u32   %tmp32_<1>;
+            .reg .pred  %p<1>;
+
+            wait_signal:
+                atom.global.sys.{sem}.cas.b32 %tmp32_0, [$1], 1, 0;
+                setp.eq.u32 %p0, %tmp32_0, 1;
+                @!%p0 bra wait_signal;
+        }}
+        """,
+        "=r, l",
+        [addrs],
+        dtype=tl.int32,
+        is_pure=False,
+        pack=1,
+    )
+
+
+@triton.jit
+def symm_mem_sync(
+    signal_pad_ptrs,
+    block_id,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    hasPreviousMemAccess: tl.constexpr = False,
+    hasSubsequentMemAccess: tl.constexpr = False,
+):
+    """
+    Synchronizes blocks with matching block_id across participating devices.
+
+    Note: the function itself is not a system level barrier/fence. It is a
+    building block for expressing different synchronization patterns.
+
+    Pattern 0: Ensures that all writes to symm_mem buffers from previous
+    kernels across all devices are visible to the current kernel:
+
+        symm_mem_sync(..., hasPreviousMemAccess=False, hasSubsequentMemAccess=True)
+
+    Pattern 1: Ensures that all writes to symm_mem buffers from the current
+    block are visible to all remote blocks with matching blockIdx:
+
+        symm_mem_sync(..., hasPreviousMemAccess=True, hasSubsequentMemAccess=True)
+
+    Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
+    for writing by subsequent kernels across all devices.
+
+        symm_mem_sync(..., hasPreviousMemAccess=True, hasSubsequentMemAccess=False)
+
+    CUDA graph friendliness:
+
+        This barrier operates through atomic operations on a zero-filled signal
+        pad, which resets to a zero-filled state after each successful
+        synchronization. This design eliminates the need for incrementing a
+        flag from host.
+    """
+    if block_id is None:
+        block_id = get_flat_bid()
+    flat_tid = get_flat_tid()
+
+    remote_ranks = tl.arange(0, world_size)
+    signal_pad_ptrs = signal_pad_ptrs.to(tl.pointer_type(tl.uint64))
+    remote_signal_pad_addrs = tl.load(signal_pad_ptrs + remote_ranks).to(tl.pointer_type(tl.uint32))
+    send_addrs = remote_signal_pad_addrs + block_id * world_size + rank
+
+    local_signal_pad_addr = tl.load(signal_pad_ptrs + rank).to(tl.pointer_type(tl.uint32))
+    wait_addrs = local_signal_pad_addr + block_id * world_size + remote_ranks
+
+    if flat_tid < world_size:
+        _send_signal(send_addrs, "release" if hasPreviousMemAccess else "relaxed")
+        _wait_signal(wait_addrs, "acquire" if hasSubsequentMemAccess else "relaxed")
diff --git a/megatron/core/inference/communication/torch_symm_triton/collectives.py b/megatron/core/inference/communication/torch_symm_triton/collectives.py
new file mode 100644
index 00000000000..4bc4dbde42b
--- /dev/null
+++ b/megatron/core/inference/communication/torch_symm_triton/collectives.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+from unittest.mock import MagicMock
+
+import torch
+
+from megatron.core.utils import null_decorator
+
+try:
+    import triton
+    import triton.language as tl
+
+    HAVE_TRITON = True
+except ImportError:
+    triton = MagicMock()
+    triton.jit = null_decorator
+    tl = MagicMock()
+    HAVE_TRITON = False
+try:
+    from torch._C._distributed_c10d import _SymmetricMemory
+except ImportError:
+    _SymmetricMemory = MagicMock()
+
+from .barrier import symm_mem_sync
+from .multimem_asm import ld_128, st_128
+from .utils import get_flat_tid, sync_threads
+
+
+@triton.jit
+def _multimem_all_gather_kernel(
+    local_ptr,
+    multicast_ptr,
+    signal_pad_ptrs,
+    numel,
+    BLOCK_SIZE: tl.constexpr,
+    NUMEL_PER_THREAD: tl.constexpr,
+    RANK: tl.constexpr,
+    WORLD_SIZE: tl.constexpr,
+):
+    """
+    Triton kernel to perform multicast all-gather over nvlink using multimem instructions.
+    """
+    # an all-gather is simply a multicast store operation
+    # we only need a barrier at the end to ensure visibility of writes
+
+    pid = tl.program_id(axis=0)
+    tid = get_flat_tid()
+
+    # From this point on, we pretend each element is 128-bit
+    numel = numel // NUMEL_PER_THREAD
+    numel_per_rank = tl.cdiv(numel, WORLD_SIZE)
+    block_start = pid * BLOCK_SIZE
+
+    while block_start < numel_per_rank:
+        offsets = block_start + tid
+        mask = offsets < numel_per_rank
+
+        # Each pointer points to a 128-bit bit pack
+        # RANK * numel_per_rank -> brings us to the start of our rank's segment
+        # offsets -> brings us to the right offset within our rank's segment
+        multicast_ptrs = (
+            multicast_ptr.to(tl.pointer_type(tl.uint64)) + (RANK * numel_per_rank + offsets) * 2
+        )
+        local_ptrs = local_ptr.to(tl.pointer_type(tl.uint64)) + offsets * 2
+        (x, y, z, w) = ld_128(local_ptrs, mask=mask, multicast_op=False)
+        st_128(multicast_ptrs, x, y, z, w, mask=mask, multicast_op=True)
+
+        block_start += tl.num_programs(axis=0) * BLOCK_SIZE
+
+    sync_threads()
+    symm_mem_sync(
+        signal_pad_ptrs,
+        None,
+        RANK,
+        WORLD_SIZE,
+        hasPreviousMemAccess=True,
+        hasSubsequentMemAccess=True,
+    )
+
+
+def multimem_all_gather(
+    output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
+    symm_mem_hdl: _SymmetricMemory,
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Calls a multicast all-gather triton kernel on the given tensor.
+    Output tensor must be a symmetric memory buffer.
+    Input tensor can be a regular torch tensor
+    Arguments:
+        output_tensor: torch.Tensor - output tensor to be all-gathered into
+        input_tensor: torch.Tensor - input tensor to be all-gathered from
+        symm_mem_hdl: _SymmetricMemory - handle to the symmetric memory buffer for output_tensor
+    Returns:
+        torch.Tensor - all-gathered tensor, which is output_tensor
+    """
+    assert HAVE_TRITON, "Triton is required for multimem all-gather."
+
+    config = {
+        "max_num_blocks": kwargs.get("max_num_blocks", 24),
+        "num_warps": kwargs.get("num_warps", 32),
+        "BLOCK_SIZE": kwargs.get("BLOCK_SIZE", 1024),
+    }
+    assert input_tensor.dtype == torch.bfloat16, "Only bfloat16 is supported for now."
+    assert output_tensor.dtype == torch.bfloat16, "Only bfloat16 is supported for now."
+    numel_per_thread = 128 // (input_tensor.element_size() * 8)
+
+    assert (
+        output_tensor.numel() % numel_per_thread == 0
+    ), "The number of elements must be 128-bit aligned."
+
+    num_threads = triton.cdiv(output_tensor.numel() // numel_per_thread, symm_mem_hdl.world_size)
+    num_blocks = min(triton.cdiv(num_threads, config["BLOCK_SIZE"]), config["max_num_blocks"])
+
+    _multimem_all_gather_kernel[(num_blocks, 1, 1)](
+        input_tensor.data_ptr(),
+        symm_mem_hdl.multicast_ptr,
+        symm_mem_hdl.signal_pad_ptrs_dev,
+        numel=output_tensor.numel(),
+        BLOCK_SIZE=config["BLOCK_SIZE"],
+        NUMEL_PER_THREAD=numel_per_thread,
+        RANK=symm_mem_hdl.rank,
+        WORLD_SIZE=symm_mem_hdl.world_size,
+        num_warps=config["num_warps"],
+    )
+
+    return output_tensor
+
+
+@triton.jit
+def _multimem_reduce_scatter_kernel(
+    local_ptr,
+    multicast_ptr,
+    signal_pad_ptrs,
+    numel,
+    BLOCK_SIZE: tl.constexpr,
+    NUMEL_PER_THREAD: tl.constexpr,
+    RANK: tl.constexpr,
+    WORLD_SIZE: tl.constexpr,
+):
+    """
+    Triton kernel to perform multicast reduce-scatter over nvlink using multimem instructions.
+    """
+    symm_mem_sync(
+        signal_pad_ptrs,
+        None,
+        RANK,
+        WORLD_SIZE,
+        hasPreviousMemAccess=False,
+        hasSubsequentMemAccess=False,
+    )
+    sync_threads()
+
+    pid = tl.program_id(axis=0)
+    tid = get_flat_tid()
+
+    # From this point on, we pretend each element is 128-bit
+    numel = numel // NUMEL_PER_THREAD
+    numel_per_rank = tl.cdiv(numel, WORLD_SIZE)
+    block_start = pid * BLOCK_SIZE
+
+    while block_start < numel_per_rank:
+        offsets = block_start + tid
+        mask = offsets < numel_per_rank
+
+        # Each pointer points to a 128-bit bit pack
+        multicast_ptrs = (
+            multicast_ptr.to(tl.pointer_type(tl.uint64)) + (RANK * numel_per_rank + offsets) * 2
+        )
+        local_ptrs = local_ptr.to(tl.pointer_type(tl.uint64)) + offsets * 2
+        (x, y, z, w) = ld_128(multicast_ptrs, mask=mask, multicast_op=True)
+        st_128(local_ptrs, x, y, z, w, mask=mask, multicast_op=False)
+
+        block_start += tl.num_programs(axis=0) * BLOCK_SIZE
+
+
+def multimem_reduce_scatter(
+    output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
+    symm_mem_hdl: _SymmetricMemory,
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Calls a multicast reduce-scatter triton kernel on the given tensor.
+    Input tensor must be a symmetric memory buffer.
+    Output tensor can be a regular torch tensor
+    Arguments:
+        output_tensor: torch.Tensor - output tensor to be reduce-scattered into
+        input_tensor: torch.Tensor - input tensor to be reduce-scattered from
+        symm_mem_hdl: _SymmetricMemory - handle to the symmetric memory buffer for input_tensor
+        **kwargs: Additional keyword arguments for kernel configuration:
+            max_num_blocks (int, optional): The maximum number of blocks to launch.
+            num_warps (int, optional): The number of warps per block.
+            BLOCK_SIZE (int, optional): The BLOCK_SIZE parameter for the kernel.
+    Returns:
+        torch.Tensor - reduce-scattered tensor, which is output_tensor
+    """
+
+    assert HAVE_TRITON, "Triton is required for multimem reduce-scatter."
+
+    config = {
+        "max_num_blocks": kwargs.get("max_num_blocks", 24),
+        "num_warps": kwargs.get("num_warps", 32),
+        "BLOCK_SIZE": kwargs.get("BLOCK_SIZE", 1024),
+    }
+
+    assert input_tensor.dtype == torch.bfloat16, "Only bfloat16 is supported for now."
+    assert output_tensor.dtype == torch.bfloat16, "Only bfloat16 is supported for now."
+    numel_per_thread = 128 // (output_tensor.element_size() * 8)
+
+    assert (
+        input_tensor.numel() % numel_per_thread == 0
+    ), "The number of elements must be 128-bit aligned."
+
+    num_threads = triton.cdiv(input_tensor.numel() // numel_per_thread, symm_mem_hdl.world_size)
+    num_blocks = min(triton.cdiv(num_threads, config["BLOCK_SIZE"]), config["max_num_blocks"])
+
+    _multimem_reduce_scatter_kernel[(num_blocks, 1, 1)](
+        output_tensor.data_ptr(),
+        symm_mem_hdl.multicast_ptr,
+        symm_mem_hdl.signal_pad_ptrs_dev,
+        numel=input_tensor.numel(),
+        BLOCK_SIZE=config["BLOCK_SIZE"],
+        NUMEL_PER_THREAD=numel_per_thread,
+        RANK=symm_mem_hdl.rank,
+        WORLD_SIZE=symm_mem_hdl.world_size,
+        num_warps=config["num_warps"],
+    )
+
+    return output_tensor
diff --git a/megatron/core/inference/communication/torch_symm_triton/fused_collectives.py b/megatron/core/inference/communication/torch_symm_triton/fused_collectives.py
new file mode 100644
index 00000000000..875a8ff8d96
--- /dev/null
+++ b/megatron/core/inference/communication/torch_symm_triton/fused_collectives.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import torch
+
+from .barrier import symm_mem_sync
+from .multimem_asm import add_v8_bf16_from_u32, asm_rsqrt, ld_128, st_128
+from .utils import sync_threads
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:
+    from unittest.mock import MagicMock
+
+    from megatron.core.utils import null_decorator
+
+    triton = MagicMock()
+    tl = MagicMock()
+    triton.jit = null_decorator
+
+
+@triton.jit
+def unpack_bf16x2(x, mask):
+    """
+    Unpack x, which is in bf16x2 packed format stored in uint32,
+    into two float32 tensors representing the high and low bf16 values.
+
+    Args:
+        x: tl.uint32 tensor containing packed bf16x2 values.
+        mask: boolean mask tensor, 1 denotes that x is valid.
+    Returns:
+        x_hi: float32 tensor containing the high bf16 values.
+        x_lo: float32 tensor containing the low bf16 values.
+    """
+    x = x * mask
+    x_hi = (x >> 16).cast(tl.uint16).cast(tl.bfloat16, bitcast=True).cast(tl.float32)
+    x_lo = x.cast(tl.uint16).cast(tl.bfloat16, bitcast=True).cast(tl.float32)
+    return x_hi, x_lo
+
+
+@triton.jit
+def sum_sq(x, y, z, w, mask):
+    """
+    First computes the squared sum of 8 bf16 values
+    packed in x, y, z, w. Then does an SM-wide
+    reduction to get the total sqaured sum.
+    Args:
+        x, y, z, w: tl.uint32 tensors containing packed bf16x2 values.
+        mask: boolean mask tensor, 1 denotes that x,y,z,w are valid.
+    Returns:
+        sq_sum: float32 scalar, the total squared sum.
+    """
+    x_hi, x_lo = unpack_bf16x2(x, mask)
+    y_hi, y_lo = unpack_bf16x2(y, mask)
+    z_hi, z_lo = unpack_bf16x2(z, mask)
+    w_hi, w_lo = unpack_bf16x2(w, mask)
+    # thread local sum
+    sq_sum = (
+        x_hi * x_hi
+        + x_lo * x_lo
+        + y_hi * y_hi
+        + y_lo * y_lo
+        + z_hi * z_hi
+        + z_lo * z_lo
+        + w_hi * w_hi
+        + w_lo * w_lo
+    )
+    # sm-wide reduction
+    sq_sum = tl.sum(sq_sum)
+    return sq_sum
+
+
+@triton.jit
+def apply_norm(x, y, z, w, wx, wy, wz, ww, rrms, mask):
+    """
+    Apply RMS norm to the input bf16x2 tensors x,y,z,w using
+    the rms norm weights wx,wy,wz,ww and the reciprocal
+    root mean square rrms.
+    """
+    # todo: try converting to pure ASM code
+    x_hi, x_lo = unpack_bf16x2(x, mask)
+    y_hi, y_lo = unpack_bf16x2(y, mask)
+    z_hi, z_lo = unpack_bf16x2(z, mask)
+    w_hi, w_lo = unpack_bf16x2(w, mask)
+    wx_hi, wx_lo = unpack_bf16x2(wx, mask)
+    wy_hi, wy_lo = unpack_bf16x2(wy, mask)
+    wz_hi, wz_lo = unpack_bf16x2(wz, mask)
+    ww_hi, ww_lo = unpack_bf16x2(ww, mask)
+
+    x_hi = (x_hi * rrms * wx_hi).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(
+        tl.uint32
+    ) << 16
+    x_lo = (x_lo * rrms * wx_lo).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(tl.uint32)
+    y_hi = (y_hi * rrms * wy_hi).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(
+        tl.uint32
+    ) << 16
+    y_lo = (y_lo * rrms * wy_lo).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(tl.uint32)
+    z_hi = (z_hi * rrms * wz_hi).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(
+        tl.uint32
+    ) << 16
+    z_lo = (z_lo * rrms * wz_lo).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(tl.uint32)
+    w_hi = (w_hi * rrms * ww_hi).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(
+        tl.uint32
+    ) << 16
+    w_lo = (w_lo * rrms * ww_lo).cast(tl.bfloat16).cast(tl.uint16, bitcast=True).cast(tl.uint32)
+    # pack back to bf16x2, to be used by nvls multicast store.
+    x = x_hi | x_lo
+    y = y_hi | y_lo
+    z = z_hi | z_lo
+    w = w_hi | w_lo
+    return x, y, z, w
+
+
+@triton.jit
+def _multimem_reduce_scatter_residual_add_kernel(
+    residual_output_ptr,
+    residual_input_ptr,
+    rms_norm_weights_ptr,
+    multicast_ptr,  # points to symmetric memory buffer
+    signal_pad_ptrs,
+    num_tokens,
+    eps,
+    HIDDEN_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    NUMEL_PER_THREAD: tl.constexpr,
+    RANK: tl.constexpr,
+    WORLD_SIZE: tl.constexpr,
+):
+    symm_mem_sync(
+        signal_pad_ptrs,
+        None,
+        RANK,
+        WORLD_SIZE,
+        hasPreviousMemAccess=False,
+        hasSubsequentMemAccess=False,
+    )
+    sync_threads()
+
+    pid = tl.program_id(axis=0)
+    tid = tl.arange(0, BLOCK_SIZE)
+
+    tokens_per_rank = tl.cdiv(num_tokens, WORLD_SIZE)
+    numel_per_token = tl.cdiv(HIDDEN_SIZE, NUMEL_PER_THREAD)
+    numel_per_rank = tokens_per_rank * numel_per_token
+
+    # each program handles 1 token at a time
+    program_offset = pid * numel_per_token
+    thread_mask = tid < numel_per_token
+
+    for token_offset in range(pid, tokens_per_rank, tl.num_programs(axis=0)):
+        # Step 1: - reduce-scatter + residual add for this token + collect sq sum
+        program_offset = token_offset * numel_per_token
+        sq_sum_ = 0.0
+        for thread_offset in range(0, numel_per_token, BLOCK_SIZE):
+            offsets = program_offset + thread_offset + tid
+            mask = (offsets < numel_per_rank) & (thread_mask)
+            multicast_ptrs = (
+                multicast_ptr.to(tl.pointer_type(tl.uint64)) + (RANK * numel_per_rank + offsets) * 2
+            )
+            res_out_ptrs = residual_output_ptr.to(tl.pointer_type(tl.uint64)) + offsets * 2
+            res_in_ptrs = residual_input_ptr.to(tl.pointer_type(tl.uint64)) + offsets * 2
+            # reduce-scatter
+            (x, y, z, w) = ld_128(multicast_ptrs, mask=mask, multicast_op=True)
+            # load residual
+            (rx, ry, rz, rw) = ld_128(res_in_ptrs, mask=mask, multicast_op=False)
+            # add residual
+            (x, y, z, w) = add_v8_bf16_from_u32(x, y, z, w, rx, ry, rz, rw)
+            # store residual
+            st_128(res_out_ptrs, x, y, z, w, mask=mask, multicast_op=False)
+            # update squared sum for computing the norm later
+            sq_sum_ += sum_sq(x, y, z, w, mask=mask)
+
+        # sum_sq is now the sum of squares for this token
+        # it is a SM-wide reduction, so no need to sync_threads()
+        mean_sq = sq_sum_ / HIDDEN_SIZE
+        rrms = asm_rsqrt(mean_sq, eps)
+
+        # Step 2 - apply-rms-norm + all-gather
+        for thread_offset in range(0, numel_per_token, BLOCK_SIZE):
+            offsets = program_offset + thread_offset + tid
+            # first offset is a token offset
+            # second offset is a hidden-dim offset (in units of 128-bit)
+            mask = (offsets < numel_per_rank) & (thread_mask)
+
+            multicast_ptrs = (
+                multicast_ptr.to(tl.pointer_type(tl.uint64)) + (RANK * numel_per_rank + offsets) * 2
+            )
+            res_out_ptrs = residual_output_ptr.to(tl.pointer_type(tl.uint64)) + offsets * 2
+
+            rms_norm_weights_ptrs = (
+                rms_norm_weights_ptr.to(tl.pointer_type(tl.uint64)) + (thread_offset + tid) * 2
+            )
+
+            (rx, ry, rz, rw) = ld_128(res_out_ptrs, mask=mask, multicast_op=False)
+            (wx, wy, wz, ww) = ld_128(rms_norm_weights_ptrs, mask=mask, multicast_op=False)
+            (nx, ny, nz, nw) = apply_norm(rx, ry, rz, rw, wx, wy, wz, ww, rrms, mask)
+            st_128(multicast_ptrs, nx, ny, nz, nw, mask=mask, multicast_op=True)
+
+    sync_threads()
+    symm_mem_sync(
+        signal_pad_ptrs,
+        None,
+        RANK,
+        WORLD_SIZE,
+        hasPreviousMemAccess=True,
+        hasSubsequentMemAccess=True,
+    )
+
+
+def fused_multimem_rs_add_norm_ag(
+    residual_output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
+    symm_mem_hdl,
+    residual_input_tensor: torch.Tensor,
+    rms_norm_weights: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    """
+    Calls a multicast reduce-scatter + residual add + rms norm + all-gather
+    triton kernel. Writes out the output of the residual add to residual_output_tensor.
+    The output of the full kernel is written in-place to the symmetric memory buffer.
+    input_tensor must be a symmetric memory buffer.
+    Args:
+        residual_output_tensor: torch.Tensor to write the output of the residual add.
+        input_tensor: torch.Tensor, symmetric memory buffer to read the input from.
+        symm_mem_hdl: _SymmetricMemory handle for the symmetric memory buffer.
+        residual_input_tensor: torch.Tensor, the residual input to be added.
+        rms_norm_weights: torch.Tensor, the weights for rms norm.
+        eps: float, epsilon value for rms norm.
+    Returns:
+        residual_output_tensor: torch.Tensor, the output of the full fused operation.
+    """
+    WARP_SIZE = 32
+    MAX_NUM_BLOCKS = 128
+    MAX_BLOCK_SIZE = 1024
+    BYTES_PER_THREAD = 16
+
+    assert input_tensor.dtype == torch.bfloat16, "Only bfloat16 is supported for now."
+    assert residual_output_tensor.dtype == torch.bfloat16, "Only bfloat16 is supported for now."
+    assert residual_input_tensor.dtype == torch.bfloat16, "Only bfloat16 is supported for now."
+
+    # this evaluates to 128 for bf16.
+    # each thread will process 128 bits (8 bf16 values) at a time.
+    numel_per_thread = BYTES_PER_THREAD // residual_input_tensor.element_size()
+
+    assert (
+        input_tensor.numel() % numel_per_thread == 0
+    ), "The number of elements must be 128-bit aligned."
+
+    num_threads = triton.cdiv(input_tensor.numel() // numel_per_thread, symm_mem_hdl.world_size)
+
+    if num_threads < MAX_BLOCK_SIZE:
+        block_size = 1
+        while block_size < num_threads:
+            block_size *= 2
+        num_warps = block_size // WARP_SIZE
+        num_blocks = 1
+    else:
+        block_size = MAX_BLOCK_SIZE
+        num_warps = MAX_BLOCK_SIZE // WARP_SIZE
+        num_blocks = min(triton.cdiv(num_threads, MAX_BLOCK_SIZE), MAX_NUM_BLOCKS)
+
+    hsize = input_tensor.size(-1)
+    _multimem_reduce_scatter_residual_add_kernel[(num_blocks, 1, 1)](
+        residual_output_tensor.data_ptr(),
+        residual_input_tensor.data_ptr(),
+        rms_norm_weights.data_ptr(),
+        symm_mem_hdl.multicast_ptr,
+        symm_mem_hdl.signal_pad_ptrs_dev,
+        input_tensor.numel() // hsize,
+        eps=eps,
+        HIDDEN_SIZE=hsize,
+        BLOCK_SIZE=block_size,
+        NUMEL_PER_THREAD=numel_per_thread,
+        RANK=symm_mem_hdl.rank,
+        WORLD_SIZE=symm_mem_hdl.world_size,
+        num_warps=num_warps,
+    )
+
+    return residual_output_tensor
diff --git a/megatron/core/inference/communication/torch_symm_triton/multimem_asm.py b/megatron/core/inference/communication/torch_symm_triton/multimem_asm.py
new file mode 100644
index 00000000000..774c3f6d2bf
--- /dev/null
+++ b/megatron/core/inference/communication/torch_symm_triton/multimem_asm.py
@@ -0,0 +1,213 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# Adapted from https://github.com/yifuwang/symm-mem-recipes.git
+
+from unittest.mock import MagicMock
+
+from megatron.core.utils import null_decorator
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:
+    triton = MagicMock()
+    tl = MagicMock()
+    triton.jit = null_decorator
+
+
+@triton.jit
+def ld_128(ptr, mask, multicast_op: tl.constexpr):
+    """
+    Loads 128 bits (8 x bf16) from memory into registers.
+
+    This function abstracts two distinct hardware behaviors based on `multicast_op`:
+
+    1.  **Standard Load (`multicast_op=False`)**:
+        -   **Semantics:** Local Global Memory Load.
+        -   **Action:** Reads 128 bits from `ptr` in global memory into the local register file.
+        -   **Use Case:** Standard tensor processing.
+
+    2.  **Multicast Reduce-Load (`multicast_op=True`)**:
+        -   **Semantics:** "Pull" Reduction over NVLink.
+        -   **Action:** Simultaneously reads 128 bits from the *same* address across all peer GPUs
+            in the multicast group, sums them (add reduction), and loads the result into the
+            local register file.
+        -   **Hardware:** Uses `multimem.ld_reduce` (Hopper+).
+        -   **Use Case:** The "Reduce" step in collective operations.
+
+    Args:
+        ptr: Memory pointer to the source buffer.
+        mask: Boolean predicate. If False, the operation is skipped (no-op).
+        multicast_op (tl.constexpr): Toggles between standard load (False)
+        and multicast-reduce (True).
+
+    Returns:
+        Four 32-bit registers (tl.uint32), representing 128 bits of loaded data.
+        Note: When interpreting as bf16, this equates to 8 values (2 per register).
+    """
+    # PTX Assembly Logic:
+    # 1. @$5: Predication. Only execute if argument 5 (mask) is True (1).
+    # 2. Opcode Selection:
+    #    - 'multimem.ld_reduce...add.v4.bf16x2': Hardware-accelerated reduction across peers.
+    #    - 'ld.global...v4.u32': Standard 128-bit memory read.
+    # 3. Operands:
+    #    - {$0, $1, $2, $3}: Destination registers (Output).
+    #    - [$4]: Source memory address (Input).
+    if multicast_op:
+        return tl.inline_asm_elementwise(
+            """
+            {
+                .reg .pred %p0;
+                setp.ne.s32 %p0, $5, 1;
+                @%p0 bra end;
+                multimem.ld_reduce.relaxed.sys.global.add.acc::f32.v4.bf16x2 {$0, $1, $2, $3}, [$4];
+                end:
+            }
+            """,
+            "=r,=r,=r,=r,l,r",
+            args=[ptr, mask.to(tl.int32)],
+            dtype=(tl.uint32, tl.uint32, tl.uint32, tl.uint32),
+            is_pure=True,
+            pack=1,
+        )
+    else:
+        return tl.inline_asm_elementwise(
+            """
+        {
+            .reg .pred %p0;
+            setp.ne.s32 %p0, $5, 1;
+            @%p0 bra end;
+            ld.global.v4.u32 {$0, $1, $2, $3}, [$4];
+            end:
+        }
+        """,
+            "=r,=r,=r,=r,l,r",
+            args=[ptr, mask.to(tl.int32)],
+            dtype=(tl.uint32, tl.uint32, tl.uint32, tl.uint32),
+            is_pure=True,
+            pack=1,
+        )
+
+
+@triton.jit
+def st_128(ptr, x, y, z, w, mask, multicast_op):
+    """
+    Stores 128 bits (8 x bf16) from registers to memory.
+
+    This function abstracts two distinct hardware behaviors based on `multicast_op`:
+
+    1.  **Standard Store (`multicast_op=False`)**:
+        -   **Semantics:** Local Global Memory Store.
+        -   **Action:** Writes 128 bits from local registers to `ptr` in global memory.
+
+    2.  **Multicast Store (`multicast_op=True`)**:
+        -   **Semantics:** "Push" Broadcast over NVLink.
+        -   **Action:** Writes 128 bits from local registers to the `ptr` address in
+            the global memory of **all** peer GPUs in the multicast group simultaneously.
+        -   **Hardware:** Uses `multimem.st` (Hopper+).
+        -   **Use Case:** The "Broadcast" or "All-Gather" step in collective operations.
+
+    Args:
+        ptr: Memory pointer to the destination buffer.
+        x, y, z, w: Four 32-bit registers containing the data to store.
+        mask: Boolean predicate. If False, the store is skipped.
+        multicast_op (tl.constexpr): Toggles between standard store (False)
+        and multicast broadcast (True).
+    """
+    # PTX Assembly Logic:
+    # 1. @$6: Predication. Only execute if argument 6 (mask) is True.
+    # 2. Opcode Selection:
+    #    - 'multimem.st...v4.f32': Broadcasts data to all peers.
+    #      (Note: .f32 type used for bit-movement, equivalent to .u32 for storage).
+    #    - 'st.global...v4.u32': Standard 128-bit memory write.
+    # 3. Operands:
+    #    - [$1]: Destination memory address.
+    #    - {$2, $3, $4, $5}: Source registers containing data.
+    if multicast_op:
+        return tl.inline_asm_elementwise(
+            """
+            {
+                .reg .pred %p0;
+                setp.ne.s32 %p0, $6, 1;
+                @%p0 bra end;
+                multimem.st.relaxed.sys.global.v4.f32 [$1], {$2, $3, $4, $5};
+                end:
+            }
+            """,
+            "=r,l,r,r,r,r,r",
+            args=[ptr, x, y, z, w, mask.to(tl.int32)],
+            dtype=(tl.uint32),
+            is_pure=False,
+            pack=1,
+        )
+    else:
+        return tl.inline_asm_elementwise(
+            """
+        {
+            .reg .pred %p0;
+            setp.ne.s32 %p0, $6, 1;
+            @%p0 bra end;
+            st.global.v4.f32 [$1], {$2, $3, $4, $5};
+            end:
+        }
+        """,
+            "=r,l,r,r,r,r,r",
+            args=[ptr, x, y, z, w, mask.to(tl.int32)],
+            dtype=(tl.uint32),
+            is_pure=False,
+            pack=1,
+        )
+
+
+@triton.jit
+def add_v8_bf16_from_u32(
+    a0,
+    a1,
+    a2,
+    a3,  # First vector of 8 bf16s, packed in 4 uint32s
+    b0,
+    b1,
+    b2,
+    b3,  # Second vector of 8 bf16s, packed in 4 uint32s
+):
+    """
+    Adds two vectors of 8 bfloat16 numbers.
+    Each vector is passed as four tl.uint32 tensors.
+    Returns the result as a tuple of four tl.uint32 tensors.
+    """
+    return tl.inline_asm_elementwise(
+        """
+        {
+            add.bf16x2 $0, $4, $8;
+            add.bf16x2 $1, $5, $9;
+            add.bf16x2 $2, $6, $10;
+            add.bf16x2 $3, $7, $11;
+        }
+        """,
+        # 8 outputs (=r), 8 inputs (r)
+        "=r,=r,=r,=r,r,r,r,r,r,r,r,r",
+        args=[a0, a1, a2, a3, b0, b1, b2, b3],
+        dtype=(tl.uint32, tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+
+
+@triton.jit
+def asm_rsqrt(x, eps):
+    """
+    Computes the reciprocal square root of a float32 number using inline assembly.
+    """
+    return tl.inline_asm_elementwise(
+        """
+        {
+            add.f32 $1, $1, $2;
+            rsqrt.approx.f32 $0, $1;
+        }
+        """,
+        "=f, f, f",
+        args=[x, eps],
+        dtype=(tl.float32),
+        is_pure=True,
+        pack=1,
+    )
diff --git a/megatron/core/inference/communication/torch_symm_triton/utils.py b/megatron/core/inference/communication/torch_symm_triton/utils.py
new file mode 100644
index 00000000000..785481dfba6
--- /dev/null
+++ b/megatron/core/inference/communication/torch_symm_triton/utils.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# Adapted from: https://github.com/meta-pytorch/kraken.git
+
+from unittest.mock import MagicMock
+
+from megatron.core.utils import null_decorator
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:
+    triton = MagicMock()
+    tl = MagicMock()
+    triton.jit = null_decorator
+
+
+@triton.jit
+def get_tid():
+    """
+    Returns the thread IDs in x, y, z dimensions.
+    """
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %tid.x;
+        mov.u32 $1, %tid.y;
+        mov.u32 $2, %tid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+
+
+@triton.jit
+def get_ntid():
+    """
+    Returns the number of threads in x, y, z dimensions.
+    """
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %ntid.x;
+        mov.u32 $1, %ntid.y;
+        mov.u32 $2, %ntid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+
+
+@triton.jit
+def get_flat_tid():
+    """
+    Calculates a unique, one-dimensional ID for each thread within its thread block.
+    """
+    tid_x, tid_y, tid_z = get_tid()
+    ntid_x, ntid_y, _ = get_ntid()
+    return tid_z * ntid_y * ntid_x + tid_y * ntid_x + tid_x
+
+
+@triton.jit
+def get_flat_bid():
+    """
+    Calculates a unique, one-dimensional ID for each block within the grid."""
+    return (
+        tl.program_id(2) * tl.num_programs(1) * tl.num_programs(0)
+        + tl.program_id(1) * tl.num_programs(0)
+        + tl.program_id(0)
+    )
+
+
+@triton.jit
+def sync_threads():
+    """
+    Synchronize all threads within a block.
+    """
+    tl.inline_asm_elementwise("bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1)
diff --git a/megatron/core/inference/contexts/attention_context/mamba_metadata.py b/megatron/core/inference/contexts/attention_context/mamba_metadata.py
index ecb0296559f..6cf45aeb9e1 100644
--- a/megatron/core/inference/contexts/attention_context/mamba_metadata.py
+++ b/megatron/core/inference/contexts/attention_context/mamba_metadata.py
@@ -5,10 +5,18 @@
 
 import torch
 
+from megatron.core.inference.batch_dimensions_utils import InferenceBatchDimensions
+
 
 @dataclass
 class MambaInferenceStateConfig:
-    """Config for initializing Mamba model inference state tensors."""
+    """
+    Config for initializing Mamba model inference state tensors.
+
+    Note that we maintain separate metadata for decode, regular prefill, and
+    chunked prefill requests because the Mamba kernels do not yet support mixing
+    these. Once the kernels have been updated we can simplify this code.
+    """
 
     layer_type_list: List[str]
     """
@@ -26,7 +34,7 @@ class MambaInferenceStateConfig:
 class MambaMetadata:
     """Manages the metadata tensors required for Mamba layers during inference."""
 
-    def __init__(self, max_requests: int):
+    def __init__(self, max_requests: int, max_tokens: int):
         """
         Initializes the Mamba slot allocator.
 
@@ -34,15 +42,50 @@ def __init__(self, max_requests: int):
             max_requests (int): The maximum number of concurrent requests.
         """
         self.max_requests = max_requests
+        self.max_tokens = max_tokens
+        self.device = torch.cuda.current_device()
 
-        # Metadata for mapping requests to slots in the static Mamba state buffer
+        # Map from requests to slots in the static Mamba state buffer
         self.request_to_mamba_state_idx = torch.full(
             (self.max_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device()
         )
 
-        # Separate mapping used only for CUDA graph compatibility
-        self.request_to_mamba_state_idx_cudagraph_only = torch.full(
-            (self.max_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device()
+        # Map from requests to slots in the static Mamba state buffer for active decode requests
+        self._batch_indices_decode_buffer = torch.full(
+            (self.max_requests,), -1, dtype=torch.int32, device=self.device
+        )
+
+        # Map from requests to slots in the static Mamba state buffer for active prefill requests
+        self._batch_indices_prefill_buffer = torch.full(
+            (self.max_requests,), -1, dtype=torch.int32, device=self.device
+        )
+
+        # Map from the active chunked prefill request to its slot in the static Mamba state buffer
+        self._batch_indices_chunked_prefill_buffer = torch.full(
+            (1,), -1, dtype=torch.int32, device=self.device
+        )
+
+        # Map from token id to request id for active prefill requests
+        self._seq_idx_buffer = torch.full(
+            (1, self.max_tokens), -1, dtype=torch.int32, device=self.device
+        )
+
+        # Cumulative sequence lengths for active prefill requests
+        self._cu_seqlens_buffer = torch.zeros(
+            (self.max_requests + 1,), dtype=torch.int32, device=self.device
+        )
+
+        # Tuple of (active decode request count, active prefill request count)
+        self._device_decode_prefill_buffer = torch.zeros(
+            (2,), dtype=torch.int32, device=self.device
+        )
+
+        # Tuple of (
+        #   total prefill sequence length excluding chunked prefill,
+        #   chunked prefill sequence length
+        # )
+        self._device_chunked_prefill_buffer = torch.zeros(
+            (2,), dtype=torch.int32, device=self.device
         )
 
         # Allocator for Mamba state slots
@@ -56,7 +99,8 @@ def reset(self) -> None:
         Resets all Mamba states and frees all allocated slots.
         """
         self.request_to_mamba_state_idx.fill_(-1)
-        self.request_to_mamba_state_idx_cudagraph_only.fill_(-1)
+
+        self.reset_varlen_metadata()
 
         # Re-initialize the free slot pool
         self.mamba_state_free_slots = torch.arange(
@@ -64,14 +108,23 @@ def reset(self) -> None:
         )
         self.mamba_state_free_slot_count = self.max_requests
 
-    def reset_cudagraph_mapping(self) -> None:
-        """
-        Resets only the CUDA graph mapping tensor.
-        """
-        self.request_to_mamba_state_idx_cudagraph_only.fill_(-1)
+    def reset_varlen_metadata(self) -> None:
+        """Resets varlen metadata."""
+        self.batch_indices_decode = None
+        self.batch_indices_prefill = None
+        self.batch_indices_chunked_prefill = None
+        self.cu_seqlens = None
+        self.seq_idx = None
+        self.device_decode_prefill = None
+        self.device_chunked_prefill = None
 
-    def update_cudagraph_mapping(
-        self, active_mamba_indices: torch.Tensor, num_active_requests: int
+    def update(
+        self,
+        active_mamba_indices: torch.Tensor,
+        token_to_request_idx: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        batch_dimensions: InferenceBatchDimensions,
+        padded_batch_dimensions: InferenceBatchDimensions,
     ) -> None:
         """
         Updates the dedicated CUDA graph mapping tensor with the indices
@@ -82,7 +135,104 @@ def update_cudagraph_mapping(
                                            for active requests.
             num_active_requests (int): The number of active requests.
         """
-        self.request_to_mamba_state_idx_cudagraph_only[0:num_active_requests] = active_mamba_indices
+        real_decode_count = batch_dimensions.decode_req_count
+        real_prefill_count = batch_dimensions.prefill_req_count
+        real_token_count = batch_dimensions.token_count
+        has_explicit_chunked_prefill_req = batch_dimensions.has_explicit_chunked_prefill_req
+
+        padded_decode_count = padded_batch_dimensions.decode_req_count
+        padded_prefill_count = padded_batch_dimensions.prefill_req_count
+        padded_token_count = padded_batch_dimensions.token_count
+        assert (
+            has_explicit_chunked_prefill_req
+            == padded_batch_dimensions.has_explicit_chunked_prefill_req
+        )
+
+        if padded_decode_count > 0:
+            # Update decode indices
+            self._batch_indices_decode_buffer[:real_decode_count].copy_(
+                active_mamba_indices[:real_decode_count]
+            )
+            if padded_decode_count > real_decode_count:
+                self._batch_indices_decode_buffer[real_decode_count:padded_decode_count] = -1
+            self.batch_indices_decode = self._batch_indices_decode_buffer[:padded_decode_count]
+
+        # Determine if we have a chunked prefill request and adjust counts for regular prefill
+        regular_prefill_count = real_prefill_count
+        if has_explicit_chunked_prefill_req:
+            # The last prefill request is the chunked one
+            regular_prefill_count -= 1
+            chunked_req_idx = real_decode_count + regular_prefill_count
+
+            # Update chunked prefill indices
+            self._batch_indices_chunked_prefill_buffer[0] = active_mamba_indices[chunked_req_idx]
+            self.batch_indices_chunked_prefill = self._batch_indices_chunked_prefill_buffer
+        else:
+            self.batch_indices_chunked_prefill = None
+
+        if padded_prefill_count > 0:
+            # Update prefill indices (excluding chunked prefill from regular prefill buffer)
+            if regular_prefill_count > 0:
+                self._batch_indices_prefill_buffer[:regular_prefill_count].copy_(
+                    active_mamba_indices[
+                        real_decode_count : real_decode_count + regular_prefill_count
+                    ]
+                )
+
+            if padded_prefill_count > regular_prefill_count:
+                self._batch_indices_prefill_buffer[regular_prefill_count:padded_prefill_count] = -1
+
+            self.batch_indices_prefill = self._batch_indices_prefill_buffer[:padded_prefill_count]
+
+            # Update seq_idx
+            end_regular_prefill_token_idx = cu_seqlens[real_decode_count + regular_prefill_count]
+
+            # The length of tokens belonging to regular prefill requests (excluding decode tokens)
+            seq_len = end_regular_prefill_token_idx - real_decode_count
+
+            if seq_len > 0:
+                self._seq_idx_buffer[:, :seq_len].copy_(
+                    token_to_request_idx[real_decode_count:end_regular_prefill_token_idx]
+                    - real_decode_count
+                )
+
+            if padded_token_count > seq_len:
+                self._seq_idx_buffer[:, seq_len:padded_token_count] = -1
+            self.seq_idx = self._seq_idx_buffer[:, :padded_token_count]
+
+            # Update cu_seqlens
+            self._cu_seqlens_buffer[0] = 0
+            if regular_prefill_count > 0:
+                self._cu_seqlens_buffer[1 : regular_prefill_count + 1].copy_(
+                    cu_seqlens[
+                        real_decode_count + 1 : real_decode_count + regular_prefill_count + 1
+                    ]
+                    - real_decode_count
+                )
+
+            # Pad the rest with the last value (effectively length 0 segments)
+            last_val = self._cu_seqlens_buffer[regular_prefill_count]
+            self._cu_seqlens_buffer[regular_prefill_count + 1 : padded_prefill_count + 1].fill_(
+                last_val
+            )
+            self.cu_seqlens = self._cu_seqlens_buffer[: padded_prefill_count + 1]
+
+        if padded_decode_count > 0 and padded_prefill_count > 0:
+            self._device_decode_prefill_buffer[0] = real_decode_count
+            self._device_decode_prefill_buffer[1] = regular_prefill_count
+            self.device_decode_prefill = self._device_decode_prefill_buffer
+
+        # If using chunked prefill for this batch, store the number of regular prefill tokens
+        # and the number of tokens in the chunked prefill request
+        if has_explicit_chunked_prefill_req:
+            chunked_prefill_token_count = (
+                cu_seqlens[real_decode_count + real_prefill_count]
+                - cu_seqlens[real_decode_count + real_prefill_count - 1]
+            )
+            assert self.cu_seqlens is not None
+            self._device_chunked_prefill_buffer[0] = self.cu_seqlens[regular_prefill_count]
+            self._device_chunked_prefill_buffer[1] = chunked_prefill_token_count
+            self.device_chunked_prefill = self._device_chunked_prefill_buffer
 
     def allocate_slot(self) -> Optional[int]:
         """
diff --git a/megatron/core/inference/contexts/attention_context/triton/tensor_ops.py b/megatron/core/inference/contexts/attention_context/triton/tensor_ops.py
new file mode 100644
index 00000000000..2f3210488f5
--- /dev/null
+++ b/megatron/core/inference/contexts/attention_context/triton/tensor_ops.py
@@ -0,0 +1,462 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+from typing import Optional
+
+import torch
+import triton  # type: ignore
+import triton.language as tl  # type: ignore
+
+
+@triton.jit
+def _tensor_get_slice_after_kernel(
+    INPUT_TENSOR,
+    OUTPUT_TENSOR,
+    POS_ON_DEVICE,
+    INPUT_BATCH_SIZE: tl.constexpr,
+    OUTPUT_BATCH_SIZE: tl.constexpr,
+    ROW_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Kernel to copy rows from INPUT_TENSOR[pos_on_device:] into OUTPUT_TENSOR."""
+
+    pid = tl.program_id(0)
+    pos_on_device = tl.load(POS_ON_DEVICE)
+    copy_size = INPUT_BATCH_SIZE - pos_on_device
+
+    if pid < copy_size and pid < OUTPUT_BATCH_SIZE:
+        input_idx = pos_on_device + pid
+
+        if input_idx < INPUT_BATCH_SIZE:
+            row_offsets = tl.arange(0, BLOCK_SIZE)
+            row_mask = row_offsets < ROW_SIZE
+
+            input_ptr = INPUT_TENSOR + input_idx * ROW_SIZE + row_offsets
+            output_ptr = OUTPUT_TENSOR + pid * ROW_SIZE + row_offsets
+
+            input_data = tl.load(input_ptr, mask=row_mask, other=0.0)
+            tl.store(output_ptr, input_data, mask=row_mask)
+
+
+@triton.jit
+def _tensor_merge_kernel(
+    TENSOR_A,
+    TENSOR_B,
+    OUTPUT_TENSOR,
+    POS_ON_DEVICE,
+    TENSOR_B_BATCH_SIZE: tl.constexpr,
+    ROW_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    OUTPUT_BATCH_SIZE: tl.constexpr,
+    IS_INPLACE: tl.constexpr,
+):
+    """
+    Kernel to merge rows from tensor_a and tensor_b into output_tensor.
+
+    - output[:pos_on_device] = tensor_a[:pos_on_device]
+    - output[pos_on_device:pos_on_device + tensor_b_batch] = tensor_b[:tensor_b_batch]
+    """
+
+    pid = tl.program_id(0)
+    pos_on_device = tl.load(POS_ON_DEVICE)
+
+    if pid < pos_on_device:
+        if not IS_INPLACE:
+            row_offsets = tl.arange(0, BLOCK_SIZE)
+            row_mask = row_offsets < ROW_SIZE
+
+            tensor_a_ptr = TENSOR_A + pid * ROW_SIZE + row_offsets
+            output_ptr = OUTPUT_TENSOR + pid * ROW_SIZE + row_offsets
+
+            tensor_a_data = tl.load(tensor_a_ptr, mask=row_mask, other=0.0)
+            tl.store(output_ptr, tensor_a_data, mask=row_mask)
+
+    elif pid < pos_on_device + TENSOR_B_BATCH_SIZE and pid < OUTPUT_BATCH_SIZE:
+        tensor_b_idx = pid - pos_on_device
+
+        if tensor_b_idx < TENSOR_B_BATCH_SIZE:
+            row_offsets = tl.arange(0, BLOCK_SIZE)
+            row_mask = row_offsets < ROW_SIZE
+
+            tensor_b_ptr = TENSOR_B + tensor_b_idx * ROW_SIZE + row_offsets
+            output_ptr = OUTPUT_TENSOR + pid * ROW_SIZE + row_offsets
+
+            tensor_b_data = tl.load(tensor_b_ptr, mask=row_mask, other=0.0)
+            tl.store(output_ptr, tensor_b_data, mask=row_mask)
+
+
+@triton.jit
+def _tensor_masked_update_kernel_2d(
+    STATES_PTR,
+    IDX_PTR,
+    NEW_STATES_PTR,
+    stride_state_b,
+    stride_state_d0,
+    stride_new_b,
+    stride_new_d0,
+    ROW_SIZE,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Kernel to update values in a 2D states tensor using a mask."""
+    pid_batch = tl.program_id(0).to(tl.int64)
+    pid_row_chunk = tl.program_id(1).to(tl.int64)
+
+    target_idx = tl.load(IDX_PTR + pid_batch)
+    if target_idx == -1:
+        return
+
+    row_start_offset = pid_row_chunk * BLOCK_SIZE
+    row_offsets = row_start_offset + tl.arange(0, BLOCK_SIZE)
+    mask = row_offsets < ROW_SIZE
+
+    # 2D Calculation: base + batch * stride0 + col * stride1
+    dst_ptr = (
+        STATES_PTR
+        + (target_idx.to(tl.int64) * stride_state_b)
+        + (row_offsets.to(tl.int64) * stride_state_d0)
+    )
+    src_ptr = (
+        NEW_STATES_PTR
+        + (pid_batch * stride_new_b.to(tl.int64))
+        + (row_offsets.to(tl.int64) * stride_new_d0)
+    )
+
+    val = tl.load(src_ptr, mask=mask)
+    tl.store(dst_ptr, val, mask=mask)
+
+
+@triton.jit
+def _tensor_masked_update_kernel_3d(
+    STATES_PTR,
+    IDX_PTR,
+    NEW_STATES_PTR,
+    stride_state_b,
+    stride_state_d0,
+    stride_state_d1,
+    stride_new_b,
+    stride_new_d0,
+    stride_new_d1,
+    SIZE_D0,
+    SIZE_D1,  # Dimensions of the non-batch axes
+    ROW_SIZE,  # Total elements per batch item (D0 * D1)
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Kernel to update values in a 3D states tensor using a mask."""
+    pid_batch = tl.program_id(0).to(tl.int64)
+    pid_row_chunk = tl.program_id(1).to(tl.int64)
+
+    target_idx = tl.load(IDX_PTR + pid_batch)
+    if target_idx == -1:
+        return
+
+    # Linear index within the "row" (flattened 3D volume)
+    row_start_offset = pid_row_chunk * BLOCK_SIZE
+    flat_offsets = row_start_offset + tl.arange(0, BLOCK_SIZE)
+    mask = flat_offsets < ROW_SIZE
+
+    # Reconstruct 3D coordinates from linear index
+    # Given shape (batch, D0, D1)
+    # idx_d1 = flat_idx % D1
+    # idx_d0 = flat_idx // D1
+    idx_d1 = flat_offsets % SIZE_D1.to(tl.int64)
+    idx_d0 = flat_offsets // SIZE_D1.to(tl.int64)
+
+    # Calculate pointers using specific strides
+    dst_offset = (
+        (target_idx.to(tl.int64) * stride_state_b.to(tl.int64))
+        + (idx_d0 * stride_state_d0)
+        + (idx_d1 * stride_state_d1)
+    )
+
+    src_offset = (
+        (pid_batch * stride_new_b.to(tl.int64))
+        + (idx_d0 * stride_new_d0)
+        + (idx_d1 * stride_new_d1)
+    )
+
+    dst_ptr = STATES_PTR + dst_offset
+    src_ptr = NEW_STATES_PTR + src_offset
+
+    val = tl.load(src_ptr, mask=mask)
+    tl.store(dst_ptr, val, mask=mask)
+
+
+@triton.jit
+def _tensor_masked_update_kernel_4d(
+    STATES_PTR,
+    IDX_PTR,
+    NEW_STATES_PTR,
+    stride_state_b,
+    stride_state_d0,
+    stride_state_d1,
+    stride_state_d2,
+    stride_new_b,
+    stride_new_d0,
+    stride_new_d1,
+    stride_new_d2,
+    SIZE_D0,
+    SIZE_D1,
+    SIZE_D2,  # Dimensions (C, H, W)
+    ROW_SIZE,  # Total elements (C * H * W)
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Kernel to update values in a 4D states tensor using a mask."""
+    pid_batch = tl.program_id(0).to(tl.int64)
+    pid_row_chunk = tl.program_id(1).to(tl.int64)
+
+    target_idx = tl.load(IDX_PTR + pid_batch)
+    if target_idx == -1:
+        return
+
+    # Linear index
+    row_start_offset = pid_row_chunk * BLOCK_SIZE
+    flat_offsets = row_start_offset + tl.arange(0, BLOCK_SIZE)
+    mask = flat_offsets < ROW_SIZE
+
+    # Reconstruct 4D coordinates from linear index
+    # Given shape (batch, D0, D1, D2)
+    # idx_d2 = flat % D2
+    # temp   = flat // D2
+    # idx_d1 = temp % D1
+    # idx_d0 = temp // D1
+
+    idx_d2 = flat_offsets % SIZE_D2.to(tl.int64)
+    temp = flat_offsets // SIZE_D2.to(tl.int64)
+    idx_d1 = temp % SIZE_D1.to(tl.int64)
+    idx_d0 = temp // SIZE_D1.to(tl.int64)
+
+    # Calculate pointers using specific strides
+    dst_offset = (
+        (target_idx.to(tl.int64) * stride_state_b.to(tl.int64))
+        + (idx_d0 * stride_state_d0)
+        + (idx_d1 * stride_state_d1)
+        + (idx_d2 * stride_state_d2)
+    )
+
+    src_offset = (
+        (pid_batch * stride_new_b.to(tl.int64))
+        + (idx_d0 * stride_new_d0)
+        + (idx_d1 * stride_new_d1)
+        + (idx_d2 * stride_new_d2)
+    )
+
+    dst_ptr = STATES_PTR + dst_offset
+    src_ptr = NEW_STATES_PTR + src_offset
+
+    val = tl.load(src_ptr, mask=mask)
+    tl.store(dst_ptr, val, mask=mask)
+
+
+def _compute_row_size(tensor):
+    if tensor.ndim == 1:
+        return 1
+
+    row_size = 1
+    for dim in tensor.shape[1:]:
+        row_size *= dim
+    return row_size
+
+
+def tensor_get_slice_after(input_tensor, output_tensor, pos_on_device, check_bounds: bool = False):
+    """
+    Copy from input_tensor[pos_on_device:] to output_tensor[:copy_size].
+    """
+
+    assert (
+        input_tensor.device == output_tensor.device
+    ), "Input and output tensors must be on the same device"
+    assert (
+        input_tensor.dtype == output_tensor.dtype
+    ), "Input and output tensors must have the same dtype"
+    assert (
+        input_tensor.is_contiguous() and output_tensor.is_contiguous()
+    ), "Input and output tensors must be contiguous"
+
+    if check_bounds:
+        assert (
+            input_tensor.ndim == output_tensor.ndim
+        ), "Input and output tensors must have the same number of dimensions"
+
+        for i in range(1, input_tensor.ndim):
+            assert (
+                input_tensor.shape[i] == output_tensor.shape[i]
+            ), f"Dimension {i} must match between input and output tensors"
+
+        pos_on_device_val = pos_on_device[0].item()
+        assert (
+            0 <= pos_on_device_val <= input_tensor.shape[0]
+        ), "pos_on_device must be between 0 and input_tensor.shape[0]"
+
+        copy_size = input_tensor.shape[0] - pos_on_device_val
+        assert (
+            copy_size <= output_tensor.shape[0]
+        ), f"Copy size ({copy_size}) exceeds output_tensor batch size ({output_tensor.shape[0]})"
+
+    input_batch_size = input_tensor.shape[0]
+    output_batch_size = output_tensor.shape[0]
+
+    row_size = _compute_row_size(input_tensor)
+    block_size = triton.next_power_of_2(row_size)
+
+    grid = (input_batch_size,) if input_batch_size > 0 else (1,)
+
+    if input_batch_size > 0:
+        _tensor_get_slice_after_kernel[grid](
+            input_tensor,
+            output_tensor,
+            POS_ON_DEVICE=pos_on_device,
+            INPUT_BATCH_SIZE=input_batch_size,
+            OUTPUT_BATCH_SIZE=output_batch_size,
+            ROW_SIZE=row_size,
+            BLOCK_SIZE=block_size,
+        )
+
+
+def tensor_merge(
+    tensor_a: torch.Tensor,
+    tensor_b: torch.Tensor,
+    pos_on_device: torch.Tensor,
+    output_tensor: Optional[torch.Tensor] = None,
+    check_bounds: bool = False,
+):
+    """
+    Merge tensor_a and tensor_b.
+
+    If output_tensor is None, the operation is performed in-place on tensor_a.
+    """
+
+    is_inplace = False
+    if output_tensor is None:
+        output_tensor = tensor_a
+        is_inplace = True
+
+    assert (
+        tensor_a.device == tensor_b.device == output_tensor.device
+    ), "All tensors must be on the same device"
+    assert (
+        tensor_a.dtype == tensor_b.dtype == output_tensor.dtype
+    ), "All tensors must have the same dtype"
+    assert (
+        tensor_a.is_contiguous() and tensor_b.is_contiguous() and output_tensor.is_contiguous()
+    ), "All tensors must be contiguous"
+
+    if check_bounds:
+        assert (
+            tensor_a.ndim == tensor_b.ndim == output_tensor.ndim
+        ), "All tensors must have the same number of dimensions"
+
+        for i in range(1, tensor_a.ndim):
+            assert (
+                tensor_a.shape[i] == tensor_b.shape[i] == output_tensor.shape[i]
+            ), f"Dimension {i} must match across all tensors"
+
+        assert (
+            output_tensor.shape[0] >= tensor_a.shape[0]
+        ), "output_tensor batch size must be >= tensor_a batch size"
+
+        pos_on_device_val = pos_on_device[0].item()
+        assert (
+            0 <= pos_on_device_val <= tensor_a.shape[0]
+        ), "pos_on_device must be between 0 and tensor_a batch size"
+
+    tensor_b_batch_size = tensor_b.shape[0]
+    output_batch_size = output_tensor.shape[0]
+
+    row_size = _compute_row_size(tensor_a)
+    block_size = triton.next_power_of_2(row_size)
+
+    grid = (output_batch_size,)
+
+    _tensor_merge_kernel[grid](
+        tensor_a,
+        tensor_b,
+        output_tensor,
+        POS_ON_DEVICE=pos_on_device,
+        TENSOR_B_BATCH_SIZE=tensor_b_batch_size,
+        ROW_SIZE=row_size,
+        BLOCK_SIZE=block_size,
+        OUTPUT_BATCH_SIZE=output_batch_size,
+        IS_INPLACE=is_inplace,
+    )
+
+
+def tensor_masked_update(states: torch.Tensor, idx: torch.Tensor, new_states: torch.Tensor):
+    """
+    Update `states` to `new_states` at `idx`, but ignore any -1 values in `idx`.
+    Works for 2D, 3D, or 4D tensors.
+
+    Args:
+        states: (N, ...) - Destination tensor (2D, 3D, or 4D)
+        idx: (B,) - Indices to update. -1 means skip.
+        new_states: (B, ...) - Source tensor. Must match states shape[1:]
+    """
+    assert states.is_cuda and idx.is_cuda and new_states.is_cuda
+    assert idx.ndim == 1
+    assert states.shape[1:] == new_states.shape[1:], "State dimensions must match"
+
+    ndim = states.ndim
+    assert ndim in [2, 3, 4], "Only 2D, 3D, and 4D tensors are supported"
+
+    n_updates = idx.shape[0]
+
+    row_size = 1
+    for dim in states.shape[1:]:
+        row_size *= dim
+
+    BLOCK_SIZE = 1024
+    grid = lambda meta: (n_updates, triton.cdiv(row_size, meta["BLOCK_SIZE"]))
+
+    if ndim == 2:
+        _tensor_masked_update_kernel_2d[grid](
+            STATES_PTR=states,
+            IDX_PTR=idx,
+            NEW_STATES_PTR=new_states,
+            stride_state_b=states.stride(0),
+            stride_state_d0=states.stride(1),
+            stride_new_b=new_states.stride(0),
+            stride_new_d0=new_states.stride(1),
+            ROW_SIZE=row_size,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    elif ndim == 3:
+        # Shapes: (N, D0, D1)
+        _tensor_masked_update_kernel_3d[grid](
+            STATES_PTR=states,
+            IDX_PTR=idx,
+            NEW_STATES_PTR=new_states,
+            # Strides
+            stride_state_b=states.stride(0),
+            stride_state_d0=states.stride(1),
+            stride_state_d1=states.stride(2),
+            stride_new_b=new_states.stride(0),
+            stride_new_d0=new_states.stride(1),
+            stride_new_d1=new_states.stride(2),
+            # Dims
+            SIZE_D0=states.shape[1],
+            SIZE_D1=states.shape[2],
+            ROW_SIZE=row_size,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    elif ndim == 4:
+        # Shapes: (N, D0, D1, D2)
+        _tensor_masked_update_kernel_4d[grid](
+            STATES_PTR=states,
+            IDX_PTR=idx,
+            NEW_STATES_PTR=new_states,
+            # Strides
+            stride_state_b=states.stride(0),
+            stride_state_d0=states.stride(1),
+            stride_state_d1=states.stride(2),
+            stride_state_d2=states.stride(3),
+            stride_new_b=new_states.stride(0),
+            stride_new_d0=new_states.stride(1),
+            stride_new_d1=new_states.stride(2),
+            stride_new_d2=new_states.stride(3),
+            # Dims
+            SIZE_D0=states.shape[1],
+            SIZE_D1=states.shape[2],
+            SIZE_D2=states.shape[3],
+            ROW_SIZE=row_size,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
diff --git a/megatron/core/inference/contexts/dynamic_block_allocator.py b/megatron/core/inference/contexts/dynamic_block_allocator.py
index aaf7d4444fc..8207a17550c 100644
--- a/megatron/core/inference/contexts/dynamic_block_allocator.py
+++ b/megatron/core/inference/contexts/dynamic_block_allocator.py
@@ -16,21 +16,20 @@ class BlockAllocator:
 
     Args:
         context (DynamicInferenceContext): Dynamic inference context.
-        active_count (int): Total number of active blocks available in the buffer.
-            The full buffer size is 2*active_count, to accommodate an equal-size
-            space for paused requests that live on the CPU.
+        total_count (int): Total number of blocks in the buffer.
+        paused_count (int): Number of paused blocks in the buffer. Must be less
+            than `total_count`.
     """
 
-    def __init__(self, context: "DynamicInferenceContext", total_count: int):
+    def __init__(self, context: "DynamicInferenceContext", total_count: int, paused_count: int):
 
         self.context = context
 
-        active_count = (total_count - 1) // 2  # -1 for dummy_block_idx (see below)
-        active_count = max(1, active_count)  # need at least one block
-        self.total_count = 2 * active_count + 1  # +1 for dummy_block_idx
-        self.total_avail = self.total_count - 1  # -1 for dummy_block_idx
-        self.active_count = active_count
-        self.paused_count = self.total_count - self.active_count - 1  # -1 for dummy_block_idx
+        self.total_count = total_count
+        self.total_avail = total_count - 1  # -1 for dummy_block_idx (see below)
+        self.paused_count = paused_count
+        self.active_count = total_count - paused_count - 1  # -1 for dummy_block_idx
+        assert self.active_count >= 1  # ensures paused_count < total_count - 1
         self.dummy_block_idx = self.total_count - 1
 
         # Initialize block pool as a "stack" data structure
@@ -40,10 +39,15 @@ def __init__(self, context: "DynamicInferenceContext", total_count: int):
 
     def __str__(self):
         return (
-            f"total avail {self.total_avail} / {self.total_count - 1}"
-            f"; active {self.active_count}"
+            f"using: total {self.get_total_used()}/{self.total_count - 1}"
+            f"; active {self.get_active_used()}/{self.active_count}"
+            f"; paused {self.get_paused_used()}/{self.paused_count}"
         )
 
+    def get_total_used(self):
+        """Compute number of total blocks used."""
+        return self.total_count - self.total_avail - 1
+
     def get_active_used(self):
         """Compute number of active blocks used."""
         return (
@@ -77,7 +81,7 @@ def is_memory_available(self, num_blocks: int) -> bool:
         Return:
             (bool) Is memory available?
         """
-        return self.get_active_avail() >= num_blocks
+        return self.total_avail >= num_blocks
 
     def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]:
         """Allocate memory blocks if available, else return None.
@@ -116,4 +120,16 @@ def reset(self) -> None:
         (except for the dummy block).
         """
 
+        # Reset block bag to so we start consuming from the beginning of the pool
+        # for UVM performance.
+        # *Note*: Resetting the block bag is essential because if engine has been
+        # suspended, then the block bag contains non-unique IDs since the
+        # right-most IDs have been 'popped' off and are owned by the context.
+        # Without resetting the block bag, context request memory will clash and
+        # requests will point to each other's memory blocks, resulting in faulty
+        # generations.
+        self.block_bag = torch.arange(
+            self.total_count, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+
         self.total_avail = self.total_count - 1
diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py
index 2f559bf581d..5dc2d503097 100644
--- a/megatron/core/inference/contexts/dynamic_context.py
+++ b/megatron/core/inference/contexts/dynamic_context.py
@@ -28,10 +28,11 @@
 from megatron.core.inference.utils import tensor_swap
 from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
 from megatron.core.package_info import __version__ as mcore_version
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.ssm.mamba_hybrid_layer_allocation import get_layer_maps_from_layer_type_list
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import divide as core_divide
-from megatron.core.utils import internal_api
+from megatron.core.utils import get_attr_wrapped_model, get_pg_size, internal_api
 
 from .attention_context.mamba_metadata import MambaInferenceStateConfig, MambaMetadata
 from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
@@ -57,6 +58,14 @@
 except ImportError:
     HAVE_FLASHINFER = False
 
+try:
+    from torch_memory_saver import torch_memory_saver
+
+    torch_memory_saver.hook_mode = "torch"
+    HAVE_TORCH_MEMORY_SAVER = True
+except ImportError:
+    HAVE_TORCH_MEMORY_SAVER = False
+
 try:
     import wandb  # pylint: disable=unused-import
 
@@ -116,7 +125,7 @@ class BlockOverflowError(ContextOverflowError):
 
 class ActiveRequestCountOverflowError(ContextOverflowError):
     '''Used when `initialize_attention_state()` is called with
-    `num_warmup_requests > max_active_requests.'''
+    `num_warmup_requests > max_requests.'''
 
     def __init__(self, max_request_count, active_request_count):
         assert active_request_count > max_request_count
@@ -174,7 +183,7 @@ def deserialize(cls, obj: dict) -> ContextOverflowError:
             "ActiveRequestCountOverflowError": ActiveRequestCountOverflowError,
         }[obj["type"]]
         error = ContextOverflowError(**{k: v for k, v in obj.items() if k != "type"})
-        error.__class__ = error_cls  # todo (@lmcafe): better/safer alternative?
+        error.__class__ = error_cls  # todo (@lmcafee): better/safer alternative?
         return error
 
 
@@ -199,9 +208,9 @@ class DynamicInferenceContext(BaseInferenceContext):
     at any step. The only constraint is the maximum number of requests or tokens
     that the context is defined to support. For the block-level KV cache, a memory
     buffer is allocated up front (size `buffer_size_gb` if `unified_memory_level`
-    == 0, or `2 * buffer_size_gb` if `unified_memory_level` == 1), that is
-    divided into blocks and dynamically assigned to requests. At any given step,
-    any unassigned blocks equate to unused space.
+    == 0, or `buffer_size_gb + paused_buffer_size_gb` if `unified_memory_level` ==
+    1), that is divided into blocks and dynamically assigned to requests. At any
+    given step, any unassigned blocks equate to unused space.
 
     Args:
         params_dtype (torch.dtype): Dtype used for KV cache.
@@ -212,16 +221,24 @@ class DynamicInferenceContext(BaseInferenceContext):
             that will occur.
         buffer_size_gb (float): Buffer size reserved on the GPU for the KV cache.
             if `unified_memory_level` >= 1, then CPU memory is additionally
-            utilized, resulting in a total buffer size of `2 * buffer_size_gb`.
-            Regardless of total buffer size, the KV cache is conceptually divided
-            into 50% active requests and 50% paused requests.
+            utilized, resulting in a total buffer size of `buffer_size_gb +
+            paused_buffer_size_gb`.
+        paused_buffer_size_gb (float | None): Portion of buffer reserved for
+            paused requests. Active requests are paused when there are not enough
+            active blocks available to continue generating a request. The total
+            buffer size (active + paused) depends on `unified_memory_level` (uvm):
+            - uvm 0: buffer_size_gb (paused buffer is inclusive)
+            - uvm 1: buffer_size_gb + paused_buffer_size_gb
+        max_requests (int): Max number of active requests to use for
+            decode-only forward passes. This value is primarily limited by the
+            combination of `buffer_size_gb` and `max_sequence_length`.
         max_tokens (int): Max number of tokens to use for forward passes. This is
             primarily limited by prefill activation memory usage. (Defaults to
             16384).
         block_size_tokens (int): Size of KV cache block size.
         tensor_model_parallel_size (Optional[int]): Tensor model parallel size.
         num_cuda_graphs (Optional[int]): Maximum number of cuda graphs to capture,
-            where the cuda graph batch sizes range from 1 to `max_active_requests`
+            where the cuda graph batch sizes range from 1 to `max_requests`
             (as computed below). Due to rounding, the actual number of cuda graphs
             may not equal this argument.
         materialize_only_last_token_logits (Optional[bool]): Whether to only
@@ -238,9 +255,9 @@ class DynamicInferenceContext(BaseInferenceContext):
         use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation.
             If None, defaults to using flash-infer if available.
         metrics_writer (Optional['WandbModule']): Wandb module for writing metrics.
-        num_request_metadata (Optional[int]): Number of metadata fields to track per request.
-            These represent metadata that is needed by the text generation controller,
-            and that must be kept in sync with active requests through update_requests.
+        request_metadata_types (Optional[List[Tuple[str, torch.dtype, bool]]]): A list of the
+            per-request metadata types to track. Each entry is a tuple consisting of the string
+            label, the target dtype, and whether to store the data on GPU.
     """
 
     DEFAULT_MAX_TOKENS = 16384
@@ -256,9 +273,13 @@ def __init__(
         num_attention_heads: int,
         max_sequence_length: int,
         buffer_size_gb: float,
+        paused_buffer_size_gb: float | None = None,
+        max_requests: int = None,
         max_tokens: int = DEFAULT_MAX_TOKENS,
         block_size_tokens: int = 256,
         tensor_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_size: Optional[int] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
         cache_mla_latent: bool = False,
         kv_lora_rank: Optional[int] = None,
         qk_pos_emb_head_dim: Optional[int] = None,
@@ -267,11 +288,13 @@ def __init__(
         mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None,
         use_cuda_graphs_for_non_decode_steps: bool = True,
         use_flashinfer_fused_rope: bool = False,
-        unified_memory_level: Optional[int] = 1,
+        unified_memory_level: Optional[int] = 0,
         cuda_graph_max_tokens: Optional[int] = None,
         cuda_graph_mixed_prefill_count: Optional[int] = 16,
         metrics_writer: Optional['WandbModule'] = None,
-        num_request_metadata: Optional[int] = None,
+        request_metadata_types: Optional[List[Tuple[str, torch.dtype, bool]]] = None,
+        persist_cuda_graphs: Optional[bool] = False,
+        offload_kv_cache: Optional[bool] = False,
     ):
         super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
 
@@ -281,17 +304,58 @@ def __init__(
                 block_size_tokens == 64
             ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert"
 
+        # give deprecated args warning for cuda_graph_max_tokens
+        if cuda_graph_max_tokens is not None:
+            warnings.warn(
+                "`cuda_graph_max_tokens` is deprecated and will be removed in a future release. "
+                "The context now automatically sets the max tokens for cuda graphs based on "
+                "`max_requests`.",
+                DeprecationWarning,
+            )
+
         self.metrics_writer = metrics_writer
 
         # Per partition num heads and hidden size.
         projection_size = kv_channels * num_attention_heads
         if tensor_model_parallel_size is None:
-            tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            tp_size = (
+                get_pg_size(pg_collection.tp)
+                if pg_collection is not None
+                else parallel_state.get_tensor_model_parallel_world_size()
+            )
         else:
             tp_size = tensor_model_parallel_size
         self.hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads)
         self.num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size)
 
+        if pipeline_model_parallel_size is None:
+            pp_size = (
+                get_pg_size(pg_collection.pp)
+                if pg_collection is not None
+                else parallel_state.get_pipeline_model_parallel_world_size()
+            )
+        else:
+            pp_size = pipeline_model_parallel_size
+
+        # Cache the PP group we should use for PP collectives inside the context.
+        # If the model provides a pg_collection with a pp group, prefer it.
+        # Otherwise:
+        # - for PP=1 we don't need a PP group at all
+        # - for PP>1 we require Megatron parallel_state to be initialized
+        if pg_collection is not None and get_pg_size(pg_collection.pp) > 1:
+            self.pipeline_parallel_group = pg_collection.pp
+        elif pp_size > 1:
+            self.pipeline_parallel_group = parallel_state.get_pipeline_model_parallel_group()
+        else:
+            self.pipeline_parallel_group = None
+
+        if pg_collection is not None:
+            self.expert_model_parallel_group = pg_collection.ep
+        elif parallel_state.get_expert_model_parallel_world_size() > 1:
+            self.expert_model_parallel_group = parallel_state.get_expert_model_parallel_group()
+        else:
+            self.expert_model_parallel_group = None
+
         # Mamba states.
         self.is_hybrid_model = mamba_inference_state_config is not None
         if self.is_hybrid_model:
@@ -303,9 +367,6 @@ def __init__(
             assert (
                 mamba_ssm_states_shape is not None
             ), "`mamba_ssm_states_shape` must be specified for hybrid models"
-            assert not (
-                num_cuda_graphs is not None and use_cuda_graphs_for_non_decode_steps
-            ), "Non-decode CUDA graphs not yet supported for hybrid models"
 
             # For hybrid models, the layer map converts the global layer index to the
             # corresponding attention layer index or Mamba layer index depending on the
@@ -362,6 +423,7 @@ def __init__(
 
         # Unified memory.
         self.unified_memory_level = unified_memory_level
+        self.persist_cuda_graphs = persist_cuda_graphs
         if unified_memory_level > 0:
             try:
                 self.unified_memory_mempool = create_unified_mempool()
@@ -374,34 +436,44 @@ def __init__(
 
         # Initialize block allocator.
         buffer_size_bytes = int(buffer_size_gb * 1024**3)
-        block_count_total = buffer_size_bytes // (
+        paused_buffer_size_bytes = (
+            0 if paused_buffer_size_gb is None else int(paused_buffer_size_gb * 1024**3)
+        )
+        # TODO: Add parameter to control fraction of memory assigned to KV cache
+        # versus Mamba state.
+        block_count = buffer_size_bytes // (self.block_size_bytes + mamba_states_memory_per_request)
+        block_count = max(2, block_count)  # need >= 1 active block + 1 dummy block
+        paused_block_count = paused_buffer_size_bytes // (
             self.block_size_bytes + mamba_states_memory_per_request
         )
+
+        # If using pipeline parallelism synchronize the total block count in case the
+        # pipeline stages have different layer allocations. Non-uniform block counts
+        # can lead to some ranks pausing requests earlier than other ranks
+        # (i.e., divergence in the scheduling behavior).
+        if pp_size > 1:
+            block_count_tensor = torch.tensor(
+                block_count, dtype=torch.int32, device=torch.cuda.current_device()
+            )
+            torch.distributed.all_reduce(
+                block_count_tensor,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.pipeline_parallel_group,
+            )
+            block_count = block_count_tensor.item()
+
         self.block_allocator = BlockAllocator(
             context=self,
             total_count=(
-                block_count_total if self.unified_memory_level == 0 else 2 * block_count_total
+                block_count if self.unified_memory_level == 0 else block_count + paused_block_count
             ),
-        )
-
-        # Set max_total_requests, max_active_requests, max_tokens.
-        self.max_total_requests = self.block_allocator.total_count - 1  # -1 for dummy block
-        max_active_requests = self.block_allocator.active_count // tp_size * tp_size
-        self.max_active_requests = (
-            max_active_requests // self.REQUEST_ROUNDER * self.REQUEST_ROUNDER
-        )
-        self.max_tokens = max_tokens or self.DEFAULT_MAX_TOKENS
-
-        assert self.max_tokens >= self.max_active_requests, (
-            f"max_tokens ({self.max_tokens}) must be >= "
-            f"max_active_requests ({self.max_active_requests}), "
-            "to have consistency between cuda graph sizes and the block table size."
+            paused_count=paused_block_count,
         )
 
         # Track request metadata.
-        if num_request_metadata is None:
-            num_request_metadata = len(DynamicInferenceRequest.get_metadata_labels())
-        self.num_request_metadata = num_request_metadata
+        if request_metadata_types is None:
+            request_metadata_types = DynamicInferenceRequest.get_metadata_types()
+        self.request_metadata_types = request_metadata_types
 
         # Initialize context state.
         self.params_dtype = params_dtype
@@ -424,6 +496,24 @@ def __init__(
         # Block ids.
         self.max_kv_block_count = math.ceil(self.max_sequence_length / self.block_size_tokens)
 
+        # Set max_requests, max_tokens.
+        if max_requests is None:
+            # Maximize compute utilization by defaulting to 1 block per request.
+            self.max_requests = self.block_allocator.total_count - 1  # -1 for dummy block
+            self.max_requests = self.max_requests // tp_size * tp_size
+            self.max_requests = self.max_requests // self.REQUEST_ROUNDER * self.REQUEST_ROUNDER
+        else:
+            # User can control request overflow via max_requests.
+            self.max_requests = max_requests
+
+        self.max_tokens = max_tokens or self.DEFAULT_MAX_TOKENS
+
+        assert self.max_tokens >= self.max_requests, (
+            f"max_tokens ({self.max_tokens}) must be >= "
+            f"max_requests ({self.max_requests}), "
+            "to have consistency between cuda graph sizes and the block table size."
+        )
+
         # Attention metadata initialization (tensors are now handled by MHAMetadata classes)
 
         self.num_prefill_requests = 0
@@ -434,7 +524,7 @@ def __init__(
         self.graph_attn_metadata["mha_metadata"] = GraphedMHAMetadata(
             block_count_total=self.block_allocator.total_count,
             max_kv_block_count=self.max_kv_block_count,
-            max_requests=self.max_total_requests,
+            max_requests=self.max_requests,
             block_size_tokens=self.block_size_tokens,
             max_seqlen=self.max_sequence_length,
         )
@@ -442,7 +532,7 @@ def __init__(
         self.non_graph_attn_metadata["mha_metadata"] = NonGraphedMHAMetadata(
             block_count_total=self.block_allocator.total_count,
             max_kv_block_count=self.max_kv_block_count,
-            max_requests=self.max_total_requests,
+            max_requests=self.max_requests,
             block_size_tokens=self.block_size_tokens,
             max_seqlen=self.max_sequence_length,
         )
@@ -452,18 +542,26 @@ def __init__(
             CUDAGraphBatchDimensionBuilder.generate_cuda_graph_batch_dimensions_list(
                 tp_size=tp_size,
                 num_cuda_graphs=num_cuda_graphs,
-                cuda_graph_max_tokens=cuda_graph_max_tokens,
+                cuda_graph_max_tokens=self.max_requests,
                 cuda_graph_mixed_prefill_count=cuda_graph_mixed_prefill_count,
-                max_requests=self.max_active_requests,
+                max_requests=self.max_requests,
                 max_tokens=self.max_tokens,
                 max_sequence_length=self.max_sequence_length,
                 use_cuda_graphs_for_non_decode_steps=use_cuda_graphs_for_non_decode_steps,
             )
         )
 
+        # Whether to offload the KV cache. Determines where the KV cache is allocated within memory.
+        self.offload_kv_cache = offload_kv_cache
+        assert not (
+            self.offload_kv_cache and self.unified_memory_level
+        ), "The KV cache should not be instantiated in unified memory when it is offloaded during training."
+
         self._using_cuda_graph_this_step = False
+        self.use_cuda_graphs_for_non_decode_steps = use_cuda_graphs_for_non_decode_steps
         # Deal with chunked prefill
         self.chunked_prefill_request_id = -1
+        self.has_explicit_chunked_prefill_req = False
 
         # FlashInfer.
         if use_flashinfer_fused_rope is True:
@@ -474,6 +572,7 @@ def __init__(
 
         # Allocate GPU state.
         self.is_tensor_state_allocated = False
+        self.is_symmetric_memory_initialized = False
         self.allocate_all_tensors(is_init=True)
 
         # Print info.
@@ -517,7 +616,7 @@ def allocate_all_tensors(self, *, is_init: bool) -> None:
 
         # Per-request state.
         self.request_ids = torch.full(
-            (self.max_total_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device()
+            (self.max_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device()
         )
         # request_query_lengths is the input prompt tokens length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation)
         self.request_query_lengths = torch.empty_like(self.request_ids)
@@ -530,18 +629,19 @@ def allocate_all_tensors(self, *, is_init: bool) -> None:
         # request_last_kv_block_offset represents number of tokens in the last kv block
         self.request_last_kv_block_offset = torch.empty_like(self.request_ids)
         self.request_to_kv_block_ids = torch.full(
-            (self.max_total_requests, self.max_kv_block_count),
+            (self.max_requests, self.max_kv_block_count),
             -1,
             dtype=torch.int,
             device=torch.cuda.current_device(),
         )
 
         # Track request metadata.
-        self.request_metadata = torch.empty(
-            (self.max_total_requests, self.num_request_metadata),
-            dtype=torch.float32,
-            device=torch.cuda.current_device(),
-        )
+        self.request_metadata = {
+            label: torch.empty(
+                (self.max_requests,), dtype=dtype, device=torch.cuda.current_device()
+            )
+            for label, dtype, _ in self.request_metadata_types
+        }
 
         # Per-token state.
         self.token_to_input_ids = torch.full(
@@ -572,32 +672,41 @@ def allocate_memory_buffer():
                     device=torch.cuda.current_device(),
                 )
             else:
-                self.memory_buffer = torch.empty(
-                    (
-                        2,  # key and value
-                        self.num_attention_layers,
-                        self.block_allocator.total_count,
-                        self.block_size_tokens,
-                        self.num_attention_heads_per_partition,
-                        self.hidden_size_per_attention_head,
-                    ),
-                    dtype=self.params_dtype,
-                    device=torch.cuda.current_device(),
+                ctx = (
+                    torch_memory_saver.region(tag="kv_cache", enable_cpu_backup=True)
+                    if HAVE_TORCH_MEMORY_SAVER and self.offload_kv_cache
+                    else nullcontext()
                 )
 
+                with ctx:
+                    self.memory_buffer = torch.empty(
+                        (
+                            2,  # key and value
+                            self.num_attention_layers,
+                            self.block_allocator.total_count,
+                            self.block_size_tokens,
+                            self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head,
+                        ),
+                        dtype=self.params_dtype,
+                        device=torch.cuda.current_device(),
+                    )
+
         # Optional state tensors for hybrid models
         def allocate_mamba_states():
             """Allocate Mamba states. This function is called below within
             `with ctx_manager:`."""
             if self.is_hybrid_model:
-                self.mamba_metadata = MambaMetadata(max_requests=self.max_total_requests)
+                self.mamba_metadata = MambaMetadata(
+                    max_requests=self.max_requests, max_tokens=self.max_tokens
+                )
                 self.mamba_conv_states = torch.empty(
-                    (self.num_mamba_layers, self.max_total_requests) + self.mamba_conv_states_shape,
+                    (self.num_mamba_layers, self.max_requests) + self.mamba_conv_states_shape,
                     dtype=self.params_dtype,
                     device=torch.cuda.current_device(),
                 )
                 self.mamba_ssm_states = torch.empty(
-                    (self.num_mamba_layers, self.max_total_requests) + self.mamba_ssm_states_shape,
+                    (self.num_mamba_layers, self.max_requests) + self.mamba_ssm_states_shape,
                     dtype=self.params_dtype,
                     device=torch.cuda.current_device(),
                 )
@@ -675,28 +784,51 @@ def from_config(
         buffer_size_gb: float = 40,
         num_cuda_graphs: int = None,
         mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None,
+        unified_memory_level: int = 0,
     ):
         """
         Instantiate a `DynamicInferenceContext` from a `TransformerConfig` and an `InferenceWrapperConfig`.
         """
         # TODO: Add other necessary configs from inference_config
 
-        model_config = model.config
-        max_sequence_length = (
-            inference_config.inference_max_seq_length or model_config.max_sequence_length
-        )
+        # Max sequence length.
+        position_embedding_type = get_attr_wrapped_model(model, "position_embedding_type")
+        model_max_seq_len = get_attr_wrapped_model(model, "max_sequence_length")
+        inf_max_seq_len = inference_config.inference_max_seq_length
+
+        if position_embedding_type == "learned_absolute":
+            # When using absolute position embeddings, it is critical that the
+            # context's `max_sequence_length` is less than or equal to the model's
+            # `max_sequence_length`. Otherwise, the context's `position_ids` will
+            # contain ids greater than the dimension of the position embedding
+            # tensor, which will result in an index error.
+            if inf_max_seq_len:
+                max_sequence_length = min(model_max_seq_len, inf_max_seq_len)
+            else:
+                max_sequence_length = model_max_seq_len
+            assert max_batch_size <= model_max_seq_len
+        else:
+            max_sequence_length = (
+                inference_config.inference_max_seq_length or model_config.max_sequence_length
+            )
         max_sequence_length = max(max_sequence_length, max_batch_size)
+
+        # Context.
+        model_config = model.config
         return cls(
             params_dtype=inference_config.params_dtype,
             num_layers=model_config.num_layers // model_config.pipeline_model_parallel_size,
             kv_channels=model_config.kv_channels,
             num_attention_heads=model_config.num_query_groups,
-            max_sequence_length=inference_config.inference_max_seq_length,
+            tensor_model_parallel_size=model_config.tensor_model_parallel_size,
+            pipeline_model_parallel_size=model_config.pipeline_model_parallel_size,
+            max_sequence_length=max_sequence_length,
             buffer_size_gb=buffer_size_gb,
             materialize_only_last_token_logits=False,
             num_cuda_graphs=num_cuda_graphs,
             use_flashinfer_fused_rope=None,
             mamba_inference_state_config=mamba_inference_state_config,
+            unified_memory_level=unified_memory_level,
         )
 
     @classmethod
@@ -976,7 +1108,7 @@ def reset_attention_state(self) -> None:
         self.active_attn_metadata = None
 
         if self.is_hybrid_model:
-            self.mamba_metadata.reset_cudagraph_mapping()
+            self.mamba_metadata.reset_varlen_metadata()
 
     def reset_mamba_state(self) -> None:
         """Reset state used within Mamba layers."""
@@ -992,14 +1124,14 @@ def add_dummy_requests_parallel(
             return
 
         num_new_requests = len(requests)
-        if self.total_request_count + num_new_requests > self.max_active_requests:
+        if self.total_request_count + num_new_requests > self.max_requests:
             raise RequestOverflowError(requests[-1].request_id)
 
         lengths: List[int] = []
         num_tokens_to_generate: List[int] = []
         request_ids: List[int] = []
         prompt_tokens: List[Tensor] = []
-        metadata_rows: List[List[float]] = []
+        metadata_cols: List[List] = [[] for _ in self.request_metadata_types]
 
         for req in requests:
             assert isinstance(
@@ -1020,7 +1152,8 @@ def add_dummy_requests_parallel(
                     device=self.token_to_input_ids.device, dtype=self.token_to_input_ids.dtype
                 )
             )
-            metadata_rows.append(req.tracked_metadata)
+            for i, m in enumerate(req.tracked_metadata):
+                metadata_cols[i].append(m)
 
         total_new_tokens = sum(lengths)
         if self.active_token_count + total_new_tokens > self.max_tokens:
@@ -1034,9 +1167,6 @@ def add_dummy_requests_parallel(
             num_tokens_to_generate, dtype=self.request_query_lengths.dtype, device=device
         )
         request_ids_tensor = torch.tensor(request_ids, dtype=self.request_ids.dtype, device=device)
-        metadata_tensor = torch.tensor(
-            metadata_rows, dtype=self.request_metadata.dtype, device=self.request_metadata.device
-        )
 
         block_counts = torch.div(
             lengths_tensor + (self.block_size_tokens - 1),
@@ -1053,7 +1183,10 @@ def add_dummy_requests_parallel(
         self.request_output_lengths[request_slice] = lengths_tensor + tokens_to_generate_tensor
         self.request_kv_length_offsets[request_slice] = 0
         self.request_kv_block_counts[request_slice] = block_counts
-        self.request_metadata[request_slice] = metadata_tensor
+        for i, (label, dtype, _) in enumerate(self.request_metadata_types):
+            self.request_metadata[label][request_slice] = torch.tensor(
+                metadata_cols[i], dtype=dtype, device=torch.cuda.current_device()
+            )
 
         dummy_block_idx = self.block_allocator.dummy_block_idx
         self.request_last_kv_block_id[request_slice] = dummy_block_idx
@@ -1200,16 +1333,17 @@ def initialize_attention_state(
             token_count=self.active_token_count,
             prefill_req_count=self.num_prefill_requests,
             decode_req_count=self.num_decode_requests,
+            has_explicit_chunked_prefill_req=self.has_explicit_chunked_prefill_req,
         )
         self.batch_dimensions = batch_dimensions
         best_graph = CUDAGraphBatchDimensionBuilder.match_graph_config(
-            batch_dimensions, self.cuda_graph_batch_dimensions_list
+            batch_dimensions,
+            self.cuda_graph_batch_dimensions_list,
+            strict=self.is_hybrid_model,
+            decode_only_cuda_graphs=(not self.use_cuda_graphs_for_non_decode_steps),
+            ep_group=self.expert_model_parallel_group,
         )
         self._using_cuda_graph_this_step = best_graph is not None
-        if construct_graph_dimensions is not None:
-            assert (
-                batch_dimensions == construct_graph_dimensions == best_graph
-            ), f"batch_dimensions: {batch_dimensions}, construct_graph_dimensions: {construct_graph_dimensions}, best_graph: {best_graph}"
 
         if self.using_cuda_graph_this_step():
             self.padded_batch_dimensions = best_graph
@@ -1218,14 +1352,14 @@ def initialize_attention_state(
             if self.is_decode_only():
                 padded_token_count = min(
                     self.max_tokens,
-                    self.max_active_requests,
+                    self.max_requests,
                     self.round_up_tokens(self.active_token_count),
                 )
                 padded_decode_req_count = padded_token_count
                 padded_prefill_req_count = 0
             else:
                 target_padding_req_count = min(
-                    self.max_active_requests,
+                    self.max_requests,
                     self.round_up_requests(self.total_request_count - self.paused_request_count),
                 )
                 padded_decode_req_count = self.num_decode_requests
@@ -1234,9 +1368,11 @@ def initialize_attention_state(
                 token_count=padded_token_count,
                 prefill_req_count=padded_prefill_req_count,
                 decode_req_count=padded_decode_req_count,
+                has_explicit_chunked_prefill_req=self.has_explicit_chunked_prefill_req,
             )
         self.padded_active_token_count = self.padded_batch_dimensions.token_count
         self.padded_active_request_count = self.padded_batch_dimensions.req_count
+        self.padding_slice = slice(self.active_token_count, self.padded_active_token_count)
 
         # Update token position indexes.
         self.token_to_block_idx[self.active_token_count : self.padded_active_token_count] = (
@@ -1263,6 +1399,8 @@ def initialize_attention_state(
 
         attn_dimensions = batch_dimensions
         if self.using_cuda_graph_this_step():
+            assert not self.has_explicit_chunked_prefill_req
+
             # Treat some decode requests as prefill requests to fit the cuda graph batch dimension.
             if batch_dimensions.decode_req_count > self.padded_batch_dimensions.decode_req_count:
                 total_req = batch_dimensions.req_count
@@ -1272,6 +1410,7 @@ def initialize_attention_state(
                     token_count=batch_dimensions.token_count,
                     prefill_req_count=adjusted_prefill_req_count,
                     decode_req_count=adjusted_decode_req_count,
+                    has_explicit_chunked_prefill_req=False,
                 )
 
         self.active_attn_metadata["mha_metadata"].update(
@@ -1282,15 +1421,19 @@ def initialize_attention_state(
             padded_batch_dimensions=self.padded_batch_dimensions,
         )
 
-        # Create Mamba state block table if it's a hybrid model
         if self.is_hybrid_model:
-            active_mamba_indices = self.mamba_metadata.request_to_mamba_state_idx[
-                self.paused_request_count : self.total_request_count
+            active_mamba_indices_view = self.mamba_metadata.request_to_mamba_state_idx[active_slice]
+            token_to_request_idx_view = self.token_to_request_idx[: self.active_token_count]
+            cu_seqlens = self.active_attn_metadata["mha_metadata"].state_data[
+                "cu_query_seq_lengths"
             ]
-            if self.is_decode_only() or self.using_cuda_graph_this_step():
-                self.mamba_metadata.update_cudagraph_mapping(
-                    active_mamba_indices, self.total_request_count - self.paused_request_count
-                )
+            self.mamba_metadata.update(
+                active_mamba_indices_view,
+                token_to_request_idx_view,
+                cu_seqlens,
+                batch_dimensions=attn_dimensions,
+                padded_batch_dimensions=self.padded_batch_dimensions,
+            )
 
     def reset(self) -> None:
         """Reset entire context.
@@ -1325,7 +1468,10 @@ def reset(self) -> None:
         self.request_last_kv_block_id.fill_(-1)
         self.request_last_kv_block_offset.fill_(0)
         self.request_to_kv_block_ids.fill_(-1)
-        self.request_metadata.fill_(0)
+
+        # Reset request metadata.
+        for metadata_tensor in self.request_metadata.values():
+            metadata_tensor.fill_(0)
 
         # Reset token indexes.
         self.token_to_input_ids.fill_(0)
@@ -1343,6 +1489,7 @@ def reset(self) -> None:
 
         # Reset chunked prefill state
         self.chunked_prefill_request_id = -1
+        self.has_explicit_chunked_prefill_req = False
         self.num_prefill_requests = 0
         self._using_cuda_graph_this_step = False
         self.padded_batch_dimensions = InferenceBatchDimensions(
@@ -1403,7 +1550,7 @@ def check_availability(self, req: DynamicInferenceRequest) -> (bool, bool, bool)
         Check if the request can be added to the context.
         """
         request_can_be_added = (
-            self.total_request_count - self.paused_request_count < self.max_active_requests
+            self.total_request_count < self.max_requests and self.paused_request_count == 0
         )
         request_tokens_can_be_added = (
             self.active_token_count + req.remaining_prompt_length <= self.max_tokens
@@ -1475,21 +1622,31 @@ def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int]
         else:
             current_id = self.total_request_count
 
-        if current_id >= self.max_active_requests:
+        if current_id >= self.max_requests:
             raise RequestOverflowError(req.request_id)
 
         if self.active_token_count + chunk_length > self.max_tokens:
             raise TokenOverflowError(req.request_id)
 
         self.request_ids[current_id] = req.request_id
+
         # Handle request metadata.
-        metadata = req.tracked_metadata
         assert (
-            len(metadata) == self.num_request_metadata
-        ), "Request added to context with invalid metadata length"
-        self.request_metadata[current_id] = torch.tensor(
-            metadata, dtype=torch.float32, device=self.request_metadata.device
-        )
+            req.get_metadata_types() == self.request_metadata_types
+        ), "Request added to context with invalid metadata types"
+        metadata = req.tracked_metadata
+        metadata_types = req.get_metadata_types()
+        for m, m_type in zip(metadata, metadata_types):
+            label, _, _ = m_type
+            if not isinstance(m, torch.Tensor):
+                m = torch.as_tensor(
+                    m,
+                    device=self.request_metadata[label].device,
+                    dtype=self.request_metadata[label].dtype,
+                )
+
+            self.request_metadata[label][current_id] = m
+
         # Handle length and block assignments.
         self.request_query_lengths[current_id] = chunk_length
         self.request_output_lengths[current_id] = (
@@ -1556,7 +1713,6 @@ def _move_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         self.request_kv_length_offsets[dst_idxs] = self.request_kv_length_offsets[src_idxs]
         self.request_query_lengths[dst_idxs] = self.request_query_lengths[src_idxs]
         self.request_output_lengths[dst_idxs] = self.request_output_lengths[src_idxs]
-        self.request_metadata[dst_idxs] = self.request_metadata[src_idxs]
         self.request_ids[dst_idxs] = self.request_ids[src_idxs]
         next_tokens[dst_idxs] = next_tokens[src_idxs]
 
@@ -1565,6 +1721,9 @@ def _move_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         self.request_last_kv_block_id[dst_idxs] = self.request_last_kv_block_id[src_idxs]
         self.request_last_kv_block_offset[dst_idxs] = self.request_last_kv_block_offset[src_idxs]
 
+        for metadata_tensor in self.request_metadata.values():
+            metadata_tensor[dst_idxs] = metadata_tensor[src_idxs]
+
         if self.is_hybrid_model:
             self.mamba_metadata.request_to_mamba_state_idx[dst_idxs] = (
                 self.mamba_metadata.request_to_mamba_state_idx[src_idxs]
@@ -1577,7 +1736,6 @@ def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         tensor_swap(self.request_kv_length_offsets, src_idxs, dst_idxs)
         tensor_swap(self.request_query_lengths, src_idxs, dst_idxs)
         tensor_swap(self.request_output_lengths, src_idxs, dst_idxs)
-        tensor_swap(self.request_metadata, src_idxs, dst_idxs)
         tensor_swap(self.request_ids, src_idxs, dst_idxs)
         tensor_swap(next_tokens, src_idxs, dst_idxs)
         tensor_swap(self.request_to_kv_block_ids, src_idxs, dst_idxs)
@@ -1585,6 +1743,9 @@ def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         tensor_swap(self.request_last_kv_block_id, src_idxs, dst_idxs)
         tensor_swap(self.request_last_kv_block_offset, src_idxs, dst_idxs)
 
+        for metadata_tensor in self.request_metadata.values():
+            tensor_swap(metadata_tensor, src_idxs, dst_idxs)
+
         if self.is_hybrid_model:
             tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs)
 
@@ -1596,7 +1757,217 @@ def get_index_of_chunked_prefill_request(self) -> int:
         """
         return torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0]
 
-    # TODO: see if we can compile this function
+    def release_memory_blocks_from_request_indexes(self, request_indexes) -> None:
+        """Release memory blocks used by the given request idxs.
+
+        Args:
+            request_indexes (torch.Tensor): Request indexes. (*Note*, NOT request
+                ids.)
+        """
+        kv_blocks_assigned = self.request_to_kv_block_ids[request_indexes]
+        non_zero_values_in_kv_memory = kv_blocks_assigned[kv_blocks_assigned != -1]
+        self.block_allocator.release_memory_blocks(non_zero_values_in_kv_memory)
+
+        # Reset the KV blocks for finished requests.
+        # Note: do not use fill_() (or add_() and similar inplace ops) here.
+        # The combinition of indexing with a tensor (like finished_idxs) and
+        # fill_()/add_() creates a clone and updates it instead of the original
+        # tensor.
+        self.request_to_kv_block_ids[request_indexes] = -1
+
+        # Free Mamba slots.
+        if self.is_hybrid_model:
+            self.mamba_metadata.free_slots(request_indexes)
+
+    def resume_paused_requests(
+        self,
+        active_request_count: int,
+        newly_paused_request_ids: torch.Tensor,
+        next_tokens: torch.Tensor,
+    ) -> tuple[int, int, torch.Tensor]:
+        """Resume as many paused requests as we have space for in the active buffer.
+
+        Args:
+            active_request_count (int): Number of active requests.
+            newly_paused_request_ids (torch.Tensor): List of newly paused request ids.
+            next_tokens (torch.Tensor): Sampled tokens.
+
+        Returns:
+            (tuple[int, torch.Tensor]) active_request_count, newly_paused_request_ids.
+        """
+
+        # Assign released blocks to paused requests.
+        # todo: @shanmugamr, un-pause requests using FIFO, rather than LIFO.
+        resume_request_count = 0
+        if self.paused_request_count > 0:
+            active_block_count_avail = self.block_allocator.get_active_avail()
+            paused_block_counts = self.request_kv_block_counts[: self.paused_request_count]
+            # Flip counts before cumsum, since paused requests are resumed from
+            # the right-most index, so we must count resumed blocks starting from
+            # the right side.
+            paused_block_counts = paused_block_counts.flip(dims=[0])
+            # Add +1 to all block counts, since any time a paused request is
+            # resumed, it will be starting a new memory block. For background,
+            # pausing happens after a request has generated the final token of a
+            # memory block (i.e., token 256 of that block), which means the very
+            # next token (whenever that request gets unpaused) will be in a new
+            # block. So, when we resume a paused request, we have to account for
+            # the fact that it will need an extra block beyond the ones that it
+            # has already used.
+            paused_block_counts += 1  # +1 for newly added block
+            paused_block_counts_cumsum = paused_block_counts.cumsum(dim=0)
+            resume_request_count = min(
+                torch.nonzero(paused_block_counts_cumsum <= active_block_count_avail).numel(),
+                self.block_allocator.total_avail,
+            )
+
+        self.paused_request_count -= resume_request_count
+        active_request_count += resume_request_count
+
+        # Resume requests by assigning blocks and updating bookkeeping tensors.
+        if resume_request_count > 0:
+            assert torch.all(
+                self.request_last_kv_block_offset[
+                    self.paused_request_count : (self.paused_request_count + resume_request_count)
+                ]
+                == self.block_size_tokens - 1
+            ), "The request_last_kv_block_offset should be 0 for the requests that just got resumed this step."
+
+            assert resume_request_count <= self.block_allocator.total_avail
+            block_ids = self.block_allocator.allocate_memory_blocks(resume_request_count)
+            row_idx = torch.arange(
+                self.paused_request_count,
+                self.paused_request_count + resume_request_count,
+                device=torch.cuda.current_device(),
+            )
+            col_idx = self.request_kv_block_counts[
+                self.paused_request_count : (self.paused_request_count + resume_request_count)
+            ]
+            self.request_to_kv_block_ids[row_idx, col_idx] = block_ids
+            self.request_kv_block_counts[
+                self.paused_request_count : (self.paused_request_count + resume_request_count)
+            ] += 1
+            self.request_last_kv_block_id[
+                self.paused_request_count : (self.paused_request_count + resume_request_count)
+            ] = block_ids
+
+        # Remove resumed requests from newly_paused_request_ids. We do this by
+        # truncating the end of newly_paused_request_ids, which works because we
+        # resume requests in LIFO order. If resume_request_count >
+        # len(newly_paused_request_ids), this means that none of the paused
+        # requests are newly paused during this update.
+        if newly_paused_request_ids is not None and resume_request_count > 0:
+            newly_paused_request_ids = newly_paused_request_ids[:-resume_request_count]
+
+        return active_request_count, newly_paused_request_ids
+
+    def evict_overflow_paused_requests(
+        self, active_request_count: int, next_tokens: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Evict requests that overflow the paused buffer.
+
+        Args:
+            active_request_count (int): Number of active requests.
+            next_tokens (torch.Tensor): Sampled tokens.
+
+        Returns:
+            (torch.Tensor) Evicted request ids.
+        """
+
+        # Overflow paused block count.
+        overflow_paused_block_count = (
+            self.block_allocator.get_paused_used() - self.block_allocator.paused_count
+        )
+
+        # Nothing to evict?
+        if overflow_paused_block_count <= 0:
+            return None
+
+        # Overflow paused block count.
+        paused_block_counts = self.request_kv_block_counts[: self.paused_request_count]
+        paused_block_counts_cumsum = paused_block_counts.cumsum(dim=0)
+        valid_paused_request_count = torch.nonzero(
+            paused_block_counts_cumsum <= self.block_allocator.paused_count
+        ).numel()
+        overflow_paused_request_count = self.paused_request_count - valid_paused_request_count
+
+        # Nothing to evict? (Similar to checking overflow_paused_block_count
+        # above, but here we allow up to one paused request to overflow into the
+        # active buffer.
+        if overflow_paused_request_count == 0:
+            return None
+
+        # Evict request count. (Flip paused_block_counts because evictions are
+        # counted from the right-most paused requests.
+        paused_block_counts = paused_block_counts[-overflow_paused_request_count:].flip(dims=[0])
+        paused_block_counts_cumsum = paused_block_counts.cumsum(dim=0)
+        remaining_paused_request_counts = torch.arange(
+            overflow_paused_request_count - 1,
+            -1,
+            -1,
+            dtype=paused_block_counts_cumsum.dtype,
+            device=torch.cuda.current_device(),
+        )
+        net_block_counts = paused_block_counts_cumsum - remaining_paused_request_counts
+        evict_request_count = torch.nonzero(net_block_counts >= 0)[0].item() + 1
+
+        # Eviction index range.
+        evict_start_idx = self.paused_request_count - evict_request_count
+        evict_end_idx = self.paused_request_count
+        evict_request_idxs = torch.arange(
+            evict_start_idx, evict_end_idx, device=torch.cuda.current_device()
+        )
+        evict_request_ids = self.request_ids[evict_start_idx:evict_end_idx].clone()
+
+        # Release memory.
+        self.release_memory_blocks_from_request_indexes(evict_request_idxs)
+
+        # Move evicted requests to the right of active requests, while minimizing
+        # movement.
+        if evict_request_count < active_request_count:
+            # Swap all evicted requests with right-most active requests.
+            src_idxs = torch.arange(
+                self.paused_request_count - evict_request_count,
+                self.paused_request_count,
+                device=torch.cuda.current_device(),
+            )
+            dst_idxs = torch.arange(
+                self.total_request_count - evict_request_count,
+                self.total_request_count,
+                device=torch.cuda.current_device(),
+            )
+        else:
+            # Swap all active requests with left-most evicted requests.
+            src_idxs = torch.arange(
+                self.paused_request_count - evict_request_count,
+                self.paused_request_count - evict_request_count + active_request_count,
+                device=torch.cuda.current_device(),
+            )
+            dst_idxs = torch.arange(
+                self.paused_request_count,
+                self.paused_request_count + active_request_count,
+                device=torch.cuda.current_device(),
+            )
+
+        # Swap evicted and active requests.
+        self._swap_book_keeping_tensors(
+            src_idxs=src_idxs, dst_idxs=dst_idxs, next_tokens=next_tokens
+        )
+
+        # Update tracking vars.
+        self.paused_request_count -= evict_request_count
+        self.total_request_count -= evict_request_count
+
+        # Reset unused block ids.
+        evict_slice = slice(
+            self.total_request_count, self.total_request_count + evict_request_count
+        )
+        self.request_to_kv_block_ids[evict_slice] = -1
+        if self.is_hybrid_model:
+            self.mamba_metadata.request_to_mamba_state_idx[evict_slice] = -1
+
+        return evict_request_ids
+
     def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor:
         """Update context state after calling engine.step().
 
@@ -1613,7 +1984,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
         between these request groups.
         - 0:paused_request_count -> paused requests
         - paused_request_count:total_request_count -> active requests
-        - total_request_count:max_active_requests -> completed requests are moved here.
+        - total_request_count:max_requests -> completed requests are moved here.
         The reason for maintaining contiguous tensors rather than multiple
         smaller (e.g., per-group or per-request) tensors is for both 1) speed
         (avoid unnecessary tensor allocations), and 2) compatibility with the
@@ -1625,10 +1996,9 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
         3. Concatenate the paused tokens to the active tokens
         4. For the finished requests we release memory blocks and move them to the right
         5. We identify requests that require a new block and add them to the paused requests (i.e move them left)
-        6. We determine how many requests we can resume and resume them
+        6. Resume paused requests & evict overflowing paused requests.
         7. We make changes to the request book keeping tesnsors and setup the tokens for next iteration
-        8. We resume those requests by assigning blocks and updating bookkeeping tensors
-        9. We make relevant changes to the token bookkeeping tensors
+        8. We make relevant changes to the token bookkeeping tensors
 
         Args:
             active_requests_mask (Tensor): 1D Mask tensor marking active requests.
@@ -1647,6 +2017,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
             active_requests_mask[-1] = (
                 1  # must keep this, next iteration will add a new chunk to it
             )
+        self.has_explicit_chunked_prefill_req = False
 
         active_request_count = (active_requests_mask == 1).sum().item()
         finished_request_count = (active_requests_mask == 0).sum().item()
@@ -1668,12 +2039,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
                     torch.nonzero(active_requests_mask == 0, as_tuple=True)[0]
                     + self.paused_request_count
                 )
-                kv_blocks_assigned = self.request_to_kv_block_ids[finished_idxs]
-                non_zero_values_in_kv_memory = kv_blocks_assigned[kv_blocks_assigned != -1]
-                self.block_allocator.release_memory_blocks(non_zero_values_in_kv_memory)
-
-                if self.is_hybrid_model:
-                    self.mamba_metadata.free_slots(finished_idxs)
+                self.release_memory_blocks_from_request_indexes(finished_idxs)
 
             # Reset request/token counts.
             self.request_to_kv_block_ids.fill_(-1)
@@ -1682,7 +2048,6 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
 
             # Reset Mamba state.
             self.reset_mamba_state()
-
             return
 
         # 3. Concatenate the paused tokens to the active tokens if present.
@@ -1700,19 +2065,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
                 torch.nonzero(active_requests_mask == 0, as_tuple=True)[0]
                 + self.paused_request_count
             )
-            kv_blocks_assigned = self.request_to_kv_block_ids[finished_idxs]
-            non_zero_values_in_kv_memory = kv_blocks_assigned[kv_blocks_assigned != -1]
-            self.block_allocator.release_memory_blocks(non_zero_values_in_kv_memory)
-
-            # Reset the KV blocks for finished requests.
-            # Note: do not use fill_() (or add_() and similar inplace ops) here.
-            # The combinition of indexing with a tensor (like finished_idxs) and fill_()/add_() creates a clone
-            # and updates it instead of the original tensor.
-            self.request_to_kv_block_ids[finished_idxs] = -1
-
-            if self.is_hybrid_model:
-                # Get the Mamba state indices for finished requests and free them
-                self.mamba_metadata.free_slots(finished_idxs)
+            self.release_memory_blocks_from_request_indexes(finished_idxs)
 
             if active_request_count > 0:
                 finished_idxs_on_left = (
@@ -1753,9 +2106,9 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
 
             if self.chunked_prefill_request_id != -1:
                 # find the id in request_ids that is the chunked_prefill_request_id. Only one request should be chunked.
-                active_requests_requiring_new_block[self.get_index_of_chunked_prefill_request()] = (
-                    0  # chunked prefill should not be paused
-                )
+                active_requests_requiring_new_block[
+                    self.get_index_of_chunked_prefill_request() - self.paused_request_count
+                ] = 0  # chunked prefill should not be paused
 
             active_requests_requiring_new_block_count = (
                 (active_requests_requiring_new_block == 1).sum().item()
@@ -1804,41 +2157,33 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
 
         # 6. Now that we have the requests in following order [Paused, Active, Finished]
         # We determine how many requests we can resume and resume them
-        # Assign released blocks to paused requests.
-        # todo: @shanmugamr, un-pause requests using FIFO, rather than LIFO.
-        resume_request_count = 0
-        if self.paused_request_count > 0:
-            active_block_count_avail = self.block_allocator.get_active_avail()
-            paused_block_counts = self.request_kv_block_counts[: self.paused_request_count]
-            paused_block_counts = paused_block_counts.flip(dims=[0])
-            paused_block_counts += 1  # +1 for newly added block
-            paused_block_counts_cumsum = paused_block_counts.cumsum(dim=0)
-            resume_request_count = min(
-                torch.nonzero(paused_block_counts_cumsum <= active_block_count_avail).numel(),
-                self.block_allocator.total_avail,
-            )
 
-        self.paused_request_count -= resume_request_count
-        active_request_count += resume_request_count
+        # 6.a. First, resume temporarily paused requests.
+        active_request_count, newly_paused_request_ids = self.resume_paused_requests(
+            active_request_count, newly_paused_request_ids, next_tokens
+        )
+
+        # 6.b. Evict requests that overflow the paused buffer.
+        evict_request_ids = self.evict_overflow_paused_requests(active_request_count, next_tokens)
+
+        # 6.c. Resume any additional requests.
+        active_request_count, newly_paused_request_ids = self.resume_paused_requests(
+            active_request_count, newly_paused_request_ids, next_tokens
+        )
+
         assert active_request_count > 0, "active_request_count == %d." % active_request_count
 
-        # finally, swap the chunked prefill to the end of the active requests to obey the invariance
+        # 6.d. Swap the chunked prefill request to the end of the active requests
+        # to obey the invariance.
         if self.chunked_prefill_request_id != -1:
             self._swap_book_keeping_tensors(
                 src_idxs=torch.tensor([self.get_index_of_chunked_prefill_request()]),
-                dst_idxs=torch.tensor([active_request_count + self.paused_request_count - 1]),
+                dst_idxs=torch.tensor([self.total_request_count - 1]),
                 next_tokens=next_tokens,
             )
-        # Remove resumed requests from newly_paused_request_ids. We do this by
-        # truncating the end of newly_paused_request_ids, which works because we
-        # resume requests in LIFO order. If resume_request_count >
-        # len(newly_paused_request_ids), this means that none of the paused
-        # requests are newly paused during this update.
-        if newly_paused_request_ids is not None and resume_request_count > 0:
-            newly_paused_request_ids = newly_paused_request_ids[:-resume_request_count]
 
         # 7. We make changes to the request book keeping tesnsors and setup the tokens for next iteration
-        self.total_request_count = active_request_count + self.paused_request_count
+        assert self.total_request_count == active_request_count + self.paused_request_count
 
         # All these active requests are in decode phase, so they need only 1 token per request
         self.active_token_count = active_request_count
@@ -1865,34 +2210,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
             + 1
         ) % self.block_size_tokens
 
-        # 8. We resume those requests by assigning blocks and updating bookkeeping tensors
-        if resume_request_count > 0:
-            assert torch.all(
-                self.request_last_kv_block_offset[
-                    self.paused_request_count : (self.paused_request_count + resume_request_count)
-                ]
-                == 0
-            ), "The request_last_kv_block_offset should be 0 for the requests that just got resumed this step. "
-
-            assert resume_request_count <= self.block_allocator.total_avail
-            block_ids = self.block_allocator.allocate_memory_blocks(resume_request_count)
-            row_idx = torch.arange(
-                self.paused_request_count,
-                self.paused_request_count + resume_request_count,
-                device=torch.cuda.current_device(),
-            )
-            col_idx = self.request_kv_block_counts[
-                self.paused_request_count : (self.paused_request_count + resume_request_count)
-            ]
-            self.request_to_kv_block_ids[row_idx, col_idx] = block_ids
-            self.request_kv_block_counts[
-                self.paused_request_count : (self.paused_request_count + resume_request_count)
-            ] += 1
-            self.request_last_kv_block_id[
-                self.paused_request_count : (self.paused_request_count + resume_request_count)
-            ] = block_ids
-
-        # 9. We make relevant changes to the token bookkeeping tensors
+        # 8. We make relevant changes to the token bookkeeping tensors
         self.token_to_request_idx[: self.active_token_count] = torch.arange(
             self.paused_request_count, self.total_request_count, device=torch.cuda.current_device()
         )
@@ -1907,7 +2225,10 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
             self.request_last_kv_block_offset[self.paused_request_count : self.total_request_count]
         )
 
-        return newly_paused_request_ids
+        return {
+            "newly_paused_request_ids": newly_paused_request_ids,
+            "evict_request_ids": evict_request_ids,
+        }
 
     def calculate_log_probs(
         self, logits: Tensor, new_tokens: Tensor, only_last_token_logits: Optional[bool] = False
@@ -2038,6 +2359,13 @@ def get_kvcache_utilization_stats(self) -> dict:
             'block_count_avail': int(block_count_avail),
             'active_token_count': int(self.active_token_count),
             'total_request_count': int(total_request_count),
-            'max_total_requests': int(self.max_total_requests),
-            'max_active_requests': int(self.max_active_requests),
+            'max_requests': int(self.max_requests),
         }
+
+    def maybe_initialize_symmetric_memory(self):
+        """
+        Initializes symmetric memory for inference, if not already initialized
+        """
+        if not self.is_symmetric_memory_initialized:
+            parallel_state._set_global_symmetric_memory_buffer()
+            self.is_symmetric_memory_initialized = True
diff --git a/megatron/core/inference/data_parallel_inference_coordinator.py b/megatron/core/inference/data_parallel_inference_coordinator.py
index e1fe7b21566..9a1a11a8c2b 100644
--- a/megatron/core/inference/data_parallel_inference_coordinator.py
+++ b/megatron/core/inference/data_parallel_inference_coordinator.py
@@ -1,11 +1,14 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import errno
 import faulthandler
 import logging
 import signal
+import socket
 from collections import deque
 from itertools import cycle
 from multiprocessing import Event
+from multiprocessing.connection import Connection
 
 import torch
 
@@ -65,7 +68,13 @@ class DataParallelInferenceCoordinator:
         next_request_id (int): A counter for generating unique server-side request IDs.
     """
 
-    def __init__(self, inference_coordinator_port: int, data_parallel_size: int):
+    def __init__(
+        self,
+        pipe_connection: Connection,
+        data_parallel_size: int,
+        tokenizer,
+        inference_coordinator_port: int | None = None,
+    ):
         """
         Initializes the inference coordinator.
 
@@ -74,9 +83,11 @@ def __init__(self, inference_coordinator_port: int, data_parallel_size: int):
         ranks to connect before proceeding.
 
         Args:
-            inference_coordinator_port (int): The TCP port number to bind the server to.
+            pipe_connection (Connection): A connecting pipe to the parent process.
             data_parallel_size (int): The number of TP-coordinator workers that are
                 expected to connect.
+            tokenizer: The tokenizer to use for prompt tokenization and detokenization.
+            inference_coordinator_port (Optional[int]): The TCP port number to bind the server to.
         """
         assert HAVE_ZMQ, (
             "please install the pyzmq library to use DataParallelInferenceCoordinator\n"
@@ -86,6 +97,8 @@ def __init__(self, inference_coordinator_port: int, data_parallel_size: int):
             "please install the messagepack library to use DataParallelInferenceCoordinator\n"
             "pip install msgpack"
         )
+        self.pipe_connection = pipe_connection
+        self.data_parallel_size = data_parallel_size
         self.context = zmq.Context()
 
         # This is the central router socket
@@ -95,9 +108,33 @@ def __init__(self, inference_coordinator_port: int, data_parallel_size: int):
         # 3. data parallel ranks return completed requests to this socket. We route them back to
         #    the user that had submitted the request originally.
 
+        # Get local IP.
+        local_ip = socket.gethostname()
+
         self.router_socket = self.context.socket(zmq.ROUTER)
-        self.router_socket.bind(f"tcp://0.0.0.0:{inference_coordinator_port}")
-        self.data_parallel_size = data_parallel_size
+        is_bound = False
+        if inference_coordinator_port is not None:
+            try:
+                self.router_socket.bind(f"tcp://{local_ip}:{inference_coordinator_port}")
+                is_bound = True
+            except zmq.error.ZMQError as e:
+                if e.errno == errno.EADDRINUSE:
+                    logging.warning(
+                        f"Port {inference_coordinator_port} is already in use. "
+                        "Binding to a random available port instead."
+                    )
+            except Exception:
+                logging.warning(
+                    f"Unknown error when binding to port {inference_coordinator_port}. "
+                    "Attempting to bind to a random available port instead."
+                )
+        if not is_bound:
+            self.router_socket.bind_to_random_port(f"tcp://{local_ip}")
+        self.addr = self.router_socket.getsockopt_string(zmq.LAST_ENDPOINT)
+
+        # Send the address to the parent process.
+        self.pipe_connection.send(self.addr)
+        self.pipe_connection.close()
 
         logging.info("Inference Coordinator: waiting for connections from data parallel ranks...")
         # First wait for all data parallel ranks to establish connections.
@@ -116,6 +153,7 @@ def __init__(self, inference_coordinator_port: int, data_parallel_size: int):
         self.request_id_to_client_request_id = {}
 
         self.next_request_id = 0
+        self.tokenizer = tokenizer
 
     def get_next_data_parallel_rank(self):
         """
@@ -254,12 +292,14 @@ def start(self):
                                 msgpack.packb([Headers.STOP_ACK.value], use_bin_type=True),
                             ]
                         )
+                    break  # Exit the main loop after STOP_ACKs have been processed.
             elif header == Headers.ENGINE_REPLY:
                 # This is the output of a single engine step on some data parallel rank.
                 assert sender_identity in self.identities_of_data_parallel_ranks
                 finished_request_records = deserialized_payload[1]
 
                 for finished_request_record in finished_request_records:
+                    self.detokenize(finished_request_record)
                     fid = finished_request_record["requests"][0]["request_id"]
                     client_identity = self.request_id_to_client_id[fid]
                     client_request_identity = self.request_id_to_client_request_id[fid]
@@ -279,9 +319,30 @@ def start(self):
             else:
                 raise UnknownHeaderError(header)
 
+    def detokenize(self, finished_request_record):
+        """
+        Detokenizes the generated tokens in the finished request record.
+
+        This method uses the coordinator's tokenizer to convert the list of
+        generated token IDs back into human-readable text.
+
+        Args:
+            finished_request_record (dict): The record containing the generated
+                tokens to be detokenized. It is modified in place.
+        """
+        for request in finished_request_record["requests"]:
+            if request["prompt"] is None:
+                request["prompt"] = self.tokenizer.detokenize(request["prompt_tokens"][1])
+            request["generated_text"] = self.tokenizer.detokenize(request["generated_tokens"])
+
     @classmethod
     def entrypoint(
-        cls, ready_event: Event, inference_coordinator_port: int, data_parallel_size: int
+        cls,
+        pipe_connection: Connection,
+        ready_event: Event,
+        data_parallel_size: int,
+        tokenizer,
+        inference_coordinator_port: int | None = None,
     ):
         """
         Class method to instantiate and run the coordinator, for use in a separate process.
@@ -290,18 +351,22 @@ def entrypoint(
         that it is fully initialized and listening, and then starts the main event loop.
 
         Args:
+            pipe_connection (Connection): A connecting pipe to the parent process.
             ready_event (Event): A threading or multiprocessing event object that is set()
                 once the coordinator is ready to accept connections.
             inference_coordinator_port (int): The port to bind to.
             data_parallel_size (int): The number of expected TP-coordinators.
         """
-        coordinator = cls(inference_coordinator_port, data_parallel_size)
+        coordinator = cls(
+            pipe_connection, data_parallel_size, tokenizer, inference_coordinator_port
+        )
         ready_event.set()
         try:
             coordinator.start()
         except KeyboardInterrupt:
             logging.info("Coordinator process interrupted. Exiting...")
-            coordinator.stop()
+        coordinator.stop()
+        logging.info("Inference Coordinator: shut down successfully.")
 
     def stop(self):
         """
diff --git a/megatron/core/inference/engines/async_zmq_communicator.py b/megatron/core/inference/engines/async_zmq_communicator.py
new file mode 100644
index 00000000000..7076bb283bd
--- /dev/null
+++ b/megatron/core/inference/engines/async_zmq_communicator.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import asyncio
+import socket
+import struct
+
+import torch.distributed as dist
+
+try:
+    import zmq
+
+    HAVE_ZMQ = True
+except ImportError:
+    from unittest.mock import MagicMock
+
+    zmq = MagicMock()
+    HAVE_ZMQ = False
+
+
+class AsyncZMQCommunicator:
+    """
+    An asyncio-friendly communicator abstraction using ZMQ.
+    Can be used to implement collective operations like all-reduce,
+    and bcast which are asyncio friendly on top of ZMQ sockets.
+    Only to be used with small amounts of data (e.g., 1 integer)
+    on the CPU.
+    """
+
+    def __init__(self, zmq_context: zmq.Context, process_group: dist.ProcessGroup):
+        """
+        Constructor for AsyncZMQCommunicator. Sets up ZMQ sockets
+        for communication among ranks in the given process group.
+        Args:
+            zmq_context (zmq.Context): ZMQ context to create sockets.
+            process_group (dist.ProcessGroup): Process group for communication.
+        """
+        self.rank = dist.get_rank(process_group)
+        self.world_size = dist.get_world_size(process_group)
+        self.is_leader = self.rank == 0
+        # Get the global rank of the leader (first rank in the process group)
+        src_rank = dist.get_process_group_ranks(process_group)[0]
+
+        if self.is_leader:
+            local_ip = socket.gethostname()
+            self.gather_sock = zmq_context.socket(zmq.PULL)
+            self.gather_sock.bind_to_random_port(f"tcp://{local_ip}")
+            gather_socket_addr = self.gather_sock.getsockopt_string(zmq.LAST_ENDPOINT)
+
+            self.bcast_sock = zmq_context.socket(zmq.PUB)
+            self.bcast_sock.bind_to_random_port(f"tcp://{local_ip}")
+            bcast_socket_addr = self.bcast_sock.getsockopt_string(zmq.LAST_ENDPOINT)
+
+            # Share the socket addresses with all peers
+            dist.broadcast_object_list(
+                [gather_socket_addr, bcast_socket_addr], src=src_rank, group=process_group
+            )
+
+        else:
+            bcast_output = [None, None]
+            dist.broadcast_object_list(bcast_output, src=src_rank, group=process_group)
+            gather_socket_addr, bcast_socket_addr = bcast_output
+            self.gather_sock = zmq_context.socket(zmq.PUSH)
+            self.gather_sock.connect(gather_socket_addr)
+            self.bcast_sock = zmq_context.socket(zmq.SUB)
+            self.bcast_sock.connect(bcast_socket_addr)
+            self.bcast_sock.setsockopt_string(zmq.SUBSCRIBE, "")
+
+    async def all_reduce_max(self, local_val: int) -> int:
+        """
+        Asyncio friendly all reduce max operation. Gathers on rank 0, computes max,
+        and broadcasts the result.
+        """
+        if self.world_size <= 1:
+            return local_val
+
+        payload = struct.pack('!i', local_val)
+
+        if self.is_leader:
+            # Rank 0: Gather -> Max -> Broadcast
+            values = [local_val]
+
+            # Non-blocking gather from N-1 peers
+            while len(values) < self.world_size:
+                try:
+                    msg = self.gather_sock.recv(flags=zmq.NOBLOCK)
+                    values.append(struct.unpack('!i', msg)[0])
+                except zmq.Again:
+                    await asyncio.sleep(0.001)  # Yield to event loop
+
+            max_val = max(values)
+            self.bcast_sock.send(struct.pack('!i', max_val))
+            return max_val
+
+        else:
+            # Worker: Send -> Wait for Broadcast
+            self.gather_sock.send(payload)
+
+            while True:
+                try:
+                    msg = self.bcast_sock.recv(flags=zmq.NOBLOCK)
+                    return struct.unpack('!i', msg)[0]
+                except zmq.Again:
+                    await asyncio.sleep(0.001)  # Yield to event loop
+
+    def close(self):
+        """
+        Close the ZMQ sockets.
+        """
+        self.gather_sock.close()
+        self.bcast_sock.close()
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
index ed107af9bfb..0a95e8f4a53 100644
--- a/megatron/core/inference/engines/dynamic_engine.py
+++ b/megatron/core/inference/engines/dynamic_engine.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import asyncio
+import concurrent.futures
 import logging
 import multiprocessing
-import os
 import socket
 import struct
 import time
@@ -38,10 +38,11 @@
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
-from megatron.core.inference.utils import Counter, await_process_event
+from megatron.core.inference.utils import Counter, await_process_call
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.cuda_graphs import delete_cuda_graphs
 from megatron.core.utils import (
+    experimental_api,
     get_asyncio_loop,
     get_pg_rank,
     get_pg_size,
@@ -50,6 +51,8 @@
     trace_async_exceptions,
 )
 
+from .async_zmq_communicator import AsyncZMQCommunicator
+
 try:
     from tqdm import tqdm
 
@@ -86,6 +89,11 @@
 except ImportError:
     HAVE_PSUTIL = False
 
+from megatron.core.inference.contexts.dynamic_context import HAVE_TORCH_MEMORY_SAVER
+
+if HAVE_TORCH_MEMORY_SAVER:
+    from torch_memory_saver import torch_memory_saver
+
 
 class EngineSuspendedError(Exception):
     """Engine is currently suspended and not performing steps."""
@@ -111,6 +119,7 @@ class RequestEntry:
 
 
 # pylint: disable=line-too-long
+@experimental_api
 class DynamicInferenceEngine(AbstractEngine):
     """The dynamic inference engine.
 
@@ -180,6 +189,7 @@ def __init__(
         self.enable_chunked_prefill = enable_chunked_prefill
         self.inference_logging_step_interval = inference_logging_step_interval
         self.unified_memory_level = context.unified_memory_level
+        self.persist_cuda_graphs = context.persist_cuda_graphs
 
         if enable_cuda_graph is not None:
             self.cuda_graph_impl = "local" if enable_cuda_graph else "none"
@@ -189,6 +199,11 @@ def __init__(
         # Initialize engine.
         self.reset()
 
+        # Set callback for getting stop word finished request IDs
+        self.controller.set_stop_word_finished_ids_callback(
+            self._get_and_clear_stop_word_finished_ids
+        )
+
         # Configure wandb to use separate step counter for inference metrics (only once)
         if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None:
             logging.info(
@@ -225,10 +240,15 @@ def reset(self) -> None:
         # Request state.
         self.request_counter = Counter()
         self.finished_request_count = 0
+        self.evicted_request_count = 0
 
         self.requests: Dict[int, RequestEntry] = {}
         self.waiting_request_ids = deque()
         self.failed_request_ids = []
+        # Track requests that should stop due to stop words (detected in post_process_requests)
+        self.stop_word_finished_request_ids: set[int] = set()
+        # Track requests currently being finished due to stop words (to skip extra token)
+        self.stop_word_being_finished_ids: set[int] = set()
 
         # Timing and logging variables.
         self.rank = torch.distributed.get_rank()
@@ -269,21 +289,6 @@ def create_cuda_graphs(self, reset_context: bool = True):
         controller = self.controller
 
         config = controller.inference_wrapped_model.inference_wrapper_config
-        moe_pad_experts = config.moe_pad_experts_for_cuda_graph_inference
-
-        if moe_pad_experts:
-            filtered_cuda_graph_batch_dimensions_list = []
-            for config in context.cuda_graph_batch_dimensions_list:
-                if config.prefill_req_count == 0:
-                    filtered_cuda_graph_batch_dimensions_list.append(config)
-            if len(filtered_cuda_graph_batch_dimensions_list) != len(
-                context.cuda_graph_batch_dimensions_list
-            ):
-                warnings.warn(
-                    "MoE models do not support non-decode cuda graphs. "
-                    "Forcing non_decode_cuda_graphs to False."
-                )
-            context.cuda_graph_batch_dimensions_list = filtered_cuda_graph_batch_dimensions_list
 
         time_start = time.time()
         mem_stats_start = torch.cuda.memory_stats()
@@ -337,12 +342,14 @@ def create_cuda_graphs(self, reset_context: bool = True):
 
         self.capture_stats = capture_stats
 
+        if HAVE_TORCH_MEMORY_SAVER:
+            torch_memory_saver.pause("kv_cache")
+
     @internal_api
     async def start_listening_to_data_parallel_coordinator(
         self,
-        inference_coordinator_port: int,
+        inference_coordinator_port: int | None = None,
         launch_inference_coordinator: bool = True,
-        verbose: bool = False,
         *,
         loop: Optional[asyncio.AbstractEventLoop] = None,
     ):
@@ -371,12 +378,20 @@ async def start_listening_to_data_parallel_coordinator(
         (`self.run_engine`) as a background asyncio task.
 
         Args:
-            inference_coordinator_port (int): The network port where the central
+            inference_coordinator_port (int | None): The network port where the central
                 `InferenceCoordinator` is or will be listening.
+                If None, a random available port will be selected.
+                If not None, the coordinator will attempt to bind to this port, but should it
+                not succeed (e.g., if the port is already in use), it may bind to a different port.
+                The actual port used is returned by this method.
             launch_inference_coordinator (bool, optional): If True, the global rank 0
                 process will spawn and manage the `InferenceCoordinator`
                 process. Defaults to True.
-            verbose (bool): Whether to run in verbose mode.
+
+        Returns:
+            inference_coordinator_addresss (str): The network address of the central
+                `InferenceCoordinator`, which may not have the same port as what the user requested
+                with `inference_coordinator_port`.
         """
 
         assert HAVE_ZMQ, (
@@ -404,23 +419,43 @@ async def start_listening_to_data_parallel_coordinator(
         self.is_mp_coordinator = tp_rank == 0 and pp_rank == 0
         self.is_dp_coordinator = (dp_rank == 0) and self.is_mp_coordinator
 
+        local_ip = socket.gethostname()
+
         # Spawn a DP coordinator process and get the connection info.
         if launch_inference_coordinator and self.is_dp_coordinator:
             spawn_context = multiprocessing.get_context('spawn')
+            dp_pipe, dp_process_pipe = spawn_context.Pipe()
             coordinator_ready_event = spawn_context.Event()
             self.inference_coordinator_process = spawn_context.Process(
                 target=DataParallelInferenceCoordinator.entrypoint,
                 args=(
+                    dp_process_pipe,
                     coordinator_ready_event,
-                    inference_coordinator_port,
                     get_pg_size(self.pg_collection.dp),
+                    self.controller.tokenizer,
+                    inference_coordinator_port,
                 ),
             )
             self.inference_coordinator_process.start()
+            await await_process_call(dp_pipe.poll, self.inference_coordinator_process)
+            dp_addr = dp_pipe.recv()
+            dp_pipe.close()
+
+            # Check if the port number is not inference_coordinator_port
+            actual_port = int(dp_addr.rsplit(":", 1)[-1])
+            if inference_coordinator_port != None and actual_port != inference_coordinator_port:
+                logging.warning(
+                    f"Requested InferenceCoordinator port {inference_coordinator_port} "
+                    f"but got port {actual_port} instead. This happens if the request port "
+                    f"is already in use."
+                )
+        elif not launch_inference_coordinator:
+            dp_addr = f"tcp://{local_ip}:{inference_coordinator_port}"
+        else:
+            dp_addr = None
 
         # Find available ports for MP and bind to them.
         if self.is_mp_coordinator:
-            local_ip = socket.gethostname()
             mp_req_sock = self.zmq_context.socket(zmq.PUB)
             mp_req_sock.bind_to_random_port(f"tcp://{local_ip}")
             mp_req_addr = mp_req_sock.getsockopt_string(zmq.LAST_ENDPOINT)
@@ -433,12 +468,13 @@ async def start_listening_to_data_parallel_coordinator(
             mp_len_addr = None
 
         # Broadcast addresses to respective ranks.
+        bcast = [dp_addr]
+        torch.distributed.broadcast_object_list(bcast, src=dp_src, group=dp_group)
+        [dp_addr] = bcast
         bcast = [mp_req_addr, mp_len_addr]
         torch.distributed.broadcast_object_list(bcast, src=mp_src, group=mp_group)
         [mp_req_addr, mp_len_addr] = bcast
 
-        ip_address_of_dp_coordinator = os.getenv('MASTER_ADDR', '127.0.0.1')
-        dp_addr = f"tcp://{ip_address_of_dp_coordinator}:{inference_coordinator_port}"
         identity = f'mp-coord-{dp_rank}'
         if self.is_mp_coordinator:
             # 1. Create dealer sockets where tp_rank = 0 and pp_rank = 0
@@ -478,15 +514,26 @@ async def start_listening_to_data_parallel_coordinator(
 
         torch.distributed.barrier(mp_group)
 
+        # initialize zmq-based EP communicator
+        self.ep_rank = get_pg_rank(self.pg_collection.ep)
+        self.ep_world_size = get_pg_size(self.pg_collection.ep)
+        if self.ep_world_size > 1:
+            self.expert_parallel_zmq_communicator = AsyncZMQCommunicator(
+                self.zmq_context, process_group=self.pg_collection.ep
+            )
+
         if launch_inference_coordinator and self.is_dp_coordinator:
-            await await_process_event(coordinator_ready_event, self.inference_coordinator_process)
+            await await_process_call(
+                coordinator_ready_event.wait, self.inference_coordinator_process
+            )
             logging.info("Inference co-ordinator is ready to receive requests!")
+            logging.info(f"Data parallel coordinator can be found at {dp_addr}")
 
         # Finally run the engine infinite loop
         loop = get_asyncio_loop(loop)
-        self.engine_loop_task = loop.create_task(
-            self.run_engine_with_coordinator(loop=loop, verbose=verbose)
-        )
+        self.engine_loop_task = loop.create_task(self.run_engine_with_coordinator(loop=loop))
+
+        return dp_addr
 
     @contextmanager
     @staticmethod
@@ -564,10 +611,10 @@ def suspend(self):
         ):
             self.context.deallocate_all_tensors()
 
-        # Delete cuda graphs when not using unified memory at all (level 0). For
-        # levels 1 and 2, the context's tensors maintain static memory addresses,
-        # so the cuda graphs are re-used.
-        if self.unified_memory_level == 0:
+        # Delete cuda graphs when not using unified memory at all (level 0) and
+        # `--rl-training-cuda-graphs` is not passed. For UVM levels 1 and 2, the context's tensors
+        # maintain static memory addresses, so the cuda graphs are re-used.
+        if self.unified_memory_level == 0 and not self.persist_cuda_graphs:
             delete_cuda_graphs()
 
         # Maintain references to requests before reset.
@@ -578,7 +625,7 @@ def suspend(self):
 
         # Suspend requests objects.
         for request_id in active_request_ids:
-            self.requests[request_id].record.suspend()
+            self.requests[request_id].record.checkpoint()
 
     def resume(self):
         """Resume engine by reallocating context's GPU state."""
@@ -609,7 +656,7 @@ def resume(self):
             # 0). For levels 1 and 2, the context's tensors maintain static
             # memory addresses, so the cuda graphs are re-used.
             capture_time = time.time()
-            if self.unified_memory_level == 0:
+            if self.unified_memory_level == 0 and not self.persist_cuda_graphs:
                 self.create_cuda_graphs()
             capture_time = time.time() - capture_time
 
@@ -678,14 +725,20 @@ def _add_request(
             request.sampling_params.num_tokens_to_generate is None
             or request.sampling_params.num_tokens_total is None
         )
-        if request.sampling_params.return_prompt_top_n_logprobs:
-            assert (
-                request.sampling_params.return_log_probs
-            ), "return_prompt_top_n_logprobs requires sampling_params.return_log_probs to be True"
         if request.sampling_params.top_n_logprobs > 0:
             assert (
                 request.sampling_params.return_log_probs
             ), "top_n_logprobs requires sampling_params.return_log_probs to be True"
+        if (
+            request.sampling_params.return_log_probs
+            and not request.sampling_params.skip_prompt_log_probs
+        ):
+            assert not self.context.materialize_only_last_token_logits, (
+                "Prompt log probs cannot be calculated if only last token logits are materialized. "
+                "Set materialize_only_last_token_logits to False in DynamicInferenceContext "
+                "or skip_prompt_log_probs to True in SamplingParams."
+            )
+
         if request.sampling_params.num_tokens_total is not None:
             request.sampling_params.num_tokens_to_generate = (
                 request.sampling_params.num_tokens_total - len(request.prompt_tokens)
@@ -709,7 +762,7 @@ def _add_request(
         if (
             len(request.prompt_tokens) + request.sampling_params.num_tokens_to_generate
             > self.context.max_sequence_length
-        ):
+        ) or (request.sampling_params.num_tokens_to_generate < 0):
             request.status = Status.FAILED
             request.add_event_error_nontransient(MaxSequenceLengthOverflowError(request_id))
 
@@ -717,6 +770,14 @@ def _add_request(
             request.status = Status.FAILED
             request.add_event_error_nontransient(TokenOverflowError(request_id))
 
+        # Tokenize stop words if provided
+        if request.sampling_params.stop_words:
+            stop_word_ids = [
+                self.controller.tokenize_prompt(stop_word, add_BOS=False)
+                for stop_word in request.sampling_params.stop_words
+            ]
+            request.stop_word_ids = stop_word_ids
+
         if request.status != Status.FAILED:
             self.waiting_request_ids.append(request_id)
         else:
@@ -781,6 +842,7 @@ def post_process_requests(
         self,
         request_ids: torch.Tensor,
         finished_request_ids: torch.Tensor,
+        evict_request_ids: torch.Tensor,
         step_time: float,
         sample: torch.Tensor,
         log_probs: torch.Tensor,
@@ -792,6 +854,7 @@ def post_process_requests(
         Args:
             request_ids (torch.Tensor): A list of request_ids
             finished_request_ids (torch.Tensor): A list of finished request ids
+            evict_request_ids (torch.Tensor): A list of evicted request ids.
             step_time (float): The latency of the last step
             sample: (torch.Tensor): The newly generated tokens for each request
             log_probs: (List): Log probs for each request
@@ -805,6 +868,8 @@ def post_process_requests(
         finished_request_ids = set(finished_request_ids.tolist())
         finished_request_records: list[DynamicInferenceRequestRecord] = []
         self.finished_request_count += len(finished_request_ids)
+        if evict_request_ids is not None:
+            self.evicted_request_count += evict_request_ids.numel()
 
         log_probs_iter = log_probs if log_probs else repeat(None)
 
@@ -813,12 +878,19 @@ def post_process_requests(
         ):
             request: DynamicInferenceRequest = self.get_request(request_id)
             if request_id != self.context.chunked_prefill_request_id:
-                request.generated_tokens.append(token)
-                if request.tpot is None:
-                    request.tpot = []
-                request.tpot.append(step_time)
+                # Skip appending token for requests being finished due to stop words
+                # (they already have their final token from the previous step)
+                if request_id not in self.stop_word_being_finished_ids:
+                    request.generated_tokens.append(token)
+                    if request.tpot is None:
+                        request.tpot = []
+                    request.tpot.append(step_time)
+
+                # Check for stop words (after token is appended)
+                stop_word_hit = self._check_stop_words_for_request_post_append(request)
 
                 if request_id in finished_request_ids:
+                    # Request finished by normal means (termination_id, max_length, or stop word from previous step)
                     request.generated_length = len(request.generated_tokens)
                     request.status = Status.COMPLETED
                     finished_entry = self.requests.pop(request_id)
@@ -826,6 +898,11 @@ def post_process_requests(
                     finished_request.generated_length = len(finished_request.generated_tokens)
                     finished_request_records.append(finished_entry.record)
                     finished_entry.future.set_result(finished_entry.record)
+                elif stop_word_hit:
+                    # Stop word detected - mark for removal in next step's bookkeeping
+                    # Don't pop yet; let the next step handle it properly via callback
+                    self.stop_word_finished_request_ids.add(request_id)
+                    active_request_ids.append(request_id)
                 else:
                     active_request_ids.append(request_id)
             else:
@@ -912,8 +989,79 @@ def post_process_requests(
                     else:
                         request.generated_top_n_logprobs.append(logit_dict)
 
+        # Handle evicted requests.
+        if evict_request_ids is not None and evict_request_ids.numel() > 0:
+
+            evict_request_ids = evict_request_ids.tolist()
+
+            # Insert into waiting_request_ids after any chunk prefill request.
+            self.waiting_request_ids.extendleft(evict_request_ids)
+            if self.context.chunked_prefill_request_id != -1:
+                chunked_prefill_id = self.waiting_request_ids[len(evict_request_ids)]
+                del self.waiting_request_ids[len(evict_request_ids)]
+                self.waiting_request_ids.appendleft(chunked_prefill_id)
+
+            # Checkpoint requests (i.e., prompt += generations) + add eviction event.
+            for request_id in evict_request_ids:
+                self.requests[request_id].record.checkpoint()
+                self.get_request(request_id).add_event_evict()
+
+        # Clear the stop word being finished set after processing
+        self.stop_word_being_finished_ids.clear()
+
         return active_request_ids, finished_request_records
 
+    def _get_and_clear_stop_word_finished_ids(self, active_request_ids: list[int]) -> set[int]:
+        """Get and clear the set of request IDs that should be finished due to stop words.
+
+        This callback is called from the controller during bookkeeping to get request IDs
+        that were detected as hitting stop words in the previous step's post_process_requests.
+
+        Args:
+            active_request_ids: List of currently active request IDs.
+
+        Returns:
+            Set of request IDs from active_request_ids that should be marked as finished.
+        """
+        if not self.stop_word_finished_request_ids:
+            return set()
+
+        # Find which stop word finished IDs are in the current active requests
+        result = self.stop_word_finished_request_ids & set(active_request_ids)
+        # Move to "being finished" set so post_process_requests can skip the extra token
+        self.stop_word_being_finished_ids = result
+        # Clear the IDs that we're returning (they'll be marked as finished)
+        self.stop_word_finished_request_ids -= result
+        return result
+
+    def _check_stop_words_for_request_post_append(self, request: DynamicInferenceRequest) -> bool:
+        """Check if a request should stop due to stop words (after token is appended).
+
+        This method is called from post_process_requests after the token has already
+        been appended to request.generated_tokens.
+
+        Args:
+            request: The request to check.
+
+        Returns:
+            bool: True if the generated sequence ends with a stop word, False otherwise.
+        """
+        # Check if request has stop words configured
+        if request.stop_word_ids is None or len(request.stop_word_ids) == 0:
+            return False
+
+        generated_tokens = request.generated_tokens
+
+        # Check if the sequence ends with any stop word
+        for stop_word_ids in request.stop_word_ids:
+            stop_len = len(stop_word_ids)
+            if len(generated_tokens) >= stop_len:
+                # Check if the last stop_len tokens match the stop word
+                if list(generated_tokens[-stop_len:]) == stop_word_ids:
+                    return True
+
+        return False
+
     def schedule_waiting_requests(self):
         """Tries to schedule any requests in the waiting pool."""
         if self.enable_chunked_prefill:
@@ -976,6 +1124,12 @@ def schedule_chunked_prefill(self):
 
             if request_can_be_added and kv_cache_available:
                 if token_fully_can_be_added:
+                    # For Mamba models we need to ensure that the last prefill chunk
+                    # is still tagged as a chunked prefill request.
+                    self.context.has_explicit_chunked_prefill_req = (
+                        self.context.is_hybrid_model
+                        and self.context.chunked_prefill_request_id == req.request_id
+                    )
                     self.context.chunked_prefill_request_id = -1
                     self.context.add_request(req)
                     self._loop.call_soon_threadsafe(
@@ -986,7 +1140,10 @@ def schedule_chunked_prefill(self):
                     # Fully scheduled, so we remove from waiting pool
                     self.waiting_request_ids.popleft()
                     # Only this case we keep checking the rest of the waiting queue
-                    can_schedule = True
+                    # We break early for Mamba models running a final prefill chunk
+                    # so that no additional requests are scheduled beyond the chunked
+                    # prefill request.
+                    can_schedule = not self.context.has_explicit_chunked_prefill_req
                 elif token_partially_can_be_added:
                     chunk_length = self.context.max_tokens - self.context.active_token_count
                     self.context.add_request(req, chunk_length=chunk_length)
@@ -994,6 +1151,7 @@ def schedule_chunked_prefill(self):
                         self._loop.create_task, self._notify_cond_for_new_request()
                     )
                     self.context.chunked_prefill_request_id = req.request_id
+                    self.context.has_explicit_chunked_prefill_req = self.context.is_hybrid_model
                     req.remaining_prompt_tokens = req.remaining_prompt_tokens[chunk_length:]
                     req.finished_chunk_token_count += chunk_length
                     # Still have tokens to prefill, so we break and keep the
@@ -1023,6 +1181,7 @@ async def async_forward(self) -> Tuple[Dict, Dict, float, int]:
         is_decode_only = self.context.is_decode_only()
         pre_step_context_state = {
             "is_decode_only": is_decode_only,
+            "max_requests": self.context.max_requests,
             "total_request_count": self.context.total_request_count,
             "paused_request_count": self.context.paused_request_count,
             "active_token_count": self.context.active_token_count,
@@ -1044,8 +1203,8 @@ async def async_forward(self) -> Tuple[Dict, Dict, float, int]:
 
         if (
             self.inference_logging_step_interval > 0
-            and step_count > 0
-            and step_count % self.inference_logging_step_interval == 0
+            and self.step_count > 0
+            and self.step_count % self.inference_logging_step_interval == 0
             and self.context.metrics_writer is not None
         ):
             kvcache_util_stats = self.context.get_kvcache_utilization_stats()
@@ -1055,6 +1214,7 @@ async def async_forward(self) -> Tuple[Dict, Dict, float, int]:
         post_step_context_state = {
             "waiting_request_count": len(self.waiting_request_ids),
             "finished_request_count": self.finished_request_count,
+            "evicted_request_count": self.evicted_request_count,
             "kv_stats": kvcache_util_stats,
             "padded_active_token_count": self.context.padded_active_token_count,
             "using_cuda_graph_this_step": self.context.using_cuda_graph_this_step(),
@@ -1069,13 +1229,7 @@ async def async_forward(self) -> Tuple[Dict, Dict, float, int]:
         return result, context_state, step_time, self.step_count
 
     async def async_bookkeep(
-        self,
-        step_result: Optional[Dict],
-        context_state: Dict,
-        step_time: float,
-        step_count: int,
-        *,
-        verbose: bool = False,
+        self, step_result: Optional[Dict], context_state: Dict, step_time: float, step_count: int
     ):
         """Uses `asyncio` for continuous bookkeeping.
 
@@ -1084,7 +1238,6 @@ async def async_bookkeep(
             context_state (Dict): is_decode_only, total/paused request count, active token count.
             step_time (float): How long this step took.
             step_count (int): The count of the step.
-            verbose (bool): Whether to run in verbose mode.
 
         Returns:
             A dictionary containing:
@@ -1094,12 +1247,14 @@ async def async_bookkeep(
                 cuda_graph_request_count (int): The CUDA graph batch size matching this step.
         """
         # Increment finished_request_count.
+        range_push("bookkeeping")
         cuda_graph_request_count = None
 
         if step_result is not None:
             active_request_ids = step_result["active_request_ids"]
-            newly_paused_request_ids = step_result["newly_paused_request_ids"]
             finished_request_ids = step_result["finished_request_ids"]
+            newly_paused_request_ids = step_result.get("newly_paused_request_ids")
+            evict_request_ids = step_result.get("evict_request_ids")
             sample = step_result["sample"]
             log_probs = step_result["log_probs"]
             top_n_logprobs = step_result.get("top_n_logprobs", None)
@@ -1116,6 +1271,7 @@ async def async_bookkeep(
             (active_request_ids, finished_request_records) = self.post_process_requests(
                 active_request_ids,
                 finished_request_ids,
+                evict_request_ids,
                 step_time,
                 sample,
                 log_probs,
@@ -1135,26 +1291,33 @@ async def async_bookkeep(
             finished_request_records.append(failed_entry.record)
             failed_entry.future.set_result(failed_entry.record)
         self.failed_request_ids.clear()
+        range_pop()
 
-        # Detokenize all finished requests (critical for InferenceClient, which
-        # doesn't necessarily have the tokenizer).
-        for record in finished_request_records:
-            for request in record.requests:
-                if request.prompt is None:
-                    request.prompt = self.controller.tokenizer.detokenize(
-                        request.prompt_tokens.tolist()
+        # Detokenize all finished requests if not using
+        # the coordinator. Otherwise, the coordinator will
+        # overlap detokenization with the engine.
+        if not self.use_coordinator:
+            range_push("detokenization")
+            for record in finished_request_records:
+                for request in record.requests:
+                    if request.prompt is None:
+                        request.prompt = self.controller.tokenizer.detokenize(
+                            request.prompt_tokens.tolist()
+                        )
+                    request.generated_text = self.controller.tokenizer.detokenize(
+                        request.generated_tokens
                     )
-                request.generated_text = self.controller.tokenizer.detokenize(
-                    request.generated_tokens
-                )
+            range_pop()
 
         # Handle necessary ZMQ DP coordinator communication.
         if self.use_coordinator and self.is_mp_coordinator and finished_request_records:
+            range_push("coordinator_communication")
             payload = msgpack.packb(
                 [Headers.ENGINE_REPLY.value, [r.serialize() for r in finished_request_records]],
                 use_bin_type=True,
             )
             self.socket_for_receiving_requests.send(payload)
+            range_pop()
 
         # Log KV cache utilization stats to W&B
         if context_state["kv_stats"] is not None:
@@ -1183,12 +1346,15 @@ async def async_bookkeep(
                 )
 
         # Print context state.
-        if verbose:
+        if (
+            self.inference_logging_step_interval > 0
+            and step_count % self.inference_logging_step_interval == 0
+        ):
             mem = torch.cuda.memory_stats()
             step_type = "decode" if context_state["is_decode_only"] else "non-decode"
             output_str = (
                 "* rank %d | step %d | %s ... time: %.3f%s ... "
-                "reqs: a %d/%d, p %d/%d, w %d, f %d ... "
+                "reqs: a %d/%d, p %d, w %d, f %d, e %d ... "
                 "blocks: a %d/%d, p %d/%d ... "
                 "mem: tensors %d, alloc %.1f gb, res %.1f gb."
                 % (
@@ -1209,11 +1375,11 @@ async def async_bookkeep(
                         )
                     ),
                     context_state["total_request_count"] - context_state["paused_request_count"],
-                    context_state["total_active_block_count"],
+                    context_state["max_requests"],
                     context_state["paused_request_count"],
-                    context_state["total_paused_block_count"],
                     context_state["waiting_request_count"],
                     context_state["finished_request_count"],
+                    context_state["evicted_request_count"],
                     context_state["total_active_used_blocks"],
                     context_state["total_active_block_count"],
                     context_state["total_paused_used_blocks"],
@@ -1235,16 +1401,13 @@ async def async_bookkeep(
         }
 
     async def async_step(
-        self, *, verbose: bool = False
+        self,
     ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
         """
         Wrapper for controller.generate_output_tokens_dynamic_batch(), to
         match vLLM API. Uses `asyncio` for continuous generation which allows this
         method to sleep and wake up when new requests are available.
 
-        Args:
-            verbose (bool): Whether to run in verbose mode.
-
         Returns:
             A tuple comprised of:
                 1. Requests that ran in the last step and are still active.
@@ -1252,18 +1415,36 @@ async def async_step(
                 3. The step time in seconds.
         """
         last_step_data = await self.async_forward()
-        ret = await self.async_bookkeep(*last_step_data, verbose=verbose)
+        ret = await self.async_bookkeep(*last_step_data)
         # Keep for compatibility with current test suite.
         return ret
 
+    def _run_coroutine_sync(self, coro):
+        """Run a coroutine synchronously, handling the case when already in an event loop.
+
+        This method safely runs an async coroutine from synchronous code, even when
+        called from within an already running event loop (e.g., when used with async
+        frameworks like pytriton).
+        """
+        try:
+            # Check if there's already a running event loop
+            asyncio.get_running_loop()
+            # We're inside a running loop - run in a separate thread
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(asyncio.run, coro)
+                return future.result()
+        except RuntimeError:
+            # No running loop - safe to use run_until_complete
+            return self._loop.run_until_complete(coro)
+
     def step_modern(
-        self, *, verbose: bool = False
+        self,
     ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
         """Synchronous wrapper for `self.async_step`."""
-        return self._loop.run_until_complete(self.async_step(verbose=verbose))
+        return self._run_coroutine_sync(self.async_step())
 
     def step_legacy(
-        self, sampling_params: SamplingParams, *, verbose: bool = False
+        self, sampling_params: SamplingParams
     ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
         """Synchronous wrapper for `self.async_step`."""
         warnings.warn(
@@ -1271,7 +1452,7 @@ def step_legacy(
             "0.16. Please use `step_modern()` going forward, which will eventually "
             "be renamed to `step()`."
         )
-        result = self._loop.run_until_complete(self.async_step(verbose=verbose))
+        result = self._run_coroutine_sync(self.async_step())
         active_requests = [self.get_request(i) for i in result["active_request_ids"]]
         finished_requests = [r.merge() for r in result["finished_request_records"]]
         return active_requests, finished_requests, result["step_time"]
@@ -1330,7 +1511,7 @@ def schedule_requests(self) -> int:
             int: The number of messages that were received and processed in this batch.
         """
 
-        torch.cuda.nvtx.range_push("drain_zmq_socket")
+        range_push("drain_zmq_socket")
         all_messages = []
         if self.is_mp_coordinator:
             while True:
@@ -1363,7 +1544,7 @@ def schedule_requests(self) -> int:
             else:
                 all_messages = []
 
-        torch.cuda.nvtx.range_pop()
+        range_pop()
         for message in all_messages:
             data = msgpack.unpackb(message, raw=False)
             header = Headers(data[0])
@@ -1376,7 +1557,9 @@ def schedule_requests(self) -> int:
             if header == Headers.SUBMIT_REQUEST:
                 request_id, prompt, sampling_params = data[1:]
                 sampling_params = SamplingParams.deserialize(sampling_params)
+                range_push("add_request")
                 self.add_request(request_id, prompt, sampling_params)
+                range_pop()
             elif header == Headers.PAUSE:
                 # Pause thyself.
                 self.received_pause = True
@@ -1398,7 +1581,7 @@ def schedule_requests(self) -> int:
                 self.received_pause = False
             elif header == Headers.STOP_ACK:
                 self.stopped.set()
-                self.stop()
+                self.received_stop = False
             elif header == Headers.UNPAUSE:
                 self.paused.clear()
                 self.running.set()
@@ -1407,7 +1590,7 @@ def schedule_requests(self) -> int:
             elif header == Headers.RESUME:
                 self.suspend_signal = False
             elif header == Headers.STOP:
-                self.stopped = True
+                self.received_stop = True
             else:
                 raise UnknownHeaderError(header)
 
@@ -1423,15 +1606,15 @@ def stop(self):
         """
 
         if hasattr(self, "inference_coordinator_process"):
-            self.inference_coordinator_process.terminate()
+            self.inference_coordinator_process.join()
         for socket in self.zmq_sockets:
             socket.close()
+        if hasattr(self, "expert_parallel_zmq_communicator"):
+            self.expert_parallel_zmq_communicator.close()
         self.zmq_context.term()
 
     @trace_async_exceptions
-    async def run_engine(
-        self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
-    ):
+    async def run_engine(self, *, loop: Optional[asyncio.AbstractEventLoop] = None):
         """Continually steps the engine asynchronously."""
         self._loop = get_asyncio_loop(loop)
         self.use_coordinator = False
@@ -1448,13 +1631,51 @@ async def run_engine(
                             )
                         )
                     )
-                await self.async_step(verbose=verbose)
+                await self.async_step()
         except asyncio.CancelledError:
             pass
 
+    async def _ep_group_has_work(self, local_work: int) -> bool:
+        """Determines if there are some pending requests in the expert parallel group this
+        rank is a part of.
+        Args:
+            local_work (int): The local work count for this rank. This is a sum of active
+            and waiting requests.
+        Returns:
+            bool: True if there is some work in the EP group, False otherwise.
+        """
+        range_push("_ep_group_has_work")
+
+        is_stopped = self.stopped.is_set() or self.received_stop
+        is_paused = self.paused.is_set() or self.received_pause
+        is_suspended = self.suspend_signal
+        if is_stopped or is_paused or is_suspended:
+            # Signals can be received asynchronously on EP ranks.
+            # We do not want a rank to pause/stop/suspend prematurely if one of it's peers
+            # is yet to receive the signal.
+            # So this is an *attempt* to process the signal. This rank has received the signal
+            # and passes 0 to the all-reduce. If any other rank in the EP group has not received the signal yet,
+            # it will pass a non-zero value to the all-reduce, and hence the global work will be non-zero,
+            # and we will defer processing the signal.
+            # When all ranks receive the signal, global work will be zero, and we can process the signal safely.
+            local_work = 0
+
+        if self.ep_world_size > 1:
+            # Perform all-reduce to get max global work across EP group.
+            # Note that it is important to use a non-blocking asyncio-friendly all-reduce here.
+            # The user may have other tasks running in the event loop that need to be serviced.
+            # Do not using a torch.distributed blocking all-reduce here using nccl/gloo.
+            # We have tried that and it blocks the event loop is megatron-rl.
+            max_global_work = await self.expert_parallel_zmq_communicator.all_reduce_max(local_work)
+        else:
+            max_global_work = local_work
+
+        range_pop()
+        return max_global_work > 0
+
     @trace_async_exceptions
     async def run_engine_with_coordinator(
-        self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
+        self, *, loop: Optional[asyncio.AbstractEventLoop] = None
     ):
         """Continually steps the engine asynchronously."""
         self._loop = get_asyncio_loop(loop)
@@ -1462,8 +1683,6 @@ async def run_engine_with_coordinator(
         try:
             while True:
                 self.schedule_requests()
-                if self.stopped.is_set():
-                    break
 
                 # for the cases below (no active requests, or undergoing a state-change)
                 # do not use asyncio.sleep(0)
@@ -1475,30 +1694,51 @@ async def run_engine_with_coordinator(
                 # needed to send one message on an IPC socket. However
                 # just to be safe, we use 20ms here.
 
-                # todo [Siddharth]: Can this hardcoded sleep be avoided
-                # with asyncio zmq sockets?
-                if self.paused.is_set() or self.received_pause or self.received_stop:
-                    await asyncio.sleep(0.02)
-                    continue
-
-                # Suspend, resume.
-                if self.suspend_signal:
-                    self.suspend()
-                    await asyncio.sleep(0.02)
+                local_pending_requests = self.context.get_active_request_count() + len(
+                    self.waiting_request_ids
+                )
+                # 1. Check for work availability (Consensus Step)
+                ep_group_has_work = await self._ep_group_has_work(local_pending_requests)
+
+                # 2. Dummy Work Logic (Keep group alive if peers have work)
+                if ep_group_has_work and local_pending_requests == 0:
+                    # run dummy forward pass if EP group as a whole has work,
+                    # but this rank does not have any work.
+                    self.controller.dummy_forward()
                     continue
 
-                else:
-                    self.resume()
-
-                # No requests.
-                if (
-                    self.context.get_active_request_count() == 0
-                    and len(self.waiting_request_ids) == 0
-                ):
-                    await asyncio.sleep(0.02)
+                # 3. No work in EP group
+                # We handle control signals (PAUSE/STOP/SUSPEND) only when
+                # the entire EP group has received the signal. It is important to
+                # not process these signals immediately upon receipt, because
+                # other ranks in the EP group may not have received them yet.
+                # If we exit prematurely, other ranks will deadlock at the all-to-all.
+                # We use self._ep_group_has_work() to build consensus across the EP group
+                # as to when it is safe to process these signals. The function returns False
+                # when all ranks have received the signal.
+                if not ep_group_has_work:
+                    # Priority A: STOP
+                    if self.stopped.is_set():
+                        if self.rank == 0:
+                            logging.info("Stopping engine.")
+                        self.stop()
+                        break
+
+                    # Priority B: SUSPEND
+                    if self.suspend_signal:
+                        self.suspend()
+                    else:
+                        self.resume()
+
+                    # Priority C: PAUSE or no work - nothing needs to be done
+                    # To avoid flooding the TP publisher socket with packets,
+                    # we sleep for 20 ms here.
+                    # todo [Siddharth]: Can this hardcoded sleep be avoided
+                    # with asyncio zmq sockets?
+                    await asyncio.sleep(0.02)  # Yield to event loop
                     continue
 
-                await self.async_step(verbose=verbose)
+                await self.async_step()
 
         except asyncio.CancelledError:
             pass
diff --git a/megatron/core/inference/inference_client.py b/megatron/core/inference/inference_client.py
index 8a19e226c46..a927a393b8c 100644
--- a/megatron/core/inference/inference_client.py
+++ b/megatron/core/inference/inference_client.py
@@ -2,7 +2,6 @@
 
 import asyncio
 import logging
-import os
 import time
 from typing import Awaitable, List, Optional, Union
 
@@ -54,12 +53,12 @@ class InferenceClient:
             completed requests.
     """
 
-    def __init__(self, inference_coordinator_port: int):
+    def __init__(self, inference_coordinator_address: str):
         """
         Initializes the InferenceClient.
 
         Args:
-            inference_coordinator_port (int): The port number on which the
+            inference_coordinator_address (str): The address on which the
                 inference coordinator is listening.
         """
         assert (
@@ -70,8 +69,7 @@ def __init__(self, inference_coordinator_port: int):
         ), "please install the messagepack library to use InferenceClient - pip install msgpack"
         self.context = zmq.Context()
         socket = self.context.socket(zmq.DEALER)
-        inference_coordinator_address = os.getenv('MASTER_ADDR', '127.0.0.1')
-        socket.connect(f"tcp://{inference_coordinator_address}:{inference_coordinator_port}")
+        socket.connect(inference_coordinator_address)
 
         self._loop = None
         self.running = asyncio.Event()
@@ -111,7 +109,7 @@ def add_request(
         payload_serialized = msgpack.packb(payload, use_bin_type=True)
         self.socket.send(payload_serialized)
         assert request_id not in self.completion_futures
-        self.completion_futures[request_id] = self._loop.create_future()
+        self.completion_futures[request_id] = asyncio.get_running_loop().create_future()
         self.request_submission_times[request_id] = time.perf_counter()
         return self.completion_futures[request_id]
 
@@ -141,7 +139,10 @@ async def _recv_task(self):
                     if completion_future.done():
                         logging.warning(f"Client: The future for {request_id} has been cancelled!")
                         continue
-                    completion_future.set_result(DynamicInferenceRequestRecord.deserialize(reply))
+                    completed_request = DynamicInferenceRequestRecord.deserialize(reply)
+                    completion_future.get_loop().call_soon_threadsafe(
+                        completion_future.set_result, completed_request
+                    )
                 elif header == Headers.PAUSE_ACK:
                     self.paused.set()
                 elif header == Headers.STOP_ACK:
diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py
index 6d0ff898bad..6a7354220f9 100644
--- a/megatron/core/inference/inference_request.py
+++ b/megatron/core/inference/inference_request.py
@@ -1,46 +1,47 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import copy
-import io
 import time
 import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum, auto
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 
 from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.tokenizers import MegatronTokenizer
+from megatron.core.utils import experimental_api
 
 
-def serialize_tensor(tensor: torch.Tensor) -> bytes:
+def serialize_tensor(tensor: torch.Tensor) -> List:
     """Serialize tensor to bytes.
 
     Args:
         tensor (Tensor): Tensor.
 
     Returns:
-        (bytes) Byte representation of tensor.
+        (List) Tensor as a list
     """
-    buffer = io.BytesIO()
-    torch.save(tensor, buffer)
-    buffer.seek(0)
-    tensor_bytes = buffer.read()
-    return tensor_bytes
+    torch.cuda.nvtx.range_push("serialize_tensor")
 
+    # simply convert tensor into a list
+    tensor = tensor.cpu().tolist()
 
-def deserialize_tensor(tensor_bytes: bytes) -> torch.Tensor:
+    torch.cuda.nvtx.range_pop()
+    return tensor
+
+
+def deserialize_tensor(tensor_as_list: List) -> torch.Tensor:
     """Deserialize tensor from bytes.
 
     Args:
-        tensor_bytes (bytes): Byte representation of tensor.
+        tensor_as_list (List): List representation of tensor.
 
     Returns:
         (Tensor) Tensor.
     """
-    buffer = io.BytesIO(tensor_bytes)
-    tensor = torch.load(buffer)
+    tensor = torch.tensor(tensor_as_list)
     return tensor
 
 
@@ -98,17 +99,21 @@ def serialize(self) -> dict:
             (dict) A dictionary representation of the instance suitable for
                 serialization.
         """
-
         # Dataclass to dict.
-        obj = asdict(self)
+        # do not use asdict(self) - it has very high CPU overheads
+        # and if there are tensors, it will try to deepcopy them
+        obj = self.__dict__.copy()  # shallow dict copy
         obj["status"] = self.status.name if self.status else None
+        obj["sampling_params"] = self.sampling_params.serialize() if self.sampling_params else None
+        obj["inference_parameters"] = (
+            self.inference_parameters.serialize() if self.inference_parameters else None
+        )
 
         # Serialize tensors.
         obj = {
             k: (("tensor", serialize_tensor(v)) if isinstance(v, torch.Tensor) else v)
             for k, v in obj.items()
         }
-
         return obj
 
     @classmethod
@@ -124,14 +129,31 @@ def deserialize(cls, obj: dict) -> "InferenceRequest":
 
         # Initialize request.
         request = cls(**obj)
-        request.status = None if obj["status"] is None else Status[obj["status"]]
+        request._post_deserialize(obj)
+        return request
 
-        # Deserialize tensors.
+    def _post_deserialize(self, obj: dict):
+        """
+        This is called after the dataclass is initialized to handle any special
+        deserialization logic.
+        """
+        # Deserialize status.
+        self.status = None if obj["status"] is None else Status[obj["status"]]
+        self.sampling_params = (
+            None
+            if obj["sampling_params"] is None
+            else SamplingParams.deserialize(obj["sampling_params"])
+        )
+        self.inference_parameters = (
+            None
+            if obj["inference_parameters"] is None
+            else SamplingParams.deserialize(obj["inference_parameters"])
+        )
+
+        # Deserialize tensors and sampling params.
         for k, v in obj.items():
             if isinstance(v, list) and len(v) == 2 and v[0] == "tensor":
-                setattr(request, k, deserialize_tensor(v[1]))
-
-        return request
+                setattr(self, k, deserialize_tensor(v[1]))
 
 
 class DynamicInferenceEventType(Enum):
@@ -139,6 +161,7 @@ class DynamicInferenceEventType(Enum):
 
     ADD = auto()
     PAUSE = auto()
+    EVICT = auto()
     FINISH = auto()
     FAIL = auto()
     ERROR_TRANSIENT = auto()
@@ -153,6 +176,7 @@ class DynamicInferenceEvent:
 
     - request added
     - request paused
+    - request evicted
     - request finished
     - request failed
     - request error (transient)
@@ -194,7 +218,10 @@ def serialize(self) -> dict:
         """
 
         # Dataclass to dict.
-        obj = asdict(self)
+        torch.cuda.nvtx.range_push("DynamicInferenceEvent.serialize")
+        # do not use asdict(self) - it has very high CPU overheads
+        # and if there are tensors, it will try to deepcopy them
+        obj = self.__dict__.copy()
         obj["type"] = self.type.name
 
         # Serialize payload.
@@ -202,7 +229,7 @@ def serialize(self) -> dict:
             from .contexts.dynamic_context import ContextErrorFactory  # avoid circular import.
 
             obj["payload"] = ContextErrorFactory.serialize(self.payload)
-
+        torch.cuda.nvtx.range_pop()
         return obj
 
     @classmethod
@@ -228,6 +255,7 @@ def deserialize(cls, obj: dict) -> "DynamicInferenceEvent":
         return event
 
 
+@experimental_api
 @dataclass(kw_only=True)
 class DynamicInferenceRequest(InferenceRequest):
     """Class for one inference request
@@ -243,7 +271,8 @@ class DynamicInferenceRequest(InferenceRequest):
     # remaining prompt tokens are used for chunked prefill
     remaining_prompt_tokens: Optional[torch.Tensor] = None
     latency: Optional[float] = None
-    finished_chunk_token_count = 0
+    finished_chunk_token_count: int = 0
+    stop_word_ids: Optional[List[List[int]]] = None  # Tokenized stop words (populated internally)
 
     def __post_init__(self):
         self.sampling_params = copy.deepcopy(self.sampling_params)
@@ -270,30 +299,22 @@ def __str__(self):
             )
         )
 
-    def serialize(self):
+    def serialize(self) -> dict:
         """Converts the instance into a serializable dictionary.
 
         Returns:
             (dict) A dictionary representation of the instance suitable for
                 serialization.
         """
+        torch.cuda.nvtx.range_push("DynamicInferenceRequest.serialize")
         obj = super().serialize()
         obj["events"] = [e.serialize() for e in self.events]
+        torch.cuda.nvtx.range_pop()
         return obj
 
-    @classmethod
-    def deserialize(cls, obj: dict) -> "DynamicInferenceRequest":
-        """Deserialize request.
-
-        Args:
-            obj (dict): Serialized request data.
-
-        Returns:
-            (DynamicInferenceRequest) Deserialized request.
-        """
-        request = super().deserialize(obj)
-        request.events = [DynamicInferenceEvent.deserialize(e) for e in obj["events"]]
-        return request
+    def _post_deserialize(self, obj):
+        super()._post_deserialize(obj)
+        self.events = [DynamicInferenceEvent.deserialize(e) for e in obj["events"]]
 
     @property
     def tracked_metadata(self) -> List[Any]:
@@ -313,21 +334,27 @@ def tracked_metadata(self) -> List[Any]:
                     "in its sampling_params. Defaulting to -1."
                 )
             sp.termination_id = -1
-        return [getattr(sp, field) for field in self.get_metadata_labels().keys()]
+        return [getattr(sp, field) for field, _, _ in self.get_metadata_types()]
 
     @staticmethod
-    def get_metadata_labels() -> Dict[str, int]:
-        """Provides human-readable labels for the tracked metadata fields."""
-        ret = [
-            "temperature",
-            "top_k",
-            "top_p",
-            "termination_id",
-            "return_log_probs",
-            "skip_prompt_log_probs",
-            "top_n_logprobs",
+    def get_metadata_types() -> List[Tuple[str, torch.dtype, bool]]:
+        """Keeps track of all request metadata names, dtypes, and target device.
+
+        Returns:
+            List[Tuple[str, torch.dtype, bool]]: Mapping from metadata name to:
+                name (str) - The name of the metadata field.
+                dtype (torch.dtype) - The datatype of the metadata.
+                on_device (bool) - Whether the metadata lives on GPU (True) or CPU (False).
+        """
+        return [
+            ("temperature", torch.float32, False),  # CPU for torch sampling
+            ("top_k", torch.int32, False),  # CPU for torch sampling
+            ("top_p", torch.float32, False),  # CPU for torch sampling
+            ("termination_id", torch.int64, True),
+            ("return_log_probs", torch.bool, False),  # CPU for non-selective logprobs
+            ("skip_prompt_log_probs", torch.bool, False),  # CPU for non-selective logprobs
+            ("top_n_logprobs", torch.int32, False),  # CPU for torch sampling
         ]
-        return {k: v for v, k in enumerate(ret)}
 
     def add_event(self, type: DynamicInferenceEventType, payload: Optional[Any] = None) -> None:
         """Add event."""
@@ -341,6 +368,10 @@ def add_event_pause(self):
         """Add 'pause' event."""
         return self.add_event(DynamicInferenceEventType.PAUSE)
 
+    def add_event_evict(self):
+        """Add 'evict' event."""
+        return self.add_event(DynamicInferenceEventType.EVICT)
+
     def add_event_finish(self):
         """Add 'finish' event."""
         return self.add_event(DynamicInferenceEventType.FINISH)
@@ -368,8 +399,8 @@ def failed(self) -> bool:
 
 @dataclass(kw_only=True)
 class DynamicInferenceRequestRecord:
-    """History of DynamicInferenceRequest objects over multiple suspend and
-    resumes."""
+    """History of DynamicInferenceRequest objects over multiple request
+    checkpoints."""
 
     requests: list[DynamicInferenceRequest] = field(default_factory=list)
     latency: Optional[float] = None
@@ -408,9 +439,9 @@ def request_id(self) -> int:
         """
         return self.requests[0].request_id
 
-    def suspend(self, tokenizer: MegatronTokenizer | None = None):
-        """Suspend request by storing references to previous prompt, generations,
-        and sampling params.
+    def checkpoint(self, tokenizer: MegatronTokenizer | None = None):
+        """Maintain reference to previous request, and then append a new request
+        that concatenates the previous prompt and generations.
 
         Args:
             tokenizer (MegatronTokenizer | None): (Deprecated) Tokenizer.
@@ -451,7 +482,7 @@ def suspend(self, tokenizer: MegatronTokenizer | None = None):
         self.requests.append(new_request)
 
     def merge(self, tokenizer: MegatronTokenizer | None = None) -> DynamicInferenceRequest:
-        """Merge requests into a single suspend-agnostic request object.
+        """Merge requests into a single checkpoint-agnostic request object.
 
         Args:
             tokenizer (MegatronTokenizer | None): (Deprecated) Tokenizer.
@@ -469,7 +500,10 @@ def merge_lists(key):
         prompt_tokens = self.requests[0].prompt_tokens
         prompt_text = self.requests[0].prompt
         generated_tokens = merge_lists("generated_tokens")
-        generated_text = "".join(r.generated_text for r in self.requests)
+        try:
+            generated_text = "".join(r.generated_text for r in self.requests)
+        except TypeError as e:  # generally means r.generated_text is None
+            generated_text = None
 
         # Merged request.
         request = DynamicInferenceRequest(
@@ -499,8 +533,10 @@ def serialize(self) -> dict:
             (dict) A dictionary representation of the instance suitable for
                 serialization.
         """
-        obj = asdict(self)
-        obj["requests"] = [r.serialize() for r in self.requests]
+        torch.cuda.nvtx.range_push("DynamicInferenceRequestRecord.serialize")
+        obj = self.__dict__.copy()  # shallow dict copy
+        obj["requests"] = [r.serialize() for r in obj["requests"]]
+        torch.cuda.nvtx.range_pop()
         return obj
 
     @classmethod
diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
index 95d476a9f83..6a17de685bf 100644
--- a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
@@ -7,7 +7,6 @@
 
 import torch
 
-from megatron.core import parallel_state
 from megatron.core.fp8_utils import prepare_model_for_fp8_inference
 from megatron.core.inference.communication_utils import (
     is_pipeline_first_stage,
@@ -21,7 +20,7 @@
 )
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.utils import get_model_config
+from megatron.core.utils import get_attr_wrapped_model, get_model_config
 
 
 # pylint: disable=line-too-long
@@ -73,10 +72,7 @@ def __init__(
         self.inference_context = inference_context
 
         if pg_collection is None:
-            pg_collection = ProcessGroupCollection(
-                tp=parallel_state.get_tensor_model_parallel_group(),
-                pp=parallel_state.get_pipeline_model_parallel_group(),
-            )
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups()
 
         self.tp_group = pg_collection.tp
         self.pp_group = pg_collection.pp
@@ -166,6 +162,23 @@ def _forward(self, inference_input):
             runtime_gather_output=True,  # Inference should always gather the logits
         )
 
+    @torch.no_grad()
+    def dummy_forward(self):
+        """Run a dummy forward pass through the model, with a single token.
+        Use-case: Used in EP on ranks which do not have any work, but are needed
+        for the all-to-all communication."""
+        # we use num_dummy_tokens equal to tensor model parallel size
+        # so that the dummy forward pass will work with sequence parallel
+        num_dummy_tokens = self.tp_size
+        tokens = torch.zeros(
+            (1, num_dummy_tokens), dtype=torch.long, device=torch.cuda.current_device()
+        )
+        position_ids = torch.zeros(
+            (1, num_dummy_tokens), dtype=torch.long, device=torch.cuda.current_device()
+        )
+        attention_mask = None
+        return self.model(tokens, position_ids, attention_mask)
+
     def _get_batch_size_and_seq_len(
         self, tokens: torch.Tensor, recv_buffer_seq_len: Optional[int] = None
     ):
@@ -239,7 +252,8 @@ def forward_pass_with_pipeline_parallel_small_input_batch(
             recv_buffer = self._allocate_recv_buffer(batch_size, seq_len)
             recv_from_prev_pipeline_rank_(recv_buffer, self.pp_group)
 
-        self.model.set_input_tensor(recv_buffer)
+        set_input_tensor = get_attr_wrapped_model(self.model, "set_input_tensor")
+        set_input_tensor(recv_buffer)
         output_tensor = self._forward(inference_input)
 
         if not is_pipeline_last_stage(self.pp_group):
@@ -364,9 +378,7 @@ def run_one_forward_step(
             torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models.
         """
         # Check if we are in a PP model
-        if not (
-            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
-        ):
+        if not (is_pipeline_first_stage(self.pp_group) and is_pipeline_last_stage(self.pp_group)):
             tokens = inference_input["tokens"]
             current_batch_size, seq_len = self._get_batch_size_and_seq_len(
                 tokens, recv_buffer_seq_len
diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
index 430126816a7..ba89fbc2f6c 100644
--- a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
@@ -12,6 +12,7 @@
 )
 from megatron.core.inference.utils import get_attention_mask
 from megatron.core.models.gpt import GPTModel
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.enums import AttnBackend
 from megatron.core.utils import get_model_config
 
@@ -28,6 +29,8 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper):
             size, etc.
         inference_context (BaseInferenceContext): Manages KV cache, and tracks
             sequence/token/batch offsets.
+        pg_collection (ProcessGroupCollection): Process groups for model communication.
+            If not provided, defaults to global parallel state groups.
     """
 
     def __init__(
@@ -35,8 +38,9 @@ def __init__(
         model: GPTModel,
         inference_wrapper_config: InferenceWrapperConfig,
         inference_context: Optional[BaseInferenceContext] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ):
-        super().__init__(model, inference_wrapper_config, inference_context)
+        super().__init__(model, inference_wrapper_config, inference_context, pg_collection)
 
     def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[str, Any]:
         """Prepares the inference input data.
diff --git a/megatron/core/inference/sampling_params.py b/megatron/core/inference/sampling_params.py
index 6a4c5736706..ba1acae4c57 100644
--- a/megatron/core/inference/sampling_params.py
+++ b/megatron/core/inference/sampling_params.py
@@ -2,7 +2,7 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional
 
 
 @dataclass
@@ -30,6 +30,9 @@ class SamplingParams:
     top_n_logprobs: int = 0
     return_prompt_top_n_logprobs: bool = False  # Deprecated field for backwards compatibility
     add_BOS: bool = False
+    stop_words: Optional[List[str]] = (
+        None  # List of strings that will stop generation when produced
+    )
 
     def __post_init__(self):
         """Ensure backward compatibility for return_prompt_top_n_logprobs.
@@ -48,7 +51,7 @@ def _sync_prompt_logprobs_fields(self):
                 DeprecationWarning,
             )
             assert (
-                self.skip_prompt_log_probs
+                not self.skip_prompt_log_probs
             ), "return_prompt_top_n_logprobs requires skip_prompt_log_probs to be False"
         if self.top_n_logprobs > 0:
             self.return_prompt_top_n_logprobs = not self.skip_prompt_log_probs
diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py
index dcb8b419e74..a5233983ed0 100644
--- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py
@@ -6,7 +6,7 @@
 import functools
 import inspect
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, OrderedDict, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, OrderedDict, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -20,16 +20,13 @@
     is_pipeline_last_stage,
 )
 from megatron.core.inference.contexts.dynamic_context import MaxSequenceLengthOverflowError
-from megatron.core.inference.inference_request import (
-    DynamicInferenceRequest,
-    InferenceRequest,
-    Status,
-)
+from megatron.core.inference.inference_request import InferenceRequest, Status
 from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
     AbstractModelInferenceWrapper,
 )
 from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.utils import get_attention_mask, set_decode_expert_padding
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.moe.moe_layer import BaseMoELayer
 from megatron.core.transformer.utils import set_model_to_sequence_parallel
 from megatron.core.utils import get_asyncio_loop, get_model_config, unwrap_model
@@ -81,32 +78,46 @@ def __init__(
         if self.inference_wrapped_model.inference_context.is_dynamic_batching():
             self._init_dynamic_sampling_tensors()
 
+    def set_stop_word_finished_ids_callback(self, callback):
+        """Set a callback to get request IDs that should be marked as finished due to stop words.
+
+        The callback should have signature: callback(active_request_ids: List[int]) -> Set[int]
+        Returns a set of request IDs from active_request_ids that should be marked as finished.
+
+        Args:
+            callback: Function that returns request IDs to mark as finished.
+        """
+        self._get_stop_word_finished_ids_callback = callback
+
     def _init_dynamic_sampling_tensors(self):
         """Initialize tensors needed for dynamic sampling."""
         context = self.inference_wrapped_model.inference_context
-        max_requests = context.max_total_requests
+        max_requests = context.max_requests
+
+        # Callback to get request IDs that should be marked as finished due to stop words
+        self._get_stop_word_finished_ids_callback = None
 
         device = torch.cuda.current_device()
         logits_dtype = self.inference_wrapped_model.inference_wrapper_config.params_dtype
         # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2.
         vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size
 
-        # Initialize bookkeeping tensors.
-        self.sampling_logits_cuda = torch.empty(
-            max_requests, vocab_size, dtype=logits_dtype, device=device
-        )
-        self.sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device)
+        self._sampling_backend = "torch"
+        self._sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device)
 
-        self.temperature_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.float)
-        self.top_k_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.int32)
-        self.top_p_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.float)
-        self.termination_id_cuda = torch.empty(max_requests, dtype=torch.int64, device=device)
-        self.return_log_probs_cuda = torch.empty(max_requests, dtype=torch.bool, device=device)
-        self.skip_prompt_log_probs_cuda = torch.empty(max_requests, dtype=torch.bool, device=device)
-        self.top_n_logprobs_cuda = torch.empty(max_requests, dtype=torch.int32, device=device)
+        # Keep track of request metadata.
+        self._request_metadata: Dict[str, Tensor] = {}
+        for label, dtype, on_gpu in context.request_metadata_types:
+            tensor = context.request_metadata[label]
+            if not on_gpu:
+                # Create pinned tensors for request metadata that lives on CPU.
+                # This is metadata which requires D2H copies, such as top_k for torch sampling.
+                tensor = torch.empty_like(tensor, device="cpu", pin_memory=True)
+            self._request_metadata[label] = tensor
 
         # Used for inefficient torch sampling.
-        self.torch_sampling_buckets: List[Tensor] = []
+        if self._sampling_backend == "torch":
+            self._torch_sampling_buckets: Iterator[Tuple] = []
 
     def tokenize_prompt(self, prompt: str, add_BOS: bool = False) -> List[int]:
         """Utility to tokenize the input prompts.
@@ -478,13 +489,16 @@ def unpad_input_prompt_tokens(
         return padded_batch_prompt_tokens[:original_batch_size]
 
     def _dynamic_step_context_init(
-        self, construct_graph_dimensions: Optional[InferenceBatchDimensions] = None
+        self,
+        construct_graph_dimensions: Optional[InferenceBatchDimensions] = None,
+        is_dummy_forward: bool = False,
     ):
         """Initializes the inference context for dynamic batching.
 
         Args:
             construct_graph_dimensions (Optional[InferenceBatchDimensions]): The graph config to use
                 for constructing the cuda graphs.
+            is_dummy_forward (bool): Whether we are running an expert parallel dummy forward pass
 
         Return:
             input_ids (Tensor): The active input IDs.
@@ -492,6 +506,7 @@ def _dynamic_step_context_init(
         """
         context = self.inference_wrapped_model.inference_context
         inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config
+        active_request_slice = slice(context.paused_request_count, context.total_request_count)
 
         # Remove Float16Module wrapper if it exists
         unwrapped_model = unwrap_model(self.inference_wrapped_model.model)
@@ -504,17 +519,21 @@ def _dynamic_step_context_init(
         # for prefill turn off symmetric kernels
         symmetric_ar_type = model_config.symmetric_ar_type
         nccl_all_reduce_for_prefill = inference_wrapper_config.nccl_all_reduce_for_prefill
-        # Turning on/off MoE padding for prefill
+        # Turning on/off MoE padding for cuda-graphs
         moe_pad_experts_for_cuda_graph_inference = (
             inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference
         )
         if moe_pad_experts_for_cuda_graph_inference:
-            if context.is_decode_only():
+            if context.using_cuda_graph_this_step():
                 capacity_factor = model_config.num_moe_experts / model_config.moe_router_topk
                 set_decode_expert_padding(unwrapped_model, True, capacity_factor=capacity_factor)
             else:
                 set_decode_expert_padding(unwrapped_model, False)
 
+        # initialize symmetric memory if needed
+        if model_config.transformer_impl == "inference_optimized":
+            context.maybe_initialize_symmetric_memory()
+
         if nccl_all_reduce_for_prefill and symmetric_ar_type is not None:
             if context.is_decode_only():
                 # Turn on symmetric all reduce when in decode mode
@@ -523,8 +542,18 @@ def _dynamic_step_context_init(
                 # Turn off symmetric all reduces for prefill
                 unwrapped_model.set_symmetric_ar(None)
 
+        # Get request metadata for this step.
+        for label, dtype, on_gpu in context.request_metadata_types:
+            if not on_gpu:
+                # We need a D2H copy from the context to the pinned memory buffer.
+                self._request_metadata[label].copy_(
+                    context.request_metadata[label], non_blocking=True
+                )
+
         # Get flat tokens, position ids.
-        if construct_graph_dimensions is not None:
+        # If we are running a dummy forward step we want to use the token count agreed upon
+        # by all EP ranks rather than the minimum number of tokens.
+        if construct_graph_dimensions is not None and not is_dummy_forward:
             return context.current_input_and_position_ids(
                 num_warmup_tokens=construct_graph_dimensions.token_count
             )
@@ -543,8 +572,6 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor)
         inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config
 
         context = self.inference_wrapped_model.inference_context
-        materialize_only_last_token_logits = context.materialize_only_last_token_logits
-
         active_request_count = context.total_request_count - context.paused_request_count
 
         with torch.inference_mode():
@@ -554,7 +581,9 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor)
 
         if self.model_is_pipeline_parallel:
             logits_seq_len = (
-                active_request_count if materialize_only_last_token_logits else input_ids.shape[1]
+                active_request_count
+                if context.materialize_only_last_token_logits
+                else input_ids.shape[1]
             )
             vocab_size = inference_wrapper_config.padded_vocab_size
             logits_shape = [1, logits_seq_len, vocab_size]
@@ -568,178 +597,94 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor)
                 tensor=logits,
                 pp_group=self.pp_group,
             )
-        return logits
 
-    def _dynamic_step_sample_bookkeeping(
-        self,
-        *,
-        backend: str = "torch",
-        request_metadata: Optional[Tensor] = None,
-        request_metadata_labels: Dict[str, int] = None,
-    ):
-        """Perform bookkeeping necessary to sample logits for dynamic batching.
-
-        The ability to override the context's data is solely intended for
-            standalone use or testing, and should never be used in a running system.
+        return logits
 
-        Args:
-            backend (str): The sampling backend to use.
-            request_metadata (Optional[Tensor]): An override for the tensor that manages all
-                request metadata, such as sampling parameters. By default, this metadata is
-                retrieved from the context.
-            request_metadata_labels (Optional[Dict]): An override for the map of metadata labels
-                to their index in the request_metadata tensor. By default, this metadata is
-                retrieved from the request object.
-        """
-        assert backend in ["torch"]
+    def _dynamic_step_sample_bookkeeping(self):
+        """Perform bookkeeping necessary to sample logits for dynamic batching."""
         context = self.inference_wrapped_model.inference_context
+        active_request_slice = slice(context.paused_request_count, context.total_request_count)
 
-        if request_metadata is None:
-            request_metadata = context.request_metadata[
-                context.paused_request_count : context.total_request_count, :
-            ]
-        if request_metadata_labels is None:
-            request_metadata_labels = DynamicInferenceRequest.get_metadata_labels()
-        active_request_count = request_metadata.size(0)
-
-        # Shorthand these, because the torch backend needs them.
-        temp = request_metadata[:, request_metadata_labels["temperature"]]
-        top_k = request_metadata[:, request_metadata_labels["top_k"]]
-        top_p = request_metadata[:, request_metadata_labels["top_p"]]
-
-        # Copy data into relevant tensors.
-        self.temperature_cuda[:active_request_count].copy_(temp, non_blocking=True)
-        self.top_k_cuda[:active_request_count] = top_k.to(
-            dtype=torch.int32, copy=True, non_blocking=True
-        )
-        self.top_p_cuda[:active_request_count].copy_(top_p, non_blocking=True)
-        self.termination_id_cuda[:active_request_count] = request_metadata[
-            :, request_metadata_labels["termination_id"]
-        ].to(dtype=torch.int64, copy=True, non_blocking=True)
-        self.return_log_probs_cuda[:active_request_count] = request_metadata[
-            :, request_metadata_labels["return_log_probs"]
-        ].to(dtype=torch.bool, copy=True, non_blocking=True)
-        self.skip_prompt_log_probs_cuda[:active_request_count] = request_metadata[
-            :, request_metadata_labels["skip_prompt_log_probs"]
-        ].to(dtype=torch.bool, copy=True, non_blocking=True)
-        self.top_n_logprobs_cuda[:active_request_count] = request_metadata[
-            :, request_metadata_labels["top_n_logprobs"]
-        ].to(dtype=torch.int32, copy=True, non_blocking=True)
-
-        if backend == "torch":
+        if self._sampling_backend == "torch":
             # Bucketize the core sampling parameters.
-            core_params = torch.stack((temp, top_k, top_p), dim=1)
-            _, inv_indices, cnts = torch.unique(
-                core_params, dim=0, return_inverse=True, return_counts=True
-            )
-            order = torch.argsort(inv_indices, stable=True)
-            sampling_buckets = torch.split(order, cnts.tolist())
-            # Perform the D2H sync needed by `_torch_sampling_func` here.
-            group_reps = torch.stack([indices[0] for indices in sampling_buckets], dim=0)
-            core_params_reps = core_params[group_reps].detach().cpu()
-            temp_reps = core_params_reps[:, 0].tolist()
-            top_k_reps = core_params_reps[:, 1].to(torch.int32).tolist()
-            top_p_reps = core_params_reps[:, 2].tolist()
+            # Doing so via list comprehension is orders of magnitude faster than via torch.
+            bucket_map = {}
+
+            # Shorthands for the dictionary comprehension.
+            temp = self._request_metadata["temperature"][active_request_slice].tolist()
+            top_k = self._request_metadata["top_k"][active_request_slice].tolist()
+            top_p = self._request_metadata["top_p"][active_request_slice].tolist()
+
+            for i, (t, k, p) in enumerate(zip(temp, top_k, top_p)):
+                h = (t, k, p)
+                bucket = bucket_map.get(h, None)
+                if bucket is None:
+                    bucket_map[h] = ([i], i)
+                else:
+                    bucket[0].append(i)
+
             # Store the buckets and their equivalence class representatives.
-            self.torch_sampling_buckets = (
-                (sampling_buckets[idx], temp_reps[idx], top_k_reps[idx], top_p_reps[idx])
-                for idx in range(len(sampling_buckets))
+            self._torch_sampling_buckets = (
+                (indices, temp[rep], top_k[rep], top_p[rep]) for indices, rep in bucket_map.values()
             )
 
-    def _dynamic_step_sample_logits(self, logits: Tensor, backend: str = "torch") -> Tensor:
+    def _dynamic_step_sample_logits(self, logits: Tensor):
         """Sample tokens from logits for dynamic batching.
 
         Args:
-            logits (Tensor): The logits to sample from.
-            backend (str): The sampling backend to use.
-
-        Returns:
-            new_sample (Tensor): The sampled tokens.
+            logits (Tensor): The logits from the forward pass.
         """
         # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank
         # and then broadcast the sampled tokens rather than broadcasting the raw logits.
-        assert backend in ["torch"]
-
-        context = self.inference_wrapped_model.inference_context
-        materialize_only_last_token_logits = context.materialize_only_last_token_logits
 
         # Last token logits.
-        if materialize_only_last_token_logits:
+        context = self.inference_wrapped_model.inference_context
+        if context.materialize_only_last_token_logits:
             # When materialize_only_last_token_logits is true, last_token_logits is
             # already called in the forward pass of GPT.
             last_token_logits = logits.squeeze(0)
         else:
             last_token_logits = context.last_token_logits(logits)
-        active_request_count = last_token_logits.size(0)
-        # Copy last_token_logits to contiguous buffer.
-        self.sampling_logits_cuda[:active_request_count].copy_(last_token_logits, non_blocking=True)
 
-        if backend == "torch":
+        if self._sampling_backend == "torch":
             # Concatenate the outputs once to prevent repeated small writes.
             token_list = []
             indices_list = []
 
-            for indices, temp, top_k, top_p in self.torch_sampling_buckets:
+            for indices, temp, top_k, top_p in self._torch_sampling_buckets:
                 token_list.append(
-                    self._torch_sampling_func(
-                        self.sampling_logits_cuda[indices, :], temp, top_k, top_p
-                    )
+                    self._torch_sampling_func(last_token_logits[indices, :], temp, top_k, top_p)
                 )
-                indices_list.append(indices)
+                indices_list.append(torch.tensor(indices))
 
             # Single write to the output tensor.
             sampled_tokens = torch.cat(token_list, dim=0)
             sampled_indices = torch.cat(indices_list, dim=0)
-            self.sampled_tokens_cuda.index_copy_(0, sampled_indices, sampled_tokens)
-        return self.sampled_tokens_cuda[:active_request_count].clone()
-
-    def _dynamic_step_log_probs_bookkeeping(self) -> bool:
-        """Perform bookkeeping necessary to compute log probs for dynamic batching."""
-        context = self.inference_wrapped_model.inference_context
-        materialize_only_last_token_logits = context.materialize_only_last_token_logits
-
-        active_request_count = context.total_request_count - context.paused_request_count
-
-        # Create a copy to avoid modifying the original tensor with in-place operations
-        to_check = self.return_log_probs_cuda[:active_request_count].clone()
-        to_check &= ~self.skip_prompt_log_probs_cuda[:active_request_count]
+            self._sampled_tokens_cuda[sampled_indices] = sampled_tokens
 
-        assert not (
-            to_check.any() and materialize_only_last_token_logits
-        ), "Prompt log probs cannot be calculated if only last token logits are materialized. Set materialize_only_last_token_logits to False in DynamicInferenceContext or skip_prompt_log_probs to True in SamplingParams."
+    def _dynamic_step_log_probs_bookkeeping(self) -> Tuple[bool, bool]:
+        """Perform bookkeeping necessary to compute log probs for dynamic batching.
 
-        return self.return_log_probs_cuda[:active_request_count].any()
-
-    def _dynamic_step_top_n_logprobs_bookkeeping(self) -> bool:
-        """Perform bookkeeping necessary to compute top-n log probs for dynamic batching."""
+        Returns:
+            return_log_probs (bool): Whether to return the sampled log_probs.
+        """
         context = self.inference_wrapped_model.inference_context
-        materialize_only_last_token_logits = context.materialize_only_last_token_logits
-
-        active_request_count = context.total_request_count - context.paused_request_count
-
-        # Check if any request wants prompt top-n logprobs (top_n > 0 AND skip_prompt_log_probs = False)
-        # Create a copy to avoid modifying the original tensor with in-place operations
-        to_check = (self.top_n_logprobs_cuda[:active_request_count] > 0).clone()
-        to_check &= ~self.skip_prompt_log_probs_cuda[:active_request_count]
+        active_request_slice = slice(context.paused_request_count, context.total_request_count)
 
-        assert not (
-            to_check.any() and materialize_only_last_token_logits
-        ), "Prompt top-n logprobs cannot be calculated if only last token logits are materialized. Set materialize_only_last_token_logits to False in DynamicInferenceContext or set skip_prompt_log_probs to True in SamplingParams."
+        return_log_probs = self._request_metadata["return_log_probs"][active_request_slice]
+        top_n_log_probs = self._request_metadata["top_n_logprobs"][active_request_slice] > 0
 
-        # Check if any request has top_n_logprobs > 0
-        return (self.top_n_logprobs_cuda[:active_request_count] > 0).any()
+        return return_log_probs.any(), top_n_log_probs.any()
 
     def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optional[Tensor]:
         """Calculate log probs from logits."""
         context = self.inference_wrapped_model.inference_context
-        materialize_only_last_token_logits = context.materialize_only_last_token_logits
-
         active_request_count = context.total_request_count - context.paused_request_count
 
         return context.calculate_log_probs(
             logits,
-            self.sampled_tokens_cuda[:active_request_count],
-            only_last_token_logits=materialize_only_last_token_logits,
+            self._sampled_tokens_cuda[:active_request_count],
+            only_last_token_logits=context.materialize_only_last_token_logits,
         )
 
     def _dynamic_step_calculate_top_n_logprobs(
@@ -763,19 +708,20 @@ def _dynamic_step_calculate_top_n_logprobs(
         )
 
         context = self.inference_wrapped_model.inference_context
-        materialize_only_last_token_logits = context.materialize_only_last_token_logits
-
         active_request_count = context.total_request_count - context.paused_request_count
+        active_request_slice = slice(context.paused_request_count, context.total_request_count)
 
         # Handle decode-only mode (only last token)
-        if materialize_only_last_token_logits or context.is_decode_only():
+        if context.materialize_only_last_token_logits or context.is_decode_only():
             # In decode mode or when only last token logits are materialized,
             # logits already represent only the last tokens
             log_probs = log_probs_tensor[:active_request_count]
 
             top_n_results = {}
             for req_idx in range(active_request_count):
-                top_n = int(self.top_n_logprobs_cuda[req_idx].item())
+                top_n = int(
+                    self._request_metadata["top_n_logprobs"][active_request_slice][req_idx].item()
+                )
                 if top_n > 0:
                     # Get top-n logprobs and indices for this request (single token)
                     top_n_logits = torch.topk(log_probs[req_idx], k=top_n)
@@ -789,9 +735,7 @@ def _dynamic_step_calculate_top_n_logprobs(
         # Note: logits may be padded, so we only take the first active_token_count tokens
         log_probs = log_probs_tensor[: context.active_token_count]
 
-        active_query_lengths = context.request_query_lengths[
-            context.paused_request_count : context.total_request_count
-        ]
+        active_query_lengths = context.request_query_lengths[active_request_slice]
 
         # Split log_probs across request boundaries
         # log_probs has shape [active_token_count, vocab_size]
@@ -799,12 +743,14 @@ def _dynamic_step_calculate_top_n_logprobs(
 
         top_n_results = {}
         for req_idx in range(active_request_count):
-            top_n = int(self.top_n_logprobs_cuda[req_idx].item())
+            top_n = int(
+                self._request_metadata["top_n_logprobs"][active_request_slice][req_idx].item()
+            )
             if top_n > 0:
                 request_log_probs = log_probs_per_request[
                     req_idx
                 ]  # [num_tokens_for_request, vocab_size]
-                skip_prompt = bool(self.skip_prompt_log_probs_cuda[req_idx].item())
+                skip_prompt = bool(self._request_metadata["skip_prompt_log_probs"][req_idx].item())
 
                 # If skip_prompt_log_probs is True, only compute for last token
                 if skip_prompt and request_log_probs.size(0) > 1:
@@ -825,9 +771,45 @@ def _dynamic_step_calculate_top_n_logprobs(
 
         return top_n_results if top_n_results else None
 
-    def _dynamic_step_context_bookkeeping(self, new_sample) -> Dict[str, Tensor]:
+    def dummy_forward(self):
+        """Perform a dummy forward pass. This is used in expert model parallelism
+        on ranks that do not have any real requests."""
+
+        context = self.inference_wrapped_model.inference_context
+        # if no cuda graphs, directly use dummy forward
+        if not context.cuda_graph_batch_dimensions_list:
+            return self.inference_wrapped_model.dummy_forward()
+
+        # attempt to use cuda-graph if possible
+        # here we try to reuse the cuda-graph warmup code to run
+        # a dummy cuda graph.
+        input_ids, position_ids = self._dynamic_step_context_init(
+            # try to use the smallest cuda-graph config for dummy forward
+            construct_graph_dimensions=min(context.cuda_graph_batch_dimensions_list),
+            is_dummy_forward=True,
+        )
+
+        # _dynamic_step_context_init tries to find a cuda-graph that is compatible
+        # with all EP ranks. It can also return no match, in which case
+        # we run in eager mode.
+
+        if context.using_cuda_graph_this_step():
+            # we found a cuda-graph to run
+            self._dynamic_step_forward_logits(input_ids, position_ids)
+        else:
+            # fallback to eager dummy forward
+            self.inference_wrapped_model.dummy_forward()
+        context.reset()
+
+    def _dynamic_step_context_bookkeeping(self) -> Dict[str, Tensor]:
         """Update the dynamic inference context after sampling.
 
+        Args:
+            new_sample (Tensor): The newly sampled tokens.
+            request_metadata (Optional[Dict[str, Tensor]]): An override for the tensors
+                that manage request metadata, such as sampling parameters. By default, this
+                metadata is retrieved from the context.
+
         Return:
             Dict [str, Tensor]: A dictionary containing:
                 active_request_ids (Tensor): Current active request IDs.
@@ -835,13 +817,11 @@ def _dynamic_step_context_bookkeeping(self, new_sample) -> Dict[str, Tensor]:
                 finished_request_ids (Tensor): Finished request IDs.
         """
         context = self.inference_wrapped_model.inference_context
-
         active_request_count = context.total_request_count - context.paused_request_count
+        active_request_slice = slice(context.paused_request_count, context.total_request_count)
 
         # Active sequence lengths.
-        active_request_ids = context.request_ids[
-            context.paused_request_count : context.total_request_count
-        ].long()
+        active_request_ids = context.request_ids[active_request_slice].long()
         active_sequence_lengths = context.get_active_sequence_lengths()
         active_sequence_lengths += 1  # Account for the token we just generated
         max_sequence_lengths = context.get_max_sequence_lengths()
@@ -849,24 +829,34 @@ def _dynamic_step_context_bookkeeping(self, new_sample) -> Dict[str, Tensor]:
         # Request finished if termination_id or length >= max_sequence_length.
         # Note: termination_id tensor has per-request termination IDs from mixed sampling
         active_request_mask = (
-            self.sampled_tokens_cuda[:active_request_count]
-            != self.termination_id_cuda[:active_request_count]
+            self._sampled_tokens_cuda[:active_request_count]
+            != self._request_metadata["termination_id"][active_request_slice]
         ).byte() & torch.less(active_sequence_lengths, max_sequence_lengths).byte()
+
+        # Mark requests as finished if they hit stop words (detected in previous step's post_process_requests)
+        if self._get_stop_word_finished_ids_callback is not None:
+            request_ids_list = active_request_ids.tolist()
+            stop_word_finished_ids = self._get_stop_word_finished_ids_callback(request_ids_list)
+            if stop_word_finished_ids:
+                for idx, request_id in enumerate(request_ids_list):
+                    if request_id in stop_word_finished_ids:
+                        active_request_mask[idx] = 0
+
         finished_idxs = (
             torch.nonzero(active_request_mask == 0, as_tuple=True)[0] + context.paused_request_count
         )
         finished_request_ids = context.request_ids[finished_idxs]
 
         # New sample gets updated in update_requests, so we pass in a clone
-        new_sample_copy = new_sample.clone()
+        new_sample_copy = self._sampled_tokens_cuda[:active_request_count].clone()
 
         # Update requests.
-        newly_paused_request_ids = context.update_requests(active_request_mask, new_sample_copy)
+        update_result = context.update_requests(active_request_mask, new_sample_copy)
 
         return {
             "active_request_ids": active_request_ids,
-            "newly_paused_request_ids": newly_paused_request_ids,
             "finished_request_ids": finished_request_ids,
+            **(update_result or {}),
         }
 
     @torch.inference_mode()
@@ -888,6 +878,7 @@ async def async_generate_output_tokens_dynamic_batch(
                 cuda_graph_request_count (Optional[int]): Size of cuda graph used for this step.
         """
         context = self.inference_wrapped_model.inference_context
+        active_request_count = context.total_request_count - context.paused_request_count
 
         # No tokens?
         if context.active_token_count == 0:
@@ -910,11 +901,9 @@ async def async_generate_output_tokens_dynamic_batch(
         # NOTE [TDE]: This will be moved once CPU and GPU methods are separated.
         await asyncio.sleep(0)
 
+        return_log_probs, return_top_n_logprobs = self._dynamic_step_log_probs_bookkeeping()
         self._dynamic_step_sample_bookkeeping()
-        new_sample = self._dynamic_step_sample_logits(logits)
-
-        return_log_probs = self._dynamic_step_log_probs_bookkeeping()
-        return_top_n_logprobs = self._dynamic_step_top_n_logprobs_bookkeeping()
+        self._dynamic_step_sample_logits(logits)
 
         log_probs = None
         top_n_logprobs = None
@@ -928,10 +917,10 @@ async def async_generate_output_tokens_dynamic_batch(
         if skip_bookkeeping:
             request_bookkeeping = {}
         else:
-            request_bookkeeping = self._dynamic_step_context_bookkeeping(new_sample)
+            request_bookkeeping = self._dynamic_step_context_bookkeeping()
 
         ret = {
-            "sample": new_sample,
+            "sample": self._sampled_tokens_cuda[:active_request_count],
             "log_probs": log_probs,
             "top_n_logprobs": top_n_logprobs,
             "cuda_graph_request_count": cuda_graph_request_count,
@@ -1029,7 +1018,7 @@ def generate_all_output_tokens_static_batch(
         # Check whether CUDA graphs are enabled
         enable_cuda_graph = (
             model_config.cuda_graph_impl == "local"
-            and model_config.cuda_graph_scope != "full_iteration"
+            and CudaGraphScope.full_iteration not in model_config.cuda_graph_scope
         )
 
         # Pad batch tokens if necessary
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/__init__.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/__init__.py
new file mode 100644
index 00000000000..3ad54686261
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from .flask_server import run_flask_server
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/__init__.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/__init__.py
new file mode 100644
index 00000000000..f2b0661dace
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+
+try:
+    from .chat_completions import bp as ChatCompletions
+    from .completions import bp as Completions
+    from .health import bp as Health
+
+    __all__ = [Completions, ChatCompletions, Health]
+except ImportError:
+    __all__ = []
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/chat_completions.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/chat_completions.py
new file mode 100644
index 00000000000..0c3379bc53f
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/chat_completions.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import asyncio
+import logging
+import time
+
+from megatron.core.inference.sampling_params import SamplingParams
+
+logger = logging.getLogger(__name__)
+
+try:
+    from flask import Blueprint, current_app, jsonify, request
+
+    bp = Blueprint('chat_completions_api', __name__)
+
+    @bp.route('/chat/completions', methods=['POST'])
+    @bp.route('/v1/chat/completions', methods=['POST'])
+    async def chat_completions():
+        """Handles async POST requests for chat completions."""
+        client = current_app.config['client']
+        tokenizer = current_app.config['tokenizer']
+
+        req = request.get_json()
+
+        # --- 1. Parse Messages ---
+        messages = req.get("messages")
+        if not messages:
+            return "Missing 'messages' field", 400
+        if not isinstance(messages, list):
+            return "'messages' must be a list", 400
+
+        try:
+            prompt_tokens = tokenizer.apply_chat_template(
+                messages, tokenize=True, add_generation_prompt=True
+            )
+        except AttributeError:
+            return (
+                "Tokenizer does not support 'apply_chat_template'. "
+                "Chat completions requires a tokenizer with a configured chat template."
+            ), 500
+        except Exception as e:
+            return f"Error processing 'messages': {e}", 500
+
+        # --- 2. Parse Sampling Params ---
+        try:
+            temperature = float(req.get("temperature", 1.0))
+            top_p = float(req.get("top_p", 1.0))
+            top_k = int(req.get("top_k", 0))
+            n = int(req.get("n", 1))  # Number of choices to generate
+
+            if temperature == 0.0:
+                top_k = 1
+                top_p = 0.0
+
+            # Check for 'logprobs' (bool) and 'top_logprobs' (int)
+            return_log_probs = bool(req.get("logprobs", False))
+            top_n_logprobs = int(req.get("top_logprobs", 0)) if return_log_probs else 0
+
+            sampling_params = SamplingParams(
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                return_log_probs=return_log_probs,
+                top_n_logprobs=top_n_logprobs,
+                num_tokens_to_generate=int(req.get("max_tokens", 16)),
+            )
+        except ValueError as e:
+            return f"Invalid sampling parameter: {e}", 400
+
+        # --- 3. Send Requests to Engine ---
+        # For chat, we run the *same* prompt 'n' times.
+        tasks = []
+        for _ in range(n):
+            per_req_params = SamplingParams(
+                temperature=sampling_params.temperature,
+                top_k=sampling_params.top_k,
+                top_p=sampling_params.top_p,
+                return_log_probs=sampling_params.return_log_probs,
+                top_n_logprobs=sampling_params.top_n_logprobs,
+                num_tokens_to_generate=sampling_params.num_tokens_to_generate,
+            )
+            tasks.append(client.add_request(prompt_tokens, per_req_params))
+
+        start_time = time.perf_counter()
+        try:
+            batch_results = await asyncio.gather(*tasks)
+        except Exception as e:
+            return f"Error during inference: {e}", 500
+
+        logger.info(
+            f"Batch of {len(tasks)} requests (n={n}) processed in "
+            f"{time.perf_counter() - start_time:.2f}s"
+        )
+
+        # --- 4. Format OpenAI Response ---
+        choices = []
+        total_completion_tokens = 0
+        prompt_token_count = len(prompt_tokens)  # Calculated once
+
+        request_idx = 0
+        for record in batch_results:
+            for result in record.requests:
+                text_output = result.generated_text
+
+                logprobs_content = None
+                if sampling_params.return_log_probs:
+                    token_logprobs = getattr(result, 'log_probs', [])
+                    tokens = [tokenizer.detokenize([tok]) for tok in result.generated_tokens]
+
+                    # Get top_n_logprobs if available
+                    generated_top_n_logprobs = getattr(result, 'generated_top_n_logprobs', None)
+
+                    logprobs_content = []
+                    for i, (tok, lp) in enumerate(zip(tokens, token_logprobs)):
+                        # Build top_logprobs list for this token position
+                        top_logprobs_list = []
+                        if generated_top_n_logprobs and i < len(generated_top_n_logprobs):
+                            top_n_dict = generated_top_n_logprobs[i]
+                            for token_str, logprob in top_n_dict.items():
+                                top_logprobs_list.append(
+                                    {
+                                        "token": token_str,
+                                        "logprob": logprob,
+                                        "bytes": list(token_str.encode("utf-8")),
+                                    }
+                                )
+
+                        entry = {
+                            "token": tok,
+                            "logprob": lp,
+                            "bytes": list(tok.encode("utf-8")),
+                            "top_logprobs": top_logprobs_list,
+                        }
+                        logprobs_content.append(entry)
+
+                choice_data = {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": text_output},
+                    # 'logprobs' in chat API is an object containing 'content'
+                    "logprobs": {"content": logprobs_content} if logprobs_content else None,
+                    "finish_reason": "length",  # Original code hardcoded this.
+                }
+                choices.append(choice_data)
+                total_completion_tokens += len(result.generated_tokens)
+                request_idx += 0
+
+        response = {
+            "choices": choices,
+            "usage": {
+                "prompt_tokens": prompt_token_count,
+                "completion_tokens": total_completion_tokens,
+                "total_tokens": prompt_token_count + total_completion_tokens,
+            },
+        }
+        return jsonify(response)
+
+except ImportError as e:
+    logger.warning(f"Could not import flask: {e}")
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/common.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/common.py
new file mode 100644
index 00000000000..6efdba5cdb2
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/common.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import threading
+
+import torch
+
+GENERATE_NUM = 0
+LOCK = threading.Lock()
+
+
+def send_do_generate():
+    """Broadcasts a message to perform a generation to all tensor parallel ranks."""
+    choice = torch.tensor([GENERATE_NUM], dtype=torch.long, device=torch.cuda.current_device())
+    torch.distributed.broadcast(choice, 0)
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/completions.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/completions.py
new file mode 100644
index 00000000000..b749205cdfd
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/completions.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import asyncio
+import logging
+import time
+
+from megatron.core.inference.sampling_params import SamplingParams
+
+logger = logging.getLogger(__name__)
+
+
+try:
+    from flask import Blueprint, current_app, jsonify, request
+
+    bp = Blueprint('completions_api', __name__)
+
+    @bp.route('/completions', methods=['POST'])
+    @bp.route('/v1/completions', methods=['POST'])
+    async def completions():
+        """Handles async POST requests for completions."""
+        client = current_app.config['client']
+        tokenizer = current_app.config['tokenizer']
+
+        req = request.get_json()
+
+        # --- 1. Parse Prompt ---
+        prompt_data = req.get("prompt")
+        if not prompt_data:
+            return "Missing 'prompt' field", 400
+
+        try:
+            if isinstance(prompt_data, str):
+                prompts_as_tokens = [tokenizer.tokenize(prompt_data)]
+                prompts_as_strings = [prompt_data]
+            elif isinstance(prompt_data, list):
+                if not prompt_data:
+                    return "'prompt' list is empty", 400
+                if all(isinstance(p, str) for p in prompt_data):
+                    prompts_as_tokens = [tokenizer.tokenize(p) for p in prompt_data]
+                    prompts_as_strings = prompt_data
+                elif all(isinstance(p, int) for p in prompt_data):
+                    prompts_as_tokens = [prompt_data]
+                    prompts_as_strings = [tokenizer.detokenize(prompt_data)]
+                elif all(
+                    isinstance(p, list) and all(isinstance(t, int) for t in p) for p in prompt_data
+                ):
+                    prompts_as_tokens = prompt_data
+                    prompts_as_strings = [tokenizer.detokenize(p) for p in prompt_data]
+                else:
+                    return (
+                        (
+                            "Invalid 'prompt' format. Must be str, list[str], "
+                            "list[int], or list[list[int]]"
+                        ),
+                        400,
+                    )
+            else:
+                return "Invalid 'prompt' type. Must be str or list", 400
+        except Exception as e:
+            return f"Error tokenizing prompt: {e}", 500
+
+        # --- 2. Parse Sampling Params ---
+        try:
+            temperature = float(req.get("temperature", 1.0))
+            top_p = float(req.get("top_p", 1.0))
+            top_k = int(req.get("top_k", 0))
+            echo = bool(req.get("echo", False))
+
+            if temperature == 0.0:
+                top_k = 1
+                top_p = 0.0
+
+            # Parse logprobs - can be an integer (number of top logprobs to return) or None
+            logprobs_param = req.get("logprobs", None)
+
+            if logprobs_param is not None:
+                top_n_logprobs = int(logprobs_param)
+                return_log_probs = True
+            else:
+                top_n_logprobs = 0
+                return_log_probs = False
+
+            # When echo=True and logprobs are requested, we need prompt logprobs
+            # skip_prompt_log_probs=False ensures the engine computes logprobs for prompt tokens
+            skip_prompt_log_probs = not (echo and return_log_probs)
+
+            sampling_params = SamplingParams(
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                return_log_probs=return_log_probs,
+                top_n_logprobs=top_n_logprobs,
+                skip_prompt_log_probs=skip_prompt_log_probs,
+                num_tokens_to_generate=int(req.get("max_tokens", 16)),
+            )
+        except ValueError as e:
+            return f"Invalid sampling parameter: {e}", 400
+
+        # --- 3. Send Requests to Engine ---
+        tasks = []
+        for prompt_tokens in prompts_as_tokens:
+            per_req_params = SamplingParams(
+                temperature=sampling_params.temperature,
+                top_k=sampling_params.top_k,
+                top_p=sampling_params.top_p,
+                return_log_probs=sampling_params.return_log_probs,
+                top_n_logprobs=sampling_params.top_n_logprobs,
+                skip_prompt_log_probs=sampling_params.skip_prompt_log_probs,
+                num_tokens_to_generate=sampling_params.num_tokens_to_generate,
+            )
+            tasks.append(client.add_request(prompt_tokens, per_req_params))
+
+        start_time = time.perf_counter()
+        try:
+            batch_results = await asyncio.gather(*tasks)
+        except Exception as e:
+            return f"Error during inference: {e}", 500
+
+        logger.info(
+            f"Batch of {len(tasks)} requests processed in {time.perf_counter() - start_time:.2f}s"
+        )
+
+        # --- 4. Format Response (matching old_completions.py) ---
+        choices = []
+
+        request_idx = 0
+        for record in batch_results:
+            for result in record.requests:
+                full_text = result.generated_text or ""
+                text_output = (prompts_as_strings[request_idx] + full_text) if echo else full_text
+
+                logprobs_data = None
+                if sampling_params.return_log_probs:
+                    # Get prompt tokens and logprobs
+                    prompt_tokens_list = []
+                    if result.prompt_tokens is not None:
+                        if hasattr(result.prompt_tokens, 'tolist'):
+                            prompt_tokens_list = result.prompt_tokens.tolist()
+                        else:
+                            prompt_tokens_list = list(result.prompt_tokens)
+
+                    prompt_log_probs = getattr(result, 'prompt_log_probs', None) or []
+                    prompt_top_n_logprobs = getattr(result, 'prompt_top_n_logprobs', None) or []
+
+                    # Get generated tokens and logprobs
+                    generated_tokens_list = (
+                        list(result.generated_tokens) if result.generated_tokens else []
+                    )
+                    generated_log_probs = getattr(result, 'generated_log_probs', None) or []
+                    generated_top_n_logprobs = (
+                        getattr(result, 'generated_top_n_logprobs', None) or []
+                    )
+
+                    if echo:
+                        # When echo=True, include prompt tokens and their logprobs
+                        # Prompt logprobs are for tokens [1:] (first token has no logprob)
+                        all_token_ids = prompt_tokens_list + generated_tokens_list
+                        tokens = [tokenizer.detokenize([tok]) for tok in all_token_ids]
+
+                        # Build token_logprobs: [None] for first token, then prompt logprobs,
+                        # then generated logprobs
+                        token_logprobs = [None] + list(prompt_log_probs) + list(generated_log_probs)
+
+                        # Build top_logprobs: [None] for first token, then prompt top_n,
+                        # then generated top_n
+                        top_logprobs = None
+                        if prompt_top_n_logprobs or generated_top_n_logprobs:
+                            top_logprobs = (
+                                [None]
+                                + list(prompt_top_n_logprobs)
+                                + list(generated_top_n_logprobs)
+                            )
+
+                        # Calculate text_offset: cumulative character positions starting from 0
+                        text_offset = []
+                        current_offset = 0
+                        for tok_str in tokens:
+                            text_offset.append(current_offset)
+                            current_offset += len(tok_str)
+                    else:
+                        # When echo=False, only return generated tokens and their logprobs
+                        tokens = [tokenizer.detokenize([tok]) for tok in generated_tokens_list]
+
+                        # Prepend [None] to match OpenAI format
+                        token_logprobs = [None] + list(generated_log_probs)
+
+                        # Build top_logprobs
+                        top_logprobs = None
+                        if generated_top_n_logprobs:
+                            top_logprobs = [None] + list(generated_top_n_logprobs)
+
+                        # Calculate text_offset for generated tokens only
+                        text_offset = []
+                        current_offset = 0
+                        for tok_str in tokens:
+                            text_offset.append(current_offset)
+                            current_offset += len(tok_str)
+
+                    logprobs_data = {
+                        "token_logprobs": token_logprobs,
+                        "tokens": tokens,
+                        "text_offset": text_offset,
+                        "top_logprobs": top_logprobs,
+                    }
+
+                choices.append(
+                    {"index": request_idx, "text": text_output, "logprobs": logprobs_data}
+                )
+                request_idx += 1
+
+        return jsonify({"choices": choices})
+
+except ImportError as e:
+    logger.warning(f"Could not import flask: {e}")
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/health.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/health.py
new file mode 100644
index 00000000000..a9d0a678b44
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/endpoints/health.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+try:
+    from flask import Blueprint, current_app, jsonify
+
+    bp = Blueprint('health_api', __name__)
+
+    @bp.route('/health', methods=['GET'])
+    @bp.route('/v1/health', methods=['GET'])
+    async def health():
+        """
+        Handles GET requests for service health.
+        Checks if the inference client is initialized and reachable.
+        """
+        status_response = {"status": "ok", "service": "Megatron Inference Server", "ready": False}
+
+        try:
+            client = current_app.config.get('client')
+
+            if client is not None:
+                status_response["ready"] = True
+                return jsonify(status_response), 200
+            else:
+                logger.error("Health check failed: Client not found in app config.")
+                status_response["status"] = "error"
+                status_response["details"] = "Inference client not initialized"
+                return jsonify(status_response), 503
+
+        except Exception as e:
+            logger.error(f"Health check failed with exception: {e}")
+            return jsonify({"status": "error", "details": str(e)}), 500
+
+except ImportError as e:
+    logger.warning(f"Could not import flask: {e}")
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/flask_server.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/flask_server.py
new file mode 100644
index 00000000000..2b0469b340a
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/flask_server.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+import socket
+from contextlib import contextmanager
+
+try:
+    from flask import Flask
+    from hypercorn.asyncio import serve
+    from hypercorn.config import Config
+
+    HAS_FLASK = True
+except ImportError as e:
+    HAS_FLASK = False
+
+import megatron.core.inference.text_generation_server.dynamic_text_gen_server.endpoints as endpoints
+from megatron.core.inference.inference_client import InferenceClient
+from megatron.core.utils import trace_async_exceptions
+
+logger = logging.getLogger(__name__)
+
+
+@contextmanager
+def temp_log_level(level, logger=None):
+    """Enables temporarily overriding the logging level."""
+    logger = logger or logging.getLogger()
+    old_level = logger.level
+    logger.setLevel(level)
+    try:
+        yield
+    finally:
+        logger.setLevel(old_level)
+
+
+@trace_async_exceptions
+async def run_flask_server(coordinator_port: int, tokenizer, rank: int, flask_port: int):
+    """Initializes and runs the async Flask server."""
+    if not HAS_FLASK:
+        raise RuntimeError(f"Flask not available")
+
+    try:
+        hostname = socket.gethostname()
+    except Exception as e:
+        logger.warning(f"Could not get hostname: {e}")
+        hostname = "0.0.0.0"
+
+    inference_client = InferenceClient(coordinator_port)
+    await inference_client.start()
+    logger.info(f"Rank {rank}: InferenceClient connected.")
+
+    app = Flask(__name__)
+
+    # Store client and tokenizer in app config for Blueprints to use
+    app.config['client'] = inference_client
+    app.config['tokenizer'] = tokenizer
+
+    # Register all blueprints from the 'endpoints' package
+    for endpoint in endpoints.__all__:
+        app.register_blueprint(endpoint)
+
+    @app.route('/')
+    def health_check():
+        return "Megatron Dynamic Inference Server is running."
+
+    config = Config()
+    config.bind = [f"0.0.0.0:{flask_port}"]
+
+    # Force logging level to INFO to ensure that hostname is printed
+    with temp_log_level(logging.INFO, logger):
+        logger.info(f"Starting Flask server on http://{hostname}:{flask_port}")
+
+    try:
+        await serve(app, config)
+    finally:
+        await inference_client.stop()
+        logger.info(f"Rank {rank}: Flask server and client shut down.")
diff --git a/megatron/core/inference/text_generation_server/dynamic_text_gen_server/tokenization.py b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/tokenization.py
new file mode 100644
index 00000000000..ca645c8f7d6
--- /dev/null
+++ b/megatron/core/inference/text_generation_server/dynamic_text_gen_server/tokenization.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Tokenization utilities."""
+
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.inference.communication_utils import broadcast_int_list, broadcast_tensor
+
+
+def tokenize_prompts(
+    tokenizer, prompts=None, tokens_to_generate=None, add_BOS=None, rank=0, data_parallel=False
+):
+    """Tokenize prompts and make them avaiable on all ranks.
+
+    Args:
+        data_parallel (bool): Broadcast tokens across a single data parallel model replica.
+    """
+
+    # On all ranks set to None so we can pass them to functions
+    sizes_list = None
+    prompts_tokens_cuda_long_tensor = None
+    prompts_length_cuda_long_tensor = None
+
+    # On the specified rank, build the above.
+    src_rank = torch.distributed.get_rank()
+    if data_parallel:
+        src_rank = parallel_state.get_data_parallel_src_rank()
+
+    if src_rank == rank:
+        assert prompts is not None
+        assert tokens_to_generate is not None
+        # Tensor of tokens padded and their unpadded length.
+        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = (
+            _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, add_BOS)
+        )
+        # We need the sizes of these tensors for the boradcast
+        sizes_list = [
+            prompts_tokens_cuda_long_tensor.size(0),  # Batch size
+            prompts_tokens_cuda_long_tensor.size(1),
+        ]  # Sequence lenght
+
+    # First, broadcast the sizes.
+    sizes_tensor = broadcast_int_list(
+        2, int_list=sizes_list, rank=rank, data_parallel=data_parallel
+    )
+
+    # Now that we have the sizes, we can boradcast the tokens
+    # and length tensors.
+    sizes = sizes_tensor.tolist()
+    prompts_tokens_cuda_long_tensor = broadcast_tensor(
+        sizes,
+        torch.int64,
+        tensor=prompts_tokens_cuda_long_tensor,
+        rank=rank,
+        data_parallel=data_parallel,
+    )
+    prompts_length_cuda_long_tensor = broadcast_tensor(
+        sizes[0],
+        torch.int64,
+        tensor=prompts_length_cuda_long_tensor,
+        rank=rank,
+        data_parallel=data_parallel,
+    )
+
+    return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
+
+
+def _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, add_BOS):
+    """Given a set of prompts and number of tokens to generate:
+    - tokenize prompts
+    - set the sequence length to be the max of length of prompts
+      plus the number of tokens we would like to generate
+    - pad all the sequences to this length so we can convert them
+      into a 2D tensor.
+    """
+
+    # Tokenize all the prompts.
+    if hasattr(tokenizer, 'eod'):
+        eod_token = tokenizer.eod
+    elif hasattr(tokenizer, 'eos_id'):
+        eod_token = tokenizer.eos_id
+    else:
+        raise AttributeError('No eod token found in Tokenizer')
+    if add_BOS:
+        prompts_tokens = [[eod_token] + tokenizer.tokenize(prompt) for prompt in prompts]
+    else:
+        prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
+
+    # Now we have a list of list of tokens which each list has a different
+    # size. We want to extend this list to:
+    #   - incorporate the tokens that need to be generated
+    #   - make all the sequences equal length.
+    # Get the prompts length.
+    prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
+    # Get the max prompts length.
+    max_prompt_len = max(prompts_length)
+    # Number of tokens in the each sample of the batch.
+    samples_length = max_prompt_len + tokens_to_generate
+    # Now update the list of list to be of the same size: samples_length.
+    for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
+        padding_size = samples_length - prompt_length
+        prompt_tokens.extend([eod_token] * padding_size)
+
+    # Now we are in a structured format, we can convert to tensors.
+    prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.long, device='cuda')
+    prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.long, device='cuda')
+
+    return prompts_tokens_tensor, prompts_length_tensor
diff --git a/megatron/core/inference/unified_memory.py b/megatron/core/inference/unified_memory.py
index 56073df063f..6b58e845812 100644
--- a/megatron/core/inference/unified_memory.py
+++ b/megatron/core/inference/unified_memory.py
@@ -1,12 +1,15 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
+import ctypes
 import os
 import signal
+import threading
 import warnings
 from contextlib import contextmanager
 from enum import Enum, auto
 from pathlib import Path
 
+import torch
 from torch.cuda.memory import CUDAPluggableAllocator
 from torch.utils.cpp_extension import CUDA_HOME, load_inline
 
@@ -42,6 +45,10 @@ class UnifiedMemoryCompileTimeoutError(UnifiedMemoryUnsupportedError):
 _compilation_state = CompilationState.UNATTEMPTED
 _alloc = None  # must remain global until process exit.
 _mod = None  # must remain global until process exit.
+_so_path = None  # path to compiled extension .so (must remain global until exit).
+_ctypes_lib = None  # ctypes handle to compiled extension
+_ctypes_lock = threading.Lock()
+_compilation_error: str | None = None  # store last failure reason for better error messages
 
 
 @contextmanager
@@ -74,11 +81,19 @@ def _handler(signum, frame):
 def compile_allocator():
     """Attempt to compile UVM allocator."""
 
-    global _compilation_state, _alloc, _mod
+    global _compilation_state, _alloc, _mod, _so_path, _ctypes_lib, _compilation_error
 
     if _compilation_state != CompilationState.UNATTEMPTED:
         return
 
+    if not _has_mem_pool:
+        _compilation_state = CompilationState.FAILURE
+        _compilation_error = (
+            "PyTorch does not expose CUDA MemPool on this build/version. "
+            "UVM mempool requires torch.cuda.MemPool or torch.cuda.memory.MemPool."
+        )
+        return
+
     _mempool_c_src = r"""
     #include <cuda_runtime_api.h>
     #include <cstddef>
@@ -134,6 +149,59 @@ def compile_allocator():
       (void)size; (void)device; (void)stream;
       if (ptr) cudaFree(ptr);
     }
+
+    // Prefetch managed memory to a device (or to CPU with cudaCpuDeviceId == -1).
+    EXPORT int managed_prefetch(void* ptr, size_t size, int device, void* stream) {
+      cudaStream_t s = (cudaStream_t)stream;
+      cudaError_t err;
+      #if CUDART_VERSION >= 13000
+        cudaMemLocation location;
+        if (device == (int)-1) {
+          location.type = cudaMemLocationTypeHost;
+          location.id = 0;
+        } else {
+          location.type = cudaMemLocationTypeDevice;
+          location.id = device;
+        }
+        err = cudaMemPrefetchAsync(ptr, (size_t)size, location, 0, s);
+      #else
+        err = cudaMemPrefetchAsync(ptr, (size_t)size, device, s);
+      #endif
+      return (int)err;
+    }
+
+    // Update preferred location advice for managed memory (GPU device id, or CPU with cudaCpuDeviceId == -1).
+    EXPORT int managed_advise_preferred_location(void* ptr, size_t size, int device) {
+      cudaError_t err;
+      #if CUDART_VERSION >= 13000
+        cudaMemLocation location;
+        if (device == (int)-1) {
+          location.type = cudaMemLocationTypeHost;
+          location.id = 0;
+        } else {
+          location.type = cudaMemLocationTypeDevice;
+          location.id = device;
+        }
+        err = cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, location);
+      #else
+        err = cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
+      #endif
+      return (int)err;
+    }
+
+    // Ensure a device is in the page table for this managed region.
+    EXPORT int managed_advise_accessed_by(void* ptr, size_t size, int device) {
+      cudaError_t err;
+      #if CUDART_VERSION >= 13000
+        cudaMemLocation location;
+        location.type = cudaMemLocationTypeDevice;
+        location.id = device;
+        err = cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, location);
+      #else
+        err = cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
+      #endif
+      return (int)err;
+    }
     """
 
     # Define a timeout of 30s for how long the build is allowed to run.
@@ -160,14 +228,16 @@ def compile_allocator():
                 _cpa = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free")
                 _alloc = _cpa.allocator()
                 _compilation_state = CompilationState.SUCCESS
+                _compilation_error = None
         except (RuntimeError, ImportError, OSError, UnifiedMemoryCompileTimeoutError) as e:
+            _compilation_error = str(e)
             warnings.warn(f"Failed to create unified memory mempool: '{e}'.")
             _compilation_state = CompilationState.FAILURE
+            _so_path = None
+            _ctypes_lib = None
 
         # Synchronize failure state across ranks. (For currently unknown reasons,
         # one rank can show as FAILURE while the remaining ranks show as SUCCESS.)
-        import torch
-
         local_state = torch.tensor(
             [_compilation_state.value], dtype=torch.uint8, device=torch.cuda.current_device()
         )
@@ -193,6 +263,264 @@ def create_unified_mempool() -> "MemPool":
 
     # Return mempool.
     if _compilation_state != CompilationState.SUCCESS:
-        raise UnifiedMemoryUnsupportedError()
+        details = _compilation_error
+        if details is None:
+            details = "Unknown reason (allocator compilation did not succeed)."
+        raise UnifiedMemoryUnsupportedError(
+            "Unified virtual memory (UVM) mempool is unsupported or failed to initialize: "
+            + details
+        )
     else:
         return MemPool(allocator=_alloc)
+
+
+def _get_ctypes_lib() -> "ctypes.CDLL":
+    """Return a ctypes handle to the compiled UVM extension (.so)."""
+    global _ctypes_lib
+    compile_allocator()
+    if _compilation_state != CompilationState.SUCCESS or _so_path is None:
+        raise UnifiedMemoryUnsupportedError()
+    if _ctypes_lib is not None:
+        return _ctypes_lib
+    with _ctypes_lock:
+        if _ctypes_lib is None:
+            _ctypes_lib = ctypes.CDLL(_so_path)
+            # Configure argtypes/restype for exported helpers.
+            _ctypes_lib.managed_prefetch.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_size_t,
+                ctypes.c_int,
+                ctypes.c_void_p,
+            ]
+            _ctypes_lib.managed_prefetch.restype = ctypes.c_int
+            _ctypes_lib.managed_advise_preferred_location.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_size_t,
+                ctypes.c_int,
+            ]
+            _ctypes_lib.managed_advise_preferred_location.restype = ctypes.c_int
+            _ctypes_lib.managed_advise_accessed_by.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_size_t,
+                ctypes.c_int,
+            ]
+            _ctypes_lib.managed_advise_accessed_by.restype = ctypes.c_int
+    return _ctypes_lib
+
+
+def prefetch_managed_tensor(tensor, *, device: int, stream=None) -> None:
+    """Prefetch a CUDA tensor allocated from the UVM mempool to a specific device.
+
+    This uses `cudaMemPrefetchAsync` to physically migrate the pages backing the tensor.
+    The virtual address (pointer) remains unchanged, making this safe for use with
+    recorded CUDA graphs.
+
+    Args:
+        tensor (torch.Tensor): CUDA tensor allocated from the UVM mempool.
+        device (int): Target device ID. Use -1 (cudaCpuDeviceId) to prefetch to CPU.
+        stream (torch.cuda.Stream, optional): Stream to use for the asynchronous prefetch.
+            Defaults to the current stream.
+    """
+    if tensor is None:
+        return
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError("prefetch_managed_tensor expects a torch.Tensor")
+    if tensor.numel() == 0:
+        return
+    if not tensor.is_cuda:
+        raise ValueError("prefetch_managed_tensor expects a CUDA tensor")
+
+    lib = _get_ctypes_lib()
+    nbytes = tensor.nbytes
+    if stream is None:
+        stream = torch.cuda.current_stream()
+    # torch.cuda.Stream exposes a cuda_stream integer handle.
+    stream_ptr = ctypes.c_void_p(int(stream.cuda_stream))
+    err = lib.managed_prefetch(
+        ctypes.c_void_p(int(tensor.data_ptr())), ctypes.c_size_t(nbytes), int(device), stream_ptr
+    )
+    if err != 0:
+        raise RuntimeError(f"cudaMemPrefetchAsync failed with cudaError={err}")
+
+
+def advise_managed_tensor_preferred_location(tensor, *, device: int) -> None:
+    """Set the preferred physical location hint for a managed tensor.
+
+    This uses `cudaMemAdviseSetPreferredLocation`. It tells the CUDA driver where the
+    pages should ideally reside. Unlike prefetch, this is a hint and does not
+    immediately trigger migration unless the driver decides it is necessary.
+
+    Args:
+        tensor (torch.Tensor): CUDA tensor allocated from the UVM mempool.
+        device (int): Preferred device ID. Use -1 (cudaCpuDeviceId) for CPU.
+    """
+    if tensor is None:
+        return
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError("advise_managed_tensor_preferred_location expects a torch.Tensor")
+    if tensor.numel() == 0:
+        return
+    if not tensor.is_cuda:
+        raise ValueError("advise_managed_tensor_preferred_location expects a CUDA tensor")
+
+    lib = _get_ctypes_lib()
+    nbytes = tensor.nbytes
+    err = lib.managed_advise_preferred_location(
+        ctypes.c_void_p(int(tensor.data_ptr())), ctypes.c_size_t(nbytes), int(device)
+    )
+    if err != 0:
+        raise RuntimeError(f"cudaMemAdviseSetAccessedBy failed with cudaError={err}")
+
+
+def advise_managed_tensor_accessed_by(tensor, *, device: int) -> None:
+    """Hint that a specific device will access the managed tensor.
+
+    This uses `cudaMemAdviseSetAccessedBy`. It ensures that the mapping for this
+    memory region is established in the page tables of the specified device,
+    reducing page fault latency when the device first touches the data.
+
+    Args:
+        tensor (torch.Tensor): CUDA tensor allocated from the UVM mempool.
+        device (int): Device ID that will access the tensor. Must be a GPU ID.
+    """
+    if tensor is None:
+        return
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError("advise_managed_tensor_accessed_by expects a torch.Tensor")
+    if tensor.numel() == 0:
+        return
+    if not tensor.is_cuda:
+        raise ValueError("advise_managed_tensor_accessed_by expects a CUDA tensor")
+
+    lib = _get_ctypes_lib()
+    nbytes = tensor.nbytes
+    err = lib.managed_advise_accessed_by(
+        ctypes.c_void_p(int(tensor.data_ptr())), ctypes.c_size_t(nbytes), int(device)
+    )
+    if err != 0:
+        raise RuntimeError(f"cudaMemAdviseSetAccessedBy failed with cudaError={err}")
+
+
+def prefetch_managed_module_parameters(
+    module, *, device: int, include_buffers: bool = False
+) -> int:
+    """Prefetch all UVM-allocated parameters (and optionally buffers) of a module.
+
+    Iterates through all parameters of the module and initiates an asynchronous
+    migration to the target device. This is typically used to offload weights to
+    CPU during training or prefetch them to GPU before inference.
+
+    Args:
+        module (torch.nn.Module): The module containing UVM parameters.
+        device (int): Target device ID (-1 for CPU).
+        include_buffers (bool, optional): Whether to also prefetch module buffers.
+            Defaults to False.
+
+    Returns:
+        int: The total number of bytes for which prefetch was initiated.
+    """
+    if module is None:
+        return 0
+
+    # Avoid duplicate prefetch on shared tensors.
+    seen_ptrs: set[int] = set()
+    total_nbytes = 0
+    stream = torch.cuda.current_stream()
+
+    for name, p in module.named_parameters(recurse=True):
+        if p is None:
+            continue
+        t = p.data
+        if not isinstance(t, torch.Tensor) or not t.is_cuda or t.numel() == 0:
+            continue
+        ptr = int(t.data_ptr())
+        if ptr in seen_ptrs:
+            continue
+        seen_ptrs.add(ptr)
+        nbytes = t.nbytes
+        err = prefetch_managed_tensor(t, device=device, stream=stream)
+        if err:
+            raise RuntimeError(
+                f"cudaMemPrefetchAsync failed (cudaError={err}) for parameter '{name}': "
+                f"shape={tuple(t.shape)}, dtype={t.dtype}, device={t.device}, "
+                f"data_ptr=0x{t.data_ptr():x}, nbytes={nbytes}. "
+                "This tensor is not UVM-allocated."
+            )
+        total_nbytes += nbytes
+
+    if include_buffers:
+        for name, b in module.named_buffers(recurse=True):
+            if b is None:
+                continue
+            if not isinstance(b, torch.Tensor) or not b.is_cuda or b.numel() == 0:
+                continue
+            ptr = int(b.data_ptr())
+            if ptr in seen_ptrs:
+                continue
+            seen_ptrs.add(ptr)
+            nbytes = b.nbytes
+            err = prefetch_managed_tensor(b, device=device, stream=stream)
+            if err:
+                raise RuntimeError(
+                    f"cudaMemPrefetchAsync failed (cudaError={err}) for buffer '{name}': "
+                    f"shape={tuple(b.shape)}, dtype={b.dtype}, device={b.device}, "
+                    f"data_ptr=0x{b.data_ptr():x}, nbytes={nbytes}. "
+                    "This tensor is not UVM-allocated."
+                )
+            total_nbytes += nbytes
+
+    return total_nbytes
+
+
+def advise_managed_module_parameters_preferred_location(
+    module, *, device: int, include_buffers: bool = False
+) -> None:
+    """Set the preferred physical location hint for all UVM parameters in a module.
+
+    Args:
+        module (torch.nn.Module): The module containing UVM parameters.
+        device (int): Preferred device ID (-1 for CPU).
+        include_buffers (bool, optional): Whether to also advise on module buffers.
+            Defaults to False.
+    """
+    if module is None:
+        return
+
+    seen_ptrs: set[int] = set()
+    for name, p in module.named_parameters(recurse=True):
+        if p is None:
+            continue
+        t = p.data
+        if not isinstance(t, torch.Tensor) or not t.is_cuda or t.numel() == 0:
+            continue
+        ptr = int(t.data_ptr())
+        if ptr in seen_ptrs:
+            continue
+        seen_ptrs.add(ptr)
+        err = advise_managed_tensor_preferred_location(t, device=device)
+        if err:
+            raise RuntimeError(
+                f"cudaMemAdviseSetPreferredLocation failed (cudaError={err}) for param '{name}': "
+                f"shape={tuple(t.shape)}, dtype={t.dtype}, device={t.device}, "
+                f"data_ptr=0x{t.data_ptr():x}, nbytes={t.nbytes}. "
+                "This tensor is not UVM-allocated."
+            )
+
+    if include_buffers:
+        for name, b in module.named_buffers(recurse=True):
+            if b is None:
+                continue
+            if not isinstance(b, torch.Tensor) or not b.is_cuda or b.numel() == 0:
+                continue
+            ptr = int(b.data_ptr())
+            if ptr in seen_ptrs:
+                continue
+            seen_ptrs.add(ptr)
+            err = advise_managed_tensor_preferred_location(b, device=device)
+            if err:
+                raise RuntimeError(
+                    f"cudaMemAdviseSetPreferredLocation failed (err={err}) for buf '{name}': "
+                    f"shape={tuple(b.shape)}, dtype={b.dtype}, device={b.device}, "
+                    f"data_ptr=0x{b.data_ptr():x}, nbytes={b.nbytes}. "
+                    "This tensor is not UVM-allocated."
+                )
diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py
index 55536a52088..0bdaff64be1 100644
--- a/megatron/core/inference/utils.py
+++ b/megatron/core/inference/utils.py
@@ -139,10 +139,8 @@ def tensor_swap(x, src_idxs, dst_idxs):
     x[dst_idxs], x[src_idxs] = x[src_idxs], x[dst_idxs]
 
 
-async def await_process_event(
-    event: multiprocessing.Event, process: multiprocessing.Process, timeout: float = 1.0
-) -> None:
-    """Repeatedly wait for a multiprocessing event to be set, aborting upon process failure.
+async def await_process_call(call, process: multiprocessing.Process, timeout: float = 1.0):
+    """Repeatedly wait for a multiprocessing callable to resolve, aborting upon process failure.
 
     Note that the timeout in this function is only for checking process liveness.
     Its value should be set to a relatively high number. The only problem a high timeout
@@ -155,8 +153,7 @@ async def await_process_event(
         timeout: The timeout for each wait iteration in seconds.
     """
     while True:
-        signal = await asyncio.to_thread(event.wait, timeout)
-        if signal:
+        if await asyncio.to_thread(call, timeout):
             return
         if not process.is_alive():
             raise RuntimeError(
diff --git a/megatron/core/jit.py b/megatron/core/jit.py
index b1aa3e0b611..b67810f2e34 100644
--- a/megatron/core/jit.py
+++ b/megatron/core/jit.py
@@ -7,12 +7,27 @@
 jit_fuser = torch.jit.script
 # nvFuser is deprecated in PyTorch JIT starting from 2.2
 
-try:
-    if is_torch_min_version("2.2.0a0"):
-        jit_fuser = torch.compile
-except ImportError:
 
-    def noop_decorator(func):
-        return func
+def noop_decorator(func):
+    '''No-op decorator'''
+    return func
 
+
+def enable_jit_fuser():
+    '''Enable the JIT fuser'''
+    global jit_fuser
+    try:
+        if is_torch_min_version("2.2.0a0"):
+            jit_fuser = torch.compile
+    except ImportError:
+
+        jit_fuser = noop_decorator
+
+
+def disable_jit_fuser():
+    '''Disable the JIT fuser'''
+    global jit_fuser
     jit_fuser = noop_decorator
+
+
+enable_jit_fuser()
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
index d57679aab3d..0f0c1a060fb 100644
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
@@ -1,8 +1,8 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import warnings
-from dataclasses import dataclass
-from typing import Callable, ContextManager, Optional
+from dataclasses import dataclass, field
+from typing import Callable, ContextManager, Literal, Optional
 
 import torch
 
@@ -20,7 +20,7 @@ class ModelParallelConfig:
     tensor_model_parallel_size: int = 1
     """Intra-layer model parallelism. Splits tensors across GPU ranks."""
 
-    pipeline_model_parallel_comm_backend: Optional[str] = None
+    pipeline_model_parallel_comm_backend: Optional[Literal["nccl", "ucc"]] = None
     """Configuring backend option of pipeline parallel communication (e.g., nccl, ucc)
        If None, the default backend will be used.
     """
@@ -53,11 +53,29 @@ class ModelParallelConfig:
        type.
     """
 
+    max_seqlen_per_dp_cp_rank: Optional[int] = None
+    """
+    Maximum sequence length per DPxCP rank. This is the maximum sequence length each rank
+    can handle without overflowing the memory. Typically, a good starting point is to set this
+    to maximum sequence length / context parallel size.
+    This is used to calculate the number and length of sub-samples assigned to 
+    each rank when using hybrid_context_parallel.
+    """
+
+    hybrid_context_parallel: bool = False
+    """
+    If true, enables hybrid context parallel. This is used to balance the workload of 
+    each CP rank when we use packed samples with variable sequence lengths.
+    Please set max_seqlen_per_dp_cp_rank when using hybrid_context_parallel.
+    """
+
     expert_model_parallel_size: int = 1
     """Distributes Moe Experts across sub data parallel dimension."""
 
     expert_tensor_parallel_size: Optional[int] = None
-    """Intra-layer tensor model parallelsm for expert layer. Splits tensors across GPU ranks."""
+    """Intra-layer tensor model parallelism for expert layer. Splits tensors across GPU ranks.
+       Default is None, which will be set to the value of tensor_model_parallel_size.
+    """
 
     moe_extended_tp: bool = False
     """NOTE: Deprecated from MCore v0.10. This flag is ignored.
@@ -67,12 +85,16 @@ class ModelParallelConfig:
     ###################
     # Initialization
     ###################
-    perform_initialization: bool = True
-    """If true, weights are initialized. This option can be useful when you know you are going to
+    perform_initialization: bool = field(
+        default=True, metadata={"argparse_meta": {"arg_names": ["--no-initialization"]}}
+    )
+    """Controls weights initialization. This option can be useful when you know you are going to
        load values from a checkpoint.
     """
 
-    use_cpu_initialization: bool = False
+    use_cpu_initialization: bool = field(
+        default=False, metadata={"argparse_meta": {"default": None}}
+    )
     """When set to False, we initialize the weights directly on the GPU. CPU initialization is the
        same regardless of tensor model parallelism, but GPU initialization is not. Transferring
        weights from CPU to GPU can take a significant amount of time for large models.
@@ -151,11 +173,14 @@ class ModelParallelConfig:
        must turn off gradient accumulation fusion.
     """
 
-    async_tensor_model_parallel_allreduce: bool = False
+    async_tensor_model_parallel_allreduce: bool = True
     """NOTE: Deprecated. This flag is ignored."""
 
-    use_te_rng_tracker: bool = False
+    use_te_rng_tracker: bool = field(
+        default=False, metadata={"argparse_meta": {"arg_names": ["--te-rng-tracker"]}}
+    )
     """If true, uses RNG state tracker in TransformerEngine if exists.
+    Required for CUDA graphs support.
     """
 
     tp_comm_overlap: bool = False
@@ -165,22 +190,22 @@ class ModelParallelConfig:
     """
 
     tp_comm_bulk_wgrad: bool = True
-    """If true, allows All-Gather overlap with Bprop activation gradient GEMM. Don't care if
+    """Controls All-Gather overlap with Bprop activation gradient GEMM. Don't care if
        tp_comm_overlap is False.
     """
 
     tp_comm_bulk_dgrad: bool = True
-    """If true, allows Reduce-Scatter overlap with Bprop weight gradient GEMM. Don't care if
+    """Controls Reduce-Scatter overlap with Bprop weight gradient GEMM. Don't care if
        tp_comm_overlap is False.
     """
 
     tp_comm_overlap_ag: bool = True
-    """If true, allows All-Gather overlap with GEMM by pipelining the GEMM and All-Gather.
+    """Controls All-Gather overlap with GEMM by pipelining the GEMM and All-Gather.
        Don't care if tp_comm_overlap is False.
     """
 
     tp_comm_overlap_rs: bool = True
-    """If true, allows Reduce-Scatter overlap with GEMM by pipelining the GEMM and Reduce-Scatter.
+    """Controls Reduce-Scatter overlap with GEMM by pipelining the GEMM and Reduce-Scatter.
        Don't care if tp_comm_overlap is False.
     """
 
@@ -191,7 +216,7 @@ class ModelParallelConfig:
 
     tp_comm_split_ag: bool = True
     """Deprecated from TransformerEngine v1.6.0.
-       If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
+       Controls All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
        splits. Don't care if tp_comm_overlap is False.
     """
 
@@ -203,7 +228,7 @@ class ModelParallelConfig:
 
     tp_comm_split_rs: bool = True
     """Deprecated from TransformerEngine v1.6.0.
-       If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
+       Controls Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
        Reduce-Scatter splits. Don't care if tp_comm_overlap is False.
     """
 
@@ -218,7 +243,7 @@ class ModelParallelConfig:
        Defaults to False.
     """
 
-    cross_entropy_fusion_impl: str = 'native'
+    cross_entropy_fusion_impl: Literal['native', 'te'] = 'native'
     """If 'native', MCore based CE loss fusion is used, if 'te', Parallel CE loss
        from Transformer Engine library is used. Defaults to 'native'.
     """
@@ -233,10 +258,8 @@ class ModelParallelConfig:
        If true, the AllGather -> Gemm overlap for FC1 layer of MLP gets disabled
     """
 
-    tp_comm_bootstrap_backend: str = 'nccl'
-    """
-       Set the bootstrapping backend out of 'nccl', 'mpi', and 'gloo'
-    """
+    tp_comm_bootstrap_backend: Literal['nccl', 'mpi', 'gloo'] = 'nccl'
+    """Set the bootstrapping backend of Tensor parallel communications."""
 
     overlap_moe_expert_parallel_comm: bool = False
     """Overlap EP A2A communications with independent computations of different micro-batches
@@ -246,6 +269,19 @@ class ModelParallelConfig:
     delay_wgrad_compute: bool = False
     """Delay the weight gradient computation to improve batch-level communication overlapping"""
 
+    ep_overlap_early_attn_memory_release: bool = False
+    """Enable early memory release of attention activations during EP overlap.
+    EP overlap can increase peak memory usage when the overlapped forward module allocates 
+    more memory than what is freed by the backward module. This flag addresses this by 
+    reordering the attention backward pass to occur earlier in the schedule.
+    Specifically:
+    - Without this flag: attn_bwd executes after moe_combine_fwd
+    - With this flag: attn_bwd executes before mlp_fwd
+    The earlier execution releases attention activations sooner, reducing peak memory.
+    Note: This may impact performance as moe_combine_fwd and moe_dispatch_bwd become 
+    exposed (not overlapped with other computation).
+    """
+
     ###################
     # Pipeline Parallel
     ###################
@@ -295,13 +331,21 @@ class ModelParallelConfig:
        Defaults to 0, which means all micro-batches are deferred.
     """
 
-    overlap_p2p_comm_warmup_flush: bool = False
+    overlap_p2p_comm_warmup_flush: bool = field(
+        default=False,
+        metadata={"argparse_meta": {"arg_names": ["--overlap-p2p-communication-warmup-flush"]}},
+    )
     """If true, overlap communication and computation in warm up and flush phase.
        Only valid when overlap_p2p_comm is True and batch_p2p_comm is False. 
        Defaults to False.
     """
 
-    microbatch_group_size_per_vp_stage: Optional[int] = None
+    microbatch_group_size_per_vp_stage: Optional[int] = field(
+        default=None,
+        metadata={
+            "argparse_meta": {"arg_names": ["--microbatch-group-size-per-virtual-pipeline-stage"]}
+        },
+    )
     """This value specifies the number of micro-batches that are executed 
        at a time for a given virtual stage (both forward and backward).
        Default (in __post_init__() method below) to pipeline_parallel_size 
@@ -316,6 +360,10 @@ class ModelParallelConfig:
        rank 1 |   0 1 2 0 1 2 3 4 3 4
     """
 
+    mtp_standalone: bool = False
+    """This will be set automatically according to the pipeline layout, 
+    and will be set to True if MTP is in a separate vpp stage."""
+
     ###################
     # CPU Offloading
     ###################
@@ -344,8 +392,11 @@ class ModelParallelConfig:
     ###################
     # Timing
     ###################
-    barrier_with_L1_time: bool = True
-    """If true, use barrier with level 1 time measurements. It is up to the user to make sure
+    barrier_with_L1_time: bool = field(
+        default=True,
+        metadata={"argparse_meta": {"arg_names": ["--no-barrier-with-level-1-timing"]}},
+    )
+    """Controls barrier with level 1 time measurements. It is up to the user to make sure
        calling barrier with their timers will not result in hangs. This can happen if for example
        the user adds a level 1 timer that is not called by all ranks.
     """
diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py
index a3ad84a21d6..50aecf0a950 100644
--- a/megatron/core/models/T5/t5_spec.py
+++ b/megatron/core/models/T5/t5_spec.py
@@ -14,6 +14,7 @@
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.typed_torch import not_none
 
 try:
     import transformer_engine as te  # pylint: disable=unused-import
@@ -28,6 +29,13 @@
 
     HAVE_TE = True
 except ImportError:
+    (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    ) = (None, None, None, None, None)
     HAVE_TE = False
 
 try:
@@ -57,8 +65,8 @@ def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
                 module=SelfAttention,
                 params={"attn_mask_type": AttnMaskType.padding},
                 submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
+                    linear_qkv=not_none(TELayerNormColumnParallelLinear),
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
@@ -86,8 +94,8 @@ def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
                 module=SelfAttention,
                 params={"attn_mask_type": AttnMaskType.causal},
                 submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
+                    linear_qkv=not_none(TELayerNormColumnParallelLinear),
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
@@ -99,9 +107,9 @@ def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
                 module=CrossAttention,
                 params={"attn_mask_type": AttnMaskType.padding},
                 submodules=CrossAttentionSubmodules(
-                    linear_q=TEColumnParallelLinear,
-                    linear_kv=TEColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
+                    linear_q=not_none(TEColumnParallelLinear),
+                    linear_kv=not_none(TEColumnParallelLinear),
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                 ),
             ),
diff --git a/megatron/core/models/backends.py b/megatron/core/models/backends.py
index 29169285b3e..7f84599a04c 100644
--- a/megatron/core/models/backends.py
+++ b/megatron/core/models/backends.py
@@ -153,7 +153,7 @@ def fuse_layernorm_and_linear(self) -> bool:
         """TE backend chooses a single module for layernorm and linear"""
         return True
 
-    def column_parallel_layer_norm_linear(self) -> Optional[type]:
+    def column_parallel_layer_norm_linear(self) -> type[InferenceLayerNormColumnParallelLinear]:
         """Which module for sequential layernorm and linear"""
         return InferenceLayerNormColumnParallelLinear
 
@@ -166,7 +166,7 @@ def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type:
             return FusedLayerNorm
         return TENorm
 
-    def core_attention(self) -> type:
+    def core_attention(self) -> type[TEDotProductAttention]:
         """Which module to use for attention"""
         return TEDotProductAttention
 
diff --git a/megatron/core/models/bert/bert_layer_specs.py b/megatron/core/models/bert/bert_layer_specs.py
index 69cec788b2c..8415ef02cc5 100644
--- a/megatron/core/models/bert/bert_layer_specs.py
+++ b/megatron/core/models/bert/bert_layer_specs.py
@@ -10,6 +10,7 @@
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.typed_torch import not_none
 
 try:
     import transformer_engine as te  # pylint: disable=unused-import
@@ -22,6 +23,11 @@
 
     HAVE_TE = True
 except ImportError:
+    (TEDotProductAttention, TELayerNormColumnParallelLinear, TERowParallelLinear) = (
+        None,
+        None,
+        None,
+    )
     HAVE_TE = False
 
 try:
@@ -57,8 +63,8 @@ def get_bert_layer_with_transformer_engine_spec():
                 module=SelfAttention,
                 params={"attn_mask_type": AttnMaskType.padding},
                 submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
+                    linear_qkv=not_none(TELayerNormColumnParallelLinear),
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py
index b7b9bfc73f3..abe9bc1c9b7 100644
--- a/megatron/core/models/bert/bert_model.py
+++ b/megatron/core/models/bert/bert_model.py
@@ -14,6 +14,7 @@
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.dot_product_attention import (
     DotProductAttention as MCoreDotProductAttention,
 )
@@ -74,8 +75,9 @@ def __init__(
         add_binary_head=True,
         return_embeddings=False,
         vp_stage: Optional[int] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ):
-        super(BertModel, self).__init__(config=config)
+        super(BertModel, self).__init__(config=config, pg_collection=pg_collection)
 
         if has_config_logger_enabled(config):
             log_config_to_disk(config, locals(), prefix=type(self).__name__)
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
index afecb72bdb8..20fcb264643 100644
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -25,7 +25,7 @@
     apply_rotary_pos_emb,
     get_pos_emb_on_this_cp_rank,
 )
-from megatron.core.utils import deprecate_inference_params
+from megatron.core.utils import deprecate_inference_params, internal_api
 
 logger = logging.getLogger(__name__)
 
@@ -147,14 +147,12 @@ def get_cos_sin(self, max_seq_len: int, offset: int = 0) -> (Tensor, Tensor):
         sin = torch.sin(freqs)
         return cos, sin
 
-    @lru_cache(maxsize=32)
-    def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -> Tensor:
-        """Forward pass of RoPE embedding.
+    def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor:
+        """Forward pass of RoPE embedding before CP sharding.
 
         Args:
             max_seq_len (int): Maximum size of sequence
             offset (int, optional): RoPE offset. Defaults to 0.
-            packed_seq (bool, optional): Whether to use packed sequence. Defaults to False.
 
         Returns:
             Tensor: Embeddings after applying RoPE.
@@ -174,10 +172,37 @@ def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -
             )
         # emb [seq_length, .., dim]
         emb = emb[:, None, None, :]
-        if self.cp_group is not None and self.cp_group.size() > 1 and not packed_seq:
-            # slice rotary_pos_emb along sequence dimension and select the parition of the current
-            # CP rank
-            emb = get_pos_emb_on_this_cp_rank(emb, 0, self.cp_group)
+        return emb
+
+    @lru_cache(maxsize=32)
+    @internal_api
+    def forward(
+        self,
+        max_seq_len: int,
+        offset: int = 0,
+        packed_seq: bool = False,
+        cp_group: Optional[torch.distributed.ProcessGroup] = None,
+    ) -> Tensor:
+        """Forward pass of RoPE embedding.
+
+        Args:
+            max_seq_len (int): Maximum size of sequence
+            offset (int, optional): RoPE offset. Defaults to 0.
+            packed_seq (bool, optional): Whether to use packed sequence. Defaults to False.
+            cp_group (torch.distributed.ProcessGroup, optional): Context parallel group.
+                Defaults to None.
+
+        Returns:
+            Tensor: Embeddings after applying RoPE.
+        """
+        emb = self.get_emb(max_seq_len, offset)
+        if cp_group is None:
+            cp_group = self.cp_group
+        if cp_group is not None and cp_group.size() > 1 and not packed_seq:
+            # slice rotary_pos_emb along sequence dimension
+            # and select the parition of the current CP rank
+            emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group)
+
         return emb
 
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
@@ -279,13 +304,20 @@ def __init__(
             else parallel_state.get_context_parallel_group(check_initialized=False)
         )
 
-    def forward(self, position_ids: torch.Tensor, mrope_section: List[int]) -> Tensor:
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        mrope_section: List[int],
+        cp_group: Optional[torch.distributed.ProcessGroup] = None,
+    ) -> Tensor:
         """Forward pass of multimodal RoPE embedding.
 
         Args:
             position_ids (torch.Tensor): A postion_id tensor with shape [3, batchsize, seqlens]
             mrope_section (list[int]): Multimodal rope section is for channel dimension of temporal,
                 height and width in rope calculation.
+            cp_group (torch.distributed.ProcessGroup, optional): Context parallel group.
+                Defaults to None.
 
         Returns:
             Tensor: Embeddings after applying RoPE.
@@ -318,8 +350,10 @@ def forward(self, position_ids: torch.Tensor, mrope_section: List[int]) -> Tenso
 
         # shape (seq_length, bs, 1, 2 * dim)
         emb = emb[..., None, :].transpose(0, 1).contiguous()
-        if self.cp_group is not None and self.cp_group.size() > 1:
+        if cp_group is None:
+            cp_group = self.cp_group
+        if cp_group is not None and cp_group.size() > 1:
             # slice rotary_pos_emb along sequence dimension and select the parition of the current
             # CP rank
-            emb = get_pos_emb_on_this_cp_rank(emb, 0, self.cp_group)
+            emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group)
         return emb
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
index 79bcd144f30..166ef9b41e7 100644
--- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
@@ -13,6 +13,7 @@
 from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import internal_api
 
 logger = logging.getLogger(__name__)
 
@@ -102,14 +103,12 @@ def __init__(
             # method causes a memory leak in NeMo-RL.
             self.forward.cache_clear()
 
-    @lru_cache(maxsize=32)
-    def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -> Tensor:
+    def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor:
         """Forward pass of Yarn Rotary Embedding.
 
         Args:
             max_seq_len (int): Maximum size of sequence
             offset (int, optional): RoPE offset. Defaults to 0.
-            packed_seq (bool, optional): Whether to use packed sequence. Defaults to False.
 
         Returns:
             Tensor: Embeddings after applying Yarn RoPE.
@@ -155,10 +154,36 @@ def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -
         emb = torch.cat((freqs, freqs), dim=-1)
         # emb [seq_length, .., dim]
         emb = emb[:, None, None, :]
-        if self.cp_group is not None and self.cp_group.size() > 1 and not packed_seq:
+        return emb, _mscale
+
+    @lru_cache(maxsize=32)
+    @internal_api
+    def forward(
+        self,
+        max_seq_len: int,
+        offset: int = 0,
+        packed_seq: bool = False,
+        cp_group: Optional[torch.distributed.ProcessGroup] = None,
+    ) -> Tensor:
+        """Forward pass of Yarn Rotary Embedding.
+
+        Args:
+            max_seq_len (int): Maximum size of sequence
+            offset (int, optional): RoPE offset. Defaults to 0.
+            packed_seq (bool, optional): Whether to use packed sequence. Defaults to False.
+            cp_group (torch.distributed.ProcessGroup, optional): Context parallel group.
+                Defaults to None.
+
+        Returns:
+            Tensor: Embeddings after applying Yarn RoPE.
+        """
+        emb, _mscale = self.get_emb(max_seq_len, offset)
+        if cp_group is None:
+            cp_group = self.cp_group
+        if cp_group is not None and cp_group.size() > 1 and not packed_seq:
             # slice rotary_pos_emb along sequence dimension
             # and select the parition of the current CP rank
-            emb = get_pos_emb_on_this_cp_rank(emb, 0, self.cp_group)
+            emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group)
         return emb, _mscale
 
     def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False):
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index e836fad4ed2..f65cae70110 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import logging
 import os
 from typing import Optional, Tuple
@@ -21,10 +21,15 @@
     is_vp_last_stage,
 )
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import is_te_min_version, make_tp_sharded_tensor_for_checkpoint
+from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group
+from megatron.core.utils import (
+    get_tensor_model_parallel_group_if_none,
+    is_te_min_version,
+    make_tp_sharded_tensor_for_checkpoint,
+)
 from megatron.plugin.decorators import plugin_method
 
 
@@ -45,6 +50,7 @@ def __init__(
             pg_collection = ProcessGroupCollection.use_mpu_process_groups()
         self.pg_collection = pg_collection
         self.cp_group = pg_collection.cp
+        self.tp_group = get_tensor_model_parallel_group_if_none(pg_collection.tp)
         self.pp_group = pg_collection.pp
         assert hasattr(self.pg_collection, 'embd'), (
             "pg_collection must have a embd. In previous version, it used default "
@@ -64,6 +70,8 @@ def _is_in_embd_group(self):
         if torch.distributed.get_rank() in torch.distributed.get_process_group_ranks(
             self.embd_group
         ):
+            if getattr(self, 'mtp_process', False):
+                return True
             if (
                 torch.distributed.get_rank()
                 == torch.distributed.get_process_group_ranks(self.embd_group)[0]
@@ -138,7 +146,7 @@ def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor:
                     # Use is_cg_capturable=True for full iteration CUDA graphs to avoid torch.equal checks
                     is_cg_capturable = (
                         hasattr(self.config, 'cuda_graph_scope')
-                        and self.config.cuda_graph_scope == 'full_iteration'
+                        and CudaGraphScope.full_iteration in self.config.cuda_graph_scope
                     )
                     if is_cg_capturable and not is_te_min_version("2.7.0"):
                         from megatron.core.utils import get_te_version
@@ -204,7 +212,10 @@ def setup_embeddings_and_output_layer(self) -> None:
         ):
             self.shared_embedding_or_output_weight().shared_embedding = True
 
-        if (self.post_process or getattr(self, 'mtp_process', False)) and not self.pre_process:
+        if (
+            (self.post_process and self.share_embeddings_and_output_weights)
+            or getattr(self, 'mtp_process', False)
+        ) and not self.pre_process:
             assert not (
                 is_vp_first_stage(self.vp_stage, self.vp_size) and is_pp_first_stage(self.pp_group)
             )
@@ -275,6 +286,10 @@ def sharded_state_dict(
             ShardedStateDict: sharded state dict for the LanguageModel
         """
         assert not sharded_offsets, "Unexpected sharded offsets"
+
+        # Guard for cases metadata is not provided
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
+
         sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
 
         first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
@@ -283,7 +298,7 @@ def sharded_state_dict(
 
         if self.share_embeddings_and_output_weights:
             self.tie_embeddings_and_output_weights_state_dict(
-                sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key
+                sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key, metadata
             )
         elif self.post_process:
             # Make sure the output layer follows the embeddings padding logic
@@ -300,6 +315,7 @@ def tie_embeddings_and_output_weights_state_dict(
         sharded_state_dict: ShardedStateDict,
         output_layer_weight_key: str,
         first_stage_word_emb_key: str,
+        metadata: dict = {},
     ) -> None:
         """Ties the embedding and output weights in a given sharded state dict.
 
@@ -344,4 +360,6 @@ def tie_embeddings_and_output_weights_state_dict(
             key=first_stage_word_emb_key,
             replica_id=last_stage_word_emb_replica_id,
             allow_shape_mismatch=True,
+            tp_group=self.tp_group,
+            dp_cp_group=metadata['dp_cp_group'],
         )
diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py
index d501c11a0a9..033e8e808f9 100644
--- a/megatron/core/models/common/model_chunk_schedule_plan.py
+++ b/megatron/core/models/common/model_chunk_schedule_plan.py
@@ -14,7 +14,6 @@
     get_comm_stream,
     get_comp_stream,
 )
-from megatron.core.transformer.multi_token_prediction import get_mtp_num_layers_to_build
 
 
 class ModelChunkState:
@@ -35,23 +34,20 @@ class TransformerLayerSchedulePlan:
     mtp post process nodes.
 
     layer (TransformerLayerSchedulePlan)
-    ├── attn (TransformerLayerNode): attention module
-    ├── post_attn (TransformerLayerNode): layernorm -> router -> dispatch preprocess
+    ├── attn (TransformerLayerNode): attention -> layernorm -> router -> dispatch preprocess
     ├── moe_dispatch (TransformerLayerNode): dispatch All2All
     ├── mlp (TransformerLayerNode): mlp module
     ├── moe_combine (TransformerLayerNode): combine All2All
     └── mtp_post_process (PostProcessNode): mtp post process
 
     Note that MTP layer has the same operation and execution order with TransformerLayer regarding
-    post_attn, moe_dispatch, mlp, moe_combine, but contains extra operations in attn and
-    mtp_post_process:
+    moe_dispatch, mlp, moe_combine, but contains extra operations in attn and mtp_post_process:
     * mtp.attn wraps around transformer_layer.attn with extra norm, proj and embedding operations.
     * mtp.mtp_post_process contains output_layer, mtp loss operations, whereas
       transformer_layer.mtp_post_process is empty.
     """
 
     attn = None
-    post_attn = None
     moe_dispatch = None
     mlp = None
     moe_combine = None
@@ -75,6 +71,7 @@ def __init__(self, layer, event, chunk_state, comp_stream, comm_stream, extra_ar
         """
         from megatron.core.models.gpt.fine_grained_callables import TransformerLayerState
 
+        self.config = layer.config
         self.layer_state = TransformerLayerState()
         self.chunk_state = chunk_state
         self.layer = layer
@@ -85,10 +82,33 @@ def __init__(self, layer, event, chunk_state, comp_stream, comm_stream, extra_ar
         # get callable nodes for transformer/mtp layer
         self._build_callable_nodes(event, comp_stream, comm_stream, extra_args)
 
+    def release_state(self):
+        """Release reference, this helps avoid memory leak."""
+        if hasattr(self, 'attn') and self.attn is not None:
+            del self.attn
+            self.attn = None
+        if hasattr(self, 'moe_dispatch') and self.moe_dispatch is not None:
+            del self.moe_dispatch
+            self.moe_dispatch = None
+        if hasattr(self, 'mlp') and self.mlp is not None:
+            del self.mlp
+            self.mlp = None
+        if hasattr(self, 'moe_combine') and self.moe_combine is not None:
+            del self.moe_combine
+            self.moe_combine = None
+        if hasattr(self, 'mtp_post_process') and self.mtp_post_process is not None:
+            del self.mtp_post_process
+            self.mtp_post_process = None
+        if hasattr(self, 'layer_state') and self.layer_state is not None:
+            del self.layer_state
+            self.layer_state = None
+        if hasattr(self, 'layer'):
+            del self.layer
+
     def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args):
         """
         Builds the callable nodes for the transformer/mtp layer:
-            attn, post_attn, mlp, moe_dispatch and moe_combine, and mtp_post_process.
+            attn, mlp, moe_dispatch and moe_combine, and mtp_post_process.
         """
         from megatron.core.models.gpt.fine_grained_callables import (
             TransformerLayerNode,
@@ -108,11 +128,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args):
             else isinstance(self.layer.mlp, MoELayer)
         )
 
-        enable_deepep = (
-            self.layer.config.moe_token_dispatcher_type == "flex"
-            and self.layer.config.moe_flex_dispatcher_backend == "deepep"
-        )
-        extra_args["enable_deepep"] = enable_deepep
+        extra_args["config"] = self.layer.config
         extra_args["is_moe"] = is_moe
         extra_args["delay_wgrad_compute"] = self.layer.config.delay_wgrad_compute
         extra_args["is_mtp"] = is_mtp
@@ -133,7 +149,6 @@ def create_node(stream, module, name):
 
         (
             attn_module,
-            post_attn_module,
             moe_dispatch_module,
             mlp_module,
             moe_combine_module,
@@ -145,11 +160,9 @@ def create_node(stream, module, name):
         self.attn = create_node(comp_stream, attn_module, "attn")
         self.mlp = create_node(comp_stream, mlp_module, "mlp")
         if is_moe:
-            self.post_attn = create_node(comp_stream, post_attn_module, "post_attn")
             self.moe_dispatch = create_node(comm_stream, moe_dispatch_module, "moe_dispatch")
             self.moe_combine = create_node(comm_stream, moe_combine_module, "moe_combine")
         else:
-            self.post_attn = NoopScheduleNode()
             self.moe_dispatch = NoopScheduleNode()
             self.moe_combine = NoopScheduleNode()
 
@@ -182,8 +195,8 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False)
         to maximize parallelism and efficiency.
 
         When f_layer and b_layer are not None, forward and backward pass are overlapped as follows:
-        comm_stream: combine_bwd            | dispatch_fwd->dispatch_bwd  | combine_fwd
-        comp_stream: attn_fwd->post_attn_fwd| mlp_bwd->mlp_bwd_dw->mlp_fwd| post_attn_bwd->attn_bwd
+        comm_stream: combine_bwd | dispatch_fwd->dispatch_bwd  | combine_fwd
+        comp_stream: attn_fwd    | mlp_bwd->mlp_bwd_dw->mlp_fwd| attn_bwd
         For MTP, mtp_post_process_fwd is executed after the combine_fwd in the comp_stream,
         and mtp_post_process_bwd is executed before the combine_bwd in the comp_stream.
 
@@ -206,7 +219,6 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False)
         if f_layer is not None:
             with f_layer.get_fp8_context():
                 f_input = f_layer.attn.forward(f_input)
-                f_input = f_layer.post_attn.forward(f_input)
 
         if b_layer is not None:
             b_grad = b_layer.mlp.backward(b_grad)
@@ -219,6 +231,9 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False)
             b_layer.mlp.backward_dw()
             b_grad = b_layer.moe_dispatch.backward(b_grad)
 
+        if b_layer is not None and b_layer.config.ep_overlap_early_attn_memory_release:
+            b_grad = b_layer.attn.backward(b_grad)
+
         if f_layer is not None:
             with f_layer.get_fp8_context():
                 f_input = f_layer.mlp.forward(f_input)
@@ -228,8 +243,7 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False)
                 f_input = f_layer.moe_combine.forward(f_input)
                 f_input = f_layer.mtp_post_process.forward(f_input)
 
-        if b_layer is not None:
-            b_grad = b_layer.post_attn.backward(b_grad)
+        if b_layer is not None and not b_layer.config.ep_overlap_early_attn_memory_release:
             b_grad = b_layer.attn.backward(b_grad)
 
         # Delay the last attn_dw in backward pass (attn_dw of the first layer)
@@ -267,6 +281,7 @@ def __init__(
         extra_block_kwargs=None,
         runtime_gather_output: Optional[bool] = None,
         loss_mask: Optional[Tensor] = None,
+        padding_mask=None,
     ):
         """Initialize the schedule plan of all Transformer layers' sub-modules.
 
@@ -309,6 +324,7 @@ def __init__(
         self._model_chunk_state.mtp_hidden_states = None
         self._model_chunk_state.loss_mask = loss_mask
         self._model_chunk_state.packed_seq_params = packed_seq_params
+        self._model_chunk_state.padding_mask = padding_mask
         self._model_chunk_state.extra_block_kwargs = extra_block_kwargs
         self._model_chunk_state.runtime_gather_output = runtime_gather_output
         self._model_chunk_state.model = model
@@ -316,37 +332,40 @@ def __init__(
         self._model_chunk_state.context_mask = None
         self._model_chunk_state.attention_bias = None
 
-        transformer_num_layers = model.decoder.num_layers_per_pipeline_rank
-        mtp_num_layers = get_mtp_num_layers_to_build(model.config, vp_stage=self.vp_stage)
-
         # build preprocess
         self.pre_process = PreProcessNode(model, self._model_chunk_state, self._event, comp_stream)
-        # build layer schedule plan for each layer
-        for layer_idx in range(transformer_num_layers):
-            layer = model.decoder._get_layer(layer_idx)
-            layer_plan = TransformerLayerSchedulePlan(
-                layer, self._event, self._model_chunk_state, comp_stream, comm_stream
+
+        # build layer schedule plan for each layer.
+        # The methods to obtain layers are different for MTP so we need the other build plan for
+        # MTP. Also, this can help annotate MTP layer so that it can know where MTP is.
+        self._build_layer_schedule_plan(model.decoder, comp_stream, comm_stream)
+        self._build_layer_schedule_plan(getattr(model, "mtp", None), comp_stream, comm_stream)
+
+        # build post process
+        if model.post_process:
+            self.post_process = PostProcessNode(
+                model, self._model_chunk_state, self._event, comp_stream
             )
-            self._transformer_layers.append(layer_plan)
 
-        # build mtp layers
-        for layer_idx in range(mtp_num_layers):
+    def _build_layer_schedule_plan(self, module, comp_stream, comm_stream):
+        if module is None:
+            return
+        num_layers = len(module.layers)
+        for layer_idx in range(num_layers):
             extra_args = {
                 "is_first_layer": layer_idx == 0,
-                "is_last_layer": layer_idx == mtp_num_layers - 1,
+                "is_last_layer": layer_idx == num_layers - 1,
             }
-            layer = model.mtp.layers[layer_idx]
             layer_plan = TransformerLayerSchedulePlan(
-                layer, self.event, self.state, comp_stream, comm_stream, extra_args
+                module.layers[layer_idx],
+                self.event,
+                self.state,
+                comp_stream,
+                comm_stream,
+                extra_args,
             )
             self._transformer_layers.append(layer_plan)
 
-        # build post process
-        if model.post_process:
-            self.post_process = PostProcessNode(
-                model, self._model_chunk_state, self._event, comp_stream
-            )
-
     @property
     def event(self):
         """Gets the CUDA event for synchronization."""
@@ -367,6 +386,10 @@ def get_layer(self, i):
         assert i < self.num_layers()
         return self._transformer_layers[i]
 
+    def pop_layer(self):
+        """Pops the transformer layer in FILO order."""
+        return self._transformer_layers.pop()
+
     def num_layers(self):
         """Gets the number of transformer layers."""
         return len(self._transformer_layers)
@@ -445,11 +468,12 @@ def run(
         b_num_layers = b_schedule_plan.num_layers() if b_schedule_plan is not None else 0
         overlapped_layers = min(f_num_layers, b_num_layers)
 
+        f_layer = b_layer = None
         # combined forward and backward pass for overlapped layers
         for i in range(overlapped_layers):
             f_layer = f_schedule_plan.get_layer(i)
-            b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i)
-            torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_num_layers - 1 - i}b")
+            b_layer = b_schedule_plan.pop_layer()
+            torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_schedule_plan.num_layers()}b")
             f_input, b_grad = TransformerLayerSchedulePlan.run(
                 f_layer,
                 b_layer,
@@ -457,15 +481,19 @@ def run(
                 b_grad=b_grad,
                 is_last_layer_in_bwd=(i == b_num_layers - 1),
             )
+            if i < b_num_layers - 1:
+                b_layer.release_state()
             torch.cuda.nvtx.range_pop()
 
         # backward pass for the remaining layers
         for i in range(overlapped_layers, b_num_layers):
-            b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i)
-            torch.cuda.nvtx.range_push(f"layer_{b_num_layers - 1 - i}b")
+            b_layer = b_schedule_plan.pop_layer()
+            torch.cuda.nvtx.range_push(f"layer_{b_schedule_plan.num_layers()}b")
             _, b_grad = TransformerLayerSchedulePlan.run(
                 None, b_layer, b_grad=b_grad, is_last_layer_in_bwd=(i == b_num_layers - 1)
             )
+            if i < b_num_layers - 1:
+                b_layer.release_state()
             torch.cuda.nvtx.range_pop()
 
         # forward pass for the remaining layers
@@ -491,7 +519,9 @@ def run(
         # Delay the last attn_dw in backward pass (attn_dw of the first layer)
         # for overlapping with the p2p comm
         if b_num_layers > 0:
-            b_schedule_plan.get_layer(0).attn.backward_dw()
+            assert b_layer is not None
+            b_layer.attn.backward_dw()
+            b_layer.release_state()
 
         # post process forward
         if f_schedule_plan is not None and f_schedule_plan.post_process is not None:
@@ -504,9 +534,7 @@ def run(
             f_schedule_plan.wait_current_stream()
         if b_schedule_plan:
             b_schedule_plan.wait_current_stream()
-
-        # Release reference as early as possible, this helps avoid memory leak.
-        if b_schedule_plan is not None:
+            # Release reference as early as possible, this helps avoid memory leak.
             b_schedule_plan.release_state()
 
         return f_input
diff --git a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
new file mode 100644
index 00000000000..a7cc7cc0a55
--- /dev/null
+++ b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
@@ -0,0 +1,451 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+from typing import List, Optional
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.backends import BackendSpecProvider
+from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules
+from megatron.core.transformer.enums import AttnMaskType, LayerType
+from megatron.core.transformer.experimental_attention_variant.dsa import (
+    DSAIndexer,
+    DSAIndexerSubmodules,
+    DSAttention,
+    DSAttentionSubmodules,
+)
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.multi_latent_attention import (
+    MLASelfAttention,
+    MLASelfAttentionSubmodules,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import (
+    TransformerBlockSubmodules,
+    get_num_layers_to_build,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import (
+    TransformerLayer,
+    TransformerLayerSubmodules,
+    get_transformer_layer_offset,
+)
+
+try:
+    import transformer_engine as te  # type: ignore[import-untyped]  # pylint: disable=unused-import
+
+    from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import nvidia_kitchen  # type: ignore[import-not-found]  # pylint: disable=unused-import
+
+    from megatron.core.extensions.kitchen import KitchenSpecProvider
+
+    HAVE_KITCHEN = True
+except ImportError:
+    HAVE_KITCHEN = False
+
+
+##########
+# Experimental Attention Variant Module Specs
+##########
+
+
+def get_gated_delta_net_module_spec(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> ModuleSpec:
+    """Build module spec for GatedDeltaNet attention."""
+
+    if backend is None:
+        backend = _get_backend_spec_provider(config=config)
+
+    rms_norm = config.normalization == "RMSNorm"
+    attention = ModuleSpec(
+        module=GatedDeltaNet,
+        submodules=GatedDeltaNetSubmodules(
+            in_proj=backend.column_parallel_layer_norm_linear(),
+            out_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False),
+            out_proj=backend.row_parallel_linear(),
+        ),
+        metainfo={"fuse_input_layernorm": True},
+    )
+    return attention
+
+
+def get_dsa_module_spec_for_backend(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> ModuleSpec:
+    """Helper function to get module spec for Sparse Attention."""
+    assert config.multi_latent_attention, "Currently only MLA supports sparse attention."
+    assert config.qk_l2_norm is False, "qk_l2_norm is not supported with MLA."
+
+    linear_q_up_proj = (
+        backend.column_parallel_layer_norm_linear()
+        if config.qk_layernorm
+        else backend.column_parallel_linear()
+    )
+    linear_kv_up_proj = (
+        backend.column_parallel_layer_norm_linear()
+        if config.qk_layernorm
+        else backend.column_parallel_linear()
+    )
+
+    # Because TransformerEngine does not support sparse attention yet, we use local
+    # implementation whether the backend is TransformerEngine or not.
+    core_attention = ModuleSpec(
+        module=DSAttention,
+        submodules=DSAttentionSubmodules(
+            indexer=ModuleSpec(
+                module=DSAIndexer,
+                submodules=DSAIndexerSubmodules(
+                    linear_wq_b=backend.linear(),
+                    linear_wk=backend.linear(),
+                    k_norm=backend.layer_norm(rms_norm=False, for_qk=True),
+                    linear_weights_proj=backend.linear(),
+                ),
+            )
+        ),
+    )
+
+    attention = ModuleSpec(
+        module=MLASelfAttention,
+        params={"attn_mask_type": AttnMaskType.causal},
+        submodules=MLASelfAttentionSubmodules(
+            linear_q_proj=backend.column_parallel_linear(),
+            linear_q_down_proj=backend.linear(),
+            linear_q_up_proj=linear_q_up_proj,
+            linear_kv_down_proj=backend.linear(),
+            linear_kv_up_proj=linear_kv_up_proj,
+            core_attention=core_attention,
+            linear_proj=backend.row_parallel_linear(),
+            q_layernorm=IdentityOp,
+            kv_layernorm=IdentityOp,
+        ),
+    )
+
+    return attention
+
+
+def get_experimental_attention_variant_module_spec(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> ModuleSpec:
+    """Helper function to get module spec for experimental attention variant"""
+
+    if backend is None:
+        backend = _get_backend_spec_provider(config=config)
+
+    if config.experimental_attention_variant == "gated_delta_net":
+        return get_gated_delta_net_module_spec(config=config, backend=backend)
+    else:
+        raise ValueError(
+            f"Invalid experimental attention variant: {config.experimental_attention_variant}"
+        )
+
+
+##########
+# Experimental GPT Decoder Block Spec
+##########
+
+
+def get_transformer_block_with_experimental_attention_variant_spec(
+    config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None
+) -> TransformerBlockSubmodules:
+    """Build transformer block spec with experimental attention variants (e.g., linear attention).
+
+    This function constructs a heterogeneous transformer block that supports mixing different
+    attention mechanisms (experimental vs standard) and MLP types (MoE vs dense) across layers.
+    **Note that, this API is a experimental API in the short term, and might be deprecated in the
+    future. In the long run, we will move to a new design that better support hybrid models.**
+
+    Key Design:
+        1. Attention and MLP patterns: The attention pattern and MLP pattern are orthogonal
+           and determined independently. This allows flexible combinations (e.g., linear attention
+           with MoE, or standard attention with dense MLP).
+           - Attention pattern: derived from `config.linear_attention_freq` or
+             `config.experimental_attention_variant`.
+           - MLP pattern: derived from `config.moe_layer_freq`.
+
+        2. Per-Layer Spec Construction: Iterates through layers, constructing transformer
+           layer specs based on attention and MLP patterns.
+
+        3. Pipeline Slicing: Extracts layer specs for the current pipeline stage.
+
+    Args:
+        config: Transformer configuration containing model hyperparameters and feature flags.
+        vp_stage: Virtual pipeline stage index for interleaved pipeline parallelism.
+        pp_rank: Pipeline model parallel rank.
+
+    Returns:
+        TransformerBlockSubmodules containing per-layer specs and final layer norm.
+
+    Note:
+        Currently only supports transformer_engine backend. Kitchen backend can be used as a
+        wrapper with TE fallback for unsupported operations.
+    """
+
+    backend = _get_backend_spec_provider(config=config)
+
+    # Get attention patterns and specs
+    experimental_attention_pattern = [0] * config.num_layers
+    if is_linear_attention_variant(config.experimental_attention_variant):
+        experimental_attention_pattern = get_linear_attention_pattern(config=config)
+    elif config.experimental_attention_variant is not None:
+        experimental_attention_pattern = [1] * config.num_layers
+
+    if 1 in experimental_attention_pattern:
+        experimental_attention_spec = get_experimental_attention_variant_module_spec(
+            config=config, backend=backend
+        )
+    else:
+        experimental_attention_spec = None
+
+    if 0 in experimental_attention_pattern:
+        standard_attention_spec = _get_self_attention_module_spec(config=config, backend=backend)
+    else:
+        standard_attention_spec = None
+
+    # Get MLP patterns and specs
+    if config.num_moe_experts is not None:
+        moe_layer_pattern = get_moe_layer_pattern(config=config)
+    else:
+        moe_layer_pattern = [0] * config.num_layers
+
+    if 1 in moe_layer_pattern:
+        moe_layer_spec = _get_moe_module_spec(config=config, backend=backend)
+    else:
+        moe_layer_spec = None
+
+    if 0 in moe_layer_pattern:
+        dense_mlp_layer_spec = _get_dense_mlp_module_spec(config=config, backend=backend)
+    else:
+        dense_mlp_layer_spec = None
+
+    # Get GPT decoder block layer specs
+    rms_norm = config.normalization == "RMSNorm"
+    layer_specs = []
+    for layer_number in range(config.num_layers):
+        attention = (
+            experimental_attention_spec
+            if experimental_attention_pattern[layer_number] == 1
+            else standard_attention_spec
+        )
+        mlp = moe_layer_spec if moe_layer_pattern[layer_number] == 1 else dense_mlp_layer_spec
+        input_layernorm = (
+            IdentityOp
+            if attention.metainfo["fuse_input_layernorm"]
+            else backend.layer_norm(rms_norm=rms_norm, for_qk=False)
+        )
+        pre_mlp_layernorm = (
+            IdentityOp
+            if mlp.metainfo["fuse_pre_mlp_layernorm"]
+            else backend.layer_norm(rms_norm=rms_norm, for_qk=False)
+        )
+
+        layer_specs.append(
+            ModuleSpec(
+                module=TransformerLayer,
+                submodules=TransformerLayerSubmodules(
+                    input_layernorm=input_layernorm,
+                    self_attention=attention,
+                    self_attn_bda=get_bias_dropout_add,
+                    pre_mlp_layernorm=pre_mlp_layernorm,
+                    mlp=mlp,
+                    mlp_bda=get_bias_dropout_add,
+                ),
+            )
+        )
+
+    # Slice the layer specs to only include the layers that are built in this pipeline stage.
+    if config.pipeline_model_parallel_layout is not None:
+        local_layer_ids = config.pipeline_model_parallel_layout.get_layer_id_list(
+            layer_type=LayerType.decoder, vp_stage=vp_stage, pp_rank=pp_rank
+        )
+    else:
+        offset = get_transformer_layer_offset(config, vp_stage=vp_stage, pp_rank=pp_rank)
+        num_layers_to_build = get_num_layers_to_build(config, vp_stage=vp_stage, pp_rank=pp_rank)
+        local_layer_ids = range(offset, offset + num_layers_to_build)
+
+    layer_specs = [layer_specs[layer_id] for layer_id in local_layer_ids]
+
+    # Get GPT decoder block spec
+    gpt_decoder_block_spec = TransformerBlockSubmodules(
+        layer_specs=layer_specs, layer_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False)
+    )
+
+    return gpt_decoder_block_spec
+
+
+##########
+# Utilities
+##########
+
+
+def is_linear_attention_variant(experimental_attention_variant: Optional[str]) -> bool:
+    """Check if the experimental attention variant is a linear attention variant."""
+    linear_attention_variants = ["gated_delta_net"]
+    return experimental_attention_variant in linear_attention_variants
+
+
+def get_moe_layer_pattern(config: TransformerConfig) -> List[int]:
+    """Parse config.moe_layer_freq to get per-layer MoE pattern (1=MoE, 0=dense).
+
+    - int N: one MoE layer every N layers (e.g., N=2 -> [1,0,1,0,...])
+    - list: use directly as the pattern."""
+
+    if isinstance(config.moe_layer_freq, int):
+        # [1,0,0,...,0,1,0,0,...,0,...]
+        moe_layer_pattern = [
+            1 if (i % config.moe_layer_freq == 0) else 0 for i in range(config.num_layers)
+        ]
+    elif isinstance(config.moe_layer_freq, list):
+        moe_layer_pattern = config.moe_layer_freq
+        assert len(moe_layer_pattern) == config.num_layers, (
+            f"Invalid length of moe_layer_pattern: {len(moe_layer_pattern)}, "
+            f"expected {config.num_layers}, "
+            f"current moe layer pattern: {config.moe_layer_freq}"
+        )
+    else:
+        raise ValueError(
+            f"Invalid moe_layer_freq: {type(config.moe_layer_freq)}, {config.moe_layer_freq}"
+        )
+    return moe_layer_pattern
+
+
+def get_linear_attention_pattern(config: TransformerConfig) -> List[int]:
+    """Parse config.linear_attention_freq to get per-layer attention pattern (1=LA, 0=SDPA).
+
+    - int N: one SDPA layer every N layers (e.g., N=4 -> [1,1,1,0,1,1,1,0,...])
+    - list: use directly as the pattern."""
+
+    if isinstance(config.linear_attention_freq, int):
+        linear_attention_pattern = [
+            # [1,1,...,1,0,1,1,...,1,0,...]
+            0 if ((i + 1) % config.linear_attention_freq == 0) else 1
+            for i in range(config.num_layers)
+        ]
+    elif isinstance(config.linear_attention_freq, list):
+        linear_attention_pattern = config.linear_attention_freq
+        assert len(linear_attention_pattern) == config.num_layers, (
+            f"Invalid length of linear_attention_pattern: {len(linear_attention_pattern)}, "
+            f"expected {config.num_layers}, "
+            f"current linear attention pattern: {config.linear_attention_freq}"
+        )
+    elif config.linear_attention_freq is None:
+        if not is_linear_attention_variant(config.experimental_attention_variant):
+            linear_attention_pattern = [0] * config.num_layers
+        else:
+            # This should be caught by config validation, but raise here as a safety check
+            raise ValueError(
+                f"Linear attention type {config.experimental_attention_variant} is specified "
+                "but linear_attention_freq is None. "
+                "Please set linear_attention_freq to specify the LA/SDPA layer pattern."
+            )
+    else:
+        raise ValueError(
+            f"Invalid linear_attention_freq: {type(config.linear_attention_freq)},"
+            f" {config.linear_attention_freq}"
+        )
+    return linear_attention_pattern
+
+
+def _get_backend_spec_provider(config: TransformerConfig) -> BackendSpecProvider:
+    """Get backend spec provider for experimental attention variant."""
+
+    assert config.transformer_impl == "transformer_engine", (
+        "Experimental GPT decoder block spec only supports "
+        "transformer engine implementation for now."
+    )
+    backend: BackendSpecProvider = (
+        KitchenSpecProvider(
+            fallback=TESpecProvider(),
+            use_kitchen_attention=config.use_kitchen_attention,
+            kitchen_attention_backend=config.kitchen_attention_backend,
+        )
+        if config.use_kitchen
+        else TESpecProvider()
+    )
+    return backend
+
+
+##########
+# Spec functions for non-experimental self attention and MLP layer.
+##########
+
+
+def _get_self_attention_module_spec(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> ModuleSpec:
+    """Get non-experimental self-attention module spec.
+    For hybrid models that mix experimental and non-experimental attention architectures.
+
+    Warning: This function may be deprecated in the future."""
+
+    if backend is None:
+        backend = _get_backend_spec_provider(config=config)
+
+    from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+
+    layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        num_experts=config.num_moe_experts,
+        moe_grouped_gemm=config.moe_grouped_gemm,
+        qk_layernorm=config.qk_layernorm,
+        multi_latent_attention=config.multi_latent_attention,
+        moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm,
+        qk_l2_norm=config.qk_l2_norm,
+        use_kitchen=config.use_kitchen,
+        use_te_activation_func=config.use_te_activation_func,
+        use_kitchen_attention=config.use_kitchen_attention,
+        kitchen_attention_backend=config.kitchen_attention_backend,
+    )
+    attn_spec = layer_spec.submodules.self_attention
+    if config.multi_latent_attention:
+        attn_spec.metainfo["fuse_input_layernorm"] = False
+    else:
+        attn_spec.metainfo["fuse_input_layernorm"] = backend.fuse_layernorm_and_linear()
+
+    return attn_spec
+
+
+def _get_dense_mlp_module_spec(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> ModuleSpec:
+    """Get dense MLP module spec.
+    For hybrid models that mix dense MLP and experimental attention architectures.
+
+    Warning: This function may be deprecated in the future."""
+
+    if backend is None:
+        backend = _get_backend_spec_provider(config=config)
+
+    from megatron.core.models.gpt.gpt_layer_specs import get_mlp_module_spec_for_backend
+
+    mlp_spec = get_mlp_module_spec_for_backend(backend=backend, num_experts=None)
+    mlp_spec.metainfo["fuse_pre_mlp_layernorm"] = backend.fuse_layernorm_and_linear()
+
+    return mlp_spec
+
+
+def _get_moe_module_spec(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> ModuleSpec:
+    """Get MoE module spec.
+    For hybrid models that mix MoE and experimental attention architectures.
+
+    Warning: This function may be deprecated in the future."""
+
+    if backend is None:
+        backend = _get_backend_spec_provider(config=config)
+
+    from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend
+
+    moe_spec = get_moe_module_spec_for_backend(
+        backend=backend,
+        num_experts=config.num_moe_experts,
+        moe_grouped_gemm=config.moe_grouped_gemm,
+        moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm,
+        use_te_activation_func=config.use_te_activation_func,
+    )
+    moe_spec.metainfo["fuse_pre_mlp_layernorm"] = False
+    return moe_spec
diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py
index fd1cc3d33c6..7cee9d2973c 100644
--- a/megatron/core/models/gpt/fine_grained_callables.py
+++ b/megatron/core/models/gpt/fine_grained_callables.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import weakref
 from contextlib import nullcontext
@@ -6,16 +6,23 @@
 from typing import Optional
 
 import torch
+from torch import Tensor
 
 from megatron.core import tensor_parallel
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
 from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless
-from megatron.core.transformer.module import float16_to_fp32
+from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.module import GraphableMegatronModule, float16_to_fp32
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.multi_token_prediction import (
     MultiTokenPredictionLayer,
     get_mtp_layer_offset,
 )
 from megatron.core.transformer.transformer_layer import TransformerLayer, make_viewless_tensor
+from megatron.core.utils import internal_api
 
 
 def weak_method(method):
@@ -35,13 +42,14 @@ def wrapped_func(*args, **kwarg):
     return wrapped_func
 
 
-def should_free_input(name, is_moe, is_deepep):
+@internal_api
+def should_free_input(name, is_moe, config):
     """Determine if the node should free its input memory.
 
     Args:
         name: Node name
         is_moe: Whether it's a MoE model
-        is_deepep: Whether it's a DeepEP model
+        config: TransformerConfig object
 
     Returns:
         bool: Whether to free input memory
@@ -49,18 +57,30 @@ def should_free_input(name, is_moe, is_deepep):
     # For dense layers [attn, fake, mlp, fake], the input is needed during backward pass
     if not is_moe:
         return False
+    enable_deepep = (
+        config.moe_token_dispatcher_type == "flex"
+        and config.moe_flex_dispatcher_backend == "deepep"
+    )
+    enable_hybridep = (
+        config.moe_token_dispatcher_type == "flex"
+        and config.moe_flex_dispatcher_backend == "hybridep"
+    )
     # Define which nodes should free input memory
     # Since we split the computing graph into multiple nodes, we can manually control
     # when and how to free the input memory.
     # The input and output of A2A are not needed anymore after the forward pass,
     # so we can free the input memory after the forward pass.
     free_input_nodes = {
-        "mlp": True,
+        "mlp": not enable_hybridep,
         "moe_combine": True,
-        # For non-deepep mode, the input is the un-dispatched tokens and probs before dispatch A2A
-        # and it's not needed anymore after the forward pass
-        # For deepep mode, they are both needed in backward pass, so they cannot be freed.
-        "moe_dispatch": not is_deepep,
+        # For non-DeepEP and non-HybridEP dispatcher mode, the input is the un-dispatched tokens
+        # and probs before dispatch A2A and it's not needed anymore after the forward pass
+        # For DeepEP and HybridEP dispatcher mode, they are both needed in backward pass
+        # and cannot be freed.
+        # If moe_preprocess is in cuda graph scope, tokens and probs are fixed size tensors,
+        # so they cannot be freed.
+        "moe_dispatch": not (enable_deepep or enable_hybridep)
+        and (CudaGraphScope.moe_preprocess not in config.cuda_graph_scope),
     }
 
     return free_input_nodes.get(name, False)
@@ -111,13 +131,19 @@ def forward_impl(self):
         if not self.gpt_model.pre_process:
             self.chunk_state.decoder_input = self.gpt_model.decoder.input_tensor
         # Run GPTModel._preprocess
-        decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset = (
-            self.gpt_model._preprocess(
-                input_ids=self.chunk_state.input_ids,
-                position_ids=self.chunk_state.position_ids,
-                decoder_input=self.chunk_state.decoder_input,
-                packed_seq_params=self.chunk_state.packed_seq_params,
-            )
+        (
+            decoder_input,
+            rotary_pos_emb,
+            rotary_pos_cos,
+            rotary_pos_sin,
+            sequence_len_offset,
+            padding_mask,
+        ) = self.gpt_model._preprocess(
+            input_ids=self.chunk_state.input_ids,
+            position_ids=self.chunk_state.position_ids,
+            decoder_input=self.chunk_state.decoder_input,
+            packed_seq_params=self.chunk_state.packed_seq_params,
+            padding_mask=self.chunk_state.padding_mask,
         )
 
         # Saved for later use
@@ -126,6 +152,7 @@ def forward_impl(self):
         self.chunk_state.rotary_pos_cos = rotary_pos_cos
         self.chunk_state.rotary_pos_sin = rotary_pos_sin
         self.chunk_state.sequence_len_offset = sequence_len_offset
+        self.chunk_state.padding_mask = padding_mask
         return decoder_input
 
 
@@ -153,9 +180,8 @@ def forward_impl(self, hidden_states):
         """Implements the forward pass for postprocessing.
 
         This method handles:
-        1. Final layer normalization
-        2. Output layer computation
-        3. Loss computation if labels are provided
+        1. Output layer computation
+        2. Loss computation if labels are provided
 
         Args:
             hidden_states: The hidden states from the transformer layers.
@@ -163,12 +189,11 @@ def forward_impl(self, hidden_states):
         Returns:
             The logits or loss depending on whether labels are provided.
         """
-        # Final layer norm from Decoder
-        if self.gpt_model.decoder.final_layernorm and not self.gpt_model.mtp_process:
-            hidden_states = self.gpt_model.decoder.final_layernorm(hidden_states)
-            # TENorm produces a "viewed" tensor. This will result in schedule.py's
-            # deallocate_output_tensor() throwing an error, so a viewless tensor is
-            # created to prevent this.
+
+        empty_decoder = len(self.gpt_model.decoder.layers) == 0
+        layer_norm = self.gpt_model.decoder.final_layernorm
+        if not self.gpt_model.config.mtp_num_layers and empty_decoder and layer_norm:
+            hidden_states = layer_norm(hidden_states)
             hidden_states = make_viewless_tensor(
                 inp=hidden_states, requires_grad=True, keep_graph=True
             )
@@ -225,12 +250,13 @@ def __init__(
             it's the per_batch_state_context, o.w. nullcontext
             name (str): Node name, also used to determine memory strategy
             bwd_dw_callables (list): List of weight gradient functions for the layer.
-            extra_args (dict): Extra arguments for the node: is_moe, enable_deepep.
+            extra_args (dict): Extra arguments for the node: is_moe, config.
         """
         # determine whether to free input memory
+        config = extra_args.get("config", None)
+        assert config is not None, "model config must be passed to TransformerLayerNode."
         is_moe = extra_args.get("is_moe", False)
-        enable_deepep = extra_args.get("enable_deepep", False)
-        free_input = should_free_input(name, is_moe, enable_deepep)
+        free_input = should_free_input(name, is_moe, config)
         self.delay_wgrad_compute = extra_args.get("delay_wgrad_compute", False)
 
         super().__init__(
@@ -246,6 +272,7 @@ def __init__(
         self.submodule = submodule
         self.detached = tuple()
         self.before_detached = tuple()
+        self.is_mtp = extra_args.get("is_mtp", False)
 
         # Create flags to indicate first and last layer
         self.is_first_layer = extra_args.get("is_first_layer", False)
@@ -275,7 +302,13 @@ def backward_impl(self, outputs, output_grad):
         detached_grad = tuple([e.grad for e in self.detached])
         grads = output_grad + detached_grad
         self.default_backward_func(outputs + self.before_detached, grads)
-        self._release_state()
+        # release the output grad memory after backward finishes,
+        # except when delay_wgrad_comptue is enabled, the grad should be
+        # kept until all modules' backward_dw has been invoked.
+        if self.delay_wgrad_compute:
+            self.output_grads = grads
+            self.delay_grads_release = len(self.bwd_dw_callables) > 0
+
         # return grads for record stream
         return grads
 
@@ -286,9 +319,17 @@ def backward_dw(self):
         with torch.cuda.nvtx.range(f"{self.name} wgrad"):
             for module in self.bwd_dw_callables:
                 module.backward_dw()
+
+        # the output grad memory is last used in wgrad compute, should be safe to release.
+        assert self.delay_grads_release, "output grad memory should be valid before wgrad."
+        if self.manual_release_grads:
+            for tensor in self.output_grads:
+                tensor.untyped_storage().resize_(0)
+        self.output_grads = None
+
         self.bwd_dw_callables = None
 
-    def _release_state(self):
+    def __del__(self):
         # Release reference as early as possible, this helps avoid memory leak.
         self.before_detached = None
         self.detached = None
@@ -297,6 +338,55 @@ def _release_state(self):
         self.submodule = None
 
 
+class _BackwardDWWrapper:
+    """Wrapper for managing backward weight gradient computation of attn module.
+
+    This class handles the execution of weight gradient computations for transformer layers,
+    coordinating between CUDA graphed and non-graphed components. It is used when
+    overlap_moe_expert_parallel_comm and delay_wgrad_compute are enabled to manage
+    the delayed weight gradient computation in MoE models.
+
+    The wrapper stores references to the attention and shared expert backward weight gradient
+    callables, and determines which components should be executed based on whether CUDA graphs
+    are being replayed and which scopes are covered by the graphs.
+    """
+
+    def __init__(self, layer):
+        assert isinstance(
+            layer, GraphableMegatronModule
+        ), "cuda graphed ep overlap only supports GraphableMegatronModule."
+        assert isinstance(
+            layer, TransformerLayer
+        ), "cuda graphed ep overlap only supports TransformerLayer for now."
+        self.layer = layer
+        self.graphed_backward_dw_callable = None
+        self.attn_dw_callable = layer.self_attention.backward_dw
+        if layer.is_moe_layer:
+            self.shared_expert_dw_callable = partial(
+                layer.mlp.backward_dw, routed_experts=False, shared_experts=True
+            )
+        else:
+            self.shared_expert_dw_callable = None
+        self.cuda_graph_scope = layer.config.cuda_graph_scope
+
+    def backward_dw(self):
+        """Execute weight gradients, skipping CUDA graphed components during replay."""
+        is_replay = hasattr(self.layer, 'cuda_graphs') and self.layer.cuda_graphs
+        if self.shared_expert_dw_callable is not None and (
+            not is_replay or CudaGraphScope.moe_router not in self.cuda_graph_scope
+        ):
+            self.shared_expert_dw_callable()
+        if not is_replay or CudaGraphScope.attn not in self.cuda_graph_scope:
+            self.attn_dw_callable()
+        if is_replay and self.graphed_backward_dw_callable is not None:
+            self.graphed_backward_dw_callable()
+        self.layer = None
+
+    def set_graphed_backward_dw_callable(self, graphed_backward_dw_callable):
+        """Store the CUDA graphed backward weight gradient callable."""
+        self.graphed_backward_dw_callable = graphed_backward_dw_callable
+
+
 def build_transformer_layer_callables(layer: TransformerLayer):
     """Create callables for transformer layer nodes.
     Divides the transformer layer's operations into a sequence of smaller, independent
@@ -329,12 +419,69 @@ def build_transformer_layer_callables(layer: TransformerLayer):
         layer.config.moe_token_dispatcher_type == "flex"
         and layer.config.moe_flex_dispatcher_backend == "deepep"
     )
+    enable_hybridep = (
+        layer.config.moe_token_dispatcher_type == "flex"
+        and layer.config.moe_flex_dispatcher_backend == "hybridep"
+    )
 
     def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor):
         """
-        Performs same attnention forward logic as GPT Model.
+        Performs same attnention forward logic as GPT Model and forward pass for
+        computations between attention and dispatch:
+            pre mlp layernorm->router->dispatch preprocess
         """
-        hidden_states, _ = layer._forward_attention(
+
+        if (
+            isinstance(layer, GraphableMegatronModule)
+            and hasattr(layer, 'cuda_graphs')
+            and layer.cuda_graphs
+        ):
+            layer.set_te_cuda_graph_backward_dw_wrapper()
+            forward_func = layer._te_cuda_graph_replay
+        else:
+            # wrapper function that keeps consistent api with cuda graph replay
+            def forward_func(
+                hidden_states: Tensor,
+                attention_mask: Optional[Tensor] = None,
+                rotary_pos_emb: Optional[Tensor] = None,
+                rotary_pos_cos: Optional[Tensor] = None,
+                rotary_pos_sin: Optional[Tensor] = None,
+                packed_seq_params: Optional[PackedSeqParams] = None,
+                sequence_len_offset: Optional[Tensor] = None,
+            ):
+                hidden_states, _ = layer._forward_attention(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    rotary_pos_emb=rotary_pos_emb,
+                    rotary_pos_cos=rotary_pos_cos,
+                    rotary_pos_sin=rotary_pos_sin,
+                    packed_seq_params=packed_seq_params,
+                    sequence_len_offset=sequence_len_offset,
+                )
+                if not isinstance(layer.mlp, MoELayer):
+                    return hidden_states, None, None, None
+                if layer.recompute_pre_mlp_layernorm:
+                    layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
+                    with off_interface(
+                        layer.offload_mlp_norm, hidden_states, "mlp_norm"
+                    ) as hidden_states:
+                        pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint(
+                            layer.pre_mlp_layernorm, hidden_states
+                        )
+                else:
+                    with off_interface(
+                        layer.offload_mlp_norm, hidden_states, "mlp_norm"
+                    ) as hidden_states:
+                        pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states)
+
+                shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output)
+                probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output)
+                local_tokens, probs = layer.mlp.preprocess(
+                    pre_mlp_layernorm_output, probs, routing_map
+                )
+                return hidden_states, local_tokens, probs, shared_expert_output
+
+        hidden_states, local_tokens, probs, shared_expert_output = forward_func(
             hidden_states=hidden_states,
             attention_mask=node.chunk_state.attention_mask,
             rotary_pos_emb=node.chunk_state.rotary_pos_emb,
@@ -343,28 +490,14 @@ def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor):
             packed_seq_params=node.chunk_state.packed_seq_params,
             sequence_len_offset=node.chunk_state.sequence_len_offset,
         )
-        return hidden_states
-
-    def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor):
-        """
-        Run forward pass for computations between attention and dispatch:
-            pre mlp layernorm->router->dispatch preprocess
-        """
-        if layer.recompute_pre_mlp_layernorm:
-            layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint(
-                layer.pre_mlp_layernorm, hidden_states
-            )
-        else:
-            pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states)
-
-        local_tokens, probs, _ = layer.mlp.router_and_preprocess(pre_mlp_layernorm_output)
+        if not isinstance(layer.mlp, MoELayer):
+            return hidden_states
 
         # Detach here for mlp_bda residual connection
         node.layer_state.residual = node.detach(hidden_states)
         if layer.mlp.use_shared_expert and not layer.mlp.shared_expert_overlap:
-            # Detach here for shared expert connection
-            node.layer_state.pre_mlp_layernorm_output = node.detach(pre_mlp_layernorm_output)
+            # Detach here for shared expert connection in moe_combine
+            node.layer_state.shared_expert_output = node.detach(shared_expert_output)
 
         return local_tokens, probs
 
@@ -375,7 +508,7 @@ def submodule_dispatch_forward(
         Dispatches tokens to the experts based on the router output.
         """
         token_dispatcher = layer.mlp.token_dispatcher
-        if enable_deepep:
+        if enable_deepep or enable_hybridep:
             # update token_probs to be the detached version, prevents
             # backward graph from connecting to attn submodule
             token_dispatcher._comm_manager.token_probs = probs
@@ -389,19 +522,14 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor):
         Run forward pass for computations between dispatch and combine:
             post dispatch->experts->combine preprocess
         """
-        shared_expert_output = None
         dispatched_probs = node.layer_state.dispatched_probs
         token_dispatcher = layer.mlp.token_dispatcher
-        if enable_deepep:
+        if enable_deepep or enable_hybridep:
             # update dispatched_probs to be detached version, prevents
             # backward graph from connecting to dispatch submodule
             token_dispatcher._comm_manager.dispatched_probs = dispatched_probs
 
-        pre_mlp_layernorm_output = getattr(node.layer_state, 'pre_mlp_layernorm_output', None)
-        shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output)
-        expert_output, mlp_bias = layer.mlp.routed_experts_compute(
-            dispatched_tokens, dispatched_probs, pre_mlp_layernorm_output
-        )
+        expert_output, _ = layer.mlp.routed_experts_compute(dispatched_tokens, dispatched_probs)
 
         if layer.recompute_pre_mlp_layernorm:
             # discard the output of the pre-mlp layernorm and register the recompute
@@ -410,16 +538,10 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor):
         # release tensor reference after use
         node.layer_state.dispatched_probs = None
         node.layer_state.pre_mlp_layernorm_output = None
-        if shared_expert_output is None:
-            # Return only expert_output, since shared_expert_output causes backward on None
-            return expert_output
-        return expert_output, shared_expert_output
-
-    def submodule_combine_forward(
-        node: ScheduleNode,
-        output: torch.Tensor,
-        shared_expert_output: Optional[torch.Tensor] = None,
-    ):
+
+        return expert_output
+
+    def submodule_combine_forward(node: ScheduleNode, output: torch.Tensor):
         """
         # Triggers token combine and the remaining computation in the transformer layer.
         # The `mlp_bda` computation is placed after `mlp.combine` due to data dependency.
@@ -429,14 +551,23 @@ def submodule_combine_forward(
         # with another microbatch's computation and expose the communication.
         """
         residual = node.layer_state.residual
+        shared_expert_output = getattr(node.layer_state, 'shared_expert_output', None)
+        output = layer.mlp.combine(output)
+        output = layer.mlp.postprocess(output, shared_expert_output)
 
-        output = layer.mlp.combine(output, shared_expert_output)
         mlp_output_with_bias = (output, None)
-
+        if hasattr(layer, 'cuda_graphs') and layer.cuda_graphs:
+            layer.mlp.cudagraph_tensor_store.clear()
         with layer.bias_dropout_add_exec_handler():
             hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)(
                 mlp_output_with_bias, residual, layer.hidden_dropout
             )
+        # Delay the offload of the mlp norm until after the mlp_bda has been computed
+        # because the residual is needed in the mlp_bda.
+        if layer.offload_mlp_norm:
+            hidden_states = off_interface.group_commit(
+                hidden_states, name="mlp_norm", forced_released_tensors=[residual]
+            )
         output = make_viewless_tensor(
             inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
         )
@@ -446,6 +577,12 @@ def submodule_combine_forward(
 
         # release tensor reference after use
         node.layer_state.residual = None
+
+        # final layer norm from decoder
+        final_layernorm = node.chunk_state.model.decoder.final_layernorm
+        if not node.is_mtp and final_layernorm and node.is_last_layer:
+            output = final_layernorm(output)
+            output = make_viewless_tensor(inp=output, requires_grad=True, keep_graph=True)
         return output
 
     def mlp_wrapper(node: ScheduleNode, *args, **kwargs):
@@ -458,13 +595,14 @@ def raise_not_implemented(*args):
 
     # Build forward and backward callable functions
     attn_func = submodule_attn_forward
-    post_attn_func = submodule_post_attn_forward if is_moe else raise_not_implemented
     dispatch_func = submodule_dispatch_forward if is_moe else raise_not_implemented
     mlp_func = submodule_moe_forward if is_moe else mlp_wrapper
     combine_func = submodule_combine_forward if is_moe else raise_not_implemented
 
-    forward_funcs = [attn_func, post_attn_func, dispatch_func, mlp_func, combine_func, None]
-    backward_dw = {"attn": layer.self_attention, "mlp": layer.mlp}
+    layer.init_backward_dw_wrapper()
+
+    forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, None]
+    backward_dw = {"attn": layer.backward_dw_wrapper, "mlp": layer.mlp}
     return forward_funcs, backward_dw
 
 
@@ -476,24 +614,14 @@ def build_mtp_layer_callables(layer):
     """
 
     forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer)
-    attn_forward, post_attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = (
-        forward_funcs
-    )
+    attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs
     is_moe = isinstance(layer.transformer_layer.mlp, MoELayer)
     assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now."
 
     def submodule_mtp_attn_forward(node, hidden_states):
         # MTP Block Preprocess
         if node.is_first_layer:
-            # Final layer norm from Decoder
-            final_layernorm = node.chunk_state.model.decoder.final_layernorm
-            if final_layernorm:
-                hidden_states = final_layernorm(hidden_states)
-                hidden_states = make_viewless_tensor(
-                    inp=hidden_states, requires_grad=True, keep_graph=True
-                )
-                hidden_states = node.detach(hidden_states)
-            offset = get_mtp_layer_offset(layer.config)
+            offset = get_mtp_layer_offset(layer.config, node.chunk_state.model.vp_stage)
             node.chunk_state.mtp_hidden_states = list(torch.chunk(hidden_states, 1 + offset, dim=0))
             hidden_states = node.chunk_state.mtp_hidden_states[offset]
 
@@ -547,24 +675,17 @@ def rng_context_wrapper(func, *args, **kwargs):
     # Build forward and backward callable functions
     # attn_forward already has rng context, no need to wrap
     attn_func = submodule_mtp_attn_forward
-    post_attn_func = partial(rng_context_wrapper, post_attn_forward)
     dispatch_func = partial(rng_context_wrapper, dispatch_forward)
     mlp_func = partial(rng_context_wrapper, mlp_forward)
     combine_func = partial(rng_context_wrapper, combine_forward)
     mtp_post_process_func = submodule_mtp_postprocess_forward
 
-    forward_funcs = [
-        attn_func,
-        post_attn_func,
-        dispatch_func,
-        mlp_func,
-        combine_func,
-        mtp_post_process_func,
-    ]
-    backward_dw = {
-        "attn": [layer.transformer_layer.self_attention, layer.eh_proj],
-        "mlp": layer.transformer_layer.mlp,
-    }
+    forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, mtp_post_process_func]
+    if isinstance(backward_dw["attn"], list):
+        backward_dw["attn"].append(layer.eh_proj)
+    else:
+        backward_dw["attn"] = [backward_dw["attn"], layer.eh_proj]
+
     return forward_funcs, backward_dw
 
 
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
index 1344694e091..4b00fa82b73 100755
--- a/megatron/core/models/gpt/gpt_layer_specs.py
+++ b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -24,6 +24,7 @@
     get_mtp_layer_spec_for_backend,
     get_mtp_num_layers_to_build,
 )
+from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.torch_norm import L2Norm
 from megatron.core.transformer.transformer_block import (
@@ -36,9 +37,10 @@
     TransformerLayerSubmodules,
     get_transformer_layer_offset,
 )
+from megatron.core.utils import is_te_min_version
 
 try:
-    import transformer_engine as te  # pylint: disable=unused-import
+    import transformer_engine as te  # type: ignore[import-untyped]  # pylint: disable=unused-import
 
     from megatron.core.extensions.transformer_engine import TEFusedMLP, TENorm
     from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider
@@ -48,16 +50,13 @@
     HAVE_TE = False
 
 try:
-    import nvidia_kitchen  # pylint: disable=unused-import
+    from megatron.core.extensions.kitchen import HAVE_KITCHEN, KitchenSpecProvider
 
-    from megatron.core.extensions.kitchen import KitchenSpecProvider
-
-    HAVE_KITCHEN = True
 except ImportError:
     HAVE_KITCHEN = False
 
 try:
-    import apex  # pylint: disable=unused-import
+    import apex  # type: ignore[import-untyped]  # pylint: disable=unused-import
 
     from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 
@@ -180,6 +179,8 @@ def get_gpt_layer_with_transformer_engine_spec(
     use_te_op_fuser: Optional[bool] = False,
     use_kitchen: bool = False,
     use_te_activation_func: bool = False,
+    use_kitchen_attention: bool = False,
+    kitchen_attention_backend: str = "sdpa",
 ) -> ModuleSpec:
     """Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
 
@@ -188,6 +189,7 @@ def get_gpt_layer_with_transformer_engine_spec(
         num_experts (int, optional): Number of experts. Defaults to None.
         moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
         qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+        multi_latent_attention (bool, optional): To use MLA. Defaults to False.
         fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
         moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
                                                       Defaults to False.
@@ -207,7 +209,11 @@ def get_gpt_layer_with_transformer_engine_spec(
 
     if use_kitchen:
         assert HAVE_KITCHEN
-        backend: BackendSpecProvider = KitchenSpecProvider(fallback=TESpecProvider())
+        backend: BackendSpecProvider = KitchenSpecProvider(
+            fallback=TESpecProvider(),
+            use_kitchen_attention=use_kitchen_attention,
+            kitchen_attention_backend=kitchen_attention_backend,
+        )
         if use_te_op_fuser:
             raise AssertionError("use_te_op_fuser not compatible with using kitchen in mlp.")
         if use_te_activation_func:
@@ -307,6 +313,8 @@ def get_gpt_layer_local_spec(
     normalization: Optional[str] = None,
     qk_l2_norm: Optional[bool] = False,
     use_kitchen: bool = False,
+    use_kitchen_attention: bool = False,
+    kitchen_attention_backend: str = "sdpa",
 ) -> ModuleSpec:
     """Use this spec for an implementation using only modules in Megatron-Core.
 
@@ -315,6 +323,7 @@ def get_gpt_layer_local_spec(
         num_experts (int, optional): Number of experts. Defaults to None.
         moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
         qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+        multi_latent_attention (bool, optional): To use MLA. Defaults to False.
         fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
         moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
                                                       Defaults to False.
@@ -326,7 +335,11 @@ def get_gpt_layer_local_spec(
 
     if use_kitchen:
         assert HAVE_KITCHEN
-        backend = KitchenSpecProvider(fallback=LocalSpecProvider())
+        backend = KitchenSpecProvider(
+            fallback=LocalSpecProvider(),
+            use_kitchen_attention=use_kitchen_attention,
+            kitchen_attention_backend=kitchen_attention_backend,
+        )
     else:
         backend = LocalSpecProvider()
     # Adjust for RMS norm.
@@ -501,14 +514,13 @@ def get_mlp_module_spec_for_backend(
         )
 
 
-def get_gpt_decoder_block_spec(
+def get_gpt_decoder_layer_specs(
     config: TransformerConfig,
     use_transformer_engine: bool,
     normalization: Optional[str] = None,
     qk_l2_norm: Optional[bool] = False,
     vp_stage: Optional[int] = None,
     pp_rank: Optional[int] = None,
-    is_dualpipev_first_chunk: Optional[bool] = False,
 ) -> TransformerBlockSubmodules:
     """GPT block spec."""
     if use_transformer_engine:
@@ -522,6 +534,8 @@ def get_gpt_decoder_block_spec(
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
             use_te_activation_func=config.use_te_activation_func,
+            use_kitchen_attention=config.use_kitchen_attention,
+            kitchen_attention_backend=config.kitchen_attention_backend,
         )
         moe_layer_spec = get_gpt_layer_with_transformer_engine_spec(
             num_experts=config.num_moe_experts,
@@ -532,6 +546,8 @@ def get_gpt_decoder_block_spec(
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
             use_te_activation_func=config.use_te_activation_func,
+            use_kitchen_attention=config.use_kitchen_attention,
+            kitchen_attention_backend=config.kitchen_attention_backend,
         )
     else:
         layer_norm_impl = LNImpl
@@ -544,6 +560,8 @@ def get_gpt_decoder_block_spec(
             normalization=normalization,
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
+            use_kitchen_attention=config.use_kitchen_attention,
+            kitchen_attention_backend=config.kitchen_attention_backend,
         )
         moe_layer_spec = get_gpt_layer_local_spec(
             num_experts=config.num_moe_experts,
@@ -554,6 +572,8 @@ def get_gpt_decoder_block_spec(
             normalization=normalization,
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
+            use_kitchen_attention=config.use_kitchen_attention,
+            kitchen_attention_backend=config.kitchen_attention_backend,
         )
 
     # Parse config.moe_layer_freq to determine the pattern of expert/dense layers.
@@ -586,15 +606,33 @@ def get_gpt_decoder_block_spec(
         else:
             raise ValueError(f"Invalid layer pattern: {moe_layer_pattern}")
 
+    return layer_specs
+
+
+def get_gpt_decoder_block_spec(
+    config: TransformerConfig,
+    use_transformer_engine: bool,
+    normalization: Optional[str] = None,
+    qk_l2_norm: Optional[bool] = False,
+    vp_stage: Optional[int] = None,
+    pp_rank: Optional[int] = None,
+    is_dualpipev_first_chunk: Optional[bool] = False,
+) -> TransformerBlockSubmodules:
+    """GPT block spec."""
+    layer_specs = get_gpt_decoder_layer_specs(
+        config, use_transformer_engine, normalization, qk_l2_norm
+    )
     # Slice the layer specs to only include the layers that are built in this pipeline stage.
     # Note: MCore layer_number starts at 1
     ######### FlagScale Modify ########
     num_layers_to_build = get_num_layers_to_build(config, vp_stage=vp_stage, pp_rank=pp_rank, is_dualpipev_first_chunk=is_dualpipev_first_chunk)
 
     if config.pipeline_model_parallel_layout is not None:
+        layout = config.pipeline_model_parallel_layout
+        assert isinstance(layout, PipelineParallelLayerLayout)
         local_layer_specs = [
             layer_specs[layer_id]
-            for layer_id in config.pipeline_model_parallel_layout.get_layer_id_list(
+            for layer_id in layout.get_layer_id_list(
                 layer_type=LayerType.decoder, vp_stage=vp_stage, pp_rank=pp_rank
             )
         ]
@@ -603,6 +641,10 @@ def get_gpt_decoder_block_spec(
         offset = get_transformer_layer_offset(config, vp_stage=vp_stage, pp_rank=pp_rank, is_dualpipev_first_chunk=is_dualpipev_first_chunk)
         local_layer_specs = layer_specs[offset : offset + num_layers_to_build]
 
+    if use_transformer_engine:
+        layer_norm_impl = TENorm
+    else:
+        layer_norm_impl = LNImpl
     # Block spec.
     block_spec = TransformerBlockSubmodules(
         layer_specs=local_layer_specs, layer_norm=layer_norm_impl
@@ -621,13 +663,21 @@ def get_gpt_mtp_block_spec(
     """GPT Multi-Token Prediction (MTP) block spec."""
     if use_transformer_engine:
         backend: BackendSpecProvider = (
-            KitchenSpecProvider(fallback=TESpecProvider())
+            KitchenSpecProvider(
+                fallback=TESpecProvider(),
+                use_kitchen_attention=config.use_kitchen_attention,
+                kitchen_attention_backend=config.kitchen_attention_backend,
+            )
             if config.use_kitchen
             else TESpecProvider()
         )
     else:
         backend = (
-            KitchenSpecProvider(fallback=LocalSpecProvider())
+            KitchenSpecProvider(
+                fallback=LocalSpecProvider(),
+                use_kitchen_attention=config.use_kitchen_attention,
+                kitchen_attention_backend=config.kitchen_attention_backend,
+            )
             if config.use_kitchen
             else LocalSpecProvider()
         )
@@ -662,13 +712,13 @@ def get_gpt_mtp_block_spec_for_backend(
     mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0
     mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers
 
-    offset = get_mtp_layer_offset(config)
+    offset = get_mtp_layer_offset(config, vp_stage=vp_stage)
     # split the mtp layer specs to only include the layers that are built in this pipeline stage.
     mtp_layer_specs = mtp_layer_specs[offset : offset + num_layers_to_build]
     if len(mtp_layer_specs) > 0:
         assert (
             len(mtp_layer_specs) == config.mtp_num_layers
-        ), +f"currently all of the mtp layers must stage in the same pipeline stage."
+        ), f"currently all of the mtp layers must stage in the same pipeline stage."
         mtp_block_spec = MultiTokenPredictionBlockSubmodules(layer_specs=mtp_layer_specs)
     else:
         mtp_block_spec = None
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 25546d36629..e287344c13d 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from collections import OrderedDict
 from typing import Dict, Literal, Optional
@@ -18,22 +18,28 @@
 )
 from megatron.core.models.common.language_module.language_module import LanguageModule
 from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.quantization.utils import get_quant_config_or_none
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
-from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.enums import CudaGraphScope, ModelType
 from megatron.core.transformer.multi_token_prediction import (
     MTPLossAutoScaler,
     MTPLossLoggingHelper,
     MultiTokenPredictionBlock,
     roll_tensor,
-    tie_output_layer_state_dict,
     tie_word_embeddings_state_dict,
 )
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import WrappedTensor, deprecate_inference_params
+from megatron.core.utils import (
+    WrappedTensor,
+    deprecate_inference_params,
+    is_using_quantization_scales,
+)
 
 
 class GPTModel(LanguageModule):
@@ -117,6 +123,7 @@ def __init__(
         self.parallel_output = parallel_output
         self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
         self.vp_stage = vp_stage
+        self.disable_param_offloading = True
 
         if hasattr(self.config, 'position_embedding_type'):
             self.position_embedding_type = self.config.position_embedding_type
@@ -246,7 +253,7 @@ def __init__(
                 tp_group=self.pg_collection.tp,
             )
 
-        if self.pre_process or self.post_process:
+        if self.pre_process or self.post_process or self.mtp_process:
             self.setup_embeddings_and_output_layer()
 
         if has_config_logger_enabled(self.config):
@@ -281,6 +288,7 @@ def _preprocess(
         decoder_input: Tensor = None,
         inference_context: BaseInferenceContext = None,
         packed_seq_params: PackedSeqParams = None,
+        padding_mask: Optional[Tensor] = None,
     ):
         """Preprocesses inputs for the transformer decoder.
 
@@ -297,7 +305,20 @@ def _preprocess(
         if decoder_input is not None:
             pass
         elif self.pre_process:
+            if padding_mask is not None:
+                assert padding_mask.shape == input_ids.shape, (
+                    f"padding_mask shape {padding_mask.shape} does not match "
+                    f"input_ids shape {input_ids.shape}"
+                )
             decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+            if padding_mask is not None and self.config.sequence_parallel:
+                padding_mask = (
+                    tensor_parallel.scatter_to_sequence_parallel_region(
+                        padding_mask.transpose(0, 1).contiguous()
+                    )
+                    .transpose(0, 1)
+                    .contiguous()
+                )
         else:
             # intermediate stage of pipeline
             # decoder will get hidden_states from encoder.input_tensor
@@ -344,13 +365,19 @@ def _preprocess(
                     rotary_seq_len,
                     packed_seq=packed_seq_params is not None
                     and packed_seq_params.qkv_format == 'thd',
+                    cp_group=packed_seq_params.cp_group if packed_seq_params is not None else None,
                 )
         elif self.position_embedding_type == 'yarn':
             if self.training or not self.config.flash_decode:
                 rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
                     inference_context, self.decoder, decoder_input, self.config, packed_seq_params
                 )
-                rotary_pos_emb, _ = self.rotary_pos_emb(rotary_seq_len)
+                rotary_pos_emb, _ = self.rotary_pos_emb(
+                    rotary_seq_len,
+                    packed_seq=packed_seq_params is not None
+                    and packed_seq_params.qkv_format == 'thd',
+                    cp_group=packed_seq_params.cp_group if packed_seq_params is not None else None,
+                )
             else:
                 raise NotImplementedError(
                     "Flash decoding uses precomputed cos and sin for RoPE, not implemented in "
@@ -358,7 +385,11 @@ def _preprocess(
                 )
         elif self.position_embedding_type == 'mrope' and not self.config.multi_latent_attention:
             if self.training or not self.config.flash_decode:
-                rotary_pos_emb = self.rotary_pos_emb(position_ids, self.mrope_section)
+                rotary_pos_emb = self.rotary_pos_emb(
+                    position_ids,
+                    self.mrope_section,
+                    cp_group=packed_seq_params.cp_group if packed_seq_params is not None else None,
+                )
             else:
                 # Flash decoding uses precomputed cos and sin for RoPE
                 raise NotImplementedError(
@@ -371,7 +402,7 @@ def _preprocess(
             and (
                 (
                     self.config.cuda_graph_impl == "local"
-                    and self.config.cuda_graph_scope != "full_iteration"
+                    and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope
                 )
                 or self.config.flash_decode
             )
@@ -386,11 +417,19 @@ def _preprocess(
         else:
             sequence_len_offset = None
 
-        # Wrap decoder_input to allow the decoder (TransformerBlock) to delete the
-        # reference held by this caller function, enabling early garbage collection for
-        # inference. Skip wrapping if decoder_input is logged after decoder completion.
-        if in_inference_mode and not has_config_logger_enabled(self.config):
-            decoder_input = WrappedTensor(decoder_input)
+        if in_inference_mode:
+            # Clear the outputs for padding tokens when using dynamic batching with
+            # quantization scales to avoid corrupting amax calculations
+            if inference_context.is_dynamic_batching() and is_using_quantization_scales(
+                self.config
+            ):
+                decoder_input[inference_context.padding_slice] = 0.0
+
+            # Wrap decoder_input to allow the decoder (TransformerBlock) to delete the
+            # reference held by this caller function, enabling early garbage collection for
+            # inference. Skip wrapping if decoder_input is logged after decoder completion.
+            if not has_config_logger_enabled(self.config):
+                decoder_input = WrappedTensor(decoder_input)
 
         preproc_output = (
             decoder_input,
@@ -398,6 +437,7 @@ def _preprocess(
             rotary_pos_cos,
             rotary_pos_sin,
             sequence_len_offset,
+            padding_mask,
         )
         if rotary_pos_cos_sin is not None:
             # only in the case of flashinfer fused rope will we
@@ -409,6 +449,24 @@ def _preprocess(
 
         return preproc_output
 
+    def preprocess_for_fine_grained_offloading(self):
+        """Preprocess for fine-grained activation offloading."""
+        off_interface.init_chunk_handler(
+            vp_size=self.config.virtual_pipeline_model_parallel_size,
+            vp_stage=self.vp_stage,
+            min_offloaded_tensor_size=self.config.min_offloaded_tensor_size,
+        )
+        if self.disable_param_offloading:
+            for param in self.decoder.parameters():
+                off_interface.mark_not_offloadable(param)
+            if self.mtp_process:
+                for param in self.mtp.parameters():
+                    off_interface.mark_not_offloadable(param)
+            if self.post_process:
+                for param in self.output_layer.parameters():
+                    off_interface.mark_not_offloadable(param)
+            self.disable_param_offloading = False
+
     def forward(
         self,
         input_ids: Tensor,
@@ -423,6 +481,7 @@ def forward(
         *,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
+        padding_mask: Optional[Tensor] = None,
     ) -> Tensor:
         """Forward function of the GPT Model This function passes the input tensors
         through the embedding layer, and then the decoder and finally into the post
@@ -433,7 +492,12 @@ def forward(
         Args:
             runtime_gather_output (bool): Gather output at runtime. Default None means
                 `parallel_output` arg in the constructor will be used.
+            padding_mask (Tensor, optional): Padding mask for MoE routing.
+                Shape [bsz, seq_length]. True = padding (exclude), False = valid (include).
+                Only used for MoE layers to exclude padding tokens from routing computations.
         """
+        if self.config.fine_grained_activation_offloading:
+            self.preprocess_for_fine_grained_offloading()
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
@@ -443,13 +507,19 @@ def forward(
             decoder_input=decoder_input,
             inference_context=inference_context,
             packed_seq_params=packed_seq_params,
+            padding_mask=padding_mask,
         )
 
-        (decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset) = (
-            preproc_output[:5]
-        )
+        (
+            decoder_input,
+            rotary_pos_emb,
+            rotary_pos_cos,
+            rotary_pos_sin,
+            sequence_len_offset,
+            padding_mask,
+        ) = preproc_output[:6]
 
-        rotary_pos_cos_sin = preproc_output[5] if len(preproc_output) == 6 else None
+        rotary_pos_cos_sin = preproc_output[6] if len(preproc_output) == 7 else None
 
         # Run decoder.
         hidden_states = self.decoder(
@@ -462,6 +532,7 @@ def forward(
             rotary_pos_cos_sin=rotary_pos_cos_sin,
             packed_seq_params=packed_seq_params,
             sequence_len_offset=sequence_len_offset,
+            padding_mask=padding_mask,
             **(extra_block_kwargs or {}),
         )
 
@@ -518,7 +589,6 @@ def _postprocess(
         output_weight = None
         if self.share_embeddings_and_output_weights:
             output_weight = self.shared_embedding_or_output_weight()
-
         if mtp_in_postprocess:
             hidden_states = self.mtp(
                 input_ids=input_ids,
@@ -538,7 +608,7 @@ def _postprocess(
         if not self.post_process:
             return hidden_states
 
-        if self.mtp_process:
+        if self.config.mtp_num_layers is not None:
             mtp_labels = labels.clone()
             hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0)
             hidden_states = hidden_states_list[0]
@@ -553,9 +623,19 @@ def _postprocess(
                     runtime_gather_output=runtime_gather_output,
                 )
                 # Calc loss for the current Multi-Token Prediction (MTP) layers.
-                mtp_labels, _ = roll_tensor(mtp_labels, shifts=-1, dims=-1, cp_group=self.cp_group)
+                mtp_labels, _ = roll_tensor(
+                    mtp_labels,
+                    shifts=-1,
+                    dims=-1,
+                    cp_group=self.cp_group,
+                    packed_seq_params=packed_seq_params,
+                )
                 loss_mask, num_tokens = roll_tensor(
-                    loss_mask, shifts=-1, dims=-1, cp_group=self.cp_group
+                    loss_mask,
+                    shifts=-1,
+                    dims=-1,
+                    cp_group=self.cp_group,
+                    packed_seq_params=packed_seq_params,
                 )
                 mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits)
                 mtp_loss = loss_mask * mtp_loss
@@ -580,6 +660,7 @@ def _postprocess(
                         hidden_states, mtp_loss_scale * mtp_loss / num_tokens
                     )
         sequence_parallel_override = False
+
         if in_inference_mode and inference_context.materialize_only_last_token_logits:
             if inference_context.is_static_batching():
                 hidden_states = hidden_states[-1:, :, :]
@@ -668,6 +749,7 @@ def build_schedule_plan(
         runtime_gather_output: Optional[bool] = None,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
+        padding_mask: Optional[Tensor] = None,
     ):
         """Builds a computation schedule plan for the model.
 
@@ -693,11 +775,15 @@ def build_schedule_plan(
             inference_params (InferenceParams, optional):
                 Parameters for inference. Defaults to None.
             loss_mask (Optional[Tensor], optional): Loss mask. Defaults to None.
+            padding_mask (Optional[Tensor], optional): Padding mask. Defaults to None.
 
         Returns:
             TransformerModelChunkSchedulePlan: The model chunk schedule plan.
         """
 
+        if self.config.fine_grained_activation_offloading:
+            self.preprocess_for_fine_grained_offloading()
+
         from ..common.model_chunk_schedule_plan import TransformerModelChunkSchedulePlan
 
         return TransformerModelChunkSchedulePlan(
@@ -711,6 +797,7 @@ def build_schedule_plan(
             extra_block_kwargs,
             runtime_gather_output,
             loss_mask,
+            padding_mask,
         )
 
     def sharded_state_dict(
@@ -739,27 +826,20 @@ def sharded_state_dict(
             output_extra_state and output_extra_state.data
         ), f'Expected output layer extra state to be empty, got: {output_extra_state}'
 
-        # Multi-Token Prediction (MTP) need both embedding layer and output layer in
-        # mtp process stage.
+        # Multi-Token Prediction (MTP) need embedding layer in mtp process stage.
         # If MTP is not placed in the pre processing stage, we need to maintain a copy of
         # embedding layer in the mtp process stage and tie it to the embedding in the pre
         # processing stage.
-        # Also, if MTP is not placed in the post processing stage, we need to maintain a copy
-        # of output layer in the mtp process stage and tie it to the output layer in the post
-        # processing stage.
+        # Now MTP loss is computed in post processing stage, so the output_layer is not needed.
         if self.mtp_process and not self.pre_process:
             emb_weight_key = f'{prefix}embedding.word_embeddings.weight'
             emb_weight = self.embedding.word_embeddings.weight
-            tie_word_embeddings_state_dict(sharded_state_dict, emb_weight, emb_weight_key)
-        if self.mtp_process and not self.post_process:
-            # We only need to tie the output layer weight if share_embeddings_and_output_weights
-            # is False. Because if share_embeddings_and_output_weights is True, the shared weight
-            # will be stored in embedding layer, and output layer will not have any weight.
-            if not self.share_embeddings_and_output_weights:
-                output_layer_weight_key = f'{prefix}output_layer.weight'
-                output_layer_weight = self.output_layer.weight
-                tie_output_layer_state_dict(
-                    sharded_state_dict, output_layer_weight, output_layer_weight_key
-                )
+            tie_word_embeddings_state_dict(
+                sharded_state_dict,
+                emb_weight,
+                emb_weight_key,
+                tp_group=self.tp_group,
+                dp_cp_group=metadata['dp_cp_group'],
+            )
 
         return sharded_state_dict
diff --git a/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py b/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py
index b1c2fb79a11..5e9687b09a3 100644
--- a/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py
+++ b/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py
@@ -27,6 +27,7 @@
     TransformerLayerSubmodules,
     get_transformer_layer_offset,
 )
+from megatron.core.typed_torch import not_none
 from megatron.core.utils import is_te_min_version
 
 try:
@@ -44,6 +45,13 @@
 
     HAVE_TE = True
 except ImportError:
+    (
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+        TELayerNormColumnParallelLinearGathered,
+    ) = (None, None, None, None, None)
     HAVE_TE = False
 
 from megatron.core.transformer.torch_norm import WrappedTorchNorm
@@ -110,8 +118,10 @@ def _get_heterogenous_attention_spec(
             module=SelfAttention,
             params={"attn_mask_type": AttnMaskType.causal},
             submodules=SelfAttentionSubmodules(
-                linear_qkv=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear,
-                core_attention=TEDotProductAttention if use_te else DotProductAttention,
+                linear_qkv=(
+                    not_none(TELayerNormColumnParallelLinear) if use_te else ColumnParallelLinear
+                ),
+                core_attention=not_none(TEDotProductAttention) if use_te else DotProductAttention,
                 linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
                 q_layernorm=ln,
                 k_layernorm=ln,
diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py
index 4f090e4a024..62ee4537cfc 100755
--- a/megatron/core/models/gpt/moe_module_specs.py
+++ b/megatron/core/models/gpt/moe_module_specs.py
@@ -57,10 +57,12 @@ def get_moe_module_spec_for_backend(
     experts = ModuleSpec(module=expert_module, submodules=expert_submodule)
 
     # shared experts spec
-    shared_experts = ModuleSpec(module=SharedExpertMLP, params={"gate": False}, submodules=mlp)
+    shared_experts = ModuleSpec(module=SharedExpertMLP, submodules=mlp)
 
     # MoE module spec
     moe_module_spec = ModuleSpec(
-        module=MoELayer, submodules=MoESubmodules(experts=experts, shared_experts=shared_experts)
+        module=MoELayer,
+        submodules=MoESubmodules(experts=experts, shared_experts=shared_experts),
+        metainfo={"fuse_pre_mlp_layernorm": False},
     )
     return moe_module_spec
diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py
index bfe38c2bbc8..b87124bab1d 100755
--- a/megatron/core/models/mamba/mamba_layer_specs.py
+++ b/megatron/core/models/mamba/mamba_layer_specs.py
@@ -12,11 +12,19 @@
 from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
 from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
 from megatron.core.ssm.mlp_layer import MLPLayer
+from megatron.core.tensor_parallel import (
+    InferenceLayerNormColumnParallelLinear,
+    InferenceRowParallelLinear,
+)
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.transformer.transformer_layer import (
+    MoETransformerLayer,
+    TransformerLayer,
+    TransformerLayerSubmodules,
+)
 
 moe = get_moe_module_spec(
     use_te=True,
@@ -73,6 +81,65 @@
                 mlp_bda=get_bias_dropout_add,
             ),
         ),
+        moe_layer=ModuleSpec(
+            module=MoETransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add
+            ),
+        ),
+    ),
+)
+
+mamba_inference_stack_spec = ModuleSpec(
+    module=MambaStack,
+    submodules=MambaStackSubmodules(
+        mamba_layer=ModuleSpec(
+            module=MambaLayer,
+            submodules=MambaLayerSubmodules(
+                mixer=ModuleSpec(
+                    module=MambaMixer,
+                    submodules=MambaMixerSubmodules(
+                        in_proj=InferenceLayerNormColumnParallelLinear,
+                        out_proj=InferenceRowParallelLinear,
+                    ),
+                ),
+                mamba_bda=get_bias_dropout_add,
+            ),
+        ),
+        # Started with spec from gpt_layer_specs.py (with MLP removed)
+        # Using the TE spec because we had problems getting the non-TE spec
+        # working
+        attention_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=InferenceLayerNormColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=InferenceRowParallelLinear,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+            ),
+        ),
+        # Started with spec from gpt_layer_specs.py
+        # Using the TE spec because we had problems getting the non-TE spec
+        # working
+        mlp_layer=ModuleSpec(
+            module=MLPLayer,
+            submodules=TransformerLayerSubmodules(
+                mlp=ModuleSpec(
+                    module=MLP,
+                    submodules=MLPSubmodules(
+                        linear_fc1=InferenceLayerNormColumnParallelLinear,
+                        linear_fc2=InferenceRowParallelLinear,
+                    ),
+                ),
+                mlp_bda=get_bias_dropout_add,
+            ),
+        ),
         moe_layer=ModuleSpec(
             # TODO (rwaleffe): change this to be an "MoELayer" to work with CudaGraphs?
             module=TransformerLayer,
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index 378cf7e47d6..8d45e1d0147 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -10,13 +10,18 @@
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.quantization.utils import get_quant_config_or_none
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
 from megatron.core.transformer import TransformerConfig
 from megatron.core.transformer.enums import ModelType
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
-from megatron.core.utils import WrappedTensor, deprecate_inference_params
+from megatron.core.utils import (
+    WrappedTensor,
+    deprecate_inference_params,
+    is_using_quantization_scales,
+)
 
 
 class MambaModel(LanguageModule):
@@ -179,6 +184,8 @@ def forward(
         runtime_gather_output: Optional[bool] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        padding_mask: Optional[Tensor] = None,
     ) -> Tensor:
         """Forward function of the Mamba model. This function passes the input tensors
         through the embedding layer, and then the decoder and finally into the post
@@ -201,6 +208,15 @@ def forward(
             pass
         elif self.pre_process:
             decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+
+            # Clear the outputs for padding tokens when using dynamic batching with
+            # quantization scales to avoid corrupting amax calculations
+            if (
+                in_inference_mode
+                and inference_context.is_dynamic_batching()
+                and is_using_quantization_scales(self.config)
+            ):
+                decoder_input[inference_context.padding_slice] = 0.0
         else:
             # intermediate stage of pipeline
             # decoder will get hidden_states from encoder.input_tensor
@@ -209,9 +225,12 @@ def forward(
         rotary_pos_emb = None
         if self.position_embedding_type == 'rope':
             rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
-                inference_context, self.decoder, decoder_input, self.config
+                inference_context, self.decoder, decoder_input, self.config, packed_seq_params
+            )
+            rotary_pos_emb = self.rotary_pos_emb(
+                rotary_seq_len,
+                packed_seq=packed_seq_params is not None and packed_seq_params.qkv_format == 'thd',
             )
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
 
         # Wrap decoder_input to allow the decoder (MambaBlock) to delete the
         # reference held by this caller function, enabling early garbage collection
@@ -235,6 +254,8 @@ def forward(
             attention_mask=attention_mask,
             inference_context=inference_context,
             rotary_pos_emb=rotary_pos_emb,
+            packed_seq_params=packed_seq_params,
+            padding_mask=padding_mask,
         )
 
         if not self.post_process:
diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index 2ac2657c1cd..af0bcf6e9fd 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -124,6 +124,7 @@ def __init__(
         max_num_tiles: int = 0,
         tokenizer_type: str = "",
         vp_stage: Optional[int] = None,
+        use_vision_backbone_fp8_arch: bool = False,
     ) -> None:
         super().__init__(config=language_transformer_config)
 
@@ -295,7 +296,7 @@ def __init__(
                     ln_post_impl = None
                     use_mask_token = False
 
-                if vision_transformer_config.fp8:
+                if vision_transformer_config.fp8 or use_vision_backbone_fp8_arch:
                     # FP8 padding for final sequence length to be a multiple of 16 or 32.
                     class_token_len = 32 if vision_transformer_config.fp8_recipe == "mxfp8" else 16
 
@@ -923,27 +924,16 @@ def forward(
                 )
             )
 
-        if isinstance(self.language_model, MambaModel):
-            output = self.language_model(
-                input_ids=None,
-                position_ids=None,
-                attention_mask=attention_mask,
-                decoder_input=combined_embeddings,
-                labels=new_labels,
-                inference_context=inference_context,
-                runtime_gather_output=runtime_gather_output,
-            )
-        else:
-            output = self.language_model(
-                input_ids=None,
-                position_ids=None,
-                attention_mask=attention_mask,
-                decoder_input=combined_embeddings,
-                labels=new_labels,
-                inference_context=inference_context,
-                runtime_gather_output=runtime_gather_output,
-                packed_seq_params=packed_seq_params,
-            )
+        output = self.language_model(
+            input_ids=None,
+            position_ids=None,
+            attention_mask=attention_mask,
+            decoder_input=combined_embeddings,
+            labels=new_labels,
+            inference_context=inference_context,
+            runtime_gather_output=runtime_gather_output,
+            packed_seq_params=packed_seq_params,
+        )
 
         return output, new_loss_mask
 
diff --git a/megatron/core/models/retro/__init__.py b/megatron/core/models/retro/__init__.py
deleted file mode 100644
index ea7cea6d8fb..00000000000
--- a/megatron/core/models/retro/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-Exports:
-
-  - RetroConfig: configuration dataclass for RetroModel.
-  - RetroModel: The Retro model.
-  - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block.
-"""
-
-from .config import RetroConfig
-from .decoder_spec import get_retro_decoder_block_spec
-from .model import RetroModel
diff --git a/megatron/core/models/retro/base_attention.py b/megatron/core/models/retro/base_attention.py
deleted file mode 100644
index fa07a51ddcd..00000000000
--- a/megatron/core/models/retro/base_attention.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Base class for decoder and encoder attention modules."""
-
-from megatron.core.models.retro.config import RetroConfig
-from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.module import MegatronModule
-
-
-class BaseRetroCrossAttention(MegatronModule):
-    """Base class for Retro cross attention, for both encoder & decoder layers.
-
-    This class collects the retro arguments below (i.e., num neighbors, chunk
-    length, and retrieve length) for use in Retro's custom cross attention
-    operators.
-
-    Args:
-        config (RetroConfig): Retro config.
-        submodules (CrossAttentionSubmodules): Cross attention submodules.
-        layer_number (int): Layer number within transformer block.
-        attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
-        pg_collection (ProcessGroupCollection): Model communication process groups.
-    """
-
-    def __init__(
-        self,
-        config: RetroConfig,
-        submodules: CrossAttentionSubmodules,
-        layer_number: int = 1,
-        attn_mask_type: AttnMaskType = AttnMaskType.padding,
-        pg_collection: ProcessGroupCollection = None,
-    ):
-        super().__init__(config=config)
-
-        self.attn = CrossAttention(
-            config=config,
-            submodules=submodules,
-            layer_number=layer_number,
-            attn_mask_type=attn_mask_type,
-            pg_collection=pg_collection,
-        )
-
-        self.retro_num_neighbors = config.retro_num_neighbors
-        self.retro_chunk_length = config.retro_chunk_length
-        self.retro_retrieved_length = config.retro_retrieved_length
diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py
deleted file mode 100644
index 1b486767264..00000000000
--- a/megatron/core/models/retro/config.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-
-"""Configuration dataclass for a RetroModel."""
-
-import os
-from dataclasses import dataclass
-
-from megatron.core.transformer import TransformerConfig
-from megatron.core.transformer.enums import AttnBackend
-from megatron.core.utils import is_te_min_version
-
-
-@dataclass
-class RetroConfig(TransformerConfig):
-    """Configuration object for Retro models."""
-
-    # Retro.
-    retro_project_dir: str = None
-    """Retro project directory, which contains the preprocessed data for for pretraining. This
-       directory is built during preprocessing (see tools/retro/README.md), and contains
-       subdirectories for the chunk database and pretraining neighbors.
-    """
-
-    retro_block_size: int = None
-    """Number of records to load per data file, as saved during preprocessing. Block processing is
-       used for efficient data preprocessing.
-    """
-
-    retro_chunk_length: int = None
-    """Chunk length used for performing chunked- cross-attention (CCA)."""
-
-    retro_encoder_num_layers: int = 2
-    """Number of layers to use for the retrieval encoder."""
-
-    retro_encoder_hidden_dropout: float = 0.1
-    """Hidden dropout for retrieval encoder."""
-
-    retro_encoder_attention_dropout: float = 0.1
-    """Attention dropout for retrieval encoder."""
-
-    retro_neighbor_dirs: dict = None
-    """Directory names of saved neighbor id files for train, valid, and test datasets."""
-
-    retro_num_neighbors: int = 2
-    """Number of neighbors to retrieve during pretraining."""
-
-    retro_num_retrieved_chunks: int = 2
-    """Number of chunks to retrieve from the retrieval database."""
-
-    retro_retrieved_length: int = None
-    """Cached value of retro_num_retrieved_chunks * retro_chunk_length (i.e., the total number of
-       retrieved tokens; neighbor + continuation).
-    """
-
-    retro_split_preprocessing: str = None
-    """Data split used during data preprocessing."""
-
-    retro_verify_neighbor_count: bool = True
-    """Verify that len(GPT dataset) == len(saved neighbors)."""
-
-    def __post_init__(self) -> None:
-        """Validate Retro config."""
-
-        super().__post_init__()
-
-        self.attention_backend = AttnBackend.unfused
-
-        # Validate Transformer Engine version.
-        if is_te_min_version("1.3"):
-            try:
-                assert os.getenv("NVTE_FLASH_ATTN") == "0"
-                assert os.getenv("NVTE_FUSED_ATTN") == "0"
-            except Exception as e:
-                raise Exception(
-                    "When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN "
-                    "and NVTE_FUSED_ATTN most both be defined and set to '0'. "
-                    "Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s."
-                    % (
-                        os.getenv("NVTE_FLASH_ATTN", "[unset]"),
-                        os.getenv("NVTE_FUSED_ATTN", "[unset]"),
-                    )
-                )
-
-        # Preprocessing split should be defined.
-        assert self.retro_split_preprocessing is not None
-
-        # Pre-compute retrieved length.
-        self.retro_retrieved_length = self.retro_num_retrieved_chunks * self.retro_chunk_length
diff --git a/megatron/core/models/retro/decoder_attention.py b/megatron/core/models/retro/decoder_attention.py
deleted file mode 100644
index 5aedb053112..00000000000
--- a/megatron/core/models/retro/decoder_attention.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-
-"""Retro's cross attention modules for the decoder block."""
-
-from functools import partial
-from typing import Callable, Optional
-
-import numpy as np
-import torch
-from torch import Tensor
-
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.inference.contexts import BaseInferenceContext
-from megatron.core.models.retro.base_attention import BaseRetroCrossAttention
-from megatron.core.models.retro.config import RetroConfig
-from megatron.core.models.retro.utils import get_all_true_mask
-from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer import ModuleSpec
-from megatron.core.transformer.attention import CrossAttentionSubmodules
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.transformer_block import TransformerBlock
-from megatron.core.utils import deprecate_inference_params
-
-
-class RetroDecoderCrossAttention(BaseRetroCrossAttention):
-    """Retro decoder's chunked cross attention operator.
-
-    See this paper for more details: https://arxiv.org/abs/2112.04426.
-    Neighboring chunks retrieved from the chunk database are used here for
-    chunked-cross attention.
-
-    ** Note about 'encoder_block_spec' **
-
-    Retro is an encoder-decoder model that uses its encoder for encoding
-    neighboring chunks that are retrieved from a chunk database. These
-    encoded neighbors are then used in the decoder stack for performing
-    chunked-cross attention (see paper link above).
-
-    In contrast to the T5 model, the encoder and decoder are computationally
-    intertwined, since the input to the encoder is the output of the self-
-    attention of the first decoder layer. As such, the encoder block itself
-    is instantiated within the first Retro decoder layer, in order to receive
-    the self-attention's output. (Note, that only the first decoder layer
-    instantiates an encoder block, and the remaining decoder layers use the
-    encoder output from the first decoder layer.)
-
-    Args:
-        config (RetroConfig): Retro config.
-        submodules (CrossAttentionSubmodules): Cross attention submodules.
-        layer_number (int): Layer number within transformer block.
-        attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
-        encoder_block_spec (ModuleSpec): The first Retro decoder layer is
-            provided with a transformer block spec to construct the neighbor encoder.
-        pg_collection (ProcessGroupCollection): Model communication process groups.
-    """
-
-    def __init__(
-        self,
-        config: RetroConfig,
-        submodules: CrossAttentionSubmodules,
-        layer_number: int = 1,
-        attn_mask_type: AttnMaskType = AttnMaskType.padding,
-        encoder_block_spec: ModuleSpec = None,
-        pg_collection: ProcessGroupCollection = None,
-    ):
-        super().__init__(
-            config=config,
-            submodules=submodules,
-            layer_number=layer_number,
-            attn_mask_type=attn_mask_type,
-            pg_collection=pg_collection,
-        )
-
-        if encoder_block_spec:
-            self.encoder = TransformerBlock(
-                config=config,
-                spec=encoder_block_spec,
-                pre_process=True,
-                post_process=False,
-                pg_collection=pg_collection,
-            )
-            # self._encoder_key = 'encoder' # ... necessary?
-        else:
-            self.encoder = None
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Tensor,
-        key_value_states: Tensor = None,
-        inference_context: BaseInferenceContext = None,
-        # rotary_pos_emb: Tensor = None, # ... unsupported for retro.
-        *,
-        inference_params: Optional[BaseInferenceContext] = None,
-    ) -> dict:
-        """Cross attention for Retro decoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            m  : Number of tokens per chunk.
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-
-        Args:
-            hidden_states (Tensor): Transformer layer hidden states.
-            attention_mask (Tensor): Attention mask.
-            key_value_states (Tensor): Neighbor embeddings if first decoder layer,
-                else encoder output.
-            inference_context (BaseInferenceContext): Inference context.
-
-        Returns:
-            A dict consisting of the attention output and context, along with
-                other scalars necessary for performing the downstream bias-dropout-add.
-        """
-
-        # hidden_states: [ ns, bs, d ]
-        # key_value_states: [ r, k*bs*l, d ]
-
-        inference_context = deprecate_inference_params(inference_context, inference_params)
-
-        ns, bs, d = hidden_states.shape
-        l = int(np.ceil(ns / self.retro_chunk_length))
-
-        # Retrieve neighbors.
-        if self.encoder:
-            # Sequence length remainder.
-            first_ns = ns % self.retro_chunk_length
-
-            # Case 1: Sequence length not divisible by chunk length.
-            if first_ns > 0:
-                # Split sequence into first partial chunk & remaining chunks.
-                first_chunk, rest_chunk = (hidden_states[:first_ns], hidden_states[first_ns:])
-
-                # Pad partial chunk with zeros.
-                first_chunk = torch.nn.functional.pad(
-                    first_chunk, (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), "constant", 0
-                )
-
-                # Concatenate padded chunk with remaining chunks.
-                chunked_output = torch.cat((first_chunk, rest_chunk), dim=0)  # [ l*m, bs, d ]
-
-            # Case 2: Sequence length is divisible by chunk length.
-            else:
-                chunked_output = hidden_states  # [ l*m, bs, d ]
-
-            # Chunk & permute hidden states.
-            # - hidden_states:  [ l*m, bs, d ]
-            # - chunked_output: [ m, bs*l, d ]
-            chunked_output = (
-                chunked_output.reshape(l, self.retro_chunk_length, bs, d)
-                .permute(1, 2, 0, 3)
-                .reshape(self.retro_chunk_length, bs * l, d)
-                .contiguous()
-            )
-
-            # flash attn: [ b, h, sq, sk ]
-            # fused attn: [ b, 1, 1, sq ]
-            chunked_output_mask = get_all_true_mask(
-                size=(1, 1, chunked_output.shape[0], key_value_states.shape[0]),
-                device=chunked_output.device,
-            )
-
-            # Encode neighbors. (Note: 'key_value_states' re-assigned here.)
-            key_value_states = self.encoder(
-                hidden_states=key_value_states,
-                attention_mask=attention_mask,
-                context=chunked_output,
-                context_mask=chunked_output_mask,
-                inference_context=inference_context,
-            )  # [ r, k*bs*l, d ]
-            key_value_states = key_value_states.reshape(
-                self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d
-            )  # [ r*k, bs*l, d ]
-
-        # Attend starting at last token of first chunk.
-        pad = (ns - 1) % self.retro_chunk_length
-        attending_chunks = hidden_states[pad:]
-
-        # Pad attending tokens to sequence length.
-        padded_chunks = torch.nn.functional.pad(
-            attending_chunks, (0, 0, 0, 0, 0, self.retro_chunk_length - 1), "constant", 0
-        )
-
-        # Permute attending chunks.
-        # - padded_chunks:         [ l*m, bs, d ]
-        # - padded_chunked_output: [ m, bs*l, d ] (matches 'chunked_output' above)
-        padded_chunked_output = padded_chunks.reshape(l, self.retro_chunk_length, bs, d).permute(
-            1, 2, 0, 3
-        )
-        padded_chunked_output = padded_chunked_output.reshape(
-            self.retro_chunk_length, bs * l, d
-        ).contiguous()
-
-        # flash attn: [ b, h, sq, sk ]
-        # fused attn: [ b, 1, 1, sq ]
-        padded_chunked_output_mask = get_all_true_mask(
-            size=(1, 1, padded_chunked_output.shape[0], key_value_states.shape[0]),
-            device=padded_chunked_output.device,
-        )
-
-        # Attend to encoded neighbors.
-        attention_output, attention_bias = self.attn(
-            hidden_states=padded_chunked_output,
-            attention_mask=padded_chunked_output_mask,
-            key_value_states=key_value_states,
-        )
-
-        # Return dimensions for bias-dropout step.
-        return {
-            "ns": ns,
-            "bs": bs,
-            "d": d,
-            "l": l,
-            "pad": pad,
-            "attention_output": attention_output,  # [ m, bs*l, d ]
-            "attention_bias": attention_bias,  # [ d ]
-            "context": key_value_states,  # [ r*k, bs*l, d ]
-        }
-
-
-class RetroDecoderBiasDropoutAdd(MegatronModule):
-    """Retro decoder's bias-dropout-add operator.
-
-    This operator takes care of reshaping and permuting the output from the
-    chunk dimension to the sequence dimension.
-
-    Args:
-        config (RetroConfig): Retro config.
-    """
-
-    def __init__(self, config: RetroConfig):
-        super().__init__(config=config)
-        self.retro_chunk_length = config.retro_chunk_length
-
-    @classmethod
-    def _forward(
-        cls,
-        x_with_bias: dict,
-        residual: Tensor,
-        prob: float,
-        retro_chunk_length: int,
-        bias_dropout_add: Callable,
-    ) -> Tensor:
-        """Per-chunk bias-dropout-add.
-
-        Args:
-            x_with_bias (dict): Attention output and bias, along with other Retro
-                relevant parameters.
-            residual (Tensor): Transformer layer residual.
-            prob (float): Dropout probability.
-            retro_chunk_length (int): Retro chunk length (e.g., 64).
-            bias_dropout_add (Callable): Bias-dropout-add function.
-
-        Returns:
-            Output of bias-dropout-add.
-        """
-
-        # Extract input dict.
-        ns = x_with_bias["ns"]
-        bs = x_with_bias["bs"]
-        d = x_with_bias["d"]
-        l = x_with_bias["l"]
-        pad = x_with_bias["pad"]
-        attention_output = x_with_bias["attention_output"]  # [ m, bs*l, d ]
-        attention_bias = x_with_bias["attention_bias"]  # [ d ]
-
-        # Re-enable torch grad to enable fused optimization.
-        with torch.enable_grad():
-            # Bias-dropout-add.
-            x = bias_dropout_add(
-                (
-                    attention_output,
-                    None if attention_bias is None else attention_bias.expand_as(attention_output),
-                ),
-                torch.zeros_like(attention_output),
-                prob,
-            )
-
-            # Permute chunks back to sequence dimension.
-            # 1. [ m, bs*l, d ]
-            # 2. [ m, bs, l, d ]
-            # 3. [ l, m, bs, d ]
-            # 4. [ m*l, bs, d ] == [ ns, bs, d ]
-            x = (
-                x.reshape(retro_chunk_length, bs, l, d)
-                .permute(2, 0, 1, 3)
-                .reshape(retro_chunk_length * l, bs, d)
-            )
-
-            # Prepend zeros for non-attending tokens.
-            x = torch.nn.functional.pad(x, (0, 0, 0, 0, pad, 0), "constant", 0)[
-                :ns
-            ]  # [ ns, bs, d ]
-
-            # Add residual. [ ns, bs, d ]
-            x = x + residual
-
-        # Output. [ ns, bs, d ]
-        return x
-
-    def forward(self, training: bool, fused: bool) -> partial:
-        """Retro decoder bias-dropout-add.
-
-        Args:
-            training (bool): If training, then apply dropout.
-            fused (bool): Fuse bias-dropout-add.
-
-        Returns:
-            The partial function for performing bias-dropout-add.
-        """
-        return partial(
-            self._forward,
-            retro_chunk_length=self.retro_chunk_length,
-            bias_dropout_add=get_bias_dropout_add(training, fused),
-        )
diff --git a/megatron/core/models/retro/decoder_spec.py b/megatron/core/models/retro/decoder_spec.py
deleted file mode 100644
index 6539348143f..00000000000
--- a/megatron/core/models/retro/decoder_spec.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-
-"""Specs for Retro decoder."""
-
-import typing
-from typing import Optional
-
-from megatron.core.models.gpt.gpt_layer_specs import (
-    get_gpt_layer_local_spec,
-    get_gpt_layer_with_transformer_engine_spec,
-)
-from megatron.core.models.retro.config import RetroConfig
-from megatron.core.models.retro.decoder_attention import (
-    RetroDecoderBiasDropoutAdd,
-    RetroDecoderCrossAttention,
-)
-from megatron.core.models.retro.encoder_spec import get_retro_encoder_block_spec
-from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron.core.transformer import ModuleSpec
-from megatron.core.transformer.attention import CrossAttentionSubmodules
-from megatron.core.transformer.dot_product_attention import DotProductAttention
-from megatron.core.transformer.transformer_block import (
-    TransformerBlockSubmodules,
-    get_num_layers_to_build,
-)
-
-try:
-    import apex  # pylint: disable=unused-import
-
-    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
-
-    HAVE_APEX = True
-    LNImpl = FusedLayerNorm
-except ImportError:
-    import warnings
-
-    from megatron.core.transformer.torch_norm import WrappedTorchNorm
-
-    warnings.warn(f"Apex is not installed. Falling back to Torch Norm")
-    LNImpl = WrappedTorchNorm
-    HAVE_APEX = False
-
-try:
-    import transformer_engine as te  # pylint: disable=unused-import
-
-    from megatron.core.extensions.transformer_engine import (
-        TEColumnParallelLinear,
-        TEDotProductAttention,
-        TENorm,
-        TERowParallelLinear,
-    )
-
-    HAVE_TE = True
-except ImportError:
-    HAVE_TE = False
-
-
-def get_retro_decoder_layer_te_spec(
-    encoder_block_spec: typing.Union[ModuleSpec, TransformerBlockSubmodules, None] = None
-) -> ModuleSpec:
-    """Retro decoder TE spec (uses Transformer Engine components).
-
-    A Retro decoder layer uses custom attention and bias-dropout-add operators
-    to perform chunked-cross attention. Additionally, the first Retro decoder
-    layer instantiates an entire encoder transformer block. As such, the decoder
-    cross attention module takes an optional encoder block spec, which is only
-    provided for the first Retro decoder layer.
-
-    Args:
-        encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided for
-            the first Retro decoder layer.
-
-    Returns:
-        A module spec with Transformer Engine modules.
-    """
-    spec = get_gpt_layer_with_transformer_engine_spec()
-    spec.submodules.pre_cross_attn_layernorm = TENorm
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroDecoderCrossAttention,
-        params={"encoder_block_spec": encoder_block_spec},
-        submodules=CrossAttentionSubmodules(
-            linear_q=TEColumnParallelLinear,
-            linear_kv=TEColumnParallelLinear,
-            core_attention=TEDotProductAttention,
-            linear_proj=TERowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
-    return spec
-
-
-def get_retro_decoder_layer_local_spec(
-    encoder_block_spec: typing.Optional[ModuleSpec] = None,
-) -> ModuleSpec:
-    """Retro decoder local spec (uses Megatron-Core components).
-
-    A Retro decoder layer uses custom attention and bias-dropout-add operators
-    to perform chunked-cross attention. Additionally, the first Retro decoder
-    layer instantiates an entire encoder transformer block. As such, the decoder
-    cross attention module takes an optional encoder block spec, which is only
-    provided for the first Retro decoder layer.
-
-    Args:
-        encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided
-            for the first Retro decoder layer.
-
-    Returns:
-        A module spec with local modules.
-    """
-    spec = get_gpt_layer_local_spec()
-    spec.submodules.pre_cross_attn_layernorm = LNImpl
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroDecoderCrossAttention,
-        params={"encoder_block_spec": encoder_block_spec},
-        submodules=CrossAttentionSubmodules(
-            linear_q=ColumnParallelLinear,
-            linear_kv=ColumnParallelLinear,
-            core_attention=DotProductAttention,
-            linear_proj=RowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
-    return spec
-
-
-def get_retro_decoder_block_spec(
-    config: RetroConfig,
-    use_transformer_engine: bool,
-    vp_stage: Optional[int] = None,
-    pp_rank: Optional[int] = None,
-) -> TransformerBlockSubmodules:
-    """Retro decoder block spec.
-
-    Retro decoder block implementation details:
-    - The retro decoder block consists of interleaved GPT layers
-        and customized Retro decoder layers.
-    - The Retro decoder layers are spaced three layers apart,
-        and start on layer 6 or 9 (depending on the total number of layers).
-    - The first decoder layer instantiates an encoder block,
-        and it therefore passes in an encoder_block_spec.
-
-    Args:
-        config (RetroConfig): Retro config.
-        use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules.
-        vp_stage (Optional[int]): Virtual pipeline stage number.
-        pp_rank (Optional[int]): Pipeline parallel rank.
-
-    Returns:
-        Transformer block submodules for the given spec.
-    """
-
-    assert (
-        config.pipeline_model_parallel_size == 1
-    ), "retro does not currently support pipeline parallelism."
-
-    assert (
-        config.virtual_pipeline_model_parallel_size is None
-    ), "retro does not currently support virtual pipeline parallelism."
-
-    # Num layers.
-    num_layers = get_num_layers_to_build(config, vp_stage=vp_stage, pp_rank=pp_rank)
-
-    # Retro layer numbers.
-    retro_layer_start = 6 if num_layers <= 15 else 9
-    retro_layer_numbers = list(range(retro_layer_start, num_layers + 1, 3))
-
-    # Layer specs.
-    gpt_layer_spec = (
-        get_gpt_layer_with_transformer_engine_spec()
-        if use_transformer_engine
-        else get_gpt_layer_local_spec()
-    )
-    get_retro_decoder_layer_spec = (
-        get_retro_decoder_layer_te_spec
-        if use_transformer_engine
-        else get_retro_decoder_layer_local_spec
-    )
-    retro_layer_spec = get_retro_decoder_layer_spec()
-    retro_layer_spec_with_retriever = get_retro_decoder_layer_spec(
-        get_retro_encoder_block_spec(config, use_transformer_engine)
-    )
-
-    layer_specs = []
-    for layer_number in range(1, num_layers + 1):
-        if layer_number == retro_layer_numbers[0]:
-            layer_specs.append(retro_layer_spec_with_retriever)
-        elif layer_number in retro_layer_numbers:
-            layer_specs.append(retro_layer_spec)
-        else:
-            layer_specs.append(gpt_layer_spec)
-
-    # Block spec.
-    block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
-
-    return block_spec
diff --git a/megatron/core/models/retro/encoder_attention.py b/megatron/core/models/retro/encoder_attention.py
deleted file mode 100644
index 19fdae5b250..00000000000
--- a/megatron/core/models/retro/encoder_attention.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Retro's cross attention modules for the encoder block."""
-
-from functools import partial
-from typing import Callable, List, Optional, Tuple, Type
-
-import torch
-from torch import Tensor
-
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.inference.contexts import BaseInferenceContext
-from megatron.core.models.retro.base_attention import BaseRetroCrossAttention
-from megatron.core.models.retro.config import RetroConfig
-from megatron.core.models.retro.utils import get_all_true_mask
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.utils import deprecate_inference_params
-
-
-class RetroEncoderCrossAttention(BaseRetroCrossAttention):
-    """Retro encoder's cross attention operator.
-
-    See this paper for more details: https://arxiv.org/abs/2112.04426.
-    Neighboring chunks are retrieved from the chunk database, encoded, and
-    used by the decoder layers for chunked cross attention.
-
-    Args:
-        config (RetroConfig): Retro config.
-        submodules (CrossAttentionSubmodules): Cross attention submodules.
-        layer_number (int): Layer number within transformer block.
-        attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
-    """
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Tensor,
-        key_value_states: Tensor = None,
-        inference_context: BaseInferenceContext = None,
-        # rotary_pos_emb: Tensor = None, # unsupported for retro.
-        *,
-        inference_params: Optional[BaseInferenceContext] = None,
-    ) -> List[Tuple[Tensor, Optional[Tensor], Tensor]]:
-        """Cross attention for Retro encoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-
-        Args:
-            hidden_states (Tensor): Transformer layer hidden states.
-            attention_mask (Tensor): Attention mask.
-            key_value_states (Tensor): Neighbor embeddings.
-            inference_context (BaseInferenceContext): Inference context.
-
-        Returns:
-            List of tuples, where each tuple is (attention_output, attention_bias, residual).
-        """
-
-        inference_context = deprecate_inference_params(inference_context, inference_params)
-
-        # Input shape. [ r, bs*l*k, d ]
-        ns, bs, d = hidden_states.shape
-
-        # Reshape sequence into neighboring chunks.
-        # - hidden_states:   [ r, bs*l*k, d ]
-        # - chunked_outputs: [ r, bs*l, k, d ]
-        chunked_outputs = hidden_states.reshape(
-            self.retro_retrieved_length, -1, self.retro_num_neighbors, d
-        )
-
-        # flash attn: [ b, h, sq, sk ]
-        # fused attn: [ b, 1, 1, sq ]
-        chunked_output_mask = get_all_true_mask(
-            size=(1, 1, chunked_outputs.shape[0], key_value_states.shape[0]),
-            device=chunked_outputs.device,
-        )
-
-        # Per-chunk attention.
-        attention_output_tuples = []
-        for k in range(self.retro_num_neighbors):
-
-            # Attend to current neighboring chunks.
-            # - chunked_output:   [ r, bs*l, d ]
-            # - key_value_states: [ m, bs*l, d ]
-            # - attention_output: [ r, bs*l, d ]
-            # - attention_bias:   [ d ]
-            chunked_output = chunked_outputs[:, :, k].contiguous()
-            attention_output, attention_bias = self.attn(
-                hidden_states=chunked_output,  # Q (neighbor embedding)
-                attention_mask=chunked_output_mask,
-                key_value_states=key_value_states,  # K, V (hidden act)
-            )
-
-            # Residual connection. [ r, bs*l, d ]
-            residual = chunked_output
-
-            # Collect tensors.
-            attention_output_tuples.append((attention_output, attention_bias, residual))
-
-        # Output. (List[Tuple[( [ r, bs*l, d ], [ d ] )]])
-        return attention_output_tuples
-
-
-class RetroEncoderBiasDropoutAdd(MegatronModule):
-    """Retro encoder's bias-dropout-add operator.
-
-    This operator applies bias-dropout-add individually on each neighboring
-    chunk that is retrieved from the chunk database.
-
-    Args:
-        config (RetroConfig): Retro config.
-    """
-
-    def __init__(self, config: RetroConfig):
-        super().__init__(config=config)
-        self.retro_num_neighbors = config.retro_num_neighbors
-
-    @classmethod
-    def _forward(
-        cls,
-        x_with_bias: List[Tuple[Tensor, Optional[Tensor], Tensor]],
-        residual: Tensor,
-        prob: float,
-        retro_num_neighbors: int,
-        bias_dropout_add: Callable,
-    ) -> Tensor:
-        """Per-chunk bias-dropout-add.
-
-        Args:
-            x_with_bias (dict): Attention output and bias tuple.
-            residual (Tensor): Transformer layer residual.
-            prob (float): Dropout probability.
-            retro_num_neighbors (int): Number of retrieved neighbor chunks (e.g., 2).
-            bias_dropout_add (Callable): Bias-dropout-add function.
-
-        Returns:
-            Output of bias-dropout-add.
-        """
-
-        # Re-enable torch grad to enable fused optimization.
-        with torch.enable_grad():
-
-            # Per-neighbor bias-dropout-add.
-            # - attention_output: [ r, bs*l, d ]
-            # - attention_bias:   [ d ]
-            # - residual:         [ r, bs*l, d ]
-            # - output:           [ r, bs*l, d ]
-            outputs = [
-                bias_dropout_add(
-                    (
-                        attention_output,
-                        None if attention_bias is None else attention_bias.expand_as(residual),
-                    ),
-                    residual,
-                    prob,
-                )
-                for attention_output, attention_bias, residual in x_with_bias
-            ]
-
-        # Concatenate outputs (to shape [r, k*bs*l, d]; see notation above).
-        r, _, d = outputs[0].shape
-        output = torch.stack(outputs, dim=1).reshape(r, -1, d)
-
-        # Output. [ r, k*bs*l, d ]
-        return output
-
-    def forward(self, training: bool, fused: bool) -> partial:
-        """Retro decoder bias-dropout-add.
-
-        Args:
-            training (bool): If training, then apply dropout.
-            fused (bool): Fuse bias-dropout-add.
-
-        Returns:
-            A partial function for performing bias-dropout-add.
-        """
-        return partial(
-            self._forward,
-            retro_num_neighbors=self.retro_num_neighbors,
-            bias_dropout_add=get_bias_dropout_add(training, fused),
-        )
-
-
-class RetroEncoderLayerNorm(MegatronModule):
-    """Retro encoder's layernorm operator.
-
-    This operator applies layernorm individually on each neighboring chunk that
-    is retrieved from the chunk database, and then concatenates the chunks into
-    a single tensor.
-
-    Args:
-        config (RetroConfig): Retro config.
-        submodules (Type): Layer norm class. (Named 'submodules' to fit external interface.)
-    """
-
-    def __init__(self, config: RetroConfig, submodules: Type, **kwargs: dict):
-        super().__init__(config=config)
-        norm_class = submodules
-        self.norm = norm_class(config=config, **kwargs)
-        self.retro_num_neighbors = config.retro_num_neighbors
-
-    def forward(self, input: Tensor) -> Tensor:
-        """Per-chunk layer norm.
-
-        Args:
-            input (Tensor): Input chunks, concatenated into a single tensor.
-
-        Returns:
-            Output of the layer norm.
-        """
-
-        # Input shape: [ r, k*bs*l, d ]. (see notation above in attention module)
-
-        # Split input into 'num_neighbors' tensors.
-        chunk_size = input.shape[1] // self.retro_num_neighbors
-        inputs = torch.split(input, chunk_size, dim=1)
-
-        # Norm.
-        outputs = [self.norm(inp.contiguous()) for inp in inputs]
-
-        # Concatenate layer norms (to shape [r, k*bs*l, d]; see notation above).
-        r, _, d = inputs[0].shape
-        output = torch.stack(outputs, dim=1).reshape(r, -1, d)
-
-        # Output. [ r, k*bs*l, d ]
-        return output
diff --git a/megatron/core/models/retro/encoder_spec.py b/megatron/core/models/retro/encoder_spec.py
deleted file mode 100644
index a7cb76ca19b..00000000000
--- a/megatron/core/models/retro/encoder_spec.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Specs for Retro encoder."""
-
-from megatron.core.models.gpt.gpt_layer_specs import (
-    get_gpt_layer_local_spec,
-    get_gpt_layer_with_transformer_engine_spec,
-)
-from megatron.core.models.retro.config import RetroConfig
-from megatron.core.models.retro.encoder_attention import (
-    RetroEncoderBiasDropoutAdd,
-    RetroEncoderCrossAttention,
-    RetroEncoderLayerNorm,
-)
-from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron.core.transformer import ModuleSpec
-from megatron.core.transformer.attention import CrossAttentionSubmodules
-from megatron.core.transformer.dot_product_attention import DotProductAttention
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.mlp import MLP, MLPSubmodules
-from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
-
-try:
-    import transformer_engine as te  # pylint: disable=unused-import
-
-    from megatron.core.extensions.transformer_engine import (
-        TEColumnParallelLinear,
-        TEDotProductAttention,
-        TENorm,
-        TERowParallelLinear,
-    )
-
-    HAVE_TE = True
-except ImportError:
-    HAVE_TE = False
-
-try:
-    import apex  # pylint: disable=unused-import
-
-    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
-
-    HAVE_APEX = True
-    LNImpl = FusedLayerNorm
-except ImportError:
-    import warnings
-
-    from megatron.core.transformer.torch_norm import WrappedTorchNorm
-
-    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
-    LNImpl = WrappedTorchNorm
-    HAVE_APEX = False
-
-
-def get_retro_encoder_layer_te_spec() -> ModuleSpec:
-    """Retro encoder TE spec (uses Transformer Engine components).
-
-    A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
-    operators to encode neighboring chunks that are retrieved from the chunk
-    database. Each operator is responsible for iterating the retrieved chunks
-    and processing them individually.
-
-    Returns:
-        A module spec if Transformer Engine modules.
-    """
-    spec = get_gpt_layer_with_transformer_engine_spec()
-    spec.submodules.pre_cross_attn_layernorm = TENorm
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroEncoderCrossAttention,
-        params={"attn_mask_type": AttnMaskType.padding},
-        submodules=CrossAttentionSubmodules(
-            linear_q=TEColumnParallelLinear,
-            linear_kv=TEColumnParallelLinear,
-            core_attention=TEDotProductAttention,
-            linear_proj=TERowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
-    spec.submodules.pre_mlp_layernorm = ModuleSpec(module=RetroEncoderLayerNorm, submodules=TENorm)
-    spec.submodules.mlp = ModuleSpec(
-        module=MLP,
-        submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear),
-    )
-    return spec
-
-
-def get_retro_encoder_layer_local_spec() -> ModuleSpec:
-    """Retro encoder local spec (uses Megatron-Core components).
-
-    A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
-    operators to encode neighboring chunks that are retrieved from the chunk
-    database. Each operator is responsible for iterating the retrieved chunks
-    and processing them individually.
-
-    Returns:
-        A module spec if local modules.
-    """
-    spec = get_gpt_layer_local_spec()
-    spec.submodules.pre_cross_attn_layernorm = LNImpl
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroEncoderCrossAttention,
-        params={"attn_mask_type": AttnMaskType.padding},
-        submodules=CrossAttentionSubmodules(
-            linear_q=ColumnParallelLinear,
-            linear_kv=ColumnParallelLinear,
-            core_attention=DotProductAttention,
-            linear_proj=RowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
-    spec.submodules.pre_mlp_layernorm = ModuleSpec(module=RetroEncoderLayerNorm, submodules=LNImpl)
-    spec.submodules.mlp = ModuleSpec(
-        module=MLP,
-        submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear),
-    )
-    spec.submodules.sharded_state_dict_keys_map = {
-        'input_layernorm.': 'self_attention.linear_qkv.layer_norm_'
-    }  # pre_mlp_layernorm doesn't need remapping
-    return spec
-
-
-def get_retro_encoder_block_spec(
-    config: RetroConfig, use_transformer_engine: bool
-) -> TransformerBlockSubmodules:
-    """Retro encoder block spec.
-
-    The retro encoder block consists of one customized Retro encoder layer
-    (layer 1), and all of the following layers are standard GPT layers.
-
-    Args:
-      config (RetroConfig): Retro config.
-      use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules).
-
-    Returns:
-        Transformer block submodules for the given spec.
-    """
-
-    # Num layers.
-    num_layers = config.retro_encoder_num_layers
-    retro_layer_numbers = [1]
-
-    # Layer specs.
-    gpt_layer_spec = (
-        get_gpt_layer_with_transformer_engine_spec()
-        if use_transformer_engine
-        else get_gpt_layer_local_spec()
-    )
-    get_retro_encoder_layer_spec = (
-        get_retro_encoder_layer_te_spec
-        if use_transformer_engine
-        else get_retro_encoder_layer_local_spec
-    )
-    retro_layer_spec = get_retro_encoder_layer_spec()
-    for spec in (gpt_layer_spec, retro_layer_spec):
-        spec.params["hidden_dropout"] = config.retro_encoder_hidden_dropout
-        spec.submodules.self_attention.params["attn_mask_type"] = AttnMaskType.padding
-        spec.submodules.self_attention.submodules.core_attention = ModuleSpec(
-            module=TEDotProductAttention if use_transformer_engine else DotProductAttention,
-            params={"attention_dropout": config.retro_encoder_attention_dropout},
-        )
-
-    layer_specs = []
-    for layer_number in range(1, num_layers + 1):
-        if layer_number in retro_layer_numbers:
-            layer_specs.append(retro_layer_spec)
-        else:
-            layer_specs.append(gpt_layer_spec)
-
-    # Block spec.
-    block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
-
-    return block_spec
diff --git a/megatron/core/models/retro/model.py b/megatron/core/models/retro/model.py
deleted file mode 100644
index 35fe7e8c878..00000000000
--- a/megatron/core/models/retro/model.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Retro Model."""
-from typing import Dict, Optional
-
-from torch import Tensor
-
-from megatron.core.dist_checkpointing.mapping import ShardedStateDict
-from megatron.core.inference.contexts import BaseInferenceContext
-from megatron.core.models.gpt import GPTModel
-from megatron.core.utils import deprecate_inference_params
-
-
-class RetroModel(GPTModel):
-    """Retro Model.
-
-    A Retro model mostly re-uses the GPTModel interface, with the only difference
-    being the embedding of the 'context' this is used by Retro for processing
-    neighbor tokens. This embedded context is then forwarded to the Transformer
-    Block.
-    """
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        attention_mask: Tensor,
-        context_input_ids: Tensor = None,
-        context_position_ids: Tensor = None,
-        context_mask: Tensor = None,
-        decoder_input: Tensor = None,
-        labels: Tensor = None,
-        inference_context: BaseInferenceContext = None,
-        *,
-        inference_params: Optional[BaseInferenceContext] = None,
-    ) -> Tensor:
-        """RetroModel forward method.
-
-        Foward input tokens & mask, along with neighbor tokens & mask, through
-        the Retro model..
-
-        Args:
-            input_ids (Tensor): Input token IDs.
-            position_ids (Tensor): Input position IDs.
-            attention_mask (Tensor): Input attention mask.
-            context_input_ids (Tensor): Context (i.e., neighbor) token IDs.
-            context_position_ids (Tensor): Context (i.e., neighbor) position IDs.
-            context_mask (Tensor): Context (i.e., neighbor) attention mask.
-            decoder_input (Tensor): When using pipeline parallelism, input_ids
-                and position_ids will only be used on the first stage, and for
-                all other stages decoder_input will be provided via communication
-                from the previous stage.
-            labels (Tensor): The labels of dimension [batch size, seq length].
-            inference_context (BaseInferenceContext): Inference context.
-
-        Returns:
-            Output tensor of forward pass.
-        """
-
-        # Argument shapes:
-        #   Notation:
-        #     ns : Sequence length.
-        #     bs : Batch size.
-        #     d  : Hidden size.
-        #     l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-        #     k  : Number of neighbors.
-        #     r  : Number of retrieved tokens (neighbors + continuation).
-        # - input_ids:   [ bs, ns ]
-        # - context_ids: [ k*bs*l, r ]
-        # - context:     [ r, k*bs*l, d ]
-        # - output:      [ ns, bs, d ]
-
-        inference_context = deprecate_inference_params(inference_context, inference_params)
-
-        # Context embedding (e.g., for Retro neighbor tokens).
-        if context_input_ids is not None:
-            context = self.embedding(context_input_ids, context_position_ids)
-        else:
-            context = None
-
-        # Call GPTModel.forward, and pass in embedded context.
-        return super().forward(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            decoder_input=decoder_input,
-            labels=labels,
-            inference_context=inference_context,
-            extra_block_kwargs={"context": context, "context_mask": context_mask},
-        )
-
-    def sharded_state_dict(
-        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None
-    ) -> ShardedStateDict:
-        """Get sharded state dict.
-
-        Args:
-            prefix (str): Module name prefix.
-            sharded_offsets (tuple): Offsets of local shard within global tensor.
-            metadata (Optional[Dict]): Shard metadata.
-
-        Returns:
-            A <ShardedStateDict> ?
-        """
-        metadata = metadata or {}
-        metadata['non_homogeneous_layers'] = True
-        return super().sharded_state_dict(prefix, sharded_offsets, metadata)
diff --git a/megatron/core/models/retro/utils.py b/megatron/core/models/retro/utils.py
deleted file mode 100644
index 7d83c5d306f..00000000000
--- a/megatron/core/models/retro/utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-
-import torch
-
-
-def get_config_path(project_dir: str) -> str:
-    """Config copy stored within retro project dir."""
-    return os.path.join(project_dir, "config.json")
-
-
-def get_gpt_data_dir(project_dir: str) -> str:
-    """Get project-relative directory of GPT bin/idx datasets."""
-    return os.path.join(project_dir, "data")
-
-
-# ** Note ** : Retro's compatibility between cross attention and Flash/Fused
-#   Attention is currently a work in progress. We default to returning None for
-#   now.
-# def get_all_true_mask(size, device):
-#     return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device)
-def get_all_true_mask(size, device):
-    return None
diff --git a/megatron/core/nccl_allocator.py b/megatron/core/nccl_allocator.py
index b46157e9d00..d475684f0d3 100644
--- a/megatron/core/nccl_allocator.py
+++ b/megatron/core/nccl_allocator.py
@@ -10,7 +10,9 @@
 # pylint: disable=unused-import
 from torch.utils import cpp_extension
 
-from megatron.core.utils import is_torch_min_version
+from megatron.core.utils import is_torch_min_version, log_single_rank
+
+logger = logging.getLogger(__name__)
 
 # MCORE NCCL Allocator copies and modifies the APEX NCCL allocator.
 # The original APEX NCCL allocator is available at:
@@ -153,7 +155,40 @@ def init() -> None:
     # Disables the use of the tensor register allocator hook
     os.environ["TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"] = "0"
     _build_nccl_allocator()
-    logging.info(f"[MCORE][NCCL_ALLOCATOR] Initialized NCCL Allocator")
+    log_single_rank(logger, logging.INFO, "[MCORE][NCCL_ALLOCATOR] Initialized NCCL Allocator")
+
+
+# register_mem_pool/deregister_mem_pool are used for manual (de)registration of the memory pool.
+# They are used in the case of FSDP manual registration.
+def register_mem_pool(pool, group, symmetric=True):
+    """
+    Register a memory pool to a group.
+    symmetric: bool, this is for future use.
+    """
+    backend = group._get_backend(torch.device("cuda", torch.cuda.current_device()))
+    if symmetric:
+        try:
+            backend.register_mem_pool(pool, symm=symmetric)
+        except TypeError:
+            # Older PyTorch/APIs without 'symm' keyword.
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "[MCORE][NCCL_ALLOCATOR] Failed in symmetric registration. "
+                "Falling back to registration api without 'symm' keyword!!",
+            )
+            backend.register_mem_pool(pool)
+    else:
+        backend.register_mem_pool(pool)
+
+
+def deregister_mem_pool(pool, group):
+    """
+    Deregister a memory pool from a group.
+    """
+    backend = group._get_backend(torch.device("cuda", torch.cuda.current_device()))
+    if pool.snapshot():
+        backend.deregister_mem_pool(pool)
 
 
 # Preserve the original APEX NCCL allocator interface for backward compatibility
@@ -199,9 +234,11 @@ def __enter__(self):
                     backend.deregister_mem_pool(self.pool)
                 except RuntimeError:
                     desc = getattr(self.group, "group_desc", None)
-                    logging.warning(
+                    log_single_rank(
+                        logger,
+                        logging.WARNING,
                         f"[MCORE][NCCL_ALLOCATOR] Failed to deregister mem pool from"
-                        f"{repr(self.group)}({desc}) group!!"
+                        f"{repr(self.group)}({desc}) group!!",
                     )
 
     def __exit__(self, *args):
@@ -215,18 +252,22 @@ def __exit__(self, *args):
                         backend.register_mem_pool(self.pool, symm=self.symmetric)
                     except TypeError:
                         # Older PyTorch/APIs without 'symm' keyword.
-                        logging.warning(
-                            f"[MCORE][NCCL_ALLOCATOR] Failed in symmetric registration."
-                            f"Falling back to non-symmetric registration!!"
+                        log_single_rank(
+                            logger,
+                            logging.WARNING,
+                            "[MCORE][NCCL_ALLOCATOR] Failed in symmetric registration. "
+                            "Falling back to non-symmetric registration!!",
                         )
                         backend.register_mem_pool(self.pool)
                 else:
                     backend.register_mem_pool(self.pool)
             except RuntimeError:
                 desc = getattr(self.group, "group_desc", None)
-                logging.warning(
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
                     f"[MCORE][NCCL_ALLOCATOR] Failed to register mem pool to"
-                    f"{repr(self.group)}({desc}) group!!"
+                    f"{repr(self.group)}({desc}) group!!",
                 )
 
         self.mem_context.__exit__(*args)
@@ -284,9 +325,11 @@ def __enter__(self):
                     backend.deregister_mem_pool(self.pool)
                 except RuntimeError:
                     desc = getattr(group, "group_desc", None)
-                    logging.warning(
+                    log_single_rank(
+                        logger,
+                        logging.WARNING,
                         f"[MCORE][MultiGroupMemPoolAllocator] Failed to deregister mem pool from"
-                        f"{repr(group)}({desc}) group!!"
+                        f"{repr(group)}({desc}) group!!",
                     )
 
     def __exit__(self, *args):
@@ -300,17 +343,39 @@ def __exit__(self, *args):
                         backend.register_mem_pool(self.pool, symm=self.symmetric)
                     except TypeError:
                         # Older PyTorch/APIs without 'symm' keyword.
-                        logging.warning(
-                            f"[MCORE][MultiGroupMemPoolAllocator] Failed in symmetric registration."
-                            f"Falling back to non-symmetric registration!!"
+                        log_single_rank(
+                            logger,
+                            logging.WARNING,
+                            "[MCORE][MultiGroupMemPoolAllocator] "
+                            "Failed in symmetric registration. "
+                            "Falling back to non-symmetric registration!!",
                         )
                         backend.register_mem_pool(self.pool)
                 else:
                     backend.register_mem_pool(self.pool)
             except RuntimeError:
                 desc = getattr(group, "group_desc", None)
-                logging.warning(
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
                     f"[MCORE][MultiGroupMemPoolAllocator] Failed to register mem pool to"
-                    f"{repr(group)}({desc}) group!!"
+                    f"{repr(group)}({desc}) group!!",
                 )
         self.mem_context.__exit__(*args)
+
+
+class MemPoolAllocatorWithoutRegistration:
+    """
+    An allocator class that uses allocates memory without registering to any communication group.
+    Users are expected to register the memory manually to the communication groups.
+    """
+
+    def __init__(self, pool):
+        self.pool = pool
+        self.mem_context = torch.cuda.use_mem_pool(self.pool)
+
+    def __enter__(self):
+        self.mem_context.__enter__()
+
+    def __exit__(self, *args):
+        self.mem_context.__exit__(*args)
diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
index 0276c5e0e70..95cc7ec02f8 100644
--- a/megatron/core/optimizer/__init__.py
+++ b/megatron/core/optimizer/__init__.py
@@ -3,12 +3,11 @@
 import logging
 import warnings
 from dataclasses import astuple
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.optim import SGD as CPUSGD
 from torch.optim import AdamW as CPUAdam
-from .muon import Muon
 
 try:
     from transformer_engine.pytorch.optimizers import FusedAdam as Adam
@@ -36,6 +35,11 @@
 
 from megatron.core import parallel_state
 from megatron.core.optimizer.cpu_offloading.hybrid_optimizer import HybridDeviceOptimizer
+from megatron.core.optimizer_param_scheduler import (
+    ParamGroupOverride,
+    combine_param_group_overrides,
+    param_group_override_to_tuple,
+)
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.fsdp_dtensor_checkpoint import get_global_unique_param_name
 
@@ -51,67 +55,92 @@
     MegatronOptimizer,
     param_group_identifier_keys,
 )
-from .optimizer_config import AdamOptimizerConfig, OptimizerConfig, ParamKey, SGDOptimizerConfig, MuonOptimizerConfig
+from .optimizer_config import (
+    AdamOptimizerConfig,
+    OptimizerConfig,
+    ParamKey,
+    ParamPredicate,
+    ParamWithNamePredicate,
+    SGDOptimizerConfig,
+)
 
 logger = logging.getLogger(__name__)
 
 
-def _matches(param: torch.nn.Parameter, param_name: str, param_key: ParamKey) -> bool:
-    """Returns true if passed-in parameter (with name) matches `param_key`.
+def get_standard_config_overrides(config: OptimizerConfig) -> Dict[ParamKey, ParamGroupOverride]:
+    """Get standard config overrides for the optimizer, handling decoupled LR and common wd skips.
 
     Args:
-        param (torch.nn.Parameter): Handle to parameter object.
-        param_name (str): Name of parameter in underlying PyTorch module.
-        param_key (ParamKey): ParamKey object.
+        config (OptimizerConfig): optimizer configuration object.
 
     Returns:
-        bool: True if parameter matches passed-in param_key.
+        Dict[ParamKey, ParamGroupOverride]: standard config overrides.
     """
-
-    # Check if name matches.
-    if isinstance(param_key.name, str):
-        target_names = [param_key.name]
-    else:
-        target_names = list(param_key.name)
-    for target_name in target_names:
-        if param_name in target_name:
-            return True
-
-    # Check if attribute matches.
-    if isinstance(param_key.attr, str):
-        target_attrs = [param_key.attr]
+    config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]] = {}
+    # First, figure out how we are going to do wd skipping. The two main approaches are:
+    #  1. The classic megatron approach of skipping all len 1 and bias parameters.
+    #  2. The Qwen3-Next approach of doing 1, other than qk layernorm parameters.
+    if config.apply_wd_to_qk_layernorm:
+        shape_1_not_qkln_param = ParamWithNamePredicate(
+            name="s1_not_qkln",
+            fn=lambda param, name: (len(param.shape) == 1 or name.endswith(".bias"))
+            and not ("q_layernorm." in name or "k_layernorm." in name),
+        )
+        param_wd_mult_key = ParamKey(with_name_predicate=shape_1_not_qkln_param)
     else:
-        target_attrs = list(param_key.attr)
-    for target_attr in target_attrs:
-        if getattr(param, target_attr, False):
-            return True
+        param_length_1_match = ParamPredicate(
+            name="param_len_1", fn=lambda param: len(param.shape) == 1
+        )
+        param_wd_mult_key = ParamKey(name="*.bias", predicate=param_length_1_match)
 
-    return False
+    config_overrides[param_wd_mult_key] = ParamGroupOverride(wd_mult=0.0)
+
+    if config.decoupled_lr is not None:
+        decoupled_lr_config: ParamGroupOverride = {"max_lr": config.decoupled_lr}
+        decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter")
+        if config.decoupled_min_lr is not None:
+            decoupled_lr_config["min_lr"] = config.decoupled_min_lr
+        config_overrides[decoupled_param_key] = decoupled_lr_config
+
+    return config_overrides
 
 
 def _get_param_groups(
     model_chunks: List[MegatronModule],
     config: OptimizerConfig,
-    config_overrides: Optional[Dict[ParamKey, OptimizerConfig]],
+    config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]],
 ) -> List[Dict]:
     """Create parameter groups for optimizer.
 
     Creates parameter groups from provided optimizer config object.
 
+    NOTE There can be more than one match between a ParamKey and a parameter.
+        What we do is merge all of the matching ParamKey overrides into a single ParamGroupOverride
+        for that parameter and use that as the key for that parameter. Any parameters that get
+        the same set of merged overrides will be mapped into the same parameter group.
+
     Args:
         model_chunks (List[MegatronModule]): model chunks to create parameter
             groups for.
         config (OptimizerConfig): optimizer configuration object.
-        config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides,
-            specified on a per-layer basis.
+        config_overrides (Optional[Dict[ParamKey, ParamGroupOverride]): optimizer overrides,
+            specified on a per-layer basis. NOTE: if you want to skip applying weight decay on bias
+            and length 1 parameters, and also do not want to do any other overrides, set this to an
+            empty dictionary rather than the default value of None.
     Returns:
         List of parameter groups.
     """
 
-    # Map (wd_mult, is_expert_parallel, param_group_hyperparameters_config) to params.
+    # Map (pg_overrides, is_expert_parallel) to params.
     params_map = {}
-    configs_map = {}
-    muon_params_map = {}
+
+    if config_overrides is None:
+        # TODO remove this default behavior eventually.
+        #  This is only needed for backwards compatibility with the old config overrides API where
+        #  the config_overrides argument by default lead to bias parameters and length 1 parameters.
+        #  We assume that users of decoupled LR already provide config overrides so will adapt
+        #  to the new API.
+        config_overrides = get_standard_config_overrides(config=config)
 
     for model_chunk in model_chunks:
         for name, param in model_chunk.named_parameters():
@@ -119,32 +148,22 @@ def _get_param_groups(
                 continue
 
             uses_default_config = False
-            # Get optimizer config for this parameter.
-            if config_overrides is None:
-                config_for_param = config
-                uses_default_config = True
+            # Get optimizer config overrides for this parameter.
+            param_overrides_list: list[ParamGroupOverride] = []
+            if config_overrides is not None:
+                for param_key, param_override in config_overrides.items():
+                    if param_key.matches(param, name):
+                        param_overrides_list.append(param_override)
+
+            if param_overrides_list:
+                param_override: ParamGroupOverride | None = combine_param_group_overrides(
+                    param_overrides_list
+                )
             else:
-                config_for_param = None
-                for param_key in config_overrides:
-                    if _matches(param, name, param_key):
-                        config_for_param = config_overrides[param_key]
-                        break
-                # Fall back to default config.
-                if config_for_param is None:
-                    config_for_param = config
-                    uses_default_config = True
+                param_override = None
 
             is_expert_parallel = not getattr(param, 'allreduce', True)
 
-            # TODO: Make sure there is a way to support old no_weight_decay_func functionality
-            # and default_skip_embedding_weight_decay:
-            #     or (default_skip_embedding_weight_decay and "embedding" in name)
-            no_wd = name.endswith(".bias") or len(param.shape) == 1
-            if not no_wd:
-                wd_mult = 1.0
-            else:
-                wd_mult = 0.0
-
             # NOTE(lizhiyu): hack for qwen2.5vl
             is_vision_model_param = False
             if "vision_model" in name:
@@ -152,33 +171,14 @@ def _get_param_groups(
             else:
                 is_vision_model_param = False
 
-
-            # Create config_tuple that is hash-able. Remove timers object before
-            # creating config_tuple.
-            config_for_param_copy = copy.deepcopy(config_for_param)
-            config_for_param_copy.timers = None
-            config_tuple = astuple(config_for_param_copy)
-
-            bias_flag = name.endswith(".bias")
-            shape_flag = param.dim() == 2
-            embedding_flag = "embedding" in name or "output_layer" in name
-            use_muon = config.optimizer == 'muon'
-            muon_flag = use_muon and shape_flag and (not bias_flag) and (not embedding_flag)
-            if muon_flag:
-                key = (wd_mult, is_expert_parallel, config_tuple)
-                if key not in muon_params_map:
-                    muon_params_map[key] = []
-                muon_params_map[key].append(param)
-            else:
-                key = (wd_mult, is_expert_parallel, is_vision_model_param, config_tuple)
-                if key not in params_map:
-                    params_map[key] = []
-                params_map[key].append(param)
-
-            if key in configs_map:
-                assert (config_for_param, uses_default_config) == configs_map[key]
-            else:
-                configs_map[key] = (config_for_param, uses_default_config)
+            # Create config_tuple that is hash-able, and has a consistent ordering of the keys.
+            param_override_tuple: tuple[tuple[str, Any], ...] | None = (
+                param_group_override_to_tuple(param_override)
+            )
+            key = (param_override_tuple, is_expert_parallel, is_vision_model_param)
+            if key not in params_map:
+                params_map[key] = []
+            params_map[key].append(param)
 
     # Distributed checkpoint requires all ranks to have the same param groups,
     # so we need to align the param groups across ranks, otherwise we may have
@@ -190,71 +190,50 @@ def _get_param_groups(
         for key in keys:
             if key not in params_key:
                 params_key.append(key)
-
-    # for muon optimizer
-    # For muon optimizer, we need to add the muon params key to the params_key
-    # so we need to align the param groups across ranks, otherwise we may have
-    # runtime error when loading the checkpoint or numerical error when resuming training.
-    muon_params_key = list(muon_params_map.keys())
-    gathered_muon_params_key = [None for _ in range(torch.distributed.get_world_size())]
-    torch.distributed.all_gather_object(gathered_muon_params_key, muon_params_key)
-    for keys in gathered_muon_params_key:
-        for key in keys:
-            if key not in muon_params_key:
-                muon_params_key.append(key)
-
+    # Need to pick one of the param_override_tuples to use for the param group.
     param_groups = []
-    for key in params_key:
-        wd_mult, is_expert_parallel, is_vision_model_param, _ = key
+    # Sort keys, None first.
+    for key in sorted(params_key, key=lambda x: (x[0] is not None, x[0])):
+        param_override_tuple, is_expert_parallel, is_vision_model_param = key
         params = params_map[key] if key in params_map else []
-        config, uses_default_config = None, True
-        if key not in configs_map:
-            assert params == []
+        if param_override_tuple is None:
+            param_override: ParamGroupOverride = {}
         else:
-            config, uses_default_config = configs_map[key]
-            assert config is not None
+            param_override: ParamGroupOverride = {k: v for (k, v) in param_override_tuple}
+
+        # False if param_group_override is None or empty tuple or if we do not modify the
+        #  LR schedule.
+        #  NOTE: "default_config" is used for logging the learning rate in training.py.
+        #   so set to True if we do not modify the learning rate.
+        #  if param_group['default_config']:
+        #    learning_rate = param_group['lr']
+        uses_default_lr_schedule: bool = (not bool(param_override_tuple)) or not any(
+            ["lr" in k for k in param_override]
+        )
 
         # TODO: Remove "backwards compatible" fields below eventually.
+        default_config: ParamGroupOverride = {
+            'wd_mult': 1.0,
+            'lr_mult': 1.0,
+            'is_decoupled_lr': False,
+            # The following two fields may be important to keep even when we remove the
+            #   above "backwards compatible" fields.
+            "max_lr": config.lr if not is_vision_model_param else config.lr * config.vision_ration,  # user may override this in param_override, # NOTE(lizhiyu): change the ration here
+            "min_lr": config.min_lr,  # user may override this in param_override
+            'is_vision_model_param': is_vision_model_param,
+        }
+        assert (
+            "params" not in param_override
+        ), "'params' should not be in param_override, this is a protected key"
         param_group = {
             'params': params,
-            'wd_mult': wd_mult,  # For backwards compatibility.
-            'lr_mult': 1.0,  # For backwards compatibility.
             'is_expert_parallel': is_expert_parallel,
-            'is_decoupled_lr': False,  # For backwards compatibility.
-            'default_config': uses_default_config,
-            'is_vision_model_param': is_vision_model_param,
+            'default_config': uses_default_lr_schedule,
+            **default_config,
+            **param_override,  # keep **param_override last so that users can override other fields.
         }
-
-        # Stick relevant fields into param_group from config object.
-        if config is not None:
-            param_group['max_lr'] = config.lr if not is_vision_model_param else config.lr * config.vision_ration # NOTE(lizhiyu): change the ration here
-            param_group['min_lr'] = config.min_lr
-            # TODO: Add other relevant arguments (e.g., weight decay, optimizer)
-            # here as well.
         param_groups.append(param_group)
 
-    for key in muon_params_key:
-        wd_mult, is_expert_parallel, _ = key
-        params = muon_params_map[key] if key in muon_params_map else []
-        config, uses_default_config = None, True
-        if key not in configs_map:
-            assert params == []
-        else:
-            config, uses_default_config = configs_map[key]
-            assert config is not None
-
-        param_groups.append(
-            {
-                'params': params,
-                'wd_mult': wd_mult,  # For backwards compatibility.
-                'lr_mult': 1.0,  # For backwards compatibility.
-                'is_expert_parallel': is_expert_parallel,
-                'is_decoupled_lr': False,  # For backwards compatibility.
-                'default_config': uses_default_config,
-                'use_muon': True,
-            }
-        )
-
     return param_groups
 
 
@@ -262,7 +241,7 @@ def _get_param_groups_and_buffers(
     model_chunks: List[MegatronModule],
     model_chunk_offset: int,
     config: OptimizerConfig,
-    config_overrides: Optional[Dict[ParamKey, OptimizerConfig]],
+    config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]],
     filter_fn: Callable,
     buffer_name: str,
 ) -> Tuple[List[Dict], Dict[int, List[_ParamAndGradBuffer]]]:
@@ -273,8 +252,8 @@ def _get_param_groups_and_buffers(
             groups for.
         model_chunk_offset (int): offset of model_chunks in global model_chunks list.
         config (OptimizerConfig): optimizer configuration object.
-        config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides,
-            specified on a per-layer basis.
+        config_overrides (Optional[Dict[ParamKey, ParamGroupOverride]): optimizer/scheduler
+            overrides, specified on the basis of ParamKey matches with each parameter.
         lr (float): learning rate.
         min_lr (float): minimum learning rate.
         filter_fn (callable): filtering function for param_groups.
@@ -304,6 +283,7 @@ def _get_megatron_optimizer_based_on_param_groups(
     data_parallel_group_idx: Optional[int] = None,
     intra_dist_opt_group: Optional[torch.distributed.ProcessGroup] = None,
     distributed_optimizer_instance_id: Optional[int] = 0,
+    pg_collection: Optional[ProcessGroupCollection] = None,
 ) -> MegatronOptimizer:
     """Get Megatron optimizer based on parameter groups.
 
@@ -434,25 +414,6 @@ def init_state_fn(opt, config=None):
                 momentum=config.sgd_momentum,
             )
             init_state_fn = None
-        elif config.optimizer == 'muon':
-            optimizer = Muon(param_groups,
-                             lr=config.lr, weight_decay=config.weight_decay,
-                             matched_adamw_rms=config.muon_matched_adamw_rms,
-                             momentum=config.muon_momentum,
-                             nesterov=config.muon_nesterov,
-                             ns_steps=config.muon_ns_steps,
-                             adamw_betas=(config.adam_beta1, config.adam_beta2),
-                             adamw_eps=config.adam_eps)
-
-            def init_state_fn(opt, config=None):
-                for group in opt.param_groups:
-                    for p in group['params']:
-                        if len(opt.state[p]) == 0:
-                            if config is None or not config.use_precision_aware_optimizer:
-                                opt.state[p]['exp_avg'] = torch.zeros_like(p.data)
-                                opt.state[p]['exp_avg_sq'] = torch.zeros_like(p.data)
-                            else:
-                                opt.initialize_state(p)
         else:
             raise Exception('{} optimizer is not supported.'.format(config.optimizer))
     else:
@@ -512,13 +473,47 @@ def init_state_fn(opt, config=None):
         optimizer = FP32Optimizer(optimizer, config, init_state_fn)
         setattr(optimizer, 'grad_stats_parallel_group', model_parallel_group)
 
+    if pg_collection is None or not hasattr(pg_collection, 'tp'):
+        tp_group = parallel_state.get_tensor_model_parallel_group()
+    else:
+        tp_group = pg_collection.tp
+    # TODO(M4): plumb tp_group through optimizer constructors so this setattr disappears.
+    setattr(optimizer, 'tp_group', tp_group)
+
     return optimizer
 
 
+def check_config_overrides_consistency(
+    config: OptimizerConfig, config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]]
+):
+    """Check if the config overrides are consistent with the config."""
+
+    # TODO: Remove `optimizer` from this eventually (e.g., if we use Muon for some layers and
+    # Adam for other layers). This would need some more refactoring to work though (param_groups
+    # filtered by optimizer passed into _get_megatron_optimizer_based_on_param_groups).
+    if config_overrides is not None:
+        fields_to_check_for_consistency = [
+            'overlap_param_gather_with_optimizer_step',
+            'optimizer',
+            'optimizer_cpu_offload',
+        ]
+        for field_name in fields_to_check_for_consistency:
+            base_field = getattr(config, field_name, None)
+            all_config_overrides = list(config_overrides.values())
+            for config_override in all_config_overrides:
+                if field_name in config_override:
+                    field = config_override[field_name]
+                    if field != base_field:
+                        raise ValueError(
+                            f"Field {field_name} should not be overriden in a config override."
+                        )
+    return True
+
+
 def get_megatron_optimizer(
     config: OptimizerConfig,
     model_chunks: List[MegatronModule],
-    config_overrides: Optional[Dict[ParamKey, OptimizerConfig]] = None,
+    config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]] = None,
     use_gloo_process_groups: bool = True,
     pg_collection: Optional[ProcessGroupCollection] = None,
     dump_param_to_param_group_map: Optional[str] = None,
@@ -544,19 +539,7 @@ def get_megatron_optimizer(
 
     log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}')
 
-    # TODO: Remove `optimizer` from this eventually (e.g., if we use Muon for some layers and
-    # Adam for other layers). This would need some more refactoring to work though (param_groups
-    # filtered by optimizer passed into _get_megatron_optimizer_based_on_param_groups).
-    fields_to_check_for_consistency = [
-        'overlap_param_gather_with_optimizer_step',
-        'optimizer',
-        'optimizer_cpu_offload',
-    ]
-    for field_name in fields_to_check_for_consistency:
-        field = getattr(config, field_name, None)
-        if config_overrides is not None:
-            all_configs = list(config_overrides.values())
-            assert all([getattr(x, field_name, None) == field for x in all_configs])
+    check_config_overrides_consistency(config, config_overrides)
 
     # Separate out first model chunk if overlapping param AG with optimizer step.
     if config.overlap_param_gather_with_optimizer_step:
@@ -567,27 +550,26 @@ def get_megatron_optimizer(
         overlap_param_gather_with_optimizer_step_flags = [False]
 
     # Setup process groups using helper method
-    process_groups = ProcessGroupCollection.setup_process_groups_for_optimizer(
+    process_groups_dict = ProcessGroupCollection.setup_process_groups_for_optimizer(
         pg_collection, model_chunks, use_gloo_process_groups
     )
 
-    dp_cp_group = process_groups['dp_cp_group']
-    intra_dp_cp_group = process_groups['intra_dp_cp_group']
-    intra_expt_dp_group = process_groups['intra_expt_dp_group']
-    mp_group = process_groups['mp_group']
+    dp_cp_group = process_groups_dict['dp_cp_group']
+    intra_dp_cp_group = process_groups_dict['intra_dp_cp_group']
+    intra_expt_dp_group = process_groups_dict['intra_expt_dp_group']
+    mp_group = process_groups_dict['mp_group']
     ########## FlagScale Begin ##########
     mp_group = [mp_group] if not isinstance(mp_group, list) else mp_group
-    model_parallel_rank = mp_group[0].rank()
     ########## FlagScale End ##########
-    expt_tp_pp_group = process_groups['expt_tp_pp_group']
-    intra_dp_cp_group_gloo = process_groups['intra_dp_cp_group_gloo']
-    intra_expt_dp_group_gloo = process_groups['intra_expt_dp_group_gloo']
-    intra_dist_opt_group = process_groups['intra_dist_opt_group']
+    expt_tp_pp_group = process_groups_dict['expt_tp_pp_group']
+    intra_dp_cp_group_gloo = process_groups_dict['intra_dp_cp_group_gloo']
+    intra_expt_dp_group_gloo = process_groups_dict['intra_expt_dp_group_gloo']
+    intra_dist_opt_group = process_groups_dict['intra_dist_opt_group']
 
     model_parallel_rank = get_pg_rank(mp_group)
 
     if get_pg_size(dp_cp_group) > get_pg_size(intra_dp_cp_group):
-        inter_dist_opt_group = process_groups['inter_dist_opt_group']
+        inter_dist_opt_group = process_groups_dict['inter_dist_opt_group']
         distributed_optimizer_instance_id = get_pg_rank(inter_dist_opt_group)
     else:
         distributed_optimizer_instance_id = 0
@@ -620,6 +602,7 @@ def get_megatron_optimizer(
                     data_parallel_group_idx=model_parallel_rank,
                     intra_dist_opt_group=intra_dist_opt_group,
                     distributed_optimizer_instance_id=distributed_optimizer_instance_id,
+                    pg_collection=pg_collection,
                 )
             )
             model_chunk_offset += 1
@@ -667,6 +650,7 @@ def get_megatron_optimizer(
                 data_parallel_group_idx=model_parallel_rank,
                 intra_dist_opt_group=intra_dist_opt_group,
                 distributed_optimizer_instance_id=distributed_optimizer_instance_id,
+                pg_collection=pg_collection,
             )
         )
         model_chunk_offset += 1
@@ -708,6 +692,7 @@ def get_megatron_optimizer(
                 data_parallel_group_idx=expt_model_parallel_rank,
                 intra_dist_opt_group=intra_dist_opt_group,
                 distributed_optimizer_instance_id=distributed_optimizer_instance_id,
+                pg_collection=pg_collection,
             )
         )
 
diff --git a/megatron/core/optimizer/clip_grads.py b/megatron/core/optimizer/clip_grads.py
index 6ec0002b70e..1f5174bc265 100644
--- a/megatron/core/optimizer/clip_grads.py
+++ b/megatron/core/optimizer/clip_grads.py
@@ -183,6 +183,7 @@ def count_zeros_fp32(
     parameters: Union[List[torch.Tensor], torch.Tensor],
     grad_stats_parallel_group: torch.distributed.ProcessGroup,
     use_decoupled_grad: bool = False,
+    tp_group: Optional[torch.distributed.ProcessGroup] = None,
 ) -> float:
     """Counts the number of zeros in gradients associated with the passed-in list of
     parameters.
@@ -220,7 +221,7 @@ def count_zeros_fp32(
         grad_attr = "decoupled_grad" if use_decoupled_grad else "grad"
         grad_not_none = hasattr(param, grad_attr) and getattr(param, grad_attr) is not None
         is_not_shared = param_is_not_shared(param)
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param, tp_group=tp_group)
         if grad_not_none and is_not_shared and is_not_tp_duplicate:
             grad_obj = getattr(param, grad_attr)
             data_parallel_group = get_data_parallel_group_if_dtensor(grad_obj, data_parallel_group)
diff --git a/megatron/core/optimizer/cpu_offloading/README.md b/megatron/core/optimizer/cpu_offloading/README.md
index 1486226aa86..68bfba54ba2 100644
--- a/megatron/core/optimizer/cpu_offloading/README.md
+++ b/megatron/core/optimizer/cpu_offloading/README.md
@@ -8,6 +8,6 @@ Add these flags to enable optimizer cpu offload in MCore.
 --use-precision-aware-optimizer
 ```
 
-## Configuration Recommendataions
+## Configuration Recommendations
 
 Gradient copy from GPU to CPU, CPU optimizer step, and subsequent parameter copy from CPU to GPU can be time-consuming operations, and it is recommended to use the flag `--overlap-cpu-optimizer-d2h-h2d` to execute them concurrently.
diff --git a/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py b/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py
index 28487c3b367..c87ccd5ff31 100644
--- a/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py
+++ b/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py
@@ -122,7 +122,7 @@ def param_copy_back_gpu_hook(optimizer, args, kwargs):
                     for param in _param_generator(optimizer):
                         gpu_param = self.cpu_copys_map_gpu_param[param]
                         gpu_param.data.copy_(param.data, non_blocking=True)
-                self._d2h_stream.record_event().wait(torch.cuda.current_stream())
+                self._h2d_stream.record_event().wait(torch.cuda.current_stream())
 
             return param_copy_back_gpu_hook
 
diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
index 9de232cf7d1..0c5805f1908 100644
--- a/megatron/core/optimizer/distrib_optimizer.py
+++ b/megatron/core/optimizer/distrib_optimizer.py
@@ -5,6 +5,7 @@
 
 import gc
 import itertools
+import logging
 from collections import ChainMap
 from dataclasses import replace
 from logging import getLogger
@@ -13,6 +14,8 @@
 import torch
 import torch.nn.functional
 
+from megatron.core.utils import log_single_rank
+
 from ..dist_checkpointing.optimizer import KEEP_VARS_HINT
 
 HAVE_APEX_OR_TE = True
@@ -53,17 +56,6 @@
 from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys
 from .optimizer_config import OptimizerConfig
 
-from .muon import Muon, MuonDistMeta
-from megatron.core.parallel_state import get_tensor_model_parallel_group
-
-try:
-    # This will be used when "--fp8-param-gather" is enabled.
-    # When BF16/FP16 parameters don't exist, we need to cast the FP32 main parameters to
-    # FP8 directly in the optimizer.
-    from transformer_engine.pytorch.cpp_extensions import cast_to_fp8
-except:
-    pass
-
 logger = getLogger(__name__)
 
 
@@ -170,7 +162,6 @@ def _build_model_gbuf_param_range_map(
                 sub_param_start = max(0, gbuf_world_range.start - param_world_start)
                 sub_param_range = param_local_range.normalize(sub_param_start)
                 param_range_map[param] = {
-                    "world_indexes": (param_world_start, param_world_end),
                     "gbuf_world": param_world_range,
                     "gbuf_world_in_bucket": param_world_range_in_bucket,
                     "gbuf_local": param_local_range,
@@ -358,24 +349,14 @@ def _build_model_and_main_param_groups(
             shard_fp32_groups.append(shard_fp32_params_this_group)
             shard_fp32_from_float16_groups.append(shard_fp32_from_float16_params_this_group)
 
-            dist_metas = {}
-
             for model_param in group_range["params"]:
 
                 assert model_param.requires_grad
 
                 gbuf_index, dtype, bucket_index = param_gbuf_map[model_param]
                 gbuf_range = gbuf_ranges[gbuf_index][dtype][bucket_index]
-                # param_range = gbuf_range["param_map"][model_param]["param"]
-                param_gbuf_ranges = gbuf_range["param_map"][model_param]
-                param_range = param_gbuf_ranges["param"]
-
-                # gen dist meta
-                param_world_indexes = param_gbuf_ranges["world_indexes"]
-                tp_split_dim = -1 if not getattr(model_param, 'tensor_model_parallel', False) else \
-                    getattr(model_param, 'partition_dim')
-                dist_meta = MuonDistMeta(gbuf_index, bucket_index, model_param.shape, param_world_indexes, tp_split_dim)
-                
+                param_range = gbuf_range["param_map"][model_param]["param"]
+
                 # fp16, bf16 params.
                 if model_param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']:
 
@@ -435,9 +416,6 @@ def _build_model_and_main_param_groups(
                     shard_float16_params_this_group.append(shard_model_param)
                     shard_fp32_from_float16_params_this_group.append(shard_main_param)
 
-                    # add to dist metas
-                    dist_metas[shard_main_param] = dist_meta
-
                 # fp32 params.
                 elif model_param.type() == 'torch.cuda.FloatTensor':
                     shard_model_param = model_param.view(-1)[param_range.start : param_range.end]
@@ -476,7 +454,7 @@ def _build_model_and_main_param_groups(
             shard_float16_groups,
             shard_fp32_groups,
             shard_fp32_from_float16_groups,
-        ), dist_metas
+        )
 
     def __init__(
         self,
@@ -533,22 +511,14 @@ def __init__(
             assert self.ddp_config == model_chunk.ddp_config
         self.distributed_optimizer_instance_id = distributed_optimizer_instance_id
 
-        # assert (
-        #     isinstance(optimizer, (Adam, torch.optim.AdamW, HybridDeviceOptimizer))
-        #     or optimizer is None
-        # ), (
-        #     "Only Adam and HybridDeviceOptimizer currently supported, "
-        #     "due to checkpointing requirements."
-        # )
         assert (
-            isinstance(optimizer, (Adam, torch.optim.AdamW, HybridDeviceOptimizer, Muon))
+            isinstance(optimizer, (Adam, torch.optim.AdamW, HybridDeviceOptimizer))
             or optimizer is None
         ), (
-            "Only Adam / HybridDeviceOptimizer / Muon currently supported, "
+            "Only Adam and HybridDeviceOptimizer currently supported, "
             "due to checkpointing requirements."
         )
 
-
         # when freezing sub-models we have no real optimizer
         # but still need a stub DistributedOptimizer class
         if optimizer is None:
@@ -625,7 +595,7 @@ def __init__(
             self.shard_float16_groups,
             self.shard_fp32_groups,
             self.shard_fp32_from_float16_groups,
-        ), dist_metas = self._build_model_and_main_param_groups(
+        ) = self._build_model_and_main_param_groups(
             self.gbuf_ranges, self.model_param_gbuf_map, self.opt_group_ranges, config
         )
 
@@ -636,19 +606,6 @@ def __init__(
         else:
             self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges]
             self.optimizer.load_state_dict(self.optimizer.state_dict())
-            # for muon optimizer, enable distributed mode
-            if isinstance(self.optimizer, Muon):
-                assert all(grad_buffer.grad_dtype == torch.float32 for grad_buffer in self.buffers), \
-                    "all grad buffer should only contains float32 type for muon optimizer"
-                gbuf_sizes = [ [(bucket.grad_data.numel(), bucket.offset) for bucket in buffer.buckets ]
-                                for buffer in self.buffers ]
-                self.optimizer.enable_distributed_mode(
-                    gbuf_sizes, self.data_parallel_group,
-                    get_tensor_model_parallel_group(),
-                    dist_metas,
-                )
-
-        self.is_stub_optimizer = False
 
     def _get_model_param_range_map(self, param: torch.nn.Parameter):
         """
@@ -786,10 +743,10 @@ def make_needed_groups(param_group):
                     pass
                 elif f"pre_{key}" in param_group:
                     key = f"pre_{key}"
-                elif key == 'use_muon':
-                    param_group[key] = False
+                ######### FlagScale Begin #########
                 elif key == 'is_vision_model_param':
                     param_group[key] = False
+                ######### FlagScale End #########
                 else:
                     raise ValueError(
                         f"Key {key} (or pre_{key}) not found in param_group {param_group}."
@@ -840,18 +797,10 @@ def make_needed_groups(param_group):
 
                             # For precision_aware_optimizer, the empty tensors should also be
                             #  initialized with the correct dtype.
-                            # For muon optimizer link state ( for load state dict )
-                            if isinstance(self.optimizer, Muon):
-                                group = self.optimizer.param_groups[group_index]
-                                is_muon_group = group.get("use_muon", False)
-                                if is_muon_group:
-                                    tensors = {"muon_buffer": init_shard(self.config.exp_avg_dtype)}
-                                else:
-                                    tensors = {"adamw_exp_avg": init_shard(self.config.exp_avg_dtype), "adamw_exp_avg_sq": init_shard(self.config.exp_avg_sq_dtype)}
-                            
-                            else:
-                                tensors = {"exp_avg": init_shard(self.config.exp_avg_dtype), "exp_avg_sq": init_shard(self.config.exp_avg_sq_dtype)}         
-
+                            tensors = {
+                                "exp_avg": init_shard(self.config.exp_avg_dtype),
+                                "exp_avg_sq": init_shard(self.config.exp_avg_sq_dtype),
+                            }
                             if self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8:
                                 if self.config.store_param_remainders and self.config.bf16:
                                     tensors["master_param"] = init_shard(torch.int16)
@@ -898,24 +847,32 @@ def make_needed_groups(param_group):
         # Grad scaler.
         if 'grad_scaler' not in state_dict:
             if self.config.fp16:
-                logger.info(
-                    '***WARNING*** found an old checkpoint, will not ' 'load grad scaler ...'
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    '***WARNING*** found an old checkpoint, will not load grad scaler ...',
                 )
         else:
             if self.grad_scaler:
                 self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
             else:
-                logger.info(
+                log_single_rank(
+                    logger,
+                    logging.INFO,
                     '***WARNING*** fould the grad scaler in the '
                     'checkpoint but it is None in the class. '
-                    'Skipping loading grad scaler ...'
+                    'Skipping loading grad scaler ...',
                 )
 
         if 'param_state' in state_dict:
             assert 'param_state_sharding_type' in state_dict, state_dict.keys()
             param_state = state_dict['param_state']
             sharding_type = state_dict['param_state_sharding_type']
-            logger.info(f'Loading distributed optimizer sharded state of type {sharding_type}')
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f'Loading distributed optimizer sharded state of type {sharding_type}',
+            )
             if sharding_type == 'dp_zero_gather_scatter':
                 self.load_parameter_state_from_dp_zero(param_state)
             elif sharding_type == 'fully_reshardable':
@@ -953,15 +910,6 @@ def _get_main_param_and_optimizer_states(self, model_param):
             main_param = self.optimizer.param_groups[group_index]["params"][group_order]
             optim_state = self.optimizer.state[main_param]
             tensors = {"param": main_param, **optim_state}
-        # process muon to be compatiable with adam ( always save to exp_avg / exp_avg_sq )
-        if isinstance(self.optimizer, Muon):
-            use_muon = self.optimizer.param_groups[group_index].get("use_muon", False)
-            if use_muon:
-                tensors["exp_avg"] = tensors["muon_buffer"]
-                tensors["exp_avg_sq"] = torch.zeros_like(tensors["param"])
-            else:
-                tensors["exp_avg"] = tensors["adamw_exp_avg"]
-                tensors["exp_avg_sq"] = tensors["adamw_exp_avg_sq"]
         return tensors
 
     def _set_main_param_and_optimizer_states(self, model_param, tensors):
@@ -991,31 +939,9 @@ def _set_main_param_and_optimizer_states(self, model_param, tensors):
         else:
             main_param = self.optimizer.param_groups[group_index]["params"][group_order]
             optim_state = self.optimizer.state[main_param]
-            
-            if isinstance(self.optimizer, Muon):
-                if "param" in tensors:
-                    main_param.copy_(tensors["param"])
-                use_muon = self.optimizer.param_groups[group_index].get("use_muon", False)
-                if use_muon:
-                    if "muon_buffer" not in optim_state:
-                        optim_state["muon_buffer"] = torch.zeros_like(main_param)
-                    if "exp_avg" in tensors:
-                        optim_state["muon_buffer"].copy_(tensors["exp_avg"])
-                else:
-                    if "adamw_exp_avg" not in optim_state:
-                        optim_state["adamw_exp_avg"] = torch.zeros_like(main_param)
-                    if "adamw_exp_avg_sq" not in optim_state:
-                        optim_state["adamw_exp_avg_sq"] = torch.zeros_like(main_param)
-                    if "exp_avg" in tensors:
-                        optim_state["adamw_exp_avg"].copy_(tensors["exp_avg"])
-                    if "exp_avg_sq" in tensors:
-                        optim_state["adamw_exp_avg_sq"].copy_(tensors["exp_avg_sq"])
-            else:
-                dst_tensors = {"param": main_param, **optim_state}
-                for key in dst_tensors:
-                    if not key in tensors:
-                        continue
-                    dst_tensors[key].copy_(tensors[key])
+            dst_tensors = {"param": main_param, **optim_state}
+            for key in dst_tensors:
+                dst_tensors[key].copy_(tensors[key])
 
     def get_parameter_state_dp_reshardable(self):
         """Get internal representation of parameter state without any copies and modifications.
@@ -1291,10 +1217,12 @@ def sharded_state_dict(
         Regular state dict parameters are saved on DP rank 0 and loaded on all ranks.
         """
         if sharding_type is not None:
-            logger.warning(
+            log_single_rank(
+                logger,
+                logging.WARNING,
                 'DistributedOptimizer.sharded_state_dict parameter `sharding_type`'
                 ' is deprecated and will be removed.'
-                ' Use `metadata["distrib_optim_sharding_type"] instead`.'
+                ' Use `metadata["distrib_optim_sharding_type"] instead`.',
             )
         else:
             sharding_type = (metadata or {}).get(
@@ -1311,10 +1239,12 @@ def sharded_state_dict(
             return state_dict
 
         if not is_loading and sharding_type == 'fully_sharded_bucket_space':
-            logger.warning(
+            log_single_rank(
+                logger,
+                logging.WARNING,
                 '`fully_sharded_bucket_space` sharding for DistributedOptimizer'
                 ' checkpoint is deprecated and will be removed in the future.'
-                ' Please switch to `full_sharded_model_space`.'
+                ' Please switch to `full_sharded_model_space`.',
             )
 
         state_dict = self.state_dict()
@@ -1735,6 +1665,11 @@ def sharded_param_state_dp_reshardable(
                             if key == 'padding':
                                 tensors[key] = LocalNonpersistentObject(tensors[key])
                                 continue
+                            if key == 'step':
+                                # The optimizer state of STEP is a 0-dim tensor and is handled
+                                # separately via param_groups, not as part of the gradient buffer.
+                                tensors[key] = LocalNonpersistentObject(tensors[key])
+                                continue
                             assert tensors[key].shape == (gbuf_local_end - gbuf_local_start,), (
                                 tensors[key].shape,
                                 gbuf_local_start,
diff --git a/megatron/core/optimizer/layer_wise_optimizer.py b/megatron/core/optimizer/layer_wise_optimizer.py
new file mode 100644
index 00000000000..de4396a5b4f
--- /dev/null
+++ b/megatron/core/optimizer/layer_wise_optimizer.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import logging
+from typing import Callable, List, Optional
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from megatron.core.dist_checkpointing.dict_utils import nested_values
+from megatron.core.dist_checkpointing.mapping import LocalNonpersistentObject, ShardedStateDict
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.utils import get_pg_rank, get_pg_size
+
+from .clip_grads import count_zeros_fp32, get_grad_norm_fp32
+from .optimizer import (
+    ChainedOptimizer,
+    Float16OptimizerWithFloat16Params,
+    FP32Optimizer,
+    MegatronOptimizer,
+)
+from .optimizer_config import OptimizerConfig
+
+logger = logging.getLogger(__name__)
+
+
+class LayerWiseDistributedOptimizer(ChainedOptimizer):
+    """Layer-wise distributed optimizer for Megatron-core models.
+
+    Experimental distributed optimizer wrapper that distributes weight to DP ranks by layer.
+    Implemented as ChainedOptimizer to support multiple optimizers (e.g. muon + adamW)
+    When using, keep all megatron distributed-optimizer related options OFF.
+
+    How LayerWiseDistributedOptimizer work:
+    1. weights are splited into lists and each rank only keep its shard in its optimizer
+    2. Megatron DDP handle allreduce grad, note that each rank have full model and grad
+    3. optimizer is already modified so only param belong to this DP rank is updated
+    4. grad_norm and zero counting will reduce metrics globally in step function
+    5. Do regular update with chained optimizers, modified optimizer only update shard
+    6. allgather updated params to every rank
+    """
+
+    def __init__(
+        self,
+        optimizers: List[MegatronOptimizer],
+        config: OptimizerConfig,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        init_state_fn_list: Optional[List[Callable]] = None,
+    ) -> None:
+        """
+        Initialize LayerWiseDistributedOptimizer.
+
+        Args:
+            optimizers: List of MegatronOptimizers.
+            config: OptimizerConfig.
+            pg_collection: ProcessGroupCollection.
+            init_state_fn_list: List of init state functions.
+        """
+
+        self.pg_collection = pg_collection
+        self.shard_params(optimizers)
+        if init_state_fn_list:
+            assert len(init_state_fn_list) == len(
+                optimizers
+            ), "init_state_fn_list must be the same length as optimizers if provided"
+
+        # wrap optimizer after sharding to avoid unnecessary master weight creation
+        # for higher precision, optimizers are wrapped with megatron already
+        if config.bf16:
+            # unwrap FP32 optimizer, possibly from reusing get_megatron_optimizer for adam
+            for i in range(len(optimizers)):
+                opt = optimizers[i]
+                if isinstance(opt, Float16OptimizerWithFloat16Params):
+                    raise TypeError(
+                        'LayerWiseDistributedOptimizer received Float16 optimizer already.'
+                    )
+                # unwrap FP32 optimizer from reusing get_megatron_optimizer for adam
+                if isinstance(opt, FP32Optimizer):
+                    opt = opt.optimizer
+                optimizers[i] = Float16OptimizerWithFloat16Params(
+                    opt, config, None, init_state_fn_list[i] if init_state_fn_list else None
+                )
+
+        super().__init__(optimizers)
+
+        # TODO(kunlun, deyuf): potential future perf optimization
+        # since allreduce is unchanged and handled by megatron DDP, they're already in
+        # contiguous gbuf. So instead of shard param by layer randomly, we can shard by
+        # buf range but keep some "extras" to keep boundary weight not sharded.
+        # This way each rank do some duplicated work but allgather_v is no longer needed
+        # All current distopt optimization can also be potentially applied
+
+    def shard_params(self, optimizers):
+        """Shard all params into lists by rank."""
+        # list of parameter are sorted by numel and assigned to ranks in ping-pong style
+        # example of 4 ranks and 10 parameters p0-p9 after sorting, then dp_cp_params_list will be
+        # [[p0, p7, p8], [p1, p6, p9], [p2, p5], [p3, p4]]
+
+        # simplify when dp_cp group size is 1
+        if get_pg_size(self.pg_collection.dp_cp) == 1:
+            self.dp_cp_params_list = None
+            self.expt_dp_params_list = None
+            return
+
+        dp_cp_idx, expt_dp_idx = 0, 0
+        dp_cp_size = get_pg_size(self.pg_collection.dp_cp)
+        expt_dp_size = get_pg_size(self.pg_collection.expt_dp)
+        # create ping-pong style loop so memory is more balanced
+        dp_cp_loop = list(range(dp_cp_size)) + list(range(dp_cp_size))[::-1]
+        expt_dp_loop = list(range(expt_dp_size)) + list(range(expt_dp_size))[::-1]
+        self.dp_cp_params_list = [[] for _ in range(dp_cp_size)]
+        self.expt_dp_params_list = [[] for _ in range(expt_dp_size)]
+        # get all param groups
+        param_groups = []
+        for optimizer in optimizers:
+            param_groups += optimizer.param_groups
+
+        # sort param in all groups by param numel and assign to each rank evenly
+        param_list = []
+        for group_index, group in enumerate(param_groups):
+            for p in group["params"]:
+                param_list.append((p, group_index))
+        param_list.sort(key=lambda x: x[0].numel())
+        param_groups_this_rank = [[] for g in param_groups]
+
+        # assign params to rank in ping-pong style loop
+        for p, group_index in param_list:
+            if param_groups[group_index].get("is_expert_parallel", False):
+                if expt_dp_loop[expt_dp_idx] == get_pg_rank(self.pg_collection.expt_dp):
+                    param_groups_this_rank[group_index].append(p)
+                self.expt_dp_params_list[expt_dp_loop[expt_dp_idx]].append(p)
+                expt_dp_idx = (expt_dp_idx + 1) % len(expt_dp_loop)
+            else:
+                if dp_cp_loop[dp_cp_idx] == get_pg_rank(self.pg_collection.dp_cp):
+                    param_groups_this_rank[group_index].append(p)
+                self.dp_cp_params_list[dp_cp_loop[dp_cp_idx]].append(p)
+                dp_cp_idx = (dp_cp_idx + 1) % len(dp_cp_loop)
+
+        # now we modify the group to only handle local params
+        for groups, params in zip(param_groups, param_groups_this_rank):
+            groups["params"] = params
+
+        # simplify when expt_dp group size is 1 or expert parallel is off
+        if expt_dp_size == 1 or len(self.expt_dp_params_list[0]) == 0:
+            self.expt_dp_params_list = None
+
+    @torch.no_grad()
+    def allgather_params(self) -> None:
+        """All-gather updated params from all ranks."""
+
+        # helper function to flatten local params, allgather, unflatten and copy to model params
+        def _allgather_helper(params_list, group):
+            # flatten this rank's params and create empty tensor output list
+            device = params_list[0][0].device
+            dtype = params_list[0][0].dtype
+            rank = get_pg_rank(group)
+            # for rank without params create empty tensor and participate in allgather
+            src = (
+                _flatten_dense_tensors(params_list[rank])
+                if len(params_list[rank]) > 0
+                else torch.empty(0, device=device, dtype=dtype)
+            )
+            output_list = [
+                torch.empty(sum([p.numel() for p in params]), device=device, dtype=dtype)
+                for params in params_list
+            ]
+            # single all_gather_v to collect all updated params
+            torch.distributed.all_gather(output_list, src, group=group)
+            # unflatten and copy gathered params for each rank i
+            for idx, (flat_params, params) in enumerate(zip(output_list, params_list)):
+                # skip local params and empty tensors
+                if len(params) == 0 or idx == rank:
+                    continue
+                updated_params = _unflatten_dense_tensors(flat_params, params)
+                for updated_p, model_p in zip(updated_params, params):
+                    model_p.data.copy_(updated_p)
+
+        if self.pg_collection is None:
+            return
+        if self.dp_cp_params_list:
+            _allgather_helper(self.dp_cp_params_list, self.pg_collection.dp_cp)
+        if self.expt_dp_params_list:
+            _allgather_helper(self.expt_dp_params_list, self.pg_collection.expt_dp)
+
+    @torch.no_grad()
+    def broadcast_params(self):
+        """All rank broadcast updated local params."""
+        # Broadcast linear layer weights to all other ranks. Kept as reference test.
+        if self.dp_cp_params_list is None:
+            return
+        for i, params in enumerate(self.dp_cp_params_list):
+            src_global_rank = torch.distributed.get_global_rank(self.pg_collection.dp_cp, i)
+            for p in params:
+                torch.distributed.broadcast(p, src_global_rank, self.pg_collection.dp_cp)
+        if self.expt_dp_params_list is None:
+            return
+        for i, params in enumerate(self.expt_dp_params_list):
+            src_global_rank = torch.distributed.get_global_rank(self.pg_collection.expt_dp, i)
+            for p in params:
+                torch.distributed.broadcast(p, src_global_rank, self.pg_collection.expt_dp)
+
+    @torch.no_grad()
+    def get_grad_norm(self):
+        # similar to dist opt, always aggregate globally
+        grads_for_norm = []
+        for optimizer in self.chained_optimizers:
+            grads_for_norm += optimizer.get_main_grads_for_grad_norm()
+        grad_norm = get_grad_norm_fp32(grads_for_norm, grad_stats_parallel_group=None)
+        return grad_norm
+
+    @torch.no_grad()
+    def count_zeros(self):
+        params = []
+        for optimizer in self.chained_optimizers:
+            params += optimizer.get_parameters()
+        return count_zeros_fp32(
+            params,
+            grad_stats_parallel_group=None,
+            use_decoupled_grad=self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8,
+        )
+
+    @torch.no_grad()
+    def step(self):  # type: ignore[no-untyped-def]
+        """step function for layer-wise optimizer."""
+        update_successful, grad_norm, num_zeros_in_grad = super().step()
+
+        # All gather updated params.
+        self.allgather_params()
+
+        return update_successful, grad_norm, num_zeros_in_grad
+
+    # TODO(deyuf): need to improve dist checkpointing design to properly handle this
+    # fp32_from_fp16_params is list, each sub list could be empty if group is empty
+    # this breaks dist checkpointing assumption since extract_sharded_base drop list structure
+    # for now, we convert it to dict with index as key and convert back in load_state_dict
+    def load_state_dict(self, state_dict):
+        if len(self.chained_optimizers) == 1:
+            wrapped_state_dict = {1: state_dict}
+        else:
+            wrapped_state_dict = state_dict
+        for sd in wrapped_state_dict.values():
+            if 'fp32_from_fp16_params' in sd and isinstance(sd['fp32_from_fp16_params'], dict):
+                logger.info('[layerwise] converting fp32_from_fp16_params from dict to list')
+                sd['fp32_from_fp16_params'] = [
+                    v for k, v in sorted(sd['fp32_from_fp16_params'].items())
+                ]
+        super().load_state_dict(state_dict)
+
+    def sharded_state_dict(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False, **kwargs
+    ):
+        """
+        Sharded state dict for torch_dist format checkpointing.
+        For fixed DP usage only, set replica_id to 0 for all ShardedTensor.
+        """
+        sharded_state_dict = super().sharded_state_dict(
+            model_sharded_state_dict, is_loading, **kwargs
+        )
+
+        # for fixed DP usage only
+        for sh_base in nested_values(sharded_state_dict):
+            if hasattr(sh_base, 'replica_id'):
+                assert (
+                    isinstance(sh_base.replica_id, int) or len(sh_base.replica_id) == 3
+                ), f'Expected replica_id as int or (PP, TP, DP), got: {sh_base}'
+                sh_base.replica_id = (
+                    0 if isinstance(sh_base.replica_id, int) else (*sh_base.replica_id[:2], 0)
+                )
+
+        # later code assume list but chained optimizer fallback to non-list if there's only one
+        if len(self.chained_optimizers) == 1:
+            wrapped_sharded_state_dict = {1: sharded_state_dict}
+        else:
+            wrapped_sharded_state_dict = sharded_state_dict
+
+        # Adjust dict rank 0 output correct global metadata into common_dict
+        for sd in wrapped_sharded_state_dict.values():
+            # wrap empty containers into LocalNonpersistentObject so it won't be saved/loaded
+            # params is already wrapped, we only need to handle fp32_from_fp16_params and state
+            # more details in load_state_dict comment
+            if 'fp32_from_fp16_params' in sd:
+                sd['fp32_from_fp16_params'][:] = [
+                    group if group else LocalNonpersistentObject(group)
+                    for group in sd['fp32_from_fp16_params']
+                ]
+                sd['fp32_from_fp16_params'] = {
+                    i: v for i, v in enumerate(sd['fp32_from_fp16_params'])
+                }
+            # state is a single dict and will be empty if optimizer is fully empty
+            if not sd['optimizer']['state']:
+                sd['optimizer']['state'] = LocalNonpersistentObject(sd['optimizer']['state'])
+            # group keys(e.g. 'step') might be missing or not updated
+            for i, group in enumerate(sd['optimizer']['param_groups']):
+                # keep local param tensor so we only gather metadata
+                local_params = group.pop('params')
+                # save whether this group is empty, so we can use non-empty rank for metadata
+                group['params'] = bool(local_params.unwrap())
+                all_rank_groups = [None for _ in range(torch.distributed.get_world_size())]
+                torch.distributed.all_gather_object(all_rank_groups, group)
+                # find first non-empty group if it exists
+                nonempty_rank_group = next((g for g in all_rank_groups if g['params']), group)
+                nonempty_rank_group['params'] = local_params
+                sd['optimizer']['param_groups'][i] = nonempty_rank_group
+        return sharded_state_dict
+
+    def save_state_dict_to_file(self, filename: str) -> None:
+        """Save the parameter state of the optimizer. For torch format only.
+        Args:
+            filename: The filename to save the parameter state.
+        """
+        torch.save(super().state_dict(), filename)
+
+    def load_state_dict_from_file(self, filename: str) -> None:
+        """Load the parameter state of the optimizer. For torch format only."""
+        super().load_state_dict(torch.load(filename))
diff --git a/megatron/core/optimizer/muon.py b/megatron/core/optimizer/muon.py
index afb838a7209..57eb1e94478 100644
--- a/megatron/core/optimizer/muon.py
+++ b/megatron/core/optimizer/muon.py
@@ -1,315 +1,350 @@
-from typing import Tuple, Dict
-
-import torch
-import math
-import torch.distributed as dist
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+"""Megatron muon optimizer wrapper to handle tensor-parallel."""
 
-# copy from https://github.com/KellerJordan/Muon/tree/master
-# @torch.compile
-def zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750,  2.0315)
-    X = G
-    if G.size(0) > G.size(1):
-        X = X.T
-
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    # Perform the NS iterations
-    for _ in range(steps):
-        A = X @ X.T
-        B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
-        X = a * X + B @ X
-    
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-
-def normalize_range(range: Tuple[int, int], start):
-    return (range[0] - start, range[1] - start)
-
-class MuonDistMeta:
-
-    # which buffer and bucket param belongs to
-    buffer_idx: int = 0
-    bucket_idx: int = 0
-    # param shape after tp
-    shape: torch.Size = None
-    # param location in global buffer
-    global_range: Tuple[int, int] = None
-    tp_split_dim: int = -1
-    # param location in global buffer (current dp slice)
-    local_range: Tuple[int, int] = None
-
-    def __init__(self, buffer_idx: int, bucket_idx: int, shape: torch.Size, global_range: Tuple[int, int], tp_split_dim: int):
-        self.buffer_idx = buffer_idx
-        self.bucket_idx = bucket_idx
-        self.shape = shape
-        self.global_range = global_range
-        self.tp_split_dim = tp_split_dim
-    
-    def set_local_buffer_range(self, local_buffer_range: Tuple[int, int]):
-        start = max(self.global_range[0], local_buffer_range[0])
-        end = min(self.global_range[1], local_buffer_range[1])
-        self.local_range = (start, end) if start < end else (local_buffer_range[0], local_buffer_range[0])
-
-# adjust LR based on: https://github.com/MoonshotAI/Moonlight
-def adjust_lr_wd_for_muon(lr, matched_adamw_rms, param_shape):
-    A, B = param_shape[:2]
-    adjusted_ratio = math.sqrt(max(A, B)) * matched_adamw_rms
-    adjusted_lr = lr * adjusted_ratio
-    return adjusted_lr
-
-# copy from https://github.com/KellerJordan/Muon/tree/master and support distributed solution
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-
-    Arguments:
-        param_groups: The parameters to be optimized.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        matched_adamw_rms: The AdamW Update RMS that Muon is designed to match. (0.2~0.4 recommended)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (5 is probably always enough)
-        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        adamw_wd: The weight decay for the internal AdamW.
-    """
-    def __init__(self, param_groups, lr=2e-2, weight_decay=0.1,
-                 matched_adamw_rms=0.2, momentum=0.95, nesterov=True, ns_steps=5,
-                 adamw_betas=(0.95, 0.95), adamw_eps=1e-8):
-
-        defaults = dict(lr=lr, weight_decay=weight_decay,
-                        matched_adamw_rms=matched_adamw_rms,
-                        momentum=momentum, nesterov=nesterov, ns_steps=ns_steps,
-                        adamw_betas=adamw_betas, adamw_eps=adamw_eps,)
-
-        super().__init__(param_groups, defaults)
-        self.distributed_mode = False
+import logging
+from typing import Any, Callable, Dict, List, Literal, Optional
 
+import torch
+from torch.optim.optimizer import ParamsT
+
+from megatron.core.optimizer_param_scheduler import ParamGroupOverride
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.utils import get_pg_size, log_single_rank
+
+from . import _get_param_groups, get_megatron_optimizer
+from .layer_wise_optimizer import LayerWiseDistributedOptimizer
+from .optimizer import (
+    ChainedOptimizer,
+    Float16OptimizerWithFloat16Params,
+    FP32Optimizer,
+    MegatronOptimizer,
+)
+from .optimizer_config import OptimizerConfig, ParamKey
+
+try:
+    from emerging_optimizers.orthogonalized_optimizers import (
+        OrthogonalizedOptimizer,
+        get_muon_scale_factor,
+    )
+    from emerging_optimizers.orthogonalized_optimizers.muon_utils import newton_schulz_tp
+
+    HAVE_EMERGING_OPTIMIZERS = True
+except ImportError:
+    HAVE_EMERGING_OPTIMIZERS = False
+    OrthogonalizedOptimizer = object
+
+
+logger = logging.getLogger(__name__)
+
+
+class TensorParallelMuon(OrthogonalizedOptimizer):
+    """Tensor Parallel Muon optimizer."""
+
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = 3e-4,
+        momentum_beta: float = 0.95,
+        use_nesterov: bool = True,
+        weight_decay: float = 0.01,
+        use_decoupled_weight_decay: bool = True,
+        split_qkv: bool = False,
+        is_qkv_fn: Callable[[torch.Tensor], bool] | None = None,
+        qkv_split_shapes: tuple[int, int, int] | None = None,
+        fp32_matmul_prec: str = "medium",
+        coefficient_type: str = "quintic",
+        num_ns_steps: int = 5,
+        scale_mode: str = "spectral",
+        extra_scale_factor: float = 1.0,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        mode: Literal["blockwise", "duplicated", "distributed"] = "duplicated",
+    ) -> None:
+        if num_ns_steps < 1:
+            raise ValueError(f"num_ns_steps must be at least 1, got {num_ns_steps}")
+
+        def scaled_orthogonalize_fn(
+            grad: torch.Tensor,
+            tp_group: torch.distributed.ProcessGroup,
+            partition_dim: int | None = None,
+        ) -> torch.Tensor:
+            log_single_rank(
+                logger,
+                logging.DEBUG,
+                f'Orthogonalizing grad with {num_ns_steps} steps, {coefficient_type} coefficient, '
+                f'{scale_mode} scale mode, extra_scale_factor={extra_scale_factor}',
+            )
+            size = [grad.size(-2), grad.size(-1)]
+            if partition_dim is not None:
+                size[partition_dim] *= get_pg_size(tp_group)
+            orth_grad = newton_schulz_tp(
+                grad,
+                steps=num_ns_steps,
+                coefficient_type=coefficient_type,
+                tp_group=tp_group,
+                partition_dim=partition_dim,
+                mode="duplicated" if mode == "blockwise" else mode,
+            )
+            scale_factor = get_muon_scale_factor(size[0], size[1], mode=scale_mode)
+            return orth_grad * scale_factor * extra_scale_factor
+
+        self.pg_collection = pg_collection
+        self.mode = mode
+        self.split_qkv = split_qkv
+        self.is_qkv_fn = is_qkv_fn
+        self.qkv_split_shapes = qkv_split_shapes
+
+        weight_decay_method = "decoupled" if use_decoupled_weight_decay else "l2"
+        super().__init__(
+            params,
+            lr,
+            momentum_beta,
+            use_nesterov=use_nesterov,
+            weight_decay=weight_decay,
+            weight_decay_method=weight_decay_method,
+            fp32_matmul_prec=fp32_matmul_prec,
+            scaled_orthogonalize_fn=scaled_orthogonalize_fn,
+        )
+
+    def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+        """Orthogonalize the momentum.
 
-    def enable_distributed_mode(self, global_buffer_sizes, dist_group, tp_group,
-                                dist_metas: Dict[torch.nn.Parameter, MuonDistMeta]):
-        """
-        enable distributed mode
         Args:
-            global_buffer_size: global buffer size
-            dist group: optimizer sharding group
-            tp group: param tp group
-            dist metas: dist metas for all param
-        """
+            p: The parameter tensor. i is necessary to pass param tensor in addition to momentum
+                because a lot of information is only available in the param tensor,
+                attributes for example.
+            grad: The momentum tensor.
 
-        self.global_buffer_sizes = global_buffer_sizes
-        self.dist_group = dist_group
-        self.tp_group = tp_group
-        self.dist_metas = dist_metas
-
-        world_size = dist.get_world_size(dist_group)
-        rank = dist.get_rank(dist_group)
-
-        # calc local buffer range
-        self.local_buffer_sizes = []
-        self.local_buffer_ranges = []
-        for bucket_sizes in global_buffer_sizes:
-            local_bucket_sizes = []
-            local_bucket_ranges = []
-            for (global_bucket_size, bucket_offset) in bucket_sizes:
-                assert global_bucket_size % world_size == 0
-                local_buffer_size = global_bucket_size // world_size
-                local_buffer_start = local_buffer_size * rank + bucket_offset
-                local_buffer_range = (local_buffer_start, local_buffer_start + local_buffer_size)
-                local_bucket_sizes.append(local_buffer_size)
-                local_bucket_ranges.append(local_buffer_range)
-
-            self.local_buffer_sizes.append(local_bucket_sizes)
-            self.local_buffer_ranges.append(local_bucket_ranges)
-
-        # calc local range for params
-        for dist_meta in dist_metas.values():
-            local_buffer_range = self.local_buffer_ranges[dist_meta.buffer_idx][dist_meta.bucket_idx]
-            dist_meta.set_local_buffer_range(local_buffer_range)
-
-        self.distributed_mode = True
-
-    def step(self):
-
-        for group in self.param_groups:
-            if 'step' in group:
-                group['step'] += 1
-            else:
-                group['step'] = 1
-
-        dtype = torch.bfloat16
-        device = torch.cuda.current_device()
-
-        ns_inputs = {}
-
-        # update muon momentum first
-        for group in self.param_groups:
-
-            if not group.get("use_muon", False):
-                continue
-
-            momentum = group['momentum']
-            params = group["params"]
-
-            for p in params:
-
-                g = p.grad
-                assert g is not None
-                # 1-dim grad for distributed mode
-                assert self.distributed_mode or g.dim() == 2
-
-                # prepare muon buffer in state
-                state = self.state[p]
-                if not "muon_buffer" in state:
-                    state["muon_buffer"] = torch.zeros_like(g)
-                buf = state["muon_buffer"]
-                buf.mul_(momentum).add_(g)
-
-                # save to ns input
-                g = g.add(buf, alpha=momentum) if group['nesterov'] else buf
-                ns_inputs[p] = g.bfloat16()
-        
-        # rewrite ns_inputs if distributed
-        if self.distributed_mode:
-
-            # initialize buffers
-            ns_input_local_buffers = [
-                [ torch.empty((local_buffer_size), device=device, dtype=dtype)
-                    for local_buffer_size in local_bucket_sizes ]
-                for local_bucket_sizes in self.local_buffer_sizes
-            ]
-            ns_input_global_buffers = [
-                [ torch.empty((global_buffer_size), device=device, dtype=dtype)
-                    for (global_buffer_size, bucket_offset) in global_bucket_sizes ]
-                for global_bucket_sizes in self.global_buffer_sizes
+        Returns:
+            The orthogonalized gradient tensor.
+        """
+        # TODO(deyuf): switch to group
+        if self.pg_collection:
+            tp_group = (
+                self.pg_collection.expt_tp
+                if getattr(p, 'expert_tp', False)
+                else self.pg_collection.tp
+            )
+        else:
+            tp_group = None
+        partition_dim = None if self.mode == "blockwise" else getattr(p, "partition_dim", None)
+        if partition_dim == -1:
+            # emerging-optimizers use None instead of -1 to indicate no tensor parallel
+            partition_dim = None
+
+        if self.split_qkv and self.is_qkv_fn(p):  # type: ignore[misc]
+            # split grouped attention parameters (e.g., QKV, GQA, etc.)
+            grad_shape = grad.shape
+            log_single_rank(
+                logger,
+                logging.DEBUG,
+                f'qkv split grad shape {grad_shape}, split shapes {self.qkv_split_shapes}',
+            )
+            num_query_groups = grad_shape[0] // sum(self.qkv_split_shapes)
+            qkv_grads = torch.split(
+                grad.view(num_query_groups, sum(self.qkv_split_shapes), -1),
+                self.qkv_split_shapes,
+                dim=1,
+            )
+            qkv_grads = [g.reshape(-1, grad_shape[-1]) for g in qkv_grads]
+
+            # Apply Newton-Schulz and scales to each component, concat back
+            qkv_grads = [
+                self.scaled_orthogonalize_fn(g, tp_group, partition_dim).view(
+                    num_query_groups, -1, grad_shape[-1]
+                )
+                for g in qkv_grads
             ]
-
-            # fill ns input data to local buffer
-            for param, ns_input in ns_inputs.items():
-                dist_meta = self.dist_metas[param]
-                ns_input_local_buffer = ns_input_local_buffers[dist_meta.buffer_idx][dist_meta.bucket_idx]
-                local_buffer_range = self.local_buffer_ranges[dist_meta.buffer_idx][dist_meta.bucket_idx]
-                local_range = normalize_range(dist_meta.local_range, local_buffer_range[0])
-                ns_input_local_buffer[local_range[0]:local_range[1]].copy_(ns_input.view(-1))
-
-            # all gather buffers
-            for ns_input_global_buffer, ns_input_local_buffer in zip(ns_input_global_buffers, ns_input_local_buffers):
-                for ns_input_global_bucket, ns_input_local_bucket in zip(ns_input_global_buffer, ns_input_local_buffer):
-                    dist.all_gather_into_tensor(ns_input_global_bucket, ns_input_local_bucket, group=self.dist_group)
-
-            # overwrite ns input
-            for p in ns_inputs.keys():
-                dist_meta = self.dist_metas[p]
-                ns_input_global_buffer = ns_input_global_buffers[dist_meta.buffer_idx][dist_meta.bucket_idx]
-                global_range = dist_meta.global_range
-                offset = self.global_buffer_sizes[dist_meta.buffer_idx][dist_meta.bucket_idx][1]
-                ns_inputs[p] = ns_input_global_buffer[global_range[0] - offset : global_range[1] - offset].view(dist_meta.shape)
-
-            # set tp info
-            tp_world_size = dist.get_world_size(self.tp_group)
-            tp_rank = dist.get_rank(self.tp_group)
-
-        # update muon momentum first
-        for group in self.param_groups:
-
-            if not group.get('use_muon', False):
-                continue
-
-            lr = group["lr"]
-            ns_steps = group["ns_steps"]
-            weight_decay = group["weight_decay"]
-            matched_adamw_rms = group["matched_adamw_rms"]
-            params = group["params"]
-
-            for p in params:
-
-                ns_input = ns_inputs[p]
-                tp_split_dim = -1
-
-                if self.distributed_mode:
-                    dist_meta = self.dist_metas[p]
-                    tp_split_dim = dist_meta.tp_split_dim
-
-                # gather tensor parallel ( if tp )
-                if tp_split_dim != -1:
-                    ns_input_shards = [ torch.empty_like(ns_input) for _ in range(tp_world_size) ]
-                    dist.all_gather(ns_input_shards, ns_input, self.tp_group)
-                    ns_input = torch.cat(ns_input_shards, dim=tp_split_dim)
-
-                # calc update
-                update = zeropower_via_newtonschulz5(ns_input, steps=ns_steps)
-
-                # only local tp part
-                if tp_split_dim != -1:
-                    update = update.chunk(tp_world_size, dim=tp_split_dim)[tp_rank]
-
-                # only local buffer part
-                if self.distributed_mode:
-                    local_range_in_global_range = normalize_range(dist_meta.local_range, dist_meta.global_range[0])
-                    update = update.reshape(-1)[local_range_in_global_range[0]:local_range_in_global_range[1]]
-
-                # apply weight decay
-                p.data.mul_(1 - lr*weight_decay)
-
-                #  adjust lr and apply update
-                adjusted_lr = adjust_lr_wd_for_muon(lr, matched_adamw_rms, ns_input.shape)
-                p.data.add_(update, alpha=-adjusted_lr)
-
-        # use adam for other params
-        for group in self.param_groups:
-
-            if group.get('use_muon', False):
+            grad = torch.cat(qkv_grads, dim=1).view(grad_shape)
+        else:
+            grad = self.scaled_orthogonalize_fn(grad, tp_group, partition_dim)
+        return grad
+
+
+def get_megatron_muon_optimizer(
+    config: OptimizerConfig,
+    model_chunks: List[MegatronModule],
+    config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]] = None,
+    use_gloo_process_groups: bool = True,
+    layer_wise_distributed_optimizer: bool = False,
+    pg_collection: Optional[ProcessGroupCollection] = None,
+) -> MegatronOptimizer:
+    """This function is used to get the muon optimizer for the model chunks.
+    It is used to get the muon optimizer for the model chunks.
+
+    Args:
+        config (OptimizerConfig): optimizer configuration object.
+        model_chunks (List[MegatronModule]): model chunks to get optimizer for.
+        use_gloo_process_groups (bool): if false, disable use of Gloo process groups
+            in underlying Megatron optimizers.
+        layer_wise_distributed_optimizer (bool): if true, use layer-wise distributed optimizer.
+            Defaults to False.
+    """
+    # Muon currently use adam config. setting str here to call regular get for adam creation
+    # side effect is muon optimizer will have wrong name, i.e. config.optimizer == 'adam'
+    config.optimizer = 'adam'
+
+    assert HAVE_EMERGING_OPTIMIZERS, "Emerging Optimizers is not installed."
+
+    # Dist-opt is not supported due to strong coupling with how DDP init grad buffer
+    # In theory we can change DDP to enable use muon and dist-opt-adam together
+    if config.use_distributed_optimizer:
+        raise Exception('muon with dist optimizer is not supported.')
+    # only support bf16 w/o loss scale now
+    if config.fp16:
+        raise Exception('muon with fp16 is not supported.')
+
+    # before this function receive properly created collection
+    if pg_collection is None:
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+
+    log_single_rank(logger, logging.INFO, f'Setting up emerging optimizer with config {config}')
+
+    # Needed for torch_dist ckpt_format, unlike torch ckpt_format
+    # For other emerging optimizers, need to implement init_state_fn as well
+    # TODO(boxiangw): Improve usability after optimizer refactor
+    # TODO(boxiangw): support precision aware optimizer
+    def muon_init_state_fn(opt, config=None):
+        for group in opt.param_groups:
+            for p in group['params']:
+                if len(opt.state[p]) == 0:
+                    opt.state[p]['momentum_buffer'] = torch.zeros_like(p.data)
+
+    def adam_init_state_fn(opt, config=None):
+        for group in opt.param_groups:
+            for p in group['params']:
+                if len(opt.state[p]) == 0:
+                    if config is None or not config.use_precision_aware_optimizer:
+                        opt.state[p]['exp_avg'] = torch.zeros_like(p.data)
+                        opt.state[p]['exp_avg_sq'] = torch.zeros_like(p.data)
+                    else:
+                        opt.initialize_state(p)
+
+    optimizers = []
+    # record list of non/linear params
+    linear_params = []
+    nonlinear_params = []
+    for model_chunk in model_chunks:
+        # use config to determine qkv split shapes.
+        # no need to check tp since tp splits by head and this is per head(group) dimension
+        num_attention_heads = model_chunk.config.num_attention_heads
+        num_query_groups = model_chunk.config.num_query_groups
+        kv_channels = model_chunk.config.kv_channels
+        qkv_split_shapes = [
+            num_attention_heads // num_query_groups * kv_channels,
+            kv_channels,
+            kv_channels,
+        ]
+        for name, param in model_chunk.named_parameters():
+            if not param.requires_grad:
                 continue
-
-            step = group['step']
-            params = group["params"]
-            lr = group['lr']
-            weight_decay = group['weight_decay']
-            beta1, beta2 = group['adamw_betas']
-            eps = group['adamw_eps']
-
-            for p in params:
-
-                g = p.grad
-                assert g is not None
-                state = self.state[p]
-
-                if len(state) == 0:
-                    state['adamw_exp_avg'] = torch.zeros_like(g)
-                    state['adamw_exp_avg_sq'] = torch.zeros_like(g)
-
-                buf1 = state['adamw_exp_avg']
-                buf2 = state['adamw_exp_avg_sq']
-                buf1.lerp_(g, 1-beta1)
-                buf2.lerp_(g.square(), 1-beta2)
-
-                g = buf1 / (eps + buf2.sqrt())
-
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr/scale)
+            # add flag for expert weight so optimizer can figure which tp group it uses
+            # alternatively, create new param group and save tp_group. this require more
+            # change in optimizer
+            if 'experts' in name and 'shared' not in name:
+                param.expert_tp = True
+            # add flag for qkv parameter
+            # TODO(deyuf): support MLA
+            if 'linear_qkv.weight' in name and len(param.shape) == 2:
+                param.is_qkv = True
+            # TODO(deyuf): currently only allow 2D non-embedding weight to avoid breaking
+            if (
+                not getattr(param, 'is_embedding_or_output_parameter', False)
+                and len(param.shape) == 2
+            ):
+                linear_params.append(param)
+            else:
+                nonlinear_params.append(param)
+
+    muon_kwargs = {
+        "lr": config.lr,
+        "momentum_beta": config.muon_momentum,
+        "use_nesterov": config.muon_use_nesterov,
+        "weight_decay": config.weight_decay,
+        "fp32_matmul_prec": config.muon_fp32_matmul_prec,
+        "num_ns_steps": config.muon_num_ns_steps,
+        "scale_mode": config.muon_scale_mode,
+        "split_qkv": config.muon_split_qkv,
+        "is_qkv_fn": lambda p: getattr(p, "is_qkv", False),
+        "qkv_split_shapes": qkv_split_shapes,
+        "extra_scale_factor": config.muon_extra_scale_factor,
+        "pg_collection": pg_collection,
+        "mode": config.muon_tp_mode,
+    }
+
+    # freezing nonlinear params and get param groups for muon
+    for param in nonlinear_params:
+        param.requires_grad = False
+
+    linear_param_groups = _get_param_groups(model_chunks, config, config_overrides)
+    # if layerwise distributed optimizer is not used, need to handle ep params separately
+    expert_param_groups = []
+    if not layer_wise_distributed_optimizer:
+        for group in linear_param_groups:
+            if group['is_expert_parallel']:
+                expert_param_groups.append(group)
+                linear_param_groups.remove(group)
+
+    optimizer = TensorParallelMuon(linear_param_groups, **muon_kwargs)
+
+    reset_config_bf16 = False
+    if config.bf16:
+        if layer_wise_distributed_optimizer:
+            # creating master weight before layerwise sharding will lead to unnecessary master
+            # weight so here we delay master weight creation into layer_wise unset config.bf16
+            # will also result in all optimizers below(adam) to also not be wrapped
+            config.bf16 = False
+            reset_config_bf16 = True
+        else:
+            # if not using layer_wise wrapper, just create master weight here is fine
+            optimizer = Float16OptimizerWithFloat16Params(
+                optimizer, config, None, muon_init_state_fn
+            )
+    else:
+        optimizer = FP32Optimizer(optimizer, config, muon_init_state_fn)
+
+    optimizers.append(optimizer)
+
+    # expert optimizer exists meaning layerwise distributed optimizer is not used
+    if len(expert_param_groups) > 0:
+        expert_optimizer = TensorParallelMuon(expert_param_groups, **muon_kwargs)
+        if config.bf16:
+            expert_optimizer = Float16OptimizerWithFloat16Params(
+                expert_optimizer, config, None, muon_init_state_fn
+            )
+        else:
+            expert_optimizer = FP32Optimizer(expert_optimizer, config, muon_init_state_fn)
+        setattr(expert_optimizer, 'grad_stats_parallel_group', pg_collection.tp_ep_pp)
+        optimizers.append(expert_optimizer)
+
+    # done with muon, unfreeze nonlinear and freeze linear
+    for param in nonlinear_params:
+        param.requires_grad = True
+    for param in linear_params:
+        param.requires_grad = False
+
+    # call original get. linear params will be skipped since they're freezed
+    chained_adam = get_megatron_optimizer(
+        config,
+        model_chunks,
+        config_overrides=config_overrides,
+        use_gloo_process_groups=use_gloo_process_groups,
+    )
+
+    # unfreeze everything
+    for param in linear_params:
+        param.requires_grad = True
+
+    # chain everything together
+    init_fns = [muon_init_state_fn] + len(chained_adam.chained_optimizers) * [adam_init_state_fn]
+    optimizers += chained_adam.chained_optimizers
+
+    if layer_wise_distributed_optimizer:
+        log_single_rank(logger, logging.INFO, 'Using LayerWiseDistributedOptimizer for Muon')
+        if reset_config_bf16:
+            config.bf16 = True
+        return LayerWiseDistributedOptimizer(
+            optimizers, config, pg_collection, init_state_fn_list=init_fns
+        )
+    return ChainedOptimizer(optimizers)
diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py
index 67dad6efebe..71b34cc7736 100644
--- a/megatron/core/optimizer/optimizer.py
+++ b/megatron/core/optimizer/optimizer.py
@@ -94,8 +94,8 @@ def _multi_tensor_copy_this_to_that(
         for this_, that_ in zip(this, that):
             that_.copy_(this_)
 
-
-param_group_identifier_keys = ('wd_mult', 'lr_mult', 'is_expert_parallel', 'is_decoupled_lr', 'use_muon', 'is_vision_model_param') ####FlagScale add is_vision_model_param
+#### FlagScale add is_vision_model_param
+param_group_identifier_keys = ('wd_mult', 'lr_mult', 'is_expert_parallel', 'is_decoupled_lr', 'is_vision_model_param')
 
 
 class MegatronOptimizer(ABC):
@@ -155,7 +155,9 @@ def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]:
                 grad = param.grad
             grad_not_none = grad is not None
             is_not_shared = param_is_not_shared(param)
-            is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param)
+            is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(
+                param, getattr(self, 'tp_group', None)
+            )
             if grad_not_none and is_not_shared and is_not_tp_duplicate:
                 grads_for_norm.append(grad)
 
@@ -227,6 +229,7 @@ def count_zeros(self) -> float:
             params,
             grad_stats_parallel_group=self.get_grad_stats_parallel_group(),
             use_decoupled_grad=self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8,
+            tp_group=getattr(self, 'tp_group', None),
         )
 
     @abstractmethod
diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py
index 185c11c7d25..c0c97785ca1 100644
--- a/megatron/core/optimizer/optimizer_config.py
+++ b/megatron/core/optimizer/optimizer_config.py
@@ -1,5 +1,6 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import fnmatch
 from dataclasses import dataclass, field
 from typing import Callable, Optional, Tuple, Union
 
@@ -8,6 +9,58 @@
 from ..utils import is_te_min_version
 
 
+@dataclass(frozen=True)
+class ParamPredicate:
+    """Wraps a matching function to make it hashable for ParamKey.
+    Example:
+        >>> shape_1_param = ParamPredicate(name="s1", fn=lambda param: len(param.shape) == 1)
+        >>> shape_1_param(torch.empty(10))
+        True
+        >>> shape_1_param_copy = ParamPredicate(name="s1", fn=lambda param: len(param.shape) == 1)
+        >>> shape_1_param == shape_1_param_copy  # name is used to match
+        True
+        >>> {shape_1_param, shape_1_param_copy} == {shape_1_param}  # set hashing works properly
+
+    NOTE:
+        __hash__ and __eq__ are automatically generated by @dataclass(frozen=True)
+        based solely on 'name' because we set compare=False/hash=False on 'fn'.
+    """
+
+    name: str
+    fn: Callable[[torch.nn.Parameter], bool] = field(compare=False, hash=False)
+
+    def __call__(self, param: torch.nn.Parameter) -> bool:
+        return self.fn(param)
+
+
+@dataclass(frozen=True)
+class ParamWithNamePredicate:
+    """Wraps a matching function to make it hashable for ParamKey.
+    Example:
+        >>> shape_1_not_qkln_param = ParamWithNamePredicate(
+                name="s1_not_qkln",
+                fn=lambda param, name: (
+                    len(param.shape) == 1 or name.endswith(".bias")
+                    and not ("q_layernorm." in name or "k_layernorm." in name)
+                )
+            )
+        >>> shape_1_not_qkln_param(torch.empty(10), "interesting.bias")
+        True
+        >>> shape_1_not_qkln_param(torch.empty(10), "interesting.q_layernorm.bias")
+        False
+
+    NOTE:
+        __hash__ and __eq__ are automatically generated by @dataclass(frozen=True)
+        based solely on 'name' because we set compare=False/hash=False on 'fn'.
+    """
+
+    name: str
+    fn: Callable[[torch.nn.Parameter, str], bool] = field(compare=False, hash=False)
+
+    def __call__(self, param: torch.nn.Parameter, name: str) -> bool:
+        return self.fn(param, name)
+
+
 @dataclass(frozen=True, slots=True)
 class ParamKey:
     """Key to group parameters by. All such grouped parameters can share an
@@ -16,11 +69,71 @@ class ParamKey:
     # TODO: Can add layer_id here later.
 
     name: Union[str, Tuple[str]] = field(default_factory=tuple)
-    """Parameter name(s)."""
+    """Parameter name(s), will use unix filesystem path syntax for matching."""
 
     attr: Union[str, Tuple[str]] = field(default_factory=tuple)
     """Parameter attribute(s)."""
 
+    predicate: Union[ParamPredicate, Tuple[ParamPredicate]] = field(default_factory=tuple)
+    """Predicate(s) to match parameters by. If multiple predicates are provided, any must match."""
+
+    with_name_predicate: Union[ParamWithNamePredicate, Tuple[ParamWithNamePredicate]] = field(
+        default_factory=tuple
+    )
+    """
+    Predicate(s) to match parameters with their name. If multiple predicates are provided, 
+      any must match. This is useful if you need to filter out some parameters from an otherwise 
+      positive match by their name.
+    """
+
+    def matches(self, param: torch.nn.Parameter, param_name: str) -> bool:
+        """Returns true if passed-in parameter (with name) matches `param_key`.
+
+        Args:
+            param (torch.nn.Parameter): Handle to parameter object.
+            param_name (str): Name of parameter in underlying PyTorch module.
+
+        Returns:
+            bool: True if parameter matches passed-in param_key.
+        """
+
+        # Check if name matches.
+        if isinstance(self.name, str):
+            target_names = [self.name]
+        else:
+            target_names = list(self.name)
+        for target_name in target_names:
+            if fnmatch.fnmatch(param_name, target_name):
+                return True
+
+        # Check if attribute matches.
+        if isinstance(self.attr, str):
+            target_attrs = [self.attr]
+        else:
+            target_attrs = list(self.attr)
+        for target_attr in target_attrs:
+            if getattr(param, target_attr, False):
+                return True
+
+        # Check if predicate matches.
+        if isinstance(self.predicate, ParamPredicate):
+            if self.predicate(param):
+                return True
+        else:
+            for predicate in self.predicate:
+                if predicate(param):
+                    return True
+
+        # Check if with_name_predicate matches.
+        if isinstance(self.with_name_predicate, ParamWithNamePredicate):
+            if self.with_name_predicate(param, param_name):
+                return True
+        else:
+            for predicate in self.with_name_predicate:
+                if predicate(param, param_name):
+                    return True
+        return False
+
 
 @dataclass
 class OptimizerConfig:
@@ -38,9 +151,20 @@ class OptimizerConfig:
     min_lr: Optional[float] = None
     """Minumum value for learning rate. The scheduler clip values below this threshold."""
 
+    decoupled_lr: Optional[float] = None
+    """Separate learning rate for the input and output layer."""
+
+    decoupled_min_lr: Optional[float] = None
+    """Minimum value for learning rate for the input and output layer. The scheduler clip values
+       below this threshold.
+    """
+
     weight_decay: float = 0.01
     """Weight decay coefficient for L2 regularization."""
 
+    apply_wd_to_qk_layernorm: bool = False
+    """If true, apply weight decay to qk layernorm as a special case."""
+
     ##############
     # Precision
     ##############
@@ -131,6 +255,31 @@ class OptimizerConfig:
     sgd_momentum: float = 0.9
     """Momentum factor for SGD optimizer."""
 
+    # Muon.
+    # TODO: move muon configs to it's own `MuonConfig`.
+    muon_momentum: float = 0.95
+    """The momentum used by the internal SGD."""
+
+    muon_split_qkv: bool = True
+    """Whether to split QKV parameters for Muon optimizer."""
+
+    muon_use_nesterov: bool = False
+    """Whether to use Nesterov-style momentum in the internal SGD."""
+
+    muon_scale_mode: str = "spectral"
+    """The mode to use for the scale factor. Defaults to "spectral"."""
+
+    muon_fp32_matmul_prec: str = "medium"
+    """The precision to use for the fp32 matmul. Defaults to "medium"."""
+
+    muon_num_ns_steps: int = 5
+    """The number of iteration steps to use in the Newton-Schulz iteration."""
+
+    muon_tp_mode: str = "blockwise"
+    """How to perform NS calculation for tensor parallel weights. Defaults to "blockwise"."""
+
+    muon_extra_scale_factor: float = 1.0
+    """Additional scale factor for the muon update."""
 
     #######################
     # Distributed optimizer
@@ -314,14 +463,3 @@ class SGDOptimizerConfig(OptimizerConfig):
 
     sgd_momentum: float = 0.9
     """Momentum factor for SGD optimizer."""
-
-
-@dataclass
-class MuonOptimizerConfig(OptimizerConfig):
-    # Muon.
-    muon_momentum: float = 0.95
-    """Momentum factor for Muon optimizer."""
-    muon_nesterov: bool = True
-    muon_ns_steps: int = 5
-    """The adamw update rms that muon is designed to matched, typicially 0.2 ~ 0.4"""
-    muon_matched_adamw_rms: float = 0.2
\ No newline at end of file
diff --git a/megatron/core/optimizer/qk_clip.py b/megatron/core/optimizer/qk_clip.py
new file mode 100644
index 00000000000..72127f94712
--- /dev/null
+++ b/megatron/core/optimizer/qk_clip.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core import parallel_state
+
+
+def clip_qk(model, log_max_only=False) -> float:
+    """
+    Clip the QK attention logits to the threshold, recommended for Muon optimizer.
+
+    Args:
+        model: The model to clip the QK attention logits, a list of model chunks.
+        log_only: Whether to only log the max attention logit, without updating the weights.
+
+    Returns:
+        The maximum attention logit, a float.
+    """
+
+    with torch.no_grad():
+        log_max_attention_logit = 0
+        for model_chunk in model:
+            for transformer_layer in model_chunk.module.module.decoder.layers:
+                if hasattr(transformer_layer.self_attention, 'clip_qk'):
+                    torch.distributed.all_reduce(
+                        transformer_layer.self_attention.core_attention.current_max_attn_logits,
+                        op=torch.distributed.ReduceOp.MAX,
+                        group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    )
+                    log_max_attention_logit = max(
+                        log_max_attention_logit,
+                        torch.max(
+                            transformer_layer.self_attention.core_attention.current_max_attn_logits
+                        ).item(),
+                    )
+                    if not log_max_only:
+                        transformer_layer.self_attention.clip_qk()
+
+    return log_max_attention_logit
diff --git a/megatron/core/optimizer_param_scheduler.py b/megatron/core/optimizer_param_scheduler.py
index f8533f3d8b7..a38016692ef 100644
--- a/megatron/core/optimizer_param_scheduler.py
+++ b/megatron/core/optimizer_param_scheduler.py
@@ -3,16 +3,99 @@
 """Learning rate decay and weight decay incr functions."""
 import logging
 import math
-from typing import Optional
+from typing import TYPE_CHECKING, Any, Optional, TypedDict
 
-from megatron.core.optimizer import MegatronOptimizer
 from megatron.core.utils import log_single_rank
 
+if TYPE_CHECKING:
+    # Avoid circular import.
+    from megatron.core.optimizer import MegatronOptimizer
+
 logger = logging.getLogger(__name__)
 
 from megatron.plugin.decorators import plugin_method
 
 
+class ParamGroupOverride(TypedDict):
+    """Override values for a parameter group. These values may be optimizer-state/scheduler related.
+
+    These are the values you see later in param_group.get(...) calls in the
+        OptimizerParamScheduler.get_lr and get_wd methods. If you use a custom optimizer
+        or scheduler, you could override those variables instead.
+
+    Example:
+        >>> param_group_override = ParamGroupOverride(min_lr=1e-4, wd_mult=0.1)
+        >>> param_group_override == ParamGroupOverride(newvar=3) # this is ok too
+
+    """
+
+    max_lr: float
+    min_lr: float
+    start_wd: float
+    end_wd: float
+    wd_mult: float
+
+
+def get_canonical_lr_for_logging(param_groups: list[dict]) -> float | None:
+    """Return the lr of the first ``default_config=True`` param group.
+
+    All ``default_config`` groups share the same LR schedule, so the first one
+    is representative.  This includes empty rank-alignment stub groups, which
+    the scheduler still writes a valid lr onto.
+
+    Args:
+        param_groups (list[dict]): parameter groups from the optimizer.
+
+    Returns:
+        float | None: the canonical learning rate, or None if no
+            ``default_config=True`` group is found.
+    """
+    for param_group in param_groups:
+        if param_group.get('default_config', False):
+            return param_group.get('lr')
+    return None
+
+
+def param_group_override_to_tuple(
+    param_group_override: ParamGroupOverride | None,
+) -> tuple[tuple[str, Any], ...] | None:
+    """Convert a param group override to a tuple for use as a key in a dictionary.
+
+    The tuple is sorted by the keys of the param group override to handle different orderings of
+     the keys in different override dictionaries which still mean the same thing.
+    """
+    if param_group_override is None:
+        return None
+    return tuple(sorted(param_group_override.items()))
+
+
+def combine_param_group_overrides(
+    param_group_overrides: list[ParamGroupOverride | None],
+) -> ParamGroupOverride:
+    """Combine a list of param group overrides into a single param group override.
+
+    This function ensures that the overrides are not conflicting as well.
+
+    Args:
+        param_group_overrides (list[ParamGroupOverride]): list of param group overrides to combine
+
+    Returns:
+        ParamGroupOverride: combined param group override
+    """
+    combined_override = ParamGroupOverride()
+    for override in param_group_overrides:
+        if override is None:
+            continue
+        for key, value in override.items():
+            if key in combined_override:
+                if combined_override[key] != value:
+                    raise ValueError(
+                        f"Conflicting overrides for {key}: {combined_override[key]} and {value}"
+                    )
+            combined_override[key] = value
+    return combined_override
+
+
 class OptimizerParamScheduler:
     """Anneals learning rate and weight decay
 
@@ -40,7 +123,7 @@ class OptimizerParamScheduler:
 
     def __init__(
         self,
-        optimizer: MegatronOptimizer,
+        optimizer: "MegatronOptimizer",
         init_lr: float,
         max_lr: float,
         min_lr: float,
@@ -215,6 +298,10 @@ def step(self, increment: int) -> None:
             increment (int): number of steps to increment
         """
         self.num_steps += increment
+        # Do not skip empty param groups: get_canonical_lr_for_logging reads lr
+        # from default_config groups regardless of whether they hold parameters.
+        # This is important for logging under model parallelism that may leave
+        # some ranks with empty default_config parameter groups.
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = self.get_lr(param_group)
             param_group['weight_decay'] = self.get_wd(param_group) * param_group.get('wd_mult', 1.0)
diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index 8a8bafb414a..5a37e6efc67 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 16
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = ""
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py
index 330d0e03471..08ebdac67d8 100644
--- a/megatron/core/packed_seq_params.py
+++ b/megatron/core/packed_seq_params.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 from dataclasses import dataclass
 
+import torch.distributed as dist
 from torch import Tensor
 
 
@@ -18,3 +19,5 @@ class PackedSeqParams:
     cu_seqlens_kv_padded: Tensor = None
     max_seqlen_q: int = None
     max_seqlen_kv: int = None
+    local_cp_size: int = None
+    cp_group: dist.ProcessGroup = None
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
index f666451ba59..1d76995bda8 100644
--- a/megatron/core/parallel_state.py
+++ b/megatron/core/parallel_state.py
@@ -6,12 +6,13 @@
 import os
 import warnings
 from datetime import timedelta
+from math import log2
 from typing import Callable, List, Optional
 
 import numpy as np
 import torch
 
-from .utils import GlobalMemoryBuffer, is_torch_min_version
+from .utils import GlobalMemoryBuffer, GlobalSymmetricMemoryBuffer, is_torch_min_version
 
 from megatron.plugin.hetero.parallel_context import get_parallel_context  
 
@@ -25,8 +26,6 @@
 except ImportError:
     HAVE_EINOPS = False
 
-logger = logging.getLogger(__name__)
-
 # Intra-layer model parallel group that the current rank belongs to.
 _TENSOR_MODEL_PARALLEL_GROUP = None
 # Inter-layer model parallel group that the current rank belongs to.
@@ -105,6 +104,10 @@
 # the first local rank in the tensor model parallel group
 _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = None
 
+# A list of global ranks for each expert model parallel group to ease calculation of
+# the first local rank in the expert model parallel group
+_EXPERT_MODEL_PARALLEL_RANKS = None
+
 # A list of global ranks for each model parallel group to ease calculation of
 # the first local rank in the model parallel group
 _MODEL_PARALLEL_GLOBAL_RANKS = None
@@ -116,9 +119,12 @@
 _CONTEXT_PARALLEL_GLOBAL_RANKS = None
 # Hierarchical context parallel groups
 _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS = None
+# Hybrid context parallel groups
+_HYBRID_DP_CP_GROUPS = {}
 
 # Data parallel group information with context parallel combined.
 _DATA_PARALLEL_GROUP_WITH_CP = None
+_DATA_PARALLEL_GROUP_WITH_CP_AG = None
 _DATA_PARALLEL_GROUP_WITH_CP_GLOO = None
 _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None
 
@@ -140,6 +146,9 @@
 # Memory buffers to avoid dynamic memory allocation
 _GLOBAL_MEMORY_BUFFER = None
 
+# Global symmetric memory buffer for inference
+_GLOBAL_SYMMETRIC_MEMORY_BUFFER = None
+
 # List of all process groups
 # Used for updating the timeout for all process groups
 # None represents the default process group
@@ -418,6 +427,31 @@ def create_hierarchical_groups(
     return hierarchical_groups, hierarchical_groups_gloo
 
 
+def create_hybrid_dp_cp_groups(rank, ranks, pg_options):
+    """
+    Creates groups required for hybrid DPxCP.
+    Creates a new group for every power of 2 up to the number of DPxCP ranks.
+    Returns a dictionary indexed by group size.
+    """
+    hybrid_dp_cp_groups = {}
+    # Generate group for every power of 2 up to the number of CP ranks
+    # We limit the allowed group sizes in order to avoid excessive overhead.
+    group_sizes = [2**i for i in range(int(log2(len(ranks))))][1:]
+    for group_size in group_sizes:
+        for i in range(0, len(ranks), group_size):
+            group = create_group(
+                ranks[i : i + group_size],
+                pg_options=pg_options,
+                group_desc=f"HYBRID_DP_CP_GROUP_{group_size}",
+            )
+            if rank in ranks[i : i + group_size]:
+                assert (
+                    group_size not in hybrid_dp_cp_groups
+                ), f"Rank {rank} appears in multiple Hybrid DP CP groups of size {group_size}"
+                hybrid_dp_cp_groups[group_size] = group
+    return hybrid_dp_cp_groups
+
+
 class RankGenerator(object):
     """A class for generating rank groups for different modes of parallelism."""
 
@@ -527,6 +561,7 @@ def initialize_model_parallel(
     use_sharp: bool = False,
     context_parallel_size: int = 1,
     hierarchical_context_parallel_sizes: Optional[List[int]] = None,
+    hybrid_context_parallel: bool = False,
     expert_model_parallel_size: int = 1,
     num_distributed_optimizer_instances: int = 1,
     expert_tensor_parallel_size: Optional[int] = None,
@@ -538,6 +573,7 @@ def initialize_model_parallel(
     create_gloo_process_groups: bool = True,
     high_priority_stream_groups: Optional[List[str]] = None,
     sharp_enabled_group: Optional[str] = None,
+    create_all_gather_group: Optional[bool] = False,
     create_dualpipev_parallel_size: bool = False,
 ) -> None:
     """Initialize model data parallel groups.
@@ -653,6 +689,13 @@ def initialize_model_parallel(
             By default (None), it is enabled from dp group.
             Available options (choose one): [dp, dp_replica]
 
+        create_all_gather_group (bool, default = False):
+            Create a separate process group for all-gather operations to avoid
+            head-of-line blocking with reduce-scatter operations. When enabled,
+            creates an additional NCCL communicator with identical ranks as the
+            dp-cp group but with independent progress engines for better communication
+            overlap.
+
     Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
     use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
     the model pipeline. The present function will
@@ -795,6 +838,7 @@ def initialize_model_parallel(
     global _DATA_PARALLEL_GROUP_GLOO
     global _DATA_PARALLEL_GLOBAL_RANKS
     global _DATA_PARALLEL_GROUP_WITH_CP
+    global _DATA_PARALLEL_GROUP_WITH_CP_AG
     global _DATA_PARALLEL_GROUP_WITH_CP_GLOO
     global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
     global _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP
@@ -826,6 +870,15 @@ def initialize_model_parallel(
             pg_options=get_nccl_options("dp_cp", nccl_comm_cfgs),
             group_desc="DATA_PARALLEL_GROUP_WITH_CP",
         )
+        if create_all_gather_group:
+            group_with_cp_ag = create_group(
+                ranks_with_cp,
+                timeout=timeout,
+                pg_options=get_nccl_options("dp_cp", nccl_comm_cfgs),
+                group_desc="DATA_PARALLEL_GROUP_WITH_CP_AG",
+            )
+        else:
+            group_with_cp_ag = None
         if create_gloo_process_groups:
             group_with_cp_gloo = create_group(
                 ranks_with_cp,
@@ -837,6 +890,7 @@ def initialize_model_parallel(
             group_with_cp_gloo = None
         if rank in ranks_with_cp:
             _DATA_PARALLEL_GROUP_WITH_CP = group_with_cp
+            _DATA_PARALLEL_GROUP_WITH_CP_AG = group_with_cp_ag
             _DATA_PARALLEL_GROUP_WITH_CP_GLOO = group_with_cp_gloo
             _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp
 
@@ -896,6 +950,19 @@ def initialize_model_parallel(
         if "NCCL_COLLNET_ENABLE" in os.environ:
             del os.environ["NCCL_COLLNET_ENABLE"]
 
+    if hybrid_context_parallel:
+        global _HYBRID_DP_CP_GROUPS
+        for ranks_with_cp in decoder_rank_generator.get_ranks('dp-cp'):
+            assert (
+                len(ranks_with_cp) % 2 == 0
+            ), "Hybrid context parallel requires an even number of ranks"
+            _HYBRID_DP_CP_GROUPS.update(
+                create_hybrid_dp_cp_groups(
+                    rank, ranks_with_cp, get_nccl_options("dp_cp", nccl_comm_cfgs)
+                )
+            )
+        # TODO: Are gloo groups needed for hybrid cp?
+
     for ranks in decoder_rank_generator.get_ranks('dp'):
         group = create_group(
             ranks,
@@ -1136,7 +1203,7 @@ def initialize_model_parallel(
 
     ### Expert-related parallel groups initialization
     # Build the expert model parallel group
-    global _EXPERT_MODEL_PARALLEL_GROUP
+    global _EXPERT_MODEL_PARALLEL_GROUP, _EXPERT_MODEL_PARALLEL_RANKS
     assert _EXPERT_MODEL_PARALLEL_GROUP is None, 'Expert parallel group is already initialized'
     for ranks in expert_decoder_rank_generator.get_ranks('ep'):
         group = create_group(
@@ -1147,6 +1214,7 @@ def initialize_model_parallel(
         )
         if rank in ranks:
             _EXPERT_MODEL_PARALLEL_GROUP = group
+            _EXPERT_MODEL_PARALLEL_RANKS = ranks
 
     # Build the expert tensor parallel group
     global _EXPERT_TENSOR_PARALLEL_GROUP
@@ -1374,7 +1442,9 @@ def get_pipeline_model_parallel_group(check_initialized=True):
     return _PIPELINE_MODEL_PARALLEL_GROUP
 
 
-def get_data_parallel_group(with_context_parallel=False, partial_data_parallel=False):
+def get_data_parallel_group(
+    with_context_parallel=False, partial_data_parallel=False, independent_all_gather=False
+):
     """Get the data-parallel group the caller rank belongs to."""
     para_ctx = get_parallel_context() 
     if para_ctx is not None:
@@ -1388,6 +1458,11 @@ def get_data_parallel_group(with_context_parallel=False, partial_data_parallel=F
                 _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP is not None
             ), "Intra partial data parallel group is not initialized"
             return _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP
+        if independent_all_gather:
+            assert (
+                _DATA_PARALLEL_GROUP_WITH_CP_AG is not None
+            ), "data parallel group with context parallel AG is not initialized"
+            return _DATA_PARALLEL_GROUP_WITH_CP_AG
         assert (
             _DATA_PARALLEL_GROUP_WITH_CP is not None
         ), "data parallel group with context parallel combined is not initialized"
@@ -1398,6 +1473,15 @@ def get_data_parallel_group(with_context_parallel=False, partial_data_parallel=F
         return _DATA_PARALLEL_GROUP
 
 
+def has_separate_all_gather_group() -> bool:
+    """Check if a separate all-gather process group has been created.
+
+    Returns True if a dedicated all-gather process group exists for improved
+    communication overlap, False otherwise.
+    """
+    return _DATA_PARALLEL_GROUP_WITH_CP_AG is not None
+
+
 def get_data_parallel_group_gloo(with_context_parallel=False, partial_data_parallel=False):
     """Get the Gloo data-parallel group the caller rank belongs to."""
     para_ctx = get_parallel_context() 
@@ -1457,6 +1541,18 @@ def get_hierarchical_context_parallel_groups(check_initialized=True):
     return _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS
 
 
+def get_hybrid_data_context_parallel_groups(check_initialized=True, group_size=None):
+    """Get the hybrid context parallel groups the caller rank belongs to."""
+    # If the group size is the same as the entire DPxCP group, return the original group
+    if get_data_parallel_world_size(with_context_parallel=True) == group_size:
+        if check_initialized:
+            assert _DATA_PARALLEL_GROUP_WITH_CP is not None
+        return _DATA_PARALLEL_GROUP_WITH_CP
+    if check_initialized:
+        assert _HYBRID_DP_CP_GROUPS is not None
+    return _HYBRID_DP_CP_GROUPS[group_size]
+
+
 def get_embedding_group(check_initialized=True):
     """Get the embedding group the caller rank belongs to."""
     para_ctx = get_parallel_context() 
@@ -1944,6 +2040,15 @@ def get_expert_model_parallel_group(check_initialized=True):
     return _EXPERT_MODEL_PARALLEL_GROUP
 
 
+def get_expert_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the expert model parallel group."""
+    assert (
+        _EXPERT_MODEL_PARALLEL_RANKS is not None
+    ), "Expert model parallel group is not initialized"
+    return _EXPERT_MODEL_PARALLEL_RANKS[0]
+
+
 def get_expert_model_parallel_world_size():
     """Return world size for the expert-model-parallel group."""
     para_ctx = get_parallel_context() 
@@ -2225,6 +2330,17 @@ def _set_global_memory_buffer():
     _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
 
 
+def _set_global_symmetric_memory_buffer():
+    """Initialize global buffer."""
+    global _GLOBAL_SYMMETRIC_MEMORY_BUFFER
+    assert _GLOBAL_SYMMETRIC_MEMORY_BUFFER is None, "global memory buffer is already initialized"
+
+    _GLOBAL_SYMMETRIC_MEMORY_BUFFER = GlobalSymmetricMemoryBuffer(
+        size_in_mb=256,  # todo: set from an argument?
+        process_group=get_tensor_model_parallel_group(),
+    )
+
+
 def get_global_memory_buffer():
     """Return the global GlobalMemoryBuffer object"""
     para_ctx = get_parallel_context() 
@@ -2235,6 +2351,14 @@ def get_global_memory_buffer():
     return _GLOBAL_MEMORY_BUFFER
 
 
+def get_global_symmetric_memory_buffer():
+    """Return the global GlobalSymmetricMemoryBuffer object"""
+    assert (
+        _GLOBAL_SYMMETRIC_MEMORY_BUFFER is not None
+    ), "global symmetric memory buffer is not initialized"
+    return _GLOBAL_SYMMETRIC_MEMORY_BUFFER
+
+
 def destroy_global_memory_buffer():
     """Sets the global memory buffer to None"""
     para_ctx = get_parallel_context() 
@@ -2244,6 +2368,12 @@ def destroy_global_memory_buffer():
     _GLOBAL_MEMORY_BUFFER = None
 
 
+def destroy_global_symmetric_memory_buffer():
+    """Sets the global symmetric memory buffer to None"""
+    global _GLOBAL_SYMMETRIC_MEMORY_BUFFER
+    _GLOBAL_SYMMETRIC_MEMORY_BUFFER = None
+
+
 def get_all_ranks():
     """Get caller's rank in tensor-model-parallel, data-parallel, context-parallel,
     pipeline-model-parallel and expert-model-parallel groups."""
@@ -2274,6 +2404,9 @@ def destroy_model_parallel():
     global _DATA_PARALLEL_GROUP_WITH_CP
     _DATA_PARALLEL_GROUP_WITH_CP = None
 
+    global _DATA_PARALLEL_GROUP_WITH_CP_AG
+    _DATA_PARALLEL_GROUP_WITH_CP_AG = None
+
     global _CONTEXT_PARALLEL_GROUP
     _CONTEXT_PARALLEL_GROUP = None
 
@@ -2324,6 +2457,9 @@ def destroy_model_parallel():
     global _GLOBAL_MEMORY_BUFFER
     _GLOBAL_MEMORY_BUFFER = None
 
+    global _GLOBAL_SYMMETRIC_MEMORY_BUFFER
+    _GLOBAL_SYMMETRIC_MEMORY_BUFFER = None
+
     global _DATA_PARALLEL_GROUP_GLOO
     if (
         _DATA_PARALLEL_GROUP_GLOO is not None
diff --git a/megatron/core/pipeline_parallel/bridge_communicator.py b/megatron/core/pipeline_parallel/bridge_communicator.py
index a67ded6bf08..f1e74a2f16d 100644
--- a/megatron/core/pipeline_parallel/bridge_communicator.py
+++ b/megatron/core/pipeline_parallel/bridge_communicator.py
@@ -628,9 +628,6 @@ def send_forward_recv_backward(
                 dist.broadcast(
                     shape_tensor, src=self.current_rank, group=self.src_grid_broadcast_pg
                 )
-                dist.broadcast(
-                    shape_tensor, src=self.current_rank, group=self.src_grid_broadcast_pg
-                )
 
                 # Broadcast the tensors to all ranks in the group
                 dist.broadcast(
diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py
new file mode 100644
index 00000000000..08e46a039e2
--- /dev/null
+++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py
@@ -0,0 +1,1267 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+from collections import deque
+from contextlib import nullcontext
+from typing import Any, Dict, Tuple
+
+import torch
+
+# CPU offload implementation for pipeline parallelism
+DEBUG = False
+DEBUG_RANK = 0
+
+from megatron.core.transformer.cuda_graphs import is_graph_capturing
+
+
+def debug_rank(message):
+    """Print debug message for a specific rank when DEBUG is enabled."""
+    # pylint: disable=bad-builtin
+    if not DEBUG:
+        return
+    assert torch.distributed.is_initialized()
+    if torch.distributed.get_rank() == DEBUG_RANK:
+        print(message)
+
+
+def print_offload_summary_table(total_offload_bytes: Dict[str, int]):
+    """
+    Print an ASCII table summarizing offload bytes across all ranks.
+
+    Gathers offload data from all ranks and prints a formatted table on rank 0,
+    with rows representing ranks and columns representing groups.
+
+    Args:
+        total_offload_bytes: Dict mapping group names to offload bytes for this rank.
+    """
+    # pylint: disable=bad-builtin
+    assert torch.distributed.is_initialized()
+    rank = torch.distributed.get_rank()
+    world_size = torch.distributed.get_world_size()
+
+    # Gather all group names across ranks
+    local_names = list(total_offload_bytes.keys())
+    all_names_list = [None] * world_size
+    torch.distributed.all_gather_object(all_names_list, local_names)
+    all_group_names = sorted(set(name for names in all_names_list for name in names))
+
+    # Gather offload bytes from all ranks: each rank sends a list of bytes per group
+    local_bytes = [total_offload_bytes.get(name, 0) for name in all_group_names]
+    all_bytes_list = [None] * world_size
+    torch.distributed.all_gather_object(all_bytes_list, local_bytes)
+
+    # Print ASCII table on rank 0
+    if rank == 0:
+        # Calculate column widths
+        col_width = max(12, max((len(name) for name in all_group_names), default=8) + 2)
+        rank_col_width = max(6, len(f"Rank {world_size - 1}") + 2)
+
+        # Build header
+        header = "Rank".ljust(rank_col_width)
+        header += "".join(name.rjust(col_width) for name in all_group_names)
+        header += "Total".rjust(col_width)
+        separator = "-" * len(header)
+
+        print("\n" + "=" * len(header))
+        print("Activation Offload Summary (MB)".center(len(header)))
+        print("=" * len(header))
+        print(header)
+        print(separator)
+
+        # Build rows for each rank
+        grand_total = 0
+        col_totals = [0] * len(all_group_names)
+        for r in range(world_size):
+            row_bytes = all_bytes_list[r]
+            row_total = sum(row_bytes)
+            grand_total += row_total
+            for i, b in enumerate(row_bytes):
+                col_totals[i] += b
+            row_str = f"Rank {r}".ljust(rank_col_width)
+            for b in row_bytes:
+                row_str += f"{b / (1024 * 1024):.2f}".rjust(col_width)
+            row_str += f"{row_total / (1024 * 1024):.2f}".rjust(col_width)
+            print(row_str)
+
+        # Print totals row
+        print(separator)
+        totals_row = "Total".ljust(rank_col_width)
+        for ct in col_totals:
+            totals_row += f"{ct / (1024 * 1024):.2f}".rjust(col_width)
+        totals_row += f"{grand_total / (1024 * 1024):.2f}".rjust(col_width)
+        print(totals_row)
+        print("=" * len(header) + "\n")
+
+    torch.distributed.barrier()
+
+
+class GPUTensorPool:
+    """
+    GPU memory pool for efficient allocation and deallocation of tensors.
+
+    Features:
+    - Supports multiple tensor shapes and dtypes, each with its own pool
+    - Dynamic allocation: tensors are created on-demand during allocation
+    - Efficient reuse: freed tensors are returned to the pool for reuse
+    - Uses queue-based management for O(1) allocation and deallocation
+
+    Example:
+        pool = GPUTensorPool(device='cuda:0')
+        tensor = pool.allocate((128, 512), dtype=torch.float32)
+        # ... use tensor ...
+        pool.free(tensor, (128, 512), dtype=torch.float32)
+    """
+
+    def __init__(self, device: str = 'cuda', pin_memory: bool = False):
+        """
+        Initialize GPU tensor pool.
+
+        Args:
+            device: GPU device, default 'cuda'
+            pin_memory: Whether to use pinned memory (mainly for CPU tensors)
+        """
+        self.device = torch.device(device)
+        self.pin_memory = pin_memory
+
+        # Maintain a separate pool for each (shape, dtype) combination
+        # Structure: {(shape, dtype): {'free': deque, 'all': list, 'allocated_count': int}}
+        self._pools: Dict[Tuple, Dict[str, Any]] = {}
+
+        # Statistics
+        self._stats = {
+            'total_allocated': 0,  # Total number of tensors ever allocated
+            'current_in_use': 0,  # Number of tensors currently in use
+            'allocation_requests': 0,  # Number of allocation requests
+            'free_requests': 0,  # Number of free requests
+            'pool_hits': 0,  # Number of times a tensor was reused from pool
+            'pool_misses': 0,  # Number of times a new tensor was created
+        }
+
+        debug_rank("GPUTensorPool: Initialized with dynamic allocation")
+
+    def _get_pool_key(self, shape: Tuple, dtype: torch.dtype) -> Tuple:
+        """Generate a unique key for the pool based on shape and dtype."""
+        return (shape, dtype)
+
+    @staticmethod
+    def _calculate_memory_size(shape: Tuple, dtype: torch.dtype) -> int:
+        """Calculate memory size in bytes."""
+        element_size = torch.tensor([], dtype=dtype).element_size()
+        numel = 1
+        for dim in shape:
+            numel *= dim
+        return numel * element_size
+
+    def allocate(self, shape: Tuple, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+        """
+        Allocate a tensor with the specified shape and dtype.
+
+        Args:
+            shape: Shape of the tensor
+            dtype: Data type of the tensor, default torch.float32
+
+        Returns:
+            Allocated tensor
+        """
+        self._stats['allocation_requests'] += 1
+
+        pool_key = self._get_pool_key(shape, dtype)
+
+        # Create pool for this (shape, dtype) if it doesn't exist
+        if pool_key not in self._pools:
+            self._pools[pool_key] = {
+                'free': deque(),  # Queue of available tensors
+                'all': [],  # List of all tensors (for tracking)
+                'allocated_count': 0,  # Number of allocated tensors
+            }
+
+        pool = self._pools[pool_key]
+
+        # Try to reuse a tensor from the pool
+        if len(pool['free']) > 0:
+            tensor = pool['free'].popleft()
+            self._stats['pool_hits'] += 1
+            debug_rank(
+                f"GPUTensorPool.allocate: Reused tensor from pool, "
+                f"shape={shape}, dtype={dtype}, "
+                f"remaining in pool={len(pool['free'])}"
+            )
+        else:
+            # Allocate a new tensor
+            tensor = torch.empty(shape, dtype=dtype, device=self.device, pin_memory=self.pin_memory)
+            pool['all'].append(tensor)
+            self._stats['total_allocated'] += 1
+            self._stats['pool_misses'] += 1
+
+            memory_mb = self._calculate_memory_size(shape, dtype) / (1024**2)
+            debug_rank(
+                f"GPUTensorPool.allocate: Created new tensor, "
+                f"shape={shape}, dtype={dtype}, "
+                f"memory={memory_mb:.2f} MB, "
+                f"total_created={len(pool['all'])}"
+            )
+
+        pool['allocated_count'] += 1
+        self._stats['current_in_use'] += 1
+
+        return tensor
+
+    def free(self, tensor: torch.Tensor):
+        """
+        Return a tensor to the pool for reuse.
+
+        Args:
+            tensor: Tensor to free
+
+        Raises:
+            ValueError: If tensor doesn't belong to this pool
+        """
+        self._stats['free_requests'] += 1
+
+        shape = tensor.shape
+        dtype = tensor.dtype
+
+        pool_key = self._get_pool_key(shape, dtype)
+
+        if pool_key not in self._pools:
+            raise ValueError(
+                f"No pool exists for shape={shape}, dtype={dtype}. "
+                f"Available pools: {list(self._pools.keys())}"
+            )
+
+        pool = self._pools[pool_key]
+
+        # Verify tensor belongs to this pool (use identity check, not value comparison)
+        tensor_found = any(tensor is t for t in pool['all'])
+        if not tensor_found:
+            raise ValueError(
+                f"Attempting to free a tensor that doesn't belong to this pool "
+                f"(shape={shape}, dtype={dtype})"
+            )
+
+        # Return tensor to the free queue
+        pool['free'].append(tensor)
+        pool['allocated_count'] -= 1
+        self._stats['current_in_use'] -= 1
+
+        debug_rank(
+            f"GPUTensorPool.free: shape={shape}, dtype={dtype}, "
+            f"available in pool={len(pool['free'])}"
+        )
+
+    def get_pool_status(self, shape: Tuple = None, dtype: torch.dtype = None) -> Dict[str, Any]:
+        """
+        Get the status of the memory pool.
+
+        Args:
+            shape: If specified along with dtype, return status for that specific pool
+            dtype: Data type (required if shape is specified)
+
+        Returns:
+            Dictionary containing status information
+        """
+        if shape is not None:
+            if dtype is None:
+                raise ValueError("dtype must be specified when shape is provided")
+
+            pool_key = self._get_pool_key(shape, dtype)
+
+            if pool_key not in self._pools:
+                raise ValueError(f"No pool exists for shape={shape}, dtype={dtype}")
+
+            pool = self._pools[pool_key]
+            total_count = len(pool['all'])
+
+            return {
+                'shape': shape,
+                'dtype': dtype,
+                'total_count': total_count,
+                'allocated_count': pool['allocated_count'],
+                'free_count': len(pool['free']),
+                'utilization': (
+                    pool['allocated_count'] / total_count * 100 if total_count > 0 else 0
+                ),
+            }
+        else:
+            # Return status for all pools
+            status = {'global_stats': self._stats.copy(), 'pools': {}}
+
+            for pool_key in self._pools:
+                shape, dtype = pool_key
+                status['pools'][pool_key] = self.get_pool_status(shape, dtype)
+
+            return status
+
+    def reset(self):
+        """Reset the pool, marking all tensors as available."""
+        debug_rank("GPUTensorPool: Resetting pool...")
+
+        for pool_key, pool in self._pools.items():
+            # Clear and refill the free queue
+            pool['free'].clear()
+            for tensor in pool['all']:
+                pool['free'].append(tensor)
+            pool['allocated_count'] = 0
+
+        self._stats['current_in_use'] = 0
+        debug_rank("GPUTensorPool: Reset complete")
+
+    def clear(self):
+        """Clear the pool and release all GPU memory."""
+        debug_rank("GPUTensorPool: Clearing pool...")
+
+        for pool_key, pool in self._pools.items():
+            # Clear all references, allowing PyTorch GC to reclaim memory
+            pool['free'].clear()
+            pool['all'].clear()
+
+        self._pools.clear()
+        self._stats['current_in_use'] = 0
+
+        # Trigger GPU cache cleanup
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+        debug_rank("GPUTensorPool: Clear complete")
+
+    def __del__(self):
+        """Destructor to ensure resources are released."""
+        self.clear()
+
+
+class OffloadTensorGroup:
+    """
+    A group of tensors to be offloaded together.
+    """
+
+    def __init__(self, name):
+        self._name = name
+        self._tensors = {}
+        self._offload_event = torch.cuda.Event()
+        self._reload_event = torch.cuda.Event()
+        self.offload = True
+        self.total_offload_bytes = 0
+        self.total_tensor_count = 0
+        # Using memory pool is for the compatibility with cuda graph.
+        # Shapes of tensors for expert_fc1 and moe_act are not known in advance,
+        # so we do not use CPU pool for them.
+        if name == "expert_fc1" or name == "moe_act":
+            self.use_cpu_pool = False
+        else:
+            self.use_cpu_pool = True
+
+    def push_tensor(self, tag, tensor):
+        """Push a tensor to the group."""
+        self._tensors[tag] = tensor
+
+    def pop_tensor(self, tag):
+        """Pop a tensor from the group."""
+        return self._tensors.pop(tag)
+
+    def record_offload_event(self, stream):
+        """Record the offload event."""
+        self._offload_event.record(stream)
+
+    def wait_offload_event(self, stream):
+        """Wait for the offload event."""
+        stream.wait_event(self._offload_event)
+
+    def record_reload_event(self, stream):
+        """Record the reload event."""
+        self._reload_event.record(stream)
+
+    def wait_reload_event(self, stream):
+        """Wait for the reload event."""
+        stream.wait_event(self._reload_event)
+
+    def update_offload_info(self, tensor):
+        """Update the offload information."""
+        self.total_offload_bytes += tensor.numel() * tensor.element_size()
+        self.total_tensor_count += 1
+
+
+class PipelineOffloadManager:
+    """
+    Singleton manager for coordinating activation offloading across pipeline stages.
+    Manages chunk handlers, synchronizes GPU-CPU transfers,
+    and handles virtual pipeline parallelism.
+    """
+
+    OFFLOAD_MGR = None
+
+    @classmethod
+    def get_instance(cls):
+        """Get the singleton instance of PipelineOffloadManager."""
+        if cls.OFFLOAD_MGR is None:
+            cls.OFFLOAD_MGR = PipelineOffloadManager()
+        return cls.OFFLOAD_MGR
+
+    @classmethod
+    def reset_instance(cls):
+        """Reset the singleton instance of PipelineOffloadManager."""
+        cls.OFFLOAD_MGR = None
+        cls.OFFLOAD_MGR = PipelineOffloadManager()
+
+    def __init__(self):
+        """Initialize the manager with queues and dedicated CUDA streams."""
+        # Queue to store chunk handlers for backward pass
+        self._queue = deque()
+        # Cache chunk handlers for each virtual pipeline stage
+        self._stages = None
+        # allocate streams and events for synchronization
+        self._d2h_stream = torch.cuda.Stream()
+        self._h2d_stream = torch.cuda.Stream()
+        # Shared CPU tensor pool for all chunks to improve reuse efficiency
+        self._cpu_tensor_pool = GPUTensorPool(device="cpu", pin_memory=True)
+
+        # Whether the manager is in warmup phase.
+        self._is_warmup = True
+        # Cache OffloadChunkHandler objects for each virtual pipeline stage and each forward pass.
+        self._cached_chunks_forward = []
+        # Cache OffloadChunkHandler objects for each virtual pipeline stage and each backward pass.
+        self._cached_chunks_backward = []
+        # Index of the current backward chunk in the cached chunks backward.
+        self._cached_chunks_index_backward = 0
+        # Index of the current forward chunk in the cached chunks forward.
+        self._cached_chunks_index_forward = 0
+
+        self.do_offload = True
+
+        # Do not offload the last X groups so that the reloading won't block the computing stream.
+        self._offload_margin = 0
+        # Sometimes we need to delay the offloading and launch it later.
+        # The delayed offload groups are stored in a queue.
+        self._delayed_offload_groups = []
+        self.reset()
+
+    @property
+    def d2h_stream(self):
+        """Get the device-to-host (GPU to CPU) transfer stream."""
+        return self._d2h_stream
+
+    @property
+    def h2d_stream(self):
+        """Get the host-to-device (CPU to GPU) transfer stream."""
+        return self._h2d_stream
+
+    @property
+    def cpu_tensor_pool(self):
+        """Get the shared CPU tensor pool."""
+        return self._cpu_tensor_pool
+
+    def push_offload_groups(self, group_hook, forced_released_tensors):
+        """Push the offload groups to the delayed queue."""
+        debug_rank(f"pushing offload groups to the delayed queue")
+        self._delayed_offload_groups.append((group_hook, forced_released_tensors))
+
+    def flush_delayed_groups(self):
+        """Flush the delayed groups."""
+        debug_rank("flushing delayed groups")
+        # Flush the delayed groups in reverse order to maintain the order of the groups.
+        for group_hook, forced_released_tensors in reversed(self._delayed_offload_groups):
+            group_hook(forced_released_tensors)
+        self._delayed_offload_groups = []
+
+    def reset(self):
+        """Reset manager state for a new training iteration."""
+        self._inside_context = False
+        self._cur_forward_chunk = None
+        self._cur_backward_chunk = None
+        # Reset CPU tensor pool to reuse all CPU tensors for next iteration
+        if hasattr(self, '_cpu_tensor_pool'):
+            self._cpu_tensor_pool.reset()
+
+        # Call post_warmup_callback after warmup to collect the offload information.
+        if self._is_warmup and len(self._cached_chunks_forward) > 0:
+            self.post_warmup_callback()
+        self._cached_chunks_index_backward = 0
+        self._cached_chunks_index_forward = 0
+
+        for chunk in self._cached_chunks_forward:
+            chunk.reset()
+        self._delayed_offload_groups = []
+
+    @property
+    def offload_summary_bytes(self) -> Dict[str, int]:
+        """Offload summary bytes per group collected after warmup."""
+        return self._offload_summary_bytes
+
+    @property
+    def offload_summary_total_bytes(self) -> int:
+        """Total offloaded bytes collected after warmup."""
+        return self._offload_summary_total_bytes
+
+    def flush(self):
+        """Flush all staged chunks to the backward queue in reverse order."""
+        # Ensure all virtual pipeline stages have the same number of chunks
+        if len(self._stages[0]) == len(self._stages[-1]):
+            lens = [len(e) for e in self._stages]
+            assert min(lens) == max(lens), "All stages must have same chunk count"
+            # Clear the last stage and push all chunks in reverse order for backward
+            self._stages[-1] = []
+            for chunks in reversed(self._stages):
+                for chunk in chunks:
+                    self.push(chunk)
+            # Clear all stages after flushing
+            for i in range(self._vpp):
+                self._stages[i] = []
+
+    def disable_offload(self):
+        """Disable the offload."""
+        debug_rank("disable_offload")
+        self.do_offload = False
+        for chunk in self._cached_chunks_forward:
+            chunk.do_offload = False
+
+    def enable_offload(self):
+        """Enable the offload."""
+        debug_rank("enable_offload")
+        self.do_offload = True
+        for chunk in self._cached_chunks_forward:
+            chunk.do_offload = True
+
+    def post_warmup_callback(self):
+        """Callback after warmup."""
+        # pylint: disable=bad-builtin
+        debug_rank("post_warmup_callback")
+        self._is_warmup = False
+        assert len(self._cached_chunks_forward) == len(
+            self._cached_chunks_backward
+        ), "Cached chunks forward and backward must have the same length"
+        for chunk in self._cached_chunks_forward:
+            chunk.is_warmup = False
+            assert (
+                chunk in self._cached_chunks_backward
+            ), "Chunk not found in cached chunks backward"
+            # Update the offload margin to the maximum number of deduplicated groups
+            self._offload_margin = max(self._offload_margin, chunk.get_max_deduplicated_groups())
+            debug_rank(f"offload margin {self._offload_margin}")
+        # Find the last group with the same name in the cached chunks backward
+        last_group_with_same_name = {}
+        for chunk_idx, chunk in enumerate(reversed(self._cached_chunks_backward)):
+            for group in chunk.offload_groups:
+                last_group_with_same_name[group._name] = group
+        # Mark the last group with the same name as not offloadable to make sure
+        # the reloading won't block the main stream.
+        for name, group in last_group_with_same_name.items():
+            if self._offload_margin > 0:
+                group.offload = False
+                self._offload_margin -= 1
+                debug_rank(f"setting offload to false for group {name} at chunk index {chunk_idx}")
+            else:
+                break
+        debug_rank(f"offload margin {self._offload_margin}")
+        assert self._offload_margin == 0, "Offload margin is not 0"
+        # Dump the offload information
+        total_tensor_count = {}
+        total_offload_bytes = {}
+        for chunk in self._cached_chunks_forward:
+            for group in chunk.offload_groups:
+                if group.offload:
+                    if group._name not in total_tensor_count:
+                        total_tensor_count[group._name] = 0
+                    total_tensor_count[group._name] += group.total_tensor_count
+                    if group._name not in total_offload_bytes:
+                        total_offload_bytes[group._name] = 0
+                    total_offload_bytes[group._name] += group.total_offload_bytes
+            # Stop statistics at the first backward chunk after which 1F1B is running,
+            # where the memory cost will not increase anymore.
+            if chunk is self._cached_chunks_backward[0]:
+                break
+        # Cache summary for downstream consumers (e.g., unit tests).
+        self._offload_summary_bytes = dict(total_offload_bytes)
+        self._offload_summary_total_bytes = int(sum(total_offload_bytes.values()))
+        print_offload_summary_table(total_offload_bytes)
+
+    def push(self, handler):
+        """Add a chunk handler to the backward queue."""
+        debug_rank(f"pushing handler {handler}")
+        self._queue.append(handler)
+        if self._is_warmup:
+            self._cached_chunks_backward.append(handler)
+
+    def pop_backward_chunk(self, name=None):
+        """Get the next non-empty backward chunk containing the group with the given name."""
+        self._cur_backward_chunk = None
+        debug_rank(f"popping backward chunk {self._cached_chunks_index_backward}")
+        debug_rank(f"cached chunks backward {self._cached_chunks_backward}")
+        for idx, handler in enumerate(
+            self._cached_chunks_backward[self._cached_chunks_index_backward :]
+        ):
+            self._cached_chunks_index_backward += 1
+            if not handler.is_empty_chunk(name):
+                self._cur_backward_chunk = (
+                    handler  # set the first non-empty chunk as the current backward chunk
+                )
+                debug_rank(f"handler {handler} at index {idx} is not empty")
+                break
+        assert self._cur_backward_chunk is not None, "No non-empty chunk found"
+
+    def front_backward_chunk(self, name=None):
+        """Get the first non-empty backward chunk containing the group with the given name."""
+        for idx, handler in enumerate(
+            self._cached_chunks_backward[self._cached_chunks_index_backward :]
+        ):
+            if not handler.is_empty_chunk(name):
+                debug_rank(f"front handler {handler} at index {idx}")
+                return handler
+        return None
+
+    def init_model_chunk_offload_handler(
+        self, vp_size, vp_stage, min_offloaded_tensor_size=1024 * 1024
+    ):
+        """
+        Initialize a chunk offload handler for a model chunk (microbatch).
+
+        Args:
+            vp_size: Virtual pipeline size
+            vp_stage: Virtual pipeline stage index (None means stage 0)
+            min_offloaded_tensor_size: Minimum tensor size (in elements) to offload
+        """
+        if not self._is_warmup:
+            return
+
+        vp_size = 1 if vp_size is None else vp_size
+        if self._stages is None:
+            self._vpp = vp_size
+            self._stages = [[] for _ in range(vp_size)]
+
+        if vp_stage is None:
+            cur_vpp_rank = 0
+        else:
+            cur_vpp_rank = vp_stage
+
+        # Flush staged chunks when reaching the last virtual pipeline stage
+        if cur_vpp_rank == self._vpp - 1:
+            self.flush()
+
+        # Use shared CPU tensor pool for better reuse across chunks
+        cur_chunk = ChunkOffloadHandler(min_offloaded_tensor_size, self._cpu_tensor_pool)
+        debug_rank(f"init_model_chunk_offload_handler {cur_chunk}")
+        self._stages[cur_vpp_rank].append(cur_chunk)
+        # For the last stage, push immediately and flush
+        if cur_vpp_rank == self._vpp - 1:
+            self.push(cur_chunk)
+            self.flush()
+        self._cur_forward_chunk = cur_chunk
+        cur_chunk.vpp_rank = cur_vpp_rank
+        self._cached_chunks_forward.append(cur_chunk)
+
+    def pop_forward_chunk(self, name=None):
+        """Get the next forward pass chunk handler."""
+        debug_rank(f"pop_forward_chunk {self._cur_forward_chunk}")
+        if not self.do_offload:
+            return self._cur_forward_chunk
+        while not self._is_warmup and (
+            self._cur_forward_chunk is None or self._cur_forward_chunk.finish_all_groups(name)
+        ):
+            if self._cached_chunks_index_forward >= len(self._cached_chunks_forward):
+                self._cur_forward_chunk = None
+                break
+            self._cur_forward_chunk = self._cached_chunks_forward[self._cached_chunks_index_forward]
+            self._cached_chunks_index_forward += 1
+            debug_rank(f"new cur_forward_chunk {self._cur_forward_chunk}")
+        return self._cur_forward_chunk
+
+    def cur_forward_chunk(self):
+        """Get the current forward pass chunk handler."""
+        return self._cur_forward_chunk
+
+    def cur_backward_chunk(self):
+        """Get the current backward pass chunk handler."""
+        return self._cur_backward_chunk
+
+    def mark_not_offloadable(self, tensor: torch.Tensor):
+        """Mark the current forward chunk as not offloadable."""
+        if tensor is not None:
+            tensor.offloading_activation = False
+
+    def __enter__(self):
+        """Enter context manager to enable activation offloading hooks."""
+        debug_rank("----__enter__")
+        if self._cur_forward_chunk is None or not self.cur_forward_chunk().do_offload:
+            return
+        from megatron.core.extensions.transformer_engine import cpu_offload
+
+        if cpu_offload is not None:
+            cpu_offload.CPUOffloadEnabled = True
+        else:
+            raise RuntimeError("TE CPU offload is not available")
+        self.inside_context = True
+
+        torch._C._autograd._push_saved_tensors_default_hooks(
+            self.on_save_for_backward, self.on_get_saved_tensor
+        )
+
+    def __exit__(self, *args: Any):
+        """Exit context manager and restore original tensor saving behavior."""
+        debug_rank("----__exit__")
+        if self._cur_forward_chunk is None or not self.cur_forward_chunk().do_offload:
+            return
+        from megatron.core.extensions.transformer_engine import cpu_offload
+
+        if cpu_offload is not None:
+            cpu_offload.CPUOffloadEnabled = False
+        else:
+            raise RuntimeError("TE CPU offload is not available")
+        self.inside_context = False
+        torch._C._autograd._pop_saved_tensors_default_hooks()
+
+    def on_save_for_backward(self, tensor: torch.Tensor) -> Any:
+        """
+        Hook called when autograd saves a tensor for backward pass.
+        Returns a tag to identify the tensor later.
+        """
+        debug_rank(f"------on_save_for_backward {tensor.shape}")
+        assert self.inside_context, "Must be inside offload context"
+        return self.cur_forward_chunk().tensor_push(tensor)
+
+    def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor:
+        """
+        Hook called when autograd retrieves a saved tensor during backward pass.
+        Returns the actual tensor (potentially reloading from CPU).
+        """
+        debug_rank(f"----on_get_saved_tensor {saved_state}")
+        return self.cur_backward_chunk().tensor_pop(saved_state)
+
+
+class ChunkOffloadHandler:
+    """
+    Handles activation offloading and reloading for a single pipeline chunk (microbatch).
+    Manages tensor groups, coordinates asynchronous GPU-CPU transfers, and handles synchronization.
+    """
+
+    def offload(self, src_tensor, pin_memory=True, use_cpu_pool=True):
+        """Offload."""
+        debug_rank("--------offload")
+
+        if not src_tensor.is_contiguous():
+            src_tensor = src_tensor.contiguous()
+
+        if use_cpu_pool:
+            cpu_backup = self.cpu_tensor_pool.allocate(src_tensor.shape, dtype=src_tensor.dtype)
+        else:
+            cpu_backup = torch.empty(
+                src_tensor.shape, dtype=src_tensor.dtype, device="cpu", pin_memory=pin_memory
+            )
+
+        cpu_backup.copy_(src_tensor, non_blocking=pin_memory)
+        state = (src_tensor.device, cpu_backup, use_cpu_pool)
+        return state
+
+    def reload(self, state, non_blocking=None):
+        """Reload."""
+        debug_rank("------reload")
+        dev, cpu_backup, use_cpu_pool = state
+        if non_blocking is None:
+            non_blocking = cpu_backup.is_pinned()
+        gpu_tensor = torch.empty(
+            cpu_backup.size(), dtype=cpu_backup.dtype, layout=cpu_backup.layout, device=dev
+        )
+        gpu_tensor.copy_(cpu_backup, non_blocking=non_blocking)
+        if use_cpu_pool:
+            self.cpu_tensor_pool.free(cpu_backup)
+        return gpu_tensor
+
+    def __init__(self, min_offloaded_tensor_size, cpu_tensor_pool):
+        self.do_offload = True
+
+        # Group management for batching offload/reload operations
+        self.offload_groups = []
+        self._offloaded_group_index = 0
+        # Groups to be offloaded.
+        self._groups_to_offload = []
+        # Groups to be reloaded.
+        self._groups_to_reload = []
+        # Tensor count for the current group.
+        self._tensor_count_current_group = 0
+        # Maximum number of groups to offload or reload.
+        self._max_group_size = 0
+        # Groups being reloaded.
+        self._reloading_group = []
+        # Counter for special torch tensor types (FakeTensor, FunctionalTensor)
+        self.torch_tensor_count = 0
+        self.d2h_stream = PipelineOffloadManager.get_instance().d2h_stream
+        self.h2d_stream = PipelineOffloadManager.get_instance().h2d_stream
+        self.min_offloaded_tensor_size = min_offloaded_tensor_size
+        self.cpu_tensor_pool = cpu_tensor_pool
+        self.is_warmup = True
+
+    def reset(self):
+        """Reset the chunk offload handler."""
+        self._offloaded_group_index = 0
+        self._groups_to_offload = []
+        self._groups_to_reload = []
+        self._tensor_count_current_group = 0
+        self._reloading_group = []
+
+    def find_group_with_name(self, name: str, start_index: int = 0):
+        """Find the group with the given name starting from the given index."""
+        return next(
+            (group for group in self.offload_groups[start_index:] if group._name == name), None
+        )
+
+    def is_empty_chunk(self, name=None):
+        """Check if this chunk has no tensors to manage."""
+        debug_rank(f"------is_empty_chunk {self._max_group_size}")
+        if name is not None:
+            return self.find_group_with_name(name) is None
+        return self._max_group_size == 0
+
+    def finish_all_groups(self, name=None) -> bool:
+        """Finish all groups."""
+        debug_rank(
+            f"------finish_all_groups {self} {self._max_group_size} {self._offloaded_group_index}"
+        )
+        # TODO: check if this is correct
+        # Mark it as finished when there are no groups to offload or reload
+        if (
+            len(self._groups_to_reload) == 0
+            and len(self._groups_to_offload) == 0
+            and self._offloaded_group_index > 0
+        ):
+            return True
+        assert name is not None, "Name is required"
+        return self.find_group_with_name(name, self._offloaded_group_index) is None
+
+    def find_next_group(self, name=None):
+        """Find the next group with the given name."""
+        assert name is not None, "Name is required"
+        return self.find_group_with_name(name, self._offloaded_group_index)
+
+    def tensor_push(self, tensor):
+        """Push tensor to the offload handler."""
+        torch_stray_tensor = isinstance(
+            tensor,
+            (
+                torch._subclasses.fake_tensor.FakeTensor,
+                torch._subclasses.functional_tensor.FunctionalTensor,
+            ),
+        )
+        assert not torch_stray_tensor, "Stray tensor should not be offloaded"
+
+        # Assign unique tag based on group index and position within group
+        tensor_tag = (self._offloaded_group_index, self._tensor_count_current_group)
+        self._tensor_count_current_group += 1
+        self.offload_groups[self._offloaded_group_index - 1].push_tensor(tensor_tag, tensor)
+        debug_rank(f"--------tensor_push {tensor_tag}")
+        return tensor_tag
+
+    def tensor_pop(self, tensor_tag):
+        """Pop tensor from the offload handler."""
+        debug_rank(f"--------tensor_pop {tensor_tag}")
+        group_id, idx = tensor_tag
+        tensor = self.offload_groups[group_id - 1].pop_tensor(tensor_tag)
+        # If tensor is offloaded (stored as tuple), reload it
+        if isinstance(tensor, tuple):
+            tensor = self.reload(tensor)
+        debug_rank(f"--------tensor_pop {tensor.shape}")
+        return tensor
+
+    def tensor_need_offloading_checker(self, tensor):
+        """Check if the tensor needs to be offloaded."""
+        debug_rank(
+            f"tensor_need_offloading_checker {getattr(tensor, 'offloading_activation', None)}"
+        )
+        if tensor.numel() < self.min_offloaded_tensor_size:
+            return False
+        # Respect tensor's offload preference if specified
+        if hasattr(tensor, "offloading_activation") and not tensor.offloading_activation:
+            return False
+        return True
+
+    def bulk_offload_group(self):
+        """offload a group of tensors recorded in tensor_push()."""
+        debug_rank("------bulk_offload_group")
+        group_to_offload = self._groups_to_offload[-1]
+        torch.cuda.nvtx.range_push("activation offloading " + group_to_offload._name)
+        with torch.cuda.stream(self.d2h_stream):
+            for tensor_tag, tensor_on_device in group_to_offload._tensors.items():
+                if self.tensor_need_offloading_checker(tensor_on_device):
+                    state = self.offload(
+                        tensor_on_device, use_cpu_pool=group_to_offload.use_cpu_pool
+                    )
+                    if self.is_warmup:
+                        group_to_offload.update_offload_info(tensor_on_device)
+                    tensor_on_device.record_stream(self.d2h_stream)
+                    group_to_offload.push_tensor(tensor_tag, state)
+            group_to_offload.record_offload_event(self.d2h_stream)
+        self._groups_to_offload.pop()
+        torch.cuda.nvtx.range_pop()
+
+    def get_max_deduplicated_groups(self):
+        """Get the maximum number of deduplicated groups."""
+        count_modules = []
+        for group in self.offload_groups:
+            if group._name not in count_modules:
+                count_modules.append(group._name)
+        return len(count_modules)
+
+    def bulk_reload_group(self):
+        """Bulk reload group."""
+        debug_rank("----bulk_reload_group")
+        group_to_reload = self._groups_to_reload[-1]
+        torch.cuda.nvtx.range_push("activation reloading " + group_to_reload._name)
+        with torch.cuda.stream(self.h2d_stream):
+            # Wait for offload to complete before reloading
+            if not is_graph_capturing():
+                group_to_reload.wait_offload_event(self.h2d_stream)
+            for tensor_tag, state in group_to_reload._tensors.items():
+                # Only reload if tensor was offloaded (stored as tuple)
+                if isinstance(state, tuple):
+                    recovered_tensor = self.reload(state)
+                    debug_rank(f"----recovered_tensor {recovered_tensor.shape}")
+                    group_to_reload.push_tensor(tensor_tag, recovered_tensor)
+            group_to_reload.record_reload_event(self.h2d_stream)
+        self._groups_to_reload.pop()
+        # Add the group to the reloading group to wait for the reload event.
+        self._reloading_group.append(group_to_reload)
+        torch.cuda.nvtx.range_pop()
+
+    def pre_reload_last_layer(self):
+        """Pre-reload the last layer of this chunk to hide reload latency."""
+        debug_rank("pre_reload_last_layer")
+        debug_rank(f"len(self._groups_to_reload) {len(self._groups_to_reload)}")
+        if len(self._groups_to_reload) > 0:
+            # Reload the last group (last layer) early
+            self.bulk_reload_group()
+
+    def should_bulk_offload(self):
+        """Determine if the current group should be offloaded."""
+        assert len(self._groups_to_offload) > 0, "No groups to offload"
+        group = self._groups_to_offload[-1]
+        debug_rank(f"should_bulk_offload {self.is_warmup} {group.offload}")
+        # Don't offload if the chunk is not in warmup stage
+        if self.is_warmup:
+            return True
+        # Don't offload if the group is marked as not offloadable
+        if not group.offload:
+            return False
+
+        # Check if next backward chunk is this chunk (for last pipeline stage)
+        next_backward_chunk = PipelineOffloadManager.get_instance().front_backward_chunk(
+            group._name
+        )
+        if next_backward_chunk is not None and next_backward_chunk is self:
+            # Don't offload the last group with the same name if it's about to be used immediately
+            if self.find_next_group(group._name) is None:
+                debug_rank(f"next group {group._name} is not found")
+                return False
+
+        return True
+
+    def bulk_offload(self, forced_released_tensors):
+        """Offload a group of tensors and optionally release their GPU memory."""
+        debug_rank("----bulk_offload")
+        if self.should_bulk_offload():
+            self._groups_to_reload.append(self._groups_to_offload[-1])
+            self.bulk_offload_group()
+            # Manually release tensors not auto-freed by torch GC
+            if len(forced_released_tensors) > 0:
+                cur_stream = torch.cuda.current_stream()
+                for release_tensor in forced_released_tensors:
+                    if self.tensor_need_offloading_checker(release_tensor):
+                        # Ensure tensor is not in use before freeing
+                        release_tensor.record_stream(cur_stream)
+                        release_tensor.untyped_storage().resize_(0)
+
+    def on_group_commit_forward(self, forced_released_tensors):
+        """Called at the end of a layer group's forward pass to trigger offloading."""
+        if not self.do_offload:
+            return
+        debug_rank("--on_group_commit_forward")
+        # Wait for compute to finish before starting offload
+        self.d2h_stream.wait_stream(torch.cuda.current_stream())
+        self.bulk_offload(forced_released_tensors)
+
+    def bulk_reload(self):
+        """Reload the next group of tensors from CPU to GPU."""
+        debug_rank("--bulk_reload")
+        if len(self._groups_to_reload) > 0:
+            # Reload the next layer group
+            self.bulk_reload_group()
+        else:
+            # Pre-load the last layer of the next backward chunk to hide latency
+            next_backward_chunk = PipelineOffloadManager.get_instance().front_backward_chunk()
+            # Don't pre-reload the last layer if the next backward chunk hasn't finished fprop yet.
+            if (
+                next_backward_chunk is not None
+                and next_backward_chunk._offloaded_group_index
+                == next_backward_chunk._max_group_size
+            ):
+                next_backward_chunk.pre_reload_last_layer()
+
+    def on_group_commit_backward(self, name):
+        """
+        Called at the end of a layer group's backward pass.
+        Ensures correct chunk is active and synchronizes reloads.
+        """
+        if not self.do_offload:
+            return
+        debug_rank("--on_group_commit_backward")
+        cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk()
+        # Switch to this chunk if it's not already current
+        if cur_backward_chunk is not self:
+            PipelineOffloadManager.get_instance().pop_backward_chunk(name)
+        cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk()
+        assert cur_backward_chunk is self, f"Chunk mismatch {cur_backward_chunk} {self}"
+        # Wait for reload to complete before using tensors
+        if not is_graph_capturing() and len(self._reloading_group) > 0:
+            for reloading_group in self._reloading_group:
+                if reloading_group._name == name:
+                    reloading_group.wait_reload_event(torch.cuda.current_stream())
+                    self._reloading_group.remove(reloading_group)
+                    break
+
+    def on_group_start_forward(self, name):
+        """
+        Called at the start of a layer group's forward pass.
+        Increments group index and prepares for offloading.
+        """
+        if not self.do_offload:
+            return
+        debug_rank(f"--on_group_start_forward {name}")
+        self._offloaded_group_index = self._offloaded_group_index + 1
+        if self.is_warmup:
+            self.offload_groups.append(OffloadTensorGroup(name))
+            self._max_group_size = max(self._max_group_size, self._offloaded_group_index)
+            debug_rank(f"max group size {self._max_group_size}")
+        else:
+            for group in self.offload_groups[self._offloaded_group_index - 1 :]:
+                if group._name == name:
+                    break
+                self._offloaded_group_index = self._offloaded_group_index + 1
+        self._tensor_count_current_group = 0
+        self._groups_to_offload.append(self.offload_groups[self._offloaded_group_index - 1])
+        debug_rank(f"groups to offload {self._groups_to_offload}")
+
+    def on_group_start_backward(self):
+        """
+        Called at the start of a layer group's backward pass.
+        Triggers reloading of tensors from CPU.
+        """
+        if not self.do_offload:
+            return
+        debug_rank(f"--on_group_start_backward {self}")
+        # Wait for compute to finish before starting reload
+        self.h2d_stream.wait_stream(torch.cuda.current_stream())
+        self.bulk_reload()
+
+
+def fine_grained_offloading_disable_offload():
+    """Disable the offload."""
+    debug_rank("fine_grained_offloading_disable_offload")
+    PipelineOffloadManager.get_instance().disable_offload()
+
+
+def fine_grained_offloading_enable_offload():
+    """Enable the offload."""
+    debug_rank("fine_grained_offloading_enable_offload")
+    PipelineOffloadManager.get_instance().enable_offload()
+
+
+class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function):
+    """
+    Identity operation that marks the end of a layer group for offload synchronization.
+    Triggers offload during forward and synchronizes reload during backward.
+    """
+
+    @staticmethod
+    def forward(ctx, tensor, cur_forward_chunk, name, forced_released_tensors, delay_offload):
+        # pylint: disable=missing-function-docstring
+        debug_rank("FineGrainedOffloadingGroupCommitFunction forward")
+
+        if delay_offload:
+            PipelineOffloadManager.get_instance().push_offload_groups(
+                cur_forward_chunk.on_group_commit_forward, forced_released_tensors
+            )
+        else:
+            cur_forward_chunk.on_group_commit_forward(forced_released_tensors)
+        ctx.cpu_offload_handler = cur_forward_chunk
+        ctx.name = name
+        return tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        # pylint: disable=missing-function-docstring
+        debug_rank("FineGrainedOffloadingGroupCommitFunction backward")
+
+        cpu_offload_handler = ctx.cpu_offload_handler
+        cpu_offload_handler.on_group_commit_backward(ctx.name)
+        return grad_output + (None, None, None, None)
+
+
+def fine_grained_offloading_group_commit(
+    tensor, name, forced_released_tensors=None, delay_offload=False
+):
+    """
+    Specify the tensors to be released after offloading.
+    forced_released_tensors is a list of tensors to be released after offloading.
+    The tensors will be untyped_storage().resize_(0) after offloading.
+    Note: specify the tensors only when they are not automatically released by torch gc.
+    """
+    # Be permissive: callers may pass a tuple/list of outputs (e.g., (q, k, v)).
+    # We only need to insert a single identity op into the autograd graph; applying
+    # it to the first tensor output is sufficient and keeps callers' code minimal.
+    if forced_released_tensors is None:
+        forced_released_tensors = []
+    if isinstance(tensor, tuple):
+        if len(tensor) == 0:
+            return tensor
+        committed0 = fine_grained_offloading_group_commit(
+            tensor[0],
+            name=name,
+            forced_released_tensors=forced_released_tensors,
+            delay_offload=delay_offload,
+        )
+        return (committed0,) + tensor[1:]
+    if isinstance(tensor, list):
+        if len(tensor) == 0:
+            return tensor
+        committed0 = fine_grained_offloading_group_commit(
+            tensor[0],
+            name=name,
+            forced_released_tensors=forced_released_tensors,
+            delay_offload=delay_offload,
+        )
+        return [committed0] + tensor[1:]
+
+    cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk()
+    if cur_forward_chunk is None:
+        return tensor
+    return FineGrainedOffloadingGroupCommitFunction.apply(
+        tensor, cur_forward_chunk, name, forced_released_tensors, delay_offload
+    )
+
+
+def fine_grained_offloading_group_flush_delayed_groups():
+    """Flush the delayed groups."""
+    debug_rank("fine_grained_offloading_group_flush_delayed_groups")
+    PipelineOffloadManager.get_instance().flush_delayed_groups()
+
+
+class FineGrainedOffloadingGroupStartFunction(torch.autograd.Function):
+    """
+    Identity operation that marks the start of a layer group for offload/reload.
+    Prepares for offload during forward and triggers reload during backward.
+    """
+
+    @staticmethod
+    def forward(ctx, tensor, cpu_offload_handler, name):
+        # pylint: disable=missing-function-docstring
+        ctx.cpu_offload_handler = cpu_offload_handler
+        debug_rank("FineGrainedOffloadingGroupStartFunction forward")
+
+        cpu_offload_handler.on_group_start_forward(name)
+        # return the identical tensor
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # pylint: disable=missing-function-docstring
+        debug_rank("FineGrainedOffloadingGroupStartFunction backward")
+        cpu_offload_handler = ctx.cpu_offload_handler
+        cpu_offload_handler.on_group_start_backward()
+        return grad_output, None, None, None
+
+
+def fine_grained_offloading_group_start(tensor, name=None):
+    """Mark the start of a layer group and prepare for offload/reload."""
+    cur_forward_chunk = PipelineOffloadManager.get_instance().pop_forward_chunk(name=name)
+    if cur_forward_chunk is None:
+        return tensor
+    return FineGrainedOffloadingGroupStartFunction.apply(tensor, cur_forward_chunk, name)
+
+
+def fine_grained_offloading_forward_record(event: torch.cuda.Event) -> None:
+    """Record the forward event for cuda graph capture."""
+    d2h_stream = PipelineOffloadManager.get_instance().d2h_stream
+    torch.cuda.current_stream().record_event(event)
+    torch.cuda.current_stream().wait_stream(d2h_stream)
+
+
+class FineGrainedOffloadingBackwardRecordFunction(torch.autograd.Function):
+    """
+    Identity operation that marks the end of a layer group for offload synchronization.
+    Triggers offload during forward and synchronizes reload during backward.
+    """
+
+    @staticmethod
+    def forward(ctx, tensor, event: torch.cuda.Event) -> torch.Tensor:
+        """Forward pass for cuda graph capture."""
+        ctx.event = event
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Record the backward event and wait for the h2d stream on cuda graph stream."""
+        h2d_stream = PipelineOffloadManager.get_instance().h2d_stream
+        torch.cuda.current_stream().record_event(ctx.event)
+        torch.cuda.current_stream().wait_stream(h2d_stream)
+        return grad_output, None
+
+
+def fine_grained_offloading_backward_record(tensor, event: torch.cuda.Event) -> torch.Tensor:
+    """Record the backward event for cuda graph capture."""
+    return FineGrainedOffloadingBackwardRecordFunction.apply(tensor, event)
+
+
+class FineGrainedActivationOffloadingInterface:
+    """Interface for fine-grained activation offloading."""
+
+    def __init__(self, offload: bool, tensor: torch.Tensor, name: str):
+        self.offload = offload
+        self.tensor = tensor
+        self.name = name
+
+    def __enter__(self):
+        """Enter context manager to enable activation offloading hooks."""
+        if self.offload:
+            self.tensor = fine_grained_offloading_group_start(self.tensor, self.name)
+            PipelineOffloadManager.get_instance().__enter__()
+        return self.tensor
+
+    def __exit__(self, *args: Any):
+        """Exit context manager to disable activation offloading hooks."""
+        if self.offload:
+            PipelineOffloadManager.get_instance().__exit__()
+
+    @staticmethod
+    def init_chunk_handler(vp_size, vp_stage, min_offloaded_tensor_size):
+        """Initialize the chunk handler, called at the start of a microbatch forward pass."""
+        PipelineOffloadManager.get_instance().init_model_chunk_offload_handler(
+            vp_size, vp_stage, min_offloaded_tensor_size
+        )
+
+    @staticmethod
+    def get_context(flag):
+        """Get the fine-grained offload context"""
+        return PipelineOffloadManager.get_instance() if flag else nullcontext()
+
+    @staticmethod
+    def group_commit(tensor, name, forced_released_tensors=None, delay_offload=False):
+        """Group commit the tensors."""
+        return fine_grained_offloading_group_commit(
+            tensor, name, forced_released_tensors, delay_offload
+        )
+
+    @staticmethod
+    def mark_not_offloadable(tensor: torch.Tensor):
+        """Mark the tensor as not offloadable."""
+        PipelineOffloadManager.get_instance().mark_not_offloadable(tensor)
+
+    @staticmethod
+    def forward_record(event: torch.cuda.Event) -> None:
+        """Record the forward event for cuda graph capture."""
+        d2h_stream = PipelineOffloadManager.get_instance().d2h_stream
+        torch.cuda.current_stream().record_event(event)
+        torch.cuda.current_stream().wait_stream(d2h_stream)
+
+    @staticmethod
+    def reset():
+        """Reset the chunk handler."""
+        PipelineOffloadManager.get_instance().reset()
+
+    @staticmethod
+    def reset_instance():
+        """Reset the singleton instance."""
+        PipelineOffloadManager.reset_instance()
diff --git a/megatron/core/pipeline_parallel/hybrid_cp_schedule.py b/megatron/core/pipeline_parallel/hybrid_cp_schedule.py
new file mode 100644
index 00000000000..27b5fc87945
--- /dev/null
+++ b/megatron/core/pipeline_parallel/hybrid_cp_schedule.py
@@ -0,0 +1,660 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+from collections import deque
+from functools import lru_cache
+from math import ceil, log2
+from typing import Callable, List, Optional, Tuple
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.rerun_state_machine import RerunDataIterator
+
+
+class BalancedCPScheduler:
+    """
+    This class provides the functionality to form groups of sub-samples
+    such that all DPxCP ranks have a roughly balanced workload in the group.
+    """
+
+    def __init__(self, max_seq_len_per_rank: int, dp_cp_group: torch.distributed.ProcessGroup):
+        self.max_seq_len_per_rank = max_seq_len_per_rank
+        self.num_subsamples = 0
+        self.num_subsamples_processed = 0
+        self.free_resources = []
+        self.total_hdp_gpus = dp_cp_group.size()
+
+    @lru_cache(maxsize=128)
+    def get_total_workload(self, seq_length: int, cp_size: Optional[int] = None):
+        """
+        seq_length: sequence length of a sub-sample
+        cp_size: total number of CP ranks working on this sub-sample
+
+        Note:
+        This function is used to estimate the relative workload intensity
+        of a sub-sample. This is not meant to be an accurate flops calculator.
+
+        Returns: workload of a sub-sample
+        """
+        if cp_size is None:
+            cp_size = self.gpus_needed(seq_length)
+        return (seq_length * seq_length) / cp_size
+
+    @lru_cache(maxsize=128)
+    def gpus_needed(self, seq_len: int) -> int:
+        """
+        Calculates the number of GPUs needed for a given sequence length
+        and max sequence length per CP rank.
+        This is used to determine the CP size of a sub-sample.
+
+        The number is rounded up to the next power of 2 to match the available
+        hybrid context parallel process group sizes.
+        """
+        return max(1, 2 ** ceil(log2((seq_len / self.max_seq_len_per_rank))))
+
+    def make_buckets_equal(
+        self,
+        sample_seqlens: List[Tuple[int, int]],  # List of (sample_id, sequence_length) tuples
+        compute_estimator: Callable[[int], float],
+    ) -> List[deque]:
+        """
+        Makes as many buckets as unique CP sizes needed.
+        This keeps sample IDs tethered to their sequence lengths throughout the bucketing process.
+        """
+        # Extract just the sequence lengths for determining k
+        seqlens = [seq_len for _, seq_len in sample_seqlens]
+
+        # Determine k based on unique GPU categories needed
+        k = len({self.gpus_needed(L) for L in seqlens})
+
+        # Create a work target for each bucket
+        # This is the total work divided by the number of buckets
+        work = []
+        for _, s in sample_seqlens:
+            cp_size = self.gpus_needed(s)
+            work.append(compute_estimator(s, cp_size))
+        total_work = sum(work)
+        target = total_work / k
+        buckets, cur, cur_work = [], [], 0.0
+        remaining_work = total_work
+        remaining_k = k
+
+        for i, (sample_id, seq_len) in enumerate(sample_seqlens):
+            work = compute_estimator(seq_len)
+            projected = cur_work + work
+
+            # Check if we should close this bucket
+            if cur and (
+                projected > target * 1.1  # Too much work
+                or len(sample_seqlens) - i <= remaining_k - len(buckets)
+            ):  # Need to save sequences for remaining buckets
+                buckets.append(deque(cur))
+                cur, cur_work = [], 0.0
+                remaining_work -= sum(compute_estimator(seq_len) for _, seq_len in cur)
+                remaining_k -= 1
+
+            cur.append((sample_id, seq_len))
+            cur_work += work
+
+        if cur:
+            buckets.append(deque(cur))
+
+        return buckets
+
+    def next_hdp_group(
+        self,
+        sample_seqlens: List[Tuple[int, int]],  # List of (sample_id, sequence_length) tuples
+        compute_estimator: Callable[[int], float],
+        total_gpus: int,
+        delta: float = 0.05,  # balance slack (e.g. 5 %)
+        strategy: str = "dp",  # "dp" or "pp"
+        eps_bucket: float = 0.10,  # ε target for bucket balance
+    ) -> Tuple[List[List[int]], List[Tuple[int, int]], List[float], List[List[int]]]:
+        """
+        Given a list of (sample_id, sequence_length) tuples, this function aims to assign
+        sequences in a group such that all GPUs in the DPxCP group have a roughly balanced
+        workload. Once each group is roughly balanced, we exit and return the
+        group and the leftover sequences.
+
+        The function performs the following passes in order to form a balanced microbatch:
+        1. We create buckets of sequences that are roughly balanced.
+        We try to create as many buckets as possible CP sizes.
+        2. Given a bucket has sequences available, we assign the sample
+            a. To a new set of GPUs if there are enough free GPUs.
+            b. To an existing set of GPUs with the lowest load.
+        3. We check if the group is balanced whenever we need to move onto a new CP size
+        in the same set of GPUs.
+        4. We trim the group if removing the last added sequence helps improve balance.
+        5. If we run out of sequences to assign and there are empty GPUs,
+        we redistribute work to empty GPUs by recursively increasing the CP size of a
+        sample until no empty GPUs are left.
+
+        Returns (micro_batches, leftover_sample_seqlens, exec_times, sample_ids_per_gpu).
+        """
+        if not sample_seqlens:
+            return (
+                [[] for _ in range(total_gpus)],
+                [],
+                [0.0 for _ in range(total_gpus)],
+                [[] for _ in range(total_gpus)],
+            )
+
+        # Get buckets of sequences with balanced work
+        buckets = self.make_buckets_equal(sample_seqlens, compute_estimator)
+
+        # Initialize tracking structures
+        micro_batches = [[] for _ in range(total_gpus)]
+        exec_times = [0.0 for _ in range(total_gpus)]
+        sample_ids_per_gpu = [[] for _ in range(total_gpus)]
+
+        gpu_group_id = [None] * total_gpus
+        group_members = {}
+        group_size = {}
+        next_gid = 0
+
+        pp_cursor = 0
+        prev_needed = None
+        check_balance = False
+
+        while buckets:
+            # ---- Step 1 – pick the next sequence we COULD place ------------------
+            sample_seq_tuple = bucket_idx = None
+            needed = None
+
+            scan_order = (
+                range(len(buckets))
+                if strategy == "dp"
+                else [(pp_cursor + i) % len(buckets) for i in range(len(buckets))]
+            )
+
+            for idx in scan_order:
+                if not buckets[idx]:
+                    continue
+                cand_tuple = buckets[idx][0]  # This is now (sample_id, seq_len)
+                cand_seq_len = cand_tuple[1]
+                needed = self.gpus_needed(cand_seq_len)
+
+                # (a) Do we have an *existing* group of size `needed`?
+                candidate_gids = [gid for gid, sz in group_size.items() if sz == needed]
+
+                # (b) Or enough completely free GPUs to start a new group?
+                free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None]
+                if candidate_gids or len(free_ranks) >= needed:
+                    sample_seq_tuple, bucket_idx = cand_tuple, idx
+                    break
+
+            # No place to put any remaining sequence – finish this micro‑batch
+            if sample_seq_tuple is None:
+                break
+
+            # TODO[pmannan]: PP not yet supported. Add PP scheduling.
+            if strategy == "pp":
+                pp_cursor = (bucket_idx + 1) % len(buckets)
+
+            sample_id, seq_len = sample_seq_tuple
+            needed = self.gpus_needed(seq_len)
+            if prev_needed is None:
+                prev_needed = needed
+
+            # (a)  Existing groups of exactly this size
+            candidate_gids = [gid for gid, sz in group_size.items() if sz == needed]
+            if candidate_gids:
+                best_gid, best_load = min(
+                    (
+                        (gid, max(exec_times[r] for r in group_members[gid]))
+                        for gid in candidate_gids
+                    ),
+                    key=lambda t: t[1],
+                )
+            else:
+                best_gid, best_load = None, float("inf")
+
+            # (b)  Hypothetical **new** group from completely free GPUs
+            free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None]
+            if len(free_ranks) >= needed:
+                free_sorted = sorted(free_ranks, key=lambda r: exec_times[r])
+                new_members = free_sorted[:needed]
+                new_load = exec_times[new_members[-1]]
+
+                if new_load < best_load:
+                    best_gid = None
+                    chosen_members = new_members
+                else:
+                    chosen_members = group_members[best_gid]
+            else:
+                chosen_members = group_members[best_gid]
+
+            # ---- Step 2 – if we decided to create a fresh group ----------------
+            if best_gid is None:
+                best_gid = next_gid
+                next_gid += 1
+                group_members[best_gid] = chosen_members
+                group_size[best_gid] = needed
+                for r in chosen_members:
+                    gpu_group_id[r] = best_gid
+
+            # ---- Step 3 – assign the sequence to every member of that group ------
+            per_gpu_cost = compute_estimator(seq_len)
+
+            for r in chosen_members:
+                micro_batches[r].append(seq_len)
+                exec_times[r] += per_gpu_cost
+                sample_ids_per_gpu[r].append(sample_id)
+
+            # Remove the sequence definitively from its bucket
+            buckets[bucket_idx].popleft()
+
+            # ---- Step 4 – tidy, balance‑check, maybe early‑exit ------------------
+            while buckets and not buckets[0]:
+                buckets.pop(0)
+                pp_cursor %= max(1, len(buckets))
+
+            # TODO: Removing this helps reduce the number of groups when we have
+            # lots of samples with same CP size.
+            # But because we don't exit as soon as we get balanced,
+            # even if there is one group available that can take the next sample,
+            # we will keep adding samples to the same group.
+            # trim_overload() does not help because it only checks if removing the
+            # last added sample helps.
+            # We cannot check after adding every sample because there will always be imbalance
+            # if we don't wait for future scheduling.
+
+            # IMPORTANT: So we need a solution here
+            if needed < prev_needed:
+                # When we get into a lower CP size in the same group,
+                # we can start checking for balance. There is still a gotcha here.
+                # Let's say we have a group of 3 GPU 0-2, then we move onto group of 2.
+                # We keep assigning group of 2 as we do in descending order but GPU 7/15
+                # never sees a microbatch assigned to it
+                # until we run out of samples with CP2.
+                # This means we are never balanced as min(exec_times) will always be 0.
+                # We need a smart way of identifying that we have run out of big samples
+                # and if we are having to assign work to a GPU already working,
+                # is it because there are empty GPUs?
+                # Would assigning work to empty GPUs first by moving onto next CP bucket help?
+                # But we need to remember to come back to this CP size bucket and then
+                # check for balance. Maybe the scheduling algorithm should look at empty
+                # GPUs and find work rather than going sequence by sequence.
+                check_balance = True
+
+            if (
+                check_balance
+                and buckets
+                and max(exec_times) - min(exec_times) <= delta * max(exec_times)
+            ):
+                break
+
+        # Gather leftovers (flatten remaining buckets, preserve order)
+        leftovers = []
+        for b in buckets:
+            for sample_seq_tuple in b:
+                leftovers.append(sample_seq_tuple)
+
+        # ---------------------------------------------------------------------------
+        def trim_overload():
+            """
+            Iteratively pop the most‑recent sequence from the *most‑loaded group*
+            whenever doing so reduces the global slack.
+            """
+            while True:
+                cur_max = max(exec_times)
+                cur_min = min(exec_times)
+                cur_slack = cur_max - cur_min
+                if cur_slack <= delta * cur_max:
+                    # Slack is already within limit.
+                    break
+                if cur_min == 0:
+                    # There are empty GPUs that will be
+                    # handled in the next step.
+                    break
+
+                max_r = exec_times.index(cur_max)
+                gid = gpu_group_id[max_r]
+                members = group_members[gid]
+
+                if not micro_batches[max_r] or len(micro_batches[max_r]) <= 1:
+                    break
+
+                seq = micro_batches[max_r][-1]
+                need = group_size[gid]
+                per_gpu_cost = compute_estimator(seq)
+
+                proj_times = exec_times[:]
+                for r in members:
+                    proj_times[r] -= per_gpu_cost
+
+                proj_slack = max(proj_times) - min(proj_times)
+
+                # Check if trimming the workload helps imbalance
+                if proj_slack < cur_slack:
+                    sample_id_to_remove = sample_ids_per_gpu[max_r][-1]
+                    for r in members:
+                        micro_batches[r].pop()
+                        exec_times[r] -= per_gpu_cost
+                        sample_ids_per_gpu[r].pop()
+                    leftovers.append((sample_id_to_remove, seq))
+                else:
+                    break
+
+        trim_overload()
+
+        # Track samples in this group before redistribution to empty GPUs
+        total_work_before = sum(len(mb) for mb in micro_batches)
+
+        # Check for empty GPUs and redistribute work
+        def fill_empty_gpus(
+            micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size
+        ):
+            """
+            Recursively check for empty GPUs and redistribute work by increasing
+            the number of GPUs sharing samples. This ensures all GPUs have work.
+            GPUs must be allocated consecutively so we may need to push existing
+            work to other ranks in order to expand samples.
+            """
+            # Find empty GPUs
+            empty_gpus = [i for i in range(total_gpus) if not micro_batches[i]]
+            if not empty_gpus:
+                return (
+                    micro_batches,
+                    exec_times,
+                    sample_ids_per_gpu,
+                    group_members,
+                    group_size,
+                )  # No empty GPUs, we're done
+
+            # Find the smallest group size that exists
+            existing_group_sizes = set(group_size.values())
+            assert (
+                existing_group_sizes
+            ), "There should be at least one group existing, cannot reditribute, "
+            "try to increase 'max-seqlen-per-cp-rank'."
+
+            min_group_size = min(existing_group_sizes)
+            # We have Hybrid DPxCP groups for every power of 2 of GPUs or the entire DPxCP group.
+            next_power = min(min_group_size * 2, total_gpus)
+
+            # Find the first group of min_group_size that can be expanded
+            expandable_gid = None
+            expandable_members = None
+            expandable_new_gpus = None
+
+            for gid, size in group_size.items():
+                if size == min_group_size:
+                    members = group_members[gid]
+                    needed_count = next_power - min_group_size
+                    group_start_gpu = members[0]
+                    group_end_gpu = members[-1]
+                    empty_gpu = [idx for idx, work in enumerate(micro_batches) if not work][0]
+                    assert not all(
+                        work for work in micro_batches[empty_gpu : empty_gpu + needed_count]
+                    ), f"Empty GPUs were detected but not enough to expand."
+                    work_to_push = micro_batches[
+                        group_end_gpu + 1 : empty_gpu
+                    ]  # This is work of all other subsequent sub-samples
+                    exec_times_to_push = exec_times[group_end_gpu + 1 : empty_gpu]
+                    sample_ids_to_push = sample_ids_per_gpu[group_end_gpu + 1 : empty_gpu]
+
+                    new_micro_batches = [[]] * len(micro_batches)
+                    new_exec_times = [0.0] * len(exec_times)
+                    new_sample_ids_per_gpu = [[]] * len(sample_ids_per_gpu)
+
+                    # No change in work until the group selected for expansion
+                    for i in range(group_start_gpu):
+                        new_micro_batches[i] = micro_batches[i]
+                        new_exec_times[i] = exec_times[i]
+                        new_sample_ids_per_gpu[i] = sample_ids_per_gpu[i]
+
+                    # The work is distributed across the expanded group
+                    for i in range(group_start_gpu, group_end_gpu + needed_count + 1):
+                        new_micro_batches[i] = micro_batches[group_end_gpu]
+                        new_exec_times[i] = self.get_total_workload(
+                            micro_batches[group_end_gpu][0], next_power
+                        )
+                        new_sample_ids_per_gpu[i] = sample_ids_per_gpu[group_end_gpu]
+
+                    # Any assigned work on expanded GPUs is pushed
+                    for i, work in enumerate(work_to_push):
+                        new_micro_batches[group_end_gpu + needed_count + 1 + i] = work
+                        new_exec_times[group_end_gpu + needed_count + 1 + i] = exec_times_to_push[i]
+                        new_sample_ids_per_gpu[group_end_gpu + needed_count + 1 + i] = (
+                            sample_ids_to_push[i]
+                        )
+
+                    group_size[gid] = next_power
+                    group_members[gid] = list(range(members[0], members[-1] + needed_count + 1))
+                    for pushed_gid in group_size.keys():
+                        if pushed_gid > gid:
+                            group_members[pushed_gid] = [
+                                x + needed_count for x in group_members[pushed_gid]
+                            ]
+
+                    return (
+                        new_micro_batches,
+                        new_exec_times,
+                        new_sample_ids_per_gpu,
+                        group_members,
+                        group_size,
+                    )
+
+        empty_gpus = any([not micro_batches[i] for i in range(total_gpus)])
+        while empty_gpus:
+            micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size = (
+                fill_empty_gpus(
+                    micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size
+                )
+            )
+            empty_gpus = any([not micro_batches[i] for i in range(total_gpus)])
+
+        # Assert that no sample has been completely removed
+        total_work_after = sum(len(mb) for mb in micro_batches)
+        assert (
+            total_work_after >= total_work_before
+        ), f"Samples were removed: {total_work_before} -> {total_work_after}"
+
+        return micro_batches, leftovers, exec_times, sample_ids_per_gpu
+
+    def get_groups_and_subsamples(self, sample_id_seqlens, config):
+        """
+        This function recursively forms groups of sub-samples such that all DPxCP ranks
+        have a roughly balanced workload in the group.
+        """
+        groups = []
+        sample_id_groups = []
+        # We assign a sample_id to each sub-sample in order to track assignment to each GPU.
+        sample_id_seqlens = sorted(sample_id_seqlens, key=lambda x: x[1], reverse=True)
+        while sample_id_seqlens:
+            mb, sample_id_seqlens, exec_times, sample_ids = self.next_hdp_group(
+                sample_id_seqlens, self.get_total_workload, self.total_hdp_gpus
+            )
+            groups.append(mb)
+            if len(sample_ids) < self.total_hdp_gpus:
+                sample_ids.extend([] * (self.total_hdp_gpus - len(sample_ids)))
+            sample_id_groups.append(sample_ids)
+
+        return groups, sample_id_groups
+
+
+def hybrid_context_parallel_forward_backward(
+    forward_step_func,
+    data_iterator,
+    model,
+    num_microbatches,
+    input_tensor,
+    output_tensor_grad,
+    forward_data_store,
+    config,
+    collect_non_loss_data,
+    first_val_step,
+    forward_only,
+    no_sync_func,
+    total_num_tokens,
+    check_first_val_step,
+    model_type,
+):
+    """
+    Scheduler for Hybrid Context Parallel.
+
+    This function performs the packed sample scheduling and determines
+    1. The number of microbatches to schedule for each CP rank
+    2. The number of groups each CP rank should execute
+    3. The number of sub-samples per group each CP rank should execute
+
+    A group is defined by a set of samples that can run across the CP domain without any barrier.
+    There are many reasons why we may not be able to run endless samples within a single group.
+    For example, if we have 8 GPUs,
+    if GPU 0-5 are assigned a long sample that requires CP6,
+    GPU 6-7 are assigned a short sample that requires CP2,
+    The next sample which requires CP4 can be assigned GPU 4-7.
+    But GPU 6-7 will finish first and get deadlocked if GPU 4-5 are not participating in the group.
+    """
+    from .schedules import backward_step, forward_step
+
+    def _broadcast(item):
+        if item is not None:
+            torch.distributed.broadcast(
+                item,
+                parallel_state.get_tensor_model_parallel_src_rank(),
+                group=parallel_state.get_tensor_model_parallel_group(),
+            )
+
+    def _broadcast_num_samples_this_group(num_samples_this_group):
+        dev = torch.cuda.current_device()
+        torch.distributed.barrier()
+
+        n = 0 if num_samples_this_group is None else int(num_samples_this_group.numel())
+        n = torch.tensor([n], dtype=torch.int64, device=dev)
+
+        _broadcast(n)
+        n = int(n.item())
+
+        assert n > 0, "there should be at least 1 sub samples in the group"
+        num_samples_this_group_broadcast = (
+            torch.empty(n, dtype=torch.int32, device=dev)
+            if num_samples_this_group is None
+            else num_samples_this_group
+        )
+        _broadcast(num_samples_this_group_broadcast)
+        return num_samples_this_group_broadcast
+
+    def _get_new_data_iterator(sample_id_in_group, group_id):
+        if is_first_tp_rank:
+            sub_sample_id = sample_ids_this_group[sample_id_in_group]
+            sample = batch[sub_sample_id]
+            partner_cp_size = len(
+                [True for sample_ids in sample_id_groups[group_id] if sub_sample_id in sample_ids]
+            )
+            sample["local_cp_size"] = torch.tensor(partner_cp_size, dtype=torch.int32)
+            new_data_iterator = RerunDataIterator(iter([sample]))
+            return new_data_iterator
+        else:
+            return None
+
+    # We get data once per global batch and schedule the sub-samples.
+    # TODO(pmannan): Should we wrap the data_iterator here instead of the training.py file?
+    hdp_rank = parallel_state.get_data_parallel_rank(with_context_parallel=True)
+    is_first_tp_rank = parallel_state.get_tensor_model_parallel_rank() == 0
+
+    if is_first_tp_rank:
+        data = next(data_iterator)
+        sample_id_groups = data[1]
+        batch = data[0]
+    else:
+        data, sample_id_groups, batch = None, None, None
+
+    num_samples_this_group = None
+    if is_first_tp_rank:
+        num_samples_this_group = torch.tensor(
+            [len(group[hdp_rank]) for group in sample_id_groups], dtype=torch.int32, device='cuda'
+        )
+
+    num_samples_this_group = _broadcast_num_samples_this_group(num_samples_this_group)
+    num_samples_this_group = num_samples_this_group.cpu().numpy()
+    num_total_groups = num_samples_this_group.shape[0]
+
+    current_microbatch = 0
+
+    # Upto last group, we don't need any sync.
+    with no_sync_func():
+        for j in range(num_total_groups - 1):
+            sample_ids_this_group = sample_id_groups[j][hdp_rank] if is_first_tp_rank else None
+            for i in range(num_samples_this_group[j]):
+                # Call forward step for each sub-sample
+                new_data_iterator = _get_new_data_iterator(i, j)
+                # TODO: Find the usage of current_microbatch and is_first_microbatch and
+                # how that may affect my usage.
+                output_tensor, num_tokens = forward_step(
+                    forward_step_func,
+                    new_data_iterator,
+                    model,
+                    num_microbatches,
+                    input_tensor,
+                    forward_data_store,
+                    config,
+                    collect_non_loss_data,
+                    is_first_microbatch=check_first_val_step(
+                        first_val_step, forward_only, current_microbatch == 0
+                    ),
+                    current_microbatch=current_microbatch,
+                )
+                current_microbatch += 1
+                total_num_tokens += num_tokens.item()
+                if not forward_only:
+                    backward_step(
+                        input_tensor, output_tensor, output_tensor_grad, model_type, config
+                    )
+
+            # Create a barrier at end of each group.
+            # This barrier ensures that all ranks are prepared to change assigned CP group sizes and
+            # no rank is starting a sub-sample ahead of it's partner ranks.
+            torch.distributed.barrier(
+                parallel_state.get_data_parallel_group(with_context_parallel=True)
+            )
+
+    # For the last group, we need to run the last sub-sample out of the context handler.
+    with no_sync_func():
+        sample_ids_this_group = sample_id_groups[-1][hdp_rank] if is_first_tp_rank else None
+        for i in range(num_samples_this_group[-1] - 1):
+            new_data_iterator = _get_new_data_iterator(i, -1)
+            # Call forward step for each sub-sample
+            output_tensor, num_tokens = forward_step(
+                forward_step_func,
+                new_data_iterator,
+                model,
+                num_microbatches,
+                input_tensor,
+                forward_data_store,
+                config,
+                collect_non_loss_data,
+                is_first_microbatch=check_first_val_step(
+                    first_val_step, forward_only, current_microbatch == 0
+                ),
+                current_microbatch=current_microbatch,
+            )
+            current_microbatch += 1
+            total_num_tokens += num_tokens.item()
+            if not forward_only:
+                backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
+
+    # The last sub-sample of the last group of the last microbatch is
+    # run out of the context handler.
+    new_data_iterator = _get_new_data_iterator(-1, -1)
+    # Call forward step for each sub-sample
+    output_tensor, num_tokens = forward_step(
+        forward_step_func,
+        new_data_iterator,
+        model,
+        num_microbatches,
+        input_tensor,
+        forward_data_store,
+        config,
+        collect_non_loss_data,
+        is_first_microbatch=check_first_val_step(
+            first_val_step, forward_only, current_microbatch == 0
+        ),
+        current_microbatch=current_microbatch,
+    )
+    total_num_tokens += num_tokens.item()
+    if not forward_only:
+        backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
+
+    return forward_data_store, total_num_tokens
diff --git a/megatron/core/pipeline_parallel/multimodule_communicator.py b/megatron/core/pipeline_parallel/multimodule_communicator.py
new file mode 100644
index 00000000000..1e8da3468e2
--- /dev/null
+++ b/megatron/core/pipeline_parallel/multimodule_communicator.py
@@ -0,0 +1,531 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+
+from megatron.core.hyper_comm_grid import HyperCommGrid
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.pipeline_parallel.bridge_communicator import BridgeCommunicator
+from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator
+
+# Types
+Shape = Union[List[int], torch.Size]
+
+
+@dataclass
+class RankModuleInfo:
+    """Information about a rank in a module.
+
+    Attributes:
+        pp_rank: The stage index of the current rank within the module's pipeline.
+        pp_size: The total number of pipeline stages (ranks) in the module.
+        p2p_communicator: Intra-module point-to-point communicator.
+        bridge_comms_as_src_module: Bridge communicators for outgoing connections
+            from this module to downstream modules. One module may have multiple
+            bridge communicators if it has multiple outgoing connections.
+        bridge_comms_as_dest_module: Bridge communicators for incoming connections
+            to this module from upstream modules. One module may have multiple
+            bridge communicators if it has multiple incoming connections.
+        is_source_stage: True if this rank is at the absolute first stage in the
+            overall model (no incoming connections).
+        is_terminal_stage: True if this rank is at the absolute last stage in the
+            overall model (no outgoing connections).
+    """
+
+    pp_rank: int
+    pp_size: int
+    p2p_communicator: Optional[P2PCommunicator]
+    bridge_comms_as_src_module: Optional[List[BridgeCommunicator]]
+    bridge_comms_as_dest_module: Optional[List[BridgeCommunicator]]
+    is_source_stage: Optional[bool] = True
+    is_terminal_stage: Optional[bool] = True
+
+
+class MultiModulePipelineCommunicator:
+    """Communicator for a multi-module pipeline."""
+
+    def __init__(
+        self,
+        module_to_grid_map: Dict[str, HyperCommGrid],
+        topology: Dict[str, List[str]],
+        config: ModelParallelConfig,
+        dim_mapping: Dict[str, List[int]] = None,
+    ):
+        """
+        Initialize the MultiModulePipelineCommunicator.
+
+        Args:
+            module_to_grid_map (dict): A dictionary mapping module names to HyperCommGrids.
+                Example:
+                    module_to_grid_map = {
+                        'image_encoder': image_encoder_grid,
+                        'audio_encoder': audio_encoder_grid,
+                        'llm': llm_grid,
+                        'generator': generator_grid
+                    }
+            topology (dict): A dictionary mapping module names to lists of outgoing modules.
+                Example:
+                    topology = {
+                        'image_encoder': ['llm'],
+                        'audio_encoder': ['llm'],
+                        'llm': ['generator'],
+                        'generator': []
+                    }
+            config (ModelParallelConfig): A ModelParallelConfig object.
+            dim_mapping (Dict[str, List[int]]): Dimension mapping for sequence, batch, hidden.
+                Example:
+                    dim_mapping = {'s': 0, 'h': 2, 'b': 1}
+                Default: None
+        """
+        self.module_to_grid_map = module_to_grid_map
+        self.topology = topology
+        self.config = config
+        self.dim_mapping = dim_mapping
+        self.current_rank = dist.get_rank()
+
+        # Build bridge communicators for all modules
+        self.bridge_comms = []
+        self._build_bridge_comms()
+
+        self.rank_module_map = {}
+        self._build_rank_module_info_map()
+
+    def _build_bridge_comms(self):
+        """Construct and store BridgeCommunicator objects that describe the outgoing
+        communication relationships for all of the modules.
+        """
+        for src_module_name, src_grid in self.module_to_grid_map.items():
+            for dest_module_name in self.topology[src_module_name]:
+                dest_grid = self.module_to_grid_map[dest_module_name]
+                bridge_comm = BridgeCommunicator(
+                    src_grid=src_grid,
+                    dest_grid=dest_grid,
+                    dim_mapping=self.dim_mapping,
+                    comm_dtype=self.config.pipeline_dtype,
+                    src_module_name=src_module_name,
+                    dest_module_name=dest_module_name,
+                )
+                self.bridge_comms.append(bridge_comm)
+
+    @property
+    def is_pp_first_stage(self):
+        """Return True if the current rank has the absolute first stage in the overall model.
+
+        The absolute first stage is defined as:
+        1. The current rank must be in the first PP stage (pp_rank == 0) of some module
+        2. That module must be a source module (no incoming connections in topology)
+        """
+        for module_name, rank_module_info in self.rank_module_map.items():
+            # Check if this rank is at the first PP stage of this module
+            if rank_module_info.pp_rank == 0:
+                # Check if this module is a source module (no incoming connections)
+                if self._is_source_module(module_name):
+                    return True
+        return False
+
+    @property
+    def is_pp_last_stage(self):
+        """Return True if the current rank has the absolute last stage in the overall model.
+
+        The absolute last stage is defined as:
+        1. The current rank must be in the last PP stage of some module
+        2. That module must be a sink module (no outgoing connections in topology)
+        """
+        for module_name, rank_module_info in self.rank_module_map.items():
+            # Check if this rank is at the last PP stage of this module
+            if rank_module_info.pp_rank == rank_module_info.pp_size - 1:
+                # Check if this module is a sink module (no outgoing connections)
+                if self._is_sink_module(module_name):
+                    return True
+        return False
+
+    def _is_source_module(self, module_name: str) -> bool:
+        """Check if a module is a source module (has no incoming connections)."""
+        # A module is a source if no other module lists it as a destination
+        for src_module, dest_modules in self.topology.items():
+            if module_name in dest_modules:
+                return False
+        return True
+
+    def _is_sink_module(self, module_name: str) -> bool:
+        """Check if a module is a sink module (has no outgoing connections)."""
+        return len(self.topology.get(module_name, [])) == 0
+
+    def is_current_rank_in_grid(self, grid: HyperCommGrid) -> bool:
+        """Check if the current rank is in the grid."""
+        return grid.rank_offset <= self.current_rank < grid.rank_offset + grid.size
+
+    @property
+    def num_warmup_microbatches(self):
+        """Calculate the number of warmup microbatches for the current rank.
+
+        Uses the same simple logic as P2PCommunicator:
+        total_pipeline_stages - current_rank_stage - 1
+
+        Returns:
+            int: Number of warmup microbatches for this rank
+        """
+        # Get total pipeline depth across all modules
+        total_stages = self.compute_total_pipeline_stages(self.topology, self.module_to_grid_map)
+
+        # Get current rank's position in the overall pipeline (0-indexed)
+        # Use compute_total_pipeline_stages with current rank to get cumulative position
+        if self.rank_module_map:
+            # Take the first module this rank belongs to
+            # TODO: ykarnati - improve this logic.
+            module_name = next(iter(self.rank_module_map.keys()))
+            current_stage = (
+                self.compute_total_pipeline_stages(
+                    self.topology,
+                    self.module_to_grid_map,
+                    rank=self.current_rank,
+                    module_name=module_name,
+                )
+                - 1
+            )  # Convert from 1-indexed to 0-indexed
+        else:
+            current_stage = 0
+
+        assert (
+            current_stage <= total_stages
+        ), f"current_stage: {current_stage} is greater than total_stages: {total_stages}"
+        logging.debug(
+            f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] "
+            f"current_stage: {current_stage} total_stages: {total_stages} "
+            f"num_warmup_microbatches: {total_stages - current_stage - 1}"
+        )
+        return total_stages - current_stage - 1
+
+    def _build_rank_module_info_map(self):
+        """For each module in the current rank, initialize the P2P communicator
+        and build the bridge communicator info for the module.
+        Each rank may hold multiple modules when colocated.
+        """
+        for module_name, module_grid in self.module_to_grid_map.items():
+            if self.is_current_rank_in_grid(module_grid):
+                # Initialize P2P communicator
+                pp_group = module_grid.get_pg('pp')
+                p2p_comm = P2PCommunicator(pp_group, self.config)
+                pp_size = dist.get_world_size(pp_group)
+                rank_in_pp_group = dist.get_group_rank(pp_group, self.current_rank)
+                pp_rank = rank_in_pp_group % pp_size
+
+                bridge_comms_as_dest_module = []
+                bridge_comms_as_src_module = []
+                # If first stage, check if the module has any incoming modules
+                # If so, initialize bridge communicator
+                if pp_rank == 0:
+                    for bridge_comm in self.bridge_comms:
+                        if (
+                            bridge_comm.is_current_rank_in_grid(bridge_comm.dest_grid)
+                            and bridge_comm.dest_module_name == module_name
+                        ):
+                            bridge_comms_as_dest_module.append(bridge_comm)
+                # If last stage, check if the module has any outgoing modules
+                # If so, initialize bridge communicator
+                if pp_rank == pp_size - 1:
+                    for bridge_comm in self.bridge_comms:
+                        if (
+                            bridge_comm.is_current_rank_in_grid(bridge_comm.src_grid)
+                            and bridge_comm.src_module_name == module_name
+                        ):
+                            bridge_comms_as_src_module.append(bridge_comm)
+                # Build RankModuleInfo for the module
+                rank_module_info = RankModuleInfo(
+                    pp_rank=pp_rank,
+                    pp_size=pp_size,
+                    p2p_communicator=p2p_comm,
+                    bridge_comms_as_dest_module=bridge_comms_as_dest_module,
+                    bridge_comms_as_src_module=bridge_comms_as_src_module,
+                )
+                self.rank_module_map[module_name] = rank_module_info
+
+    def recv_forward(
+        self, tensor_shape: Optional[Shape] = None, is_first_stage: bool = False
+    ) -> Dict[str, torch.Tensor]:
+        """Receive forward activation tensor.
+
+        Args:
+            tensor_shape: Expected activation tensor shape
+
+        Returns:
+            A dictionary mapping module names to tensors.
+        """
+        logging.debug(
+            f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] "
+            f"[receive_forward] tensors_shape: {tensor_shape}, is_first_stage: {is_first_stage}"
+        )
+        input_dict = {}
+        for module_name, rank_module_info in self.rank_module_map.items():
+
+            if rank_module_info.pp_rank == 0:
+                # If first stage, and has incoming modules, receive forward activation
+                # from incoming modules.
+                for bridge_comm in rank_module_info.bridge_comms_as_dest_module:
+                    input_dict[bridge_comm.src_module_name] = bridge_comm.recv_forward()
+            else:
+                # If not first stage, receive forward activation tensor from P2P communicator.
+                input_dict[module_name] = rank_module_info.p2p_communicator.recv_forward(
+                    tensor_shapes=tensor_shape, is_first_stage=False
+                )
+        return input_dict
+
+    def send_forward(self, output_dict: Dict[str, torch.Tensor], is_last_stage: bool = False):
+        """Send forward activation tensor.
+
+        Args:
+            output_dict: A dictionary mapping module names to tensors.
+        """
+        logging.debug(
+            f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] "
+            f"[send_forward] output_dict keys: {output_dict.keys()}, is_last_stage: {is_last_stage}"
+        )
+        for module_name, rank_module_info in self.rank_module_map.items():
+            if rank_module_info.pp_rank == rank_module_info.pp_size - 1:
+                # If last stage, and has outgoing modules, send forward activation
+                # by using bridge communicator.
+                for bridge_comm in rank_module_info.bridge_comms_as_src_module:
+                    bridge_comm.send_forward(output_dict[module_name])
+            else:
+                # If not last stage, send forward activation by using P2P communicator.
+                rank_module_info.p2p_communicator.send_forward(
+                    output_dict[module_name], is_last_stage=False
+                )
+
+    def send_forward_recv_backward(
+        self,
+        output_dict: Dict[str, torch.Tensor],
+        tensor_shape: Optional[Shape] = None,
+        is_last_stage: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """Send forward activation tensor and receive backward activation tensor.
+
+        Args:
+            output_dict: A dictionary mapping module names to tensors.
+            tensor_shape: Expected gradient tensor shape
+
+        Returns:
+            A dictionary mapping module names to tensors.
+        """
+        logging.debug(
+            f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] "
+            f"[send_forward_recv_backward] output_dict keys: {output_dict.keys()}, "
+            f"tensor_shape: {tensor_shape}, is_last_stage: {is_last_stage}"
+        )
+        grad_dict = {}
+        for module_name, rank_module_info in self.rank_module_map.items():
+            if rank_module_info.pp_rank == rank_module_info.pp_size - 1:
+                # If last stage, and has outgoing modules, send forward activation and
+                # receive backward gradient by using bridge communicator.
+                for bridge_comm in rank_module_info.bridge_comms_as_src_module:
+                    grad_dict[bridge_comm.src_module_name] = bridge_comm.send_forward_recv_backward(
+                        output_dict[module_name]
+                    )
+            else:
+                # If not last stage, send forward activation and receive backward gradient
+                # by using P2P communicator.
+                grad_dict[module_name] = (
+                    rank_module_info.p2p_communicator.send_forward_recv_backward(
+                        output_dict[module_name], tensor_shapes=tensor_shape, is_last_stage=False
+                    )
+                )
+        return grad_dict
+
+    def send_backward_recv_forward(
+        self,
+        grad_dict: Dict[str, torch.Tensor],
+        tensor_shape: Optional[Shape] = None,
+        is_first_stage: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """Send backward activation tensor and receive forward activation tensor.
+
+        Args:
+            grad_dict: A dictionary mapping module names to tensors.
+            tensor_shape: Expected gradient tensor shape
+
+        Returns:
+            A dictionary mapping module names to tensors.
+        """
+        logging.debug(
+            f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] "
+            f"[send_backward_recv_forward] grad_dict keys: {grad_dict.keys()}, "
+            f"tensor_shape: {tensor_shape}, is_first_stage: {is_first_stage}"
+        )
+        input_dict = {}
+        for module_name, rank_module_info in self.rank_module_map.items():
+            if rank_module_info.pp_rank == 0:
+                for bridge_comm in rank_module_info.bridge_comms_as_dest_module:
+                    # If first stage, and has incoming modules, send backward gradient and
+                    # receive forward activation by using bridge communicator.
+                    input_dict[bridge_comm.src_module_name] = (
+                        bridge_comm.send_backward_recv_forward(
+                            grad_dict[bridge_comm.src_module_name]
+                        )
+                    )
+            else:
+                # If not first stage, send backward gradient and receive forward activation
+                # by using P2P communicator.
+                input_dict[module_name] = (
+                    rank_module_info.p2p_communicator.send_backward_recv_forward(
+                        grad_dict[module_name], tensor_shapes=tensor_shape, is_first_stage=False
+                    )
+                )
+        return input_dict
+
+    def recv_backward(
+        self, tensor_shape: Optional[Shape] = None, is_last_stage: bool = False
+    ) -> Dict[str, torch.Tensor]:
+        """Receive backward activation tensor.
+
+        Args:
+            tensor_shape: Expected gradient tensor shape
+
+        Returns:
+            A dictionary mapping module names to tensors.
+        """
+        logging.debug(
+            f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] "
+            f"[recv_backward] tensor_shape: {tensor_shape}, is_last_stage: {is_last_stage}"
+        )
+        grad_dict = {}
+        for module_name, rank_module_info in self.rank_module_map.items():
+            if rank_module_info.pp_rank == rank_module_info.pp_size - 1:
+                # If last stage, and has incoming modules, receive backward gradient
+                # by using bridge communicator.
+                for bridge_comm in rank_module_info.bridge_comms_as_src_module:
+                    grad_dict[bridge_comm.src_module_name] = bridge_comm.recv_backward()
+            else:
+                # If not last stage, receive backward gradient by using P2P communicator.
+                grad_dict[module_name] = rank_module_info.p2p_communicator.recv_backward(
+                    tensor_shapes=tensor_shape, is_last_stage=False
+                )
+        return grad_dict
+
+    def send_backward(self, grad_dict: Dict[str, torch.Tensor], is_first_stage: bool = False):
+        """Send backward activation tensor.
+
+        Args:
+            grad_dict: A dictionary mapping module names to tensors.
+        """
+        logging.debug(
+            f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] "
+            f"[send_backward] grad_dict keys: {grad_dict.keys()}, is_first_stage: {is_first_stage}"
+        )
+        for module_name, rank_module_info in self.rank_module_map.items():
+            if rank_module_info.pp_rank == 0:
+                # If first stage, and has incoming modules, send backward activation
+                # by using bridge communicator.
+                for bridge_comm in rank_module_info.bridge_comms_as_dest_module:
+                    bridge_comm.send_backward(grad_dict[bridge_comm.src_module_name])
+            else:
+                # If not first stage, send backward activation by using P2P communicator.
+                rank_module_info.p2p_communicator.send_backward(
+                    grad_dict[module_name], is_first_stage=False
+                )
+
+    @staticmethod
+    def compute_total_pipeline_stages(
+        topology: Dict[str, List[str]],
+        module_to_grid_map: Dict[str, HyperCommGrid],
+        rank: Optional[int] = None,
+        module_name: Optional[str] = None,
+    ) -> int:
+        """Compute the total number of pipeline stages across a multi-module chain.
+
+        Interprets ``topology`` as a directed acyclic graph (DAG) where nodes are modules
+        and edges indicate forward data flow from source to destination modules. Each node
+        is assigned a weight equal to its pipeline parallel size (number of PP stages).
+
+        The total number of stages is defined as the length of the longest path in this DAG
+        under node weights.
+
+        If ``rank`` is None (default), returns the maximum over all terminal (sink) modules of
+        the sum of PP sizes along a path ending at that terminal. For example, given:
+
+            image_encoder ->\
+                              -> llm -> generator
+            audio_encoder  ->/
+
+        the total is: max(pp(image_encoder), pp(audio_encoder)) + pp(llm) + pp(generator).
+
+        If ``rank`` is provided, the result is the total number of pipeline stages up to (and
+        including) the PP stage that ``rank`` occupies inside its module. In this case, the
+        weight of the target module equals (pp_rank_index(rank) + 1) instead of the module's
+        full PP size; other modules still contribute their full PP sizes. If the rank belongs to
+        multiple modules (colocation), pass ``module_name`` to disambiguate; otherwise the
+        maximum across all candidate modules containing the rank is returned.
+
+        Args:
+            topology: Mapping from a module to its list of outgoing modules.
+            module_to_grid_map: Mapping from module name to its ``HyperCommGrid``.
+
+        Returns:
+            The total number of pipeline stages along the longest path given the constraints.
+
+        Raises:
+            ValueError: If the topology contains cycles; or has no terminal nodes when
+                ``rank`` is None
+        """
+        nodes = set(module_to_grid_map.keys())
+        # Build adjacency and reverse-adjacency (predecessors).
+        adj: Dict[str, List[str]] = {node: list(topology.get(node, [])) for node in nodes}
+        preds: Dict[str, List[str]] = {node: [] for node in nodes}
+        for src, outs in adj.items():
+            for dst in outs:
+                preds[dst].append(src)
+
+        # Identify terminal nodes (no outgoing edges) for the rank=None case.
+        sinks = [node for node, outs in adj.items() if not outs]
+        if rank is None and not sinks:
+            raise ValueError(
+                "Topology must be a DAG with at least one terminal (no outgoing) module."
+            )
+
+        def pp_size(name: str) -> int:
+            grid = module_to_grid_map[name]
+            pp_dim_index = grid.dim_names.index('pp')
+            return grid.shape[pp_dim_index]
+
+        def partial_weight_for_target(target: str) -> Optional[int]:
+            if rank is None:
+                return None
+            grid = module_to_grid_map.get(target)
+            rank_groups = grid._gen_rank_enum(['pp'])
+            stage_index: Optional[int] = None
+            for group in rank_groups:
+                if rank in group:
+                    stage_index = group.index(rank)
+                    break
+            return stage_index + 1
+
+        def longest_path_to(target: str) -> int:
+            visiting = set()
+            partial = partial_weight_for_target(target)
+
+            def weight(name: str) -> int:
+                if partial is not None and name == target:
+                    return partial
+                return pp_size(name)
+
+            def dfs(node: str) -> int:
+                if node in visiting:
+                    raise ValueError("Topology contains cycles; expected a DAG.")
+                visiting.add(node)
+                best = 0
+                for p in preds.get(node, []):
+                    val = dfs(p)
+                    if val > best:
+                        best = val
+                visiting.remove(node)
+                return weight(node) + best
+
+            return dfs(target)
+
+        if rank is None:
+            return max(longest_path_to(sink) for sink in sinks)
+
+        return longest_path_to(module_name)
diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py
index 9aeff883c48..a223d64e6a3 100644
--- a/megatron/core/pipeline_parallel/p2p_communication.py
+++ b/megatron/core/pipeline_parallel/p2p_communication.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 
 from typing import List, Optional, Tuple, Union
@@ -217,22 +217,22 @@ def _communicate_shapes(self, tensor_send_next, tensor_send_prev, recv_prev, rec
             ops = []
             if send_prev_shape_tensor is not None:
                 send_prev_op = torch.distributed.P2POp(
-                    torch.distributed.isend, send_prev_shape_tensor, self.prev_rank
+                    torch.distributed.isend, send_prev_shape_tensor, self.prev_rank, self.pp_group
                 )
                 ops.append(send_prev_op)
             if recv_prev_shape_tensor is not None:
                 recv_prev_op = torch.distributed.P2POp(
-                    torch.distributed.irecv, recv_prev_shape_tensor, self.prev_rank
+                    torch.distributed.irecv, recv_prev_shape_tensor, self.prev_rank, self.pp_group
                 )
                 ops.append(recv_prev_op)
             if send_next_shape_tensor is not None:
                 send_next_op = torch.distributed.P2POp(
-                    torch.distributed.isend, send_next_shape_tensor, self.next_rank
+                    torch.distributed.isend, send_next_shape_tensor, self.next_rank, self.pp_group
                 )
                 ops.append(send_next_op)
             if recv_next_shape_tensor is not None:
                 recv_next_op = torch.distributed.P2POp(
-                    torch.distributed.irecv, recv_next_shape_tensor, self.next_rank
+                    torch.distributed.irecv, recv_next_shape_tensor, self.next_rank, self.pp_group
                 )
                 ops.append(recv_next_op)
             if len(ops) > 0:
@@ -302,13 +302,13 @@ def _communicate(
         tensor_recv_prev_func = None
         tensor_recv_next_func = None
 
-        if not config.variable_seq_lengths:
-            recv_prev_shape = tensor_shape
-            recv_next_shape = tensor_shape
-        else:
+        if config.variable_seq_lengths or config.mtp_standalone:
             recv_prev_shape, recv_next_shape = self._communicate_shapes(
                 tensor_send_next, tensor_send_prev, recv_prev, recv_next
             )
+        else:
+            recv_prev_shape = tensor_shape
+            recv_next_shape = tensor_shape
 
         def create_tensor_recv_prev():
             return torch.empty(
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
index 8269be657c4..91cda2db6ad 100644
--- a/megatron/core/pipeline_parallel/schedules.py
+++ b/megatron/core/pipeline_parallel/schedules.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import contextlib
 from functools import partial
@@ -9,6 +9,9 @@
 
 from megatron.core import parallel_state
 from megatron.core.enums import ModelType
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
 from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator
 from megatron.core.pipeline_parallel.utils import (
     is_pp_first_stage,
@@ -18,6 +21,7 @@
 )
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.cuda_graphs import create_cudagraphs
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler
 from megatron.core.utils import (
     drain_embedding_wgrad_compute,
@@ -32,12 +36,13 @@
     combined_1f1b_schedule_for_interleaved_pipelining,
     combined_1f1b_schedule_for_no_pipelining,
 )
+from .hybrid_cp_schedule import hybrid_context_parallel_forward_backward
 
 # Types
 Shape = Union[List[int], torch.Size]
 
 
-def get_forward_backward_func():
+def get_forward_backward_func(pp_size: Optional[int] = None, vp_size: Optional[int] = None):
     """Retrieves the appropriate forward_backward function given the
     configuration of parallel_state.
 
@@ -106,9 +111,9 @@ def forward_step(data_iterator, model):
     decoder_seq_length (int, optional): The sequence length for the decoder in a dual-stack
         transformer. This is ignored for a single-stack transformer.
 
-    forward_only (optional, default = False): Perform only the forward step
+    forward_only (optional, default = False): Perform only the forward step.
 
-    collect_non_loss_data (optional, bool, default=False): TODO
+    collect_non_loss_data (optional, bool, default=False): TODO.
 
     first_val_step (bool, optional): Is the first step of the validation phase. Used by
         Transformer Engine modules to only update their fp8 weights only on the first validation
@@ -120,14 +125,28 @@ def forward_step(data_iterator, model):
         respective list of shapes. Thus it is not used in the other forward-backward functions
         which have different shape handling.
 
+    force_all_reduce (bool, optional): If true, force use of all-reduce for gradient reduction
+        instead of reduce-scatter (if using distributed optimizer) in this iteration to ensure all
+        data-parallel ranks have fully reduced gradients. This is useful for easier wgrad saving
+        (can just inspect DP replica 0 to get full set of wgrads for entire model).
+
+    Args:
+        pp_size (Optional[int]): Pipeline model parallel size to use.
+        vp_size (Optional[int]): Virtual pipeline model parallel size to use.
+            If both pp_size and vp_size are None, both values fall back to parallel_state.
+            Otherwise, provided values are used as-is and None is treated as an explicit input.
+
     """
-    pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
-    if pipeline_model_parallel_size > 1:
-        ######### FlagScale Modify ########
-        if parallel_state.get_dualpipev_pipeline_model_parallel_world_size() is not None:
+    if pp_size is None and vp_size is None:
+        pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+    if pp_size > 1:
+        dualpipev_size = parallel_state.get_dualpipev_pipeline_model_parallel_world_size()
+        if dualpipev_size is not None:
             from megatron.plugin.dualpipev.dualpipev_schedules import forward_backward_pipelining_with_dualpipev
             forward_backward_func = forward_backward_pipelining_with_dualpipev
-        elif parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+        elif vp_size is not None:
             forward_backward_func = forward_backward_pipelining_with_interleaving
         else:
             forward_backward_func = forward_backward_pipelining_without_interleaving
@@ -235,7 +254,9 @@ def forward_step_calc_loss(
 
     num_tokens = torch.tensor(0, dtype=torch.int)
     if is_last_stage:
-        if not collect_non_loss_data:
+        if loss_func is None:
+            forward_data_store.append(output_tensor)
+        elif not collect_non_loss_data:
             outputs = loss_func(output_tensor)
             if len(outputs) == 3:
                 output_tensor, num_tokens, loss_reduced = outputs
@@ -272,7 +293,7 @@ def forward_step_calc_loss(
         if config.calculate_per_token_loss:
             MoEAuxLossAutoScaler.set_loss_scale(loss_scale)
         else:
-            MoEAuxLossAutoScaler.set_loss_scale(loss_scale / num_microbatches)
+            MoEAuxLossAutoScaler.set_loss_scale(loss_scale * cp_group_size / num_microbatches)
 
     # Set the loss scale for Multi-Token Prediction (MTP) loss.
     if hasattr(config, 'mtp_num_layers') and config.mtp_num_layers is not None:
@@ -511,7 +532,9 @@ def forward_backward_no_pipelining(
     collect_non_loss_data: bool = False,
     first_val_step: Optional[bool] = None,
     adjust_tensor_shapes_fn: Optional[Callable] = None,  # unused
+    p2p_communicator: Optional[P2PCommunicator] = None,  # unused
     pg_collection: Optional[ProcessGroupCollection] = None,
+    force_all_reduce: Optional[bool] = False,
 ):
     """Run forward and backward passes with no pipeline parallelism"""
 
@@ -593,6 +616,24 @@ def forward_backward_no_pipelining(
             total_num_tokens,
             partial(check_first_val_step, first_val_step, forward_only),
         )
+    elif config.hybrid_context_parallel:
+        forward_data_store, total_num_tokens = hybrid_context_parallel_forward_backward(
+            forward_step_func,
+            data_iterator,
+            model,
+            num_microbatches,
+            input_tensor,
+            output_tensor_grad,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            first_val_step,
+            forward_only,
+            no_sync_func,
+            total_num_tokens,
+            check_first_val_step,
+            model_type,
+        )
     else:
         with no_sync_func():
             for i in range(num_microbatches - 1):
@@ -644,15 +685,19 @@ def forward_backward_no_pipelining(
             [model],
             total_num_tokens if config.calculate_per_token_loss else None,
             pg_collection=pg_collection,
+            force_all_reduce=force_all_reduce,
         )
 
+    if not forward_only and config.fine_grained_activation_offloading:
+        off_interface.reset()
+
     if config.timers is not None:
         config.timers('forward-backward').stop()
 
     if (
         hasattr(config, 'cuda_graph_impl')
         and config.cuda_graph_impl == "local"
-        and config.cuda_graph_scope != "full_iteration"
+        and CudaGraphScope.full_iteration not in config.cuda_graph_scope
     ):
         create_cudagraphs()
 
@@ -784,32 +829,6 @@ def get_schedule_table(num_microbatches, num_model_chunks, microbatch_group_size
     return schedule_table
 
 
-def convert_schedule_table_to_order(num_warmup_microbatches, num_model_chunks, schedule_table):
-    """Convert a tunable schedule lookup table to the te.make_graphed_callables() accepted
-    order format. For example, the tunable schedule table for PP2 N3M5 with VP2 is as below:
-    virtual_microbatch_id | 0 1 2 3 4 5 6 7 8 9
-    microbatch_id         | 0 1 2 0 1 2 3 4 3 4
-    model_chunk_id        | 0 0 0 1 1 1 0 0 1 1
-
-    Then the forward backward separated order is:
-    forward               | 1 1 1 2 2 2 1 1 2 2
-    backward              | -2 -2 -2 -1 -1 -1 -2 -2 -1 -1
-
-    If num_warmup_microbatches is 5, the output order is:
-    1 1 1 2 2 2 -2 1 -2 1 -2 2 -1 2 -1 -1 -2 -2 -1 -1
-    """
-    _, model_chunk_id_table = zip(*schedule_table)
-    forward_order = [chunk_id + 1 for chunk_id in model_chunk_id_table]
-    backward_order = [chunk_id - num_model_chunks for chunk_id in model_chunk_id_table]
-    order = forward_order[:num_warmup_microbatches]
-    for i in range(num_warmup_microbatches, len(forward_order)):
-        order.append(forward_order[i])
-        order.append(backward_order[i - num_warmup_microbatches])
-    if num_warmup_microbatches > 0:
-        order.extend(backward_order[-num_warmup_microbatches:])
-    return order
-
-
 def forward_backward_pipelining_with_interleaving(
     *,
     forward_step_func,
@@ -825,6 +844,7 @@ def forward_backward_pipelining_with_interleaving(
     adjust_tensor_shapes_fn: Optional[Callable] = None,  # unused
     p2p_communicator: Optional[P2PCommunicator] = None,
     pg_collection: Optional[ProcessGroupCollection] = None,
+    force_all_reduce: Optional[bool] = False,
 ):
     """Run interleaved 1F1B schedule (model split into model chunks), with
     communication between pipeline stages as needed.
@@ -1904,8 +1924,11 @@ def pp_post_backward(input_tensor_grad, vp_stage=None):
             model,
             total_num_tokens if config.calculate_per_token_loss else None,
             pg_collection=pg_collection,
+            force_all_reduce=force_all_reduce,
         )
 
+    if not forward_only and config.fine_grained_activation_offloading:
+        off_interface.reset()
     # Restore config.grad_sync_func and config.param_sync_func.
     if forward_only:
         config.grad_sync_func, config.param_sync_func = grad_sync_func, param_sync_func
@@ -1916,7 +1939,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None):
     if (
         hasattr(config, 'cuda_graph_impl')
         and config.cuda_graph_impl == "local"
-        and config.cuda_graph_scope != "full_iteration"
+        and CudaGraphScope.full_iteration not in config.cuda_graph_scope
     ):
         create_cudagraphs()
     nvtx_range_pop(suffix="misc")
@@ -1965,6 +1988,7 @@ def forward_backward_pipelining_without_interleaving(
     adjust_tensor_shapes_fn: Optional[Callable] = None,
     p2p_communicator: Optional[P2PCommunicator] = None,
     pg_collection: Optional[ProcessGroupCollection] = None,
+    force_all_reduce: Optional[bool] = False,
 ):
     """Run non-interleaved 1F1B schedule, with communication between pipeline
     stages. Returns dictionary with losses if the last stage, empty dict otherwise."""
@@ -2301,15 +2325,19 @@ def enable_grad_sync():
             [model],
             total_num_tokens if config.calculate_per_token_loss else None,
             pg_collection=pg_collection,
+            force_all_reduce=force_all_reduce,
         )
 
+    if not forward_only and config.fine_grained_activation_offloading:
+        off_interface.reset()
+
     if config.timers is not None:
         config.timers('forward-backward').stop()
 
     if (
         hasattr(config, 'cuda_graph_impl')
         and config.cuda_graph_impl == "local"
-        and config.cuda_graph_scope != "full_iteration"
+        and CudaGraphScope.full_iteration not in config.cuda_graph_scope
     ):
         create_cudagraphs()
 
diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py
index fae8e5466da..03c5f01f443 100644
--- a/megatron/core/pipeline_parallel/utils.py
+++ b/megatron/core/pipeline_parallel/utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
+import logging
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Callable, Optional
@@ -7,7 +8,9 @@
 import torch
 from torch.autograd import Variable
 
-from megatron.core.utils import get_pg_rank, get_pg_size, make_viewless_tensor
+from megatron.core.utils import get_pg_rank, get_pg_size, log_single_rank, make_viewless_tensor
+
+logger = logging.getLogger(__name__)
 
 
 def is_pp_first_stage(pp_group: torch.distributed.ProcessGroup):
@@ -80,6 +83,39 @@ def make_viewless(e):
     return e
 
 
+def set_ideal_affinity_for_current_gpu():
+    """Set CPU affinity for the current GPU to optimize host-device transfers."""
+    import uuid
+
+    try:
+        import cuda.bindings.driver as cuda_driver
+        import cuda.bindings.runtime as cuda_runtime
+    except:
+        try:
+            import cuda.cuda as cuda_driver
+            import cuda.cudart as cuda_runtime
+        except:
+            raise RuntimeError("Please install cuda-python to enable GPU affinity setting")
+    import pynvml
+
+    # Get current CUDA device ID
+    err, device_id = cuda_runtime.cudaGetDevice()
+    assert err == cuda_runtime.cudaError_t.cudaSuccess
+    # Get device UUID
+    err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id)
+    assert err == cuda_driver.CUresult.CUDA_SUCCESS
+    # Set CPU affinity based on GPU's NUMA node
+    pynvml.nvmlInit()
+    handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes)))
+    pynvml.nvmlDeviceSetCpuAffinity(handle)
+
+    log_single_rank(
+        logger,
+        logging.WARNING,
+        f"Set CPU affinity for all GPUs for optimal host-device transfer performance",
+    )
+
+
 @contextmanager
 def stream_acquire_context(stream, event):
     """Stream acquire context"""
@@ -149,6 +185,8 @@ def __init__(
         self.free_input = free_input
         self.inputs = None
         self.outputs = None
+        self.delay_grads_release = False
+        self.manual_release_grads = False
 
     def default_backward_func(self, outputs, output_grad):
         """Default backward function"""
@@ -230,6 +268,12 @@ def _backward(self, *output_grad):
             for g in output_grad:
                 if g is not None:
                     g.record_stream(self.stream)
+                    # Manually trigger the memory release of dgrad tensor
+                    # to avoid delayed garbage collection. If
+                    # delay_grads_release is True, dgrad is last used in
+                    # wgrad compute and skip the release here.
+                    if self.manual_release_grads and not self.delay_grads_release:
+                        g.untyped_storage().resize_(0)
 
         grads = self.get_grad()
         self._release_state()
diff --git a/megatron/core/post_training/modelopt/__init__.py b/megatron/core/post_training/modelopt/__init__.py
index 885d2b3f019..7390d1bb24d 100644
--- a/megatron/core/post_training/modelopt/__init__.py
+++ b/megatron/core/post_training/modelopt/__init__.py
@@ -1,10 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-"""Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt).
+"""Integrations with NVIDIA Model Optimizer (referred to as ModelOpt).
 
 ModelOpt is a library comprising state-of-the-art model optimization techniques
 including quantization and sparsity to compress model for efficient inference on
 NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
 experience for users to optimize their Megatron-core models for inference.
 More details on ModelOpt including installation and usage can be found at
-https://github.com/NVIDIA/TensorRT-Model-Optimizer.
+https://github.com/NVIDIA/Model-Optimizer.
 """
diff --git a/megatron/core/post_training/modelopt/gpt/model_specs.py b/megatron/core/post_training/modelopt/gpt/model_specs.py
index 0b62b0c057c..b705997ad40 100644
--- a/megatron/core/post_training/modelopt/gpt/model_specs.py
+++ b/megatron/core/post_training/modelopt/gpt/model_specs.py
@@ -50,7 +50,7 @@ def get_gpt_modelopt_spec(
         config: model's transformer config
         local_core_attention: whether to use local DotProductAttention or TEDotProductAttention
         remap_te_layernorm: whether to perform sharded state_dict prefix mapping on layernorm
-        real_quant_cfg: TensorRT Model Optimizer real quantization config
+        real_quant_cfg: Model Optimizer real quantization config
         qk_l2_norm: whether to use Llama4 L2 norm for Q and K
         use_arbitrary_attention_mask: whether to use arbitrary attention mask instead of causal
     """
diff --git a/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py b/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py
index 596d210f581..22624d9ab2a 100644
--- a/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py
+++ b/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
+import logging
 from logging import getLogger
 
-import torch
+from megatron.core.utils import log_single_rank
 
 logger = getLogger(__name__)
 
@@ -58,7 +59,6 @@ def mcore_gpt_load_te_state_dict_pre_hook(
                 key_rewrite_list += [(key, key.replace(old_name, new_name))]
 
     for old_key, new_key in key_rewrite_list:
-        if torch.distributed.get_rank() == 0:
-            logger.info("replace {} with {}".format(old_key, new_key))
+        log_single_rank(logger, logging.INFO, "replace {} with {}".format(old_key, new_key))
         state_dict[new_key] = state_dict[old_key]
         state_dict.pop(old_key)
diff --git a/megatron/core/post_training/modelopt/layers.py b/megatron/core/post_training/modelopt/layers.py
index 0ca4a8e4070..698fae4478a 100644
--- a/megatron/core/post_training/modelopt/layers.py
+++ b/megatron/core/post_training/modelopt/layers.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
+import logging
 from typing import Callable, List, Optional
 
 import torch
@@ -10,6 +11,8 @@
 from megatron.core.transformer.transformer_layer import TransformerLayer
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 
+logger = logging.getLogger(__name__)
+
 try:
     import transformer_engine as te
 
@@ -17,6 +20,7 @@
 except ImportError:
     HAVE_TE = False
 
+
 FP8_PER_TENSOR_REAL_QUANT_CFG = {
     "quant_cfg": {
         "*weight_quantizer": {"num_bits": (4, 3), "axis": None},
@@ -116,6 +120,7 @@ def __init__(
         tp_group: Optional[torch.distributed.ProcessGroup] = None,
     ):
         self.config = config
+        self.tp_group = tp_group
 
         self._return_bias = skip_bias_add and bias
 
@@ -153,7 +158,11 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
                 if v.ndim == 0:
                     state_dict[k] = v.view(1)
         sharded_state_dict = make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, sharded_offsets=sharded_offsets
+            state_dict,
+            prefix,
+            sharded_offsets=sharded_offsets,
+            tp_group=self.tp_group,
+            dp_cp_group=metadata['dp_cp_group'],
         )
         return sharded_state_dict
 
@@ -170,7 +179,7 @@ class RealQuantTransformerLayer(TransformerLayer):
     """Real quantization transformer layer base class.
 
     This base class iniitialize the default TransformerLayer and immediately
-    perform weight-only real quantization via TensorRT Model Optimizer.
+    perform weight-only real quantization via Model Optimizer.
     All linear weights (Linear, ColumnParallelLinear, RowParallelLinear) picked
     up will be replaced with low-bit data type (default torch.uint8). If sub-byte
     real_quant_cfg is used, the weight shape will further be half.
@@ -229,7 +238,7 @@ def _report_quantize_tensor_info(self):
                 if not isinstance(v, torch.Tensor):
                     continue
                 original_dtype, original_shape = self._original_tensor_info.get(k, ("-", "-"))
-                print(
+                logger.info(
                     "{:<64} {:<16} {:<32} {:<16} {:<32}".format(
                         k, original_dtype, original_shape, str(v.dtype), str(v.shape)
                     )
diff --git a/megatron/core/post_training/modelopt/mamba/model_specs.py b/megatron/core/post_training/modelopt/mamba/model_specs.py
index e8a14212bc3..0a38d05b980 100755
--- a/megatron/core/post_training/modelopt/mamba/model_specs.py
+++ b/megatron/core/post_training/modelopt/mamba/model_specs.py
@@ -2,6 +2,7 @@
 
 from megatron.core.extensions.transformer_engine import TEDotProductAttention
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec
 from megatron.core.post_training.modelopt.layers import Norm
 from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
 from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
@@ -83,9 +84,23 @@ def get_mamba_stack_modelopt_spec(
         ),
     )
 
+    moe_layer = ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            pre_mlp_layernorm=Norm,
+            mlp=get_moe_module_spec(
+                use_te=False, num_experts=8, moe_grouped_gemm=False  # Can be anything non None
+            ),
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
     return ModuleSpec(
         module=MambaStack,
         submodules=MambaStackSubmodules(
-            mamba_layer=mamba_layer, attention_layer=attention_layer, mlp_layer=mlp_layer
+            mamba_layer=mamba_layer,
+            attention_layer=attention_layer,
+            mlp_layer=mlp_layer,
+            moe_layer=moe_layer,
         ),
     )
diff --git a/megatron/core/quantization/quant_config.py b/megatron/core/quantization/quant_config.py
index 0005b4d39fb..81aed23e927 100644
--- a/megatron/core/quantization/quant_config.py
+++ b/megatron/core/quantization/quant_config.py
@@ -50,6 +50,8 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
+from megatron.core.utils import log_single_rank
+
 logger = logging.getLogger(__name__)
 
 try:
@@ -134,11 +136,13 @@ def __init__(self, matchers: List[Matcher], config_dict: Dict[str, Dict]):
         self.matchers = matchers
 
     @staticmethod
-    def _build_matchers(matchers_dict: Dict) -> List[Matcher]:
+    def _build_matchers(matchers_dict: Dict | None) -> List[Matcher]:
         # NOTE(slayton): We rely on order for matchers because it allows us to specify an
         # override ordering from the yaml structure. Process matchers in order of
         # definition, so we can have fallthrus.
         matchers: List[Matcher] = []
+        if matchers_dict is None:
+            return matchers
 
         for name, matcher in matchers_dict.items():
             enabled = matcher.get("enabled", False)
@@ -172,7 +176,13 @@ def from_yaml_file(recipe_yaml_path: str) -> "RecipeConfig":
             raise ImportError("yaml is not installed. Please install it with `pip install pyyaml`.")
 
         with open(recipe_yaml_path, "r") as f:
-            config = yaml.load(f, Loader=yaml.FullLoader)
+            config = yaml.load(f, Loader=yaml.SafeLoader)
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Loaded quantization recipe from path '{recipe_yaml_path}'. " f"Contents: '{config}'",
+        )
 
         return RecipeConfig.from_config_dict(config)
 
@@ -182,7 +192,7 @@ def from_config_dict(config: Dict) -> "RecipeConfig":
 
         matchers_config = config.get("matchers", None)
         matchers = RecipeConfig._build_matchers(matchers_config)
-        config_dict = config.get("configs", None)
+        config_dict = config.get("configs", {})
 
         return RecipeConfig(matchers, config_dict)
 
@@ -194,9 +204,15 @@ def match_to_config_key(self, operator_context: MatchContext) -> str | None:
         for matcher in self.matchers:
             config_key = matcher.match(operator_context)
             if config_key is not None:
-                logger.info(f'Context ({operator_context}) matched to quant config "{config_key}"')
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    f'Context ({operator_context}) matched to quant config "{config_key}"',
+                )
                 return config_key
-        logger.info(f"No config key match found for Context ({operator_context})")
+        log_single_rank(
+            logger, logging.INFO, f"No config key match found for Context ({operator_context})"
+        )
         return None
 
     def match(self, operator_context: MatchContext) -> QuantizationConfig | None:
diff --git a/megatron/core/quantization/utils.py b/megatron/core/quantization/utils.py
index 519cd8efd9d..5a013be27e4 100644
--- a/megatron/core/quantization/utils.py
+++ b/megatron/core/quantization/utils.py
@@ -10,21 +10,19 @@ def get_quant_config_or_none(
     module_path: str, recipe: Optional[RecipeConfig] = None
 ) -> Union[QuantizationConfig, None]:
     """Resolve quantization config for a layer."""
+    if recipe is None:
+        return None
     re_match = re.search(r'layers\.(\d+)', module_path)
     if re_match:
         layer_number: Optional[int] = int(re_match.group(1))
     else:
         layer_number = None
-    if recipe is not None:
-        return recipe.match(MatchContext(module_path=module_path, layer_number=layer_number))
-    else:
-        return None
+    return recipe.match(MatchContext(module_path=module_path, layer_number=layer_number))
 
 
 def load_quantization_recipe(recipe_path: str) -> RecipeConfig:
     """Loads a quantization recipe from a path."""
     recipe = RecipeConfig.from_yaml_file(recipe_path)
-
     return recipe
 
 
diff --git a/megatron/core/rerun_state_machine.py b/megatron/core/rerun_state_machine.py
index 3cd6999ab2b..8fce2beaa85 100644
--- a/megatron/core/rerun_state_machine.py
+++ b/megatron/core/rerun_state_machine.py
@@ -9,11 +9,12 @@
 import re
 from collections import defaultdict
 from enum import Enum
-from typing import Any, Callable, Iterable, NamedTuple, Optional, Set, Tuple, Union
+from typing import Any, Callable, Iterable, List, NamedTuple, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
 
+from megatron.core._rank_utils import log_single_rank, safe_get_rank
 from megatron.core.dist_checkpointing.mapping import ShardedObject
 
 """DISCLAIMER: THIS IS AN EXPERIMENTAL FEATURE.
@@ -234,14 +235,12 @@ def __init__(
 
         self.saved_results: dict[Call, Any] = {}
         self.stats: dict[Caller, QuickStats] = defaultdict(lambda: QuickStats())
-        if _safe_get_rank() == 0:
-            logger.warning(f"RerunStateMachine initialized in mode {mode}")
+        log_single_rank(logger, logging.WARNING, f"RerunStateMachine initialized in mode {mode}")
 
     def set_mode(self, mode: RerunMode) -> None:
         """Method to set the operating mode"""
 
-        if _safe_get_rank() == 0:
-            logger.warning(f"Setting RerunStateMachine mode {mode}")
+        log_single_rank(logger, logging.WARNING, f"Setting RerunStateMachine mode {mode}")
         self.mode = mode
 
     def get_mode(self) -> RerunMode:
@@ -249,6 +248,24 @@ def get_mode(self) -> RerunMode:
 
         return self.mode
 
+    def _reduce_any(self, value: Union[bool, List[bool]]) -> Union[bool, Tuple[bool, ...]]:
+        """
+        All-reduce a boolean value (or multiple boolean values) across the world group.
+
+        If any of the ranks have a True value, return True.
+        If all the ranks have a False value, return False.
+
+        For multiple inputs, returns a tuple.
+        """
+        if isinstance(value, list):
+            val_tensor: torch.Tensor = torch.tensor(value, dtype=torch.int32, device='cuda')
+            torch.distributed.all_reduce(val_tensor)
+            return tuple([x > 0 for x in val_tensor.tolist()])
+        else:
+            val_tensor: torch.Tensor = torch.tensor([value], dtype=torch.int32, device='cuda')
+            torch.distributed.all_reduce(val_tensor)
+            return val_tensor.item() > 0
+
     def should_run_forward_backward(self, data_iterator: DataIteratorArgType) -> bool:
         """Method instructing whether to (re)run the forward-backward pass.
 
@@ -306,14 +323,11 @@ def train_step(data_iterator, ...):
             if self.mode == RerunMode.DISABLED:
                 self.state = RerunState.NOT_RUNNING_YET
                 return False
-            will_rerun_tensor: torch.Tensor = torch.tensor(
-                [self.rerun_requested], dtype=torch.int32, device="cuda"
-            )
-            torch.distributed.all_reduce(will_rerun_tensor)
-            if will_rerun_tensor.item() == 0:
+            will_rerun = self._reduce_any(self.rerun_requested)
+            if not will_rerun:
                 self.state = RerunState.NOT_RUNNING_YET
                 return False
-            if self.mode == RerunMode.VALIDATE_RESULTS and _safe_get_rank() == 0:
+            if self.mode == RerunMode.VALIDATE_RESULTS and safe_get_rank() == 0:
                 logger.warning("Need to rerun step to check reproducibility of initial result")
             self.state = RerunState.RERUNNING_IN_PLACE
             self._restore_state()
@@ -330,11 +344,22 @@ def train_step(data_iterator, ...):
                 self._maybe_report_stats()
                 self.saved_results = defaultdict(list)
                 return False
-            will_checkpoint_tensor: torch.Tensor = torch.tensor(
-                [self.checkpoint_requested], dtype=torch.int32, device="cuda"
+            # N.B. We may be able to rely on the behavior of the state machine
+            # to produce an equivalent value of self.continue_requested across
+            # ranks, since it depends on "fatal". That logic coupling seems
+            # brittle though.
+            will_continue, will_checkpoint = self._reduce_any(
+                [self.continue_requested, self.checkpoint_requested]
             )
-            torch.distributed.all_reduce(will_checkpoint_tensor)
-            if will_checkpoint_tensor.item() > 0:
+            if will_continue:
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    "Continuing normal execution because failed validation was not fatal",
+                )
+                self.state = RerunState.NOT_RUNNING_YET
+                return False
+            if will_checkpoint:
                 self.state = RerunState.WILL_RERUN_FROM_CHECKPOINT
             self._restore_state()
             if data_iterators:
@@ -347,27 +372,24 @@ def train_step(data_iterator, ...):
             return True
         # Are we done re-running from a checkpoint?
         elif self.state == RerunState.RERUNNING_FROM_CHECKPOINT:
-            will_restart_again_tensor: torch.Tensor = torch.tensor(
-                [self.restart_again_requested], dtype=torch.int32, device="cuda"
+            will_restart_again, will_continue = self._reduce_any(
+                [self.restart_again_requested, self.continue_requested]
             )
-            torch.distributed.all_reduce(will_restart_again_tensor)
-            if will_restart_again_tensor.item() > 0:
-                if _safe_get_rank() == 0:
-                    logger.warning(
-                        "Need to restart job from the same checkpoint "
-                        "because it was scheduled on the same node/GPU"
-                    )
+            if will_restart_again:
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    "Need to restart job from the same checkpoint "
+                    "because it was scheduled on the same node/GPU",
+                )
                 self.state = RerunState.RERUNNING_AGAIN_FROM_CHECKPOINT
             else:
-                will_continue_tensor: torch.Tensor = torch.tensor(
-                    [self.continue_requested], dtype=torch.int32, device="cuda"
-                )
-                torch.distributed.all_reduce(will_continue_tensor)
-                if will_continue_tensor.item() > 0:
-                    if _safe_get_rank() == 0:
-                        logger.warning(
-                            "Continuing normal execution because failed validation was not fatal"
-                        )
+                if will_continue:
+                    log_single_rank(
+                        logger,
+                        logging.WARNING,
+                        "Continuing normal execution because failed validation was not fatal",
+                    )
                     self.state = RerunState.NOT_RUNNING_YET
             return False
         raise RuntimeError("Should not be here")
@@ -403,33 +425,37 @@ def train_step(data_iterator, ...):
         if self.mode in [RerunMode.DISABLED, RerunMode.REPORT_DETERMINISM_STATS]:
             return False, False, 0
         if self.state == RerunState.RERUNNING_IN_PLACE:
-            if _safe_get_rank() == 0:
-                logger.warning(
-                    "Exiting now. A checkpoint at the last iteration is being saved "
-                    "if further examination is needed"
-                )
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "Exiting now. A checkpoint at the last iteration is being saved "
+                "if further examination is needed",
+            )
             return True, True, EXIT_CODE_FAILED_ON_RESULT_VALIDATION
         elif self.state == RerunState.WILL_RERUN_FROM_CHECKPOINT:
-            if _safe_get_rank() == 0:
-                logger.warning(
-                    "Saving a checkpoint and exiting now. Please resume the job "
-                    "from the checkpoint to rerun the last iteration "
-                    "and establish a diagnostic"
-                )
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "Saving a checkpoint and exiting now. Please resume the job "
+                "from the checkpoint to rerun the last iteration "
+                "and establish a diagnostic",
+            )
             return True, True, EXIT_CODE_RESUME_TO_DISAMBIGUATE
         elif self.state == RerunState.RERUNNING_FROM_CHECKPOINT:
-            if _safe_get_rank() == 0:
-                logger.warning(
-                    "Exiting now. A checkpoint at the last iteration already exists "
-                    "if further examination is needed"
-                )
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "Exiting now. A checkpoint at the last iteration already exists "
+                "if further examination is needed",
+            )
             return False, True, EXIT_CODE_FAILED_ON_RESULT_VALIDATION
         elif self.state == RerunState.RERUNNING_AGAIN_FROM_CHECKPOINT:
-            if _safe_get_rank() == 0:
-                logger.warning(
-                    "Exiting now. Please resume the job from the same checkpoint "
-                    "to rerun the last iteration and establish a diagnostic"
-                )
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "Exiting now. Please resume the job from the same checkpoint "
+                "to rerun the last iteration and establish a diagnostic",
+            )
             return False, True, EXIT_CODE_RESUME_TO_DISAMBIGUATE
         return False, False, 0
 
@@ -454,8 +480,8 @@ def validate_result(
                 the 2. The default implementation is for 0-dim float tensors.
             tolerance: tolerance used in combination with comparison_func to determine
                 reproducibility of results. Default is no tolerance (deterministic calculations).
-            fatal: whether to abort the job when no HW fault was identified (unexpected result is
-                reproducible and correct).
+            fatal: whether to abort the job when fault attribution is complete
+                (transient/permanent/not HW)
         Returns:
             None
 
@@ -487,14 +513,14 @@ def train_step(data_iterator, ...):
         """
 
         # If reruns are disabled, still validate the result and throw a RuntimeError if it is
-        # rejected. This is a backward-compatible behavior.
+        # rejected when fatal. This is a backward-compatible behavior for infs and NaNs.
         if self.mode == RerunMode.DISABLED:
             result_rejected: bool = rejection_func(result)
             if result_rejected:
                 self._log_validation_error_to_file(
                     status=RerunValidationStatus.RERUN_DISABLED, result=result, message=message
                 )
-                rank: int = _safe_get_rank()
+                rank: int = safe_get_rank()
                 node: str = os.uname()[1]
                 device: int = torch.cuda.current_device()
                 full_message: str = (
@@ -502,7 +528,10 @@ def train_step(data_iterator, ...):
                     f"iteration {self.current_iteration}: "
                     f"Unexpected result {result} (message='{message}')"
                 )
-                raise RuntimeError(full_message)
+                if fatal:
+                    raise RuntimeError(full_message)
+                else:
+                    logger.warning(full_message)
             return
 
         if comparison_func is None:
@@ -517,7 +546,7 @@ def train_step(data_iterator, ...):
         # Handle the stats reporting mode. In that mode, we rerun every iteration once to collect
         # stats about any non-determinism in the calculations (as a relative difference between the
         # calculations in the initial run and in the re-run). The only assumption here is that the
-        # control flow is deterministic (so that the results corresponding to the nth invokation of
+        # control flow is deterministic (so that the results corresponding to the nth invocation of
         # validate_result() can be compared).
 
         if self.mode == RerunMode.REPORT_DETERMINISM_STATS:
@@ -532,19 +561,27 @@ def train_step(data_iterator, ...):
                 self.stats[caller].record(diff)
             return
 
-        def log_failure(message: str) -> None:
-            rank: int = _safe_get_rank()
+        def log_failure(message: str, fatal: bool = True) -> None:
+            rank: int = safe_get_rank()
             node: str = os.uname()[1]
             device: int = torch.cuda.current_device()
-            logger.error(f"Rank {rank}, node {node}, device {device}: {message}!")
+            if fatal:
+                logger.error(
+                    f"Rank {rank}, node {node}, device {device}, "
+                    f"iteration #{self.current_iteration}: {message}!"
+                )
+            else:
+                logger.warning(
+                    f"Rank {rank}, node {node}, device {device}, "
+                    f"iteration #{self.current_iteration}: {message}!"
+                )
 
         # Emit message in log so that we can identify which jobs have this instrumentation
         # enabled. We do this from the validate_result() method because some jobs may run with
         # the check_for_nan_in_loss_and_grad option but never call validate_result.
         if not self.logged_sdc_enabled:
             self.logged_sdc_enabled = True
-            if _safe_get_rank() == 0:
-                logger.warning("Result validation enabled")
+            log_single_rank(logger, logging.WARNING, "Result validation enabled")
 
         # If this the initial run of the iteration, and no unexpected result has already been
         # identified?
@@ -555,7 +592,7 @@ def log_failure(message: str) -> None:
             if not self.first_iteration_complete:
                 return
 
-            result_rejected: bool = self.error_injector.maybe_inject() or rejection_func(result)
+            result_rejected = self.error_injector.maybe_inject() or rejection_func(result)
             if result_rejected:
                 self.failed_validation_call = validation_call
                 self.initial_result = result
@@ -565,9 +602,9 @@ def log_failure(message: str) -> None:
                 )
                 logger.error(
                     f"Unexpected result {result} "
-                    f"on rank {_safe_get_rank()} "
+                    f"on rank {safe_get_rank()} "
                     f"at iteration #{self.current_iteration} "
-                    f"invokation #{validation_call.sequence} "
+                    f"invocation #{validation_call.sequence} "
                     f"(message='{message}')"
                 )
         # If this the first rerun (same GPU) or second 2nd rerun (different GPU), and have we
@@ -582,18 +619,25 @@ def log_failure(message: str) -> None:
             # This is the first re-run.
             if self.state == RerunState.RERUNNING_IN_PLACE:
                 if comparison > tolerance:
-                    logger.warning(
+                    if not fatal:
+                        self.continue_requested = True
+                    log_failure(
                         "First rerun: unexpected result is not reproducible within the tolerance "
-                        f"({result} != {self.initial_result})"
+                        f"({result} != {self.initial_result})",
+                        fatal=fatal,
                     )
                     self._log_validation_error_to_file(
                         status=RerunValidationStatus.FIRST_RERUN_NOT_REPRODUCIBLE,
                         result=result,
                         message=message,
                     )
-                    log_failure("Possible transient error!")
+                    log_failure("Possible transient error!", fatal=fatal)
+
                 else:
-                    self.checkpoint_requested = True
+                    if fatal:
+                        self.checkpoint_requested = True
+                    else:
+                        self.continue_requested = True
                     # Remember the node and device we're running on so that we can check we're not
                     # rerunning on the same GPU when we resume from the checkpoint.
                     self.suspicious_node = os.uname()[1]
@@ -603,16 +647,17 @@ def log_failure(message: str) -> None:
                         result=result,
                         message=message,
                     )
-                    logger.warning(
+                    log_failure(
                         "First rerun: unexpected result is reproducible within the tolerance "
                         f"({result} = {self.initial_result}). "
-                        "Need to rerun on a different GPU to verify correctness"
+                        "Need to rerun on a different GPU to verify correctness.",
+                        fatal=fatal,
                     )
             # This is the second re-run.
             elif self.state == RerunState.RERUNNING_FROM_CHECKPOINT:
                 # Ensure we're not on the same GPU as the first rerun.
-                node: str = os.uname()[1]
-                device: int = torch.cuda.current_device()
+                node = os.uname()[1]
+                device = torch.cuda.current_device()
                 if node == self.suspicious_node and device == self.suspicious_device:
                     logger.error(
                         f"Got rescheduled on the same GPU. Need to resume again from the same "
@@ -630,6 +675,8 @@ def log_failure(message: str) -> None:
                         f"therefore was likely incorrect ({result} != {self.initial_result})"
                     )
                     log_failure("Possible persistent error!")
+                    if not fatal:
+                        self.continue_requested = True
                 else:
                     self._log_validation_error_to_file(
                         status=RerunValidationStatus.SECOND_RERUN_REPRODUCIBLE,
@@ -666,7 +713,7 @@ def is_unexpectedly_large(
             threshold: a float representing the minimum trigger threshold
                 e.g. 10 means > 10x max absolute value observed.
             context: a string identifying the value. This is used to differentiate
-                between different invokations of validate_results targetting different
+                between different invocations of validate_results targeting different
                 values, e.g. loss and grads.
             num_samples: the sample size used to estimate the max value.
                 Default is 100 value samples.
@@ -747,11 +794,12 @@ def save_my_model_checkpoint(data_iterator, ...):
                 return None
 
             if ckpt_format != "torch_dist":
-                if _safe_get_rank() == 0:
-                    logger.warning(
-                        "RerunStateMachine checkpoints ONLY SUPPORTED "
-                        "for checkpoint format torch_dist"
-                    )
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    "RerunStateMachine checkpoints ONLY SUPPORTED "
+                    "for checkpoint format torch_dist",
+                )
                 return None
 
         data_iterators: list[RerunDataIterator] = self._sanitize_data_iterators(data_iterator)
@@ -829,13 +877,17 @@ def load_checkpoint(checkpoint, ...)
         """
 
         if self.mode == RerunMode.DISABLED:
-            if _safe_get_rank() == 0:
-                logger.warning(
-                    "RerunStateMachine disabled via CLI, ignoring machine state saved in checkpoint"
-                )
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "RerunStateMachine disabled via CLI, ignoring machine state saved in checkpoint",
+            )
             return
-        if _safe_get_rank() == 0:
-            logger.warning("Getting RerunStateMachine state from checkpoint. Will rerun step.")
+        log_single_rank(
+            logger,
+            logging.WARNING,
+            "Getting RerunStateMachine state from checkpoint. Will rerun step.",
+        )
         self.mode = state_dict["mode"]
         self.current_iteration = state_dict["current_iteration"]
         self.state = state_dict["state"]
@@ -874,11 +926,14 @@ def _sanitize_data_iterators(
     def _get_validation_call_info(self, message: str) -> Call:
         """Internal method to get the context about the caller to validate_result()."""
 
-        frame: inspect.frame = inspect.currentframe()
+        frame = inspect.currentframe()
+        assert frame is not None
+        assert frame.f_back is not None
         frame = frame.f_back.f_back
+        assert frame is not None
         filename: str = inspect.getframeinfo(frame).filename
         lineno: int = frame.f_lineno
-        rank: int = _safe_get_rank()
+        rank: int = safe_get_rank()
         caller = Caller(message=message, rank=rank)
         self.validation_counts[caller] += 1
         sequence: int = self.validation_counts[caller]
@@ -949,16 +1004,15 @@ def _log_validation_error_to_file(
         if self.result_rejected_tracker_filename is not None:
             # Append to log.
             try:
-                rank: int = _safe_get_rank()
+                rank: int = safe_get_rank()
                 node: str = os.uname()[1]
                 device: int = torch.cuda.current_device()
                 with open(self.result_rejected_tracker_filename, "a") as f:
-                    print(
+                    f.write(
                         f"ts={datetime.datetime.now()} node={node} device={device} "
                         f"jobID={os.getenv('SLURM_JOBID', 'N/A')} rank={rank} "
                         f"iteration={self.current_iteration} status={status} result={result} "
-                        f"message='{message}'",
-                        file=f,
+                        f"message='{message}'\n"
                     )
             except Exception as e:
                 logger.error(f"Could not log validation error! ({e})")
@@ -1013,13 +1067,21 @@ def get_skipped_iterations_from_tracker_file(cls, tracker_file_name: str) -> lis
                             if len(iterations_seen_by_job[job][iteration]) > 1:
                                 iterations_to_ignore.add(iteration)
         except Exception as e:
-            logger.error(f"Could not parse iterations to skip in tracker file! ({e})")
+            log_single_rank(
+                logger, logging.ERROR, f"Could not parse iterations to skip in tracker file! ({e})"
+            )
         iterations_to_skip = sorted(iterations_to_potentially_skip - iterations_to_ignore)
-        logger.warning(f"Will skip these iterations from tracker file: {iterations_to_skip}")
+        log_single_rank(
+            logger,
+            logging.WARNING,
+            f"Will skip these iterations from tracker file: {iterations_to_skip}",
+        )
         if len(iterations_to_ignore) > 0:
-            logger.warning(
+            log_single_rank(
+                logger,
+                logging.WARNING,
                 "Will not skip these iterations due to multiple rank errors: "
-                f"{sorted(iterations_to_ignore)}"
+                f"{sorted(iterations_to_ignore)}",
             )
         return iterations_to_skip
 
@@ -1059,7 +1121,7 @@ def __next__(self) -> Any:
             n = self.saved_microbatches[self.replay_pos]
             self.replay_pos += 1
             return n
-        n: Any = next(self.iterable)
+        n = next(self.iterable)
         if get_rerun_state_machine().get_mode() != RerunMode.DISABLED:
             self.saved_microbatches.append(n)
         return n
@@ -1116,7 +1178,7 @@ def record(self, data: float) -> None:
             if self.pos < self.max_size:
                 self.samples.append(data)
             else:
-                self.samples[self.pos % self.self.max_size] = data
+                self.samples[self.pos % self.max_size] = data
             self.pos += 1
             if data > self.max:
                 self.max = data
@@ -1210,7 +1272,7 @@ def maybe_inject(self) -> bool:
         if not self.should_inject_errors or self.injected_error_type is not None:
             return False
         r: int = (
-            random.randint(0, self.error_injection_rate - 1) + _safe_get_rank()
+            random.randint(0, self.error_injection_rate - 1) + safe_get_rank()
         ) % self.error_injection_rate
         if r != 0:
             return False
@@ -1297,8 +1359,9 @@ def get_rerun_state_machine() -> RerunStateMachine:
     """Helper function to return the singleton instance of the rerun machine."""
 
     if _GLOBAL_RERUN_STATE_MACHINE is None:
-        logger.warning("Implicit initialization of Rerun State Machine!")
+        log_single_rank(logger, logging.WARNING, "Implicit initialization of Rerun State Machine!")
         initialize_rerun_state_machine()
+        assert _GLOBAL_RERUN_STATE_MACHINE is not None
     return _GLOBAL_RERUN_STATE_MACHINE
 
 
@@ -1310,19 +1373,6 @@ def _set_rerun_state_machine(rerun_state_machine) -> None:
     _GLOBAL_RERUN_STATE_MACHINE = rerun_state_machine
 
 
-def _safe_get_rank() -> int:
-    """Internal function that safely checks and returns the rank of the caller."""
-
-    if torch.distributed.is_initialized():
-        return torch.distributed.get_rank()
-
-    # If torch.distributed is not initialized, try to read environment variables.
-    try:
-        return int(os.environ.get("RANK", 0))
-    except (ValueError, TypeError):
-        return 0
-
-
 def _compare_floats(a: torch.Tensor, b: torch.Tensor) -> float:
     """Internal function that implements the default compare_func.
 
diff --git a/megatron/core/resharding/__init__.py b/megatron/core/resharding/__init__.py
new file mode 100644
index 00000000000..083c4518c0e
--- /dev/null
+++ b/megatron/core/resharding/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from .execution import execute_reshard_plan
+from .planner import build_centralized_reshard_plan
+from .refit import (
+    clear_service_cache,
+    get_or_create_service,
+    reshard_model_weights,
+    swap_model_weights,
+)
+from .utils import ParameterMetadata, ReshardPlan, ShardingDescriptor, TransferOp
+
+__all__ = [
+    "build_centralized_reshard_plan",
+    "execute_reshard_plan",
+    "swap_model_weights",
+    "reshard_model_weights",
+    "get_or_create_service",
+    "clear_service_cache",
+    "ParameterMetadata",
+    "ShardingDescriptor",
+    "TransferOp",
+    "ReshardPlan",
+]
diff --git a/megatron/core/resharding/copy_services/__init__.py b/megatron/core/resharding/copy_services/__init__.py
new file mode 100644
index 00000000000..447588f7b3a
--- /dev/null
+++ b/megatron/core/resharding/copy_services/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+from .base import CopyService
+from .nccl_copy_service import NCCLCopyService
+from .nvshmem_copy_service import NVSHMEMCopyService
+
+__all__ = ["CopyService", "NCCLCopyService", "NVSHMEMCopyService"]
diff --git a/megatron/core/resharding/copy_services/base.py b/megatron/core/resharding/copy_services/base.py
new file mode 100644
index 00000000000..d7b9205ba83
--- /dev/null
+++ b/megatron/core/resharding/copy_services/base.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class CopyService(ABC):
+    """Abstract interface for submitting and executing batched P2P copy operations."""
+
+    @abstractmethod
+    def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
+        """Register a tensor send from the current rank to ``dest_rank``."""
+        ...
+
+    @abstractmethod
+    def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
+        """Register a tensor receive into ``dest_tensor`` from ``src_rank``."""
+        ...
+
+    @abstractmethod
+    def run(self):
+        """Execute all previously submitted send/recv operations as a single batch."""
+        ...
diff --git a/megatron/core/resharding/copy_services/gloo_copy_service.py b/megatron/core/resharding/copy_services/gloo_copy_service.py
new file mode 100644
index 00000000000..c9c83ca74a5
--- /dev/null
+++ b/megatron/core/resharding/copy_services/gloo_copy_service.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import torch
+import torch.distributed as dist
+
+from .base import CopyService
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SendOp:
+    """Simple container describing a single send operation."""
+
+    task_id: int | None
+    tensor: torch.Tensor
+    dest_rank: int
+
+
+@dataclass
+class RecvOp:
+    """Simple container describing a single receive operation."""
+
+    task_id: int | None
+    tensor: torch.Tensor
+    src_rank: int
+
+
+class GlooCopyService(CopyService):
+    """
+    CopyService implementation that routes refit traffic over a CPU/Gloo
+    process group instead of NCCL.
+    """
+
+    def __init__(self):
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        self.gloo_pg = dist.new_group(backend="gloo")
+        self.send_ops: List[SendOp] = []
+        self.recv_ops: List[Tuple[RecvOp, torch.Tensor]] = []
+        self._copy_stream = torch.cuda.Stream()
+        if self.rank == 0:
+            logger.info(
+                f"GlooCopyService initialized on rank {self.rank} with {self.world_size} ranks"
+            )
+
+    def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
+        self.send_ops.append(SendOp(task_id=None, tensor=src_tensor, dest_rank=dest_rank))
+
+    def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, dest_rank: int):
+        """Submit a send operation with a unique task identifier."""
+        self.send_ops.append(SendOp(task_id=task_id, tensor=src_tensor, dest_rank=dest_rank))
+
+    def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
+        """Submit a receive operation."""
+        # Allocate a CPU buffer that matches the destination view; we'll
+        # copy into dest_tensor after the Gloo recv completes.
+        cpu_buffer = torch.empty_like(dest_tensor, device="cpu").contiguous()
+        self.recv_ops.append(
+            (RecvOp(task_id=None, tensor=cpu_buffer, src_rank=src_rank), dest_tensor)
+        )
+
+    def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor, src_rank: int):
+        """Submit a receive operation with a unique task identifier."""
+        cpu_buffer = torch.empty_like(dest_tensor, device="cpu").contiguous()
+        self.recv_ops.append(
+            (RecvOp(task_id=task_id, tensor=cpu_buffer, src_rank=src_rank), dest_tensor)
+        )
+
+    def run(self):
+        total_ops = len(self.send_ops) + len(self.recv_ops)
+        if self.rank == 0:
+            logger.info(
+                f"GlooCopyService rank {self.rank}: executing batched communication: "
+                f"{len(self.send_ops)} sends + {len(self.recv_ops)} recvs = {total_ops} ops"
+            )
+
+        p2p_ops: List[dist.P2POp] = []
+
+        # Short-circuit self transfers into local device copies.
+        local_sends = [op for op in self.send_ops if op.dest_rank == self.rank]
+        remote_sends = [op for op in self.send_ops if op.dest_rank != self.rank]
+        local_recvs = [(recv, dst) for (recv, dst) in self.recv_ops if recv.src_rank == self.rank]
+        remote_recvs = [(recv, dst) for (recv, dst) in self.recv_ops if recv.src_rank != self.rank]
+
+        if local_sends or local_recvs:
+            local_sends_by_id = {op.task_id: op for op in local_sends}
+            if None in local_sends_by_id:
+                raise RuntimeError(
+                    "GlooCopyService: local send missing task_id; "
+                    "use submit_send_with_id/submit_recv_with_id for local copies"
+                )
+            local_recvs_by_id = {recv.task_id: (recv, dst) for (recv, dst) in local_recvs}
+            if None in local_recvs_by_id:
+                raise RuntimeError(
+                    "GlooCopyService: local recv missing task_id; "
+                    "use submit_send_with_id/submit_recv_with_id for local copies"
+                )
+            if len(local_sends_by_id) != len(local_sends) or len(local_recvs_by_id) != len(
+                local_recvs
+            ):
+                raise RuntimeError(
+                    f"GlooCopyService: unmatched local ops on rank {self.rank}: "
+                    f"{len(local_sends)} local sends vs {len(local_recvs)} local recvs"
+                )
+            for task_id, (recv_op, dst_tensor) in local_recvs_by_id.items():
+                send_op = local_sends_by_id.get(task_id)
+                if send_op is None:
+                    raise RuntimeError(
+                        f"GlooCopyService: missing local send for task_id={task_id} "
+                        f"on rank {self.rank}"
+                    )
+                with torch.no_grad():
+                    src_tensor = send_op.tensor
+                    if dst_tensor.device != src_tensor.device:
+                        dst_tensor.copy_(src_tensor.to(dst_tensor.device))
+                    else:
+                        dst_tensor.copy_(src_tensor)
+
+        # Build Gloo P2P ops over CPU tensors. For sends we clone to CPU;
+        # for recvs we use the preallocated CPU buffers.
+        for op in remote_sends:
+            cpu_tensor = op.tensor.detach().to("cpu").contiguous()
+            p2p_ops.append(dist.P2POp(dist.isend, cpu_tensor, op.dest_rank, group=self.gloo_pg))
+        for recv, _dst_tensor in remote_recvs:
+            p2p_ops.append(dist.P2POp(dist.irecv, recv.tensor, recv.src_rank, group=self.gloo_pg))
+
+        if p2p_ops:
+            reqs = dist.batch_isend_irecv(p2p_ops)
+            for req in reqs:
+                req.wait()
+
+        # Copy received CPU buffers back into the original destination tensors.
+        for recv, dst_tensor in remote_recvs:
+            if dst_tensor.is_cuda:
+                dst_tensor.copy_(recv.tensor.to(dst_tensor.device))
+            else:
+                dst_tensor.copy_(recv.tensor)
+
+        if self._copy_stream is not None:
+            torch.cuda.current_stream().wait_stream(self._copy_stream)
+
+        if self.rank == 0:
+            logger.info("GlooCopyService: batched communication completed")
+        self.send_ops.clear()
+        self.recv_ops.clear()
diff --git a/megatron/core/resharding/copy_services/nccl_copy_service.py b/megatron/core/resharding/copy_services/nccl_copy_service.py
new file mode 100644
index 00000000000..8724279b991
--- /dev/null
+++ b/megatron/core/resharding/copy_services/nccl_copy_service.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import List
+
+import torch
+import torch.distributed as dist
+
+from .base import CopyService
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SendOp:
+    """Simple container describing a single NCCL send operation."""
+
+    task_id: int | None
+    tensor: torch.Tensor
+    dest_rank: int
+
+
+@dataclass
+class RecvOp:
+    """Simple container describing a single NCCL receive operation."""
+
+    task_id: int | None
+    tensor: torch.Tensor
+    src_rank: int
+
+
+class NCCLCopyService(CopyService):
+    """
+    Thin wrapper around torch.distributed batch_isend_irecv to submit and execute
+    a batch of point-to-point sends and recvs.
+    """
+
+    def __init__(self):
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        self.send_ops: List[SendOp] = []
+        self.recv_ops: List[RecvOp] = []
+        # Dedicated stream for local (same-rank) copies to avoid unnecessary
+        # serialization with work on the default stream.
+        self._copy_stream = torch.cuda.Stream()
+        if self.rank == 0:
+            logger.info(f"NCCLCopyService initialized with {self.world_size} ranks")
+
+    def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
+        self.send_ops.append(SendOp(task_id=None, tensor=src_tensor, dest_rank=dest_rank))
+
+    def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, dest_rank: int):
+        """Submit a send operation with a unique task identifier."""
+        self.send_ops.append(SendOp(task_id=task_id, tensor=src_tensor, dest_rank=dest_rank))
+
+    def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
+        """Submit a receive operation."""
+        self.recv_ops.append(RecvOp(task_id=None, tensor=dest_tensor, src_rank=src_rank))
+
+    def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor, src_rank: int):
+        """Submit a receive operation with a unique task identifier."""
+        self.recv_ops.append(RecvOp(task_id=task_id, tensor=dest_tensor, src_rank=src_rank))
+
+    def run(self):
+        total_ops = len(self.send_ops) + len(self.recv_ops)
+        if self.rank == 0:
+            logger.info(
+                "Executing batched communication: %d sends + %d recvs = %d ops",
+                len(self.send_ops),
+                len(self.recv_ops),
+                total_ops,
+            )
+
+        local_sends = [op for op in self.send_ops if op.dest_rank == self.rank]
+        remote_sends = [op for op in self.send_ops if op.dest_rank != self.rank]
+        local_recvs = [op for op in self.recv_ops if op.src_rank == self.rank]
+        remote_recvs = [op for op in self.recv_ops if op.src_rank != self.rank]
+
+        if local_sends or local_recvs:
+            local_sends_by_id = {op.task_id: op for op in local_sends}
+            if None in local_sends_by_id:
+                raise RuntimeError(
+                    "NCCLCopyService: local send missing task_id; "
+                    "use submit_send_with_id/submit_recv_with_id for local copies"
+                )
+            local_recvs_by_id = {op.task_id: op for op in local_recvs}
+            if None in local_recvs_by_id:
+                raise RuntimeError(
+                    "NCCLCopyService: local recv missing task_id; "
+                    "use submit_send_with_id/submit_recv_with_id for local copies"
+                )
+            if len(local_sends_by_id) != len(local_sends) or len(local_recvs_by_id) != len(
+                local_recvs
+            ):
+                raise RuntimeError(
+                    f"NCCLCopyService: unmatched local ops on rank {self.rank}: "
+                    f"{len(local_sends)} local sends vs {len(local_recvs)} local recvs"
+                )
+            for task_id, recv_op in local_recvs_by_id.items():
+                send_op = local_sends_by_id.get(task_id)
+                if send_op is None:
+                    raise RuntimeError(
+                        f"NCCLCopyService: missing local send for task_id={task_id} "
+                        f"on rank {self.rank}"
+                    )
+                with torch.no_grad():
+                    with torch.cuda.stream(self._copy_stream):
+                        recv_op.tensor.copy_(send_op.tensor)
+
+        p2p_ops = []
+        for op in remote_sends:
+            p2p_ops.append(dist.P2POp(dist.isend, op.tensor, op.dest_rank))
+        for op in remote_recvs:
+            p2p_ops.append(dist.P2POp(dist.irecv, op.tensor, op.src_rank))
+
+        if p2p_ops:
+            reqs = dist.batch_isend_irecv(p2p_ops)
+            for req in reqs:
+                req.wait()
+
+        # Make sure the copy stream is finished
+        torch.cuda.current_stream().wait_stream(self._copy_stream)
+
+        if self.rank == 0:
+            logger.info("Batched communication completed")
+        self.send_ops.clear()
+        self.recv_ops.clear()
diff --git a/megatron/core/resharding/copy_services/nvshmem_copy_service.py b/megatron/core/resharding/copy_services/nvshmem_copy_service.py
new file mode 100644
index 00000000000..8d231de5339
--- /dev/null
+++ b/megatron/core/resharding/copy_services/nvshmem_copy_service.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+import logging
+from typing import Dict
+
+import torch
+import torch.distributed as dist
+
+from ..nvshmem_copy_service import RemoteCopyService
+from .base import CopyService
+
+logger = logging.getLogger(__name__)
+
+
+class NVSHMEMCopyService(CopyService):
+    """CopyService implementation backed by NVSHMEM RemoteCopyService."""
+
+    def __init__(self):
+        if not dist.is_initialized():
+            raise RuntimeError("torch.distributed must be initialized before NVSHMEMCopyService()")
+
+        self.rank = dist.get_rank()
+        self._remote = RemoteCopyService()
+        # Lazily initialized on first use to avoid side effects at import time
+        self._initialized = False
+
+        # NOTE: keep the original typed tensors here (not uint8 views) so local copies
+        # preserve shape/strides semantics and avoid byte-offset pitfalls.
+        self._local_send_ops: Dict[int, torch.Tensor] = {}
+        self._local_recv_ops: Dict[int, torch.Tensor] = {}
+        self._local_copy_stream = torch.cuda.Stream()
+
+        logger.info("NVSHMEMCopyService constructed")
+
+    def _ensure_initialized(self):
+        if not self._initialized:
+            self._remote.init(log_level="INFO")
+            self._initialized = True
+            logger.info(
+                "NVSHMEMCopyService initialized: PE %d / %d", self._remote.my_pe, self._remote.n_pes
+            )
+
+    def submit_send(self, src_tensor: torch.Tensor, dest_rank: int):
+        """
+        Basic CopyService API is not rich enough to drive the NVSHMEM planner
+        (it lacks a globally shared task identifier), so this method is kept
+        only for interface compatibility and should not be used directly.
+
+        The resharding path calls into NVSHMEMCopyService via the
+        submit_send_with_id/submit_recv_with_id helpers instead.
+        """
+        raise RuntimeError(
+            "NVSHMEMCopyService.submit_send() is not supported; "
+            "use submit_send_with_id(...) from execute_reshard_plan."
+        )
+
+    def submit_recv(self, dest_tensor: torch.Tensor, src_rank: int):
+        raise RuntimeError(
+            "NVSHMEMCopyService.submit_recv() is not supported; "
+            "use submit_recv_with_id(...) from execute_reshard_plan."
+        )
+
+    #
+    # New helper API used from execute_reshard_plan via monkey-patching:
+    # we avoid changing the existing execute_reshard_plan signature by adding
+    # a small adapter layer that batches up matched send/recv slices.
+    #
+
+    def submit_send_with_id(self, task_id: int, src_tensor: torch.Tensor, dest_rank: int):
+        """Register a send with an explicit, globally shared task_id."""
+        self._ensure_initialized()
+
+        if not src_tensor.is_contiguous():
+            src_tensor = src_tensor.contiguous()
+
+        # Local transfers: keep them out of RemoteCopyService entirely.
+        if dest_rank == self.rank:
+            self._local_send_ops[task_id] = src_tensor
+            return
+
+        num_bytes = src_tensor.numel() * src_tensor.element_size()
+        src_bytes = src_tensor.view(torch.uint8)
+
+        logger.debug(
+            "NVSHMEMCopyService: register_send task_id=%d, %d bytes (%d → %d)",
+            task_id,
+            num_bytes,
+            self.rank,
+            dest_rank,
+        )
+
+        # Use public API on RemoteCopyService
+        self._remote.register_send(
+            task_id=task_id, src_tensor=src_bytes, src_pos=0, size=num_bytes, dest_pe=dest_rank
+        )
+
+    def submit_recv_with_id(self, task_id: int, dest_tensor: torch.Tensor, src_rank: int):
+        """Register a recv with an explicit, globally shared task_id."""
+        self._ensure_initialized()
+
+        if not dest_tensor.is_contiguous():
+            dest_tensor = dest_tensor.contiguous()
+
+        # Local transfers: keep them out of RemoteCopyService entirely.
+        if src_rank == self.rank:
+            self._local_recv_ops[task_id] = dest_tensor
+            return
+
+        num_bytes = dest_tensor.numel() * dest_tensor.element_size()
+        dst_bytes = dest_tensor.view(torch.uint8)
+
+        logger.debug(
+            "NVSHMEMCopyService: register_recv task_id=%d, %d bytes (%d ← %d)",
+            task_id,
+            num_bytes,
+            self.rank,
+            src_rank,
+        )
+
+        self._remote.register_receive(
+            task_id=task_id, dest_tensor=dst_bytes, dest_pos=0, size=num_bytes, src_pe=src_rank
+        )
+
+    def run(self):
+        """
+        Execute all registered transfer pairs via NVSHMEM.
+
+        This converts the registered pairs into RemoteCopyService send/receive
+        requests, builds a schedule, runs the pipelined NVSHMEM transfer, and
+        then clears internal state.
+        """
+        self._ensure_initialized()
+
+        # 1) Run same-rank copies (match by task_id), like NCCL backend.
+        if self._local_send_ops or self._local_recv_ops:
+            missing_sends = set(self._local_recv_ops.keys()) - set(self._local_send_ops.keys())
+            missing_recvs = set(self._local_send_ops.keys()) - set(self._local_recv_ops.keys())
+            if missing_sends or missing_recvs:
+                raise RuntimeError(
+                    "NVSHMEMCopyService: unmatched local ops on rank "
+                    f"{self.rank}: missing_sends={sorted(list(missing_sends))[:10]} "
+                    f"missing_recvs={sorted(list(missing_recvs))[:10]}"
+                )
+
+            with torch.no_grad():
+                with torch.cuda.stream(self._local_copy_stream):
+                    for task_id, dst in self._local_recv_ops.items():
+                        src = self._local_send_ops[task_id]
+                        if src.numel() != dst.numel() or src.element_size() != dst.element_size():
+                            raise RuntimeError(
+                                "NVSHMEMCopyService: local copy size mismatch on rank "
+                                f"{self.rank} task_id={task_id}: "
+                                f"src=({tuple(src.shape)}, {src.dtype}) "
+                                f"dst=({tuple(dst.shape)}, {dst.dtype})"
+                            )
+                        dst.copy_(src, non_blocking=True)
+
+            torch.cuda.current_stream().wait_stream(self._local_copy_stream)
+            self._local_send_ops.clear()
+            self._local_recv_ops.clear()
+
+        # 2) Execute remote schedule (if any remote sends/recvs were registered).
+        if not self._remote.send_requests and not self._remote.receive_requests:
+            logger.info("NVSHMEMCopyService: no remote requests; local copies complete")
+            return
+
+        logger.info("NVSHMEMCopyService: building NVSHMEM schedule and executing")
+        self._remote.schedule()
+        self._remote.run()
+        self._remote.clear_requests()
+        logger.info("NVSHMEMCopyService: NVSHMEM transfers complete")
diff --git a/megatron/core/resharding/execution.py b/megatron/core/resharding/execution.py
new file mode 100644
index 00000000000..6a7779406d0
--- /dev/null
+++ b/megatron/core/resharding/execution.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+import logging
+from typing import List, Tuple
+
+import torch
+import torch.distributed as dist
+
+from .copy_services.base import CopyService
+from .utils import ReshardPlan
+
+logger = logging.getLogger(__name__)
+
+
+def execute_reshard_plan(
+    plan: ReshardPlan,
+    src_module: torch.nn.Module,
+    dst_module: torch.nn.Module,
+    service: CopyService,
+) -> None:
+    """
+    Execute a reshard plan (from centralized controller).
+    A communication service must be provided to abstract transport.
+    Expected service API: submit_send(tensor, dest_rank), submit_recv(tensor, src_rank), run().
+    """
+
+    src_params = {name: p for name, p in src_module.named_parameters(recurse=True)}
+    dst_params = {name: p for name, p in dst_module.named_parameters(recurse=True)}
+    submit_send_with_id = getattr(service, "submit_send_with_id", None)
+    submit_recv_with_id = getattr(service, "submit_recv_with_id", None)
+
+    # Submit sends
+    for op in plan.send_ops:
+        src_param = src_params.get(op.param_name)
+        if src_param is not None:
+            src_view = src_param.data[op.my_slice].contiguous()
+            if submit_send_with_id is not None and op.task_id is not None:
+                submit_send_with_id(op.task_id, src_view, op.peer_rank)
+            else:
+                service.submit_send(src_view, op.peer_rank)
+
+    # Submit recvs
+    recv_writebacks: List[Tuple[torch.Tensor, torch.nn.Parameter, tuple[slice, ...]]] = []
+    for op in plan.recv_ops:
+        dst_param = dst_params.get(op.param_name)
+        if dst_param is not None:
+            dst_slice_view = dst_param.data[op.my_slice]
+            recv_buffer = torch.empty_like(dst_slice_view.contiguous())
+            if submit_recv_with_id is not None and op.task_id is not None:
+                submit_recv_with_id(op.task_id, recv_buffer, op.peer_rank)
+            else:
+                service.submit_recv(recv_buffer, op.peer_rank)
+            recv_writebacks.append((recv_buffer, dst_param, op.my_slice))
+
+    # Execute
+    logger.info(f"Executing {len(plan.send_ops)} sends + {len(plan.recv_ops)} recvs")
+    service.run()
+    dist.barrier()
+
+    # Write back received buffers into their destination parameter slices
+    for recv_buffer, dst_param, dst_slice in recv_writebacks:
+        with torch.no_grad():
+            dst_param.data[dst_slice].copy_(recv_buffer)
+
+    logger.info("Reshard complete")
diff --git a/megatron/core/resharding/nvshmem_copy_service/__init__.py b/megatron/core/resharding/nvshmem_copy_service/__init__.py
new file mode 100644
index 00000000000..2ab8cde81fe
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+NVSHMEM-based remote copy service and supporting components.
+
+This package is an in-tree integration of the standalone
+`nvshmem_copy_service/python` implementation so that Megatron
+can use it without relying on an external library.
+"""
+
+from . import nvshmem_types
+from .core import GPUResourceManager, KernelLauncher, PipelineExecutor
+from .memory import DoubleBufferManager, TensorPointerExtractor
+from .planning import CommunicationScheduler, GPUExecutionPlanner, TaskSegmenter, WorkloadPacker
+from .service import RemoteCopyService
+
+__all__ = [
+    "RemoteCopyService",
+    "nvshmem_types",
+    "GPUResourceManager",
+    "KernelLauncher",
+    "PipelineExecutor",
+    "DoubleBufferManager",
+    "TensorPointerExtractor",
+    "CommunicationScheduler",
+    "GPUExecutionPlanner",
+    "TaskSegmenter",
+    "WorkloadPacker",
+]
diff --git a/megatron/core/resharding/nvshmem_copy_service/core/__init__.py b/megatron/core/resharding/nvshmem_copy_service/core/__init__.py
new file mode 100644
index 00000000000..f466e925899
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/core/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Core execution components for NVSHMEM operations."""
+
+from .gpu_resource_manager import GPUResourceManager
+from .kernel_launcher import KernelLauncher
+from .pipeline_executor import PipelineExecutor
+
+__all__ = ["GPUResourceManager", "KernelLauncher", "PipelineExecutor"]
diff --git a/megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py b/megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py
new file mode 100644
index 00000000000..6e03b914b26
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/core/gpu_resource_manager.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+GPU resource management for NVSHMEM operations.
+
+Handles NVSHMEM initialization, CUDA device setup, stream management,
+and event lifecycle.
+"""
+
+import logging
+from typing import Dict, Optional
+
+try:
+    import nvshmem.core
+    from cuda.core.experimental import Device
+
+    HAVE_NVSHMEM = True
+except ImportError:
+    HAVE_NVSHMEM = False
+
+import torch
+import torch.distributed as dist
+
+logger = logging.getLogger(__name__)
+
+
+class GPUResourceManager:
+    """Manages GPU resources including NVSHMEM, streams, and events."""
+
+    def __init__(self):
+        self.device = None
+        self.my_pe: int = -1
+        self.n_pes: int = -1
+        self.initialized: bool = False
+
+        # CUDA streams (cuda.core.experimental)
+        self.pack_stream = None
+        self.unpack_stream = None
+        self.send_stream = None
+        self.copy_stream = None
+
+        # PyTorch stream wrappers
+        self.torch_pack_stream = None
+        self.torch_unpack_stream = None
+        self.torch_send_stream = None
+        self.torch_copy_stream = None
+
+        # Stream name to PyTorch stream mapping
+        self._torch_streams: Dict[str, torch.cuda.ExternalStream] = {}
+
+    def init(self) -> None:
+        """
+        Initialize NVSHMEM, CUDA device, and streams.
+
+        Expects torch.distributed to be already initialized.
+        """
+        if self.initialized:
+            return
+
+        if not HAVE_NVSHMEM:
+            raise RuntimeError(
+                "nvshmem.core is not available. Please install nvshmem to use GPUResourceManager."
+            )
+
+        # torch.distributed must be initialized before calling this
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "torch.distributed must be initialized before " "GPUResourceManager.init()"
+            )
+
+        # Get current CUDA device (already set by caller based on LOCAL_RANK)
+        local_rank = torch.cuda.current_device()
+
+        # nvshmem4py requires a cuda.core Device at init time
+        self.device = Device(local_rank)
+        self.device.set_current()
+
+        # Extract rank, nranks from the default process group
+        num_ranks = dist.get_world_size()
+        rank_id = dist.get_rank()
+
+        # Create/Broadcast UniqueID using broadcast_object_list
+        uniqueid = nvshmem.core.get_unique_id(empty=True)
+        if rank_id == 0:
+            uniqueid = nvshmem.core.get_unique_id()
+            broadcast_objects = [uniqueid]
+        else:
+            broadcast_objects = [None]
+
+        # Broadcast ID to all ranks using the default group
+        dist.broadcast_object_list(broadcast_objects, src=0)
+
+        # Barrier to ensure everyone has the ID before NVSHMEM init
+        dist.barrier()
+
+        # Initialize NVSHMEM with the broadcasted UID
+        nvshmem.core.init(
+            device=self.device,
+            uid=broadcast_objects[0],
+            rank=rank_id,
+            nranks=num_ranks,
+            initializer_method="uid",
+        )
+
+        logger.info("NVSHMEM initialized")
+
+        self.my_pe = nvshmem.core.my_pe()
+        self.n_pes = nvshmem.core.n_pes()
+
+        # Create CUDA streams
+        self.pack_stream = self.device.create_stream()
+        self.unpack_stream = self.device.create_stream()
+        self.send_stream = self.device.create_stream()
+        self.copy_stream = self.device.create_stream()
+
+        # Get stream pointers and create PyTorch wrappers
+        _, pack_stream_ptr = self.pack_stream.__cuda_stream__()
+        _, unpack_stream_ptr = self.unpack_stream.__cuda_stream__()
+        _, send_stream_ptr = self.send_stream.__cuda_stream__()
+        _, copy_stream_ptr = self.copy_stream.__cuda_stream__()
+
+        self.torch_pack_stream = torch.cuda.ExternalStream(pack_stream_ptr)
+        self.torch_unpack_stream = torch.cuda.ExternalStream(unpack_stream_ptr)
+        self.torch_send_stream = torch.cuda.ExternalStream(send_stream_ptr)
+        self.torch_copy_stream = torch.cuda.ExternalStream(copy_stream_ptr)
+
+        # Build stream mapping
+        self._torch_streams = {
+            "pack": self.torch_pack_stream,
+            "unpack": self.torch_unpack_stream,
+            "send": self.torch_send_stream,
+            "copy": self.torch_copy_stream,
+        }
+
+        logger.info("Stream mapping built")
+
+        self.initialized = True
+
+        # Initial barrier to ensure all PEs are ready
+        nvshmem.core.barrier_all(stream=self.send_stream)
+
+    def get_stream(self, name: str):
+        """
+        Get CUDA stream by name.
+
+        Args:
+            name: Stream name ('pack', 'unpack', 'send', 'copy')
+
+        Returns:
+            CUDA stream object
+        """
+        streams = {
+            "pack": self.pack_stream,
+            "unpack": self.unpack_stream,
+            "send": self.send_stream,
+            "copy": self.copy_stream,
+        }
+        return streams.get(name)
+
+    def get_torch_stream(self, name: str) -> Optional[torch.cuda.ExternalStream]:
+        """
+        Get PyTorch ExternalStream by name.
+
+        Args:
+            name: Stream name ('pack', 'unpack', 'send', 'copy')
+
+        Returns:
+            PyTorch ExternalStream
+        """
+        return self._torch_streams.get(name)
+
+    def create_events(self, num_events: int = 2):
+        """
+        Create double-buffered CUDA events for pack and unpack operations.
+
+        Args:
+            num_events: Number of events to create for each type
+                (default: 2 for double buffering)
+
+        Returns:
+            tuple: (pack_events, unpack_events) lists of torch.cuda.Event
+        """
+        pack_events = [torch.cuda.Event(enable_timing=False) for _ in range(num_events)]
+        unpack_events = [torch.cuda.Event(enable_timing=False) for _ in range(num_events)]
+        return pack_events, unpack_events
+
+    def finalize(self) -> None:
+        """Cleanup resources (streams are automatically managed by CUDA)."""
+        self.initialized = False
+        self.my_pe = -1
+        self.n_pes = -1
+        # Streams are automatically cleaned up when objects are deleted
diff --git a/megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py b/megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py
new file mode 100644
index 00000000000..4e86d6a9505
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/core/kernel_launcher.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+CUDA kernel management and launching for pack/unpack operations.
+
+Handles kernel compilation, launching, and stream coordination.
+"""
+
+import os
+from typing import Any, Tuple
+
+try:
+    import cupy as cp
+
+    HAVE_CUPY = True
+except ImportError:
+    HAVE_CUPY = False
+
+import torch
+import torch.cuda.nvtx as nvtx
+
+
+class KernelLauncher:
+    """Manages CUDA kernel loading and launching for data pack/unpack operations."""
+
+    def __init__(self):
+        self.chunked_copy_kernel = None
+        # Cached CuPy stream wrappers for efficient kernel launching
+        self.cp_pack_stream = None
+        self.cp_unpack_stream = None
+
+    def load_kernels(self) -> None:
+        """Load and compile CUDA kernels from source."""
+        if not HAVE_CUPY:
+            raise RuntimeError("cupy is not available. Please install cupy to use KernelLauncher.")
+
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        kernel_path = os.path.join(current_dir, "..", "kernels", "chunked_kernel.cu")
+
+        with open(kernel_path, "r") as f:
+            kernel_source = f.read()
+
+        self.chunked_copy_kernel = cp.RawKernel(
+            kernel_source, "chunked_batched_copy_kernel", options=("-std=c++11",)
+        )
+
+    def set_streams(self, pack_stream, unpack_stream) -> None:
+        """
+        Cache CuPy stream wrappers for kernel launching.
+
+        This eliminates per-launch overhead of stream pointer extraction
+        and CuPy ExternalStream creation.
+
+        Args:
+            pack_stream: CUDA stream for pack operations
+            unpack_stream: CUDA stream for unpack operations
+        """
+        _, pack_stream_ptr = pack_stream.__cuda_stream__()
+        _, unpack_stream_ptr = unpack_stream.__cuda_stream__()
+        self.cp_pack_stream = cp.cuda.ExternalStream(pack_stream_ptr)
+        self.cp_unpack_stream = cp.cuda.ExternalStream(unpack_stream_ptr)
+
+    def launch_pack(
+        self,
+        gpu_plan: Tuple[Any, Any, Any, int],
+        pack_stream,
+        torch_pack_stream: torch.cuda.ExternalStream,
+        pack_event: torch.cuda.Event,
+    ) -> None:
+        """
+        Launch pack kernel to copy data from user tensors to send buffer.
+
+        Args:
+            gpu_plan: Tuple of (cp_src_addrs, cp_dst_addrs, cp_sizes, num_chunks)
+                as CuPy arrays
+            pack_stream: CUDA stream (cuda.core.experimental.Stream) - unused,
+                kept for compatibility
+            torch_pack_stream: PyTorch external stream wrapper
+            pack_event: CUDA event to record after kernel launch
+        """
+        nvtx.range_push("Launch Pack Kernel")
+        if not gpu_plan:
+            nvtx.range_pop()
+            return
+
+        # Unpack cached CuPy arrays from gpu_plan
+        cp_src, cp_dst, cp_sizes, num_chunks = gpu_plan
+
+        # Grid/Block configuration
+        THREADS_PER_BLOCK = 1024
+        NUM_BLOCKS = 75
+
+        # Launch kernel using cached CuPy stream
+        assert self.chunked_copy_kernel is not None
+        assert self.cp_pack_stream is not None
+        self.chunked_copy_kernel(
+            (NUM_BLOCKS,),
+            (THREADS_PER_BLOCK,),
+            (cp_src, cp_dst, cp_sizes, num_chunks),
+            stream=self.cp_pack_stream,
+        )
+        nvtx.range_pop()
+        # Record event on PyTorch stream
+        pack_event.record(stream=torch_pack_stream)
+
+    def launch_unpack(
+        self,
+        gpu_plan: Tuple[Any, Any, Any, int],
+        unpack_stream,
+        torch_unpack_stream: torch.cuda.ExternalStream,
+        unpack_event: torch.cuda.Event,
+    ) -> None:
+        """
+        Launch unpack kernel to copy data from receive buffer to user tensors.
+
+        Args:
+            gpu_plan: Tuple of (cp_src_addrs, cp_dst_addrs, cp_sizes, num_chunks)
+                as CuPy arrays
+            unpack_stream: CUDA stream (cuda.core.experimental.Stream) - unused,
+            kept for compatibility
+            torch_unpack_stream: PyTorch external stream wrapper
+            unpack_event: CUDA event to record after kernel launch
+        """
+        nvtx.range_push("Launch Unpack Kernel")
+        if not gpu_plan:
+            nvtx.range_pop()
+            return
+
+        # Unpack cached CuPy arrays from gpu_plan
+        cp_src, cp_dst, cp_sizes, num_chunks = gpu_plan
+
+        # Grid/Block configuration
+        THREADS_PER_BLOCK = 1024
+        NUM_BLOCKS = 75
+
+        # Launch kernel using cached CuPy stream
+        assert self.chunked_copy_kernel is not None
+        assert self.cp_unpack_stream is not None
+        self.chunked_copy_kernel(
+            (NUM_BLOCKS,),
+            (THREADS_PER_BLOCK,),
+            (cp_src, cp_dst, cp_sizes, num_chunks),
+            stream=self.cp_unpack_stream,
+        )
+        nvtx.range_pop()
+        # Record event on PyTorch stream
+        unpack_event.record(stream=torch_unpack_stream)
diff --git a/megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py b/megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py
new file mode 100644
index 00000000000..5ba07f9956a
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/core/pipeline_executor.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+Pipelined communication execution engine.
+
+Orchestrates the pack/send/unpack pipeline with double-buffering
+and proper stream synchronization.
+"""
+
+from typing import Dict, List, Optional
+
+try:
+    import nvshmem.core
+
+    HAVE_NVSHMEM = True
+except ImportError:
+    HAVE_NVSHMEM = False
+
+import torch
+
+from ..logger import PELogger
+from ..memory.double_buffer_manager import DoubleBufferManager
+from ..nvshmem_types import ReceiveRequest, ScheduledBatch, SendRequest
+from .kernel_launcher import KernelLauncher
+
+
+class PipelineExecutor:
+    """Executes pipelined NVSHMEM communication with pack/send/unpack overlap."""
+
+    def __init__(
+        self, kernel_launcher: KernelLauncher, buffer_manager: DoubleBufferManager, my_pe: int
+    ):
+        """
+        Initialize pipeline executor.
+
+        Args:
+            kernel_launcher: KernelLauncher instance for pack/unpack kernels
+            buffer_manager: DoubleBufferManager for send/recv buffers
+            my_pe: This PE's rank
+        """
+        self.kernel_launcher = kernel_launcher
+        self.buffer_manager = buffer_manager
+        self.my_pe = my_pe
+
+        # Streams (will be set by service)
+        self.pack_stream = None
+        self.unpack_stream = None
+        self.send_stream = None
+        self.copy_stream = None
+
+        self.torch_pack_stream = None
+        self.torch_unpack_stream = None
+        self.torch_copy_stream = None
+
+        # Events for double-buffered synchronization
+        self.pack_events = []
+        self.unpack_events = []
+
+    def set_streams(
+        self,
+        pack_stream,
+        unpack_stream,
+        send_stream,
+        copy_stream,
+        torch_pack_stream,
+        torch_unpack_stream,
+        torch_copy_stream,
+    ):
+        """Set CUDA streams for execution."""
+        self.pack_stream = pack_stream
+        self.unpack_stream = unpack_stream
+        self.send_stream = send_stream
+        self.copy_stream = copy_stream
+
+        self.torch_pack_stream = torch_pack_stream
+        self.torch_unpack_stream = torch_unpack_stream
+        self.torch_copy_stream = torch_copy_stream
+
+    def set_events(self, pack_events: List, unpack_events: List):
+        """Set double-buffered CUDA events."""
+        self.pack_events = pack_events
+        self.unpack_events = unpack_events
+
+    def execute_pipeline(
+        self, iter_schedules: List[Dict[str, Optional[ScheduledBatch]]], num_iterations: int
+    ) -> None:
+        """
+        Execute pipelined communication.
+
+        Pipeline stages:
+        1. Pack NEXT iteration (async)
+        2. Unpack PRIOR iteration (async)
+        3. Send CURRENT iteration (sync)
+        4. Barrier
+        5. Wait for async pack/unpack to complete
+
+        Args:
+            iter_schedules: List of iteration schedules
+            num_iterations: Total number of iterations
+        """
+        PELogger.info(f"Executing pipeline: {num_iterations} iterations")
+
+        # Priming: Pack iteration 0 and WAIT for completion
+        if num_iterations > 0 and iter_schedules[0]["send"]:
+            torch.cuda.nvtx.range_push("Priming")
+            PELogger.debug("Priming: Packing iteration 0")
+            self._launch_pack(0, iter_schedules[0]["send"])
+            self.pack_events[0].synchronize()
+            torch.cuda.nvtx.range_pop()
+
+        for i in range(num_iterations):
+            torch.cuda.nvtx.range_push(f"Iteration {i}")
+            has_send = iter_schedules[i]["send"] is not None
+            has_recv = iter_schedules[i]["recv"] is not None
+            has_next_send = i + 1 < num_iterations and iter_schedules[i + 1]["send"] is not None
+            has_prior_recv = i > 0 and iter_schedules[i - 1]["recv"] is not None
+
+            slot = i % 2
+
+            # Log iteration start
+            send_info = (
+                f" → PE {iter_schedules[i]['send'].dest_pe} "
+                f"({iter_schedules[i]['send'].total_size} bytes)"
+                if has_send
+                else ""
+            )
+            recv_info = (
+                f" ← PE {iter_schedules[i]['recv'].src_pe} "
+                f"({iter_schedules[i]['recv'].total_size} bytes)"
+                if has_recv
+                else ""
+            )
+            PELogger.debug(f"Iteration {i}/{num_iterations}: slot={slot}{send_info}{recv_info}")
+
+            # Step 1: Pack NEXT iteration (async)
+            if has_next_send:
+                torch.cuda.nvtx.range_push("Step 1: Pack Next")
+                next_batch = iter_schedules[i + 1]["send"]
+                assert next_batch is not None
+                PELogger.debug(
+                    f"  Pack next (iter {i+1}): {len(next_batch.tasks)} tasks "
+                    f"→ PE {next_batch.dest_pe}"
+                )
+                self._launch_pack(i + 1, next_batch)
+                torch.cuda.nvtx.range_pop()
+
+            # Step 2: Unpack PRIOR iteration (async)
+            if has_prior_recv:
+                torch.cuda.nvtx.range_push("Step 2: Unpack Prior")
+                prior_batch = iter_schedules[i - 1]["recv"]
+                assert prior_batch is not None
+                PELogger.debug(
+                    f"  Unpack prior (iter {i-1}): {prior_batch.total_size} bytes "
+                    f"← PE {prior_batch.src_pe}"
+                )
+                self._launch_unpack(i - 1, prior_batch)
+                torch.cuda.nvtx.range_pop()
+
+            # Step 3: Send CURRENT iteration
+            if has_send:
+                torch.cuda.nvtx.range_push("Step 3: Send Current")
+                batch = iter_schedules[i]["send"]
+                assert batch is not None
+                transfer_size = batch.total_size
+                PELogger.debug(f"  Send current: {transfer_size} bytes → PE {batch.dest_pe}")
+
+                nvshmem.core.put(
+                    self.buffer_manager.recv_slots[slot][0:transfer_size],
+                    self.buffer_manager.send_slots[slot][0:transfer_size],
+                    batch.dest_pe,
+                    stream=self.send_stream,
+                )
+                torch.cuda.nvtx.range_pop()
+
+            # Ensure send completes
+            self.send_stream.sync()
+            nvshmem.core.quiet(stream=self.send_stream)
+
+            # Step 4: Global barrier
+            torch.cuda.nvtx.range_push("Step 4: Barrier")
+            nvshmem.core.barrier_all(stream=self.send_stream)
+            self.send_stream.sync()
+            torch.cuda.nvtx.range_pop()
+
+            # Step 5: Wait for async pack/unpack to complete
+            torch.cuda.nvtx.range_push("Step 5: Wait Async")
+            if has_prior_recv:
+                self.unpack_events[(i - 1) % 2].synchronize()
+            if has_next_send:
+                self.pack_events[(i + 1) % 2].synchronize()
+            torch.cuda.nvtx.range_pop()
+
+            torch.cuda.nvtx.range_pop()
+
+        # Final unpack for last iteration
+        if num_iterations > 0 and iter_schedules[num_iterations - 1]["recv"]:
+            torch.cuda.nvtx.range_push("Final Unpack")
+            PELogger.debug(f"Final unpack: iteration {num_iterations-1}")
+            last_recv = iter_schedules[num_iterations - 1]["recv"]
+            assert last_recv is not None
+            self._launch_unpack(num_iterations - 1, last_recv)
+            self.unpack_events[(num_iterations - 1) % 2].synchronize()
+            torch.cuda.nvtx.range_pop()
+
+        PELogger.info(f"Pipeline complete: {num_iterations} iterations")
+
+    def _launch_pack(self, iteration: int, batch: ScheduledBatch) -> None:
+        """Launch pack kernel for given iteration."""
+        if not batch.gpu_plan:
+            return
+
+        self.kernel_launcher.launch_pack(
+            batch.gpu_plan,
+            self.pack_stream,
+            self.torch_pack_stream,
+            self.pack_events[iteration % 2],
+        )
+
+    def _launch_unpack(self, iteration: int, batch: ScheduledBatch) -> None:
+        """Launch unpack kernel for given iteration."""
+        if not batch.gpu_plan:
+            return
+
+        self.kernel_launcher.launch_unpack(
+            batch.gpu_plan,
+            self.unpack_stream,
+            self.torch_unpack_stream,
+            self.unpack_events[iteration % 2],
+        )
+
+    def process_self_moves(
+        self, send_requests: List[SendRequest], receive_requests: List[ReceiveRequest]
+    ) -> None:
+        """
+        Handle same-PE transfers (where src_pe == dest_pe == my_pe).
+
+        Uses PyTorch copy on the copy stream for efficiency.
+
+        Args:
+            send_requests: List of send requests
+            receive_requests: List of receive requests
+        """
+        # Match send/recv requests where src_pe == dest_pe == my_pe
+        local_sends = {r.task_id: r for r in send_requests if r.dest_pe == self.my_pe}
+        local_recvs = [r for r in receive_requests if r.src_pe == self.my_pe]
+
+        if local_recvs:
+            PELogger.debug(f"Processing {len(local_recvs)} self-moves")
+
+        num_processed = 0
+        with torch.cuda.stream(self.torch_copy_stream):
+            for recv_req in local_recvs:
+                if recv_req.task_id in local_sends:
+                    send_req = local_sends[recv_req.task_id]
+                    PELogger.debug(
+                        "  Self-move: task_id=%d, size=%d bytes", recv_req.task_id, send_req.size
+                    )
+
+                    # Create views of the tensors with offsets
+                    src_view = send_req.src_tensor[
+                        send_req.src_pos : send_req.src_pos + send_req.size
+                    ]
+                    dest_view = recv_req.dest_tensor[
+                        recv_req.dest_pos : recv_req.dest_pos + send_req.size
+                    ]
+
+                    # Async copy on the copy stream
+                    dest_view.copy_(src_view, non_blocking=True)
+                    num_processed += 1
+
+        # Synchronize the PyTorch stream
+        self.torch_copy_stream.synchronize()
+
+        if num_processed > 0:
+            PELogger.info("Self-moves complete: %d transfers", num_processed)
diff --git a/megatron/core/resharding/nvshmem_copy_service/kernels/chunked_kernel.cu b/megatron/core/resharding/nvshmem_copy_service/kernels/chunked_kernel.cu
new file mode 100644
index 00000000000..e5b8fcc9a85
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/kernels/chunked_kernel.cu
@@ -0,0 +1,103 @@
+
+#include <cuda_runtime.h>
+
+// CUDA-compatible types (no C++ standard library headers for NVRTC)
+typedef unsigned char uint8_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t uintptr_t;
+
+// ============================================================================
+// Kernel Configuration Constants (from ChunkedKernel.h)
+// ============================================================================
+
+constexpr int CHUNK_SIZE = 128 * 1024;       // 128KB per chunk
+constexpr int NUM_BLOCKS = 75;               // Fixed grid size
+constexpr int THREADS_PER_BLOCK = 1024;      // Fixed block size
+constexpr int FLOAT4_SIZE = 16;              // 16 bytes per float4
+constexpr int MAX_CHUNKS_PER_BLOCK = 512;    // Max chunks per block for shared memory
+
+extern "C" {
+
+/**
+ * Chunked batched copy kernel implementation
+ *
+ * This kernel performs efficient batched memory copies using:
+ * 1. Contiguous block assignment for better load balancing
+ * 2. Shared memory prefetching of chunk metadata
+ * 3. Vectorized float4 (16-byte) copies for aligned data
+ * 4. Byte-by-byte fallback for unaligned or small data
+ */
+__global__ void chunked_batched_copy_kernel(
+    uint8_t** src_addrs,
+    uint8_t** dst_addrs,
+    size_t* sizes,
+    int total_chunks
+) {
+    // Shared memory for metadata prefetching
+    __shared__ uint8_t* s_src_addrs[MAX_CHUNKS_PER_BLOCK];
+    __shared__ uint8_t* s_dst_addrs[MAX_CHUNKS_PER_BLOCK];
+    __shared__ size_t s_sizes[MAX_CHUNKS_PER_BLOCK];
+
+    // Contiguous block assignment: block i processes chunks [start_chunk, end_chunk)
+    int chunks_per_block = (total_chunks + gridDim.x - 1) / gridDim.x;  // Ceiling division
+    int start_chunk = blockIdx.x * chunks_per_block;
+    int end_chunk = start_chunk + chunks_per_block;
+    if (end_chunk > total_chunks) {
+        end_chunk = total_chunks;
+    }
+    int num_chunks_this_block = end_chunk - start_chunk;
+
+    // Phase 1: Cooperative loading of metadata to shared memory
+    // All 1024 threads cooperate to load metadata from global memory
+    for (int i = threadIdx.x; i < num_chunks_this_block; i += blockDim.x) {
+        int global_chunk_id = start_chunk + i;
+        s_src_addrs[i] = src_addrs[global_chunk_id];
+        s_dst_addrs[i] = dst_addrs[global_chunk_id];
+        s_sizes[i] = sizes[global_chunk_id];
+    }
+    __syncthreads();
+
+    // Phase 2: Process each chunk using metadata from shared memory
+    for (int chunk_id = 0; chunk_id < num_chunks_this_block; chunk_id++) {
+        uint8_t* src = s_src_addrs[chunk_id];
+        uint8_t* dst = s_dst_addrs[chunk_id];
+        size_t size = s_sizes[chunk_id];
+
+        // Check if both src and dst are aligned to 16 bytes for float4 access
+        uintptr_t src_addr = (uintptr_t)src;
+        uintptr_t dst_addr = (uintptr_t)dst;
+        bool is_aligned = ((src_addr % FLOAT4_SIZE) == 0) && ((dst_addr % FLOAT4_SIZE) == 0);
+
+        if (is_aligned && size >= FLOAT4_SIZE) {
+            // Fast path: vectorized float4 copies
+            size_t aligned_size = (size / FLOAT4_SIZE) * FLOAT4_SIZE;
+
+            // All 1024 threads cooperate on float4 copies
+            #pragma unroll 4
+            for (size_t offset = threadIdx.x * FLOAT4_SIZE;
+                 offset < aligned_size;
+                 offset += blockDim.x * FLOAT4_SIZE) {
+                // Vectorized 16-byte load and store
+                float4 data = *((float4*)(src + offset));
+                *((float4*)(dst + offset)) = data;
+            }
+
+            // Handle remaining bytes (< 16 bytes) with byte-by-byte copy
+            for (size_t offset = aligned_size + threadIdx.x;
+                 offset < size;
+                 offset += blockDim.x) {
+                dst[offset] = src[offset];
+            }
+        } else {
+            // Fallback path: byte-by-byte copy for unaligned addresses
+            // Still use all threads for parallelism
+            for (size_t offset = threadIdx.x; offset < size; offset += blockDim.x) {
+                dst[offset] = src[offset];
+            }
+        }
+    }
+}
+
+}
+
+
diff --git a/megatron/core/resharding/nvshmem_copy_service/logger.py b/megatron/core/resharding/nvshmem_copy_service/logger.py
new file mode 100644
index 00000000000..a3c7c1699ad
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/logger.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+
+Per-PE Logger with colored console and file output.
+
+
+
+Similar to the C++ Logger implementation, provides:
+
+- Per-PE colored console output
+
+- Per-PE file logging
+
+- Support for TRACE, DEBUG, INFO, SUMMARY, WARN, ERROR levels
+
+"""
+
+import logging
+import os
+from datetime import datetime
+from typing import Optional
+
+
+class ColoredFormatter(logging.Formatter):
+    """Custom formatter that adds color codes for console output."""
+
+    def __init__(self, fmt: str, pe_id: int, use_color: bool = True):
+        super().__init__(fmt)
+        self.pe_id = pe_id
+        self.use_color = use_color
+
+        # ANSI color codes matching C++ implementation
+        self.colors = {
+            0: "\033[31m",  # Red
+            1: "\033[32m",  # Green
+            2: "\033[33m",  # Yellow
+            3: "\033[34m",  # Blue
+            4: "\033[35m",  # Magenta
+            5: "\033[36m",  # Cyan
+            6: "\033[91m",  # Bright Red
+            7: "\033[92m",  # Bright Green
+        }
+        self.reset = "\033[0m"
+
+    def formatTime(self, record, datefmt=None):
+        ct = self.converter(record.created)
+        if datefmt:
+            s = datetime.fromtimestamp(record.created).strftime(datefmt)
+            # For file logs, replace %f with milliseconds
+            if "%f" in datefmt:
+                s = s.replace("%f", f"{int(record.msecs):03d}")
+        else:
+            s = datetime.fromtimestamp(record.created).strftime("%H:%M:%S")
+            s = f"{s}.{int(record.msecs):03d}"
+        return s
+
+    def format(self, record):
+        # Save original message
+        original_msg = record.msg
+
+        if self.use_color and self.pe_id >= 0:
+            color = self.colors.get(self.pe_id, "\033[37m")  # White for others
+            record.msg = f"{color}{record.msg}{self.reset}"
+
+        result = super().format(record)
+
+        # Restore original message for other handlers
+        record.msg = original_msg
+
+        return result
+
+
+class PELogger:
+    """Per-PE logger with colored console and file output."""
+
+    _logger: Optional[logging.Logger] = None
+    _pe_id: int = -1
+    _level: int = logging.INFO
+
+    @classmethod
+    def init(cls, pe_id: int, level: str = "INFO", logs_dir: str = "logs"):
+        """
+        Initialize logger for this PE.
+
+        Args:
+            pe_id: Process element ID
+            level: Log level (TRACE, DEBUG, INFO, WARN, ERROR)
+            logs_dir: Directory for log files
+        """
+        cls._pe_id = pe_id
+
+        # Convert level string to logging level
+        level_map = {
+            "TRACE": logging.DEBUG - 5,  # Custom level below DEBUG
+            "DEBUG": logging.DEBUG,
+            "INFO": logging.INFO,
+            "SUMMARY": logging.INFO,
+            "WARN": logging.WARNING,
+            "WARNING": logging.WARNING,
+            "ERROR": logging.ERROR,
+            "CRITICAL": logging.CRITICAL,
+        }
+        cls._level = level_map.get(level.upper(), logging.INFO)
+
+        # Create logs directory if it doesn't exist
+        os.makedirs(logs_dir, exist_ok=True)
+
+        # Create logger
+        logger_name = f"PE_{pe_id}"
+        cls._logger = logging.getLogger(logger_name)
+        cls._logger.setLevel(cls._level)
+        cls._logger.propagate = False
+
+        # Remove existing handlers to avoid duplicates
+        cls._logger.handlers.clear()
+
+        # 1. Console handler with color
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(cls._level)
+        console_format = "[PE %d] [%%(asctime)s] [%%(levelname)s] %%(message)s" % pe_id
+        console_formatter = ColoredFormatter(console_format, pe_id, use_color=True)
+        console_handler.setFormatter(console_formatter)
+        cls._logger.addHandler(console_handler)
+
+        # 2. File handler without color
+        log_filename = os.path.join(logs_dir, f"pe_{pe_id}.log")
+        file_handler = logging.FileHandler(log_filename, mode="w")
+        file_handler.setLevel(cls._level)
+        file_format = "[PE %d] [%%(asctime)s] [%%(levelname)s] %%(message)s" % pe_id
+        file_formatter = ColoredFormatter(file_format, pe_id, use_color=False)
+        file_handler.setFormatter(file_formatter)
+        cls._logger.addHandler(file_handler)
+
+    @classmethod
+    def set_level(cls, level: str):
+        """Set the logging level."""
+        level_map = {
+            "TRACE": logging.DEBUG - 5,
+            "DEBUG": logging.DEBUG,
+            "INFO": logging.INFO,
+            "SUMMARY": logging.INFO,
+            "WARN": logging.WARNING,
+            "WARNING": logging.WARNING,
+            "ERROR": logging.ERROR,
+            "CRITICAL": logging.CRITICAL,
+        }
+        cls._level = level_map.get(level.upper(), logging.INFO)
+        if cls._logger:
+            cls._logger.setLevel(cls._level)
+            for handler in cls._logger.handlers:
+                handler.setLevel(cls._level)
+
+    @classmethod
+    def trace(cls, msg: str):
+        """Log at TRACE level (most detailed)."""
+        if cls._logger:
+            cls._logger.log(logging.DEBUG - 5, msg)
+
+    @classmethod
+    def debug(cls, msg: str):
+        """Log at DEBUG level."""
+        if cls._logger:
+            cls._logger.debug(msg)
+
+    @classmethod
+    def info(cls, msg: str):
+        """Log at INFO level."""
+        if cls._logger:
+            cls._logger.info(msg)
+
+    @classmethod
+    def summary(cls, msg: str):
+        """Log summary information (INFO level with [SUMMARY] prefix)."""
+        if cls._logger:
+            cls._logger.info(f"[SUMMARY] {msg}")
+
+    @classmethod
+    def warn(cls, msg: str):
+        """Log at WARNING level."""
+        if cls._logger:
+            cls._logger.warning(msg)
+
+    @classmethod
+    def warning(cls, msg: str):
+        """Log at WARNING level (alias for warn)."""
+        cls.warn(msg)
+
+    @classmethod
+    def error(cls, msg: str):
+        """Log at ERROR level."""
+        if cls._logger:
+            cls._logger.error(msg)
+
+    @classmethod
+    def critical(cls, msg: str):
+        """Log at CRITICAL level."""
+        if cls._logger:
+            cls._logger.critical(msg)
+
+    @classmethod
+    def shutdown(cls):
+        """Shutdown the logger and flush all handlers."""
+        if cls._logger:
+            for handler in cls._logger.handlers:
+                handler.flush()
+                handler.close()
+            cls._logger.handlers.clear()
+            cls._logger = None
diff --git a/megatron/core/resharding/nvshmem_copy_service/memory/__init__.py b/megatron/core/resharding/nvshmem_copy_service/memory/__init__.py
new file mode 100644
index 00000000000..5cd8aac704b
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/memory/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Memory management utilities for NVSHMEM operations."""
+
+from .double_buffer_manager import DoubleBufferManager
+from .tensor_pointer_utils import TensorPointerExtractor
+
+__all__ = ["DoubleBufferManager", "TensorPointerExtractor"]
diff --git a/megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py b/megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py
new file mode 100644
index 00000000000..079b2c17610
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/memory/double_buffer_manager.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+Double buffer management for NVSHMEM symmetric memory.
+
+Manages send and receive buffers with double-buffering for pipelined communication.
+"""
+
+try:
+    import nvshmem.core.interop.torch
+
+    HAVE_NVSHMEM = True
+except ImportError:
+    HAVE_NVSHMEM = False
+
+import torch
+
+from ..nvshmem_types import MAX_SEGMENT_SIZE
+
+
+class DoubleBufferManager:
+    """Manages double-buffered NVSHMEM symmetric buffers for send/receive operations."""
+
+    def __init__(self, slot_size: int = MAX_SEGMENT_SIZE):
+        """
+        Initialize buffer manager.
+
+        Args:
+            slot_size: Size of each buffer slot in bytes (default: 256MB)
+        """
+        self.slot_size = slot_size
+        self.send_slots = [None, None]
+        self.recv_slots = [None, None]
+
+    def allocate(self) -> None:
+        """Allocate NVSHMEM symmetric buffers for double-buffering."""
+        if not HAVE_NVSHMEM:
+            raise RuntimeError(
+                "nvshmem.core.interop.torch is not available. "
+                "Please install nvshmem to use DoubleBufferManager."
+            )
+
+        for i in range(2):
+            self.send_slots[i] = nvshmem.core.interop.torch.bytetensor(
+                (self.slot_size,), dtype=torch.uint8
+            )
+            self.recv_slots[i] = nvshmem.core.interop.torch.bytetensor(
+                (self.slot_size,), dtype=torch.uint8
+            )
+            # Zero out buffers
+            self.send_slots[i].zero_()
+            self.recv_slots[i].zero_()
+
+    def get_send_slot(self, iteration: int):
+        """
+        Get send buffer for given iteration.
+
+        Args:
+            iteration: Iteration number
+
+        Returns:
+            NVSHMEM tensor for sending
+        """
+        return self.send_slots[iteration % 2]
+
+    def get_recv_slot(self, iteration: int):
+        """
+        Get receive buffer for given iteration.
+
+        Args:
+            iteration: Iteration number
+
+        Returns:
+            NVSHMEM tensor for receiving
+        """
+        return self.recv_slots[iteration % 2]
+
+    def free(self) -> None:
+        """Free NVSHMEM symmetric buffers."""
+        for i in range(2):
+            if self.send_slots[i] is not None:
+                nvshmem.core.interop.torch.free_tensor(self.send_slots[i])
+                self.send_slots[i] = None
+            if self.recv_slots[i] is not None:
+                nvshmem.core.interop.torch.free_tensor(self.recv_slots[i])
+                self.recv_slots[i] = None
diff --git a/megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py b/megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py
new file mode 100644
index 00000000000..ee250618ee7
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/memory/tensor_pointer_utils.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+Utilities for extracting data pointers from different tensor types.
+
+Supports PyTorch tensors, CuPy arrays, and raw integer pointers.
+"""
+
+from typing import Any
+
+import torch
+
+
+class TensorPointerExtractor:
+    """Extract memory pointers from various tensor types."""
+
+    @staticmethod
+    def get_pointer(tensor: Any) -> int:
+        """
+        Extract the data pointer from a tensor.
+
+        Args:
+            tensor: Can be torch.Tensor, CuPy array, or raw int pointer
+
+        Returns:
+            int: Memory address of the tensor data
+
+        Examples:
+
+            >>> import torch
+
+            >>> t = torch.zeros(100, device='cuda')
+
+            >>> ptr = TensorPointerExtractor.get_pointer(t)
+
+            >>> isinstance(ptr, int)
+
+            True
+        """
+        if isinstance(tensor, torch.Tensor):
+            return tensor.data_ptr()
+        elif hasattr(tensor, "data"):  # CuPy array
+            return tensor.data.ptr
+        else:  # Assume raw integer pointer
+            return tensor
diff --git a/megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py b/megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py
new file mode 100644
index 00000000000..731cace0502
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/nvshmem_types.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass, field
+from typing import Any, List
+
+# Constants
+MAX_SEGMENT_SIZE = 256 * 1024 * 1024  # 256MB
+MAX_TASKS_PER_BATCH = 10000
+
+
+@dataclass
+class SendRequest:
+    """Container for a send operation request."""
+
+    task_id: int
+    src_tensor: Any  # cupy.ndarray or pointer
+    src_pos: int
+    size: int
+    dest_pe: int
+
+
+@dataclass
+class ReceiveRequest:
+    """Container for a receive operation request."""
+
+    task_id: int
+    dest_tensor: Any  # cupy.ndarray or pointer
+    dest_pos: int
+    size: int
+    src_pe: int
+
+
+@dataclass
+class WorkloadGroup:
+    """Container for a group of send requests to a specific destination PE."""
+
+    dest_pe: int
+    tasks: List[SendRequest] = field(default_factory=list)
+    total_size: int = 0
+
+
+@dataclass
+class ScheduledBatch:
+    """Metadata for a scheduled communication batch."""
+
+    src_pe: int
+    dest_pe: int
+    batch_index: int
+    iteration: int
+    # Metadata for GPU execution
+    gpu_plan: Any = None  # Placeholder for GPU-resident plan
+    tasks: List[SendRequest] = field(default_factory=list)
+    total_size: int = 0
+    tasks_summary: Any = None  # WorkloadSummary
+
+
+@dataclass
+class WorkloadSummary:
+    """Summary of a workload group for communication with other PEs."""
+
+    total_size: int
+    task_ids: List[int]
+    task_sizes: List[int]
+
+
+@dataclass
+class TransferMetadata:
+    """GPU-resident metadata for communication tasks."""
+
+    ptrs: Any  # cupy array of uint64 (pointers)
+    sizes: Any  # cupy array of uint64 (sizes)
+    num_tasks: int
+    total_size: int
diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/__init__.py b/megatron/core/resharding/nvshmem_copy_service/planning/__init__.py
new file mode 100644
index 00000000000..9df0b3ac318
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/planning/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Planning components for task segmentation, workload packing, and scheduling."""
+
+from .communication_scheduler import CommunicationScheduler
+from .gpu_execution_planner import GPUExecutionPlanner
+from .task_segmenter import TaskSegmenter
+from .workload_packer import WorkloadPacker
+
+__all__ = ["CommunicationScheduler", "GPUExecutionPlanner", "TaskSegmenter", "WorkloadPacker"]
diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py b/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py
new file mode 100644
index 00000000000..0f299a84e40
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/planning/communication_scheduler.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Dict, List, Tuple
+
+from ..logger import PELogger
+from ..nvshmem_types import ScheduledBatch, WorkloadGroup, WorkloadSummary
+
+
+class CommunicationScheduler:
+    """
+    Builds a conflict-free, iteration-based schedule for communication.
+    Ensures that in any given iteration, a PE is not overloaded.
+    """
+
+    def __init__(self):
+        self.num_iterations = 0
+
+    def build_schedule(
+        self, workloads: Dict[int, List[WorkloadGroup]], my_pe: int, n_pes: int
+    ) -> Tuple[Dict[int, List[ScheduledBatch]], Dict[Tuple[int, int, int], WorkloadSummary]]:
+        """
+        Main scheduling method.
+        1. Exchanges workload info with other PEs.
+        2. Assigns batches to iterations.
+        3. Returns:
+           - local schedule (iteration -> list of batches)
+           - global workload summaries (key: (src, dest, batch_idx) -> summary)
+        """
+        total_local_batches = sum(len(groups) for groups in workloads.values())
+        PELogger.info(f"Building schedule: {total_local_batches} local batches, {n_pes} PEs")
+
+        # Step 1: Collect all batches across all PE pairs
+        PELogger.debug("Collecting batches from all PEs...")
+        all_batches = self._collect_all_batches(workloads, my_pe, n_pes)
+        PELogger.debug(f"Collected {len(all_batches)} total batches globally")
+
+        # Step 2: Assign batches to iterations using conflict-free algorithm
+        PELogger.debug("Assigning batches to iterations...")
+        self._assign_iterations(all_batches)
+        PELogger.info(f"Schedule built: {self.num_iterations} iterations")
+
+        # Step 3: Exchange detailed workload summaries (Task IDs/Sizes)
+        # This is needed for receivers to know what tasks are in each batch
+        PELogger.debug("Exchanging workload summaries...")
+        global_summaries = self._exchange_workload_summaries(workloads, my_pe, n_pes)
+        PELogger.debug(f"Exchanged {len(global_summaries)} workload summaries")
+
+        # Step 4: Build schedule map for this PE
+        my_batches = [b for b in all_batches if b.src_pe == my_pe or b.dest_pe == my_pe]
+        my_batches.sort(key=lambda x: x.iteration)
+
+        final_schedule: Dict[int, List[ScheduledBatch]] = {}
+        for b in my_batches:
+            final_schedule.setdefault(b.iteration, []).append(b)
+
+        return final_schedule, global_summaries
+
+    def _collect_all_batches(
+        self, workloads: Dict[int, List[WorkloadGroup]], my_pe: int, n_pes: int
+    ) -> List[ScheduledBatch]:
+        """
+        Exchanges batch counts and details with all PEs to build a global view.
+        Uses torch.distributed for reliable communication.
+        """
+        import torch.distributed as dist
+
+        # Build local batch list
+        local_batches: List[Tuple[int, int, int]] = []
+        for dest_pe, groups in workloads.items():
+            if dest_pe == my_pe:
+                continue
+            for i, _ in enumerate(groups):
+                local_batches.append((my_pe, dest_pe, i))  # (src, dest, batch_idx)
+
+        PELogger.debug(f"  Local batch count: {len(local_batches)}")
+        PELogger.debug(f"  Local batches: {local_batches}")
+
+        # Gather all batches from all PEs using torch.distributed
+        all_batches_list: List[List[Tuple[int, int, int]] | None] = [None] * n_pes
+        dist.all_gather_object(all_batches_list, local_batches)
+
+        # Flatten into global batch list
+        global_batches: List[ScheduledBatch] = []
+        for pe_batches in all_batches_list:
+            if pe_batches is None:
+                continue
+            for src, dest, idx in pe_batches:
+                global_batches.append(
+                    ScheduledBatch(src_pe=src, dest_pe=dest, batch_index=idx, iteration=-1)
+                )
+
+        PELogger.debug(f"  Global batches collected: {len(global_batches)} total")
+
+        # Group by source for readability
+        batches_by_src: Dict[int, List[Tuple[int, int]]] = {}
+        for b in global_batches:
+            batches_by_src.setdefault(b.src_pe, []).append((b.dest_pe, b.batch_index))
+        for src_pe in sorted(batches_by_src.keys()):
+            PELogger.debug(f"    PE {src_pe} sends to: {batches_by_src[src_pe]}")
+
+        return global_batches
+
+    def _assign_iterations(self, batches: List[ScheduledBatch]):
+        self.num_iterations = 0
+        batches.sort(key=lambda x: (x.src_pe, x.dest_pe, x.batch_index))
+
+        for batch in batches:
+            iteration = 0
+            assigned = False
+            while not assigned:
+                if not self._has_conflict(batch, iteration, batches):
+                    batch.iteration = iteration
+                    self.num_iterations = max(self.num_iterations, iteration + 1)
+                    assigned = True
+                    PELogger.debug(
+                        f"  Assigned batch ({batch.src_pe} → {batch.dest_pe}, "
+                        f"idx={batch.batch_index}) to iteration {iteration}"
+                    )
+                else:
+                    iteration += 1
+
+    def _has_conflict(
+        self, batch: ScheduledBatch, iteration: int, all_batches: List[ScheduledBatch]
+    ) -> bool:
+        for other in all_batches:
+            if other.iteration == iteration and other is not batch:
+                if other.src_pe == batch.src_pe or other.dest_pe == batch.dest_pe:
+                    return True
+        return False
+
+    def _exchange_workload_summaries(
+        self, workloads: Dict[int, List[WorkloadGroup]], my_pe: int, n_pes: int
+    ) -> Dict[Tuple[int, int, int], WorkloadSummary]:
+        """
+        Exchange detailed workload content using torch.distributed.
+        Simple and reliable - no NVSHMEM symmetric memory issues.
+        """
+        import torch.distributed as dist
+
+        # Build local summaries as a simple dict:
+        # (src, dest, batch_idx) -> {total_size, task_ids, task_sizes}
+        local_summaries: Dict[Tuple[int, int, int], Dict[str, object]] = {}
+        batch_count = 0
+        total_tasks = 0
+
+        for dest_pe, groups in workloads.items():
+            if dest_pe == my_pe:
+                continue
+            for batch_idx, group in enumerate(groups):
+                key = (my_pe, dest_pe, batch_idx)
+                local_summaries[key] = {
+                    "total_size": group.total_size,
+                    "task_ids": [t.task_id for t in group.tasks],
+                    "task_sizes": [t.size for t in group.tasks],
+                }
+                batch_count += 1
+                total_tasks += len(group.tasks)
+
+        PELogger.debug(f"  Local summaries: {batch_count} batches, {total_tasks} tasks")
+
+        # Gather all summaries from all PEs using torch.distributed
+        all_summaries_list: List[Dict[Tuple[int, int, int], Dict[str, object]] | None] = [
+            None
+        ] * n_pes
+        dist.all_gather_object(all_summaries_list, local_summaries)
+
+        # Merge into global map
+        global_map: Dict[Tuple[int, int, int], WorkloadSummary] = {}
+        for pe_summaries in all_summaries_list:
+            if pe_summaries is None:
+                continue
+            for key, data in pe_summaries.items():
+                summary = WorkloadSummary(
+                    total_size=int(data["total_size"]),
+                    task_ids=list(data["task_ids"]),
+                    task_sizes=list(data["task_sizes"]),
+                )
+                global_map[key] = summary
+
+        PELogger.debug(f"  Exchanged {len(global_map)} workload summaries")
+        return global_map
diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py b/megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py
new file mode 100644
index 00000000000..68c4d11d7e5
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/planning/gpu_execution_planner.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+GPU execution planning for pack/unpack operations.
+
+Converts high-level task descriptions into GPU-ready metadata
+(pointer arrays, sizes, chunking) for kernel execution.
+"""
+
+from typing import Dict, List, Optional, Tuple
+
+try:
+    import cupy as cp
+
+    HAVE_CUPY = True
+except ImportError:
+    HAVE_CUPY = False
+
+import torch
+
+from ..logger import PELogger
+from ..memory.tensor_pointer_utils import TensorPointerExtractor
+from ..nvshmem_types import ReceiveRequest, ScheduledBatch
+
+
+class GPUExecutionPlanner:
+    """Plans GPU kernel execution by building pointer arrays and metadata."""
+
+    def __init__(self):
+        self.tensor_utils = TensorPointerExtractor()
+        self.CHUNK_SIZE = 128 * 1024  # 128KB chunks
+
+    def create_gpu_plans(
+        self,
+        iter_schedules: List[Dict[str, Optional[ScheduledBatch]]],
+        send_slots: List,
+        recv_slots: List,
+        receive_requests: List[ReceiveRequest],
+    ) -> None:
+        """
+        Build GPU execution plans for all iterations.
+
+        Modifies iter_schedules in-place by adding gpu_plan to each batch.
+
+        Args:
+            iter_schedules: List of iteration schedules (dicts with 'send' and 'recv')
+            send_slots: List of send buffer slots
+            recv_slots: List of receive buffer slots
+            receive_requests: List of all receive requests for matching
+        """
+        if not HAVE_CUPY:
+            raise RuntimeError(
+                "cupy is not available. Please install cupy to use GPUExecutionPlanner."
+            )
+
+        PELogger.debug(f"Creating GPU plans for {len(iter_schedules)} iterations")
+        for i, sched in enumerate(iter_schedules):
+            send_batch = sched["send"]
+            if send_batch:
+                # Build Pack Metadata
+                ptrs: List[int] = []
+                positions: List[int] = []
+                sizes: List[int] = []
+
+                for t in send_batch.tasks:
+                    # Extract pointer from tensor
+                    ptr = self.tensor_utils.get_pointer(t.src_tensor)
+                    ptrs.append(ptr)
+                    positions.append(t.src_pos)
+                    sizes.append(t.size)
+
+                # Plan kernel args for packing
+                send_batch.gpu_plan = self._plan_kernel_args(
+                    ptrs, positions, sizes, is_pack=True, buffer_base=send_slots[i % 2].data_ptr()
+                )
+                task_ids = [t.task_id for t in send_batch.tasks]
+                PELogger.debug(
+                    f"  Iter {i} send plan: {len(send_batch.tasks)} tasks → "
+                    f"PE {send_batch.dest_pe}, {send_batch.total_size} bytes"
+                )
+                displayed_ids = task_ids[:10] if len(task_ids) <= 10 else task_ids[:10] + ["..."]
+                PELogger.debug(f"    Send task IDs: {displayed_ids}")
+
+            recv_batch = sched["recv"]
+            if recv_batch:
+                # Build Unpack Metadata
+                summary = recv_batch.tasks_summary
+
+                # Skip if no summary available (shouldn't happen in normal operation)
+                if summary is None:
+                    PELogger.error(
+                        f"Iter {i}: recv batch from PE {recv_batch.src_pe} has no "
+                        "tasks_summary - UNPACK WILL BE SKIPPED!"
+                    )
+                    recv_batch.gpu_plan = None
+                    continue
+
+                PELogger.debug(
+                    f"  Iter {i} recv from PE {recv_batch.src_pe}: "
+                    f"{len(summary.task_ids)} tasks in summary"
+                )
+
+                ptrs = []
+                positions = []
+                sizes = []
+
+                # Create fast lookup map for receive requests
+                relevant_reqs: Dict[int, ReceiveRequest] = {
+                    r.task_id: r for r in receive_requests if r.src_pe == recv_batch.src_pe
+                }
+
+                # Match summary tasks with receive requests
+                matched_task_ids: List[int] = []
+                unmatched_task_ids: List[int] = []
+                for t_id, t_size in zip(summary.task_ids, summary.task_sizes):
+                    if t_id in relevant_reqs:
+                        req = relevant_reqs[t_id]
+                        ptr = self.tensor_utils.get_pointer(req.dest_tensor)
+                        ptrs.append(ptr)
+                        positions.append(req.dest_pos)
+                        sizes.append(t_size)  # Use sender's size
+                        matched_task_ids.append(t_id)
+                    else:
+                        unmatched_task_ids.append(t_id)
+                        PELogger.error(
+                            f"Iter {i}: Unexpected task {t_id} from PE "
+                            f"{recv_batch.src_pe} - no matching recv request!"
+                        )
+
+                if unmatched_task_ids:
+                    PELogger.error(
+                        f"  Iter {i}: {len(unmatched_task_ids)} unmatched tasks "
+                        f"from PE {recv_batch.src_pe}: {unmatched_task_ids[:10]}"
+                    )
+
+                # Plan kernel args for unpacking
+                recv_batch.gpu_plan = self._plan_kernel_args(
+                    ptrs, positions, sizes, is_pack=False, buffer_base=recv_slots[i % 2].data_ptr()
+                )
+
+                if recv_batch.gpu_plan is None:
+                    PELogger.error(
+                        f"  Iter {i} recv plan: FAILED - no gpu_plan created for "
+                        f"{len(sizes)} tasks from PE {recv_batch.src_pe}"
+                    )
+                else:
+                    PELogger.debug(
+                        f"  Iter {i} recv plan: {len(sizes)} tasks ← "
+                        f"PE {recv_batch.src_pe}, {recv_batch.total_size} bytes"
+                    )
+                    displayed_recv_ids = (
+                        matched_task_ids[:10]
+                        if len(matched_task_ids) <= 10
+                        else matched_task_ids[:10] + ["..."]
+                    )
+                    PELogger.debug(f"    Recv task IDs: {displayed_recv_ids}")
+
+    def _plan_kernel_args(
+        self,
+        ptrs: List[int],
+        positions: List[int],
+        sizes: List[int],
+        is_pack: bool,
+        buffer_base: int,
+    ) -> Optional[Tuple[object, object, object, int]]:
+        """
+        Generate GPU-ready pointer arrays for kernel execution.
+
+        Applies 128KB chunking to break large transfers into smaller pieces.
+
+        Args:
+            ptrs: List of tensor data pointers
+            positions: List of positions within tensors
+            sizes: List of transfer sizes
+            is_pack: True for pack (user->buffer), False for unpack (buffer->user)
+            buffer_base: Base pointer of the buffer
+
+        Returns:
+            Tuple of (cp_src_addrs, cp_dst_addrs, cp_sizes, num_chunks) as
+            CuPy arrays, or None if no work.
+        """
+        h_src_addrs: List[int] = []
+        h_dst_addrs: List[int] = []
+        h_sizes: List[int] = []
+
+        packed_offset = 0
+
+        for ptr, pos, size in zip(ptrs, positions, sizes):
+            num_chunks = (size + self.CHUNK_SIZE - 1) // self.CHUNK_SIZE
+
+            for c in range(num_chunks):
+                chunk_offset = c * self.CHUNK_SIZE
+                chunk_size = min(self.CHUNK_SIZE, size - chunk_offset)
+
+                if is_pack:
+                    # Pack: user tensor -> buffer
+                    h_src_addrs.append(ptr + pos + chunk_offset)
+                    h_dst_addrs.append(buffer_base + packed_offset + chunk_offset)
+                else:
+                    # Unpack: buffer -> user tensor
+                    h_src_addrs.append(buffer_base + packed_offset + chunk_offset)
+                    h_dst_addrs.append(ptr + pos + chunk_offset)
+
+                h_sizes.append(chunk_size)
+
+            packed_offset += size
+
+        total_chunks = len(h_sizes)
+        if total_chunks == 0:
+            return None
+
+        # Move to GPU using PyTorch, then convert to CuPy for kernel launching
+        d_src_addrs = torch.tensor(h_src_addrs, dtype=torch.int64, device="cuda")
+        d_dst_addrs = torch.tensor(h_dst_addrs, dtype=torch.int64, device="cuda")
+        d_sizes = torch.tensor(h_sizes, dtype=torch.int64, device="cuda")
+
+        # Convert to CuPy arrays (zero-copy) for kernel launching
+        cp_src_addrs = cp.asarray(d_src_addrs)
+        cp_dst_addrs = cp.asarray(d_dst_addrs)
+        cp_sizes = cp.asarray(d_sizes)
+
+        return (cp_src_addrs, cp_dst_addrs, cp_sizes, total_chunks)
diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py b/megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py
new file mode 100644
index 00000000000..fdeaea33ae5
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/planning/task_segmenter.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+from typing import List
+
+from ..nvshmem_types import MAX_SEGMENT_SIZE, ReceiveRequest, SendRequest
+
+logger = logging.getLogger(__name__)
+
+# Constants for ID encoding (from C++ implementation)
+REQUEST_ID_BASE = 1000000000
+SEGMENT_ID_MULTIPLIER = 1000
+MAX_REQUESTS = 1000000
+MAX_SEGMENTS_PER_REQUEST = 1000
+
+
+class TaskSegmenter:
+    """
+    Splits large tasks (>256MB) into smaller segments to fit
+    into the fixed-size communication slots.
+    """
+
+    def _encode_segment_id(self, task_id: int, segment_index: int) -> int:
+        return REQUEST_ID_BASE + (task_id * SEGMENT_ID_MULTIPLIER) + segment_index
+
+    def _calculate_num_segments(self, size: int) -> int:
+        return (size + MAX_SEGMENT_SIZE - 1) // MAX_SEGMENT_SIZE
+
+    def _validate_segmentation(self, task_id: int, size: int) -> bool:
+        num_segments = self._calculate_num_segments(size)
+        if num_segments > MAX_SEGMENTS_PER_REQUEST:
+            logger.error(
+                f"Error: Task {task_id} requires {num_segments} segments, "
+                f"exceeds max {MAX_SEGMENTS_PER_REQUEST}"
+            )
+            return False
+        if task_id >= MAX_REQUESTS:
+            logger.error(f"Error: Task ID {task_id} exceeds max {MAX_REQUESTS}")
+            return False
+        return True
+
+    def segment_send_request(self, req: SendRequest) -> List[SendRequest]:
+        """
+        Splits a single send request into multiple segments
+        if larger than MAX_SEGMENT_SIZE.
+        """
+        if req.size <= MAX_SEGMENT_SIZE:
+            return [req]
+
+        if not self._validate_segmentation(req.task_id, req.size):
+            raise ValueError(f"Task {req.task_id} validation failed")
+
+        num_segments = self._calculate_num_segments(req.size)
+        output_requests: List[SendRequest] = []
+
+        for i in range(num_segments):
+            segment_offset = i * MAX_SEGMENT_SIZE
+            segment_size = min(MAX_SEGMENT_SIZE, req.size - segment_offset)
+            segment_task_id = self._encode_segment_id(req.task_id, i)
+
+            new_req = SendRequest(
+                task_id=segment_task_id,
+                src_tensor=req.src_tensor,
+                src_pos=req.src_pos + segment_offset,
+                size=segment_size,
+                dest_pe=req.dest_pe,
+            )
+            output_requests.append(new_req)
+
+        return output_requests
+
+    def segment_receive_request(self, req: ReceiveRequest) -> List[ReceiveRequest]:
+        """
+        Splits a single receive request into multiple segments
+        if larger than MAX_SEGMENT_SIZE.
+        """
+        if req.size <= MAX_SEGMENT_SIZE:
+            return [req]
+
+        if not self._validate_segmentation(req.task_id, req.size):
+            raise ValueError(f"Task {req.task_id} validation failed")
+
+        num_segments = self._calculate_num_segments(req.size)
+        output_requests: List[ReceiveRequest] = []
+
+        for i in range(num_segments):
+            segment_offset = i * MAX_SEGMENT_SIZE
+            segment_size = min(MAX_SEGMENT_SIZE, req.size - segment_offset)
+            segment_task_id = self._encode_segment_id(req.task_id, i)
+
+            new_req = ReceiveRequest(
+                task_id=segment_task_id,
+                dest_tensor=req.dest_tensor,
+                dest_pos=req.dest_pos + segment_offset,
+                size=segment_size,
+                src_pe=req.src_pe,
+            )
+            output_requests.append(new_req)
+
+        return output_requests
diff --git a/megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py b/megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py
new file mode 100644
index 00000000000..1f2374bc187
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/planning/workload_packer.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Dict, List
+
+from ..logger import PELogger
+from ..nvshmem_types import MAX_SEGMENT_SIZE, MAX_TASKS_PER_BATCH, SendRequest, WorkloadGroup
+
+
+class WorkloadPacker:
+    """
+    Packs individual SendRequests into WorkloadGroups (batches)
+    destined for the same PE, respecting size limits.
+    """
+
+    def pack_workloads(
+        self, send_requests: List[SendRequest], n_pes: int
+    ) -> Dict[int, List[WorkloadGroup]]:
+        """
+        Groups requests by destination PE and packs them into batches.
+        Returns a map: dest_pe -> list of batches
+        """
+        PELogger.debug(f"Packing {len(send_requests)} send requests for {n_pes} PEs")
+        workloads: Dict[int, List[WorkloadGroup]] = {}
+
+        # Group requests by destination PE
+        tasks_by_dest: Dict[int, List[SendRequest]] = {}
+        for req in send_requests:
+            tasks_by_dest.setdefault(req.dest_pe, []).append(req)
+
+        # Pack tasks for each destination
+        for dest_pe in range(n_pes):
+            if dest_pe not in tasks_by_dest:
+                workloads[dest_pe] = []
+                PELogger.debug(f"  Dest PE {dest_pe}: 0 tasks → 0 batches")
+                continue
+
+            tasks = tasks_by_dest[dest_pe]
+            workloads[dest_pe] = self._pack_single_destination(tasks, dest_pe)
+
+            if workloads[dest_pe]:
+                total_size = sum(b.total_size for b in workloads[dest_pe])
+                PELogger.debug(
+                    f"  Dest PE {dest_pe}: {len(tasks)} tasks → "
+                    f"{len(workloads[dest_pe])} batches, {total_size} bytes total"
+                )
+            else:
+                PELogger.debug(
+                    f"  Dest PE {dest_pe}: {len(tasks)} tasks → 0 batches (empty after packing)"
+                )
+
+        return workloads
+
+    def _pack_single_destination(
+        self, tasks: List[SendRequest], dest_pe: int
+    ) -> List[WorkloadGroup]:
+        if not tasks:
+            return []
+
+        # Sort tasks by size (descending) for better bin packing efficiency
+        tasks.sort(key=lambda x: x.size, reverse=True)
+
+        batches: List[WorkloadGroup] = []
+        current_batch = WorkloadGroup(dest_pe=dest_pe, tasks=[], total_size=0)
+
+        for task in tasks:
+            # Check if adding this task would exceed batch constraints
+            would_exceed_size = current_batch.total_size + task.size > MAX_SEGMENT_SIZE
+            would_exceed_task_cap = len(current_batch.tasks) >= MAX_TASKS_PER_BATCH
+
+            if (would_exceed_size or would_exceed_task_cap) and current_batch.tasks:
+                # Finalize current batch
+                batches.append(current_batch)
+                task_first_10_string = ", ".join([str(t.task_id) for t in current_batch.tasks[:10]])
+                PELogger.debug(
+                    f"  Packed batch to PE {dest_pe} idx {len(batches) - 1}: "
+                    f"{task_first_10_string}... (total {len(current_batch.tasks)} tasks)"
+                )
+                # Start new batch
+                current_batch = WorkloadGroup(dest_pe=dest_pe, tasks=[], total_size=0)
+
+            # Add task to current batch
+            current_batch.tasks.append(task)
+            current_batch.total_size += task.size
+
+        # Add final batch if not empty
+        if current_batch.tasks:
+            batches.append(current_batch)
+
+        return batches
diff --git a/megatron/core/resharding/nvshmem_copy_service/service.py b/megatron/core/resharding/nvshmem_copy_service/service.py
new file mode 100644
index 00000000000..631e63ae41b
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/service.py
@@ -0,0 +1,408 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+Remote Copy Service - Main orchestrator for NVSHMEM-based GPU-to-GPU transfers.
+
+This service coordinates task segmentation, workload packing, scheduling,
+
+GPU resource management, and pipelined execution.
+"""
+
+from typing import Dict, List, Optional, Tuple
+
+try:
+    import nvshmem.core
+
+    HAVE_NVSHMEM = True
+except ImportError:
+    HAVE_NVSHMEM = False
+
+import torch.cuda.nvtx as nvtx
+
+from .core import GPUResourceManager, KernelLauncher, PipelineExecutor
+from .logger import PELogger
+from .memory import DoubleBufferManager
+from .nvshmem_types import ReceiveRequest, ScheduledBatch, SendRequest, WorkloadSummary
+from .planning import CommunicationScheduler, GPUExecutionPlanner, TaskSegmenter, WorkloadPacker
+
+
+class RemoteCopyService:
+    """
+    Main service for managing remote GPU-to-GPU data transfers.
+
+    Provides high-level API for registering transfers, scheduling,
+    and executing pipelined communication with NVSHMEM.
+    """
+
+    def __init__(self):
+        # Core components
+        self.gpu_resources = GPUResourceManager()
+        self.buffer_manager = DoubleBufferManager()
+        self.kernel_launcher = KernelLauncher()
+        self.pipeline_executor = None  # Created after init
+
+        # Planning components
+        self.task_segmenter = TaskSegmenter()
+        self.workload_packer = WorkloadPacker()
+        self.comm_scheduler = CommunicationScheduler()
+        self.gpu_planner = GPUExecutionPlanner()
+
+        # State
+        self.send_requests: List[SendRequest] = []
+        self.receive_requests: List[ReceiveRequest] = []
+        self.iter_schedules: Optional[List[Dict]] = None
+        self.num_iterations: int = 0
+
+        # Events for double-buffering
+        self.pack_events = []
+        self.unpack_events = []
+
+    @property
+    def my_pe(self) -> int:
+        """Get this PE's rank."""
+        return self.gpu_resources.my_pe
+
+    @property
+    def n_pes(self) -> int:
+        """Get total number of PEs."""
+        return self.gpu_resources.n_pes
+
+    @property
+    def device(self):
+        """Get CUDA device."""
+        return self.gpu_resources.device
+
+    @property
+    def initialized(self) -> bool:
+        """Check if service is initialized."""
+        return self.gpu_resources.initialized
+
+    def init(self, log_level: str = "INFO") -> None:
+        """
+        Initialize the service.
+
+        Sets up NVSHMEM, CUDA device, streams, buffers, and kernels.
+        Expects to be launched with torchrun.
+
+        Args:
+            log_level: Logging level (TRACE, DEBUG, INFO, WARN, ERROR)
+        """
+        if not HAVE_NVSHMEM:
+            raise RuntimeError(
+                "nvshmem.core is not available. Please install nvshmem to use NVSHMEMCopyService."
+            )
+
+        # Initialize GPU resources (NVSHMEM, device, streams)
+        self.gpu_resources.init()
+
+        # Initialize logger after PE ID is known
+        PELogger.init(self.my_pe, level=log_level)
+        PELogger.info(f"Initializing RemoteCopyService on PE {self.my_pe}/{self.n_pes}")
+
+        # Allocate double-buffered send/recv slots
+        self.buffer_manager.allocate()
+        PELogger.debug("Allocated double-buffered send/recv slots")
+
+        # Load CUDA kernels
+        self.kernel_launcher.load_kernels()
+        PELogger.debug("Loaded CUDA kernels")
+
+        # Cache CuPy stream wrappers for efficient kernel launching
+        self.kernel_launcher.set_streams(
+            self.gpu_resources.pack_stream, self.gpu_resources.unpack_stream
+        )
+        PELogger.debug("Cached CuPy stream wrappers")
+
+        # Create pipeline executor with dependencies
+        self.pipeline_executor = PipelineExecutor(
+            self.kernel_launcher, self.buffer_manager, self.my_pe
+        )
+
+        # Set streams on pipeline executor
+        self.pipeline_executor.set_streams(
+            self.gpu_resources.pack_stream,
+            self.gpu_resources.unpack_stream,
+            self.gpu_resources.send_stream,
+            self.gpu_resources.copy_stream,
+            self.gpu_resources.torch_pack_stream,
+            self.gpu_resources.torch_unpack_stream,
+            self.gpu_resources.torch_copy_stream,
+        )
+        PELogger.info("Initialization complete")
+
+    def register_send(
+        self, task_id: int, src_tensor, src_pos: int, size: int, dest_pe: int
+    ) -> None:
+        """
+        Register a send operation.
+
+        Args:
+            task_id: Unique task identifier
+            src_tensor: Source tensor (PyTorch/CuPy tensor or pointer)
+            src_pos: Starting position in source tensor
+            size: Number of bytes to send
+            dest_pe: Destination PE rank
+        """
+        if dest_pe >= self.n_pes or dest_pe < 0:
+            PELogger.error(f"Error: Invalid destination PE {dest_pe}")
+            return
+
+        req = SendRequest(task_id, src_tensor, src_pos, size, dest_pe)
+        self.send_requests.append(req)
+
+    def register_receive(
+        self, task_id: int, dest_tensor, dest_pos: int, size: int, src_pe: int
+    ) -> None:
+        """
+        Register a receive operation.
+
+        Args:
+            task_id: Unique task identifier
+            dest_tensor: Destination tensor (PyTorch/CuPy tensor or pointer)
+            dest_pos: Starting position in destination tensor
+            size: Number of bytes to receive
+            src_pe: Source PE rank
+        """
+        if src_pe >= self.n_pes or src_pe < 0:
+            PELogger.error(f"Error: Invalid source PE {src_pe}")
+            return
+
+        req = ReceiveRequest(task_id, dest_tensor, dest_pos, size, src_pe)
+        self.receive_requests.append(req)
+
+    def schedule(self) -> None:
+        """
+        Build execution schedule.
+
+        Can be called once and followed by multiple run() calls for
+        repeated execution with the same communication pattern.
+
+        Steps:
+        1. Segment large tasks into manageable chunks
+        2. Pack tasks into batches
+        3. Schedule batches to iterations (conflict-free)
+        4. Build GPU execution plans (pointer arrays, chunking)
+        5. Create synchronization events
+        """
+        if not self.initialized:
+            raise RuntimeError("RemoteCopyService not initialized")
+
+        PELogger.info(
+            f"Starting schedule: {len(self.send_requests)} send requests, "
+            f"{len(self.receive_requests)} receive requests"
+        )
+
+        # Step 1: Segment tasks (break large tasks into chunks)
+        PELogger.debug("Step 1: Segmenting tasks...")
+        orig_send_count = len(self.send_requests)
+        orig_recv_count = len(self.receive_requests)
+        self._segment_tasks()
+        PELogger.info(
+            f"Segmented: {orig_send_count} sends → {len(self.send_requests)} segments, "
+            f"{orig_recv_count} recvs → {len(self.receive_requests)} segments"
+        )
+
+        # Step 2: Pack tasks into workload groups
+        PELogger.debug("Step 2: Packing workloads...")
+        workloads = self.workload_packer.pack_workloads(self.send_requests, self.n_pes)
+        total_batches = sum(len(batches) for batches in workloads.values())
+        active_pes = sum(1 for batches in workloads.values() if batches)
+        PELogger.info(f"Packed: {total_batches} batches across {active_pes} destination PEs")
+
+        # Step 3: Schedule workloads to iterations
+        PELogger.debug("Step 3: Building communication schedule...")
+        schedule, global_summaries = self.comm_scheduler.build_schedule(
+            workloads, self.my_pe, self.n_pes
+        )
+
+        self.num_iterations = self.comm_scheduler.num_iterations
+        PELogger.info(f"Scheduled: {total_batches} batches → {self.num_iterations} iterations")
+
+        # Step 4: Prepare iteration schedules
+        PELogger.debug("Step 4: Preparing iteration schedules...")
+        self.iter_schedules = self._prepare_iter_schedules(
+            schedule, workloads, global_summaries, self.num_iterations
+        )
+
+        # Step 5: Build GPU execution plans
+        PELogger.debug("Step 5: Building GPU execution plans...")
+        self.gpu_planner.create_gpu_plans(
+            self.iter_schedules,
+            self.buffer_manager.send_slots,
+            self.buffer_manager.recv_slots,
+            self.receive_requests,
+        )
+
+        # Step 6: Create double-buffered events
+        PELogger.debug("Step 6: Creating synchronization events...")
+        self.pack_events, self.unpack_events = self.gpu_resources.create_events(num_events=2)
+        self.pipeline_executor.set_events(self.pack_events, self.unpack_events)
+
+        PELogger.info(f"Schedule complete: {self.num_iterations} iterations ready")
+
+    def run(self) -> None:
+        """
+        Execute the scheduled communication.
+
+        Can be called multiple times after a single schedule() call
+        to repeat the same communication pattern.
+        """
+        # import torch
+        # torch.save(self.send_requests, f"send_requests_{torch.distributed.get_rank()}.pt")
+        # torch.save(self.receive_requests, f"receive_requests_{torch.distributed.get_rank()}.pt")
+
+        if not self.initialized:
+            raise RuntimeError("RemoteCopyService not initialized")
+        if self.iter_schedules is None:
+            raise RuntimeError("Must call schedule() before run()")
+
+        PELogger.info(f"Starting execution: {self.num_iterations} iterations")
+
+        # Start timing
+        nvtx.range_push("RemoteCopyService.run_total")
+
+        # Global barrier before execution
+        PELogger.debug("Barrier: Synchronizing all PEs before execution")
+        nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream)
+        self.gpu_resources.send_stream.sync()
+
+        # Execute pipelined communication
+        nvtx.range_push("execute_pipeline")
+        self.pipeline_executor.execute_pipeline(self.iter_schedules, self.num_iterations)
+        nvtx.range_pop()  # execute_pipeline
+
+        # Global barrier after execution
+        PELogger.debug("Barrier: Synchronizing all PEs after pipeline")
+        nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream)
+
+        # Process same-PE transfers
+        self.pipeline_executor.process_self_moves(self.send_requests, self.receive_requests)
+
+        # End timing range
+        nvtx.range_pop()  # RemoteCopyService.run_total
+
+    def clear_requests(self) -> None:
+        """
+        Clear registered requests and schedule.
+
+        Call this before registering a new set of transfers.
+        """
+        self.send_requests = []
+        self.receive_requests = []
+        self.iter_schedules = None
+        self.num_iterations = 0
+        self.pack_events = []
+        self.unpack_events = []
+
+    def finalize(self) -> None:
+        """Cleanup resources."""
+        PELogger.info("Finalizing RemoteCopyService")
+
+        # Barrier to ensure all PEs are ready to finalize
+        try:
+            PELogger.debug("Barrier: Synchronizing all PEs before finalize")
+            nvshmem.core.barrier_all(stream=self.gpu_resources.send_stream)
+            self.gpu_resources.send_stream.sync()
+        except Exception as e:
+            PELogger.error(f"Error in final barrier: {e}")
+
+        # Free buffers
+        self.buffer_manager.free()
+
+        # Finalize GPU resources (this will call nvshmem.core.finalize internally)
+        self.gpu_resources.finalize()
+
+        PELogger.info("RemoteCopyService finalized")
+        PELogger.shutdown()
+
+    def _segment_tasks(self) -> None:
+        """Segment tasks into manageable chunks."""
+        new_sends: List[SendRequest] = []
+        for req in self.send_requests:
+            segments = self.task_segmenter.segment_send_request(req)
+            new_sends.extend(segments)
+            if len(segments) > 1:
+                PELogger.debug(
+                    f"  Segmented send task {req.task_id}: "
+                    f"{req.size} bytes → {len(segments)} segments"
+                )
+        self.send_requests = new_sends
+
+        new_recvs: List[ReceiveRequest] = []
+        for req in self.receive_requests:
+            segments = self.task_segmenter.segment_receive_request(req)
+            new_recvs.extend(segments)
+            if len(segments) > 1:
+                PELogger.debug(
+                    f"  Segmented recv task {req.task_id}: "
+                    f"{req.size} bytes → {len(segments)} segments"
+                )
+        self.receive_requests = new_recvs
+
+    def _prepare_iter_schedules(
+        self,
+        schedule_batches: Dict[int, List[ScheduledBatch]],
+        workloads: Dict[int, List],
+        global_summaries: Dict[Tuple[int, int, int], WorkloadSummary],
+        num_iterations: int,
+    ) -> List[Dict]:
+        """
+        Organize schedule into iteration-based structure.
+
+        Returns:
+            List of dicts with 'send' and 'recv' keys for each iteration
+        """
+        iter_schedules: List[Dict[str, Optional[ScheduledBatch]]] = []
+
+        for i in range(num_iterations):
+            sched: Dict[str, Optional[ScheduledBatch]] = {"send": None, "recv": None}
+
+            if i in schedule_batches:
+                batches = schedule_batches[i]
+
+                for b in batches:
+                    # Skip same-PE transfers (handled separately by process_self_moves)
+                    if b.src_pe == b.dest_pe:
+                        PELogger.debug(
+                            f"  Iter {i}: Skipping same-PE batch " f"({b.src_pe} → {b.dest_pe})"
+                        )
+                        continue
+
+                    if b.src_pe == self.my_pe:
+                        # This PE sends in this iteration
+                        b.tasks = workloads[b.dest_pe][b.batch_index].tasks
+                        b.total_size = workloads[b.dest_pe][b.batch_index].total_size
+                        sched["send"] = b
+                        PELogger.debug(
+                            f"  Iter {i}: Send to PE {b.dest_pe}, batch "
+                            f"{b.batch_index}, {len(b.tasks)} tasks, "
+                            f"{b.total_size} bytes"
+                        )
+
+                    elif b.dest_pe == self.my_pe:
+                        # This PE receives in this iteration
+                        key = (b.src_pe, b.dest_pe, b.batch_index)
+                        if key in global_summaries:
+                            summary = global_summaries[key]
+                            b.tasks_summary = summary
+                            b.total_size = summary.total_size
+                        else:
+                            PELogger.error(
+                                f"  Iter {i}: Missing workload summary for "
+                                f"recv from PE {b.src_pe}, batch {b.batch_index}"
+                            )
+                            PELogger.error(
+                                "  Available keys in global_summaries: "
+                                f"{list(global_summaries.keys())}"
+                            )
+                            b.tasks_summary = None
+                            b.total_size = 0
+                        sched["recv"] = b
+                        PELogger.debug(
+                            f"  Iter {i}: Recv from PE {b.src_pe}, batch "
+                            f"{b.batch_index}, {b.total_size} bytes"
+                        )
+
+            iter_schedules.append(sched)
+
+        return iter_schedules
diff --git a/megatron/core/resharding/nvshmem_copy_service/validation.py b/megatron/core/resharding/nvshmem_copy_service/validation.py
new file mode 100644
index 00000000000..fafb1321024
--- /dev/null
+++ b/megatron/core/resharding/nvshmem_copy_service/validation.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+Validation utilities for GPU-to-GPU communication.
+
+Provides deterministic data generation and validation for verifying
+
+correctness of communication operations."""
+
+from dataclasses import dataclass
+from typing import List
+
+import torch
+
+from .logger import PELogger
+
+
+@dataclass
+class ValidationResult:
+    """Result of validating a single task."""
+
+    task_id: int
+    size: int
+    passed: bool
+    src_pe: int = -1
+    mismatches: int = 0
+    first_mismatch_idx: int = -1
+    first_mismatch_expected: int = 0
+    first_mismatch_actual: int = 0
+    # Scheduling info - which batch/iteration this task was supposed to be handled
+    batch_index: int = -1
+    iteration: int = -1
+
+
+@dataclass
+class ValidationSummary:
+    """Summary of validation across all tasks."""
+
+    total_tasks: int
+    passed_tasks: int
+    failed_tasks: int
+    total_bytes: int
+    results: List[ValidationResult]
+
+    @property
+    def all_passed(self) -> bool:
+        """Check if all validated tasks passed."""
+        return self.failed_tasks == 0
+
+
+def generate_deterministic_data(task_id: int, size: int, device: str = "cuda") -> torch.Tensor:
+    """
+    Generate deterministic data pattern for a task.
+
+    Pattern: Each byte = (task_id * 31 + position) % 256
+    This creates a unique pattern per task that varies along the data.
+
+    Args:
+        task_id: Unique task identifier
+        size: Number of bytes to generate
+        device: Device to create tensor on ('cuda' or 'cpu')
+
+    Returns:
+        torch.Tensor of uint8 with deterministic pattern
+    """
+    positions = torch.arange(size, dtype=torch.int64, device=device)
+    pattern = ((task_id * 31 + positions) % 256).to(torch.uint8)
+    return pattern
+
+
+def validate_received_data(
+    task_id: int, tensor: torch.Tensor, size: int, src_pe: int = -1
+) -> ValidationResult:
+    """
+    Validate received data against expected deterministic pattern.
+
+    Args:
+        task_id: Task identifier to regenerate expected data
+        tensor: Received tensor to validate
+        size: Number of bytes to validate
+
+    Returns:
+        ValidationResult with pass/fail status and details
+    """
+    # Get the data slice to validate
+    recv_data = tensor[:size]
+
+    # Generate expected pattern on same device
+    expected = generate_deterministic_data(task_id, size, device=recv_data.device.type)
+
+    # Compare
+    mismatches_mask = recv_data != expected
+    num_mismatches = mismatches_mask.sum().item()
+
+    result = ValidationResult(
+        task_id=task_id,
+        size=size,
+        passed=(num_mismatches == 0),
+        src_pe=src_pe,
+        mismatches=num_mismatches,
+    )
+
+    if num_mismatches > 0:
+        # Find first mismatch for debugging
+        first_idx = mismatches_mask.nonzero(as_tuple=True)[0][0].item()
+        result.first_mismatch_idx = first_idx
+        result.first_mismatch_expected = expected[first_idx].item()
+        result.first_mismatch_actual = recv_data[first_idx].item()
+
+    return result
+
+
+def log_validation_summary(summary: ValidationSummary) -> None:
+    """Log validation summary."""
+    if summary.all_passed:
+        PELogger.info(
+            "Validation PASSED: %d/%d tasks, %d bytes validated",
+            summary.passed_tasks,
+            summary.total_tasks,
+            summary.total_bytes,
+        )
+    else:
+        PELogger.error(
+            "Validation FAILED: %d/%d tasks passed, %d failed",
+            summary.passed_tasks,
+            summary.total_tasks,
+            summary.failed_tasks,
+        )
+
+        # Group failures by source PE
+        failures_by_src = {}
+        for r in summary.results:
+            if not r.passed:
+                failures_by_src.setdefault(r.src_pe, []).append(r)
+
+        PELogger.error("  Failures by source PE:")
+        for src_pe in sorted(failures_by_src.keys()):
+            failed_tasks = failures_by_src[src_pe]
+            task_ids = [r.task_id for r in failed_tasks]
+            PELogger.error(
+                "    PE %d: %d failed tasks: %s",
+                src_pe,
+                len(failed_tasks),
+                task_ids[:15] if len(task_ids) <= 15 else task_ids[:15] + ["..."],
+            )
diff --git a/megatron/core/resharding/planner.py b/megatron/core/resharding/planner.py
new file mode 100644
index 00000000000..31045fbfc01
--- /dev/null
+++ b/megatron/core/resharding/planner.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+import logging
+import math
+
+import torch
+import torch.distributed as dist
+
+from .utils import (
+    ParameterMetadata,
+    ReshardPlan,
+    ShardingDescriptor,
+    TransferOp,
+    _build_layer_module_prefix_map,
+    _get_rank_in_group,
+    extract_param_metadata,
+    select_src_metadata_balanced,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _build_descriptors_for_param(
+    src_metadata: ParameterMetadata, dst_metadata: ParameterMetadata
+) -> list[ShardingDescriptor]:
+    """Construct sharding descriptors (currently TP) for this parameter based on actual layout.
+    Guard TP descriptor with size conservation so we don't mis-classify replicated tensors.
+    """
+    descriptors: list[ShardingDescriptor] = []
+
+    # TP descriptor: allow when either side participates in TP
+    if src_metadata.is_tp or dst_metadata.is_tp:
+        # Prefer destination partition_dim, else source
+        tp_dim = dst_metadata.partition_dim if dst_metadata.is_tp else src_metadata.partition_dim
+        src_tp_ranks = src_metadata.tensor_parallel_group_ranks
+        dst_tp_ranks = dst_metadata.tensor_parallel_group_ranks
+        if src_tp_ranks is None or dst_tp_ranks is None:
+            # Not enough context to build TP descriptor
+            return descriptors
+        src_stride = src_metadata.partition_stride if src_metadata.is_tp else 1
+        dst_stride = dst_metadata.partition_stride if dst_metadata.is_tp else 1
+
+        # Size conservation check on partition dim
+        src_world = len(src_tp_ranks)
+        dst_world = len(dst_tp_ranks)
+        src_local = src_metadata.shape[tp_dim]
+        dst_local = dst_metadata.shape[tp_dim]
+        if src_world * src_local != dst_world * dst_local:
+            raise RuntimeError(
+                f"Cannot build TP descriptor for {dst_metadata.name} dim{tp_dim}: "
+                f"src_world*src_local={src_world}*{src_local} != {dst_world}*{dst_local}. "
+                "This usually means the param is marked TP but is effectively replicated on that "
+                "dim or partition_dim/metadata is inconsistent between source and destination."
+            )
+
+        descriptors.append(
+            ShardingDescriptor(
+                name="tp",
+                dim=tp_dim,
+                src_stride=src_stride,
+                dst_stride=dst_stride,
+                src_dim_ranks=src_tp_ranks,
+                dst_dim_ranks=dst_tp_ranks,
+            )
+        )
+    return descriptors
+
+
+def _plan_multi_dim_lcm(
+    param_name: str,
+    src_metadata: ParameterMetadata,
+    dst_metadata: ParameterMetadata,
+    descriptors: list[ShardingDescriptor],
+    my_global_rank: int,
+) -> list[tuple[int, tuple[slice, ...], tuple[slice, ...]]]:
+    """
+    TP-only planner using LCM tiling to support strides on source/destination.
+    - Requires exactly one TP descriptor
+    - Supports arbitrary integer strides (contiguous micro-tiles)
+    """
+    if not descriptors:
+        return []
+    if len(descriptors) != 1:
+        raise NotImplementedError(
+            f"{param_name}: _plan_multi_dim_lcm supports TP-only (one descriptor)"
+        )
+    if descriptors[0].name != "tp":
+        raise NotImplementedError(f"{param_name}: _plan_multi_dim_lcm expects TP descriptor")
+    d = descriptors[0]
+    if my_global_rank not in d.dst_dim_ranks:
+        return []
+    src_shape = tuple(src_metadata.shape)
+    dst_shape = tuple(dst_metadata.shape)
+    dim = d.dim
+    src_world = len(d.src_dim_ranks)
+    dst_world = len(d.dst_dim_ranks)
+    src_local = src_shape[dim]
+    dst_local = dst_shape[dim]
+    if src_world * src_local != dst_world * dst_local:
+        raise RuntimeError(
+            f"{param_name}: size mismatch on TP dim{dim} "
+            f"(src_world={src_world}, src_local={src_local}, "
+            f"dst_world={dst_world}, dst_local={dst_local})"
+        )
+    # LCM tiling with strides
+    Ns = src_world * max(1, d.src_stride)
+    Nd = dst_world * max(1, d.dst_stride)
+    full_len = dst_local * dst_world
+    g = math.gcd(Ns, Nd)
+    L = (Ns // g) * Nd
+    if full_len % L != 0:
+        raise RuntimeError(
+            f"{param_name}: TP dim{dim} full_len {full_len} not divisible by LCM {L} "
+            f"(Ns={Ns}, Nd={Nd})"
+        )
+    unit = full_len // L  # micro-tile length
+    cps = L // Ns  # micro-tiles per source segment
+    cpd = L // Nd  # micro-tiles per destination segment
+    seg_src = cps * unit  # contiguous length per source segment
+    seg_dst = cpd * unit  # contiguous length per destination segment
+    dst_local_rank = _get_rank_in_group(my_global_rank, d.dst_dim_ranks)
+    ops: list[tuple[int, tuple[slice, ...], tuple[slice, ...]]] = []
+    # Sweep destination segments owned by this rank (handle destination stride)
+    for k in range(max(1, d.dst_stride)):
+        g_dst_seg = dst_local_rank + k * dst_world
+        # Within this segment, enumerate the cpd micro-tiles
+        for off in range(cpd):
+            g_micro = g_dst_seg * cpd + off
+            s_idx = g_micro // cps
+            in_seg = g_micro % cps
+            src_owner_in_dim = s_idx % src_world
+            src_global_rank = d.src_dim_ranks[src_owner_in_dim]
+            src_local_seg_idx = s_idx // src_world
+            src_start = src_local_seg_idx * seg_src + in_seg * unit
+            dst_start = k * seg_dst + off * unit
+            # Build full N-D slices
+            src_slice = [slice(None)] * len(src_shape)
+            dst_slice = [slice(None)] * len(dst_shape)
+            src_slice[dim] = slice(src_start, src_start + unit)
+            dst_slice[dim] = slice(dst_start, dst_start + unit)
+            ops.append((src_global_rank, tuple(src_slice), tuple(dst_slice)))
+
+    # Stable order by destination offset
+    def dst_key(op):
+        _, _, dsl = op
+        s = dsl[dim]
+        return s.start if isinstance(s, slice) else 0
+
+    ops.sort(key=dst_key)
+    return ops
+
+
+def _finalize_dp_transfers(
+    param_name: str,
+    src_metadata: ParameterMetadata,
+    dst_metadata: ParameterMetadata,
+    my_global_rank: int,
+) -> list[tuple[int, tuple[slice, ...], tuple[slice, ...]]]:
+    """Return receiver-side transfer for a parameter that is not TP-sharded.
+
+    This is reached when we cannot build a TP sharding descriptor for the parameter
+    (i.e., it is effectively replicated with respect to sharding).  We use this when the
+    destination and source mode have no TP or the parameter is replicted on all ranks
+    such as layernorm. If the source and destination DP groups match, we return a local
+    full-tensor copy; otherwise we pick a source rank from the source DP group in a
+    deterministic round-robin manner based on the receiver's global rank for better load
+    distribution.
+    """
+    dst_dp_ranks = dst_metadata.data_parallel_group_ranks
+    src_dp_ranks = src_metadata.data_parallel_group_ranks
+    if my_global_rank not in dst_dp_ranks:
+        return []
+
+    dst_shape = dst_metadata.shape
+
+    # Same DP layout - local copy
+    if src_dp_ranks == dst_dp_ranks:
+        full_slice = tuple(slice(None) for _ in range(len(dst_shape)))
+        return [(my_global_rank, full_slice, full_slice)]
+
+    # Different DP groups - use round-robin based on destination global rank for
+    # better load balancing across source ranks. This ensures that destination
+    # ranks are distributed across source ranks even when they have the same
+    # position within their respective DP groups.
+    src_global_rank = src_dp_ranks[my_global_rank % len(src_dp_ranks)]
+    full_slice = tuple(slice(None) for _ in range(len(dst_shape)))
+    return [(src_global_rank, full_slice, full_slice)]
+
+
+def _determine_source_ranks_for_dst_param(
+    param_name: str,
+    src_metadata: ParameterMetadata,
+    dst_metadata: ParameterMetadata,
+    my_global_rank: int,
+) -> list[tuple[int, tuple[slice, ...], tuple[slice, ...]]]:
+    """Route to dimension-specific planner based on parameter sharding type."""
+
+    # Regular TP/DP planning with EP-resolved metadata
+    descriptors = _build_descriptors_for_param(src_metadata=src_metadata, dst_metadata=dst_metadata)
+    if descriptors:
+        return _plan_multi_dim_lcm(
+            param_name=param_name,
+            src_metadata=src_metadata,
+            dst_metadata=dst_metadata,
+            descriptors=descriptors,
+            my_global_rank=my_global_rank,
+        )
+    # DP / replicated fallback
+    return _finalize_dp_transfers(param_name, src_metadata, dst_metadata, my_global_rank)
+
+
+def build_centralized_reshard_plan(
+    src_module: torch.nn.Module, dst_module: torch.nn.Module, num_experts: int = None
+) -> ReshardPlan:
+    """
+    Centralized planning: Rank 0 builds complete plan for all ranks, then scatters.
+    """
+    my_global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    # Get process groups
+    src_pg = getattr(src_module, "pg_collection", None)
+    dst_pg = getattr(dst_module, "pg_collection", None)
+    if src_pg is None or dst_pg is None:
+        raise ValueError("Both modules must have pg_collection")
+
+    # Gather param metadata from all ranks
+    my_src_params = {name: p for name, p in src_module.named_parameters(recurse=True)}
+    my_dst_params = {name: p for name, p in dst_module.named_parameters(recurse=True)}
+
+    # Build PP layer prefix maps to be used for parameter name rewriting
+    src_layer_prefix_map = _build_layer_module_prefix_map(src_module)
+    dst_layer_prefix_map = _build_layer_module_prefix_map(dst_module)
+
+    my_src_metadata = [
+        extract_param_metadata(
+            p,
+            name,
+            my_global_rank,
+            src_pg,
+            num_experts=num_experts,
+            layer_module_prefix_map=src_layer_prefix_map,
+        )
+        for name, p in my_src_params.items()
+    ]
+    my_dst_metadata = [
+        extract_param_metadata(
+            p,
+            name,
+            my_global_rank,
+            dst_pg,
+            num_experts=num_experts,
+            layer_module_prefix_map=dst_layer_prefix_map,
+        )
+        for name, p in my_dst_params.items()
+    ]
+
+    all_src_metadata_by_rank = [None] * world_size
+    all_dst_metadata_by_rank = [None] * world_size
+    dist.all_gather_object(all_src_metadata_by_rank, my_src_metadata)
+    dist.all_gather_object(all_dst_metadata_by_rank, my_dst_metadata)
+
+    # Parameter to metadata maps keyed by resolved_name
+    src_param_metadata_by_rank = {}
+    dst_param_metadata_by_rank = {}
+    src_param_metadata: dict[str, list[ParameterMetadata]] = {}
+
+    for rank_id, rank_metadata_list in enumerate(all_src_metadata_by_rank):
+        src_param_metadata_by_rank[rank_id] = {m.resolved_name: m for m in rank_metadata_list}
+    for rank_id, rank_metadata_list in enumerate(all_dst_metadata_by_rank):
+        dst_param_metadata_by_rank[rank_id] = {m.resolved_name: m for m in rank_metadata_list}
+    for rank_metadata_list in all_src_metadata_by_rank:
+        for metadata in rank_metadata_list:
+            key = metadata.resolved_name
+            if key not in src_param_metadata:
+                src_param_metadata[key] = []
+            src_param_metadata[key].append(metadata)
+
+    # Build the plan on global rank 0 and broadcast to all ranks
+    if my_global_rank == 0:
+        plans_for_all_ranks = {r: ReshardPlan([], []) for r in range(world_size)}
+        # Global monotonically increasing ID for non-local transfers.
+        # This is shared between the corresponding send/recv ops so that
+        # NVSHMEM can build schedule.
+        next_task_id = 0
+
+        # Pipeline-parallel (PP) "mapping" is handled implicitly.
+        # Each rank contributes metadata only for the parameters it actually owns
+        # (i.e., the module partitioning for its PP stage). When PP sizes differ
+        # between source and destination, we don't compute an explicit stage-to-stage
+        # mapping here; instead, we iterate destination ranks and plan copies for the
+        # parameters present on those ranks. Any source rank that has the same logical
+        # parameter (matched by resolved_name) can serve as a sender (with DP balancing),
+        # and TP slicing is applied when applicable.
+        for dst_rank in range(world_size):
+            dst_rank_params = dst_param_metadata_by_rank.get(dst_rank, {})
+            for resolved_name, dst_metadata in dst_rank_params.items():
+                src_meta_list = src_param_metadata.get(resolved_name)
+                if not src_meta_list:
+                    raise RuntimeError(
+                        f"Destination parameter '{resolved_name}' on rank {dst_rank} "
+                        "not found in source model."
+                    )
+                # Choose a representative source metadata with DP round-robin balancing
+                src_metadata = select_src_metadata_balanced(src_meta_list, dst_metadata, dst_rank)
+                sources = _determine_source_ranks_for_dst_param(
+                    resolved_name, src_metadata, dst_metadata, dst_rank
+                )
+                for src_rank, src_slice, dst_slice in sources:
+                    task_id = next_task_id
+                    next_task_id += 1
+
+                    plans_for_all_ranks[dst_rank].recv_ops.append(
+                        TransferOp(
+                            param_name=dst_metadata.name,
+                            peer_rank=src_rank,
+                            is_send=False,
+                            my_slice=dst_slice,
+                            peer_slice=src_slice,
+                            task_id=task_id,
+                        )
+                    )
+                    plans_for_all_ranks[src_rank].send_ops.append(
+                        TransferOp(
+                            param_name=src_metadata.name,
+                            peer_rank=dst_rank,
+                            is_send=True,
+                            my_slice=src_slice,
+                            peer_slice=dst_slice,
+                            task_id=task_id,
+                        )
+                    )
+        plans_list = [plans_for_all_ranks[r] for r in range(world_size)]
+    else:
+        plans_list = [None] * world_size
+    torch.distributed.broadcast_object_list(plans_list, src=0)
+    my_plan = plans_list[my_global_rank]
+
+    logger.info(
+        f"Rank {my_global_rank}: Received plan - {len(my_plan.recv_ops)} recvs, "
+        f"{len(my_plan.send_ops)} sends"
+    )
+
+    return my_plan
diff --git a/megatron/core/resharding/refit.py b/megatron/core/resharding/refit.py
new file mode 100644
index 00000000000..5461b8d3900
--- /dev/null
+++ b/megatron/core/resharding/refit.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+"""
+High-level refit/reshard orchestration:
+- swap_model_weights: public API; accepts a backend name or CopyService and delegates.
+- reshard_model_weights: transport-agnostic core; builds/caches plan and executes.
+"""
+
+from typing import Any, Literal, Optional, Union
+
+from megatron.core import parallel_state
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.utils import unwrap_model
+
+from . import build_centralized_reshard_plan, execute_reshard_plan
+from .copy_services.base import CopyService
+from .copy_services.gloo_copy_service import GlooCopyService
+from .copy_services.nccl_copy_service import NCCLCopyService
+from .copy_services.nvshmem_copy_service import NVSHMEMCopyService
+
+# Supported refit backend names
+RefitBackendName = Literal["nccl", "gloo", "nvshmem"]
+
+# Module-level cache for refit services to avoid repeated allocations
+_service_cache: dict[str, CopyService] = {}
+
+
+def get_or_create_service(backend: RefitBackendName) -> CopyService:
+    """Get or create a cached CopyService instance for the given backend.
+
+    This avoids expensive repeated allocations (especially for NVSHMEM buffers)
+    when swap_model_weights is called multiple times with the same backend.
+    """
+    if backend in _service_cache:
+        return _service_cache[backend]
+
+    if backend == "nccl":
+        service = NCCLCopyService()
+    elif backend == "gloo":
+        service = GlooCopyService()
+    elif backend == "nvshmem":
+        service = NVSHMEMCopyService()
+    else:
+        raise ValueError(f"Unknown backend '{backend}'")
+
+    _service_cache[backend] = service
+    return service
+
+
+def clear_service_cache():
+    """Clear the cached refit services.
+
+    Call this if you need to invalidate the cache, for example when
+    reinitializing distributed state.
+    """
+    global _service_cache
+    _service_cache.clear()
+
+
+def swap_model_weights(
+    src_model: LanguageModule,
+    target_model: LanguageModule,
+    refit_method: Union[RefitBackendName, CopyService],
+):
+    """
+    Orchestrate weight swap/refit.
+    - refit_method can be:
+        * a string backend name (one of the supported refit backends), or
+        * a CopyService instance.
+    """
+    if isinstance(refit_method, CopyService):
+        service = refit_method
+        reshard_model_weights(src_model, target_model, service=service)
+    elif isinstance(refit_method, str):
+        service = get_or_create_service(refit_method)
+        reshard_model_weights(src_model, target_model, service=service)
+    else:
+        raise TypeError("refit_method must be a str backend name or a CopyService instance")
+
+
+def reshard_model_weights(
+    src_model: LanguageModule, target_model: LanguageModule, service: CopyService
+):
+    """Reshard and copy model weights from ``src_model`` to ``target_model`` using ``service``."""
+    # Handle list-wrapped modules used throughout training utils
+    src_lm = src_model[0] if isinstance(src_model, (list, tuple)) else src_model
+    tgt_lm = target_model[0] if isinstance(target_model, (list, tuple)) else target_model
+
+    num_experts = src_lm.config.num_moe_experts
+
+    # Unwrap to get owning modules (with parameters and pg_collection)
+    src_core = unwrap_model(src_lm)
+    tgt_core = unwrap_model(tgt_lm)
+
+    # Ensure pg_collection exists
+    if not hasattr(src_core, "pg_collection") or src_core.pg_collection is None:
+        raise RuntimeError("Source model missing pg_collection required for NCCL reshard")
+    if not hasattr(tgt_core, "pg_collection") or tgt_core.pg_collection is None:
+        raise RuntimeError("Target model missing pg_collection required for NCCL reshard")
+
+    # Fill missing DP group on the source using Megatron's parallel state if not provided
+    if getattr(src_core.pg_collection, "dp", None) is None:
+        src_core.pg_collection.dp = parallel_state.get_data_parallel_group()
+
+    # caching plan for reuse
+    cached_plan: Optional[Any] = getattr(tgt_core, "_cached_reshard_plan", None)
+    if cached_plan is None:
+        plan = build_centralized_reshard_plan(src_core, tgt_core, num_experts=num_experts)
+        setattr(tgt_core, "_cached_reshard_plan", plan)
+    else:
+        plan = cached_plan
+
+    execute_reshard_plan(plan, src_core, tgt_core, service=service)
diff --git a/megatron/core/resharding/utils.py b/megatron/core/resharding/utils.py
new file mode 100644
index 00000000000..7fc9e9ad3a7
--- /dev/null
+++ b/megatron/core/resharding/utils.py
@@ -0,0 +1,361 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Mapping, Optional
+
+import torch
+import torch.distributed as dist
+
+# -----------------------------------------------------------------------------
+# Dataclasses used by the planner
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class TransferOp:
+    """Single logical send/recv operation used in a reshard plan."""
+
+    param_name: str
+    peer_rank: int  # Who to send to / receive from
+    is_send: bool  # True=send, False=recv
+
+    # Slice information (for when we execute the plan)
+    my_slice: tuple[slice, ...]  # My tensor slice
+    peer_slice: tuple[slice, ...]  # Peer's tensor slice (for reference)
+
+    # Optional global task identifier for advanced backends (e.g., NVSHMEM)
+    # When present, this ID is shared between the matching send/recv ops
+    # across ranks and can be used to build richer communication schedules.
+    task_id: int | None = None
+
+
+@dataclass
+class ParameterMetadata:
+    """Metadata for a parameter (used when param is on different rank)."""
+
+    name: str
+    shape: tuple[int, ...]
+    dtype: torch.dtype
+    element_size: int
+
+    # TP sharding info
+    is_tp: bool = False
+    partition_dim: int = 0
+    partition_stride: int = 1
+
+    # EP sharding info (fused/grouped MoE)
+    is_ep: bool = False
+    num_experts: Optional[int] = None
+
+    # Which rank owns this param
+    owner_rank: int = -1
+
+    tensor_parallel_group_ranks: list[int] | None = None
+    expert_parallel_group_ranks: list[int] | None = None
+    data_parallel_group_ranks: list[int] | None = None
+    pipeline_parallel_group_ranks: list[int] | None = None
+
+    # Canonical name for matching parameters across models with different EP/PP configurations.
+    #
+    # - EP (expert parallel): each rank owns a subset of experts with local indices
+    #   (e.g., rank 1 has "weight0" locally, but it's actually global expert 4). The raw param
+    #   name can't be used to match across source/destination because the same local name refers
+    #   to different global experts on different ranks. `resolved_name` remaps local expert indices
+    #   to global indices (e.g., "layer.experts.weight0" on rank 1 → "layer.experts.weight4").
+    #
+    # - PP (pipeline parallel): transformer blocks are often named with rank-local indices
+    #   (e.g., PP stage 1 may have "decoder.layers.0" even though that corresponds to global
+    #   layer 16). For reshard/refit across different PP partitionings (e.g., PP2 ↔ PP1),
+    #   `resolved_name` may be further canonicalized to global layer indices.
+    #
+    # For non-EP and non-PP cases, resolved_name == name.
+    resolved_name: Optional[str] = None
+    # The global expert index this parameter belongs to (e.g., 4 for global expert 4).
+    # Computed alongside resolved_name; None for non-EP or fused expert tensors.
+    global_expert_index: Optional[int] = None
+
+
+@dataclass
+class ShardingDescriptor:
+    """Descriptor for a sharded dimension for a parameter."""
+
+    name: str  # "tp" | "ep" | custom label
+    dim: int
+    src_stride: int
+    dst_stride: int
+    src_dim_ranks: list[int]
+    dst_dim_ranks: list[int]
+
+
+@dataclass
+class ReshardPlan:
+    """Reshard plan - operations for this rank."""
+
+    send_ops: list[TransferOp]
+    recv_ops: list[TransferOp]
+
+    def __str__(self):
+        return f"ReshardPlan(sends={len(self.send_ops)}, recvs={len(self.recv_ops)})"
+
+
+# -----------------------------------------------------------------------------
+# EP + Metadata helpers
+# -----------------------------------------------------------------------------
+
+
+def _get_rank_in_group(global_rank: int, group_ranks: list[int]) -> int:
+    try:
+        return group_ranks.index(global_rank)
+    except ValueError:
+        raise ValueError(
+            f"Rank {global_rank} not found in process group {group_ranks}. "
+            f"This likely indicates a configuration mismatch."
+        )
+
+
+def _detect_expert_index_from_param_name(param_name: str) -> Optional[int]:
+    """Extract expert index from parameter name for TEGroupedMLP per-expert tensors."""
+    for part in param_name.split('.'):
+        if (
+            part.startswith('weight')
+            and len(part) > len('weight')
+            and part[len('weight') :].isdigit()
+        ):
+            return int(part[len('weight') :])
+        if part.startswith('bias') and len(part) > len('bias') and part[len('bias') :].isdigit():
+            return int(part[len('bias') :])
+    return None
+
+
+def assign_ep_resolved_name_inplace(
+    meta: ParameterMetadata, *, base_name: str | None = None
+) -> None:
+    """
+    EP-only canonicalization for per-expert parameters.
+
+    Under Expert Parallelism (EP), each rank owns a subset of experts with local indices
+    (e.g., rank 1 has "weight0" locally, but it's actually global expert 4). The raw param
+    name can't be used to match across source/destination because the same local name refers
+    to different global experts on different ranks. This function remaps local expert indices
+    to global indices in `resolved_name` and sets `global_expert_index`.
+
+    Effects:
+    - Sets meta.resolved_name (defaults to base_name/meta.name for non-EP).
+    - Sets meta.global_expert_index for per-expert parameters; otherwise leaves it as None.
+    """
+    base = meta.name if base_name is None else base_name
+    meta.resolved_name = base
+    meta.global_expert_index = None
+    if not meta.is_ep:
+        return
+
+    local_idx = _detect_expert_index_from_param_name(base)
+    if local_idx is None:
+        # Fused experts tensor: leave name as-is; TP planner will handle slicing
+        return
+    ep_group = meta.expert_parallel_group_ranks
+    ep_size = len(ep_group)
+    ep_local_rank = ep_group.index(meta.owner_rank)
+    experts_per_rank = meta.num_experts // ep_size
+    global_idx = ep_local_rank * experts_per_rank + local_idx
+    meta.global_expert_index = global_idx
+
+    # Replace trailing integer in "weightK"/"biasK" with global_idx
+    parts = base.split('.')
+    new_parts = []
+    for p in parts:
+        if p.startswith('weight') and len(p) > len('weight') and p[len('weight') :].isdigit():
+            new_parts.append('weight' + str(global_idx))
+        elif p.startswith('bias') and len(p) > len('bias') and p[len('bias') :].isdigit():
+            new_parts.append('bias' + str(global_idx))
+        else:
+            new_parts.append(p)
+    meta.resolved_name = '.'.join(new_parts)
+
+
+def assign_resolved_name_inplace(
+    meta: ParameterMetadata,
+    *,
+    layer_module_prefix_map: Mapping[str, str] | None = None,
+    base_name: str | None = None,
+) -> None:
+    """Set meta.resolved_name so the planner can match the same weights across models.
+
+    It rewrites PP layer indices to global layer indices (when layer_module_prefix_map is
+    provided) and
+    rewrites EP per-expert indices (weightK/biasK) to global expert indices.
+    """
+    name = meta.name if base_name is None else base_name
+    if layer_module_prefix_map:
+        name = _resolve_global_layer_number_in_name(name, layer_module_prefix_map)
+    assign_ep_resolved_name_inplace(meta, base_name=name)
+
+
+def _build_layer_module_prefix_map(module: torch.nn.Module) -> dict[str, str]:
+    """Build a mapping local_module_prefix -> global_module_prefix for PP layer modules.
+
+    Megatron assigns a global, 1-indexed layer_number to each transformer layer module at
+    construction time (including PP/VPP/layout offsets). We convert that to the 0-indexed naming
+    convention used in parameter names and build a map such as:
+
+    - "decoder.layers.0" → "decoder.layers.16"  (if layer_number == 17)
+    """
+    prefix_map: dict[str, str] = {}
+    for module_name, submodule in module.named_modules():
+        if not module_name:
+            continue
+        layer_number = getattr(submodule, 'layer_number', None)
+        if not isinstance(layer_number, int):
+            continue
+        parts = module_name.split('.')
+        if not parts[-1].isdigit():
+            continue
+        parts[-1] = str(layer_number - 1)  # convert 1-indexed to 0-indexed
+        prefix_map[module_name] = '.'.join(parts)
+    return prefix_map
+
+
+def _resolve_global_layer_number_in_name(
+    name: str, layer_module_prefix_map: Mapping[str, str]
+) -> str:
+    """Rewrite a parameter name to use global layer indices (PP-aware).
+
+    Given a parameter name like decoder.layers.0.self_attention..., this function rewrites
+    the decoder.layers.0 prefix to the corresponding global layer index using the owning
+    layer module's layer_number.
+
+    Implementation:
+    - Build a {local_prefix -> global_prefix} map once (outside the per-parameter loop).
+    - Perform a longest-prefix match replacement so we only rewrite the module path portion.
+    """
+    if not layer_module_prefix_map:
+        return name
+
+    parts = name.split('.')
+    for i in range(len(parts), 0, -1):
+        prefix = '.'.join(parts[:i])
+        mapped = layer_module_prefix_map.get(prefix)
+        if mapped is None:
+            continue
+        rest = '.'.join(parts[i:])
+        return mapped if not rest else mapped + '.' + rest
+    return name
+
+
+def extract_param_metadata(
+    param: torch.nn.Parameter,
+    param_name: str,
+    owner_rank: int,
+    pg_collection,
+    num_experts: Optional[int] = None,
+    layer_module_prefix_map: Mapping[str, str] | None = None,
+) -> ParameterMetadata:
+    """Extract metadata from a parameter for cross-rank communication."""
+    # TP flags from attributes (set by Megatron linear layers)
+    is_tp = bool(getattr(param, 'tensor_model_parallel', False))
+    partition_dim = int(getattr(param, 'partition_dim', 0))
+    partition_stride = int(getattr(param, 'partition_stride', 1))
+
+    # SwiGLU/GLU compatibility: For gated linear units, fc1 stores interleaved [gate, up] portions
+    # and requires partition_stride=2 for correct resharding. New models set this at construction
+    # time (MLP sets partition_stride=2 on weight when gated_linear_unit=True). For legacy models
+    # where stride=1 was left as default, we apply stride=2 as a fallback for fc1 parameters.
+    # This is safe because: (1) gated models need it, and (2) non-gated models have smaller fc1
+    # and stride doesn't affect single-block transfers.
+    # if 'mlp.linear_fc1' in param_name and is_tp and partition_stride == 1:
+    #     partition_stride = 2
+
+    # EP detection: Megatron convention - expert params are not allreduced
+    is_ep = not bool(getattr(param, 'allreduce', True))
+
+    tensor_parallel_group_ranks: list[int] | None = None
+    expert_parallel_group_ranks: list[int] | None = None
+    data_parallel_group_ranks: list[int] | None = None
+    pipeline_parallel_group_ranks: list[int] | None = None
+
+    if is_ep:
+        expert_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.ep)
+        # For MoE params, prefer expert TP group when available, else regular TP
+        if is_tp and hasattr(pg_collection, 'expt_tp') and pg_collection.expt_tp is not None:
+            tensor_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.expt_tp)
+        elif is_tp and hasattr(pg_collection, 'tp') and pg_collection.tp is not None:
+            tensor_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.tp)
+        data_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.dp)
+    elif is_tp:
+        # Non-EP: use regular TP group
+        if hasattr(pg_collection, 'tp') and pg_collection.tp is not None:
+            tensor_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.tp)
+        data_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.dp)
+    else:
+        data_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.dp)
+
+    if hasattr(pg_collection, 'pp') and pg_collection.pp is not None:
+        pipeline_parallel_group_ranks = dist.get_process_group_ranks(pg_collection.pp)
+    else:
+        pipeline_parallel_group_ranks = list(range(dist.get_world_size()))
+
+    meta = ParameterMetadata(
+        name=param_name,
+        shape=tuple(param.shape),
+        dtype=param.dtype,
+        element_size=param.element_size(),
+        is_tp=is_tp,
+        partition_dim=partition_dim,
+        partition_stride=partition_stride,
+        is_ep=is_ep,
+        num_experts=num_experts,
+        owner_rank=owner_rank,
+        tensor_parallel_group_ranks=tensor_parallel_group_ranks,
+        expert_parallel_group_ranks=expert_parallel_group_ranks,
+        data_parallel_group_ranks=data_parallel_group_ranks,
+        pipeline_parallel_group_ranks=pipeline_parallel_group_ranks,
+    )
+    assign_resolved_name_inplace(
+        meta, layer_module_prefix_map=layer_module_prefix_map, base_name=param_name
+    )
+
+    return meta
+
+
+def select_src_metadata_balanced(
+    src_meta_list: list[ParameterMetadata], dst_metadata: ParameterMetadata, dst_rank: int
+) -> ParameterMetadata:
+    """Choose a representative source `ParameterMetadata` for a destination rank.
+
+    Multiple source data-parallel (DP) groups may hold the same logical parameter.
+    To avoid always reading from the same group, we:
+      - bucket `src_meta_list` by their DP group (tuple of ranks)
+      - if there is only one bucket, just return the first entry
+      - otherwise, use the destination rank's global rank to select a source
+        DP group in a round-robin fashion, ensuring even distribution of load
+        across all source DP groups.
+    """
+    if not src_meta_list:
+        raise ValueError("src_meta_list must be non-empty")
+
+    # Group source metadata by their DP group layout so we can balance across groups.
+    #   (dp_rank0, dp_rank1, ...) -> [ParameterMetadata for that DP group]
+    grouped_by_dp: dict[tuple[int, ...], list[ParameterMetadata]] = {}
+    for meta in src_meta_list:
+        dp_group = tuple(meta.data_parallel_group_ranks or [])
+        grouped_by_dp.setdefault(dp_group, []).append(meta)
+
+    # Fast path: only one DP layout present; no balancing necessary.
+    if len(grouped_by_dp) == 1:
+        return src_meta_list[0]
+
+    # Use the destination rank's global rank to select a source DP group in a
+    # round-robin fashion. This ensures that even when multiple destination ranks
+    # have the same DP index (e.g., ranks 0,1,2,3 all being at position 0 in their
+    # respective DP groups), they still get distributed across different source
+    # DP groups based on their global rank.
+    sorted_dp_groups = sorted(grouped_by_dp.keys())
+    chosen_group = sorted_dp_groups[dst_rank % len(sorted_dp_groups)]
+
+    # Within the chosen group, any representative metadata works; use the first.
+    return grouped_by_dp[chosen_group][0]
+
+
+logger = logging.getLogger(__name__)
diff --git a/megatron/core/safe_globals.py b/megatron/core/safe_globals.py
index cc5eb8809e8..8bcfe788f60 100755
--- a/megatron/core/safe_globals.py
+++ b/megatron/core/safe_globals.py
@@ -3,6 +3,7 @@
 from argparse import Namespace
 from io import BytesIO
 from pathlib import PosixPath
+from signal import Signals
 from types import SimpleNamespace
 
 import torch
@@ -13,7 +14,7 @@
 from megatron.core.enums import ModelType
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.rerun_state_machine import RerunDiagnostic, RerunMode, RerunState
-from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 
 SAFE_GLOBALS = [
     SimpleNamespace,
@@ -24,12 +25,14 @@
     UInt32DType,
     Namespace,
     AttnBackend,
+    CudaGraphScope,
     ModelType,
     OptimizerConfig,
     RerunDiagnostic,
     RerunMode,
     RerunState,
     BytesIO,
+    Signals,
 ]
 
 
diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py
new file mode 100644
index 00000000000..70e749724dc
--- /dev/null
+++ b/megatron/core/ssm/gated_delta_net.py
@@ -0,0 +1,664 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025, Songlin Yang, Jan Kautz, Ali Hatamizadeh.
+
+# Some of this code was adopted from https://github.com/huggingface/transformers
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass, replace
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.mapping import ReplicaId, ShardedTensorFactory
+from megatron.core.fp8_utils import get_fp8_align_size
+from megatron.core.inference.contexts import BaseInferenceContext
+from megatron.core.jit import jit_fuser
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel import get_cuda_rng_tracker
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.utils import (
+    ensure_metadata_has_dp_cp_group,
+    make_sharded_tensors_for_checkpoint,
+    sharded_state_dict_default,
+)
+from megatron.core.utils import deprecate_inference_params, nvtx_range_pop, nvtx_range_push
+
+# TODO: Implement GatedDeltaNetContextParallel
+# from .gated_delta_net_context_parallel import GatedDeltaNetContextParallel
+
+try:
+    from fla.modules.l2norm import l2norm
+    from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+
+    HAVE_FLA = True
+except ImportError:
+    chunk_gated_delta_rule = None
+
+    HAVE_FLA = False
+
+try:
+    from causal_conv1d import causal_conv1d_fn
+except ImportError:
+    causal_conv1d_fn = None
+    causal_conv1d_update = None
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class GatedDeltaNetSubmodules:
+    """
+    Contains the module specs for the input linear, output norm, and output linear layers.
+    """
+
+    in_proj: Union[ModuleSpec, type] = IdentityOp
+    out_norm: Union[ModuleSpec, type] = IdentityOp
+    out_proj: Union[ModuleSpec, type] = IdentityOp
+
+
+class GatedDeltaNet(MegatronModule):
+    """Gated Delta Net (GDN) layer class
+
+    GDN layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: GatedDeltaNetSubmodules,
+        layer_number: int = None,
+        bias: bool = False,
+        conv_bias: bool = False,
+        conv_init: Optional[float] = None,
+        use_qk_l2norm: bool = True,
+        A_init_range: Tuple[float, float] = (1, 16),
+        pg_collection: ProcessGroupCollection = None,
+    ):
+        """
+        Args:
+            config: The config of the model.
+            submodules: Contains the module specs for the input and output linear layers.
+            layer_number: The layer number of this GDN layer.
+            bias: Whether to use bias in the linear layers.
+            conv_bias: Whether to use bias in the causal convolution.
+            conv_init: The initialization range for the causal convolution weights.
+            use_qk_l2norm: Whether to use L2 normalization in the kernel of the gated delta rule.
+            A_init_range: The initialization range for the attention weights.
+            pg_collection: The required process groups to use for tensor model parallel and context
+                parallel.
+        """
+
+        if not HAVE_FLA:
+            raise ImportError(
+                "FLA is not installed. Please install it with `pip install flash-linear-attention`."
+            )
+
+        super().__init__(config)
+
+        # Attributes from arguments
+        self.layer_number = layer_number
+        self.bias = bias
+        self.conv_bias = conv_bias
+        self.conv_init = conv_init
+        assert A_init_range[0] >= 0 and A_init_range[1] >= A_init_range[0]
+        self.A_init_range = A_init_range
+        self.use_qk_l2norm = use_qk_l2norm
+        assert pg_collection is not None, "pg_collection must be provided for GatedDeltaNet"
+        self.pg_collection = pg_collection
+        self.tp_size = self.pg_collection.tp.size()
+        self.sp_size = self.tp_size if config.sequence_parallel else 1
+
+        # Attributes from config
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.act_fn = config.activation_func
+        self.activation = self.act_fn.__name__
+        self.conv_kernel_dim = config.linear_conv_kernel_dim
+        self.key_head_dim = config.linear_key_head_dim
+        self.value_head_dim = config.linear_value_head_dim
+        self.num_key_heads = config.linear_num_key_heads
+        self.num_value_heads = config.linear_num_value_heads
+        self.qk_dim = self.key_head_dim * self.num_key_heads
+        self.v_dim = self.value_head_dim * self.num_value_heads
+
+        # Input projection (hidden_states -> q, k, v, gate, beta, alpha)
+        # TODO: for now, output gate is forced for GDN.
+        # We may remove this restriction in the future.
+        self.in_proj_dim = self.qk_dim * 2 + self.v_dim * 2 + self.num_value_heads * 2
+        if self.config.fp8:
+            fp8_align_size = get_fp8_align_size(self.config.fp8_recipe)
+            assert self.in_proj_dim % fp8_align_size == 0, (
+                "For FP8, the innermost dimension of the GDN layer "
+                "input projection output tensor must be a multiple of 16."
+            )
+        self.in_proj = build_module(
+            submodules.in_proj,
+            self.hidden_size,
+            self.in_proj_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name="fc1",
+            tp_group=self.pg_collection.tp,
+        )
+
+        # Conv1d for QKV
+        self.conv_dim = self.qk_dim * 2 + self.v_dim
+        self.conv_dim_local_tp = self.conv_dim // self.tp_size
+
+        # weight shape: [conv_dim, 1, d_conv]
+        # bias shape: [conv_dim]
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim_local_tp,
+            out_channels=self.conv_dim_local_tp,
+            bias=conv_bias,
+            kernel_size=self.conv_kernel_dim,
+            groups=self.conv_dim_local_tp,
+            padding=self.conv_kernel_dim - 1,
+            device=torch.cuda.current_device(),
+            dtype=config.params_dtype,
+        )
+        setattr(self.conv1d.weight, "tensor_model_parallel", True)
+        if conv_bias:
+            setattr(self.conv1d.bias, "tensor_model_parallel", True)
+
+        # Time step projection (discretization)
+        self.num_v_heads_local_tp = self.num_value_heads // self.tp_size
+        # dt_bias parameter
+        self.dt_bias = nn.Parameter(
+            torch.empty(
+                self.num_v_heads_local_tp,
+                dtype=config.params_dtype,
+                device=torch.cuda.current_device(),
+            )
+        )
+        setattr(self.dt_bias, "tensor_model_parallel", True)
+        # A_log parameter
+        self.A_log = nn.Parameter(
+            torch.empty(
+                self.num_v_heads_local_tp,
+                dtype=config.params_dtype,
+                device=torch.cuda.current_device(),
+            )
+        )
+        setattr(self.A_log, "tensor_model_parallel", True)
+
+        # Output layernorm before projection
+        self.out_norm = build_module(
+            submodules.out_norm,
+            config=self.config,
+            hidden_size=self.value_head_dim,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.out_proj = build_module(
+            submodules.out_proj,
+            self.v_dim,
+            self.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=bias,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name="fc2",
+            tp_group=self.pg_collection.tp,
+        )
+
+        # TODO: support CP
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """Reset the parameters."""
+        if self.config.perform_initialization:
+            with get_cuda_rng_tracker().fork():
+                # conv1d.weight
+                if self.conv_init is not None:
+                    nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
+                # dt_bias
+                torch.ones(
+                    self.num_v_heads_local_tp,
+                    out=self.dt_bias.data,
+                    dtype=self.config.params_dtype,
+                    device=torch.cuda.current_device(),
+                )
+                # A_log
+                A = torch.empty(
+                    self.num_v_heads_local_tp,
+                    dtype=self.config.params_dtype,
+                    device=torch.cuda.current_device(),
+                ).uniform_(*self.A_init_range)
+                self.A_log.data.copy_(torch.log(A))
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        key_value_states: Optional[Tensor] = None,
+        inference_context: Optional[BaseInferenceContext] = None,
+        attention_bias: Optional[Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        sequence_len_offset: Optional[int] = None,
+        *,
+        inference_params: Optional[BaseInferenceContext] = None,
+        **kwargs,
+    ):
+        """
+        Perform a forward pass through the GDN module.
+
+        Args:
+            hidden_states (Tensor): Hidden states.
+            attention_mask (Tensor): Attention mask.
+            key_value_states (Optional[Tensor]): Key/value states (for cross attention).
+            inference_context (Optional[BaseInferenceContext]): Inference context that manages
+                KV cache.
+            attention_bias (Optional[Tensor]): Attention bias.
+            packed_seq_params (Optional[PackedSeqparams]): Parameters used for THD format.
+            sequence_len_offset (Optional[int]): Sequence length offset used for
+                inference CUDA graphs.
+
+        Return:
+            (Tuple[Tensor, Tensor]) GDN output and bias.
+
+        """
+        # TODO: Deal with attention_mask
+
+        inference_context = deprecate_inference_params(inference_context, inference_params)
+
+        seq_len, batch, _ = hidden_states.shape
+        seq_len = seq_len * self.sp_size
+
+        if inference_context is not None:
+            assert (
+                inference_context.is_static_batching()
+            ), "GDN does not currently support dynamic inference batching."
+            assert not self.config.sequence_parallel
+            # TODO: support inference
+            raise NotImplementedError("GDN does not support inference for now.")
+
+        if packed_seq_params is not None:
+            # TODO: support packed sequence
+            raise NotImplementedError("GDN does not support packed sequence for now.")
+
+        # Input projection
+        nvtx_range_push(suffix="in_proj")
+        qkvzba, _ = self.in_proj(hidden_states)
+        nvtx_range_pop(suffix="in_proj")
+
+        # Transpose: s b x --> b s x
+        # From sbhd to bshd format
+        qkvzba = qkvzba.transpose(0, 1)
+
+        # Split, reorder, and reshape the tensor into q, k, v, gate, beta, alpha
+        qkv, gate, beta, alpha = torch.split(
+            qkvzba,
+            [
+                (self.qk_dim * 2 + self.v_dim) // self.tp_size,
+                self.v_dim // self.tp_size,
+                self.num_value_heads // self.tp_size,
+                self.num_value_heads // self.tp_size,
+            ],
+            dim=-1,
+        )
+        gate = gate.reshape(batch, seq_len, -1, self.value_head_dim)
+        beta = beta.reshape(batch, seq_len, -1)
+        alpha = alpha.reshape(batch, seq_len, -1)
+
+        # Convolution on qkv
+        qkv = qkv.transpose(1, 2).contiguous()  # b, s, d -> b, d, s
+        nvtx_range_push(suffix="conv1d")
+        if (causal_conv1d_fn is None) or self.config.deterministic_mode:
+            qkv = self.act_fn(self.conv1d(qkv)[..., :seq_len])
+        else:
+            assert self.activation in ["silu", "swish"]
+            qkv = causal_conv1d_fn(
+                x=qkv,
+                weight=self.conv1d.weight.squeeze(1),  # d, 1, w -> d, w
+                bias=self.conv1d.bias,
+                activation=self.activation,
+            )
+        nvtx_range_pop(suffix="conv1d")
+        # Split qkv into query, key, and value
+        qkv = qkv.transpose(1, 2)  # b, d, s -> b, s, d
+        query, key, value = torch.split(
+            qkv,
+            [self.qk_dim // self.tp_size, self.qk_dim // self.tp_size, self.v_dim // self.tp_size],
+            dim=-1,
+        )
+        query = query.reshape(batch, seq_len, -1, self.key_head_dim)
+        key = key.reshape(batch, seq_len, -1, self.key_head_dim)
+        value = value.reshape(batch, seq_len, -1, self.value_head_dim)
+        # Apply L2 norm to query and key
+        if self.use_qk_l2norm:
+            query = l2norm(query.contiguous())
+            key = l2norm(key.contiguous())
+        if self.num_value_heads // self.num_key_heads > 1:
+            query = query.repeat_interleave(self.num_value_heads // self.num_key_heads, dim=2)
+            key = key.repeat_interleave(self.num_value_heads // self.num_key_heads, dim=2)
+
+        # Make contiguous
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        gate = gate.contiguous()
+        beta = beta.contiguous()
+        alpha = alpha.contiguous()
+
+        # Calculate g and beta
+        nvtx_range_push(suffix="g_and_beta")
+        g = -self.A_log.exp() * F.softplus(alpha.float() + self.dt_bias)  # In fp32
+        beta = beta.sigmoid()
+        nvtx_range_pop(suffix="g_and_beta")
+
+        nvtx_range_push(suffix="gated_delta_rule")
+        if self.config.deterministic_mode:
+            core_attn_out, last_recurrent_state = torch_chunk_gated_delta_rule(
+                query,
+                key,
+                value,
+                g=g,
+                beta=beta,
+                initial_state=None,
+                output_final_state=False,
+                use_qk_l2norm_in_kernel=False,
+            )
+        else:
+            core_attn_out, last_recurrent_state = chunk_gated_delta_rule(
+                query,
+                key,
+                value,
+                g=g,
+                beta=beta,
+                initial_state=None,
+                output_final_state=False,
+                use_qk_l2norm_in_kernel=False,
+            )
+        nvtx_range_pop(suffix="gated_delta_rule")
+
+        # RMSNorm
+        nvtx_range_push(suffix="gated_norm")
+        norm_out = self._apply_gated_norm(core_attn_out, gate)
+        nvtx_range_pop(suffix="gated_norm")
+
+        # Transpose: b s x --> s b x
+        # From bshd back to sbhd format
+        norm_out = norm_out.reshape(batch, seq_len, -1)
+        norm_out = norm_out.transpose(0, 1).contiguous()
+
+        # Output projection
+        nvtx_range_push(suffix="out_proj")
+        out, out_bias = self.out_proj(norm_out)
+        nvtx_range_pop(suffix="out_proj")
+
+        return out, out_bias
+
+    @jit_fuser
+    def _apply_gated_norm(self, x, gate):
+        # Output Norm
+        x_dtype = x.dtype
+        x = x.reshape(-1, x.shape[-1])
+        y = self.out_norm(x)
+        # Output gate
+        gate = gate.reshape(-1, gate.shape[-1])
+        y = y * self.act_fn(gate.float())
+        y = y.to(x_dtype)
+        return y
+
+    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None, tp_group=None):
+        """Provide a sharded state dictionary for distributed checkpointing."""
+        # Guard for cases metadata is not provided
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
+
+        sharded_state_dict = {}
+        # Parameters
+        self._save_to_state_dict(sharded_state_dict, "", keep_vars=True)
+        sharded_state_dict = make_sharded_tensors_for_checkpoint(
+            sharded_state_dict,
+            prefix,
+            tensor_parallel_layers_axis_map={
+                "A_log": 0,
+                "dt_bias": 0,
+            },  # parameters sharded across TP
+            sharded_offsets=sharded_offsets,
+            tp_group=(tp_group if tp_group is not None else self.pg_collection.tp),
+            dp_cp_group=metadata['dp_cp_group'],
+        )
+        # Submodules
+        tp_group = tp_group if tp_group is not None else self.pg_collection.tp
+        for name, module in self.named_children():
+            if name == "conv1d":
+                # Add TP sharding for Conv1d
+                module_sd = module.state_dict(prefix="", keep_vars=True)
+                tp_sharding_map = {f"weight": 0}
+                if self.conv_bias:
+                    tp_sharding_map[f"bias"] = 0
+                module_sharded_sd = make_sharded_tensors_for_checkpoint(
+                    module_sd,
+                    f"{prefix}{name}.",
+                    tp_sharding_map,
+                    sharded_offsets,
+                    tp_group=tp_group,
+                    dp_cp_group=metadata['dp_cp_group'],
+                )
+            else:
+                module_sharded_sd = sharded_state_dict_default(
+                    module, f"{prefix}{name}.", sharded_offsets, metadata, tp_group=tp_group
+                )
+
+            sharded_state_dict.update(module_sharded_sd)
+
+        # At this point the TP sharding is correctly defined for each tensor, but some of the
+        # tensors must be additionally split into separate parts
+        in_proj_dim_local_tp = self.in_proj_dim // self.tp_size
+        assert sharded_state_dict[f"{prefix}in_proj.weight"].data.size(0) == in_proj_dim_local_tp, (
+            in_proj_dim_local_tp,
+            sharded_state_dict[f"{prefix}in_proj.weight"],
+        )
+
+        sharded_state_dict[f"{prefix}in_proj.weight"] = _split_tensor_factory(
+            sharded_state_dict[f"{prefix}in_proj.weight"],
+            [
+                self.qk_dim // self.tp_size,
+                self.qk_dim // self.tp_size,
+                self.v_dim // self.tp_size,
+                self.v_dim // self.tp_size,
+                self.num_value_heads // self.tp_size,
+                self.num_value_heads // self.tp_size,
+            ],
+            ["query", "key", "value", "z", "beta", "alpha"],
+            0,
+        )
+
+        conv_layer_name_list = ["conv1d.weight"]
+        assert (
+            sharded_state_dict[f"{prefix}conv1d.weight"].data.size(0) == self.conv_dim_local_tp
+        ), (self.conv_dim_local_tp, sharded_state_dict[f"{prefix}conv1d.weight"])
+        if self.conv_bias:
+            conv_layer_name_list.append("conv1d.bias")
+            assert (
+                sharded_state_dict[f"{prefix}conv1d.bias"].data.size(0) == self.conv_dim_local_tp
+            ), (self.conv_dim_local_tp, sharded_state_dict[f"{prefix}conv1d.bias"])
+        for conv_layer_name in conv_layer_name_list:
+            sharded_state_dict[f"{prefix}{conv_layer_name}"] = _split_tensor_factory(
+                sharded_state_dict[f"{prefix}{conv_layer_name}"],
+                [
+                    self.qk_dim // self.tp_size,
+                    self.qk_dim // self.tp_size,
+                    self.v_dim // self.tp_size,
+                ],
+                ["query", "key", "value"],
+                0,
+            )
+
+        return sharded_state_dict
+
+
+def _split_tensor_factory(
+    orig_sh_ten: ShardedTensor, split_sections: List[int], split_names: List[str], split_dim: int
+) -> ShardedTensorFactory:
+    """Builds a factory that splits a given ShardedTensor into several independent chunks."""
+    assert isinstance(orig_sh_ten, ShardedTensor), type(orig_sh_ten)
+    orig_sh_ten_no_data = orig_sh_ten.without_data()  # remove `data` reference
+
+    if sum(split_sections) != orig_sh_ten_no_data.local_shape[split_dim]:
+        raise ValueError(
+            f"Split sections must cover the whole dimension size, "
+            f"got {split_sections=} vs dimensions size "
+            f"{orig_sh_ten_no_data.local_shape[split_dim]}"
+        )
+
+    assert not isinstance(
+        split_sections, int
+    ), "Splitting into predefined section sizes is supported (`split_sections` must be a list)"
+    assert len(split_sections) == len(split_names), (len(split_sections), len(split_names))
+
+    @torch.no_grad()
+    def sh_ten_build_fn(
+        key: str, t: torch.Tensor, replica_id: ReplicaId, flattened_range: Optional[slice]
+    ):
+        factory_sh_ten = replace(
+            orig_sh_ten_no_data,
+            key=key,
+            data=t,
+            dtype=t.dtype,
+            replica_id=replica_id,
+            flattened_range=flattened_range,
+        )
+
+        chunk_sh_tens = []
+        split_start = 0
+        for split_size, split_name in zip(split_sections, split_names):
+            split_chunks = factory_sh_ten.narrow(split_dim, split_start, split_size)
+            for sh_ten in split_chunks:
+                sh_ten.key = f"{sh_ten.key}.{split_name}"
+            chunk_sh_tens.extend(split_chunks)
+            split_start += split_size
+
+        assert split_start == orig_sh_ten_no_data.local_shape[split_dim], (
+            split_start,
+            orig_sh_ten_no_data.local_shape[split_dim],
+        )
+        assert sum(sh_ten.data.numel() for sh_ten in chunk_sh_tens) == t.numel(), (
+            chunk_sh_tens,
+            t.shape,
+        )
+        return chunk_sh_tens
+
+    @torch.no_grad()
+    def sh_ten_merge_fn(sub_state_dict):
+        return torch.cat(sub_state_dict)
+
+    return ShardedTensorFactory(
+        orig_sh_ten.key, orig_sh_ten.data, sh_ten_build_fn, sh_ten_merge_fn, orig_sh_ten.replica_id
+    )
+
+
+def torch_chunk_gated_delta_rule(
+    query,
+    key,
+    value,
+    g,
+    beta,
+    chunk_size=64,
+    initial_state=None,
+    output_final_state=False,
+    use_qk_l2norm_in_kernel=False,
+):
+    # pylint: disable=line-too-long
+    '''
+    Torch-native implementation of chunked gated delta rule for deterministic mode.
+    Need this because FLA is not deterministic.
+
+    Reference: https://github.com/huggingface/transformers/blob/144c8ce2809a2e21914017652700e1ecb450501e/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L470-L547
+    '''
+
+    initial_dtype = query.dtype
+    if use_qk_l2norm_in_kernel:
+        query = l2norm(query, dim=-1, eps=1e-6)
+        key = l2norm(key, dim=-1, eps=1e-6)
+    query, key, value, beta, g = [
+        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
+    ]
+
+    batch_size, num_heads, sequence_length, k_head_dim = key.shape
+    v_head_dim = value.shape[-1]
+    pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size
+    query = F.pad(query, (0, 0, 0, pad_size))
+    key = F.pad(key, (0, 0, 0, pad_size))
+    value = F.pad(value, (0, 0, 0, pad_size))
+    beta = F.pad(beta, (0, pad_size))
+    g = F.pad(g, (0, pad_size))
+    total_sequence_length = sequence_length + pad_size
+    scale = 1 / (query.shape[-1] ** 0.5)
+    query = query * scale
+
+    v_beta = value * beta.unsqueeze(-1)
+    k_beta = key * beta.unsqueeze(-1)
+    # reshape to chunks
+    query, key, value, k_beta, v_beta = [
+        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1])
+        for x in (query, key, value, k_beta, v_beta)
+    ]
+    g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
+    mask = torch.triu(
+        torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0
+    )
+
+    # chunk decay
+    g = g.cumsum(dim=-1)
+    decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
+    attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
+    for i in range(1, chunk_size):
+        row = attn[..., i, :i].clone()
+        sub = attn[..., :i, :i].clone()
+        attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
+    attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+    value = attn @ v_beta
+    k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
+    last_recurrent_state = (
+        torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value)
+        if initial_state is None
+        else initial_state.to(value)
+    )
+    core_attn_out = torch.zeros_like(value)
+    mask = torch.triu(
+        torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1
+    )
+
+    # for each chunk
+    for i in range(0, total_sequence_length // chunk_size):
+        q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
+        attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+        v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+        v_new = v_i - v_prime
+        attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+        core_attn_out[:, :, i] = attn_inter + attn @ v_new
+        last_recurrent_state = (
+            last_recurrent_state * g[:, :, i, -1, None, None].exp()
+            + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
+        )
+
+    if not output_final_state:
+        last_recurrent_state = None
+    core_attn_out = core_attn_out.reshape(
+        core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1]
+    )
+    core_attn_out = core_attn_out[:, :, :sequence_length]
+    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
+    return core_attn_out, last_recurrent_state
diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py
index 33333e1d530..ef41faae143 100644
--- a/megatron/core/ssm/mamba_block.py
+++ b/megatron/core/ssm/mamba_block.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 
 # Some of this code was adopted from https://github.com/state-spaces/mamba/
@@ -18,10 +18,12 @@
 from megatron.core.extensions.transformer_engine import TENorm
 from megatron.core.fp8_utils import get_fp8_context
 from megatron.core.inference.contexts import BaseInferenceContext
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols
 from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers
 from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
@@ -93,6 +95,7 @@ def __init__(
         assert pg_collection is not None, "pg_collection must be provided for MambaStack"
 
         self.pp_group = pg_collection.pp
+        self.tp_group = pg_collection.tp
 
         # Required for pipeline parallel schedules
         self.input_tensor = None
@@ -146,7 +149,10 @@ def __init__(
                 elif layer_type == LayerSymbols.MOE:
                     # Transformer layers apply their own pp_layer_offset
                     layer = build_module(
-                        submodules.moe_layer, config=self.config, layer_number=i + 1
+                        submodules.moe_layer,
+                        config=self.config,
+                        layer_number=i + 1,
+                        pg_collection=pg_collection,
                     )
                 else:
                     assert False, "unexpected layer_type"
@@ -204,6 +210,8 @@ def forward(
         rotary_pos_emb: Optional[Tensor] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        padding_mask=None,
     ):
         """
         Forward function of the MambaStack class.
@@ -244,7 +252,7 @@ def forward(
             (
                 (
                     self.config.cuda_graph_impl == "local"
-                    and self.config.cuda_graph_scope != "full_iteration"
+                    and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope
                 )
                 or self.config.flash_decode
             )
@@ -285,12 +293,15 @@ def forward(
                             inference_context=inference_context,
                             rotary_pos_emb=rotary_pos_emb,
                             sequence_len_offset=sequence_len_offset,
+                            packed_seq_params=packed_seq_params,
+                            padding_mask=padding_mask,
                         )
                     else:  # MambaLayer
                         hidden_states = layer(
                             hidden_states=hidden_states,
                             attention_mask=attention_mask,
                             inference_context=inference_context,
+                            packed_seq_params=packed_seq_params,
                         )
 
                 # The attention layer (currently a simplified transformer layer)
@@ -359,7 +370,11 @@ def sharded_state_dict(
             if not module is self.layers:
                 sharded_state_dict.update(
                     sharded_state_dict_default(
-                        module, f'{prefix}{name}.', sharded_offsets, metadata
+                        module,
+                        f'{prefix}{name}.',
+                        sharded_offsets,
+                        metadata,
+                        tp_group=self.tp_group,
                     )
                 )
 
diff --git a/megatron/core/ssm/mamba_context_parallel.py b/megatron/core/ssm/mamba_context_parallel.py
index d59d451fba8..3925f8bd8df 100644
--- a/megatron/core/ssm/mamba_context_parallel.py
+++ b/megatron/core/ssm/mamba_context_parallel.py
@@ -1,10 +1,14 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
+from typing import Optional
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.tensor_parallel import all_to_all
+from megatron.core.utils import is_te_min_version
 
 try:
     from einops import repeat
@@ -13,6 +17,16 @@
 except ImportError:
     HAVE_EINOPS = False
 
+try:
+    # Register the TE CUDA kernels
+    import transformer_engine  # pylint: disable=unused-import
+
+    # Alias the PyTorch wrapper so we can call tex.* APIs
+    import transformer_engine_torch as tex
+except ImportError:
+    # TE isn’t installed or the torch wrapper is missing
+    tex = None
+
 
 class MambaContextParallel:
     """
@@ -116,7 +130,9 @@ def __init__(
         # and also `nheads_local_tpcp = nheads_local_tp // cp_size` whilst ngroups_local_tpcp is
         # either 1 or `ngroups_local_tp // cp_size`
 
-    def pre_conv_ssm(self, input_: torch.Tensor) -> torch.Tensor:
+    def pre_conv_ssm(
+        self, input_: torch.Tensor, packed_seq_params: Optional[PackedSeqParams] = None
+    ) -> torch.Tensor:
         """Method to be applied before the convolution and SSM"""
         if self.cp_size == 1:
             return input_
@@ -171,17 +187,20 @@ def pre_conv_ssm(self, input_: torch.Tensor) -> torch.Tensor:
 
         output = torch.cat([z, x, B, C, dt], dim=-1)
         # TODO(duncan): for hybrid models, consider isolating load-balancing to attention layers
-        output = _undo_attention_load_balancing(output, self.cp_size)
+        output = _undo_attention_load_balancing(output, self.cp_size, packed_seq_params)
 
         return output
 
-    def post_conv_ssm(self, input_: torch.Tensor) -> torch.Tensor:
+    def post_conv_ssm(
+        self, input_: torch.Tensor, packed_seq_params: Optional[PackedSeqParams] = None
+    ) -> torch.Tensor:
         """Method to be applied after the convolution and SSM"""
         if self.cp_size == 1:
             return input_
         else:
             return _all_to_all_hp2cp(
-                _redo_attention_load_balancing(input_, self.cp_size), self.cp_group
+                _redo_attention_load_balancing(input_, self.cp_size, packed_seq_params),
+                self.cp_group,
             )
 
     def conv1d(self, input_: torch.Tensor) -> torch.Tensor:
@@ -357,33 +376,78 @@ def _all_to_all_hp2cp(
     return output
 
 
-def _undo_attention_load_balancing(input_: torch.Tensor, cp_size: int) -> torch.Tensor:
+def _undo_attention_load_balancing(
+    input_: torch.Tensor, cp_size: int, packed_seq_params: Optional[PackedSeqParams] = None
+) -> torch.Tensor:
     """
-    Undoes the context parallel attention load balancing
-    For example, for cp_size=3, converts 162534 to 123456 for sequential
-    processing by the convolution and SSM.
+    Undoes the context parallel attention load balancing.
+    For example (non-packed), for cp_size=3, converts 162534 to 123456 for
+    sequential processing by the convolution and SSM.
     """
-    num_chunks_div_2 = cp_size
-    num_chunks = num_chunks_div_2 * 2
-    chunks = torch.chunk(input_, chunks=num_chunks, dim=0)
-    order = [2 * i for i in range(num_chunks_div_2)] + [
-        num_chunks - 2 * i - 1 for i in range(num_chunks_div_2)
-    ]
-    reordered_chunks = [chunks[i] for i in order]
-    return torch.cat(reordered_chunks, dim=0)
+    if packed_seq_params is None:
+        num_chunks_div_2 = cp_size
+        num_chunks = num_chunks_div_2 * 2
+        chunks = torch.chunk(input_, chunks=num_chunks, dim=0)
+        order = [2 * i for i in range(num_chunks_div_2)] + [
+            num_chunks - 2 * i - 1 for i in range(num_chunks_div_2)
+        ]
+        reordered_chunks = [chunks[i] for i in order]
+        return torch.cat(reordered_chunks, dim=0)
+    else:
+        assert tex is not None and is_te_min_version("1.10.0"), (
+            "Please update Transformer Engine to >= 1.10 to use "
+            "Context Parallel with THD format data"
+        )
+        if packed_seq_params.cu_seqlens_q_padded is not None:
+            cu_seqlens = packed_seq_params.cu_seqlens_q_padded
+        else:
+            cu_seqlens = packed_seq_params.cu_seqlens_q
+        total_tokens = input_.size(0)
+        assert total_tokens % cp_size == 0
+        seqlen_per_rank = total_tokens // cp_size
+        output = torch.empty_like(input_)
+        for cp_rank in range(cp_size):
+            start = cp_rank * seqlen_per_rank
+            end = start + seqlen_per_rank
+            index = tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, cp_size, cp_rank)
+            output[index] = input_[start:end]
+        return output
 
 
-def _redo_attention_load_balancing(input_: torch.Tensor, cp_size: int) -> torch.Tensor:
+def _redo_attention_load_balancing(
+    input_: torch.Tensor, cp_size: int, packed_seq_params: Optional[PackedSeqParams] = None
+) -> torch.Tensor:
     """
-    Redo the context parallel attention load balancing
-    For example, for cp_size=3, converts 123456 to 162534 for efficient
-    processing by attention.
+    Redo the context parallel attention load balancing.
+    For example (non-packed), for cp_size=3, converts 123456 to 162534 for
+    efficient processing by attention.
     """
-    num_chunks_div_2 = cp_size
-    num_chunks = num_chunks_div_2 * 2
-    chunks = torch.chunk(input_, chunks=num_chunks, dim=0)
-    order = [None] * num_chunks
-    order[::2] = range(num_chunks_div_2)  # order[even]
-    order[1::2] = reversed(range(num_chunks_div_2, num_chunks))  # order[odd]
-    reordered_chunks = [chunks[i] for i in order]
-    return torch.cat(reordered_chunks, dim=0)
+    if packed_seq_params is None:
+        num_chunks_div_2 = cp_size
+        num_chunks = num_chunks_div_2 * 2
+        chunks = torch.chunk(input_, chunks=num_chunks, dim=0)
+        order = [None] * num_chunks
+        order[::2] = range(num_chunks_div_2)  # order[even]
+        order[1::2] = reversed(range(num_chunks_div_2, num_chunks))  # order[odd]
+        reordered_chunks = [chunks[i] for i in order]
+        return torch.cat(reordered_chunks, dim=0)
+    else:
+        assert tex is not None and is_te_min_version("1.10.0"), (
+            "Please update Transformer Engine to >= 1.10 to use "
+            "Context Parallel with THD format data"
+        )
+        if packed_seq_params.cu_seqlens_q_padded is not None:
+            cu_seqlens = packed_seq_params.cu_seqlens_q_padded
+        else:
+            cu_seqlens = packed_seq_params.cu_seqlens_q
+        total_tokens = input_.size(0)
+        assert total_tokens % cp_size == 0
+        seqlen_per_rank = total_tokens // cp_size
+        index = torch.empty(total_tokens, device=input_.device, dtype=torch.int32)
+        for cp_rank in range(cp_size):
+            start = cp_rank * seqlen_per_rank
+            end = start + seqlen_per_rank
+            index[start:end] = tex.thd_get_partitioned_indices(
+                cu_seqlens, total_tokens, cp_size, cp_rank
+            )
+        return input_.index_select(0, index)
diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py
index 6514050ac63..ac6e8b5bf40 100644
--- a/megatron/core/ssm/mamba_layer.py
+++ b/megatron/core/ssm/mamba_layer.py
@@ -14,7 +14,9 @@
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.dist_checkpointing.utils import apply_prefix_mapping
 from megatron.core.inference.contexts import BaseInferenceContext
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import GraphableMegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
@@ -84,6 +86,13 @@ def __init__(
         self.mamba_bda = build_module(submodules.mamba_bda)
         self.bias_dropout_add_exec_handler = torch.enable_grad
 
+    def create_mcore_cudagraph_manager(self, config):
+        """Register the mamba layer for cudagraphs."""
+        from megatron.core.transformer.cuda_graphs import CudaGraphManager
+
+        if not self.config.cuda_graph_scope or CudaGraphScope.mamba in self.config.cuda_graph_scope:
+            self.cudagraph_manager = CudaGraphManager(config)
+
     def mamba_state_shapes_per_request(self) -> Tuple[Tuple[int], Tuple[int]]:
         """Returns the Mamba conv and ssm states shapes per request."""
         return self.mixer.mamba_state_shapes_per_request()
@@ -96,6 +105,7 @@ def forward(
         rotary_pos_emb: Optional[Tensor] = None,  # Not used in MambaLayer
         *,
         inference_params: Optional[BaseInferenceContext] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
     ):
         """
         Perform a forward pass through the Mamba layer.
@@ -124,7 +134,9 @@ def forward(
         hidden_states = hidden_states.to(dtype=self.config.params_dtype)
         hidden_states = self.norm(hidden_states)
 
-        mixer_out_with_bias = self.mixer(hidden_states, inference_context=inference_context)
+        mixer_out_with_bias = self.mixer(
+            hidden_states, inference_context=inference_context, packed_seq_params=packed_seq_params
+        )
 
         with self.bias_dropout_add_exec_handler():
             hidden_states = self.mamba_bda(
@@ -176,11 +188,11 @@ def _should_call_local_cudagraph(self, *args, **kwargs):
         # Training and validation mode CUDA graphs
         if hasattr(self, 'cudagraph_manager') and kwargs.get('inference_context') is None:
             return True
-        # Inference mode. CUDA graphs are used in the decode phase only, when attn mask is None
         elif not self.training and (
             hasattr(self, 'cudagraph_manager')
             and kwargs.get('attention_mask') is None
-            and kwargs['inference_context'].is_decode_only()
+            and kwargs.get('inference_context') is not None
         ):
-            return True
+            using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step()
+            return using_cuda_graph
         return False
diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py
index 929d1d9c6b5..cc71cdc32f6 100644
--- a/megatron/core/ssm/mamba_mixer.py
+++ b/megatron/core/ssm/mamba_mixer.py
@@ -15,23 +15,31 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from megatron.core import parallel_state
 from megatron.core.dist_checkpointing import ShardedTensor
 from megatron.core.dist_checkpointing.mapping import ReplicaId, ShardedTensorFactory
 from megatron.core.inference.contexts import BaseInferenceContext, DynamicInferenceContext
+from megatron.core.inference.contexts.attention_context.triton.tensor_ops import (
+    tensor_get_slice_after,
+    tensor_masked_update,
+    tensor_merge,
+)
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel import get_cuda_rng_tracker
 from megatron.core.transformer import TransformerConfig
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.utils import (
+    ensure_metadata_has_dp_cp_group,
     make_sharded_tensors_for_checkpoint,
     sharded_state_dict_default,
 )
 from megatron.core.utils import (
-    check_mamba_sequence_packing_support,
     deprecate_inference_params,
+    is_causal_conv1d_min_version,
+    is_mamba_min_version,
     log_single_rank,
-    maybe_cat,
 )
 
 from .mamba_context_parallel import MambaContextParallel
@@ -79,9 +87,16 @@ class ExtendedRMSNorm(RMSNormGated):
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias not sharded"""
+        if not hasattr(self, 'tp_group'):
+            self.tp_group = parallel_state.get_tensor_model_parallel_group()
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 0},
+            sharded_offsets,
+            tp_group=self.tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
 
@@ -290,11 +305,11 @@ def __init__(
             )
             setattr(self.conv1d.weight, "tensor_model_parallel", True)
             setattr(self.conv1d.bias, "tensor_model_parallel", True)
-
-            if self.conv_init is not None:
-                nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
-            else:
-                nn.init.kaiming_uniform_(self.conv1d.weight, a=math.sqrt(5))
+            if self.config.perform_initialization:
+                if self.conv_init is not None:
+                    nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
+                else:
+                    nn.init.kaiming_uniform_(self.conv1d.weight, a=math.sqrt(5))
 
         self.activation = "silu"
         self.act = nn.SiLU()
@@ -319,7 +334,9 @@ def __init__(
             assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
             A = torch.empty(
                 self.nheads_local_tp, dtype=torch.float32, device=torch.cuda.current_device()
-            ).uniform_(*A_init_range)
+            )
+            if self.config.perform_initialization:
+                A = A.uniform_(*A_init_range)
             A_log = torch.log(A)  # Keep A_log in fp32
             self.A_log = nn.Parameter(A_log)
             setattr(self.A_log, "tensor_model_parallel", True)
@@ -378,6 +395,7 @@ def __init__(
             D_cp1=self.D,
             D_has_hdim=self.D_has_hdim,
         )
+        self.tp_group = pg_collection.tp
 
     def forward(
         self,
@@ -385,6 +403,7 @@ def forward(
         inference_context=None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
     ):
         """
         hidden_states: (nL, B, D) / (L B D)
@@ -400,148 +419,181 @@ def forward(
 
         if in_inference_mode:
             if inference_context.is_dynamic_batching():
-                return self.dynamic_inference(hidden_states, inference_context)
+                return self._dynamic_inference(hidden_states, inference_context)
             else:
                 assert inference_context.is_static_batching()
                 assert not self.config.sequence_parallel
                 conv_state, ssm_state = self._get_states_from_cache(inference_context, batch)
                 if inference_context.seqlen_offset > 0:
                     # The states are updated inplace
-                    out, out_bias = self.decode(hidden_states, conv_state, ssm_state)
+                    out, out_bias = self._decode(hidden_states, conv_state, ssm_state)
                     return out, out_bias
 
         zxBCdt, _ = self.in_proj(hidden_states)
 
-        zxBCdt = self.cp.pre_conv_ssm(zxBCdt)
+        zxBCdt = self.cp.pre_conv_ssm(zxBCdt, packed_seq_params)
 
         if in_inference_mode or not self.use_mem_eff_path:
             # TODO(ksanthanam): Consider deprecating this path for training
-            y = self.ssm_prefill(zxBCdt, conv_state=conv_state, ssm_state=ssm_state)
+            assert packed_seq_params is None, (
+                "Training with packed sequences is not supported "
+                "in the non-memory-efficient code path."
+            )
+            y = self._ssm_prefill(zxBCdt, conv_state=conv_state, ssm_state=ssm_state)
         else:
             assert ssm_state is None
-            y = self.ssm_training(zxBCdt)
+            y = self._ssm_training(zxBCdt, packed_seq_params)
 
         out, out_bias = self.out_proj(y)
 
         return out, out_bias
 
-    def dynamic_inference(self, hidden_states: torch.Tensor, context: DynamicInferenceContext):
+    def _dynamic_inference(self, hidden_states: torch.Tensor, context: DynamicInferenceContext):
         """
         Executes dynamic inference by separating decode and prefill requests and
         running them independently. Also runs the chunked prefill request independently
         if it exists.
         """
         sequence_packing_available, reason_for_no_sequence_packing = (
-            check_mamba_sequence_packing_support()
+            _check_mamba_sequence_packing_support(for_inference_not_training=True)
         )
         assert sequence_packing_available, reason_for_no_sequence_packing
 
         conv_state, ssm_state = context.mamba_states_cache(self.layer_number - self.pp_layer_offset)
 
-        # Fast path: decode-only
-        if context.is_decode_only():
-            batch_indices = context.mamba_metadata.request_to_mamba_state_idx_cudagraph_only[
-                : context.padded_active_token_count
-            ]
-            out, out_bias = self.decode(
-                hidden_states, conv_state, ssm_state, batch_indices=batch_indices
-            )
-            return out, out_bias
+        padded_dims = context.padded_batch_dimensions
 
-        # Compute input projection before splitting into prefill and decode
-        # to ensure sequence parallel all-gather.
+        token_count = padded_dims.token_count
+        decode_req_count = padded_dims.decode_req_count
+        prefill_req_count = padded_dims.prefill_req_count
+        has_explicit_chunked_prefill_req = padded_dims.has_explicit_chunked_prefill_req
+
+        # Input projection
         zxBCdt, _ = self.in_proj(hidden_states)
 
-        # Compute split between decode and prefill.
-        seq_idx, cu_seqlens, return_varlen_states = self._get_varlen_generation_state(context)
-        active_query_lengths = context.request_query_lengths[
-            context.paused_request_count : context.total_request_count
-        ]
-        batch_indices = context.mamba_metadata.request_to_mamba_state_idx
-
-        # First request with query len > 1 is prefill-start.
-        first_prefill_token_idx = torch.nonzero(active_query_lengths > 1)[0].int()
-
-        # Process decode requests if there are any.
-        if first_prefill_token_idx > 0:
-            zxBCdt_decode = zxBCdt[:first_prefill_token_idx]
-            batch_indices_decode = batch_indices[:first_prefill_token_idx]
-            y_decode = self.ssm_decode(
-                zxBCdt_decode.transpose(0, 1), conv_state, ssm_state, batch_indices_decode
+        if decode_req_count > 0 and prefill_req_count == 0:
+            # Decode-only
+            y = self._ssm_decode(
+                zxBCdt.transpose(0, 1),
+                conv_state,
+                ssm_state,
+                context.mamba_metadata.batch_indices_decode,
             ).transpose(0, 1)
+        elif decode_req_count == 0 and (prefill_req_count > 0 or has_explicit_chunked_prefill_req):
+            if prefill_req_count > 0:
+                # Prefill only (regular prefill requests)
+                y_prefill = self._ssm_prefill(
+                    zxBCdt,
+                    conv_state=conv_state,
+                    ssm_state=ssm_state,
+                    seq_idx=context.mamba_metadata.seq_idx,
+                    cu_seqlens=context.mamba_metadata.cu_seqlens,
+                    return_varlen_states=True,
+                    batch_indices=context.mamba_metadata.batch_indices_prefill,
+                )
+            if has_explicit_chunked_prefill_req:
+                # Prefill only (chunked prefill request)
+                zxBCdt_chunked_prefill = torch.empty_like(zxBCdt)
+                tensor_get_slice_after(
+                    zxBCdt,
+                    zxBCdt_chunked_prefill,
+                    context.mamba_metadata.device_chunked_prefill,
+                    check_bounds=False,
+                )
+                y_chunked_prefill = self._ssm_prefill(
+                    zxBCdt_chunked_prefill[: context.mamba_metadata.device_chunked_prefill[1]],
+                    conv_state=conv_state,
+                    ssm_state=ssm_state,
+                    batch_indices=context.mamba_metadata.batch_indices_chunked_prefill,
+                    is_chunked_prefill=True,
+                )
+            if prefill_req_count > 0 and has_explicit_chunked_prefill_req:
+                # Merge regular prefill and chunked prefill parts
+                tensor_merge(
+                    y_prefill, y_chunked_prefill, context.mamba_metadata.device_chunked_prefill
+                )
+                y = y_prefill
+            elif prefill_req_count > 0:
+                # Prefill-only without chunked prefill
+                y = y_prefill
+            else:
+                # Prefill-only with only chunked prefill
+                y = y_chunked_prefill
         else:
-            y_decode = None
-
-        active_token_count = context.active_token_count
-        active_request_count = context.get_active_request_count()
-        padded_active_token_count = context.padded_active_token_count
-
-        # Process the chunked prefill request if it exists.
-        if context.chunked_prefill_request_id != -1:
-            chunked_prefill_request_token_count = active_query_lengths[-1]
-            zxBCdt_chunked_prefill = zxBCdt[
-                active_token_count - chunked_prefill_request_token_count : active_token_count
-            ]
-
-            batch_index_chunked_prefill = batch_indices[
-                context.get_index_of_chunked_prefill_request()
-            ]
-
-            y_prefill_chunked = self.ssm_prefill(
-                zxBCdt_chunked_prefill,
-                conv_state=conv_state[batch_index_chunked_prefill].unsqueeze(0),
-                ssm_state=ssm_state[batch_index_chunked_prefill].unsqueeze(0),
-                is_chunked_prefill=True,
-            )
-
-            # Remove the chunked prefill request from the request / token counts so
-            # the subsequent prefill computation ignores the chunked prefill request.
-            active_token_count -= chunked_prefill_request_token_count
-            active_request_count -= 1
-        else:
-            y_prefill_chunked = None
-
-        # Process non-chunked prefill requests if there are any.
-        if (remaining_prefill_tokens := active_token_count - first_prefill_token_idx) > 0:
-            zxBCdt_prefill = zxBCdt[first_prefill_token_idx:active_token_count]
-            cu_seqlens_prefill = F.pad(
-                cu_seqlens[first_prefill_token_idx + 1 : active_request_count + 1]
-                - first_prefill_token_idx,
-                (1, 0),
+            # Mix of decode and prefill
+            zxBCdt_prefill = torch.empty_like(zxBCdt)
+            tensor_get_slice_after(
+                zxBCdt,
+                zxBCdt_prefill,
+                context.mamba_metadata.device_decode_prefill,
+                check_bounds=False,
             )
-            seq_idx_prefill = (
-                seq_idx[:, first_prefill_token_idx:active_token_count] - first_prefill_token_idx
+            # Decode requests
+            y_decode = self._ssm_decode(
+                zxBCdt[:decode_req_count].transpose(0, 1),
+                conv_state,
+                ssm_state,
+                context.mamba_metadata.batch_indices_decode,
+            ).transpose(0, 1)
+            y_prefill, y_chunked_prefill = None, None
+            if prefill_req_count > 0:
+                # Regular prefill requests
+                y_prefill = self._ssm_prefill(
+                    zxBCdt_prefill,
+                    conv_state=conv_state,
+                    ssm_state=ssm_state,
+                    seq_idx=context.mamba_metadata.seq_idx,
+                    cu_seqlens=context.mamba_metadata.cu_seqlens,
+                    return_varlen_states=True,
+                    batch_indices=context.mamba_metadata.batch_indices_prefill,
+                )
+            if has_explicit_chunked_prefill_req:
+                # Chunked prefill request
+                zxBCdt_chunked_prefill = torch.empty_like(zxBCdt_prefill)
+                tensor_get_slice_after(
+                    zxBCdt_prefill,
+                    zxBCdt_chunked_prefill,
+                    context.mamba_metadata.device_chunked_prefill,
+                    check_bounds=False,
+                )
+                y_chunked_prefill = self._ssm_prefill(
+                    zxBCdt_chunked_prefill[: context.mamba_metadata.device_chunked_prefill[1]],
+                    conv_state=conv_state,
+                    ssm_state=ssm_state,
+                    batch_indices=context.mamba_metadata.batch_indices_chunked_prefill,
+                    is_chunked_prefill=True,
+                )
+            if prefill_req_count > 0 and has_explicit_chunked_prefill_req:
+                # Merge regular prefill and chunked prefill parts
+                assert y_prefill is not None
+                assert y_chunked_prefill is not None
+                tensor_merge(
+                    y_prefill, y_chunked_prefill, context.mamba_metadata.device_chunked_prefill
+                )
+            elif has_explicit_chunked_prefill_req:
+                # Chunked prefill only
+                assert y_prefill is None
+                assert y_chunked_prefill is not None
+                y_prefill = y_chunked_prefill
+            else:
+                # Regular prefill only; y_prefill is already set, nothing more to be done
+                assert y_prefill is not None
+            # Merge decode and prefill parts
+            y = torch.empty(
+                [token_count, 1, y_prefill.shape[-1]],
+                dtype=y_prefill.dtype,
+                device=y_prefill.device,
             )
-            batch_indices_prefill = batch_indices[first_prefill_token_idx:active_request_count]
-
-            y_prefill = self.ssm_prefill(
-                zxBCdt_prefill,
-                conv_state=conv_state,
-                ssm_state=ssm_state,
-                seq_idx=seq_idx_prefill,
-                cu_seqlens=cu_seqlens_prefill,
-                return_varlen_states=return_varlen_states,
-                batch_indices=batch_indices_prefill,
+            tensor_merge(
+                y_decode, y_prefill, context.mamba_metadata.device_decode_prefill, output_tensor=y
             )
-        else:
-            y_prefill = None
 
-        # Assemble the final output by concatenating the decode output,
-        # non-chunked prefill output, and chunked prefill output together.
-        y_prefill = maybe_cat(y_prefill, y_prefill_chunked, required=True)
-        y = maybe_cat(y_decode, y_prefill, required=True)
-
-        # Add padding tokens back if necessary. Note that we use the context active token count
-        # in case we modified the local count for chunked prefill above.
-        if (num_padding_tokens := padded_active_token_count - context.active_token_count) > 0:
-            y = torch.cat((y, y.new_zeros(num_padding_tokens, *y.shape[1:])), dim=0)
-
-        # The output projection will perform the sequence parallel reduce-scatter if necessary.
+        # Output projection
         out, out_bias = self.out_proj(y)
 
         return out, out_bias
 
-    def decode(
+    def _decode(
         self, hidden_states, conv_state, ssm_state, batch_indices: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Performs inference step for decoding."""
@@ -562,7 +614,7 @@ def decode(
 
         assert self.cp.cp_size == 1, "Context parallel not supported for Mamba inferenece decode"
 
-        y = self.ssm_decode(
+        y = self._ssm_decode(
             zxBCdt, conv_state=conv_state, ssm_state=ssm_state, batch_indices=batch_indices
         )
 
@@ -575,7 +627,9 @@ def decode(
 
         return out, out_bias
 
-    def ssm_training(self, zxBCdt: torch.Tensor) -> torch.Tensor:
+    def _ssm_training(
+        self, zxBCdt: torch.Tensor, packed_seq_params: Optional[PackedSeqParams] = None
+    ) -> torch.Tensor:
         """
         Performs SSM computation for training step.
 
@@ -594,6 +648,14 @@ def ssm_training(self, zxBCdt: torch.Tensor) -> torch.Tensor:
         if self.conv1d.bias is not None:
             self.conv1d.bias.data_ptr()
 
+        seq_idx = None
+        if packed_seq_params is not None:
+            sequence_packing_available, reason_for_no_sequence_packing = (
+                _check_mamba_sequence_packing_support(for_inference_not_training=False)
+            )
+            assert sequence_packing_available, reason_for_no_sequence_packing
+            seq_idx = self._create_packed_seq_idx(packed_seq_params, zxBCdt.shape[1])
+
         y = mamba_split_conv1d_scan_combined(
             zxBCdt,
             rearrange(self.cp.get_conv1d_weight(), "d 1 w -> d w"),
@@ -610,17 +672,48 @@ def ssm_training(self, zxBCdt: torch.Tensor) -> torch.Tensor:
             headdim=None if self.D_has_hdim else self.headdim,
             ngroups=self.cp.ngroups_local_tpcp,
             norm_before_gate=self.norm_before_gate,
+            seq_idx=seq_idx,
         )
 
         y = rearrange(y, "b l d -> l b d").contiguous()
-        y = self.cp.post_conv_ssm(y)
+        y = self.cp.post_conv_ssm(y, packed_seq_params)
 
         if self.rmsnorm:
             y = self.norm(y)
 
         return y
 
-    def ssm_prefill(
+    def _create_packed_seq_idx(self, packed_seq_params: PackedSeqParams, total_tokens: int):
+        """
+        If total_tokens is 16 (for example), this method takes packed_seq_params.cu_seqlens_q_padded
+        (or cu_seqlens_q) which is of the form [0, 5, 7, 11] and returns a tensor of the form
+        [0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3],
+        which is [0]*(5-0) + [1]*(7-5) + [2]*(11-7) + [3]*(16-11)
+        In the above example, there are three sequences in the pack.
+        In general, the output has an additional sequence index (e.g. 0, 1, 2, 3) so that any tokens
+        beyond the last padded input sequence are accounted for as an extra sequence. However, If
+        cu_seqlens_q_padded[-1] == max_seqlen then this additional sequence index will not be
+        included.
+        """
+        # Example: [0, 5, 7, 11] -> [0, 5, 7, 11, 16]
+        if packed_seq_params.cu_seqlens_q_padded is not None:
+            cu_seqlens = packed_seq_params.cu_seqlens_q_padded
+        else:
+            cu_seqlens = packed_seq_params.cu_seqlens_q
+        total_tokens_tensor = torch.tensor(
+            [total_tokens], dtype=cu_seqlens.dtype, device=cu_seqlens.device
+        )
+        cu_seqlens_with_max = torch.cat([cu_seqlens, total_tokens_tensor])
+        # Example: [0, 5, 7, 11, 16] -> [5, 2, 4, 5]
+        seq_lengths = cu_seqlens_with_max[1:] - cu_seqlens_with_max[:-1]
+        # Example: [5, 2, 4, 5] -> [0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3]
+        seq_idx = torch.repeat_interleave(
+            torch.arange(seq_lengths.numel(), device=cu_seqlens.device), seq_lengths
+        )
+        seq_idx = seq_idx.to(torch.int32).unsqueeze(0)  # Add a batch dimension
+        return seq_idx
+
+    def _ssm_prefill(
         self,
         zxBCdt: torch.Tensor,
         conv_state: Optional[torch.Tensor],
@@ -671,12 +764,14 @@ def ssm_prefill(
         )
 
         # Compute short convolution
+        initial_conv_state = None
         if conv_state is not None and is_dynamic_batching:
             # xBC should have shape (b l d) for causal_conv1d_varlen_states
             assert batch_indices is not None
-            conv_state[batch_indices] = causal_conv1d_varlen_states(
+            conv_varlen_states = causal_conv1d_varlen_states(
                 xBC.squeeze(0), cu_seqlens, state_len=conv_state.shape[-1]
             )
+            tensor_masked_update(conv_state, batch_indices, conv_varlen_states)
 
             # Maintain channels-last memory layout to use seq_idx for causal_conv1d_fn
             # See https://github.com/Dao-AILab/causal-conv1d/blob/69e6dadc28b169a4c49cb86b586f64ee90242c70/csrc/causal_conv1d.cpp#L174 # pylint: disable=line-too-long
@@ -684,7 +779,14 @@ def ssm_prefill(
         elif is_chunked_prefill:
             # Maintain channels-last memory layout to use initial_states for causal_conv1d_fn
             # See https://github.com/Dao-AILab/causal-conv1d/blob/69e6dadc28b169a4c49cb86b586f64ee90242c70/csrc/causal_conv1d.cpp#L200 # pylint: disable=line-too-long
+            assert batch_indices is not None
+            initial_conv_state = (
+                conv_state[batch_indices, :, 1:].permute(0, 2, 1).contiguous().transpose(1, 2)
+            )
             xBC = xBC.transpose(1, 2)
+            tensor_masked_update(
+                conv_state, batch_indices, F.pad(xBC, (self.d_conv - xBC.shape[-1], 0))
+            )
         else:
             # transpose: b l pd --> b pd l
             xBC = rearrange(xBC, "b l d -> b d l").contiguous()
@@ -700,12 +802,6 @@ def ssm_prefill(
             xBC = self.act(self.cp.conv1d(xBC)[..., :seqlen])
         else:
             assert self.activation in ["silu", "swish"]
-            if is_chunked_prefill:
-                initial_conv_state = (
-                    conv_state[:, :, 1:].permute(0, 2, 1).contiguous().transpose(1, 2)
-                )
-            else:
-                initial_conv_state = None
             xBC = causal_conv1d_fn(
                 x=xBC,
                 weight=rearrange(self.cp.get_conv1d_weight(), "d 1 w -> d w"),
@@ -744,7 +840,7 @@ def ssm_prefill(
         ), "Context parallel not supported for use_mem_eff_path==False and rmsnorm==False"
 
         if is_chunked_prefill:
-            initial_ssm_state = ssm_state
+            initial_ssm_state = ssm_state[batch_indices]
         else:
             initial_ssm_state = None
 
@@ -777,12 +873,16 @@ def ssm_prefill(
             if return_varlen_states:
                 assert batch_indices is not None
 
-                y, _, varlen_states = y
+                y, _, ssm_varlen_states = y
 
                 # This has to be varlen_states, NOT last_state
                 # See reference implementation:
                 # https://github.com/state-spaces/mamba/blob/e0761ece1db07e0949dd88b4f4cd440420a19fd9/mamba_ssm/modules/mamba2.py#L267 # pylint: disable=line-too-long
-                ssm_state[batch_indices] = varlen_states
+                tensor_masked_update(ssm_state, batch_indices, ssm_varlen_states)
+            elif is_chunked_prefill:
+                assert batch_indices is not None
+                y, last_state = y
+                tensor_masked_update(ssm_state, batch_indices, last_state)
             else:
                 y, last_state = y
                 ssm_state.copy_(last_state)
@@ -797,7 +897,7 @@ def ssm_prefill(
 
         return y
 
-    def ssm_decode(
+    def _ssm_decode(
         self,
         zxBCdt: torch.Tensor,
         conv_state: torch.Tensor,
@@ -949,46 +1049,6 @@ def ssm_decode(
         # Restore sequence dimension
         return y.unsqueeze(0)
 
-    def _get_varlen_generation_state(
-        self, inference_context: Optional[BaseInferenceContext] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor, bool]:
-        """Constructs the variable length generation state for non-decode dynamic inference.
-
-        The returned state includes the following:
-            `seq_idx` (Tensor): A map from token idx to request idx.
-            `cu_seqlens` (Tensor): The cumulative sequence lengths.
-            `return_varlen_states` (bool): Whether to return a varlen states tensor for
-                `mamba_chunk_scan_combined`.
-
-        Returns empty state for training, static inference, or decode-only dynamic inference.
-
-        Args:
-            inference_context (InferenceContext): The inference context.
-
-        Returns:
-            A tuple of (`seq_idx`, `cu_seqlens`, `return_varlen_states`)
-        """
-
-        if (
-            inference_context is None
-            or not inference_context.is_dynamic_batching()
-            or inference_context.is_decode_only()
-        ):
-            return None, None, False
-
-        active_token_count = inference_context.active_token_count
-        seq_idx = (
-            inference_context.token_to_request_idx[:active_token_count]
-            .clone()
-            .to(torch.int32)
-            .unsqueeze(0)
-        )
-
-        # Get the list of cumulative sequence lengths for active requests.
-        cu_seqlens, _ = inference_context.cu_query_lengths()
-
-        return seq_idx, cu_seqlens, True
-
     def mamba_state_shapes_per_request(self) -> Tuple[Tuple[int], Tuple[int]]:
         """Returns the Mamba conv and ssm states shapes per request."""
         conv_states_shape = (self.conv1d.weight.shape[0], self.d_conv)
@@ -1037,6 +1097,9 @@ def _get_states_from_cache(self, inference_context, batch_size, *, inference_par
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Provide a sharded state dictionary for distributed checkpointing."""
+        # Guard for cases metadata is not provided
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
+
         sharded_state_dict = {}
         # Parameters
         self._save_to_state_dict(sharded_state_dict, "", keep_vars=True)
@@ -1056,12 +1119,17 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
                 # Add TP sharding for Conv1d
                 module_sd = module.state_dict(prefix="", keep_vars=True)
                 module_sharded_sd = make_sharded_tensors_for_checkpoint(
-                    module_sd, f"{prefix}{name}.", {f"weight": 0, f"bias": 0}, sharded_offsets
+                    module_sd,
+                    f"{prefix}{name}.",
+                    {"weight": 0, "bias": 0},
+                    sharded_offsets,
+                    tp_group=self.tp_group,
+                    dp_cp_group=metadata['dp_cp_group'],
                 )
 
             else:
                 module_sharded_sd = sharded_state_dict_default(
-                    module, f"{prefix}{name}.", sharded_offsets, metadata
+                    module, f"{prefix}{name}.", sharded_offsets, metadata, tp_group=self.tp_group
                 )
 
             sharded_state_dict.update(module_sharded_sd)
@@ -1174,3 +1242,22 @@ def sh_ten_merge_fn(sub_state_dict):
     return ShardedTensorFactory(
         orig_sh_ten.key, orig_sh_ten.data, sh_ten_build_fn, sh_ten_merge_fn, orig_sh_ten.replica_id
     )
+
+
+def _check_mamba_sequence_packing_support(
+    for_inference_not_training: bool = True,
+) -> Tuple[bool, Optional[str]]:
+    """Checks whether `causal_conv1d` and `mamba_ssm` support sequence packing."""
+    if for_inference_not_training:
+        # https://github.com/Dao-AILab/causal-conv1d/commit/d87608f78f87d1288a7821d9e6ff4b10a8d5bf07
+        conv1d_min = "1.5.3.post1"
+        # https://github.com/state-spaces/mamba/commit/4f77d5306e19f5c7ae37665a44c3e61e24cafcb5
+        mamba_min = "2.2.6.post3"
+    else:
+        conv1d_min = "1.4.0"
+        mamba_min = "2.0.0"
+    if not is_causal_conv1d_min_version(conv1d_min):
+        return False, f"causal_conv1d >= {conv1d_min} is required"
+    elif not is_mamba_min_version(mamba_min):
+        return False, f"mamba_ssm >= {mamba_min} is required"
+    return True, None
diff --git a/megatron/core/tensor_parallel/__init__.py b/megatron/core/tensor_parallel/__init__.py
index afa53bdc6e1..2140ee54b37 100644
--- a/megatron/core/tensor_parallel/__init__.py
+++ b/megatron/core/tensor_parallel/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 from .cross_entropy import vocab_parallel_cross_entropy
 from .data import broadcast_data
+from .inference_layers import InferenceLayerNormColumnParallelLinear, InferenceRowParallelLinear
 from .layers import (
     ColumnParallelLinear,
     RowParallelLinear,
@@ -28,9 +29,11 @@
 from .random import (
     CheckpointWithoutOutput,
     checkpoint,
+    convert_cuda_rng_state,
     get_cuda_rng_tracker,
     get_data_parallel_rng_tracker_name,
     get_expert_parallel_rng_tracker_name,
+    is_graph_safe_cuda_rng_tracker,
     model_parallel_cuda_manual_seed,
 )
 from .utils import (
@@ -63,9 +66,11 @@
     "scatter_to_sequence_parallel_region",
     # random.py
     "checkpoint",
+    "convert_cuda_rng_state",
     "get_cuda_rng_tracker",
     "model_parallel_cuda_manual_seed",
     "get_expert_parallel_rng_tracker_name",
+    "is_graph_safe_cuda_rng_tracker",
     "CheckpointWithoutOutput",
     # utils.py
     "split_tensor_along_last_dim",
diff --git a/megatron/core/tensor_parallel/inference_layers.py b/megatron/core/tensor_parallel/inference_layers.py
index 05f7b88d095..9c1adbc6717 100644
--- a/megatron/core/tensor_parallel/inference_layers.py
+++ b/megatron/core/tensor_parallel/inference_layers.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-
-
-from typing import Callable, Optional
+from typing import Callable, Optional, Tuple
 
 import torch
 import torch.distributed as dist
@@ -10,7 +8,13 @@
     TELayerNormColumnParallelLinear,
     TERowParallelLinear,
 )
+from megatron.core.inference.communication.torch_symm_triton import (
+    fused_multimem_rs_add_norm_ag,
+    multimem_all_gather,
+    multimem_reduce_scatter,
+)
 from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.parallel_state import get_global_symmetric_memory_buffer
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import get_tensor_model_parallel_group_if_none
 
@@ -53,6 +57,7 @@ def __init__(
         bias: bool,
         skip_bias_add: bool,
         is_expert: bool,
+        stride: int = 1,
         skip_weight_param_allocation: bool = False,
         tp_comm_buffer_name: Optional[str] = None,
         tp_group: Optional[torch.distributed.ProcessGroup] = None,
@@ -67,6 +72,7 @@ def __init__(
             bias=bias,
             skip_bias_add=skip_bias_add,
             is_expert=is_expert,
+            stride=stride,
             skip_weight_param_allocation=skip_weight_param_allocation,
             tp_comm_buffer_name=tp_comm_buffer_name,
             tp_group=tp_group,
@@ -85,15 +91,73 @@ def __init__(
                 config.sequence_parallel
             ), "--transformer-impl=inference_optimized requires --sequence-parallel"
 
+        # Boolean to be toggled externally for skipping norm and all-gather.
+        # This is used when enabling fused reduce-scatter + add + rms-norm + all-gather
+        # in tensor parallelism. In this case, the preceeding RowParallelLinear layer
+        # has already applied the rms-norm and all-gather.
+        self.skip_norm_and_all_gather = False
+
+    def _maybe_allocate_symmetric_buffer(self, x: torch.Tensor):
+        """
+        Attempt to allocate symmetric memory buffer for all-gather.
+        """
+        symm_mem_buffer_dims = list(x.size())
+        symm_mem_buffer_dims[0] *= self.tp_size
+        symm_mem_buffer = get_global_symmetric_memory_buffer().maybe_get_tensor(
+            symm_mem_buffer_dims, dtype=x.dtype
+        )
+        return symm_mem_buffer
+
+    def _all_gather(self, x: torch.Tensor, symm_mem_buffer: dict) -> None:
+        """
+        Attempt an NVLS all-gather into symmetric memory. If not possible,
+        revert to torch dist (NCCL) all-gather.
+        """
+        if self.tp_size == 1:
+            return x
+
+        # 1. check if bf16
+        is_bf16 = x.dtype == torch.bfloat16
+        # 2. check if hopper or newer
+        is_hopper_or_newer = torch.cuda.get_device_properties(x.device).major >= 9
+        # 3. check if symmetric memory buffer is available
+        has_enough_symmetric_memory = symm_mem_buffer["handle"] is not None
+        can_use_custom_nvls_collectives = (
+            is_bf16 and is_hopper_or_newer and has_enough_symmetric_memory
+        )
+        if can_use_custom_nvls_collectives:
+            # do multimem all gather
+            multimem_all_gather(symm_mem_buffer["tensor"], x, symm_mem_buffer["handle"])
+            return symm_mem_buffer["tensor"]
+        else:
+            # revert to torch dist (NCCL) all gather
+            x, _ = gather_along_first_dim(x, process_group=self.tp_group)
+            return x
+
     @torch.no_grad()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, None]:
         """
         Forward pass.
         """
-        x = _te_rms_norm_kernel(x=x, weight=self.layer_norm_weight, eps=self.eps)
-        if self.tp_size > 1:
-            x, _ = gather_along_first_dim(x, process_group=self.tp_group)
+        # Necessary conditions to ensure we are executing the fused rs-add-rmsnorm-ag
+        # in the preceeding RowParallelLinear layer.
+        # 1. skip_norm_and_all_gather is True
+        # 2. tp_size > 1
+        # 3. enough symmetric memory is available - if available it already has the output
+        symm_mem_buffer = self._maybe_allocate_symmetric_buffer(x)
+        is_in_fused_mode = (
+            self.skip_norm_and_all_gather
+            and self.tp_size > 1
+            and symm_mem_buffer["handle"] is not None
+        )
+        if is_in_fused_mode:
+            x = symm_mem_buffer["tensor"]
+        else:
+            x = _te_rms_norm_kernel(x=x, weight=self.layer_norm_weight, eps=self.eps)
+            x = self._all_gather(x, symm_mem_buffer)
+
         x = torch.matmul(x, self.weight.t())
+
         return x, None
 
 
@@ -140,12 +204,89 @@ def __init__(
                 config.sequence_parallel
             ), "--transformer-impl=inference_optimized requires --sequence-parallel"
 
+        # Placeholder for next layer norm weights for fused
+        # reduce-scatter + add + rms-norm + all-gather
+        self.next_layer_norm_weights = None
+        self.config = config
+
+    def _matmul_reduce_scatter(self, x, residual=None):
+        """
+        Multiplies x by the weight matrix and performs a reduce-scatter.
+        It will first try to write the matmul output to symmetric memory
+        and perform an NVLS multicast reduce-scatter. If that is not possible,
+        it will revert to torch.dist (NCCL) reduce-scatter.
+        """
+        # 1. check if bf16
+        is_bf16 = x.dtype == torch.bfloat16
+        # 2. check if hopper
+        is_hopper_or_newer = torch.cuda.get_device_properties(x.device).major >= 9
+        # 3. attempt to ask for symmetric memory
+        symm_mem_buffer_dims = list(x.size())
+        symm_mem_buffer_dims[-1] = self.weight.size(0)
+        symm_mem_buffer = get_global_symmetric_memory_buffer().maybe_get_tensor(
+            symm_mem_buffer_dims, dtype=x.dtype
+        )
+        has_enough_symmetric_memory = symm_mem_buffer["handle"] is not None
+        can_use_custom_nvls_collectives = (
+            is_bf16 and is_hopper_or_newer and has_enough_symmetric_memory
+        )
+        if can_use_custom_nvls_collectives:
+            # Write output of matmul directly onto the symmetric memory buffer
+            torch.matmul(x, self.weight.t(), out=symm_mem_buffer["tensor"])
+            x = symm_mem_buffer["tensor"]
+            # perform nvls reduce-scatter
+            if self.next_layer_norm_weights is None:
+                output_dims = list(x.size())
+                output_dims[0] = x.size(0) // self.tp_size
+                output = torch.empty(output_dims, dtype=x.dtype, device=x.device)
+                multimem_reduce_scatter(output, x, symm_mem_buffer["handle"])
+                return output
+            else:
+                assert hasattr(self, "residual"), (
+                    "For fused reduce-scatter + add + rms-norm + all-gather, "
+                    "residual must be set via _set_residual()"
+                )
+                residual = self.residual
+                fused_multimem_rs_add_norm_ag(
+                    residual,
+                    symm_mem_buffer["tensor"],
+                    symm_mem_buffer["handle"],
+                    residual,
+                    self.next_layer_norm_weights,
+                    self.config.layernorm_epsilon,
+                )
+                # 1. Residual has the output of the reduce-scatter + residual add
+                #    Care must be taken in the model definition, so as to not apply the
+                #    residual again.
+                # 2. The output of the full reduce-scatter + add + rms-norm + all-gather is
+                #    written into symm_mem_buffer["tensor"] and will be accessible there.
+                return residual
+        else:
+            # revert to torch dist (NCCL) reduce-scatter
+            x = torch.matmul(x, self.weight.t())
+            x, _ = reduce_scatter_along_first_dim(x, tp_group=self.tp_group)
+        return x
+
+    def _set_next_layer_norm_weights(self, weights: torch.Tensor):
+        """
+        Set next layer norm weights for fused reduce-scatter + add + rms-norm + all-gather.
+        """
+        self.next_layer_norm_weights = weights
+
+    def _set_residual(self, residual: torch.Tensor):
+        """
+        Set residual for fused reduce-scatter + add + rms-norm + all-gather.
+        """
+        self.residual = residual
+
     @torch.no_grad()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
         """
         Forward pass.
         """
-        x = torch.matmul(x, self.weight.t())
-        if self.tp_size > 1:
-            x, _ = reduce_scatter_along_first_dim(x, tp_group=self.tp_group)
-        return x, None
+        if self.tp_size == 1:
+            x = torch.matmul(x, self.weight.t())
+            return x, None
+        else:
+            x = self._matmul_reduce_scatter(x)
+            return x, None
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index 2b559021481..d68c8b63a3b 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 # Parts of the code here are adapted from PyTorch
 # repo: https://github.com/pytorch/pytorch
@@ -56,6 +56,8 @@
     HAVE_TE = False
 
 _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
+    "expert_tp": False,
+    "is_qkv": False,
     "tensor_model_parallel": False,
     "partition_dim": -1,
     "partition_stride": 1,
@@ -84,12 +86,16 @@
     dist_reduce_scatter_func = torch.distributed._reduce_scatter_base
 
 
-def param_is_not_tensor_parallel_duplicate(param):
+def param_is_not_tensor_parallel_duplicate(param, tp_group=None):
     """Returns true if the passed-in parameter is not a duplicate parameter
     on another TP rank."""
-    return (hasattr(param, "tensor_model_parallel") and param.tensor_model_parallel) or (
-        get_tensor_model_parallel_rank() == 0
-    )
+    if hasattr(param, "tensor_model_parallel") and param.tensor_model_parallel:
+        return True
+    # Prefer provided tp_group when available (new explicit path).
+    if tp_group is not None:
+        return tp_group.rank() == 0
+    # Fallback to legacy global state (back-compat).
+    return get_tensor_model_parallel_rank() == 0
 
 
 def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
@@ -310,6 +316,8 @@ def sharded_state_dict(
                 key=weight_prefix,
                 allow_shape_mismatch=True,
                 prepend_offsets=sharded_offsets,
+                tp_group=self.tp_group,
+                dp_cp_group=metadata["dp_cp_group"],
             )
         }
 
@@ -816,7 +824,7 @@ def __init__(
         embedding_activation_buffer: Optional[List[torch.Tensor]] = None,
         grad_output_buffer: Optional[List[torch.Tensor]] = None,
         is_expert: bool = False,
-        tp_comm_buffer_name: str = None,  # Not used
+        tp_comm_buffer_name: Optional[str] = None,  # Not used
         disable_grad_reduce: bool = False,
         tp_group: Optional[torch.distributed.ProcessGroup] = None,
     ):
@@ -960,7 +968,7 @@ def forward(
         input_: torch.Tensor,
         weight: Optional[torch.Tensor] = None,
         runtime_gather_output: Optional[bool] = None,
-    ):
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -1054,11 +1062,23 @@ def forward(
         output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
+    def backward_dw(self) -> None:
+        """Compute weight gradients during the backward pass if delay_wgrad_compute is enabled.
+
+        Not supported - does nothing.
+        """
+        pass
+
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0, "bias": 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 0, "bias": 0},
+            sharded_offsets,
+            tp_group=self.tp_group,
+            dp_cp_group=metadata['dp_cp_group'],
         )
 
     def set_extra_state(self, state: Any):
@@ -1296,7 +1316,12 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 1, bias not sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 1}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 1},
+            sharded_offsets,
+            tp_group=self.tp_group,
+            dp_cp_group=metadata['dp_cp_group'],
         )
 
     def set_extra_state(self, state: Any):
diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py
index 54cac0e41e3..bf00717ab6c 100644
--- a/megatron/core/tensor_parallel/random.py
+++ b/megatron/core/tensor_parallel/random.py
@@ -111,6 +111,41 @@ def cb():
     _lazy_call(cb)
 
 
+def convert_cuda_rng_state(
+    state: Union[torch.Tensor, torch.Generator], to_graphable: bool = False
+) -> Union[torch.Tensor, torch.Generator]:
+    """
+    Convert the cuda rng state tensor to the graphable version,
+    or from the graphable version to the non-graphable tensor version.
+    """
+    if to_graphable:
+        if isinstance(state, torch.Tensor):
+            # Convert to the graphable version.
+            # Store current rng state.
+            orig_cuda_rng_state = _get_cuda_rng_state(graph_safe=False)
+            # Set rng state to the desired one
+            _set_cuda_rng_state(state, graph_safe=False)
+            # Get the graphable state
+            graphable_state = _get_cuda_rng_state(clone=True, graph_safe=True)
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state, graph_safe=False)
+            return graphable_state
+        elif isinstance(state, torch.Generator):
+            # already graphable, just return it.
+            return state
+        else:
+            raise ValueError(f"Invalid state type: {type(state)}")
+    else:
+        if isinstance(state, torch.Tensor):
+            # already non-graphable, just return it.
+            return state
+        elif isinstance(state, torch.Generator):
+            # Convert to the non-graphable tensor version.
+            return state.get_state()
+        else:
+            raise ValueError(f"Invalid state type: {type(state)}")
+
+
 def get_expert_parallel_rng_tracker_name():
     """Get the expert parallel rng tracker name"""
     global _EXPERT_PARALLEL_RNG_TRACKER_NAME
@@ -161,6 +196,10 @@ def reset(self):
         # Seeds are just for book keeping and ensure no seed is set twice.
         self.seeds_ = set()
 
+        # Name of the rng state currently being used in the generator.
+        # The default one is "default-rng" and won't be pushed to the self.states_ dictionary.
+        self._current_state_name = "default-rng"
+
     def get_states(self):
         """Get rng states. Copy the dictionary so we have direct
         pointers to the states, not just a pointer to the dictionary."""
@@ -207,10 +246,14 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
         # Check if we have added the state
         if name not in self.states_:
             raise Exception('cuda rng state {} is not added'.format(name))
-        # Store current rng state.
+        # Store current rng state and name. Store in self.states_ if it's not the default state.
         orig_cuda_rng_state = _get_cuda_rng_state(graph_safe=self.use_cudagraphable_rng)
-        # Set rng state to the desired one
+        orig_state_name = self._current_state_name
+        if orig_state_name != "default-rng":
+            self.states_[orig_state_name] = orig_cuda_rng_state
+        # Set rng state and name to the desired one.
         _set_cuda_rng_state(self.states_[name], graph_safe=self.use_cudagraphable_rng)
+        self._current_state_name = name
         # Record cpu RNG state
         cpu_rng_state = torch.get_rng_state()
         # Do the stuff we wanted to do.
@@ -220,10 +263,19 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
             # Throw a warning if cpu RNG state changed
             if not torch.all(cpu_rng_state == torch.get_rng_state()).item():
                 logging.getLogger(__name__).warning('CPU RNG state changed within GPU RNG context')
+            # Check if the current state name is the same as the desired state name.
+            if self._current_state_name != name:
+                raise Exception(
+                    f'current state name {self._current_state_name} is not the same as the desired '
+                    f'state name {name}.'
+                )
             # Update the current rng state for later use.
             self.states_[name] = _get_cuda_rng_state(graph_safe=self.use_cudagraphable_rng)
-            # And set the state to the original state we started with.
+            # And set the state and name to the original state we started with.
+            if orig_state_name != "default-rng":
+                orig_cuda_rng_state = self.states_[orig_state_name]
             _set_cuda_rng_state(orig_cuda_rng_state, graph_safe=self.use_cudagraphable_rng)
+            self._current_state_name = orig_state_name
 
 
 # RNG tracker object.
@@ -377,10 +429,24 @@ def model_parallel_cuda_manual_seed(
     _CUDA_RNG_STATE_TRACKER.add(_EXPERT_PARALLEL_RNG_TRACKER_NAME, expert_parallel_seed)
 
 
+def is_graph_safe_cuda_rng_tracker(cuda_rng_tracker):
+    """Check if the cuda rng tracker is graph safe version."""
+    if HAVE_TE and is_te_min_version("1.5.0"):
+        from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker
+
+        if isinstance(cuda_rng_tracker, TECudaRNGStatesTracker):
+            return True
+    if getattr(cuda_rng_tracker, "use_cudagraphable_rng", False):
+        return True
+    return False
+
+
 def _get_all_rng_states():
     """Get all the rng states."""
     cpu_rng_state = torch.get_rng_state()
-    cuda_rng_state = _get_cuda_rng_state()
+    cuda_rng_state = _get_cuda_rng_state(
+        graph_safe=is_graph_safe_cuda_rng_tracker(get_cuda_rng_tracker())
+    )
     cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
     return cpu_rng_state, cuda_rng_state, cuda_rng_state_tracker
 
@@ -388,7 +454,9 @@ def _get_all_rng_states():
 def _set_all_rng_states(cpu_rng_state, cuda_rng_state, cuda_rng_state_tracker):
     """Set all the rng states."""
     torch.set_rng_state(cpu_rng_state)
-    _set_cuda_rng_state(cuda_rng_state)
+    _set_cuda_rng_state(
+        cuda_rng_state, graph_safe=is_graph_safe_cuda_rng_tracker(get_cuda_rng_tracker())
+    )
     get_cuda_rng_tracker().set_states(cuda_rng_state_tracker)
 
 
@@ -404,6 +472,27 @@ def _fork_rng():
         _set_all_rng_states(*current_states)
 
 
+# Global flag that's toggled whenever inside a checkpointing context
+IS_CHECKPOINTING = False
+
+
+def _set_checkpointing():
+    """Set state to checkpointing enabled."""
+    global IS_CHECKPOINTING
+    IS_CHECKPOINTING = True
+
+
+def _unset_checkpointing():
+    """Unset state to checkpointing enabled."""
+    global IS_CHECKPOINTING
+    IS_CHECKPOINTING = False
+
+
+def is_checkpointing():
+    """Check if currently in a checkpoint context."""
+    return IS_CHECKPOINTING
+
+
 class CheckpointFunction(torch.autograd.Function):
     """Checkpoint Function
 
@@ -416,6 +505,8 @@ class CheckpointFunction(torch.autograd.Function):
     @staticmethod
     def forward(ctx, run_function, distribute_saved_activations, *args):
         """Forward pass."""
+        _set_checkpointing()
+
         ctx.run_function = run_function
         ctx.distribute_saved_activations = distribute_saved_activations
 
@@ -436,6 +527,7 @@ def forward(ctx, run_function, distribute_saved_activations, *args):
         # Store everything.
         ctx.save_for_backward(*args)
 
+        _unset_checkpointing()
         return outputs
 
     # pylint: disable=missing-function-docstring
@@ -447,6 +539,8 @@ def backward(ctx, *args):
                 "Checkpointing is not compatible with .grad(), "
                 "please use .backward() if possible"
             )
+        _set_checkpointing()
+
         inputs = ctx.saved_tensors
         if ctx.distribute_saved_activations:
             safely_set_viewless_tensor_data(
@@ -471,6 +565,8 @@ def backward(ctx, *args):
         )
         torch.autograd.backward(outputs, args)
         grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in detached_inputs)
+
+        _unset_checkpointing()
         return (None, None) + grads
 
 
@@ -510,10 +606,14 @@ def forward(ctx, run_function, checkpoint_without_output_obj, *args):
     @staticmethod
     def backward(ctx, *args):
         """Backward pass."""
-        inputs = ctx.saved_tensors
+        # Get the inputs from the context instead of the saved tensors
+        # because the saved tensors are already cached by the recomputation.
+        # This is to avoid double-reloading the inputs in CPU offloading scenario.
+        inputs = ctx.inputs
         outputs = ctx.outputs
         torch.autograd.backward(outputs, args)
         ctx.outputs = None
+        ctx.inputs = None
         grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in inputs)
         return (None, None) + grads
 
@@ -543,6 +643,14 @@ def __init__(self, fp8=False):
 
     def checkpoint(self, run_function, *args):
         """Checkpoint function."""
+
+        # If in cuda graph warmup, disable checkpointing, as 'discard_output_and_register_recompute'
+        # may be called in a separate graph warmup.
+        from megatron.core.transformer.cuda_graphs import is_graph_warmup
+
+        if is_graph_warmup():
+            return run_function(*args)
+
         self.run_function = run_function
 
         self.rng_states = _get_all_rng_states()
@@ -555,7 +663,15 @@ def checkpoint(self, run_function, *args):
 
     def _recompute(self, _):
         """Used as a hook to recompute the output."""
-        if not torch.autograd._is_checkpoint_valid():
+
+        from megatron.core.transformer.cuda_graphs import is_graph_capturing, is_graph_warmup
+
+        # The recomputation has been triggered already. Just return.
+        # Handle cudagraphs, do nothing if currently in graph warmup
+        if self.ctx is None or is_graph_warmup():
+            return
+
+        if not torch.autograd._is_checkpoint_valid() and not is_graph_capturing():
             raise RuntimeError(
                 "Checkpointing is not compatible with .grad(), "
                 "please use .backward() if possible"
@@ -573,8 +689,19 @@ def _recompute(self, _):
                 recompute_ctx = contextlib.nullcontext()
                 fp8_ctx = contextlib.nullcontext()
 
+            # Store the inputs for backward pass
+            inputs = self.ctx.saved_tensors
+
+            def detach(t):
+                if isinstance(t, torch.Tensor):
+                    requires_grad = t.requires_grad
+                    t = t.detach()
+                    t.requires_grad_(requires_grad)
+                return t
+
+            inputs = tuple(detach(t) for t in inputs)
             with torch.enable_grad(), fp8_ctx, recompute_ctx:
-                outputs = self.run_function(*self.ctx.saved_tensors)
+                outputs = self.run_function(*inputs)
 
         self.run_function = None
         self.rng_states = None
@@ -590,6 +717,7 @@ def _recompute(self, _):
                 output.untyped_storage().copy_(recomputation_output.untyped_storage())
 
         self.ctx.outputs = outputs
+        self.ctx.inputs = inputs
         self.outputs = None
         self.ctx = None
 
@@ -602,6 +730,12 @@ def discard_output_and_register_recompute(self, hook_tensor):
         in the forward pass and the gradient of the hook_tensor is computed before the recomputed
         tensors are used.
         """
+
+        from megatron.core.transformer.cuda_graphs import is_graph_warmup
+
+        if is_graph_warmup():
+            return
+
         # use resize to release the output tensor memory and still keep the metadata in the tensors.
         # the metadata is still needed for backward
         for output in self.outputs:
diff --git a/megatron/core/timers.py b/megatron/core/timers.py
index 1dd0a55b983..2209d4cfc7c 100644
--- a/megatron/core/timers.py
+++ b/megatron/core/timers.py
@@ -174,6 +174,17 @@ def reset(self):
         self._elapsed = 0.0
         self._started = False
 
+    def set_elapsed(self, value):
+        """Directly set the elapsed time.
+
+        This is useful for injecting pre-computed timing values (e.g., startup
+        timestamps) into the timer so they can be reported via timers.log().
+
+        Args:
+            value (float): The elapsed time value in seconds.
+        """
+        self._elapsed = value
+
     def elapsed(self, reset=True, barrier=False):
         """Calculates the elapsed time and restarts timer.
 
diff --git a/megatron/core/tokenizers/megatron_tokenizer.py b/megatron/core/tokenizers/megatron_tokenizer.py
index be01d0e554f..14b273e909b 100644
--- a/megatron/core/tokenizers/megatron_tokenizer.py
+++ b/megatron/core/tokenizers/megatron_tokenizer.py
@@ -15,7 +15,6 @@
         ("mamba", "MambaTokenizer"),
         ("bert", "BertTokenizer"),
         ("t5", "T5Tokenizer"),
-        ("retro", "RetroTokenizer"),
     ]
 )
 
@@ -104,7 +103,7 @@ def write_metadata(
             tokenizer_path (str): path to tokenizer model.
             tokenizer_library (str): tokenizer model library.
             model_type (str): type of the model to be used with tokenizer.
-                list of available model types: [gpt, bert, t5, mamba, retro, default].
+                list of available model types: [gpt, bert, t5, mamba, default].
                 `DefaultTokenizerText` will be used if model_type is not specified.
             tokenizer_class (MegatronTokenizerBase): pre-defined tokenizer class.
             chat_template (str): tokenizer chat template in jinja format.
diff --git a/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py b/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py
index 458689fa1f4..965f43733a6 100644
--- a/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py
+++ b/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py
@@ -10,6 +10,8 @@
 except ModuleNotFoundError:
     HAVE_TRANSFORMERS = False
 
+from megatron.core.utils import log_single_rank
+
 from .abstract_tokenizer import MegatronTokenizerTextAbstract
 
 logger = logging.getLogger(__name__)
@@ -166,9 +168,11 @@ def __init__(
             tokenizer.resize_token_embeddings(tokenizer_default.vocab_size)
             """
 
-            logger.warning(
+            log_single_rank(
+                logger,
+                logging.WARNING,
                 f'{new_tokens_in_vocab} \n will be added to the vocabulary.\n'
-                f'Please resize your model accordingly.'
+                f'Please resize your model accordingly.',
             )
         self.add_special_tokens(special_tokens_dict)
         self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens(
@@ -196,7 +200,11 @@ def add_special_tokens(self, special_tokens_dict: dict) -> int:
         num_tokens_added = self.tokenizer.add_special_tokens(special_tokens_dict)
 
         if num_tokens_added > 0:
-            logger.info(f'{num_tokens_added} special tokens added, resize your model accordingly.')
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f'{num_tokens_added} special tokens added, resize your model accordingly.',
+            )
         for k in self.tokenizer.SPECIAL_TOKENS_ATTRIBUTES:
             setattr(self, k, getattr(self.tokenizer, k, None))
         return num_tokens_added
diff --git a/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py b/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py
index e9d486d4e60..46f7f9a1059 100644
--- a/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py
+++ b/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py
@@ -2,6 +2,7 @@
 
 import base64
 import json
+import logging
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -14,14 +15,18 @@
 from .abstract_tokenizer import MegatronTokenizerTextAbstract
 from .chat_template import MegatronTokenizerChatTemplate
 
-PATTERN_TIKTOKEN_V1 = (
+logger = logging.getLogger(__name__)
+
+_PATTERN_TIKTOKEN_V1 = (
     r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"
 )
-PATTERN_TIKTOKEN_V2 = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"  # pylint: disable=line-too-long
+_PATTERN_TIKTOKEN_V2 = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"  # pylint: disable=line-too-long
 DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17  # 131072
 SPECIAL_TOKENS = ["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]
 SPECIAL_TOKEN_TEMPLATE = "<SPECIAL_{id}>"
 
+__all__ = ["TikTokenTokenizer", "reload_mergeable_ranks"]
+
 
 def reload_mergeable_ranks(
     path: str, max_vocab: Optional[int] = None, num_special_tokens: Optional[int] = None
@@ -39,15 +44,18 @@ def reload_mergeable_ranks(
     """
 
     assert path.endswith(".json")
+    from megatron.core.utils import log_single_rank
 
     # reload vocab
     with open(path, "r") as f:
         vocab = json.load(f)
     assert isinstance(vocab, list)
-    print(f"Vocab size: {len(vocab)}")
+    log_single_rank(logger, logging.INFO, f"Vocab size: {len(vocab)}")
     if max_vocab is not None:
         vocab = vocab[:max_vocab]
-        print(f"Cutting vocab to first {len(vocab)} tokens.")
+        from megatron.core.utils import log_single_rank
+
+        log_single_rank(logger, logging.INFO, f"Cutting vocab to first {len(vocab)} tokens")
 
     # build ranks
     ranks: Dict[bytes, int] = {}
@@ -94,9 +102,9 @@ def __init__(
             special_tokens = SPECIAL_TOKENS.copy()
 
         if pattern == "v1":
-            pattern = PATTERN_TIKTOKEN_V1
+            pattern = _PATTERN_TIKTOKEN_V1
         elif pattern == "v2":
-            pattern = PATTERN_TIKTOKEN_V2
+            pattern = _PATTERN_TIKTOKEN_V2
         else:
             raise ValueError(f"Expected tiktoken pattern to be `v1` or `v2`, but got {pattern}.")
 
@@ -124,10 +132,14 @@ def __init__(
             for i in range(len(special_tokens), num_special_tokens)
         ]
         self.special_filler = special_filler
+        from megatron.core.utils import log_single_rank
+
         if special_filler:
-            print(
+            log_single_rank(
+                logger,
+                logging.INFO,
                 "Adding special tokens: "
-                f"{', '.join(special_tokens)}, {special_filler[0]}, ..., {special_filler[-1]}"
+                f"{', '.join(special_tokens)}, {special_filler[0]}, ..., {special_filler[-1]}",
             )
         self.special_tokens = special_tokens + special_filler
         assert (
diff --git a/megatron/core/tokenizers/text/models/__init__.py b/megatron/core/tokenizers/text/models/__init__.py
index 3610a8b98e4..d1788adb417 100644
--- a/megatron/core/tokenizers/text/models/__init__.py
+++ b/megatron/core/tokenizers/text/models/__init__.py
@@ -4,5 +4,4 @@
 from megatron.core.tokenizers.text.models.default_tokenizer import DefaultTokenizerText
 from megatron.core.tokenizers.text.models.gpt_tokenizer import GPTTokenizer
 from megatron.core.tokenizers.text.models.mamba_tokenizer import MambaTokenizer
-from megatron.core.tokenizers.text.models.retro_tokenizer import RetroTokenizer
 from megatron.core.tokenizers.text.models.t5_tokenizer import T5Tokenizer
diff --git a/megatron/core/tokenizers/text/models/retro_tokenizer.py b/megatron/core/tokenizers/text/models/retro_tokenizer.py
deleted file mode 100644
index a81af0c00f7..00000000000
--- a/megatron/core/tokenizers/text/models/retro_tokenizer.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-
-from megatron.core.tokenizers.text.text_tokenizer import MegatronTokenizerText
-
-
-class RetroTokenizer(MegatronTokenizerText):
-    """Base class for Megatron Retro tokenizer."""
-
-    def __init__(self, path: str = None, config: dict = None, **kwargs) -> None:
-        config['class_name'] = self.__class__.__name__
-        config['class_path'] = self.__class__.__module__
-        super().__init__(path, config, **kwargs)
diff --git a/megatron/core/tokenizers/text/utils/build_tokenizer.py b/megatron/core/tokenizers/text/utils/build_tokenizer.py
index 7b4a19435ba..2017945204d 100644
--- a/megatron/core/tokenizers/text/utils/build_tokenizer.py
+++ b/megatron/core/tokenizers/text/utils/build_tokenizer.py
@@ -15,15 +15,23 @@ def build_tokenizer(args):
     if args.tokenizer_type in MEGATRON_TOKENIZERS:
         tokenizer_library = 'megatron'
         tokenizer_path = args.tokenizer_type
+        kwargs['additional_special_tokens'] = (
+            args.tokenizer_special_tokens if args.tokenizer_special_tokens else []
+        )
         if tokenizer_path == 'BertWordPieceCase':
             special_tokens = {}
             special_tokens['additional_special_tokens'] = [f'<extra_id_{i}>' for i in range(100)]
             kwargs = special_tokens
         kwargs['vocab_file'] = args.vocab_file
         kwargs['merges_file'] = args.merge_file
+        kwargs['use_fast'] = args.tokenizer_hf_use_fast
+        kwargs['trust_remote_code'] = args.trust_remote_code
+        kwargs['include_special_tokens'] = args.tokenizer_hf_include_special_tokens
     elif args.tokenizer_type in SP_TOKENIZERS:
         tokenizer_library = 'sentencepiece'
         tokenizer_path = args.tokenizer_model
+        kwargs['legacy'] = args.tokenizer_sentencepiece_legacy
+        kwargs['special_tokens'] = args.tokenizer_special_tokens
     elif args.tokenizer_type == 'TikTokenizer':
         tokenizer_library = 'tiktoken'
         tokenizer_path = args.tokenizer_model
@@ -32,12 +40,18 @@ def build_tokenizer(args):
         if args.vocab_size:
             kwargs['vocab_size'] = args.vocab_size
         kwargs['num_special_tokens'] = args.tiktoken_num_special_tokens
-        kwargs['special_tokens'] = args.tiktoken_special_tokens
+        kwargs['special_tokens'] = args.tokenizer_special_tokens
     elif args.tokenizer_type == 'HuggingFaceTokenizer':
         tokenizer_library = 'huggingface'
         tokenizer_path = args.tokenizer_model
         kwargs['vocab_file'] = args.vocab_file
         kwargs['merges_file'] = args.merge_file
+        kwargs['additional_special_tokens'] = (
+            args.tokenizer_special_tokens if args.tokenizer_special_tokens else []
+        )
+        kwargs['use_fast'] = args.tokenizer_hf_use_fast
+        kwargs['trust_remote_code'] = args.trust_remote_code
+        kwargs['include_special_tokens'] = args.tokenizer_hf_include_special_tokens
     elif args.tokenizer_type == 'NullTokenizer':
         tokenizer_library = 'null'
         metadata = {'library': tokenizer_library}
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
index 72c9924ecb9..7a082feb559 100644
--- a/megatron/core/transformer/attention.py
+++ b/megatron/core/transformer/attention.py
@@ -1,14 +1,18 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+from __future__ import annotations
 
+import copy
+import inspect
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import NoReturn, Optional, Tuple, Union
+from typing import Callable, Optional, Protocol, Tuple, Union
 
 import torch
 from torch import Tensor
 
 from megatron.core import tensor_parallel
 from megatron.core.inference.contexts import BaseInferenceContext
+from megatron.core.jit import jit_fuser
 from megatron.core.models.common.embeddings.rope_utils import (
     apply_rotary_pos_emb,
     apply_rotary_pos_emb_with_cos_sin,
@@ -22,10 +26,15 @@
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
 from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.mappings import all_gather_last_dim_from_tensor_parallel_region
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.typed_torch import apply_module, not_none
 from megatron.core.utils import (
     deprecate_inference_params,
     divide,
@@ -33,6 +42,7 @@
     get_pg_size,
     is_fa_min_version,
     is_te_min_version,
+    is_using_quantization_scales,
     nvtx_range_pop,
     nvtx_range_push,
 )
@@ -40,7 +50,7 @@
 from ..models.common.embeddings.yarn_rotary_pos_embedding import (
     _yarn_get_concentration_factor_from_config,
 )
-from .enums import AttnMaskType
+from .enums import AttnMaskType, CudaGraphScope
 from .transformer_config import TransformerConfig
 
 try:
@@ -60,7 +70,7 @@
 
 if not HAVE_FA3:
     try:
-        from flash_attn_3.flash_attn_interface import _flash_attn_forward
+        from flashattn_hopper.flash_attn_interface import _flash_attn_forward
         from flashattn_hopper.flash_attn_interface import (
             flash_attn_with_kvcache as flash_attn3_with_kvcache,
         )
@@ -107,14 +117,107 @@
     HAVE_FUSED_QKV_ROPE = False
 
 
+class LinearQkv(Protocol):
+    """Protocol for linear_qkv modules."""
+
+    def forward(self, input: Tensor, /) -> tuple[Tensor, object]:
+        """Applies linear_qkv."""
+        ...
+
+    def backward_dw(self) -> None:
+        """Backward pass for the linear_qkv module."""
+        ...
+
+
+class LinearQkvBuilder(Protocol):
+    """Protocol for building linear_qkv layers."""
+
+    def __call__(
+        self,
+        input_size: int,
+        output_size: int,
+        /,
+        *,
+        config: TransformerConfig,
+        init_method: Callable[[torch.Tensor], None],
+        gather_output: bool,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        tp_comm_buffer_name: str,
+        tp_group: torch.distributed.ProcessGroup | None = None,
+    ) -> LinearQkv: ...
+
+
+class LinearLayer(Protocol):
+    """Protocol for linear_q and linear_kv modules."""
+
+    def forward(self, input: Tensor, /) -> Tuple[Tensor, object]:
+        """Applies linear_q/linear_kv."""
+        ...
+
+
+class LinearLayerBuilder(Protocol):
+    """Protocol for building linear_q and linear_kv layers."""
+
+    def __call__(
+        self,
+        input_size: int,
+        output_size: int,
+        /,
+        *,
+        config: TransformerConfig,
+        init_method: Callable[[torch.Tensor], None],
+        gather_output: bool,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+    ) -> LinearLayer: ...
+
+
+class CoreAttention(Protocol):
+    """Protocol for core_attention modules."""
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Optional[Tensor],
+        /,
+        *,
+        attn_mask_type: AttnMaskType,
+        attention_bias: Optional[Tensor],
+        packed_seq_params: Optional[PackedSeqParams],
+    ) -> Tensor:
+        """Applies dot product attention."""
+        ...
+
+
+class CoreAttentionBuilder(Protocol):
+    """Protocol for building core_attention layers."""
+
+    def __call__(
+        self,
+        *,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        cp_comm_type: Optional[str],
+        softmax_scale: Optional[float],
+        pg_collection: Optional[ProcessGroupCollection],
+    ) -> CoreAttention: ...
+
+
 @dataclass
 class SelfAttentionSubmodules:
     """
     Configuration class for specifying the submodules of a self-attention.
     """
 
-    linear_qkv: Union[ModuleSpec, type] = None
-    core_attention: Union[ModuleSpec, type] = None
+    linear_qkv: LinearQkvBuilder
+    core_attention: CoreAttentionBuilder
     linear_proj: Union[ModuleSpec, type] = None
     q_layernorm: Union[ModuleSpec, type] = None
     k_layernorm: Union[ModuleSpec, type] = None
@@ -126,9 +229,9 @@ class CrossAttentionSubmodules:
     Configuration class for specifying the submodules of a cross-attention.
     """
 
-    linear_q: Union[ModuleSpec, type] = None
-    linear_kv: Union[ModuleSpec, type] = None
-    core_attention: Union[ModuleSpec, type] = None
+    linear_q: LinearLayerBuilder
+    linear_kv: LinearLayerBuilder
+    core_attention: CoreAttentionBuilder
     linear_proj: Union[ModuleSpec, type] = None
 
 
@@ -146,8 +249,8 @@ def __init__(
         layer_number: int,
         attn_mask_type: AttnMaskType,
         attention_type: str,
-        cp_comm_type: str = None,
-        pg_collection: ProcessGroupCollection = None,
+        cp_comm_type: str | None = None,
+        pg_collection: ProcessGroupCollection | None = None,
     ):
         super().__init__(config=config)
 
@@ -156,6 +259,10 @@ def __init__(
 
         self.attn_mask_type = attn_mask_type
         self.attention_type = attention_type
+        self.batch_invariant_mode = config.batch_invariant_mode
+
+        assert self.config.kv_channels is not None
+        assert self.config.num_query_groups is not None
 
         # For normal attention without groups, num_query_groups == num_attention_heads,
         # so these two will be the same
@@ -172,22 +279,44 @@ def __init__(
                 pg_collection, 'cp'
             ), "Attention pg_collection must have cp process group"
         self.pg_collection = pg_collection
+        self.tp_group = pg_collection.tp
 
         # Per attention head and per partition values
         world_size = get_pg_size(self.pg_collection.tp)
         self.hidden_size_per_attention_head = divide(
             self.query_projection_size, self.config.num_attention_heads
         )
-        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
-        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
+        if self.config.num_query_groups < world_size:
+            # When num_kv_heads < tp_size, each TP rank (post AG) initially produces
+            # activations for 1 kv_head and (num_q_heads / num_kv_heads) q_heads.
+            # We then pull out the appropriate (num_q_heads / tp_size) q_heads.
+            self.num_query_groups_per_partition = 1
+            self.num_attention_heads_per_partition = divide(
+                self.config.num_attention_heads, self.config.num_query_groups
+            )
+        else:
+            # When num_kv_heads >= tp_size, each TP rank produces activations for
+            # (num_kv_heads / tp_size) kv_heads and (num_q_heads / tp_size) q_heads.
+            self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
+            self.num_attention_heads_per_partition = divide(
+                self.config.num_attention_heads, world_size
+            )
+        self.world_size = world_size
 
         # To support both CUDA Graphs and key value with different hidden size
         self.key_hidden_size = self.hidden_size_per_attention_head
         self.val_hidden_size = self.hidden_size_per_attention_head
 
-        self.core_attention = build_module(
-            submodules.core_attention,
-            config=self.config,
+        if self.config.num_query_groups < world_size:
+            # TE throws an assertion error if num_kv_heads / num_query_groups
+            # is not divisible by TP size.
+            # TODO(rwaleffe/dnarayanan): Clean this up eventually.
+            tmp_config = copy.deepcopy(self.config)
+            tmp_config.num_query_groups = world_size
+        else:
+            tmp_config = self.config
+        self.core_attention = submodules.core_attention(
+            config=tmp_config,
             layer_number=self.layer_number,
             attn_mask_type=self.attn_mask_type,
             attention_type=self.attention_type,
@@ -201,6 +330,21 @@ def __init__(
             and "core_attn" in self.config.recompute_modules
         )
 
+        self.offload_qkv_linear = (
+            self.config.fine_grained_activation_offloading
+            and "qkv_linear" in self.config.offload_modules
+        )
+
+        self.offload_core_attention = (
+            self.config.fine_grained_activation_offloading
+            and "core_attn" in self.config.offload_modules
+        )
+
+        self.offload_attn_proj = (
+            self.config.fine_grained_activation_offloading
+            and "attn_proj" in self.config.offload_modules
+        )
+
         # Output.
         self.linear_proj = build_module(
             submodules.linear_proj,
@@ -254,7 +398,7 @@ def custom_forward(*inputs):
             attention_mask = inputs[3]
             attn_mask_type = inputs[5]
             attn_mask_type = AttnMaskType(attn_mask_type.item())
-            output_ = self.core_attention(
+            output_ = apply_module(self.core_attention)(
                 query,
                 key,
                 value,
@@ -312,7 +456,7 @@ def _adjust_key_value_for_inference(
         sequence_len_offset: Optional[int] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
-    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, AttnMaskType, Tensor]:
         """
         Saves the generated key and value tensors to the end of the buffers in inference_context.
         Returns the full size keys and values from the provided inference_context, as well as
@@ -479,7 +623,17 @@ def _adjust_key_value_for_inference(
         return query, key, value, rotary_pos_emb, attn_mask_type, block_table
 
     @abstractmethod
-    def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=True):
+    def get_query_key_value_tensors(
+        self,
+        hidden_states: Tensor,
+        key_value_states: Tensor | None,
+        output_gate: bool = False,
+        split_qkv: bool = True,
+    ) -> (
+        tuple[Tensor, Tensor, Tensor, Tensor]
+        | tuple[Tensor, Tensor, Tensor]
+        | tuple[Tensor, list[int]]
+    ):
         """
         This method needs to be implemented based on whether the derived class
         is "self-attn" or "cross-attn".
@@ -496,7 +650,7 @@ def flash_decode(
         rotary_cos: Tensor,
         rotary_sin: Tensor,
         rotary_interleaved: bool = False,
-    ) -> (Tensor, Tensor):
+    ) -> tuple[Tensor, Tensor]:
         """
         The flash decoding kernel will do the following in a single execution:
         1. Compute RoPE embedding with precomputed cos & sin tensors
@@ -531,6 +685,74 @@ def flash_decode(
         )
         return out
 
+    def _flash_attention_3_forward_wrapper(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        max_seqlen_q,
+        max_seqlen_k,
+        cu_seqlens_q,
+        seqlens_k,
+        block_table,
+        softmax_scale,
+    ):
+        """
+        Wrapper for calling the FA3 _flash_attn_forward function.
+        Handles argument conversion for different versions of the _flash_attn_forward API.
+        """
+        candidate_kwargs = {
+            "q": q,
+            "k": k,
+            "v": v,
+            "k_new": None,
+            "v_new": None,
+            "qv": None,
+            "out": None,
+            "out_": None,
+            "cu_seqlens_q": cu_seqlens_q,
+            "cu_seqlens_k": None,
+            "cu_seqlens_k_new": None,
+            "seqused_q": None,
+            "seqused_k": seqlens_k,
+            "max_seqlen_q": max_seqlen_q,
+            "max_seqlen_k": max_seqlen_k,
+            "page_table": block_table,
+            "kv_batch_idx": None,
+            "leftpad_k": None,
+            "rotary_cos": None,
+            "rotary_sin": None,
+            "seqlens_rotary": None,
+            "q_descale": None,
+            "k_descale": None,
+            "v_descale": None,
+            "softmax_scale": softmax_scale,
+            "causal": True,
+            "attention_chunk": 0,
+            "softcap": 0.0,
+            "window_size": (-1, -1),
+            "window_size_left": -1,
+            "window_size_right": -1,
+            "rotary_interleaved": True,
+            "scheduler_metadata": None,
+            "num_splits": 0 if not self.batch_invariant_mode else 1,
+            "pack_gqa": None,
+            "sm_margin": 0,
+        }
+
+        # Parse the expect argument names from the function signature
+        if inspect.isfunction(_flash_attn_forward):
+            sig = inspect.signature(_flash_attn_forward)
+        else:
+            assert isinstance(_flash_attn_forward, torch._library.custom_ops.CustomOpDef)
+            sig = inspect.signature(_flash_attn_forward._init_fn)
+        valid_kwargs = set(sig.parameters.keys())
+        final_kwargs = {k: candidate_kwargs[k] for k in valid_kwargs if k in candidate_kwargs}
+
+        output_total, *unused = _flash_attn_forward(**final_kwargs)
+
+        return output_total
+
     def flash_decode_and_prefill(
         self,
         q: Tensor,
@@ -542,6 +764,7 @@ def flash_decode_and_prefill(
         cu_seqlens_k,
         seqlens_k,
         block_table,
+        is_decode_only,
     ) -> Tensor:
         """Flash attention kernel for mixed decode and prefill samples.
 
@@ -555,6 +778,7 @@ def flash_decode_and_prefill(
             cu_seqlens_k (Tensor): Cumulative key sequence lengths.
             seqlens_k (Tensor): key sequence lengths.
             block_table (Tensor): KV cache block ids for all samples.
+            is_decode_only (bool): True if batch is decode only.
         Return:
             (Tensor) Attention output.
         """
@@ -563,7 +787,7 @@ def flash_decode_and_prefill(
         assert block_table is not None
 
         # Flash attn kernel.
-        if max_seqlen_q > 1:
+        if not is_decode_only:
             q = q.squeeze(1)
             if getattr(self, "softmax_scale", None) is not None:
                 softmax_scale = self.softmax_scale
@@ -572,42 +796,21 @@ def flash_decode_and_prefill(
             if HAVE_FA3:
                 # TODO(ksanthanam): Replace with call to flash_attn_varlen_func once
                 # it accepts block_table
-                output_total, *unused = _flash_attn_forward(
-                    q=q,
-                    k=k,
-                    v=v,
-                    k_new=None,
-                    v_new=None,
-                    qv=None,
-                    out=None,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=None,
-                    cu_seqlens_k_new=None,
-                    seqused_q=None,
-                    seqused_k=seqlens_k,
-                    max_seqlen_q=max_seqlen_q,
-                    max_seqlen_k=max_seqlen_k,
-                    page_table=block_table,
-                    kv_batch_idx=None,
-                    leftpad_k=None,
-                    rotary_cos=None,
-                    rotary_sin=None,
-                    seqlens_rotary=None,
-                    q_descale=None,
-                    k_descale=None,
-                    v_descale=None,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=(-1, -1),
-                    attention_chunk=0,
-                    softcap=0.0,
-                    rotary_interleaved=True,
-                    scheduler_metadata=None,
-                    num_splits=0,
-                    pack_gqa=None,
-                    sm_margin=0,
+                output_total = self._flash_attention_3_forward_wrapper(
+                    q,
+                    k,
+                    v,
+                    max_seqlen_q,
+                    max_seqlen_k,
+                    cu_seqlens_q,
+                    seqlens_k,
+                    block_table,
+                    softmax_scale,
                 )
             else:
+                assert (
+                    self.batch_invariant_mode is False
+                ), "Batch invariant mode is not supported for flash attention 2"
                 output_total = flash_attn_varlen_func(
                     q,
                     k,
@@ -658,10 +861,14 @@ def flash_decode_and_prefill(
                     "cache_seqlens": seqlens_k,
                     "causal": True,
                     "page_table" if HAVE_FA3 else "block_table": block_table,
+                    "num_splits": 0 if not self.batch_invariant_mode else 1,
                 }
                 if HAVE_FA3:
                     output_total = flash_attn3_with_kvcache(**flash_attn_args)
                 else:
+                    assert (
+                        not self.batch_invariant_mode
+                    ), "Batch invariant mode is not supported for flash attention 2"
                     output_total = flash_attn_with_kvcache(**flash_attn_args)
         return output_total
 
@@ -680,7 +887,7 @@ def forward(
         sequence_len_offset: Optional[int] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
-    ) -> Tuple[Tensor, Tensor]:
+    ) -> tuple[Tensor, Tensor]:
         """
         Perform a forward pass through the attention module.
 
@@ -770,14 +977,31 @@ def forward(
                 self.config.fused_single_qkv_rope and split_qkv
             ), "fused_single_qkv_rope requested but not available/supported for the config."
 
-        qkv_output = self.get_query_key_value_tensors(
-            hidden_states, key_value_states, split_qkv=split_qkv
-        )
+        with off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear") as hidden_states:
+            qkv_output = self.get_query_key_value_tensors(
+                hidden_states,
+                key_value_states,
+                split_qkv=split_qkv,
+                output_gate=self.config.attention_output_gate,
+            )
+        if self.offload_qkv_linear:
+            # `qkv_output` may be a tuple; commit supports tuple/list and will keep structure.
+            qkv_output = off_interface.group_commit(
+                qkv_output, name="qkv_linear", forced_released_tensors=[]
+            )
         attn_mask_type = self.attn_mask_type
         block_table = None
+        gate = None
         if split_qkv:
-            query, key, value = qkv_output
+            if self.config.attention_output_gate:
+                query, key, value, gate = qkv_output
+            else:
+                query, key, value = qkv_output
+            mixed_qkv = qkv_split_arg_list = None
         else:
+            assert (
+                not self.config.attention_output_gate
+            ), "attention_output_gate is not supported for unsplit mixed_qkv tensor."
             mixed_qkv, qkv_split_arg_list = qkv_output
         nvtx_range_pop(suffix="qkv")
 
@@ -819,7 +1043,7 @@ def forward(
         if (
             in_decode_mode
             and self.config.cuda_graph_impl == "local"
-            and self.config.cuda_graph_scope != "full_iteration"
+            and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope
             and inference_context.is_static_batching()
         ):
             raise ValueError(f"CUDA graphs must use flash decode with static batching!")
@@ -839,7 +1063,7 @@ def forward(
                 )
             )
 
-        if packed_seq_params is not None:
+        if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
             query = query.squeeze(1)
             key = key.squeeze(1)
             value = value.squeeze(1)
@@ -854,7 +1078,7 @@ def forward(
         ):
             q_pos_emb, k_pos_emb = rotary_pos_emb
 
-            if packed_seq_params is not None:
+            if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
                 if packed_seq_params.cu_seqlens_q_padded is not None:
                     cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded
                 else:
@@ -920,15 +1144,18 @@ def forward(
         else:
             if inference_context is None or inference_context.is_static_batching():
                 # Static batching attention kernel.
-                core_attn_out = self.core_attention(
-                    query,
-                    key,
-                    value,
-                    attention_mask,
-                    attn_mask_type=attn_mask_type,
-                    attention_bias=attention_bias,
-                    packed_seq_params=packed_seq_params,
-                )
+                with off_interface(
+                    self.offload_core_attention and self.training, query, "core_attn"
+                ) as query:
+                    core_attn_out = apply_module(self.core_attention)(
+                        query,
+                        key,
+                        value,
+                        attention_mask,
+                        attn_mask_type=attn_mask_type,
+                        attention_bias=attention_bias,
+                        packed_seq_params=packed_seq_params,
+                    )
 
             else:
                 # Dynamic batching attention kernel.
@@ -946,9 +1173,19 @@ def forward(
                     cu_kv_lengths,
                     kv_lengths,
                     block_table,
+                    inference_context.is_decode_only(),
                 )
                 core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)')
 
+                # Clear the outputs for padding tokens when using quantization scales
+                # to avoid corrupting amax calculations
+                if is_using_quantization_scales(self.config):
+                    core_attn_out[inference_context.padding_slice] = 0.0
+
+            if self.offload_core_attention and self.training:
+                core_attn_out = off_interface.group_commit(
+                    core_attn_out, name="core_attn", forced_released_tensors=[query, key, value]
+                )
         if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
             # reshape to same output shape as unpacked case
             # (t, np, hn) -> (t, b=1, h=np*hn)
@@ -957,20 +1194,46 @@ def forward(
             core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
         nvtx_range_pop(suffix="core_attention")
 
+        # Output gate
+        if gate is not None:
+            nvtx_range_push(suffix="output_gate")
+            core_attn_out = self._apply_output_gate(core_attn_out, gate)
+            nvtx_range_pop(suffix="output_gate")
+
         # =================
         # Output. [sq, b, h]
         # =================
-
         nvtx_range_push(suffix="linear_proj")
-        output, bias = self.linear_proj(core_attn_out)
+        with off_interface(self.offload_attn_proj, core_attn_out, "attn_proj") as core_attn_out:
+            output, bias = self.linear_proj(core_attn_out)
+        if self.offload_attn_proj:
+            output = off_interface.group_commit(
+                output, name="attn_proj", forced_released_tensors=[core_attn_out]
+            )
         nvtx_range_pop(suffix="linear_proj")
 
         return output, bias
 
+    @jit_fuser
+    def _apply_output_gate(self, x, gate):
+        x_dtype = x.dtype
+        gate = gate.contiguous()
+        gate = gate.view(*x.shape)
+        x = x * torch.sigmoid(gate.float())
+        x = x.to(x_dtype)
+        return x
+
     def set_for_recompute_input_layernorm(self):
         """Set the attention layer for recompute input_layernorm. Only needed for fp8."""
         raise NotImplementedError("set_for_recompute_input_layernorm is not implemented.")
 
+    def clip_qk(self):
+        """
+        QK Clipping is a technique to clip the query and key attention logits to prevent the
+        attention logits from exploding.
+        """
+        raise NotImplementedError("clip_qk is not implemented.")
+
 
 class SelfAttention(Attention):
     """Self-attention layer class
@@ -984,9 +1247,9 @@ def __init__(
         config: TransformerConfig,
         submodules: SelfAttentionSubmodules,
         layer_number: int,
-        attn_mask_type=AttnMaskType.padding,
-        cp_comm_type: str = None,
-        pg_collection: ProcessGroupCollection = None,
+        attn_mask_type: AttnMaskType = AttnMaskType.padding,
+        cp_comm_type: str | None = None,
+        pg_collection: ProcessGroupCollection | None = None,
     ):
         super().__init__(
             config=config,
@@ -998,12 +1261,14 @@ def __init__(
             pg_collection=pg_collection,
         )
 
-        self.linear_qkv = build_module(
-            submodules.linear_qkv,
+        self.linear_qkv_out_dim = self.query_projection_size + 2 * self.kv_projection_size
+        if self.config.attention_output_gate:
+            self.linear_qkv_out_dim += self.config.kv_channels * self.config.num_attention_heads
+        self.linear_qkv = submodules.linear_qkv(
             self.config.hidden_size,
-            self.query_projection_size + 2 * self.kv_projection_size,
+            self.linear_qkv_out_dim,
             config=self.config,
-            init_method=self.config.init_method,
+            init_method=not_none(self.config.init_method),
             gather_output=False,
             bias=self.config.add_bias_linear or self.config.add_qkv_bias,
             skip_bias_add=False,
@@ -1125,52 +1390,112 @@ def _compare(srcs, tgts, names, parallelism):
                 "TP",
             )
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=True):
+    def get_query_key_value_tensors(
+        self,
+        hidden_states: Tensor,
+        key_value_states: Tensor | None = None,
+        output_gate: bool = False,
+        split_qkv: bool = True,
+    ) -> (
+        tuple[Tensor, Tensor, Tensor, Tensor]
+        | tuple[Tensor, Tensor, Tensor]
+        | tuple[Tensor, list[int]]
+    ):
         """
-        Derives `query`, `key` and `value` tensors from `hidden_states`. If `split_qkv=False`, then
-        the unsplit mixed_qkv tensor is returned.
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        If `output_gate` is True, then also derives `gate` tensor.
+        If `split_qkv=False`, then the unsplit mixed_qkv tensor is returned.
         """
-        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
-        mixed_qkv, _ = self.linear_qkv(hidden_states)
+        # If no output gate: Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        # If have output gate: Attention heads [sq, b, h] --> [sq, b, ng * (2 * np/ng + 2) * hn)]
+        mixed_qkv, _ = apply_module(self.linear_qkv)(hidden_states)
+        num_query_heads_per_group = (
+            self.num_attention_heads_per_partition // self.num_query_groups_per_partition
+        )
+        num_qkv_heads_per_group = num_query_heads_per_group + 2
+        if output_gate:
+            num_qkv_heads_per_group += num_query_heads_per_group
+
+        assert self.config.num_query_groups is not None
+        if self.config.num_query_groups < self.world_size:
+            # Note that weights are interleaved in the following manner:
+            # q1 q2 k1 v1 | q3 q4 k2 v2 | q5 q6 k3 v3 | ...
+            # When tp_size > num_kv_heads, we split "q1 q2 k1 v1" over multiple
+            # ranks, so a rank does not have a clean partitioning of just the q_heads
+            # it needs. Instead, we perform the following steps:
+            # 1. Assemble the full "q1 q2 k1 v1 | q3 q4 k2 v2 | q5 q6 k3 v3 | ..."
+            #    through an AG.
+            # 2. Pull out the right slice (e.g., "q1 q2 k1 v1" or "q3 q4 k2 v2").
+            # 3. Split q_heads (e.g., q1, q2), k_heads (e.g., k1), v_heads (e.g., v1).
+            # 4. Further index into query to get only the q_heads that this rank is
+            #    responsible for (e.g., q1).
+            # The block of code below performs steps 1 and 2.
+            mixed_qkv = all_gather_last_dim_from_tensor_parallel_region(mixed_qkv)
+            idx = get_tensor_model_parallel_rank() // (
+                self.world_size // self.config.num_query_groups
+            )
+            size = mixed_qkv.size()[-1] // self.config.num_query_groups
+            mixed_qkv = mixed_qkv[:, :, idx * size : (idx + 1) * size]
 
-        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        # If no output gate: [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        # If have output gate: [sq, b, hp] --> [sq, b, ng, (2 * np/ng + 2) * hn]
         new_tensor_shape = mixed_qkv.size()[:-1] + (
             self.num_query_groups_per_partition,
-            (
-                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
-                * self.hidden_size_per_attention_head
-            ),
+            num_qkv_heads_per_group * self.hidden_size_per_attention_head,
         )
         mixed_qkv = mixed_qkv.view(*new_tensor_shape)
 
-        split_arg_list = [
-            (
-                self.num_attention_heads_per_partition
-                // self.num_query_groups_per_partition
-                * self.hidden_size_per_attention_head
-            ),
-            self.hidden_size_per_attention_head,
-            self.hidden_size_per_attention_head,
-        ]
-
-        # Return unsplit mixed_qkv and split_arg_list
-        if not split_qkv:
-            return mixed_qkv, split_arg_list
-
-        if SplitAlongDim is not None:
+        # Split the tensor into query, gate, key, and value.
+        if output_gate:
+            if not split_qkv:
+                raise ValueError("split_qkv not supported for gated attention yet.")
+            # If have output gate: [sq, b, ng, (2 * np/ng + 2) * hn]
+            # --> [sq, b, ng, np/ng * hn], [sq, b, ng, np/ng * hn],
+            # [sq, b, ng, hn], [sq, b, ng, hn]
+            split_arg_list = [
+                num_query_heads_per_group * self.hidden_size_per_attention_head,
+                num_query_heads_per_group * self.hidden_size_per_attention_head,
+                self.hidden_size_per_attention_head,
+                self.hidden_size_per_attention_head,
+            ]
 
-            # [sq, b, ng, (np/ng + 2) * hn]
-            # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list)
+            if SplitAlongDim is not None:
+                (query, gate, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list)
+            else:
+                (query, gate, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3)
         else:
+            # If no output gate: [sq, b, ng, (np/ng + 2) * hn]
+            # --> [sq, b, ng, np/ng * hn], None, [sq, b, ng, hn], [sq, b, ng, hn]
+            split_arg_list = [
+                num_query_heads_per_group * self.hidden_size_per_attention_head,
+                self.hidden_size_per_attention_head,
+                self.hidden_size_per_attention_head,
+            ]
 
-            # [sq, b, ng, (np/ng + 2) * hn]
-            # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3)
+            # Return unsplit mixed_qkv and split_arg_list
+            if not split_qkv:
+                return mixed_qkv, split_arg_list
+
+            if SplitAlongDim is not None:
+                (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list)
+            else:
+                (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3)
 
-        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        # Query [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
         query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
 
+        if self.config.num_query_groups < self.world_size:
+            # query above corresponds to (num_q_heads / num_kv_heads) q_heads.
+            # Index appropriately into query to get (num_q_heads / tp_size) q_heads.
+            # This is step 4 in the list of steps above.
+            idx = get_tensor_model_parallel_rank() % (
+                self.world_size // self.config.num_query_groups
+            )
+            size = self.num_attention_heads_per_partition // (
+                self.world_size // self.config.num_query_groups
+            )
+            query = query[:, :, idx * size : (idx + 1) * size, :]
+
         if self.q_layernorm is not None:
             if not self.config.qk_layernorm_hidden_dim:
                 query = self.q_layernorm(query)
@@ -1194,9 +1519,14 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None, spli
         if self.config.test_mode:
             self.run_realtime_tests()
 
+        if output_gate:
+            # Gate [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+            gate = gate.reshape(*gate.shape[:2], -1, self.hidden_size_per_attention_head)
+            return query, key, value, gate
+
         return query, key, value
 
-    def backward_dw(self) -> NoReturn:
+    def backward_dw(self) -> None:
         """Execute weight update operations"""
         self._backward_qkv_proj()
         self._backward_output_proj()
@@ -1215,6 +1545,103 @@ def set_for_recompute_input_layernorm(self):
 
         set_save_original_input(self.linear_qkv)
 
+    def clip_qk(self):
+        """
+        QK Clipping is a technique to clip the query and key attention logits to prevent the
+        attention logits from exploding. This function is experimental on GQA.
+        """
+        if not self.config.qk_clip:
+            raise ValueError("qk_clip option needs to be enabled")
+
+        if self.core_attention.current_max_attn_logits is None:
+            raise ValueError("current_max_attn_logits is None")
+
+        assert self.core_attention.current_max_attn_logits.shape == (
+            self.num_attention_heads_per_partition,
+        ), f"current_max_attn_logits shape is not ({self.num_attention_heads_per_partition}, ) \
+                    but {self.core_attention.current_max_attn_logits.shape}"
+
+        grouped_max_attn_logits = torch.max(
+            self.core_attention.current_max_attn_logits.view(
+                self.num_query_groups_per_partition, -1
+            ),
+            dim=1,
+        ).values
+
+        # only update the weight if any head has
+        # current_max_attn_logits > qk_clip_threshold
+        if torch.any(grouped_max_attn_logits > self.config.qk_clip_threshold):
+            # Use num_query_groups_per_partition for tensor parallel scenarios
+
+            # qk_clip_balancing_eta (g, 1, 1)
+            assert grouped_max_attn_logits.shape == (
+                self.num_query_groups_per_partition,
+            ), f"current_max_attn_logits shape is not ({self.num_query_groups_per_partition},) \
+                but {grouped_max_attn_logits.shape}"
+            self.qk_clip_balancing_eta = torch.clamp(
+                self.config.qk_clip_threshold / grouped_max_attn_logits, max=1.0
+            ).view(self.num_query_groups_per_partition, 1, 1)
+            assert torch.all(self.qk_clip_balancing_eta <= 1.0)
+
+            # Handle different weight access patterns (main_param vs direct access)
+            if hasattr(self.linear_qkv.weight, 'main_param'):
+                self.linear_qkv.weight.main_param.data.copy_(
+                    self._clip_linear_qkv(self.linear_qkv.weight.main_param.data)
+                )
+
+            self.linear_qkv.weight.data.copy_(self._clip_linear_qkv(self.linear_qkv.weight.data))
+
+        # reset current_max_attn_logits
+        self.core_attention.current_max_attn_logits = None
+
+    def _clip_linear_qkv(self, weight):
+        """Apply qkclip to linear_qkv layer"""
+        # Reshape to (g, query_projection_size + 2 * kv_projection_size, -1)
+        weight_reshaped = weight.view(
+            self.num_query_groups_per_partition,
+            (self.query_projection_size + 2 * self.kv_projection_size)
+            // self.num_query_groups_per_partition,
+            -1,
+        )
+
+        # Split into query_projection_size and 2 * kv_projection_size parts:
+        # (n, a, -1) and (n, b, -1)
+        weight_q = weight_reshaped[
+            :, : self.query_projection_size // self.num_query_groups_per_partition, :
+        ]
+        weight_k = weight_reshaped[
+            :,
+            self.query_projection_size
+            // self.num_query_groups_per_partition : (
+                self.query_projection_size + self.kv_projection_size
+            )
+            // self.num_query_groups_per_partition,
+            :,
+        ]
+        weight_v = weight_reshaped[
+            :,
+            (self.query_projection_size + self.kv_projection_size)
+            // self.num_query_groups_per_partition :,
+            :,
+        ]
+
+        # extend the qk_clip_balancing_eta to the same shape as weight_q and weight_k
+        self.qk_clip_balancing_eta_extended = self.qk_clip_balancing_eta.repeat(
+            1, weight_q.size(1), 1
+        )
+
+        # Clipping
+        weight_q.mul_(torch.pow(self.qk_clip_balancing_eta_extended, self.config.qk_clip_alpha))
+        weight_k.mul_(torch.pow(self.qk_clip_balancing_eta, 1 - self.config.qk_clip_alpha))
+
+        # Concatenate back and reshape to original shape
+        weight_updated = torch.cat([weight_q, weight_k, weight_v], dim=1)
+        weight_updated = weight_updated.view(
+            self.query_projection_size + 2 * self.kv_projection_size, -1
+        )
+
+        return weight_updated
+
 
 class CrossAttention(Attention):
     """Cross-attention layer class
@@ -1228,9 +1655,9 @@ def __init__(
         config: TransformerConfig,
         submodules: CrossAttentionSubmodules,
         layer_number: int,
-        attn_mask_type=AttnMaskType.padding,
-        cp_comm_type: str = None,
-        pg_collection: ProcessGroupCollection = None,
+        attn_mask_type: AttnMaskType = AttnMaskType.padding,
+        cp_comm_type: str | None = None,
+        pg_collection: ProcessGroupCollection | None = None,
     ):
         super().__init__(
             config=config,
@@ -1246,38 +1673,45 @@ def __init__(
             raise ValueError("Group query attention is not currently supported in cross attention.")
         assert self.query_projection_size == self.kv_projection_size
 
-        self.linear_q = build_module(
-            submodules.linear_q,
+        self.linear_q = submodules.linear_q(
             self.config.hidden_size,
             self.query_projection_size,
             config=self.config,
-            init_method=self.config.init_method,
+            init_method=not_none(self.config.init_method),
             gather_output=False,
             bias=self.config.add_bias_linear,
             skip_bias_add=False,
             is_expert=False,
         )
 
-        self.linear_kv = build_module(
-            submodules.linear_kv,
+        self.linear_kv = submodules.linear_kv(
             self.config.hidden_size,
             2 * self.kv_projection_size,
             config=self.config,
-            init_method=self.config.init_method,
+            init_method=not_none(self.config.init_method),
             gather_output=False,
             bias=self.config.add_bias_linear,
             skip_bias_add=False,
             is_expert=False,
         )
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=True):
+    def get_query_key_value_tensors(
+        self,
+        hidden_states: Tensor,
+        key_value_states: Optional[Tensor],
+        output_gate: bool = False,
+        split_qkv: bool = True,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
         """
         Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
         from `key_value_states`.
         """
         assert split_qkv, "split_qkv must be True for CrossAttention"
+        assert not output_gate, "Output gate is not supported in cross attention for now."
+
+        assert key_value_states is not None, "key_value_states cannot be None for CrossAttention"
         # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
-        mixed_kv, _ = self.linear_kv(key_value_states)
+        mixed_kv, _ = apply_module(self.linear_kv)(key_value_states)
 
         # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
         new_tensor_shape = mixed_kv.size()[:-1] + (
@@ -1290,7 +1724,7 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv
         (key, value) = tensor_parallel.split_tensor_along_last_dim(mixed_kv, 2)
 
         # Attention head [sq, b, h] --> [sq, b, hp]
-        query, _ = self.linear_q(hidden_states)
+        query, _ = apply_module(self.linear_q)(hidden_states)
 
         # [sq, b, hp] --> [sq, b, np, hn]
         new_tensor_shape = query.size()[:-1] + (
diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py
index 7b81eb723ed..3643c42c3ce 100644
--- a/megatron/core/transformer/cuda_graphs.py
+++ b/megatron/core/transformer/cuda_graphs.py
@@ -1,18 +1,24 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import dataclasses
 import gc
 import inspect
 import logging
+import math
 import os
 import time
 from collections import defaultdict
 from contextlib import nullcontext
-from dataclasses import fields, is_dataclass
+from copy import deepcopy
+from dataclasses import dataclass, is_dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from functools import partial
+from itertools import chain, zip_longest
+from math import ceil
+from typing import Any, Dict, List
 
 import torch
-from torch.utils._pytree import tree_flatten
+from torch.utils._pytree import tree_map as tree_map_pyt
 
 from megatron.core import parallel_state
 from megatron.core.num_microbatches_calculator import get_num_microbatches
@@ -20,9 +26,10 @@
     CudaRNGStatesTracker,
     get_all_rng_states,
     get_cuda_rng_tracker,
+    is_checkpointing,
 )
-from megatron.core.transformer.identity_op import IdentityOp
-from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import (
     get_attr_wrapped_model,
@@ -34,6 +41,7 @@
 
 try:
     import transformer_engine as te  # pylint: disable=unused-import
+    from transformer_engine.pytorch.distributed import is_fp8_activation_recompute_enabled
     from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
     from transformer_engine.pytorch.graph import (
         make_graphed_callables,
@@ -43,6 +51,7 @@
     from transformer_engine.pytorch.graph import set_capture_end as te_set_capture_end
     from transformer_engine.pytorch.graph import set_capture_start as te_set_capture_start
     from transformer_engine.pytorch.module.base import TransformerEngineBaseModule
+    from transformer_engine.pytorch.utils import make_weak_ref
 
     HAVE_TE_GRAPHS = True
 except:
@@ -56,7 +65,7 @@
     HAVE_TQDM = False
 
 _IS_GRAPH_CAPTURING = False
-
+_IS_GRAPH_WARMUP = False
 logger = logging.getLogger(__name__)
 
 # Freeze GC during capture.
@@ -74,7 +83,6 @@
 
 def is_graph_capturing():
     """Query if currently capturing."""
-    global _IS_GRAPH_CAPTURING
     return _IS_GRAPH_CAPTURING
 
 
@@ -90,6 +98,39 @@ def _set_capture_end():
     _IS_GRAPH_CAPTURING = False
 
 
+def is_graph_warmup():
+    """Query if currently warming up for graph capture."""
+    return _IS_GRAPH_WARMUP
+
+
+def _set_warmup_start():
+    """Set graph warmup has started."""
+    global _IS_GRAPH_WARMUP
+    _IS_GRAPH_WARMUP = True
+
+
+def _set_warmup_end():
+    """Set graph warmup has ended."""
+    global _IS_GRAPH_WARMUP
+
+
+@dataclass
+class CudagraphBufferMetadata:
+    """
+    Metadata saved to tensors during cudagraph capture. This data will be used to determine
+    during graph captue when a cudagraph can reuse a buffer or directly write its output into
+    a subsequent's graph's input.
+    """
+
+    is_cudagraph_input: bool = False
+    is_cudagraph_output: bool = False
+    input_use_count: int = 0
+    cudagraph_reuse_ref_count: int = 0
+    capture_reuse_count: int = 0
+    fwd_cudagraph_buffer: torch.Tensor = None
+    bwd_cudagraph_buffer: torch.Tensor = None
+
+
 class ArgMetadata:
     """Arg meta."""
 
@@ -99,9 +140,84 @@ def __init__(self, arg):
             self.shape = arg.shape
             self.dtype = arg.dtype
             self.device = arg.device
+            self.value = arg.data_ptr()
+            self.requires_grad = arg.requires_grad
+            if hasattr(arg, "cg_buffer_metadata"):
+                # Its important this is a reference copy
+                self.cg_buffer_metadata = arg.cg_buffer_metadata
         else:
             self.value = arg
 
+    def zeros_like(self):
+        """Reconstruct a tensor with the properties as the meta arg."""
+
+        assert self.type == torch.Tensor
+        return torch.zeros(
+            *self.shape, dtype=self.dtype, device=self.device, requires_grad=self.requires_grad
+        )
+
+
+class TensorReusePool:
+    """
+    A pool-like list of tensors that can be reused as input and output buffers during graph capture.
+    Also maintains strong references to all tensors created by this pool, so that they will never be
+    freed by the memory allocator.
+    """
+
+    """Record strong references to buffers created by the pool so they cannot be deallocated between
+    graph captures."""
+    tensor_strong_refs: list = []
+
+    """Record the data_ptrs of buffers created by the pool to check when a tensor came was 
+    allocated from this pool. """
+    tensor_strong_refs_dataptrs: set = set()
+
+    """Buffers that have been returned to the pool and are available for reuse. """
+    pool: list[torch.Tensor] = []
+
+    def insert(self, tensor: torch.Tensor):
+        """Return a tensor to the pool reuse."""
+        assert self.owns(tensor)
+        self.pool.append(tensor)
+
+    def owns(self, tensor: torch.Tensor):
+        """Check if a tensor was created from this pool."""
+        return tensor.data_ptr() in self.tensor_strong_refs_dataptrs
+
+    def get(self, meta: ArgMetadata):
+        """Try to get a buffer from the pool. If a matching tensor is already in the pool, its
+        assumed to be available and returned. Otherwise, allocate a new buffer."""
+
+        assert isinstance(meta, ArgMetadata)
+        # Find first matching buffer in pool
+        for i, buf in enumerate(self.pool):
+            if buf.shape == meta.shape and buf.dtype == meta.dtype and buf.device == meta.device:
+                return self.pool.pop(i)
+
+        out = meta.zeros_like()
+        self.tensor_strong_refs.append(out)
+        self.tensor_strong_refs_dataptrs.add(out.data_ptr())
+        return out
+
+
+def tree_map(func, tree):
+    """
+    Wrapper around pytorch's tree_map, but also recurses into dataclasses.
+    """
+
+    def wrapper(arg):
+        # If it's a dataclass, map over its fields
+        if is_dataclass(arg) and not isinstance(arg, type):
+            changes = {
+                f.name: tree_map_pyt(func, getattr(arg, f.name)) for f in dataclasses.fields(arg)
+            }
+            return dataclasses.replace(arg, **changes)
+
+        # Otherwise, apply the user function
+        return func(arg)
+
+    return tree_map_pyt(wrapper, tree)
+
 
 def _check_supported_type(meta):
     """Check if arg meta is a supported type for cudagraph input/outputs."""
@@ -119,35 +235,16 @@ def _check_supported_type(meta):
         int,
         str,
         float,
+        dataclass,
         StaticInferenceContext,
         DynamicInferenceContext,
+        ArgMetadata,
     }
     assert meta.type in _SUPPORTED_TYPES or is_dataclass(
         meta.value
     ), f"Cudagraphs recieved an arg of type {meta.type} which is not supported."
 
 
-def _determine_if_transformer_decoder_layer(base_module):
-    """Determine if the given module is a transformer decoder layer."""
-    # import modules here to avoid a circular import
-    from megatron.core.ssm.mamba_layer import MambaLayer
-    from megatron.core.transformer.transformer_layer import BaseTransformerLayer, TransformerLayer
-
-    is_potential_decoder_layer = isinstance(
-        base_module, (TransformerLayer, BaseTransformerLayer, MambaLayer)
-    )
-    if not is_potential_decoder_layer:
-        return False
-    if isinstance(base_module, TransformerLayer) and not isinstance(
-        base_module.cross_attention, IdentityOp
-    ):
-        # If the layer has a cross attention, it is not a decoder layer
-        return False
-    else:
-        # Otherwise it is a decoder layer
-        return True
-
-
 def _determine_if_first_last_layer_of_this_vp_chunk(base_module):
     """Determine if the given module is the first/last layer of the PP+VPP chunk it belongs to.
     Returns a tuple of two booleans indicating if the module is the first/last layer of the chunk.
@@ -157,6 +254,9 @@ def _determine_if_first_last_layer_of_this_vp_chunk(base_module):
     from megatron.core.transformer.transformer_block import get_num_layers_to_build
     from megatron.core.transformer.transformer_layer import get_transformer_layer_offset
 
+    if not hasattr(base_module, "layer_number"):
+        return True, True
+
     # find all first/last layers of this PP stage
     first_layer_numbers = []
     last_layer_numbers = []
@@ -174,6 +274,48 @@ def _determine_if_first_last_layer_of_this_vp_chunk(base_module):
     )
 
 
+def _clone_nested_tensors(value: Any) -> Any:
+    """Recursively clone tensors inside nested containers."""
+    if torch.is_tensor(value):
+        return value.clone()
+    if isinstance(value, (tuple, list)):
+        return type(value)(_clone_nested_tensors(v) for v in value)
+    if isinstance(value, dict):
+        return {k: _clone_nested_tensors(v) for k, v in value.items()}
+    if isinstance(value, set):
+        raise TypeError(
+            "Sets of tensors are unsupported in cudagraph helpers; use list/tuple instead"
+        )
+    return value
+
+
+def _ensure_generator_state_is_cudagraph_safe(gen: torch.Generator) -> torch.Generator:
+    """Make generator state safe for CUDA graph capture/replay.
+
+    Generator state tensors can become inference tensors if created under `torch.inference_mode()`.
+    CUDA graph capture may later attempt in-place updates on that state; this fails for inference
+    tensors. Fix the generator *in-place* (preserving identity) by cloning its state outside
+    inference mode and setting it back.
+    """
+    with torch.inference_mode(mode=False):
+        if hasattr(gen, "graphsafe_get_state"):
+            state = gen.graphsafe_get_state()
+        else:
+            state = gen.get_state()
+
+        cloned_state = _clone_nested_tensors(state)
+        if hasattr(gen, "graphsafe_set_state"):
+            gen.graphsafe_set_state(cloned_state)
+        else:
+            gen.set_state(cloned_state)
+
+    return gen
+
+
+fwd_buffer_reuse_ref_count = 0
+bwd_buffer_reuse_ref_count = 0
+
+
 class _CudagraphGlobalRecord:
     """A global datastructure that records of the ordering of all _CudaGraphRunner's
     first fwd or bwd passes. 'create_cudagraphs' will use this to create
@@ -185,13 +327,16 @@ class _CudagraphGlobalRecord:
 
     """A record of fwd and bwd graph creation, populated with 'record_fwd_graph' and
     'record_bwd_graph."""
-    cudagraph_record = []
-    cudagraph_inference_record = []
+    cudagraph_record: list[tuple] = []
+    cudagraph_inference_record: list[tuple] = []
+
+    """A pool-like data structure to reuse input and output buffers across cudagraph."""
+    tensor_reuse_pool = TensorReusePool()
 
     @classmethod
-    def record_fwd_graph(cls, runner, args, kwargs):
+    def record_fwd_graph(cls, runner, args, kwargs, out):
         """Record a fwd graph to 'cudagraph_record"""
-        cls.cudagraph_record.append((runner, "fwd", args, kwargs))
+        cls.cudagraph_record.append((runner, "fwd", args, kwargs, out))
 
     @classmethod
     def record_bwd_graph(cls, runner):
@@ -202,7 +347,6 @@ def record_bwd_graph(cls, runner):
     def create_cudagraphs(cls):
         """Iterate through 'cudagraph_record' creating graphs in the order in which
         they were recorded."""
-
         # Cudagraphs have already been created, check that no cudagraphed modules ran in eager mode
         if cls.cudagraph_created:
             assert len(cls.cudagraph_record) == 0, (
@@ -216,8 +360,6 @@ def create_cudagraphs(cls):
             return
 
         # Otherwise, create all the recorded cudagraphs.
-        logging.getLogger(__name__).info(f"Creating {len(cls.cudagraph_record)} CUDA graphs")
-
         has_te_modules = False
         if HAVE_TE_GRAPHS:
             for g in cls.cudagraph_record:
@@ -226,21 +368,27 @@ def create_cudagraphs(cls):
                     [isinstance(m, TransformerEngineBaseModule) for m in base_module.modules()]
                 )
 
-        # If graphing only transformer layers with self attention, then apply the following
-        # transformer layer specific optimizations that reduce memory usage and tensor copies:
-        # These eventually will become unneccessary with:
-        # https://github.com/pytorch/pytorch/pull/137318
-        # 1. Some inputs to TransformerLayer (e.g. rotary_emb) are the same over all layers
-        #    and only need to be set once.
-        # 2. Because the next layer consumes the previous layer's hidden states, all fwd
-        #    cudagraphs can alternate reusing the same hidden_state input, output buffer.
-        #    Similarly, bwd graphs can alternate the same output, input grad buffers.
-        optimize_transformer_layer_graph_buffers = all(
-            [g[0].reuse_input_output_buffer for g in cls.cudagraph_record]
-        )
-        if optimize_transformer_layer_graph_buffers:
-            prev_fwd_hidden_state_output = None
-            prev_bwd_hidden_state_inputgrad = None
+        progress_bar = enumerate(cls.cudagraph_record)
+        time_start = time.time()
+        mem_stats_start = torch.cuda.memory_stats()
+
+        if torch.distributed.get_rank() == 0:
+            if HAVE_TQDM:
+                progress_bar = tqdm(
+                    progress_bar, "create cuda graphs", total=len(cls.cudagraph_record)
+                )
+
+            logger.info(f"Creating {len(cls.cudagraph_record)} CUDA graphs")
+            if not HAVE_TE_GRAPHS:
+                logger.warning(
+                    "Transformer Engine was not detected while capturing training cudagraphs."
+                    "As a result cudagraph memory overhead may significantly increase as "
+                    "Transformer Engine's weak reference feature is used on cudagraph input and "
+                    "output buffers. This allows the memory of input and output buffers to be "
+                    " reclaimed across graphs while remaining valid buffers for when the graph "
+                    "is replayed. For more information see: "
+                    "https://github.com/NVIDIA/TransformerEngine/blob/v2.10/transformer_engine/pytorch/utils.py#L759"  # pylint: disable=line-too-long
+                )
 
         gc.collect()
         torch.cuda.empty_cache()
@@ -249,6 +397,8 @@ def create_cudagraphs(cls):
         if has_te_modules:
             te_set_capture_start()
 
+        global bwd_buffer_reuse_ref_count, fwd_buffer_reuse_ref_count
+
         def format_mem_bytes(mem_bytes):
             for power, suffix in [(4, "tb"), (3, "gb"), (2, "mb"), (1, "kb"), (0, "bytes")]:
                 suffix_bytes = 1024**power
@@ -256,58 +406,25 @@ def format_mem_bytes(mem_bytes):
                     return "%.1f %s" % (mem_bytes / suffix_bytes, suffix)
             return "%d bytes" % mem_bytes
 
-        time_start = time.time()
-        mem_stats_start = torch.cuda.memory_stats()
-        progress_bar = enumerate(cls.cudagraph_record)
-        if HAVE_TQDM:
-            progress_bar = tqdm(progress_bar, "create cuda graphs", total=len(cls.cudagraph_record))
         for g_idx, g in progress_bar:
+            if torch.distributed.get_rank() == 0:
+                mem_stats = torch.cuda.memory_stats()
+                progress_str = "create cuda graphs | mem: alloc %s, res %s" % (
+                    format_mem_bytes(mem_stats["allocated_bytes.all.current"]),
+                    format_mem_bytes(mem_stats["reserved_bytes.all.current"]),
+                )
+                if HAVE_TQDM:
+                    progress_bar.set_description(progress_str)
+                elif g_idx % 100 == 0 or g_idx == len(cls.cudagraph_record) - 1:
+                    logger.info(f"{g_idx}/{len(cls.cudagraph_record)}. {progress_str}")
 
             runner, graph_type = g[0:2]
-
-            mem_stats = torch.cuda.memory_stats()
-            progress_str = "create cuda graphs | mem: alloc %s, res %s" % (
-                format_mem_bytes(mem_stats["allocated_bytes.all.current"]),
-                format_mem_bytes(mem_stats["reserved_bytes.all.current"]),
-            )
-            if HAVE_TQDM:
-                progress_bar.set_description(progress_str)
-            elif g_idx % 100 == 0 or g_idx == len(cls.cudagraph_record) - 1:
-                logger.info(f"{g_idx}/{len(cls.cudagraph_record)}. {progress_str}")
-
-            if optimize_transformer_layer_graph_buffers:
-                if graph_type == 'fwd':
-                    args, kwargs = g[2:]
-
-                    if not runner.is_first_layer:
-                        kwargs['hidden_states'] = prev_fwd_hidden_state_output
-                    runner.create_fwd_graph(args, kwargs, clone_inputs=False)
-
-                    # The output of TransformerLayer is: (hidden_states, None)
-                    # The output of MambaLayer is: (hidden_states,)
-                    # make sure to get the hidden states tensor from the tuple
-                    prev_fwd_hidden_state_output = runner.fwd_graph_outputs[0]
-
-                else:
-                    # In vision models, encoder and decoder transformers have different
-                    # hidden_states shapes. Each has its own first and last layers that
-                    # are noncontiguous. Reset prev_bwd_hidden_state_inputgrad to None at
-                    # each last layer to avoid shape mismatch when transitioning between
-                    # encoder and decoder.
-                    if runner.is_last_layer:
-                        prev_bwd_hidden_state_inputgrad = None
-
-                    runner.create_bwd_graph(prev_bwd_hidden_state_inputgrad)
-
-                    # The first input grad TransformerLayer is for 'hidden_states'
-                    prev_bwd_hidden_state_inputgrad = runner.static_grad_inputs[0]
+            if graph_type == 'fwd':
+                args, kwargs, out = g[2:]
+                runner.create_fwd_graph(args, kwargs, out, clone_inputs=True)
             else:
-                runner, graph_type = g[0:2]
-                if graph_type == 'fwd':
-                    args, kwargs = g[2:]
-                    runner.create_fwd_graph(args, kwargs)
-                else:
-                    runner.create_bwd_graph()
+                assert fwd_buffer_reuse_ref_count == 0
+                runner.create_bwd_graph()
 
         # Memory usage.
         time_end = time.time()
@@ -323,7 +440,10 @@ def format_mem_bytes(mem_bytes):
                 - mem_stats_start["reserved_bytes.all.current"]
             ),
         }
-        logger.info(
+
+        log_single_rank(
+            logger,
+            logging.INFO,
             "> built %d cuda graph(s) in %.2f sec, with total memory usage: "
             "allocated %s, reserved %s."
             % (
@@ -331,7 +451,7 @@ def format_mem_bytes(mem_bytes):
                 capture_stats["time"],
                 format_mem_bytes(capture_stats["allocated_bytes"]),
                 format_mem_bytes(capture_stats["reserved_bytes"]),
-            )
+            ),
         )
 
         # Mark cuda graphs as created.
@@ -348,6 +468,8 @@ def format_mem_bytes(mem_bytes):
         if has_te_modules:
             te_set_capture_end()
 
+        torch.cuda.set_stream(torch.cuda.default_stream())
+
         # Return capture time and memory usage.
         return capture_stats
 
@@ -381,8 +503,7 @@ def delete_cuda_graphs():
         runner.bwd_graph_recorded = False
         runner.fwd_graph = None
         runner.bwd_graph = None
-        runner.fwd_mempool = None
-        runner.bwd_mempool = None
+        runner.mempool = None
 
     # Reset global tracking state
     _CudagraphGlobalRecord.cudagraph_created = False
@@ -394,8 +515,6 @@ def delete_cuda_graphs():
     torch.cuda.empty_cache()
 
     CudaGraphManager.global_mempool = None
-    CudaGraphManager.fwd_mempools = None
-    CudaGraphManager.bwd_mempool = None
 
 
 class _GraphStatus(Enum):
@@ -456,38 +575,42 @@ def forward(ctx, runner, is_first_microbatch, *inputs):
         ), "Fwd cudagraph received a different number of tensors than what it was graphed with!"
 
         # Copy new data into fwd graph input buffer
+        need_copy_inputs = []
         for user_input, cudagraph_input in zip(inputs, runner.fwd_graph_input_surface):
-            if user_input.data_ptr() != cudagraph_input.data_ptr():
+            if (
+                hasattr(cudagraph_input, "can_skip_replay_copy")
+                and cudagraph_input.can_skip_replay_copy
+            ):
+                need_copy_inputs.append(user_input)
+                assert user_input.data_ptr() == cudagraph_input.data_ptr()
+            else:
                 cudagraph_input.copy_(user_input)
 
         ctx.runner = runner
-        if runner.fp8_enabled or runner.fp4_enabled:
-            for m in runner.base_module.modules():
-                if isinstance(m, TransformerEngineBaseModule):
-                    m.fp8_meta["fp8_group"] = FP8GlobalStateManager.get_fp8_group()
-                    m.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
+        ctx.save_for_backward(*need_copy_inputs)
 
-                    if is_te_min_version("1.13.0"):
-                        FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(m.fp8_meta)
-                    else:
-                        FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(
-                            m.fp8_meta, fp8_weights=m._get_fp8_params()
-                        )
+        if runner.fp8_enabled or runner.fp4_enabled:
+            if isinstance(FP8GlobalStateManager.get_fp8_recipe(), te.common.recipe.DelayedScaling):
+                for m in runner.base_module.modules():
+                    if isinstance(m, TransformerEngineBaseModule):
+                        m.fp8_meta["fp8_group"] = FP8GlobalStateManager.get_fp8_group()
+                        m.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
+
+                        if is_te_min_version("1.13.0"):
+                            FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(m.fp8_meta)
+                        else:
+                            FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(
+                                m.fp8_meta, fp8_weights=m._get_fp8_params()
+                            )
 
-            is_first_fp8_module = FP8GlobalStateManager.is_first_fp8_module()
-            if is_first_fp8_module:
+            # Note that FP8GlobalStateManager.is_first_fp8_module() is inacccurate as each
+            # layer may be in its own fp8 context, when the fp8 recipe != delayed_scaling
+            if runner.is_first_layer and (runner.fp8_param_cache_updated != is_first_microbatch):
                 FP8GlobalStateManager.set_skip_fp8_weight_update_tensor(not is_first_microbatch)
-            ctx.is_first_fp8_module = is_first_fp8_module
+                runner.fp8_param_cache_updated = is_first_microbatch
 
         runner.fwd_graph.replay()
-
-        # if last transformer layer, return a clone of the cudagraph output buffer, as releasing
-        # the cudagraph output buffer into the rest of the system may allow it to be corrupted
-        if runner.is_last_layer:
-            out = tuple(o.clone().detach() for o in runner.fwd_graph_output_surface)
-        else:
-            out = tuple(o.detach() for o in runner.fwd_graph_output_surface)
-        return out
+        return runner.fwd_graph_output_surface
 
     @staticmethod
     def backward(ctx, *grads):
@@ -504,16 +627,28 @@ def backward(ctx, *grads):
             runner.static_grad_outputs
         ), "Bwd cudagraph received a different number of tensors than what it was graphed with!"
 
+        need_copy_inputs = list(ctx.saved_tensors)
+        for cudagraph_input in runner.fwd_graph_input_surface:
+            if (
+                hasattr(cudagraph_input, "can_skip_replay_copy")
+                and cudagraph_input.can_skip_replay_copy
+            ):
+                cudagraph_input.copy_(need_copy_inputs.pop(0))
+
         # Copy new data into bwd graph input buffer
         for user_output_grad, cudagraph_output_grad in zip(grads, runner.static_grad_outputs):
+            if cudagraph_output_grad is None:
+                continue
             if user_output_grad.data_ptr() != cudagraph_output_grad.data_ptr():
                 cudagraph_output_grad.copy_(user_output_grad)
 
         runner.bwd_graph.replay()
         runner.status = _GraphStatus.FWD_READY
 
-        # Update FP8/FP4 scale factors if needed
-        if (runner.fp8_enabled or runner.fp4_enabled) and ctx.is_first_fp8_module:
+        # Update FP8 scale factors if needed
+        if runner.fp8_enabled and isinstance(
+            FP8GlobalStateManager.get_fp8_recipe(), te.common.recipe.DelayedScaling
+        ):
             FP8GlobalStateManager.reduce_and_update_fp8_tensors(forward=False)
 
         # If using gradient_accumulation_fusion, whenever `main_grad` is calculated
@@ -522,18 +657,12 @@ def backward(ctx, *grads):
         for param, grad_added in runner.groundtruth_grad_added_to_main_grad.items():
             param.grad_added_to_main_grad = grad_added
 
-        grads, is_dummy_grad = runner.get_input_grads_with_dummy_flags()
-        if runner.is_first_layer:
-            output_grads = tuple(
-                b.clone().detach() if not (b is None or dummy) else b
-                for dummy, b in zip(is_dummy_grad, grads)
-            )
-        else:
-            output_grads = tuple(
-                b.detach() if not (b is None or dummy) else b
-                for dummy, b in zip(is_dummy_grad, grads)
-            )
-        return None, None, *output_grads
+        # Replaying the next bwd graph destroys the data held in static_grad_inputs, so clone
+        # wgrads as autograd may launch the next graph before wgrads are accumulated
+        dgrads = runner.static_grad_inputs[: runner.num_dgrads]
+        wgrads = (g.clone() for g in runner.static_grad_inputs[runner.num_dgrads :])
+
+        return None, None, *dgrads, *wgrads
 
 
 class _CudaGraphRunner(torch.nn.Module):
@@ -544,23 +673,20 @@ class _CudaGraphRunner(torch.nn.Module):
     def __init__(
         self,
         base_module: MegatronModule,
-        fwd_mempool: int,
-        bwd_mempool: int,
+        mempool: int,
         fwd_graph_input_args: List[Any],
         fwd_graph_input_kwargs: Dict[str, Any],
-        share_cudagraph_io_buffers=None,
+        func,
+        need_backward,
     ):
         """Creates a _CudaGraphRunner, which holds a single pair of fwd and bwd cudagraphs, which
         are not created until this runner records its graph creation into
-        '_CudagraphGlobalRecord', and 'create_cudagraphs()' is called. share_cudagraph_io_buffers
-        is a boolean flag to indicate whether to reuse the cudagraph input and output buffers for
-        transformer layer specific optimizations that reduce memory usage and tensor copies."""
+        '_CudagraphGlobalRecord', and 'create_cudagraphs()' is called."""
 
         super().__init__()
 
         self.base_module = base_module
-        self.fwd_mempool = fwd_mempool
-        self.bwd_mempool = bwd_mempool
+        self.mempool = mempool
 
         self.fwd_graph_input_arg_metas = [ArgMetadata(a) for a in fwd_graph_input_args]
         self.fwd_graph_input_kwarg_metas = {
@@ -580,14 +706,30 @@ def __init__(
         self.fp8_enabled = False
         self.fp4_enabled = False
         self.deallocate_pipeline_outputs = False
-        self.num_warmup_steps = 2
-        if isinstance(self.base_module.config, TransformerConfig):
+
+        self.grad_enabled = need_backward and torch.is_grad_enabled()
+        self.func = super(MegatronModule, self.base_module).__call__ if func is None else func
+        self.is_first_layer, self.is_last_layer = _determine_if_first_last_layer_of_this_vp_chunk(
+            base_module
+        )
+
+        # We use this attribute to record the value of 'is_first_microbatch' each fwd cudagraph
+        # replay so that way we only update the value of this flag in FP8GlobalStateManager when
+        # it changes which incurs an HtoD sync
+        if self.is_first_layer:
+            self.fp8_param_cache_updated = None
+
+        if hasattr(self.base_module, "config") and isinstance(
+            self.base_module.config, TransformerConfig
+        ):
             self.fuse_wgrad_accumulation = self.base_module.config.gradient_accumulation_fusion
             self.backward_retain_grad = self.base_module.config.cuda_graph_retain_backward_graph
-            self.fp8_enabled = self.base_module.config.fp8 is not None
-            self.fp4_enabled = self.base_module.config.fp4 is not None
             self.deallocate_pipeline_outputs = self.base_module.config.deallocate_pipeline_outputs
             self.num_warmup_steps = self.base_module.config.cuda_graph_warmup_steps
+            self.fp8_enabled = self.base_module.config.fp8 is not None
+            self.fp4_enabled = self.base_module.config.fp4 is not None
+            self.fp8_runtime_enabled = None
+            self.fp4_runtime_enabled = None
 
             if self.fp8_enabled:
                 self.fp8_recipe = FP8GlobalStateManager.get_fp8_recipe()
@@ -599,259 +741,435 @@ def __init__(
                 self.fp4_recipe = get_fp4_recipe(self.base_module.config)
                 FP8GlobalStateManager.set_skip_fp8_weight_update_tensor(False)
 
-        # Decide whether to reuse the input and output buffer, and if so,
-        # whether this layer is the first layer (which needs an input buffer)
-        # or the last layer (which needs an output buffer)
-
-        self.is_transformer_decoder_layer = _determine_if_transformer_decoder_layer(base_module)
-        self.reuse_input_output_buffer = (
-            share_cudagraph_io_buffers and self.is_transformer_decoder_layer
-        )
-        if self.reuse_input_output_buffer:
-            self.is_first_layer, self.is_last_layer = (
-                _determine_if_first_last_layer_of_this_vp_chunk(base_module)
-            )
-        else:
-            self.is_first_layer, self.is_last_layer = True, True
-
     def __str__(self):
         return "%s; hid %s" % (
             self.base_module.__class__.__name__,
             tuple(self.fwd_graph_input_kwarg_metas["hidden_states"].shape),
         )
 
-    def get_fp8_context(self):
-        """Return a new fp8 context in cudagraph mode."""
-        from megatron.core.fp8_utils import get_fp8_context  # to avoid circular import
-
-        return get_fp8_context(self.base_module.config, self.base_module.layer_number - 1)
-
-    def get_fp4_context(self):
-        """Return a new fp4 context in cudagraph mode."""
-        from megatron.core.fp4_utils import get_fp4_context  # to avoid circular import
-
-        return get_fp4_context(self.base_module.config, self.base_module.layer_number - 1)
-
     def get_quantization_context(self):
         """Return appropriate quantization context (FP8 or FP4) in cudagraph mode."""
-        if self.fp8_enabled:
-            return self.get_fp8_context()
-        elif self.fp4_enabled:
-            return self.get_fp4_context()
+        if self.fp8_runtime_enabled:
+            from megatron.core.fp8_utils import get_fp8_context  # to avoid circular import
+
+            return get_fp8_context(self.base_module.config, self.base_module.layer_number - 1)
+        elif self.fp4_runtime_enabled:
+            from megatron.core.fp4_utils import get_fp4_context  # to avoid circular import
+
+            return get_fp4_context(self.base_module.config, self.base_module.layer_number - 1)
         else:
             return nullcontext()
 
-    def create_fwd_graph(self, args, kwargs, clone_inputs=True):
+    def get_connected_params(self, outputs):
+        """Iterate through the autograd graph of 'outputs' and returns all parameters connected.
+        In theory this should return all parameters that return a nonzero wgrad when computing
+        the backward pass of 'outputs'."""
+        # Flatten outputs and start traversal from roots that require gradients
+        args = (outputs,) if torch.is_tensor(outputs) else outputs
+        stack = [
+            t.grad_fn
+            for t in self.get_tensors(args, check_types=False)
+            if t.requires_grad and t.grad_fn
+        ]
+        visited, p_ids = set(), set()
+
+        while stack:
+            if (fn := stack.pop()) not in visited:
+                visited.add(fn)
+                # AccumulateGrad nodes (leafs) hold the 'variable' (Parameter) they accumulate into
+                if hasattr(fn, 'variable'):
+                    p_ids.add(id(fn.variable))
+                stack.extend(f for f, _ in fn.next_functions if f)
+
+        # Return module params that were found in the graph, preserving original order
+        return tuple(p for p in self.base_module.parameters() if id(p) in p_ids)
+
+    def create_fwd_graph(self, args, kwargs, outputs=None, clone_inputs=True):
         """Create a fwd cudagraph for this runner. Should be called inside
         'create_cudagraphs()'."""
 
-        # Freeze GC, to speed up capture time ~15-20x.
-        if FREEZE_GC:
-            gc.freeze()
+        global fwd_buffer_reuse_ref_count
+
+        self.args = args
+        self.kwargs = kwargs
+        self.outputs = outputs
 
         # save grads and other variables that may be affected by graph warmup
         if self.training and torch.is_grad_enabled():
-            save_main_grads = [
-                param.main_grad.clone()
-                for param in self.base_module.parameters()
-                if hasattr(param, 'main_grad')
-            ]
+            grad_backup = []
+            for param in self.base_module.parameters():
+                grad_backup.append(param.main_grad.clone() if hasattr(param, "main_grad") else None)
+
+            saved_fp8_tensors = None
+            if self.fp8_enabled:
+                if is_te_min_version("1.13.0"):
+                    saved_fp8_tensors = save_fp8_tensors([self.base_module], self.fp8_recipe)
+                else:
+                    saved_fp8_tensors = save_fp8_tensors(
+                        [self.base_module], self.fp8_recipe.amax_history_len
+                    )
+            elif self.fp4_enabled:
+                if is_te_min_version("2.7.0.dev0"):
+                    saved_fp8_tensors = save_fp8_tensors([self.base_module], self.fp4_recipe)
+                else:
+                    raise ValueError("FP4 requires TE >= 2.7.0.dev0 for NVFP4BlockScaling support.")
 
-        saved_fp8_tensors = None
+        # cache the moe aux loss if needed, which is accumulated inside the forward pass
+        from megatron.core.transformer.transformer_layer import MoETransformerLayer
 
-        if self.fp8_enabled:
-            if is_te_min_version("1.13.0"):
-                saved_fp8_tensors = save_fp8_tensors([self.base_module], self.fp8_recipe)
-            else:
-                saved_fp8_tensors = save_fp8_tensors(
-                    [self.base_module], self.fp8_recipe.amax_history_len
+        is_moe = isinstance(self.base_module, MoETransformerLayer)
+        if is_moe:
+            from megatron.core.transformer.moe.moe_utils import get_moe_layer_wise_logging_tracker
+
+            tracker = get_moe_layer_wise_logging_tracker()
+            cached_aux_losses = {}
+            for name in tracker:
+                if "values" in tracker[name]:
+                    cached_aux_losses[name] = torch.clone(tracker[name]["values"])
+
+        self.fwd_graph = torch.cuda.CUDAGraph()
+
+        # For cases with multiple active RNG states, e.g. TP.
+        rng_states = get_all_rng_states()
+        with torch.inference_mode(mode=False):
+            for gen in rng_states.values():
+                self.fwd_graph.register_generator_state(
+                    _ensure_generator_state_is_cudagraph_safe(gen)
+                )
+
+        def _resolve_input_buffer(ten):
+            if not isinstance(ten, ArgMetadata):
+                return ten
+
+            # the input tensor is resued from another cudagraph's input or output
+            if (
+                hasattr(ten, "cg_buffer_metadata")
+                and ten.cg_buffer_metadata.fwd_cudagraph_buffer is not None
+            ):
+                global fwd_buffer_reuse_ref_count
+                buf = ten.cg_buffer_metadata.fwd_cudagraph_buffer
+
+                assert (
+                    ten.cg_buffer_metadata.is_cudagraph_input
+                    and buf.cg_buffer_metadata.capture_reuse_count > 0
                 )
-        elif self.fp4_enabled:
-            if is_te_min_version("2.7.0.dev0"):
-                saved_fp8_tensors = save_fp8_tensors([self.base_module], self.fp4_recipe)
+
+                if (
+                    ten.cg_buffer_metadata.input_use_count > 1
+                    and ten.cg_buffer_metadata.input_use_count
+                    == buf.cg_buffer_metadata.capture_reuse_count
+                ):
+                    can_skip_replay_copy = False
+                else:
+                    can_skip_replay_copy = True
+
+                buf.cg_buffer_metadata.capture_reuse_count -= 1
+                if buf.cg_buffer_metadata.capture_reuse_count == 0:
+                    ten.cg_buffer_metadata.fwd_cudagraph_buffer = None
+                    fwd_buffer_reuse_ref_count -= 1
             else:
-                raise ValueError("FP4 requires TE >= 2.7.0.dev0 for NVFP4BlockScaling support.")
+                # need to provide a fresh buffer from the reuse pool
+                buf = _CudagraphGlobalRecord.tensor_reuse_pool.get(ten)
+                can_skip_replay_copy = False
+
+            buf = buf.detach().requires_grad_(ten.requires_grad)
+            buf.can_skip_replay_copy = can_skip_replay_copy
+            return buf
 
         if clone_inputs:
-            args, kwargs = self.zero_out_tensors(args, kwargs)
+            # if a buffer is used for multiple inputs, create it now
+            for ten in self.get_tensors(args, kwargs):
+                if (
+                    hasattr(ten, 'cg_buffer_metadata')
+                    and ten.cg_buffer_metadata.input_use_count > 1
+                    and ten.cg_buffer_metadata.fwd_cudagraph_buffer is None
+                ):
+                    buf = _CudagraphGlobalRecord.tensor_reuse_pool.get(ten)
+                    buf.cg_buffer_metadata = deepcopy(ten.cg_buffer_metadata)
+                    buf.cg_buffer_metadata.capture_reuse_count = (
+                        ten.cg_buffer_metadata.input_use_count
+                    )
+                    ten.cg_buffer_metadata.fwd_cudagraph_buffer = buf
+                    fwd_buffer_reuse_ref_count += 1
 
-        input_tensors = self.get_tensors(args, kwargs)
-        self.fwd_graph_input_surface = input_tensors + tuple(self.base_module.parameters())
+            self.fwd_graph_input_args = tree_map(_resolve_input_buffer, args)
+            self.fwd_graph_input_kwargs = tree_map(_resolve_input_buffer, kwargs)
+        else:
+            self.fwd_graph_input_args, self.fwd_graph_input_kwargs = args, kwargs
 
-        self.fwd_graph = torch.cuda.CUDAGraph()
+        self.fwd_graph_input_surface = self.get_tensors(
+            self.fwd_graph_input_args, self.fwd_graph_input_kwargs
+        )
 
-        # For cases with multiple active RNG states, e.g. TP.
-        for _, state in get_all_rng_states().items():
-            self.fwd_graph.register_generator_state(state)
+        ctx = torch.no_grad() if not self.grad_enabled else nullcontext()
+        with ctx:
+            # warmup again as case graph capture mode may execute a different codepath
+            _set_warmup_start()
+            for _ in range(self.num_warmup_steps):
+                with self.get_quantization_context():
+
+                    def clone_ten(ten):
+                        if not torch.is_tensor(ten):
+                            return ten
+                        return torch.zeros_like(ten).requires_grad_(ten.requires_grad)
+
+                    warmup_args = tree_map(clone_ten, self.fwd_graph_input_args)
+                    warmup_kwargs = tree_map(clone_ten, self.fwd_graph_input_kwargs)
+                    warmup_outputs = self.func(*warmup_args, **warmup_kwargs)
+
+                if self.grad_enabled:
+                    warmup_outputs = self.get_tensors(warmup_outputs)
+                    warmup_outputs = tuple(o for o in warmup_outputs if o.requires_grad)
+                    input_tensors = self.get_tensors(warmup_args, warmup_kwargs)
+                    torch.autograd.grad(
+                        outputs=warmup_outputs,
+                        inputs=tuple(i for i in input_tensors if i.requires_grad),
+                        grad_outputs=tuple(torch.zeros_like(o) for o in warmup_outputs),
+                        only_inputs=True,
+                        allow_unused=True,
+                    )
+            _set_warmup_end()
 
-        # warmup again as case graph capture mode may execute a different codepath
-        for _ in range(self.num_warmup_steps):
             with self.get_quantization_context():
-                outputs = self.base_module.forward(*args, **kwargs)
-            if self.training and torch.is_grad_enabled():
-                if isinstance(outputs, torch.Tensor):
-                    outputs = (outputs,)
-                outputs = self.get_tensors(outputs)
-                grad_inputs = torch.autograd.grad(
-                    outputs=tuple(o for o in outputs if o.requires_grad),
-                    inputs=tuple(i for i in self.fwd_graph_input_surface if i.requires_grad),
-                    grad_outputs=tuple(
-                        torch.zeros_like(o) if o.requires_grad else None for o in outputs
-                    ),
-                    only_inputs=True,
-                    allow_unused=True,
-                )
+                torch.cuda.synchronize()
+                # Register default CUDA generators ourselves (fixed in-place to have normal tensors)
+                # before capture begins, to avoid inference-tensor state issues during capture.
+                with torch.inference_mode(mode=False):
+                    for device_idx in range(torch.cuda.device_count()):
+                        default_gen = torch.cuda.default_generators[device_idx]
+                        self.fwd_graph.register_generator_state(
+                            _ensure_generator_state_is_cudagraph_safe(default_gen)
+                        )
 
-        with self.get_quantization_context():
-            torch.cuda.synchronize()
-            with torch.cuda.graph(
-                self.fwd_graph, pool=self.fwd_mempool, capture_error_mode="thread_local"
-            ):
-                outputs = self.base_module.forward(*args, **kwargs)
+                # Freeze GC, to speed up capture time ~15-20x.
+                if FREEZE_GC:
+                    gc.freeze()
+
+                with torch.cuda.graph(
+                    self.fwd_graph, pool=self.mempool, capture_error_mode="thread_local"
+                ):
+                    fwd_graph_outputs = self.func(
+                        *self.fwd_graph_input_args, **self.fwd_graph_input_kwargs
+                    )
+
+                # Unfreeze GC.
+                if FREEZE_GC:
+                    gc.unfreeze()
+
+                    # gc.collect() drops references to unreachable tensors created during capture,
+                    # returning their storage to the allocator to avoid a slowdown during replay.
+                    # However, it forces expensive global garbage collection, so must be done
+                    # only on the last layer per-device to avoid slowing down graph creation.
+                    if self.is_last_layer:
+                        gc.collect()
 
         # save cudagraph output buffer
-        if isinstance(outputs, torch.Tensor):
-            outputs = (outputs,)
-        self.fwd_graph_outputs = outputs
-        self.fwd_graph_output_surface = self.get_tensors(outputs)
+        self.fwd_graph_outputs = fwd_graph_outputs
+        self.fwd_graph_output_surface = self.get_tensors(fwd_graph_outputs)
+
+        for fwd_graph_out, o in zip(
+            self.fwd_graph_output_surface, self.get_arg_metas(self.outputs)
+        ):
+            assert hasattr(o, "cg_buffer_metadata") and o.cg_buffer_metadata.is_cudagraph_output
+
+            if (
+                o.cg_buffer_metadata.is_cudagraph_input
+                and o.cg_buffer_metadata.fwd_cudagraph_buffer is None
+            ):
+                fwd_graph_out.cg_buffer_metadata = deepcopy(o.cg_buffer_metadata)
+                fwd_graph_out.cg_buffer_metadata.capture_reuse_count = (
+                    o.cg_buffer_metadata.cudagraph_reuse_ref_count
+                )
+                o.cg_buffer_metadata.fwd_cudagraph_buffer = fwd_graph_out
+                fwd_buffer_reuse_ref_count += 1
+
+        # if an input buffer requires a copy, and does not have metadata attached to it at this
+        # point, it will not be reused after this forward pass, so return it to the pool
+        for buf in self.fwd_graph_input_surface:
+            if (
+                hasattr(buf, "can_skip_replay_copy")
+                and not buf.can_skip_replay_copy
+                and not hasattr(buf, "cg_buffer_metadata")
+            ):
+                assert _CudagraphGlobalRecord.tensor_reuse_pool.owns(buf)
+                _CudagraphGlobalRecord.tensor_reuse_pool.insert(buf)
 
         if self.training and torch.is_grad_enabled():
             assert (
                 len(self.fwd_graph_output_surface) > 0
-            ), """Tried graphing a moudule that returned no tensors in training mode,
-                however the graphed module must output at least one tensor,
+            ), """Tried graphing a module that returned no tensors in training mode, 
+                however the graphed module must output at least one tensor, 
                 so that a corresponding backward node may be registered in the autograd graph."""
 
-            # restore cached grads
-            for param in self.base_module.parameters():
-                if hasattr(param, 'main_grad'):
-                    saved_grad = save_main_grads.pop(0)
-                    assert (
-                        param.main_grad.shape == saved_grad.shape
-                    ), "Error restoring grads while cudagraphing!"
-                    param.main_grad.copy_(saved_grad)
-
-        if self.fp8_enabled or self.fp4_enabled:
-            restore_fp8_tensors([self.base_module], saved_fp8_tensors)
+            self.params_to_backprop = self.get_connected_params(fwd_graph_outputs)
+            self.num_wgrads = len(self.params_to_backprop)
+            self.num_dgrads = len(self.fwd_graph_input_surface)
+            self.fwd_graph_input_surface = self.fwd_graph_input_surface + self.params_to_backprop
 
-        # Unfreeze GC.
-        if FREEZE_GC:
-            gc.unfreeze()
+            if self.fp8_enabled:
+                restore_fp8_tensors([self.base_module], saved_fp8_tensors)
+            # restore cached grads
+            for main_grad_copy, param in zip(grad_backup, self.base_module.parameters()):
+                if main_grad_copy is not None:
+                    param.main_grad.copy_(main_grad_copy)
 
-            # gc.collect() drops references to unreachable tensors created during capture,
-            # returning their storage to the allocator to avoid a slowdown during replay. However,
-            # it forces expensive global garbage collection, so must be done only on the last layer
-            # per-device to avoid slowing down graph creation.
-            if self.is_last_layer:
-                gc.collect()
+        if is_moe:
+            for name in tracker:
+                tracker[name]["values"].copy_(cached_aux_losses[name])
 
-    def create_bwd_graph(self, static_grad_outputs=None):
+    def create_bwd_graph(self):
         """Create a bwd cudagraph for this runner. Should be called inside
         'create_cudagraphs()'."""
 
-        # Freeze GC, to speed up capture time ~15-20x.
-        if FREEZE_GC:
-            gc.freeze()
+        # unlike 'fwd_buffer_reuse_ref_count', 'bwd_buffer_reuse_ref_count' may not decrement
+        # to 0 when activation checkpointing is used. See [interaction with recompute].
+        global bwd_buffer_reuse_ref_count
 
+        assert self.grad_enabled
         self.bwd_graph = torch.cuda.CUDAGraph()
 
         # For cases with multiple active RNG states, e.g. TP.
         for _, state in get_all_rng_states().items():
             self.bwd_graph.register_generator_state(state)
 
-        if static_grad_outputs is None:
-            static_grad_outputs = tuple(
-                torch.zeros_like(o) if o.requires_grad else None
-                for o in self.fwd_graph_output_surface
-            )
-        else:
-            # canoncalize as tuple
-            if torch.is_tensor(static_grad_outputs):
-                static_grad_outputs = (static_grad_outputs,)
+        self.static_grad_outputs = []
+        for o in self.get_arg_metas(self.outputs):
+            out_grad = None
+            if o.requires_grad:
+                # TODO: (jiemingz) [interaction with recompute]
+                # for activation recompute, the fwd pass is rerun in the backward pass and
+                # the metadata we attach in record_graph_capture is lost. As a result the next
+                # cudagraph expects the buffer to be provided 'fwd_cudagraph_buffer' but is missing.
+                # So, we cannot always assume this metadata exists. Consequently, there are extra
+                # copies between the outputs of the fwd-bwd pass and the bwd pass.
+                if (
+                    o.cg_buffer_metadata.is_cudagraph_input
+                    and o.cg_buffer_metadata.bwd_cudagraph_buffer is not None
+                ):
+                    o.cg_buffer_metadata.bwd_cudagraph_buffer.shape == o.shape
+
+                    out_grad = o.cg_buffer_metadata.bwd_cudagraph_buffer
+                    o.cg_buffer_metadata.bwd_cudagraph_buffer = None
+                    out_grad.cg_buffer_metadata.capture_reuse_count -= 1
+                    bwd_buffer_reuse_ref_count -= 1
+                else:
+                    out_grad = _CudagraphGlobalRecord.tensor_reuse_pool.get(o)
+                out_grad.requires_grad = True
+            self.static_grad_outputs.append(out_grad)
 
-        torch.cuda.synchronize()
-        with torch.cuda.graph(
-            self.bwd_graph, pool=self.bwd_mempool, capture_error_mode="thread_local"
-        ):
+        # Freeze GC, to speed up capture time ~15-20x.
+        if FREEZE_GC:
+            gc.freeze()
+
+        with torch.cuda.graph(self.bwd_graph, pool=self.mempool):
             grad_inputs = torch.autograd.grad(
                 outputs=tuple(o for o in self.fwd_graph_output_surface if o.requires_grad),
                 inputs=tuple(i for i in self.fwd_graph_input_surface if i.requires_grad),
-                grad_outputs=tuple(o for o in static_grad_outputs if o is not None),
+                grad_outputs=tuple(o for o in self.static_grad_outputs if o is not None),
                 retain_graph=self.backward_retain_grad,
                 only_inputs=True,
                 allow_unused=True,
             )
 
+        # Unfreeze GC.
+        if FREEZE_GC:
+            gc.unfreeze()
+
         # Constructs a tuple suitable for returning from Graphed.backward:
         # Pads out the actually-needed grads with Nones in gradient slots for inputs
-        # that don't require grad. I couldn't think of a one-liner for this pattern.
-        static_grad_inputs = []
-        grad_idx = 0
-        for arg in self.fwd_graph_input_surface:
-            has_wgrad_fusion = self.fuse_wgrad_accumulation and getattr(
-                arg, "grad_added_to_main_grad", False
-            )
-            if arg.requires_grad:
-                if has_wgrad_fusion:
-                    static_grad_inputs.append(None)
-                else:
-                    static_grad_inputs.append(grad_inputs[grad_idx])
-                grad_idx += 1
+        # that don't require grad
+        grad_inputs = list(grad_inputs)
+        self.static_grad_inputs = []
+        for input_tensor in self.get_arg_metas(self.args, self.kwargs):
+            if input_tensor.requires_grad:
+                input_grad = grad_inputs.pop(0)
+                input_grad.cg_buffer_metadata = deepcopy(input_tensor.cg_buffer_metadata)
+                if input_tensor.cg_buffer_metadata.is_cudagraph_output:
+                    if input_tensor.cg_buffer_metadata.bwd_cudagraph_buffer is None:
+                        input_tensor.cg_buffer_metadata.bwd_cudagraph_buffer = input_grad
+                        input_grad.cg_buffer_metadata.capture_reuse_count += 1
+                        bwd_buffer_reuse_ref_count += 1
+                self.static_grad_inputs.append(input_grad)
             else:
-                static_grad_inputs.append(None)
+                self.static_grad_inputs.append(None)
+
+        # at this point static_grad_inputs hold the input dgrads, add the wgrads next
+        assert self.num_wgrads == len(grad_inputs)
+        self.static_grad_inputs.extend(grad_inputs)
+        self.static_grad_inputs = tuple(self.static_grad_inputs)
+        self.static_grad_outputs = tuple(self.static_grad_outputs)
 
         self.groundtruth_grad_added_to_main_grad = {}
         if self.fuse_wgrad_accumulation:
-            for param in self.base_module.parameters():
+            for param in self.params_to_backprop:
                 if hasattr(param, "grad_added_to_main_grad"):
                     self.groundtruth_grad_added_to_main_grad[param] = param.grad_added_to_main_grad
 
-        self.static_grad_outputs = static_grad_outputs
-        self.static_grad_inputs = static_grad_inputs
+        # After backward pass grad_output buffers are no longer used and returned to the pool
+        for ten in self.static_grad_outputs:
+            if torch.is_tensor(ten):
+                # Check that the tensor is not in use. This scenario may occur when a cudagraph
+                # passes its input directly as an output, and places this output as the
+                # input of a subsequent cudgraph, leading to a grad output buffer to be still in use
+                # even after the backward pass.
+                reuse_count = (
+                    ten.cg_buffer_metadata.capture_reuse_count
+                    if hasattr(ten, "cg_buffer_metadata")
+                    else 0
+                )
 
-        # Unfreeze GC.
-        if FREEZE_GC:
-            gc.unfreeze()
+                if _CudagraphGlobalRecord.tensor_reuse_pool.owns(ten) and reuse_count == 0:
+                    _CudagraphGlobalRecord.tensor_reuse_pool.insert(ten)
 
-            if self.is_first_layer:
-                gc.collect()
+        # now weakref everything
+        if HAVE_TE_GRAPHS:
 
-    def get_input_grads_with_dummy_flags(self):
-        """Get the inputs grads that are returned by the bwd cudagraph call. If using grad accum
-        fusion, wgrads have already been accumulated, so return dummy wgrads."""
+            def replace_with_weak_ref(arg):
+                if not torch.is_tensor(arg):
+                    return arg
 
-        is_dummy_grad = [False] * len(self.static_grad_inputs)
-        if not self.fuse_wgrad_accumulation:
-            return self.static_grad_inputs, is_dummy_grad
-        else:
-            num_dgrads = len(self.static_grad_inputs) - len(list(self.base_module.parameters()))
-            dgrads = self.static_grad_inputs[:num_dgrads]
-            wgrads = self.static_grad_inputs[num_dgrads:]
-
-            wgrads_with_placeholders = []
-            is_dummy_grad = [False] * len(dgrads)
-            for idx, param in enumerate(self.base_module.parameters()):
-                wgrad_is_dummy = getattr(param, "grad_added_to_main_grad", False)
-                if wgrad_is_dummy:
-                    if getattr(param, "zero_out_wgrad", False):
-                        wgrad = torch.zeros(
-                            param.main_grad.shape,
-                            dtype=param.dtype,
-                            device=torch.cuda.current_device(),
-                            requires_grad=False,
-                        )
-                    else:
-                        wgrad = torch.empty(
-                            param.main_grad.shape,
-                            dtype=param.dtype,
-                            device=torch.cuda.current_device(),
-                            requires_grad=False,
-                        )
-                else:
-                    wgrad = wgrads[idx]
-                wgrads_with_placeholders.append(wgrad)
-                is_dummy_grad.append(wgrad_is_dummy)
-            return tuple(dgrads + wgrads_with_placeholders), is_dummy_grad
+                ref = make_weak_ref(arg)
+                ref.requires_grad = arg.requires_grad
+                if hasattr(arg, "can_skip_replay_copy"):
+                    ref.can_skip_replay_copy = arg.can_skip_replay_copy
+                return ref
+
+            self.fwd_graph_input_surface = tree_map(
+                replace_with_weak_ref, self.fwd_graph_input_surface
+            )
+            self.fwd_graph_input_args = tree_map(replace_with_weak_ref, self.fwd_graph_input_args)
+            self.fwd_graph_input_kwargs = tree_map(
+                replace_with_weak_ref, self.fwd_graph_input_kwargs
+            )
+            self.fwd_graph_output_surface = tree_map(
+                replace_with_weak_ref, self.fwd_graph_output_surface
+            )
+            # It is safe to weakref static_grad_inputs as any inuse input grads have a strong ref
+            # stored in 'bwd_cudagraph_buffer'
+            self.static_grad_inputs = tree_map(replace_with_weak_ref, self.static_grad_inputs)
+            self.static_grad_outputs = tree_map(replace_with_weak_ref, self.static_grad_outputs)
+
+        delattr(self, "args")
+        delattr(self, "kwargs")
+        delattr(self, "outputs")
+
+    def apply_cudagraph_record_metadata(self, args, kwargs, outputs):
+        """Attaches graph capture metadata to all passed in tensors."""
+
+        for t in self.get_tensors(args, kwargs):
+            if not hasattr(t, "cg_buffer_metadata"):
+                t.cg_buffer_metadata = CudagraphBufferMetadata()
+
+            t.cg_buffer_metadata.is_cudagraph_input = True
+            t.cg_buffer_metadata.input_use_count += 1
+
+            if t.cg_buffer_metadata.is_cudagraph_output:
+                t.cg_buffer_metadata.cudagraph_reuse_ref_count += 1
+
+        # mark all outputs, so that the fwd graph we may reuse cudagraph output buffers as inputs
+        for o in self.get_tensors(outputs):
+            o.cg_buffer_metadata = CudagraphBufferMetadata()
+            o.cg_buffer_metadata.is_cudagraph_output = True
 
     def record_graph_capture(self, args, kwargs):
         """Records the data needed to create this runner's forward cudagraph.
@@ -859,21 +1177,8 @@ def record_graph_capture(self, args, kwargs):
         The actual cudagraph will be created when 'create_cudagraphs()` is called. Subsequent
         passes should replay the graph."""
 
-        if not self.fwd_graph_recorded:
-            logger.debug(f"Recording forward graph creation...")
-            if self.is_transformer_decoder_layer and not self.is_first_layer:
-                # transformer layers hidden_states are already saved as the output of the previous
-                # layer's cudagraph so avoid saving again
-                kwargs_copy = dict(kwargs)
-                kwargs_copy['hidden_states'] = None
-                _CudagraphGlobalRecord.record_fwd_graph(self, args, kwargs_copy)
-            else:
-                _CudagraphGlobalRecord.record_fwd_graph(self, args, kwargs)
-
-            self.fwd_graph_recorded = True
-
         # Run the forward pass as normal in eager mode.
-        out = super(MegatronModule, self.base_module).__call__(*args, **kwargs)
+        out = self.func(*args, **kwargs)
 
         if type(out) != tuple:
             out = (out,)
@@ -890,9 +1195,38 @@ def record_graph_capture(self, args, kwargs):
             ]
         )
 
-        # autograd nodes return inputs as views, so clone the tensor as returning views may cause
-        # issues, for instance with pipeline parallelism
-        return tuple(o.clone() if torch.is_tensor(o) else o for o in out)
+        if not self.fwd_graph_recorded:
+            logger.debug(f"Recording forward graph creation...")
+
+            self.apply_cudagraph_record_metadata(args, kwargs, out)
+
+            def _replace_with_meta(arg):
+                return ArgMetadata(arg) if torch.is_tensor(arg) else arg
+
+            m_args = tree_map(_replace_with_meta, args)
+            m_kwargs = tree_map(_replace_with_meta, kwargs)
+            m_out = tree_map(_replace_with_meta, out)
+            _CudagraphGlobalRecord.record_fwd_graph(self, m_args, m_kwargs, m_out)
+
+            if HAVE_TE_GRAPHS:
+                if FP8GlobalStateManager.is_fp8_enabled():
+                    # check if the low precision recipe is either fp4 or fp8
+                    if is_te_min_version("2.7.0.dev0"):
+                        from transformer_engine.common.recipe import NVFP4BlockScaling
+
+                        recipe = FP8GlobalStateManager.get_fp8_recipe()
+                        if isinstance(recipe, NVFP4BlockScaling):
+                            self.fp4_runtime_enabled = True
+                        else:
+                            self.fp8_runtime_enabled = True
+                    else:
+                        self.fp8_runtime_enabled = True
+
+            self.fwd_graph_recorded = True
+
+        if len(out) == 1:
+            return out[0]
+        return tuple(out)
 
     def replay_graph_capture(self, is_first_microbatch, args, kwargs):
         """Replay the fwd cuda graph with autograd."""
@@ -905,15 +1239,17 @@ def replay_graph_capture(self, is_first_microbatch, args, kwargs):
             error_msg = "CUDA graph argument mismatch:\n" + "\n".join(mismatch_errors)
             raise AssertionError(error_msg)
 
-        inp_tensors = self.get_tensors(args, kwargs)
-        func_args = inp_tensors + tuple(self.parameters())
-        out = _CudagraphReplayNode.apply(self, is_first_microbatch, *func_args)
-        out = list(out)
+        inp_tensors = self.get_tensors(args, kwargs, check_types=False)
+        if self.grad_enabled:
+            func_args = inp_tensors + self.params_to_backprop
+        else:
+            func_args = inp_tensors
 
-        if torch.is_tensor(self.fwd_graph_outputs):
-            self.fwd_graph_outputs = [self.fwd_graph_outputs]
+        out = _CudagraphReplayNode.apply(self, is_first_microbatch, *func_args)
 
-        return tuple(out.pop(0) if torch.is_tensor(o) else o for o in self.fwd_graph_outputs)
+        out_iter = iter(self.to_list(out))
+        fwd_outputs = self.to_list(self.fwd_graph_outputs)
+        return tuple(next(out_iter) if torch.is_tensor(o) else o for o in fwd_outputs)
 
     def get_mismatch_errors(self, args, kwargs):
         """Return list of detailed errors for mismatched cudagraph args."""
@@ -946,7 +1282,7 @@ def check(val, ref, context):
                     add_error(f"Tensor mismatch at {context}: {', '.join(mismatches)}")
 
             elif is_dataclass(ref.value):
-                for field in fields(ref.value):
+                for field in dataclasses.fields(ref.value):
                     check(
                         ArgMetadata(getattr(val.value, field.name)),
                         ArgMetadata(getattr(ref.value, field.name)),
@@ -978,66 +1314,55 @@ def check(val, ref, context):
 
         return errors
 
-    def zero_out_tensors(self, args, kwargs=None):
-        """Replace all tensors inside arg, kwargs with zeroed copies."""
+    def get_arg_metas(self, args, kwargs=None):
+        """Replaces all passed in tensors with 'ArgMetadata' and returns them as a list."""
+        arg_metas = []
 
-        def clone_tensor(ten):
-            cloned = torch.zeros_like(ten)
-            cloned.requires_grad = ten.requires_grad
-            return cloned
+        def collect(item):
+            if isinstance(item, ArgMetadata):
+                arg_metas.append(item)
+            return item  # tree_map expects a return value to rebuild the tree
 
-        def process_arg(arg):
-            _check_supported_type(ArgMetadata(arg))
-            if torch.is_tensor(arg):
-                return clone_tensor(arg)
-            elif is_dataclass(arg):
-                for field in fields(arg):
-                    attr = getattr(arg, field.name)
-                    if torch.is_tensor(attr):
-                        setattr(arg, field.name, clone_tensor(attr))
-            return arg
-
-        args_replaced = []
-        for arg in args:
-            args_replaced.append(process_arg(arg))
-        if kwargs is None:
-            return args_replaced
-
-        kwargs_replaced = {}
-        for k, v in kwargs.items():
-            kwargs_replaced[k] = process_arg(v)
-
-        return args_replaced, kwargs_replaced
+        tree_map(collect, args)
+        if kwargs is not None:
+            tree_map(collect, kwargs)
 
-    @classmethod
-    def get_tensors(cls, args, kwargs=None):
-        """Filter and flatten all tensors from args and kwargs."""
+        return arg_metas
+
+    def get_tensors(self, args, kwargs=None, check_types=True):
+        """
+        Filter and flatten all tensors from args and kwargs using list comprehensions
+        and itertools.chain for faster flattening.
+        """
 
         def extract_tensors(arg):
-            _check_supported_type(ArgMetadata(arg))
+            if check_types:
+                _check_supported_type(ArgMetadata(arg))
             if torch.is_tensor(arg):
                 return [arg]
-            elif is_dataclass(arg):
-                tens = []
-                for field in fields(arg):
-                    attr = getattr(arg, field.name)
-                    if torch.is_tensor(attr):
-                        tens.append(attr)
-                return tens
-            else:
-                return []
 
-        tens = []
-        args, _ = tree_flatten(args)
-        for a in args:
-            tens.extend(extract_tensors(a))
+            if is_dataclass(arg):
+                return [
+                    attr
+                    for field in dataclasses.fields(arg)
+                    if torch.is_tensor(attr := getattr(arg, field.name))
+                ]
 
-        if kwargs is not None:
-            kwargs, _ = tree_flatten(kwargs)
-            for k in kwargs:
-                tens.extend(extract_tensors(k))
+            return []
+
+        if torch.is_tensor(args):
+            return (args,)
+
+        args_tens = [tensor for arg in args for tensor in extract_tensors(arg)] if args else []
+        kwargs_tens = (
+            [tensor for val in kwargs.values() for tensor in extract_tensors(val)] if kwargs else []
+        )
 
-        return tuple(tens)
+        return tuple(chain(args_tens, kwargs_tens))
+
+    def to_list(self, x):
+        """Helper function to wrap an input into a list"""
+        return [x] if torch.is_tensor(x) else list(x)
 
 
 class CudaGraphManager(torch.nn.Module):
@@ -1046,17 +1371,8 @@ class CudaGraphManager(torch.nn.Module):
     """A global mempool for when 'cuda_graph_use_single_mempool' is used."""
     global_mempool = None
 
-    """Forward pass mempools, used with cudagraph reuse mode."""
-    fwd_mempools = None
-
-    """Backward pass mempool, used with cudagraph reuse mode."""
-    bwd_mempool = None
-
     def __init__(
-        self,
-        config: TransformerConfig,
-        share_cudagraph_io_buffers: bool = True,
-        vp_stage: Optional[int] = None,
+        self, config: TransformerConfig, base_module=None, function_name=None, need_backward=True
     ):
         super().__init__()
         """Creates a CudaGraphManager to manage CUDA graphs for a Megatron module.
@@ -1064,14 +1380,21 @@ def __init__(
         Args:
             config: TransformerConfig object containing CUDA graph settings for memory
                 pooling, graph retention, gradient accumulation, FP8/FP4, and warmup steps.
-            share_cudagraph_io_buffers (bool, optional): (DEPRECATED, will be replaced by
-                config.cuda_graph_share_io_buffers) If None (default) or True, enables
-                buffer reuse optimizations for transformer and mamba layers. If False,
-                disables buffer reuse.
         """
         rng_tracker = get_cuda_rng_tracker()
-        self.share_cudagraph_io_buffers = share_cudagraph_io_buffers
-        self.vp_stage = vp_stage
+        self.need_backward = need_backward
+
+        if function_name is not None:
+            func = getattr(base_module, function_name)
+
+            def wrapped_func(*args, **kwargs):
+                out = self(base_module, args, kwargs)
+                return out
+
+            setattr(base_module, function_name, wrapped_func)
+        else:
+            func = None
+        self.func = func
 
         # need to delay the import here to avoid a circular import
         global HAVE_TE_GRAPHS
@@ -1087,54 +1410,28 @@ def __init__(
         ), "RNG tracker does not support cudagraphs!"
 
         assert config.cuda_graph_impl == "local", "Option cuda_graph_impl=local not enabled."
-        assert "expandable_segments:True" not in os.getenv("PYTORCH_CUDA_ALLOC_CONF", ""), (
-            "expandable_segments:True may not be safe when using CUDA Graphs, and may result in"
-            "a crash due to illegal memory access or other undefined behaviour."
-        )
+        if torch.cuda.get_device_capability()[0] < 10:
+            assert (
+                "expandable_segments:True" not in os.getenv("PYTORCH_CUDA_ALLOC_CONF", "")
+                or os.getenv("NCCL_GRAPH_REGISTER", "") == "0"
+            ), (
+                "Setting NCCL_GRAPH_REGISTER=0 to avoid illegal memory access when using "
+                "CUDA Graph with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True."
+            )
 
-        self.cudagraph_runners = []
-        self.inference_cudagraphs_lookup_table = defaultdict(lambda: None)
+        self.cudagraph_runners: list[_CudaGraphRunner] = []
+        self.inference_cudagraphs_lookup_table: dict = defaultdict(lambda: None)
         self.is_first_microbatch = False
 
         # Without pipeline parallelism, microbatches execute one at a time.
         # Therefore modules will always execute in the same order, so cudagraphs
         # can both be reused and share a single mempool.
-        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
-            self.reuse_cudagraphs = True
-            self.use_single_mempool = True
-        else:
-            if config.cuda_graph_use_single_mempool:
-                self.reuse_cudagraphs = False
-                self.use_single_mempool = True
-            else:
-                self.reuse_cudagraphs = True
-                self.use_single_mempool = False
-
-        # Mempools are static so that multiple cudagraph managers may share the same mempool
-        if self.use_single_mempool:
-            if CudaGraphManager.global_mempool is None:
-                CudaGraphManager.global_mempool = torch.cuda.graph_pool_handle()
-        else:
-            # All cudagraphs in the same microbatch use the same mempool. For pipeline parallelism,
-            # additonally all bwd passes share the same mempool
-            if CudaGraphManager.fwd_mempools is None:
-                CudaGraphManager.fwd_mempools = defaultdict(
-                    lambda: defaultdict(torch.cuda.graph_pool_handle)
-                )
-                CudaGraphManager.bwd_mempool = torch.cuda.graph_pool_handle()
-
-        # Cudagraph stream capture requires no operations on the default stream prior to the
-        # capture, so change to a side stream.
-        self.stream = torch.cuda.current_stream()
-        torch.cuda.set_stream(torch.cuda.Stream())
-
-    def set_is_first_microbatch(self, is_first_microbatch: bool):
-        """Update the is_first_microbatch flag for weight caching.
-
-        Args:
-            is_first_microbatch (bool): Whether this is the first microbatch in the step.
-        """
-        self.is_first_microbatch = is_first_microbatch
+        self.reuse_cudagraphs = parallel_state.get_pipeline_model_parallel_world_size() == 1
+        if CudaGraphManager.global_mempool is None:
+            CudaGraphManager.global_mempool = torch.cuda.graph_pool_handle()
+            # Cudagraph stream capture requires no operations on the default stream prior to the
+            # capture, so change to a side stream.
+            torch.cuda.set_stream(torch.cuda.Stream())
 
     def call_ddp_preforward_hook(self, module):
         """Call any DDP pre-forward hooks which are used to launch async data parallel
@@ -1151,31 +1448,14 @@ def call_ddp_preforward_hook(self, module):
                 # Only hooks from Mcore DDP, which take no args, should be called at this point.
                 hook(module)
 
-    def get_cudagraph_runner(self, megatron_module, args, kwargs):
+    def get_cudagraph_runner(self, megatron_module, args, kwargs, reuse_cudagraphs):
         '''Returns a valid cudagraph runner for the current forward call.
-        For single mempool mode, we create a cudagraph for each call, if the module is called
-        multiple times per step, for instance in the case of pipeline parallelism.
         The cudagraph corresponding to this call is the first element of 'self.cudagraph_runners'.
         We iterate through the list by 1 for each call, and the number of calls is equal to the
         length of 'self.cudagraph_runners'.
         Otherwise, we assign a mempool per microbatch, which allows cudagraphs to be reused
         over different microbatches by tracking their respective fwd and bwd passes.'''
-
-        if self.use_single_mempool:
-            fwd_mempool = CudaGraphManager.global_mempool
-            bwd_mempool = CudaGraphManager.global_mempool
-        else:
-            if megatron_module.config.virtual_pipeline_model_parallel_size is not None:
-                assert (
-                    self.vp_stage is not None
-                ), "vp_stage must be passed if virtual pipeline is enabled"
-                vpp_rank = self.vp_stage
-            else:
-                vpp_rank = 0
-            fwd_mempool = CudaGraphManager.fwd_mempools[vpp_rank][len(self.cudagraph_runners)]
-            bwd_mempool = CudaGraphManager.bwd_mempool
-
-        if self.reuse_cudagraphs:
+        if reuse_cudagraphs:
             is_inference_mode = 'inference_context' in kwargs.keys() and kwargs['inference_context']
             if is_inference_mode:
                 is_static_batching = kwargs['inference_context'].is_static_batching()
@@ -1188,15 +1468,20 @@ def get_cudagraph_runner(self, megatron_module, args, kwargs):
                     runner = self.inference_cudagraphs_lookup_table[padded_batch_dimensions]
             else:
                 # Todo: For training, we could also cache runners based on input shape.
-                runner = next(
-                    (
-                        r
-                        for r in self.cudagraph_runners
-                        if r.status == _GraphStatus.FWD_READY
+                # If autograd is currently disabled, it doesnt matter if a runner was created
+                # with or without autograd, so just get the first fwd ready runner.
+                require_grad = self.need_backward and torch.is_grad_enabled()
+
+                def is_valid(r):
+                    return (
+                        r.status == _GraphStatus.FWD_READY
                         and not r.get_mismatch_errors(args, kwargs)
-                    ),
-                    None,
-                )
+                        and (not require_grad or r.grad_enabled)
+                    )
+
+                # We must choose the first available runner, as the order of
+                # self.cudagraph_runners corresponds to the capture order.
+                runner = next((r for r in self.cudagraph_runners if is_valid(r)), None)
 
             if runner is None:
                 if _CudagraphGlobalRecord.cudagraph_created:
@@ -1208,11 +1493,11 @@ def get_cudagraph_runner(self, megatron_module, args, kwargs):
                 else:
                     runner = _CudaGraphRunner(
                         megatron_module,
-                        fwd_mempool,
-                        bwd_mempool,
+                        CudaGraphManager.global_mempool,
                         args,
                         kwargs,
-                        self.share_cudagraph_io_buffers,
+                        self.func,
+                        self.need_backward,
                     )
                     self.cudagraph_runners.append(runner)
                     if is_inference_mode:
@@ -1232,11 +1517,11 @@ def get_cudagraph_runner(self, megatron_module, args, kwargs):
             else:
                 runner = _CudaGraphRunner(
                     megatron_module,
-                    fwd_mempool,
-                    bwd_mempool,
+                    CudaGraphManager.global_mempool,
                     args,
                     kwargs,
-                    self.share_cudagraph_io_buffers,
+                    self.func,
+                    self.need_backward,
                 )
                 self.cudagraph_runners.append(runner)
 
@@ -1252,38 +1537,30 @@ def __call__(self, megatron_module, args, kwargs):
 
             kwargs (dict):  The keyword args to be passed to the module.
         """
-        # Set the is_first_microbatch flag on the megatron module if it's the first microbatch
-        if self.is_first_microbatch and hasattr(megatron_module, 'set_is_first_microbatch'):
-            megatron_module.set_is_first_microbatch()
+        is_inference_mode = 'inference_context' in kwargs.keys() and kwargs['inference_context']
+        is_in_checkpoint_fwd = is_checkpointing()
+        if HAVE_TE_GRAPHS:
+            is_in_checkpoint_fwd = is_in_checkpoint_fwd or is_fp8_activation_recompute_enabled()
 
         if _CudagraphGlobalRecord.cudagraph_created:
             if self.training and torch.is_grad_enabled():
-                # param.data_ptr() below is used to trigger any hooks that have attached to the
-                # parameter. Specifically, this is trying to trigger the param sync hook for the
-                # APEX optimizer, which triggers param syncs by hooking into any param references.
-                # However cudagraphs disables this, so we workaround by manually referencing
-                # params here. For more information see:
-                # https://github.com/NVIDIA/apex/blob/7001836/apex/contrib/optimizers/distributed_fused_adam.py#L885C9
-                for param in megatron_module.parameters():
-                    param.data_ptr()
-
                 # Trigger Mcore DDP pre-forward hooks
                 self.call_ddp_preforward_hook(megatron_module)
                 for module in megatron_module.modules():
                     self.call_ddp_preforward_hook(module)
 
-            runner = self.get_cudagraph_runner(megatron_module, args, kwargs)
+            runner = self.get_cudagraph_runner(megatron_module, args, kwargs, self.reuse_cudagraphs)
             out = runner.replay_graph_capture(self.is_first_microbatch, args, kwargs)
         else:
-            if 'inference_context' in kwargs.keys() and kwargs['inference_context']:
+            if is_inference_mode:
                 # Inference generation mode creates graphs immediately
-                runner = self.get_cudagraph_runner(megatron_module, args, kwargs)
+                runner = self.get_cudagraph_runner(megatron_module, args, kwargs, True)
                 runner.eval()
 
                 if not runner.fwd_graph_recorded:
                     # Reuse graph input-output buffers for inference
                     local_args, local_kwargs = args, kwargs
-                    if runner.reuse_input_output_buffer and not runner.is_first_layer:
+                    if not runner.is_first_layer:
                         # Find previous layer's runner in the global record
                         try:
                             previous_runner = next(
@@ -1304,10 +1581,9 @@ def __call__(self, megatron_module, args, kwargs):
                             # No match found for previous layer, continue with no buffer reuse
                             pass
 
-                    clone_inputs = not (
-                        runner.reuse_input_output_buffer and not runner.is_first_layer
+                    runner.create_fwd_graph(
+                        local_args, local_kwargs, outputs=None, clone_inputs=runner.is_first_layer
                     )
-                    runner.create_fwd_graph(local_args, local_kwargs, clone_inputs=clone_inputs)
                     runner.fwd_graph_recorded = True
                     runner.cudagraph_created = True
 
@@ -1318,10 +1594,10 @@ def __call__(self, megatron_module, args, kwargs):
 
                 # Now replay the graph
                 out = runner.replay_graph_capture(self.is_first_microbatch, args, kwargs)
-
-            elif self.training:
-                # Training mode
-                runner = self.get_cudagraph_runner(megatron_module, args, kwargs)
+            elif self.training or is_in_checkpoint_fwd:
+                runner = self.get_cudagraph_runner(
+                    megatron_module, args, kwargs, self.reuse_cudagraphs
+                )
                 # check if a layer is frozen during training.
                 if not torch.is_grad_enabled():
                     # If the layer is frozen, we need to set the runner to eval mode.
@@ -1330,13 +1606,17 @@ def __call__(self, megatron_module, args, kwargs):
             else:
                 # No cudagraphs were found in training mode with grad disabled, so fallback to
                 # eager since autograd is needed to correctly trace the backward graph.
-                return super(MegatronModule, megatron_module).__call__(*args, **kwargs)
+                if self.func is not None:
+                    return self.func(*args, **kwargs)
+                else:
+                    return super(MegatronModule, megatron_module).__call__(*args, **kwargs)
 
+        self.is_first_microbatch = False
         # If forward only, next replay should be a forward pass as well
-        if self.training and torch.is_grad_enabled():
-            runner.status = _GraphStatus.BWD_READY
-        else:
+        if is_inference_mode or not torch.is_grad_enabled():
             runner.status = _GraphStatus.FWD_READY
+        else:
+            runner.status = _GraphStatus.BWD_READY
 
         return out
 
@@ -1347,23 +1627,40 @@ def _layer_is_graphable(layer, config):
     Check if a layer is graphable.
     """
 
+    # Only GraphableMegatronModule can be graphed.
+    if not isinstance(layer, GraphableMegatronModule):
+        return False
+
+    # If cuda_graph_scope is not set, every layer is graphed.
+    if not config.cuda_graph_scope:
+        return True
+
     # import modules here to avoid a circular import
     from megatron.core.ssm.mamba_layer import MambaLayer
     from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.mlp import MLP
+    from megatron.core.transformer.moe.moe_layer import MoELayer
     from megatron.core.transformer.transformer_layer import TransformerLayer
 
-    if isinstance(layer, MambaLayer) and config.cuda_graph_scope == "full":
+    if isinstance(layer, MambaLayer) and CudaGraphScope.mamba in config.cuda_graph_scope:
         # mamba layer.
         return True
     if isinstance(layer, TransformerLayer):
-        if config.cuda_graph_scope == 'attn':
-            if not (
-                isinstance(layer.self_attention, IdentityOp)
-                and isinstance(layer.cross_attention, IdentityOp)
-            ):
-                # attn layer.
-                return True
-        else:
+        if CudaGraphScope.attn in config.cuda_graph_scope and not (
+            isinstance(layer.self_attention, IdentityOp)
+            and isinstance(layer.cross_attention, IdentityOp)
+        ):
+            # attn layer.
+            return True
+        if (
+            CudaGraphScope.moe in config.cuda_graph_scope
+            or CudaGraphScope.moe_router in config.cuda_graph_scope
+            or CudaGraphScope.moe_preprocess in config.cuda_graph_scope
+        ) and isinstance(layer.mlp, MoELayer):
+            # moe layer.
+            return True
+        if CudaGraphScope.mlp in config.cuda_graph_scope and isinstance(layer.mlp, MLP):
+            # mlp layer.
             return True
     return False
 
@@ -1382,18 +1679,17 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]):
         assert (
             config.cuda_graph_impl == "transformer_engine"
         ), "Option cuda_graph_impl=transformer_engine not enabled."
-        assert "expandable_segments:True" not in os.getenv("PYTORCH_CUDA_ALLOC_CONF", ""), (
-            "expandable_segments:True may not be safe when using CUDA Graphs, and may result in"
-            "a crash due to illegal memory access or other undefined behaviour."
+        assert (
+            "expandable_segments:True" not in os.getenv("PYTORCH_CUDA_ALLOC_CONF", "")
+            or os.getenv("NCCL_GRAPH_REGISTER", "") == "0"
+        ), (
+            "Setting NCCL_GRAPH_REGISTER=0 to avoid illegal memory access when using "
+            "CUDA Graph with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True."
         )
-        assert config.cuda_graph_scope != "full_iteration", (
+        assert CudaGraphScope.full_iteration not in config.cuda_graph_scope, (
             "full_iteration cuda graph is not supported for cuda_graph_impl=transformer_engine. "
             "Please use cuda_graph_impl=local instead."
         )
-        assert config.cuda_graph_scope in [
-            'full',
-            'attn',
-        ], f"--cuda-graph-scope should be full or attn, got {config.cuda_graph_scope}."
 
         self.model = model
         self.config = config
@@ -1402,6 +1698,9 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]):
         self.optimizers = optimizers
         self.num_model_chunks = len(model)
 
+        # Number of microbatches to capture. The value will be set in _get_cuda_graph_input_data().
+        self.num_microbatches = None
+
         # Get callables with captureable layers.
         self.chunks_with_decoder = []
         self.num_layers_per_chunk = []
@@ -1476,83 +1775,293 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]):
             f'{len(self.flattened_callables)} graphable layers.',
         )
 
-    def _get_cuda_graph_input_data(self):
+        # One helper object can only capture CUDA Graphs once. Use this flag to check if the graphs
+        # have been created.
+        self._graphs_created = False
+
+    def graphs_created(self):
         """
-        Create the CUDA Graph capturing input data.
-        The data is organized per-chunk per-microbatch per-layer.
+        Returns whether the CUDA Graphs have been created.
         """
+        return self._graphs_created
 
-        rotary_pos_emb_cache = {}
+    def _get_sample_arguments(self, order, chunk_id_list=None):
+        """
+        Generate sample arguments and keyword arguments for CUDA Graph capturing with
+        memory-optimized buffer reuse.
+
+        This method creates static input tensors for each (layer, microbatch) pair needed
+        by TE's make_graphed_callables(). It optimizes memory usage by reusing input buffers
+        across non-overlapping forward passes based on the pipeline parallel schedule.
+        This optimization is essential for reducing peak memory during CUDA Graph capturing with
+        many microbatches, as it allows buffers to be reused instead of allocating new ones for
+        later microbatches.
+
+        Memory Optimization Strategy:
+            The 1F1B (one-forward-one-backward) interleaved schedule in pipeline parallelism
+            means that once a microbatch's backward pass completes, its input buffers are no
+            longer needed. This method tracks buffer lifecycle and reuses "consumed" buffers
+            (those whose backward has completed) for new forward passes with matching tensor
+            signatures (shape, dtype, layout).
+
+            Example schedule: [1, 1, 1, 2, 2, 2, -2, 1, -2, 1, -2, 2, -1, 2, -1, -1, -2, -2, -1, -1]
+            - Positive values indicate forward passes (chunk_id = value)
+            - Negative values indicate backward passes (chunk_id = -value)
+            - When processing -2 (backward of chunk 2), its buffers become available for reuse
+            - The next forward with matching signature can reuse those buffers
 
-        def get_rotary_pos_emb(transformer_module, transformer_input):
-            if (
-                transformer_module.position_embedding_type == 'rope'
-                and not self.config.multi_latent_attention
-            ):
-                rotary_seq_len = transformer_module.rotary_pos_emb.get_rotary_seq_len(
-                    None, transformer_module.decoder, transformer_input, self.config, None
-                )
-                if rotary_seq_len not in rotary_pos_emb_cache:
-                    rotary_pos_emb_cache[rotary_seq_len] = transformer_module.rotary_pos_emb(
-                        rotary_seq_len
-                    )
-                return rotary_pos_emb_cache[rotary_seq_len]
-            else:
-                return None
+        Args:
+            order (List[int]): The forward/backward execution order from
+                convert_schedule_table_to_order(). Positive integers represent forward passes
+                (1-indexed chunk ID), negative integers represent backward passes.
+            chunk_id_list (List[Tuple[int, int]]): The list of chunk IDs and layer IDs in the
+                order. This is useful only when overlap_moe_expert_parallel_comm is enabled,
+                the order maps each layers' idx to their original chunk id.
+
+        Returns:
+            Tuple[List[Tuple], List[Dict]]: A tuple containing:
+                - sample_args: List of positional argument tuples for each (layer, microbatch).
+                    Length = num_layers * num_microbatches. Elements with the same tensor
+                    signature may share references to reduce memory allocation.
+                - sample_kwargs: List of keyword argument dicts for each (layer, microbatch).
+                    Length = num_layers * num_microbatches. Elements with the same tensor
+                    signature may share references to reduce memory allocation.
+
+        Data Structures:
+            - fwd_sample_queues: Dict[chunk_id, List[Tuple[sample_keys, fwd_idx]]]
+                Queue of forward samples per chunk awaiting their backward pass.
+            - consumed_sample_queue: Dict[sample_keys, List[fwd_idx]]
+                Pool of buffer indices whose backward is complete, keyed by tensor signature.
+            - sample_keys: Tuple of (shape, dtype, layout) for args + (key, shape, dtype, layout)
+                for kwargs, used to match compatible buffers for reuse.
+        """
+        assert self.num_model_chunks == max(
+            order
+        ), "num_model_chunks must match the max chunk id in order."
+        if chunk_id_list is None:
+            # check only if 1f1b overlap is disabled.
+            assert (
+                self.num_microbatches == len(order) // self.num_model_chunks // 2
+            ), "num_microbatches must match the number of microbatches in order."
 
         # Generate sample arguments and keyword arguments for capturing.
-        sample_args = []
-        sample_kwargs = []
-        for chunk_number, chunk_with_decoder in enumerate(self.chunks_with_decoder):
-            if chunk_with_decoder is None:
-                continue
-            layers = self.callables_per_chunk[chunk_number]
-            for _ in range(get_num_microbatches()):
-                for layer in layers:
-                    static_inputs = layer.get_layer_static_inputs(
-                        self.seq_length, self.micro_batch_size
+        sample_args = [None] * (len(self.flattened_callables) * self.num_microbatches)
+        sample_kwargs = [None] * (len(self.flattened_callables) * self.num_microbatches)
+
+        rotary_pos_emb_cache = {}
+
+        def _get_layer_static_inputs(layer, chunk_of_the_layer):
+            """
+            Get the static inputs for a layer.
+            """
+            assert layer in chunk_of_the_layer.decoder.layers or any(
+                layer is mtp_layer.transformer_layer for mtp_layer in chunk_of_the_layer.mtp.layers
+            ), "Layer is not in the chunk"
+
+            def get_rotary_pos_emb(transformer_module, transformer_input):
+                if (
+                    transformer_module.position_embedding_type == 'rope'
+                    and not self.config.multi_latent_attention
+                ):
+                    rotary_seq_len = transformer_module.rotary_pos_emb.get_rotary_seq_len(
+                        None, transformer_module.decoder, transformer_input, self.config, None
                     )
+                    if rotary_seq_len not in rotary_pos_emb_cache:
+                        rotary_pos_emb_cache[rotary_seq_len] = transformer_module.rotary_pos_emb(
+                            rotary_seq_len
+                        )
+                    return rotary_pos_emb_cache[rotary_seq_len]
+                else:
+                    return None
+
+            static_inputs = layer.get_layer_static_inputs(self.seq_length, self.micro_batch_size)
 
-                    from megatron.core.transformer.identity_op import IdentityOp
-                    from megatron.core.transformer.transformer_layer import TransformerLayer
+            from megatron.core.transformer.identity_op import IdentityOp
+            from megatron.core.transformer.transformer_layer import TransformerLayer
+
+            contains_self_attn = (
+                isinstance(layer, TransformerLayer)
+                and not isinstance(layer.self_attention, IdentityOp)
+                and (
+                    not self.config.cuda_graph_scope
+                    or CudaGraphScope.attn in self.config.cuda_graph_scope
+                )
+            )
 
-                    contains_self_attn = isinstance(layer, TransformerLayer) and not isinstance(
-                        layer.self_attention, IdentityOp
+            _sample_kwargs = {}
+            if is_te_min_version("1.10.0"):
+                # te.make_graphed_callables() accepts keyword arguments since 1.10.0.
+                hidden_states = static_inputs.pop("hidden_states")
+                _sample_args = (hidden_states,)
+                if contains_self_attn:
+                    rotary_pos_emb = get_rotary_pos_emb(chunk_of_the_layer, hidden_states)
+                    if rotary_pos_emb is not None:
+                        static_inputs["rotary_pos_emb"] = rotary_pos_emb
+                _sample_kwargs = static_inputs
+            elif contains_self_attn:
+                _sample_args = (
+                    static_inputs.pop("hidden_states"),
+                    static_inputs.pop("attention_mask"),
+                )
+            else:
+                _sample_args = (static_inputs.pop("hidden_states"),)
+            return _sample_args, _sample_kwargs
+
+        # Calculate the starting index of each chunk in callables for future use.
+        prefix_num_layers = [0]
+        for model_chunk_idx in range(self.num_model_chunks):
+            num_layers = self.num_layers_per_chunk[model_chunk_idx]
+            prefix_num_layers.append(prefix_num_layers[-1] + num_layers)
+
+        # Reorganize args and kwargs for input tensor reuse.
+        # fwd_sample_queues is keyed by model chunk index. The value is a queue of tuples.
+        # Each tuple contains the sample key signature and its fwd_idx. When we finish a backward
+        # chunk, we pop the corresponding fwd_idx and push to the consumed_sample_queue.
+        # consumed_sample_queue is keyed by the sample key signature. The value is a queue of the
+        # fwd_idx whose backward has been called so that we can reuse the same static buffers.
+        # In this way, we can reuse the same static input buffers for the non-overlapping samples
+        # with the same input signature.
+        fwd_sample_queues = {}
+        consumed_sample_queue = {}
+        layer_sample_keys_cache = {}
+        fwd_idx = [0] * self.num_model_chunks
+        for idx, chunk_id in enumerate(order):
+            model_chunk_idx = abs(ceil(chunk_id)) - 1
+
+            if chunk_id > 0:
+                if model_chunk_idx not in fwd_sample_queues:
+                    fwd_sample_queues[model_chunk_idx] = []
+
+                sample_start_idx = (prefix_num_layers[model_chunk_idx] * self.num_microbatches) + (
+                    fwd_idx[model_chunk_idx] * self.num_layers_per_chunk[model_chunk_idx]
+                )
+                if chunk_id_list:
+                    model_chunk_idx = chunk_id_list[idx][0]
+                    callables_curr_chunk = [
+                        self.callables_per_chunk[model_chunk_idx][chunk_id_list[idx][1]]
+                    ]
+                else:
+                    callables_curr_chunk = self.callables_per_chunk[model_chunk_idx]
+                for layer_idx, layer in enumerate(callables_curr_chunk):
+                    per_callable_fwd_idx = sample_start_idx + layer_idx
+
+                    # Get sample_args and sample_kwargs for index per_callable_fwd_idx.
+                    assert (
+                        sample_args[per_callable_fwd_idx] is None
+                        and sample_kwargs[per_callable_fwd_idx] is None
+                    ), (
+                        f"sample_args and sample_kwargs must be None before assigning static data, "
+                        f"but got sample_args[{per_callable_fwd_idx}] = "
+                        f"{sample_args[per_callable_fwd_idx]} and "
+                        f"sample_kwargs[{per_callable_fwd_idx}] = "
+                        f"{sample_kwargs[per_callable_fwd_idx]}."
                     )
-                    if is_te_min_version("1.10.0"):
-                        # te.make_graphed_callables() accepts keyword arguments since 1.10.0.
-                        hidden_states = static_inputs.pop("hidden_states")
-                        sample_args.append((hidden_states,))
-                        if contains_self_attn:
-                            rotary_pos_emb = get_rotary_pos_emb(chunk_with_decoder, hidden_states)
-                            if rotary_pos_emb is not None:
-                                static_inputs["rotary_pos_emb"] = rotary_pos_emb
-                        sample_kwargs.append(static_inputs)
-                    elif contains_self_attn:
-                        sample_args.append(
-                            (
-                                static_inputs.pop("hidden_states"),
-                                static_inputs.pop("attention_mask"),
+                    if id(layer) not in layer_sample_keys_cache:
+                        # Have not generated the static inputs for this layer yet. So we don't
+                        # know the input signature of this layer. Generate the static inputs, and
+                        # cache the signature.
+                        sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = (
+                            _get_layer_static_inputs(
+                                layer, self.chunks_with_decoder[model_chunk_idx]
                             )
                         )
+                        sample_args_keys = tuple(
+                            (t.shape, t.dtype, t.layout) for t in sample_args[per_callable_fwd_idx]
+                        )
+                        sample_kwargs_keys = tuple(
+                            (k, v.shape, v.dtype, v.layout)
+                            for k, v in sorted(sample_kwargs[per_callable_fwd_idx].items())
+                        )
+                        sample_keys = sample_args_keys + sample_kwargs_keys
+                        layer_sample_keys_cache[id(layer)] = sample_keys
                     else:
-                        sample_args.append((static_inputs.pop("hidden_states"),))
+                        # Get signature from cache. This signature will be used to see if we can
+                        # reuse the static inputs of a previous forward pass for this forward pass.
+                        # If not, we still need to generate the new static inputs.
+                        sample_keys = layer_sample_keys_cache[id(layer)]
+                    model_chunk_idx = abs(chunk_id) - 1
+                    fwd_sample_queues[model_chunk_idx].append((sample_keys, per_callable_fwd_idx))
+                    if consumed_sample_queue.get(sample_keys, []):
+                        # We can reuse the static inputs of a previous forward pass for this
+                        # forward pass, because they are of the same input signature and the
+                        # backward pass of the previous forward pass has completed.
+                        reuse_fwd_idx = consumed_sample_queue[sample_keys].pop(0)
+                        assert (
+                            sample_args[reuse_fwd_idx] is not None
+                            and sample_kwargs[reuse_fwd_idx] is not None
+                        ), (
+                            f"sample_args and sample_kwargs must not be None when reusing, but got "
+                            f"sample_args[{reuse_fwd_idx}] = {sample_args[reuse_fwd_idx]} and "
+                            f"sample_kwargs[{reuse_fwd_idx}] = {sample_kwargs[reuse_fwd_idx]}.",
+                        )
+                        sample_args[per_callable_fwd_idx] = sample_args[reuse_fwd_idx]
+                        sample_kwargs[per_callable_fwd_idx] = sample_kwargs[reuse_fwd_idx]
+
+                    if sample_args[per_callable_fwd_idx] is None:
+                        # Unfortunately, no previous static inputs are available for reuse,
+                        # sample_args is still None. Last attempt: generate the new static inputs
+                        # for this forward pass.
+                        if chunk_id_list:
+                            model_chunk_idx = chunk_id_list[idx][0]
+                        sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = (
+                            _get_layer_static_inputs(
+                                layer, self.chunks_with_decoder[model_chunk_idx]
+                            )
+                        )
+                        model_chunk_idx = abs(chunk_id) - 1
+                fwd_idx[model_chunk_idx] += 1
+            elif ceil(chunk_id) == chunk_id:
+                num_consumed_samples = min(
+                    len(fwd_sample_queues[model_chunk_idx]),
+                    self.num_layers_per_chunk[model_chunk_idx],
+                )
+                for sample_keys, per_callable_fwd_idx in fwd_sample_queues[model_chunk_idx][
+                    :num_consumed_samples
+                ]:
+                    if sample_keys not in consumed_sample_queue:
+                        consumed_sample_queue[sample_keys] = []
+                    consumed_sample_queue[sample_keys].append(per_callable_fwd_idx)
+                fwd_sample_queues[model_chunk_idx] = fwd_sample_queues[model_chunk_idx][
+                    num_consumed_samples:
+                ]
+            else:
+                # skip register static inputs for wgrad backward graphs
+                continue
+
+        return sample_args, sample_kwargs
+
+    def _get_cuda_graph_input_data(self):
+        """
+        Create the CUDA Graph capturing input data.
+        The data is organized per-chunk per-microbatch per-layer.
+        """
 
         # Get the PP and VPP scheduling order.
         from megatron.core.pipeline_parallel.schedules import (
-            convert_schedule_table_to_order,
             get_pp_rank_microbatches,
             get_schedule_table,
         )
 
+        # If PP is not enabled, we only need to capture one microbatch.
+        if (
+            parallel_state.get_pipeline_model_parallel_world_size() == 1
+            and not self.config.overlap_moe_expert_parallel_comm
+        ):
+            assert (
+                self.num_model_chunks == 1
+            ), "If PP is not enabled, there should be only one model chunk."
+            self.num_microbatches = 1
+        else:
+            self.num_microbatches = get_num_microbatches()
+
         _, _, num_warmup_microbatches, _ = get_pp_rank_microbatches(
-            get_num_microbatches(),
+            self.num_microbatches,
             self.num_model_chunks,
             self.config.microbatch_group_size_per_vp_stage,
             False,
         )
         schedule_table = get_schedule_table(
-            get_num_microbatches(),
+            self.num_microbatches,
             self.num_model_chunks,
             self.config.microbatch_group_size_per_vp_stage,
         )
@@ -1566,9 +2075,57 @@ def get_rotary_pos_emb(transformer_module, transformer_input):
             level=logging.DEBUG,
             msg=f'Rank {torch.distributed.get_rank()}: ORDER {order}',
         )
+        chunk_id_list = None
+        if self.config.overlap_moe_expert_parallel_comm:
+            wgrad_in_graph_scope = CudaGraphScope.attn in self.config.cuda_graph_scope or (
+                CudaGraphScope.moe_router in self.config.cuda_graph_scope
+                and self.config.moe_shared_expert_intermediate_size is not None
+                and not self.config.moe_shared_expert_overlap
+            )
+            capture_wgrad_graph = self.config.delay_wgrad_compute and wgrad_in_graph_scope
+            order, chunk_id_list = get_overlap_moe_expert_parallel_comm_order(
+                order, self.num_layers_per_chunk, capture_wgrad_graph
+            )
+            self.num_layers_per_chunk = [1] * sum(self.num_layers_per_chunk)
+            self.num_model_chunks = max(order)
+            _order_without_wgrad = []
+            for c_id in order:
+                if ceil(c_id) != c_id:
+                    continue
+                _order_without_wgrad.append(c_id)
+            self.num_microbatches = len(_order_without_wgrad) // self.num_model_chunks // 2
+            log_on_each_pipeline_stage(
+                logger=logger,
+                tp_group=None,
+                dp_cp_group=None,
+                level=logging.DEBUG,
+                msg=f'Rank {torch.distributed.get_rank()}: '
+                f'ORDER after overlap_moe_expert_parallel_comm {order}',
+            )
+
+        # Generate sample arguments and keyword arguments for capturing.
+        sample_args, sample_kwargs = self._get_sample_arguments(order, chunk_id_list)
 
         def get_make_graphed_callables_kwargs():
-            kwargs = {'num_warmup_iters': 11, 'allow_unused_input': True, '_order': order}
+            kwargs = {
+                'allow_unused_input': True,
+                '_order': order,
+                'retain_graph_in_backward': self.config.cuda_graph_retain_backward_graph,
+            }
+
+            # Calculate the number of warmup iterations per layer per microbatch inside TE
+            # make_graphed_callables(). There are two rules:
+            # 1. There should be at least 1 warmup iteration per layer per microbatch inside TE
+            # make_graphed_callables().
+            # 2. There should be at least 10 warmup iterations per layer, counting the MCore warmup
+            # steps before going into this capture routine.
+            kwargs['num_warmup_iters'] = max(
+                1,
+                math.ceil(
+                    (10 - self.config.cuda_graph_warmup_steps * get_num_microbatches())
+                    / self.num_microbatches
+                ),
+            )
 
             if is_te_min_version("2.6.0"):
                 # Starting from TE 2.6.0, make_graphed_callables() accepts different number
@@ -1626,9 +2183,13 @@ def _start_capturing(self):
         """
         Start capturing CUDA Graphs.
         """
+        assert not self._graphs_created, "CUDA Graphs have already been created."
+
         torch.distributed.barrier()
         gc.collect()
         torch.cuda.empty_cache()
+        if FREEZE_GC:
+            gc.freeze()
 
         _set_capture_start()
         log_single_rank(logger, logging.INFO, f'Start CUDA Graphs capture...')
@@ -1656,9 +2217,14 @@ def _finish_capturing(self, start_time):
             optimizer.zero_grad()
         clear_aux_losses_tracker()
         reset_model_temporary_tensors(self.config, self.model)
+
+        if FREEZE_GC:
+            gc.unfreeze()
         gc.collect()
         torch.cuda.empty_cache()
 
+        self._graphs_created = True
+
     def create_cudagraphs(self):
         """
         Capture CUDA Graphs per TransformerLayer per microbatch.
@@ -1667,21 +2233,30 @@ def create_cudagraphs(self):
 
         # Prepare CUDA Graph capturing input data and call `make_graphed_callables`.
         sample_args, kwargs = self._get_cuda_graph_input_data()
-        graphs = make_graphed_callables(tuple(self.flattened_callables), sample_args, **kwargs)
+        if self.config.sequence_parallel:
+            rng_context = get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+        with rng_context:
+            graphs = make_graphed_callables(tuple(self.flattened_callables), sample_args, **kwargs)
 
         # Push the captured graphs to the corresponding TransformerBlock.
         num_layers_accumulated = 0
         for layers in self.callables_per_chunk:
             for layer_number, layer in enumerate(layers):
                 layer.cuda_graphs = []
-                for batch_number in range(get_num_microbatches()):
-                    layer.cuda_graphs.append(
-                        graphs[
-                            num_layers_accumulated * get_num_microbatches()
+                for batch_number in range(self.num_microbatches):
+                    if self.config.overlap_moe_expert_parallel_comm:
+                        graph_idx = (
+                            num_layers_accumulated + layer_number
+                        ) * self.num_microbatches + batch_number
+                    else:
+                        graph_idx = (
+                            num_layers_accumulated * self.num_microbatches
                             + batch_number * len(layers)
                             + layer_number
-                        ]
-                    )
+                        )
+                    layer.cuda_graphs.append(graphs[graph_idx])
             num_layers_accumulated += len(layers)
 
         self._finish_capturing(start_time)
@@ -1695,3 +2270,163 @@ def cuda_graph_set_manual_hooks(self):
             model_chunk = self.model[chunk_number]
             for layer in layers:
                 layer.setup_manual_hooks(model_chunk._make_forward_pre_hook)
+
+    def delete_cuda_graphs(self):
+        """
+        Delete all CUDA graphs.
+        """
+        assert self._graphs_created, "CUDA Graphs have not been created."
+
+        graph_resettable = is_te_min_version("2.10.0")
+        graphs_reset, graphs_not_reset = 0, 0
+        for layers in self.callables_per_chunk:
+            for layer in layers:
+                for graph in layer.cuda_graphs:
+                    if graph_resettable:
+                        graph.reset()
+                        graphs_reset += 1
+                    else:
+                        graphs_not_reset += 1
+                layer.cuda_graphs = []
+                layer.cuda_graph_manual_hooks = []
+
+        log_on_each_pipeline_stage(
+            logger=logger,
+            tp_group=None,
+            dp_cp_group=None,
+            level=logging.INFO,
+            msg=f'Rank {torch.distributed.get_rank()}: '
+            f'{graphs_reset} graphs deleted with explicit reset, '
+            f'{graphs_not_reset} graphs deleted without explicit reset.',
+        )
+        self._graphs_created = False
+
+
+def convert_schedule_table_to_order(num_warmup_microbatches, num_model_chunks, schedule_table):
+    """Convert a tunable schedule lookup table to the te.make_graphed_callables() accepted
+    order format. For example, the tunable schedule table for PP2 N3M5 with VP2 is as below:
+    virtual_microbatch_id | 0 1 2 3 4 5 6 7 8 9
+    microbatch_id         | 0 1 2 0 1 2 3 4 3 4
+    model_chunk_id        | 0 0 0 1 1 1 0 0 1 1
+
+    Then the forward backward separated order is:
+    forward               | 1 1 1 2 2 2 1 1 2 2
+    backward              | -2 -2 -2 -1 -1 -1 -2 -2 -1 -1
+
+    If num_warmup_microbatches is 5, the output order is:
+    1 1 1 2 2 2 -2 1 -2 1 -2 2 -1 2 -1 -1 -2 -2 -1 -1
+    """
+    _, model_chunk_id_table = zip(*schedule_table)
+    forward_order = [chunk_id + 1 for chunk_id in model_chunk_id_table]
+    backward_order = [chunk_id - num_model_chunks for chunk_id in model_chunk_id_table]
+    order = forward_order[:num_warmup_microbatches]
+    for i in range(num_warmup_microbatches, len(forward_order)):
+        order.append(forward_order[i])
+        order.append(backward_order[i - num_warmup_microbatches])
+    if num_warmup_microbatches > 0:
+        order.extend(backward_order[-num_warmup_microbatches:])
+    return order
+
+
+def get_overlap_moe_expert_parallel_comm_order(order, num_layers_per_chunk, capture_wgrad_graph):
+    """
+    This functions gets the order for overlap_moe_expert_parallel_comm schedule for the original
+    chunk-wise order list. Each chunk is transformered to chunks with only 1 layer so that
+    layers between 2 chunks can now overlap with each other while following the graph order.
+    If capture_wgrad_graph is True, the wgrad backward graph is also added to the order by
+    decreasing the layer id by 0.5.
+
+    Args:
+        order (List[int]): The original chunk-wise order list. Positive values represent forward
+            passes for chunks, negative values represent backward passes. The absolute value
+            indicates the chunk ID (1-indexed).
+        num_layers_per_chunk (List[int]): Number of graphable layers in each chunk. The length
+            of this list equals the number of chunks.
+        capture_wgrad_graph (bool): If True, weight gradient computation graphs are added to the
+            order by appending entries with layer_id - 0.5.
+
+    Returns:
+        Tuple[List[float], List[Optional[List[int]]]]: A tuple containing:
+            - new_order: The layer-wise order list where each chunk is expanded to individual
+              layers. Positive values are forward passes, negative values are backward passes.
+              Values with .5 suffix indicate weight gradient computations.
+            - chunk_id_list: A list parallel to new_order. For forward passes, contains
+              [chunk_id, layer_index_within_chunk]. For backward passes, contains None.
+
+    Example:
+        original_order: [1, 2, -2, 1, -1, -1]
+        num_layers_per_chunk: [1, 2]
+        capture_wgrad_graph=True:
+            new_order: [1, 2, 3, 1, -3, -3.5, -2, -2.5, -1, -1.5, -1, -1.5]
+            chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None,
+                            None, None, None, None, None, None, None]
+        capture_wgrad_graph=False:
+            new_order: [1, 2, 3, 1, -3, -2, -1, -1]
+            chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, None, None, None]
+    """
+
+    def _add_order(new_order, chunk_id_list, c_id, layer_id, is_wgrad=False, index=None):
+        if is_wgrad:
+            new_order.append(layer_id - 0.5)
+        else:
+            new_order.append(layer_id)
+        if c_id > 0:
+            chunk_id_list.append([abs(c_id) - 1, index])
+        else:
+            chunk_id_list.append(None)
+
+    new_order = []
+    chunk_id_list = []
+    add_order = partial(_add_order, new_order, chunk_id_list)
+    first_backward_idx, last_forward_idx = None, None
+    for idx, c_id in enumerate(order):
+        if first_backward_idx is None and c_id < 0:
+            first_backward_idx = idx
+        if c_id > 0:
+            last_forward_idx = idx
+
+    def get_layer_range(c_id):
+        num_layers = num_layers_per_chunk[abs(c_id) - 1]
+        num_layers_previous_chunks = sum(num_layers_per_chunk[: abs(c_id) - 1])
+        if c_id > 0:
+            return list(
+                range(num_layers_previous_chunks + 1, num_layers_previous_chunks + num_layers + 1)
+            )
+        return list(range(-num_layers_previous_chunks - num_layers, -num_layers_previous_chunks))
+
+    # warmup stage
+    for c_id in order[:first_backward_idx]:
+        layer_range = get_layer_range(c_id)
+        new_order += layer_range
+        chunk_id_list.extend([abs(c_id) - 1, i] for i in range(len(layer_range)))
+
+    # 1f1b overlap stage
+    if first_backward_idx < last_forward_idx:
+        for c_id_b, c_id_f in zip(
+            order[first_backward_idx : last_forward_idx + 1 : 2],
+            order[first_backward_idx + 1 : last_forward_idx + 1 : 2],
+        ):
+            layer_range_f = get_layer_range(c_id_f)
+            layer_range_b = get_layer_range(c_id_b)
+            index = 0
+            for l_b, l_f in zip_longest(layer_range_b, layer_range_f, fillvalue=0):
+                # always forward graph before backward graph
+                if l_f != 0:
+                    add_order(c_id_f, l_f, index=index)
+                if l_b != 0:
+                    add_order(c_id_b, l_b)
+                    if capture_wgrad_graph and index < len(layer_range_b) - 1:
+                        add_order(c_id_b, l_b, is_wgrad=True)
+                index += 1
+            # last wgrad backward
+            if capture_wgrad_graph and layer_range_b:
+                add_order(c_id_b, layer_range_b[-1], is_wgrad=True)
+
+    # cool down stage, backward graphs only
+    for c_id in order[last_forward_idx + 1 :]:
+        for l_b in get_layer_range(c_id):
+            add_order(c_id, l_b)
+            if capture_wgrad_graph:
+                add_order(c_id, l_b, is_wgrad=True)
+
+    return new_order, chunk_id_list
diff --git a/megatron/core/transformer/custom_layers/batch_invariant_kernels.py b/megatron/core/transformer/custom_layers/batch_invariant_kernels.py
new file mode 100644
index 00000000000..c44bbf4c8fb
--- /dev/null
+++ b/megatron/core/transformer/custom_layers/batch_invariant_kernels.py
@@ -0,0 +1,1006 @@
+# Copyright 2025 Thinking Machines Lab
+# The following code has been adapted
+# from the following repo: https://github.com/thinking-machines-lab/batch_invariant_ops
+
+
+import contextlib
+import importlib
+import importlib.util
+import logging
+from collections import namedtuple
+from collections.abc import Callable
+from typing import Any, Dict, List, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+__all__ = [
+    "set_batch_invariant_mode",
+    "is_batch_invariant_mode_enabled",
+    "disable_batch_invariant_mode",
+    "enable_batch_invariant_mode",
+]
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def _matmul_launch_metadata(
+    grid: Callable[..., Any], kernel: Any, args: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Build launch metadata for Triton matmul kernels used in BIK matmul."""
+    ret = {}
+    m, n, k = args["M"], args["N"], args["K"]
+    ret["name"] = f"{kernel.name} [M={m}, N={n}, K={k}]"
+    if "tiles_per_update" in args:
+        ret["name"] = (
+            f"{kernel.name} [M={m}, N={n}, K={k}, tiles_per_update={args['tiles_per_update']:02}]"
+        )
+    if "c_ptr" in args:
+        bytes_per_elem = args["c_ptr"].element_size()
+    else:
+        bytes_per_elem = 1 if args["FP8_OUTPUT"] else 2
+    ret[f"flops{bytes_per_elem * 8}"] = 2.0 * m * n * k
+    ret["bytes"] = bytes_per_elem * (m * k + n * k + m * n)
+    return ret
+
+
+@triton.jit
+def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (tile_id % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+    return pid_m, pid_n
+
+
+@triton.jit(launch_metadata=_matmul_launch_metadata)
+def matmul_kernel_persistent(
+    a_ptr,
+    b_ptr,
+    c_ptr,  #
+    bias_ptr,
+    M,
+    N,
+    K,  #
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    BLOCK_SIZE_M: tl.constexpr,  #
+    BLOCK_SIZE_N: tl.constexpr,  #
+    BLOCK_SIZE_K: tl.constexpr,  #
+    GROUP_SIZE_M: tl.constexpr,  #
+    NUM_SMS: tl.constexpr,  #
+    A_LARGE: tl.constexpr,
+    B_LARGE: tl.constexpr,
+    C_LARGE: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    """Persistent matmul Triton kernel backing `matmul_persistent`."""
+    start_pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    num_tiles = num_pid_m * num_pid_n
+
+    tile_id_c = start_pid - NUM_SMS
+
+    offs_k_for_mask = tl.arange(0, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+
+    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True):
+        pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS)
+        start_m = pid_m * BLOCK_SIZE_M
+        start_n = pid_n * BLOCK_SIZE_N
+        offs_am = start_m + tl.arange(0, BLOCK_SIZE_M)
+        offs_bn = start_n + tl.arange(0, BLOCK_SIZE_N)
+        if A_LARGE:
+            offs_am = offs_am.to(tl.int64)
+        if B_LARGE:
+            offs_bn = offs_bn.to(tl.int64)
+        offs_am = tl.where(offs_am < M, offs_am, 0)
+        offs_bn = tl.where(offs_bn < N, offs_bn, 0)
+        offs_am = tl.max_contiguous(tl.multiple_of(offs_am, BLOCK_SIZE_M), BLOCK_SIZE_M)
+        offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn, BLOCK_SIZE_N), BLOCK_SIZE_N)
+
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for ki in range(k_tiles):
+            if A_LARGE or B_LARGE:
+                offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+            else:
+                offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+            a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+            b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+            a = tl.load(a_ptrs, mask=offs_k_for_mask[None, :] < K - ki * BLOCK_SIZE_K, other=0.0)
+            b = tl.load(b_ptrs, mask=offs_k_for_mask[:, None] < K - ki * BLOCK_SIZE_K, other=0.0)
+            accumulator = tl.dot(a, b, accumulator)
+
+        tile_id_c += NUM_SMS
+        pid_m, pid_n = _compute_pid(tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS)
+        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        if C_LARGE:
+            offs_cm = offs_cm.to(tl.int64)
+            offs_cn = offs_cn.to(tl.int64)
+        c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+        if HAS_BIAS:
+            bias_ptrs = bias_ptr + offs_cn
+            bias = tl.load(bias_ptrs, mask=offs_cn < N, other=0.0).to(tl.float32)
+            accumulator += bias
+        c = accumulator.to(c_ptr.dtype.element_ty)
+        tl.store(c_ptrs, c, mask=c_mask)
+
+
+def get_compute_units():
+    """
+    Returns the number of streaming multiprocessors (SMs) or equivalent compute units
+    for the available accelerator. Assigns the value to NUM_SMS.
+    """
+    NUM_SMS = None
+    device_type = getattr(torch.accelerator.current_accelerator(), "type", "cpu")
+
+    # Use match/case for device-specific logic (Python 3.10+)
+    match device_type:
+        case "cuda":
+            device_properties = torch.cuda.get_device_properties(0)
+            NUM_SMS = device_properties.multi_processor_count
+        case "xpu":
+            device_properties = torch.xpu.get_device_properties(0)
+            NUM_SMS = device_properties.max_compute_units
+        case _:
+            _LOGGER.warning("No CUDA or XPU device available. Using CPU.")
+            # For CPU, you might want to use the number of CPU cores
+            NUM_SMS = torch.get_num_threads()
+
+    return NUM_SMS
+
+
+def matmul_persistent(a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None):
+    """Persistent matmul kernel used by batch-invariant GEMM."""
+    # Check constraints.
+    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
+    assert a.dtype == b.dtype, "Incompatible dtypes"
+    assert (
+        bias is None or bias.dim() == 1
+    ), "Currently assuming bias is 1D, let Horace know if you run into this"
+
+    NUM_SMS = get_compute_units()
+    M, K = a.shape
+    K, N = b.shape
+    dtype = a.dtype
+    # Allocates output.
+    c = torch.empty((M, N), device=a.device, dtype=dtype)
+
+    # 1D launch kernel where each block gets its own program.
+    def grid(META):
+        blocks_m = triton.cdiv(M, META["BLOCK_SIZE_M"])
+        blocks_n = triton.cdiv(N, META["BLOCK_SIZE_N"])
+        return (min(NUM_SMS, blocks_m * blocks_n),)
+
+    configs = {
+        torch.bfloat16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 256,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float32: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+    }
+    matmul_kernel_persistent[grid](
+        a,
+        b,
+        c,  #
+        bias,
+        M,
+        N,
+        K,  #
+        a.stride(0),
+        a.stride(1),  #
+        b.stride(0),
+        b.stride(1),  #
+        c.stride(0),
+        c.stride(1),  #
+        NUM_SMS=NUM_SMS,  #
+        A_LARGE=a.numel() > 2**31,
+        B_LARGE=b.numel() > 2**31,
+        C_LARGE=c.numel() > 2**31,
+        HAS_BIAS=bias is not None,
+        **configs[dtype],
+    )
+    return c
+
+
+@triton.jit
+def _log_softmax_kernel(
+    input_ptr, output_ptr, input_row_stride, output_row_stride, n_cols, BLOCK_SIZE: tl.constexpr
+):
+    """
+    Compute log_softmax along the last dimension of a 2D tensor.
+    Each block handles one row of the input tensor.
+    """
+    # Get the row index for this block
+    row_idx = tl.program_id(0).to(tl.int64)
+
+    # Compute base pointers for input and output rows
+    row_start_ptr = input_ptr + row_idx * input_row_stride
+    output_row_start_ptr = output_ptr + row_idx * output_row_stride
+
+    # Step 1: Find maximum value in the row for numerical stability
+    max_val = -float("inf")
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+
+        # Load values
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=-float("inf"))
+
+        # Update maximum
+        max_val = tl.max(tl.maximum(vals, max_val))
+
+    # Step 2: Compute sum of exp(x - max_val)
+    sum_exp = 0.0
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+
+        # Load values
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0)
+
+        # Compute exp(x - max_val) and accumulate
+        exp_vals = tl.exp(vals - max_val)
+        sum_exp += tl.sum(tl.where(mask, exp_vals, 0.0))
+
+    # Compute log(sum_exp)
+    log_sum_exp = tl.log(sum_exp)
+
+    # Step 3: Compute final log_softmax values: x - max_val - log_sum_exp
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+
+        # Load values
+        vals = tl.load(row_start_ptr + col_idx, mask=mask)
+
+        # Compute log_softmax
+        output = vals - max_val - log_sum_exp
+
+        # Store results
+        tl.store(output_row_start_ptr + col_idx, output, mask=mask)
+
+
+def log_softmax(input: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """
+    Compute log_softmax using Triton kernel.
+
+    Args:
+        input: Input tensor
+        dim: Dimension along which to compute log_softmax (only -1 or last dim supported)
+    >> Stashed changes
+    Returns:
+        Tensor with log_softmax applied along the specified dimension
+    """
+    if dim != -1 and dim != input.ndim - 1:
+        raise ValueError("This implementation only supports log_softmax along the last dimension")
+    # Flatten all dimensions except the last one
+    original_shape = input.shape
+    input_2d = input.reshape(-1, input.shape[-1])
+    input_2d = input_2d.contiguous()
+
+    n_rows, n_cols = input_2d.shape
+
+    # Allocate output tensor
+    output = torch.empty_like(input_2d)
+
+    # Choose block size based on the number of columns
+    BLOCK_SIZE = 1024
+
+    # Launch kernel with one block per row
+    grid = (n_rows,)
+    _log_softmax_kernel[grid](
+        input_2d, output, input_2d.stride(0), output.stride(0), n_cols, BLOCK_SIZE=BLOCK_SIZE
+    )
+    # Reshape output back to original shape
+    return output.reshape(original_shape)
+
+
+@triton.jit
+def mean_kernel(
+    input_ptr,
+    output_ptr,
+    input_stride0,
+    input_stride1,
+    input_stride2,
+    output_stride0,
+    output_stride1,
+    M,  # size before reduction dim
+    N,  # size of reduction dim
+    K,  # size after reduction dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Kernel for computing mean along a single dimension.
+    Input is viewed as (M, N, K) where N is the dimension being reduced.
+    """
+    # Program ID gives us which output element we're computing
+    pid = tl.program_id(0)
+
+    # Compute output indices
+    m_idx = pid // K
+    k_idx = pid % K
+
+    # Bounds check
+    if m_idx >= M or k_idx >= K:
+        return
+
+    # Accumulate sum across reduction dimension
+    acc = 0.0
+    for n_start in range(0, N, BLOCK_SIZE):
+        n_offsets = n_start + tl.arange(0, BLOCK_SIZE)
+        mask = n_offsets < N
+
+        # Calculate input indices
+        input_idx = m_idx * input_stride0 + n_offsets * input_stride1 + k_idx * input_stride2
+
+        # Load and accumulate
+        vals = tl.load(input_ptr + input_idx, mask=mask, other=0.0)
+        acc += tl.sum(vals)
+
+    # Compute mean and store
+    mean_val = acc / N
+    output_idx = m_idx * output_stride0 + k_idx * output_stride1
+    tl.store(output_ptr + output_idx, mean_val)
+
+
+def mean_dim(
+    input: torch.Tensor, dim: int, keepdim: bool = False, dtype: torch.dtype | None = None
+) -> torch.Tensor:
+    """
+    Triton implementation of torch.mean with single dimension reduction.
+
+    Args:
+        input: Input tensor
+        dim: Single dimension along which to compute mean
+        keepdim: Whether to keep the reduced dimension
+        dtype: Output dtype. If None, uses input dtype (or float32 for integer inputs)
+
+    Returns:
+        Tensor with mean values along specified dimension
+    """
+    # Validate inputs
+    assert input.is_cuda, "Input must be a CUDA tensor"
+    assert (
+        -input.ndim <= dim < input.ndim
+    ), f"Invalid dimension {dim} for tensor with {input.ndim} dimensions"
+
+    # Handle negative dim
+    if dim < 0:
+        dim = dim + input.ndim
+
+    # Handle dtype
+    if dtype is None:
+        if input.dtype in [torch.int8, torch.int16, torch.int32, torch.int64]:
+            dtype = torch.float32
+        else:
+            dtype = input.dtype
+
+    # Convert input to appropriate dtype if needed
+    if input.dtype != dtype:
+        input = input.to(dtype)
+
+    # Get input shape and strides
+    shape = list(input.shape)
+
+    # Calculate dimensions for kernel
+    M = 1
+    for i in range(dim):
+        M *= shape[i]
+
+    N = shape[dim]
+
+    K = 1
+    for i in range(dim + 1, len(shape)):
+        K *= shape[i]
+
+    # Reshape input to 3D view (M, N, K)
+    input_3d = input.reshape(M, N, K)
+
+    # Create output shape
+    if keepdim:
+        output_shape = shape.copy()
+        output_shape[dim] = 1
+    else:
+        output_shape = shape[:dim] + shape[dim + 1 :]
+
+    # Create output tensor
+    output = torch.empty(output_shape, dtype=dtype, device=input.device)
+
+    # Reshape output for kernel
+    if keepdim:
+        output_2d = output.reshape(M, 1, K).squeeze(1)
+    else:
+        output_2d = output.reshape(M, K)
+
+    # Launch kernel
+    grid = (M * K,)
+    BLOCK_SIZE = 1024
+
+    mean_kernel[grid](
+        input_3d,
+        output_2d,
+        input_3d.stride(0),
+        input_3d.stride(1),
+        input_3d.stride(2),
+        output_2d.stride(0),
+        output_2d.stride(1) if output_2d.ndim > 1 else 0,
+        M,
+        N,
+        K,
+        BLOCK_SIZE,
+    )
+
+    return output
+
+
+def mm_batch_invariant(a, b):
+    """Batch-invariant replacement for `aten::mm` using a persistent matmul kernel."""
+    return matmul_persistent(a, b)
+
+
+def addmm_batch_invariant(bias, a, b):
+    """Batch-invariant replacement for `aten::addmm` using a persistent matmul kernel."""
+    return matmul_persistent(a, b, bias=bias)
+
+
+def _log_softmax_batch_invariant(input, dim, _half_to_float):
+    assert not _half_to_float, "not implemented"
+    return log_softmax(input, dim=dim)
+
+
+def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype | None = None):
+    """Batch-invariant replacement for `aten::mean.dim` over one or more dimensions."""
+    assert dtype is None or dtype == torch.float32, f"unsupported dtype: {dtype}"
+    if len(dim) == 1:
+        return mean_dim(input, dim[0], keepdim=keepdim)
+    else:
+        assert input.dtype in {
+            torch.float16,
+            torch.bfloat16,
+            torch.float32,
+        }, "only float types supported for now"
+        n_elems = 1
+        for d in dim:
+            n_elems *= input.shape[d]
+        return torch.sum(input, dim=dim, keepdim=keepdim, dtype=torch.float32) / n_elems
+
+
+AttentionBlockSize = namedtuple("AttentionBlockSize", ["block_m", "block_n"])
+
+
+def get_batch_invariant_attention_block_size() -> AttentionBlockSize:
+    """Return the (block_m, block_n) tiling used for batch-invariant attention."""
+    return AttentionBlockSize(block_m=16, block_n=16)
+
+
+_batch_invariant_MODE = False
+_batch_invariant_LIB = None
+_TE_GENERAL_GEMM_ORIG = None
+_TE_RMSNORM_ORIG_FWD = None
+_MEG_TE_GENERAL_GEMM_ORIG = None
+_TE_RMSNORM_FUNC_ORIGS: Dict[str, Any] = {}
+_TE_GEMM_FUNC_ORIGS: Dict[str, Any] = {}
+
+
+def _import_module_if_available(name: str):
+    spec = importlib.util.find_spec(name)
+    if spec is None:
+        return None
+    return importlib.import_module(name)
+
+
+def _te_patch_for_batch_invariant():
+    """Patch Transformer Engine modules to use batch-invariant GEMM and RMSNorm.
+
+    This monkey-patches TE's GEMM and RMSNorm entry points to dispatch to the
+    batch-invariant implementations when batch-invariant mode is enabled.
+    Safe no-op if TE is unavailable.
+    """
+    global _TE_GENERAL_GEMM_ORIG, _TE_RMSNORM_ORIG_FWD, _MEG_TE_GENERAL_GEMM_ORIG
+    import transformer_engine.pytorch as te
+    import transformer_engine.pytorch.cpp_extensions as te_cpp
+
+    # Patch general_gemm once
+    if _TE_GENERAL_GEMM_ORIG is None and hasattr(te_cpp, "general_gemm"):
+        _TE_GENERAL_GEMM_ORIG = te_cpp.general_gemm
+        te_cpp.general_gemm = _te_general_gemm_patched
+
+    # Also patch the symbol imported inside TE's module.linear
+    # (from ..cpp_extensions import general_gemm)
+    import transformer_engine.pytorch.module.linear as te_linear_mod
+
+    if hasattr(te_linear_mod, "general_gemm"):
+        if "module.linear.general_gemm" not in _TE_GEMM_FUNC_ORIGS:
+            _TE_GEMM_FUNC_ORIGS["module.linear.general_gemm"] = te_linear_mod.general_gemm
+            te_linear_mod.general_gemm = _te_general_gemm_patched
+
+    # Also patch the symbol imported inside TE's module.layernorm_linear
+    import transformer_engine.pytorch.module.layernorm_linear as te_layernorm_linear_mod
+
+    if hasattr(te_layernorm_linear_mod, "general_gemm"):
+        if "module.layernorm_linear.general_gemm" not in _TE_GEMM_FUNC_ORIGS:
+            _TE_GEMM_FUNC_ORIGS["module.layernorm_linear.general_gemm"] = (
+                te_layernorm_linear_mod.general_gemm
+            )
+            te_layernorm_linear_mod.general_gemm = _te_general_gemm_patched
+
+    # Also patch the symbol imported into Megatron's TE wrapper module
+    import megatron.core.extensions.transformer_engine as meg_te
+
+    if _MEG_TE_GENERAL_GEMM_ORIG is None and hasattr(meg_te, "general_gemm"):
+        _MEG_TE_GENERAL_GEMM_ORIG = meg_te.general_gemm
+        meg_te.general_gemm = _te_general_gemm_patched
+
+    # Patch RMSNorm.forward once (class may be on te or te.pytorch)
+    rms_cls = getattr(te, "RMSNorm", None)
+    if rms_cls is None:
+        rms_cls = getattr(te, "pytorch", None)
+        rms_cls = getattr(rms_cls, "RMSNorm", None)
+    if rms_cls is not None and _TE_RMSNORM_ORIG_FWD is None and hasattr(rms_cls, "forward"):
+        _TE_RMSNORM_ORIG_FWD = rms_cls.forward
+        rms_cls.forward = _te_rmsnorm_forward_patched
+
+    # Patch TE module-level RMSNorm functions used by fused LayerNormLinear
+    import transformer_engine.pytorch.module.layernorm as te_layernorm_mod
+
+    def _make_rmsnorm_patched(orig_func):
+        # Module-level helpers (e.g. transformer_engine.pytorch.module.layernorm.rmsnorm)
+        # do not go through the RMSNorm class, so we also wrap those functions here.
+        def _patched(*args, **kwargs):
+            # If batch-invariant mode is off, use original
+            if not is_batch_invariant_mode_enabled():
+                return orig_func(*args, **kwargs)
+
+            # Extract x, weight, eps from args/kwargs per TE signatures
+            x = args[0] if len(args) > 0 else kwargs.get("x")
+            weight = args[1] if len(args) > 1 else kwargs.get("weight")
+            eps = (args[2] if len(args) > 2 else None) if "eps" not in kwargs else kwargs.get("eps")
+            if eps is None:
+                eps = 1e-5
+            if x is None or weight is None:
+                return orig_func(*args, **kwargs)
+
+            y = rmsnorm_batch_invariant(x, weight, float(eps))
+            # Match TE behavior: cast output to parameter dtype
+            return y.to(weight.dtype)
+
+        return _patched
+
+    for name in ("rmsnorm", "rmsnorm_forward", "rmsnorm_fwd"):
+        if hasattr(te_layernorm_mod, name) and name not in _TE_RMSNORM_FUNC_ORIGS:
+            orig = getattr(te_layernorm_mod, name)
+            _TE_RMSNORM_FUNC_ORIGS[name] = orig
+            setattr(te_layernorm_mod, name, _make_rmsnorm_patched(orig))
+
+
+def _te_unpatch_for_batch_invariant():
+    """Restore original Transformer Engine functions if they were patched."""
+    global _TE_GENERAL_GEMM_ORIG, _TE_RMSNORM_ORIG_FWD, _MEG_TE_GENERAL_GEMM_ORIG
+    te_cpp = _import_module_if_available("transformer_engine.pytorch.cpp_extensions")
+    te = _import_module_if_available("transformer_engine.pytorch")
+    if te_cpp is None or te is None:
+        _TE_GENERAL_GEMM_ORIG = None
+        _TE_RMSNORM_ORIG_FWD = None
+        _MEG_TE_GENERAL_GEMM_ORIG = None
+        return
+
+    if _TE_GENERAL_GEMM_ORIG is not None and hasattr(te_cpp, "general_gemm"):
+        te_cpp.general_gemm = _TE_GENERAL_GEMM_ORIG
+        _TE_GENERAL_GEMM_ORIG = None
+
+    rms_cls = getattr(te, "RMSNorm", None)
+    if rms_cls is None:
+        rms_cls = getattr(te, "pytorch", None)
+        rms_cls = getattr(rms_cls, "RMSNorm", None)
+    if rms_cls is not None and _TE_RMSNORM_ORIG_FWD is not None:
+        rms_cls.forward = _TE_RMSNORM_ORIG_FWD
+        _TE_RMSNORM_ORIG_FWD = None
+
+    meg_te = _import_module_if_available("megatron.core.extensions.transformer_engine")
+    if (
+        meg_te is not None
+        and _MEG_TE_GENERAL_GEMM_ORIG is not None
+        and hasattr(meg_te, "general_gemm")
+    ):
+        meg_te.general_gemm = _MEG_TE_GENERAL_GEMM_ORIG
+        _MEG_TE_GENERAL_GEMM_ORIG = None
+    elif meg_te is None:
+        _MEG_TE_GENERAL_GEMM_ORIG = None
+
+    # Restore TE module-level RMSNorm functions
+    te_layernorm_mod = _import_module_if_available("transformer_engine.pytorch.module.layernorm")
+    if te_layernorm_mod is not None:
+        for name, orig in list(_TE_RMSNORM_FUNC_ORIGS.items()):
+            if hasattr(te_layernorm_mod, name):
+                setattr(te_layernorm_mod, name, orig)
+            _TE_RMSNORM_FUNC_ORIGS.pop(name, None)
+    else:
+        _TE_RMSNORM_FUNC_ORIGS.clear()
+
+    # Restore TE module.linear imported symbol for general_gemm if patched
+    te_linear_mod = _import_module_if_available("transformer_engine.pytorch.module.linear")
+    key = "module.linear.general_gemm"
+    if (
+        te_linear_mod is not None
+        and key in _TE_GEMM_FUNC_ORIGS
+        and hasattr(te_linear_mod, "general_gemm")
+    ):
+        te_linear_mod.general_gemm = _TE_GEMM_FUNC_ORIGS[key]
+        _TE_GEMM_FUNC_ORIGS.pop(key, None)
+    else:
+        _TE_GEMM_FUNC_ORIGS.pop(key, None)
+
+    # Restore TE module.layernorm_linear imported symbol for general_gemm if patched
+    te_layernorm_linear_mod = _import_module_if_available(
+        "transformer_engine.pytorch.module.layernorm_linear"
+    )
+    key = "module.layernorm_linear.general_gemm"
+    if (
+        te_layernorm_linear_mod is not None
+        and key in _TE_GEMM_FUNC_ORIGS
+        and hasattr(te_layernorm_linear_mod, "general_gemm")
+    ):
+        te_layernorm_linear_mod.general_gemm = _TE_GEMM_FUNC_ORIGS[key]
+        _TE_GEMM_FUNC_ORIGS.pop(key, None)
+    else:
+        _TE_GEMM_FUNC_ORIGS.pop(key, None)
+
+
+def _extract_te_gemm_args(args: tuple, kwargs: Dict[str, Any]):
+    """Utility to parse TE general_gemm flexible signature.
+
+    Returns (A, B, out_dtype, layout, out, bias, grad).
+    """
+    A = args[0] if len(args) > 0 else kwargs.get("A")
+    B = args[1] if len(args) > 1 else kwargs.get("B")
+    out_dtype = kwargs.get("out_dtype")
+    layout = kwargs.get("layout", "TN")
+    out = kwargs.get("out")
+    bias = kwargs.get("bias")
+    grad = kwargs.get("grad", False)
+    return A, B, out_dtype, layout, out, bias, grad
+
+
+def _is_supported_dtype_for_bik(t: torch.dtype) -> bool:
+    return t in {torch.float16, torch.bfloat16, torch.float32}
+
+
+class BatchInvariantTEGemmFn(torch.autograd.Function):
+    """Autograd function implementing batch-invariant TE GEMM."""
+
+    @staticmethod
+    def forward(
+        ctx,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        out_dtype: Optional[torch.dtype],
+        layout: str,
+    ):
+        """Forward pass computing batch-invariant TE GEMM.
+
+        Respects TE's flexible `layout` semantics, flattens leading dimensions of
+        the input as needed, applies optional bias, and casts to `out_dtype`.
+        """
+        assert isinstance(layout, str) and len(layout) == 2, f"Unsupported layout: {layout}"
+        transa = layout[0].upper() == "T"
+        transb = layout[1].upper() == "T"
+
+        opA = A.transpose(0, 1).contiguous() if transa else A.contiguous()  # [K, O] or [I, O]
+        opB = B.transpose(0, 1).contiguous() if transb else B.contiguous()  # [..., K]
+
+        # Flatten opA to 2D if needed (weight tensors should be 2D, but validate)
+        if opA.dim() > 2:
+            opA = opA.reshape(-1, opA.shape[-1])
+        elif opA.dim() < 2:
+            raise ValueError(f"opA has insufficient dimensions: {opA.shape}")
+        assert opA.dim() == 2, f"opA must be 2D for matmul_persistent, got shape {opA.shape}"
+
+        # Flatten all leading dims of opB except the last feature dim to match TE behavior
+        if opB.dim() >= 2:
+            leading_shape = opB.shape[:-1]
+            K = opB.shape[-1]
+            opB_2d = opB.reshape(-1, K)
+        else:
+            leading_shape = ()
+            opB_2d = opB
+
+        # Perform GEMM: (N_total, K) @ (K, O) -> (N_total, O)
+        base_2d = matmul_persistent(opB_2d, opA, bias=None)
+
+        # Reshape back to original leading dims with output features at the end
+        out = base_2d.reshape(*leading_shape, base_2d.shape[-1])
+
+        # Add bias after reshaping to match output structure
+        if bias is not None:
+            out = out + bias
+
+        if out_dtype is not None:
+            out = out.to(out_dtype)
+
+        # Save for backward
+        ctx.transa = transa
+        ctx.transb = transb
+        ctx.leading_shape = leading_shape
+        ctx.bias_present = bias is not None
+        ctx.save_for_backward(A, B)
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        """Backward pass for batch-invariant TE GEMM.
+
+        Computes gradients w.r.t. A, B, and optional bias while mirroring the
+        reshaping/layout logic used in the forward pass.
+        """
+        A, B = ctx.saved_tensors
+        transa = ctx.transa
+        transb = ctx.transb
+        leading_shape = ctx.leading_shape
+
+        # Reconstruct opA/opB for gradients
+        opA = A.transpose(0, 1).contiguous() if transa else A  # [K, O]
+        opB = B.transpose(0, 1).contiguous() if transb else B  # [..., K]
+
+        # Flatten grad_output to 2D to mirror forward flatten
+        if grad_output.dim() >= 2 and isinstance(leading_shape, tuple) and len(leading_shape) > 0:
+            N_total = 1
+            for s in leading_shape:
+                N_total *= s
+            grad_out_2d = grad_output.reshape(N_total, grad_output.shape[-1])
+        else:
+            grad_out_2d = grad_output
+
+        # Y = B_flat @ A -> dB_flat = dY @ A^T ; dA = B_flat^T @ dY
+        d_opB_2d = grad_out_2d.matmul(opA.transpose(0, 1).contiguous())
+        d_opA = opB.reshape(-1, opB.shape[-1]).transpose(0, 1).contiguous().matmul(grad_out_2d)
+
+        # Reshape d_opB back to original opB shape
+        d_opB = (
+            d_opB_2d.reshape(*leading_shape, d_opB_2d.shape[-1])
+            if grad_output.dim() >= 2
+            else d_opB_2d
+        )
+
+        # Map back to dA, dB based on trans flags
+        if transa:
+            dA = d_opA.transpose(0, 1).contiguous()
+        else:
+            dA = d_opA
+
+        if transb:
+            dB = d_opB.transpose(0, 1).contiguous()
+        else:
+            dB = d_opB
+
+        # Bias grad along last dimension of Y, if bias was added in forward
+        if ctx.bias_present:
+            dbias = grad_output.reshape(-1, grad_output.shape[-1]).sum(dim=0)
+        else:
+            dbias = None
+
+        return dA, dB, dbias, None, None
+
+
+def _te_general_gemm_patched(*args, **kwargs) -> List[torch.Tensor]:
+    """
+    Batch-invariant replacement for TE general_gemm.
+    Returns a list of tensors to match TE's API: (gemm_out, bias_grad, gelu_input, extra_output)
+    """
+    global _TE_GENERAL_GEMM_ORIG
+    # If original not captured, do nothing
+    if _TE_GENERAL_GEMM_ORIG is None:
+        raise RuntimeError("TE general_gemm original not captured; patching order issue")
+
+    A, B, out_dtype, layout, out, bias, grad = _extract_te_gemm_args(args, kwargs)
+    extra_output = kwargs.get("extra_output", None)
+    ub = kwargs.get("ub", None)
+    ub_type = kwargs.get("ub_type", None)
+    bulk_overlap = kwargs.get("bulk_overlap", False)
+
+    # Guardrails: validate inputs
+    if A is None or B is None:
+        raise ValueError("Batch-invariant GEMM requires A and B tensors.")
+    if (not A.is_cuda) or (not B.is_cuda):
+        raise RuntimeError("Batch-invariant GEMM requires CUDA tensors.")
+    if not _is_supported_dtype_for_bik(A.dtype) or not _is_supported_dtype_for_bik(B.dtype):
+        raise RuntimeError(f"Unsupported dtype for batch-invariant GEMM: {A.dtype}, {B.dtype}")
+
+    # Disallow GEMM-comm overlap in batch-invariant mode
+    if extra_output is not None or ub is not None or ub_type is not None or bulk_overlap:
+        raise RuntimeError(
+            "Batch-invariant GEMM does not support Userbuffers/overlap "
+            "(extra_output/ub/ub_type/bulk_overlap)."
+        )
+
+    # Compute via autograd-aware function matching TE's layout semantics
+    result = BatchInvariantTEGemmFn.apply(A, B, bias if not grad else None, out_dtype, layout)
+
+    bias_grad = None
+    if grad and bias is not None:
+        # Flatten B to 2D and sum over batch/sequence dimension (first dim)
+        B_flat = B.reshape(-1, B.shape[-1]) if B.dim() > 2 else B
+        bias_grad = B_flat.sum(dim=0)  # Sum over batch/sequence, keeping output dim
+
+    if out is not None:
+        out.copy_(result)
+        # TE expects (gemm_out, bias_grad, gelu_input, extra_output)
+        return (out, bias_grad, None, extra_output)
+    return (result, bias_grad, None, extra_output)
+
+
+class BatchInvariantRMSNormFn(torch.autograd.Function):
+    """Autograd function implementing batch-invariant RMSNorm."""
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, weight: torch.Tensor, eps: float, zero_centered_gamma: bool):
+        """Forward pass for batch-invariant RMSNorm.
+
+        Normalizes `x` using an RMSNorm-style statistic computed via `mean_dim`,
+        applies affine `weight`, and stores intermediate rsigma for backward.
+        """
+        if not x.is_cuda:
+            raise RuntimeError("Batch-invariant RMSNorm requires CUDA tensors.")
+        if not _is_supported_dtype_for_bik(x.dtype):
+            raise RuntimeError(f"Unsupported dtype for batch-invariant RMSNorm: {x.dtype}")
+        weight_eff = weight + 1.0 if zero_centered_gamma else weight
+
+        # We do everything in rmsnorm_batch_invariant manually here so that we can
+        # save rsigma in full precision for backward to match the TE behavior.
+        x_dtype = x.dtype
+        x_fp32 = x.float()
+        w_fp32 = weight.to(device=x.device, dtype=torch.float32)
+        ms = mean_dim(x_fp32 * x_fp32, dim=-1, keepdim=True)
+        rsigma = torch.rsqrt(ms + eps)
+        out_fp32 = (x_fp32 * rsigma) * w_fp32
+        out = out_fp32.to(x_dtype)
+
+        # Save for backward
+        ctx.eps = eps
+        ctx.zero_centered_gamma = zero_centered_gamma
+        ctx.rsigma = rsigma
+
+        ctx.save_for_backward(x, weight, rsigma)
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        """Backward pass for batch-invariant RMSNorm.
+
+        Computes gradients w.r.t. input and weight while matching TE's fp32
+        accumulation and reduction behavior for numerical stability.
+        """
+        x, weight, rsigma = ctx.saved_tensors
+        w_eff = (weight + 1.0) if ctx.zero_centered_gamma else weight
+
+        go_fp32 = grad_output.float()
+        x_fp32 = x.float()
+        w_fp32 = w_eff.to(device=x.device, dtype=torch.float32)
+        r = rsigma
+        r3 = r * r * r
+        D = x.shape[-1]
+
+        red_dims = tuple(range(0, go_fp32.ndim - 1))
+        g_w = (go_fp32 * x_fp32 * r).sum(dim=red_dims).to(weight.dtype)
+
+        s = (go_fp32 * x_fp32 * w_fp32).sum(dim=-1, keepdim=True)
+        dx = go_fp32 * (w_fp32 * r) - (w_fp32 * r3) * (s * x_fp32) / D
+        dx = dx.to(x.dtype)
+
+        return dx, g_w, None, None
+
+
+def rmsnorm_batch_invariant(x: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Batch-invariant RMSNorm wrapper that delegates to autograd-aware implementation.
+
+    This provides a simple functional interface while using the optimized BatchInvariantRMSNormFn
+    which has better numerics (fp32 precision in forward/backward).
+    """
+    # Delegate to the autograd function with zero_centered_gamma=False (standard RMSNorm)
+    return BatchInvariantRMSNormFn.apply(x, weight, eps, False)
+
+
+def _te_rmsnorm_forward_patched(self, x: torch.Tensor) -> torch.Tensor:
+    """Patched TE RMSNorm.forward that routes to batch-invariant
+    implementation with autograd support.
+    """
+    weight = getattr(self, "weight", None)
+    if weight is None:
+        raise RuntimeError("Batch-invariant RMSNorm requires affine weight.")
+    eps = getattr(self, "eps", 1e-5)
+    zero_centered_gamma = getattr(self, "zero_centered_gamma", False)
+    return BatchInvariantRMSNormFn.apply(x, weight, eps, zero_centered_gamma)
+
+
+def is_batch_invariant_mode_enabled():
+    """Return True if global batch-invariant mode is currently enabled."""
+    return _batch_invariant_MODE
+
+
+def enable_batch_invariant_mode():
+    """Enable global batch-invariant mode and patch Aten/TE kernels."""
+    global _batch_invariant_MODE, _batch_invariant_LIB
+    if _batch_invariant_MODE:
+        return
+    dispatch_key = getattr(torch.accelerator.current_accelerator(), "type", "cpu").upper()
+    _batch_invariant_MODE = True
+    _batch_invariant_LIB = torch.library.Library("aten", "IMPL")
+    _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, dispatch_key)
+    _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, dispatch_key)
+    _batch_invariant_LIB.impl("aten::_log_softmax", _log_softmax_batch_invariant, dispatch_key)
+    _batch_invariant_LIB.impl("aten::mean.dim", mean_batch_invariant, dispatch_key)
+    # Also patch Transformer Engine kernels when available
+    _te_patch_for_batch_invariant()
+
+
+def disable_batch_invariant_mode():
+    """Disable global batch-invariant mode and restore original kernels."""
+    global _batch_invariant_MODE, _batch_invariant_LIB
+    if _batch_invariant_LIB is not None:
+        _batch_invariant_LIB._destroy()
+    _batch_invariant_MODE = False
+    _batch_invariant_LIB = None
+    # Restore Transformer Engine kernels if previously patched
+    _te_unpatch_for_batch_invariant()
+
+
+@contextlib.contextmanager
+def set_batch_invariant_mode(enabled: bool = True):
+    """Context manager to toggle global batch-invariant mode.
+
+    When `enabled` is True, batch-invariant kernels are enabled for the duration of
+    the context; when False, they are disabled for the duration. This implementation
+    is re-entrant and correctly restores the previous state even under nesting.
+    """
+    global _batch_invariant_MODE, _batch_invariant_LIB
+    # Save the previous on/off state so we can correctly restore it, even under
+    # nested usage or when toggling from True->False inside an outer True scope.
+    prev_enabled = _batch_invariant_MODE
+
+    # Apply the requested state only if it differs from the current one.
+    if enabled and not prev_enabled:
+        enable_batch_invariant_mode()
+    elif not enabled and prev_enabled:
+        disable_batch_invariant_mode()
+
+    try:
+        yield
+    finally:
+        # Restore the previous state. If we turned BIK on at entry, turn it off here.
+        # If we turned it off at entry (inside an outer True scope), turn it back on.
+        if enabled and not prev_enabled:
+            disable_batch_invariant_mode()
+        elif not enabled and prev_enabled:
+            enable_batch_invariant_mode()
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
index f3711c86ebd..69039e0bfd0 100644
--- a/megatron/core/transformer/dot_product_attention.py
+++ b/megatron/core/transformer/dot_product_attention.py
@@ -45,10 +45,10 @@ def __init__(
         layer_number: int,
         attn_mask_type: AttnMaskType,
         attention_type: str,
-        attention_dropout: float = None,
-        softmax_scale: float = None,
-        cp_comm_type: str = None,
-        pg_collection: ProcessGroupCollection = None,
+        attention_dropout: Optional[float] = None,
+        softmax_scale: Optional[float] = None,
+        cp_comm_type: Optional[str] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ):
         super().__init__(config=config)
 
@@ -71,6 +71,8 @@ def __init__(
             assert hasattr(
                 pg_collection, 'tp'
             ), "DotProductAttention pg_collection must have tp process group"
+        self.pg_collection = pg_collection
+        self.tp_group = self.pg_collection.tp
 
         world_size = pg_collection.tp.size()
         self.hidden_size_per_partition = divide(projection_size, world_size)
@@ -142,9 +144,9 @@ def forward(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        attention_mask: Tensor,
-        attn_mask_type: AttnMaskType = None,
-        attention_bias: Tensor = None,
+        attention_mask: Optional[Tensor],
+        attn_mask_type: Optional[AttnMaskType] = None,
+        attention_bias: Optional[Tensor] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
     ):
         """Forward."""
@@ -251,7 +253,7 @@ def forward(
     def sharded_state_dict(
         self,
         prefix: str = '',
-        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        sharded_offsets: Tuple[Tuple[int, int, int], ...] = (),
         metadata: Optional[dict] = None,
     ) -> ShardedStateDict:
         """Sharded state dict for the learnable softmax offset parameter"""
@@ -260,5 +262,10 @@ def sharded_state_dict(
         else:
             state_dict = {}
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {'softmax_offset': 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {'softmax_offset': 0},
+            sharded_offsets,
+            tp_group=self.tp_group,
+            dp_cp_group=metadata['dp_cp_group'],
         )
diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py
index 52b82029f90..d06d58d65f2 100644
--- a/megatron/core/transformer/enums.py
+++ b/megatron/core/transformer/enums.py
@@ -65,3 +65,15 @@ class AttnBackend(enum.Enum):
     unfused = 3
     local = 4
     auto = 5
+
+
+class CudaGraphScope(enum.Enum):
+    """Cuda Graph Scope - defines which parts of the model to capture."""
+
+    full_iteration = 1  # Captures the entire training/inference iteration
+    attn = 2  # Captures attention layers
+    mlp = 3  # Captures MLP layers (dense layers only)
+    moe = 4  # Captures MoE layers (drop-and-pad MoE layers only)
+    moe_router = 5  # Captures MoE router part
+    moe_preprocess = 6  # Captures MoE preprocessing part (requires moe_router)
+    mamba = 7  # Captures Mamba layers
diff --git a/megatron/core/transformer/experimental_attention_variant/dsa.py b/megatron/core/transformer/experimental_attention_variant/dsa.py
new file mode 100644
index 00000000000..88b4713dc60
--- /dev/null
+++ b/megatron/core/transformer/experimental_attention_variant/dsa.py
@@ -0,0 +1,822 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings import (
+    RotaryEmbedding,
+    YarnRotaryEmbedding,
+    apply_rotary_pos_emb,
+)
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+try:
+    from fast_hadamard_transform import hadamard_transform
+except ImportError:
+    hadamard_transform = None
+
+
+def rotate_activation(x: torch.Tensor) -> torch.Tensor:
+    """Apply Hadamard rotation activation.
+    Reference:
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L424-L428
+
+    Args:
+        x: Input tensor (must be bfloat16).
+
+    Returns:
+        Rotated tensor.
+    """
+    assert (
+        x.dtype == torch.bfloat16
+    ), f"rotate_activation only support bf16 input, but got {x.dtype}"
+    assert hadamard_transform is not None, "fast_hadamard_transform is not installed."
+    hidden_size = x.size(-1)
+    return hadamard_transform(x, scale=hidden_size**-0.5)
+
+
+class DSAIndexerLossLoggingHelper:
+    """Helper class for logging sparse attention indexer losses."""
+
+    tracker = {}
+
+    @staticmethod
+    def save_loss_to_tracker(
+        loss: torch.Tensor,
+        layer_number: int,
+        num_layers: int,
+        reduce_group: torch.distributed.ProcessGroup = None,
+        avg_group: torch.distributed.ProcessGroup = None,
+    ):
+        """Save the indexer loss for logging.
+
+        Args:
+            loss: The loss tensor.
+            layer_number: Layer index of the loss, 1-indexed.
+            num_layers: The number of total layers.
+            reduce_group: The group for reducing the loss.
+            avg_group: The group for averaging the loss.
+        """
+        # Skip indexer loss logging if layer_number is None.
+        if layer_number is None:
+            return
+
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" not in tracker:
+            tracker["values"] = torch.zeros(num_layers, device=torch.cuda.current_device())
+        tracker["values"][layer_number - 1] += loss.detach()
+        tracker["reduce_group"] = reduce_group
+        tracker["avg_group"] = avg_group
+
+    @staticmethod
+    def clean_loss_in_tracker():
+        """Clear the indexer losses."""
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" in tracker:
+            tracker["values"].zero_()
+        tracker["reduce_group"] = None
+        tracker["avg_group"] = None
+
+    @staticmethod
+    def reduce_loss_in_tracker():
+        """Collect and reduce the indexer losses across ranks."""
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" not in tracker:
+            return
+        values = tracker["values"]
+
+        torch.distributed.all_reduce(
+            values, group=parallel_state.get_pipeline_model_parallel_group()
+        )
+        # Reduce indexer losses across ranks.
+        if tracker.get('reduce_group') is not None:
+            torch.distributed.all_reduce(values, group=tracker.get('reduce_group'))
+        if tracker.get('avg_group') is not None:
+            torch.distributed.all_reduce(
+                values, group=tracker['avg_group'], op=torch.distributed.ReduceOp.AVG
+            )
+        torch.distributed.all_reduce(
+            values,
+            group=parallel_state.get_data_parallel_group(with_context_parallel=False),
+            op=torch.distributed.ReduceOp.AVG,
+        )
+
+    @staticmethod
+    def track_indexer_metrics(
+        loss_scale: float,
+        iteration: int,
+        writer,
+        wandb_writer=None,
+        total_loss_dict=None,
+        per_layer_logging: bool = False,
+    ):
+        """Track the sparse attention indexer metrics for logging.
+
+        Args:
+            loss_scale: Scale factor for the loss.
+            iteration: Current training iteration.
+            writer: TensorBoard writer.
+            wandb_writer: Weights & Biases writer.
+            total_loss_dict: Dictionary to accumulate total losses.
+            per_layer_logging: Whether to log per-layer losses.
+        """
+        DSAIndexerLossLoggingHelper.reduce_loss_in_tracker()
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" not in tracker:
+            return
+
+        indexer_loss_values = tracker["values"] * loss_scale
+        num_layers = indexer_loss_values.shape[0]
+
+        # Average across all layers (assuming all layers have sparse attention)
+        avg_indexer_loss = indexer_loss_values.sum() / num_layers
+
+        # Log average loss
+        if total_loss_dict is not None:
+            if "indexer loss" in total_loss_dict:
+                total_loss_dict["indexer loss"] += avg_indexer_loss
+            else:
+                total_loss_dict["indexer loss"] = avg_indexer_loss
+
+        if writer is not None:
+            writer.add_scalar("indexer loss", avg_indexer_loss, iteration)
+
+        if wandb_writer is not None:
+            wandb_writer.log({"indexer loss": avg_indexer_loss}, iteration)
+
+        DSAIndexerLossLoggingHelper.clean_loss_in_tracker()
+
+
+def compute_dsa_indexer_loss(
+    index_scores: torch.Tensor,
+    topk_indices: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    softmax_scale: float,
+    loss_coeff: float,
+    sparse_loss: bool,
+    pg_collection: ProcessGroupCollection,
+) -> torch.Tensor:
+    """
+    Compute KL divergence loss between index_scores and true attention_scores.
+
+    This loss trains the indexer to predict which tokens are important by matching the distribution
+    of true attention scores.
+
+    Reference: Section 2.1 of
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/DeepSeek_V3_2.pdf
+
+    Args:
+        index_scores: Scores predicted by indexer [batch, seqlen_q, seqlen_k].
+        topk_indices: Top-k indices [batch, seqlen_q, index_topk].
+        query: Query tensor [seqlen_q, batch, heads, dim].
+        key: Key tensor [seqlen_k, batch, heads, dim].
+        softmax_scale: Scale coefficient after q @ k^T.
+        loss_coeff: Coefficient for the indexer KL divergence loss.
+        sparse_loss: bool, whether to use sparse indexer loss. If True, only the topk
+            indices will be used to compute the loss.
+        pg_collection: Process group collection, must have TP process group.
+
+    Returns:
+        index_loss: KL divergence loss (scalar).
+    """
+    sq, b, np, hn = query.size()
+    sk = key.size(0)
+
+    # [sq, b, np, hn] -> [b, np, sq, hn] -> [b * np, sq, hn]
+    query = query.permute(1, 2, 0, 3).reshape(b * np, sq, hn)
+    # [sk, b, np, hn] -> [b, np, hn, sk] -> [b * np, hn, sk]
+    key = key.permute(1, 2, 3, 0).reshape(b * np, hn, sk)
+    # Compute attention scores [b * np, sq, sk]
+    attention_scores = torch.bmm(query.float(), key.float()) * softmax_scale
+    # Reshape to [b, np, sq, sk]
+    attention_scores = attention_scores.reshape(b, np, sq, sk)
+
+    # causal_mask [sq, sk]
+    causal_mask = torch.triu(
+        torch.full((sq, sk), float('-inf'), dtype=torch.float32, device=attention_scores.device),
+        diagonal=1,
+    )
+    # index_mask [b, sq, sk]
+    index_mask = torch.full(
+        (b, sq, sk), float("-inf"), dtype=torch.float32, device=causal_mask.device
+    ).scatter_(-1, topk_indices, 0)
+
+    # [b, np, sq, skv] + [1, 1, sq, skv] -> [b, np, sq, skv]
+    attention_scores += causal_mask.view(1, 1, sq, sk)
+    if sparse_loss:
+        # [b, np, sq, sk] + [b, 1, sq, sk] -> [b, np, sq, sk]
+        attention_scores += index_mask.view(b, 1, sq, sk)
+        # [b, sq, sk] + [b, sq, sk] -> [b, sq, sk]
+        index_scores += index_mask
+
+    # [b, np, sq, sk] -> [b, np, sq, sk]
+    attention_scores = torch.nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32)
+    # [b, sq, sk] -> [b, sq, sk]
+    index_scores = torch.nn.functional.softmax(index_scores, dim=-1, dtype=torch.float32)
+
+    # Sum attention scores across heads.
+    # [batch, heads, seqlen_q, seqlen_k] -> [batch, seqlen_q, seqlen_k]
+    attention_scores = attention_scores.sum(dim=1)
+    if pg_collection.tp.size() > 1:
+        # attention scores are scattered to TP ranks in head dimension.
+        torch.distributed.all_reduce(attention_scores.contiguous(), group=pg_collection.tp)
+    # L1 normalize target on the last dimension. Doesn't use abs() because attention_scores are
+    # obtained from softmax so they are already non-negative.
+    attention_scores = attention_scores / attention_scores.sum(dim=-1, keepdim=True)
+
+    # Compute KL divergence: KL(target || index) = target(x) * log(target(x) / index(x))
+    # kl_per_element [b, sq, sk]
+    kl_per_element = attention_scores * (
+        torch.log(attention_scores + 1e-10) - torch.log(index_scores + 1e-10)
+    )
+
+    # [b, sq, sk] -> [b, sq] -> [1]
+    # Each token has same weight in the loss.
+    kl_div = kl_per_element.sum(dim=-1).mean()
+
+    # Scale by coefficient.
+    indexer_loss = kl_div * loss_coeff
+
+    return indexer_loss
+
+
+class DSAIndexerLossAutoScaler(torch.autograd.Function):
+    """An AutoScaler that triggers the backward pass and scales the grad for indexer loss.
+
+    This custom autograd function attaches a KL divergence loss to the activation
+    to train the indexer to predict attention scores without affecting the forward pass.
+    """
+
+    main_loss_backward_scale: torch.Tensor = None
+
+    @staticmethod
+    def forward(ctx, output: torch.Tensor, indexer_loss: torch.Tensor):
+        """Preserve the indexer_loss by storing it in the context to avoid garbage collection.
+
+        Args:
+            output: The output tensor (activation).
+            indexer_loss: The indexer KL divergence loss tensor.
+
+        Returns:
+            torch.Tensor: The output tensor unchanged.
+        """
+        ctx.save_for_backward(indexer_loss)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        """Compute and scale the gradient for indexer loss.
+
+        Args:
+            grad_output: The gradient of the output.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The gradient of the output, scaled indexer loss
+                gradient.
+        """
+        (indexer_loss,) = ctx.saved_tensors
+        if DSAIndexerLossAutoScaler.main_loss_backward_scale is None:
+            DSAIndexerLossAutoScaler.main_loss_backward_scale = torch.tensor(
+                1.0, device=indexer_loss.device
+            )
+        indexer_loss_backward_scale = DSAIndexerLossAutoScaler.main_loss_backward_scale
+        scaled_indexer_loss_grad = torch.ones_like(indexer_loss) * indexer_loss_backward_scale
+        return grad_output, scaled_indexer_loss_grad
+
+    @staticmethod
+    def set_loss_scale(scale: torch.Tensor):
+        """Set the scale of the indexer loss.
+
+        Args:
+            scale: The scale value to set.
+        """
+        if DSAIndexerLossAutoScaler.main_loss_backward_scale is None:
+            DSAIndexerLossAutoScaler.main_loss_backward_scale = scale
+        else:
+            DSAIndexerLossAutoScaler.main_loss_backward_scale.copy_(scale)
+
+
+@dataclass
+class DSAIndexerSubmodules:
+    """
+    Configuration class for specifying the submodules of an DSA Indexer.
+
+    Args:
+        linear_wq_b: Linear projection for query bottleneck expansion.
+        linear_wk: Linear projection for key.
+        k_norm: Layer normalization for key.
+        linear_weights_proj: Linear projection for attention weights.
+    """
+
+    linear_wq_b: Union[ModuleSpec, type] = None
+    linear_wk: Union[ModuleSpec, type] = None
+    k_norm: Union[ModuleSpec, type] = None
+    linear_weights_proj: Union[ModuleSpec, type] = None
+
+
+@dataclass
+class DSAttentionSubmodules:
+    """
+    Configuration class for specifying the submodules of DSAttention.
+
+    Args:
+        indexer: DSA Indexer module for computing sparse attention indices.
+    """
+
+    indexer: Union[ModuleSpec, type] = None
+
+
+class DSAIndexer(MegatronModule):
+    """
+    DSA Lightning Indexer for DeepSeek Sparse Attention.
+
+    Computes index scores to identify the top-k most relevant key-value pairs for each query in
+    sparse attention.
+
+    Reference:
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L431-L480
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: DSAIndexerSubmodules,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ) -> None:
+        """Initialize the indexer.
+
+        Args:
+            config (TransformerConfig): The configuration for the transformer model.
+            submodules (DSAIndexerSubmodules): Indexer submodules specification.
+            pg_collection (ProcessGroupCollection, optional): Process groups for the indexer.
+        """
+        super().__init__(config=config)
+        self.hidden_size = self.config.hidden_size
+        self.qk_pos_emb_head_dim = self.config.qk_pos_emb_head_dim
+        self.q_lora_rank = (
+            self.config.q_lora_rank
+            if self.config.q_lora_rank is not None
+            else self.config.hidden_size
+        )
+
+        self.index_n_heads = self.config.dsa_indexer_n_heads
+        self.index_head_dim = self.config.dsa_indexer_head_dim
+        self.index_topk = self.config.dsa_indexer_topk
+
+        self.softmax_scale: float = self.index_head_dim**-0.5
+
+        if pg_collection is None:
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        self.pg_collection = pg_collection
+
+        # Initialize Position Embedding.
+        if self.config.rope_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                self.qk_pos_emb_head_dim,
+                rotary_percent=self.config.rotary_percent,
+                rotary_base=self.config.rotary_base,
+                cp_group=self.pg_collection.cp,
+            )
+        elif self.config.rope_type == 'yarn':
+            self.rotary_pos_emb = YarnRotaryEmbedding(
+                self.qk_pos_emb_head_dim,
+                rotary_base=self.config.rotary_base,
+                scaling_factor=self.config.rotary_scaling_factor,
+                original_max_position_embeddings=self.config.original_max_position_embeddings,
+                beta_fast=self.config.beta_fast,
+                beta_slow=self.config.beta_slow,
+                mscale=self.config.mscale,
+                mscale_all_dim=self.config.mscale_all_dim,
+                cp_group=self.pg_collection.cp,
+            )
+        else:
+            raise ValueError(
+                f'Unsupported RoPE type: {self.config.rope_type}, supported types are "rope" and '
+                f'"yarn"'
+            )
+
+        self.linear_wq_b = build_module(
+            submodules.linear_wq_b,
+            self.q_lora_rank,
+            self.index_n_heads * self.index_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        self.linear_wk = build_module(
+            submodules.linear_wk,
+            self.hidden_size,
+            self.index_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        k_norm_config = copy.copy(self.config)
+        k_norm_config.normalization = "LayerNorm"
+        self.k_norm = build_module(
+            submodules.k_norm,
+            config=k_norm_config,
+            hidden_size=self.index_head_dim,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.linear_weights_proj = build_module(
+            submodules.linear_weights_proj,
+            self.hidden_size,
+            self.index_n_heads,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+    def _apply_rope(self, x: torch.Tensor, rotary_pos_emb: torch.Tensor, mscale: float):
+        """Apply RoPE to the input tensor."""
+        # x_nope [seqlen, batch, *, index_head_dim - qk_pos_emb_head_dim]
+        # x_pe   [seqlen, batch, *, qk_pos_emb_head_dim]
+        x_nope, x_pe = torch.split(
+            x, [self.index_head_dim - self.qk_pos_emb_head_dim, self.qk_pos_emb_head_dim], dim=-1
+        )
+        x_pe = apply_rotary_pos_emb(
+            x_pe,
+            rotary_pos_emb,
+            config=self.config,
+            cu_seqlens=None,
+            mscale=mscale,
+            cp_group=self.pg_collection.cp,
+        )
+        # [seqlen, batch, *, index_head_dim]
+        x = torch.cat([x_nope, x_pe], dim=-1)
+        return x
+
+    def _compute_index_scores(
+        self, q: torch.Tensor, weights: torch.Tensor, k: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Perform index score using BF16 precision.
+
+        Reference:
+            https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/kernel.py#L254-L274
+        This is a BF16 implementation of the `fp8_index` logic:
+            1. Compute attention scores: q @ k^T;
+            2. Apply ReLU activation;
+            3. Weight by attention weights;
+            4. Sum across attention heads.
+
+        Args:
+            q: BF16 [seqlen_q, batch, index_n_heads, index_head_dim], the query tensor.
+            weights: BF16 [seqlen_q, batch, index_n_heads], the attention weights.
+            k: BF16 [seqlen_k, batch, index_head_dim], the key tensor.
+
+        Returns:
+            index_scores: FP32 [batch, seqlen_q, seqlen_k], the index scores.
+        """
+        # Compute attention scores: q @ k^T
+        # [seqlen_q, batch, index_n_heads, index_head_dim] @ [seqlen_k, batch, index_head_dim]^T
+        #   -> [seqlen_q, batch, index_n_heads, seqlen_k]
+        index_scores = torch.einsum('sbhd,tbd->sbht', q.float(), k.float())
+
+        # Apply ReLU activation.
+        index_scores = torch.relu(index_scores)
+
+        # Weight each head by attention weights.
+        # [seqlen_q, batch, index_n_heads, seqlen_k] * [seqlen_q, batch, index_n_heads, 1]
+        #   -> [seqlen_q, batch, index_n_heads, seqlen_k]
+        index_scores = index_scores * weights.unsqueeze(-1)
+
+        # Sum across attention heads.
+        # [seqlen_q, batch, index_n_heads, seqlen_k] -> [seqlen_q, batch, seqlen_k]
+        index_scores = index_scores.sum(dim=2)
+
+        # Transpose to [batch, seqlen_q, seqlen_k].
+        index_scores = index_scores.transpose(0, 1)
+
+        return index_scores
+
+    def forward_with_scores(
+        self,
+        x: torch.Tensor,
+        qr: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass for DSA Indexer that returns both index scores and top-k indices.
+
+        This is used when KL loss is enabled to compare indexer scores with true attention scores.
+
+        Args:
+            x: hidden states [seqlen, batch, hidden_size].
+            qr: Low-rank query tensor [seqlen, batch, q_lora_rank].
+            mask: Attention mask [batch, seqlen, seqlen].
+            packed_seq_params: Packed sequence parameters for variable length sequences.
+
+        Returns:
+            index_scores: Index scores [batch, seqlen, seqlen].
+            topk_indices: Top-k indices [batch, seqlen, index_topk].
+        """
+        assert packed_seq_params is None, "Packed sequence is not supported for DSAttention"
+
+        # =========================================
+        # Prepare RoPE params
+        # =========================================
+        rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+            None, None, x, self.config, packed_seq_params
+        )
+        if self.config.rope_type == "rope":
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len, packed_seq=False)
+            mscale = 1.0
+        else:
+            rotary_pos_emb, mscale = self.rotary_pos_emb(rotary_seq_len, packed_seq=False)
+
+        # =========================================
+        # Gather inputs if sp is enabled
+        # =========================================
+        if self.config.sequence_parallel and self.pg_collection.tp.size() > 1:
+            x = gather_from_sequence_parallel_region(x, group=self.pg_collection.tp)
+            qr = gather_from_sequence_parallel_region(qr, group=self.pg_collection.tp)
+
+        # =========================================
+        # Get sequence length and batch size
+        # =========================================
+        seqlen, bsz, _ = x.size()
+
+        # =========================================
+        # q linear and apply rope to q
+        # =========================================
+        # [seqlen, batch, q_lora_rank] -> [seqlen, batch, index_n_heads * index_head_dim]
+        q, _ = self.linear_wq_b(qr)
+        # [seqlen, batch, index_n_heads * index_head_dim]
+        #   -> [seqlen, batch, index_n_heads, index_head_dim]
+        q = q.reshape(seqlen, bsz, self.index_n_heads, self.index_head_dim)
+        q = self._apply_rope(q, rotary_pos_emb, mscale)
+
+        # =========================================
+        # k linear and apply rope to k
+        # =========================================
+        # [seqlen, batch, hidden_size] -> [seqlen, batch, index_head_dim]
+        k, _ = self.linear_wk(x)
+        k = self.k_norm(k)
+        # [seqlen, batch, index_head_dim] -> [seqlen, batch, 1, index_head_dim]
+        k = k.reshape(seqlen, bsz, 1, self.index_head_dim)
+        k = self._apply_rope(k, rotary_pos_emb, mscale)
+        # [seqlen, batch, 1, index_head_dim] -> [seqlen, batch, index_head_dim]
+        k = k.reshape(seqlen, bsz, self.index_head_dim)
+
+        # =========================================
+        # Rotate activation
+        # =========================================
+        q = rotate_activation(q)
+        k = rotate_activation(k)
+
+        # =========================================
+        # Compute index scores
+        # =========================================
+        # [seqlen, batch, hidden_size] -> [seqlen, batch, index_n_heads]
+        weights, _ = self.linear_weights_proj(x)
+        weights = weights * (self.index_n_heads**-0.5) * self.softmax_scale
+        # [batch, seqlen, seqlen]
+        index_scores = self._compute_index_scores(q, weights, k)
+        if mask is not None:
+            assert mask.dtype == index_scores.dtype, "Mask dtype must match index scores dtype"
+            index_scores = index_scores + mask
+
+        # =========================================
+        # Select top-k indices
+        # =========================================
+        topk_k = min(self.index_topk, seqlen)
+        # [batch, seqlen, index_topk]
+        topk_indices = index_scores.topk(topk_k, dim=-1)[1]
+
+        return index_scores, topk_indices
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        qr: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ):
+        """
+        Forward pass for DSA Indexer.
+
+        Args:
+            x: hidden states [seqlen, batch, hidden_size].
+            qr: Low-rank query tensor [seqlen, batch, q_lora_rank].
+            mask: Attention mask [batch, seqlen, seqlen].
+            packed_seq_params: Packed sequence parameters for variable length sequences.
+
+        Returns:
+            topk_indices: Top-k indices for sparse attention [batch, seqlen, index_topk].
+        """
+        _, topk_indices = self.forward_with_scores(x, qr, mask, packed_seq_params)
+        return topk_indices
+
+
+def unfused_dsa_fn(query, key, value, topk_indices, softmax_scale):
+    """
+    Unfused sparse attention implementation.
+    """
+    sq, b, np, hn = query.size()
+    skv = key.size(0)
+    hnv = value.size(3)
+
+    # ===================================
+    # Raw attention scores [b, np, sq, skv]
+    # ===================================
+    # [sq, b, np, hn] -> [b, np, sq, hn] -> [b * np, sq, hn]
+    query = query.permute(1, 2, 0, 3).reshape(b * np, sq, hn)
+    # [skv, b, np, hn] -> [b, np, hn, skv] -> [b * np, hn, skv]
+    key = key.permute(1, 2, 3, 0).reshape(b * np, hn, skv)
+    # Compute attention scores [b * np, sq, skv]
+    attention_scores = torch.bmm(query.float(), key.float()) * softmax_scale
+    # Reshape to [b, np, sq, skv]
+    attention_scores = attention_scores.reshape(b, np, sq, skv)
+
+    # ===================================
+    # Apply sparse mask from indexer
+    # ===================================
+    # index_mask [b, sq, skv]
+    index_mask = torch.full((b, sq, skv), float("-inf"), device=attention_scores.device)
+    index_mask.scatter_(-1, topk_indices, 0)
+    # causal_mask [sq, skv]
+    causal_mask = torch.triu(
+        torch.full((sq, skv), float('-inf'), dtype=torch.float32, device=index_mask.device),
+        diagonal=1,
+    )
+    # [b, sq, skv] + [1, sq, skv] -> [b, sq, skv]
+    index_mask += causal_mask.view(1, sq, skv)
+    # [b, np, sq, skv] + [b, 1, sq, skv] -> [b, np, sq, skv]
+    attention_scores += index_mask.unsqueeze(1)
+    attention_scores = torch.nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32)
+
+    # ===================================
+    # Output
+    # ===================================
+    # [skv, b, np, hnv] -> [b, np, skv, hnv] -> [b * np, skv, hnv]
+    value = value.permute(1, 2, 0, 3).reshape(b * np, skv, hnv)
+    # Reshape attention_scores: [b, np, sq, skv] -> [b * np, sq, skv]
+    attention_scores = attention_scores.reshape(b * np, sq, skv)
+    # Compute output: [b * np, sq, hnv]
+    output = torch.bmm(attention_scores.to(value.dtype), value)
+    # Reshape output: [b * np, sq, hnv] -> [b, np, sq, hnv] -> [sq, b, np, hnv]
+    output = output.reshape(b, np, sq, hnv).permute(2, 0, 1, 3).contiguous()
+    # Flatten: [sq, b, np, hnv] -> [sq, b, np * hnv]
+    output = output.reshape(sq, b, np * hnv)
+    return output
+
+
+class DSAttention(MegatronModule):
+    """
+    This module implements sparse attention mechanism using an DSA Indexer to compute top-k
+    attention indices for reducing computational complexity.
+
+    Reference:
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L491-L597
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: DSAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: Optional[float] = None,
+        softmax_scale: Optional[float] = None,
+        k_channels: Optional[int] = None,
+        v_channels: Optional[int] = None,
+        cp_comm_type: str = "p2p",
+        pg_collection: ProcessGroupCollection = None,
+    ):
+        super().__init__(config=config)
+
+        self.layer_number = layer_number
+
+        self.indexer = build_module(
+            submodules.indexer, config=self.config, pg_collection=pg_collection
+        )
+
+        if softmax_scale is None:
+            softmax_scale = 1.0 / math.sqrt(
+                k_channels if k_channels is not None else config.kv_channels
+            )
+        self.softmax_scale = softmax_scale
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: torch.Tensor,
+        x: torch.Tensor,
+        qr: torch.Tensor,
+        attn_mask_type: AttnMaskType = None,
+        attention_bias: torch.Tensor = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        """
+        Forward pass for Sparse Attention.
+
+        Args:
+            query: Query tensor [sq, b, np, hn].
+            key: Key tensor [skv, b, np, hn].
+            value: Value tensor [skv, b, np, hnv].
+            x: Original hidden states [sq, b, hidden_size].
+            qr: Low-rank query representation [sq, b, q_lora_rank].
+            attention_mask: Attention mask tensor [b, 1, sq, sk].
+            attn_mask_type: Type of attention mask.
+            attention_bias: Optional attention bias.
+            packed_seq_params: Packed sequence parameters.
+
+        Returns:
+            output: Output tensor [sq, b, hidden_size]
+        """
+        sq, b, np, hn = query.size()
+        skv = key.size(0)
+        hnv = value.size(3)
+
+        # Detach x and qr to prevent gradients of indexer from flowing back to the main model.
+        x = x.detach()
+        qr = qr.detach()
+
+        # Get a FP32 mask with -inf for masked positions.
+        if attn_mask_type is not None:
+            assert attn_mask_type == AttnMaskType.causal, 'Only causal mask is supported for now'
+            # Generate upper triangular mask with -inf above diagonal, 0 elsewhere
+            # torch.triu with diagonal=1 creates upper triangular matrix (excluding main diagonal)
+            # float_mask [sq, skv]
+            float_mask = torch.triu(
+                torch.full((sq, skv), float('-inf'), dtype=torch.float32, device=x.device),
+                diagonal=1,
+            )
+        else:
+            assert attention_mask.shape == (b, 1, sq, skv), 'attention_mask shape mismatch'
+            # [b, 1, sq, skv] -> [b, sq, skv]
+            mask = attention_mask.squeeze()
+            # float_mask [b, sq, skv]
+            float_mask = torch.zeros_like(mask, dtype=torch.float32).masked_fill(
+                mask, float('-inf')
+            )
+
+        # ===================================
+        # Get index scores and top-k indices
+        # ===================================
+        index_scores, topk_indices = self.indexer.forward_with_scores(
+            x, qr, mask=float_mask, packed_seq_params=packed_seq_params
+        )
+
+        # ===================================
+        # Run sparse attention kernel
+        # ===================================
+        output = unfused_dsa_fn(query, key, value, topk_indices, self.softmax_scale)
+
+        # ===================================
+        # Attach indexer loss
+        # ===================================
+        if self.training and torch.is_grad_enabled():
+            # Compute KL divergence loss between indexer scores and true attention scores
+            indexer_loss_coeff = getattr(self.config, 'dsa_indexer_loss_coeff', 0.0)
+            indexer_loss = compute_dsa_indexer_loss(
+                index_scores,
+                topk_indices,
+                query.detach(),
+                key.detach(),
+                self.softmax_scale,
+                indexer_loss_coeff,
+                getattr(self.config, "dsa_indexer_use_sparse_loss", False),
+                self.indexer.pg_collection,
+            )
+            # Save indexer loss for logging
+            if indexer_loss_coeff > 0:
+                DSAIndexerLossLoggingHelper.save_loss_to_tracker(
+                    loss=indexer_loss,
+                    layer_number=self.layer_number,
+                    num_layers=self.config.num_layers,
+                )
+            # Attach loss to output
+            output = DSAIndexerLossAutoScaler.apply(output, indexer_loss)
+
+        return output
diff --git a/megatron/core/transformer/identity_op.py b/megatron/core/transformer/identity_op.py
index 5d9388ffcc6..6d42beb5a8f 100644
--- a/megatron/core/transformer/identity_op.py
+++ b/megatron/core/transformer/identity_op.py
@@ -1,16 +1,24 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from typing import TypeVar
+
 import torch
 
+T = TypeVar('T')
+
 
 class IdentityOp(torch.nn.Module):
     """
     This is a placeholder for IdentityOp(x) -> x
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: object, **kwargs: object):
         super().__init__()
 
-    def forward(self, x, *args, **kwargs):
+    def forward(self, x: T, *args: object, **kwargs: object) -> T:
+        """Forward pass.
+
+        Returns x unchanged.
+        """
         return x
 
 
@@ -21,8 +29,12 @@ class IdentityFuncOp(IdentityOp):
     return a function at runtime based on passed arguments
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: object, **kwargs: object):
         super().__init__()
 
-    def forward(self, *args, **kwargs):
+    def forward(self, *args: object, **kwargs: object):
+        """Forward pass.
+
+        Returns a function which returns its first argument unchanged, and discards all others.
+        """
         return super().forward
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 9602beb2f71..ad80bcfe4e4 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
 import gc
 import logging
 import warnings
@@ -78,7 +79,7 @@ def __init__(
         submodules: MLPSubmodules,
         is_expert: bool = False,
         input_size: Optional[int] = None,
-        ffn_hidden_size: int = None,
+        ffn_hidden_size: Optional[int] = None,
         tp_group: Optional[torch.distributed.ProcessGroup] = None,
     ):
         super().__init__(config=config)
@@ -87,7 +88,7 @@ def __init__(
 
         self.input_size = input_size if input_size != None else self.config.hidden_size
 
-        tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self.tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
         if ffn_hidden_size is None:
             if is_expert:
                 raise ValueError("MoE MLP requires `ffn_hidden_size`, but it was not provided.")
@@ -101,12 +102,25 @@ def __init__(
 
         # If this is a gated linear unit we double the output width
         # see https://arxiv.org/pdf/2002.05202.pdf
+        # For GLU/SwiGLU, use stride=2 because each TP rank stores interleaved [gate, up] portions.
+        # This is critical for correct weight resharding across different TP sizes.
         if self.config.gated_linear_unit:
             ffn_hidden_size *= 2
+            fc1_stride = 2
+            if self.config.use_kitchen:
+                # Kitchen Linear doesn't support stride != 1.
+                # Weight resharding across TP sizes will have aforementioned problems.
+                fc1_stride = 1
+        else:
+            fc1_stride = 1
+
+        # Use moe_latent_size only for routed experts. 'is_expert' is false for
+        # shared_experts.
+        use_latent_size = (self.config.moe_latent_size is not None) and is_expert
 
         self.linear_fc1 = build_module(
             submodules.linear_fc1,
-            self.input_size,
+            self.input_size if not use_latent_size else self.config.moe_latent_size,
             ffn_hidden_size,
             config=self.config,
             init_method=self.config.init_method,
@@ -116,6 +130,7 @@ def __init__(
             is_expert=is_expert,
             tp_comm_buffer_name="fc1",
             tp_group=tp_group,
+            stride=fc1_stride,
         )
 
         if self.config.use_te_activation_func and not (submodules.activation_func is None):
@@ -126,7 +141,7 @@ def __init__(
         self.linear_fc2 = build_module(
             submodules.linear_fc2,
             self.config.ffn_hidden_size,
-            self.config.hidden_size,
+            self.config.hidden_size if not use_latent_size else self.config.moe_latent_size,
             config=self.config,
             init_method=self.config.output_layer_init_method,
             bias=self.config.add_bias_linear,
@@ -137,7 +152,7 @@ def __init__(
             tp_group=tp_group,
         )
 
-    def forward(self, hidden_states, per_token_scale=None):
+    def forward(self, hidden_states, per_token_scale=None, **kwargs):
         """Perform the forward pass through the MLP block."""
         # [s, b, 4 * h/p]
         nvtx_range_push(suffix="linear_fc1")
@@ -222,6 +237,7 @@ def glu(x):
 
         # [s, b, h]
         nvtx_range_push(suffix="linear_fc2")
+
         output, output_bias = self.linear_fc2(intermediate_parallel)
         nvtx_range_pop(suffix="linear_fc2")
 
@@ -295,89 +311,26 @@ def sh_ten_build_fn(
             )
             w_key = key
             v_key = key
-        if flattened_range is None:
-            tensor_w, tensor_v = torch.chunk(t, 2, dim=swiglu_shard_axis)
-            return [
-                ShardedTensor.from_rank_offsets(
-                    w_key,
-                    tensor_w,
-                    *sharded_offsets,
-                    offset_w,
-                    replica_id=replica_id,
-                    prepend_axis_num=prepend_axis_num,
-                ),
-                ShardedTensor.from_rank_offsets(
-                    v_key,
-                    tensor_v,
-                    *sharded_offsets,
-                    offset_v,
-                    replica_id=replica_id,
-                    prepend_axis_num=prepend_axis_num,
-                ),
-            ]
-        else:
-            if singleton_local_shards:
-                raise NotImplementedError(
-                    'singleton_local_shards not implemented for SwiGLU MLP flattened tensors'
-                )
-            # Here we need to map a slice `t` (`flattened_range` specifies slice start and stop)
-            # of the *original* flattened tensor into slices `w` and `v` of chunked
-            # and flattened tensor.
-            # Example:
-            # If original tensor has (16, 5) shape and flattened_range is `slice(8, 64)`,
-            # then `t` has shape `(56,)` and we need to create 2 tensors:
-            # w: first 32 elements of `t` with flattened_range slice(8, 40)
-            # v: last 24 elements of `t` with flattened_range slice(0, 24)
-            # Global offsets are the same as in the non-flattened case
-            assert t.ndim == 1, (key, t.shape)
-            non_flat_local_shape = (original_shape[0] // 2, *original_shape[1:])
-            chunk_numel = original_numel // 2
-            result = []
-            if flattened_range.start < chunk_numel:
-                # Non-empty `w` chunk
-                tensor_w = t[: chunk_numel - flattened_range.start]
-                flattened_range_w = slice(
-                    flattened_range.start, min(chunk_numel, flattened_range.stop)
-                )
-                assert len(tensor_w) == flattened_range_w.stop - flattened_range_w.start
-                result.append(
-                    ShardedTensor.from_rank_offsets_flat(
-                        key,
-                        tensor_w,
-                        non_flat_local_shape,
-                        *sharded_offsets,
-                        offset_w,
-                        replica_id=replica_id,
-                        prepend_axis_num=prepend_axis_num,
-                        flattened_range=flattened_range_w,
-                    )
-                )
-            if flattened_range.stop > chunk_numel:
-                # Non-empty `v` chunk
-                tensor_v = t[-(flattened_range.stop - chunk_numel) :]
-                flattened_range_v = slice(
-                    max(chunk_numel, flattened_range.start) - chunk_numel,
-                    flattened_range.stop - chunk_numel,
-                )
-                assert len(tensor_v) == flattened_range_v.stop - flattened_range_v.start, (
-                    len(tensor_v),
-                    flattened_range_v,
-                )
 
-                result.append(
-                    ShardedTensor.from_rank_offsets_flat(
-                        key,
-                        tensor_v,
-                        non_flat_local_shape,
-                        *sharded_offsets,
-                        offset_v,
-                        replica_id=replica_id,
-                        prepend_axis_num=prepend_axis_num,
-                        flattened_range=flattened_range_v,
-                    )
-                )
-            assert sum(sh_ten.data.numel() for sh_ten in result) == t.numel(), (result, t.shape)
-            return result
+        tensor_w, tensor_v = torch.chunk(t, 2, dim=swiglu_shard_axis)
+        return [
+            ShardedTensor.from_rank_offsets(
+                w_key,
+                tensor_w,
+                *sharded_offsets,
+                offset_w,
+                replica_id=replica_id,
+                prepend_axis_num=prepend_axis_num,
+            ),
+            ShardedTensor.from_rank_offsets(
+                v_key,
+                tensor_v,
+                *sharded_offsets,
+                offset_v,
+                replica_id=replica_id,
+                prepend_axis_num=prepend_axis_num,
+            ),
+        ]
 
     def sh_ten_merge_fn(sub_state_dict):
         with torch.no_grad():
diff --git a/megatron/core/transformer/module.py b/megatron/core/transformer/module.py
index ddd9cd82b93..559bb34774d 100644
--- a/megatron/core/transformer/module.py
+++ b/megatron/core/transformer/module.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 """Megatron Module."""
+from functools import partial
 from typing import Optional, Tuple
 
 import torch
@@ -11,6 +12,7 @@
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import (
+    ensure_metadata_has_dp_cp_group,
     make_sharded_tensors_for_checkpoint,
     sharded_state_dict_default,
 )
@@ -56,7 +58,7 @@ def state_dict_for_save_checkpoint(self, prefix: str = '', keep_vars: bool = Fal
     def sharded_state_dict(
         self,
         prefix: str = '',
-        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        sharded_offsets: Tuple[Tuple[int, int, int], ...] = (),
         metadata: Optional[dict] = None,
     ) -> ShardedStateDict:
         """Default implementation for sharded state dict for distributed checkpointing.
@@ -77,13 +79,26 @@ def sharded_state_dict(
         sharded_state_dict = {}
         # Save parameters
         self._save_to_state_dict(sharded_state_dict, '', keep_vars=True)
+        if not hasattr(self, 'tp_group'):
+            # some model interface hasn't updated for m4, fallback needed
+            tp_group = parallel_state.get_tensor_model_parallel_group()
+        else:
+            tp_group = self.tp_group
+        # Guard for cases metadata is not provided
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
         sharded_state_dict = make_sharded_tensors_for_checkpoint(
-            sharded_state_dict, prefix, sharded_offsets=sharded_offsets
+            sharded_state_dict,
+            prefix,
+            sharded_offsets=sharded_offsets,
+            tp_group=tp_group,
+            dp_cp_group=metadata['dp_cp_group'],
         )
         # Recurse into submodules
         for name, module in self.named_children():
             sharded_state_dict.update(
-                sharded_state_dict_default(module, f'{prefix}{name}.', sharded_offsets, metadata)
+                sharded_state_dict_default(
+                    module, f'{prefix}{name}.', sharded_offsets, metadata, tp_group=tp_group
+                )
             )
         return sharded_state_dict
 
@@ -154,9 +169,12 @@ def __init__(self, config: TransformerConfig, vp_stage: Optional[int] = None):
 
         # Enable cuda graphs.
         if config.cuda_graph_impl == "local":
-            from megatron.core.transformer.cuda_graphs import CudaGraphManager
+            if hasattr(self, "create_mcore_cudagraph_manager"):
+                self.create_mcore_cudagraph_manager(config)
+            else:
+                from megatron.core.transformer.cuda_graphs import CudaGraphManager
 
-            self.cudagraph_manager = CudaGraphManager(config, vp_stage=vp_stage)
+                self.cudagraph_manager = CudaGraphManager(config)
         elif config.cuda_graph_impl == "transformer_engine":
             # List to store CUDA graphs. A list of `N` CUDA graphs for this layer where N is
             # the number of microbatches. Multiple CUDA graphs per layer is required to support
@@ -170,6 +188,39 @@ def __init__(self, config: TransformerConfig, vp_stage: Optional[int] = None):
             # triggered before CUDA Graph running. This is required to ensure the correct param
             # all-gather overlap with forward compute.
             self.cuda_graph_manual_hooks = []
+            # _CudaGraphBackwardDWWrapper object used to manage the wgrad backward computation.
+            # The `backward_dw` func api is the same as `TransformerLayerNode.backward_dw` and
+            # calls wgrad computation in attention module (contains attn and shared expert)
+            # according to CUDA graph scope.
+            self.cuda_graph_backward_dw_wrapper = None
+
+    def init_backward_dw_wrapper(self):
+        """Initialize the backward_dw_wrapper."""
+        from megatron.core.models.gpt.fine_grained_callables import _BackwardDWWrapper
+
+        config = getattr(self, 'config', None)
+        assert config is not None, (
+            "TransformerLayer must be initialized before calling " "`init_backward_dw_wrapper`."
+        )
+        self.backward_dw_wrapper = _BackwardDWWrapper(self)
+
+    def set_te_cuda_graph_backward_dw_wrapper(self):
+        """Replace the backward_dw callable with dw cuda graph."""
+        assert (
+            self.backward_dw_wrapper is not None
+        ), "`backward_dw_wrapper` must be set when cuda graphs are enabled for ep overlap."
+        self.backward_dw_wrapper.set_graphed_backward_dw_callable(
+            partial(self._te_cuda_graph_backward_dw_graph, self.current_microbatch)
+        )
+
+    def _te_cuda_graph_backward_dw_graph(self, microbatch_idx):
+        """
+        CUDA Graph backward weight gradient computation for current layer.
+        """
+        cg_index = microbatch_idx % len(self.cuda_graphs)
+        if not hasattr(self.cuda_graphs[cg_index], 'backward_dw'):
+            return
+        self.cuda_graphs[cg_index].backward_dw()
 
     def get_layer_static_inputs(self, seq_length, micro_batch_size):
         """
@@ -288,11 +339,7 @@ def _should_call_te_cudagraph(self, *args, **kwargs):
         )
 
     def __call__(self, *args, **kwargs):
-
         if self._should_call_local_cudagraph(*args, **kwargs):
-            # Set the is_first_microbatch flag for weight caching
-            current_microbatch = getattr(self, 'current_microbatch', 0)
-            self.cudagraph_manager.set_is_first_microbatch(current_microbatch == 0)
             return self.cudagraph_manager(self, args, kwargs)
         elif self._should_call_te_cudagraph(*args, **kwargs):
             if not self.cuda_graphs:
@@ -379,7 +426,9 @@ def __init__(self, config: TransformerConfig, module: torch.nn.Module):
         self.config = config
         self.fp16 = config.fp16
         self.bf16 = config.bf16
+        self.vp_size = config.virtual_pipeline_model_parallel_size
         self.vp_stage = getattr(module, 'vp_stage', None)
+        self.pg_collection = getattr(module, 'pg_collection', None)
 
         if self.fp16:
             self.add_module('module', module.half())
@@ -424,7 +473,7 @@ def forward(self, *inputs, fp32_output=True, **kwargs):
             The wrapped module's outputs, potentially upcast to fp32 depending on pipeline stage
             and ``fp32_output``.
         """
-        ######### FlagScale Begin ########
+        ######### FlagScale Begin #########
         #TODO: Fix the dualpipev import issue in the latest Megatron codebase
         if self.config.use_dualpipev:
             from megatron.plugin.dualpipev.dualpipev_schedules import get_dualpipe_chunk
@@ -436,17 +485,29 @@ def forward(self, *inputs, fp32_output=True, **kwargs):
             if dualpipe_last_stage:
                 outputs = float16_to_fp32(outputs)
             return outputs
-        ######### FlagScale End ########
+        ######### FlagScale End #########
+
+        from megatron.core.pipeline_parallel.utils import (
+            is_pp_first_stage,
+            is_pp_last_stage,
+            is_vp_first_stage,
+            is_vp_last_stage,
+        )
+
+        if self.pg_collection is None:
+            pp_group = parallel_state.get_pipeline_model_parallel_group()
         else:
-            if parallel_state.is_pipeline_first_stage(ignore_virtual=False, vp_stage=self.vp_stage):
-                inputs = fp32_to_float16(inputs, self.float16_convertor)
-            outputs = self.module(*inputs, **kwargs)
-            if (
-                parallel_state.is_pipeline_last_stage(ignore_virtual=False, vp_stage=self.vp_stage)
-                and fp32_output is True
-            ):
-                outputs = float16_to_fp32(outputs)
-            return outputs
+            pp_group = self.pg_collection.pp
+        if is_vp_first_stage(self.vp_stage, self.vp_size) and is_pp_first_stage(pp_group):
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if (
+            is_vp_last_stage(self.vp_stage, self.vp_size)
+            and is_pp_last_stage(pp_group)
+            and fp32_output is True
+        ):
+            outputs = float16_to_fp32(outputs)
+        return outputs
 
     def state_dict(
         self, destination=None, prefix='', keep_vars=False
diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md
index 0a933aed0df..558e2ccd39a 100644
--- a/megatron/core/transformer/moe/README.md
+++ b/megatron/core/transformer/moe/README.md
@@ -1,159 +1,370 @@
 # Megatron Core MoE
 
-Megatron-Core MoE provides comprehensive parallelism strategies, seamlessly integrating Expert Parallelism with tensor, data, sequence, and pipeline parallelism. With MCore v0.9, we've achieved remarkable performance of **468 TFLOPS** for Mixtral 8X7B bf16 training. Additionally, we support state-of-the-art MoE model architectures including DeepSeek-V3 and Qwen-MoE.
-
-### What's New
-- **Support for DeepSeek-V3 architecture**
-  - Enable TP for MLA and DeepSeek-V3
-  - Enable CP for MLA and DeepSeek-V3
-    - Requires TransformerEngine >= 2.5.0
-    - Many thanks to [SuperCB](https://github.com/SuperCB) from Xiaohongshu Inc. and [RandMist](https://github.com/RandMist) from WeChat Infra Department, Tencent Inc. for their contributions.
-  - Support aux-loss-free load balancing strategy
-  - Support node-limited routing
-  - Support Multi-Token Prediction (MTP)
-  - Batch-level overlapping to hide EP-A2A communication
-- **Support DeepSeek's DeepEP for efficient token dispatching and combining**
-- Support HybridEP for efficient token dispatching and combining within intra-node and MNNVL scenarios.
-- Add fusion for token permutation and unpermutation
-- Support Uneven virtual pipeline parallel split
-- Support output-discarding checkpointing on some submodules
-
-### Parallelism
-- **Expert Parallelism**
-    - A specific method of parallelism for MoE models, where experts are partitioned onto different workers and each worker processes a different batch of training samples, each worker process one or more experts for each MoE layer.
-- **3D Parallelism**: Data Parallelism, Tensor Parallelism, Pipeline Parallelism
-    - Note: When using MoE with expert parallelism and tensor parallelism, sequence parallelism must be enabled.
-- **Context Parallelism**:
-    - Split the sequence dimension to support long context training.
-- **Richer parallel mappings**: EP can be combined with DP/TP/PP/CP for handling larger MoE variants.
-- **MoE Parallel Folding**: Support for setting different parallelism strategies for Attention and MoE components, enabling more flexible and efficient model sharding. See detailed documentation below.
-- **Full distributed optimizer support.**
-
-### Router and Load Balancing
-- Router type:
-    - Top-K MLP router
-- Load Balancing algorithms:
-    - Sinkhorn (S-BASE)
-    - Aux loss / Load balancing loss
-    - Aux-loss-free load balancing strategy
-- CUDA fused routing and load balancing kernels
+Megatron Core MoE is a production-ready framework for training large-scale Mixture-of-Experts models, providing the foundational architecture, performance optimizations, and best practices that guide MoE framework development across the industry.
+
+## What's New
+For latest features and architectures, please refer to the [MCore dev roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729).
+
+### 🔥 [MCore dev] (2026/01)
+- 🚀 Pipeline-aware fine-grained activation offloading
+- 🚀 Qwen3-Next model support
+- 🚀 DeepSeek-V3.2 model support
+- 🚀 Muon and Layer-wise distributed optimizer
+- 🚀 CUDA Graph support with fine-grained scopes
+
+### 🔥 [MCore v0.15] (2025/11)
+- 🚀 Add HybridEP backend to Flex Dispatcher(GB200, B200, H100 supported)
+- 🚀 Support FSDP with EP for MoE models
+
+### 🔥 [MCore v0.14] (2025/09)
+- 🚀 Batch-level overlapping to hide EP-A2A communication (--overlap-moe-expert-parallel-comm --delay-wgrad-compute)
+- 🚀 FP8 support for Fine-grained Recomputations
+- Router fusion kernels for MoE models (--moe-router-fusion)
+- Context Parallelism (CP) support for MTP and MLA
+
+### 🔥 [MCore v0.13] (2025/07)
+- Support bf16 dtype for optimizer states to use precision-aware optimizer in TransformerEngine (--use-precision-aware-optimizer)
+- Flexible Asymmetric Virtual Pipeline Parallelism with Custom Pipeline Layout (--pipeline-model-parallel-layout)
+- Add Hybrid Shard Data-Parallel support for MoE models (--num-distributed-optimizer-instances)
+- Fine-grained recomputation to reduce activation memory. (--recompute-modules with --recompute-granularity selective)
+- Memory efficient token permutation by moving the probs multiplication from unpermutation to activation function of GroupedMLP.
+
+### 🔥 [MCore v0.12] (2025/05)
+- Support DeepSeek's DeepEP for efficient token dispatching (--moe-token-dispatcher-type flex --moe-enable-deepep)
+- Support Multi-Token Prediction (MTP) (--mtp-num-layers 1)
+- CUDA Graph support for dropless MoE models with attention only capture (--te-rng-track --external-cuda-graph --cuda-graph-scope attn)
+
+## Overview of MCore MoE Supported Features and Architectures
+
+### Model Support
+- ✅ **DeepSeek**
+  - ✅ DeepSeek-V2
+  - ✅ DeepSeek-V3, including MTP
+- ✅ **Qwen**
+  - ✅ Qwen2-57B-A14B
+  - ✅ Qwen3-30B-A3B
+  - ✅ Qwen3-235B-A22B
+- ✅ **Mixtral**
+  - ✅ Mixtral-8x7B
+  - ✅ Mixtral-8x22B
+
+### Core MoE Functionality
+- ✅ Token dropless MoE (dMoE) - Advanced routing without token dropping
+- ✅ Top-K Router with flexible K selection
+- ✅ Load balancing losses for expert utilization optimization
+
+### Advanced Parallelism
+- ✅ Expert Parallel (EP) with 3D parallelism integration
+- ✅ Full parallelism combo: EP + DP + TP + PP + SP support
+- ✅ Context Parallel (CP) for long sequence MoE training
+- ✅ Parallel Folding Heterogeneous Parallelism Mappings for Efficient Large-Scale MoE Model Training
+- ✅ Distributed Optimizer for MoE (ZeRO-1 equivalent)
 
 ### Performance Optimizations
-- (Experimental) **DeepEP** is integrated for efficient token communication in large-scale MoE training.
-- GroupedGEMM when num local experts > 1
-    - Supported dtype: bf16
-    - Performance improvements for larger MoE models
-- Enable `--tp-comm-overlap` for MoE
-- FP8 training support
-
-### Token Dispatch Mechanism
-- Dropless / No token drop
-- Token drop, with or without padding to capacity
-- Token permutation / Unpermutation fusion
+- ✅ Memory Efficient token permutation
+- ✅ Fine-grained Recomputations (mla, moe, mlp, moe_act, norm)
+- ✅ MLA TP Support for Mixture of Linear Attention
+- ✅ GroupedGEMM and GA Fusion
+- ✅ DP/PP/TP Communication Overlapping
+- ✅ Overlapped Shared Expert execution
+- ✅ Router Fusion optimizations
+- ✅ Token (un)permutation Fusion kernels
+- ✅ cuDNN fused Attention integration
+
+### Hardware & Precision Support
+- ✅ DeepEP support for H100 and B200
+- ✅ GroupedGEMM including FP8/MXFP8 support
+- ✅ FP8 weights with BF16 optimizer states
+- ✅ FP8 training full support
+
+### Developer Experience
+- ✅ MoE Model Zoo with pre-training best practices
+- ✅ Distributed Checkpointing for MoE models
+- ✅ Upcycling Support for model scaling
+- ✅ MCore2HF Converter for ecosystem compatibility
+- ✅ Layer-wise logging for detailed monitoring
+- ✅ Runtime Upcycling capabilities
+
+## Quick Start Guide
+
+### Basic MoE Training in Megatron-LM
+
+To train a top-2 MoE model with 8 experts and auxiliary loss, add the following arguments to your megatron training script:
 
-### Ease of use
-- Checkpoint converter for Mixtral models, see the [example](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mixtral) for details.
-- MoE Layer Frequency to customize the hybrid MoE/Dense layer architecture
-- Distributed checkpoining
-- Per-layer logging
-- Upcycling Support
+```bash
+## Set MoE Hidden site
+--num-experts 8
+--moe-shared-expert-intermediate-size: 2048
+## Set router config
+--moe-router-load-balancing-type aux_loss
+--moe-router-topk 2
+--moe-aux-loss-coeff 1e-2
+## Set token dispatcher
+--moe-token-dispatcher-type alltoall
+```
 
-# User Guide
+Detailed documentation for each feature is available in the [Feature Documentation](#feature-documentation) section.
 
-## Usage
+### Use the pre-defined config to train the popular MoE models
+We have provided some pre-defined config to train the popular MoE models in the [Megatron-MoE-Model-Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo/tree/main) repository. You can use them as a reference to configure your training script. Currently we have added the config for Mixtral 8x7B, Mixtral 8x22B, DeepSeek-V3, Qwen3-30B-A3B, Qwen3-235B-A22B.
 
-### Quick Start
-To train a top-2 MoE model with 8 experts and auxiliary loss, include the following arguments:
+### General Performance Tips
+#### Training arguments
+The following flags are general performance flags that can help to achieve higher performance on almost all workloads. Check if you have enabled all of them in your training script.
 
 ```bash
---num-experts 8
---expert-model-parallel-size 8
+## Enable DeepEP token dispatcher
+--moe-token-dispatcher-type flex
+--moe-flex-dispatcher-backend deepep
+## Enable GroupedGEMM
 --moe-grouped-gemm
+## Enable fusion kernels
+--moe-router-fusion
 --moe-permute-fusion
---moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss.
---moe-router-topk 2
---moe-aux-loss-coeff 1e-2
+--cross-entropy-loss-fusion
+--cross-entropy-fusion-impl te
+
+## Communication optimization
 --use-distributed-optimizer
---moe-token-dispatcher-type alltoall
-```
+--overlap-param-gather
+--overlap-grad-reduce
+--tp-comm-overlap
 
-To enable the token drop mechanism, such as GShard and SwitchTransformer, include the following arguments:
+## Enable manual gc to prevent python jitter
+--manual-gc: true
+--manual-gc-interval: 10
+```
+#### Environment variables
 
+Below are some environment variables that can be useful.
 ```bash
---moe-expert-capacity-factor 1.0
---moe-pad-expert-input-to-capacity # Optional
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # Enable expandable segments to prevent memory fragmentation
+export NCCL_NVLS_ENABLE=0 # Disable NVLS to prevent memory overhead
 ```
+#### Dependencies
+- Use the latest version of [TransformerEngine](https://github.com/NVIDIA/TransformerEngine).
+- Use the latest [NGC PyTorch Docker Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
 
-The following figure illustrates differenting dropping strategies in MCore:
-<!-- This image is uncommented for now as Sphinx cannot resolve this path. Sphinx imports this markdown file, and from the imported location this relative path does not exist anymore. Ideally, this markdown should not live here but rather in the `docs/` directory that Sphinx uses. -->
-<!-- ![Token Droppling Strategies](../../../../docs/source/images/moe/token_drop.png) -->
-
-1. The default dropless strategy will not drop or pad any token.
-2. By setting `--moe-expert-capacity-factor`, the tokens exceed the capacity of expert will be dropped based on their selected probabilities. 
-   The dropping is performed before the token exchange operation between EP ranks when EP > 1. 
-   The formula of capacity is `capacity = num_tokens_per_rank * topk * capacity_factor / num_experts`.
-3. By setting `--moe-pad-expert-input-to-capacity`, the experts with tokens less than capacity will be padded to the capacity.
-
-### Fine-tuning Mixtral Models
-Megatron-Core has full support for Mixtral MoE models, and we provide the checkpoint converter for Mixtral models from huggingface format to MCore format. 
-<!-- See more details in the [mixtral example](../../../../examples/mixtral/README.md). -->
-
-### Distributed Checkpointing
-MCore v0.7 introduced fully parallel and asynchronous saving capabilities to distributed checkpointing, 
-which addresses the issues of low efficiency in the traditional checkpoint saving methods. 
-It also solved the problem of incompatibility between checkpoints of different parallel mappings in the traditional format.
-With the new distributed checkpointing solution, MCore can achieve flexible parallelism configurations by saving and loading the unified format checkpoints.
-Compared to native PyTorch solution, MCore achieves up to 50x reduction in checkpointing overhead.
-
-From MCore v0.8, MoE supports Distributed Checkpointing, which means users can save and load with any combination of parallelism and it is currently available, including expert parallel.
-1. Loading weight and distributed optimizer states with TPxCPxEPxPP resharding with SequentialMLP is supported in version 0.8.
-2. GroupedMLP weight resharding is supported in version 0.8.0 and optimizer state resharding is supported in version 0.10.0. Switching between GroupedMLP/SequentialMLP when loading and saving is partially supported.
-3. TEGroupedMLP has fully support on distributed checkpointing and is fully exchangable with SequentialMLP in version 0.9.0.
-4. Optimizer state resharding cannot do across EP=1 with EP>1 due to the different optimizer type.
-
-Usage
-- `--ckpt-format torch_dist` The main argument, it will attempt to save and load using distributed checkpointing.
-- `--auto-detect-ckpt-format` With this, it can load both distributed checkpointing and legacy checkpointing.
-
-Checkpoint compatibility across SequentialMLP, GroupedMLP, and TEGroupedMLP:
-```text
-    ┌───────────────┐          ┌───────────────┐          ┌───────────────┐     
-    │   GroupedMLP  │          │ SequentialMLP │          │ TEGroupedMLP  │     
-    │               │          │               │          │               │     
-    │               │          │               │          │               │     
-    │ ┌───────────┐ │          │ ┌───────────┐ │          │ ┌───────────┐ │     
-    │ │legacy ckpt│ │          │ │legacy ckpt│ │          │ │legacy ckpt│ │     
-    │ └─────┬─────┘ │          │ └─────┬─────┘ │          │ └─────┬─────┘ │     
-    │       ▼       │          │       ▼       │          │       ▼       │     
-    │  ┌─────────┐  │          │  ┌─────────┐  │          │  ┌─────────┐  │     
-    │  │dist ckpt│  │          │  │dist ckpt│  │          │  │dist ckpt│  │     
-┌──►│  │ weight  │  │◄────────►│  │ weight  │  │◄────────►│  │ weight  │  │◄──┐ 
-│   │  └─────────┘  │          │  └─────────┘  │          │  └─────────┘  │   │ 
-└───┼───────────────┼──────────┼───────────────┼──────────┼───────────────┼───┘ 
-    │┌─────────────┐│          │┌─────────────┐│          │┌─────────────┐│     
-    ││  dist ckpt  ││          ││  dist ckpt  ││          ││  dist ckpt  ││     
-    ││optim states ││          ││optim states ││◄────────►││optim states ││     
-    │└─────────────┘│          │└─────────────┘│          │└─────────────┘│     
-    └───────────────┘          └───────────────┘          └───────────────┘     
-```
+## Best Practices to achieve high performance on MoE training
+
+Distributed training involves complex trade-offs between **communication**, **memory**, and **computation**, making it challenging to find an optimal parallelism configuration. This section provides a systematic workflow to help you identify the best parallel mapping for your model and hardware.
+
+### Step 1: Find the feasible parallel mapping under the memory capacity of the GPU
+To find the best parallel mapping, we need to first know the feasible parallel mapping for the model under the memory capacity of the GPU.
+The consumption of memory consists of three parts:
+- Activation memory
+- Weight and gradient memory
+- Optimizer states memory
+Different parallel strategies will shard these tensor memory in different ways.
+
+| Parallel Strategy | Peak Activation Memory          | Weight Memory  | Optimizer states                  | Communication (Per-Layer) |
+|:-----------------:|:-------------------------------:|:--------------:|:---------------------------------:|:-------------------------:|
+| TP                | 1/N (with SP on)                | 1/N            | 1/N                               |        High               |
+| EP                | ~1 (varies with EP balancing)   | 1/N in MoELayer| 1/N                               |       Medium              |
+| PP                | 1 (>1 with virtual pipeline)    | 1/N            | 1/N                               |       Medium              |
+| CP                | 1/N                             | 1              | 1/N (with distributed optimizer)  |       Medium              |
+| DP                | 1                               | 1              | 1/N (with distributed optimizer)  |        Low                |
+
+We provide the argument of `--fake-init-process-group` to emulate distributed training on one GPU. This is useful to find the feasible parallel mapping under the memory capacity of the GPU. See https://github.com/NVIDIA/Megatron-LM/pull/2254 for detailed usage.
+
+### Step 2: Select Optimal Parallelism Strategy
+
+The optimal parallelism configuration varies based on **model architecture**, **sequence length**, and **hardware platform**. Below are general guidelines to help you achieve high throughput.
+
+#### Guideline 1: Minimize Model Parallelism, Maximize Data Parallelism
+
+| Aspect | Recommendation |
+|--------|----------------|
+| **Goal** | Keep TP/EP/PP as small as possible while avoiding OOM |
+| **Why** | Model parallelism introduces communication overhead that hurts performance |
+| **How** | Use distributed optimizer (`--use-distributed-optimizer`) to shard optimizer states across DP ranks, freeing memory for larger DP size |
+
+#### Guideline 2: Keep EP and TP Communication Within NVLink Domain
+
+| Aspect | Recommendation |
+|--------|----------------|
+| **Goal** | Ensure EP×TP fits within a single node (typically 8 GPUs) |
+| **Why** | EP and TP are communication-intensive; NVLink provides much higher bandwidth than cross-node interconnects |
+| **Scaling** | When scaling beyond one node, prefer PP over expanding TP/EP across nodes |
+
+**Note:**
+For very large MoE models like DeepSeek-V3, the EP communication may exceed the NVLink bandwidth. In this case, consider using 1F1B A2A Overlap to overlap the EP communication.
+
+#### Guideline 3: Use Pipeline Parallelism (PP) for Multi-Node Scaling
+
+| Aspect | Recommendation |
+|--------|----------------|
+| **Goal** | Use PP to distribute layers across nodes while keeping EP×TP within NVLink |
+| **VPP** | Enable Virtual Pipeline Parallelism to reduce pipeline bubbles when `PP ≥ 2` |
+| **Config** | Set `--num-layers-per-virtual-pipeline-stage` to control VPP size |
+
+**VPP Size Tuning:**
+- Valid values: all divisors of `num_layers / PP_size`
+- Example: `num_layers=24, PP=4` → valid VPP sizes: `{1, 2, 3, 6}`
+- Trade-off: Larger VPP = fewer bubbles but more P2P communications
+- Recommendation: A middle value often gives the best balance
+
+#### Guideline 4: Prefer EP over TP for Expert Layers
+
+| EP Advantages | Details |
+|---------------|---------|
+| **Better GEMM efficiency** | Larger local matrix sizes improve GPU utilization |
+| **Lower communication** | EP has less communication overhead than TP for MoE layers |
+| **Simpler computation graph** | Easier to overlap communication with computation |
+| **Token permutation** | When `EP = num_experts`, local token permutation is eliminated |
+
+**Example:** For Mixtral 8x7B, `EP8×TP1` outperforms `EP4×TP2`.
+
+#### Guideline 5: Enable Context Parallelism (CP) for Long Sequences
+
+| Aspect | Recommendation |
+|--------|----------------|
+| **When to use** | Sequence length ≥ 8K tokens |
+| **Key factor** | CP efficiency depends on overlapping communication with computation |
+| **Config** | Set `--context-parallel-size` to partition sequences across GPUs |
+
+### Step 3: Enable Performance Features Based on Profiling Bottlenecks
+
+After establishing a working parallel configuration, profile your training to identify bottlenecks and apply targeted optimizations.
+
+#### Memory Bottleneck
+
+**Symptom**: Forced to use full recomputation or excessively large parallelism degrees to avoid OOM.
+
+**Solutions**:
+| Optimization | Overhead | Config | Reference |
+|--------------|----------|--------|---------|
+| Selective Recomputation | Low | `--recompute-granularity selective --recompute-modules ...` | [Fine-grained Recomputation](#fine-grained-recomputation) |
+| Activation Offloading | Medium | `--fine-grained-activation-offloading --offload-modules ...` | [Fine-grained Activation Offloading](#fine-grained-activation-offloading) |
+| Optimizer Offloading | Medium | `--optimizer-cpu-offload` | --- |
+
+#### Communication Bottleneck
+
+**Symptom**: Profiling shows significant time spent in collective operations.
+
+**Solutions**: Identify which communication is the bottleneck and enable corresponding overlap:
+| Communication Type | Overlap Config |
+|--------------------|----------------|
+| DP gradient reduce | `--overlap-grad-reduce` |
+| DP param gather    | `--overlap-param-gather` |
+| TP communication   | `--tp-comm-overlap` |
+| EP All-to-All      | `--overlap-moe-expert-parallel-comm --delay-wgrad-compute` |
+| PP send/recv       | Enable VPP with `--num-layers-per-virtual-pipeline-stage` |
+
+#### CPU Overhead Bottleneck
+
+**Symptom**: Nsight Systems timeline shows gaps between GPU kernels where CPU cannot launch kernels fast enough.
+
+**Solutions**:
+| Optimization | Config |
+|--------------|--------|
+| Disable Python GC | `--manual-gc --manual-gc-interval 100` |
+| Enable CUDA Graphs | `--cuda-graph-impl transformer_engine --cuda-graph-scope attn moe_router moe_preprocess` |
+| Reduce kernel launches | Decrease TP size or increase micro-batch size |
+
+#### Computation Bottleneck
+
+**Symptom**: GPU utilization is low despite no communication or CPU bottlenecks.
+
+**Solutions**:
+| Optimization | Config |
+|--------------|--------|
+| Enable kernel fusions | `--moe-router-fusion --moe-grouped-gemm --moe-permute-fusion` |
+| Use FP8 precision | `--fp8-format e4m3 --fp8-recipe blockwise` |
+
+
+## Feature Documentation
+
+### Router and Load Balancing
+
+Routers determine which expert(s) handle each token. A lightweight MLP scores every token and applies `softmax` or `sigmoid` to compute routing probabilities. The router then selects the top-K experts for each token.
+
+> **Note**: The router logits is better to remain in **FP32** or **FP64** rather than BF16 by --moe-router-dtype fp32. At high expert counts, FP32 precision yields better accuracy because output hidden states of experts are multiplied by router scores and accumulated to get the final output.
+
+#### Router Types
+
+| Router Types | Description | Config |
+|-------------|-------------|----------|
+| **Top-K Router** | Standard routing with configurable K, uses softmax for probability computation | --moe-router-topk 8 |
+| **Group Top-K Router** | Selects top-K expert groups, then routes experts in selected groups | --moe-router-num-groups 8 --moe-router-group-topk 4 |
+| **Router score function** | Score function to calculate the probs from output logits of router | --moe-router-score-function softmax/sigmoid |
+
+#### Load Balancing Strategies
+
+| Strategy | Description | Config |
+|----------|-------------|--------|
+| **aux_loss** | Auxiliary loss for balancing expert usage on a micro-batch | `--moe-router-load-balancing-type aux_loss` |
+| **seq_aux_loss** | Sequence-level auxiliary loss for balancing expert usage on each sequence| `--moe-router-load-balancing-type seq_aux_loss` |
+| **global_aux_loss** | Global auxiliary loss for balancing expert usage on a global batch across all ranks | `--moe-router-load-balancing-type global_aux_loss` |
+| **sinkhorn** | Optimal transport formulation for balancing expert usage | `--moe-router-load-balancing-type sinkhorn` |
+| **aux loss free** | Dynamic bias-based load balancing strategy without auxiliary loss | `--moe-router-enable-expert-bias --moe-router-bias-update-rate 1e-3`|
+| **none** | No load balancing | `--moe-router-load-balancing-type none` |
+
+### Token Dispatching
+
+After routing, tokens are **dispatched** to the GPU hosting the assigned expert. After expert computation, tokens are sent back and **combined** to restore the original sequence.
+
+| Dispatcher | Description | Best For | Config |
+|------------|-------------|----------|--------|
+| **alltoall** | NCCL-based All-to-All communication for token exchange | Standard EP > 1 setups | `--moe-token-dispatcher-type alltoall` |
+| **FlexDispatcher with [DeepEP](https://github.com/deepseek-ai/DeepEP) backend** | Removes redundant tokens during cross-node communication, fuses intra/inter-node communication into single kernel | Cross-node EP, fine-grained MoE (DeepSeek-V3) | `--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend deepep` |
+| **FlexDispatcher with [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) backend** | NVIDIA's optimized dispatcher using TMA and IBGDA, fewer SMs, native MNNVL support | GB200 NVL72, Multi-Node NVLink | `--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep` |
+| **allgather** | Gathers all tokens to each GPU, no inter-GPU token movement | TP-only setups, small EP, large Top-K | `--moe-token-dispatcher-type allgather` |
+
+### Upcycling
+Use `--moe-use-upcycling` to enable upcycling, which loads the dense model from the `--load` directory, converts it to an MoE model at runtime, and starts training. The converted model is saved to the `--save` path before training begins. Upcycling is built on distributed checkpointing, supporting parallel modes different from existing dense checkpoints, such as arbitrary expert parallelism during upcycling.
+
+In addition to the default upcycling strategy, we also support granular upcycling strategy which is a more state-of-the-art upcycling strategy from [our recent research work](https://arxiv.org/abs/2410.07524). For the default upcycling strategy, we duplicate the existing MLP to multiple experts, with each expert starting from a copy of the MLP. For the granular upcycling strategy, we use `--moe-upcycling-granularity` to specify how many times smaller is the expert hidden size compared with the original dense FFN hidden size. For using granular upcycling strategy, please set `--moe-upcycling-granularity` as a positive integer. If this param is set to 1, it means using the default upcycling strategy.
+
+Note: The MoE model structure is defined through script arguments. All MoE-related arguments (such as `--num-experts`) can be customized; however, other model structure arguments must be consistent with those of the dense model. For granular upcycling strategy, the moe's FFN hidden size should be set as dense FFN hidden size divided by `--moe-upcycling-granularity`.
+
+## Training Optimizations
+MoE training faces three fundamental performance bottlenecks: **Memory Wall**, **Communication Wall**, and **Compute Efficiency Wall**. The following optimizations address each of these challenges.
+
+### MoE Parallel Folding
+**The Problem with Traditional Approaches:**
+- Prior MoE frameworks constrain **EP ≤ DP** (Expert Parallelism must be a sub-group of Data Parallelism), which severely limits scalability.
+- Applying the same TP/CP to both attention and MoE is suboptimal:
+  - High TP benefits attention but hurts MoE (small per-expert dims make TP overhead prohibitive)
+  - High CP benefits long-context attention but is unnecessary for MoE (tokens processed independently)
+
+**MoE Parallel Folding** is Megatron Core's solution that **decouples attention and MoE parallelism**:
 
-Best practices for distributed checkpointing:
-1. Convert a legacy checkpoint to a distributed checkpoint. To achieve this, we can add both `--ckpt-format torch_dist --auto-detect-ckpt-format`, then it will load the legacy one and save as the distributed checkpoint format later when the training progress tries to save checkpoints.
-2. Convert checkpoint of the legacy GroupedMLP to TEGroupedMLP. This is only supported for the weight parts. To achieve this, we can use the above method to convert the legacy checkpoint to a distributed checkpoint of the legacy GroupedMLP. After updating the libraries and using TEGroupedMLP, we can directly load the previously saved checkpoint by adding argument `--no-load-optim`.
+| Parallelism Group | Attention Layers | MoE Layers |
+|-------------------|------------------|------------|
+| **Dimensions** | TP × CP × DP × PP | ETP × EP × EDP × PP |
 
-### Shared Experts
-MCore v0.9 introduced the shared expert feature. We can enable this feature by setting suitable `--moe-shared-expert-intermediate-size`.
+#### Key Benefits
 
-The parallelism patterns of the shared experts follow the settings of the dense part, i.e., the attention module. The shared experts are not distributed but replicated in EP ranks.
+1. **Breaks the EP ≤ DP Constraint**
+   - Traditional: TP=4, CP=2, DP=8, PP=4 → max EP=8
+   - With Folding: Same attention config, but MoE uses ETP=1, EP=64, EDP=1 → 8× more expert parallelism
 
-We also have an experimental feature that tries to overlap the communications and computations in the shared experts and the dispatcher.
-We can set `--moe-shared-expert-overlap` and use `alltoall` dispatcher to enable it.
-The overlapping relies on the envirionment setting `CUDA_DEVICE_MAX_CONNECTIONS=1`.
-The `AllGather` and `ReduceScatter` communications in the shared experts are overlapped with `permute`/`unpermute` in the dispatcher.
-The `MLP` computation part in the shared experts are overlapped with the `AlltoAll` communications in the dispatcher.
-Both the forward and the backward pass can overlap. But to get the overlapping in the backward pass, the PyTorch version should `>= 2.2.0`.
+2. **Reduces Minimum GPU Requirements**
+   - Traditional CP=8, EP=8 requires at least 64 GPUs
+   - With Folding: CP and EP are folded together, only 8 GPUs needed
 
-### Checkpointing
+3. **Enables Independent Optimization**
+   - Use high TP for attention (memory efficiency)
+   - Use ETP=1 for MoE (better GEMM efficiency, less communication)
+
+4. **Keeps High-Bandwidth Communication in NVLink Domain**
+   - Both CP and EP communication can remain within NVLink domain
+
+> **Reference**: [MoE Parallel Folding: Heterogeneous Parallelism Mappings for Efficient Large-Scale MoE Model Training](https://arxiv.org/abs/2504.14960)
+
+### Memory Optimization
+
+Memory optimization is critical for large-scale MoE training, as MoE models maintain all expert parameters even though only a subset is activated per token.
+
+| Optimization | Description | Config |
+|--------------|-------------|--------|
+| **Fine-grained Recomputation** | Selectively recomputes specific modules (e.g., `mla_up_proj`, `layernorm`, `moe_act`) instead of full layers | `--recompute-granularity selective --recompute-modules mla_up_proj layernorm moe_act` |
+| **Fine-grained Activation Offloading** | Offloads activations to CPU memory, overlapping D2H/H2D transfers with computation | See `docs/source/api-guide/fine_grained_activation_offloading.md` |
+| **Precision-aware Optimizer** | Stores optimizer states (exp_avg, exp_avg_sq) in BF16 instead of FP32, reducing optimizer memory by 50% | `--use-precision-aware-optimizer --exp-avg-dtype bf16 --exp-avg-sq-dtype bf16` |
+| **Optimizer Offloading** | Offloads optimizer states to CPU memory. | `--optimizer-cpu-offload` |
+
+#### Fine-grained Recomputation
 A new output-discarding checkpointing method is also supported. This method discards the output memory of certain submodules during the forward pass and recomputes them during the backward pass, which can save memory compared to standard checkpointing. This can be enabled for specific submodules using the `--recompute-granularity selective --recompute-modules [submodule1, submodule2, ...]` argument. The supported submodules are:
 
 * `moe_act`: Recompute the GroupedMLP activation function.
@@ -163,123 +374,216 @@ A new output-discarding checkpointing method is also supported. This method disc
 * `mlp`: Recompute the dense MLP submodule (uses standard checkpointing rather than output-discarding) which is useful for hybrid-models like DeepSeek-V3.
 * `moe`: Recompute the MoE layer submodule (uses standard checkpointing rather than output-discarding).
 
-### Upcycling
-Use `--moe-use-upcycling` to enable upcycling, which loads the dense model from the `--load` directory, converts it to an MoE model at runtime, and starts training. The converted model is saved to the `--save` path before training begins. Upcycling is built on distributed checkpointing, supporting parallel modes different from existing dense checkpoints, such as arbitrary expert parallelism during upcycling.
+#### Fine-grained Activation Offloading
 
-In addition to the default upcycling strategy, we also support granular upcycling strategy which is a more state-of-the-art upcycling strategy from [our recent research work](https://arxiv.org/abs/2410.07524). For the default upcycling strategy, we duplicate the existing MLP to multiple experts, with each expert starting from a copy of the MLP. For the granular upcycling strategy, we use `--moe-upcycling-granularity` to specify how many times smaller is the expert hidden size compared with the original dense FFN hidden size. For using granular upcycling strategy, please set `--moe-upcycling-granularity` as a positive integer. If this param is set to 1, it means using the default upcycling strategy.
+Unlike recomputation (which trades compute for memory), offloading trades **GPU-CPU bandwidth for memory**: activations are transferred to CPU during forward pass and retrieved during backward pass. The key is hiding transfer latency behind computation using asynchronous D2H/H2D transfers.
 
-Note: The MoE model structure is defined through script arguments. All MoE-related arguments (such as `--num-experts`) can be customized; however, other model structure arguments must be consistent with those of the dense model. For granular upcycling strategy, the moe's FFN hidden size should be set as dense FFN hidden size divided by `--moe-upcycling-granularity`.
+**Key Features:**
+- **Module-level granularity**: Target specific modules rather than entire layers
+- **Computation-offloading overlap**: Asynchronous transfers via independent CUDA streams
+- **Compatible with PP/VPP**: Works with pipeline parallelism and fine-grained recomputation
 
-### Leverage DeepSeek's DeepEP for High-Performance Cross-Node Token Dispatching
-- [DeepSeek-DeepEP](https://github.com/deepseek-ai/deepep) provides a highly optimized implementation for MoE token dispatching and combining operations, specifically designed for large-scale MoE training scenarios.
-- DeepEP is particularly recommended for training large-scale, fine-grained MoE architectures such as DeepSeek-V3 and other advanced MoE models.
-- To enable DeepEP in your training configuration, simply set `--moe-token-dispatcher-type=flex` and `--moe-flex-dispatcher-backend=deepep` in your command line arguments.
+**Usage**
+```bash
+--fine-grained-activation-offloading
+--offload-modules expert_fc1 moe_act # Choices: attn_norm, core_attn, attn_proj, mlp_norm, expert_fc1, moe_act
+```
 
-### Integrate HybridEP for High-Performance Intra-Node Token Dispatching
-- [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is developed by NVIDIA as an optimized solution for large-scale MoE (Mixture of Experts) all-to-all communication. It is designed to leverage NVIDIA GPU hardware capabilities, significantly reducing Streaming Multiprocessor (SM) resource usage.
-- HybridEP currently supports intra-node and multi-node NVLink scenarios.
-- To enable HybridEP, set `--moe-token-dispatcher-type=flex` and
-  `--moe-flex-dispatcher-backend=hybridep` in your command line arguments.
+For more details, see `docs/source/api-guide/fine_grained_activation_offloading.md`
 
-### CUDA Graph Support
-CUDA Graph functionality can be enabled through the `--cuda-graph-impl` option. There are two implementations:
+### Communication Optimization
 
-1. `--cuda-graph-impl=local`: Captures cuda graphs using the MCore-internal cuda graph manager.
-2. `--cuda-graph-impl=transformer_engine`: Captures cuda graphs using the TE `make_graphed_callables()` interface.
+Distributed training introduces communication overhead from various parallelism strategies. Megatron Core supports overlapping communication with computation to hide latency and improve throughput.
 
-To use `--cuda-graph-impl=transformer_engine`, the user should call related methods `TECudaGraphHelper.create_cudagraphs()` and `TECudaGraphHelper.cuda_graph_set_manual_hooks()` in the training script. Please refer to the usage in `megatron/training/training.py`.
+#### Data Parallel (DP) Communication Overlap
 
-For MoE models, certain configurations may prevent CUDA Graph capture of MoE layers. Specifically, when `--moe-expert-capacity-factor` and `--moe-pad-expert-input-to-capacity` are not set, the resulting dynamic shapes make MoE layers uncapturable. In such cases, you can still leverage CUDA Graphs for the attention layers (operations in `TransformerLayer._forward_attention()`) by setting `--cuda-graph-scope=attn`, while leaving the MoE layers (operations in `TransformerLayer._forward_mlp()`) unmodified. See the argument description for more usage of `--cuda-graph-scope`.
+With distributed optimizer, DP introduces **reduce-scatter** (gradients) and **all-gather** (parameters) communications, chunked by Transformer layer granularity.
+
+| Optimization | Description | Config |
+|--------------|-------------|--------|
+| **Gradient Reduce Overlap** | Overlaps gradient reduce-scatter with backward computation | `--overlap-grad-reduce` |
+| **Param Gather Overlap** | Overlaps parameter all-gather with forward computation | `--overlap-param-gather` |
+| **BF16 Gradient Reduce** | Reduces gradients in BF16 instead of FP32 for better performance | `--grad-reduce-in-fp32 false` (via mixed precision config) |
+| **FP8 Param Gather** | Conducts parameter all-gather in FP8, reducing overhead by 50% | `--fp8-param-gather` |
+
+#### Tensor Parallel (TP) Communication Overlap
+
+TP with sequence parallelism introduces activation all-gather and reduce-scatter operations. Communications are overlapped in **bulk** (no dependency) or **pipelined** (with dependency) fashion.
+
+| Optimization | Description | Config |
+|--------------|-------------|--------|
+| **TP Comm Overlap** | Enables bulk and pipelined TP communication overlap | `--tp-comm-overlap` |
+
+> **Requirements**: `tensor_model_parallel_size >= 2` and `--sequence-parallel`
+
+#### Pipeline Parallel (PP) Communication Overlap
+
+PP introduces P2P activation sends/receives between pipeline stages. Overlap is automatic in the 1F1B pipelining phase when VPP is enabled.
+
+| Optimization | Description | Config |
+|--------------|-------------|--------|
+| **P2P Comm Overlap** | Overlaps PP P2P communications with non-dependent computations | `--overlap-p2p-comm` (auto-enabled with VPP) |
+| **VPP for Better Overlap** | Increases overlap opportunities by reducing layers per virtual stage | `--num-layers-per-virtual-pipeline-stage` |
+
+#### Expert Parallel (EP) Communication Overlap
+
+EP All-to-All can consume 30-40% of training time without optimization. These features hide or reduce EP communication overhead.
 
+| Optimization | Description | Config |
+|--------------|-------------|--------|
+| **EP A2A Overlap** | Overlaps All-to-All with computation by merging FWD-BWD passes of adjacent microbatches | `--overlap-moe-expert-parallel-comm --delay-wgrad-compute` |
+| **Shared Expert Overlap** | Runs shared expert computation concurrently with EP token transfer | `--moe-shared-expert-overlap` |
 
-### Batch-Level EP-A2A hidding
-Enable A2A overlap across different batches inspired by the DSv3 DualPipe implmentation. \
-**Features** 
-- Hide ep a2a communication by batch-level overlapping
-- Split weight gradient and activation gradient computations for better overlap with communications
-- Support interleaved pipelined parallelism
-- Support FP8 training
-- Support MTP (`-mtp-num-layers 1` only, multiple MTP layers are not supported yet.)
+> **Requirements for EP A2A Overlap**: `expert_model_parallel_size > 1`, CUDA_DEVICE_MAX_CONNECTIONS > 1.
 
+### Compute Optimization
+
+Fine-grained MoE produces many small operations that can underutilize GPU resources. These optimizations reduce kernel launch overhead and improve GPU utilization.
+
+| Optimization | Description | Config |
+|--------------|-------------|--------|
+| **Grouped GEMM** | Batches multiple expert GEMM operations into a single kernel call, improving GPU utilization | `--moe-grouped-gemm` |
+| **Router Fusion** | Fuses router projection, top-k selection, softmax, and auxiliary loss into fewer kernels | `--moe-router-fusion` |
+| **Permute Fusion** | Fuses token permutation/unpermutation operations into optimized single kernels | `--moe-permute-fusion` |
+| **FP8 Training** | Uses FP8 Tensor Core operations for faster GEMMs on Hopper/Blackwell GPUs | `--fp8 --fp8-recipe blockwise` |
+
+
+### FP8 Training
+
+FP8 training provides benefits across all three performance walls:
+
+| Wall | FP8 Benefit | Impact |
+|------|-------------|--------|
+| **Memory** | 50% activation reduction | Stores linear layer inputs in FP8 instead of BF16 |
+| **Memory** | Eliminate BF16 weight copies | Native FP8 casts directly from FP32 to FP8 |
+| **Communication** | 50% EP dispatch volume | Dispatches tokens in FP8 instead of BF16 |
+| **Communication** | 50% parameter all-gather | With FP8 primary weights (except MXFP8) |
+| **Compute** | Faster Tensor Core GEMMs | FP8 ops on Hopper/Blackwell are faster than BF16 |
+
+#### FP8 Recipes
+
+| Recipe | Scaling Granularity | Format | Platform | Use Case |
+|--------|---------------------|--------|----------|----------|
+| **Per-tensor** | Whole tensor | E4M3/E5M2 hybrid | Hopper, Blackwell | Conservative, initial experimentation |
+| **Blockwise** | 1×128 (activations), 128×128 (weights) | E4M3 | Hopper | **Production-proven** (DeepSeek-V3, Minimax-M2) |
+| **MXFP8** | 1×32 | E4M3 + E8M0 scaling | Blackwell | Native hardware support on GB200 |
+
+> **Recommendation**: Use **blockwise FP8** on Hopper for production training. It has been validated at scale on DeepSeek-V3 class models.
+
+#### MoE-Specific FP8 Optimizations
+
+| Optimization | Description | Config |
+|--------------|-------------|--------|
+| **Routing Map Padding** | Pads routing map (not tokens) to align M dimension to 16/32, avoiding per-tensor padding overhead | `--moe-router-padding-for-fp8` |
+| **FP8 Primary Weights** | Casts FP32 master weights directly to FP8, eliminating BF16 intermediate copy | `--fp8-param-gather` (Need additional `--reuse-grad-buf-for-mxfp8-param-ag` for MXFP8) |
+
+
+#### Example Configuration
 
-**Usage** 
 ```bash
-# Add the following flags to your training scripts
---overlap-moe-expert-parallel-comm
-# [optional] only works with specific TE version
---delay-wgrad-compute
+# Blockwise FP8 on Hopper (recommended for production)
+--fp8-format e4m3
+--fp8-recipe blockwise
+--fp8-param-gather
+--moe-router-padding-for-fp8
+
+# MXFP8 on Blackwell
+--fp8-format e4m3
+--fp8-recipe mxfp8
+--moe-router-padding-for-fp8
+--fp8-param-gather 
+--reuse-grad-buf-for-mxfp8-param-ag
 ```
 
-### MoE Related Arguments
-| Item | Description |
-| --- | --- |
-| --num-experts | Number of Experts in MoE (None means no MoE) |
-| --expert-model-parallel-size | Degree of expert model parallelism. Default is 1. |
-| --moe-ffn-hidden-size | MoE Feed-Forward Network hidden size. Default is None. |
-
-<details>
-<summary> View all MoE related arguments. </summary>
-
-| Item | Description |
-| --- | --- |
-| --num-experts | Number of Experts in MoE (None means no MoE) |
-| --expert-model-parallel-size | Degree of expert model parallelism. Default is 1. |
-| --moe-ffn-hidden-size | MoE Feed-Forward Network hidden size. Default is None. |
-| --expert-tensor-parallel-size | Degree of tensor model parallelism of expert layer. Default is same to --tensor-model-parallel-size. |
-| --moe-layer-freq | Frequency between MoE layers and Dense layers. Accepts either: 1) An integer N for 1:N ratio (one expert layer for every N-1 dense layers), 2) A string "N" for the same ratio, or 3) A string with Python list expression for custom patterns like `([1]*3+[0]*1)*3` which gives [1,1,1,0,1,1,1,0,1,1,1,0] where 1=expert layer and 0=dense layer. Examples: `([0]+[1]*23)` for 1 dense layer followed by 23 experts layers, `([1]*3+[0]*2)*2` for three expert layers followed by two dense layers, repeated twice. Default is 1. |
-| --moe-grouped-gemm | When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine. |
-| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2 and DeepSeekV3, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
-| --moe-router-dtype | Data type for routing computation and expert output weighted averaging. Options are 'fp32' and 'fp64'. This can improve numerical stability, particularly when using a large number of experts. The throughput/memory impact should be negligible when used with --moe-permute-fusion. Default is None (no dtype promotion). |
-| --moe-router-topk | Number of experts to route to for each token. The default is 2. |  
-| --moe-router-score-function | Score function for MoE routing. Can be "softmax" or "sigmoid". Default is "softmax". |
-| --moe-router-pre-softmax | Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k. |
-| --moe-router-num-groups | Number of groups to divide experts into for group-limited routing. When using group-limited routing: 1) Experts are divided into equal-sized groups, 2) For each token, a subset of groups are selected based on routing scores (sum of top-2 expert scores within each group), 3) From these selected groups, moe_router_topk experts are chosen.  Two common use cases: 1) Device-limited routing: Set equal to expert parallel size (EP) to limit each token to experts on a subset of devices (See DeepSeek-V2: https://arxiv.org/pdf/2405.04434) 2) Node-limited routing: Set equal to number of nodes in EP group to limit each token to experts on a subset of nodes (See DeepSeek-V3: https://arxiv.org/pdf/2412.19437)) |
-| --moe-router-group-topk | Number of selected groups for group-limited routing. |
-| --moe-router-topk-scaling-factor | Scaling factor for routing score in top-k selection, only works when --moe-router-pre-softmax enabled. Defaults to None, which means no scaling. |
-| --moe-router-enable-expert-bias | TopK routing with dynamic per-expert bias in the aux-loss-free load balancing strategy. The routing decision is based on the sum of the routing scores and the expert bias. See https://arxiv.org/abs/2408.15664 for details. |
-| --moe-router-fusion | Enable fusion for MoE TopK routing and aux-loss computation. This is only supported in TransformerEngine 2.7.0 and above. |
-| --moe-router-bias-update-rate | The expert bias is updated based on the number of assigned tokens to each expert in a global batch, where the bias is increased for experts with less assigned tokens and decreased for experts with more assigned tokens. Default is 1e-3 same as that used in DeepSeekV3. |
-| --moe-router-force-load-balancing | (Experimental) Force override routing to balance token distribution using random logits for MoE routers, supporting naive top-k and group-limited top-k. This experimental feature is for benchmarking purposes only! |
-| --moe-router-padding-for-quantization | Pad the routing_map to make sure the number of tokens each expert received is a multiple of 16/32 for FP8/FP4 precision. It is suggested to enable this for dropless training with FP8 precision when num_local_experts > 1. This is a more efficient way to pad for FP8 which eliminates the explicit padding in the GroupedMLP layer. |
-| --moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. |
-| --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. |
-| --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. |
-| --moe-token-dispatcher-type | Determines the token dispatcher type. Choices are "allgather", "alltoall". Default is "allgather". We recommend using 'alltoall' if expert parallelism is applied. We have upgraded the "alltoall" dispatcher in place during MCore v0.9, while the original implementation renamed as "alltoall_seq" is retained until MCore v0.13.|
-| --moe-flex-dispatcher-backend | (Experimental) Select the backend for the flex token dispatcher. Supported options: "deepep", "hybridep". Enables efficient token dispatching and combining for MoE models. |
-| --moe-per-layer-logging | Enable per-layer logging for MoE, currently supports auxiliary loss and z loss. |
-| --moe-expert-capacity-factor | The capacity factor for each expert, None means no token will be dropped. Default is None. |
-| --moe-pad-expert-input-to-capacity | Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set. |
-| --moe-token-drop-policy | The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. |
-| --moe-layer-recompute | Enable activation checkpointing for moe_layer, should be used when memory is not sufficient. |
-| --moe-permute-fusion | Fuse token rearrangement ops during token dispatching. |
-| --moe-shared-expert-intermediate-size | Set shared expert total ffn hidden size. It should be equal to `num_shared_experts * ffn_size_of_each_shared_expert` if there are multiple shared experts. None means no shared expert. |
-| --moe-shared-expert-overlap | (Experimental, may change) If this is set, the communications/computations in the shared experts and the dispatcher will overlap (The `alltoall` dispatcher is needed.) Otherwise, the shared expert runs after the routed experts. |
-| --moe-use-upcycling | Load the dense model checkpoint, convert it into an MoE model at runtime and start training. The converted model will be saved to the path specified by `--save` before training begins. Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.|
-| --overlap-moe-expert-parallel-comm | Enable batch-level overlapping in 1f1b stage. |
-| --delay-wgrad-compute | Enable split dgrad and wgrad for `overlap-moe-expert-parallel-comm` execution. Increasing room to hide communication latency by more finegrained control. |
-| --pipeline-model-parallel-layout | (Experimental, may change) A string containing a Python list expression that defines a custom pipeline model parallel layout. |
-| --moe-upcycling-granularity | This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. For using granular upcycling strategy, please set this param as a positive integer. If this param is set to 1, it means using the default upcycling strategy.|
+> **Note**: For blockwise and MXFP8 recipes with current scaling, training loss curves show negligible difference compared to BF16 baselines.
 
-</details>
 
-## MoE training example:
-<details>
-<summary>Click here. </summary>
+### CUDA Graph
+CUDA Graph functionality can be enabled through the `--cuda-graph-impl` option. There are two implementations:
+
+1. `--cuda-graph-impl=local`: Captures cuda graphs using the MCore-internal cuda graph manager.
+2. `--cuda-graph-impl=transformer_engine`: Captures cuda graphs using the TE `make_graphed_callables()` interface.
 
+To use `--cuda-graph-impl=transformer_engine`, the user should call related methods `TECudaGraphHelper.create_cudagraphs()` and `TECudaGraphHelper.cuda_graph_set_manual_hooks()` in the training script. Please refer to the usage in `megatron/training/training.py`.
+
+For MoE models, certain configurations may prevent CUDA Graph capture of MoE layers. Specifically, when `--moe-expert-capacity-factor` and `--moe-pad-expert-input-to-capacity` are not set, the resulting dynamic shapes make MoE layers uncapturable. In such cases, you can still leverage CUDA Graphs for the attention layers (operations in `TransformerLayer._forward_attention()`) by setting `--cuda-graph-scope=attn`, while leaving the MoE layers (operations in `TransformerLayer._forward_mlp()`) unmodified. See the argument description for more usage of `--cuda-graph-scope`.
+
+## MoE Arguments Reference
+### Core Arguments
+| Argument | Description | Default |
+|----------|-------------|---------|
+| --num-experts | Number of Experts in MoE | None |
+| --expert-model-parallel-size | Degree of expert model parallelism | 1 |
+| --moe-ffn-hidden-size | MoE FFN hidden size | FFN hidden size of the dense model |
+| --expert-tensor-parallel-size | Expert layer tensor parallelism | Same as TP(Recommeded to set to 1 for fine-grained MoE models) |
+| --moe-layer-freq | MoE layer frequency pattern | 1 |
+
+### Router Arguments
+| Argument | Description | Default |
+|----------|-------------|---------|
+| --moe-router-load-balancing-type | Load balancing: aux_loss, sinkhorn, seq_aux_loss, none | aux_loss |
+| --moe-router-topk | Number of experts per token | 2 |
+| --moe-router-score-function | Score function: softmax, sigmoid | softmax |
+| --moe-router-pre-softmax | Softmax before top-k | False |
+| --moe-router-num-groups | Groups for group-limited routing | None |
+| --moe-router-group-topk | Selected groups in group-limited routing | None |
+| --moe-router-enable-expert-bias | Dynamic per-expert bias | False |
+| --moe-router-bias-update-rate | Bias update rate | 1e-3 |
+| --moe-router-fusion | Enable router fusion | False |
+| --moe-router-dtype | Router precision: fp32, fp64 | None |
+| --moe-router-padding-for-fp8 | Pad for FP8 alignment | False |
+
+### Loss and Regularization
+| Argument | Description | Default |
+|----------|-------------|---------|
+| --moe-aux-loss-coeff | Auxiliary loss coefficient | 0.0 |
+| --moe-z-loss-coeff | Z-loss coefficient | None |
+| --moe-input-jitter-eps | Input jitter epsilon | None |
+
+### Token Dispatching
+| Argument | Description | Default |
+|----------|-------------|---------|
+| --moe-token-dispatcher-type | Dispatcher: allgather, alltoall, flex | allgather |
+| --moe-enable-deepep | Enable DeepEP (with flex) | False |
+| --moe-expert-capacity-factor | Capacity factor | None |
+| --moe-pad-expert-input-to-capacity | Pad to capacity | False |
+| --moe-token-drop-policy | Drop policy: probs, position | probs |
+| --moe-permute-fusion | Fuse permutation ops | False |
+
+### Performance Optimization
+| Argument | Description | Default |
+|----------|-------------|---------|
+| --moe-grouped-gemm | Use GroupedGEMM | False |
+| --overlap-moe-expert-parallel-comm | Batch-level EP overlap | False |
+| --delay-wgrad-compute | Split dgrad/wgrad compute | False |
+| --moe-shared-expert-intermediate-size | Shared expert FFN size | None |
+| --moe-shared-expert-overlap | Overlap shared expert | False |
+
+### Memory and Checkpointing
+| Argument | Description | Default |
+|----------|-------------|---------|
+| --moe-layer-recompute | Recompute MoE layer | False |
+| --moe-use-upcycling | Enable upcycling | False |
+| --moe-upcycling-granularity | Upcycling granularity | 1 |
+
+### Miscellaneous
+| Argument | Description | Default |
+|----------|-------------|---------|
+| --moe-per-layer-logging | Per-layer logging | False |
+| --moe-router-force-load-balancing | Force load balancing (experimental) | False |
+
+## Examples
 ```bash
 #!/bin/bash
 
 # Runs Mixtral 8x7B model on 32 H100/A100 GPUs
-# The Dropless MoE suffers from an imbalanced token distribution at the early stage of training (the first few hundred iterations), which may lead to poor performance and out-of-memory (OOM) issues.
-# To check the performance of a Dropless MoE model, we should run the model for at least 500 iterations or resume from trained checkpoints.
 
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 
 GPUS_PER_NODE=8
-# Change for multinode config
 MASTER_ADDR=${MASTER_ADDR:-"localhost"}
 MASTER_PORT=${MASTER_PORT:-"6000"}
-NNODES=${NNODES:-"1"}
+NNODES=${NNODES:-"4"}
 NODE_RANK=${RANK:-"0"}
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 
@@ -319,11 +623,12 @@ MODEL_ARGS=(
 MOE_ARGS=(
     --num-experts 8
     --expert-model-parallel-size 8
-    --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, None. Default is aux_loss.
+    --moe-router-load-balancing-type aux_loss
     --moe-router-topk 2
     --moe-aux-loss-coeff 1e-2
     --moe-grouped-gemm
     --moe-permute-fusion
+    --moe-token-dispatcher-type alltoall
 )
 
 DATA_ARGS=(
@@ -358,24 +663,17 @@ MODEL_PARALLEL_ARGS=(
 )
 
 LOGGING_ARGS=(
-    --log-interval 1 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH \
-    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
-    --no-load-optim \
-    --no-load-rng
+    --log-interval 1
+    --save-interval 10000
+    --eval-interval 1000
+    --eval-iters 10
+    --save $CHECKPOINT_PATH
+    --load $CHECKPOINT_PATH
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard"
+    --ckpt-format torch_dist
+    --auto-detect-ckpt-format
 )
 
-if [ -n "${WANDB_API_KEY}" ]; then
-    LOGGING_ARGS+=(
-        --wandb-project ${WANDB_PROJECT:-"Mixtral-Finetuning"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"} 
-    )
-fi
-
 torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
     ${MODEL_ARGS[@]} \
     ${MOE_ARGS[@]} \
@@ -384,107 +682,36 @@ torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
     ${MODEL_PARALLEL_ARGS[@]} \
     ${LOGGING_ARGS[@]}
 ```
+
 </details>
 
-# Performance Best Practice
+## Contributing
 
-### Tuning Guide of Parallel Mappings
+We welcome contributions! Please see [CONTRIBUTING.md](https://github.com/NVIDIA/Megatron-LM/blob/main/CONTRIBUTING.md) for guidelines.
 
-To find a good parallel mapping that help you achieve a high throughput of a new model, there are some general rule that could help. Here is an overview of properties in different aspects for each parallel strategy.
+## Support
 
-| Parallel Strategy | Peak Activation Memory          | Weight Memory  | Optimizer states                  | Communication (Per-Layer) |
-|:-----------------:|:-------------------------------:|:--------------:|:---------------------------------:|:-------------------------:|
-| TP                | 1/N (with SP on)                | 1/N            | 1/N                               |        High               |
-| EP                | 1                               | 1/N in MoELayer| 1/N                               |       Medium              |
-| PP                | 1 (>1 with virtual pipeline)    | 1/N            | 1/N                               |       Medium              |
-| CP                | 1/N                             | 1              | 1/N (with distributed optimizer)  |       Medium              |
-| DP                | 1                               | 1              | 1/N (with distributed optimizer)  |        Low                |
+- GitHub Issues: [Report bugs or request features](https://github.com/NVIDIA/Megatron-LM/issues)
+- Documentation: [Full documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html)
 
-For a specific model, the best parallel mapping varies based on the model architecture, trained sequence length and the hardware platform.
-Here we provide some general rules to get better performance:
-1. Keep the model parallism size as small as possible. 
-    - For the large language models, model parallism is often required to prevent OOM, but it will bring communication overhead and hurt performance. 
-    - With distributed optimizer, master weights and optimizer states will be sharded across all DP ranks with slight communication overhead.
-    So try to reduce the model parallism size and increase data parallism size when there are lots of free GPU memory during training.
-2. Ensure the EPxTP communication winthin the NVLink domain.
-    - Communications of EP and TP should remain within the NVLink domain as much as possible, as both are communication-intensive.
-    - If the model is too large and requires scaling across multiple nodes, consider PP before TP and EP. See item 3 for details.
-3. Use Pipeline Parallelism to scale the model further.
-    - Enable Virtual Pipeline Parallelism(VPP) to reduce pp bubbles when PP_size >= 2 by setting `num_layers_per_virtual_pipeline_stage`.
-    - VPP_size tuning: the legal values of vpp_size are all common divisors of num_layers/pp_size, E.g., num_layers=24, pp_size=4, then we can pick vpp_size from {1, 2, 3, 6}. The larger the vpp_size, the lower the pipeline bubbles, while the larger number of P2P communications between each PP stages. Empirically a value in the middle often gives the best trade-off. `VPP_size=num_layers / PP_size / num_layers_per_virtual_pipeline_stage`
-4. Prefer EP over TP for the expert layer when possible:
-    - TP saves more memory than EP, but EP can achieve better GEMM efficiency and less communication overhead than TP.
-    - If EP size increased to the number of expert, the local token permutation/un-permutation for experts computation are omitted.
-    - Simplify the computation graph of MoE layers, more convenient for performing potential comm-computation overlapping.
-    - In practice, EP8TP1 is better than EP4TP2 for 8x7B.
-5. Enable Context Parallelism for long context training.
-    - The efficiency of CP largely depends on whether its communication can be overlapped with computation. 
-    - Empirically, use CP when sequence length >= 8K.
 
-### MoE Parallel Folding
+## Citation
 
-MoE Parallel Folding separates the MoE related parallel groups from Dense groups.
-1. Traditional MoE parallel groups are entangled with dense by using a 5-dimension parallel group generator with default order `tp-cp-ep-dp-pp`. The EP group in MoE is a sub-group of DP in Attention.
-2. With MoE Parallel Folding, we use a parallel group generator with `tp-cp-dp-pp` for Attention, and another with `tp-ep-dp-pp` for MoE. The EPxTP group in MoE is a sub-group of DPxCPxTP in Attention.
-
-By setting `--expert-tensor-parallel-size`, we can set MoE-specific TP size.
-
-#### Advantages of MoE Parallel Folding
-1. The CP and EP group are folded together by defualt, such that:
-    1. It reduces the minimal required GPUs to turn on both CP and EP. For example, the traditional way with (CP=8, EP=8) needs at least 64 GPUs, for now it only requires 8 GPUs.
-    2. The CP and EP communication can be both put in the NVLink domain.
-2. We can set different TP sizes for Attention and MoE part.
-    1. For MoE, EP is often more efficient than TP. But in the traditional way, only using EP can get OOM for most models.
-    2. With MoE parallel folding, we can turn on TP for Attention part and setting TP=1 for MoE models, which often gets better MFU.
-
-### End-to-End Training Practice
-**Use the latest NVIDIA PyTorch or NeMo Docker Image**
-- [NGC PyTorch Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
-- [NGC NeMo Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
-
-**Token Dispatcher Choices**
-- Token Dispatcher sends tokens to the designated expert, involves tensor rearangement and communications.
-- Dispatcher `allgather` is the default option. It achieves better performance and efficiency when only tensor parallelism is used or when the Top-k value is very large.
-- Dispatcher `alltoall` is recommended if expert parallelism is applied.
-- Dispatcher `flex` is a new dispatcher decouples communication group from model parallelism. It supports two backends(DeepEP and HybridEP) selectable via `--moe-flex-dispatcher-backend`.
-
-**Enable Communication Overlap**
-- Enable `--overlap-param-gather` and `--overlap-grad-reduce` with distributed optimizer.
-- Enable `--tp-comm-overlap` when TP>1.
-- Enable p2p comm overlap when PP > 1 by setting `num_layers_per_virtual_pipeline_stage`.
-
-**Enable GroupedGEMM when num_local_experts>1 with `--moe-grouped-gemm`**
-- GroupedGEMM has higher efficiency than vanilla sequential GEMMs for each expert.
-- Recommend to use the TE version of Grouped GEMM (by upgrading to MCore v0.8 and TE v1.9), which support Gradient Accumulation Fusion and FP8 Training.
-
-**OOM Caused by Token Distribution Imbalance when Training From Scratch**  
-MoE suffers from a severe load imbalance issue when the router is under-trained, leading to the model easily running out of memory (OOM), which typically occurs in the first 100~300 steps when training from scratch. 
-Therefore, there are two recommended ways during the first 200 steps to avoid the OOM problem, which can be removed after the token distribution is more stable:
-1. Increase the `expert-tensor-parallel-size` and decrease `expert-model-parallel-size` to replace EP with TP in MoELayer, this can prevent the load imbalancing between EP ranks. Since current ETP implementation has some memeory overhead, you can further enable activation recomputation only for MoE Layer by adding `--moe-layer-recompute`.
-2. Setting capacity factor to a relatively small number like 1.0 by adding `--moe-token-capacity-factor 1.0`.
-
-**Leverage DeepSeek's DeepEP for High-Performance Cross-Node Token Dispatching**
-- The primary advantage of DeepEP is its cross-node token communication efficiency, which delivers substantial performance improvements when deploying expert parallelism across multiple nodes with large TopK values.
-- To enable DeepEP in your training configuration, simply set `--moe-token-dispatcher-type=flex` and `--moe-enable-deepep` in your command line arguments.
-
-**FP8 Training Best Practice**
-- Using latest version of [TransformerEngine](https://github.com/NVIDIA/TransformerEngine).
-- Enable router padding with `--moe-router-padding-for-quantization` to reduce padding overhead.
-- Enable native FP8 weights with `--fp8-param-gather` to reduce weights memory cost.
-
-### Reference Best Parallel Mapping
-
-Here are the reference parallel mappings of MCore v0.8 for Mixtral 8x7B and 8x22B models:
-|        Model            | Vocab Size| Dispatcher | Precision | #GPUs | SEQ LEN | TP | EP | PP | VP | MBS | GBS |
-|:-----------------------:|:---------:|:----------:|:---------:|:-----:|:-------:|:--:|:--:|:--:|:--:|:---:|:---:|
-| Mixtral 8x7B(Dropless)  |   32K     | All-to-All | BF16      | 64    | 4096    | 1  | 8  | 4  | 8  | 1   | 256 |
-| Mixtral 8x22B(Dropless) |   32K     | All-to-All | BF16      | 128   | 4096    | 4  | 2  | 8  | 7  | 1   | 256 |
-
-Detailed Benchmark Information:  
-Server:
-- 8xH100 80GB HBM3 
-- NVLink 4th Generation
-- InfiniBand 8x400 Gbit/s
-
-Docker Image:
-- PyTorch 24.09 with TransformerEngine v1.11
+If you use Megatron-Core MoE in your research, please cite:
+
+```bibtex
+
+@article{megatron-lm,
+  title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
+  author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
+  journal={arXiv preprint arXiv:1909.08053},
+  year={2019}
+}
+
+@article{moe-parallel-folding,
+    title={MoE Parallel Folding: Heterogeneous Parallelism Mappings for Efficient Large-Scale MoE Model Training with Megatron Core}, 
+    author={Liu, Dennis and Yan, Zijie and Yao, Xin and Liu, Tong and Korthikanti, Vijay and Wu, Evan and Fan, Shiqing and Deng, Gao and Bai, Hongxiao and Chang, Jianbin and Aithal, Ashwath and Andersch, Michael and Shoeybi, Mohammad and Yao, Jiajie and Zhou, Chandler and Wu, David and Li, Xipeng and Yang, June},
+    year={2025},
+    journal={arXiv preprint arXiv:2504.14960},
+}
+```
diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py
index 68a3d53d2be..62fb7a148c8 100644
--- a/megatron/core/transformer/moe/experts.py
+++ b/megatron/core/transformer/moe/experts.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import copy
-import itertools
+import logging
 from copy import deepcopy
-from functools import partial, wraps
+from functools import partial
 from math import ceil
 from typing import Optional, Tuple
 
@@ -11,7 +11,7 @@
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import tensor_parallel
 from megatron.core.activations import squared_relu
 from megatron.core.dist_checkpointing import ShardedTensor
 from megatron.core.dist_checkpointing.mapping import (
@@ -25,6 +25,9 @@
 from megatron.core.fusions.fused_bias_swiglu import weighted_bias_swiglu_impl
 from megatron.core.fusions.fused_weighted_squared_relu import weighted_squared_relu_impl
 from megatron.core.jit import jit_fuser
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
 from megatron.core.tensor_parallel.layers import (
     _initialize_affine_weight_cpu,
     _initialize_affine_weight_gpu,
@@ -40,6 +43,7 @@
 from megatron.core.transformer.spec_utils import build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import (
+    ensure_metadata_has_dp_cp_group,
     make_sharded_object_for_checkpoint,
     sharded_state_dict_default,
 )
@@ -55,48 +59,7 @@
 
     HAVE_TE = False
 
-
-# TODO(Hepteract): delete the usage of the global parallel_state.
-# Currently we still have to use the global parallel_state in expert_dist_ckpt_decorator(),
-# in order to set sub-module's process group while getting sharded_state_dict.
-# After sub-module's refactoring is done, we can pass pg_collection to sub-module
-# and delete the function expert_dist_ckpt_decorator.
-def expert_dist_ckpt_decorator(func):
-    """Decorator of shared_state_dict in expert layer for distributed checkpoint.
-
-    Since !1940, the TP size for Expert layer can be different with Attention.
-    To make distributed checkpoint work in such cases, we use a decorator to
-    replace the default TP parallel states with expert-TP parallel states.
-    """
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        # Store original states
-        original_rank = parallel_state._MPU_TENSOR_MODEL_PARALLEL_RANK
-        original_size = parallel_state._MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-        original_group = parallel_state._TENSOR_MODEL_PARALLEL_GROUP
-        try:
-            # Set new states
-            parallel_state._MPU_TENSOR_MODEL_PARALLEL_RANK = (
-                parallel_state.get_expert_tensor_parallel_rank()
-            )
-            parallel_state._MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = (
-                parallel_state.get_expert_tensor_parallel_world_size()
-            )
-            parallel_state._TENSOR_MODEL_PARALLEL_GROUP = (
-                parallel_state.get_expert_tensor_parallel_group()
-            )
-
-            # Execute the function
-            result = func(*args, **kwargs)
-        finally:
-            # Restore original states
-            parallel_state._MPU_TENSOR_MODEL_PARALLEL_RANK = original_rank
-            parallel_state._MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = original_size
-            parallel_state._TENSOR_MODEL_PARALLEL_GROUP = original_group
-        return result
-
-    return wrapper
+logger = logging.getLogger(__name__)
 
 
 class GroupedMLP(MegatronModule):
@@ -105,6 +68,7 @@ class GroupedMLP(MegatronModule):
     Executes multiple experts in parallel to maximize computational efficiency.
     """
 
+    # TODO(M4): breaking api, switched from pass in tp_group to pass in pg_collection.
     def __init__(
         self,
         num_local_experts: int,
@@ -118,6 +82,9 @@ def __init__(
         assert (
             config.add_bias_linear == False
         ), "bias not supported in Grouped GEMM yet, please set '--disable-bias-linear' instead."
+        assert (
+            config.moe_latent_size is None
+        ), "MoE latent projection not supported in GroupedMLP yet."
 
         self.expert_parallel = config.expert_model_parallel_size > 1
         if self.config.gated_linear_unit:
@@ -255,6 +222,7 @@ def forward(
         permuted_probs: torch.Tensor,
     ):
         """Forward step of the GroupedMLP."""
+        assert self.config.bf16, "Currently GroupedMLP for MoE only supports bf16."
         if self.activation_recompute:
             self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput()
 
@@ -309,7 +277,6 @@ def forward(
 
         return fc2_output, None
 
-    @expert_dist_ckpt_decorator
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """
         Maps local expert to global experts.
@@ -458,186 +425,7 @@ def sh_ten_build_fn(
                             replica_id=replica_id,
                             prepend_axis_num=prepend_axis_num,
                         )
-            else:
-                if singleton_local_shards:
-                    raise NotImplementedError(
-                        'flattened_range not supported for'
-                        ' GroupedMLP with singleton_local_shards'
-                    )
-                # flattened optmizer states
-                # the non-flattened weight shape is [local_expert_num, hidden_size, ffn_size]
-                #
-                # For the case without GLU, it is straightforward, we just need to split each
-                # expert along the dim-0.
-                #
-                # For the case with GLU, we need to split the experts along dim-0 and split the
-                # two tensors for GLU along dim-2.
-                # To split along the non-first dim, we need to chunk the tensor into small pieces,
-                # since they belong to different tenors and are interleaved in the flattened space.
-                # Refer to the below sketch graph.
-                # |................|           |........|........|
-                # |............FFFF|           |........|....BBBB|
-                # |FFFFFFFFFFFFFFFF|     ->    |AAAAAAAA|BBBBBBBB|
-                # |FFFFFFFFFFFFFFFF|           |AAAAAAAA|BBBBBBBB|
-                # |FF..............|           |AA......|........|
-                # |................|           |........|........|
-                #
-                # But too many chunks have severe performance issues. We merge these chunks during
-                # the save process along with some length information and recover them during the
-                # load process.
-                assert t.ndim == 1, (key, t.shape)
-                if with_glu:
-                    non_flat_local_shape = (1, self.config.hidden_size, local_ffn_dim_size)
-                    chunk_numel = local_ffn_dim_size
-                    sub_states = []
-                    start_pos = 0
-                    for local_expert_idx in range(self.num_local_experts):
-                        first_glu_idx = -1
-                        w_start_range = -1
-                        v_start_range = -1
-                        w_tensors = []
-                        v_tensors = []
-                        w_lens = []
-                        v_lens = []
-                        expert_global_idx = local_expert_indices_offset + local_expert_idx
-                        for input_dim_idx in range(self.config.hidden_size):
-                            for glu_idx in range(2):
-                                local_idx = (
-                                    local_expert_idx * self.config.hidden_size * 2
-                                    + input_dim_idx * 2
-                                    + glu_idx
-                                )
-                                if (
-                                    flattened_range.start < chunk_numel * (local_idx + 1)
-                                    and flattened_range.stop > chunk_numel * local_idx
-                                ):
-                                    if first_glu_idx == -1:
-                                        first_glu_idx = glu_idx
-                                    end_pos = min(
-                                        flattened_range.stop,
-                                        chunk_numel * (local_idx + 1) - flattened_range.start,
-                                    )
-                                    local_tensor = t[start_pos:end_pos]
-                                    local_flattened_range = slice(
-                                        max(0, flattened_range.start - chunk_numel * local_idx),
-                                        min(
-                                            chunk_numel,
-                                            flattened_range.stop - chunk_numel * local_idx,
-                                        ),
-                                    )
-                                    assert (
-                                        len(local_tensor)
-                                        == local_flattened_range.stop - local_flattened_range.start
-                                    )
-                                    start_pos += len(local_tensor)
-                                    if glu_idx == 0:
-                                        w_tensors.append(local_tensor)
-                                        w_lens.append(len(local_tensor))
-                                        if w_start_range == -1:
-                                            w_start_range = max(
-                                                0, flattened_range.start - chunk_numel * local_idx
-                                            )
-                                    else:
-                                        v_tensors.append(local_tensor)
-                                        v_lens.append(len(local_tensor))
-                                        if v_start_range == -1:
-                                            v_start_range = max(
-                                                0, flattened_range.start - chunk_numel * local_idx
-                                            )
-                        sub_states.append(
-                            {
-                                'w_tensors': ShardedTensor.from_rank_offsets_flat(
-                                    key,
-                                    (
-                                        torch.cat(w_tensors, -1)
-                                        if len(w_tensors) > 0
-                                        else torch.Tensor()
-                                    ),
-                                    non_flat_local_shape,
-                                    *sharded_offsets,
-                                    (
-                                        prepend_axis_num,
-                                        expert_global_idx,  # pylint: disable=E0606
-                                        num_global_experts,
-                                    ),
-                                    (prepend_axis_num + 1 + tp_axis, tp_rank, tp_size * 2),
-                                    replica_id=replica_id,
-                                    prepend_axis_num=prepend_axis_num,
-                                    flattened_range=slice(
-                                        w_start_range, w_start_range + sum(w_lens)
-                                    ),
-                                ),
-                                'w_lens': LocalNonpersistentObject(w_lens),
-                                'v_tensors': ShardedTensor.from_rank_offsets_flat(
-                                    key,
-                                    (
-                                        torch.cat(v_tensors, -1)
-                                        if len(v_tensors) > 0
-                                        else torch.Tensor()
-                                    ),
-                                    non_flat_local_shape,
-                                    *sharded_offsets,
-                                    (prepend_axis_num, expert_global_idx, num_global_experts),
-                                    (
-                                        prepend_axis_num + 1 + tp_axis,
-                                        tp_rank + tp_size,
-                                        tp_size * 2,
-                                    ),
-                                    replica_id=replica_id,
-                                    prepend_axis_num=prepend_axis_num,
-                                    flattened_range=slice(
-                                        v_start_range, v_start_range + sum(v_lens)
-                                    ),
-                                ),
-                                'v_lens': LocalNonpersistentObject(v_lens),
-                                'first_glu_idx': LocalNonpersistentObject(first_glu_idx),
-                            }
-                        )
-                else:
-                    non_flat_local_shape = (
-                        real_shape[0] // self.num_local_experts,
-                        *real_shape[1:],
-                    )
-                    chunk_numel = local_ffn_dim_size * self.config.hidden_size
-                    sub_states = []
-                    start_pos = 0
-                    for local_expert_idx in range(self.num_local_experts):
-                        if (
-                            flattened_range.start < chunk_numel * (local_expert_idx + 1)
-                            and flattened_range.stop > chunk_numel * local_expert_idx
-                        ):
-                            end_pos = min(
-                                flattened_range.stop,
-                                chunk_numel * (local_expert_idx + 1) - flattened_range.start,
-                            )
-                            local_tensor = t[start_pos:end_pos]
-                            local_flattened_range = slice(
-                                max(0, flattened_range.start - chunk_numel * local_expert_idx),
-                                min(
-                                    chunk_numel,
-                                    flattened_range.stop - chunk_numel * local_expert_idx,
-                                ),
-                            )
-                            assert (
-                                len(local_tensor)
-                                == local_flattened_range.stop - local_flattened_range.start
-                            )
-                            start_pos += len(local_tensor)
-                            expert_global_idx = local_expert_indices_offset + local_expert_idx
-                            sub_states.append(
-                                ShardedTensor.from_rank_offsets_flat(
-                                    key,
-                                    local_tensor,
-                                    non_flat_local_shape,
-                                    *sharded_offsets,
-                                    (prepend_axis_num, expert_global_idx, num_global_experts),
-                                    (prepend_axis_num + 1 + tp_axis, tp_rank, tp_size),
-                                    replica_id=replica_id,
-                                    prepend_axis_num=prepend_axis_num,
-                                    flattened_range=local_flattened_range,
-                                )
-                            )
-            return sub_states
+            return sub_states  # pylint: disable=possibly-used-before-assignment
 
         @torch.no_grad()
         def sh_ten_merge_fn(sub_state_dict, tp_axis: int, with_glu: bool):
@@ -650,48 +438,24 @@ def sh_ten_merge_fn(sub_state_dict, tp_axis: int, with_glu: bool):
                 assert with_glu == False
             else:
                 raise ValueError("tp_axis should be 0 or 1.")
-            if isinstance(sub_state_dict, list) and isinstance(sub_state_dict[0], dict):
-                # flattened tensor with glu
-                res = []
-                for local_expert_dict in sub_state_dict:
-                    w_tensors = torch.split(
-                        local_expert_dict['w_tensors'], local_expert_dict['w_lens']
-                    )
-                    v_tensors = torch.split(
-                        local_expert_dict['v_tensors'], local_expert_dict['v_lens']
+            if isinstance(sub_state_dict, dict):
+                assert sub_state_dict['singleton_local_shards']
+                if with_glu:
+                    assert isinstance(sub_state_dict['data'], dict)
+                    sub_state_dict = torch.cat(
+                        (
+                            torch.stack(sub_state_dict['data']['w']),
+                            torch.stack(sub_state_dict['data']['v']),
+                        ),
+                        dim=-2,
                     )
-                    first_glu_idx = local_expert_dict['first_glu_idx']
-                    if first_glu_idx == 0:
-                        res += [
-                            x for x in itertools.chain(*itertools.zip_longest(w_tensors, v_tensors))
-                        ]
-                    else:
-                        res += [
-                            x for x in itertools.chain(*itertools.zip_longest(v_tensors, w_tensors))
-                        ]
-                return torch.cat(res)
-            elif isinstance(sub_state_dict, list) and sub_state_dict[0].ndim == 1:
-                # flattened tensor without glu
-                return torch.cat(sub_state_dict)
-            else:
-                if isinstance(sub_state_dict, dict):
-                    assert sub_state_dict['singleton_local_shards']
-                    if with_glu:
-                        assert isinstance(sub_state_dict['data'], dict)
-                        sub_state_dict = torch.cat(
-                            (
-                                torch.stack(sub_state_dict['data']['w']),
-                                torch.stack(sub_state_dict['data']['v']),
-                            ),
-                            dim=-2,
-                        )
-                    else:
-                        assert isinstance(sub_state_dict['data'], list)
-                        sub_state_dict = torch.stack(sub_state_dict['data'])
                 else:
-                    if with_glu:
-                        sub_state_dict = torch.cat(sub_state_dict, -2)
-                return sub_state_dict.transpose(-1, -2).reshape(weight_shape)
+                    assert isinstance(sub_state_dict['data'], list)
+                    sub_state_dict = torch.stack(sub_state_dict['data'])
+            else:
+                if with_glu:
+                    sub_state_dict = torch.cat(sub_state_dict, -2)
+            return sub_state_dict.transpose(-1, -2).reshape(weight_shape)
 
         state_dict = self.state_dict(prefix='', keep_vars=True)
         for name, tensor in state_dict.items():
@@ -705,7 +469,6 @@ def sh_ten_merge_fn(sub_state_dict, tp_axis: int, with_glu: bool):
                 wkey = f'{prefix}experts.linear_fc2.weight'
 
             this_replica_id = list(copy.deepcopy(replica_id))
-            flattened_range = None
 
             sharded_state_dict[f'{prefix}{name}'] = ShardedTensorFactory(
                 wkey,
@@ -713,7 +476,6 @@ def sh_ten_merge_fn(sub_state_dict, tp_axis: int, with_glu: bool):
                 partial(sh_ten_build_fn, tp_axis=tp_axis, with_glu=with_glu),
                 partial(sh_ten_merge_fn, tp_axis=tp_axis, with_glu=with_glu),
                 tuple(this_replica_id),
-                flattened_range=flattened_range,
             )
 
         replica_id = (0, tp_rank, dp_rank)
@@ -753,6 +515,7 @@ class TEGroupedMLP(MegatronModule):
     Executes multiple experts in parallel to maximize computational efficiency.
     """
 
+    # TODO(M4): breaking api, switched from pass in tp_group to pass in pg_collection.
     def __init__(
         self,
         num_local_experts,
@@ -768,17 +531,17 @@ def __init__(
         ), "bias_dropout_fusion is not supported in TEGroupedMLP when add_bias_linear=True"
 
         self.ep_group = pg_collection.ep
+        self.tp_group = pg_collection.expt_tp
 
         # Double the output width with gated linear unit, see https://arxiv.org/pdf/2002.05202.pdf
         ffn_hidden_size = self.config.moe_ffn_hidden_size
         if self.config.gated_linear_unit:
             ffn_hidden_size *= 2
 
-        # TODO(Hepteract): pass pg_collection to submodule after refactoring Linear modules
         self.linear_fc1 = build_module(
             submodules.linear_fc1,
             self.num_local_experts,
-            self.input_size,
+            self.input_size if self.config.moe_latent_size is None else self.config.moe_latent_size,
             ffn_hidden_size,
             config=self.config,
             init_method=self.config.init_method,
@@ -786,7 +549,7 @@ def __init__(
             skip_bias_add=False,
             is_expert=True,
             tp_comm_buffer_name='fc1',
-            tp_group=pg_collection.expt_tp,
+            pg_collection=pg_collection,
         )
 
         if self.config.use_te_activation_func and not (submodules.activation_func is None):
@@ -794,19 +557,32 @@ def __init__(
         else:
             self.activation_func = self.config.activation_func
 
-        # TODO(Hepteract): pass pg_collection to submodule after refactoring Linear modules
         self.linear_fc2 = build_module(
             submodules.linear_fc2,
             self.num_local_experts,
             self.config.moe_ffn_hidden_size,
-            self.config.hidden_size,
+            (
+                self.config.hidden_size
+                if self.config.moe_latent_size is None
+                else self.config.moe_latent_size
+            ),
             config=self.config,
             init_method=self.config.output_layer_init_method,
             bias=self.config.add_bias_linear,
             skip_bias_add=True,
             is_expert=True,
             tp_comm_buffer_name='fc2',
-            tp_group=pg_collection.expt_tp,
+            pg_collection=pg_collection,
+        )
+
+        self.offload_expert_fc1 = (
+            self.config.fine_grained_activation_offloading
+            and "expert_fc1" in self.config.offload_modules
+        )
+
+        self.offload_moe_act = (
+            self.config.fine_grained_activation_offloading
+            and "moe_act" in self.config.offload_modules
         )
 
         self.activation_recompute = (
@@ -818,6 +594,12 @@ def __init__(
 
             set_save_original_input(self.linear_fc2)
 
+        # This is to avoid the CPU overhead of multiple d2h copies
+        if self.offload_expert_fc1:
+            from megatron.core.extensions.transformer_engine import set_save_original_input
+
+            set_save_original_input(self.linear_fc1)
+
         if self.config.fp8 or self.config.fp4:
             assert HAVE_TE, "FP8 and FP4 requires TE."
             self.quantization_padding = Fp8Padding(self.num_local_experts)
@@ -882,9 +664,18 @@ def forward(
             # Probs already applied, so reset to 1.
             permuted_probs = torch.ones_like(permuted_probs)
 
-        intermediate_parallel, bias_parallel = self.linear_fc1(
-            permuted_local_hidden_states, tokens_per_expert
-        )
+        with off_interface(
+            self.offload_expert_fc1, permuted_local_hidden_states, "expert_fc1"
+        ) as permuted_local_hidden_states:
+            fc1_output, bias_parallel = self.linear_fc1(
+                permuted_local_hidden_states, tokens_per_expert
+            )
+        if self.offload_expert_fc1:
+            fc1_output = off_interface.group_commit(
+                fc1_output,
+                name="expert_fc1",
+                forced_released_tensors=[permuted_local_hidden_states],
+            )
 
         def bias_act_func(intermediate_parallel, bias_parallel, permuted_probs):
             if self.config.use_te_activation_func:
@@ -946,27 +737,34 @@ def glu(x):
 
         if self.activation_recompute:
             self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            intermediate_parallel = self.activation_checkpoint.checkpoint(
-                bias_act_func, intermediate_parallel, bias_parallel, permuted_probs
-            )
-            output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert)
-            self.activation_checkpoint.discard_output_and_register_recompute(output)
+            with off_interface(self.offload_moe_act, fc1_output, "moe_act") as fc1_output:
+                bias_act_output = self.activation_checkpoint.checkpoint(
+                    bias_act_func, fc1_output, bias_parallel, permuted_probs
+                )
         else:
-            intermediate_parallel = bias_act_func(
-                intermediate_parallel, bias_parallel, permuted_probs
+            with off_interface(self.offload_moe_act, fc1_output, "moe_act") as fc1_output:
+                bias_act_output = bias_act_func(fc1_output, bias_parallel, permuted_probs)
+
+        output, output_bias = self.linear_fc2(bias_act_output, tokens_per_expert)
+        if self.activation_recompute:
+            self.activation_checkpoint.discard_output_and_register_recompute(output)
+
+        # Delay the offload of the moe act until after the linear_fc2 has been computed
+        # to make sure the fc1_output is reloaded to GPU before recomputing moe_act.
+        if self.offload_moe_act:
+            output = off_interface.group_commit(
+                output, name="moe_act", forced_released_tensors=[fc1_output]
             )
-            output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert)
+        output = self._apply_bias(output, output_bias, tokens_per_expert, permuted_probs)
 
         # upad and concat the output
         if self.config.fp8 or self.config.fp4:
             output = self.quantization_unpadding(output, actual_tokens_per_expert)
 
-        output = self._apply_bias(output, output_bias, tokens_per_expert, permuted_probs)
         output_bias = None
 
         return output, output_bias
 
-    @expert_dist_ckpt_decorator
     def sharded_state_dict(
         self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None
     ) -> ShardedStateDict:
@@ -974,10 +772,14 @@ def sharded_state_dict(
         Maps local expert to global experts.
         The sharded state dict is interchangable with SequentialMLP's.
         """
+        # Guard for cases metadata is not provided
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
         singleton_local_shards = (metadata or {}).get('singleton_local_shards', False)
         sharded_state_dict = {}
         for name, module in self._modules.items():
-            sub_sd = sharded_state_dict_default(module, f'{name}.', sharded_offsets, metadata)
+            sub_sd = sharded_state_dict_default(
+                module, f'{name}.', sharded_offsets, metadata, tp_group=self.tp_group
+            )
             if name == 'linear_fc1' and self.config.gated_linear_unit:
                 num_global_experts = self.ep_group.size() * self.num_local_experts
                 local_expert_indices_offset = self.ep_group.rank() * self.num_local_experts
@@ -1021,6 +823,7 @@ class SequentialMLP(MegatronModule):
     This class executes each expert sequentially.
     """
 
+    # TODO(M4): breaking api, switched from pass in tp_group to pass in pg_collection.
     def __init__(
         self,
         num_local_experts,
@@ -1041,6 +844,7 @@ def __init__(
         self.num_local_experts = num_local_experts
         self.local_experts = torch.nn.ModuleList()
         self.ep_group = pg_collection.ep
+        self.tp_group = pg_collection.expt_tp
         # use pg_collection.expt_dp_group as data parallel group in this module.
         # TODO (Hepteract): expt_dp wont be needed here once distributed checkpoint is refactored
         self.dp_group = pg_collection.expt_dp
@@ -1128,9 +932,11 @@ def backward_dw(self):
         for expert in self.local_experts:
             expert.backward_dw()
 
-    @expert_dist_ckpt_decorator
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """Maps local expert to global experts."""
+        # Guard for cases metadata is not provided
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
+
         sharded_state_dict = {}
         num_global_experts = self.ep_group.size() * self.num_local_experts
         local_expert_indices_offset = self.ep_group.rank() * self.num_local_experts
diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py
index 60b0b11a32c..39f50a4a670 100644
--- a/megatron/core/transformer/moe/fused_a2a.py
+++ b/megatron/core/transformer/moe/fused_a2a.py
@@ -3,6 +3,7 @@
 # Copyright (c) 2025 DeepSeek
 # Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
 
+from megatron.core.utils import internal_api
 
 try:
     from deep_ep import Buffer
@@ -320,6 +321,14 @@ def init_hybrid_ep_buffer(
     )
 
 
+def reset_hybrid_ep_buffer():
+    '''
+    Reset the HybridEP buffer
+    '''
+    global _hybrid_ep_buffer
+    _hybrid_ep_buffer = None
+
+
 class HybridEPDispatch(torch.autograd.Function):
     '''
     Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend
@@ -335,7 +344,6 @@ def forward(
         num_local_experts,
         num_sms_dispatch_api=24,
         num_sms_combine_api=24,
-        num_dispatched_tokens=None,
         num_permuted_tokens=None,
         pad_multiple=None,
     ):
@@ -354,11 +362,9 @@ def forward(
                 num_sms_combine_api,
                 fp8_dispatch,
             )
-        # Defaultly, the output token_per_expert and num_dispatched_tokens_tensor
-        # will be put on the CPU to avoid the potential sync in combine/backward pass,
-        # but if we provide the num_dispatched_tokens and num_permuted_tokens on CPU,
-        # we do not need to the D2H here.
-        use_host_meta = num_dispatched_tokens is None or num_permuted_tokens is None
+        # If we provide the num_permuted_tokens, we do not need to use sync to
+        # wait for the data in pinned memory ready
+        non_blocking = num_permuted_tokens is not None
         # Process the dispatch
         (
             dispatched_hidden,
@@ -373,14 +379,12 @@ def forward(
             scaling_factor=None,
             num_of_experts_per_rank=num_local_experts,
             pad_multiple=pad_multiple,
-            num_dispatched_tokens=num_dispatched_tokens,
             num_permuted_tokens=num_permuted_tokens,
-            use_host_meta=use_host_meta,
+            non_blocking=non_blocking,
         )
 
         ctx.handle = handle
         ctx.pad_multiple = pad_multiple
-        ctx.num_dispatched_tokens = num_dispatched_tokens
         return (
             dispatched_hidden,
             dispatched_probs,
@@ -396,36 +400,27 @@ def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens_per_exper
         '''
         handle = ctx.handle
         combined_hidden, combined_probs = _hybrid_ep_buffer.combine_with_unpermute(
-            hidden=grad_x,
-            probs=grad_probs,
-            handle=handle,
-            pad_multiple=ctx.pad_multiple,
-            num_dispatched_tokens=ctx.num_dispatched_tokens,
+            hidden=grad_x, probs=grad_probs, handle=handle, pad_multiple=ctx.pad_multiple
         )
         return combined_hidden, None, combined_probs, None, None, None, None, None, None, None
 
 
+@internal_api
 class HybridEPCombine(torch.autograd.Function):
     '''
     Fused combine operation for permute + combine a2a + permute using the HybridEP backend
     '''
 
     @staticmethod
-    def forward(
-        ctx, x, handle, num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None
-    ):
+    def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None):
         '''
         Forward pass of fused combine of the HybridEP backend
         '''
         combined_hidden, _ = _hybrid_ep_buffer.combine_with_unpermute(
-            hidden=x,
-            handle=handle,
-            pad_multiple=pad_multiple,
-            num_dispatched_tokens=num_dispatched_tokens,
+            hidden=x, handle=handle, pad_multiple=pad_multiple
         )
         ctx.handle = handle
         ctx.pad_multiple = pad_multiple
-        ctx.num_dispatched_tokens = num_dispatched_tokens
         ctx.num_permuted_tokens = num_permuted_tokens
         return combined_hidden
 
@@ -440,7 +435,6 @@ def backward(ctx, grad_x):
             scaling_factor=None,
             handle=handle,
             pad_multiple=ctx.pad_multiple,
-            num_dispatched_tokens=ctx.num_dispatched_tokens,
             num_permuted_tokens=ctx.num_permuted_tokens,
         )
         return dispatched_hidden, None, None, None, None
@@ -448,6 +442,7 @@ def backward(ctx, grad_x):
 
 if HAVE_HYBRIDEP:
 
+    @internal_api
     def hybrid_ep_dispatch(
         x,
         routing_map,
@@ -456,7 +451,6 @@ def hybrid_ep_dispatch(
         num_local_experts,
         num_sms_dispatch_api=24,
         num_sms_combine_api=24,
-        num_dispatched_tokens=None,
         num_permuted_tokens=None,
         pad_multiple=None,
     ):
@@ -479,10 +473,6 @@ def hybrid_ep_dispatch(
                 Number of SMs used by the dispatch API.
             num_sms_combine_api (int):
                 Number of SMs used by the combine API.
-            num_dispatched_tokens (int):
-                Number of tokens after dispatch but before permute. HybridEP uses this
-                to allocate buffers. If not provided, HybridEP obtains the size from
-                a GPU tensor, which causes a D2H synchronization.
             num_permuted_tokens (int):
                 Number of tokens after permute. HybridEP uses this to allocate buffers.
                 If not provided, HybridEP obtains the size from a GPU tensor,
@@ -499,12 +489,12 @@ def hybrid_ep_dispatch(
             num_local_experts,
             num_sms_dispatch_api,
             num_sms_combine_api,
-            num_dispatched_tokens,
             num_permuted_tokens,
             pad_multiple,
         )
 
-    def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple):
+    @internal_api
+    def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple):
         '''
         Perform fused combine operation for unpermute + combine a2a + unpermute
         using the HybridEP backend
@@ -514,10 +504,6 @@ def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad
                 Input hidden states to combine
             handle (EventHandle):
                 Communication handle from dispatch operation
-            num_dispatched_tokens (int):
-                The number of tokens after unpermute but before combine. HybridEP uses this
-                to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor,
-                which causes a D2H synchronization.
             num_permuted_tokens (int): The number of tokens before unpermute. HybridEP uses this
                 to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor,
                 which causes a D2H synchronization.
@@ -525,9 +511,7 @@ def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad
                 The alignment multiple required for FP8 GEMM. If not provided, no padding
                 is performed.
         '''
-        return HybridEPCombine.apply(
-            x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple
-        )
+        return HybridEPCombine.apply(x, handle, num_permuted_tokens, pad_multiple)
 
 else:
     hybrid_ep_dispatch = None
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
index e3de8220a54..5cfea1e8ae4 100644
--- a/megatron/core/transformer/moe/moe_layer.py
+++ b/megatron/core/transformer/moe/moe_layer.py
@@ -1,15 +1,22 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+from __future__ import annotations
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Optional, Protocol, Union
 
 import torch
 
 from megatron.core import parallel_state, tensor_parallel, utils
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.moe.moe_utils import get_default_pg_collection
+from megatron.core.transformer.moe.moe_utils import (
+    MoECudaGraphPartialCaptureSignal,
+    MoECudaGraphTensorStore,
+    get_default_pg_collection,
+    maybe_skip_or_early_return_by_cudagraph,
+)
 from megatron.core.transformer.moe.router import TopKRouter
 from megatron.core.transformer.moe.token_dispatcher import (
     MoEAllGatherTokenDispatcher,
@@ -19,23 +26,53 @@
 )
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.typed_torch import apply_module
+from megatron.core.utils import internal_api
 
 try:
     import transformer_engine as te  # pylint: disable=unused-import
 
-    from megatron.core.extensions.transformer_engine import te_checkpoint
+    from megatron.core.extensions.transformer_engine import TELinear, te_checkpoint
 
     HAVE_TE = True
 except ImportError:
     HAVE_TE = False
 
 
+class RouterInterface(Protocol):
+    """Interface for the router used in an MoELayer."""
+
+    def forward(self, input: torch.Tensor, /) -> tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass of the router.
+
+        Returns:
+            A tuple of (probabilities, routing_map).
+        """
+        ...
+
+    def set_layer_number(self, layer_number: int) -> None:
+        """Set the layer number for the router.
+
+        Called from transformer_layer during initialization.
+        """
+        ...
+
+
+class RouterBuilder(Protocol):
+    """Protocol for building a Router."""
+
+    def __call__(
+        self, /, *, config: TransformerConfig, pg_collection: ProcessGroupCollection | None
+    ) -> RouterInterface: ...
+
+
 @dataclass
 class MoESubmodules:
     """MoE Layer Submodule spec"""
 
     experts: Union[ModuleSpec, type] = None
     shared_experts: Union[ModuleSpec, type] = None
+    router: RouterBuilder = TopKRouter
 
 
 class BaseMoELayer(MegatronModule, ABC):
@@ -72,7 +109,7 @@ def __init__(
             local_expert_indices_offset + i for i in range(self.num_local_experts)
         ]
         assert all(map(lambda x: x < self.config.num_moe_experts, self.local_expert_indices))
-        self.router: TopKRouter = None
+        self.router: RouterInterface = None
         self.experts = None
         self.shared_experts = None
         self.token_dispatcher: Optional[MoETokenDispatcher] = None
@@ -112,16 +149,48 @@ def __init__(
         super(MoELayer, self).__init__(
             config=config, layer_number=layer_number, pg_collection=pg_collection
         )
+        # If using mcore cudagraphs, recompute is handled by transformer_layer.MoETransformerLayer
         self.moe_layer_recompute = (
-            config.recompute_granularity == 'selective' and "moe" in config.recompute_modules
+            config.recompute_granularity == 'selective'
+            and "moe" in config.recompute_modules
+            and config.cuda_graph_impl != 'local'
         )
         self.shared_experts_recompute = (
             config.recompute_granularity == 'selective'
             and "shared_experts" in config.recompute_modules
         )
 
-        # Initialize router
-        self.router = TopKRouter(config=self.config, pg_collection=pg_collection)
+        self.tp_group = pg_collection.tp
+
+        # Initialize router.
+        self.router = submodules.router(config=self.config, pg_collection=pg_collection)
+        self.tp_group = pg_collection.tp
+
+        # Initialize latent projections.
+        if self.config.moe_latent_size:
+            assert HAVE_TE, "TransformerEngine is required for MoE latent projections."
+            self.fc1_latent_proj = TELinear(
+                self.config.hidden_size,
+                self.config.moe_latent_size,
+                parallel_mode="duplicated",
+                config=self.config,
+                init_method=self.config.init_method,
+                bias=self.config.add_bias_linear,
+                skip_bias_add=False,
+                skip_weight_param_allocation=False,
+                is_expert=False,
+            )
+            self.fc2_latent_proj = TELinear(
+                self.config.moe_latent_size,
+                self.config.hidden_size,
+                parallel_mode="duplicated",
+                config=self.config,
+                init_method=self.config.output_layer_init_method,
+                bias=self.config.add_bias_linear,
+                skip_bias_add=False,
+                skip_weight_param_allocation=False,
+                is_expert=False,
+            )
 
         # Initialize token dispatcher
         if config.moe_token_dispatcher_type == "allgather":
@@ -161,34 +230,58 @@ def __init__(
         # Initialize shared experts
         if self.use_shared_expert:
             self.shared_experts = build_module(
-                self.submodules.shared_experts, config=self.config, pg_collection=pg_collection
+                self.submodules.shared_experts,
+                config=self.config,
+                pg_collection=pg_collection,
+                gate=self.config.moe_shared_expert_gate,
             )
             if self.shared_expert_overlap:
                 self.token_dispatcher.set_shared_experts(self.shared_experts)
 
-    def router_and_preprocess(self, hidden_states: torch.Tensor):
-        """Compute and preprocess token routing for dispatch.
+        # Cudagraph tensor store for resuming the forward pass from the end of the cudagraph.
+        self.cudagraph_tensor_store = MoECudaGraphTensorStore()
+        self.fwd_execution_map = ["route", "expert_compute", "postprocess"]
+
+    @maybe_skip_or_early_return_by_cudagraph("route")
+    def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
+        """Compute token routing for preprocessing.
 
         This method uses the router to determine which experts to send each token to,
-        producing routing probabilities and a mapping. It then preprocesses the
-        hidden states and probabilities for the token dispatcher. The original
-        hidden states are returned as a residual connection.
+        producing routing probabilities and a mapping.
+        """
+        probs, routing_map = apply_module(self.router)(hidden_states, padding_mask)
+        return probs, routing_map
+
+    @maybe_skip_or_early_return_by_cudagraph("preprocess")
+    def preprocess(
+        self, hidden_states: torch.Tensor, probs: torch.Tensor, routing_map: torch.Tensor
+    ):
+        """Preprocess token routing for dispatch.
+
+        This method preprocesses the hidden states and routing probabilities for the token
+        dispatcher.
         """
-        residual = hidden_states
-        probs, routing_map = self.router(hidden_states)
+        # Project the hidden_states from hidden dimension down to latent dimenion.
+        if self.config.moe_latent_size:
+            assert (
+                not self.shared_expert_overlap
+            ), "Shared expert overlap not supported when MoE latent projections are used."
+            hidden_states, _ = self.fc1_latent_proj(hidden_states)
         hidden_states, probs = self.token_dispatcher.dispatch_preprocess(
             hidden_states, routing_map, probs
         )
-        return hidden_states, probs, residual
+        return hidden_states, probs
 
     def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor):
         """Dispatches tokens to assigned expert ranks via communication.
+
         This method performs the actual communication (e.g., All-to-All) to distribute
         tokens and their associated probabilities to the devices hosting their assigned
         experts.
         """
         return self.token_dispatcher.token_dispatch(hidden_states, probs)
 
+    @maybe_skip_or_early_return_by_cudagraph("shared_experts_compute")
     def shared_experts_compute(self, hidden_states: torch.Tensor):
         """Computes the output of the shared experts.
 
@@ -216,9 +309,8 @@ def shared_experts_compute(self, hidden_states: torch.Tensor):
 
         return shared_expert_output
 
-    def routed_experts_compute(
-        self, hidden_states: torch.Tensor, probs: torch.Tensor, residual: torch.Tensor
-    ):
+    @internal_api
+    def routed_experts_compute(self, hidden_states: torch.Tensor, probs: torch.Tensor):
         """Computes the output of the routed experts on the dispatched tokens.
 
         This method first post-processes the dispatched input to get permuted tokens
@@ -234,20 +326,40 @@ def routed_experts_compute(
 
         return output, mlp_bias
 
-    def combine(self, output: torch.Tensor, shared_expert_output: Optional[torch.Tensor]):
+    def combine(self, output: torch.Tensor):
         """Combines expert outputs via communication and adds shared expert output.
 
         This method uses the token dispatcher to combine the outputs from different
-        experts (e.g., via an All-to-All communication). It then adds the output
-        from the shared expert if it exists.
+        experts (e.g., via an All-to-All communication).
         """
         output = self.token_dispatcher.token_combine(output)
+        return output
+
+    def postprocess(self, output: torch.Tensor, shared_expert_output: Optional[torch.Tensor]):
+        """Project the output back from latent dimension to hidden dimension after combine
+        in latent dimension if needed. Combine expert output with shared_experts if needed."""
+
         output = self.token_dispatcher.combine_postprocess(output)
+        if self.config.moe_latent_size:
+            output, _ = self.fc2_latent_proj(output)
+
         if shared_expert_output is not None:
             output = output + shared_expert_output
         return output
 
-    def forward(self, hidden_states: torch.Tensor):
+    def router_and_preprocess(self, hidden_states: torch.Tensor):
+        """This method is a combined method of route and preprocess. Deprecated."""
+
+        probs, routing_map = self.route(hidden_states)
+        hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map)
+        return hidden_states, probs, residual
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        intermediate_tensors=None,
+        padding_mask: Optional[torch.Tensor] = None,
+    ):
         """Forward pass for the MoE layer.
 
         The forward pass comprises four main steps:
@@ -257,8 +369,10 @@ def forward(self, hidden_states: torch.Tensor):
         4. Combine: The outputs from the experts are combined and returned.
 
         Args:
-            hidden_states (torch.Tensor): The input tensor to the MoE layer.
-
+            hidden_states (torch.Tensor): The input tensor shape [seq_length, bsz, hidden_size].
+            padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
+                                                   Shape [seq_length, bsz]. True for valid tokens,
+                                                   False for padding tokens. Defaults to None.
         Returns:
             A tuple containing the output tensor and the MLP bias, if any.
         """
@@ -267,36 +381,79 @@ def forward(self, hidden_states: torch.Tensor):
                 "During training, performance may degrade if MoE and tensor parallelism"
                 "are enabled without also enabling sequence parallelism."
             )
+        # Transpose from [bsz, seq_length] to [seq_length, bsz] to align with hidden_states
+        if padding_mask is not None:
+            padding_mask = padding_mask.transpose(0, 1).bool()
 
         # MoE forward: route -> dispatch -> compute -> combine
-        def custom_forward(hidden_states):
-            shared_expert_output = self.shared_experts_compute(hidden_states)
-            hidden_states, probs, residual = self.router_and_preprocess(hidden_states)
-            dispatched_input, probs = self.dispatch(hidden_states, probs)
-            output, mlp_bias = self.routed_experts_compute(dispatched_input, probs, residual)
-            output = self.combine(output, shared_expert_output)
+        def custom_forward(hidden_states, intermediate_tensors=None, padding_mask=None):
+            try:
+                if "route" in self.fwd_execution_map:
+                    shared_expert_output = self.shared_experts_compute(hidden_states)
+                    probs, routing_map = self.route(hidden_states, padding_mask)
+                    hidden_states, probs = self.preprocess(hidden_states, probs, routing_map)
+
+                    if intermediate_tensors is not None:
+                        return hidden_states, probs, shared_expert_output
+
+            except MoECudaGraphPartialCaptureSignal as e:
+                # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator.
+                # It means we should early-return from the MoE layer forward pass.
+                # This happens when we are partially capturing the CUDA graph of the MoE layer,
+                # like cuda_graph_scope=["moe_router", "moe_preprocess"].
+                # We need to return the intermediate tensors as CUDA graph outputs.
+                return e.get_early_return_outputs(hidden_states, shared_expert_output)
+
+            if "expert_compute" in self.fwd_execution_map:
+                if intermediate_tensors is not None:
+                    hidden_states, probs = intermediate_tensors
+
+                dispatched_input, probs = self.dispatch(hidden_states, probs)
+                output, mlp_bias = self.routed_experts_compute(dispatched_input, probs)
+                assert (
+                    mlp_bias is None
+                ), f"mlp_bias is not supported for {type(self.token_dispatcher)}"
+                output = self.combine(output)
+
+                if intermediate_tensors is not None:
+                    return output, mlp_bias
+
+            if "postprocess" in self.fwd_execution_map:
+                if intermediate_tensors is not None:
+                    output, shared_expert_output = intermediate_tensors
+
+                output = self.postprocess(output, shared_expert_output)
+
+                if intermediate_tensors is not None:
+                    return output
+
             return output, mlp_bias
 
         if self.moe_layer_recompute:
             if self.config.fp8 or self.config.fp4:
-                output, mlp_bias = te_checkpoint(
+                outputs = te_checkpoint(
                     custom_forward,
                     False,
                     tensor_parallel.random.get_cuda_rng_tracker,
                     parallel_state.get_tensor_model_parallel_group(),
                     hidden_states,
+                    intermediate_tensors,
+                    padding_mask,
                 )
             else:
-                output, mlp_bias = tensor_parallel.checkpoint(custom_forward, False, hidden_states)
+                outputs = tensor_parallel.checkpoint(
+                    custom_forward, False, hidden_states, intermediate_tensors, padding_mask
+                )
         else:
-            output, mlp_bias = custom_forward(hidden_states)
+            outputs = custom_forward(hidden_states, intermediate_tensors, padding_mask)
 
-        return output, mlp_bias
+        return outputs
 
-    def backward_dw(self):
+    def backward_dw(self, routed_experts: bool = True, shared_experts: bool = False):
         """Compute weight gradients for experts and shared experts."""
-        self.experts.backward_dw()
-        if self.use_shared_expert and not self.shared_expert_overlap:
+        if routed_experts:
+            self.experts.backward_dw()
+        if shared_experts and self.use_shared_expert and not self.shared_expert_overlap:
             self.shared_experts.backward_dw()
 
     def set_for_recompute_pre_mlp_layernorm(self):
diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
index 828ff0c3fb6..cbf85612d26 100644
--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import functools
 import math
-from typing import List, Optional, Union
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
 
 import torch
 
@@ -9,8 +11,13 @@
 from megatron.core.fp4_utils import get_fp4_align_size
 from megatron.core.fp8_utils import get_fp8_align_size
 from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
+from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region
 from megatron.core.transformer.cuda_graphs import is_graph_capturing
+from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.moe.router_replay import RouterReplay
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import internal_api
 
 from megatron.plugin.utils import reduce_aux_losses_tracker_across_ranks_hetero
 
@@ -35,7 +42,7 @@
 
 
 # MOE logging
-_MOE_LAYER_WISE_LOGGING_TRACKER = {}
+_MOE_LAYER_WISE_LOGGING_TRACKER: dict = {}
 
 
 def switch_load_balancing_loss_func(
@@ -46,12 +53,13 @@ def switch_load_balancing_loss_func(
     num_experts: int,
     moe_aux_loss_coeff: float,
     fused: bool = False,
-):
+    padding_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
     """Calculate the auxiliary loss for load balancing.
     Refer to the Switch Transformer (https://arxiv.org/abs/2101.03961)
     and Global Load Balancing Loss(https://arxiv.org/abs/2501.11873) for details.
 
-    ### Detailed explanation of the auxiliary loss #######
+    Detailed explanation of the auxiliary loss:
 
     The formula for the auxiliary loss is:
         loss = E * Σ_{i=1}^{E} (f_i * P_i)
@@ -85,8 +93,6 @@ def switch_load_balancing_loss_func(
       (either micro-batch or global batch)
     - total_num_tokens: Should match the total token count at the same level as tokens_per_expert
 
-    #########################################################
-
     Args:
         probs (torch.Tensor): Softmax probabilities output by the router for each token.
                               Shape in [num_tokens, num_experts].
@@ -96,9 +102,20 @@ def switch_load_balancing_loss_func(
         topk (int): The number of experts selected for each token.
         num_experts (int): The number of experts.
         moe_aux_loss_coeff (float): The coefficient for the auxiliary loss.
+        fused (bool): Whether to use the fused version of the auxiliary loss.
+        padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
+                                               Shape in [num_tokens]. True for valid tokens,
+                                               False for padding tokens. Defaults to None.
+
     Returns:
         torch.Tensor: The auxiliary loss for load balancing.
     """
+    # Apply padding mask to probs if provided
+    if padding_mask is not None:
+        # padding_mask: [num_tokens], probs: [num_tokens, num_experts]
+        mask_expanded = padding_mask.unsqueeze(-1)
+        probs = probs * mask_expanded
+
     if fused:
         if not HAVE_TE or fused_moe_aux_loss is None:
             raise ValueError("fused_moe_aux_loss is not available. Please install TE >= 2.7.0.")
@@ -118,23 +135,48 @@ def switch_load_balancing_loss_func(
     return aux_loss
 
 
-def z_loss_func(logits, z_loss_coeff):
+def z_loss_func(
+    logits: torch.Tensor, z_loss_coeff: float, padding_mask: Optional[torch.Tensor] = None
+) -> torch.Tensor:
     """Encourages the router's logits to remain small to enhance stability.
     Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
 
     Args:
         logits (torch.Tensor): The logits of the router.
+        z_loss_coeff (float): The coefficient for the z-loss.
+        padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions.
+                                               Shape [num_tokens]. True = padding (exclude),
+                                               False = valid (include). Defaults to None.
 
     Returns:
         torch.Tensor: The logits after applying the z-loss.
     """
-
-    z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff
+    logsum = torch.logsumexp(logits, dim=-1)
+    z_loss_values = torch.square(logsum)
+
+    if padding_mask is not None:
+        # Invert padding_mask: True (padding) -> 0, False (valid) -> 1
+        valid_mask = ~padding_mask
+        # Only compute z_loss for valid (non-padding) tokens
+        z_loss_values = z_loss_values * valid_mask
+        # Compute mean over valid tokens only
+        num_valid_tokens = valid_mask.sum()
+        z_loss = z_loss_values.sum() / torch.clamp(num_valid_tokens, min=1.0) * z_loss_coeff
+    else:
+        z_loss = torch.mean(z_loss_values) * z_loss_coeff
     return z_loss
 
 
-def sinkhorn(cost: torch.Tensor, tol: float = 0.0001):
-    """Sinkhorn based MoE routing function"""
+def sinkhorn(cost: torch.Tensor, tol: float = 0.0001) -> torch.Tensor:
+    """Sinkhorn based MoE routing function.
+
+    Args:
+        cost (torch.Tensor): The cost tensor.
+        tol (float): The tolerance for the Sinkhorn algorithm.
+
+    Returns:
+        torch.Tensor: The routing probabilities.
+    """
     cost = torch.exp(cost)
     d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
     d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
@@ -150,7 +192,9 @@ def sinkhorn(cost: torch.Tensor, tol: float = 0.0001):
     return d1 * cost * d0.unsqueeze(1)
 
 
-def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_capacity=None):
+def get_capacity(
+    num_tokens: int, num_experts: int, capacity_factor: float, min_capacity: Optional[int] = None
+) -> int:
     """
     Calculate the capacity of each expert.
 
@@ -161,7 +205,7 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_
         min_capacity (int, optional): Minimum capacity. Defaults to None.
 
     Returns:
-        Tensor: Capacity of each expert.
+        int: Capacity of each expert.
     """
     capacity = math.ceil((num_tokens / num_experts) * capacity_factor)
     if min_capacity is not None and capacity < min_capacity:
@@ -169,13 +213,35 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_
     return capacity
 
 
+def get_tokens_per_expert_and_token_count(
+    routing_map: torch.Tensor,
+    reduce_group: torch.distributed.ProcessGroup,
+    topk: int = None,
+    with_padding_mask: bool = False,
+) -> torch.Tensor:
+    """
+    Compute global_tokens_per_expert, local_num_tokens and total_num_tokens with padding mask.
+    """
+    local_tokens_per_expert = routing_map.sum(dim=0)
+    global_tokens_per_expert = reduce_from_tensor_model_parallel_region(
+        local_tokens_per_expert, reduce_group
+    )
+    if with_padding_mask:
+        local_num_tokens = local_tokens_per_expert.sum() / topk
+        total_num_tokens = global_tokens_per_expert.sum() / topk
+    else:
+        local_num_tokens = routing_map.shape[0]
+        total_num_tokens = local_num_tokens * reduce_group.size()
+    return global_tokens_per_expert, local_num_tokens, total_num_tokens
+
+
 class MoEAuxLossAutoScaler(torch.autograd.Function):
     """An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss."""
 
-    main_loss_backward_scale: torch.Tensor = None
+    main_loss_backward_scale: Optional[torch.Tensor] = None
 
     @staticmethod
-    def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor):
+    def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor) -> torch.Tensor:
         """Preserve the aux_loss by storing it in the context to avoid garbage collection.
 
         Args:
@@ -189,7 +255,7 @@ def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor):
         return output
 
     @staticmethod
-    def backward(ctx, grad_output: torch.Tensor):
+    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Compute and scale the gradient for auxiliary loss..
 
         Args:
@@ -209,7 +275,7 @@ def backward(ctx, grad_output: torch.Tensor):
         return grad_output, scaled_aux_loss_grad
 
     @staticmethod
-    def set_loss_scale(scale: torch.Tensor):
+    def set_loss_scale(scale: torch.Tensor) -> None:
         """set the scale of the aux loss.
 
         Args:
@@ -223,13 +289,13 @@ def set_loss_scale(scale: torch.Tensor):
 
 
 def permute(
-    tokens,
-    routing_map,
+    tokens: torch.Tensor,
+    routing_map: torch.Tensor,
     probs: Optional[torch.Tensor] = None,
     num_out_tokens: Optional[int] = None,
     fused: bool = False,
     drop_and_pad: bool = False,
-):
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
     """Permute the tokens and probs based on the mask.
     Tokens with the same designated expert will be grouped together.
     The shape of mask is [tokens, num_experts], it indicates which experts were selected
@@ -249,6 +315,10 @@ def permute(
                                        and pads the number of tokens to the expert capacity.
                                        If set to true, routing_map has a fixed number of non-zeros
                                        in each column.
+
+    Returns:
+        Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+            The permuted tokens, permuted probs, and sorted indices.
     """
     if fused and probs is None:
         if not HAVE_TE or fused_permute is None:
@@ -313,11 +383,11 @@ def unpermute(
     permuted_tokens: torch.Tensor,
     sorted_indices: torch.Tensor,
     restore_shape: torch.Size,
-    probs: torch.Tensor = None,
-    routing_map: torch.Tensor = None,
+    probs: Optional[torch.Tensor] = None,
+    routing_map: Optional[torch.Tensor] = None,
     fused: bool = False,
     drop_and_pad: bool = False,
-):
+) -> torch.Tensor:
     """
     Restore the original order of tokens after permutation. If probs are provided, it
     will also apply them to the tokens before restoring the order.
@@ -405,8 +475,20 @@ def sort_chunks_by_idxs(
     sorted_idxs: torch.Tensor,
     probs: Optional[torch.Tensor] = None,
     fused: bool = False,
-):
-    """Split and sort the input tensor based on the split_sizes and sorted indices."""
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """Split and sort the input tensor based on the split_sizes and sorted indices.
+
+    Args:
+        input (torch.Tensor): The input tensor.
+        split_sizes (torch.Tensor): The split sizes.
+        sorted_idxs (torch.Tensor): The sorted indices.
+        probs (torch.Tensor, optional): The probs tensor. Defaults to None.
+        fused (bool, optional): Whether to use the fused version of the sort_chunks_by_idxs
+                                function. Defaults to False.
+
+    Returns:
+        Tuple[torch.Tensor, Optional[torch.Tensor]]: The sorted output tensor and permuted probs.
+    """
     if fused and probs is None:
         if not HAVE_TE or fused_sort_chunks_by_index is None:
             raise ValueError(
@@ -439,7 +521,7 @@ def group_limited_topk(
     num_experts: int,
     num_groups: int,
     group_topk: int,
-):
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """Perform top-k routing on a subset of expert groups.
 
     When using group-limited routing:
@@ -535,19 +617,32 @@ def topk_routing_with_score_function(
     score_function: str = "softmax",
     expert_bias: Optional[torch.Tensor] = None,
     fused: bool = False,
-):
+    router_replay: Optional['RouterReplay'] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """Compute the routing probabilities and map for top-k selection with score function.
+
     Args:
         logits (torch.Tensor): Logits tensor.
         topk (int): The number of experts to select for each token.
-        use_pre_softmax (bool): Whether to apply softmax or sigmoid before top-k selection.
-        num_groups (int): Number of groups for routed experts.
-        group_topk (int): Number of selected groups for each token.
-        scaling_factor (float): Scaling factor of routing score in top-k selection.
-        score_function (str): The score function to use. Can be either "softmax" or "sigmoid".
-        expert_bias (torch.Tensor): The bias added to logits for expert routing.
+        use_pre_softmax (bool, optional): Whether to apply softmax or sigmoid before top-k
+                                          selection. Defaults to False.
+        num_groups (int, optional): Number of groups for routed experts. Defaults to None.
+        group_topk (int, optional): Number of selected groups for each token. Defaults to None.
+        scaling_factor (float, optional): Scaling factor of routing score in top-k selection.
+                                         Defaults to None.
+        score_function (str, optional): The score function to use. Can be either "softmax" or
+                                        "sigmoid". Defaults to "softmax".
+        expert_bias (torch.Tensor, optional): The bias added to logits for expert routing.
+                                              Defaults to None.
+        fused (bool, optional): Whether to use the fused version. Defaults to False.
+        router_replay (Optional['RouterReplay']): For debugging and development, allows for
+                                             deterministic routing by replaying a previously
+                                             recorded routing sequence.
+
+                                              Defaults to None.
+
     Returns:
-        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        Tuple[torch.Tensor, torch.Tensor]:
             - routing_probs (torch.Tensor): A tensor of shape [num_tokens, num_experts] containing
               the routing probabilities for each token to each expert.
             - routing_map (torch.Tensor): A mask tensor of shape [num_tokens, num_experts]
@@ -572,7 +667,25 @@ def topk_routing_with_score_function(
             expert_bias=expert_bias,
         )
 
-    def compute_topk(scores, topk, num_groups=None, group_topk=None):
+    def _compute_topk(
+        scores: torch.Tensor,
+        topk: int,
+        num_groups: Optional[int] = None,
+        group_topk: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute the top-k indices for the given scores.
+
+        Args:
+            scores (torch.Tensor): The scores tensor.
+            topk (int): The number of top-k indices to compute.
+            num_groups (int, optional): The number of groups to compute the top-k indices for.
+                                        Defaults to None.
+            group_topk (int, optional): The number of top-k indices to compute for each group.
+                                        Defaults to None.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The top-k indices and the top-k scores.
+        """
         if group_topk:
             return group_limited_topk(
                 scores=scores,
@@ -585,6 +698,16 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None):
         else:
             return torch.topk(scores, k=topk, dim=1)
 
+    def compute_topk(scores, topk, num_groups=None, group_topk=None):
+        # Default behavior if no replay is active
+
+        if router_replay is None:
+            return _compute_topk(scores, topk, num_groups=num_groups, group_topk=group_topk)
+        else:
+            return router_replay.get_replay_topk(
+                scores, topk, num_groups, group_topk, _compute_topk
+            )
+
     if score_function == "softmax":
         if use_pre_softmax:
             scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
@@ -627,35 +750,52 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None):
 
 
 def compute_routing_scores_for_aux_loss(
-    logits: torch.Tensor, topk: int, score_function: str, fused: bool = False
-):
+    logits: torch.Tensor,
+    topk: int,
+    score_function: str,
+    fused: bool = False,
+    padding_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """Compute routing scores based on the score function.
 
     Args:
         logits (torch.Tensor): The logits tensor after gating, shape: [num_tokens, num_experts].
+        topk (int): The number of top-k indices to compute.
+        score_function (str): The score function to use. Can be either "softmax" or "sigmoid".
+        fused (bool, optional): Whether to use the fused version. Defaults to False.
+        padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
+                                               Shape in [num_tokens]. True for valid tokens,
+                                               False for padding tokens. Defaults to None.
 
     Returns:
-        torch.Tensor: The normalized routing scores.
+        Tuple[torch.Tensor, torch.Tensor]: The routing map and the normalized routing scores.
     """
     if fused:
         if not HAVE_TE or fused_compute_score_for_moe_aux_loss is None:
             raise ValueError(
                 "fused_compute_score_for_moe_aux_loss is not available. Please install TE >= 2.6.0."
             )
-        return fused_compute_score_for_moe_aux_loss(
+        routing_map, scores = fused_compute_score_for_moe_aux_loss(
             logits=logits, topk=topk, score_function=score_function
         )
-
-    if score_function == "softmax":
-        scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
-    elif score_function == "sigmoid":
-        scores = torch.sigmoid(logits)
-        scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20)
     else:
-        raise ValueError(f"Invalid score_function: {score_function}")
+        if score_function == "softmax":
+            scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
+        elif score_function == "sigmoid":
+            scores = torch.sigmoid(logits)
+            scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20)
+        else:
+            raise ValueError(f"Invalid score_function: {score_function}")
 
-    _, top_indices = torch.topk(scores, k=topk, dim=1)
-    routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool()
+        _, top_indices = torch.topk(scores, k=topk, dim=1)
+        routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool()
+
+    # Apply padding mask to scores if provided
+    if padding_mask is not None:
+        # Invert padding_mask and make True indicates valid tokens
+        valid_mask = (~padding_mask).unsqueeze(-1)
+        routing_map = routing_map * valid_mask
+        scores = scores * valid_mask
     return routing_map, scores
 
 
@@ -666,7 +806,7 @@ def apply_router_token_dropping(
     capacity_factor: float,
     drop_policy: str = "probs",
     pad_to_capacity: bool = False,
-):
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """Apply token dropping to top-k expert selection.
 
     This function enforces expert capacity limits by dropping tokens that exceed
@@ -679,8 +819,9 @@ def apply_router_token_dropping(
             indicating which experts were selected for each token.
         router_topk (int): Number of experts selected per token.
         capacity_factor (float): The capacity factor of each expert.
-        drop_policy (str): Policy to drop tokens - "probs" or "position".
-        pad_to_capacity (bool): Whether to pad to capacity.
+        drop_policy (str, optional): Policy to drop tokens - "probs" or "position".
+                                     Defaults to "probs".
+        pad_to_capacity (bool, optional): Whether to pad to capacity. Defaults to False.
 
     Returns:
         Tuple[torch.Tensor, torch.Tensor]:
@@ -697,14 +838,20 @@ def apply_router_token_dropping(
     )
 
     # Create capacity mask based on drop policy
-    if drop_policy == "probs":
-        _, capacity_indices = torch.topk(routing_probs, k=expert_capacity, dim=0, sorted=False)
-        capacity_mask = torch.zeros_like(routing_probs).scatter(0, capacity_indices, 1).bool()
-    elif drop_policy == "position":
-        _, capacity_indices = torch.topk(routing_map.int(), k=expert_capacity, dim=0, sorted=False)
-        capacity_mask = torch.zeros_like(routing_probs).scatter(0, capacity_indices, 1).bool()
+    if expert_capacity > num_tokens:
+        # No need to drop tokens if capacity exceeds the number of tokens
+        capacity_mask = torch.ones_like(routing_probs).bool()
     else:
-        raise ValueError(f"Invalid drop_policy: {drop_policy}")
+        if drop_policy == "probs":
+            _, capacity_indices = torch.topk(routing_probs, k=expert_capacity, dim=0, sorted=False)
+            capacity_mask = torch.zeros_like(routing_probs).scatter(0, capacity_indices, 1).bool()
+        elif drop_policy == "position":
+            _, capacity_indices = torch.topk(
+                routing_map.int(), k=expert_capacity, dim=0, sorted=False
+            )
+            capacity_mask = torch.zeros_like(routing_probs).scatter(0, capacity_indices, 1).bool()
+        else:
+            raise ValueError(f"Invalid drop_policy: {drop_policy}")
 
     # Apply capacity constraints
     if pad_to_capacity:
@@ -723,21 +870,23 @@ def save_to_aux_losses_tracker(
     loss: torch.Tensor,
     layer_number: int,
     num_layers: int,
-    reduce_group: torch.distributed.ProcessGroup = None,
-    avg_group: torch.distributed.ProcessGroup = None,
+    reduce_group: Optional[torch.distributed.ProcessGroup] = None,
+    avg_group: Optional[torch.distributed.ProcessGroup] = None,
     reduce_group_has_dp: bool = False,
-):
+) -> None:
     """Save the auxiliary loss for logging.
     Args:
         name (str): The name of the loss.
         loss (torch.Tensor): The loss tensor.
         layer_number (int): Layer index of the loss.
         num_layers (int): The number of total layers.
-        reduce_group (torch.distributed.ProcessGroup): The group for reducing the loss.
-        avg_group (torch.distributed.ProcessGroup): The group for averaging the loss.
-        reduce_group_has_dp (bool): Whether the reduce group has data parallel ranks.
+        reduce_group (torch.distributed.ProcessGroup, optional): The group for reducing the loss.
+                                                                 Defaults to None.
+        avg_group (torch.distributed.ProcessGroup, optional): The group for averaging the loss.
+                                                              Defaults to None.
+        reduce_group_has_dp (bool, optional): Whether the reduce group has data parallel ranks.
             Set this to True if the reduce group has data parallel ranks. This flag is used to
-            ensure the correct reduction in aux loss tracking.
+            ensure the correct reduction in aux loss tracking. Defaults to False.
     """
     # Skip aux loss logging if layer_number is None.
     if layer_number is None:
@@ -753,25 +902,43 @@ def save_to_aux_losses_tracker(
     tracker[name]["reduce_group_has_dp"] = reduce_group_has_dp
 
 
-def clear_aux_losses_tracker():
+def clear_aux_losses_tracker() -> None:
     """Clear the auxiliary losses."""
     tracker = get_moe_layer_wise_logging_tracker()
     for name in tracker:
         tracker[name]["values"].zero_()
 
 
-def reduce_aux_losses_tracker_across_ranks(track_names: Optional[List[str]] = None):
-    """Collect and reduce the auxiliary losses across ranks."""
+def reduce_aux_losses_tracker_across_ranks(
+    track_names: Optional[List[str]] = None, pg_collection: Optional[ProcessGroupCollection] = None
+) -> None:
+    """Collect and reduce the auxiliary losses across ranks.
+
+    Args:
+        track_names (Optional[List[str]], optional):
+            The names of the losses to track. Defaults to None.
+        pg_collection (Optional[ProcessGroupCollection], optional):
+            The process group collection. Defaults to None.
+    """
     tracker = get_moe_layer_wise_logging_tracker()
     if track_names is None:
         track_names = tracker.keys()
+
+    if pg_collection is None:
+        # Use parallel_state groups
+        pp_group = parallel_state.get_pipeline_model_parallel_group()
+        dp_group = parallel_state.get_data_parallel_group(
+            with_context_parallel=False, partial_data_parallel=False
+        )
+    else:
+        pp_group = pg_collection.pp
+        dp_group = pg_collection.dp
+
     for name in track_names:
         values = tracker[name]["values"]
         # TODO(Hepteract): delete the usage of the global parallel_state.
         # Collect aux losses across PP.
-        torch.distributed.all_reduce(
-            values, group=parallel_state.get_pipeline_model_parallel_group()
-        )
+        torch.distributed.all_reduce(values, group=pp_group)
         # Reduce aux losses across ranks.
         if tracker[name].get('reduce_group') is not None:
             torch.distributed.all_reduce(values, group=tracker[name].get('reduce_group'))
@@ -779,9 +946,7 @@ def reduce_aux_losses_tracker_across_ranks(track_names: Optional[List[str]] = No
             # does not have 'dp' attribute, do it manually.
             if not tracker[name].get('reduce_group_has_dp', False):
                 torch.distributed.all_reduce(
-                    values,
-                    group=parallel_state.get_data_parallel_group(with_context_parallel=False),
-                    op=torch.distributed.ReduceOp.AVG,
+                    values, group=dp_group, op=torch.distributed.ReduceOp.AVG
                 )
         if tracker[name].get('avg_group') is not None:
             torch.distributed.all_reduce(
@@ -793,18 +958,39 @@ def reduce_aux_losses_tracker_across_ranks(track_names: Optional[List[str]] = No
 def track_moe_metrics(
     loss_scale: float,
     iteration: int,
-    writer,
-    wandb_writer=None,
-    total_loss_dict=None,
-    per_layer_logging=False,
+    writer: Optional["SummaryWriter"] = None,
+    wandb_writer: Optional["wandb.Run"] = None,
+    total_loss_dict: Optional[dict[str, torch.Tensor]] = None,
+    per_layer_logging: bool = False,
     force_initialize: bool = False,
     track_names: Optional[List[str]] = None,
     num_layers: Optional[int] = None,
     moe_layer_freq: Optional[Union[int, List[int]]] = None,
     mtp_num_layers: Optional[int] = None,
+    pg_collection: Optional[ProcessGroupCollection] = None,
     enable_hetero=False,
-):
-    """Track the MoE metrics for logging."""
+) -> None:
+    """Track the MoE metrics for logging.
+
+    Args:
+        loss_scale (float): The loss scale.
+        iteration (int): The iteration.
+        writer (SummaryWriter, optional): The tensorboard writer. Defaults to None.
+        wandb_writer (wandb.Run, optional): The wandb writer. Defaults to None.
+        total_loss_dict (dict[str, torch.Tensor], optional): The total loss dictionary.
+                                                             Defaults to None.
+        per_layer_logging (bool, optional): Whether to log per layer. Defaults to False.
+        force_initialize (bool, optional): Whether to force initialize the tracker.
+                                           Defaults to False.
+        track_names (List[str], optional): The names of the losses to track. Defaults to None.
+        num_layers (int, optional): The number of layers. Defaults to None.
+        moe_layer_freq (Union[int, List[int]], optional): The frequency of the MoE layers.
+                                                          Defaults to None.
+        mtp_num_layers (int, optional): The number of layers in the model parallel group.
+                                        Defaults to None.
+        pg_collection (ProcessGroupCollection, optional): The process group collection.
+                                                          Defaults to None.
+    """
     # Aux loss logging
     tracker = get_moe_layer_wise_logging_tracker()
     # Initialize the tracker if force_initialize is True
@@ -818,7 +1004,7 @@ def track_moe_metrics(
                     tracker[key]["avg_group"] = None
                     tracker[key]["reduce_group_has_dp"] = False
     if not enable_hetero:
-        reduce_aux_losses_tracker_across_ranks(track_names)
+        reduce_aux_losses_tracker_across_ranks(track_names, pg_collection=pg_collection)
     else:
         reduce_aux_losses_tracker_across_ranks_hetero(track_names)
 
@@ -870,13 +1056,18 @@ def track_moe_metrics(
     clear_aux_losses_tracker()
 
 
-def get_updated_expert_bias(tokens_per_expert, expert_bias, expert_bias_update_rate):
+def get_updated_expert_bias(
+    tokens_per_expert: torch.Tensor, expert_bias: torch.Tensor, expert_bias_update_rate: float
+) -> torch.Tensor:
     """Update expert bias for biased expert routing. See https://arxiv.org/abs/2408.15664v1#
 
     Args:
         tokens_per_expert (torch.Tensor): The number of tokens assigned to each expert.
         expert_bias (torch.Tensor): The bias for each expert.
         expert_bias_udpate_rate (float): The update rate for the expert bias.
+
+    Returns:
+        torch.Tensor: The updated expert bias.
     """
     with torch.no_grad():
         # All Reduce Across TPxCPxDP group
@@ -891,13 +1082,20 @@ def get_updated_expert_bias(tokens_per_expert, expert_bias, expert_bias_update_r
         return updated_expert_bias
 
 
-def maybe_move_tensor_to_cpu(tensor, as_numpy=False, record_stream=False):
+def maybe_move_tensor_to_cpu(
+    tensor: torch.Tensor, as_numpy: bool = False, record_stream: bool = False
+) -> torch.Tensor:
     """Move a tensor to CPU if it is on GPU.
     Args:
-        tensor (torch.Tensor or None): The tensor to move to CPU.
-        as_numpy (bool): Whether to convert the tensor to a numpy array.
-        record_stream (bool): Whether to record the stream of the tensor, to prevent memory leak
-                              when the DtoH data transfer is on a side stream.
+        tensor (torch.Tensor): The tensor to move to CPU.
+        as_numpy (bool, optional): Whether to convert the tensor to a numpy array.
+                                   Defaults to False.
+        record_stream (bool, optional): Whether to record the stream of the tensor, to prevent
+                                        memory leak when the DtoH data transfer is on a side
+                                        stream. Defaults to False.
+
+    Returns:
+        torch.Tensor: The tensor moved to CPU.
     """
     if torch.is_tensor(tensor) and tensor.is_cuda:
         cpu_tensor = tensor.to(torch.device("cpu"), non_blocking=True)
@@ -909,12 +1107,13 @@ def maybe_move_tensor_to_cpu(tensor, as_numpy=False, record_stream=False):
     return tensor
 
 
-def get_moe_layer_wise_logging_tracker():
+def get_moe_layer_wise_logging_tracker() -> dict:
     """Return the moe layer wise tracker."""
     global _MOE_LAYER_WISE_LOGGING_TRACKER
     return _MOE_LAYER_WISE_LOGGING_TRACKER
 
 
+@internal_api
 class RandomSTE(torch.autograd.Function):
     """
     Straight-Through Estimator(STE) function that returns random values
@@ -923,38 +1122,44 @@ class RandomSTE(torch.autograd.Function):
     This is used to generate random logits of router for load-balanced benchmark.
     """
 
-    generator = None
-    random_logits = None
-
     @staticmethod
-    def forward(ctx, logits):
+    def forward(ctx, logits: torch.Tensor) -> torch.Tensor:
         """
         Forward pass returns random logits with rank-specific seed.
-        """
-        if is_graph_capturing() and RandomSTE.random_logits is not None:
-            return RandomSTE.random_logits
 
-        if RandomSTE.generator is None:
-            global_rank = torch.distributed.get_rank()
-            base_seed = 42
-            seed = base_seed + global_rank
-            RandomSTE.generator = torch.Generator(device=logits.device)
-            RandomSTE.generator.manual_seed(seed)
+        Args:
+            logits (torch.Tensor): The logits.
 
-        RandomSTE.random_logits = logits.clone().normal_(generator=RandomSTE.generator)
-        return RandomSTE.random_logits
+        Returns:
+            torch.Tensor: The random logits.
+        """
+        with get_cuda_rng_tracker().fork(get_expert_parallel_rng_tracker_name()):
+            random_logits = logits.clone().normal_()
+        return random_logits
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
         """
         Backward pass propagates the gradient for logits.
+
+        Args:
+            grad_output (torch.Tensor): The gradient output.
+
+        Returns:
+            torch.Tensor: The gradient input.
         """
         return grad_output
 
 
-def apply_random_logits(logits):
+def apply_random_logits(logits: torch.Tensor) -> torch.Tensor:
     """
     Apply the RandomSTE function to the logits.
+
+    Args:
+        logits (torch.Tensor): The logits.
+
+    Returns:
+        torch.Tensor: The random logits.
     """
     return RandomSTE.apply(logits)
 
@@ -966,10 +1171,23 @@ class RouterGatingLinearFunction(torch.autograd.Function):
 
     @staticmethod
     def forward(
-        ctx, inp: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, router_dtype: torch.dtype
-    ):
+        ctx,
+        inp: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        router_dtype: torch.dtype,
+    ) -> torch.Tensor:
         """
         Forward pass of the RouterGatingLinearFunction function.
+
+        Args:
+            inp (torch.Tensor): The input tensor.
+            weight (torch.Tensor): The weight tensor.
+            bias (torch.Tensor): The bias tensor. Could be None.
+            router_dtype (torch.dtype): The router dtype.
+
+        Returns:
+            torch.Tensor: The output tensor.
         """
         ctx.save_for_backward(inp, weight, bias)
         ctx.router_dtype = router_dtype
@@ -992,9 +1210,18 @@ def forward(
         return output
 
     @staticmethod
-    def backward(ctx, grad_output: torch.Tensor):
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], None]:
         """
         Backward pass of the RouterGatingLinearFunction function.
+
+        Args:
+            grad_output (torch.Tensor): The gradient output.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], None]:
+                The gradient input, gradient weight, gradient bias, and None.
         """
         inp, weight, bias = ctx.saved_tensors
         inp_shape = inp.shape
@@ -1021,18 +1248,34 @@ def backward(ctx, grad_output: torch.Tensor):
 
 
 def router_gating_linear(
-    inp: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, router_dtype: torch.dtype
-):
+    inp: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor], router_dtype: torch.dtype
+) -> torch.Tensor:
     """
     Customized linear layer for router gating.
     This linear layer accepts bfloat16 input and weight, and can return output with router_dtype.
     It can reduce the memory usage by avoiding saving the intermediate high precision tensors.
+
+    Args:
+        inp (torch.Tensor): The input tensor.
+        weight (torch.Tensor): The weight tensor.
+        bias (torch.Tensor): The bias tensor. Could be None.
+        router_dtype (torch.dtype): The router dtype.
+
+    Returns:
+        torch.Tensor: The output tensor.
     """
     return RouterGatingLinearFunction.apply(inp, weight, bias, router_dtype)
 
 
-def get_align_size_for_quantization(config: TransformerConfig):
-    """Get the alignment size for quantization."""
+def get_align_size_for_quantization(config: TransformerConfig) -> int:
+    """Get the alignment size for quantization.
+
+    Args:
+        config (TransformerConfig): The configuration.
+
+    Returns:
+        int: The alignment size for quantization.
+    """
     if config.fp8:
         return get_fp8_align_size(config.fp8_recipe)
     elif config.fp4:
@@ -1042,7 +1285,7 @@ def get_align_size_for_quantization(config: TransformerConfig):
 
 # TODO(Hepteract): delete the usage of the global parallel_state.
 # Initialize process groups with the global parallel_state.
-def get_default_pg_collection():
+def get_default_pg_collection() -> ProcessGroupCollection:
     """Get the default process groups for MoE.
 
     Returns:
@@ -1060,3 +1303,233 @@ def get_default_pg_collection():
         with_context_parallel=True
     )
     return pg_collection
+
+
+class MoECudaGraphPartialCaptureSignal(Exception):
+    """
+    Used to early-return from a MoE layer forward pass in CUDA graph capture.
+    This signal is raised when we are partially capturing the CUDA graph of the MoE layer,
+    and the related intermediate tensors are recorded in self.kwargs.
+    Call self.get_early_return_outputs() to collect the CUDA graph outputs.
+    """
+
+    def __init__(self, moe_layer, return_step: str, **kwargs):
+        self.moe_layer = moe_layer
+        self.return_step = return_step
+        self.kwargs = kwargs
+
+    def get_early_return_outputs(
+        self, hidden_states: torch.Tensor, shared_expert_output: torch.Tensor
+    ) -> List[torch.Tensor]:
+        """
+        Get the CUDA graph early return outputs for the MoE layer, including the intermediate
+        tensors and the intermediate attributes of the token dispatcher.
+
+        The returned output tensors are in the order of:
+        - routed experts path outputs
+          - hidden states, probs, and routing map for capturing router
+          - hidden states and probs for capturing router and preprocess
+        - intermediate attributes of the token dispatcher (if capturing the preprocess step)
+        - shared expert path output (if exists)
+        """
+        if self.return_step == "route":
+            # Capturing the router step returns three intermediate tensors:
+            # hidden states, routing probabilities, and routing map.
+            outputs = [hidden_states, self.kwargs['probs'], self.kwargs['routing_map']]
+        elif self.return_step == "preprocess":
+            # Capturing the preprocess step returns two intermediate tensors:
+            # hidden states and routing probabilities.
+            # It also returns the intermediate attributes of the token dispatcher, recorded in
+            # "token_dispatcher.cudagraph_attrs".
+            outputs = [self.kwargs['hidden_states'], self.kwargs['probs']]
+            valid_cudagraph_attrs = []
+            for attr_name in self.moe_layer.token_dispatcher.cudagraph_attrs:
+                hier_attr_name = attr_name.split('.')
+                attr = self.moe_layer.token_dispatcher
+                for name in hier_attr_name:
+                    attr = getattr(attr, name, None)
+                    if attr is None:
+                        break
+                if isinstance(attr, torch.Tensor):
+                    outputs.append(attr)
+                    valid_cudagraph_attrs.append(attr_name)
+            if self.moe_layer.token_dispatcher.valid_cudagraph_attrs is None:
+                self.moe_layer.token_dispatcher.valid_cudagraph_attrs = valid_cudagraph_attrs
+            else:
+                assert (
+                    self.moe_layer.token_dispatcher.valid_cudagraph_attrs == valid_cudagraph_attrs
+                ), (
+                    "valid_cudagraph_attrs mismatch: "
+                    f"{self.moe_layer.token_dispatcher.valid_cudagraph_attrs} != "
+                    f"{valid_cudagraph_attrs}"
+                )
+        # Also return the shared expert output, if it is not None.
+        if shared_expert_output is not None:
+            outputs.append(shared_expert_output)
+        return outputs
+
+
+@internal_api
+@dataclass
+class MoECudaGraphTensorStore:
+    """Storage for tensors used in CUDA graph replay for MoE layers.
+
+    This dataclass stores intermediate tensors computed during CUDA graph replay
+    that need to be resumed from the end of the CUDA graph scope to skip redundant computations.
+
+    Attributes:
+        hidden_states (Optional[torch.Tensor]): The hidden states output from the CUDA graph replay.
+        probs (Optional[torch.Tensor]): The routing probabilities for each token-expert pair.
+        routing_map (Optional[torch.Tensor]): The sparse mapping indicating which experts
+            were selected for each token. Used to skip the normal router step.
+        shared_expert_output (Optional[torch.Tensor]): The output from shared experts
+            computation. Used to skip the normal shared expert computation step.
+    """
+
+    hidden_states: Optional[torch.Tensor] = None
+    probs: Optional[torch.Tensor] = None
+    routing_map: Optional[torch.Tensor] = None
+    shared_expert_output: Optional[torch.Tensor] = None
+
+    def is_empty(self) -> bool:
+        """Check if the store has any non-None tensors.
+
+        Returns:
+            bool: True if all fields are None, False otherwise.
+        """
+        return all(
+            getattr(self, field_name) is None
+            for field_name in ['hidden_states', 'probs', 'routing_map', 'shared_expert_output']
+        )
+
+    def set(self, **kwargs):
+        """Set the tensors in the store from keyword arguments."""
+        for field_name, value in kwargs.items():
+            assert field_name in [
+                'hidden_states',
+                'probs',
+                'routing_map',
+                'shared_expert_output',
+            ], f"Invalid field name: {field_name}"
+            if value is not None:
+                assert isinstance(
+                    value, torch.Tensor
+                ), f"Value must be a torch.Tensor, got {type(value)} for field {field_name}"
+                setattr(self, field_name, value)
+
+    def clear(self):
+        """Reset all stored tensors to None."""
+        for field_name in ['hidden_states', 'probs', 'routing_map', 'shared_expert_output']:
+            setattr(self, field_name, None)
+
+
+def maybe_skip_or_early_return_by_cudagraph(step_condition):
+    """
+    Decorator to skip certain codepaths in the MoE layer forward pass in CUDA graph replay,
+    or early return from the MoE layer forward pass in CUDA graph capture.
+
+    Args:
+        step_condition: The step condition to check. Can be "shared_experts_compute", "route",
+        or "preprocess". If "shared_experts_compute", the shared experts computation will be
+        skipped in replay if it is in the CUDA graph scope. If "route" or "preprocess", the
+        router or preprocess will be skipped in replay if it is in the CUDA graph scope, or
+        early return from the MoE layer forward pass if it is in CUDA graph capturing mode.
+
+    Returns:
+        A decorator function that wraps the MoE layer forward pass.
+    """
+
+    def maybe_raise_signal(moe_layer, **kwargs):
+        """
+        Check if the MoE layer should early return for CUDA graph capture.
+        If so, raise a MoECudaGraphPartialCaptureSignal.
+        """
+        if (
+            moe_layer.config.cuda_graph_impl == "transformer_engine"
+            and moe_layer.training
+            and is_graph_capturing()
+        ):
+            if (
+                step_condition == "route"
+                and CudaGraphScope.moe_router in moe_layer.config.cuda_graph_scope
+                and CudaGraphScope.moe_preprocess not in moe_layer.config.cuda_graph_scope
+            ):
+                raise MoECudaGraphPartialCaptureSignal(moe_layer, "route", **kwargs)
+            elif (
+                step_condition == "preprocess"
+                and CudaGraphScope.moe_preprocess in moe_layer.config.cuda_graph_scope
+            ):
+                raise MoECudaGraphPartialCaptureSignal(moe_layer, "preprocess", **kwargs)
+
+    def decorator(func):
+
+        @functools.wraps(func)
+        def wrapped_func(moe_layer, *args, **kwargs):
+            """
+            Check if we should skip executing the original function based on the current
+            step condition and the tensor store status. If the tensor can be found in the store,
+            it indicates that it is already computed by the CUDA graph replay, so we can skip it.
+            Otherwise, we execute the original function and check if we should raise a signal to
+            early return in CUDA graph capture.
+            """
+
+            if moe_layer.config.cuda_graph_impl != "transformer_engine":
+                return func(moe_layer, *args, **kwargs)
+
+            # The non-cudagraph codepath just calls the original function.
+            if not is_graph_capturing() and moe_layer.cudagraph_tensor_store.is_empty():
+                return func(moe_layer, *args, **kwargs)
+
+            assert (
+                not is_graph_capturing() or moe_layer.cudagraph_tensor_store.is_empty()
+            ), "cudagraph_tensor_store cannot be used when it is capturing cuda graph."
+            if step_condition == "shared_experts_compute":
+                if moe_layer.cudagraph_tensor_store.shared_expert_output is None:
+                    # Don't skip the shared expert computation.
+                    shared_expert_output = func(moe_layer, *args, **kwargs)
+                else:
+                    # Skip the shared expert computation and get value from store.
+                    shared_expert_output = moe_layer.cudagraph_tensor_store.shared_expert_output
+                return shared_expert_output
+            elif step_condition == "route":
+                if moe_layer.cudagraph_tensor_store.probs is None:
+                    # Don't skip the router.
+                    assert (
+                        moe_layer.cudagraph_tensor_store.routing_map is None
+                    ), "routing_map must be None if probs is None"
+                    probs, routing_map = func(moe_layer, *args, **kwargs)
+
+                    # Maybe early return after the router.
+                    maybe_raise_signal(moe_layer, probs=probs, routing_map=routing_map)
+                else:
+                    # Skip the router and get value from store.
+                    probs, routing_map = (
+                        moe_layer.cudagraph_tensor_store.probs,
+                        moe_layer.cudagraph_tensor_store.routing_map,
+                    )
+                return probs, routing_map
+            elif step_condition == "preprocess":
+                if (
+                    moe_layer.cudagraph_tensor_store.is_empty()
+                    or moe_layer.cudagraph_tensor_store.routing_map is not None
+                ):
+                    # Don't skip the preprocess.
+                    hidden_states, probs = func(moe_layer, *args, **kwargs)
+
+                    # Maybe early return after the preprocess.
+                    maybe_raise_signal(moe_layer, hidden_states=hidden_states, probs=probs)
+                else:
+                    # Skip the preprocess and get value from store.
+                    assert (
+                        moe_layer.cudagraph_tensor_store.hidden_states is not None
+                        and moe_layer.cudagraph_tensor_store.probs is not None
+                    ), "hidden_states and probs must be given in moe_preprocess cudagraph replay"
+                    hidden_states, probs = (
+                        moe_layer.cudagraph_tensor_store.hidden_states,
+                        moe_layer.cudagraph_tensor_store.probs,
+                    )
+                return hidden_states, probs
+
+        return wrapped_func
+
+    return decorator
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
index 34d81a21ffa..4be97401748 100644
--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
@@ -1,12 +1,11 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
 from megatron.core.jit import jit_fuser
-from megatron.core.tensor_parallel import reduce_from_tensor_model_parallel_region
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.moe.moe_utils import (
     MoEAuxLossAutoScaler,
@@ -14,6 +13,7 @@
     apply_random_logits,
     apply_router_token_dropping,
     compute_routing_scores_for_aux_loss,
+    get_tokens_per_expert_and_token_count,
     router_gating_linear,
     save_to_aux_losses_tracker,
     sinkhorn,
@@ -21,6 +21,7 @@
     topk_routing_with_score_function,
     z_loss_func,
 )
+from megatron.core.transformer.moe.router_replay import RouterReplay
 from megatron.core.transformer.transformer_config import TransformerConfig
 
 
@@ -201,6 +202,10 @@ def __init__(
             self.global_tokens_per_expert = None
             self.ga_steps = None
 
+        self.router_replay = None
+        if self.config.moe_enable_routing_replay:
+            self.router_replay = RouterReplay()
+
     def _maintain_float32_expert_bias(self):
         """
         Maintain the expert bias in float32.
@@ -268,22 +273,29 @@ def is_aux_loss_enabled(self) -> bool:
         return False
 
     def _apply_aux_loss(
-        self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor
+        self,
+        probs: torch.Tensor,
+        scores_for_aux_loss: torch.Tensor,
+        routing_map: torch.Tensor,
+        with_padding_mask: bool = False,
     ):
         """Apply the auxiliary loss for the given scores and routing map."""
         aux_loss_coeff = self.get_aux_loss_coeff("aux_loss")
         if aux_loss_coeff == 0:
             return probs
-        tokens_per_expert = routing_map.sum(dim=0)
-        tokens_per_expert = reduce_from_tensor_model_parallel_region(
-            tokens_per_expert, self.tp_cp_group
+
+        global_tokens_per_expert, local_num_tokens, total_num_tokens = (
+            get_tokens_per_expert_and_token_count(
+                routing_map=routing_map,
+                reduce_group=self.tp_cp_group,
+                topk=self.topk,
+                with_padding_mask=with_padding_mask,
+            )
         )
-        num_tokens = routing_map.shape[0]
-        total_num_tokens = num_tokens * self.tp_cp_group.size()
 
         aux_loss = switch_load_balancing_loss_func(
             probs=scores_for_aux_loss,
-            tokens_per_expert=tokens_per_expert,
+            tokens_per_expert=global_tokens_per_expert,
             total_num_tokens=total_num_tokens,
             topk=self.topk,
             num_experts=self.config.num_moe_experts,
@@ -291,7 +303,12 @@ def _apply_aux_loss(
             fused=self.config.moe_router_fusion,
         )
         probs = self.attach_and_log_load_balancing_loss(
-            probs, aux_loss_coeff, aux_loss, "load_balancing_loss", self.tp_cp_group
+            probs,
+            aux_loss_coeff,
+            aux_loss,
+            "load_balancing_loss",
+            self.tp_cp_group,
+            valid_token_count=local_num_tokens,
         )
         return probs
 
@@ -302,6 +319,7 @@ def _apply_seq_aux_loss(
         routing_map: torch.Tensor,
         seq_length: int,
         bsz: int,
+        with_padding_mask: bool = False,
     ):
         """Apply the sequence-level auxiliary loss for the given scores and routing map.
 
@@ -315,17 +333,21 @@ def _apply_seq_aux_loss(
             return probs
 
         scores_for_aux_loss = scores_for_aux_loss.reshape(seq_length, -1)
-        tokens_per_expert = routing_map.reshape(seq_length, -1).sum(dim=0)
-        tokens_per_expert = reduce_from_tensor_model_parallel_region(
-            tokens_per_expert, self.tp_cp_group
+        routing_map = routing_map.reshape(seq_length, -1)
+
+        global_tokens_per_expert, local_num_tokens, total_num_tokens = (
+            get_tokens_per_expert_and_token_count(
+                routing_map=routing_map,
+                reduce_group=self.tp_cp_group,
+                with_padding_mask=with_padding_mask,
+                topk=self.topk * bsz,
+            )
         )
 
-        total_num_tokens = seq_length * self.tp_cp_group.size()
-
         aux_loss = (
             switch_load_balancing_loss_func(
                 probs=scores_for_aux_loss,
-                tokens_per_expert=tokens_per_expert,
+                tokens_per_expert=global_tokens_per_expert,
                 total_num_tokens=total_num_tokens,
                 topk=self.topk,
                 num_experts=self.config.num_moe_experts,
@@ -334,31 +356,43 @@ def _apply_seq_aux_loss(
             )
             / bsz
         )
+
         probs = self.attach_and_log_load_balancing_loss(
-            probs, seq_aux_loss_coeff, aux_loss, "seq_load_balancing_loss", self.tp_cp_group
+            probs,
+            seq_aux_loss_coeff,
+            aux_loss,
+            "seq_load_balancing_loss",
+            self.tp_cp_group,
+            valid_token_count=local_num_tokens,
         )
         return probs
 
     def _apply_global_aux_loss(
-        self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor
+        self,
+        probs: torch.Tensor,
+        scores_for_aux_loss: torch.Tensor,
+        routing_map: torch.Tensor,
+        with_padding_mask: bool = False,
     ):
         """Apply the global auxiliary loss for the given scores and routing map."""
         global_aux_loss_coeff = self.get_aux_loss_coeff("global_aux_loss")
         if global_aux_loss_coeff == 0:
             return probs
 
-        tokens_per_expert = routing_map.sum(dim=0)
-        tokens_per_expert = reduce_from_tensor_model_parallel_region(
-            tokens_per_expert, self.tp_dp_cp_group
+        # Use unified function to compute tokens_per_expert and num_tokens
+        global_tokens_per_expert, local_num_tokens, total_num_tokens = (
+            get_tokens_per_expert_and_token_count(
+                routing_map=routing_map,
+                reduce_group=self.tp_dp_cp_group,
+                with_padding_mask=with_padding_mask,
+                topk=self.topk,
+            )
         )
 
-        self.global_tokens_per_expert += tokens_per_expert
+        self.global_tokens_per_expert += global_tokens_per_expert
         self.ga_steps += 1
         averated_tokens_per_expert = self.global_tokens_per_expert / self.ga_steps
 
-        num_tokens = scores_for_aux_loss.shape[0]
-        total_num_tokens = num_tokens * self.tp_dp_cp_group.size()
-
         global_aux_loss = switch_load_balancing_loss_func(
             probs=scores_for_aux_loss,
             tokens_per_expert=averated_tokens_per_expert,
@@ -375,6 +409,7 @@ def _apply_global_aux_loss(
             "global_load_balancing_loss",
             self.tp_dp_cp_group,
             reduce_group_has_dp=True,
+            valid_token_count=local_num_tokens,
         )
         return probs
 
@@ -386,18 +421,22 @@ def attach_and_log_load_balancing_loss(
         aux_loss_name: str,
         reduce_group: torch.distributed.ProcessGroup,
         reduce_group_has_dp: bool = False,
+        valid_token_count: Optional[Union[int, torch.Tensor]] = None,
     ):
         """Attach aux loss function to activation and add to logging.
 
         Args:
-            activation (torch.Tensor): The activation tensor to attach the loss to.
-            aux_loss_coeff (float): The coefficient for the auxiliary loss.
-            aux_loss (torch.Tensor): The auxiliary loss tensor.
-            aux_loss_name (str): The name of the auxiliary loss for logging.
-            reduce_group (torch.distributed.ProcessGroup): The group for reducing the loss.
+            activation (torch.Tensor): Activation tensor to attach the aux loss to.
+            aux_loss_coeff (float): Coefficient for the aux loss.
+            aux_loss (torch.Tensor): Computed aux loss.
+            aux_loss_name (str): Name of the aux loss for logging.
+            reduce_group (torch.distributed.ProcessGroup): Process group for reduction.
             reduce_group_has_dp (bool): Whether the reduce group has data parallel ranks.
                 Set this to True if the reduce group has data parallel ranks. This flag is used to
                 ensure the correct reduction in aux loss tracking.
+            valid_token_count (int or torch.Tensor, optional): Number of valid tokens excluding
+                padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor).
+                If None, uses activation.shape[0]. Defaults to None.
         """
         # TODO (zijiey): fix the per_layer_logging for MTP, currently it will incorrectly
         # add the aux loss logging value to other layer's since it is difficult to get the
@@ -422,17 +461,22 @@ def attach_and_log_load_balancing_loss(
             # which scales both the main_loss gradient and aux_loss gradient by
             # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads function.
             # To correct this scaling, we need to scale the aux_loss by num_local_tokens here.
-            activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * activation.shape[0])
+            # Use valid_token_count (excluding padding) if provided, otherwise use total tokens.
+            num_tokens = valid_token_count if valid_token_count is not None else activation.shape[0]
+            activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * num_tokens)
         else:
             activation = MoEAuxLossAutoScaler.apply(activation, aux_loss)
         return activation
 
-    def apply_z_loss(self, logits):
+    def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None):
         """Encourages the router's logits to remain small to enhance stability.
         Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
 
         Args:
             logits (torch.Tensor): The logits of the router.
+            padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
+                                                   Shape in [num_tokens]. True for valid tokens,
+                                                   False for padding tokens. Defaults to None.
 
         Returns:
             torch.Tensor: The logits after applying the z-loss.
@@ -440,8 +484,7 @@ def apply_z_loss(self, logits):
         if self.config.moe_z_loss_coeff is not None and self.training and torch.is_grad_enabled():
             # Skip Z loss calculations when using torch.no_grad() or checkpointing.
             moe_z_loss_coeff = self.config.moe_z_loss_coeff / self.tp_cp_group.size()
-            z_loss = z_loss_func(logits, moe_z_loss_coeff)
-            scale_up = 1.0
+            z_loss = z_loss_func(logits, moe_z_loss_coeff, padding_mask=padding_mask)
             if self.calculate_per_token_loss:
                 # The expected final scaling for z_loss gradients is
                 # 1/(num_micro_batches * dp_size).
@@ -450,7 +493,9 @@ def apply_z_loss(self, logits):
                 # which scales both the main_loss gradient and z_loss gradient by
                 # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads().
                 # To correct this scaling, we need to scale the z_loss by num_local_tokens here.
-                logits = MoEAuxLossAutoScaler.apply(logits, z_loss * logits.shape[0])
+                # Count valid tokens: sum of inverted mask (False -> True = valid)
+                num_tokens = (~padding_mask).sum() if padding_mask is not None else logits.shape[0]
+                logits = MoEAuxLossAutoScaler.apply(logits, z_loss * num_tokens)
             else:
                 logits = MoEAuxLossAutoScaler.apply(logits, z_loss)
 
@@ -484,20 +529,27 @@ def apply_input_jitter(self, input: torch.Tensor):
             return input
 
     @jit_fuser
-    def _apply_expert_bias(self, routing_map: torch.Tensor):
+    def _apply_expert_bias(
+        self, routing_map: torch.Tensor, padding_mask: Optional[torch.Tensor] = None
+    ):
         """
         Update expert bias and tokens_per_expert
         Prevent extra local tokens accumulation on evaluation or activation recomputation
         """
         if self.enable_expert_bias and torch.is_grad_enabled():
             with torch.no_grad():
+                if padding_mask is not None:
+                    routing_map = routing_map & (~padding_mask)
                 self.local_tokens_per_expert += routing_map.sum(dim=0)
 
-    def routing(self, logits: torch.Tensor):
+    def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
         """Top-k routing function
 
         Args:
             logits (torch.Tensor): Logits tensor after gating.
+            padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
+                                                   Shape [seq_length, bsz]. True for valid tokens,
+                                                   False for padding tokens. Defaults to None.
 
         Returns:
             probs (torch.Tensor): The probabilities of token to experts assignment.
@@ -507,8 +559,12 @@ def routing(self, logits: torch.Tensor):
         seq_length, bsz = logits.shape[:2]
         logits = logits.view(-1, self.config.num_moe_experts)
 
+        # Flatten padding_mask to [num_tokens] if provided
+        if padding_mask is not None:
+            padding_mask = padding_mask.reshape(-1)
+
         # Apply Z-Loss
-        logits = self.apply_z_loss(logits)
+        logits = self.apply_z_loss(logits, padding_mask=padding_mask)
 
         # Calculate probs and routing_map for token dispatching
         if self.routing_type == "sinkhorn":
@@ -524,6 +580,7 @@ def routing(self, logits: torch.Tensor):
                 score_function=self.score_function,
                 expert_bias=self.expert_bias,
                 fused=self.config.moe_router_fusion,
+                router_replay=self.router_replay,
             )
 
         # Apply token dropping to probs and routing_map.
@@ -541,18 +598,35 @@ def routing(self, logits: torch.Tensor):
         if self.training and torch.is_grad_enabled() and self.is_aux_loss_enabled():
             # Calculate scores and routing_map for aux loss
             routing_map_for_aux_loss, scores_for_aux_loss = compute_routing_scores_for_aux_loss(
-                logits, self.topk, self.score_function, fused=self.config.moe_router_fusion
+                logits,
+                self.topk,
+                self.score_function,
+                fused=self.config.moe_router_fusion,
+                padding_mask=padding_mask,
+            )
+            probs = self._apply_aux_loss(
+                probs,
+                scores_for_aux_loss,
+                routing_map_for_aux_loss,
+                with_padding_mask=padding_mask is not None,
             )
-            probs = self._apply_aux_loss(probs, scores_for_aux_loss, routing_map_for_aux_loss)
             probs = self._apply_seq_aux_loss(
-                probs, scores_for_aux_loss, routing_map_for_aux_loss, seq_length, bsz
+                probs,
+                scores_for_aux_loss,
+                routing_map_for_aux_loss,
+                seq_length,
+                bsz,
+                with_padding_mask=padding_mask is not None,
             )
             probs = self._apply_global_aux_loss(
-                probs, scores_for_aux_loss, routing_map_for_aux_loss
+                probs,
+                scores_for_aux_loss,
+                routing_map_for_aux_loss,
+                with_padding_mask=padding_mask is not None,
             )
 
         # Optionally apply expert bias
-        self._apply_expert_bias(routing_map)
+        self._apply_expert_bias(routing_map, padding_mask=padding_mask)
 
         return probs, routing_map
 
@@ -562,12 +636,15 @@ def reset_global_aux_loss_tracker(self):
             self.global_tokens_per_expert.zero_()
             self.ga_steps.zero_()
 
-    def forward(self, input: torch.Tensor):
+    def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
         """
         Forward pass of the router.
 
         Args:
             input (torch.Tensor): Input tensor.
+            padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
+                                                   Shape [seq_length, bsz]. True for valid tokens,
+                                                   False for padding tokens. Defaults to None.
         """
         self._maintain_float32_expert_bias()
 
@@ -579,7 +656,7 @@ def forward(self, input: torch.Tensor):
             # Apply force load balancing with random logits for benchmark
             logits = apply_random_logits(logits)
 
-        probs, routing_map = self.routing(logits)
+        probs, routing_map = self.routing(logits, padding_mask=padding_mask)
 
         return probs, routing_map
 
diff --git a/megatron/core/transformer/moe/router_replay.py b/megatron/core/transformer/moe/router_replay.py
new file mode 100644
index 00000000000..b6b8e26a0a6
--- /dev/null
+++ b/megatron/core/transformer/moe/router_replay.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+from enum import Enum
+from typing import Callable, List, Optional, Tuple
+
+import torch
+
+
+class RouterReplayAction(Enum):
+    """
+    A Enum to define the actions for router replay.
+    """
+
+    RECORD = "record"  # Record the topk indices for replay
+    REPLAY_FORWARD = "replay_forward"  # Replay the recorded topk indices for forward pass
+    REPLAY_BACKWARD = "replay_backward"  # Replay topk indices for re-compute during backward pass
+
+
+class RouterReplay:
+    """
+    A class to manage the recording and replaying of MoE routing decisions.
+    It holds all router instances and provides static methods to globally
+    control recording and replaying.
+    """
+
+    # Static variable to hold all router instances, one per MoE layer.
+    global_router_replay_instances: List['RouterReplay'] = []
+
+    @staticmethod
+    def set_replay_data(all_layers_topk_indices: List[torch.Tensor]):
+        """
+        Distributes the topk indices for all layers to their respective RouterReplay instances.
+        :param all_layers_topk_indices: A list of tensors, where each tensor contains the
+                                        topk indices for a specific layer. The order
+                                        must match the instantiation order of the routers.
+        """
+        if len(all_layers_topk_indices) != len(RouterReplay.global_router_replay_instances):
+            raise ValueError(
+                f"The number of replay tensors ({len(all_layers_topk_indices)}) "
+                f"does not match instances ({len(RouterReplay.global_router_replay_instances)})."
+            )
+        for i, router_instance in enumerate(RouterReplay.global_router_replay_instances):
+            router_instance.set_target_indices(all_layers_topk_indices[i])
+
+    @staticmethod
+    def get_recorded_data() -> List[torch.Tensor]:
+        """
+        Collects the recorded topk indices from all RouterReplay instances.
+        :return: A list of tensors, each containing the recorded topk indices for a layer.
+        """
+        return [
+            router.get_recorded_indices() for router in RouterReplay.global_router_replay_instances
+        ]
+
+    @staticmethod
+    def clear_global_indices():
+        """Clears the recorded and target topk indices in all instances."""
+        for router in RouterReplay.global_router_replay_instances:
+            router.clear_indices()
+
+    @staticmethod
+    def set_global_router_replay_action(router_replay_action: RouterReplayAction):
+        """Sets the router replay action for all router instances."""
+        for router in RouterReplay.global_router_replay_instances:
+            router.set_router_replay_action(router_replay_action)
+
+    @staticmethod
+    def clear_global_router_replay_action():
+        """Clears the router replay action for all router instances."""
+        for router in RouterReplay.global_router_replay_instances:
+            router.clear_router_replay_action()
+
+    @staticmethod
+    def clear_global_router_replay_instances():
+        """Clear the global list of router replay instances to prevent memory leaks."""
+        RouterReplay.global_router_replay_instances.clear()
+
+    def __init__(self):
+        """Initializes a RouterReplay instance for a specific layer."""
+        self.target_topk_idx: Optional[torch.Tensor] = None  # Target topk indices for replay
+        self.recorded_topk_idx: Optional[torch.Tensor] = None  # Recorded topk indices for replay
+        self.router_replay_action: Optional[RouterReplayAction] = (
+            None  # Router replay action for this layer
+        )
+        self.replay_backward_list: List[torch.Tensor] = (
+            []
+        )  # List of tensors for backward pass replay
+        RouterReplay.global_router_replay_instances.append(self)
+
+    def set_target_indices(self, topk_indices: torch.Tensor):
+        """Sets the target topk indices for replay."""
+        self.target_topk_idx = topk_indices
+        self.replay_backward_list.append(topk_indices)
+
+    def get_recorded_indices(self) -> Optional[torch.Tensor]:
+        """Returns the recorded topk indices."""
+        return self.recorded_topk_idx
+
+    def record_indices(self, topk_indices: torch.Tensor):
+        """Records the topk indices."""
+        self.recorded_topk_idx = topk_indices
+
+    def clear_indices(self):
+        """Clears the recorded and target topk indices."""
+        self.recorded_topk_idx = None
+        self.target_topk_idx = None
+        self.replay_backward_list = []
+
+    def set_router_replay_action(self, router_replay_action: RouterReplayAction):
+        """Sets the router replay action for this layer."""
+        self.router_replay_action = router_replay_action
+
+    def clear_router_replay_action(self):
+        """Clears the router replay action for this layer."""
+        self.router_replay_action = None
+
+    def get_replay_topk(
+        self,
+        scores: torch.Tensor,
+        topk: int,
+        num_groups: Optional[int] = None,
+        group_topk: Optional[int] = None,
+        default_compute_topk: Callable[
+            [torch.Tensor, int, Optional[int], Optional[int]], Tuple[torch.Tensor, torch.Tensor]
+        ] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        A wrapper for top-k computation that handles different replay actions.
+
+        Args:
+            scores (torch.Tensor): The scores to compute top-k on.
+            topk (int): The number of top elements to select.
+            num_groups (Optional[int]): Number of expert groups for group-limited routing.
+            group_topk (Optional[int]): Number of groups to select for each token.
+            default_compute_topk (Callable): The default top-k computation function, which
+                                             should return a tuple of (values, indices).
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the top-k values and indices.
+        """
+        if self.router_replay_action == RouterReplayAction.RECORD:
+            probs, top_indices = default_compute_topk(
+                scores, topk, num_groups=num_groups, group_topk=group_topk
+            )
+            self.record_indices(top_indices)
+            return probs, top_indices
+        elif self.router_replay_action == RouterReplayAction.REPLAY_FORWARD:
+            top_indices = self.target_topk_idx
+            # Ensure indices are on the correct device
+            top_indices = top_indices.to(scores.device)
+            # Gather the scores for the replayed indices to get the probabilities
+            probs = scores.gather(1, top_indices)
+            return probs, top_indices
+        elif self.router_replay_action == RouterReplayAction.REPLAY_BACKWARD:
+            top_indices = self.replay_backward_list.pop(0)
+            # Ensure indices are on the correct device
+            top_indices = top_indices.to(scores.device)
+            # Gather the scores for the replayed indices to get the probabilities
+            probs = scores.gather(1, top_indices)
+            return probs, top_indices
+        else:
+            return default_compute_topk(scores, topk, num_groups, group_topk)
diff --git a/megatron/core/transformer/moe/shared_experts.py b/megatron/core/transformer/moe/shared_experts.py
index 25d5db0b979..35066b1a8b0 100644
--- a/megatron/core/transformer/moe/shared_experts.py
+++ b/megatron/core/transformer/moe/shared_experts.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 import warnings
-from copy import deepcopy
+from copy import copy
 from typing import Optional
 
 import torch
@@ -43,13 +43,13 @@ def __init__(
         gate: bool,
         pg_collection: Optional[ProcessGroupCollection] = None,
     ):
-        config = deepcopy(config)
+        config = copy(config)
         assert config.add_bias_linear == False, "bias is not supported in the shared experts, "
         "please set '--disable-bias-linear' instead."
 
         config.ffn_hidden_size = config.moe_shared_expert_intermediate_size
         # TODO(Hepteract): pass pg_collection to MLP after refactoring MLP
-        super().__init__(config=config, submodules=submodules)
+        super().__init__(config=config, submodules=submodules, tp_group=pg_collection.tp)
 
         self.use_shared_expert_gate = gate
         if self.use_shared_expert_gate:
@@ -62,9 +62,11 @@ def __init__(
         else:
             self.gate_weight = None
 
-        if (self.config.fp8 and is_te_min_version("2.6.0dev0")) or (
-            self.config.fp4 and is_te_min_version("2.7.0.dev0")
-        ):
+        if (
+            self.config.fp8
+            and self.config.fp8_recipe != 'delayed'
+            and is_te_min_version("2.6.0dev0")
+        ) or (self.config.fp4 and is_te_min_version("2.7.0.dev0")):
             # For fp8/fp4 training, the output of pre_mlp_layernorm is saved by router, and
             # the shared expert linear_fc1 also saves the quantized tensor of this output.
             # Here we set the linear_fc1 to save the original input tensors to avoid the extra
@@ -120,7 +122,7 @@ def __init__(
             if self.stream is None:
                 self.stream = torch.cuda.Stream()
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """Forward function"""
         output, _ = super().forward(hidden_states)
         if self.use_shared_expert_gate:
@@ -139,7 +141,11 @@ def sharded_state_dict(
             state_dict = self.state_dict(prefix='', keep_vars=True)
             sub_sd = {
                 f'{prefix}{name}': make_sharded_tensor_for_checkpoint(
-                    state_dict[name], f'{prefix}{name}', prepend_offsets=sharded_offsets
+                    state_dict[name],
+                    f'{prefix}{name}',
+                    prepend_offsets=sharded_offsets,
+                    tp_group=self.tp_group,
+                    dp_cp_group=metadata['dp_cp_group'],
                 )
             }
             sharded_state_dict.update(sub_sd)
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
index c7c7ff147e5..f2e26c63cf5 100644
--- a/megatron/core/transformer/moe/token_dispatcher.py
+++ b/megatron/core/transformer/moe/token_dispatcher.py
@@ -16,6 +16,7 @@
     gather_from_sequence_parallel_region,
     reduce_scatter_to_sequence_parallel_region,
 )
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.moe.fused_a2a import (
     fused_combine,
     fused_dispatch,
@@ -76,6 +77,11 @@ def __init__(
         self.tp_rank = utils.get_pg_rank(self.tp_group)
         self.ep_size = utils.get_pg_size(self.ep_group)
 
+        # Attributes that need to be captured in cudagraph. These attributes are returned
+        # as cudagraph outputs when the cuda_graph_scope contains moe_preprocess.
+        self.cudagraph_attrs = []
+        self.valid_cudagraph_attrs = None
+
     @abstractmethod
     def dispatch_preprocess(
         self, tokens: torch.Tensor, routing_map: torch.Tensor, probs: torch.Tensor
@@ -233,6 +239,10 @@ def __init__(
         # device token permutation is enabled and **AllGahter** is performed.
         self.global_local_map = None
 
+        # Attributes that need to be captured in cudagraph. These attributes are returned
+        # as cudagraph outputs when the cuda_graph_scope contains moe_preprocess.
+        self.cudagraph_attrs = ['routing_map']
+
     def dispatch_preprocess(
         self, hidden_states: torch.Tensor, routing_map: torch.Tensor, probs: torch.Tensor
     ):
@@ -426,11 +436,36 @@ def __init__(
             "no_sync": 4,
         }
         self.cuda_dtoh_point = "before_permutation_1"
+        if config.cuda_graph_impl != "none" and (
+            CudaGraphScope.moe_preprocess in config.cuda_graph_scope
+            or not self.config.cuda_graph_scope
+        ):
+            self.cuda_dtoh_point = "before_ep_alltoall"
         if MoEAlltoAllTokenDispatcher.cuda_dtoh_stream is None:
             MoEAlltoAllTokenDispatcher.cuda_dtoh_stream = torch.cuda.Stream()
 
+        # Attributes that need to be captured in cudagraph. These attributes are returned
+        # as cudagraph outputs when the cuda_graph_scope contains moe_preprocess.
+        self.cudagraph_attrs = [
+            'tokens_per_expert',
+            'input_splits',
+            'output_splits',
+            'output_splits_tp',
+            'num_out_tokens',
+            'num_global_tokens_per_local_expert',
+            'reversed_local_input_permutation_mapping',
+            'routing_map',
+        ]
+
         self.shared_experts = None
 
+    def set_shared_experts(self, shared_experts):
+        """Set shared expert to the dispatcher."""
+        super().set_shared_experts(shared_experts)
+        if shared_experts.use_shared_expert_gate:
+            self.cudagraph_attrs.append('shared_experts.gate_score')
+        self.cudagraph_attrs.append('shared_experts.cached_fc1_input')
+
     def preprocess(self, routing_map: torch.Tensor) -> torch.Tensor:
         """
         Preprocesses the token routing map for All-to-All communication and token permutation.
@@ -825,7 +860,7 @@ def _maybe_update_cuda_sync_point(self, point: str):
             self.cuda_sync_point = point
 
     def _maybe_dtoh_and_synchronize(
-        self, point: str, tokens_per_expert: torch.Tensor = None
+        self, point: str, tokens_per_expert: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         """
         Move all possible GPU tensors to CPU and make a synchronization at the expected point.
@@ -947,11 +982,8 @@ def __init__(
         if self.drop_and_pad:
             assert self.capacity_factor is not None
         self.capacity = None
-        # The up-bound for the number of tokens after dispatch op, -1 means no up-bound,
-        # which will cause a CPU sync
-        self.num_dispatched_tokens = None
-        # Actually the sum of tokens_per_expert, the up-bound for the number of tokens
-        # after permute op, -1 means no up-bound, will cause a CPU sync
+        # Actually the the up-bound for the number of tokens
+        # after permute op, None means no up-bound, will cause a CPU sync
         self.num_permuted_tokens = None
 
         # Metadata
@@ -980,12 +1012,9 @@ def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor):
                 num_experts=self.num_experts,
                 capacity_factor=self.capacity_factor,
             )
-            # We cannot predict the actual number of tokens after the dispatch op,
-            # so we set it to the worst case in drop_and_pad mode
-            self.num_dispatched_tokens = self.capacity * self.group.size() * self.num_local_experts
             # In drop_and_pad mode, the number of tokens after the permute op
             # can be computed on the CPU
-            self.num_permuted_tokens = self.num_dispatched_tokens
+            self.num_permuted_tokens = self.capacity * self.group.size() * self.num_local_experts
             self.tokens_per_expert = torch.full(
                 (self.num_local_experts,), self.capacity * self.group.size(), dtype=torch.long
             )
@@ -1014,7 +1043,6 @@ def dispatch(
                 num_local_experts=self.num_local_experts,
                 num_sms_dispatch_api=self.config.moe_hybridep_num_sms,
                 num_sms_combine_api=self.config.moe_hybridep_num_sms,
-                num_dispatched_tokens=self.num_dispatched_tokens,
                 num_permuted_tokens=self.num_permuted_tokens,
                 pad_multiple=self.pad_multiple,
             )
@@ -1036,14 +1064,15 @@ def combine(
         hidden_states = hybrid_ep_combine(
             x=hidden_states,
             handle=self.handle,
-            num_dispatched_tokens=self.num_dispatched_tokens,
             num_permuted_tokens=self.num_permuted_tokens,
             pad_multiple=self.pad_multiple,
         )
-        # Release the used handle/num_permuted_tokens which could change in each iteration
+        # Release the used handle/num_permuted_tokens which could change in each iteration.
+        # For drop_and_pad mode, we don't need to reset the num_permuted_tokens and
+        # num_dispatched_tokens, because their values never change.
         self.handle = None
-        self.num_permuted_tokens = None
-        self.num_dispatched_tokens = None
+        if not self.drop_and_pad:
+            self.num_permuted_tokens = None
         return hidden_states
 
     def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -1323,6 +1352,7 @@ def __init__(
                 num_experts=self.tp_size * self.config.num_moe_experts,
                 config=self.config,
             )
+            self.cudagraph_attrs = ['_comm_manager.token_probs', '_comm_manager.token_indices']
         elif self.config.moe_flex_dispatcher_backend == "hybridep":
             self._comm_manager = _HybridEPManager(
                 group=self.tp_ep_group,
@@ -1330,6 +1360,7 @@ def __init__(
                 num_experts=self.tp_size * self.config.num_moe_experts,
                 config=self.config,
             )
+            self.cudagraph_attrs = ['_comm_manager.token_probs', '_comm_manager.routing_map']
         else:
             raise ValueError(
                 f"Invalid backend: {self.config.moe_flex_dispatcher_backend}"
@@ -1399,7 +1430,7 @@ def dispatch_preprocess(
     def token_dispatch(
         self,
         hidden_states: torch.Tensor,
-        probs: torch.Tensor = None,
+        probs: Optional[torch.Tensor] = None,
         async_finish: bool = True,
         allocate_on_comm_stream: bool = True,
     ):
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
index a0bc52a420d..cd3db50a35b 100644
--- a/megatron/core/transformer/multi_latent_attention.py
+++ b/megatron/core/transformer/multi_latent_attention.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 
 import math
@@ -15,13 +15,16 @@
     HAVE_EINOPS = False
 
 
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import tensor_parallel
 from megatron.core.models.common.embeddings import (
     RotaryEmbedding,
     YarnRotaryEmbedding,
     _yarn_get_mscale,
     apply_rotary_pos_emb,
 )
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear
 from megatron.core.tensor_parallel.mappings import (
@@ -36,7 +39,7 @@
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import MLATransformerConfig
-from megatron.core.utils import deprecate_inference_params, is_te_min_version
+from megatron.core.utils import deprecate_inference_params, get_pg_size, is_te_min_version
 
 try:
     from megatron.core.fusions.fused_mla_yarn_rope_apply import (
@@ -87,12 +90,12 @@ class MultiLatentAttention(Attention):
     def __init__(
         self,
         config: MLATransformerConfig,
-        submodules: Union[MLASelfAttentionSubmodules],
+        submodules: MLASelfAttentionSubmodules,
         layer_number: int,
         attn_mask_type: AttnMaskType,
         attention_type: str,
         cp_comm_type: Optional[str] = None,
-        pg_collection: ProcessGroupCollection = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ) -> None:
 
         super().__init__(
@@ -103,6 +106,7 @@ def __init__(
             attn_mask_type=attn_mask_type,
             pg_collection=pg_collection,
         )
+        self.config: MLATransformerConfig
 
         self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
 
@@ -173,6 +177,7 @@ def __init__(
             skip_bias_add=True,
             is_expert=False,
             tp_comm_buffer_name='proj',
+            tp_group=self.pg_collection.tp,
         )
 
         if (
@@ -238,13 +243,18 @@ def forward(
         # Get the query, key and value tensors based on the type of attention -
         # self or cross attn.
         # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128]
-        query, key, value = self.get_query_key_value_tensors(
-            hidden_states,
-            key_value_states,
-            position_ids,
-            packed_seq_params,
-            inference_context=inference_context,
-        )
+        with off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear") as hidden_states:
+            query, key, value, q_compressed, kv_compressed = self.get_query_key_value_tensors(
+                hidden_states,
+                key_value_states,
+                position_ids,
+                packed_seq_params,
+                inference_context=inference_context,
+            )
+        if self.offload_qkv_linear:
+            query = off_interface.group_commit(
+                query, name="qkv_linear", forced_released_tensors=[hidden_states]
+            )
 
         # ===================================================
         # Adjust key, value for inference
@@ -272,14 +282,24 @@ def forward(
             )
         else:
             if inference_context is None or inference_context.is_static_batching():
-                core_attn_out = self.core_attention(
-                    query,
-                    key,
-                    value,
-                    attention_mask,
-                    packed_seq_params=packed_seq_params,
-                    attn_mask_type=attn_mask_type,
-                )
+                extra_kwargs = {}
+                if self.config.experimental_attention_variant == "dsa":
+                    # For dsa we need to pass in the original hidden states and the compressed
+                    # query representation.
+                    extra_kwargs["x"] = hidden_states
+                    extra_kwargs["qr"] = q_compressed
+                with off_interface(
+                    self.offload_core_attention and self.training, query, "core_attn"
+                ) as query:
+                    core_attn_out = self.core_attention(
+                        query,
+                        key,
+                        value,
+                        attention_mask,
+                        packed_seq_params=packed_seq_params,
+                        attn_mask_type=attn_mask_type,
+                        **extra_kwargs,
+                    )
             elif self.cache_mla_latents:
                 # Dynamic batching attention kernel.
                 q, k, v = (query, key, value)
@@ -300,6 +320,10 @@ def forward(
                 # Only rearrange if not in absorption mode (Flash MLA handles format correctly)
                 if not inference_context.is_decode_only():
                     core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)')
+            if self.offload_core_attention and self.training:
+                core_attn_out = off_interface.group_commit(
+                    core_attn_out, name="core_attn", forced_released_tensors=[query, key, value]
+                )
 
         # We are doing absorption with cache mla latents and decode mode.
         if self.cache_mla_latents and inference_context.is_decode_only():
@@ -325,7 +349,12 @@ def forward(
         # =================
         # Output. [sq, b, h]
         # =================
-        output, bias = self.linear_proj(core_attn_out)
+        with off_interface(self.offload_attn_proj, core_attn_out, "attn_proj") as core_attn_out:
+            output, bias = self.linear_proj(core_attn_out)
+        if self.offload_attn_proj:
+            output = off_interface.group_commit(
+                output, name="attn_proj", forced_released_tensors=[core_attn_out]
+            )
 
         return output, bias
 
@@ -344,8 +373,11 @@ def __init__(
         layer_number: int,
         attn_mask_type=AttnMaskType.padding,
         cp_comm_type: Optional[str] = None,
-        pg_collection: ProcessGroupCollection = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ):
+        if pg_collection is None:
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+
         super().__init__(
             config=config,
             submodules=submodules,
@@ -395,6 +427,11 @@ def __init__(
                 is_expert=False,
                 tp_comm_buffer_name='q_down_proj',
                 skip_weight_param_allocation=False,
+                tp_group=(
+                    pg_collection.tp
+                    if q_down_proj_kwargs.get('parallel_mode') != 'duplicated'
+                    else None
+                ),
                 **q_down_proj_kwargs,
             )
 
@@ -409,6 +446,7 @@ def __init__(
                 skip_bias_add=False,
                 is_expert=False,
                 tp_comm_buffer_name='q_up_proj',
+                tp_group=pg_collection.tp,
             )
 
         kv_down_proj_kwargs = {}
@@ -434,6 +472,11 @@ def __init__(
             is_expert=False,
             tp_comm_buffer_name='kv_down_proj',
             skip_weight_param_allocation=False,
+            tp_group=(
+                pg_collection.tp
+                if kv_down_proj_kwargs.get('parallel_mode') != 'duplicated'
+                else None
+            ),
             **kv_down_proj_kwargs,
         )
 
@@ -448,6 +491,7 @@ def __init__(
             skip_bias_add=False,
             is_expert=False,
             tp_comm_buffer_name='kv_up_proj',
+            tp_group=pg_collection.tp,
         )
 
         if self.config.q_lora_rank is not None:
@@ -483,6 +527,11 @@ def get_query_key_value_tensors(
         assert (
             hidden_states.ndim == 3
         ), f"hidden_states should be 3D, [s, b, n*h], got {hidden_states.ndim}D"
+        if packed_seq_params is not None:
+            assert (
+                packed_seq_params.local_cp_size is None
+            ), "hybrid_context_parallel is not supported with MLA yet and is planned for future. \
+            Please disable hybrid_context_parallel."
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
@@ -514,7 +563,7 @@ def get_query_key_value_tensors(
             else:
                 rotary_pos_emb, mscale = self.rotary_pos_emb(rotary_seq_len, packed_seq=packed_seq)
 
-        if packed_seq_params is not None:
+        if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
             if packed_seq_params.cu_seqlens_q_padded is not None:
                 cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded
             else:
@@ -568,12 +617,9 @@ def get_query_key_value_tensors(
             kv_compressed, k_pos_emb = torch.split(
                 kv_combined, [self.config.kv_lora_rank, self.config.qk_pos_emb_head_dim], dim=-1
             )
-            if (
-                parallel_state.get_tensor_model_parallel_world_size() > 1
-                and self.config.sequence_parallel
-            ):
+            if get_pg_size(self.tp_group) > 1 and self.config.sequence_parallel:
                 # k_pos_emb: [s, b, qk_pos_emb_head_dim]
-                k_pos_emb = gather_from_sequence_parallel_region(k_pos_emb)
+                k_pos_emb = gather_from_sequence_parallel_region(k_pos_emb, group=self.tp_group)
 
         if packed_seq_params is not None:
             # If sequence packing, TE expect [t, h, d] shaped qkv input.
@@ -583,6 +629,16 @@ def get_query_key_value_tensors(
             kv_compressed = kv_compressed.squeeze(1)
             k_pos_emb = k_pos_emb.squeeze(1)
 
+        # =========================================
+        # Apply norm
+        # =========================================
+
+        if self.config.q_lora_rank is not None:
+            # q_compressed: [num_tokens, q_lora_rank]
+            q_compressed = self.q_layernorm(q_compressed)
+
+        kv_compressed = self.kv_layernorm(kv_compressed)
+
         # =========================================
         # QKV up projection and RoPE apply
         # =========================================
@@ -593,7 +649,6 @@ def qkv_up_proj_and_rope_apply_for_cached_latent_kv(
             if self.config.q_lora_rank is not None:
                 # q_compressed: [num_tokens, q_lora_rank]
                 # q: [num_tokens, n * (qk_head_dim + qk_pos_emb_head_dim)]
-                q_compressed = self.q_layernorm(q_compressed)
                 q, _ = self.linear_q_up_proj(q_compressed)
             else:
                 # q_compressed: [num_tokens, hidden_size]
@@ -603,8 +658,6 @@ def qkv_up_proj_and_rope_apply_for_cached_latent_kv(
             # q: [num_tokens, n, q_head_dim]
             q = q.view(*q.size()[:-1], self.num_attention_heads_per_partition, self.q_head_dim)
 
-            kv_compressed = self.kv_layernorm(kv_compressed)
-
             # [num_tokens, qk_pos_emb_head_dim] -> [num_tokens, 1, qk_pos_emb_head_dim]
             k_pos_emb = torch.unsqueeze(k_pos_emb, -2)
 
@@ -668,7 +721,6 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
             if self.config.q_lora_rank is not None:
                 # q_compressed: [num_tokens, q_lora_rank]
                 # q: [num_tokens, n * (qk_head_dim + qk_pos_emb_head_dim)]
-                q_compressed = self.q_layernorm(q_compressed)
                 q, _ = self.linear_q_up_proj(q_compressed)
             else:
                 # q_compressed: [num_tokens, hidden_size]
@@ -678,8 +730,6 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
             # q: [num_tokens, n, q_head_dim]
             q = q.view(*q.size()[:-1], self.num_attention_heads_per_partition, self.q_head_dim)
 
-            kv_compressed = self.kv_layernorm(kv_compressed)
-
             # kv: [num_tokens, n * (qk_head_dim + v_head_dim)]
             kv, _ = self.linear_kv_up_proj(kv_compressed)
 
@@ -804,7 +854,7 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
                     q_compressed, kv_compressed, k_pos_emb, rotary_pos_emb
                 )
 
-        return query, key, value
+        return query, key, value, q_compressed, kv_compressed
 
     def uncompress_kv_from_cache(self, kv_cached):
         """
@@ -923,3 +973,123 @@ def set_for_recompute_input_layernorm(self):
         if self.config.q_lora_rank is not None:
             set_save_original_input(self.linear_q_down_proj)
         set_save_original_input(self.linear_kv_down_proj)
+
+    def clip_qk(self):
+        """
+        QK Clipping is a technique to clip the query and key attention logits to prevent the
+        attention logits from exploding. Per MuonClip usage, we update the weight by calling this
+        function after Muon optimizer step.
+        """
+
+        if not self.config.qk_clip:
+            raise ValueError("qk_clip option needs to be enabled")
+
+        if self.core_attention.current_max_attn_logits is None:
+            raise ValueError("current_max_attn_logits is None")
+
+        # Check if we're in absorption mode
+        if self.cache_mla_latents and not hasattr(self, 'linear_kv_up_proj'):
+            raise ValueError(
+                "qk_clip is not supported when cache_mla_latents is enabled and absorption is "
+                "active. The linear_kv_up_proj layer has been deleted during absorption "
+                "preparation."
+            )
+
+        assert self.core_attention.current_max_attn_logits.shape == (
+            self.num_attention_heads_per_partition,
+        ), f"current_max_attn_logits shape is not ({self.num_attention_heads_per_partition}, ) \
+                    but {self.core_attention.current_max_attn_logits.shape}"
+
+        # only update the weight if any head has
+        # current_max_attn_logits > qk_clip_threshold
+        if torch.any(self.core_attention.current_max_attn_logits > self.config.qk_clip_threshold):
+            # Use num_attention_heads_per_partition for tensor parallel scenarios
+
+            # qk_clip_balancing_eta (n, 1, 1)
+            assert self.core_attention.current_max_attn_logits.shape == (
+                self.num_attention_heads_per_partition,
+            ), f"current_max_attn_logits shape is not ({self.num_attention_heads_per_partition},) \
+                but {self.core_attention.current_max_attn_logits.shape}"
+            self.qk_clip_balancing_eta = torch.clamp(
+                self.config.qk_clip_threshold / self.core_attention.current_max_attn_logits, max=1.0
+            ).view(self.num_attention_heads_per_partition, 1, 1)
+            assert torch.all(self.qk_clip_balancing_eta <= 1.0)
+
+            # Update q side weight, keep qk_pos_emb_head_dim side weight unchanged
+            if self.config.q_lora_rank is None:
+                q_proj_weight = self.linear_q_proj.weight
+            else:
+                q_proj_weight = self.linear_q_up_proj.weight
+
+            # Handle different weight access patterns (main_param vs direct access)
+            if hasattr(q_proj_weight, 'main_param'):
+                q_proj_weight.main_param.data.copy_(
+                    self._clip_q_proj_weight(q_proj_weight.main_param.data)
+                )
+            q_proj_weight.data.copy_(self._clip_q_proj_weight(q_proj_weight.data))
+
+            # Update k side weight, keep v side weight unchanged
+            kv_proj_weight = self.linear_kv_up_proj.weight
+
+            # Handle different weight access patterns
+            if hasattr(kv_proj_weight, 'main_param'):
+                kv_proj_weight.main_param.data.copy_(
+                    self._clip_kv_proj_weight(kv_proj_weight.main_param.data)
+                )
+            kv_proj_weight.data.copy_(self._clip_kv_proj_weight(kv_proj_weight.data))
+
+        # reset current_max_attn_logits
+        self.core_attention.current_max_attn_logits = None
+
+    def _clip_q_proj_weight(self, weight):
+        """Clip q_proj_weight"""
+        # Reshape to (n, a + b, -1)
+        weight_reshaped = weight.view(
+            self.num_attention_heads_per_partition,
+            self.config.qk_head_dim + self.config.qk_pos_emb_head_dim,
+            -1,
+        )
+
+        # Split into qk_head_dim and qk_pos_emb_head_dim parts: (n, a, -1) and (n, b, -1)
+        weight_q_nope = weight_reshaped[:, : self.config.qk_head_dim, :]
+        weight_q_pe = weight_reshaped[:, self.config.qk_head_dim :, :]
+
+        # Clipping
+        weight_q_nope.mul_(torch.pow(self.qk_clip_balancing_eta, self.config.qk_clip_alpha))
+        weight_q_pe.mul_(self.qk_clip_balancing_eta)
+
+        # Concatenate back and reshape to original shape
+        weight_q_updated = torch.cat([weight_q_nope, weight_q_pe], dim=1)
+        weight_q_updated = weight_q_updated.view(
+            self.num_attention_heads_per_partition
+            * (self.config.qk_head_dim + self.config.qk_pos_emb_head_dim),
+            -1,
+        )
+
+        return weight_q_updated
+
+    def _clip_kv_proj_weight(self, weight):
+        """Clip kv_proj_weight"""
+        # shape: (n, qk_head_dim + v_head_dim, kv_lora_rank)
+        weight_reshaped = weight.view(
+            self.num_attention_heads_per_partition,
+            self.config.qk_head_dim + self.config.v_head_dim,
+            -1,
+        )
+
+        # Split into qk_head_dim and v_head_dim parts: (n, a, -1) and (n, b, -1)
+        weight_k = weight_reshaped[:, : self.config.qk_head_dim, :]
+        weight_v = weight_reshaped[:, self.config.qk_head_dim :, :]
+
+        # Clipping
+        weight_k.mul_(torch.pow(self.qk_clip_balancing_eta, 1 - self.config.qk_clip_alpha))
+
+        # Concatenate back and reshape to original shape
+        weight_kv_updated = torch.cat([weight_k, weight_v], dim=1)
+        weight_kv_updated = weight_kv_updated.view(
+            self.num_attention_heads_per_partition
+            * (self.config.qk_head_dim + self.config.v_head_dim),
+            -1,
+        )
+
+        return weight_kv_updated
diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py
index ca44946571b..2edb652bfc6 100755
--- a/megatron/core/transformer/multi_token_prediction.py
+++ b/megatron/core/transformer/multi_token_prediction.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
+import warnings
 from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Callable, List, Optional, Union
@@ -13,18 +14,19 @@
 from megatron.core.fp8_utils import get_fp8_context
 from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider
 from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core.pipeline_parallel.utils import is_vp_last_stage
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel import (
     gather_from_tensor_model_parallel_region,
     scatter_to_sequence_parallel_region,
 )
-from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.enums import AttnMaskType, LayerType
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import get_transformer_layer_offset
 from megatron.core.utils import (
+    get_pg_rank,
     is_torch_min_version,
     make_tp_sharded_tensor_for_checkpoint,
     make_viewless_tensor,
@@ -53,7 +55,11 @@
 
 
 def tie_word_embeddings_state_dict(
-    sharded_state_dict: ShardedStateDict, word_emb_weight: Tensor, word_emb_weight_key: str
+    sharded_state_dict: ShardedStateDict,
+    word_emb_weight: Tensor,
+    word_emb_weight_key: str,
+    tp_group: torch.distributed.ProcessGroup,
+    dp_cp_group: torch.distributed.ProcessGroup,
 ) -> None:
     """tie the embedding of the mtp processing stage in a given sharded state dict.
 
@@ -61,13 +67,15 @@ def tie_word_embeddings_state_dict(
         sharded_state_dict (ShardedStateDict): state dict with the weight to tie.
         word_emb_weight (Tensor): weight of the word embedding.
         word_emb_weight_key (str): key of the word embedding in the sharded state dict.
+        tp_group (torch.distributed.ProcessGroup): The tensor parallel group
+        dp_cp_group (torch.distributed.ProcessGroup): The dp-cp comm group
 
     Returns: None, acts in-place
     """
     mtp_word_emb_replica_id = (
         1,  # copy of embedding in pre processing stage
         0,
-        parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        get_pg_rank(dp_cp_group),
     )
     assert word_emb_weight_key in sharded_state_dict
     del sharded_state_dict[word_emb_weight_key]
@@ -76,11 +84,17 @@ def tie_word_embeddings_state_dict(
         key=word_emb_weight_key,
         replica_id=mtp_word_emb_replica_id,
         allow_shape_mismatch=True,
+        tp_group=tp_group,
+        dp_cp_group=dp_cp_group,
     )
 
 
 def tie_output_layer_state_dict(
-    sharded_state_dict: ShardedStateDict, output_layer_weight: Tensor, output_layer_weight_key: str
+    sharded_state_dict: ShardedStateDict,
+    output_layer_weight: Tensor,
+    output_layer_weight_key: str,
+    tp_group: torch.distributed.ProcessGroup,
+    dp_cp_group: torch.distributed.ProcessGroup,
 ) -> None:
     """tie the output layer of the mtp processing stage in a given sharded state dict.
 
@@ -88,13 +102,15 @@ def tie_output_layer_state_dict(
         sharded_state_dict (ShardedStateDict): state dict with the weight to tie.
         output_layer_weight (Tensor): weight of the output layer.
         output_layer_weight_key (str): key of the output layer in the sharded state dict.
+        tp_group (torch.distributed.ProcessGroup): The tensor parallel group
+        dp_cp_group (torch.distributed.ProcessGroup): The dp-cp comm group
 
     Returns: None, acts in-place
     """
     mtp_output_layer_replica_id = (
         1,  # copy of output layer in post processing stage
         0,
-        parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        get_pg_rank(dp_cp_group),
     )
     assert output_layer_weight_key in sharded_state_dict
     del sharded_state_dict[output_layer_weight_key]
@@ -103,10 +119,12 @@ def tie_output_layer_state_dict(
         key=output_layer_weight_key,
         replica_id=mtp_output_layer_replica_id,
         allow_shape_mismatch=True,
+        tp_group=tp_group,
+        dp_cp_group=dp_cp_group,
     )
 
 
-def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None):
+def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None, packed_seq_params=None):
     """Roll the tensor input along the sequence dimension with Context Parallelism (CP) support.
 
     This function extends the original roll_tensor to support Context Parallelism, which allows
@@ -118,15 +136,24 @@ def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None):
     For CP>1: Splits tensor into chunks, performs rolling within each chunk, then exchanges
     boundary elements between adjacent CP ranks to maintain sequence continuity.
 
+    For packed sequences: Respects sequence boundaries when rolling to avoid mixing tokens
+    from different sequences.
+
     Args:
         tensor (Tensor): The input tensor to roll.
         shifts (int): The shift of the tensor (typically -1 for MTP).
         dims (int): The dimension to roll (typically -1 for sequence dimension).
         cp_group (ProcessGroup): The context parallelism process group. If None or size=1,
                                falls back to standard rolling behavior.
+        packed_seq_params (PackedSeqParams): Parameters for packed sequence processing.
+                                            If provided, respects sequence boundaries.
     Returns:
         tuple: (rolled_tensor, sum_of_rolled_tensor)
     """
+    # Handle packed sequences cases
+    if packed_seq_params is not None:
+        return _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group)
+
     # Standard rolling behavior when CP is not enabled (cp_group is None or size=1)
     if cp_group is None or cp_group.size() == 1:
         rolled_tensor = torch.roll(tensor, shifts=shifts, dims=dims)
@@ -195,6 +222,91 @@ def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None):
     return rolled_tensor, rolled_tensor.sum()
 
 
+def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group=None):
+    """Roll tensor with packed sequence support.
+    This function handles rolling for packed sequences by respecting sequence boundaries
+    """
+
+    # Notice: This is a naive implementation to test the correctness,
+    # a better solution will only sync the boundary tokens once.
+    assert (
+        dims == -1 or dims == tensor.dim() - 1
+    ), "Packed sequence roll only supports the last dimension."
+    assert shifts == -1, "Packed sequence roll only supports a single-token left shift."
+    cu_seqlens = packed_seq_params.cu_seqlens_q
+    assert cu_seqlens is not None, "Packed sequence parameters must provide cu_seqlens_q."
+
+    rolled_tensor = tensor.clone()
+
+    cp_size = cp_group.size() if cp_group is not None else 1
+    if cp_size == 1:
+        # CP disabled: roll each packed sequence independently within its boundaries
+        for i in range(len(cu_seqlens) - 1):
+            start_idx = cu_seqlens[i]
+            end_idx = cu_seqlens[i + 1]
+            seq_slice = tensor[..., start_idx:end_idx]
+            rolled_seq = torch.roll(seq_slice, shifts=shifts, dims=dims)
+            # Zero out the last position(s) that would cross sequence boundaries
+            rolled_seq[..., shifts:] = 0
+            rolled_tensor[..., start_idx:end_idx] = rolled_seq
+        return rolled_tensor, rolled_tensor.sum()
+
+    # CP enabled: each rank owns two chunks per sequence (front and mirrored tail).
+    local_rank = torch.distributed.get_rank(group=cp_group)
+    global_ranks = torch.distributed.get_process_group_ranks(group=cp_group)
+    next_rank = global_ranks[(local_rank + 1) % cp_size]
+    prev_rank = global_ranks[(local_rank - 1) % cp_size]
+
+    # Iterate over each sequence individually
+    for i in range(len(cu_seqlens) - 1):
+        start_idx = cu_seqlens[i]
+        end_idx = cu_seqlens[i + 1]
+
+        # the idx has been multiplied by cp_size, need to divide it by cp_size to get the local idx
+        local_start_idx = start_idx // cp_size
+        local_end_idx = end_idx // cp_size
+        tensor_slice = rolled_tensor[..., local_start_idx:local_end_idx].clone()
+
+        # The following code is very similar as the code in roll_tensor function
+        local_chunks = tensor_slice.chunk(2, dim=dims)
+        rolled_chunks = [torch.roll(chunk, shifts=shifts, dims=dims) for chunk in local_chunks]
+
+        tensor_send_list = []
+        tensor_recv_list = []
+        for chunk in rolled_chunks:
+            boundary = chunk.select(dims, shifts).contiguous().clone()
+            tensor_send_list.append(boundary)
+            tensor_recv_list.append(torch.empty_like(boundary))
+
+        ops = []
+        if local_rank != 0:
+            ops.append(torch.distributed.isend(tensor=tensor_send_list[0], dst=prev_rank))
+            ops.append(torch.distributed.irecv(tensor=tensor_recv_list[1], src=prev_rank))
+        else:
+            tensor_recv_list[1].zero_()
+
+        if local_rank != cp_size - 1:
+            ops.append(torch.distributed.irecv(tensor=tensor_recv_list[0], src=next_rank))
+            ops.append(torch.distributed.isend(tensor=tensor_send_list[1], dst=next_rank))
+        else:
+            tensor_recv_list[0].copy_(tensor_send_list[1])
+
+        for op in ops:
+            op.wait()
+
+        index = [slice(None)] * rolled_chunks[0].dim()
+        index[dims] = shifts
+        for chunk, recv in zip(rolled_chunks, tensor_recv_list):
+            chunk[tuple(index)] = recv
+
+        seq_result = torch.cat(rolled_chunks, dim=dims)
+
+        # update the rolled tensor
+        rolled_tensor[..., local_start_idx:local_end_idx] = seq_result
+
+    return rolled_tensor, rolled_tensor.sum()
+
+
 class MTPLossLoggingHelper:
     """Helper class for logging MTP losses."""
 
@@ -205,8 +317,8 @@ def save_loss_to_tracker(
         loss: torch.Tensor,
         layer_number: int,
         num_layers: int,
-        reduce_group: torch.distributed.ProcessGroup = None,
-        avg_group: torch.distributed.ProcessGroup = None,
+        reduce_group: Optional[torch.distributed.ProcessGroup] = None,
+        avg_group: Optional[torch.distributed.ProcessGroup] = None,
     ):
         """Save the mtp loss for logging.
         Args:
@@ -332,25 +444,100 @@ def get_mtp_layer_spec_for_backend(
     return mtp_layer_spec
 
 
-def get_mtp_layer_offset(config: TransformerConfig) -> int:
+def mtp_on_this_rank(
+    config: TransformerConfig, ignore_virtual: Optional[bool] = True, vp_stage: Optional[int] = None
+) -> bool:
+    """
+    Check if there is MTP on the current rank.
+
+    Behavior:
+        - If a custom pipeline model parallel layout is provided in the config:
+            - If virtual pipeline parallelism is enabled (and `ignore_virtual` is False), checks
+              whether any MTP layers are present on this (pp_rank, vp_stage) pair.
+            - Otherwise, checks all virtual pipeline ranks of the current pipeline rank. Returns
+              True if any virtual sub-rank includes at least one MTP layer.
+        - If no custom layout is provided, assumes all MTP layers (if any) are placed on the last
+          pipeline stage. The function returns True only on the last pipeline stage.
+    """
+    mtp_on_this_rank = False
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    if config.pipeline_model_parallel_layout is not None:
+        # with custom PP layout, we support put MTP layers on any pipeline stage
+        layout = config.pipeline_model_parallel_layout.layout
+        if (
+            not ignore_virtual
+            and parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None
+        ):
+            assert vp_stage is not None, "vp_stage must be passed if virtual pipeline is enabled"
+            num_layers_to_build = layout[pp_rank][vp_stage].count(LayerType.mtp)
+            mtp_on_this_rank = num_layers_to_build > 0
+        else:
+            for vpp_rank in range(len(layout[pp_rank])):
+                num_layers_to_build = layout[pp_rank][vpp_rank].count(LayerType.mtp)
+                if num_layers_to_build > 0:
+                    mtp_on_this_rank = True
+                    break
+    else:
+        # without custom PP layout, we only support put all of MTP layers on the last pipeline stage
+        if config.mtp_num_layers is not None:
+            mtp_on_this_rank = parallel_state.is_pipeline_last_stage(
+                ignore_virtual=ignore_virtual, vp_stage=vp_stage
+            )
+        else:
+            mtp_on_this_rank = False
+    return mtp_on_this_rank
+
+
+def get_mtp_ranks(pp_ranks: List[int], config: TransformerConfig) -> List[int]:
+    """Get the ranks of the MTP layers."""
+    mtp_ranks = set()
+    if config.mtp_num_layers is None:
+        return []
+    if config.pipeline_model_parallel_layout is None:
+        return [pp_ranks[-1]]
+    layout = config.pipeline_model_parallel_layout.layout
+    for pp_rank in range(len(layout)):
+        for vpp_rank in range(len(layout[pp_rank])):
+            num_layers_to_build = layout[pp_rank][vpp_rank].count(LayerType.mtp)
+            if num_layers_to_build:
+                mtp_ranks.add(pp_ranks[pp_rank])
+    return list(mtp_ranks)
+
+
+def get_mtp_layer_offset(config: TransformerConfig, vp_stage: Optional[int] = None) -> int:
     """Get the offset of the MTP layer."""
-    # Currently, we only support put all of MTP layers on the last pipeline stage.
-    return 0
+    if config.pipeline_model_parallel_size > 1:
+        if config.pipeline_model_parallel_layout:
+            offset = config.pipeline_model_parallel_layout.get_layer_offset(
+                layer_type=LayerType.mtp, vp_stage=vp_stage
+            )
+        else:
+            offset = 0
+    else:
+        offset = 0
+    return offset
 
 
 def get_mtp_num_layers_to_build(
     config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None
 ) -> int:
     """Get the number of MTP layers to build."""
-    # Currently, we only support put all of MTP layers on the last pipeline stage.
-    vp_size = config.virtual_pipeline_model_parallel_size
-    if pp_rank is None:
-        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
-    is_last_pp_stage = pp_rank == config.pipeline_model_parallel_size - 1
-    if is_vp_last_stage(vp_stage=vp_stage, vp_size=vp_size) and is_last_pp_stage:
-        return config.mtp_num_layers if config.mtp_num_layers else 0
+    if config.pipeline_model_parallel_layout is not None:
+        # If we have a custom PP layout, get the number of mtp layers in the layout array.
+        num_layers_to_build = config.pipeline_model_parallel_layout.get_num_layers_to_build(
+            layer_type=LayerType.mtp, vp_stage=vp_stage
+        )
+        assert num_layers_to_build == config.mtp_num_layers or num_layers_to_build == 0, (
+            f"Currently, we only support put all of MTP layers on the last pipeline stage, "
+            f"so the number of MTP layers to build ({num_layers_to_build}) must match "
+            f"mtp_num_layers ({config.mtp_num_layers}) or be 0."
+        )
     else:
-        return 0
+        if parallel_state.is_pipeline_last_stage(ignore_virtual=False, vp_stage=vp_stage):
+            num_layers_to_build = config.mtp_num_layers if config.mtp_num_layers else 0
+        else:
+            num_layers_to_build = 0
+    return num_layers_to_build
 
 
 class MTPLossAutoScaler(torch.autograd.Function):
@@ -430,7 +617,7 @@ def __init__(
         super().__init__(config=config)
         self.sequence_parallel = config.sequence_parallel
         self.submodules = submodules
-        self.layer_number = layer_number
+        self.layer_number = layer_number + get_mtp_layer_offset(self.config, vp_stage)
         self.vp_stage = vp_stage
         self.cp_group = pg_collection.cp
 
@@ -472,8 +659,15 @@ def __init__(
             skip_bias_add=False,
             is_expert=False,
         )
+
+        diff_transformer_layer_offset = self.config.num_layers - get_transformer_layer_offset(
+            self.config, vp_stage
+        )
         self.transformer_layer = build_module(
-            self.submodules.transformer_layer, config=self.config, vp_stage=vp_stage
+            self.submodules.transformer_layer,
+            config=self.config,
+            vp_stage=vp_stage,
+            layer_number=self.layer_number + diff_transformer_layer_offset,
         )
 
         self.final_layernorm = build_module(
@@ -490,6 +684,7 @@ def _get_embeddings(
         position_ids: torch.Tensor,
         embedding: Callable,
         hidden_states: torch.Tensor,
+        packed_seq_params: Optional[PackedSeqParams] = None,
     ):
         """
         Preprocesses input data for the Multi-Token Prediction (MTP) layers.
@@ -504,10 +699,23 @@ def _get_embeddings(
                 from gpt model to compute the decoder input.
             hidden_states (torch.Tensor): hidden states tensor of shape [s, b, h] where s is the
                 sequence length, b is the batch size, and h is the hidden size.
+            packed_seq_params (PackedSeqParams): Parameters for packed sequence processing.
         """
         # Calc logits for the current Multi-Token Prediction (MTP) layers.
-        input_ids, _ = roll_tensor(input_ids, shifts=-1, dims=-1, cp_group=self.cp_group)
-        position_ids, _ = roll_tensor(position_ids, shifts=-1, dims=-1, cp_group=self.cp_group)
+        input_ids, _ = roll_tensor(
+            input_ids,
+            shifts=-1,
+            dims=-1,
+            cp_group=self.cp_group,
+            packed_seq_params=packed_seq_params,
+        )
+        position_ids, _ = roll_tensor(
+            position_ids,
+            shifts=-1,
+            dims=-1,
+            cp_group=self.cp_group,
+            packed_seq_params=packed_seq_params,
+        )
         # embedding
         decoder_input = embedding(input_ids=input_ids, position_ids=position_ids)
 
@@ -656,15 +864,15 @@ def forward(
         position_ids: Tensor,
         hidden_states: Tensor,
         attention_mask: Tensor,
-        context: Tensor = None,
-        context_mask: Tensor = None,
-        rotary_pos_emb: Tensor = None,
-        rotary_pos_cos: Tensor = None,
-        rotary_pos_sin: Tensor = None,
-        attention_bias: Tensor = None,
-        inference_params: InferenceParams = None,
-        packed_seq_params: PackedSeqParams = None,
-        sequence_len_offset: Tensor = None,
+        context: Optional[Tensor] = None,
+        context_mask: Optional[Tensor] = None,
+        rotary_pos_emb: Optional[Tensor] = None,
+        rotary_pos_cos: Optional[Tensor] = None,
+        rotary_pos_sin: Optional[Tensor] = None,
+        attention_bias: Optional[Tensor] = None,
+        inference_params: Optional[InferenceParams] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        sequence_len_offset: Optional[Tensor] = None,
         embedding=None,
     ):
         """
@@ -690,15 +898,13 @@ def forward(
             [s, b, h], and optionally the updated context tensor if cross-attention is used.
         """
         assert context is None, f"multi token prediction + cross attention is not yet supported."
-        assert (
-            packed_seq_params is None
-        ), f"multi token prediction + sequence packing is not yet supported."
 
         input_ids, position_ids, decoder_input, hidden_states = self._get_embeddings(
             input_ids=input_ids,
             position_ids=position_ids,
             embedding=embedding,
             hidden_states=hidden_states,
+            packed_seq_params=packed_seq_params,
         )
 
         if self.config.recompute_granularity == 'full' and self.training:
@@ -769,7 +975,7 @@ class MultiTokenPredictionBlockSubmodules:
             projection matrix, transformer block, shared output head).
     """
 
-    layer_specs: List[ModuleSpec] = None
+    layer_specs: Optional[List[ModuleSpec]] = None
 
 
 def _get_mtp_block_submodules(
@@ -825,7 +1031,7 @@ def __init__(
         config: TransformerConfig,
         spec: Union[TransformerBlockSubmodules, ModuleSpec],
         vp_stage: Optional[int] = None,
-        pg_collection: ProcessGroupCollection = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ):
         super().__init__(config=config)
         self.submodules = _get_mtp_block_submodules(config, spec)
@@ -874,16 +1080,16 @@ def forward(
         position_ids: Tensor,
         hidden_states: Tensor,
         attention_mask: Tensor,
-        context: Tensor = None,
-        context_mask: Tensor = None,
-        rotary_pos_emb: Tensor = None,
-        rotary_pos_cos: Tensor = None,
-        rotary_pos_sin: Tensor = None,
-        attention_bias: Tensor = None,
-        inference_params: InferenceParams = None,
-        packed_seq_params: PackedSeqParams = None,
-        sequence_len_offset: Tensor = None,
-        extra_block_kwargs: dict = None,
+        context: Optional[Tensor] = None,
+        context_mask: Optional[Tensor] = None,
+        rotary_pos_emb: Optional[Tensor] = None,
+        rotary_pos_cos: Optional[Tensor] = None,
+        rotary_pos_sin: Optional[Tensor] = None,
+        attention_bias: Optional[Tensor] = None,
+        inference_params: Optional[InferenceParams] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        sequence_len_offset: Optional[Tensor] = None,
+        extra_block_kwargs: Optional[dict] = None,
         embedding=None,
     ) -> Tensor:
         """
@@ -899,7 +1105,7 @@ def forward(
             (Tensor): The mtp loss tensor of shape [b, s].
         """
         # get hidden states from previous mtp stages
-        offset = get_mtp_layer_offset(self.config)
+        offset = get_mtp_layer_offset(self.config, self.vp_stage)
         hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0))
         hidden_states = hidden_states_list[offset]
         for layer_number in range(len(self.layers)):
@@ -944,7 +1150,7 @@ def sharded_state_dict(
         sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
         layer_prefix = f'{prefix}layers.'
         for layer in self.layers:
-            offset = get_mtp_layer_offset(self.config)
+            offset = get_mtp_layer_offset(self.config, self.vp_stage)
             sharded_prefix = f'{layer_prefix}{layer.layer_number - 1 }.'
 
             state_dict_prefix = f'{layer_prefix}{layer.layer_number - 1 - offset}.'
diff --git a/megatron/core/transformer/pipeline_parallel_layer_layout.py b/megatron/core/transformer/pipeline_parallel_layer_layout.py
index 56467bf0e9d..7a8195e1bee 100644
--- a/megatron/core/transformer/pipeline_parallel_layer_layout.py
+++ b/megatron/core/transformer/pipeline_parallel_layer_layout.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import copy
 import logging
@@ -127,15 +127,28 @@ def validate_layer_layout(self, num_layers: int, mtp_num_layers: int):
             if LayerType.mtp in self.layout[pp_rank][-1]:
                 assert (
                     self.layout[pp_rank][-1].count(LayerType.mtp) == mtp_num_layers
-                ), "All of the MTP layers must be in the same stage"
-                assert (
-                    pp_rank == self.pipeline_model_parallel_size - 1
-                    and LayerType.loss in self.layout[pp_rank][-1]
-                ), "MTP layers must be in the last stage together with Loss stage."
+                ), "All of the MTP layers must be in the same one virtual pipeline stage"
+        for vpp_rank in range(self.virtual_pipeline_model_parallel_size - 1):
+            assert LayerType.mtp not in self.layout[0][vpp_rank], (
+                f"Currently we restrict that the MTP should not be in the first pp rank."
+                f"But got {self.layout[0]} for the first pp rank."
+            )
+        ## Detect MTP standalone usage.
+        mtp_standalone = False
+        for pp_rank in range(self.pipeline_model_parallel_size):
+            if (
+                LayerType.mtp in self.layout[pp_rank][-1]
+                and pp_rank != self.pipeline_model_parallel_size - 1
+            ):
+                mtp_standalone = True
+                break
+
         # TODO: remove them in the future once they are supported
         if self.flatten_layout.count(LayerType.encoder) > 0:
             raise NotImplementedError("Encoder layer is not supported for flexible pipeline layout")
 
+        return mtp_standalone
+
     def get_num_layers_to_build(
         self,
         layer_type: LayerType = LayerType.decoder,
diff --git a/megatron/core/transformer/spec_utils.py b/megatron/core/transformer/spec_utils.py
index b3de8541734..09058084181 100644
--- a/megatron/core/transformer/spec_utils.py
+++ b/megatron/core/transformer/spec_utils.py
@@ -1,8 +1,11 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
+import logging
 import types
 from dataclasses import dataclass, field
-from typing import Tuple, Union
+from typing import Any, Tuple, Union
+
+logger = logging.getLogger(__name__)
 
 
 @dataclass
@@ -24,7 +27,17 @@ class ModuleSpec:
 
     module: Union[Tuple, type]
     params: dict = field(default_factory=lambda: {})
-    submodules: type = None
+    submodules: object = None
+    metainfo: dict = field(default_factory=lambda: {})
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        """Builds an instance of the module from the spec.
+
+        Args:
+            *args: Positional arguments to be passed to the module init.
+            **kwargs: Keyword arguments to be passed to the module init.
+        """
+        return build_module(self, *args, **kwargs)
 
 
 def import_module(module_path: Tuple[str]):
@@ -37,12 +50,14 @@ def import_module(module_path: Tuple[str]):
     try:
         module = __import__(base_path, globals(), locals(), [name])
     except ImportError as e:
-        print(f"couldn't import module due to {e}")
+        logger.error(f"couldn't import module due to {e}")
         return None
     return vars(module)[name]
 
 
+# pylint: disable=missing-function-docstring
 def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs):
+    """Returns or imports the provided module."""
     # If a module clas is already provided return it as is
     if isinstance(spec_or_module, (type, types.FunctionType)):
         return spec_or_module
@@ -56,6 +71,13 @@ def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs):
 
 
 def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs):
+    """Builds an instance of the module from the spec.
+
+    Args:
+        spec_or_module: The module spec or module class to build.
+        *args: Positional arguments to be passed to the module init.
+        **kwargs: Keyword arguments to be passed to the module init.
+    """
     # If the passed `spec_or_module` is
     # a `Function`, then return it as it is
     # NOTE: to support an already initialized module add the following condition
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 98460615ded..f4907722f59 100755
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import logging
 import os
 from contextlib import nullcontext
@@ -19,7 +19,7 @@
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.enums import LayerType
+from megatron.core.transformer.enums import CudaGraphScope, LayerType
 from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -235,7 +235,7 @@ class TransformerBlockSubmodules:
             or instance of the layer normalization to be applied.
     """
 
-    layer_specs: List[ModuleSpec] = None
+    layer_specs: Optional[List[ModuleSpec]] = None
     layer_norm: Optional[Union[ModuleSpec, torch.nn.Module]] = None
 
 
@@ -290,7 +290,7 @@ def __init__(
         post_layer_norm: bool = True,
         pre_process: bool = True,
         post_process: bool = True,
-        pg_collection: ProcessGroupCollection = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
         vp_stage: Optional[int] = None,
     ):
         super().__init__(config=config)
@@ -298,6 +298,7 @@ def __init__(
         if pg_collection is None:
             pg_collection = ProcessGroupCollection.use_mpu_process_groups()
         self.pg_collection = pg_collection
+        self.tp_group = pg_collection.tp
 
         pp_group = self.pg_collection.pp if hasattr(self.pg_collection, 'pp') else None
         pp_rank = get_pg_rank(pp_group)
@@ -390,7 +391,7 @@ def build_layer(layer_spec, layer_number):
         # @TODO: add back account_for_embedding_in_pipeline_split (see issue #293)
         # In pipeline parallelism, we want to add this LN only to the last stage of the pipeline
         # self.post_process and self.post_layer_norm guide this behavior
-        if self.submodules.layer_norm and self.post_process and self.post_layer_norm:
+        if self.has_final_layernorm_in_this_stage():
             self.final_layernorm = build_module(
                 self.submodules.layer_norm,
                 config=self.config,
@@ -400,6 +401,62 @@ def build_layer(layer_spec, layer_number):
         else:
             self.final_layernorm = None  # Either this or nn.Identity
 
+        if self.config.inference_fuse_tp_communication:
+            self._setup_fused_tp_communication()
+
+    def has_final_layernorm_in_this_stage(self):
+        """
+        Check if this vpp stage contains the final layernorm.
+
+        Note:
+            Final layernorm now has been moved from the post-process stage to the last decoder
+            layer by using this function.
+            There will be a small numeric difference because of grad norm reduction when final
+            layernorm is placed in different pipeline stages in deterministic mode. It can still
+            be bitwise aligned by disabling grad norm clipping.
+        """
+        if self.config.mtp_num_layers is None:
+            # for model without MTPLayer, the final layernorm is set in the stage which does
+            # post_process
+            return self.submodules.layer_norm and self.post_process and self.post_layer_norm
+        else:
+            # for model with MTPLayer, the final layernorm is set in the stage which has the
+            # last layer of the decoder
+            has_final_layernorm_in_this_stage = False
+            for layer in self.layers:
+                if layer.layer_number == self.config.num_layers:
+                    has_final_layernorm_in_this_stage = True
+                    break
+            return (
+                self.submodules.layer_norm
+                and has_final_layernorm_in_this_stage
+                and self.post_layer_norm
+            )
+
+    def _setup_fused_tp_communication(self):
+        """Setup fused TP communication for all layers.
+        We have a fused reduce-scatter + add + layer-norm + all-gather operation.
+        We call this kernel from within row parallel linear layers.
+        But layer-norm needs the layer norm weights from the
+        successive column parallel linear layer.
+        This function is used to pass those weights to the respective layers.
+        """
+
+        for i in range(len(self.layers)):
+            current_layer = self.layers[i]
+
+            # Get next layer's QKV norm weights (None for last layer)
+            if i < len(self.layers) - 1:
+                next_qkv_norm_weights = self.layers[i + 1].get_qkv_layer_norm_weights()
+            else:
+                next_qkv_norm_weights = None
+
+            # Configure all fused TP communication settings in one call
+            current_layer.configure_fused_tp_inference(
+                skip_qkv_norm_and_all_gather=(i > 0),
+                fc2_next_layer_norm_weights=next_qkv_norm_weights,
+            )
+
     def _get_layer(self, layer_number: int):
         return self.layers[layer_number]
 
@@ -413,12 +470,18 @@ def _checkpointed_forward(
         attention_bias: Tensor,
         packed_seq_params: PackedSeqParams,
         use_inner_quantization_context: bool,
+        padding_mask: Optional[Tensor] = None,
     ):
         """Forward method with activation checkpointing."""
 
         def custom(start: int, end: int):
             def custom_forward(
-                hidden_states, attention_mask, context, context_mask, rotary_pos_emb
+                hidden_states,
+                attention_mask,
+                context,
+                context_mask,
+                rotary_pos_emb,
+                padding_mask=None,
             ):
                 for index in range(start, end):
                     layer = self._get_layer(index)
@@ -449,6 +512,7 @@ def custom_forward(
                             attention_bias=attention_bias,
                             inference_context=None,
                             packed_seq_params=packed_seq_params,
+                            padding_mask=padding_mask,
                         )
                 return hidden_states, context
 
@@ -468,6 +532,7 @@ def checkpoint_handler(forward_func):
                     context,
                     context_mask,
                     rotary_pos_emb,
+                    padding_mask,
                 )
             else:
                 return tensor_parallel.checkpoint(
@@ -478,6 +543,7 @@ def checkpoint_handler(forward_func):
                     context,
                     context_mask,
                     rotary_pos_emb,
+                    padding_mask,
                 )
 
         if self.config.recompute_method == 'uniform':
@@ -539,7 +605,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs):
                 kwargs.get('inference_context') is not None
                 or kwargs.get('inference_params') is not None
             )
-            and self.config.cuda_graph_scope == 'full_iteration'
+            and CudaGraphScope.full_iteration in self.config.cuda_graph_scope
         ):
             if kwargs['inference_context'].is_static_batching():
                 using_cuda_graph = kwargs['inference_context'].is_decode_only()
@@ -583,6 +649,7 @@ def forward(
         inference_context: Optional[BaseInferenceContext] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
         sequence_len_offset: Optional[Tensor] = None,
+        padding_mask: Optional[Tensor] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
         dynamic_inference_decode_only: Optional[bool] = None,
@@ -765,6 +832,7 @@ def forward(
                     attention_bias=attention_bias,
                     packed_seq_params=packed_seq_params,
                     use_inner_quantization_context=use_inner_quantization_context,
+                    padding_mask=padding_mask,
                 )
             else:
                 for l_no, layer in enumerate(self.layers):
@@ -797,6 +865,7 @@ def forward(
                             inference_context=inference_context,
                             packed_seq_params=packed_seq_params,
                             sequence_len_offset=sequence_len_offset,
+                            padding_mask=padding_mask,
                         )
 
                     if (
@@ -854,6 +923,7 @@ def sharded_state_dict(
         elif isinstance(self.config.moe_layer_freq, list):
             non_homogeneous_layers = True
 
+        ######### FlagScale Begin #########
         # TODO: @aoyulong - This is a temporary solution to support single-file-per-tensor ckpt
         non_homogeneous_layers_env = os.getenv('FS_NON_HOMOGENEOUS_LAYERS', 'False').lower() in (
             'true',
@@ -862,6 +932,13 @@ def sharded_state_dict(
         )
         if non_homogeneous_layers_env:
             non_homogeneous_layers = True
+        ######### FlagScale Begin #########
+
+        if isinstance(self.config.linear_attention_freq, int):
+            if self.config.linear_attention_freq > 1:
+                non_homogeneous_layers = True
+        elif isinstance(self.config.linear_attention_freq, list):
+            non_homogeneous_layers = True
 
         if self.config.heterogeneous_block_specs:
             non_homogeneous_layers = True
@@ -907,7 +984,11 @@ def sharded_state_dict(
             if not module is self.layers:
                 sharded_state_dict.update(
                     sharded_state_dict_default(
-                        module, f'{prefix}{name}.', sharded_offsets, metadata
+                        module,
+                        f'{prefix}{name}.',
+                        sharded_offsets,
+                        metadata,
+                        tp_group=self.tp_group,
                     )
                 )
 
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index e2ababc3e5f..08fc5caef7f 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import warnings
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Callable, List, Literal, Optional, Tuple, Union
 
 import torch
@@ -9,7 +9,7 @@
 
 from megatron.core.enums import Fp4Recipe, Fp8Recipe
 from megatron.core.quantization.quant_config import RecipeConfig
-from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
 
 from ..fusions.fused_bias_geglu import quick_gelu
@@ -42,14 +42,22 @@ class TransformerConfig(ModelParallelConfig):
     # model architecture
     ####################
 
-    num_layers: int = 0
+    num_layers: int = field(default=0, metadata={"argparse_meta": {"default": None}})
     """Number of transformer layers in a transformer block."""
 
     mtp_num_layers: Optional[int] = None
-    """Number of Multi-Token Prediction (MTP) Layers."""
+    """Number of Multi-Token Prediction (MTP) Layers.
+    MTP extends the prediction scope to multiple future tokens at each position.
+    This MTP implementation sequentially predict additional tokens
+    by using D sequential modules to predict D additional tokens.
+    """
 
-    mtp_loss_scaling_factor: Optional[float] = None
-    """Weighting factor of Multi-Token Prediction (MTP) loss."""
+    mtp_loss_scaling_factor: Optional[float] = 0.1
+    """Weighting factor of Multi-Token Prediction (MTP) loss.
+    We compute the average of the MTP losses across all depths, 
+    and multiply it the scaling factor to obtain the overall MTP loss, 
+    which serves as an additional training objective.
+    """
 
     num_layers_in_first_pipeline_stage: Optional[int] = None
     """Number of transformer layers on first pipeline stage.
@@ -93,10 +101,10 @@ class TransformerConfig(ModelParallelConfig):
     """If set, the loss layer will be treated as a standard transformer
     layer in the context of partition and placement for pipeline parallelism."""
 
-    hidden_size: int = 0
+    hidden_size: int = field(default=0, metadata={"argparse_meta": {"default": None}})
     """Transformer hidden size."""
 
-    num_attention_heads: int = 0
+    num_attention_heads: int = field(default=0, metadata={"argparse_meta": {"default": None}})
     """Number of transformer attention heads."""
 
     attention_backend: AttnBackend = AttnBackend.auto
@@ -113,7 +121,9 @@ class TransformerConfig(ModelParallelConfig):
        Supports both TE FusedAttention and local unfused attention. Supports both a fixed offset and 
        and learnable offset."""
 
-    num_query_groups: Optional[int] = None
+    num_query_groups: Optional[int] = field(
+        default=None, metadata={"argparse_meta": {"default": 1}}
+    )
     """Number of query groups for group query attention. If None, normal attention is used."""
 
     ffn_hidden_size: Optional[int] = None
@@ -137,16 +147,22 @@ class TransformerConfig(ModelParallelConfig):
     apply_residual_connection_post_layernorm: bool = False
     """If True, uses the original BERT residule connection ordering."""
 
-    layernorm_epsilon: float = 1e-5
-    """Epsilon value for any LayerNorm operations."""
+    layernorm_epsilon: float = field(
+        default=1e-5, metadata={"argparse_meta": {"arg_names": ["--norm-epsilon"]}}
+    )
+    """Epsilon value for any LayerNorm/RMSNorm operations."""
 
-    layernorm_zero_centered_gamma: bool = False
+    layernorm_zero_centered_gamma: bool = field(
+        default=False, metadata={"argparse_meta": {"arg_names": ["--apply-layernorm-1p"]}}
+    )
     """If set to True, the LayerNorm is adjusted to center the gamma values around 0. This improves
     numerical stability."""
 
-    add_bias_linear: bool = True
-    """Include a bias term in all linear layers (QKV projections, after core attention, and two in
-    MLP layer)."""
+    add_bias_linear: bool = field(
+        default=True, metadata={"argparse_meta": {"arg_names": ["--disable-bias-linear"]}}
+    )
+    """Include/exclude a bias term in all linear layers (QKV projections, after core attention,
+    and two in MLP layer)."""
 
     add_qkv_bias: bool = False
     """Add a bias term only for QKV projections."""
@@ -186,12 +202,31 @@ class TransformerConfig(ModelParallelConfig):
     - An integer N: Represents a (N-1):1 ratio, one full attention layer after (N-1) SWA layers.
     - A list that defines a custom pattern, e.g.: [1,1,1,1,0,0,0,0], where 1 represents SWA. """
 
-    normalization: str = "LayerNorm"
+    normalization: Literal['LayerNorm', 'RMSNorm'] = "LayerNorm"
     """Which norm to use for normalization layers, valid options are `LayerNorm` and `RMSNorm`."""
 
     qk_layernorm: bool = False
     """Whether to apply `normalization` type of normalization to the query and key embeddings."""
 
+    qk_l2_norm: bool = False
+    """Whether to apply llama 4-style qk L2 norm."""
+
+    qk_clip: bool = False
+    """Whether to clip the query and key weights. Needed for Muon MLA Model training."""
+
+    qk_clip_alpha: float = 0.5
+    """The balancing alpha for qk-clip. Q = Q * (eta ** alpha)"""
+
+    qk_clip_threshold: float = 100
+    """The balancing threshold for qk-clip. eta = min(threshold / max_attention_logits, 1.0)"""
+
+    log_max_attention_logit: bool = False
+    """Whether to log the max attention logit across whole model. Decoupled from qk_clip,
+    defualts to False. Setting qk_clip will automatically log the max logit"""
+
+    attention_output_gate: bool = False
+    """Whether to apply output gate to the attention layers."""
+
     test_mode: bool = False
     """Whether to run real-time tests."""
 
@@ -209,12 +244,55 @@ class TransformerConfig(ModelParallelConfig):
     A list of integers: Defines a custom pattern where 1 means skip RoPE and 0 means apply RoPE.
     For example, [0,1,1,0] means: apply RoPE, skip RoPE, skip RoPE, apply RoPE."""
 
-    moe_deepep_num_sms: int = 20
-    """Number of SMs to use for DeepEP."""
+    ####################
+    # attention variant
+    ####################
+    experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa']] = None
+    """Type of attention variant to use. Currently support gated_delta_net and dsa."""
 
-    moe_hybridep_num_sms: int = 16
-    """Number of SMs to use for HybridEP. In pure NVL scenarios, 
-    16 SMs can generally achieve good bandwidth."""
+    ####################
+    # DSA
+    ####################
+    dsa_indexer_n_heads: Optional[int] = None
+    """Number of DSA indexer heads."""
+
+    dsa_indexer_head_dim: Optional[int] = None
+    """Dimension per DSA indexer head."""
+
+    dsa_indexer_topk: Optional[int] = None
+    """Number of top-k tokens to select in DSA indexer."""
+
+    dsa_indexer_loss_coeff: Optional[float] = None
+    """Coefficient for the DSA indexer KL divergence loss. Set to 0 to disable indexer loss."""
+
+    dsa_indexer_use_sparse_loss: bool = False
+    """Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the
+    top-k indices."""
+
+    ####################
+    # linear attention
+    ####################
+    linear_attention_freq: Optional[Union[int, List[int]]] = None
+    """Frequency between LA (linear attention) layers 
+    and SDPA (scaled dot-product attention) layers.
+    Accepts either:
+    - An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer
+    - A list that defines a custom pattern, e.g.: [1,1,1,0,1,1,1,0,1,1,1,0]"""
+
+    linear_conv_kernel_dim: Optional[int] = 4
+    """Conv kernel dimension for the gated delta net."""
+
+    linear_key_head_dim: Optional[int] = 128
+    """Query and key head dimension for the gated delta net."""
+
+    linear_value_head_dim: Optional[int] = 128
+    """Value and gate head dimension for the gated delta net."""
+
+    linear_num_key_heads: Optional[int] = 16
+    """Number of query and key heads for the gated delta net."""
+
+    linear_num_value_heads: Optional[int] = 32
+    """Number of value and gate heads for the gated delta net."""
 
     ####################
     # initialization
@@ -243,7 +321,10 @@ class TransformerConfig(ModelParallelConfig):
     embedding_init_method_std: Optional[float] = None
     """
     Standard deviation of the zero mean normal for the default initialization method for the 
-    embedding layer. If None, will be set to init_method_std.
+    embedding layer. If None, will be set to init_method_std. Setting this to a value around
+    1.0 may avoid loss spikes in training. Setting this to any value will also skip applying
+    weight decay on embedding weights to avoid shrinkage towards zero.
+    See https://arxiv.org/abs/2312.16903 for more details.
     """
 
     init_model_with_meta_device: bool = False
@@ -257,7 +338,7 @@ class TransformerConfig(ModelParallelConfig):
     ####################
     apply_query_key_layer_scaling: bool = False
     """If true, scale Q * K^T by 1 / layer-number. This improve numeric stability when training with
-    fp16."""
+    fp16. Also sets `attention_softmax_in_fp32` to True."""
 
     attention_softmax_in_fp32: bool = True
     """If True, run attention masking and softmax in fp32. This should be True if
@@ -299,7 +380,7 @@ class TransformerConfig(ModelParallelConfig):
     ####################
     # activation recomputation
     ####################
-    recompute_granularity: Optional[str] = None
+    recompute_granularity: Optional[Literal['full', 'selective']] = None
     """Determines which type of activation recompute to use.  Megatron-core supports 'selective'
     activation checkpointing where the submodules set in --recompute-modules is checkpointed.
     The default is "core_attn" which is the memory intensive part of attention.
@@ -310,7 +391,7 @@ class TransformerConfig(ModelParallelConfig):
     If set, must be 'selective' or 'full'. 'selective' always uses all layers.
     """
 
-    recompute_method: Optional[str] = None
+    recompute_method: Optional[Literal['uniform', 'block']] = None
     """Determines which transformer layers will be recomputed. uniform will uniformly divide the
     total number of transformer layers in a transformer block and recompute the input activation of
     each divided chunk at the specified granularity.  block will recompute the input activations for
@@ -354,12 +435,16 @@ class TransformerConfig(ModelParallelConfig):
     ####################
     # fp8 related
     ####################
-    fp8: Optional[str] = None
+    fp8: Optional[Literal['e4m3', 'hybrid']] = field(
+        default=None, metadata={"argparse_meta": {"arg_names": ["--fp8-format"]}}
+    )
     """If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined
     choices (1) 'e4m3' uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8
     activation and weight tensors and e5m2 for all FP8 output activation gradient tensors."""
 
-    fp8_recipe: Optional[str] = "delayed"
+    fp8_recipe: Optional[Literal['tensorwise', 'delayed', 'mxfp8', 'blockwise', 'custom']] = (
+        "delayed"
+    )
     """If set, enables the use of FP8 precision through Transformer Engine. There are 5 predefined
     choices (1) 'tensorwise' uses per tensor current scaling recipe, (2) 'delayed'
     uses delayed scaling recipe, 3) 'mxfp8' for Blackwell architecture only,
@@ -387,7 +472,7 @@ class TransformerConfig(ModelParallelConfig):
     fp8_amax_history_len: int = 1
     """The length of the amax history window used for scaling factor computation."""
 
-    fp8_amax_compute_algo: str = "most_recent"
+    fp8_amax_compute_algo: Literal['most_recent', 'max'] = "most_recent"
     """Algorithm used for choosing the `amax` value for the scaling factor computation. There are 2
     predefined choices: `max` chooses the largest `amax` in the history window, while `most_recent`
     always chooses the most recently seen value.
@@ -421,18 +506,29 @@ class TransformerConfig(ModelParallelConfig):
     use_kitchen: bool = False
     """Use the kitchen extension for transformer quantization."""
 
+    use_kitchen_attention: bool = False
+    """Use the kitchen extension for attention (instead of TE's attention)."""
+
+    kitchen_attention_backend: Literal["sdpa", "fa"] = "sdpa"
+    """Which kitchen attention backend to use when use_kitchen_attention=True.
+    "sdpa" for KitchenDotProductAttention, "fa" for KitchenFlashAttention."""
+
     ####################
     # fp4 related
     ####################
-    fp4: Optional[str] = None
+    fp4: Optional[Literal['e2m1']] = field(
+        default=None, metadata={"argparse_meta": {"arg_names": ["--fp4-format"]}}
+    )
     """If set, enables the use of FP4 precision through Transformer Engine. Currently only 
     supports 'nvfp4' which uses NVFP4BlockScaling recipe (requires TE >= 2.7.0.dev0)."""
 
-    fp4_recipe: Optional[str] = "nvfp4"
+    fp4_recipe: Optional[Literal['nvfp4', 'custom']] = "nvfp4"
     """If set, enables the use of FP4 precision through Transformer Engine. Currently only
     'nvfp4' is supported which uses NVFP4BlockScaling recipe for Blackwell+ architecture."""
 
-    fp4_param: bool = False
+    fp4_param: bool = field(
+        default=False, metadata={"argparse_meta": {"arg_names": ["--fp4-param-gather"]}}
+    )
     """If set, keep the parameters in fp4 precision to save memory. This option must be used
     together with fp4 mode (i.e., TransformerConfig.fp4 is not None). Note that not all parameters
     will be converted to fp4; for example, biases will remain unchanged."""
@@ -462,9 +558,15 @@ class TransformerConfig(ModelParallelConfig):
     different orders to the hidden_states, causing minor numerical differences
     in the hidden_states gradient."""
 
+    moe_shared_expert_gate: bool = False
+    """Enable gate for shared expert. Only effective when 
+    moe-shared-expert-intermediate-size is set."""
+
     moe_shared_expert_overlap: bool = False
     """Enable overlapping between shared expert computations and dispatcher communications.
-    Without this, the shared experts execute before the router."""
+    Without this, the shared experts execute before the router. 
+    Only effective when moe-shared-expert-intermediate-size is set.
+    """
 
     moe_layer_freq: Union[int, List[int]] = 1
     """Frequency between MoE layers and Dense layers. Accepts either:
@@ -472,7 +574,7 @@ class TransformerConfig(ModelParallelConfig):
     - A list that defines a custom pattern, e.g.: [1,1,1,0,1,1,1,0,1,1,1,0]"""
 
     moe_ffn_hidden_size: Optional[int] = None
-    """MoE Feed-Forward Network hidden size"""
+    """MoE Feed-Forward Network hidden size. If not specified, defaults to the ffn_hidden_size."""
 
     moe_router_load_balancing_type: Union[str, List[str]] = "aux_loss"
     """The load balancing strategy for the router.
@@ -491,6 +593,9 @@ class TransformerConfig(ModelParallelConfig):
     moe_router_topk: int = 2
     """Number of experts to route to for each token."""
 
+    moe_enable_routing_replay: bool = False
+    """If True, enable the routing replay feature for MoE layers."""
+
     moe_router_topk_limited_devices: Optional[int] = None
     """Number of EP ranks to consider for each token in group-limited routing,
     DEPRECATED and replaced by moe_router_num_groups and moe_router_group_topk.
@@ -525,7 +630,7 @@ class TransformerConfig(ModelParallelConfig):
     """Number of selected groups for group-limited routing."""
 
     moe_router_pre_softmax: bool = False
-    """Enable pre-softmax(pre-sigmoid) routing for MoE, which means softmax is before the 
+    """Enable pre-softmax(pre-sigmoid) routing for MoE, which means softmax is before the
     top-k selection.
     By default, softmax is done after top-k."""
 
@@ -533,10 +638,10 @@ class TransformerConfig(ModelParallelConfig):
     """Scaling factor for routing score in top-k selection, only works when moe_router_pre_softmax
     enabled. Defaults to None, which means no scaling."""
 
-    moe_router_score_function: str = "softmax"
+    moe_router_score_function: Literal['softmax', 'sigmoid'] = "softmax"
     """Score function for MoE routing. Can be "softmax" or "sigmoid"."""
 
-    moe_router_dtype: Optional[str] = None
+    moe_router_dtype: Optional[Literal['fp32', 'fp64']] = None
     """Data type for routing and expert output weighted averaging. Using fp32 or fp64 can
     improve stability especially when the number of experts is large (e.g. finegrained-moe).
     None means no changes for dtype."""
@@ -582,14 +687,14 @@ class TransformerConfig(ModelParallelConfig):
     specified capacity, similar to GShard, Switch-Transformer, and DeepSpeed-MoE. Note that this is
     currently unsupported so should remain False."""
 
-    moe_token_dispatcher_type: str = "allgather"
+    moe_token_dispatcher_type: Literal['allgather', 'alltoall', 'flex'] = "allgather"
     """The type of token dispatcher to use. The default is 'allgather'.
     Options are 'allgather','alltoall' and 'flex'."""
 
     moe_enable_deepep: bool = False
     """[Experimental] Enable DeepEP for efficient token dispatching and combine in MoE models."""
 
-    moe_flex_dispatcher_backend: str = "deepep"
+    moe_flex_dispatcher_backend: Literal['deepep', 'hybridep'] = "deepep"
     """[Experimental] The backend to use for flex token dispatcher. The default is "deepep".
     Options are "deepep" and "hybridep". Currently only "hybridep" backend supports 
     the MNNVL case."""
@@ -606,7 +711,7 @@ class TransformerConfig(ModelParallelConfig):
     the expert capacity length, effective only after the moe_expert_capacity_factor is set. The
     default setting is False."""
 
-    moe_token_drop_policy: str = "probs"
+    moe_token_drop_policy: Literal['probs', 'position'] = "probs"
     """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with
     the lowest probabilities will be dropped. If "position", tokens at the end of each batch will
     be dropped.
@@ -619,11 +724,23 @@ class TransformerConfig(ModelParallelConfig):
     """Fuse token rearrangement ops during token dispatching."""
 
     moe_router_fusion: bool = False
-    """Fuse ops in routing and aux loss calculation."""
+    """Enable fusion for MoE TopK routing and aux-loss computation. This is only
+    supported in TransformerEngine 2.7.0 and above.
+    """
 
     moe_apply_probs_on_input: bool = False
     """Apply probs on input of experts instead of applying after activation and glu."""
 
+    moe_latent_size: Optional[int] = None
+    """Latent projection dimension for MoE. If None, MoE latent projections are not used."""
+
+    moe_deepep_num_sms: int = 20
+    """Number of SMs to use for DeepEP."""
+
+    moe_hybridep_num_sms: int = 16
+    """Number of SMs to use for HybridEP. In pure NVL scenarios,
+    16 SMs can generally achieve good bandwidth."""
+
     ##################
     # Context Parallel
     ##################
@@ -653,11 +770,11 @@ class TransformerConfig(ModelParallelConfig):
     determines the scope of graph capture."""
 
     cuda_graph_use_single_mempool: bool = False
-    """When set to true, cudagraphs will be captured inside a single mempool, in which all
-    cudagraphs may only be used once per step. If false, cudagraphs may be reused across
-    microbatches. Enabling may reduce cudagraph memory overheads due to memory fragmentation,
-    however may greatly increase the number of cudagraphs created when the number of microbatches
-    is high."""
+    """[For `local` implementation only] When set to true, cudagraphs will be captured inside a
+    single mempool, in which all cudagraphs may only be used once per step. If false, cudagraphs may
+    be reused across microbatches. Enabling may reduce cudagraph memory overheads due to memory
+    fragmentation, however may greatly increase the number of cudagraphs created when the number of
+    microbatches is high."""
 
     cuda_graph_retain_backward_graph: bool = False
     """When set to true, cudagraph backward passes will be graph captured with 'retain_grad=True'
@@ -671,7 +788,7 @@ class TransformerConfig(ModelParallelConfig):
     """DEPRECATED and replaced by cuda_graph_impl.
     When set to true, TransformerLayer layers are swapped with user provided CUDA graphs."""
 
-    cuda_graph_impl: str = "none"
+    cuda_graph_impl: Literal['none', 'local', 'transformer_engine'] = "none"
     """Determines the CUDA graph capture implementation.
     "none": no CUDA graph.
     "local": capture the CUDA graph using MCore local implementation. Either partial CUDA graph
@@ -679,11 +796,12 @@ class TransformerConfig(ModelParallelConfig):
     excluding optimizer) is enabled.
     "transformer_engine": capture the CUDA graph using TE make_graphed_callables()."""
 
-    cuda_graph_scope: str = "full"
+    cuda_graph_scope: Union[str, CudaGraphScope, List[str], List[CudaGraphScope]] = "full"
     """Determines the CUDA graphs capturing scope.
-    When cuda_graph_impl is set to "transformer_engine", valid values are "full" and "attn".
-    "Full" scope captures a whole Transformer layer. "Attn" scope only captures operations in
-    TransformerLayer._forward_attention().
+    When cuda_graph_impl is set to "transformer_engine", valid values are "attn", "mlp", "moe",
+    "moe_router", "moe_preprocess", "mamba". "full" or an empty list means the full layer. "full"
+    is actually deprecated, but for backward compatibility, we still use "full" as the default
+    value. It will be transformed to an empty list in __post_init__.
     When cuda_graph_impl is set to "local", "full_iteration" can be specified as cuda_graph_scope
     to enable whole iteration CUDA graph. All other values enable layerwise CUDA graph."""
 
@@ -706,6 +824,13 @@ class TransformerConfig(ModelParallelConfig):
     flash_decode: bool = False
     """ Use the optimized flash decoding kernel during inference. """
 
+    batch_invariant_mode: bool = False
+    """If true, uses batch-invariant kernels that provide deterministic forward execution regardless
+       of batch size. This ensures bitwise identical results when the same inputs are processed
+       in different batch configurations. This will significantly affect speed of 
+       training and inference as the kernels are not full optimized.
+       Defaults to False."""
+
     use_te_activation_func: bool = False
     """Whether to use ffn activation functions implemented by TransformerEngine"""
 
@@ -718,12 +843,17 @@ class TransformerConfig(ModelParallelConfig):
     inference_sampling_seed: int = 42
     """ Random seed to use for sampling during inference. """
 
-    symmetric_ar_type: Optional[str] = None
-    """Type of symmetric all reduce to use"""
+    symmetric_ar_type: Optional[Literal['two_shot', "one_shot", "multimem_all_reduce"]] = None
+    """What type of symmetric all reduce to use. The default is None
+    which is no use of symmetric memory.
+    """
 
     use_inference_optimized_layers: bool = False
     """If True, use inference optimized transformer layers during inference."""
 
+    inference_fuse_tp_communication: bool = False
+    """ If true, uses a fused reduce-scatter-residual-norm-allgather kernel during inference. """
+
     mrope_section: Optional[List[int]] = None
     """ Multimodal rope section is for channel dimension of temporal, height and width
     in rope calculation. """
@@ -741,11 +871,13 @@ class TransformerConfig(ModelParallelConfig):
     """The number of groups used in Mamba layers."""
 
     mamba_num_heads: Optional[int] = None
-    """The number of heads used in Mamba layers. 
+    """The number of heads used in Mamba layers.
     If None, the number of heads will be hidden_size * expand // mamba_head_dim."""
 
-    use_mamba_mem_eff_path: bool = True
-    """If True, use the memory efficient path for Mamba layers."""
+    use_mamba_mem_eff_path: bool = field(
+        default=True, metadata={"argparse_meta": {"arg_names": ["--disable-mamba-mem-eff-path"]}}
+    )
+    """Controls usage of the memory efficient path for Mamba layers."""
 
     mlp_chunks_for_prefill: int = 1
     """The number of chunks along the sequence dimension to use for MLP computation
@@ -761,12 +893,15 @@ class TransformerConfig(ModelParallelConfig):
     # Quantization
     ####################
     quant_recipe: Optional[RecipeConfig] = None
-    """Configuration of any quantization to be applied to the model"""
+    """Configuration of any per-module quantization settings to be applied to the model"""
 
-    transformer_impl: str = "transformer_engine"
+    transformer_impl: Literal['local', 'transformer_engine', 'inference_optimized'] = (
+        "transformer_engine"
+    )
     """Transformer implementation to use.
     Options are 'transformer_engine' for Transformer Engine and 'local' for MCore."""
 
+    ######### FlagScale Begin #########
     ####################
     # PEFT
     ####################
@@ -787,11 +922,34 @@ class TransformerConfig(ModelParallelConfig):
     lora_out_init_method: Optional[str] = None
     """Lora b init method"""
 
-
     ####################
     # TE_FL
     ####################
     te_fl_prefer: Optional[str] = 'vendor'
+    ######### FlagScale End #########
+
+    #####################################
+    # Fine-grained Activation Offloading
+    #####################################
+    fine_grained_activation_offloading: bool = False
+    """If True, offload the input of the specified modules to the CPU.
+    Fine-grained activation offloading is a module-level offloading method
+    instead of a layer-level offloading method like cpu_offloading."""
+
+    offload_modules: Optional[list[str]] = field(default_factory=list)
+    """The submodules to offload its input.
+    choices: "attn_norm", "qkv_linear", "core_attn", "attn_proj",
+             "mlp_norm", "expert_fc1", "moe_act".
+    "attn_norm": offload the input of the normalization in the attention part.
+    "qkv_linear": offload the input of the qkv linear part.
+    "core_attn": offload the input of the core attention part.
+    "attn_proj": offload the input of the attn linear projection part.
+    "mlp_norm": offload the input of the normalization in the mlp part.
+    "expert_fc1": offload the input of the expert fc1 part.
+    "moe_act": offload the input of the moe act part.
+    """
+    min_offloaded_tensor_size: int = 1024 * 1024
+    """The minimum size of the tensor to be offloaded."""
 
     def __post_init__(self):
         """Python dataclass method that is used to modify attributes after initialization.
@@ -823,12 +981,55 @@ def __post_init__(self):
         if self.num_query_groups is None:
             self.num_query_groups = self.num_attention_heads
 
-        if self.num_query_groups % self.tensor_model_parallel_size != 0:
+        if (
+            self.num_query_groups % self.tensor_model_parallel_size != 0
+            and self.tensor_model_parallel_size % self.num_query_groups != 0
+        ):
             raise ValueError(
-                f"num_query_groups ({self.num_query_groups}) must be a multiple of "
+                f"num_query_groups ({self.num_query_groups}) must be a multiple or divisor of "
                 f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
             )
 
+        if self.experimental_attention_variant == "gated_delta_net":
+            assert (
+                self.linear_attention_freq is not None
+            ), f"linear_attention_freq must be set for linear gated_delta_net."
+
+            # Check required parameters
+            assert (
+                self.linear_conv_kernel_dim is not None
+            ), "linear_conv_kernel_dim must be set for gated delta net."
+            assert (
+                self.linear_key_head_dim is not None
+            ), "linear_key_head_dim must be set for gated delta net."
+            assert (
+                self.linear_value_head_dim is not None
+            ), "linear_value_head_dim must be set for gated delta net."
+            assert (
+                self.linear_num_key_heads is not None
+            ), "linear_num_key_heads must be set for gated delta net."
+            assert (
+                self.linear_num_value_heads is not None
+            ), "linear_num_value_heads must be set for gated delta net."
+            assert self.linear_num_value_heads % self.linear_num_key_heads == 0, (
+                f"linear_num_value_heads ({self.linear_num_value_heads}) must be a multiple of "
+                f"linear_num_key_heads ({self.linear_num_key_heads})."
+            )
+
+            # Check tensor parallelism compatibility
+            assert (
+                self.linear_num_key_heads % self.tensor_model_parallel_size == 0
+            ), "linear_num_key_heads must be a multiple of tensor_model_parallel_size."
+            assert (
+                self.linear_num_value_heads % self.tensor_model_parallel_size == 0
+            ), "linear_num_value_heads must be a multiple of tensor_model_parallel_size."
+
+            # Do not support yet, but coming soon.
+            assert self.context_parallel_size == 1, (
+                f"Gated delta net does not support context parallel for now,"
+                f" but got {self.context_parallel_size=}."
+            )
+
         if self.fp8:
             # cannot support first last layer bf16 with delayed scaling
             if self.first_last_layers_bf16 and self.fp8_recipe == Fp8Recipe.delayed:
@@ -1109,6 +1310,32 @@ def __post_init__(self):
             if "moe" not in self.recompute_modules:
                 self.recompute_modules.append("moe")
 
+        if self.fine_grained_activation_offloading:
+            assert (
+                not self.cpu_offloading
+            ), "fine_grained_activation_offloading cannot be enabled with cpu_offloading."
+            assert self.offload_modules is not None and len(self.offload_modules) > 0
+            allowed_modules = {
+                "core_attn",
+                "attn_proj",
+                "expert_fc1",
+                "moe_act",
+                "attn_norm",
+                "mlp_norm",
+                "qkv_linear",
+            }
+            invalid_modules = set(self.offload_modules) - allowed_modules
+            assert not invalid_modules, (
+                f'Invalid choices for offload_modules: {invalid_modules}. '
+                f'Allowed modules are: {allowed_modules}'
+            )
+            if "attn_proj" in self.offload_modules and "core_attn" not in self.offload_modules:
+                raise ValueError(
+                    "attn_proj cannot be set to offload_modules alone without core_attn "
+                    "because the input of attn_proj is the output of core_attn, "
+                    "which is needed in core_attn.backward()."
+                )
+
         if (
             self.num_layers_in_first_pipeline_stage is not None
             or self.num_layers_in_last_pipeline_stage is not None
@@ -1170,7 +1397,7 @@ def __post_init__(self):
                 self.virtual_pipeline_model_parallel_size = detected_vpp_size
 
             # Check whether the layout is valid.
-            self.pipeline_model_parallel_layout.validate_layer_layout(
+            self.mtp_standalone = self.pipeline_model_parallel_layout.validate_layer_layout(
                 num_layers=self.num_layers, mtp_num_layers=self.mtp_num_layers
             )
 
@@ -1220,6 +1447,13 @@ def __post_init__(self):
                 num_layers -= self.num_layers_in_last_pipeline_stage
                 pipeline_parallel_size -= 1
 
+            # Ensure you either have middle pp stages and layers or none of them.
+            if bool(num_layers) != bool(pipeline_parallel_size):
+                raise ValueError(
+                    f"Mismatch: {num_layers} middle layers remaining but {pipeline_parallel_size} "
+                    f"middle PP stages available."
+                )
+
             # Here pipeline_parallel_size is the number of middle PP stages. If there are middle
             # PP stages, check number of layers at middle stage is divisible by middle PP size.
             if pipeline_parallel_size and not num_layers % pipeline_parallel_size == 0:
@@ -1362,6 +1596,10 @@ def __post_init__(self):
                         "apply_rope_fusion is not available. Please install TE >= 1.4."
                     )
 
+        if self.fused_single_qkv_rope:
+            if self.attention_output_gate:
+                raise ValueError("fused_single_qkv_rope does not support gated attention for now.")
+
         if self.multi_latent_attention and self.rotary_interleaved:
             raise ValueError("rotary_interleaved does not work with multi_latent_attention.")
 
@@ -1491,30 +1729,144 @@ def __post_init__(self):
                     'use cuda_graph_impl=transformer_engine instead.'
                 )
                 self.cuda_graph_impl = "transformer_engine"
+
+        if self.cuda_graph_scope is None:
+            self.cuda_graph_scope = []
+        elif not isinstance(self.cuda_graph_scope, list):
+            if isinstance(self.cuda_graph_scope, CudaGraphScope):
+                self.cuda_graph_scope = [self.cuda_graph_scope]
+            else:
+                assert isinstance(self.cuda_graph_scope, str), (
+                    "cuda_graph_scope must be a string that can be converted to a list of "
+                    f"CudaGraphScope, got {self.cuda_graph_scope}."
+                )
+                self.cuda_graph_scope = self.cuda_graph_scope.split(',')
+        if all(isinstance(scope, str) for scope in self.cuda_graph_scope):
+            # Backward compatibility for "full" scope. Now we use an empty list instead.
+            if "full" in self.cuda_graph_scope:
+                assert self.cuda_graph_scope == [
+                    "full"
+                ], "full scope cannot be used with other scopes."
+                warnings.warn(
+                    "full scope is deprecated. "
+                    "Use empty cuda_graph_scope to capture the whole layer."
+                )
+                self.cuda_graph_scope = []
+            else:
+                self.cuda_graph_scope = [CudaGraphScope[scope] for scope in self.cuda_graph_scope]
+        assert all(
+            isinstance(scope, CudaGraphScope) for scope in self.cuda_graph_scope
+        ), f"cuda_graph_scope must be a list of CudaGraphScope, got {self.cuda_graph_scope}."
+
         if self.cuda_graph_impl != "none":
             assert self.cuda_graph_impl in [
                 "transformer_engine",
                 "local",
             ], f"Invalid cuda graph implementation: {self.cuda_graph_impl}"
+
             if self.cpu_offloading:
                 raise ValueError("CUDA graphs not supported with CPU offloading.")
-            if self.recompute_granularity:
+
+            if self.cuda_graph_impl == "local":
+                # local impl doesn't currently distinguish between moe_preproocess or moe_router
+                # so just set both if either is specified.
                 if (
-                    self.recompute_granularity != "selective"
-                    or self.cuda_graph_impl != "transformer_engine"
-                    or self.cuda_graph_scope != "attn"
+                    CudaGraphScope.moe_router in self.cuda_graph_scope
+                    or CudaGraphScope.moe_preprocess in self.cuda_graph_scope
                 ):
-                    raise ValueError("CUDA graphs not supported with activation recomputation.")
+                    if CudaGraphScope.moe_router not in self.cuda_graph_scope:
+                        self.cuda_graph_scope.append(CudaGraphScope.moe_router)
+                    if CudaGraphScope.moe_preprocess not in self.cuda_graph_scope:
+                        self.cuda_graph_scope.append(CudaGraphScope.moe_preprocess)
+
+            # Check cuda graph scopes
+            if self.cuda_graph_impl == "transformer_engine":
+                assert CudaGraphScope.full_iteration not in self.cuda_graph_scope, (
+                    "To use full iteration cuda graph, please use "
+                    "cuda_graph_impl=local instead of cuda_graph_impl=transformer_engine."
+                )
+            assert (
+                CudaGraphScope.moe not in self.cuda_graph_scope
+                or CudaGraphScope.moe_router not in self.cuda_graph_scope
+            ), 'cuda_graph_scope must not contain both moe and moe_router.'
+            if CudaGraphScope.moe_preprocess in self.cuda_graph_scope:
+                assert (
+                    CudaGraphScope.moe_router in self.cuda_graph_scope
+                ), 'moe_preprocess cuda graph is only supported with moe_router cuda graph.'
+            if self.num_moe_experts is None or self.num_moe_experts <= 1:
+                assert (
+                    CudaGraphScope.moe not in self.cuda_graph_scope
+                    and CudaGraphScope.moe_router not in self.cuda_graph_scope
+                ), 'moe cuda graph is only supported for MoE.'
+            else:
+                if self.moe_layer_freq == 1 or (
+                    isinstance(self.moe_layer_freq, list) and 0 not in self.moe_layer_freq
+                ):
+                    assert CudaGraphScope.mlp not in self.cuda_graph_scope, (
+                        'mlp cuda graph is only supported for dense layers, '
+                        'but not found in the model.'
+                    )
+                if (
+                    self.moe_expert_capacity_factor is None
+                    or not self.moe_pad_expert_input_to_capacity
+                ):
+                    assert (
+                        CudaGraphScope.moe not in self.cuda_graph_scope
+                    ), 'moe cuda graph is only supported with drop-padding MoE.'
+                    if self.moe_token_dispatcher_type == 'alltoall' and (
+                        self.moe_expert_capacity_factor is not None
+                        or self.moe_router_padding_for_fp8
+                    ):
+                        assert CudaGraphScope.moe_preprocess not in self.cuda_graph_scope, (
+                            'moe_preprocess cuda graph is not supported when there are '
+                            'DtoH copies and synchronizations in the preprocess step.'
+                        )
+
+            if self.recompute_granularity:
+                if self.recompute_granularity != "selective":
+                    assert self.cuda_graph_scope == [
+                        CudaGraphScope.full_iteration
+                    ], "full recompute is only supported with full iteration CUDA graph."
                 else:
-                    for module in self.recompute_modules:
-                        if module in ['core_attn', 'mla_up_proj']:
-                            raise ValueError(
-                                f'attn cuda graph is not supported with {module} recompute.'
-                            )
-                    if "layernorm" in self.recompute_modules:
-                        warnings.warn(
-                            "input_layernorm recompute is not supported with attention "
-                            "cudagraph. Will only recompute the pre_mlp_layernorm."
+                    # The recompute module should be inside or outside of the graph scope.
+                    # Recompute module coverring graph scope is not allowed.
+                    if (
+                        self.cuda_graph_impl == "transformer_engine"
+                        and "moe" in self.recompute_modules
+                    ):
+                        assert (
+                            CudaGraphScope.moe_router not in self.cuda_graph_scope
+                        ), "moe recompute is not supported with moe_router CUDA graph with: "
+                        "--cuda-graph-impl transformer_engine."
+
+                    # Graphed recompute module doesn't accept random number.
+                    if (
+                        not self.cuda_graph_scope
+                        or CudaGraphScope.full_iteration in self.cuda_graph_scope
+                    ):
+                        full_cudagraph = True
+                    else:
+                        full_cudagraph = False
+                    if self.attention_dropout != 0.0:
+                        assert (
+                            not full_cudagraph and CudaGraphScope.attn not in self.cuda_graph_scope
+                        ) or "core_attn" not in self.recompute_modules, (
+                            "attention dropout is not supported with graphed attention "
+                            "recomputation."
+                        )
+                    if self.hidden_dropout != 0.0:
+                        assert (
+                            (not full_cudagraph and CudaGraphScope.mlp not in self.cuda_graph_scope)
+                            or "mlp" not in self.recompute_modules
+                        ) and (
+                            (not full_cudagraph and CudaGraphScope.moe not in self.cuda_graph_scope)
+                            or "moe" not in self.recompute_modules
+                        ), "hidden dropout is not supported with graphed MLP/MoE recomputation."
+                    if self.moe_input_jitter_eps is not None:
+                        assert (
+                            not full_cudagraph and CudaGraphScope.moe not in self.cuda_graph_scope
+                        ) or "moe" not in self.recompute_modules, (
+                            "moe_input_jitter_eps is not supported with graphed moe recomputation."
                         )
 
         if self.moe_token_dispatcher_type in ["allgather"]:
@@ -1582,6 +1934,21 @@ def __post_init__(self):
             assert (
                 self.mtp_num_layers is None or self.mtp_num_layers == 1
             ), 'MTP layernum only supports 1 when enabling overlap_moe_expert_parallel_comm.'
+            if self.mtp_num_layers == 1:
+                assert self.pipeline_model_parallel_size > 1, (
+                    'Pipeline model parallel size must be larger than 1 '
+                    'when enabling overlap_moe_expert_parallel_comm with MTP layer.'
+                )
+
+            if self.cuda_graph_impl != "none":
+                assert (
+                    self.cuda_graph_impl == "transformer_engine"
+                    and CudaGraphScope.moe not in self.cuda_graph_scope
+                    and CudaGraphScope.mlp not in self.cuda_graph_scope
+                ), (
+                    'CUDA graph scope on moe and mlp is not '
+                    'supported with overlap_moe_expert_parallel_comm'
+                )
 
         # Check delay_wgrad_compute compatibility
         if self.delay_wgrad_compute:
@@ -1591,6 +1958,17 @@ def __post_init__(self):
             assert (
                 not self.moe_use_legacy_grouped_gemm
             ), 'delay_wgrad_compute is not supported with legacy groupedgemm implementation'
+            if self.cuda_graph_impl == "transformer_engine":
+                assert is_te_min_version("2.10.0"), (
+                    'TE version >= 2.10.0 is required for delay_wgrad_compute with '
+                    'partial cuda graph'
+                )
+
+        if self.ep_overlap_early_attn_memory_release:
+            assert self.overlap_moe_expert_parallel_comm, (
+                'overlap_moe_expert_parallel_comm must be enabled when enabling '
+                'ep_overlap_early_attn_memory_release'
+            )
 
         if self.context_parallel_size > 1 and self.cp_comm_type is not None:
             if isinstance(self.cp_comm_type, list):
@@ -1651,8 +2029,27 @@ def __post_init__(self):
             assert not self.add_qkv_bias
             assert not self.use_kitchen
 
+        ######### FlagScale Begin #########
         if self.moe_fb_overlap:
             self.delay_wgrad_compute = True
+        ######### FlagScale End #########
+
+        if self.experimental_attention_variant == "dsa":
+            assert (
+                self.context_parallel_size == 1
+            ), "Currently context parallelism is not supported by DSAttention!"
+            assert not self.apply_rope_fusion, "RoPE fusion is not supported for DSAttention"
+
+        if self.inference_fuse_tp_communication:
+            assert self.transformer_impl == "inference_optimized", (
+                "inference_fuse_tp_communication is only supported "
+                "for inference_optimized transformer implementation."
+            )
+
+        if self.batch_invariant_mode:
+            assert (
+                self.attention_backend == AttnBackend.flash
+            ), "Batch invariant mode only supports FlashAttention"
 
 
 @dataclass
@@ -1721,6 +2118,9 @@ def __post_init__(self):
         if self.multi_latent_attention and self.apply_rope_fusion and self.rope_type != "yarn":
             raise ValueError("apply_rope_fusion for MLA only works with YARN RoPE.")
 
+        if self.attention_output_gate:
+            raise NotImplementedError("Output gate is not supported for MLA yet.")
+
         if self.cache_mla_latents:
             assert (
                 self.apply_rope_fusion is False
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
index 5888e3e649f..09c0a0fcb94 100644
--- a/megatron/core/transformer/transformer_layer.py
+++ b/megatron/core/transformer/transformer_layer.py
@@ -1,5 +1,6 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import functools
 import logging
 import warnings
 from abc import ABC
@@ -15,7 +16,8 @@
 from megatron.core.dist_checkpointing.utils import apply_prefix_mapping
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.enums import LayerType
+from megatron.core.transformer.cuda_graphs import is_graph_capturing
+from megatron.core.transformer.enums import CudaGraphScope, LayerType
 from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
 from megatron.core.transformer.mlp import MLP
 from megatron.core.transformer.module import GraphableMegatronModule
@@ -338,13 +340,14 @@ def __init__(
         pg_collection: Optional[ProcessGroupCollection] = None,
         vp_stage: Optional[int] = None,
     ):
+        self.submodules_config = submodules
         super().__init__(config=config, vp_stage=vp_stage)
 
         if pg_collection is None:
             pg_collection = ProcessGroupCollection.use_mpu_process_groups()
         self.pg_collection = pg_collection
+        self.tp_group = pg_collection.tp
 
-        self.submodules_config = submodules
         self.layer_number = layer_number + get_transformer_layer_offset(
             self.config, vp_stage, get_pg_rank(pg_collection.pp)
         )
@@ -442,19 +445,64 @@ def __init__(
         # [Module 9: BiasDropoutFusion]
         self.mlp_bda = build_module(submodules.mlp_bda)
 
+        self.is_moe_layer = isinstance(self.mlp, MoELayer)
+
         self.recompute_input_layernorm = False
         self.recompute_pre_mlp_layernorm = False
         self.recompute_mlp = False
         if self.config.recompute_granularity == 'selective':
             if "layernorm" in self.config.recompute_modules:
-                if (
-                    not isinstance(self.input_layernorm, IdentityOp)
-                    and self.config.cuda_graph_impl == "none"
-                ):
+                if not isinstance(self.input_layernorm, IdentityOp):
                     self.recompute_input_layernorm = True
                     if self.config.fp8 or self.config.fp4:
                         self.self_attention.set_for_recompute_input_layernorm()
-                if not isinstance(self.pre_mlp_layernorm, IdentityOp):
+
+                def can_recompute_pre_mlp_layernorm_for_cudagraph():
+                    if (
+                        not self.is_moe_layer
+                        or CudaGraphScope.moe_router not in self.config.cuda_graph_scope
+                        or self.config.cuda_graph_impl == "local"
+                    ):
+                        # Not a MoE layer, or not capturing the router part.
+                        return True
+                    if (
+                        self.config.moe_shared_expert_intermediate_size is not None
+                        and self.config.moe_shared_expert_overlap
+                    ):
+                        # If shared expert overlap is used, we cannot make the pre-mlp layernorm
+                        # recomputation, because the shared expert takes the layernorm output as
+                        # input, and it is outside of the CUDA graph scope.
+                        log_single_rank(
+                            logger,
+                            logging.WARNING,
+                            "pre_mlp_layernorm recompute is not supported with moe router "
+                            "cudagraph + shared expert overlap. Disabling pre_mlp_layernorm "
+                            "recompute.",
+                        )
+                        return False
+                    if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope and (
+                        self.config.moe_token_dispatcher_type == "alltoall"
+                        or self.config.moe_latent_size
+                    ):
+                        # Only when capturing the preprocess part and using alltoall token
+                        # dispatcher or latent MoE can we make the pre-mlp layernorm recomputation.
+                        # Because in other cases the layernorm output returns directly as one of the
+                        # outputs of the cudagraph, which will be allocated a static buffer, thus
+                        # not able to be released.
+                        return True
+                    log_single_rank(
+                        logger,
+                        logging.WARNING,
+                        "pre_mlp_layernorm recompute is only supported with moe router + "
+                        "preprocess cudagraph will alltoall token dispatcher or latent MoE. "
+                        "Disabling pre_mlp_layernorm recompute.",
+                    )
+                    return False
+
+                if (
+                    not isinstance(self.pre_mlp_layernorm, IdentityOp)
+                    and can_recompute_pre_mlp_layernorm_for_cudagraph()
+                ):
                     self.recompute_pre_mlp_layernorm = True
                     if self.config.fp8 or self.config.fp4:
                         if isinstance(self.mlp, MoELayer):
@@ -466,8 +514,18 @@ def __init__(
 
                             set_save_original_input(self.mlp.linear_fc1)
             if "mlp" in self.config.recompute_modules:
-                if not isinstance(self.mlp, MoELayer):
+                if not self.is_moe_layer:
                     self.recompute_mlp = True
+        self.offload_attn_norm = (
+            self.config.fine_grained_activation_offloading
+            and "attn_norm" in self.config.offload_modules
+            and not isinstance(self.input_layernorm, IdentityOp)
+        )
+        self.offload_mlp_norm = (
+            self.config.fine_grained_activation_offloading
+            and "mlp_norm" in self.config.offload_modules
+            and not isinstance(self.pre_mlp_layernorm, IdentityOp)
+        )
 
         # @jcasper how should we handle nvfuser?
         # Set bias+dropout+add fusion grad_enable execution handler.
@@ -477,6 +535,27 @@ def __init__(
         # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
         self.bias_dropout_add_exec_handler = torch.enable_grad
 
+    def create_mcore_cudagraph_manager(self, config):
+        """Register the transformer layer for cudagraphs."""
+
+        from megatron.core.transformer.cuda_graphs import CudaGraphManager
+
+        # If full scope, just cudagraph the entire layer
+        if not self.config.cuda_graph_scope:
+            self.cudagraph_manager = CudaGraphManager(config)
+        elif (
+            CudaGraphScope.attn in self.config.cuda_graph_scope
+            and self.submodules_config.self_attention != IdentityOp
+        ):
+            self.cudagraph_manager = CudaGraphManager(config)
+        elif (
+            CudaGraphScope.mlp in self.config.cuda_graph_scope
+            and self.submodules_config.mlp != IdentityOp
+        ):
+            # Cudagraphing MoE layers are supposed handled by MoeTransforerLayer
+            assert not self.is_moe_layer
+            self.cudagraph_manager = CudaGraphManager(config)
+
     @staticmethod
     def _get_layer_offset(config: TransformerConfig):
         """
@@ -503,7 +582,11 @@ def forward(self, *args, **kwargs):
         # runners in the cuda graph manager
         kwargs.pop("dynamic_inference_decode_only", None)
         hidden_states, context = self._forward_attention(*args, **kwargs)
-        output = self._forward_mlp(hidden_states, kwargs.get("inference_context", None))
+        output = self._forward_mlp(
+            hidden_states,
+            kwargs.get("inference_context", None),
+            padding_mask=kwargs.get("padding_mask", None),
+        )
         return output, context
 
     def _forward_attention(
@@ -520,6 +603,7 @@ def _forward_attention(
         inference_context: Optional[Any] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
         sequence_len_offset: Optional[Tensor] = None,
+        padding_mask: Optional[Tensor] = None,
         *,
         inference_params: Optional[Any] = None,
     ):
@@ -550,6 +634,9 @@ def _forward_attention(
                 context (Tensor): Updated context tensor if cross-attention is used,
                 otherwise None.
         """
+        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+            FineGrainedActivationOffloadingInterface as off_interface,
+        )
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
@@ -559,11 +646,22 @@ def _forward_attention(
         # Optional Input Layer norm
         if self.recompute_input_layernorm:
             self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            input_layernorm_output = self.input_layernorm_checkpoint.checkpoint(
-                self.input_layernorm, hidden_states
-            )
+            with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states:
+                input_layernorm_output = self.input_layernorm_checkpoint.checkpoint(
+                    self.input_layernorm, hidden_states
+                )
         else:
-            input_layernorm_output = self.input_layernorm(hidden_states)
+            with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states:
+                input_layernorm_output = self.input_layernorm(hidden_states)
+
+        using_fused_tp_inference_kernel = (not self.training) and (
+            self.config.inference_fuse_tp_communication
+        )
+
+        if using_fused_tp_inference_kernel:
+            # Set the residual for fused reduce-scatter + add + layer-norm + all-gather
+            # operation in attention's out_proj (linear_proj)
+            self._set_proj_residual(residual)
 
         # Self attention.
         nvtx_range_push(suffix="self_attention")
@@ -591,12 +689,25 @@ def _forward_attention(
         # TODO: could we move `bias_dropout_add_exec_handler` itself
         # inside the module provided in the `bias_dropout_add_spec` module?
         nvtx_range_push(suffix="self_attn_bda")
-        with self.bias_dropout_add_exec_handler():
-            hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
-                attention_output_with_bias, residual, self.hidden_dropout
-            )
+        if using_fused_tp_inference_kernel:
+            # In inference optimized transformer layer, there is no bias and dropout
+            # The remaining residual add is already handled inside the
+            # self attention module.
+            hidden_states = attention_output_with_bias[0]
+        else:
+            with self.bias_dropout_add_exec_handler():
+                hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                    attention_output_with_bias, residual, self.hidden_dropout
+                )
         nvtx_range_pop(suffix="self_attn_bda")
 
+        # Delay the offload of the attention norm until after the self_attn_bda has been computed
+        # because the residual is needed in the self_attn_bda.
+        if self.offload_attn_norm:
+            hidden_states = off_interface.group_commit(
+                hidden_states, name="attn_norm", forced_released_tensors=[residual]
+            )
+
         # Residual connection.
         residual = hidden_states
 
@@ -623,13 +734,35 @@ def _forward_attention(
 
         return hidden_states, context
 
-    def _forward_mlp(self, hidden_states, inference_context=None):
+    def _forward_pre_mlp_layernorm(self, hidden_states):
+        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+            FineGrainedActivationOffloadingInterface as off_interface,
+        )
+
+        if self.recompute_pre_mlp_layernorm:
+            self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
+            with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states:
+                pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint(
+                    self.pre_mlp_layernorm, hidden_states
+                )
+        else:
+            with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states:
+                pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        return pre_mlp_layernorm_output
+
+    def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None):
         """
         Perform a forward pass through the feed-forward layer.
 
         Args:
             hidden_states (Tensor): Transformed hidden states before the MLP layernorm.
-
+                Shape [seq_length, batch_size, hidden_size].
+            inference_context: Inference context for optimizations.
+            padding_mask (Tensor, optional): Padding mask for MoE routing.
+                Shape [bsz, seq_length]. True = padding (exclude), False = valid (include).
+                Only used for MoE layers to exclude padding tokens from aux loss computations.
+                The MoELayer will internally transform this to [seq_length, bsz] format.
         Returns:
             output (Tensor): Transformed hidden states of shape [s, b, h].
         """
@@ -638,13 +771,7 @@ def _forward_mlp(self, hidden_states, inference_context=None):
         residual = hidden_states
 
         # Optional Layer norm post the cross-attention.
-        if self.recompute_pre_mlp_layernorm:
-            self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint(
-                self.pre_mlp_layernorm, hidden_states
-            )
-        else:
-            pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+        pre_mlp_layernorm_output = self._forward_pre_mlp_layernorm(hidden_states)
 
         nvtx_range_push(suffix="mlp")
         # Potentially chunk the MLP computation during prefill to minimize the peak activation size
@@ -653,6 +780,11 @@ def _forward_mlp(self, hidden_states, inference_context=None):
             and inference_context is not None
             and not inference_context.is_decode_only()
             and not isinstance(self.mlp, IdentityOp)
+            and not self.config.transformer_impl == "inference_optimized"
+        )
+
+        using_fused_tp_inference_kernel = (not self.training) and (
+            self.config.inference_fuse_tp_communication
         )
 
         if self.recompute_mlp:
@@ -666,10 +798,13 @@ def _forward_mlp(self, hidden_states, inference_context=None):
                     tensor_parallel.random.get_cuda_rng_tracker,
                     self.pg_collection.tp,
                     pre_mlp_layernorm_output,
+                    padding_mask=padding_mask,
                 )
             else:
                 mlp_output_with_bias = tensor_parallel.checkpoint(
-                    self.mlp, False, pre_mlp_layernorm_output
+                    functools.partial(self.mlp, padding_mask=padding_mask),
+                    False,
+                    pre_mlp_layernorm_output,
                 )
         elif should_chunk_mlp_for_prefill:
             # Chunk input along sequence dimension
@@ -684,9 +819,51 @@ def _forward_mlp(self, hidden_states, inference_context=None):
             bias_chunks = [bias for _, bias in outputs if bias is not None]
             bias_output = torch.stack(bias_chunks, dim=0).sum(dim=0) if bias_chunks else None
             mlp_output_with_bias = (mlp_output, bias_output)
+        else:
+            if using_fused_tp_inference_kernel:
+                # Set the residual for fused reduce-scatter + add + layer-norm + all-gather
+                # operation in MLP's fc2.
+                self._set_fc2_residual(residual)
+            mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask)
 
+        nvtx_range_pop(suffix="mlp")
+
+        if (
+            self.is_moe_layer
+            and self.config.cuda_graph_impl == "transformer_engine"
+            and self.training
+            and is_graph_capturing()
+            and CudaGraphScope.moe_router in self.config.cuda_graph_scope
+        ):
+            if self.recompute_pre_mlp_layernorm:
+                # Register the recompute hooks to all the cudagraph output tensors, because some
+                # tensors are in parallel execution paths and they all need pre_mlp_layernorm to be
+                # recomputed in backward pass. For example, the router path and the shared expert
+                # path. So only register in one path is risky.
+                for tensor in mlp_output_with_bias:
+                    self.pre_mlp_norm_checkpoint.discard_output_and_register_recompute(tensor)
+            return list(mlp_output_with_bias) + [residual]
         else:
-            mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
+            return self._forward_post_mlp(mlp_output_with_bias, residual)
+
+    def _forward_post_mlp(self, mlp_output_with_bias, residual):
+        """
+        Perform operations after the MLP computation.
+
+        Args:
+            mlp_output_with_bias (Tensor): Output tensor of the MLP layer with bias.
+            residual (Tensor): Residual tensor.
+
+        Returns:
+            output (Tensor): Transformed hidden states of shape [s, b, h].
+        """
+        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+            FineGrainedActivationOffloadingInterface as off_interface,
+        )
+
+        using_fused_tp_inference_kernel = (not self.training) and (
+            self.config.inference_fuse_tp_communication
+        )
 
         if self.recompute_pre_mlp_layernorm:
             # discard the output of the pre-mlp layernorm and register the recompute
@@ -694,16 +871,27 @@ def _forward_mlp(self, hidden_states, inference_context=None):
             self.pre_mlp_norm_checkpoint.discard_output_and_register_recompute(
                 mlp_output_with_bias[0]
             )
-        nvtx_range_pop(suffix="mlp")
 
         # TODO: could we move `bias_dropout_add_exec_handler` itself
         # inside the module provided in the `bias_dropout_add_spec` module?
         nvtx_range_push(suffix="mlp_bda")
-        with self.bias_dropout_add_exec_handler():
-            hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
-                mlp_output_with_bias, residual, self.hidden_dropout
-            )
+        if using_fused_tp_inference_kernel:
+            # In inference optimized transformer layer, there is no bias and dropout
+            # The remaining residual add is already handled inside the
+            # MLP module.
+            hidden_states = mlp_output_with_bias[0]
+        else:
+            with self.bias_dropout_add_exec_handler():
+                hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+                    mlp_output_with_bias, residual, self.hidden_dropout
+                )
         nvtx_range_pop(suffix="mlp_bda")
+        # Delay the offload of the mlp norm until after the mlp_bda has been computed
+        # because the residual is needed in the mlp_bda.
+        if self.offload_mlp_norm:
+            hidden_states = off_interface.group_commit(
+                hidden_states, name="mlp_norm", forced_released_tensors=[residual]
+            )
 
         # Jit compiled function creates 'view' tensor. This tensor
         # potentially gets saved in the MPU checkpoint function context,
@@ -740,6 +928,66 @@ def sharded_state_dict(
             apply_prefix_mapping(sharded_state_dict, prefixed_map)
         return sharded_state_dict
 
+    def configure_fused_tp_inference(
+        self,
+        skip_qkv_norm_and_all_gather: bool = False,
+        fc2_next_layer_norm_weights: Optional[Tensor] = None,
+    ):
+        """
+        Configure settings for fused TP communication in inference mode.
+
+        Args:
+            skip_qkv_norm (bool): Whether to skip norm and all-gather for linear_qkv.
+            fc2_next_layer_norm_weights (Optional[Tensor]): Next layer's QKV norm weights
+                for current layer's MLP FC2.
+        """
+        self.self_attention.linear_qkv.skip_norm_and_all_gather = skip_qkv_norm_and_all_gather
+
+        # Use current layer's own MLP FC1 norm weights for attention's/mixer's out_proj
+        mlp_fc1_weights = self.get_mlp_layer_norm_weights()
+        self._set_proj_next_layer_norm_weights(mlp_fc1_weights)
+
+        self.mlp.linear_fc1.skip_norm_and_all_gather = True
+        # Use next layer's attention norm weights for current layer's MLP FC2
+        self._set_fc2_next_layer_norm_weights(fc2_next_layer_norm_weights)
+
+    def _set_proj_next_layer_norm_weights(self, weights: Tensor):
+        """Set next layer norm weights for attention/mixer's linear_proj."""
+        self.self_attention.linear_proj._set_next_layer_norm_weights(weights)
+
+    def _set_fc2_next_layer_norm_weights(self, weights: Optional[Tensor]):
+        """Set next layer norm weights for MLP FC2."""
+        if weights is None:
+            # Create dummy tensor for last layer (same shape as fc1 norm weights)
+            weights = torch.empty_like(self.get_mlp_layer_norm_weights())
+        self.mlp.linear_fc2._set_next_layer_norm_weights(weights)
+
+    def _set_proj_residual(self, residual: Tensor):
+        """Set residual for attention's/mixer's out_proj (linear_proj)."""
+        self.self_attention.linear_proj._set_residual(residual)
+
+    def _set_fc2_residual(self, residual: Tensor):
+        """Set residual for MLP FC2."""
+        self.mlp.linear_fc2._set_residual(residual)
+
+    def get_mlp_layer_norm_weights(self) -> Tensor:
+        """
+        Get the MLP FC1 layer norm weights.
+
+        Returns:
+            Tensor: The layer norm weight data.
+        """
+        return self.mlp.linear_fc1.layer_norm_weight.data
+
+    def get_qkv_layer_norm_weights(self) -> Tensor:
+        """
+        Get the QKV layer norm weights.
+
+        Returns:
+            Tensor: The layer norm weight data.
+        """
+        return self.self_attention.linear_qkv.layer_norm_weight.data
+
     def get_layer_static_inputs(self, seq_length, micro_batch_size):
         """
         Get the static inputs for the transformer layer. Besides the hidden_states that is
@@ -750,7 +998,9 @@ def get_layer_static_inputs(self, seq_length, micro_batch_size):
         """
         static_inputs = super().get_layer_static_inputs(seq_length, micro_batch_size)
 
-        if not isinstance(self.self_attention, IdentityOp):
+        if not isinstance(self.self_attention, IdentityOp) and (
+            not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope
+        ):
             slen_per_cp = seq_length // self.config.context_parallel_size
             static_inputs["attention_mask"] = (
                 ~(torch.tril(torch.ones((slen_per_cp, seq_length))).bool())
@@ -764,18 +1014,28 @@ def _get_submodules_under_cudagraphs(self):
         """
         Get the submodules that are covered by cudagraphs.
         """
-        if self.config.cuda_graph_scope == 'full':
-            submodules = [self]
-        else:
-            assert (
-                self.config.cuda_graph_scope == 'attn'
-            ), f"Invalid cuda_graph_scope {self.config.cuda_graph_scope}"
-            submodules = [
+        if not self.config.cuda_graph_scope:
+            return super()._get_submodules_under_cudagraphs()
+
+        submodules = []
+        if CudaGraphScope.attn in self.config.cuda_graph_scope:
+            submodules += [
                 self.input_layernorm,
                 self.self_attention,
                 self.pre_cross_attn_layernorm,
                 self.cross_attention,
             ]
+        if (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) or (
+            self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope
+        ):
+            submodules += [self.pre_mlp_layernorm, self.mlp]
+        elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope:
+            submodules += [self.pre_mlp_layernorm, self.mlp.router]
+            if (
+                self.config.moe_shared_expert_intermediate_size is not None
+                and not self.config.moe_shared_expert_overlap
+            ):
+                submodules += [self.mlp.shared_experts]
         return submodules
 
     def _te_cuda_graph_capture(self, *args, **kwargs):
@@ -786,12 +1046,31 @@ def _te_cuda_graph_capture(self, *args, **kwargs):
            attribute can be set to control the scope of the CUDA graph.
         2. If context is None, it cannot be returned as output.
         """
-        hidden_states, context = self._forward_attention(*args, **kwargs)
-
-        if self.config.cuda_graph_scope == "full":
+        context = None
+        if not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope:
+            hidden_states, context = self._forward_attention(*args, **kwargs)
+        else:
+            if len(args) > 0:
+                hidden_states = args[0]
+            else:
+                hidden_states = kwargs.pop("hidden_states")
+
+        if (
+            not self.config.cuda_graph_scope
+            or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope)
+            or (
+                self.is_moe_layer
+                and (
+                    CudaGraphScope.moe in self.config.cuda_graph_scope
+                    or CudaGraphScope.moe_router in self.config.cuda_graph_scope
+                )
+            )
+        ):
             hidden_states = self._forward_mlp(hidden_states)
-        cuda_graph_outputs = [hidden_states]
-
+        if not isinstance(hidden_states, list) and not isinstance(hidden_states, tuple):
+            cuda_graph_outputs = [hidden_states]
+        else:
+            cuda_graph_outputs = list(hidden_states)
         if context is not None:
             cuda_graph_outputs.append(context)
         return tuple(cuda_graph_outputs)
@@ -803,6 +1082,11 @@ def _te_cuda_graph_replay(self, *args, **kwargs):
         However, CUDA graph accepts only Tensor inputs.
         Hence, `inference_context` and `packed_seq_params` are excluded from input list.
         """
+        context = None
+        if self.config.cuda_graph_scope and CudaGraphScope.attn not in self.config.cuda_graph_scope:
+            hidden_states, context = self._forward_attention(*args, **kwargs)
+            args = (hidden_states,)
+            kwargs = {}
 
         assert (kwargs.get('inference_context') is None) and (
             kwargs.get('packed_seq_params') is None
@@ -812,19 +1096,99 @@ def _te_cuda_graph_replay(self, *args, **kwargs):
             "For inference cuda graph, please use cuda_graph_impl=local instead."
         )
 
-        cuda_graph_output = super()._te_cuda_graph_replay(*args, **kwargs)
+        cuda_graph_output = list(super()._te_cuda_graph_replay(*args, **kwargs))
 
         if kwargs.get('context') is not None:
-            context = cuda_graph_output[-1]
-            cuda_graph_output = cuda_graph_output[:-1]
+            context = cuda_graph_output.pop()
+
+        if (
+            not self.config.cuda_graph_scope
+            or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope)
+            or (self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope)
+        ):
+            # CUDA Graph captures the whole MLP/MoE part. CUDA Graph output is the layer output.
+            assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output."
+            output = cuda_graph_output.pop()
+            assert (
+                not self.config.overlap_moe_expert_parallel_comm
+            ), "EP overlap must be \
+                disabled when CUDA graph captures the whole MLP/MoE part."
+        elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope:
+            # CUDA Graph partially captures the MoE.
+            # The rest of the layer should go to the normal pass.
+            shared_expert_output, routing_map = None, None
+            # residual is the last element in the CUDA graph output.
+            residual = cuda_graph_output.pop()
+            if (
+                self.config.moe_shared_expert_intermediate_size is not None
+                and not self.config.moe_shared_expert_overlap
+            ):
+                # The shared expert output is the last second element in the CUDA graph output.
+                shared_expert_output = cuda_graph_output.pop()
+
+            if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope:
+                # CUDA graph output is [hidden_states, probs] + attributes outputs.
+                (hidden_states, probs), attr_outputs = cuda_graph_output[:2], cuda_graph_output[2:]
+                valid_cudagraph_attrs = self.mlp.token_dispatcher.valid_cudagraph_attrs
+                assert len(attr_outputs) == len(
+                    valid_cudagraph_attrs
+                ), f"attr_outputs: {len(attr_outputs)} != {len(valid_cudagraph_attrs)}"
+                for i, attr_name in enumerate(valid_cudagraph_attrs):
+                    hier_attr_name = attr_name.split('.')
+                    attr = self.mlp.token_dispatcher
+                    for name in hier_attr_name[:-1]:
+                        attr = getattr(attr, name)
+                    setattr(attr, hier_attr_name[-1], attr_outputs[i])
+            else:
+                # CUDA graph output is [hidden_states, probs, routing_map].
+                assert len(cuda_graph_output) == 3, (
+                    "CUDA graph output should be [hidden_states, probs, routing_map], "
+                    f"but got {len(cuda_graph_output)} elements"
+                )
+                hidden_states, probs, routing_map = cuda_graph_output
+
+            # Resume the MoELayer forward pass from the end of the CUDA graph scope.
+            # The MoE layer will skip redundant computations when we pass in the calculated values
+            # through the keyword arguments. See MoELayer.forward docstring for more details.
+            nvtx_range_push(suffix="mlp")
+            self.mlp.cudagraph_tensor_store.set(
+                hidden_states=hidden_states,
+                probs=probs,
+                routing_map=routing_map,
+                shared_expert_output=shared_expert_output,
+            )
+            # If EP overlap is enabled, remaining of mlp will be called as fine_grained_callables
+            # and should be skipped here.
+            if self.config.overlap_moe_expert_parallel_comm:
+                probs, routing_map = self.mlp.route(hidden_states)
+                hidden_states, probs = self.mlp.preprocess(hidden_states, probs, routing_map)
+                nvtx_range_pop(suffix="mlp")
+                return residual, hidden_states, probs, shared_expert_output
+            mlp_output_with_bias = self.mlp(hidden_states)
+            self.mlp.cudagraph_tensor_store.clear()
+            nvtx_range_pop(suffix="mlp")
+
+            # If we early returned, layernorm recompute hooks were attached to the output buffer
+            # of the cudagraph, so disable the recompute hooks inside _forward_post_mlp
+            recompute_pre_mlp_layernorm = self.recompute_pre_mlp_layernorm
+            self.recompute_pre_mlp_layernorm = False
+            output = self._forward_post_mlp(mlp_output_with_bias, residual)
+            self.recompute_pre_mlp_layernorm = recompute_pre_mlp_layernorm
         else:
-            context = None
-        if self.config.cuda_graph_scope == "attn":
-            # CUDA Graph only covers the attention layer. Feed-forward
-            # layer still goes through the normal pass.
+            # If EP overlap is enabled, needs to return same outputs as submodule.attn
+            if self.config.overlap_moe_expert_parallel_comm:
+                assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output."
+                residual = cuda_graph_output.pop()
+                if not self.is_moe_layer:
+                    return residual, None, None, None
+                hidden_states = self.pre_mlp_layernorm(residual)
+                shared_expert_output = self.mlp.shared_experts_compute(hidden_states)
+                probs, routing_map = self.mlp.route(hidden_states)
+                hidden_states, probs = self.mlp.preprocess(hidden_states, probs, routing_map)
+                return residual, hidden_states, probs, shared_expert_output
+
+            # CUDA Graph does not capture the MLP/MoE part at all.
             output = self._forward_mlp(*cuda_graph_output)
-        else:
-            output = cuda_graph_output[0]
         return output, context
 
     def _get_te_cuda_graph_replay_args(self, *args, **kwargs):
@@ -897,7 +1261,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs):
                 (kwargs.get('inference_context') is not None)
                 or (kwargs.get('inference_params') is not None)
             )
-            and self.config.cuda_graph_scope != 'full_iteration'
+            and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope
         ):
             if kwargs['inference_context'].is_static_batching():
                 using_cuda_graph = kwargs['inference_context'].is_decode_only()
@@ -920,4 +1284,172 @@ def __call__(self, *args, **kwargs):
                 kwargs["dynamic_inference_decode_only"] = kwargs[
                     'inference_context'
                 ].is_decode_only()
+
         return super().__call__(*args, **kwargs)
+
+    def get_layer_norm_weights(self):
+        """
+        Get the weights of all layernorms (attention and MLP) in the transformer layer.
+        Returns:
+            List[Tensor]: A list of layernorm weight tensors.
+        """
+        return
+
+
+class MoETransformerLayer(TransformerLayer):
+    """
+    A Transformer layer specialized for Mixture-of-Experts (MoE) architectures.
+
+    Implements specific functionality to support CUDA graph capture for MoE layers.
+    Due to the dynamic nature of MoE, capturing the entire layer in a single CUDA graph
+    can be challenging. This class supports "partial" CUDA graphs by decomposing the
+    MLP forward pass into router, expert-compute, and post-process stages.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.is_moe_layer = True
+        self.use_partial_cudagraphs = False
+        self.moe_layer_recompute = False
+        self.token_dispatcher_attrs = {}
+
+        super().__init__(*args, **kwargs)
+
+    def create_mcore_cudagraph_manager(self, config):
+        """
+        Initializes the CUDA graph manager(s) for the MoE layer.
+
+        Unlike the standard layer which typically uses a single manager, this method
+        can configure multiple graph managers if partial CUDA graphs are enabled via
+        `cuda_graph_scope`. This allows capturing the static parts of the MoE pass
+        while leaving the expert computation to execute eagerly.
+        """
+
+        from megatron.core.transformer.cuda_graphs import CudaGraphManager
+
+        if not self.config.cuda_graph_scope or CudaGraphScope.moe in self.config.cuda_graph_scope:
+            self.cudagraph_manager = CudaGraphManager(config)
+        elif (
+            CudaGraphScope.moe_router in self.config.cuda_graph_scope
+            or CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope
+        ):
+            # full MoE layer recompute with partial_cudagraphs. If not partial cudagraphs, MoE
+            # layer recompute is handled by the moe_layer.MoELayer class
+            self.moe_layer_recompute = (
+                self.config.recompute_granularity == 'selective'
+                and "moe" in self.config.recompute_modules
+                and self.config.cuda_graph_impl == "local"
+            )
+
+            self.use_partial_cudagraphs = True
+            self.cudagraph_manager_router = CudaGraphManager(
+                self.config, self, function_name="_forward_mlp_router"
+            )
+            self.cudagraph_manager_postprocess = CudaGraphManager(
+                self.config, self, function_name="_forward_mlp_postprocess"
+            )
+
+    def _forward_mlp_router(self, hidden_states, padding_mask=None):
+        """
+        Executes the router phase of the MoE block.
+
+        This includes the pre-MLP layernorm and the routing logic.
+        This method is isolated so it can be captured by `cudagraph_manager_router`.
+        """
+
+        residual = hidden_states
+        self.mlp.fwd_execution_map = "route"
+        pre_mlp_layernorm_output = self._forward_pre_mlp_layernorm(hidden_states)
+        router_outputs = self.mlp(
+            pre_mlp_layernorm_output, intermediate_tensors=(), padding_mask=padding_mask
+        )
+
+        for attr_name in self.mlp.token_dispatcher.cudagraph_attrs:
+            attr = getattr(self.mlp.token_dispatcher, attr_name)
+            if torch.is_tensor(attr):
+                if attr_name in self.token_dispatcher_attrs:
+                    self.token_dispatcher_attrs[attr_name].copy_(attr)
+                else:
+                    self.token_dispatcher_attrs[attr_name] = attr.detach()
+
+        return residual, *router_outputs
+
+    def _forward_mlp_expert_compute(self, hidden_states, probs):
+        """
+        Executes the actual computation of the experts.
+
+        This phase takes the routing information and inputs, dispatches them to the
+        appropriate experts, and computes the results. In partial graph modes, this
+        step runs eagerly between the router and postprocess graph replays.
+        """
+
+        for name, attr in self.token_dispatcher_attrs.items():
+            setattr(self.mlp.token_dispatcher, name, attr)
+
+        self.mlp.fwd_execution_map = "expert_compute"
+        return self.mlp(None, intermediate_tensors=(hidden_states, probs))
+
+    def _forward_mlp_postprocess(self, residual, output, shared_expert_output, mlp_bias):
+        """
+        Executes the post-processing phase of the MoE block.
+
+        Handles combining the expert outputs, applying biases, re-registering
+        activation recomputation hooks if necessary, and performing the final
+        Bias-Dropout-Add. This method is isolated so it can be captured by cudagraphs.
+
+        """
+
+        self.mlp.fwd_execution_map = "postprocess"
+        output = self.mlp(None, intermediate_tensors=(output, shared_expert_output))
+        return self._forward_post_mlp((output, mlp_bias), residual)
+
+    def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None):
+        """
+        Orchestrates the MLP forward pass, handling partial CUDA graph execution logic.
+
+        If `use_partial_cudagraphs` is True, this method stitches together the
+        router, expert_compute, and postprocess calls.
+        """
+
+        if inference_context is not None:
+            assert not self.use_partial_cudagraphs, (
+                "Partial cudagraphs for MoEs were detected during inference!"
+                "Please do not use --cuda-graph-scope moe_router moe_preprocess "
+                "alongside inference."
+            )
+
+        def _forward_mlp_partial_cudagraphs(
+            hidden_states, inference_context=None, padding_mask=None
+        ):
+            residual, hidden_states, probs, shared_expert_output = self._forward_mlp_router(
+                hidden_states, padding_mask=padding_mask
+            )
+            expert_output, mlp_bias = self._forward_mlp_expert_compute(hidden_states, probs)
+            return self._forward_mlp_postprocess(
+                residual, expert_output, shared_expert_output, mlp_bias
+            )
+
+        if self.use_partial_cudagraphs:
+            if self.moe_layer_recompute:
+                if self.config.fp8 or self.config.fp4:
+                    from megatron.core.extensions.transformer_engine import te_checkpoint
+
+                    return te_checkpoint(
+                        _forward_mlp_partial_cudagraphs,
+                        False,
+                        tensor_parallel.random.get_cuda_rng_tracker,
+                        parallel_state.get_tensor_model_parallel_group(),
+                        hidden_states,
+                        padding_mask=padding_mask,
+                    )
+                else:
+                    return tensor_parallel.checkpoint(
+                        functools.partial(
+                            _forward_mlp_partial_cudagraphs, padding_mask=padding_mask
+                        ),
+                        False,
+                        hidden_states,
+                    )
+            else:
+                return _forward_mlp_partial_cudagraphs(hidden_states, padding_mask=padding_mask)
+        else:
+            return super()._forward_mlp(hidden_states, padding_mask=padding_mask)
diff --git a/megatron/core/transformer/utils.py b/megatron/core/transformer/utils.py
index 373c06f0991..880c5309933 100644
--- a/megatron/core/transformer/utils.py
+++ b/megatron/core/transformer/utils.py
@@ -10,6 +10,8 @@
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedStateDict, StateDict
 from megatron.core.jit import jit_fuser
 from megatron.core.utils import (
+    get_pg_rank,
+    get_tensor_model_parallel_group_if_none,
     make_sharded_tensor_for_checkpoint,
     make_tp_sharded_tensor_for_checkpoint,
 )
@@ -76,6 +78,8 @@ def make_sharded_tensors_for_checkpoint(
     tensor_parallel_layers_axis_map: Optional[Dict[str, int]] = None,
     sharded_offsets: Iterable[Tuple[int, int, int]] = (),
     extra_state_suffix: str = '_extra_state',
+    tp_group: Optional[torch.distributed.ProcessGroup] = None,
+    dp_cp_group: Optional[torch.distributed.ProcessGroup] = None,
 ):
     """Wraps tensors from transformer layers with ShardedTensor or ShardedObject.
 
@@ -93,31 +97,52 @@ def make_sharded_tensors_for_checkpoint(
             applied (e.g. PP related), passed along to ShardedTensor
         extra_state_suffix (str, default = '_extra_state'): layers with this
             suffix will be wrapped with ShardedObject instead of ShardedTensor.
+        tp_group (Optional[torch.distributed.ProcessGroup], optional): tensor parallel group.
+            If None, defaults to parallel_state.get_tensor_model_parallel_group()
+        dp_cp_group (Optional[torch.distributed.ProcessGroup], optional): data parallel group
+            with context parallel. If None, defaults to
+            parallel_state.get_data_parallel_group(with_context_parallel=True)
 
     """
 
     if tensor_parallel_layers_axis_map is None:
         tensor_parallel_layers_axis_map = {}
 
+    if tp_group is None and dp_cp_group is None:
+        tp_group = get_tensor_model_parallel_group_if_none(tp_group)
+        dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+
     sharded_state_dict = {}
     for layer_name in state_dict.keys():
         tensor = state_dict[layer_name]
         layer_key = f'{prefix}{layer_name}'
 
         if layer_name.endswith(extra_state_suffix):
+            # Compute replica_id when groups are provided
+            replica_id = (0, get_pg_rank(tp_group), get_pg_rank(dp_cp_group))
+
             sharded_state_dict[layer_key] = make_sharded_object_for_checkpoint(
-                tensor, layer_key, sharded_offsets
+                tensor, layer_key, sharded_offsets, replica_id=replica_id
             )
 
         elif layer_name in tensor_parallel_layers_axis_map:
             tp_axis = tensor_parallel_layers_axis_map[layer_name]
             sharded_state_dict[layer_key] = make_tp_sharded_tensor_for_checkpoint(
-                tensor, layer_key, tp_axis, prepend_offsets=sharded_offsets
+                tensor,
+                layer_key,
+                tp_axis,
+                prepend_offsets=sharded_offsets,
+                tp_group=tp_group,
+                dp_cp_group=dp_cp_group,
             )
 
         else:
             sharded_state_dict[layer_key] = make_sharded_tensor_for_checkpoint(
-                tensor, layer_key, prepend_offsets=sharded_offsets
+                tensor,
+                layer_key,
+                prepend_offsets=sharded_offsets,
+                tp_group=tp_group,
+                dp_cp_group=dp_cp_group,
             )
 
     return sharded_state_dict
@@ -166,11 +191,27 @@ def _get_extra_state_offsets(
     return extra_state_shape, extra_state_offset
 
 
+def ensure_metadata_has_dp_cp_group(metadata: Optional[dict]) -> dict:
+    """Ensure `metadata` is a dict containing `dp_cp_group` entry.
+
+    If `metadata` is None, a new dict is returned with `dp_cp_group` set.
+    If `metadata` is a dict and missing `dp_cp_group`, it is updated in-place.
+    Otherwise, asserts that `dp_cp_group` exists.
+    """
+    if metadata is None:
+        return {'dp_cp_group': parallel_state.get_data_parallel_group(with_context_parallel=True)}
+    assert isinstance(metadata, dict), "metadata must be a dict with dp_cp_group as key"
+    if 'dp_cp_group' not in metadata:
+        metadata['dp_cp_group'] = parallel_state.get_data_parallel_group(with_context_parallel=True)
+    return metadata
+
+
 def sharded_state_dict_default(
     module: torch.nn.Module,
     prefix: str = '',
     sharded_offsets: Tuple[Tuple[int, int, int]] = (),
     metadata: Optional[dict] = None,
+    tp_group: Optional[torch.distributed.ProcessGroup] = None,
 ) -> ShardedStateDict:
     """Provides implementation for sharded_state_dict method for non-MegatronModules.
 
@@ -186,11 +227,16 @@ def sharded_state_dict_default(
         sharded_offsets (Tuple[Tuple[int, int, int]], optional): sharding already
             applied (e.g. PP related) by sup-modules. Passed along to ShardedTensor
         metadata (dict, optional): metadata passed to module sharded_state_dict method
+        tp_group (Optional[torch.distributed.ProcessGroup], optional): tensor parallel group.
+            If None, defaults to parallel_state.get_tensor_model_parallel_group()
 
     Returns:
         dict: dictionary of state dict keys mapped to ShardedTensors
     """
 
+    # Guard for cases metadata is not provided
+    metadata = ensure_metadata_has_dp_cp_group(metadata)
+
     if hasattr(module, 'sharded_state_dict'):
         module_sharded_sd = module.sharded_state_dict(
             prefix=prefix, sharded_offsets=sharded_offsets, metadata=metadata
@@ -198,7 +244,12 @@ def sharded_state_dict_default(
     else:
         module_sd = module.state_dict(prefix='', keep_vars=True)
         module_sharded_sd = make_sharded_tensors_for_checkpoint(
-            module_sd, prefix, {}, sharded_offsets
+            module_sd,
+            prefix,
+            {},
+            sharded_offsets,
+            tp_group=tp_group,
+            dp_cp_group=metadata['dp_cp_group'],
         )
     return module_sharded_sd
 
diff --git a/megatron/core/typed_torch.py b/megatron/core/typed_torch.py
new file mode 100644
index 00000000000..bcbf388facc
--- /dev/null
+++ b/megatron/core/typed_torch.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+"""Utilities for improved type hinting with torch interfaces."""
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Generic, ParamSpec, Protocol, TypeVar
+
+import torch
+
+P = ParamSpec('P')
+R_co = TypeVar('R_co', covariant=True)
+T = TypeVar('T')
+
+
+class _Module(Generic[P, R_co], Protocol):
+    """Protocol allowing us to unwrap `forward`."""
+
+    def forward(self, *args: P.args, **kwargs: P.kwargs) -> R_co:
+        """Forward method of the matching torch.nn.Module."""
+        ...
+
+
+def apply_module(m: _Module[P, R_co], *, check_subclass: bool = True) -> Callable[P, R_co]:
+    """Returns the provided module unchanged, but with correct type hints.
+
+    Args:
+      m: An instance of a subclass of `torch.nn.Module`.
+      check_subclass: If `True`, checks that `m` is a subclass of
+            `torch.nn.Module` and raises a `TypeError` if not.
+
+    Returns:
+      That module unchanged, but with correct type hints.
+    """
+    if check_subclass and not issubclass(type(m), torch.nn.Module):
+        raise TypeError(f'{type(m)} is not a subclass of torch.nn.Module')
+    return m  # type: ignore
+
+
+def not_none(value: T | None) -> T:
+    """Asserts that the provided value is not None and returns it.
+
+    Args:
+        value: An optional value.
+
+    Returns:
+        The provided value, guaranteed to be not None.
+    """
+    if value is None:
+        raise ValueError('Expected value to be not None')
+    return value
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index 5cac5804839..f71824b717b 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -29,8 +29,24 @@
 import numpy
 import torch
 
+try:
+    import torch.distributed._symmetric_memory as symm_mem
+
+    HAVE_TORCH_SYMM_MEM = True
+except ImportError:
+    HAVE_TORCH_SYMM_MEM = False
+
+try:
+    import triton  # pylint: disable=unused-import
+
+    HAVE_TRITON = True
+except ImportError:
+    HAVE_TRITON = False
+
 from megatron.core import config
+from megatron.core._rank_utils import log_single_rank
 from megatron.core.package_info import __version__ as mcore_version
+from megatron.core.packed_seq_params import PackedSeqParams
 
 try:
     from torch.distributed._tensor import DTensor
@@ -59,6 +75,15 @@
 
 logger = logging.getLogger(__name__)
 
+try:
+    # Register the TE CUDA kernels
+    import transformer_engine  # pylint: disable=unused-import
+
+    # Alias the PyTorch wrapper so we can call tex.* APIs
+    import transformer_engine_torch as tex
+except ImportError:
+    # TE isn’t installed or the torch wrapper is missing
+    tex = None
 
 try:
     _torch_version = PkgVersion(torch.__version__)
@@ -131,7 +156,9 @@ def validator(func: Callable, max_lifetime: int = 3) -> Callable:
             PkgVersion(introduced_with_version).minor + max_lifetime
             < PkgVersion(mcore_version).minor
         ):
-            logger.warning(
+            log_single_rank(
+                logger,
+                logging.WARNING,
                 "%s has reached end of life. Please migrate to a non-experimental function.",
                 func.__name__,
             )
@@ -196,7 +223,9 @@ def validator(cls: Callable, max_lifetime: int = 3) -> Callable:
             PkgVersion(introduced_with_version).minor + max_lifetime
             < PkgVersion(mcore_version).minor
         ):
-            logger.warning(
+            log_single_rank(
+                logger,
+                logging.WARNING,
                 "%s has reached end of life. Please migrate to a non-experimental function.",
                 cls.__name__,
             )
@@ -280,28 +309,6 @@ def __getattribute__(self, attr):
     return validator
 
 
-def get_torch_version():
-    """Get pytorch version from __version__; if not available use pip's. Use caching."""
-
-    if not HAVE_PACKAGING:
-        raise ImportError(
-            "packaging is not installed. Please install it with `pip install packaging`."
-        )
-
-    def get_torch_version_str():
-        import torch
-
-        if hasattr(torch, "__version__"):
-            return str(torch.__version__)
-        else:
-            return version("torch")
-
-    global _torch_version
-    if _torch_version is None:
-        _torch_version = PkgVersion(get_torch_version_str())
-    return _torch_version
-
-
 def get_te_version():
     """Get TE version from __version__; if not available use pip's. Use caching."""
     if not HAVE_PACKAGING:
@@ -456,15 +463,6 @@ def is_causal_conv1d_min_version(version, check_equality=True):
     return get_causal_conv1d_version() > PkgVersion(version)
 
 
-def check_mamba_sequence_packing_support() -> Tuple[bool, Optional[str]]:
-    """Checks whether `causal_conv1d` and `mamba_ssm` support sequence packing."""
-    if not is_causal_conv1d_min_version("1.5.3.post1"):
-        return False, "causal_conv1d >= 1.5.3.post1 is required"
-    elif not is_mamba_min_version("2.2.6.post3"):
-        return False, "mamba_ssm >= 2.2.6.post3 is required"
-    return True, None
-
-
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
     assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
@@ -494,6 +492,10 @@ def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, check_ini
     if not torch.distributed.is_initialized():
         return None
 
+    # if parallel_state is not initialized, pass `tp_group` thru
+    if not parallel_state.is_initialized():
+        return tp_group
+
     if tp_group is None:
         if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
             warnings.warn(
@@ -641,6 +643,65 @@ def get_tensor(self, tensor_shape, dtype, name, mem_alloc_context: Optional[Call
         return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
 
 
+class GlobalSymmetricMemoryBuffer:
+    """
+    Global symmetric memory buffer used in inference.
+    This buffer is used by mcore-inference's low-latency
+    NVLS all-gather and reduce-scatter collectives.
+    """
+
+    def __init__(self, size_in_mb, process_group):
+        if not HAVE_TORCH_SYMM_MEM or not HAVE_TRITON:
+            # This should be hit if the user is running an older
+            # version of torch, or if they do not have triton
+            # installed.
+            self.symm_buffer = None
+            self.symm_mem_hdl = None
+        else:
+            numel = int(size_in_mb * 1024 * 1024)  # size in bytes
+            try:
+                symm_mem.enable_symm_mem_for_group(process_group.group_name)
+                self.symm_buffer = symm_mem.empty(numel, dtype=torch.uint8, device='cuda')
+                self.symm_mem_hdl = symm_mem.rendezvous(self.symm_buffer, process_group)
+            except RuntimeError as e:
+                # If symmetric memory initialization fails, set buffer and handle to None
+                # This should happen if the process group is not contained within NVlink
+                self.symm_buffer = None
+                self.symm_mem_hdl = None
+
+    def _can_allocate(self, numel, dtype) -> bool:
+        """
+        Returns whether enough symmetric memory is available
+        for the given tensor shape and dtype.
+        """
+        if self.symm_mem_hdl is None:
+            return False
+        size_of_dtype = torch.tensor([], dtype=dtype).element_size()
+        required_len = numel * size_of_dtype
+        return required_len <= self.symm_buffer.numel()
+
+    def _allocate(self, numel, dtype) -> torch.Tensor:
+        """
+        Allocates a sub-tensor from the self.symm_buffer for the given numel and dtype"""
+        required_bytes = numel * torch.tensor([], dtype=dtype).element_size()
+        return self.symm_buffer[0:required_bytes].view(dtype).view(numel)
+
+    def maybe_get_tensor(self, tensor_shape, dtype):
+        """
+        Returns (potentially) a sub-tensor from the self.symm_buffer for the given shape.
+        If enough symmetric memory is not available, returns None.
+        """
+        if self.symm_mem_hdl is None:
+            return {"tensor": None, "handle": None}
+        numel = reduce(operator.mul, tensor_shape, 1)
+        if not self._can_allocate(numel, dtype):
+            return {"tensor": None, "handle": None}
+        return {
+            "tensor": self._allocate(numel, dtype).view(*tensor_shape),
+            "handle": self.symm_mem_hdl,
+        }
+
+
 def _kernel_make_viewless_tensor(inp, requires_grad):
     """Make a viewless tensor.
 
@@ -759,25 +820,6 @@ def scaled_init_method_normal(sigma, num_layers, multiplier=2.0):
     return functools.partial(torch.nn.init.normal_, mean=0.0, std=std)
 
 
-def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
-    """If torch distributed is initialized, write log on only one rank
-
-    Args:
-        logger (logging.Logger): The logger to write the logs
-
-        args (Tuple[Any]): All logging.Logger.log positional arguments
-
-        rank (int, optional): The rank to write on. Defaults to 0.
-
-        kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
-    """
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == rank:
-            logger.log(*args, **kwargs)
-    else:
-        logger.log(*args, **kwargs)
-
-
 def log_on_each_pipeline_stage(
     logger: logging.Logger,
     *args: Any,
@@ -895,15 +937,37 @@ def make_tp_sharded_tensor_for_checkpoint(
     is sharded across TP group.
 
     Optionally, can provide offsets which prepend new dimensions to the tensor.
+
+    Args:
+        tensor: Tensor to shard
+        key: Key for the sharded tensor
+        tp_axis: Axis to shard across tensor parallel group (default: 0)
+        replica_id: Replica ID for the tensor (default: None)
+        prepend_offsets: Offsets to prepend to tensor dimensions (default: ())
+        **kwargs: Additional arguments. May include:
+            - tp_group: Tensor parallel group (default: None, falls back to parallel_state)
+            - dp_cp_group: Data parallel + context parallel group
+              (default: None, falls back to parallel_state)
     """
+    # Pop group parameters from kwargs
+    tp_group = kwargs.pop('tp_group', None)
+    dp_cp_group = kwargs.pop('dp_cp_group', None)
+
     prepend_axis_num = len(prepend_offsets)
 
     new_offsets = []
-    tp_rank = parallel_state.get_tensor_model_parallel_rank()
-    dp_rank = parallel_state.get_data_parallel_rank(with_context_parallel=True)
-    tp_size = parallel_state.get_tensor_model_parallel_world_size()
-    dp_size = parallel_state.get_data_parallel_world_size(with_context_parallel=True)
-    dp_replica_id = parallel_state.get_data_parallel_rank(with_context_parallel=True)
+
+    # Get groups with fallback to parallel_state
+    if tp_group is None and dp_cp_group is None:
+        tp_group = parallel_state.get_tensor_model_parallel_group()
+        dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+
+    # Use local get_pg_rank and get_pg_size functions
+    tp_rank = get_pg_rank(tp_group)
+    dp_rank = get_pg_rank(dp_cp_group)
+    tp_size = get_pg_size(tp_group)
+    dp_size = get_pg_size(dp_cp_group)
+    dp_replica_id = get_pg_rank(dp_cp_group)
 
     new_offsets.append((tp_axis + prepend_axis_num, tp_rank, tp_size))
 
@@ -939,14 +1003,34 @@ def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), replica_
     """Helper for instantiating a non-sharded ShardedTensor (replicated across TP and DP group).
 
     Optionally, can provide offsets which prepend new dimensions to the tensor.
+
+    Keyword Args:
+        tensor: Tensor to create sharded tensor for
+        key: Key for the sharded tensor
+        prepend_offsets: Offsets to prepend to tensor dimensions (default: ())
+        replica_id: Replica ID for the tensor (default: None)
+        **kwargs: Additional arguments. May include:
+            - tp_group: Tensor parallel group (default: None, falls back to parallel_state)
+            - dp_cp_group: Data parallel + context parallel group
+              (default: None, falls back to parallel_state)
     """
+    # Pop group parameters from kwargs
+    tp_group = kwargs.pop('tp_group', None)
+    dp_cp_group = kwargs.pop('dp_cp_group', None)
 
     prepend_axis_num = len(prepend_offsets)
 
     new_offsets = []
-    dp_rank = parallel_state.get_data_parallel_rank(with_context_parallel=True)
-    dp_size = parallel_state.get_data_parallel_world_size(with_context_parallel=True)
-    dp_replica_id = parallel_state.get_data_parallel_rank(with_context_parallel=True)
+
+    # Get groups with fallback to parallel_state
+    if tp_group is None and dp_cp_group is None:
+        tp_group = parallel_state.get_tensor_model_parallel_group()
+        dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+
+    # Use local get_pg_rank and get_pg_size functions
+    dp_rank = get_pg_rank(dp_cp_group)
+    dp_size = get_pg_size(dp_cp_group)
+    dp_replica_id = get_pg_rank(dp_cp_group)
 
     if HAVE_DTENSOR and isinstance(tensor, DTensor):
         # FSDP2 sharding
@@ -955,7 +1039,7 @@ def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), replica_
         new_offsets.append((prepend_axis_num, dp_rank, dp_size))
 
     if replica_id is None:
-        replica_id = (0, parallel_state.get_tensor_model_parallel_rank(), dp_replica_id)
+        replica_id = (0, get_pg_rank(tp_group), dp_replica_id)
 
     return ShardedTensor.from_rank_offsets(
         key,
@@ -1915,9 +1999,17 @@ def is_submodule(module, parent_module, strict=True):
 ########################
 
 
-def get_batch_on_this_cp_rank(batch: Dict[str, Any]):
+def get_batch_on_this_cp_rank(
+    batch: Dict[str, Any], cp_group: Optional[torch.distributed.ProcessGroup] = None
+):
     """Slice batch input along sequence dimension into multiple chunks,
     which are parallelized across GPUs in a context parallel group.
+
+    Args:
+        batch (Dict[str, Any]): Input batch tensors.
+        cp_group (Optional[torch.distributed.ProcessGroup]): Context-parallel process group.
+            If provided, uses this group's size and rank. Otherwise, falls back to
+            the current context-parallel settings from parallel_state.
     """
 
     # With causal masking, each token only attends to its prior tokens. Simply split
@@ -1926,12 +2018,18 @@ def get_batch_on_this_cp_rank(batch: Dict[str, Any]):
     # we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0
     # and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so
     # that we can get balanced workload among GPUs in a context parallel group.
-    cp_size = parallel_state.get_context_parallel_world_size()
-    if cp_size > 1:
+    # Determine CP topology either from provided group or from current context parallel state
+    if cp_group is not None:
+        cp_size = get_pg_size(cp_group)
+        cp_rank = get_pg_rank(cp_group)
+    else:
+        cp_size = parallel_state.get_context_parallel_world_size()
         cp_rank = parallel_state.get_context_parallel_rank()
+
+    if cp_size > 1:
         for key, val in batch.items():
             if val is not None:
-                seq_dim = 1 if key != "attention_mask" else 2
+                seq_dim = 1 if key != 'attention_mask' else 2
                 val = val.view(
                     *val.shape[0:seq_dim],
                     2 * cp_size,
@@ -1948,6 +2046,100 @@ def get_batch_on_this_cp_rank(batch: Dict[str, Any]):
     return batch
 
 
+def get_thd_batch_on_this_cp_rank(
+    batch: Dict[str, Any],
+    cu_seqlens: torch.Tensor,
+    cu_seqlens_padded: torch.Tensor,
+    max_seqlen: torch.Tensor,
+    cp_size: Optional[int] = None,
+    cp_rank: Optional[int] = None,
+):
+    """Slice each sub-sample in a packed sample batch input along
+    sequence dimension into multiple chunks, which are parallelized
+    across GPUs in a context parallel group.
+    """
+    packed_seq_params = PackedSeqParams(
+        qkv_format="thd",
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        cu_seqlens_q_padded=cu_seqlens_padded,
+        cu_seqlens_kv_padded=cu_seqlens_padded,
+        max_seqlen_q=int(max_seqlen[0].item()),
+        max_seqlen_kv=int(max_seqlen[0].item()),
+    )
+
+    cp_size = parallel_state.get_context_parallel_world_size() if cp_size is None else cp_size
+    cp_rank = parallel_state.get_context_parallel_rank() if cp_rank is None else cp_rank
+    if cp_size > 1:  # slice batch along sequence dimension for context parallelism
+        assert tex is not None and is_te_min_version("1.10.0"), (
+            "Please update Transformer Engine to >= 1.10 to use "
+            "Context Parallel with THD format data"
+        )
+        index = tex.thd_get_partitioned_indices(
+            cu_seqlens_padded, batch['tokens'].size(1), cp_size, cp_rank
+        )
+        for key, data in batch.items():
+            if key in {'attention_mask', 'cu_seqlens', 'cu_seqlens_padded', 'max_seqlen'}:
+                continue
+            batch[key] = data.index_select(1, index)
+
+    return batch, packed_seq_params
+
+
+################################
+### hybrid context parallel ###
+################################
+
+
+def get_batch_on_this_hybrid_cp_rank(
+    batch: Dict[str, Any],
+    local_cp_size: int,
+    cp_group: Optional[torch.distributed.ProcessGroup] = None,
+):
+    """Slice batch input along sequence dimension into multiple chunks,
+    which are parallelized across GPUs in a context parallel group.
+    """
+    assert local_cp_size is not None
+    if cp_group is None:
+        # Get the local cp group required for as defined by the HybridCPDataLoaderWrapper
+        if local_cp_size > 1:
+            cp_group = parallel_state.get_hybrid_data_context_parallel_groups(
+                group_size=local_cp_size
+            )
+    else:
+        # If cp group is provided, it must match the local cp size
+        # as defined by the HybridCPDataLoaderWrapper
+        assert cp_group.size() == local_cp_size
+
+    # Convert [seqlen] to [1, seqlen] similar to default collate_fn
+    # as hybrid_context_parallel dataloader wrapper does not go through default collate_fn
+    for key, data in batch.items():
+        if key in ['attention_mask']:
+            continue
+        batch[key] = torch.stack([data], 0)
+    sample_length = batch['tokens'].shape[1]
+    # TODO(pmannan): Take care of padding tokens here if not divisible by cp_size*2
+    # Create packed_seq_params for SBHD format with cp group information.
+    packed_seq_params = PackedSeqParams(
+        qkv_format="sbhd",
+        cu_seqlens_q=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        cu_seqlens_kv=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        cu_seqlens_q_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        cu_seqlens_kv_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        max_seqlen_q=sample_length,
+        max_seqlen_kv=sample_length,
+        local_cp_size=local_cp_size,
+        cp_group=cp_group,
+    )
+
+    if cp_group is not None and cp_group.size() > 1:
+        # When using hybrid_context_parallel, each sub-sample of a packed sample is
+        # required to be divisible by CP*DP*2 or CP*DP*TP*2 (if using sequence parallel)
+        batch = get_batch_on_this_cp_rank(batch, cp_group=cp_group)
+
+    return batch, packed_seq_params
+
+
 ######################
 ### NVTX profiling ###
 ######################
@@ -2103,16 +2295,6 @@ def unwrap_model(model, module_instances=None):
     return unwrapped_model
 
 
-def maybe_cat(a, b, dim=0, *, required=False):
-    """Concatenates `a` and `b` along `dim` if `a` and `b` exist."""
-    xs = [t for t in (a, b) if t is not None]
-    if not xs:
-        if required:
-            raise ValueError("both tensors are None")
-        return None
-    return xs[0] if len(xs) == 1 else torch.cat(xs, dim=dim)
-
-
 _ASYNC_IO_LOOP: asyncio.AbstractEventLoop | None = None
 
 
@@ -2131,6 +2313,11 @@ def get_asyncio_loop(loop: asyncio.AbstractEventLoop | None = None) -> asyncio.A
     return loop
 
 
+def is_using_quantization_scales(config):
+    """Returns whether the model is using quantization scales based on the config."""
+    return getattr(config, "fp8", False) or getattr(config, "fp4", False)
+
+
 _ASYNC_TASK_STATS = defaultdict(lambda: [0, 0.0])  # cnt, total_time
 
 
diff --git a/megatron/legacy/data/orqa_wiki_dataset.py b/megatron/legacy/data/orqa_wiki_dataset.py
index 57bcc5891d1..033b2351cee 100644
--- a/megatron/legacy/data/orqa_wiki_dataset.py
+++ b/megatron/legacy/data/orqa_wiki_dataset.py
@@ -20,8 +20,7 @@ def get_open_retrieval_wiki_dataset():
     dataset = OpenRetrievalEvidenceDataset('2018 Wikipedia from DPR codebase',
                                            'evidence',
                                            args.evidence_data_path,
-                                           tokenizer,
-                                           args.retriever_seq_length)
+                                           tokenizer)
     return dataset
 
 
diff --git a/megatron/legacy/model/enums.py b/megatron/legacy/model/enums.py
index bc4e4aa29a0..bab179d1a04 100644
--- a/megatron/legacy/model/enums.py
+++ b/megatron/legacy/model/enums.py
@@ -5,9 +5,6 @@
 class LayerType(enum.Enum):
     encoder = 1
     decoder = 2
-    retro_encoder = 3
-    retro_decoder = 4
-    retro_decoder_with_retriever = 5
  
 class AttnType(enum.Enum):
     self_attn = 1
diff --git a/megatron/legacy/model/gpt_model.py b/megatron/legacy/model/gpt_model.py
index 3a2b831ebe7..66fd0979c46 100644
--- a/megatron/legacy/model/gpt_model.py
+++ b/megatron/legacy/model/gpt_model.py
@@ -76,9 +76,6 @@ def set_input_tensor(self, input_tensor):
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, input_ids, position_ids, attention_mask,
-                retriever_input_ids=None,
-                retriever_position_ids=None,
-                retriever_attn_mask=None,
                 labels=None, tokentype_ids=None, inference_context=None, *, inference_params=None):
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
@@ -87,9 +84,6 @@ def forward(self, input_ids, position_ids, attention_mask,
             input_ids,
             position_ids,
             attention_mask,
-            retriever_input_ids=retriever_input_ids,
-            retriever_position_ids=retriever_position_ids,
-            retriever_attn_mask=retriever_attn_mask,
             inference_context=inference_context)
 
         if self.post_process:
diff --git a/megatron/legacy/model/language_model.py b/megatron/legacy/model/language_model.py
index b4e3c87c5e5..383230edb7f 100644
--- a/megatron/legacy/model/language_model.py
+++ b/megatron/legacy/model/language_model.py
@@ -360,7 +360,6 @@ def __init__(
         self.decoder_attn_mask_type = decoder_attn_mask_type
         self.add_pooler = add_pooler
         self.encoder_hidden_state = None
-        self.add_retriever = args.retro_add_retriever
         self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
 
         # Embeddings.
@@ -399,9 +398,7 @@ def __init__(
         if self.add_encoder:
             self.encoder = ParallelTransformer(
                 config,
-                model_type=(
-                    args.model_type if not args.retro_add_retriever else ModelType.retro_decoder
-                ),
+                model_type=args.model_type,
                 self_attn_mask_type=self.encoder_attn_mask_type,
                 pre_process=self.pre_process,
                 post_process=self.post_process,
@@ -479,9 +476,6 @@ def forward(
         dec_input_ids=None,
         dec_position_ids=None,
         dec_attn_mask=None,
-        retriever_input_ids=None,
-        retriever_position_ids=None,
-        retriever_attn_mask=None,
         enc_dec_attn_mask=None,
         tokentype_ids=None,
         inference_context=None,
@@ -502,14 +496,6 @@ def forward(
         else:
             encoder_input = None
 
-        # Retriever embedding.
-        if self.add_retriever and self.pre_process:
-            retriever_input = self.embedding(
-                retriever_input_ids, retriever_position_ids, tokentype_ids=tokentype_ids
-            )
-        else:
-            retriever_input = None
-
         # Rotary positional embeddings
         rotary_pos_emb = None
         if self.use_rotary_position_embeddings:
@@ -524,8 +510,6 @@ def forward(
                 encoder_output = self.encoder(
                     encoder_input,
                     enc_attn_mask,
-                    retriever_input=retriever_input,
-                    retriever_attn_mask=retriever_attn_mask,
                     inference_context=inference_context,
                     rotary_pos_emb=rotary_pos_emb,
                 )
diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py
index 2a662a55b16..ca3414eecdd 100644
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -911,18 +911,6 @@ def __init__(self, config,
         # Normalize the attention output
         self.post_attention_norm = get_norm(config)
 
-        # Cross attention.
-        if self.layer_type in (LayerType.decoder,
-                               LayerType.retro_decoder,
-                               LayerType.retro_decoder_with_retriever,
-                               LayerType.retro_encoder):
-            self.inter_attention = ParallelAttention(
-                config,
-                layer_number,
-                attention_type=AttnType.cross_attn)
-            # Normalize the attention output.
-            self.post_inter_attention_norm = get_norm(config)
-
         # MLP
         if args.num_experts is not None:
             self.mlp = SwitchMLP(config)
@@ -936,25 +924,6 @@ def __init__(self, config,
         self.bias_dropout_add_exec_handler = \
                 nullcontext if use_nvfuser else torch.enable_grad
 
-        if args.retro_add_retriever:
-            self.retro_num_neighbors = args.retro_num_neighbors
-            self.retro_chunk_length = args.retro_chunk_length
-            self.retro_retrieved_length = \
-                args.retro_num_retrieved_chunks * args.retro_chunk_length
-
-        # Retriever (bi-directional transformer with cross attention)
-        if layer_type == LayerType.retro_decoder_with_retriever:
-            self.retriever = ParallelTransformer(
-                config=config,
-                model_type=ModelType.retro_encoder,
-                self_attn_mask_type=AttnMaskType.padding,
-                pre_process=True,
-                post_process=False,
-            )
-            self._retriever_key = 'retriever'
-        else:
-            self.retriever = None
-
     def default_decoder_cross_attention(self,
                                         encoder_output,
                                         enc_dec_attn_mask,
@@ -991,185 +960,8 @@ def default_decoder_cross_attention(self,
 
         return norm_input, norm_output
 
-    def retro_encoder_cross_attention(self,
-                                      retriever_output,
-                                      norm_input,
-                                      norm_output,
-                                      bias_dropout_add_func):
-        """Cross attention for Retro encoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-        """
-
-        ns, bs, d = norm_output.shape # [r, bs * l * k, d]
-
-        # Divide sequence dimension into chunks.
-        chunked_outputs = norm_output.reshape(self.retro_retrieved_length,
-                                              -1,
-                                              self.retro_num_neighbors,
-                                              d)
-        chunked_outputs_before_norm = \
-            norm_input.reshape(self.retro_retrieved_length, -1,
-                               self.retro_num_neighbors, d) # [r, bs*l, k, d]
-
-        # Per-chunk attention.
-        norm_inputs = []
-        norm_outputs = []
-        for k in range(self.retro_num_neighbors):
-
-            # Attention.
-            chunked_output = chunked_outputs[:,:,k].contiguous()
-            attention_output, attention_bias = \
-                self.inter_attention(
-                    chunked_output, # Q (neighbor embedding)
-                    None,
-                    encoder_output=retriever_output) # K, V (hidden act)
-
-            # Residual connection.
-            if self.apply_residual_connection_post_norm:
-                residual = chunked_output
-            else:
-                residual = chunked_outputs_before_norm[:,:,k]
-
-            # Re-enable torch grad to enable fused optimization.
-            with torch.enable_grad():
-                norm_input = bias_dropout_add_func(
-                    attention_output,
-                    None if attention_bias is None else attention_bias.expand_as(residual),
-                    residual,
-                    self.hidden_dropout)
-                norm_inputs.append(norm_input)
-
-            # Layer norm.
-            norm_output = self.post_inter_attention_norm(norm_input)
-            norm_outputs.append(norm_output)
-
-        # Concatenate layer norms.
-        # norm_input : [r, k * bs * l, d]
-        # norm_output : [r, k * bs * l, d]
-        norm_input = torch.stack(norm_inputs, dim=1).reshape(ns, bs, d)
-        norm_output = torch.stack(norm_outputs, dim=1).reshape(ns, bs, d)
-
-        return norm_input, norm_output
-
-    def retro_decoder_cross_attention(self,
-                                      retriever_input,
-                                      retriever_output,
-                                      retriever_attn_mask,
-                                      norm_input,
-                                      norm_output,
-                                      inference_context,
-                                      bias_dropout_add_func,
-                                      *,
-                                      inference_params=None):
-        """Cross attention for Retro decoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            m  : Number of tokens per chunk.
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-        """
-
-        inference_context = deprecate_inference_params(inference_context, inference_params)
-
-        ns, bs, d = norm_output.shape
-        l = int(np.ceil(ns / self.retro_chunk_length))
-
-        # Retrieve neighbors.
-        if self.layer_type == LayerType.retro_decoder_with_retriever:
-            first_ns = ns % self.retro_chunk_length
-            if first_ns > 0:
-                first_chunk, rest_chunk = \
-                    norm_output[:first_ns], norm_output[first_ns:]
-                first_chunk = torch.nn.functional.pad(
-                    first_chunk,
-                    (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns),
-                    'constant',
-                    0)
-                chunked_output = \
-                    torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d]
-            else:
-                chunked_output = norm_output # [l * m, bs, d]
-            chunked_output = chunked_output \
-                .reshape(l, self.retro_chunk_length, bs, d) \
-                .permute(1, 2, 0, 3) \
-                .reshape(self.retro_chunk_length, bs * l, d) \
-                .contiguous()
-
-            # Get Encoder Output
-            retriever_output = self.retriever(
-                hidden_states=retriever_input,
-                attention_mask=retriever_attn_mask,
-                retriever_output=chunked_output,
-                retriever_attn_mask=retriever_attn_mask,
-                inference_context=inference_context) # [r, k * bs * l , d]
-            retriever_output = retriever_output.reshape(
-                self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d) # [r * k, bs * l, d]
-
-        # Chunks.
-        pad = (ns - 1) % self.retro_chunk_length
-        attending_chunks = norm_output[pad:]
-        padded_chunks = torch.nn.functional.pad(
-            attending_chunks,
-            (0, 0, 0, 0, 0, self.retro_chunk_length - 1),
-            'constant', 0)
-        padded_chunked_output = padded_chunks \
-            .reshape(l, self.retro_chunk_length, bs, d) \
-            .permute(1, 2, 0, 3)
-        padded_chunked_output = padded_chunked_output.reshape(
-            self.retro_chunk_length, bs * l, d).contiguous()
-
-        # Encoder output.
-        attention_output, attention_bias = \
-            self.inter_attention(padded_chunked_output,
-                                 None,
-                                 encoder_output=retriever_output)
-
-        # Residual connection.
-        if self.apply_residual_connection_post_norm:
-            residual = norm_output
-        else:
-            residual = norm_input
-
-        # Re-enable torch grad to enable fused optimization.
-        with torch.enable_grad():
-            norm_input = bias_dropout_add_func(
-                attention_output,
-                None if attention_bias is None else attention_bias.expand_as(attention_output),
-                torch.zeros_like(attention_output),
-                self.hidden_dropout)
-            norm_input = norm_input \
-                .reshape(self.retro_chunk_length, bs, l, d) \
-                .permute(2, 0, 1, 3) # [l, m, bs, d]
-            norm_input = norm_input.reshape(self.retro_chunk_length * l, bs, d)
-            norm_input = torch.nn.functional.pad(
-                norm_input,
-                (0, 0, 0, 0, pad, 0),
-                'constant', 0)[:ns] # [ns, b, d]
-            # TODO: better redesign with inference param
-            args = get_args()
-            norm_input = args.retro_attention_gate * norm_input + residual
-
-        # Layer norm post the decoder attention
-        norm_output = self.post_inter_attention_norm(norm_input)
-
-        return retriever_output, norm_input, norm_output
-
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
-                retriever_input=None,
-                retriever_output=None,
-                retriever_attn_mask=None,
                 inference_context=None,
                 rotary_pos_emb=None,
                 *,
@@ -1177,15 +969,6 @@ def forward(self, hidden_states, attention_mask,
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
-        # Update the params in case the retro param changes during inference
-        # TODO: better redesign with inference param
-        args = get_args()
-        if args.retro_add_retriever:
-            self.retro_num_neighbors = args.retro_num_neighbors
-            self.retro_chunk_length = args.retro_chunk_length
-            self.retro_retrieved_length = \
-                args.retro_num_retrieved_chunks * args.retro_chunk_length
-
         # hidden_states: [s, b, h]
 
         # Layer norm at the beginning of the transformer layer.
@@ -1246,24 +1029,6 @@ def forward(self, hidden_states, attention_mask,
                     norm_input,
                     norm_output,
                     bias_dropout_add_func)
-        elif self.layer_type == LayerType.retro_encoder:
-            norm_input, norm_output = \
-                self.retro_encoder_cross_attention(
-                    retriever_output,
-                    norm_input,
-                    norm_output,
-                    bias_dropout_add_func)
-        elif self.layer_type in (LayerType.retro_decoder,
-                                 LayerType.retro_decoder_with_retriever):
-            retriever_output, norm_input, norm_output = \
-                self.retro_decoder_cross_attention(
-                    retriever_input,
-                    retriever_output,
-                    retriever_attn_mask,
-                    norm_input,
-                    norm_output,
-                    inference_context,
-                    bias_dropout_add_func)
         else:
             raise Exception("Unsupported layer type, '%s'." %
                             self.layer_type.name)
@@ -1305,10 +1070,7 @@ def forward(self, hidden_states, attention_mask,
                                               training=self.training)
             output = residual + self.drop_path(out)
 
-        if self.layer_type == LayerType.retro_decoder_with_retriever:
-            return output, retriever_output
-        else:
-            return output
+        return output
 
 
 class NoopTransformerLayer(MegatronModule):
@@ -1339,9 +1101,7 @@ def forward(self, hidden_states, attention_mask,
 
 def _get_num_layers(args, model_type, is_decoder=False):
     """Compute the number of transformer layers resident on the current rank."""
-    if model_type == ModelType.retro_encoder:
-        num_layers = args.retro_encoder_layers
-    elif mpu.get_pipeline_model_parallel_world_size() > 1:
+    if mpu.get_pipeline_model_parallel_world_size() > 1:
         assert args.num_layers == args.encoder_num_layers
         assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
             'num_layers must be divisible by transformer_pipeline_model_parallel_size'
@@ -1364,22 +1124,6 @@ def _get_num_layers(args, model_type, is_decoder=False):
     return num_layers
 
 
-def _get_layer_type(model_type, default_layer_type, retro_layer_numbers,
-                    layer_number):
-    args = get_args()
-    if args.retro_add_retriever and layer_number in retro_layer_numbers:
-        if model_type == ModelType.retro_decoder:
-            return LayerType.retro_decoder_with_retriever \
-                if layer_number == retro_layer_numbers[0] \
-                   else LayerType.retro_decoder
-        elif model_type == ModelType.retro_encoder:
-            return LayerType.retro_encoder
-        else:
-            raise Exception("Unsupported model type, '%s'." % model_type)
-    else:
-        return default_layer_type
-
-
 class ParallelTransformer(MegatronModule):
     """Transformer class."""
 
@@ -1403,7 +1147,6 @@ def __init__(self, config,
         self.input_tensor = None
         self.drop_path_rate = drop_path_rate
         self.transformer_impl = args.transformer_impl
-        self.retro_add_retriever = args.retro_add_retriever
 
         # Store activation checkpoiting flag.
         self.recompute_granularity = config.recompute_granularity
@@ -1469,29 +1212,12 @@ def __init__(self, config,
             rate.item() for rate in
             torch.linspace(0, self.drop_path_rate, config.num_layers)]
 
-        self.retro_layer_numbers = None
-        if model_type == ModelType.retro_decoder:
-            retro_layer_start = 6 if config.num_layers <= 15 else 9
-            self.retro_layer_numbers = \
-                np.arange(retro_layer_start, args.num_layers + 1, 3).tolist()
-        if model_type == ModelType.retro_encoder:
-            self.retro_layer_numbers = [1]
-
-        # Transformer layers.
-        if args.retro_add_retriever:
-            assert self.recompute_granularity != 'full', \
-                "Full recompute not supported for Retro."
-            assert args.transformer_impl == 'local', \
-                "Transformer engine does not support Retro layers."
         def build_layer(layer_number):
             if args.transformer_impl == 'local':
-                current_layer_type = _get_layer_type(
-                    model_type, layer_type, self.retro_layer_numbers,
-                    layer_number)
                 return ParallelTransformerLayer(
                     config,
                     layer_number,
-                    layer_type=current_layer_type,
+                    layer_type=layer_type,
                     self_attn_mask_type=self_attn_mask_type,
                     drop_path_rate=self.drop_path_rates[layer_number - 1])
             else:
@@ -1575,17 +1301,6 @@ def build_layer(layer_number):
             self.layers = torch.nn.ModuleList(
                 [build_layer(i + 1 + offset) for i in range(self.num_layers)])
 
-            # Update dropout rate for Retro encoder.
-            if model_type == ModelType.retro_encoder:
-                for layer in self.layers:
-                    if layer.self_attention.use_flash_attn:
-                        layer.self_attention.core_attention_flash.dropout_p = \
-                            torch.nn.Dropout(args.retro_encoder_attention_dropout)
-                    else:
-                        layer.self_attention.core_attention.attention_dropout.p =\
-                            args.retro_encoder_attention_dropout
-                    layer.hidden_dropout = args.retro_encoder_hidden_dropout
-
         if self.post_process and self.post_norm:
             # Final layer norm before output.
             self.final_norm = get_norm(config)
@@ -1684,9 +1399,6 @@ def set_input_tensor(self, input_tensor):
 
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
-                retriever_input=None,
-                retriever_output=None,
-                retriever_attn_mask=None,
                 inference_context=None,
                 rotary_pos_emb=None,
                 *,
@@ -1771,9 +1483,6 @@ def forward(self, hidden_states, attention_mask,
                             forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
                     else:
                         forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
-                        forward_kwargs['retriever_input'] = retriever_input
-                        forward_kwargs['retriever_output'] = retriever_output
-                        forward_kwargs['retriever_attn_mask'] = retriever_attn_mask
 
                     for index in range(self.num_layers):
                         layer = self._get_layer(index)
@@ -1783,14 +1492,6 @@ def forward(self, hidden_states, attention_mask,
                             attention_mask,
                             **forward_kwargs)
 
-                        # First Retro decoder layer returns both hidden_states
-                        # and retriever_output. Make retriever_output available
-                        # to subsequence Retro layers.
-                        if isinstance(hidden_states, tuple):
-                            assert len(hidden_states) == 2
-                            hidden_states, retriever_output = hidden_states
-                            forward_kwargs["retriever_output"] = retriever_output
-
                 # Skip counter update for eval and activation checkpointing
                 if torch.is_grad_enabled() and self.training:
                     self.microbatch_count += 1
diff --git a/megatron/plugin/distributed/finalize_model_grads.py b/megatron/plugin/distributed/finalize_model_grads.py
index 896cae65f8b..897120e0f16 100644
--- a/megatron/plugin/distributed/finalize_model_grads.py
+++ b/megatron/plugin/distributed/finalize_model_grads.py
@@ -34,6 +34,7 @@ def _allreduce_embedding_grad(
     pp_group: torch.distributed.ProcessGroup,
     weight_getter: Callable[[torch.nn.Module], Optional[torch.nn.Parameter]],
     skip_if_none: bool = True,
+    config: TransformerConfig = None,
 ):
     """Unified helper to all-reduce embedding parameters across pipeline stages.
 
@@ -48,7 +49,7 @@ def _allreduce_embedding_grad(
         skip_if_none (bool, optional): If True, quietly returns when the parameter or its
             gradient is ``None``. Defaults to True.
     """
-    
+
     logger.debug(f"Megatron-LM-FL Plugins: _allreduce_embedding_grad")
     embd_group_is_list = isinstance(embd_group, list)
     if (
@@ -63,6 +64,9 @@ def _allreduce_embedding_grad(
             model_module = model[0]
         elif is_pp_last_stage(pp_group):
             model_module = model[-1]
+        elif getattr(config, 'mtp_num_layers', None) is not None and config.mtp_num_layers > 0:
+            # Embedding for MTP layers is in the last virtual pipeline model parallel stage.
+            model_module = model[-1]
         else:  # We do not support an interleaved schedule for models with encoders yet.
             model_module = model[0]
 
diff --git a/megatron/plugin/models/common/language_module/language_module.py b/megatron/plugin/models/common/language_module/language_module.py
index 966262b91dc..32fd30f4081 100644
--- a/megatron/plugin/models/common/language_module/language_module.py
+++ b/megatron/plugin/models/common/language_module/language_module.py
@@ -26,12 +26,14 @@ def _is_in_embd_group(self):
     logger.debug(f"Megatron-LM-FL Plugins: _is_in_embd_group")
     if self.embd_group is None:
         return False
-    
+
     # Original logic: handle single process group
     if not isinstance(self.embd_group, list):
         if torch.distributed.get_rank() in torch.distributed.get_process_group_ranks(
             self.embd_group
         ):
+            if getattr(self, 'mtp_process', False):
+                return True
             if (
                 torch.distributed.get_rank()
                 == torch.distributed.get_process_group_ranks(self.embd_group)[0]
@@ -48,7 +50,8 @@ def _is_in_embd_group(self):
                 )
             else:
                 return True
-    
+        return False
+
     # FlagScale Begin
     else:
         if torch.distributed.get_rank() in torch.distributed.get_process_group_ranks(
@@ -116,7 +119,10 @@ def setup_embeddings_and_output_layer(self) -> None:
     ):
         self.shared_embedding_or_output_weight().shared_embedding = True
 
-    if (self.post_process or getattr(self, 'mtp_process', False)) and not self.pre_process:
+    if (
+        (self.post_process and self.share_embeddings_and_output_weights)
+        or getattr(self, 'mtp_process', False)
+    ) and not self.pre_process:
         assert not (
             is_vp_first_stage(self.vp_stage, self.vp_size) and is_pp_first_stage(self.pp_group)
         )
diff --git a/megatron/plugin/optimizer/clip_grads.py b/megatron/plugin/optimizer/clip_grads.py
index 4b44b7cd01a..9b5de6f77d7 100644
--- a/megatron/plugin/optimizer/clip_grads.py
+++ b/megatron/plugin/optimizer/clip_grads.py
@@ -158,8 +158,8 @@ def count_zeros_fp32(
     parameters: Union[List[torch.Tensor], torch.Tensor],
     grad_stats_parallel_group: torch.distributed.ProcessGroup,
     use_decoupled_grad: bool = False,
+    tp_group: Optional[torch.distributed.ProcessGroup] = None,
 ) -> float:
-    logger.debug(f"Megatron-LM-FL Plugins: count_zeros_fp32")
     """Counts the number of zeros in gradients associated with the passed-in list of
     parameters.
 
@@ -174,6 +174,8 @@ def count_zeros_fp32(
             default value is False.
     """
 
+    logger.debug(f"Megatron-LM-FL Plugins: count_zeros_fp32")
+
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
 
@@ -196,7 +198,7 @@ def count_zeros_fp32(
         grad_attr = "decoupled_grad" if use_decoupled_grad else "grad"
         grad_not_none = hasattr(param, grad_attr) and getattr(param, grad_attr) is not None
         is_not_shared = param_is_not_shared(param)
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param, tp_group=tp_group)
         if grad_not_none and is_not_shared and is_not_tp_duplicate:
             grad_obj = getattr(param, grad_attr)
             data_parallel_group = get_data_parallel_group_if_dtensor(grad_obj, data_parallel_group)
diff --git a/megatron/plugin/optimizer/optimizer.py b/megatron/plugin/optimizer/optimizer.py
index a8b4296da77..f2cad52ad7a 100644
--- a/megatron/plugin/optimizer/optimizer.py
+++ b/megatron/plugin/optimizer/optimizer.py
@@ -54,7 +54,23 @@ def _unscale_main_grads_and_check_for_nan(self):
 @plugin_implementation("ChainedOptimizer", "load_state_dict")
 def load_state_dict(self, state_dict):
     logger.debug(f"Megatron-LM-FL Plugins: load_state_dict")
-    if self.convert_to_ep:  # convert tp/pp chained_optimizers to ep chained_optimizers
+
+    if not self.convert_to_ep: # megatron origin apply ep
+        # If there is only one optimizer, we read the state dict as a single optimizer.
+        if len(self.chained_optimizers) == 1:
+            self.chained_optimizers[0].load_state_dict(state_dict)
+            return
+        if len(self.chained_optimizers) != len(state_dict):
+            raise RuntimeError(
+                f'Expected {len(self.chained_optimizers)} entries'
+                f' in state dict, but got {len(state_dict)}.'
+            )
+        if isinstance(state_dict, dict):
+            state_dict = (v for k, v in sorted(state_dict.items()))
+        for optimizer, state in zip(self.chained_optimizers, state_dict):
+            optimizer.load_state_dict(state)
+        self._synchronize_steps()
+    else: # convert tp/pp chained_optimizers to ep chained_optimizers
         logger.info(
             "load_state_dict:convert tp/pp chained_optimizers to ep chained_optimizers!"
         )
@@ -82,18 +98,3 @@ def load_state_dict(self, state_dict):
             new_state_dict = (v for k, v in sorted(new_state_dict.items()))
         for optimizer, state in zip(self.chained_optimizers, new_state_dict):
             optimizer.load_state_dict(state)
-    else:  # megatron source apply ep
-        # If there is only one optimizer, we read the state dict as a single optimizer.
-        if len(self.chained_optimizers) == 1:
-            self.chained_optimizers[0].load_state_dict(state_dict)
-            return
-        if len(self.chained_optimizers) != len(state_dict):
-            raise RuntimeError(
-                f'Expected {len(self.chained_optimizers)} entries'
-                f' in state dict, but got {len(state_dict)}.'
-            )
-        if isinstance(state_dict, dict):
-            state_dict = (v for k, v in sorted(state_dict.items()))
-        for optimizer, state in zip(self.chained_optimizers, state_dict):
-            optimizer.load_state_dict(state)
-        self._synchronize_steps()
diff --git a/megatron/post_training/arguments.py b/megatron/post_training/arguments.py
index 528c8059878..1e988680142 100644
--- a/megatron/post_training/arguments.py
+++ b/megatron/post_training/arguments.py
@@ -2,7 +2,7 @@
 
 
 def add_modelopt_args(parser):
-    """Add additional arguments for using TensorRT Model Optimizer (modelopt) features."""
+    """Add additional arguments for using Model Optimizer (modelopt) features."""
     group = parser.add_argument_group(title="modelopt-generic")
 
     # Model and Checkpoint Compatibility
@@ -28,12 +28,12 @@ def add_modelopt_args(parser):
         action="store_true",
         help="Forcing local DotProductAttention; otherwise TEDotProductAttention is used.",
     )
-
     # Quantization
     group.add_argument(
         "--export-kv-cache-quant",
-        action="store_true",
-        help="Whether or not to perform KV-cache quantization.",
+        help="Type of KV cache quantization to perform.",
+        choices=["none", "fp8", "fp8_affine", "nvfp4", "nvfp4_affine", "nvfp4_rotate"],
+        default="none",
     )
     group.add_argument(
         "--export-real-quant-cfg",
@@ -46,47 +46,33 @@ def add_modelopt_args(parser):
         "--export-quant-cfg",
         type=str,
         default=None,
-        choices=[
-            "int8_sq",
-            "fp8",
-            "fp8_real_quant",
-            "fp8_blockwise",
-            "fp8_blockwise_real_quant",
-            "fp8_blockwise_32",
-            "int4_awq",
-            "w4a8_awq",
-            "nvfp4",
-            "None",
-        ],
-        help="Specify a quantization config from the supported choices.",
+        # TODO replace choices with mtq.config.choices after deprecating the shorter aliases
+        help="Specify a quantization config from mtq.config.choices.",
     )
-
     # Knowledge Distillation
     group.add_argument(
-        '--export-kd-cfg',
+        '--export-kd-teacher-load',
         type=str,
-        default=None,
-        help='Path to distillation configuration yaml file.',
+        help='Path to checkpoint to load as distillation teacher. (Enables distillation mode automatically)',
     )
-
     group.add_argument(
-        '--teacher-model-config',
+        '--export-kd-teacher-model-config',
         type=str,
         default=None,
         help='Path to teacher model config for distillation. If not provided, defaults to ${export_kd_teacher_load}/model_config.yaml.',
     )
-
     group.add_argument(
-        '--export-kd-teacher-load',
+        '--export-kd-teacher-ckpt-format',
         type=str,
-        help='Path to checkpoint to load as distillation teacher.',
+        default=None,
+        choices=['torch', 'torch_dist', 'torch_dcp'],
+        help="Checkpoint format of teacher model, if different from student's.",
     )
     group.add_argument(
-        '--export-kd-teacher-ckpt-format',
+        '--export-kd-cfg',
         type=str,
         default=None,
-        choices=['torch', 'torch_dist', 'zarr', 'torch_dcp'],
-        help="Checkpoint format of teacher model, if different from student's.",
+        help='Path to distillation configuration yaml file, in order to use non-default settings.',
     )
 
     # Finetuning
@@ -122,7 +108,7 @@ def add_modelopt_args(parser):
         action="store_true",
         help='Will be set automatically when loading a ModelOpt checkpoint.',
     )
-    
+
     # GPT-OSS YaRN RoPE support
     group.add_argument(
         '--enable-gpt-oss',
diff --git a/megatron/post_training/checkpointing.py b/megatron/post_training/checkpointing.py
index 745628ee0b7..47aa87b4967 100644
--- a/megatron/post_training/checkpointing.py
+++ b/megatron/post_training/checkpointing.py
@@ -13,6 +13,7 @@
 from megatron.training import get_args
 from megatron.training.checkpointing import _load_base_checkpoint, load_checkpoint
 from megatron.training.utils import print_rank_0, unwrap_model
+from .utils import print_distributed_quant_summary
 
 logger = logging.getLogger(__name__)
 
@@ -176,13 +177,8 @@ def _remove_prefix_state_dict_pre_hook(
         )
         model_state_dict = state_dict["model"]
         unwrapped_model[0].load_state_dict(model_state_dict, strict=False)
+        print_distributed_quant_summary(unwrapped_model[0])
     elif sharded_load_dir is not None and optimizer is None and opt_param_scheduler is None:
-
-        force_pre_mcore_014 = not is_torch_min_version("2.6a0")
-        if force_pre_mcore_014 and not args.dist_ckpt_save_pre_mcore_014:
-            logger.warning(f"PyTorch version {get_torch_version()} below 2.6 detected."
-                       f" Forcing dist_ckpt_save_pre_mcore_014 behavior.")
-
         sharded_state_dict_metadata = dist_checkpointing.load_content_metadata(sharded_load_dir)
         sharded_state_dict = unwrapped_model[0].sharded_state_dict(
             prefix=additional_sharded_prefix, metadata=sharded_state_dict_metadata
@@ -196,5 +192,6 @@ def _remove_prefix_state_dict_pre_hook(
             sharded_state_dict, sharded_load_dir, strict=args.dist_ckpt_strictness
         )
         unwrapped_model[0].load_state_dict(model_state_dict, strict=False)
+        print_distributed_quant_summary(unwrapped_model[0])
     else:
-        _ = load_checkpoint(model, optimizer, opt_param_scheduler, strict=strict, load_arg=load_arg)
+        _ = load_checkpoint(model, optimizer, opt_param_scheduler, strict=strict, load_arg=load_arg)
\ No newline at end of file
diff --git a/megatron/post_training/model_builder.py b/megatron/post_training/model_builder.py
index cb2654e7107..fea837c96c3 100644
--- a/megatron/post_training/model_builder.py
+++ b/megatron/post_training/model_builder.py
@@ -24,6 +24,8 @@
 from megatron.training import get_args, print_rank_0
 from megatron.training.arguments import core_transformer_config_from_args
 
+from megatron.post_training.utils import print_distributed_quant_summary
+
 
 def count_parameters_in_layer(model, layer_name):
     num_params = 0
@@ -45,9 +47,9 @@ def _add_load_convert_hooks(model: MCoreGPTModel):
 def _load_teacher_model_config(checkpoint_path: str) -> Namespace:
     """Reads teacher config from a file.
 
-    The config provided via --teacher-model-config should specify
-    (in NEMO format) any model architecture settings which differ from the main student model's.
-    This function will translate NEMO field names to MCore as needed.
+    The config provided, either in the teacher checkpoint dir or via `--export-kd-teacher-model-config`,
+    should specify (in NeMo yaml config format) any model architecture settings which differ from the main student model's.
+    This function will translate NeMo field names to MCore as needed.
     """
     required_teacher_fields = (
         "num_layers",
@@ -57,18 +59,22 @@ def _load_teacher_model_config(checkpoint_path: str) -> Namespace:
     )
 
     args = get_args()
-    config_path = os.path.join(checkpoint_path, "model_config.yaml") if args.teacher_model_config is None else args.teacher_model_config
+    if args.export_kd_teacher_model_config is not None:
+        config_path = args.export_kd_teacher_model_config
+    else:
+        config_path = os.path.join(checkpoint_path, "model_config.yaml")
     if not os.path.exists(config_path):
         raise FileNotFoundError(
-            "Teacher checkpoint dir must contain a NEMO-format yaml config named 'model_config.yaml'"
+            f"Teacher model-config file {config_path} not found.\n"
+            "Teacher checkpoint dir must contain a NeMo-format config named 'model_config.yaml'"
+            " or provide it via --export-kd-teacher-model-config."
         )
     with open(config_path) as f:
         config = yaml.safe_load(f)
 
-    missing_keys = [k for k in required_teacher_fields if k not in config]
-    if missing_keys:
+    if missing_keys := [k for k in required_teacher_fields if k not in config]:
         raise ValueError(
-            f"Teacher `model_config.yaml` file missing the following fields: {missing_keys}"
+            f"Teacher model config file ({config_path}) missing the following required fields: {missing_keys}"
         )
 
     if "encoder_seq_length" in config:
@@ -102,20 +108,39 @@ def _load_teacher_model_config(checkpoint_path: str) -> Namespace:
     return Namespace(**args_dict)
 
 
-def _teacher_provider(config: Namespace, model_kwargs: Dict[str, Any]) -> MCoreGPTModel:
-    """Teacher model factory (must be a non-local function to pickle)."""
+def _load_teacher_model(config, config_raw: Namespace, model_kwargs: Dict[str, Any]) -> MCoreGPTModel:
+    """Teacher model creator."""
     args = get_args()
 
-    # Convert to `TransformerConfig` here to avoid ModelOpt pickling issues (contains local functions)
-    config = core_transformer_config_from_args(config)
-
     if config.is_hybrid_model:
+        # These parameters are not part of the TransformerConfig and need to be passed separately.
+        if "hybrid_override_pattern" in config_raw:
+            model_kwargs["hybrid_override_pattern"] = config_raw.hybrid_override_pattern
+        if "hybrid_attention_ratio" in config_raw:
+            model_kwargs["hybrid_attention_ratio"] = config_raw.hybrid_attention_ratio
+        if "hybrid_mlp_ratio" in config_raw:
+            model_kwargs["hybrid_mlp_ratio"] = config_raw.hybrid_mlp_ratio
+
         teacher = MCoreMambaModel(config=config, **model_kwargs)
     else:
+        # GPT layer spec needs re-creation since it depends on number of model layers.
+        if config.heterogeneous_block_specs:
+            model_kwargs["transformer_layer_spec"] = get_gpt_heterogeneous_layer_spec(
+                config=config,
+                use_te=(args.transformer_impl == "transformer_engine"),
+            )
+        else:
+            model_kwargs["transformer_layer_spec"] = get_gpt_modelopt_spec(
+                config=config,
+                local_core_attention=False if config.context_parallel_size > 1 else args.export_force_local_attention,
+                remap_te_layernorm=args.export_te_mcore_model,
+                real_quant_cfg=args.export_real_quant_cfg,
+                use_arbitrary_attention_mask=False if config.context_parallel_size > 1 else True,
+            )
         teacher = MCoreGPTModel(config=config, **model_kwargs)
     _add_load_convert_hooks(teacher)
 
-    print_rank_0("Loading teacher {} checkpoint...".format("MCoreMambaModel" if config.is_hybrid_model else "MCoreGPTModel"))
+    print_rank_0(f"Loading teacher as {type(teacher).__name__} from {args.export_kd_teacher_load} ...")
     # [WAR]: load checkpoint will check checkpoint's saved args and rng state if not finetune.
     # To avoid error out on loading teacher's checkpoint, we temporarily set args.finetune to
     # True while loading the teacher checkpoint.
@@ -125,12 +150,19 @@ def _teacher_provider(config: Namespace, model_kwargs: Dict[str, Any]) -> MCoreG
         args.ckpt_format = args.export_kd_teacher_ckpt_format
     load_modelopt_checkpoint([teacher], load_arg='export_kd_teacher_load')
     args.finetune, args.ckpt_format = original_args_finetune, original_ckpt_format
-    print_rank_0("successfully loaded teacher...")
+    print_rank_0("...teacher loaded successfully.")
 
     return teacher
 
 
-def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, config=None) -> MCoreGPTModel | MCoreMambaModel:
+def modelopt_gpt_mamba_builder(
+    args,
+    pre_process,
+    post_process,
+    vp_stage=None,
+    config=None,
+    pg_collection=None,
+) -> MCoreGPTModel | MCoreMambaModel:
     """Builds the model.
 
     Args:
@@ -139,6 +171,9 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c
         post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
         vp_stage (int, optional): The virtual pipeline stage.
         config (TransformerConfig, optional): The configuration object.
+        pg_collection (ProcessGroupCollection, optional): Collection of process groups
+            used for tensor/context/pipeline/data parallelism. If provided, it will be
+            attached to the returned model for downstream routing/resharding utilities.
 
     Returns:
         MCoreGPTModel | MCoreMambaModel: The returned model
@@ -221,6 +256,7 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c
             "rotary_percent": args.rotary_percent,
             "rotary_base": args.rotary_base,
             "rope_scaling": args.use_rope_scaling,
+            "pg_collection": pg_collection,
         }
         model = MCoreGPTModel(config=config, **model_kwargs)
     elif args.export_model_type == "MambaModel" or args.is_hybrid_model:
@@ -244,6 +280,7 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c
             "position_embedding_type": args.position_embedding_type,
             "rotary_percent": args.rotary_percent,
             "rotary_base": args.rotary_base,
+            "pg_collection": pg_collection,
         }
 
         model = MCoreMambaModel(config=config, **model_kwargs)
@@ -285,19 +322,14 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c
                 args.virtual_pipeline_model_parallel_size is None
             ), "ModelOpt Distillation currently incompatible with interleaved pipeline schedule."
 
-        teacher_config = _load_teacher_model_config(args.export_kd_teacher_load)
+        teacher_config_raw = _load_teacher_model_config(args.export_kd_teacher_load)
+        teacher_config = core_transformer_config_from_args(teacher_config_raw)  # convert to TransformerConfig
+
         distill_cfg = mtd_mcore.setup_distillation_config(
-            args.export_kd_cfg, student_cfg=config, teacher_cfg=core_transformer_config_from_args(teacher_config)
+            args.export_kd_cfg, student_cfg=config, teacher_cfg=teacher_config
         )
-        if "hybrid_override_pattern" in teacher_config and args.is_hybrid_model:
-            model_kwargs["hybrid_override_pattern"] = teacher_config.hybrid_override_pattern
-        if "hybrid_attention_ratio" in teacher_config and args.is_hybrid_model:
-            model_kwargs["hybrid_attention_ratio"] = teacher_config.hybrid_attention_ratio
-        if "hybrid_mlp_ratio" in teacher_config and args.is_hybrid_model:
-            model_kwargs["hybrid_mlp_ratio"] = teacher_config.hybrid_mlp_ratio
-
         kd_config = {
-            "teacher_model": (_teacher_provider, [teacher_config, model_kwargs], {}),
+            "teacher_model": _load_teacher_model(teacher_config, teacher_config_raw, model_kwargs),
             "criterion": distill_cfg.criterion,
             "loss_balancer": distill_cfg.loss_balancer,
         }
@@ -308,5 +340,6 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c
         mtd_mcore.adjust_distillation_model_for_mcore(model, distill_cfg)
         # Also remove KD mode state to prevent issues with re-conversion after restore.
         mto.ModeloptStateManager(model).state_dict().pop()  # TODO(aanoosheh): remove once fixed in ModelOpt
-
+    
+    print_distributed_quant_summary(model)
     return model
diff --git a/megatron/post_training/non_loss_data_func.py b/megatron/post_training/non_loss_data_func.py
index 49c29b4912c..d5daf622372 100644
--- a/megatron/post_training/non_loss_data_func.py
+++ b/megatron/post_training/non_loss_data_func.py
@@ -40,7 +40,7 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_steps: int = 7):
         total_steps += steps
         if torch.distributed.get_rank() == 0:
             al = actual_osl / steps
-            ar = al / (draft_steps + parallel_draft_step - 1)
+            ar = al / (draft_steps + parallel_draft_step)
             print(
                 "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2} PARALLEL {:2}".format(
                     torch.distributed.get_rank(),
@@ -57,7 +57,7 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_steps: int = 7):
             )
     if torch.distributed.get_rank() == 0:
         al = total_osl / total_steps
-        ar = al / (draft_steps + parallel_draft_step - 1)
+        ar = al / (draft_steps + parallel_draft_step)
         print(
             "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2} PARALLEL {:2}".format(
                 torch.distributed.get_rank(),
diff --git a/megatron/post_training/utils.py b/megatron/post_training/utils.py
index 4bec8c96cf1..b24ba291127 100644
--- a/megatron/post_training/utils.py
+++ b/megatron/post_training/utils.py
@@ -1,9 +1,45 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
+import inspect
 import os
+
+import modelopt
+import modelopt.torch.quantization as mtq
 import torch
-from datasets import load_dataset
+from modelopt.torch.quantization.utils import is_quantized
+from packaging.version import Version
+
+from megatron.core import parallel_state
+from megatron.training.utils import unwrap_model
+
+
+def modelopt_version_higher_than(target_version: str):
+    """Check if Model-Optimizer is greater than this version."""
+    info = "rank {:3}/{:3} checking if nvidia-modelopt-{} is higher than {}".format(
+        torch.distributed.get_rank(),
+        torch.distributed.get_world_size(),
+        str(modelopt.__version__),
+        target_version,
+    )
+    print(info)
+    return Version(modelopt.__version__) > Version(target_version)
+
+def modelopt_version_at_least(target_version: str):
+    """Check if Model-Optimizer is greater or equal than this version."""
+    info = "rank {:3}/{:3} checking if nvidia-modelopt-{} is at least {}".format(
+        torch.distributed.get_rank(),
+        torch.distributed.get_world_size(),
+        str(modelopt.__version__),
+        target_version,
+    )
+    print(info)
+    return Version(modelopt.__version__) >= Version(target_version)
+
 
+def function_has_parameter(function, argument_name: str) -> bool:
+    """Check if a function has a specific argument."""
+    sig = inspect.signature(function)
+    return argument_name in sig.parameters
 
 def get_current_memory_info():
     """Get current memory usage."""
@@ -26,6 +62,7 @@ def report_current_memory_info():
 
 def get_mtbench_chat_data():
     """Return a MTBench dataset."""
+    from datasets import load_dataset
 
     def mtbench_to_oai_chat(example):
         """Convert MTBench data to OpenAI chat completion format."""
@@ -35,12 +72,15 @@ def mtbench_to_oai_chat(example):
         example["conversations"] = conversations
         return example
 
-    dataset = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train", token=os.environ.get("HF_TOKEN", None))
+    dataset = load_dataset(
+        "HuggingFaceH4/mt_bench_prompts", split="train", token=os.environ.get("HF_TOKEN", None)
+    )
     return dataset.map(mtbench_to_oai_chat)
 
+
 def to_empty_if_meta(module: torch.nn.Module, *, device: torch.device, recurse=True):
     """Move tensors to device if not meta device; otherwise materialize with empty_like().
-   
+
     Args:
         module: The target module to apply this transformation.
         device: The desired device of the parameters
@@ -55,6 +95,34 @@ def _empty_like_if_meta(tensor: torch.Tensor, *, device: torch.device):
         else:
             return tensor.to(device)
 
-    module._apply(
-        lambda t: _empty_like_if_meta(t, device=device), recurse=recurse
-    )
+    module._apply(lambda t: _empty_like_if_meta(t, device=device), recurse=recurse)
+
+
+def print_distributed_quant_summary(model, msg=""):
+    from megatron.core import parallel_state
+    from megatron.training import print_rank_0
+    from megatron.training.utils import unwrap_model
+
+    unwrapped_model = unwrap_model(model)
+    if isinstance(unwrapped_model, list):
+        unwrapped_model = unwrapped_model[0]
+
+    if not is_quantized(unwrapped_model):
+        return
+
+    print_rank_0(f"{msg}\nQuantization summary of unwrapped model: {unwrapped_model}\n{'_'*80}")
+
+    if not torch.distributed.is_initialized():
+        mtq.print_quant_summary(unwrapped_model)
+        return
+
+    # Only print from unique TP ranks of [0, 1]
+    if parallel_state.get_data_parallel_rank(
+        with_context_parallel=True
+    ) == 0 and parallel_state.get_tensor_model_parallel_rank() in [0, 1]:
+        TP_rank = parallel_state.get_tensor_model_parallel_rank()
+        EP_rank = parallel_state.get_expert_model_parallel_rank()
+        PP_rank = parallel_state.get_pipeline_model_parallel_rank()
+        print(f"\nTP rank {TP_rank}, EP rank {EP_rank}, PP rank {PP_rank}")
+        print("_" * 80)
+        mtq.print_quant_summary(unwrapped_model)
diff --git a/megatron/rl/__init__.py b/megatron/rl/__init__.py
index d3ae2fefd16..08ae226bfe4 100644
--- a/megatron/rl/__init__.py
+++ b/megatron/rl/__init__.py
@@ -12,6 +12,7 @@
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import Self, Type
 
+
 def import_class(class_path: str) -> Type:
     """Import a class from a string path.
 
diff --git a/megatron/rl/agent/api.py b/megatron/rl/agent/api.py
index fce7c3073ee..9568db3a54d 100644
--- a/megatron/rl/agent/api.py
+++ b/megatron/rl/agent/api.py
@@ -8,6 +8,8 @@
 import numpy as np
 from pydantic import BaseModel
 
+from megatron.core.utils import trace_async_exceptions
+
 from ..__init__ import Request, TypeLookupable
 from ..inference import (
     ChatInferenceInterface,
@@ -18,8 +20,6 @@
     ReturnsRaw,
 )
 
-from megatron.core.utils import trace_async_exceptions
-
 
 class AgentBaseModel(BaseModel, extra='allow'):
     pass
@@ -46,8 +46,8 @@ class GroupedRolloutRequest(Request):
 class Rollout(AgentBaseModel):
     """Data for language-based Rollout."""
 
-    trajectory: str
-    prompt_length: int | None = None
+    trajectory: list[str]
+    prompt_length: list[int] | None = None
     reward: float = None
     env_id: str | None = None
     problem_id: str | None = None
@@ -56,10 +56,10 @@ class Rollout(AgentBaseModel):
 class TokenRollout(AgentBaseModel):
     """Tokenized representation of a language-based Rollout."""
 
-    trajectory: list[int]
+    trajectory: list[list[int]]
     reward: list[float] | float
-    generation_mask: list[list[int]] | list[bool] | None = None
-    logprobs: list[float] | None = None
+    generation_mask: list[list[bool]] | None = None
+    logprobs: list[list[float]] | None = None
     env_id: str | None = None
     problem_id: str | None = None
 
@@ -67,8 +67,8 @@ class TokenRollout(AgentBaseModel):
 class ContrastiveRollout(AgentBaseModel):
     """Contrastive/Preference data for language-based Rollout."""
 
-    chosen_trajectory: str
-    rejected_trajectory: str
+    chosen_trajectory: list[str]
+    rejected_trajectory: list[str]
 
 
 class Head2HeadRolloutRequest(Request):
@@ -174,6 +174,11 @@ class GroupedRolloutGenerator(Agent, ABC):
     parallel_generation_tasks: int = 512
     buffer_size: int = 10
 
+    def __init__(self, *, parallel_generation_tasks: int | None = None, **kwargs):
+        super().__init__(**kwargs)
+        if parallel_generation_tasks is not None:
+            self.parallel_generation_tasks = parallel_generation_tasks
+
     @abstractmethod
     async def group_rollout(self, request: GroupedRolloutRequest) -> list[Rollout]: ...
 
diff --git a/megatron/rl/agent/huggingface_dataset_agent.py b/megatron/rl/agent/huggingface_dataset_agent.py
index 7b3754a3865..2b25b9eeef9 100644
--- a/megatron/rl/agent/huggingface_dataset_agent.py
+++ b/megatron/rl/agent/huggingface_dataset_agent.py
@@ -31,4 +31,3 @@ def load_hf_dataset(self):
             return load_dataset("arrow", data_dir=self.dataset_file, split=self.split)
         else:
             return load_dataset(self.hf_dataset_name, split=self.split)
-
diff --git a/megatron/rl/agent/reward_only_agent.py b/megatron/rl/agent/reward_only_agent.py
index 2e81674c74d..53b1f7407b2 100644
--- a/megatron/rl/agent/reward_only_agent.py
+++ b/megatron/rl/agent/reward_only_agent.py
@@ -104,16 +104,16 @@ async def rollout_from_response(
                 for x in range(len(response.token_ids))
             ]
             rollout = TokenRollout(
-                trajectory=response.token_ids,
+                trajectory=[response.token_ids],
                 reward=await self.get_reward(response_text, golden),
-                logprobs=logprobs,
-                generation_mask=generation_mask,
+                logprobs=[logprobs],
+                generation_mask=[generation_mask],
                 env_id=self.env_id,
                 problem_id=golden['problem_id'] if 'problem_id' in golden else None,
             )
         else:
             rollout = Rollout(
-                trajectory=raw_text,
+                trajectory=[raw_text],
                 reward=await self.get_reward(response_text, golden),
                 env_id=self.env_id,
                 problem_id=golden['problem_id'] if 'problem_id' in golden else None,
diff --git a/megatron/rl/agent/weighted_multi_task.py b/megatron/rl/agent/weighted_multi_task.py
index 8596ad6adcd..4690d9f1600 100644
--- a/megatron/rl/agent/weighted_multi_task.py
+++ b/megatron/rl/agent/weighted_multi_task.py
@@ -66,7 +66,9 @@ def __init__(self, agent_configs: list[AgentConfig]):
                 self.weights.append(config.weight / total_weight)
 
     @classmethod
-    def from_config(cls, config: list[dict[str, Any]]) -> 'WeightedMultiTask':
+    def from_config(
+        cls, config: list[dict[str, Any]], *, parallel_generation_tasks: int | None = None
+    ) -> 'WeightedMultiTask':
         """Create a WeightedMultiTask from a config list.
 
         Args:
@@ -82,13 +84,15 @@ def from_config(cls, config: list[dict[str, Any]]) -> 'WeightedMultiTask':
         for entry in config:
             if not all(k in entry for k in ['agent_type', 'agent_args', 'weight']):
                 raise ValueError(f"Missing required keys in config entry: {entry}")
+            agent_args = entry.get('agent_args', {})
+            agent_args['parallel_generation_tasks'] = parallel_generation_tasks
 
             # Import and instantiate the agent class
             agent_type = import_class(entry['agent_type'])
             agent_configs.append(
                 AgentConfig(
                     agent_type=agent_type,
-                    agent_args=entry['agent_args'],
+                    agent_args=agent_args,
                     weight=float(entry['weight']),
                     evaluation_only=entry.get('evaluation_only', False),
                 )
diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py
index 54acc112dd9..4e9364b3ae9 100644
--- a/megatron/rl/inference/megatron.py
+++ b/megatron/rl/inference/megatron.py
@@ -8,7 +8,6 @@
 from pydantic import PrivateAttr
 
 from megatron.core import parallel_state
-from megatron.core.utils import get_attr_wrapped_model
 from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext
 from megatron.core.inference.engines.abstract_engine import AbstractEngine
 from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine
@@ -25,13 +24,15 @@
     SimpleTextGenerationController,
 )
 from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage
 from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols
 from megatron.core.transformer.module import MegatronModule
-from megatron.core.pipeline_parallel.utils import (
-    is_pp_first_stage,
-    is_pp_last_stage,
+from megatron.core.utils import (
+    get_attr_wrapped_model,
+    get_mamba_inference_state_config_from_model,
+    get_pg_size,
+    log_single_rank,
 )
-from megatron.core.utils import get_mamba_inference_state_config_from_model, log_single_rank, get_pg_size
 from megatron.training import get_wandb_writer
 from megatron.training.global_vars import get_args, get_tokenizer
 
@@ -79,8 +80,12 @@ def get_static_inference_engine(args: Namespace, model: MegatronModule) -> Abstr
     )
 
     inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    pg_collection = get_attr_wrapped_model(model, "pg_collection")
+    pp_group = pg_collection.pp
     text_generation_controller = SimpleTextGenerationController(
-        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+        inference_wrapped_model=inference_wrapped_model,
+        tokenizer=tokenizer,
+        pp_group=pp_group,
     )
     return MCoreEngine(
         text_generation_controller=text_generation_controller,
@@ -91,8 +96,12 @@ def get_static_inference_engine(args: Namespace, model: MegatronModule) -> Abstr
 
 
 ## This code is copied from tools/run_text_generation_server.py
-def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inference_logging_step_interval: int = 0,
-    metrics_writer = None) -> AbstractEngine:
+def get_dynamic_inference_engine(
+    args: Namespace,
+    model: MegatronModule,
+    inference_logging_step_interval: int = 0,
+    metrics_writer = None
+) -> AbstractEngine:
     """Get the relevant backend for running inference.
 
     This function will automatically choose the TRTLLMBackend when possible,
@@ -114,34 +123,38 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen
 
     mamba_inference_state_config = get_mamba_inference_state_config_from_model(model)
 
-    # DynamicInferenceContext must use the inference model's TP size, not the
-    # training TP size from global args. The inference model may have a custom
-    # ProcessGroupCollection with a different TP size.
+    # DynamicInferenceContext must use the inference model's TP / PP size, not the
+    # training TP / PP size from global args. The inference model may have a custom
+    # ProcessGroupCollection with a different TP / PP size.
     pg_collection = get_attr_wrapped_model(model, "pg_collection")
     tp_group = getattr(pg_collection, 'tp', None) if pg_collection is not None else None
     if tp_group is not None:
         inference_tp_size = get_pg_size(tp_group)
     else:
         inference_tp_size = args.tensor_model_parallel_size
+    pp_group = getattr(pg_collection, 'pp', None) if pg_collection is not None else None
+    if pp_group is not None:
+        inference_pp_size = get_pg_size(pp_group)
+    else:
+        inference_pp_size = args.pipeline_model_parallel_size
 
     # Inference context.
     inference_context = DynamicInferenceContext(
         params_dtype=args.params_dtype,
-        num_layers=args.num_layers // args.pipeline_model_parallel_size,
+        num_layers=args.num_layers // inference_pp_size,
         kv_channels=args.kv_channels,
         num_attention_heads=(
             args.num_query_groups if args.group_query_attention else args.num_attention_heads
         ),
         max_sequence_length=args.inference_max_seq_length,
         num_cuda_graphs=(
-            args.inference_dynamic_batching_num_cuda_graphs
-            if enable_cuda_graph
-            else None
+            args.inference_dynamic_batching_num_cuda_graphs if enable_cuda_graph else None
         ),
         block_size_tokens=args.inference_dynamic_batching_block_size,
         buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb,
+        max_requests=args.inference_dynamic_batching_max_requests,
         max_tokens=args.inference_dynamic_batching_max_tokens,
-        tensor_model_parallel_size=inference_tp_size,
+        pg_collection=pg_collection,  # TP/PP sizes are derived from the model's pg_collection.
         materialize_only_last_token_logits=True,
         mamba_inference_state_config=mamba_inference_state_config,
         cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents,
@@ -153,23 +166,29 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen
         cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens,
         cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count,
         metrics_writer=metrics_writer,
+        persist_cuda_graphs=args.rl_training_cuda_graphs,
+        offload_kv_cache=args.rl_offload_kv_cache_during_training
     )
 
-    inference_wrapped_model = GPTInferenceWrapper(model, args, inference_context)
+    inference_wrapped_model = GPTInferenceWrapper(model, args, inference_context, pg_collection=pg_collection)
 
     inference_wrapped_model.model_is_pipeline_parallel = not (
         is_pp_first_stage(pg_collection.pp) and is_pp_last_stage(pg_collection.pp)
     )
 
+    pp_group = getattr(pg_collection, "pp", None)
     text_generation_controller = SimpleTextGenerationController(
-        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+        inference_wrapped_model=inference_wrapped_model,
+        tokenizer=tokenizer,
+        pp_group=pp_group,
     )
 
     return DynamicInferenceEngine(
         controller=text_generation_controller,
         context=inference_context,
-        enable_cuda_graph=enable_cuda_graph,
         random_seed=args.seed,
+        track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events,
+        enable_chunked_prefill=not args.disable_chunked_prefill,
         inference_logging_step_interval=inference_logging_step_interval,
         pg_collection=pg_collection,
     )
@@ -195,6 +214,7 @@ async def base_generate(self, request: InferenceRequest):
         assert self._client is not None, "Client is not initialized"
 
         tokenizer = get_tokenizer()
+        args = get_args()
 
         sampling_params = SamplingParams(
             num_tokens_to_generate=None,
@@ -205,15 +225,13 @@ async def base_generate(self, request: InferenceRequest):
             termination_id=self._inference_engine.controller.tokenizer.eod,
             return_log_probs=True,
             skip_prompt_log_probs=True,
-            add_BOS=tokenizer.bos is not None,
+            add_BOS=(not args.rl_skip_bos_token and tokenizer.bos is not None),
         )
         requests = [
             self._client.add_request(prompt=prompt, sampling_params=sampling_params)
             for prompt in request.prompt
         ]
-        records = await asyncio.gather(
-            *requests
-        )
+        records = await asyncio.gather(*requests)
         responses = [record[-1] for record in records]
         return [
             InferenceResponse(
@@ -237,24 +255,37 @@ async def launch(cls, model: GPTModel, **kwargs):
                 logging.WARNING,
                 "WARNING: Tokenizer has no BOS token so prompt will not have BOS token",
             )
-        
+
         # Get inference logging configuration from args
-        inference_logging_step_interval = args.inference_wandb_logging_step_interval
-        
+        log_inference_wandb = args.inference_wandb_logging
+        inference_logging_step_interval = args.inference_logging_step_interval
+
         # Get metrics writer if logging is enabled and on the logging rank
         # Use the same rank convention as training (last rank logs)
         metrics_writer = None
-        if inference_logging_step_interval > 0 and args.rank == (args.world_size - 1):
+        if (
+            inference_logging_step_interval > 0
+            and log_inference_wandb
+            and args.rank == (args.world_size - 1)
+        ):
             metrics_writer = get_wandb_writer()
             if metrics_writer is None:
-                log_single_rank(logger, logging.WARNING, "WARNING: --rl-inference-logging-step-interval is set but no metrics writer "
-                           "wandb module is available. Inference logging will be disabled.")
-
-        inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(args, model, inference_logging_step_interval, metrics_writer)
-        await inference_engine.start_listening_to_data_parallel_coordinator(inference_coordinator_port=41521, launch_inference_coordinator=True)
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    "WARNING: --rl-inference-logging-step-interval is set but no metrics writer "
+                    "wandb module is available. Inference logging will be disabled.",
+                )
+
+        inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(
+            args, model, inference_logging_step_interval, metrics_writer
+        )
+        dp_addr = await inference_engine.start_listening_to_data_parallel_coordinator(
+            inference_coordinator_port=41521, launch_inference_coordinator=True,
+        )
         if dist.get_rank() == 0:
             # TODO: We have to do this only on the rank 0 process, should be fixed in the future when we have support for multiple inference clients. !2278
-            client = InferenceClient(inference_coordinator_port=41521)
+            client = InferenceClient(inference_coordinator_address=dp_addr)
             await client.start()
         else:
             client = None
@@ -279,4 +310,5 @@ async def resume(self):
             self._client.unpause_engines()
         await self._inference_engine.running.wait()
 
+
 class MegatronChatLocal(ChatInferenceInterface, MegatronLocal): ...
diff --git a/megatron/rl/parallel_utils.py b/megatron/rl/parallel_utils.py
new file mode 100644
index 00000000000..9cab73daba9
--- /dev/null
+++ b/megatron/rl/parallel_utils.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+Utilities for building process groups for RL inference models with custom parallelism.
+"""
+
+from typing import Optional
+
+import torch.distributed as dist
+
+from megatron.core import mpu
+from megatron.core.hyper_comm_grid import HyperCommGrid
+from megatron.core.process_groups_config import ProcessGroupCollection
+
+
+def build_inference_pg_collection(
+    world_size: int,
+    tp_size: Optional[int] = None,
+    pp_size: Optional[int] = None,
+    cp_size: Optional[int] = None,
+    ep_size: Optional[int] = None,
+    expt_tp_size: Optional[int] = None,
+    use_tp_pp_dp_mapping: bool = False,
+) -> ProcessGroupCollection:
+    """
+    Build a ProcessGroupCollection for an RL inference model with custom parallelism.
+
+    Uses two HyperCommGrids matching the structure of mpu:
+    - decoder_grid: for dense/attention layers (tp, cp, dp, pp)
+    - expert_grid: for MoE expert layers (expt_tp, ep, expt_dp, pp)
+
+    Args:
+        world_size: Total world size (number of ranks).
+        tp_size: Tensor model parallel size. Defaults to training's TP size.
+        pp_size: Pipeline parallel size. Defaults to training's PP size.
+        cp_size: Context parallel size. Defaults to training's CP size.
+        ep_size: Expert parallel size. Defaults to training's EP size.
+        expt_tp_size: Expert tensor parallel size. Defaults to training's expert TP size.
+        use_tp_pp_dp_mapping: If True, use 'tp-pp-dp' order; otherwise 'tp-dp-pp'.
+
+    Returns:
+        ProcessGroupCollection configured for the inference model.
+    """
+    # Use current MPU values as defaults
+    if tp_size is None:
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+    if cp_size is None:
+        cp_size = mpu.get_context_parallel_world_size()
+    if pp_size is None:
+        pp_size = mpu.get_pipeline_model_parallel_world_size()
+    if ep_size is None:
+        ep_size = mpu.get_expert_model_parallel_world_size()
+    if expt_tp_size is None:
+        expt_tp_size = mpu.get_expert_tensor_parallel_world_size()
+
+
+    # Compute DP size for dense layers (same formula as mpu)
+    # world = tp × cp × dp × pp
+    dp_size = world_size // (tp_size * cp_size * pp_size)
+    assert dp_size >= 1 and (tp_size * cp_size * dp_size * pp_size) == world_size, (
+        f"World size ({world_size}) must be divisible by tp*cp*pp ({tp_size * cp_size * pp_size})"
+    )
+
+    # Compute expert DP size (same formula as mpu)
+    # world = expt_tp × ep × expt_dp × pp
+    expt_dp_size = world_size // (expt_tp_size * ep_size * pp_size)
+    assert expt_dp_size >= 1 and (expt_tp_size * ep_size * expt_dp_size * pp_size) == world_size, (
+        f"World size ({world_size}) must be divisible by expt_tp*ep*pp ({expt_tp_size * ep_size * pp_size})"
+    )
+
+    rank = dist.get_rank()
+
+    # ====================
+    # Create decoder grid for dense/attention layers
+    # Matches mpu's decoder_rank_generator with ep=1
+    # ====================
+    if use_tp_pp_dp_mapping:
+        # Order: tp-cp-pp-dp
+        decoder_grid = HyperCommGrid(
+            [tp_size, cp_size, pp_size, dp_size],
+            ["tp", "cp", "pp", "dp"]
+        )
+    else:
+        # Order: tp-cp-dp-pp (default)
+        decoder_grid = HyperCommGrid(
+            [tp_size, cp_size, dp_size, pp_size],
+            ["tp", "cp", "dp", "pp"]
+        )
+
+    # Create dense layer groups from decoder_grid
+    tp_group = decoder_grid.create_pg("tp")
+    cp_group = decoder_grid.create_pg("cp")
+    pp_group = decoder_grid.create_pg("pp")
+    dp_group = decoder_grid.create_pg("dp")
+    mp_group = decoder_grid.create_pg(["tp", "pp"])
+    tp_cp_group = decoder_grid.create_pg(["tp", "cp"])
+    dp_cp_group = decoder_grid.create_pg(["cp", "dp"])
+    tp_dp_cp_group = decoder_grid.create_pg(["tp", "cp", "dp"])
+
+    # ====================
+    # Create expert grid for MoE expert layers
+    # Matches mpu's expert_decoder_rank_generator with cp=1
+    # ====================
+    if use_tp_pp_dp_mapping:
+        # Order: tp-ep-pp-dp
+        expert_grid = HyperCommGrid(
+            [expt_tp_size, ep_size, pp_size, expt_dp_size],
+            ["tp", "ep", "pp", "dp"]
+        )
+    else:
+        # Order: tp-ep-dp-pp (default)
+        expert_grid = HyperCommGrid(
+            [expt_tp_size, ep_size, expt_dp_size, pp_size],
+            ["tp", "ep", "dp", "pp"]
+        )
+
+    # Verify PP groups match between decoder and expert grids (required by mpu)
+    decoder_pp_enum = decoder_grid.get_rank_enum("pp")
+    expert_pp_enum = expert_grid.get_rank_enum("pp")
+    assert decoder_pp_enum == expert_pp_enum, (
+        f"PP groups must match between decoder and expert grids. "
+        f"Decoder: {decoder_pp_enum}, Expert: {expert_pp_enum}"
+    )
+
+    # Create expert layer groups from expert_grid
+    ep_group = expert_grid.create_pg("ep")
+    expt_tp_group = expert_grid.create_pg("tp")
+    expt_dp_group = expert_grid.create_pg("dp")
+    tp_ep_group = expert_grid.create_pg(["tp", "ep"])
+    tp_ep_pp_group = expert_grid.create_pg(["tp", "ep", "pp"])
+
+    # ====================
+    # Embedding groups (derived from PP groups)
+    # ====================
+    embd_group = None
+    pos_embd_group = None
+
+    pp_rank_enum = decoder_grid.get_rank_enum("pp")
+    for pp_ranks in pp_rank_enum:
+        # Embedding is on first and last PP stage
+        if len(pp_ranks) == 1:
+            embd_ranks = [pp_ranks[0]]
+        else:
+            embd_ranks = [pp_ranks[0], pp_ranks[-1]]
+        group = dist.new_group(ranks=embd_ranks)
+        if rank in embd_ranks:
+            embd_group = group
+
+        # Position embedding is only on first PP stage
+        pos_embd_ranks = [pp_ranks[0]]
+        group = dist.new_group(ranks=pos_embd_ranks)
+        if rank in pos_embd_ranks:
+            pos_embd_group = group
+
+    return ProcessGroupCollection(
+        tp=tp_group,
+        cp=cp_group,
+        pp=pp_group,
+        ep=ep_group,
+        embd=embd_group,
+        pos_embd=pos_embd_group,
+        dp=dp_group,
+        tp_cp=tp_cp_group,
+        mp=mp_group,
+        expt_tp=expt_tp_group,
+        expt_dp=expt_dp_group,
+        tp_ep=tp_ep_group,
+        tp_ep_pp=tp_ep_pp_group,
+        dp_cp=dp_cp_group,
+        tp_dp_cp=tp_dp_cp_group,
+    )
diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py
index c4630388c96..364a80db81e 100644
--- a/megatron/rl/rl_utils.py
+++ b/megatron/rl/rl_utils.py
@@ -2,17 +2,18 @@
 
 import gc
 
+import copy
+from functools import partial
 # Keep this to make the env registered.
 import itertools
-import logging
 import math
+import logging
 import pickle
 from collections import Counter, defaultdict
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
-from difflib import SequenceMatcher
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional 
 
 import numpy as np
 import torch
@@ -20,19 +21,36 @@
 import yaml
 from torch.utils.data import DataLoader, TensorDataset
 from torch.utils.tensorboard import SummaryWriter
-from wandb import wandb_run
 
 from megatron.core import mpu
 from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer
+from megatron.core.full_cuda_graph import FullCudaGraphWrapper
 from megatron.core.models.common.language_module.language_module import LanguageModule
-from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
 from megatron.core.optimizer import MegatronOptimizer
-from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core.parallel_state import get_tensor_model_parallel_src_rank
+from megatron.core.pipeline_parallel import get_forward_backward_func
+from megatron.core.pipeline_parallel.utils import is_pp_last_stage, get_pp_last_rank
 from megatron.core.rerun_state_machine import RerunDataIterator
 from megatron.core.transformer.cuda_graphs import _CudagraphGlobalRecord
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.utils import toggle_cuda_graphs
-from megatron.core.utils import get_asyncio_loop
+from megatron.core.resharding.refit import swap_model_weights
+from megatron.core.inference.unified_memory import (
+    advise_managed_module_parameters_preferred_location,
+    prefetch_managed_module_parameters,
+)
+from megatron.core.utils import get_asyncio_loop, log_single_rank
+from megatron.rl.sequence_packing_utils import (
+    get_microbatch_dataloader,
+    pack_inference_logprobs,
+    compute_packed_inference_logprobs_stats,
+    pack_all_trajectories,
+    load_packed_data_by_index,
+    get_sequence_packing_tensorboard_metrics,
+    get_sequence_packing_log_info,
+    get_default_packed_seq_params,
+    update_microbatch_calculator,
+)
 from megatron.rl.agent.api import (
     EvaluationRequest,
     EvaluationResponse,
@@ -49,25 +67,155 @@
 from megatron.training.global_vars import (
     get_args,
     get_tensorboard_writer,
-    get_timers,
     get_tokenizer,
     get_wandb_writer,
 )
 from megatron.training.tokenizer.tokenizer import CustomTikTokenizer, _HuggingFaceTokenizer
-from megatron.training.utils import get_ltor_masks_and_position_ids, get_nvtx_range, print_rank_0
+from megatron.training.utils import (
+    get_ltor_masks_and_position_ids,
+    get_nvtx_range,
+    print_rank_0,
+    unwrap_model,
+)
+from megatron.core.utils import get_pg_rank, get_pg_size, get_attr_wrapped_model
+from megatron.core.process_groups_config import ProcessGroupCollection
+from wandb import wandb_run
+from megatron.core.transformer.custom_layers.batch_invariant_kernels import (
+    is_batch_invariant_mode_enabled,
+)
+
+from megatron.core.inference.contexts.dynamic_context import HAVE_TORCH_MEMORY_SAVER
+if HAVE_TORCH_MEMORY_SAVER:
+    from torch_memory_saver import torch_memory_saver
 
 logger = logging.getLogger(__name__)
 
 # Global variable to store packing context for forward_step
 _GLOBAL_PACKING_CONTEXT = None
 
-GroupedRollouts = list[list[TokenRollout | Rollout]]
+
+def _maybe_prefetch_separate_inference_model_weights(model_core, *, to_cpu: bool) -> None:
+    """Prefetch RL *separate inference model* weights to CPU/GPU (UVM-only path).
+
+    Gated only by user args; this assumes the separate inference model was allocated with UVM when enabled.
+    """
+    args = get_args()
+    if not args.rl_offload_inference_model_weights_when_idle:
+        return
+    if args.rl_inference_model_unified_memory_level != 1:
+        return
+    device = -1 if to_cpu else int(torch.cuda.current_device())
+    # Note: include_buffers=False because buffers created with explicit device= in register_buffer()
+    # are not allocated via the UVM mempool and will fail UVM operations. Only parameters are UVM-allocated.
+    advise_managed_module_parameters_preferred_location(model_core, device=device, include_buffers=False)
+    nbytes = prefetch_managed_module_parameters(model_core, device=device, include_buffers=False)
+    # Ensure pages are resident before we enter CUDA-graph capture / inference, or before training continues.
+    torch.cuda.synchronize()
+
+    if to_cpu:
+        print_rank_0(f"[Rank 0] offloaded {nbytes / 1024**2:.2f} MB of separate RL inference model weights to CPU (other ranks may vary)")
+    else:
+        print_rank_0(f"[Rank 0] prefetched {nbytes / 1024**2:.2f} MB of separate RL inference model weights to GPU (other ranks may vary)")
+
+
+def verify_model_weights_swap(
+    train_model: LanguageModule,
+    inference_model: LanguageModule,
+    seq_len: int = 8,
+    batch_size: int = 2,
+    atol: float = 1e-4,
+    rtol: float = 1e-4,
+) -> None:
+    """Verify that the inference model produces the same forward pass outputs
+    as the training model after the weights have been swapped.
+
+    This function should be called after swap_model_weights to ensure the weight
+    transfer was successful. It runs a forward pass on both models and asserts
+    the outputs match.  This is meant for debugging purposes only.
+
+    Args:
+        train_model: The training model (source of weights).
+        inference_model: The inference model (target of weights).
+        seq_len: Sequence length for test input.
+        batch_size: Batch size for test input.
+        atol: Absolute tolerance for comparing outputs.
+        rtol: Relative tolerance for comparing outputs.
+
+    Raises:
+        AssertionError: If forward pass outputs do not match within tolerance.
+    """
+    args = get_args()
+
+    # Unwrap models to get the core module
+    train_lm = train_model[0] if isinstance(train_model, (list, tuple)) else train_model
+    inf_lm = inference_model[0] if isinstance(inference_model, (list, tuple)) else inference_model
+
+    train_core = unwrap_model(train_lm)
+    inf_core = unwrap_model(inf_lm)
+
+    actual_vocab_size = getattr(args, 'padded_vocab_size', 128256)
+    actual_seq_len = min(seq_len, getattr(args, 'seq_length', seq_len))
+    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+
+    # Generate deterministic test input - same across ALL ranks
+    torch.manual_seed(1234)
+    test_tokens = torch.randint(
+        low=0, high=actual_vocab_size, size=(batch_size, actual_seq_len),
+        device=device, dtype=torch.long
+    )
+    test_position_ids = (
+        torch.arange(actual_seq_len, device=device, dtype=torch.long)
+        .unsqueeze(0)
+        .expand(batch_size, -1)
+    )
+    test_attention_mask = torch.ones(
+        (batch_size, 1, actual_seq_len, actual_seq_len), device=device, dtype=torch.bool
+    )
+
+    # Save and restore training state
+    train_was_training = train_core.training
+    inf_was_training = inf_core.training
+
+    train_core.eval()
+    inf_core.eval()
+
+    try:
+        with torch.no_grad():
+            train_output = train_lm(
+                test_tokens, test_position_ids, test_attention_mask,
+                runtime_gather_output=True
+            )
+
+            inf_output = inf_lm(
+                test_tokens, test_position_ids, test_attention_mask,
+                runtime_gather_output=True
+            )
+
+        # Only check on ranks that have output (last PP stage)
+        if train_output is not None and inf_output is not None:
+            assert train_output.shape == inf_output.shape, (
+                f"Output shape mismatch: train={train_output.shape}, infer={inf_output.shape}"
+            )
+            
+            max_diff = (train_output - inf_output).abs().max().item()
+            assert torch.allclose(train_output, inf_output, atol=atol, rtol=rtol), (
+                f"Forward pass outputs do not match: max_diff={max_diff:.6e}, atol={atol}, rtol={rtol}"
+            )
+
+    finally:
+        # Restore training state
+        if train_was_training:
+            train_core.train()
+        if inf_was_training:
+            inf_core.train()
+
+Rollouts = list[TokenRollout | Rollout]
+GroupedRollouts = list[Rollouts]
 
 
 @dataclass(slots=True)
 class RolloutStats:
     mean_reward: float
-    mean_sim: None | float
     mean_length: float
     mean_length_std: float
     max_length: float
@@ -85,22 +233,23 @@ class RolloutStats:
     min_inf_prob: None | float
     max_inf_prob: None | float
     mean_inf_prob: None | float
+    num_turns: list[int] # num_turns per traj
 
 
 # Runtime state container for RL-specific data that shouldn't be checkpointed
 class RLRuntimeState:
-    """Container for seq packing runtime state that is rank-specific"""
+    """Container for runtime state that is not checkpointed, tracking state between rollout collections"""
 
     def __init__(self):
-        self.sequence_packing_plan = None
-        self.sequence_packing_metadata = None
         self.packing_context = None
+        self.last_collection_iteration = 0
         self.sequences_this_iteration_on_rank = 0
         self.latest_batch_num_sequences = 0
 
-    def reset_iteration_counters(self):
+    def reset_iteration_counters(self, iteration):
         """Reset per-iteration counters."""
         self.sequences_this_iteration_on_rank = 0
+        self.last_collection_iteration = iteration
 
     def increment_sequences(self, count):
         """Increment the sequence counter."""
@@ -117,163 +266,38 @@ def get_rl_runtime_state():
     return _rl_runtime_state
 
 
-def create_empty_bins(
-    num_empty_bins,
-    bin_size,
-    packed_trajs,
-    packed_position_ids,
-    packed_loss_mask,
-    packed_attention_mask,
-    tokenizer,
-):
-    """Create empty bins for padding to ensure all ranks have the same number of bins.
-
-    Args:
-        num_empty_bins: Number of empty bins to create
-        bin_size: Size of each bin
-        packed_trajs: Packed trajectories tensor (for dtype/device reference)
-        packed_position_ids: Packed position IDs tensor
-        packed_loss_mask: Packed loss mask tensor
-        packed_attention_mask: Packed attention mask tensor (can be None)
-        tokenizer: Tokenizer for pad token
-
-    Returns:
-        Tuple of (empty_trajs, empty_position_ids, empty_loss_mask, empty_attention_mask, empty_packing_info_entries)
-    """
-    device = packed_trajs.device
-
-    # Create empty bins with proper shape
-    empty_bins = []
-    empty_position_ids_list = []
-    empty_loss_mask_list = []
-    empty_attention_mask_list = []
-    empty_packing_info_entries = []
-
-    for i in range(num_empty_bins):
-        # Trajectories filled with pad tokens
-        empty_bin = torch.full(
-            (1, bin_size), tokenizer.pad, dtype=packed_trajs.dtype, device=device
-        )
-        empty_bins.append(empty_bin)
-
-        # Zero position IDs
-        empty_pos_ids = torch.zeros(1, bin_size, dtype=packed_position_ids.dtype, device=device)
-        empty_position_ids_list.append(empty_pos_ids)
-
-        # Zero loss mask (so no loss contribution)
-        empty_loss = torch.zeros(1, bin_size, dtype=packed_loss_mask.dtype, device=device)
-        empty_loss_mask_list.append(empty_loss)
-
-        # Zero attention mask if needed
-        if packed_attention_mask is not None:
-            # Attention mask is always 4D: [num_bins, 1, bin_size, bin_size]
-            empty_attn = torch.zeros(
-                1, 1, bin_size, bin_size, dtype=packed_attention_mask.dtype, device=device
-            )
-            empty_attention_mask_list.append(empty_attn)
-
-        # Empty packing info entries
-        empty_packing_info_entries.append(
-            {
-                'bin_seq_indices': [],  # No sequences in empty bin
-                'seq_starts': [],  # No sequence starts
-            }
-        )
-
-    # Concatenate all empty bins
-    if num_empty_bins > 0:
-        empty_trajs = torch.cat(empty_bins, dim=0)
-        empty_position_ids = torch.cat(empty_position_ids_list, dim=0)
-        empty_loss_mask = torch.cat(empty_loss_mask_list, dim=0)
-        empty_attention_mask = (
-            torch.cat(empty_attention_mask_list, dim=0)
-            if packed_attention_mask is not None
-            else None
-        )
-    else:
-        empty_trajs = None
-        empty_position_ids = None
-        empty_loss_mask = None
-        empty_attention_mask = None
-
-    return (
-        empty_trajs,
-        empty_position_ids,
-        empty_loss_mask,
-        empty_attention_mask,
-        empty_packing_info_entries,
-    )
-
+def update_inference_logprobs_group_stats(
+    old_logprobs: torch.Tensor,
+    inference_logprobs: torch.Tensor,
+    mask: torch.Tensor,
+    group_stats: Any,
+) -> None:
+    """Update group statistics with inference/train logprobs comparison metrics.
 
-def pack_inference_logprobs(
-    inference_logprobs: List[torch.Tensor],
-    packing_info: Dict[str, Any],
-    generation_masks: torch.Tensor,
-    bin_size: int,
-) -> torch.Tensor:
-    """Pack inference logprobs into bins aligned with packed sequences.
+    This is the common statistics computation used by both packed and unpacked cases.
 
     Args:
-        inference_logprobs: List of inference logprobs tensors for each sequence
-        packing_info: Dictionary containing bin assignments and sequence positions
-        generation_masks: Tensor indicating which tokens were generated
-        bin_size: Size of each bin
-
-    Returns:
-        Packed inference logprobs tensor of shape [num_bins, bin_size - 1]
+        old_logprobs: Old logprobs tensor (train side)
+        inference_logprobs: Inference logprobs tensor (aligned to match old_logprobs shape)
+        mask: Boolean mask indicating valid positions for statistics
+        group_stats: Statistics object to update with computed metrics
     """
-    num_bins = len(packing_info['bin_seq_indices'])
+    n_elems = mask.sum()
+    if n_elems > 0:
+        ratios = (old_logprobs - inference_logprobs).exp()[mask]
+        abs_diffs = (old_logprobs.exp() - inference_logprobs.exp()).abs()[mask]
 
-    # Create packed inference logprobs tensor (logprobs are 1 token shorter than sequences)
-    packed_inference_logprobs = torch.zeros(
-        (num_bins, bin_size - 1), dtype=torch.float32, device='cpu'
-    )
-
-    # Create mapping from global sequence index to local bin index
-    # This is needed because seq_to_bin_idx uses global bin indices,
-    # but after distribution each rank only has a subset of bins
-    seq_to_local_bin = {}
-    for local_bin_idx, seq_indices in enumerate(packing_info['bin_seq_indices']):
-        for seq_idx in seq_indices:
-            seq_to_local_bin[seq_idx] = local_bin_idx
-
-    # Align and pack inference logprobs based on generation masks
-    for seq_idx in range(len(inference_logprobs)):
-        if seq_idx not in seq_to_local_bin:
-            continue  # Skip sequences not on this rank
-
-        local_bin_idx = seq_to_local_bin[seq_idx]
-
-        # Get the position of this sequence within the bin
-        seq_positions = packing_info['bin_seq_indices'][local_bin_idx]
-        seq_pos_in_bin = seq_positions.index(seq_idx)
-        seq_start = packing_info['seq_starts'][local_bin_idx][seq_pos_in_bin]
-
-        # Get generation mask for this sequence to find where generation starts
-        gen_mask = generation_masks[seq_idx]
-        # Find first generation token (accounting for the shift in get_logprobs)
-        first_gen_idx = gen_mask.int().argmax().item() - 1
-
-        # Get the inference logprobs for this sequence
-        if isinstance(inference_logprobs[seq_idx], torch.Tensor):
-            seq_inf_logprobs = inference_logprobs[seq_idx]
-        else:
-            continue  # Skip if no inference logprobs
-
-        # Calculate where to place inference logprobs in the packed tensor
-        # The inference logprobs start at the first generated token position
-        pack_start = seq_start + first_gen_idx
-        pack_end = min(
-            pack_start + len(seq_inf_logprobs), seq_start + packing_info['seq_lengths'][seq_idx] - 1
-        )
-        actual_len = pack_end - pack_start
-
-        if actual_len > 0 and pack_end <= bin_size - 1:
-            packed_inference_logprobs[local_bin_idx, pack_start:pack_end] = seq_inf_logprobs[
-                :actual_len
-            ]
+        group_stats.min_piold_to_inf_prob = ratios.min().item()
+        group_stats.max_piold_to_inf_prob = ratios.max().item()
+        group_stats.mean_piold_to_inf_prob = (ratios.sum() / n_elems).item()
+        group_stats.min_inf_train_prob_abs_diff = abs_diffs.min().item()
+        group_stats.max_inf_train_prob_abs_diff = abs_diffs.max().item()
+        group_stats.mean_inf_train_prob_abs_diff = (abs_diffs.sum() / n_elems).item()
 
-    return packed_inference_logprobs
+        inf_probs = inference_logprobs.exp()[mask]
+        group_stats.min_inf_prob = inf_probs.min().item()
+        group_stats.max_inf_prob = inf_probs.max().item()
+        group_stats.mean_inf_prob = inf_probs.mean().item()
 
 
 def align_unpacked_inference_logprobs(
@@ -326,244 +350,22 @@ def align_unpacked_inference_logprobs(
             pad_size = old_logprobs_for_data.shape[1] - truncated_mask.shape[1]
             truncated_mask = torch.nn.functional.pad(truncated_mask, (0, pad_size), value=False)
 
-    # Compute statistics
-    n_elems = truncated_mask.sum()
-
-    ratios = (old_logprobs_for_data - padded_inference_logprobs).exp()[truncated_mask]
-    abs_diffs = (old_logprobs_for_data.exp() - padded_inference_logprobs.exp()).abs()[
-        truncated_mask
-    ]
-
-    # Two probability values cannot be more than 1.0 apart
+    # Sanity check: Two probability values cannot be more than 1.0 apart
+    abs_diffs = (old_logprobs_for_data.exp() - padded_inference_logprobs.exp()).abs()[truncated_mask]
     assert all(abs_diffs <= 1.0)
 
-    # Update group statistics
-    group_stats.min_piold_to_inf_prob = ratios.min().item()
-    group_stats.max_piold_to_inf_prob = ratios.max().item()
-    group_stats.mean_piold_to_inf_prob = (ratios.sum() / n_elems).item()
-    group_stats.min_inf_train_prob_abs_diff = abs_diffs.min().item()
-    group_stats.max_inf_train_prob_abs_diff = abs_diffs.max().item()
-    group_stats.mean_inf_train_prob_abs_diff = (abs_diffs.sum() / n_elems).item()
-
-    # Compute inference probability statistics
-    inf_probs = padded_inference_logprobs.exp()[truncated_mask]
-    group_stats.min_inf_prob = inf_probs.min().item()
-    group_stats.max_inf_prob = inf_probs.max().item()
-    group_stats.mean_inf_prob = inf_probs.mean().item()
+    # Update group statistics using common helper
+    update_inference_logprobs_group_stats(
+        old_logprobs=old_logprobs_for_data,
+        inference_logprobs=padded_inference_logprobs,
+        mask=truncated_mask,
+        group_stats=group_stats,
+    )
 
     return padded_inference_logprobs
 
 
-class SequencePacker:
-    """Packs multiple sequences into bins to minimize padding and improve GPU utilization."""
-
-    def __init__(self, bin_size: int, pad_token: int, max_sequences_per_bin: int = 16):
-        self.bin_size = bin_size
-        self.pad_token = pad_token
-        self.max_sequences_per_bin = max_sequences_per_bin
-
-    def pack_sequences(
-        self, sequences: List[torch.Tensor], generation_masks: torch.Tensor = None
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
-        """Pack sequences into bins using a greedy first-fit algorithm."""
-        sequences_tensor = torch.stack(sequences)
-
-        seq_lengths = get_actual_sequence_lengths(sequences_tensor, self.pad_token)
-
-        # Trim sequences to actual lengths
-        sequences = [sequences_tensor[i, :length] for i, length in enumerate(seq_lengths)]
-
-        sorted_indices = sorted(range(len(sequences)), key=lambda i: seq_lengths[i], reverse=True)
-
-        args = get_args()
-        # Check that sequences can fit in bins
-        # TODO(jalbericiola): this should probably be moved to the arguments file
-        assert (
-            args.seq_length <= self.bin_size
-        ), f"seq_length ({args.seq_length}) must be <= bin_size ({self.bin_size})"
-
-        bins = []
-        bin_seq_indices = []  # Track which sequences are in each bin
-        current_bin = []
-        current_bin_indices = []
-        current_bin_length = 0
-
-        # Pack sequences into bins
-        sequences_per_bin = []
-        for idx in sorted_indices:
-            seq = sequences[idx]
-            seq_len = len(seq)
-
-            if (
-                current_bin_length + seq_len <= self.bin_size
-                and len(current_bin) < self.max_sequences_per_bin
-            ):
-                current_bin.append(seq)
-                current_bin_indices.append(idx)
-                current_bin_length += seq_len
-            else:
-                # Start a new bin
-                if current_bin:
-                    bins.append(current_bin)
-                    bin_seq_indices.append(current_bin_indices)
-                    sequences_per_bin.append(len(current_bin))
-                current_bin = [seq]
-                current_bin_indices = [idx]
-                current_bin_length = seq_len
-
-        # Don't forget the last bin
-        if current_bin:
-            bins.append(current_bin)
-            bin_seq_indices.append(current_bin_indices)
-            sequences_per_bin.append(len(current_bin))
-
-        # Create packed tensors
-        num_bins = len(bins)
-        device = sequences[0].device
-        dtype = sequences[0].dtype
-
-        # Log packing distribution
-        if sequences_per_bin:
-            avg_seqs_per_bin = sum(sequences_per_bin) / len(sequences_per_bin)
-            min_seqs = min(sequences_per_bin)
-            max_seqs = max(sequences_per_bin)
-            print_rank_0(
-                f"[SequencePacker] Packing distribution: {num_bins} bins, "
-                f"avg {avg_seqs_per_bin:.1f} seqs/bin, "
-                f"min {min_seqs}, max {max_seqs} seqs/bin "
-                f"(limit: {self.max_sequences_per_bin})"
-            )
-            # Store for later use
-            self.last_avg_seqs_per_bin = avg_seqs_per_bin
-
-        packed_sequences = torch.full(
-            (num_bins, self.bin_size), self.pad_token, dtype=dtype, device=device
-        )
-        position_ids = torch.zeros(
-            (num_bins, self.bin_size), dtype=torch.long, device=device, requires_grad=False
-        )
-        attention_mask = torch.zeros(
-            (num_bins, 1, self.bin_size, self.bin_size), dtype=torch.bool, device=device
-        )
-        loss_mask = torch.zeros((num_bins, self.bin_size), dtype=torch.float, device=device)
-
-        # TODO(jalbericiola): packing_info as a dataclass and not just a dict
-        # Track packing information for unpacking later
-        packing_info = {
-            'bin_seq_indices': bin_seq_indices,  # Which original sequences are in each bin
-            'seq_starts': [],  # Start position of each sequence within its bin
-            'seq_lengths': seq_lengths,  # Original sequence lengths
-            'seq_to_bin_idx': [None] * len(sequences),  # Map from sequence index to bin index
-        }
-
-        # Build seq_to_bin_idx mapping
-        for bin_idx, seq_indices in enumerate(bin_seq_indices):
-            for seq_idx in seq_indices:
-                packing_info['seq_to_bin_idx'][seq_idx] = bin_idx
-
-        # Fill bins
-        for bin_idx, (bin_seqs, seq_indices) in enumerate(zip(bins, bin_seq_indices)):
-            seq_starts = []
-            current_pos = 0
-
-            for seq_idx, seq in enumerate(bin_seqs):
-                start = current_pos
-                end = start + len(seq)
-                seq_starts.append(start)
-                current_pos = end
-
-                # Pack sequence
-                packed_sequences[bin_idx, start:end] = seq
-
-                # Position IDs reset for each sequence
-                position_ids[bin_idx, start:end] = torch.arange(
-                    len(seq), device=device, requires_grad=False
-                )
-
-                # Causal attention mask within each sequence
-                seq_len = end - start
-                attention_mask[bin_idx, 0, start:end, start:end] = torch.tril(
-                    torch.ones(seq_len, seq_len, dtype=torch.bool, device=device)
-                )
-
-                # Loss mask (excluding padding)
-                loss_mask[bin_idx, start:end] = 1.0
-
-                # Apply generation mask if provided
-                if generation_masks is not None:
-                    orig_idx = seq_indices[seq_idx]
-                    gen_mask = generation_masks[orig_idx][
-                        : len(seq)
-                    ]  # Truncate to actual seq length
-                    loss_mask[bin_idx, start:end] *= gen_mask.float()
-
-            packing_info['seq_starts'].append(seq_starts)
-
-        # Add bin_start_positions - a dict mapping bin_idx to list of start positions for each sequence in that bin
-        bin_start_positions = {}
-        for bin_idx in range(num_bins):
-            bin_start_positions[bin_idx] = packing_info['seq_starts'][bin_idx]
-        packing_info['bin_start_positions'] = bin_start_positions
-
-        # Note: We'll store the actual padded length later when we know it
-        # (it depends on the original trajectories passed to pack_sequences)
-
-        # Invert attention mask, before inversion: (True = attend, False = mask)
-        attention_mask = ~attention_mask
-
-        return packed_sequences, position_ids, attention_mask, loss_mask, packing_info
-
-
-def create_packed_seq_params_for_bin(
-    packing_info: Dict[str, Any],
-    bin_idx: int,
-    bin_size: int,
-    device: torch.device,
-) -> Optional[PackedSeqParams]:
-    """Create PackedSeqParams for a single bin to enable proper attention masking in TE.
-
-    When using Transformer Engine with sequence packing, we need to provide cu_seqlens
-    (cumulative sequence lengths) so that TE knows the boundaries between sequences
-    within a packed bin. This prevents attention leakage between unrelated sequences.
-
-    Args:
-        packing_info: Dictionary containing packing metadata from SequencePacker
-        bin_idx: Index of the bin to create params for
-        bin_size: Size of the bin (padded sequence length)
-        device: Device to create tensors on
-
-    Returns:
-        PackedSeqParams with cu_seqlens set for proper attention masking (or None if empty)
-    """
-    seq_indices = packing_info['bin_seq_indices'][bin_idx]
-
-    # Handle empty bins (padding bins with no sequences)
-    if not seq_indices:
-        return None
-
-    # Get actual sequence lengths for sequences in this bin
-    seq_lengths_in_bin = [packing_info['seq_lengths'][idx] for idx in seq_indices]
-
-    # Build cumulative sequence lengths for actual sequences
-    # cu_seqlens should be [0, len(seq1), len(seq1)+len(seq2), ..., total_actual_len]
-    cu_seqlens_list = np.cumsum([0] + seq_lengths_in_bin)
-
-    cu_seqlens = torch.tensor(cu_seqlens_list, dtype=torch.int32, device=device)
-
-    max_seqlen = max(seq_lengths_in_bin) if seq_lengths_in_bin else bin_size
-
-    return PackedSeqParams(
-        qkv_format='thd',
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_kv=cu_seqlens,
-        cu_seqlens_q_padded=None,
-        cu_seqlens_kv_padded=None,
-        max_seqlen_q=max_seqlen,
-        max_seqlen_kv=max_seqlen,
-    )
-
-
-def get_agent(args):
+def get_agent(args, parallel_generation_tasks: int | None = None):
     """Get an agent based on environment configuration.
 
     If args.langrl_env_config is provided, uses weighted environment selection.
@@ -572,7 +374,10 @@ def get_agent(args):
     with open(args.langrl_env_config, 'r') as f:
         config = yaml.safe_load(f)
 
-    return WeightedMultiTask.from_config(config)
+    return WeightedMultiTask.from_config(
+        config,
+        parallel_generation_tasks=parallel_generation_tasks,
+    )
 
 
 _INFERENCE_INTERFACE = None
@@ -620,16 +425,17 @@ def get_inference_interface(args, loop, model):
 def get_rollout_generator(args, inference_interface, n_prompts, samples_per_group):
     global _ROLLOUT_GENERATOR
     if not args.rl_partial_rollouts or _ROLLOUT_GENERATOR is None:
-        agent = get_agent(args)
+        agent = get_agent(args, parallel_generation_tasks=args.rl_parallel_generation_tasks)
         # Collect Rollouts
         request = GroupedRolloutRequest(
             num_groups=-1 if args.rl_partial_rollouts else n_prompts,
             rollouts_per_group=samples_per_group,
             inference_interface=inference_interface,
             generation_args={
-                'temperature': args.grpo_default_temperature,
-                'max_tokens': args.seq_length,
-                'top_p': args.grpo_default_top_p,
+                'temperature': args.rl_default_temperature,
+                'max_tokens': args.inference_max_seq_length,
+                'top_p': args.rl_default_top_p,
+                'top_k': args.rl_default_top_k,
             },
             filter_groups_with_same_reward=args.grpo_filter_groups_with_same_reward,
         )
@@ -638,12 +444,13 @@ def get_rollout_generator(args, inference_interface, n_prompts, samples_per_grou
 
 
 def get_environment_rollouts(
-    model: LanguageModule, optimizer: MegatronOptimizer, n_prompts: int, samples_per_group: int
+    model: LanguageModule, inference_model: LanguageModule, optimizer: MegatronOptimizer, n_prompts: int, samples_per_group: int
 ):
     """Sample environment rollouts from an LLM.
 
     Args:
         model: Model to sample from.
+        inference_model: Inference model to use for inference.
         n_prompts: Number of prompts to sample for across *all* data parallel workers.
         samples_per_group: Amount of trajectories per prompt.
 
@@ -653,18 +460,42 @@ def get_environment_rollouts(
     args = get_args()
     nvtx_range = get_nvtx_range()
 
+    if args.rl_offload_optimizer_during_inference:
+        with nvtx_range("offload-optimizer-state-and-grad-buffers-during-inference"):
+            model[0].offload_grad_buffers()
+            optimizer.offload_to_cpu()
+             
+    # If we have seperate training and inference models we to refit weights from the training model to the inference model.
+    if inference_model is not None:
+        # If the separate inference model weights were prefetched to CPU while idle, bring them
+        # back to GPU before refit/copy and before any CUDA-graph'd inference.
+        with nvtx_range("prefetch-inference-model-weights-to-gpu"):
+            inf_core = unwrap_model(inference_model[0])
+            _maybe_prefetch_separate_inference_model_weights(inf_core, to_cpu=False)
+        swap_model_weights(model, inference_model, args.refit_method)
+        if args.rl_verify_model_weights_swap:
+            verify_model_weights_swap(
+                train_model=model,
+                inference_model=inference_model,
+                atol=.1,
+                rtol=5e-4,
+            )
+    else:
+        inference_model = model
+
+    inference_pg_collection = get_attr_wrapped_model(inference_model[0], "pg_collection")
     assert (
-        n_prompts % mpu.get_expert_data_parallel_world_size() == 0
+        n_prompts % get_pg_size(inference_pg_collection.ep) == 0
     ), "n_prompts must be divisible by data_parallel_world_size"
 
     with nvtx_range("rollout-collection"):
         loop = get_asyncio_loop()
         with megatron_rl_inference_mode(
-            model,
+            inference_model,
             optimizer,
             args.cuda_graph_impl,
             args.rl_reset_cuda_graphs,
-            args.rl_offload_optimizer_during_inference,
+            False, # offload optimizer during rollout collection is handled above
             args.rl_offload_kv_cache_during_training,
             args.rl_remove_kv_cache_during_training,
         ) as inference_interface:
@@ -679,7 +510,11 @@ def get_environment_rollouts(
             rank = torch.distributed.get_rank()
             with nvtx_range("collect-rollouts"):
                 if rank == 0:
-                    print(f"Collecting rollouts on rank {rank}, Iteration {args.curr_iteration}...")
+                    log_single_rank(
+                        logger,
+                        logging.INFO,
+                        f"Collecting rollouts, Iteration {args.curr_iteration}...",
+                    )
                     rollouts = [
                         loop.run_until_complete(anext(rollout_generator)) for _ in range(n_prompts)
                     ]
@@ -698,9 +533,14 @@ def get_environment_rollouts(
             # Wait for Rollouts to be collected
             # TODO(jbarker): double check why this isn't causing rank 0 memory allocations
             torch.distributed.broadcast_object_list(rollouts, src=0)
-        print(f"Got rollouts on rank {rank}")
+        logger.debug(f"Got rollouts on rank {rank}")
+
+    if args.rl_offload_optimizer_during_inference:
+        with nvtx_range("restore-optimizer-state-and-grad-buffers-after-inference"):
+            model[0].restore_grad_buffers()
+            optimizer.restore_from_cpu()
 
-    if lang_rl_log_dir and rank == get_tensor_model_parallel_src_rank():
+    if lang_rl_log_dir and rank == get_pg_rank(inference_pg_collection.tp):
         with open(
             lang_rl_log_dir
             + f'/rollouts_rank{rank}_iteration{args.curr_iteration}_'
@@ -732,7 +572,8 @@ def selective_log_softmax(logits, index):
         `torch.Tensor`:
             Gathered log probabilities with the same shape as `index`.
     """
-    if logits.dtype in [torch.float32, torch.float64]:
+    use_bik_logsoftmax = is_batch_invariant_mode_enabled()
+    if logits.dtype in [torch.float32, torch.float64] and not use_bik_logsoftmax:
         selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
         # loop to reduce peak mem consumption
         logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
@@ -752,7 +593,7 @@ def selective_log_softmax(logits, index):
     return per_token_logps
 
 
-def get_logprobs(model, tokens, position_ids, attention_mask, no_grad=False, packed_seq_params=None, packed_seq_len=None):
+def get_logprobs(model, tokens, position_ids, no_grad=False, sequence_packing=False, packed_seq_params=None):
     """Get sequence logprobs from their token ids.
 
     Args:
@@ -771,82 +612,92 @@ def get_logprobs(model, tokens, position_ids, attention_mask, no_grad=False, pac
         Logprobs of input sequences.
 
     """
+
+    args = get_args()
+    # Ensure packed_seq_params is always provided for CUDA graph signature consistency
+    if packed_seq_params is None and sequence_packing:
+        packed_seq_params = get_default_packed_seq_params(
+            seq_length=tokens.shape[1],
+            max_sequences_per_bin=args.rl_sequence_packing_max_sequences_per_bin,
+            device=tokens.device,
+        )
+
     nvtx_range = get_nvtx_range()
 
     with nvtx_range("get-logprobs", time=False):
-
         with nvtx_range("forward-pass", time=False):
             # TODO(vitalyk): use fp16/bf16 as a function argument. Do not use args.
-            args = get_args()
-            
-            # Handle THD format: slice off padding before forward, pad back after
-            original_seq_len = tokens.shape[1]
-            attention_mask_for_forward = attention_mask
-            if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
-                # Get the actual token count (excluding padding)
-                if packed_seq_len is not None:
-                    actual_len = packed_seq_len
-                else:
-                    actual_len = packed_seq_params.cu_seqlens_q[-1].item()
-                
-                if actual_len == 0:
-                    # No real tokens, skip packed path
-                    packed_seq_params = None
-                else:
-                    # Slice inputs to remove padding
-                    # dimension 0 is batch, with seq packing BS=1
-                    tokens = tokens[:, :actual_len]
-                    position_ids = position_ids[:, :actual_len]
-                    # attention_mask is not used with THD format (cu_seqlens handles it)
-                    attention_mask_for_forward = None
-            
+
+            attention_mask_for_forward = None
+
             # This is a hack to fix megatron's behaviour when flash-decode affects the training code flow.
             flash_decode = model.config.flash_decode
             model.config.flash_decode = False
+            fp32_output = not (args.fp16 or args.bf16)
             with torch.no_grad() if no_grad else nullcontext():
-                logits = model(
+                logits_or_hidden_states = model(
                     tokens,
                     position_ids,
                     attention_mask_for_forward,
                     packed_seq_params=packed_seq_params,
                     runtime_gather_output=True,
-                    fp32_output=not (args.fp16 or args.bf16),
+                    fp32_output=fp32_output,
                 )
             model.config.flash_decode = flash_decode
-            
-            # Pad logits back to original sequence length if we sliced
-            if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
-                if logits.shape[1] < original_seq_len:
-                    pad_len = original_seq_len - logits.shape[1]
-                    # Pad with zeros (these positions will be masked out anyway)
-                    logits = torch.nn.functional.pad(logits, (0, 0, 0, pad_len), value=0)
-                # Also need to restore tokens for the log_softmax below
-                tokens_for_softmax = torch.nn.functional.pad(
-                    tokens, (0, original_seq_len - tokens.shape[1]), value=0
-                )
-            else:
-                tokens_for_softmax = tokens
-            
-            # We do not need logprobs for the n+1 token.
-        with nvtx_range("log-softmax", time=False):
-            logprobs = selective_log_softmax(logits[:, :-1, :], tokens_for_softmax[:, 1:])
 
-    return logprobs
+        pg_collection = get_attr_wrapped_model(model, "pg_collection")
+        pp_group = pg_collection.pp
+
+        if not is_pp_last_stage(pp_group):
+            return logits_or_hidden_states
+        else:
+            logits = logits_or_hidden_states
+            with nvtx_range("log-softmax", time=False):
+                # We do not need logprobs for the n+1 token.
+                logprobs = selective_log_softmax(logits[:, :-1, :], tokens[:, 1:])
+            return logprobs
+
+
+def calculate_grpo_advantages(rewards: list[list[float]], num_turns: list[list[int]]) -> np.ndarray:
+    """Calculate GRPO advantages from rewards/num_turns.
+
+    For multiturn rollouts, the logic is a bit more involved.
+    # For training, we'll be turning each turn into a trajectory with the same reward
+    # within a trajectory, e.g. if [[a,b],[c,d,e]] trajectory has reward 1.0, we will
+    # get [a,b] with 1.0 and [c,d,e] with 1.0 when doing updates.
+    """
+
+    rewards = np.array(rewards)
+
+    num_turns = np.array(num_turns)
+    # Each outer dimension of num_turns is a group. Sum of those gives total num_turns per group.
+    # Let's use this to calculate advantage.
+    # mean/std should be repeated based on group lens
+    group_turns = num_turns.sum(axis=-1)
+    reward_means = rewards.mean(axis=1, keepdims=True).repeat(group_turns)
+    reward_stds = rewards.std(axis=1, keepdims=True).repeat(group_turns)
+
+    # rewards are originally [g, group_size]
+    # Making an assumption that all groups are of the same size!
+    # @vitalyk: this will go away when we start sending env-based sample reqs.
+    rewards = rewards.flatten().repeat(num_turns.flatten())
+
+    return ((rewards - reward_means) / (1e-4 + reward_stds)).tolist()
 
 
 def compute_group_stats(
-    rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer
+    rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer, seq_len: int,
 ) -> RolloutStats:
     """Add group-based rollout stats for logging.
 
     Args:
         rollouts: Rollouts to generate the stats for. Each inner list is a group (as in GRPO group), i.e. all rollouts are for the same prompt.
         tokenizer: Tokenizer to tokenize the rollouts in case they are raw strings.
+        seq_len: Maximum sequence length.
 
     Returns:
        RolloutStats object containing all the stats.
     """
-    args = get_args()
     # TODO (rkirby) Maybe do some of this after the tensor building
     group_reward_means = []
     group_reward_stds = []
@@ -854,54 +705,45 @@ def compute_group_stats(
     group_length_stds = []
     group_length_maxs = []
     group_length_mins = []
-    group_rollout_similarities = []
+    rewards = []
+    num_turns = [] # num_turns per traj
     for group in rollouts:
         group_rewards = []
         group_lengths = []
+        group_num_turns = []
         for rollout in group:
+            group_num_turns.append(len(rollout.trajectory))
             if isinstance(rollout, TokenRollout):
-                lang_rl_log(
-                    f"Rollout: [{rollout.env_id}] [{rollout.reward} : {len(rollout.trajectory)} tokens] {tokenizer.detokenize(rollout.trajectory)}"
-                )
-                assert (len(rollout.trajectory) == args.seq_length) or (
-                    rollout.trajectory[-1] == tokenizer.eod
-                ), f"Rollout is not the correct length: {len(rollout.trajectory)} {rollout.trajectory[-1]}\n{tokenizer.detokenize(rollout.trajectory)}"
+                for turn_traj in rollout.trajectory:
+                    detokenized_traj = tokenizer.detokenize(turn_traj)
+                    lang_rl_log(
+                        f"Rollout: [{rollout.env_id}] [{rollout.reward} : {len(rollout.trajectory)} tokens] {detokenized_traj}"
+                    )
+                    # TODO(vitalyk): how does multiturn change EOD/EOT?
+                    assert (len(turn_traj) == seq_len) or (
+                        turn_traj[-1] == tokenizer.eod
+                    ), f"Rollout is not the correct length: {len(turn_traj)} {turn_traj[-1]}\n{detokenized_traj}"
             else:
                 lang_rl_log(
                     f"Rollout: [{rollout.env_id}] [{rollout.reward} : {len(rollout.trajectory)} chars] {rollout.trajectory}"
                 )
             group_rewards.append(rollout.reward)
-            group_lengths.append(len(rollout.trajectory))
-        if args.rl_calculate_intra_group_similarity:
-            # We can probably compute this outside, but in case we switch to different group sizes for different envs, let's keep it here.
-            combos = itertools.combinations(range(len(group)), 2)
-            # For every pair (excluding ourselves), check the sequence similarity and log.
-            # Use this to track the diversity of generated rollouts within a group.
-            intra_group_sim = np.mean(
-                list(
-                    map(
-                        lambda idx_pair: SequenceMatcher(
-                            None, group[idx_pair[0]].trajectory, group[idx_pair[1]].trajectory
-                        ).ratio(),
-                        combos,
-                    )
-                )
-            )
-            group_rollout_similarities.append(intra_group_sim)
-        else:
-            group_rollout_similarities = None
+            #TODO(vitalyk): What is the semantics behind traj length in multiturn? Should we take the last only? Average them instead of extending?
+            group_lengths.extend(len(t) for t in rollout.trajectory)
 
         group_length_maxs.append(max(group_lengths))
         group_length_mins.append(min(group_lengths))
         group_reward_means.append(np.mean(group_rewards))
         group_reward_stds.append(np.std(group_rewards))
+        rewards.append(group_rewards)
         group_length_means.append(np.mean(group_lengths))
         # https://arxiv.org/abs/2504.21233 reports that lens variants hurts.
         # Let's track this.
         group_length_stds.append(np.std(group_lengths))
+        num_turns.append(group_num_turns)
+
     stats = RolloutStats(
         mean_reward=np.mean(group_reward_means),
-        mean_sim=np.mean(group_rollout_similarities) if group_rollout_similarities else None,
         mean_length=np.mean(group_length_means),
         mean_length_std=np.mean(group_length_stds),
         max_length=np.max(group_length_maxs),
@@ -917,8 +759,9 @@ def compute_group_stats(
         min_inf_prob=None,
         max_inf_prob=None,
         mean_inf_prob=None,
-        rewards=None,  # We will fill those in later in prepare_data_for_update.
-        advantages=None,  # We will fill those in later in prepare_data_for_update.
+        rewards=[r for group in rewards for r in group],
+        advantages=calculate_grpo_advantages(rewards, num_turns),
+        num_turns=[nt for group in num_turns for nt in group],
     )
     return stats
 
@@ -974,7 +817,7 @@ def maybe_log_training_metrics(
                         advantages, 'advantages', 'Advantages'
                     ),
                     'nonzero_groups_ratio': np.count_nonzero(group_stats.advantages)
-                            / len(group_stats.advantages),
+                    / len(group_stats.advantages),
                     'min_piold_to_inf_prob': group_stats.min_piold_to_inf_prob,
                     'max_piold_to_inf_prob': group_stats.max_piold_to_inf_prob,
                     'mean_piold_to_inf_prob': group_stats.mean_piold_to_inf_prob,
@@ -989,11 +832,10 @@ def maybe_log_training_metrics(
                         columns=['Trajectories', 'Tokens', 'Rewards'],
                         rows=[
                             [
-                                (
-                                    tokenizer.detokenize(r.trajectory)
+                                [(tokenizer.detokenize(turn)
                                     if isinstance(r, TokenRollout)
-                                    else r.trajectory
-                                ),
+                                    else turn) for turn in r.trajectory
+                                ],
                                 r.trajectory,
                                 r.reward,
                             ]
@@ -1001,11 +843,6 @@ def maybe_log_training_metrics(
                         ],
                     ),
                 },
-                **(
-                    {'mean_intra_group_similarity': group_stats.mean_sim}
-                    if group_stats.mean_sim
-                    else {}
-                ),
             },
             step=current_iteration,
         )
@@ -1014,10 +851,9 @@ def maybe_log_training_metrics(
 
 
 def prepare_trajectories(
-    rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer, seq_length: int
+    rollouts: Rollouts, tokenizer: MegatronLegacyTokenizer, seq_length: int, sequence_packing: bool, skip_bos_token: bool
 ):
     """Pad trajectories and extract the generation masks.
-
     Args:
         rollouts: Rollouts to extract trajectories from.
         tokenizer: Tokenizer to get the padding token and potentially tokenize.
@@ -1034,11 +870,14 @@ def prepare_trajectories(
 
     DEFAULT_PAD_TOKENS = ['<|finetune_right_pad_id|>']
 
+
     if isinstance(tokenizer, _HuggingFaceTokenizer):
         if not tokenizer.pad:
             for pad_token in DEFAULT_PAD_TOKENS:
                 if pad_token in tokenizer.vocab:
-                    print_rank_0(f"Updating tokenizer pad token to {pad_token}")
+                    log_single_rank(
+                        logger, logging.INFO, f"Updating tokenizer pad token to {pad_token}"
+                    )
                     tokenizer._tokenizer.pad_token_id = tokenizer.vocab[pad_token]
                     break
             else:
@@ -1047,26 +886,34 @@ def prepare_trajectories(
         assert "<SPECIAL_233>" in tokenizer.vocab, "Pad token is NOT in the tokenizer"
         tokenizer._pad_id = tokenizer.vocab["<SPECIAL_233>"]
 
-    print_rank_0(
-        f"Tokenizer vocab size: {tokenizer.vocab_size}\n"
-        f"Tokenizer PAD: '{tokenizer.detokenize([tokenizer.pad])} ({tokenizer.pad})'\n"
-        f"Tokenizer EOD: '{tokenizer.detokenize([tokenizer.eod])} ({tokenizer.eod})'"
+    log_single_rank(logger, logging.INFO, f"Tokenizer vocab size: {tokenizer.vocab_size}")
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"Tokenizer PAD: '{tokenizer.detokenize([tokenizer.pad])} ({tokenizer.pad})'",
+    )
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"Tokenizer EOD: '{tokenizer.detokenize([tokenizer.eod])} ({tokenizer.eod})'",
     )
 
     trajs = []
     generation_masks = []
     inference_logprobs = []
-    for group in rollouts:
-        for rollout in group:
-            generation_mask = rollout.generation_mask if isinstance(rollout, TokenRollout) else None
-
-            trajectory = (
-                rollout.trajectory.copy()
-                if isinstance(rollout, TokenRollout)
-                else tokenizer.tokenize(rollout.trajectory)
-            )
-            inf_logprobs = rollout.logprobs
-
+    for rollout in rollouts:
+        # traj, gen mask and logprobs are lists now.
+        # each list entry is a turn, single-turn environments just have a single-element list.
+        # We assume that all lengths of the structs above have the same lengths (number of turns).
+
+        all_turns_trajectories = (
+            copy.deepcopy(rollout.trajectory)
+            if isinstance(rollout, TokenRollout)
+            else tokenizer.tokenize(rollout.trajectory)
+        )
+        for turn_idx, trajectory in enumerate(all_turns_trajectories):
+            inf_logprobs = rollout.logprobs[turn_idx]
+            generation_mask = rollout.generation_mask[turn_idx] if isinstance(rollout, TokenRollout) else None
             length = len(trajectory)
             assert length <= seq_length, "Rollout too long, how did this happen?"
             if len(trajectory) < seq_length:
@@ -1088,47 +935,35 @@ def prepare_trajectories(
             else:
                 inference_logprobs.append(None)
 
-            env_id = rollout.env_id
-            env_id_counts[env_id] += 1
+        env_id_counts[rollout.env_id] += 1
 
-    print(
-        "Rollout counts:"
-        + "".join([f"\n\t{env_id}: {count}" for env_id, count in env_id_counts.items()])
-    )
+    if torch.distributed.is_initialized():
+        logger.info(f"[{dist.get_rank()}] Rollout counts:")
+        for env_id, count in env_id_counts.items():
+            logger.info(f"[{dist.get_rank()}] \t{env_id}: {count}")
 
     generation_masks = torch.tensor(generation_masks, dtype=torch.bool, device='cpu')
     trajs = torch.tensor(trajs, device='cpu')
 
-    args = get_args()
     # Only process if we have inference_logprobs
     if inference_logprobs and any(lp is not None for lp in inference_logprobs):
-        if args.rl_use_sequence_packing:
-            # For sequence packing, we need to pad all logprobs to the same size
-            padded_logprobs = []
-            for logprobs in inference_logprobs:
-                if logprobs is not None:
-                    if len(logprobs) < seq_length:
-                        # Pad with zeros (these positions will be masked anyway)
-                        padding_size = seq_length - len(logprobs)
-                        padded = torch.nn.functional.pad(logprobs, (0, padding_size), value=0.0)
-                        padded_logprobs.append(padded)
-                    else:
-                        padded_logprobs.append(logprobs)
-                else:
-                    # Create zero tensor for None logprobs
-                    padded_logprobs.append(torch.zeros(seq_length))
-            inference_logprobs = torch.stack(padded_logprobs)
-        else:
-            # For non-packing mode, keep as list of tensors (unpadded)
-            # This preserves the original behavior where each sequence can have different lengths
-            pass
+        # We need to pad all logprobs to the same size for sequence packing.
+        # For non-packing mode, keep as list of tensors (unpadded)
+        # This preserves the original behavior where each sequence can have different lengths
+        if sequence_packing:
+            inference_logprobs = _pad_nonnull_with_zeros(inference_logprobs, seq_length)
     else:
         inference_logprobs = None
 
     # Some sanity checks regarding the tokenization
-    assert (
-        tokenizer.bos is None or (trajs[:, 0] == tokenizer.bos).all()
-    ), "First token should be bos"
+    if not skip_bos_token:
+        assert (
+            tokenizer.bos is None or (trajs[:, 0] == tokenizer.bos).all()
+        ), "First token should be bos"
+    else:
+        assert (
+            tokenizer.bos is None or (trajs[:, 0] != tokenizer.bos).all()
+        ), "First token should not be bos"  
     assert (
         tokenizer.bos is None or (trajs[:, 1] != tokenizer.bos).all()
     ), "Second token should not be bos"
@@ -1142,111 +977,83 @@ def prepare_trajectories(
     return trajs, generation_masks, inference_logprobs
 
 
-def prepare_packed_trajectories(
-    all_rollouts: GroupedRollouts, tokenizer: MegatronLegacyTokenizer, args
-):
-    """Prepare trajectories for sequence packing mode with distributed processing.
-    Distributes trajectory preparation across ranks, then gathers results for packing.
-
-    Args:
-        all_rollouts: All rollouts to process.
-        tokenizer: Tokenizer to get the padding token and potentially tokenize.
-        args: Arguments containing seq_length and distributed settings.
+def logprobs_forward_step(data_iterator, model, is_correction, packing_context=None):
+    # Avoid self.training checks which will trigger cudagraph capture; this path reuses
+    # the forward pass from training after it has been captured on the 1st iteration.
+    model.eval()
 
-    Returns:
-        Trajectories, generation masks, and inference logprobs (all gathered from all ranks).
-    """
-    world_size = mpu.get_expert_data_parallel_world_size()
-    # For packing, distribute trajectory preparation across ranks
-    # Each rank prepares a portion, then we gather for packing
-    total_rollouts = len(all_rollouts)
-    rollouts_per_rank = total_rollouts // (world_size if world_size > 0 else 1)
-    rank = mpu.get_expert_data_parallel_rank()
-
-    # Each rank prepares its portion
-    start_idx = rank * rollouts_per_rank
-    end_idx = (
-        start_idx + rollouts_per_rank
-        if rank < mpu.get_expert_data_parallel_world_size() - 1
-        else total_rollouts
-    )
-    my_rollouts = all_rollouts[start_idx:end_idx]
+    if packing_context is not None:
+        # When using sequence packing, the data iterator returns a tuple with a single element, the bin index.
+        bin_tensor = next(data_iterator)[0]
+        #TODO(jalbericiola): change for named tuple
+        (b_trajs, _, _, _, b_posids, _, _, _, _, _, b_packed_seq_params) = (
+            load_packed_data_by_index(bin_tensor.item(), packing_context, is_correction)
+        )
+    else:
+        b_trajs, b_posids = next(data_iterator)
+        b_packed_seq_params = None
 
-    # Prepare this rank's portion
-    my_trajs, my_generation_masks, my_inference_logprobs = prepare_trajectories(
-        my_rollouts, tokenizer, args.seq_length
+    logprobs = (
+        get_logprobs(
+            model,
+            b_trajs.cuda(),
+            b_posids.cuda(),
+            no_grad=True,
+            sequence_packing=b_packed_seq_params is not None, 
+            packed_seq_params=b_packed_seq_params,
+        ),
+        None,
     )
+    model.train()
+    return logprobs
 
-    # Move to GPU for all_gather operation
-    # Note: prepare_trajectories already returns tensors, not lists
-
-    my_trajs = my_trajs.cuda()
-    my_generation_masks = my_generation_masks.cuda()
-    if my_inference_logprobs is not None:
-        my_inference_logprobs = my_inference_logprobs.cuda()
-
-    # All-gather trajectories from all ranks
-    # This is more efficient than having all ranks process all sequences
-    if world_size > 1:
-        # Gather all trajectories
-        trajs_list = [torch.empty_like(my_trajs) for _ in range(world_size)]
-        torch.distributed.all_gather(
-            trajs_list, my_trajs, group=mpu.get_expert_data_parallel_group()
-        )
-        trajs = torch.cat(trajs_list, dim=0)
 
-        # Gather all generation masks
-        masks_list = [torch.empty_like(my_generation_masks) for _ in range(world_size)]
-        torch.distributed.all_gather(
-            masks_list, my_generation_masks, group=mpu.get_expert_data_parallel_group()
+def _compute_logprobs_batch(
+    model,
+    data_loader,
+    forward_backward_func,
+    packing_context,
+    trajs_batch_size, # n_bins for seq packing, and batch_size for non seq packing
+    seq_length,
+    logprobs_batch_size,
+    decoder_seq_length,
+    dtype,
+    pp_group,
+    is_correction,
+):
+    """Compute logprobs for all batches in the data loader."""
+    logprobs_list = []
+    data_iterator = iter(data_loader)
+    for i in range(len(data_loader)):
+        output_tensor = forward_backward_func(
+            forward_step_func=partial(logprobs_forward_step, is_correction=is_correction, packing_context=packing_context),
+            data_iterator=data_iterator,
+            model=model,
+            num_microbatches=1,
+            seq_length=seq_length,
+            micro_batch_size=logprobs_batch_size,
+            decoder_seq_length=decoder_seq_length,
+            forward_only=True,
+            adjust_tensor_shapes_fn=None,
         )
-        generation_masks = torch.cat(masks_list, dim=0)
+        if is_pp_last_stage(pp_group):
+            logprobs_list.append(output_tensor[0].detach())
 
-        # Gather inference logprobs if present
-        if my_inference_logprobs is not None:
-            logprobs_list = [torch.empty_like(my_inference_logprobs) for _ in range(world_size)]
-            torch.distributed.all_gather(
-                logprobs_list, my_inference_logprobs, group=mpu.get_expert_data_parallel_group()
-            )
-            inference_logprobs = torch.cat(logprobs_list, dim=0)
-        else:
-            inference_logprobs = None
+    if is_pp_last_stage(pp_group):
+        logprobs = torch.concat(logprobs_list, dim=0)
+        assert logprobs.dtype == dtype
     else:
-        # Single process case (testing)
-        trajs = my_trajs
-        generation_masks = my_generation_masks
-        inference_logprobs = my_inference_logprobs
-
-    return trajs, generation_masks, inference_logprobs
-
-
-def get_actual_sequence_lengths(sequences: torch.Tensor, pad_token: int) -> List[int]:
-    """Get actual sequence lengths for pre-padded sequences.
-
-    Args:
-        sequences: Tensor of shape [batch_size, seq_len] with pre-padded sequences
-        pad_token: The padding token ID
-
-    Returns:
-        List of actual sequence lengths (excluding padding)
-    """
-    if len(sequences.shape) != 2:
-        raise ValueError(f"Expected 2D tensor, got shape {sequences.shape}")
-
-    actual_lengths = []
-
-    # Find actual length of each sequence by locating where padding starts
-    for seq in sequences:
-        # Find the last non-padding token
-        non_pad_mask = seq != pad_token
-        if non_pad_mask.any():
-            # Get the position of the last non-padding token
-            actual_length = non_pad_mask.nonzero(as_tuple=True)[0][-1].item() + 1
-        else:
-            actual_length = 0  # All padding
-        actual_lengths.append(actual_length)
+        logprobs = torch.empty(
+            trajs_batch_size,
+            seq_length-1,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+        )
 
-    return actual_lengths
+    # Only PP>1 needs a broadcast from the last stage; for PP=1 the output is already local.
+    if get_pg_size(pp_group) > 1:
+        dist.broadcast(logprobs, src=get_pp_last_rank(pp_group), group=pp_group)
+    return logprobs.cpu()
 
 
 def prepare_data_for_update(
@@ -1254,6 +1061,8 @@ def prepare_data_for_update(
     ref_state_dict: Dict[str, Any],
     rollouts: GroupedRollouts,
     tokenizer: MegatronLegacyTokenizer,
+    sequence_packing: bool,
+    is_correction: bool,
 ) -> RerunDataIterator:
     """Extract data for the update from raw rollouts.
 
@@ -1262,442 +1071,96 @@ def prepare_data_for_update(
         ref_state_dict: Reference policy state dict.
         rollouts: Rollouts to extract the data from.
         tokenizer: Tokenizer to pad/tokenize data.
+        sequence_packing: Use sequence packing if True.
+        is_correction: Prepare data for IS correction if True.
 
     Returns:
         Cycled iterator over dataset batches. In GRPO we might want to go over the same data multiple times.
     """
     args = get_args()
-    timers = get_timers()
     wandb_writer = get_wandb_writer()
     tb_writer = get_tensorboard_writer()
     nvtx_range = get_nvtx_range()
+    runtime_state = get_rl_runtime_state()
+
+    if args.cuda_graph_impl != "none" and not args.rl_training_cuda_graphs:
+        lang_module = (
+            model[0].module.module if hasattr(model[0].module, "module") else model[0].module
+        )
+        toggle_cuda_graphs(lang_module, "none", reset_cuda_graphs=False)
+
     model = model[0]
+    dtype = torch.bfloat16 if args.bf16 else (torch.float16 if args.fp16 else torch.float32)
 
     with nvtx_range("prepare-data-for-update"):
         with nvtx_range("compute-group-stats"):
-            # These are computed on all rollouts for reporting purposes
-            group_stats = compute_group_stats(rollouts, tokenizer)
-            rewards = np.array([[rollout.reward for rollout in group] for group in rollouts])
-            group_stats.rewards = rewards.flatten().tolist()
-            group_stats.advantages = (
-                (
-                    (rewards - rewards.mean(axis=1, keepdims=True))
-                    / (1e-4 + rewards.std(axis=1, keepdims=True))
-                )
-                .flatten()
-                .tolist()
-            )
-
-        all_rollouts = rollouts
+            group_stats = compute_group_stats(rollouts, tokenizer, args.seq_length)
+            # TODO(vitalyk): why do we need global_advantages here? go inside packing
+            advantages = global_advantages = torch.tensor(group_stats.advantages, dtype=dtype).cuda()
 
         # Now split the rollouts across the data parallel ranks for training
         # This needs to be done at this point because we are about to calculate logprobs
-        if (expert_data_parallel_world_size := mpu.get_expert_data_parallel_world_size()) > 0:
-            data_split_size = len(rollouts) // expert_data_parallel_world_size
-            data_split_range = (
-                mpu.get_expert_data_parallel_rank() * data_split_size,
-                (mpu.get_expert_data_parallel_rank() + 1) * data_split_size,
-            )
-            rollouts = rollouts[data_split_range[0] : data_split_range[1]]
+        # Note :- For EP, do not use the expert data parallel group here. Always 
+        # use the regular data parallel group. 
 
-        # [g, group_size]
-        # Making an assumption that all groups are of the same size!
-        # For packing mode, use all rollouts to compute rewards
-        rollouts_for_rewards = all_rollouts if args.rl_use_sequence_packing else rollouts
-        rewards = torch.tensor(
-            [[rollout.reward for rollout in group] for group in rollouts_for_rewards], device='cpu'
-        )
+        # Use one group as an exampling for logging later.
+        example_group = rollouts[0]
 
-        # We flatten them for logging.
-        with nvtx_range("prepare_trajectories"):
-            if args.rl_use_sequence_packing:
-                trajs, generation_masks, inference_logprobs = prepare_packed_trajectories(
-                    all_rollouts, tokenizer, args
-                )
-            else:
-                trajs, generation_masks, inference_logprobs = prepare_trajectories(
-                    rollouts, tokenizer, args.seq_length
-                )
-        # Store reference to original data (no clone needed since we don't modify in-place)
-        original_trajs = trajs
-
-        # Sequence packing or standard processing
-        packing_context = {}  # Store all packing-related data
-
-        if args.rl_use_sequence_packing:
-            with nvtx_range("sequence_packing"):
-                timers('sequence-packing-overhead', log_level=1).start()
+        # Let's expand rollouts getting rid of the groups.
+        # We need this to correctly split the rollouts across dp groups.
+        # And we do not actually need them grouped in anything below anyways.
+        rollouts = [r for g in rollouts for r in g]
+        total_turns_sampled = len(rollouts)
 
-                bin_size = args.rl_sequence_packing_bin_size
+        # We might sample more than we consume in one step.
+        samples_ratio_per_step = args.global_batch_size / (args.grpo_prompts_per_step * args.grpo_group_size)
+        assert samples_ratio_per_step <= 1, "You cannot use more data than you sampled."
 
-                # Create packer with max sequences per bin limit to prevent extreme imbalance
-                max_sequences_per_bin = getattr(args, 'rl_sequence_packing_max_sequences_per_bin', 100)
-                packer = SequencePacker(
-                    bin_size=bin_size,
-                    pad_token=tokenizer.pad,
-                    max_sequences_per_bin=max_sequences_per_bin,
-                )
-                packing_context['packer'] = packer  # Store for reuse
-
-                # Convert trajectories to list for packing
-                traj_list = [trajs[i] for i in range(trajs.shape[0])]
-
-                # Pack sequences with generation masks
-                (
-                    packed_trajs,
-                    packed_position_ids,
-                    packed_attention_mask,
-                    packed_loss_mask,
-                    packing_info,
-                ) = packer.pack_sequences(traj_list, generation_masks)
-
-                rank = mpu.get_expert_data_parallel_rank()
-                # Debug: Check packing output
-                if rank == 0:
-                    seq_per_bin = [len(indices) for indices in packing_info['bin_seq_indices']]
-                    print(f"\nDEBUG: Initial packing output (before distribution):")
-                    print(f"  - Total bins created: {len(packing_info['bin_seq_indices'])}")
-                    print(f"  - Total sequences packed: {sum(seq_per_bin)}")
-                    print(
-                        f"  - Sequences per bin: min={min(seq_per_bin)}, max={max(seq_per_bin)}, avg={sum(seq_per_bin)/len(seq_per_bin):.1f}"
-                    )
-                    print(f"  - First 20 bins: {seq_per_bin[:20]}")
-
-                # Store packing info for later unpacking
-                # Also store the actual padded length in packing_info for the unpacker
-                packing_info['original_padded_length'] = original_trajs.shape[1]
-
-                # Distribute packed bins across data parallel ranks
-                num_bins = packed_trajs.shape[0]
-                world_size = mpu.get_expert_data_parallel_world_size()
-
-                # Choose distribution algorithm based on args.sequence_packing_algo
-                packing_algo = getattr(args, 'rl_sequence_packing_algo', 'fifo')
-
-                if packing_algo == 'round-robin':
-                    # Round-robin assignment: rank i gets bins [i, i+world_size, i+2*world_size, ...]
-                    my_bin_indices = list(range(rank, num_bins, world_size))
-                else:  # fifo (default)
-                    world_size = world_size if world_size > 0 else 1
-                    # FIFO assignment: divide bins sequentially across ranks
-                    bins_per_rank = num_bins // world_size
-                    extra_bins = num_bins % world_size
-
-                    # Calculate start and end indices for this rank
-                    if rank < extra_bins:
-                        # Ranks with extra bins
-                        start_idx = rank * (bins_per_rank + 1)
-                        end_idx = start_idx + bins_per_rank + 1
-                    else:
-                        # Ranks without extra bins
-                        start_idx = rank * bins_per_rank + extra_bins
-                        end_idx = start_idx + bins_per_rank
-
-                    my_bin_indices = list(range(start_idx, end_idx))
-
-                # Calculate the maximum bins any rank has (for synchronization)
-                max_bins_per_rank = (num_bins + world_size - 1) // world_size
-
-                # Extract this rank's bins
-                my_packed_trajs = []
-                my_packed_position_ids = []
-                my_packed_attention_mask = []
-                my_packed_loss_mask = []
-                my_packed_inference_logprobs = []
-                my_bin_seq_indices = []
-                my_seq_starts = {}
-
-                # Check if we have packed inference logprobs
-                has_packed_inference_logprobs = 'packed_inference_logprobs' in packing_context
-
-                for new_idx, old_idx in enumerate(my_bin_indices):
-                    my_packed_trajs.append(packed_trajs[old_idx])
-                    my_packed_position_ids.append(packed_position_ids[old_idx])
-                    if packed_attention_mask is not None:
-                        my_packed_attention_mask.append(packed_attention_mask[old_idx])
-                    my_packed_loss_mask.append(packed_loss_mask[old_idx])
-                    if has_packed_inference_logprobs:
-                        my_packed_inference_logprobs.append(
-                            packing_context['packed_inference_logprobs'][old_idx]
-                        )
-                    my_bin_seq_indices.append(packing_info['bin_seq_indices'][old_idx])
-                    my_seq_starts[new_idx] = packing_info['seq_starts'][old_idx]
-
-                # Stack the selected bins
-                packed_trajs = (
-                    torch.stack(my_packed_trajs)
-                    if my_packed_trajs
-                    else torch.empty(
-                        0,
-                        packed_trajs.shape[1],
-                        dtype=packed_trajs.dtype,
-                        device=packed_trajs.device,
-                    )
-                )
-                packed_position_ids = (
-                    torch.stack(my_packed_position_ids)
-                    if my_packed_position_ids
-                    else torch.empty(
-                        0,
-                        packed_position_ids.shape[1],
-                        dtype=packed_position_ids.dtype,
-                        device=packed_position_ids.device,
-                    )
-                )
-                packed_attention_mask = (
-                    torch.stack(my_packed_attention_mask) if my_packed_attention_mask else None
-                )
-                packed_loss_mask = (
-                    torch.stack(my_packed_loss_mask)
-                    if my_packed_loss_mask
-                    else torch.empty(
-                        0,
-                        packed_loss_mask.shape[1],
-                        dtype=packed_loss_mask.dtype,
-                        device=packed_loss_mask.device,
-                    )
-                )
-
-                # Stack the packed inference logprobs if available
-                if has_packed_inference_logprobs and my_packed_inference_logprobs:
-                    packed_inference_logprobs = torch.stack(my_packed_inference_logprobs)
-                    packing_context['packed_inference_logprobs'] = packed_inference_logprobs
-                elif has_packed_inference_logprobs:
-                    # Create empty tensor if no bins for this rank
-                    packed_inference_logprobs = torch.empty(
-                        0,
-                        bin_size - 1,
-                        dtype=packing_context['packed_inference_logprobs'].dtype,
-                        device=packing_context['packed_inference_logprobs'].device,
-                    )
-                    packing_context['packed_inference_logprobs'] = packed_inference_logprobs
-
-                # Debug: Check what we're extracting
-                if rank == 0:
-                    print(f"\nDEBUG: Rank 0 {packing_algo} bin assignment:")
-                    print(f"  - Total bins before distribution: {num_bins}")
-                    print(
-                        f"  - Bins assigned to rank 0: {my_bin_indices[:10]}... (showing first 10)"
-                    )
-                    print(f"  - Number of bins for this rank: {len(my_bin_indices)}")
-                    print(f"  - Length of my_bin_seq_indices: {len(my_bin_seq_indices)}")
-                    if len(my_bin_seq_indices) > 0:
-                        print(
-                            f"  - Sequences in first 5 bins: {[len(indices) for indices in my_bin_seq_indices[:5]]}"
-                        )
-
-                # Create updated packing info for this rank
-                packing_info = {
-                    'bin_seq_indices': my_bin_seq_indices,
-                    'seq_starts': my_seq_starts,
-                    'seq_lengths': packing_info['seq_lengths'],  # Keep all sequence lengths
-                    'seq_to_bin_idx': packing_info['seq_to_bin_idx'],  # Keep mapping
-                    'original_padded_length': packing_info['original_padded_length'],
-                }
-
-                # Add empty bins if this rank has fewer than max_bins_per_rank
-                current_bins = len(my_bin_indices)
-                if current_bins < max_bins_per_rank:
-                    num_empty_bins = max_bins_per_rank - current_bins
-
-                    # Create empty bins using the helper function
-                    bin_size = packed_trajs.shape[1]
-                    (
-                        empty_trajs,
-                        empty_position_ids,
-                        empty_loss_mask,
-                        empty_attention_mask,
-                        empty_packing_entries,
-                    ) = create_empty_bins(
-                        num_empty_bins,
-                        bin_size,
-                        packed_trajs,
-                        packed_position_ids,
-                        packed_loss_mask,
-                        packed_attention_mask,
-                        tokenizer,
-                    )
-
-                    # Append empty bins to packed tensors
-                    packed_trajs = torch.cat([packed_trajs, empty_trajs], dim=0)
-                    packed_position_ids = torch.cat(
-                        [packed_position_ids, empty_position_ids], dim=0
-                    )
-                    packed_loss_mask = torch.cat([packed_loss_mask, empty_loss_mask], dim=0)
-
-                    if packed_attention_mask is not None and empty_attention_mask is not None:
-                        packed_attention_mask = torch.cat(
-                            [packed_attention_mask, empty_attention_mask], dim=0
-                        )
-
-                    # Create empty inference logprobs if needed
-                    if has_packed_inference_logprobs:
-                        empty_inference_logprobs = torch.zeros(
-                            (num_empty_bins, bin_size - 1),
-                            dtype=packed_inference_logprobs.dtype,
-                            device=packed_inference_logprobs.device,
-                        )
-                        packed_inference_logprobs = torch.cat(
-                            [packed_inference_logprobs, empty_inference_logprobs], dim=0
-                        )
-                        packing_context['packed_inference_logprobs'] = packed_inference_logprobs
-
-                    # Add empty entries to packing_info
-                    for i, entry in enumerate(empty_packing_entries):
-                        bin_idx = current_bins + i
-                        packing_info['bin_seq_indices'].append(entry['bin_seq_indices'])
-                        packing_info['seq_starts'][bin_idx] = entry['seq_starts']
-
-                packing_context['packing_info'] = packing_info
-                packing_context['original_generation_masks'] = generation_masks
-                packing_context['original_trajs'] = original_trajs
-                # Move packed tensors to GPU once to avoid CPU-GPU transfers every iteration
-                packing_context['packed_trajs'] = packed_trajs.cuda()
-                packing_context['packed_position_ids'] = packed_position_ids.cuda()
-                packing_context['packed_attention_mask'] = (
-                    packed_attention_mask.cuda() if packed_attention_mask is not None else None
-                )
-                packing_context['packed_loss_mask'] = packed_loss_mask.cuda()
-
-                # Store the original padding positions for correct unpacking
-                # The loss_mask will be based on original_trajs, so we need to preserve that pattern
-                packing_context['original_padding_positions'] = original_trajs == tokenizer.pad
-
-                # Store my_bin_seq_indices for later use
-                packing_context['my_bin_seq_indices'] = my_bin_seq_indices
-
-                # Pre-compute all PackedSeqParams for all bins ONCE to avoid repeated
-                # tensor allocations that cause CUDA memory fragmentation and periodic spikes
-                cached_packed_seq_params = []
-                device = packing_context['packed_trajs'].device
-                for bin_idx in range(len(packing_context['packed_trajs'])):
-                    params = create_packed_seq_params_for_bin(
-                        packing_info=packing_info,
-                        bin_idx=bin_idx,
-                        bin_size=args.rl_sequence_packing_bin_size,
-                        device=device,
-                    )
-                    # Compute seq_len here (one-time .item() call during caching is fine)
-                    seq_len = params.cu_seqlens_q[-1].item() if params is not None else 0
-                    cached_packed_seq_params.append((params, seq_len))
-                packing_context['cached_packed_seq_params'] = cached_packed_seq_params
-
-                # Log packing efficiency (for this rank's bins)
-                total_tokens = sum(packing_info['seq_lengths'])  # All sequences
-                my_sequences = sum(len(indices) for indices in my_bin_seq_indices)
-                my_tokens = sum(
-                    packing_info['seq_lengths'][idx]
-                    for indices in my_bin_seq_indices
-                    for idx in indices
-                )
-                total_capacity = packed_trajs.shape[0] * packed_trajs.shape[1]
-                packing_efficiency = my_tokens / total_capacity if total_capacity > 0 else 0
-                avg_seq_length = total_tokens / len(packing_info['seq_lengths'])
-
-                # Store global average sequences per bin in packing context
-                if num_bins > 0:
-                    global_avg_seqs_per_bin = len(packing_info['seq_lengths']) / num_bins
-                else:
-                    global_avg_seqs_per_bin = 1  # Default to 1 if no bins
-                packing_context['global_avg_seqs_per_bin'] = global_avg_seqs_per_bin
-
-                print_rank_0(f"\n[Sequence Packing] Statistics:")
-                print_rank_0(f"  - Total sequences: {len(packing_info['seq_lengths'])}")
-                print_rank_0(f"  - Total bins: {num_bins}")
-                print_rank_0(f"  - Bin size: {packed_trajs.shape[1]} tokens")
-                print_rank_0(f"  - Average sequence length: {avg_seq_length:.1f} tokens")
-                print_rank_0(f"  - Average sequences per bin: {global_avg_seqs_per_bin:.1f}")
-                print_rank_0(
-                    f"  - This rank: {my_sequences} sequences in {packed_trajs.shape[0]} bins"
-                )
-                print_rank_0(
-                    f"  - Packing efficiency: {packing_efficiency:.1%} ({my_tokens:,} / {total_capacity:,} tokens)"
-                )
+        if (data_parallel_world_size := mpu.get_data_parallel_world_size()) > 0:
+            data_split_size = len(rollouts) // data_parallel_world_size
+            data_split_range = (
+                mpu.get_data_parallel_rank() * data_split_size,
+                (mpu.get_data_parallel_rank() + 1) * data_split_size,
+            )
+            # TODO(vitalyk): This has to be rewritten assuming we are multiturn now.
+            rollouts = rollouts[data_split_range[0] : data_split_range[1]]
+            local_num_turns = sum(group_stats.num_turns[data_split_range[0] : data_split_range[1]])
+            steps_before = sum(group_stats.num_turns[:data_split_range[0]])
+            advantages = advantages[steps_before:steps_before+local_num_turns]
+            # First we calculate them on a global level and then we split and recalculate on a local level.
+            # Sequence packing and reporting needs it global but non-packing wants it local.
 
-                # Add detailed per-rank sequence distribution analysis
-                if torch.distributed.is_initialized():
-                    # Gather sequence counts from all ranks
-                    seq_counts_per_bin = [len(indices) for indices in my_bin_seq_indices]
-                    non_empty_bins = [c for c in seq_counts_per_bin if c > 0]
-
-                    # Create tensor with rank statistics
-                    rank_stats = torch.tensor(
-                        [
-                            float(rank),
-                            float(len(my_bin_seq_indices)),  # total bins
-                            float(len(non_empty_bins)),  # non-empty bins
-                            float(my_sequences),  # total sequences
-                            (
-                                float(min(non_empty_bins)) if non_empty_bins else 0.0
-                            ),  # min sequences per bin
-                            (
-                                float(max(non_empty_bins)) if non_empty_bins else 0.0
-                            ),  # max sequences per bin
-                            (
-                                float(my_sequences / len(non_empty_bins)) if non_empty_bins else 0.0
-                            ),  # avg sequences per non-empty bin
-                        ],
-                        device='cuda',
-                    )
+        with nvtx_range("prepare_trajectories"):
+            trajs, generation_masks, inference_logprobs = prepare_trajectories(
+                rollouts, tokenizer, args.seq_length, sequence_packing, args.rl_skip_bos_token
+            )
 
-                    # Gather from all ranks
-                    world_size = mpu.get_data_parallel_world_size()
-                    all_rank_stats = [torch.zeros_like(rank_stats) for _ in range(world_size)]
-                    torch.distributed.all_gather(
-                        all_rank_stats, rank_stats, group=mpu.get_data_parallel_group()
+        packing_context = None
+        # Build trajectories based on sequence packing or standard processing
+        if sequence_packing:
+            with nvtx_range("sequence_packing", time=True):
+                runtime_state.packing_context = packing_context = pack_all_trajectories(
+                    trajs, 
+                    generation_masks, 
+                    inference_logprobs, 
+                    global_advantages, 
+                    args.seq_length, 
+                    args.rl_sequence_packing_max_sequences_per_bin,
+                    args.rl_sequence_packing_algo
                     )
-
-                    # Print detailed statistics for each rank
-                    if rank == 0:
-                        print(
-                            f"\n[Sequence Packing] Per-rank distribution ({packing_algo} algorithm):"
-                        )
-                        print(
-                            "  Rank | Total Bins | Non-empty | Sequences | Min/Bin | Max/Bin | Avg/Bin"
-                        )
-                        print(
-                            "  -----|------------|-----------|-----------|---------|---------|--------"
-                        )
-                        for stats in all_rank_stats:
-                            r = int(stats[0].item())
-                            total_bins = int(stats[1].item())
-                            non_empty = int(stats[2].item())
-                            sequences = int(stats[3].item())
-                            min_seq = int(stats[4].item())
-                            max_seq = int(stats[5].item())
-                            avg_seq = stats[6].item()
-                            print(
-                                f"   {r:3d} | {total_bins:10d} | {non_empty:9d} | {sequences:9d} | {min_seq:7d} | {max_seq:7d} | {avg_seq:6.1f}"
-                            )
-
-                        # Also show first few bins for rank 0 as example
-                        print(f"\n  Example (Rank 0 first 10 bins): {seq_counts_per_bin[:10]}")
-
-                        # Show the improvement from round-robin
-                        total_seqs_all_ranks = sum(int(stats[3].item()) for stats in all_rank_stats)
-                        avg_seqs_per_rank = total_seqs_all_ranks / world_size
-                        max_deviation = max(
-                            abs(int(stats[3].item()) - avg_seqs_per_rank)
-                            for stats in all_rank_stats
-                        )
-                        print(f"\n  Round-robin distribution quality:")
-                        print(f"  - Average sequences per rank: {avg_seqs_per_rank:.1f}")
-                        print(
-                            f"  - Max deviation from average: {max_deviation:.0f} sequences ({max_deviation/avg_seqs_per_rank*100:.1f}%)"
-                        )
-
-                # Update data for packed computation
-                trajs = packed_trajs
-                position_ids = packed_position_ids
-                attention_mask = packed_attention_mask
-
-                timers('sequence-packing-overhead').stop()
-
-        # Always compute standard masks for the original data (we'll need them later)
-        with nvtx_range("get_ltor_masks_and_position_ids"):
-            original_attention_mask, original_loss_mask, original_position_ids = (
-                get_ltor_masks_and_position_ids(
-                    original_trajs,
+    
+                compute_trajs = packing_context.packed_trajs
+                compute_position_ids = packing_context.packed_position_ids
+                # Use batch_size=1 for packed computation to enable proper attention masking
+                # via PackedSeqParams (TE needs cu_seqlens per bin)
+                dataset = TensorDataset(torch.arange(len(compute_trajs)))
+                data_loader = DataLoader(dataset, batch_size=1)
+                logprobs_batch_size = 1
+        else:
+            # Always compute standard masks for the original data (we'll need them later)
+            with nvtx_range("get_ltor_masks_and_position_ids"):
+                _, original_loss_mask, original_position_ids = get_ltor_masks_and_position_ids(
+                    trajs,
                     tokenizer.eod,
                     tokenizer.pad,
                     args.reset_position_ids,
@@ -1705,357 +1168,171 @@ def prepare_data_for_update(
                     eod_mask_loss=False,
                     pad_mask_loss=True,
                 )
-            )
-            original_loss_mask[~generation_masks] = 0.0
-
-        if not args.rl_use_sequence_packing:
-            # Use original masks if not packing
-            attention_mask = original_attention_mask
-            loss_mask = original_loss_mask
-            position_ids = original_position_ids
-
-        with torch.no_grad(), nvtx_range("compute_logprobs"):
-            timers('compute-logprobs', log_level=0).start()
-            # Before we can update the model, we need to get the logprobs for the \pi_{old} model.
-            # Use packed sequences if packing is enabled for performance benefits
-            if args.rl_use_sequence_packing and 'packed_trajs' in packing_context:
-                compute_trajs = packing_context['packed_trajs']
-                compute_position_ids = packing_context['packed_position_ids']
-                compute_attention_mask = packing_context['packed_attention_mask']
-                use_packed_computation = True
-            else:
-                compute_trajs = original_trajs
+                original_loss_mask[~generation_masks] = 0.0
+                compute_trajs = trajs
                 compute_position_ids = original_position_ids
-                compute_attention_mask = original_attention_mask
-                use_packed_computation = False
-
-        with nvtx_range("create-logprobs-dataloader"):
-            # Use batch_size=1 for packed computation to enable proper attention masking
-            # via PackedSeqParams (TE needs cu_seqlens per bin)
-            logprobs_batch_size = 1 if use_packed_computation else args.micro_batch_size
-            data_iter = DataLoader(
-                TensorDataset(compute_trajs, compute_position_ids), batch_size=logprobs_batch_size
-            )
-            old_logprobs = []
-
-            # Compute logprobs
-            for batch_idx, (b_trajs, b_posids) in enumerate(data_iter):
-                # Get attention mask slice
-                if compute_attention_mask is not None:
-                    start_idx = batch_idx * logprobs_batch_size
-                    end_idx = min(
-                        start_idx + logprobs_batch_size, compute_attention_mask.shape[0]
-                    )
-                    b_attn_mask = compute_attention_mask[start_idx:end_idx].cuda()
-                else:
-                    b_attn_mask = None
-
-                b_trajs = b_trajs.cuda()
-                b_posids = b_posids.cuda()
-
-                # Get cached packed_seq_params for proper attention masking in TE
-                b_packed_seq_params = None
-                b_packed_seq_len = 0
-                if use_packed_computation and 'cached_packed_seq_params' in packing_context:
-                    b_packed_seq_params, b_packed_seq_len = packing_context['cached_packed_seq_params'][batch_idx]
-
-                logprobs = get_logprobs(
-                    model, b_trajs, b_posids, b_attn_mask, no_grad=True,
-                    packed_seq_params=b_packed_seq_params,
-                    packed_seq_len=b_packed_seq_len
-                )
-                old_logprobs.append(logprobs.detach().cpu())
-
-            old_logprobs = torch.concat(old_logprobs, dim=0)
-
-            # Handle packed vs unpacked logprobs
-            if use_packed_computation and 'packing_info' in packing_context:
-                # Store packed logprobs on GPU for forward_step
-                packing_context['old_logprobs'] = old_logprobs.cuda()
-                # Keep old_logprobs as None for the data loading path
-                old_logprobs_for_data = None
-            else:
-                # In unpacked mode, we need to unpack if we computed on packed data
-                old_logprobs_for_data = old_logprobs
-
-            timers('compute-logprobs').stop()
-
-        # Inference logprobs 2 tokens shorter than old_logprobs.
-        # One token difference is because we remove the first one in get_logprobs(), the other one is eod padding, if I got it correct. The difference should be one token if we are cut by the sequence length.
-
-        if inference_logprobs is not None and not args.rl_use_sequence_packing:
-            inference_logprobs = align_unpacked_inference_logprobs(
-                inference_logprobs=inference_logprobs,
-                old_logprobs_for_data=old_logprobs_for_data,
-                generation_masks=generation_masks,
-                group_stats=group_stats,
-            )
-            # We run the above to fill in the inference/train side mismatch stats.
-            # We do the above for logging purposes.
-            # Nullify logprobs if not used in IS correction,
-            if not args.rl_inference_logprobs_is_correction:
-                inference_logprobs = None
-        elif not args.rl_use_sequence_packing:
-            # For sequence packing, inference_logprobs will be handled separately
-            inference_logprobs = None
-
-        # Handle packing of inference_logprobs for sequence packing mode
-        if (
-            args.rl_use_sequence_packing
-            and inference_logprobs is not None
-            and args.rl_inference_logprobs_is_correction
-        ):
-            with nvtx_range("pack-inference-logprobs"):
-                # Pack the inference logprobs using the helper function
-                packed_inference_logprobs = pack_inference_logprobs(
-                    inference_logprobs=inference_logprobs,
-                    packing_info=packing_context['packing_info'],
-                    generation_masks=generation_masks,
-                    bin_size=args.rl_sequence_packing_bin_size,
+                data_loader = DataLoader(
+                    TensorDataset(compute_trajs, compute_position_ids),
+                    batch_size=args.micro_batch_size,
                 )
+                logprobs_batch_size = args.micro_batch_size
 
-                # Store packed inference logprobs in packing context
-                packing_context['packed_inference_logprobs'] = packed_inference_logprobs.cuda()
-                packing_context['has_inference_logprobs'] = True
-
-        # TODO(vitalyk): add a test for prepare_data_for_update.
-
-        with torch.no_grad(), nvtx_range("compute_ref_logprobs"):
-            # We need to load the ref model state dict and compute the logprobs for the ref model
-            cur_st_dict = {
-                k: (v.cpu() if v is not None else v) for k, v in model.state_dict().items()
-            }
-            model.load_state_dict(ref_state_dict)
-            ref_logprobs = []
-
-            # Compute reference logprobs
-            for batch_idx, (b_trajs, b_posids) in enumerate(data_iter):
-                # Get attention mask slice
-                if compute_attention_mask is not None:
-                    start_idx = batch_idx * logprobs_batch_size
-                    end_idx = min(
-                        start_idx + logprobs_batch_size, compute_attention_mask.shape[0]
-                    )
-                    b_attn_mask = compute_attention_mask[start_idx:end_idx].cuda()
-                else:
-                    b_attn_mask = None
-
-                b_trajs = b_trajs.cuda()
-                b_posids = b_posids.cuda()
-
-                # Get cached packed_seq_params for proper attention masking in TE
-                b_packed_seq_params = None
-                b_packed_seq_len = 0
-                if use_packed_computation and 'cached_packed_seq_params' in packing_context:
-                    b_packed_seq_params, b_packed_seq_len = packing_context['cached_packed_seq_params'][batch_idx]
+        with torch.no_grad(), nvtx_range("compute_logprobs", time=True):
+            # Before we can update the model, we need to get the logprobs for the \pi_{old} model.
 
-                logprobs = get_logprobs(
-                    model, b_trajs, b_posids, b_attn_mask, no_grad=True,
-                    packed_seq_params=b_packed_seq_params,
-                    packed_seq_len=b_packed_seq_len
+            # Wrap forward_backward_func for Full iteration CUDA graph
+            forward_backward_func = get_forward_backward_func()
+            if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
+                forward_backward_func = FullCudaGraphWrapper(
+                    forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps
                 )
-                ref_logprobs.append(logprobs.detach().cpu())
-
-            ref_logprobs = torch.concat(ref_logprobs, dim=0)
-
-            # Handle packed vs unpacked logprobs
-            if use_packed_computation and 'packing_info' in packing_context:
-                # Store packed logprobs on GPU for forward_step
-                packing_context['ref_logprobs'] = ref_logprobs.cuda()
-                # Keep ref_logprobs as None for the data loading path
-                # since we won't use TensorDataset in packed mode
-                ref_logprobs_for_data = None
-            else:
-                # In unpacked mode, use the computed logprobs directly
-                ref_logprobs_for_data = ref_logprobs
-            # logprobs are [b, seq, h] now.
-            model.load_state_dict(cur_st_dict)
-
-        torch.cuda.synchronize()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        # model.train()
 
-        with nvtx_range("prepare_advantages"):
-            timers('prepare-advantages', log_level=0).start()
 
-            advantages = (rewards - rewards.mean(axis=1, keepdim=True)) / (
-                1e-4 + rewards.std(axis=1, keepdim=True)
+            dtype = (
+                torch.bfloat16 if args.bf16 else (torch.float16 if args.fp16 else torch.float32)
             )
 
-            # Flatten advantages for training and move to GPU
-            advantages = advantages.view(-1).cuda()
-
-            timers('prepare-advantages').stop()
-        with nvtx_range("create_dataloader"):
-            if args.rl_use_sequence_packing:
-                # Store packing context in runtime state for forward_step
-                runtime_state = get_rl_runtime_state()
-                runtime_state.packing_context = packing_context
-
-                packing_info = packing_context['packing_info']
-                packing_context['bin_advantages'] = []
-                for bin_idx, seq_indices in enumerate(packing_info['bin_seq_indices']):
-                    if seq_indices:
-                        packing_context['bin_advantages'].append(advantages[seq_indices])
-                    else:
-                        packing_context['bin_advantages'].append(
-                            torch.tensor([], dtype=advantages.dtype, device=advantages.device)
-                        )
-
-                num_bins_this_rank = len(packing_context['packed_trajs'])
-                bin_indices = torch.arange(num_bins_this_rank)
-
-                my_bin_seq_indices = packing_context.get('my_bin_seq_indices', [])
-
-                my_sequences = sum(len(indices) for indices in my_bin_seq_indices)
-
-                actual_seqs_per_bin_this_rank = (
-                    my_sequences / num_bins_this_rank if num_bins_this_rank > 0 else 1
+            pg_collection = get_attr_wrapped_model(model, "pg_collection")
+            pp_group = pg_collection.pp
+
+            with torch.no_grad(), nvtx_range("compute_old_logprobs", time=True):
+                old_logprobs = _compute_logprobs_batch(
+                    model=model,
+                    data_loader=data_loader,
+                    forward_backward_func=forward_backward_func,
+                    packing_context=packing_context,
+                    trajs_batch_size=len(compute_trajs),
+                    seq_length=args.seq_length,
+                    logprobs_batch_size=logprobs_batch_size,
+                    decoder_seq_length=args.decoder_seq_length,
+                    dtype=dtype,
+                    pp_group=pp_group,
+                    is_correction=args.rl_inference_logprobs_is_correction,
                 )
-                global_avg_seqs_per_bin = max(
-                    1, packing_context.get('global_avg_seqs_per_bin', actual_seqs_per_bin_this_rank)
-                )
-
-                target_sequences_per_step = args.global_batch_size
-                dp_world_size = max(1, mpu.get_data_parallel_world_size())
 
-                total_bins_needed = max(
-                    1, math.ceil(target_sequences_per_step / global_avg_seqs_per_bin)
+            with torch.no_grad(), nvtx_range("compute_ref_logprobs", time=True):
+                # We need to load the ref model state dict and compute the logprobs for the ref model
+                cur_st_dict = {
+                    k: (v.cpu() if v is not None else v) for k, v in model.state_dict().items()
+                }
+                model.load_state_dict(ref_state_dict)
+                ref_logprobs = _compute_logprobs_batch(
+                    model=model,
+                    data_loader=data_loader,
+                    forward_backward_func=forward_backward_func,
+                    packing_context=packing_context,
+                    trajs_batch_size=len(compute_trajs),
+                    seq_length=args.seq_length,
+                    logprobs_batch_size=logprobs_batch_size,
+                    decoder_seq_length=args.decoder_seq_length,
+                    dtype=dtype,
+                    pp_group=pp_group,
+                    is_correction=args.rl_inference_logprobs_is_correction,
                 )
 
-                # Ensure divisibility by dp_world_size
-                if total_bins_needed % dp_world_size != 0:
-                    total_bins_needed = ((total_bins_needed // dp_world_size) + 1) * dp_world_size
-
-                bins_per_rank_per_step = total_bins_needed // dp_world_size
-                bins_per_rank_per_step = min(bins_per_rank_per_step, num_bins_this_rank)
-
-                # Synchronize across ranks - all ranks must process same number of bins
-                bins_per_rank_tensor = torch.tensor(
-                    [bins_per_rank_per_step], dtype=torch.long, device='cuda'
-                )
-                if torch.distributed.is_initialized():
-                    torch.distributed.all_reduce(
-                        bins_per_rank_tensor,
-                        op=torch.distributed.ReduceOp.MIN,
-                        group=mpu.get_data_parallel_group(),
+                # logprobs are [b, seq, h] now.
+                model.load_state_dict(cur_st_dict)
+
+            torch.cuda.synchronize()
+            gc.collect()
+            torch.cuda.empty_cache()
+
+
+        if sequence_packing:
+            with nvtx_range("pack_logprobs", time=True):
+                # Store logprobs on gpu in packing context
+                # Since PackingContext is a dataclass, we add these as new attributes
+                packing_context.old_logprobs = old_logprobs.cuda()
+                packing_context.ref_logprobs = ref_logprobs.cuda()
+
+                if inference_logprobs is not None:
+                    # Pack the inference logprobs using the helper function
+                    # We do this for logging purposes even if is_correction is disabled
+                    packed_inference_logprobs = pack_inference_logprobs(
+                        inference_logprobs=packing_context.original_inference_logprobs,
+                        packing_info=packing_context.packing_info,
+                        generation_masks=packing_context.original_generation_masks,
+                        bin_size=args.seq_length,
                     )
-                bins_per_rank_per_step = int(bins_per_rank_tensor.item())
 
-                effective_global_batch_size = bins_per_rank_per_step * dp_world_size
-
-                total_steps = len(bin_indices) // bins_per_rank_per_step + (
-                    1 if len(bin_indices) % bins_per_rank_per_step else 0
-                )
-
-                # Store packing plan in runtime state for the training loop to use
-                runtime_state = get_rl_runtime_state()
-                runtime_state.sequence_packing_plan = {
-                    'bin_indices': bin_indices,
-                    'bins_per_rank_per_step': bins_per_rank_per_step,
-                    'total_steps': total_steps,
-                    'current_step': 0,
-                    'packing_context': packing_context,
-                }
-
-                runtime_state.sequence_packing_metadata = {
-                    'num_bins': num_bins_this_rank,
-                    'num_bins_this_rank': num_bins_this_rank,
-                    'num_sequences': len(packing_info['seq_lengths']),
-                    'avg_seqs_per_bin': global_avg_seqs_per_bin,
-                    'avg_seqs_per_bin_this_rank': actual_seqs_per_bin_this_rank,
-                }
-
-                if args.micro_batch_size != 1:
-                    print_rank_0(
-                        f"WARNING: micro_batch_size={args.micro_batch_size} but sequence packing expects 1. Using 1."
+                    # Compute statistics for logging using packed data
+                    compute_packed_inference_logprobs_stats(
+                        old_logprobs=old_logprobs,
+                        packed_inference_logprobs=packed_inference_logprobs,
+                        packed_loss_mask=packing_context.packed_loss_mask,
+                        group_stats=group_stats,
                     )
-                micro_batch_size = 1
-
-                from megatron.core.num_microbatches_calculator import (
-                    get_num_microbatches,
-                    reconfigure_num_microbatches_calculator,
-                )
 
-                old_num_microbatches = get_num_microbatches()
+                    # Store packed inference logprobs in packing context
+                    packing_context.packed_inference_logprobs = packed_inference_logprobs.cuda()
+                    # Only mark as having inference logprobs for IS correction if enabled
+                    packing_context.has_inference_logprobs = args.rl_inference_logprobs_is_correction
+            with nvtx_range("create_dataloader"):
+                # @vitalyk: This function also reconfigures the data loader to count the
+                # global_batch_size in the bins frame of reference.
+                # I think it will be a better design if we split the data loader creating and logic
+                # that reconfigures the microbatch calculator.
+
+                update_microbatch_calculator(
+                    samples_ratio_per_step=samples_ratio_per_step,
+                    num_bins_this_rank = len(packing_context.packed_trajs),
+                    bin_seq_indices = packing_context.packing_info.bin_seq_indices,
+                    global_batch_size=args.global_batch_size, 
+                    rampup_batch_size=args.rampup_batch_size, 
+                    micro_batch_size=args.micro_batch_size, 
+                    decrease_batch_size_if_needed=args.decrease_batch_size_if_needed,
+               )
+                loader = get_microbatch_dataloader(len(packing_context.packed_trajs), args.micro_batch_size)
+        else:
+            with nvtx_range("align_inference_logprobs", time=True):
+                if inference_logprobs is not None:
+                    inference_logprobs = align_unpacked_inference_logprobs(
+                        inference_logprobs=inference_logprobs,
+                        old_logprobs_for_data=old_logprobs,
+                        generation_masks=generation_masks,
+                        group_stats=group_stats,
+                    )
+                    # We run the above to fill in the inference/train side mismatch stats.
+                    # We do the above for logging purposes.
+                    # Nullify logprobs if not used in IS correction,
+                    if not args.rl_inference_logprobs_is_correction:
+                        inference_logprobs = None
+            with nvtx_range("create_dataloader"):
+                # Because of multiturn, our batch sizes for non-sequence packed trajectories are not fixed anymore.
+                # As in sequence packing above, we need to reconfigure it too.
+                runtime_state.packing_context = None
 
                 reconfigure_num_microbatches_calculator(
                     rank=torch.distributed.get_rank() if torch.distributed.is_initialized() else 0,
-                    rampup_batch_size=args.rampup_batch_size,
-                    global_batch_size=effective_global_batch_size,
-                    micro_batch_size=micro_batch_size,
-                    data_parallel_size=dp_world_size,
+                    global_batch_size=math.ceil(samples_ratio_per_step*total_turns_sampled), 
+                    rampup_batch_size=args.rampup_batch_size, 
+                    micro_batch_size=args.micro_batch_size, 
                     decrease_batch_size_if_needed=args.decrease_batch_size_if_needed,
+                    data_parallel_size=mpu.get_data_parallel_world_size(),
                 )
 
-                new_num_microbatches = get_num_microbatches()
-
-                print_rank_0(f"\n[Sequence Packing] Multi-step training plan:")
-                print_rank_0(f"  - Target sequences per step: {target_sequences_per_step}")
-                print_rank_0(f"  - Bins per rank per step: {bins_per_rank_per_step}")
-                print_rank_0(
-                    f"  - Estimated sequences per step: ~{int(effective_global_batch_size * global_avg_seqs_per_bin)}"
-                )
-                print_rank_0(f"  - Total optimizer steps: {total_steps}")
-                print_rank_0(
-                    f"  - Microbatches per step: {new_num_microbatches} (was {old_num_microbatches})"
-                )
-
-                for step in range(min(3, total_steps)):
-                    start_idx = step * bins_per_rank_per_step
-                    end_idx = min(start_idx + bins_per_rank_per_step, num_bins_this_rank)
-                    step_bins = end_idx - start_idx
-
-                    actual_seqs = sum(
-                        len(my_bin_seq_indices[bin_idx])
-                        for bin_idx in range(start_idx, end_idx)
-                        if bin_idx < len(my_bin_seq_indices)
-                    )
-                    est_global_seqs = actual_seqs * dp_world_size
-                    print_rank_0(
-                        f"  - Step {step + 1}: {step_bins} bins, ~{est_global_seqs} sequences globally"
-                    )
-
-                if total_steps > 3:
-                    print_rank_0(f"  - ... ({total_steps - 3} more steps)")
-
-                start_idx = 0
-                end_idx = min(bins_per_rank_per_step, num_bins_this_rank)
-                step_bin_indices = bin_indices[start_idx:end_idx]
-                dataset = TensorDataset(step_bin_indices)
-                loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=lambda x: x[0])
-            else:
-                runtime_state = get_rl_runtime_state()
-                runtime_state.packing_context = None
                 dataset_tensors = [
-                    original_trajs,
+                    compute_trajs,
                     advantages,
-                    old_logprobs_for_data,
+                    old_logprobs,
                     original_loss_mask,
                     original_position_ids,
-                    ref_logprobs_for_data,
+                    ref_logprobs,
                 ]
-                if args.rl_inference_logprobs_is_correction:
-                    if inference_logprobs is not None:
-                        dataset_tensors.append(inference_logprobs)
-                    else:
-                        # Create dummy tensor matching the batch size only if correction is enabled
-                        dataset_tensors.append(torch.zeros_like(old_logprobs_for_data))
+                if is_correction and inference_logprobs is not None:
+                    dataset_tensors.append(inference_logprobs)
                 else:
-                    # If correction is not enabled, always append zeros
-                    dataset_tensors.append(torch.zeros_like(old_logprobs_for_data))
-
+                    dataset_tensors.append(torch.zeros_like(old_logprobs))
                 data = TensorDataset(*dataset_tensors)
                 loader = DataLoader(data, batch_size=args.micro_batch_size)
 
+
         with nvtx_range("log-wandb-tb"):
             maybe_log_training_metrics(
                 group_stats=group_stats,
                 current_iteration=args.curr_iteration,
                 tokenizer=tokenizer,
-                example_group=rollouts[0],
+                example_group=example_group,
                 wandb_writer=wandb_writer,
                 tb_writer=tb_writer,
             )
@@ -2063,99 +1340,65 @@ def prepare_data_for_update(
     return RerunDataIterator(itertools.cycle(loader))
 
 
-def get_rollout_data_iterator(
-    model: LanguageModule,
-    optimizer: MegatronOptimizer,
-    iteration: int,
-    ref_state_dict: Dict[str, torch.Tensor],
-) -> RerunDataIterator:
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    buffered_rollouts = get_environment_rollouts(
-        model, optimizer, args.grpo_prompts_per_step, args.grpo_group_size
-    )
-    buffered_rollouts = prepare_data_for_update(model, ref_state_dict, buffered_rollouts, tokenizer)
-
-    return buffered_rollouts
-
-
-def setup_grpo_data_iterator(
+def get_grpo_data_iterator(
     model: LanguageModule,
+    inference_model: LanguageModule | None,
     optimizer: MegatronOptimizer,
     iteration: int,
     ref_state_dict: Dict[str, torch.Tensor],
+    grpo_iterations: int,
+    grpo_prompts_per_step: int,
+    grpo_group_size: int,
+    global_batch_size: int,
+    sequence_packing: bool,
+    is_correction: bool,
     buffered_rollouts: RerunDataIterator | None = None,
 ) -> RerunDataIterator:
     """
-    Set up the data iterator for GRPO training.
+    Get the data iterator for GRPO training.
+
+    Depending on the sampling parameters either performs data collections or returns
+    the buffered_rollouts as is.
 
     Args:
         model: The language model
         optimizer: The Megatron optimizer
         iteration: Current training iteration
         ref_state_dict: Reference model state dict for GRPO
+        grpo_iterations: How many steps we reuse the sampled data for.
+        grpo_prompts_per_step: How many prompts we sample per data collection.
+        grpo_group_size: How many samples we do per prompt.
+        global_batch_size: Global batch size.
+        sequence_packing: Use sequence packing if True.
+        is_correction: Use IS correction if True.
         buffered_rollouts: Previously collected rollouts (if any)
 
     Returns:
         RerunDataIterator for the current training step
     """
-    args = get_args()
+    runtime_state = get_rl_runtime_state()
 
     # We collect new rollouts when we've gone over the collected data 'grpo_iterations' times.
+    global_batches_per_collection = (grpo_prompts_per_step * grpo_group_size) // global_batch_size 
     if (
-        iteration
-        % (args.grpo_iterations * ((args.grpo_samples_per_iteration) // args.global_batch_size))
-        == 0
+        buffered_rollouts is None or
+        iteration == runtime_state.last_collection_iteration + 
+        (grpo_iterations * global_batches_per_collection)
     ):
-        buffered_rollouts = get_rollout_data_iterator(model, optimizer, iteration, ref_state_dict)
-
-        # Reset packing step counter when new rollouts are collected
-        runtime_state = get_rl_runtime_state()
-        if runtime_state.sequence_packing_plan is not None:
-            runtime_state.sequence_packing_plan['current_step'] = 0
 
-    # Handle sequence packing: update the data loader for the current optimizer step
-    runtime_state = get_rl_runtime_state()
-    if runtime_state.sequence_packing_plan is not None:
-        plan = runtime_state.sequence_packing_plan
-        if plan['current_step'] < plan['total_steps']:
-            # Create loader for current chunk of bins
-            start_idx = plan['current_step'] * plan['bins_per_rank_per_step']
-            end_idx = min(start_idx + plan['bins_per_rank_per_step'], len(plan['bin_indices']))
-            step_bin_indices = plan['bin_indices'][start_idx:end_idx]
-
-            dataset = TensorDataset(step_bin_indices)
-            loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=lambda x: x[0])
-            train_data_iterator = RerunDataIterator(itertools.cycle(loader))
-
-            # Advance to next step for next iteration
-            plan['current_step'] += 1
-
-            # Log which bins we're processing
-            my_bin_seq_indices = plan['packing_context'].get('my_bin_seq_indices', [])
-            step_sequences = sum(
-                len(my_bin_seq_indices[bin_idx.item()])
-                for bin_idx in step_bin_indices
-                if bin_idx.item() < len(my_bin_seq_indices)
-            )
-            # Estimate global sequences for this step
-            est_global_sequences = step_sequences * mpu.get_data_parallel_world_size()
-            print_rank_0(
-                f"[Sequence Packing] Optimizer step {plan['current_step']}/{plan['total_steps']}: "
-                f"processing {len(step_bin_indices)} bins (~{est_global_sequences} sequences globally)"
+        buffered_rollouts = get_environment_rollouts(
+            model, inference_model, optimizer, grpo_prompts_per_step, grpo_group_size
+        )
+        buffered_rollouts = prepare_data_for_update(model=model, 
+            ref_state_dict=ref_state_dict, 
+            rollouts=buffered_rollouts,
+            tokenizer=get_tokenizer(),
+            sequence_packing=sequence_packing,
+            is_correction=is_correction,
             )
+        runtime_state.reset_iteration_counters(iteration)
 
-            runtime_state.reset_iteration_counters()
-
-        else:
-            print_rank_0(f"[Sequence Packing] All bins processed, waiting for new rollouts")
-            train_data_iterator = buffered_rollouts
-    else:
-        train_data_iterator = buffered_rollouts
-
-    return train_data_iterator
+    return buffered_rollouts
 
 
 def evaluate_and_print_results_rl(
@@ -2195,7 +1438,7 @@ def evaluate_and_print_results_rl(
 
             rank = torch.distributed.get_rank()
             if rank == 0:
-                print(f"Collecting evaluation results on rank {rank}...")
+                logger.info("Collecting evaluation results...")
                 agent = get_agent(args)
                 request = EvaluationRequest(
                     inference_interface=inference_interface,
@@ -2203,9 +1446,10 @@ def evaluate_and_print_results_rl(
                     validation=True,
                     rank_info=None,
                     generation_args={
-                        'temperature': args.grpo_default_temperature,
+                        'temperature': args.rl_default_temperature,
                         'max_tokens': args.seq_length,
-                        'top_p': args.grpo_default_top_p,
+                        'top_p': args.rl_default_top_p,
+                        'top_k': args.rl_default_top_k,
                     },
                 )
                 evaluation_responses = loop.run_until_complete(agent.run_evaluation(request))
@@ -2242,7 +1486,7 @@ def evaluate_and_print_results_rl(
                             except Exception as e:
                                 lang_rl_log(f"Error: {e}")
                                 lang_rl_log(f"Result: {result}")
-            print(
+            logger.info(
                 "Collected metrics:"
                 + "".join([f"\n\t{k} count: {len(v)}" for k, v in eval_metrics.items()])
             )
@@ -2255,7 +1499,7 @@ def evaluate_and_print_results_rl(
             wandb_writer = get_wandb_writer()
             if wandb_writer:
                 wandb_writer.log(eval_metrics, step=iteration)
-            print(
+            logger.info(
                 "Evaluation results:"
                 + "".join([f"\n\t{k}: {v:0.4f}" for k, v in eval_metrics.items()])
             )
@@ -2310,8 +1554,10 @@ def calculate_grpo_loss(
     """
     # Ensure shapes match before computation
     if current_logprobs.shape != old_logprobs.shape:
-        print_rank_0(
-            f"WARNING: Shape mismatch - current_logprobs: {current_logprobs.shape}, old_logprobs: {old_logprobs.shape}"
+        log_single_rank(
+            logger,
+            logging.WARNING,
+            f"WARNING: Shape mismatch - current_logprobs: {current_logprobs.shape}, old_logprobs: {old_logprobs.shape}",
         )
 
     ratios = (current_logprobs - old_logprobs).exp()
@@ -2390,12 +1636,17 @@ def megatron_rl_inference_mode(
     loop = get_asyncio_loop()
     nvtx_range = get_nvtx_range()
 
-    print(f"[{dist.get_rank()}:DP] Entering inference mode")
+    logger.debug(f"[{dist.get_rank()}] Entering inference mode")
 
     # If we get a lower precision wrapper, we go one object deeper.
     lang_module = model[0].module.module if hasattr(model[0].module, "module") else model[0].module
 
     lang_module.eval()
+    # If this is a separate RL inference model allocated with UVM, ensure weights are resident on GPU
+    # before any CUDA-graph capture/replay or inference.
+    with nvtx_range("prefetch-inference-model-weights-to-gpu"):
+        model_core = unwrap_model(model[0])
+        _maybe_prefetch_separate_inference_model_weights(model_core, to_cpu=False)
 
     rotary_module = getattr(lang_module, "rotary_pos_emb", None)
     # Vanilla RotaryEmbedding module has lru_cache decorator which breaks RL training
@@ -2407,22 +1658,23 @@ def megatron_rl_inference_mode(
     with torch.no_grad():
 
         if offload_optimizer_during_inference:
-            with nvtx_range("offload-optimizer-before-inference"):
+            with nvtx_range("offload-optimizer-state-and-grad-buffers-before-inference"):
+                model[0].offload_grad_buffers()
                 optimizer.offload_to_cpu()
 
         # TODO: Remove this if statement once a change to `toggle_cuda_graphs` makes it safe to.
-        if cuda_graph_impl != "none":
+        if cuda_graph_impl != "none" and not args.rl_training_cuda_graphs:
             toggle_cuda_graphs(lang_module, cuda_graph_impl, reset_cuda_graphs=reset_cuda_graphs)
 
         inference_interface = get_inference_interface(args, loop, model)
 
         with nvtx_range("onload-kv-cache-before-inference"):
             if offload_kv_cache_during_training:
-                assert (
-                    reset_cuda_graphs
-                ), "reset_cuda_graphs must be True when offloading kv cache during training"
-                print(
-                    f"[{dist.get_rank()}:DP] Restoring kv cache ({inference_interface._inference_engine.context.memory_buffer.numel() / 1024**3:.2f} GB) to GPU"
+                # Restore the KV cache by re-binding physical pages to a consistent virtual address
+                torch_memory_saver.resume("kv_cache")
+
+                logger.debug(
+                    f"[{dist.get_rank()}] Restoring kv cache ({inference_interface._inference_engine.context.memory_buffer.numel() / 1024**3:.2f} GB) to GPU"
                 )
                 kv_cache = inference_interface._inference_engine.context.memory_buffer
                 inference_interface._inference_engine.context.memory_buffer = kv_cache.cuda()
@@ -2443,7 +1695,7 @@ def megatron_rl_inference_mode(
 
         loop.run_until_complete(inference_interface.resume())
 
-        print(f"[{dist.get_rank()}:DP] Entered inference mode")
+        logger.debug(f"[{dist.get_rank()}] Entered inference mode")
         yield inference_interface
 
         with nvtx_range("suspend-engine"):
@@ -2452,19 +1704,26 @@ def megatron_rl_inference_mode(
         with nvtx_range("offload-kv-cache-after-inference"):
             if offload_kv_cache_during_training:
                 kv_cache = inference_interface._inference_engine.context.memory_buffer
-                print(
-                    f"[{dist.get_rank()}:DP] Offloading kv cache ({kv_cache.numel() * kv_cache.element_size() / 1024**3:.2f} GB) to CPU"
+                logger.debug(
+                    f"[{dist.get_rank()}] Offloading kv cache ({kv_cache.numel() * kv_cache.element_size() / 1024**3:.2f} GB) to CPU"
                 )
-                inference_interface._inference_engine.context.memory_buffer = kv_cache.cpu()
+                torch_memory_saver.pause("kv_cache")
+
             elif remove_kv_cache_during_training:
                 inference_interface._inference_engine.context.memory_buffer = None
 
         # TODO: Remove this if statement once a change to `toggle_cuda_graphs` makes it safe to.
-        if cuda_graph_impl != "none":
+        if cuda_graph_impl != "none" and not args.rl_training_cuda_graphs:
             toggle_cuda_graphs(lang_module, 'none', reset_cuda_graphs=reset_cuda_graphs)
 
+        # If this is a separate RL inference model, prefetch weights back to CPU so they don't consume
+        # GPU memory during training.
+        with nvtx_range("prefetch-inference-model-weights-to-cpu"):
+            _maybe_prefetch_separate_inference_model_weights(model_core, to_cpu=True)
+
         if offload_optimizer_during_inference:
-            with nvtx_range("onload-optimizer-after-inference"):
+            with nvtx_range("onload-optimizer-state-and-grad-buffers-after-inference"):
+                model[0].restore_grad_buffers()
                 optimizer.restore_from_cpu()
 
         lang_module.train()
@@ -2472,15 +1731,19 @@ def megatron_rl_inference_mode(
         if has_lru_cache:
             rotary_module.forward.cache_clear()
 
-        print(f"[{dist.get_rank()}:DP] Exiting inference mode")
+        logger.debug(f"[{dist.get_rank()}] Exiting inference mode")
+
 
 def rl_inference_interface_shutdown():
+    global _INFERENCE_INTERFACE
     if _INFERENCE_INTERFACE is not None:
         loop = get_asyncio_loop()
         loop.run_until_complete(_INFERENCE_INTERFACE.kill())
+        _INFERENCE_INTERFACE = None
     else:
         logger.warning("No inference interface to shutdown. This should not happen.")
 
+
 def get_iteration_sequence_count(args):
     """Get the total number of sequences processed in this iteration across all ranks."""
     runtime_state = get_rl_runtime_state()
@@ -2490,31 +1753,32 @@ def get_iteration_sequence_count(args):
     if torch.distributed.is_initialized():
         torch.distributed.all_reduce(sequences_tensor, group=mpu.get_data_parallel_group())
     return int(sequences_tensor.item())
+    
+def _pad_nonnull_with_zeros(data: list[Optional[torch.Tensor]], max_len: int) -> torch.Tensor:
+    """Pad each element of a list of tensors to the length required.
+    Args:
+        data: List of tensors to pad.
+        max_len: Maximum length to pad to. Must be higher or equal than the max len of the data tensors.
+    Returns:
+        A padded tensor which is a stacked list of padded input tensors.
 
+    """
+    if all([el is None for el in data]):
+        raise ValueError("At least one element of the data list should be not None.")
+    padded_data = []
+    for chunk in data:
+        if chunk is not None:
+            padding_size = max_len - len(chunk)
+            if padding_size > 0:
+                # Pad with zeros (these positions will be masked anyway)
+                padded = torch.nn.functional.pad(chunk, (0, padding_size), value=0.0)
+                padded_data.append(padded)
+            elif padding_size == 0:
+                padded_data.append(chunk)
+            else:
+                raise ValueError("One of the input tensors has larger length than padding max len.")
+        else:
+            # Create zero tensor for None logprobs
+            padded_data.append(torch.zeros(max_len))
+    return torch.stack(padded_data)
 
-def update_sequence_packing_metrics(args):
-    """Update bin tracking for sequence packing mode."""
-    if args.rl_use_sequence_packing:
-        bin_count = (
-            mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches()
-        )
-        args.consumed_train_bins += bin_count
-
-
-def get_sequence_packing_log_info(args):
-    """Get logging information for sequence packing mode."""
-    if args.consumed_train_bins > 0:
-        return f' consumed bins: {args.consumed_train_bins:12d} |'
-    return ''
-
-
-def get_sequence_packing_tensorboard_metrics(args):
-    """Get tensorboard metrics for sequence packing mode."""
-    metrics = {}
-    if args.consumed_train_bins > 0:
-        bin_batch_size = (
-            mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches()
-        )
-        metrics['bin-batch-size'] = bin_batch_size
-        metrics['consumed-bins'] = args.consumed_train_bins
-    return metrics
diff --git a/megatron/rl/sequence_packing_utils.py b/megatron/rl/sequence_packing_utils.py
new file mode 100644
index 00000000000..4d983764f77
--- /dev/null
+++ b/megatron/rl/sequence_packing_utils.py
@@ -0,0 +1,1169 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import torch
+import math
+import numpy as np
+from typing import List, Dict, Any, Tuple, Optional
+from torch.utils.data import DataLoader, TensorDataset
+from dataclasses import dataclass, field
+from megatron.core.utils import log_single_rank
+from megatron.training.global_vars import get_args, get_tokenizer
+from megatron.training.utils import get_nvtx_range
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core import mpu
+import logging
+import typing
+from megatron.core.num_microbatches_calculator import (
+        get_num_microbatches,
+        reconfigure_num_microbatches_calculator,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PackingInfo:
+    """Information about how sequences are packed into bins.
+    
+    Attributes:
+        bin_seq_indices: List where each element contains the global sequence indices in that bin
+        seq_starts: Dict mapping bin index to list of start positions for each sequence in that bin
+        seq_lengths: List of all original sequence lengths (indexed by global sequence index)
+        seq_to_bin_idx: List mapping each global sequence index to its bin index
+        packing_algo: Algorithm used for distributing bins ('fifo' or 'round-robin')
+    """
+    bin_seq_indices: List[List[int]]
+    seq_starts: Dict[int, List[int]]
+    seq_lengths: List[int]
+    seq_to_bin_idx: List[Optional[int]]
+    packing_algo: typing.Literal['fifo', 'round-robin']
+
+
+@dataclass
+class PackingContext:
+    """Context containing all information needed for sequence packing during training.
+    
+    Attributes:
+        bin_size: Maximum size of each bin (in tokens)
+        packer: 'SequencePacker' instance used for packing
+        packing_info: PackingInfo object with bin assignments and metadata
+        original_generation_masks: Generation masks for all sequences before packing
+        original_trajs: All trajectories before packing
+        packed_trajs: Packed trajectories tensor [num_bins, bin_size]
+        packed_position_ids: Position IDs for packed sequences [num_bins, bin_size]
+        packed_attention_mask: Attention mask for packed sequences [num_bins, 1, bin_size, bin_size]
+        packed_loss_mask: Loss mask for packed sequences [num_bins, bin_size]
+        original_inference_logprobs: Inference logprobs for all sequences before packing (optional)
+        bin_advantages: List of advantage tensors for each bin
+        cached_packed_seq_params: Pre-computed PackedSeqParams for each bin
+    """
+    bin_size: int
+    packer: 'SequencePacker'
+    packing_info: PackingInfo
+    original_generation_masks: torch.Tensor
+    original_trajs: torch.Tensor
+    packed_trajs: torch.Tensor
+    packed_position_ids: torch.Tensor
+    packed_attention_mask: torch.Tensor
+    packed_loss_mask: torch.Tensor
+    original_inference_logprobs: Optional[torch.Tensor] = None
+    bin_advantages: List[torch.Tensor] = field(default_factory=list)
+    cached_packed_seq_params: List[Optional[PackedSeqParams]] = field(default_factory=list)
+
+
+def load_packed_data_by_index(bin_idx: int, packing_context: PackingContext, logprobs_is_correction: bool):
+    """Load packed data by index.
+
+    Args:
+        bin_idx: Index of the bin to load.
+    """
+    # Get packing context (should always be available in packed mode)
+    idx = slice(bin_idx, bin_idx + 1)
+
+    # Get cached PackedSeqParams for proper attention masking in Transformer Engine
+    # These were pre-computed in prepare_data_for_update to avoid repeated tensor allocations
+    packed_seq_params = packing_context.cached_packed_seq_params[bin_idx]
+
+    # Extract packed data for this bin (already on GPU)
+    tokens = packing_context.packed_trajs[idx]
+    position_ids = packing_context.packed_position_ids[idx]
+
+    # Check if we have old_logprobs and ref_logprobs as attributes
+    # These are set after logprobs computation, so they may not exist during initial forward pass
+    old_logprobs = getattr(packing_context, 'old_logprobs', None)
+    if old_logprobs is not None:
+        old_logprobs = old_logprobs[idx]
+    
+    ref_logprobs = getattr(packing_context, 'ref_logprobs', None)
+    if ref_logprobs is not None:
+        ref_logprobs = ref_logprobs[idx]
+        
+    # Slice from position 1 because logprobs predict the next token, so they are
+    # shifted by 1 relative to the input tokens (logprobs has shape [batch, seq_len-1])
+    loss_mask = packing_context.packed_loss_mask[idx, 1:]
+
+    # Get sequence-level data for this bin
+    packing_info = packing_context.packing_info
+    seq_starts = packing_info.seq_starts[bin_idx]
+    seq_indices = packing_info.bin_seq_indices[bin_idx]
+
+    # Handle empty bins (used for padding to ensure all ranks have same iterations)
+    if not seq_indices:
+        seq_lengths = []
+        advantages = torch.tensor([], device='cuda')
+    else:
+        seq_lengths = [packing_info.seq_lengths[idx] for idx in seq_indices]
+        advantages = packing_context.bin_advantages[bin_idx]
+
+    # Extract packed inference_logprobs if available
+    packed_inference_logprobs = getattr(packing_context, 'packed_inference_logprobs', None)
+    if packed_inference_logprobs is not None and logprobs_is_correction:
+        inference_logprobs = packed_inference_logprobs[idx]
+    else:
+        inference_logprobs = None
+
+    return (
+        tokens,
+        advantages,
+        old_logprobs,
+        loss_mask,
+        position_ids,
+        ref_logprobs,
+        inference_logprobs,
+        seq_starts,
+        seq_lengths,
+        seq_indices,
+        packed_seq_params,
+    )
+
+
+def log_packing_efficiency(packing_context: PackingContext):
+    # Log packing efficiency (for this rank's bins)
+    packing_info = packing_context.packing_info
+    packed_trajs = packing_context.packed_trajs
+    my_bin_seq_indices = packing_info.bin_seq_indices
+    num_bins = len(packing_info.bin_seq_indices)
+    total_tokens = sum(packing_info.seq_lengths)  # All sequences
+    my_sequences = sum(len(indices) for indices in my_bin_seq_indices)
+    my_tokens = sum(
+        packing_info.seq_lengths[idx]
+        for indices in my_bin_seq_indices
+        for idx in indices
+    )
+    total_capacity = packed_trajs.shape[0] * packed_trajs.shape[1]
+    packing_efficiency = my_tokens / total_capacity if total_capacity > 0 else 0
+    avg_seq_length = total_tokens / len(packing_info.seq_lengths)
+    rank = mpu.get_data_parallel_rank()
+
+    log_single_rank(logger, logging.INFO, "[Sequence Packing] Statistics:")
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - Total sequences: {len(packing_info.seq_lengths)}",
+    )
+    log_single_rank(
+        logger, logging.INFO, f"[Sequence Packing]  - Total bins: {num_bins}"
+    )
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - Bin size: {packed_trajs.shape[1]} tokens",
+    )
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - Average sequence length: {avg_seq_length:.1f} tokens",
+    )
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - This rank: {my_sequences} sequences in {packed_trajs.shape[0]} bins",
+    )
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - Packing efficiency: {packing_efficiency:.1%} ({my_tokens:,} / {total_capacity:,} tokens)",
+    )
+
+    # Add detailed per-rank sequence distribution analysis
+    if torch.distributed.is_initialized():
+        # Gather sequence counts from all ranks
+        seq_counts_per_bin = [len(indices) for indices in my_bin_seq_indices]
+        non_empty_bins = [c for c in seq_counts_per_bin if c > 0]
+
+        # Create tensor with rank statistics
+        rank_stats = torch.tensor(
+            [
+                float(rank),
+                float(len(my_bin_seq_indices)),  # total bins
+                float(len(non_empty_bins)),  # non-empty bins
+                float(my_sequences),  # total sequences
+                (
+                    float(min(non_empty_bins)) if non_empty_bins else 0.0
+                ),  # min sequences per bin
+                (
+                    float(max(non_empty_bins)) if non_empty_bins else 0.0
+                ),  # max sequences per bin
+                (
+                    float(my_sequences / len(non_empty_bins)) if non_empty_bins else 0.0
+                ),  # avg sequences per non-empty bin
+            ],
+            device='cuda',
+        )
+
+        # Gather from all ranks
+        world_size = mpu.get_data_parallel_world_size()
+        all_rank_stats = [torch.zeros_like(rank_stats) for _ in range(world_size)]
+        torch.distributed.all_gather(
+            all_rank_stats, rank_stats, group=mpu.get_data_parallel_group()
+        )
+
+        # Print detailed statistics for each rank
+        if rank == 0:
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[Sequence Packing] Per-rank distribution ({packing_info.packing_algo} algorithm):",
+            )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                "[Sequence Packing]  Rank | Total Bins | Non-empty | Sequences | Min/Bin | Max/Bin | Avg/Bin",
+            )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                "[Sequence Packing]  -----|------------|-----------|-----------|---------|---------|--------",
+            )
+            for stats in all_rank_stats:
+                r = int(stats[0].item())
+                total_bins = int(stats[1].item())
+                non_empty = int(stats[2].item())
+                sequences = int(stats[3].item())
+                min_seq = int(stats[4].item())
+                max_seq = int(stats[5].item())
+                avg_seq = stats[6].item()
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    f"[Sequence Packing]   {r:3d} | {total_bins:10d} | {non_empty:9d} | {sequences:9d} | {min_seq:7d} | {max_seq:7d} | {avg_seq:6.1f}",
+                )
+
+            # Also show first few bins for rank 0 as example
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[Sequence Packing]  Example (Rank 0 first 10 bins): {seq_counts_per_bin[:10]}",
+            )
+
+            # Show the improvement from round-robin
+            total_seqs_all_ranks = sum(int(stats[3].item()) for stats in all_rank_stats)
+            avg_seqs_per_rank = total_seqs_all_ranks / world_size
+            max_deviation = max(
+                abs(int(stats[3].item()) - avg_seqs_per_rank)
+                for stats in all_rank_stats
+            )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                "[Sequence Packing]  Round-robin distribution quality:",
+            )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[Sequence Packing]  - Average sequences per rank: {avg_seqs_per_rank:.1f}",
+            )
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"[Sequence Packing]  - Max deviation from average: {max_deviation:.0f} sequences ({max_deviation/avg_seqs_per_rank*100:.1f}%)",
+            )
+
+def get_actual_sequence_lengths(sequences: torch.Tensor, pad_token: int) -> List[int]:
+    """Get actual sequence lengths for pre-padded sequences.
+
+    Args:
+        sequences: Tensor of shape [batch_size, seq_len] with pre-padded sequences
+        pad_token: The padding token ID
+
+    Returns:
+        List of actual sequence lengths (excluding padding)
+    """
+    if len(sequences.shape) != 2:
+        raise ValueError(f"Expected 2D tensor, got shape {sequences.shape}")
+
+    actual_lengths = []
+
+    # Find actual length of each sequence by locating where padding starts
+    for seq in sequences:
+        # Find the last non-padding token
+        non_pad_mask = seq != pad_token
+        if non_pad_mask.any():
+            # Get the position of the last non-padding token
+            actual_length = non_pad_mask.nonzero(as_tuple=True)[0][-1].item() + 1
+        else:
+            actual_length = 0  # All padding
+        actual_lengths.append(actual_length)
+
+    return actual_lengths
+
+
+def create_empty_bins(
+    num_empty_bins : int,
+    bin_size : int,
+    packed_trajs : torch.Tensor,
+    packed_position_ids : torch.Tensor,
+    packed_loss_mask : torch.Tensor,
+    packed_attention_mask : torch.Tensor,
+    tokenizer,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[Dict[str, Any]]]:
+    """Create empty bins for padding to ensure all ranks have the same number of bins.
+
+    Args:
+        num_empty_bins: Number of empty bins to create
+        bin_size: Size of each bin
+        packed_trajs: Packed trajectories tensor (for dtype/device reference)
+        packed_position_ids: Packed position IDs tensor (for dtype/device reference)
+        packed_loss_mask: Packed loss mask tensor (for dtype/device reference)
+        packed_attention_mask: Packed attention mask tensor (can be None)
+        tokenizer: Tokenizer for pad token
+
+    Returns:
+        Tuple of (empty_trajs, empty_position_ids, empty_loss_mask, empty_attention_mask, empty_packing_info_entries)
+    """
+    device = packed_trajs.device
+
+    # Create empty bins with proper shape
+    empty_bins = []
+    empty_position_ids_list = []
+    empty_loss_mask_list = []
+    empty_attention_mask_list = []
+    empty_packing_info_entries = []
+
+    for i in range(num_empty_bins):
+        # Trajectories filled with pad tokens
+        empty_bin = torch.full(
+            (1, bin_size), tokenizer.pad, dtype=packed_trajs.dtype, device=device
+        )
+        empty_bins.append(empty_bin)
+
+        # Zero position IDs
+        empty_pos_ids = torch.zeros(1, bin_size, dtype=packed_position_ids.dtype, device=device)
+        empty_position_ids_list.append(empty_pos_ids)
+
+        # Zero loss mask (so no loss contribution)
+        empty_loss = torch.zeros(1, bin_size, dtype=packed_loss_mask.dtype, device=device)
+        empty_loss_mask_list.append(empty_loss)
+
+        # Zero attention mask if needed
+        if packed_attention_mask is not None:
+            # Attention mask is always 4D: [num_bins, 1, bin_size, bin_size]
+            empty_attn = torch.zeros(
+                1, 1, bin_size, bin_size, dtype=packed_attention_mask.dtype, device=device
+            )
+            empty_attention_mask_list.append(empty_attn)
+
+        # Empty packing info entries
+        empty_packing_info_entries.append(
+            {
+                'bin_seq_indices': [],  # No sequences in empty bin
+                'seq_starts': [],  # No sequence starts
+            }
+        )
+
+    # Concatenate all empty bins
+    if num_empty_bins > 0:
+        empty_trajs = torch.cat(empty_bins, dim=0)
+        empty_position_ids = torch.cat(empty_position_ids_list, dim=0)
+        empty_loss_mask = torch.cat(empty_loss_mask_list, dim=0)
+        empty_attention_mask = (
+            torch.cat(empty_attention_mask_list, dim=0)
+            if packed_attention_mask is not None
+            else None
+        )
+    else:
+        empty_trajs = None
+        empty_position_ids = None
+        empty_loss_mask = None
+        empty_attention_mask = None
+
+    return (
+        empty_trajs,
+        empty_position_ids,
+        empty_loss_mask,
+        empty_attention_mask,
+        empty_packing_info_entries,
+    )
+
+def get_default_packed_seq_params(seq_length: int, max_sequences_per_bin: int, device: torch.device) -> PackedSeqParams:
+    """Create a default PackedSeqParams that acts as no-op for a single sequence.
+
+    This ensures CUDA graph signature consistency when packed_seq_params
+    would otherwise be None. A single sequence spanning the full length
+    means no actual packing boundaries
+
+    Args:
+        seq_length: The sequence length 
+        max_sequences_per_bin: Max sequences to pack in a bin.
+        device: Device to create tensors on.
+
+    Returns:
+        PackedSeqParams configured as a single unpacked sequence.
+    """
+
+    args = get_args()
+
+    # Pad to the maximum number of sequences in the bin for the attention kernel.
+    cu_seqlens = torch.full((max_sequences_per_bin,), seq_length, dtype=torch.int32, device=device)
+    cu_seqlens[0] = 0
+
+    return PackedSeqParams(
+        qkv_format='thd',
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        cu_seqlens_q_padded=None,
+        cu_seqlens_kv_padded=None,
+        max_seqlen_q=seq_length,
+        max_seqlen_kv=seq_length,
+    )
+
+def create_packed_seq_params(packing_context: PackingContext):
+    cached_packed_seq_params = []
+    packing_info = packing_context.packing_info
+    bin_size = packing_context.bin_size
+    max_sequences_per_bin = packing_context.packer.max_sequences_per_bin
+    device = packing_context.packed_trajs.device
+    for bin_idx in range(len(packing_context.packed_trajs)):
+        params = create_packed_seq_params_for_bin(
+            packing_info=packing_info,
+            bin_idx=bin_idx,
+            bin_size=bin_size,
+            max_sequences_per_bin=max_sequences_per_bin,
+            device=device,
+        )
+        cached_packed_seq_params.append(params)
+    return cached_packed_seq_params
+
+def create_packed_seq_params_for_bin(
+    packing_info: PackingInfo,
+    bin_idx: int,
+    bin_size: int,
+    max_sequences_per_bin: int,
+    device: torch.device
+) -> Optional[PackedSeqParams]:
+    """Create PackedSeqParams for a single bin to enable proper attention masking in TE.
+
+    When using Transformer Engine with sequence packing, we need to provide cu_seqlens
+    (cumulative sequence lengths) so that TE knows the boundaries between sequences
+    within a packed bin. This prevents attention leakage between unrelated sequences.
+
+    Args:
+        packing_info: PackingInfo object containing packing metadata from SequencePacker
+        bin_idx: Index of the bin to create params for
+        bin_size: Size of the bin (padded sequence length)
+        max_sequences_per_bin: Maximum number of sequences per bin
+        device: Device to create tensors on
+
+    Returns:
+        PackedSeqParams with cu_seqlens set for proper attention masking (or None if empty)
+    """
+    seq_indices = packing_info.bin_seq_indices[bin_idx]
+
+    # Handle empty bins (padding bins with no sequences)
+    if not seq_indices:
+        return None
+
+    # Get actual sequence lengths for sequences in this bin
+    seq_lengths_in_bin = [packing_info.seq_lengths[idx] for idx in seq_indices]
+
+    # Build cumulative sequence lengths for actual sequences
+    # cu_seqlens should be [0, len(seq1), len(seq1)+len(seq2), ..., total_actual_len]
+    cu_seqlens_list = np.append(np.cumsum([0] + seq_lengths_in_bin), bin_size)
+
+    cu_seqlens = torch.tensor(cu_seqlens_list, dtype=torch.int32, device=device)
+
+    # Pad cu_seqlens to bin_size by repeating the last value (creates zero-length ghost sequences)
+    # This ensures a fixed tensor size for CUDA graph compatibility
+    if len(cu_seqlens) < max_sequences_per_bin:
+        out = cu_seqlens.new_full((max_sequences_per_bin,), bin_size)
+        out[:len(cu_seqlens)] = cu_seqlens
+        cu_seqlens = out
+
+    max_seqlen = bin_size
+
+    return PackedSeqParams(
+        qkv_format='thd',
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        cu_seqlens_q_padded=None,
+        cu_seqlens_kv_padded=None,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_kv=max_seqlen,
+    )
+
+
+def pack_inference_logprobs(
+    inference_logprobs: List[torch.Tensor],
+    packing_info: PackingInfo,
+    generation_masks: torch.Tensor,
+    bin_size: int,
+) -> torch.Tensor:
+    """Pack inference logprobs into bins aligned with packed sequences.
+
+    Args:
+        inference_logprobs: List of inference logprobs tensors for each sequence
+        packing_info: PackingInfo object containing bin assignments and sequence positions
+        generation_masks: Tensor indicating which tokens were generated
+        bin_size: Size of each bin
+
+    Returns:
+        Packed inference logprobs tensor of shape [num_bins, bin_size - 1]
+    """
+    num_bins = len(packing_info.bin_seq_indices)
+
+    # Create packed inference logprobs tensor (logprobs are 1 token shorter than sequences)
+    packed_inference_logprobs = torch.zeros(
+        (num_bins, bin_size - 1), dtype=torch.float32, device='cpu'
+    )
+
+    # Create mapping from global sequence index to local bin index
+    # This is needed because seq_to_bin_idx uses global bin indices,
+    # but after distribution each rank only has a subset of bins
+    seq_to_local_bin = {}
+    for local_bin_idx, seq_indices in enumerate(packing_info.bin_seq_indices):
+        for seq_idx in seq_indices:
+            seq_to_local_bin[seq_idx] = local_bin_idx
+
+    # Align and pack inference logprobs based on generation masks
+    for seq_idx in range(len(inference_logprobs)):
+        if seq_idx not in seq_to_local_bin:
+            continue  # Skip sequences not on this rank
+
+        local_bin_idx = seq_to_local_bin[seq_idx]
+
+        # Get the position of this sequence within the bin
+        seq_positions = packing_info.bin_seq_indices[local_bin_idx]
+        seq_pos_in_bin = seq_positions.index(seq_idx)
+        seq_start = packing_info.seq_starts[local_bin_idx][seq_pos_in_bin]
+
+        # Get generation mask for this sequence to find where generation starts
+        gen_mask = generation_masks[seq_idx]
+        # Find first generation token (accounting for the shift in get_logprobs)
+        first_gen_idx = gen_mask.int().argmax().item() - 1
+
+        # Get the inference logprobs for this sequence
+        if isinstance(inference_logprobs[seq_idx], torch.Tensor):
+            seq_inf_logprobs = inference_logprobs[seq_idx]
+        else:
+            continue  # Skip if no inference logprobs
+
+        # Calculate where to place inference logprobs in the packed tensor
+        # The inference logprobs start at the first generated token position
+        pack_start = seq_start + first_gen_idx
+        pack_end = min(
+            pack_start + len(seq_inf_logprobs), seq_start + packing_info.seq_lengths[seq_idx] - 1
+        )
+        actual_len = pack_end - pack_start
+
+        if actual_len > 0 and pack_end <= bin_size - 1:
+            packed_inference_logprobs[local_bin_idx, pack_start:pack_end] = seq_inf_logprobs[
+                :actual_len
+            ]
+
+    return packed_inference_logprobs
+
+
+def compute_packed_inference_logprobs_stats(
+    old_logprobs: torch.Tensor,
+    packed_inference_logprobs: torch.Tensor,
+    packed_loss_mask: torch.Tensor,
+    group_stats: Any,
+) -> None:
+    """Compute statistics for packed inference logprobs for logging purposes.
+
+    Compares packed inference logprobs with old logprobs using the packed loss mask
+    to identify valid positions. Updates group_stats with computed metrics.
+
+    Args:
+        old_logprobs: Old logprobs tensor in packed format [num_bins, seq_len-1]
+        packed_inference_logprobs: Packed inference logprobs [num_bins, seq_len-1]
+        packed_loss_mask: Loss mask indicating valid positions [num_bins, seq_len]
+        group_stats: Statistics object to update with computed metrics
+    """
+    # Lazy import to avoid circular dependency (rl_utils imports from this module)
+    from megatron.rl.rl_utils import update_inference_logprobs_group_stats
+
+    # Ensure all tensors are on the same device (CPU for stats computation)
+    old_logprobs = old_logprobs.cpu()
+    packed_inference_logprobs = packed_inference_logprobs.cpu()
+    packed_loss_mask = packed_loss_mask.cpu()
+
+    # Use packed_loss_mask to identify valid positions for stats (shift by 1 for logprobs)
+    mask = packed_loss_mask[:, 1:].bool()
+
+    # Ensure shapes match
+    if mask.shape != old_logprobs.shape:
+        return
+
+    # Update group statistics using common helper
+    update_inference_logprobs_group_stats(
+        old_logprobs=old_logprobs,
+        inference_logprobs=packed_inference_logprobs,
+        mask=mask,
+        group_stats=group_stats,
+    )
+
+
+class SequencePacker:
+    """Packs multiple sequences into bins to minimize padding and improve GPU utilization."""
+
+    def __init__(self, bin_size: int, pad_token: int, max_sequences_per_bin: int = 16):
+        self.bin_size = bin_size
+        self.pad_token = pad_token
+        self.max_sequences_per_bin = max_sequences_per_bin
+
+    def pack_sequences(
+        self, trajs: torch.Tensor, generation_masks: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, PackingInfo]:
+        """Pack sequences into bins using a greedy first-fit algorithm."""
+        # Convert trajectories to list for packing
+        sequences = [trajs[i] for i in range(trajs.shape[0])]
+
+        sequences_tensor = torch.stack(sequences)
+
+        seq_lengths = get_actual_sequence_lengths(sequences_tensor, self.pad_token)
+
+        # Trim sequences to actual lengths
+        sequences = [sequences_tensor[i, :length] for i, length in enumerate(seq_lengths)]
+
+        sorted_indices = sorted(range(len(sequences)), key=lambda i: seq_lengths[i], reverse=True)
+
+        bins = []
+        bin_seq_indices = []  # Track which sequences are in each bin
+        current_bin = []
+        current_bin_indices = []
+        current_bin_length = 0
+
+        # Pack sequences into bins
+        sequences_per_bin = []
+        for idx in sorted_indices:
+            seq = sequences[idx]
+            seq_len = len(seq)
+
+            if (
+                current_bin_length + seq_len <= self.bin_size
+                and len(current_bin) < self.max_sequences_per_bin
+            ):
+                current_bin.append(seq)
+                current_bin_indices.append(idx)
+                current_bin_length += seq_len
+            else:
+                # Start a new bin
+                if current_bin:
+                    bins.append(current_bin)
+                    bin_seq_indices.append(current_bin_indices)
+                    sequences_per_bin.append(len(current_bin))
+                current_bin = [seq]
+                current_bin_indices = [idx]
+                current_bin_length = seq_len
+
+        # Don't forget the last bin
+        if current_bin:
+            bins.append(current_bin)
+            bin_seq_indices.append(current_bin_indices)
+            sequences_per_bin.append(len(current_bin))
+
+        # Create packed tensors
+        num_bins = len(bins)
+        device = sequences[0].device
+        dtype = sequences[0].dtype
+
+        # Log packing distribution
+        if sequences_per_bin:
+            avg_seqs_per_bin = sum(sequences_per_bin) / len(sequences_per_bin)
+            min_seqs = min(sequences_per_bin)
+            max_seqs = max(sequences_per_bin)
+            log_single_rank(
+                logger,
+                logging.INFO,
+                (
+                    f"[SequencePacker] Packing distribution: {num_bins} bins, "
+                    f"avg {avg_seqs_per_bin:.1f} seqs/bin, "
+                    f"min {min_seqs}, max {max_seqs} seqs/bin "
+                    f"(limit: {self.max_sequences_per_bin})"
+                ),
+            )
+            # Store for later use
+            self.last_avg_seqs_per_bin = avg_seqs_per_bin
+
+        packed_sequences = torch.full(
+            (num_bins, self.bin_size), self.pad_token, dtype=dtype, device=device
+        )
+        position_ids = torch.zeros(
+            (num_bins, self.bin_size), dtype=torch.long, device=device, requires_grad=False
+        )
+        attention_mask = torch.zeros(
+            (num_bins, 1, self.bin_size, self.bin_size), dtype=torch.bool, device=device
+        )
+        loss_mask = torch.zeros((num_bins, self.bin_size), dtype=torch.float, device=device)
+
+        # Track packing information for unpacking later
+        seq_starts_dict: Dict[int, List[int]] = {}
+        seq_to_bin_idx: List[Optional[int]] = [None] * len(sequences)
+
+        # Build seq_to_bin_idx mapping
+        for bin_idx, seq_indices in enumerate(bin_seq_indices):
+            for seq_idx in seq_indices:
+                seq_to_bin_idx[seq_idx] = bin_idx
+
+        # Fill bins
+        for bin_idx, (bin_seqs, seq_indices) in enumerate(zip(bins, bin_seq_indices)):
+            seq_starts = []
+            current_pos = 0
+
+            for seq_idx, seq in enumerate(bin_seqs):
+                start = current_pos
+                end = start + len(seq)
+                seq_starts.append(start)
+                current_pos = end
+
+                # Pack sequence
+                packed_sequences[bin_idx, start:end] = seq
+
+                # Position IDs reset for each sequence
+                position_ids[bin_idx, start:end] = torch.arange(
+                    len(seq), device=device, requires_grad=False
+                )
+
+                # Causal attention mask within each sequence
+                seq_len = end - start
+                attention_mask[bin_idx, 0, start:end, start:end] = torch.tril(
+                    torch.ones(seq_len, seq_len, dtype=torch.bool, device=device)
+                )
+
+                # Loss mask (excluding padding)
+                loss_mask[bin_idx, start:end] = 1.0
+
+                # Apply generation mask if provided
+                if generation_masks is not None:
+                    orig_idx = seq_indices[seq_idx]
+                    gen_mask = generation_masks[orig_idx][
+                        : len(seq)
+                    ]  # Truncate to actual seq length
+                    loss_mask[bin_idx, start:end] *= gen_mask.float()
+
+            seq_starts.append(current_pos)
+            seq_starts_dict[bin_idx] = seq_starts
+
+        # Note: We'll store the actual padded length later when we know it
+        # (it depends on the original trajectories passed to pack_sequences)
+
+        # Invert attention mask, before inversion: (True = attend, False = mask)
+        attention_mask.bitwise_not_()
+
+        # Create the PackingInfo dataclass
+        packing_info = PackingInfo(
+            bin_seq_indices=bin_seq_indices,
+            seq_starts=seq_starts_dict,
+            seq_lengths=seq_lengths,
+            seq_to_bin_idx=seq_to_bin_idx,
+            packing_algo='fifo'
+        )
+
+        seq_per_bin = [len(indices) for indices in packing_info.bin_seq_indices]
+        log_single_rank(
+            logger, logging.DEBUG, ("Initial packing output (before distribution):")
+        )
+        log_single_rank(
+            logger,
+            logging.DEBUG,
+            f"  - Total bins created: {len(packing_info.bin_seq_indices)}",
+        )
+        log_single_rank(
+            logger, logging.DEBUG, f"  - Total sequences packed: {sum(seq_per_bin)}"
+        )
+        log_single_rank(
+            logger,
+            logging.DEBUG,
+            f"  - Sequences per bin: min={min(seq_per_bin)}, max={max(seq_per_bin)}, avg={sum(seq_per_bin)/len(seq_per_bin):.1f}",
+        )
+        log_single_rank(logger, logging.DEBUG, f"  - First 20 bins: {seq_per_bin[:20]}")
+
+        return packed_sequences, position_ids, attention_mask, loss_mask, packing_info
+
+def distribute_packed_bins(
+    packed_trajs: torch.Tensor,
+    packed_position_ids: torch.Tensor,
+    packed_attention_mask: torch.Tensor,
+    packed_loss_mask: torch.Tensor,
+    packing_info: PackingInfo,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, PackingInfo]:
+    """Distribute packed bins across the data parallel ranks."""
+    rank = mpu.get_data_parallel_rank()
+    world_size = mpu.get_data_parallel_world_size()
+    tokenizer = get_tokenizer()
+
+    # Distribute packed bins across data parallel ranks
+    num_bins, bin_size = packed_trajs.shape
+    packing_algo = packing_info.packing_algo
+
+    if packing_algo == 'round-robin':
+        # Round-robin assignment: rank i gets bins [i, i+world_size, i+2*world_size, ...]
+        my_bin_indices = list(range(rank, num_bins, world_size))
+    else:  # fifo (default)
+        world_size = world_size if world_size > 0 else 1
+        # FIFO assignment: divide bins sequentially across ranks
+        bins_per_rank = num_bins // world_size
+        extra_bins = num_bins % world_size
+
+        # Calculate start and end indices for this rank
+        if rank < extra_bins:
+            # Ranks with extra bins
+            start_idx = rank * (bins_per_rank + 1)
+            end_idx = start_idx + bins_per_rank + 1
+        else:
+            # Ranks without extra bins
+            start_idx = rank * bins_per_rank + extra_bins
+            end_idx = start_idx + bins_per_rank
+
+        my_bin_indices = list(range(start_idx, end_idx))
+
+    # Calculate the maximum bins any rank has (for synchronization)
+    max_bins_per_rank = (num_bins + world_size - 1) // world_size
+
+    # Extract this rank's bins
+    my_packed_trajs = []
+    my_packed_position_ids = []
+    my_packed_attention_mask = []
+    my_packed_loss_mask = []
+    my_bin_seq_indices = []
+    my_seq_starts = {}
+
+
+    # Build the local data from the global indices
+    for new_idx, old_idx in enumerate(my_bin_indices):
+        my_packed_trajs.append(packed_trajs[old_idx])
+        my_packed_position_ids.append(packed_position_ids[old_idx])
+        if packed_attention_mask is not None:
+            my_packed_attention_mask.append(packed_attention_mask[old_idx])
+        my_packed_loss_mask.append(packed_loss_mask[old_idx])
+        my_bin_seq_indices.append(packing_info.bin_seq_indices[old_idx])
+        my_seq_starts[new_idx] = packing_info.seq_starts[old_idx]
+
+    # Stack the selected bins
+    packed_trajs = (
+        torch.stack(my_packed_trajs)
+        if my_packed_trajs
+        else torch.empty(
+            0,
+            packed_trajs.shape[1],
+            dtype=packed_trajs.dtype,
+            device=packed_trajs.device,
+        )
+    )
+    packed_position_ids = (
+        torch.stack(my_packed_position_ids)
+        if my_packed_position_ids
+        else torch.empty(
+            0,
+            packed_position_ids.shape[1],
+            dtype=packed_position_ids.dtype,
+            device=packed_position_ids.device,
+        )
+    )
+    packed_attention_mask = (
+        torch.stack(my_packed_attention_mask) if my_packed_attention_mask else None
+    )
+    packed_loss_mask = (
+        torch.stack(my_packed_loss_mask)
+        if my_packed_loss_mask
+        else torch.empty(
+            0,
+            packed_loss_mask.shape[1],
+            dtype=packed_loss_mask.dtype,
+            device=packed_loss_mask.device,
+        )
+    )
+
+    # Debug: Check what we're extracting
+    log_single_rank(logger, logging.DEBUG, (f"Rank 0 {packing_algo} bin assignment:"))
+    log_single_rank(
+        logger, logging.DEBUG, f"  - Total bins before distribution: {num_bins}"
+    )
+    log_single_rank(
+        logger,
+        logging.DEBUG,
+        f"  - Bins assigned to rank 0: {my_bin_indices[:10]}... (showing first 10)",
+    )
+    log_single_rank(
+        logger,
+        logging.DEBUG,
+        f"  - Number of bins for this rank: {len(my_bin_indices)}",
+    )
+    log_single_rank(
+        logger,
+        logging.DEBUG,
+        f"  - Length of my_bin_seq_indices: {len(my_bin_seq_indices)}",
+    )
+    if len(my_bin_seq_indices) > 0:
+        log_single_rank(
+            logger,
+            logging.DEBUG,
+            f"  - Sequences in first 5 bins: {[len(indices) for indices in my_bin_seq_indices[:5]]}",
+        )
+
+    # Create updated packing info for this rank
+    new_packing_info = PackingInfo(
+        bin_seq_indices=my_bin_seq_indices,
+        seq_starts=my_seq_starts,
+        seq_lengths=packing_info.seq_lengths,  # Keep all sequence lengths
+        seq_to_bin_idx=packing_info.seq_to_bin_idx,  # Keep mapping
+        packing_algo=packing_algo,
+    )
+
+    # Add empty bins if this rank has fewer than max_bins_per_rank
+    current_bins = len(my_bin_indices)
+    if current_bins < max_bins_per_rank:
+        num_empty_bins = max_bins_per_rank - current_bins
+
+        # Create empty bins using the helper function
+        (
+            empty_trajs,
+            empty_position_ids,
+            empty_loss_mask,
+            empty_attention_mask,
+            empty_packing_entries,
+        ) = create_empty_bins(
+            num_empty_bins,
+            bin_size,
+            packed_trajs,
+            packed_position_ids,
+            packed_loss_mask,
+            packed_attention_mask,
+            tokenizer,
+        )
+
+        # Append empty bins to packed tensors
+        packed_trajs = torch.cat([packed_trajs, empty_trajs], dim=0)
+        packed_position_ids = torch.cat(
+            [packed_position_ids, empty_position_ids], dim=0
+        )
+        packed_loss_mask = torch.cat([packed_loss_mask, empty_loss_mask], dim=0)
+
+        if packed_attention_mask is not None and empty_attention_mask is not None:
+            packed_attention_mask = torch.cat(
+                [packed_attention_mask, empty_attention_mask], dim=0
+            )
+
+        # Add empty entries to packing_info
+        for i, entry in enumerate(empty_packing_entries):
+            bin_idx = current_bins + i
+            new_packing_info.bin_seq_indices.append(entry['bin_seq_indices'])
+            new_packing_info.seq_starts[bin_idx] = entry['seq_starts']
+
+    return packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, new_packing_info
+
+
+def pack_all_trajectories(trajs, generation_masks, inference_logprobs, global_advantages, bin_size, max_sequences_per_bin, packing_algo):
+    tokenizer = get_tokenizer()
+    data_parallel_world_size = mpu.get_data_parallel_world_size()
+    data_parallel_group = mpu.get_data_parallel_group()
+    nvtx_range = get_nvtx_range()
+
+    with nvtx_range("regather_trajectories", time=True):
+        def _gather(data):
+            data = data.cuda()
+            data_list = [torch.empty_like(data) for _ in range(data_parallel_world_size)]
+            torch.distributed.all_gather(data_list, data, group=data_parallel_group)
+            return torch.cat(data_list, dim=0)
+
+        trajs = _gather(trajs)    
+        generation_masks = _gather(generation_masks) 
+        if inference_logprobs is not None:
+            inference_logprobs = _gather(inference_logprobs)
+
+    with nvtx_range("pack_sequences", time=True):
+        # Create packer with max sequences per bin limit to prevent extreme imbalance
+        packer = SequencePacker(
+            bin_size=bin_size,
+            pad_token=tokenizer.pad,
+            max_sequences_per_bin=max_sequences_per_bin,
+        )
+
+        # Pack sequences with generation masks
+        (
+            packed_trajs,
+            packed_position_ids,
+            packed_attention_mask,
+            packed_loss_mask,
+            packing_info,
+        ) = packer.pack_sequences(trajs, generation_masks)
+        packing_info.packing_algo = packing_algo
+
+        # Distribute packed bins across the data parallel ranks
+        (
+            packed_trajs,
+            packed_position_ids,
+            packed_attention_mask,
+            packed_loss_mask,
+            packing_info,
+        ) = distribute_packed_bins(
+            packed_trajs,
+            packed_position_ids,
+            packed_attention_mask,
+            packed_loss_mask,
+            packing_info,
+        )
+
+    # Create bin_advantages list
+    bin_advantages = []
+    for seq_indices in packing_info.bin_seq_indices:
+        if seq_indices:
+            bin_advantages.append(global_advantages[seq_indices])
+        else:
+            bin_advantages.append(
+                torch.tensor([], dtype=global_advantages.dtype, device=global_advantages.device)
+            )
+
+    # Pre-compute all PackedSeqParams for all bins ONCE to avoid repeated
+    # tensor allocations that cause CUDA memory fragmentation and periodic spikes
+    # Create a temporary packing context to pass to create_packed_seq_params
+    cached_packed_seq_params = [
+        create_packed_seq_params_for_bin(
+                packing_info=packing_info,
+                bin_idx=bin_idx,
+                bin_size=bin_size,
+                max_sequences_per_bin=max_sequences_per_bin,
+                device=packed_trajs.device,
+            ) for bin_idx in range(len(packed_trajs))
+    ]
+
+    # Create the final PackingContext
+    packing_context = PackingContext(
+        bin_size=bin_size,
+        packer=packer,
+        packing_info=packing_info,
+        original_generation_masks=generation_masks,
+        original_trajs=trajs,
+        packed_trajs=packed_trajs,
+        packed_position_ids=packed_position_ids,
+        packed_attention_mask=packed_attention_mask,
+        packed_loss_mask=packed_loss_mask,
+        original_inference_logprobs=inference_logprobs,
+        bin_advantages=bin_advantages,
+        cached_packed_seq_params=cached_packed_seq_params,
+    )
+
+    log_packing_efficiency(packing_context)
+
+    return packing_context
+
+def update_microbatch_calculator(
+    samples_ratio_per_step: float,
+    num_bins_this_rank: int,
+    bin_seq_indices: List[List[int]],
+    global_batch_size: int, 
+    rampup_batch_size: int, 
+    micro_batch_size: int, 
+    decrease_batch_size_if_needed: bool,
+):
+    """Return a data loader with seqpacked indices with microbatches in bins frame of reference.
+    Args:
+        samples_ratio_per_step: Fraction of sampled trajectories to use per iteration.
+        num_bins_this_rank: Amount of packing bins that belongs to current rank.
+        bin_seq_indices: Global seq indices in the bin, see PackingInfo.
+        global_batch_size: Current global batch size.
+        rampup_batch_size: Rampup batch size. See num_microbatches_calculator.py for more.
+        micro_batch_size: Micro batch size at init.
+        decrease_batch_size_if_needed: Scale down batch size. See num_microbatches_calculator.py for more.
+
+    As a side effect, we calculate the global batch size in the bins frame of reference.
+    In sequence packing, our batch dimension shrinks as we move some trajs onto free
+    space in sequence dimension. The resulting batch size is what we return here.
+    """
+
+    dp_world_size = mpu.get_data_parallel_world_size()
+
+    # Ceiling division means we will reuse some bins
+    # If we did floor we would leave some behind
+    local_bins_per_step = math.ceil(samples_ratio_per_step * num_bins_this_rank)
+
+    bins_bs = local_bins_per_step * dp_world_size
+
+    old_num_microbatches = get_num_microbatches()
+    reconfigure_num_microbatches_calculator(
+        rank=torch.distributed.get_rank() if torch.distributed.is_initialized() else 0,
+        rampup_batch_size=rampup_batch_size,
+        global_batch_size=bins_bs,
+        micro_batch_size=micro_batch_size,
+        data_parallel_size=dp_world_size,
+        decrease_batch_size_if_needed=decrease_batch_size_if_needed,
+    )
+    new_num_microbatches = get_num_microbatches()
+
+    log_single_rank(
+        logger, logging.INFO, "[Sequence Packing] Multi-step training plan:"
+    )
+
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - Bins per rank per step: {samples_ratio_per_step}*{num_bins_this_rank}={local_bins_per_step}",
+    )
+
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - Target sequences per step: {global_batch_size}",
+    )
+    log_single_rank(
+        logger,
+        logging.INFO,
+        f"[Sequence Packing]  - Microbatches per step: {new_num_microbatches} (was {old_num_microbatches})",
+    )
+
+    # Opt steps only depends on how much we sample and how much we consume.
+    # We make sure this is an integer division, check validate_args in arguments.py for details.
+    opt_steps = int(1 / samples_ratio_per_step)
+    for step in range(min(3, opt_steps)):
+        start_idx = step * local_bins_per_step
+        end_idx = min(start_idx + local_bins_per_step, num_bins_this_rank)
+        step_bins = end_idx - start_idx
+
+        actual_seqs = sum(
+            len(bin_seq_indices[bin_idx])
+            for bin_idx in range(start_idx, end_idx)
+            if bin_idx < len(bin_seq_indices)
+        )
+        est_global_seqs = actual_seqs * dp_world_size
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"[Sequence Packing]  - Step {step + 1}: {step_bins} bins, ~{est_global_seqs} sequences globally",
+        )
+
+    if opt_steps > 3:
+        log_single_rank(logger, logging.INFO, f"  - ... ({opt_steps - 3} more steps)")
+
+def get_microbatch_dataloader(num_bins_this_rank, micro_batch_size):
+    bin_indices = torch.arange(num_bins_this_rank)
+    dataset = TensorDataset(bin_indices)
+    return DataLoader(dataset, batch_size=micro_batch_size, shuffle=False, collate_fn=lambda x: x[0])
+
+def get_sequence_packing_log_info(args):
+    """Get logging information for sequence packing mode."""
+    if args.consumed_train_bins > 0:
+        return f' consumed bins: {args.consumed_train_bins:12d} |'
+    return ''
+
+
+def get_sequence_packing_tensorboard_metrics(args):
+    """Get tensorboard metrics for sequence packing mode."""
+    metrics = {}
+    if args.consumed_train_bins > 0:
+        bin_batch_size = (
+            mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches()
+        )
+        metrics['bin-batch-size'] = bin_batch_size
+        metrics['consumed-bins'] = args.consumed_train_bins
+    return metrics
diff --git a/megatron/rl/server/inference/inference_interface_server.py b/megatron/rl/server/inference/inference_interface_server.py
index 4abdf85cfcb..ba595c3ca0e 100644
--- a/megatron/rl/server/inference/inference_interface_server.py
+++ b/megatron/rl/server/inference/inference_interface_server.py
@@ -93,6 +93,6 @@ async def suspend(self):
         if isinstance(self._inference_interface, InferenceServer):
             await self._inference_interface.suspend()
 
-    def resume(self):
+    async def resume(self):
         if isinstance(self._inference_interface, InferenceServer):
-            self._inference_interface.resume()
+            await self._inference_interface.resume()
diff --git a/megatron/training/__init__.py b/megatron/training/__init__.py
index 46cf5b5c9bc..3546dfd5761 100644
--- a/megatron/training/__init__.py
+++ b/megatron/training/__init__.py
@@ -11,7 +11,7 @@
 from .global_vars import get_adlr_autoresume
 from .global_vars import get_timers
 from .initialize  import initialize_megatron
-from .training import pretrain, get_model, get_train_valid_test_num_samples
+from .training import pretrain, get_model, get_train_valid_test_num_samples, set_startup_timestamps
 
 from .utils import (print_rank_0,
                     is_last_rank,
diff --git a/megatron/training/argument_utils.py b/megatron/training/argument_utils.py
new file mode 100644
index 00000000000..b9f7c7b22d1
--- /dev/null
+++ b/megatron/training/argument_utils.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import dataclasses
+import typing
+import types
+from typing import Any, Optional
+from argparse import ArgumentParser, _ArgumentGroup
+import inspect
+import itertools
+import builtins
+import ast
+import enum
+from dataclasses import Field, fields
+
+# TODO: support arg renames
+
+class TypeInferenceError(Exception):
+    """Custom exception type to be conditionally handled by ArgumentGroupFactory."""
+    pass
+
+class ArgumentGroupFactory:
+    """Utility that adds an argument group to an ArgumentParser based on the attributes of a dataclass.
+
+    This utility uses dataclass metadata including type annotations and docstrings to automatically
+        infer the type, default, and other argparse keyword arguments.
+
+    You can override or supplement the automatically inferred argparse kwargs for any 
+        dataclass field by providing an "argparse_meta" key in the field's metadata dict.
+        The value should be a dict of kwargs that will be passed to ArgumentParser.add_argument().
+        These metadata kwargs take precedence over the automatically inferred values.
+
+        Example:
+            @dataclass
+            class YourConfig:
+                your_attribute: int | str | None = field(
+                    default=None,
+                    metadata={
+                        "argparse_meta": {
+                            "arg_names": ["--your-arg-name1", "--your-arg-name2"],
+                            "type": str,
+                            "nargs": "+",
+                            "default": "foo",
+                        }
+                    },
+                )
+
+        In this example, inferring the type automatically would fail, as Unions are
+        not supported. However the metadata is present, so that takes precedence.
+        Any keyword arguments to `ArgumentParser.add_argument()` can be included in
+        the "argparse_meta" dict, as well as "arg_names" for the argument flag name.
+
+    This class can also be used as a base class and extended as needed to support dataclasses
+        that require some customized or additional handling.
+
+    Args:
+        src_cfg_class: The source dataclass type (not instance) whose fields will be 
+            converted into command-line arguments. Each field's type annotation determines 
+            the argument type, default values become argument defaults, and field-level 
+            docstrings are extracted to populate argument help text.
+        exclude: Optional list of attribute names from `src_cfg_class` to exclude from 
+            argument generation. Useful for omitting internal fields, computed properties,
+            or attributes that should be configured through other means. If None, all 
+            dataclass fields will be converted to command-line arguments. Default: None.
+    """
+
+    def __init__(self, src_cfg_class: type, exclude: Optional[list[str]] = None) -> None:
+        self.src_cfg_class = src_cfg_class
+        self.field_docstrings = self._get_field_docstrings(src_cfg_class)
+        self.exclude = set(exclude) if exclude is not None else set()
+
+    def _format_arg_name(self, config_attr_name: str, prefix: Optional[str] = None) -> str:
+        """Convert dataclass name into appropriate argparse flag name.
+
+        Args:
+            config_attr_name: dataclass attribute name
+            prefix: prefix string to add to the dataclass attribute name. e.g. 'no' for bool 
+                settings that are default True. A hyphen is added after the prefix. Default: None
+        """
+        arg_name = config_attr_name
+        if prefix:
+            arg_name = prefix + '_' + arg_name
+        arg_name = "--" + arg_name.replace("_", "-")
+        return arg_name
+
+    def _get_enum_kwargs(self, config_type: enum.EnumMeta) -> dict[str, Any]:
+        """Build kwargs for Enums.
+
+        With these settings, the user must provide a valid enum value, e.g.
+            'flash', for `AttnBackend.flash`.
+        """
+        def enum_type_handler(cli_arg):
+            return config_type[cli_arg]
+
+        return {"type": enum_type_handler, "choices": list(config_type)}
+
+    def _extract_type(self, config_type: type) -> dict[str, Any]:
+        """Determine the type, nargs, and choices settings for this argument.
+
+        Args:
+            config_type: attribute type from dataclass
+        """
+        origin = typing.get_origin(config_type)
+        type_tuple = typing.get_args(config_type)
+
+        if isinstance(config_type, type) and issubclass(config_type, enum.Enum):
+            return self._get_enum_kwargs(config_type)
+
+        # Primitive type
+        if origin is None:
+            return {"type": config_type}
+
+        if origin in [types.UnionType, typing.Union]:
+            # Handle Optional and Union
+            if type_tuple[1] == type(None): # Optional type. First element is value inside Optional[]
+                return self._extract_type(type_tuple[0])
+            else:
+                raise TypeInferenceError(f"Unions not supported by argparse: {config_type}")
+
+        elif origin is list:
+            if len(type_tuple) == 1:
+                kwargs = self._extract_type(type_tuple[0])
+                kwargs["nargs"] = "+"
+                return kwargs
+            else:
+                raise TypeInferenceError(f"Multi-type lists not supported by argparse: {config_type}")
+
+        elif origin is typing.Literal:
+            choices_types = [type(choice) for choice in type_tuple]
+            assert all([t == choices_types[0] for t in choices_types]), "Type of each choice in a Literal type should all be the same."
+            kwargs = {"type": choices_types[0], "choices": type_tuple}
+            return kwargs
+        else:
+            raise TypeInferenceError(f"Unsupported type: {config_type}")
+
+
+    def _build_argparse_kwargs_from_field(self, attribute: Field) -> dict[str, Any]:
+        """Assemble kwargs for add_argument().
+
+        Args:
+            attribute: dataclass attribute
+        """
+        argparse_kwargs = {}
+        argparse_kwargs["arg_names"] = [self._format_arg_name(attribute.name)]
+        argparse_kwargs["dest"] = attribute.name
+        argparse_kwargs["help"] = self.field_docstrings[attribute.name] if attribute.name in self.field_docstrings else ""
+
+        # dataclasses specifies that both should not be set
+        if isinstance(attribute.default, type(dataclasses.MISSING)):
+            # dataclasses specified default_factory must be a zero-argument callable
+            argparse_kwargs["default"] = attribute.default_factory()
+        else:
+            argparse_kwargs["default"] = attribute.default
+
+        attr_argparse_meta = None
+        if attribute.metadata != {} and "argparse_meta" in attribute.metadata:
+            # save metadata here, but update at the end so the metadata has highest precedence
+            attr_argparse_meta = attribute.metadata["argparse_meta"]
+
+
+        # if we cannot infer the argparse type, all of this logic may fail. we try to defer
+        # to the developer-specified metadata if present
+        try:
+            argparse_kwargs.update(self._extract_type(attribute.type))
+
+            # use store_true or store_false action for enable/disable flags, which doesn't accept a 'type'
+            if argparse_kwargs["type"] == bool:
+                argparse_kwargs["action"] = "store_true" if attribute.default == False else "store_false"
+                argparse_kwargs.pop("type")
+
+                # add '--no-*' and '--disable-*' prefix if this is a store_false argument
+                if argparse_kwargs["action"] == "store_false":
+                    argparse_kwargs["arg_names"] = [self._format_arg_name(attribute.name, prefix="no"), self._format_arg_name(attribute.name, prefix="disable")] 
+        except TypeInferenceError as e:
+            if attr_argparse_meta is not None:
+                print(
+                    f"WARNING: Inferring the appropriate argparse argument type from {self.src_cfg_class} "
+                    f"failed for {attribute.name}: {attribute.type}.\n"
+                    "Deferring to attribute metadata. If the metadata is incomplete, 'parser.add_argument()' may fail.\n"
+                    f"Original failure: {e}"
+                )
+            else:
+                raise e
+
+        # metadata provided by field takes precedence 
+        if attr_argparse_meta is not None:
+            argparse_kwargs.update(attr_argparse_meta)
+
+        return argparse_kwargs
+
+    def build_group(self, parser: ArgumentParser, title: Optional[str] = None) -> _ArgumentGroup:
+        """Entrypoint method that adds the argument group to the parser.
+
+        Args:
+            parser: The parser to add arguments to
+            title: Title for the argument group
+        """
+        arg_group = parser.add_argument_group(title=title, description=self.src_cfg_class.__doc__)
+        for attr in fields(self.src_cfg_class):
+            if attr.name in self.exclude or attr.init is False:
+                continue
+
+            add_arg_kwargs = self._build_argparse_kwargs_from_field(attr)
+
+            arg_names = add_arg_kwargs.pop("arg_names")
+            arg_group.add_argument(*arg_names, **add_arg_kwargs)
+
+        return arg_group
+
+    def _get_field_docstrings(self, src_cfg_class: type) -> dict[str, str]:
+        """Extract field-level docstrings from a dataclass by inspecting its AST.
+
+        Recurses on parent classes of `src_cfg_class`.
+
+        Args:
+            src_cfg_class: Dataclass to get docstrings from.
+        """
+        source = inspect.getsource(src_cfg_class)
+        tree = ast.parse(source)
+        root_node = tree.body[0]
+
+        assert isinstance(root_node, ast.ClassDef), "Provided object must be a class."
+
+        field_docstrings = {}
+
+        # Iterate over body of the dataclass using 2-width sliding window.
+        # When 'a' is an assignment expression and 'b' is a constant, the window is
+        # lined up with an attribute-docstring pair. The pair can be saved to our dict.
+        for a, b in itertools.pairwise(root_node.body):
+            a_cond = isinstance(a, ast.AnnAssign) and isinstance(a.target, ast.Name)
+            b_cond = isinstance(b, ast.Expr) and isinstance(b.value, ast.Constant)
+
+            if a_cond and b_cond:
+                # These should be guaranteed by typechecks above, but assert just in case
+                assert isinstance(a.target.id, str), "Dataclass attribute not in the expected format. Name is not a string."
+                assert isinstance(b.value.value, str), "Dataclass attribute docstring is not a string."
+
+                # Formatting
+                docstring = inspect.cleandoc(b.value.value)
+                docstring = ' '.join(docstring.split())
+
+                field_docstrings[a.target.id] = docstring
+
+        # recurse on parent class
+        base_classes = src_cfg_class.__bases__
+        if len(base_classes) > 0:
+            parent_class = base_classes[0]
+            if parent_class.__name__ not in builtins.__dict__:
+                field_docstrings.update(self._get_field_docstrings(base_classes[0]))
+
+        return field_docstrings
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 52b41791c72..46f3c28b1da 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -15,14 +15,10 @@
 from packaging.version import Version as PkgVersion
 
 from megatron.core.dist_checkpointing.validation import StrictHandling
-from megatron.core.models.retro.utils import (
-    get_config_path as get_retro_config_path,
-    get_gpt_data_dir as get_retro_data_dir,
-)
 from megatron.core.rerun_state_machine import RerunStateMachine
 from megatron.core.transformer import MLATransformerConfig, TransformerConfig
 from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
-from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 from megatron.core.transformer.heterogeneous.heterogeneous_config import (
     HeterogeneousTransformerConfig,
     MLPConfig,
@@ -34,7 +30,6 @@
 )
 from megatron.core.activations import squared_relu
 from megatron.core.fusions.fused_bias_geglu import quick_gelu
-from megatron.training.dist_signal_handler import SIGNAL_MAP
 from megatron.training.utils import (
     get_device_arch_version,
     update_use_dist_ckpt,
@@ -48,6 +43,8 @@
     load_quantization_recipe,
 )
 
+from megatron.training.argument_utils import ArgumentGroupFactory
+
 def add_megatron_arguments(parser: argparse.ArgumentParser):
     """"Add Megatron-LM arguments to the given parser."""
 
@@ -69,18 +66,17 @@ def add_megatron_arguments(parser: argparse.ArgumentParser):
     parser = _add_vision_args(parser)
     parser = _add_moe_args(parser)
     parser = _add_mla_args(parser)
+    parser = _add_experimental_attention_variant_args(parser)
     parser = _add_heterogeneous_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_straggler_detector_args(parser)
     parser = _add_workload_inspector_server_args(parser)
     parser = _add_inference_args(parser)
     parser = _add_transformer_engine_args(parser)
-    parser = _add_retro_args(parser)
     parser = _add_experimental_args(parser)
     parser = _add_one_logger_args(parser)
     parser = _add_inprocess_restart_args(parser)
     parser = _add_ft_package_args(parser)
-    parser = _add_config_logger_args(parser)
     parser = _add_rerun_machine_args(parser)
     parser = _add_msc_args(parser)
     parser = _add_kitchen_quantization_arguments(parser)
@@ -121,7 +117,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     if not args.enable_msc:
         MultiStorageClientFeature.disable()
         assert MultiStorageClientFeature.is_enabled() is False
-        print('WARNING: The MSC feature is disabled.')
+        warn_rank_0('The MSC feature is disabled.')
 
     return args
 
@@ -162,7 +158,7 @@ def validate_model_config_args_from_heterogeneous_config(args):
     )
 
     n_kv_heads_in_group = [
-        config["attention"]["n_heads_in_group"] for config in hf_config_dict.block_configs 
+        config["attention"]["n_heads_in_group"] for config in hf_config_dict.block_configs
         if config["attention"]["n_heads_in_group"] is not None
     ]
     assert all(num == n_kv_heads_in_group[0] for num in n_kv_heads_in_group), "num query head must be consistent across all layers"
@@ -199,81 +195,6 @@ def validate_model_config_args_from_heterogeneous_config(args):
             f"Arguments differ from heterogeneous config: {incompatible_args_str}"
         )
 
-
-def load_retro_config(retro_project_dir):
-    '''Load Retro's config.json.'''
-
-    # Retro config path.
-    retro_config_path = get_retro_config_path(retro_project_dir)
-    assert os.path.exists(retro_config_path), \
-        "Retro project dir missing config.json."
-
-    # Load retro config.
-    with open(retro_config_path) as f:
-        retro_config = types.SimpleNamespace(**json.load(f))
-
-    return retro_config
-
-
-def load_retro_args(args):
-    """Load predefined args from Retro config (if applicable).
-
-    When using Retro (or GPT for comparison purposes), data arguments are
-    overridden by the saved config.json within the Retro project directory. This
-    is to ensure that the data used for pretraining is consistent with the data
-    that was preprocessed using the Retro preprocessing pipeline (see
-    `tools/retro/preprocess_data.py`).
-    """
-
-    # Return if no project directory is specified.
-    if args.retro_project_dir is None:
-        return
-
-    # Load retro config.
-    retro_config = load_retro_config(args.retro_project_dir)
-
-    # Retro data path is relative to project dir (via hard or soft links).
-    data_dir = get_retro_data_dir(args.retro_project_dir)
-    data_path = list(retro_config.retro_gpt_data_path)
-    if len(data_path) % 2 == 0:
-        for i in range(len(data_path) - 1, -1, -2):
-            data_path[i] = os.path.join(data_dir, data_path[i])
-    else:
-        assert len(data_path) == 1
-        data_path[0] = os.path.join(data_dir, data_path[0])
-
-    # Update args.
-    args.data_cache_path = retro_config.retro_gpt_data_cache_path
-    args.data_path = data_path if args.data_path is None else args.data_path
-    args.eval_interval = retro_config.retro_gpt_eval_interval
-    args.eval_iters = retro_config.retro_gpt_eval_iters
-    args.global_batch_size = retro_config.retro_gpt_global_batch_size
-    args.max_position_embeddings = retro_config.retro_gpt_seq_length
-    args.merge_file = os.path.join(
-        args.retro_project_dir,
-        retro_config.retro_gpt_merge_file,
-    ) if retro_config.retro_gpt_merge_file is not None else None
-    args.seed = retro_config.retro_gpt_seed
-    args.seq_length = retro_config.retro_gpt_seq_length
-    args.tokenizer_model = os.path.join(
-        args.retro_project_dir,
-        retro_config.retro_gpt_tokenizer_model,
-    ) if retro_config.retro_gpt_tokenizer_model is not None else None
-    args.tokenizer_type = retro_config.retro_gpt_tokenizer_type
-    args.train_samples = retro_config.retro_gpt_train_samples
-    args.vocab_file = os.path.join(
-        args.retro_project_dir,
-        retro_config.retro_gpt_vocab_file,
-    ) if retro_config.retro_gpt_vocab_file is not None else None
-
-    # Retro-specific args.
-    args.retro_block_size = retro_config.retro_block_size
-    args.retro_chunk_length = retro_config.retro_gpt_chunk_length
-    args.retro_neighbor_dirs = retro_config.retro_neighbor_dirs
-    args.retro_split_preprocessing = retro_config.retro_gpt_split
-    args.retro_bert_tokenizer_type = retro_config.retro_bert_tokenizer_type
-    args.retro_bert_vocab_file = retro_config.retro_bert_vocab_file
-
 def _eval_pattern(pattern):
     """ Validate and evaluate a string containing a Python list expression """
     assert isinstance(pattern, str)
@@ -319,7 +240,7 @@ def moe_freq_type(x):
       This allows defining arbitrary patterns of expert and dense layers.
       The pattern length must match the total number of transformer layers.
       Examples:
-          "([0]+[1]*23)": 1 dense layer followed by 23 experts layers
+          "([0]+[1]*23)": 1 dense layer followed by 23 expert layers
           "([1]*3+[0]*2)*2": Three expert layers followed by two dense layers, repeated twice.
     """
     if isinstance(x, int):
@@ -332,6 +253,31 @@ def moe_freq_type(x):
         # it's a single int but in str
         return int(x)
 
+def la_freq_type(x):
+    """Frequency between LA (linear attention) layers and SDPA (scaled dot-product attention) layers.
+
+    Accepts either:
+    - An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer
+    - A string "N": Same as above, but provided as a string
+    - A string containing a Python list expression that defines a custom pattern, e.g.:
+      "([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0]
+      where 1 indicates an LA layer and 0 indicates a SDPA layer.
+      This allows defining arbitrary patterns of LA and SDPA layers.
+      The pattern length must match the total number of transformer layers.
+      Examples:
+          "([0]+[1]*23)": 1 SDPA layer followed by 23 LA layers
+          "([1]*3+[0]*2)*2": Three LA layers followed by two SDPA layers, repeated twice.
+    """
+    if x is None or isinstance(x, int):
+        return x
+    assert isinstance(x, str)
+    if '[' in x:
+        # it's a custom pattern
+        return _eval_pattern(x)
+    else:
+        # it's a single int but in str
+        return int(x)
+
 def tuple_type(x):
     """
     Convert a string to a tuple of integers.
@@ -359,9 +305,6 @@ def validate_args(args, defaults={}):
     # validate model config args from heterogeneous config (if provided).
     validate_model_config_args_from_heterogeneous_config(args)
 
-    # Load saved args from Retro (if applicable).
-    load_retro_args(args)
-
     # Set args.use_dist_ckpt from args.ckpt_format.
     if args.use_legacy_models:
         assert args.ckpt_format == "torch", \
@@ -384,6 +327,14 @@ def validate_args(args, defaults={}):
     total_model_size = args.tensor_model_parallel_size * args.pipeline_model_parallel_size * args.context_parallel_size
     args.data_parallel_size = args.world_size // total_model_size
 
+    # Assert that `torch_memory_saver` is installed if offloading KV cache during RL.
+    if args.rl_offload_kv_cache_during_training:
+        try:
+            from torch_memory_saver import torch_memory_saver
+        except ImportError:
+            raise AssertionError("To use offload-kv-cache-during-training, `torch_memory_saver` must be installed. See https://github.com/fzyzcjy/torch_memory_saver.")
+        assert not args.inference_dynamic_batching_unified_memory_level, "The KV cache should not be instantiated in unified memory when it is offloaded during training."
+
     # Batch size checks if running RL.
     if args.perform_rl_step:
         assert not (args.rl_remove_kv_cache_during_training and args.rl_offload_kv_cache_during_training), \
@@ -392,6 +343,27 @@ def validate_args(args, defaults={}):
         assert not (args.rl_partial_rollouts and args.rl_remove_kv_cache_during_training), \
             "Cannot use both partial-rollouts and remove-kv-cache-during-training"
 
+        assert not (
+            args.rl_offload_inference_model_weights_when_idle
+            and args.rl_inference_model_unified_memory_level != 1
+        ), (
+            "--rl-offload-inference-model-weights-when-idle requires "
+            "--rl-inference-model-unified-memory-level=1."
+        )
+
+        # When using different EP sizes for inference and training (EP refit), the legacy
+        # GroupedMLP is not supported. Only SequentialMLP or TEGroupedMLP can be used.
+        if (
+            args.rl_inference_expert_model_parallel_size is not None
+            and args.rl_inference_expert_model_parallel_size != args.expert_model_parallel_size
+        ):
+            assert not args.moe_use_legacy_grouped_gemm, (
+                "Legacy GroupedMLP (--moe-use-legacy-grouped-gemm) is not supported when using "
+                "different expert parallelism sizes for inference and training. "
+                "Use SequentialMLP (default when --moe-grouped-gemm is not set) or "
+                "TEGroupedMLP (--moe-grouped-gemm without --moe-use-legacy-grouped-gemm)."
+            )
+
         args.grpo_samples_per_iteration = args.grpo_prompts_per_step * args.grpo_group_size
         num_generated_samples_per_inference_iteration = (
             args.grpo_samples_per_iteration * args.grpo_iterations)
@@ -412,20 +384,19 @@ def validate_args(args, defaults={}):
             assert args.save_interval % num_training_iterations_per_inference_iteration == 0, \
                 f"save_interval should be divisible by number of global batches per inference iteration."
         if args.rl_use_sequence_packing:
-            assert args.seq_length <= args.rl_sequence_packing_bin_size, \
-                f"rl_sequence_packing_bin_size should be larger than or equal to seq_length"
-
-    if args.rank == 0:
-        print('using world size: {}, data-parallel size: {}, '
-              'context-parallel size: {}, '
-              'hierarchical context-parallel sizes: {}, '
-              'tensor-model-parallel size: {}, '
-              'pipeline-model-parallel size: {}'.format(
-                  args.world_size, args.data_parallel_size,
-                  args.context_parallel_size,
-                  args.hierarchical_context_parallel_sizes,
-                  args.tensor_model_parallel_size,
-                  args.pipeline_model_parallel_size), flush=True)
+            assert args.micro_batch_size == 1, \
+                "micro_batch_size must be 1 when using sequence packing. To increase compute per micro batch increase the sequence length."
+
+    print_rank_0('using world size: {}, data-parallel size: {}, '
+                 'context-parallel size: {}, '
+                 'hierarchical context-parallel sizes: {}, '
+                 'tensor-model-parallel size: {}, '
+                 'pipeline-model-parallel size: {}'.format(
+                     args.world_size, args.data_parallel_size,
+                     args.context_parallel_size,
+                     args.hierarchical_context_parallel_sizes,
+                     args.tensor_model_parallel_size,
+                     args.pipeline_model_parallel_size))
 
     # Checks.
 
@@ -451,9 +422,8 @@ def validate_args(args, defaults={}):
     del args.model_parallel_size
 
     if args.checkpoint_activations:
-        if args.rank == 0:
-            print('--checkpoint-activations is no longer valid, use --recompute-activations, '
-                  'or, for more control, --recompute-granularity and --recompute-method.')
+        print_rank_0('--checkpoint-activations is no longer valid, use --recompute-activations, '
+                     'or, for more control, --recompute-granularity and --recompute-method.')
         exit()
     del args.checkpoint_activations
 
@@ -489,19 +459,16 @@ def validate_args(args, defaults={}):
         # arguments that are passed to the program. We check this by
         # ensuring the arg is set to None.
         if getattr(args, key, None) is not None:
-            if args.rank == 0:
-                print('WARNING: overriding default arguments for {key}:{v} \
-                       with {key}:{v2}'.format(key=key, v=defaults[key],
-                                               v2=getattr(args, key)),
-                                               flush=True)
+            warn_rank_0('Overriding default arguments for {key}:{v} '
+                        'with {key}:{v2}'.format(key=key, v=defaults[key],
+                                                 v2=getattr(args, key)))
         else:
             setattr(args, key, defaults[key])
 
     if args.data_path is not None and args.split is None:
         legacy_default_split_value = '969, 30, 1'
-        if args.rank == 0:
-            print('WARNING: Please specify --split when using --data-path. Using legacy default value '
-                  f'of "{legacy_default_split_value}"')
+        warn_rank_0('Please specify --split when using --data-path. Using legacy default value '
+                    f'of "{legacy_default_split_value}"')
         args.split = legacy_default_split_value
 
     use_data_path = (args.data_path is not None) or (args.data_args_path is not None)
@@ -518,14 +485,18 @@ def validate_args(args, defaults={}):
                    for elt in [args.train_data_path, args.valid_data_path, args.test_data_path]) is False or \
             args.per_split_data_args_path is None
 
+    if args.phase_transition_iterations:
+        args.phase_transition_iterations = sorted(
+            int(x.strip()) for x in args.phase_transition_iterations.split(",")
+        )
+        assert args.rampup_batch_size is None, "multi-phase training does not support batch size ramp-up"
+
     # Batch size.
     assert args.micro_batch_size is not None
     assert args.micro_batch_size > 0
     if args.global_batch_size is None:
         args.global_batch_size = args.micro_batch_size * args.data_parallel_size
-        if args.rank == 0:
-            print('setting global batch size to {}'.format(
-                args.global_batch_size), flush=True)
+        print_rank_0('setting global batch size to {}'.format(args.global_batch_size))
     assert args.global_batch_size > 0
 
     # Uneven virtual pipeline parallelism
@@ -598,7 +569,7 @@ def validate_args(args, defaults={}):
 
                 assert num_layers % args.transformer_pipeline_model_parallel_size == 0, \
                     'Number of layers should be divisible by the pipeline-model-parallel size'
-    
+
     if args.virtual_pipeline_model_parallel_size is not None:
         if args.overlap_p2p_comm:
             assert args.pipeline_model_parallel_size > 1, \
@@ -619,8 +590,9 @@ def validate_args(args, defaults={}):
                 'since non-interleaved schedule does not support overlapping p2p communication '
                 'and aligned param AG')
 
-    if args.rank == 0:
-        print(f"Number of virtual stages per pipeline stage: {args.virtual_pipeline_model_parallel_size}")
+    print_rank_0(
+        f"Number of virtual stages per pipeline stage: {args.virtual_pipeline_model_parallel_size}"
+    )
 
     if args.overlap_param_gather:
         assert args.use_distributed_optimizer or args.use_megatron_fsdp, \
@@ -658,7 +630,7 @@ def validate_args(args, defaults={}):
                 args.rank,
             )
         if args.fp4_param and not is_te_min_version("2.7.0.dev0"):
-            raise ValueError("--fp4-param requires Transformer Engine >= 2.7.0.dev0.")   
+            raise ValueError("--fp4-param requires Transformer Engine >= 2.7.0.dev0.")
 
     if args.overlap_param_gather_with_optimizer_step:
         assert args.use_distributed_optimizer, \
@@ -691,7 +663,7 @@ def validate_args(args, defaults={}):
     # FP4 param requires FP4 mode
     if args.fp4_param and not args.fp4:
         raise ValueError("--fp4-param-gather must be used together with --fp4-format.")
-    
+
     # FP4 requires TE >= 2.7.0.dev0
     if args.fp4 and not is_te_min_version("2.7.0.dev0"):
         raise ValueError("--fp4-format requires Transformer Engine >= 2.7.0.dev0 for NVFP4BlockScaling support.")
@@ -717,6 +689,13 @@ def validate_args(args, defaults={}):
 
         assert args.ckpt_format == "fsdp_dtensor", \
             "Megatron FSDP only supports fsdp_dtensor checkpoint format"
+        
+    if args.fsdp_manual_registration:
+        assert args.use_megatron_fsdp, "FSDP manual registration is only supported with Megatron FSDP"
+        assert args.nccl_ub, "FSDP manual registration is only supported with nccl-ub option"
+
+        if args.use_megatron_fsdp:
+            args.reuse_grad_buf_for_mxfp8_param_ag = False
 
     # Parameters dtype.
     args.params_dtype = torch.float
@@ -727,9 +706,8 @@ def validate_args(args, defaults={}):
         # where NaNs in grads / loss are signal to the loss scaler.
         if not args.loss_scale:
             args.check_for_nan_in_loss_and_grad = False
-            if args.rank == 0:
-                print('WARNING: Setting args.check_for_nan_in_loss_and_grad to False since '
-                      'dynamic loss scaling is being used')
+            warn_rank_0('Setting args.check_for_nan_in_loss_and_grad to False since '
+                        'dynamic loss scaling is being used')
     if args.bf16:
         assert not args.fp16
         args.params_dtype = torch.bfloat16
@@ -743,10 +721,8 @@ def validate_args(args, defaults={}):
             args.accumulate_allreduce_grads_in_fp32 = False
         elif not args.accumulate_allreduce_grads_in_fp32 and args.main_grads_dtype == torch.float32:
             args.accumulate_allreduce_grads_in_fp32 = True
-            if args.rank == 0:
-                print('accumulate and all-reduce gradients in fp32 for '
-                      'bfloat16 data type.', flush=True)
-    if args.cuda_graph_impl == "local" and args.cuda_graph_scope=="full_iteration":
+            print_rank_0('accumulate and all-reduce gradients in fp32 for bfloat16 data type.')
+    if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
         if not args.inference_dynamic_batching:
             assert not args.check_for_nan_in_loss_and_grad, \
             "--no-check-for-nan-in-loss-and-grad should be set with full_iteration CUDA graph"
@@ -754,9 +730,7 @@ def validate_args(args, defaults={}):
             assert args.fp8 is None, \
             "fp8 is not supported with inference dynamic batching and full_iteration CUDA graph"
 
-    if args.rank == 0:
-        print('using {} for parameters ...'.format(args.params_dtype),
-              flush=True)
+    print_rank_0('using {} for parameters ...'.format(args.params_dtype))
 
     if args.dataloader_type is None:
         args.dataloader_type = 'single'
@@ -866,6 +840,8 @@ def validate_args(args, defaults={}):
         if args.save_retain_interval is not None:
             assert args.save_retain_interval > 0
             assert args.save_retain_interval % args.save_interval == 0
+    if args.log_memory_interval is not None:
+        assert args.log_memory_interval % args.log_interval == 0
     # Mixed precision checks.
     if args.fp16_lm_cross_entropy:
         assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
@@ -874,10 +850,20 @@ def validate_args(args, defaults={}):
             'residual connection in fp32 only supported when using fp16 or bf16.'
 
     if args.moe_grouped_gemm:
-        assert args.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.'
         dc = torch.cuda.get_device_capability()
         assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels."
 
+    if args.no_weight_decay_cond_type is not None:
+        print_rank_0(
+            'WARNING: --no-weight-decay-cond-type is deprecated. Please use --apply-wd-to-qk-layernorm instead.',
+            args.rank,
+        )
+        if args.no_weight_decay_cond_type == "apply_wd_to_qk_layernorm":
+            args.apply_wd_to_qk_layernorm = True
+        else:
+            raise ValueError(f"Invalid no_weight_decay_cond_type: {args.no_weight_decay_cond_type}")
+        args.no_weight_decay_cond_type = None
+
     if args.weight_decay_incr_style == 'constant':
         assert args.start_weight_decay is None
         assert args.end_weight_decay is None
@@ -890,10 +876,9 @@ def validate_args(args, defaults={}):
     # Persistent fused layer norm.
     if not is_torch_min_version("1.11.0a0"):
         args.no_persist_layer_norm = True
-        if args.rank == 0:
-            print('Persistent fused layer norm kernel is supported from '
-                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
-                  'Defaulting to no_persist_layer_norm=True')
+        print_rank_0('Persistent fused layer norm kernel is supported from '
+                     'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+                     'Defaulting to no_persist_layer_norm=True')
 
     # Activation recomputing.
     if args.distribute_saved_activations:
@@ -930,6 +915,13 @@ def validate_args(args, defaults={}):
     if args.tp_comm_overlap:
         assert args.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
 
+    if args.hybrid_context_parallel:
+        assert not args.pipeline_model_parallel_size > 1, 'Hybrid context parallelism not supported with pipeline parallelism'
+        assert not args.enable_cuda_graph, 'Hybrid context parallelism not supported with CUDA Graph'
+        assert not args.use_megatron_fsdp, 'Hybrid context parallelism not supported with Megatron FSDP'
+        assert args.dataloader_type == 'single', 'Hybrid context parallelism only supported with single dataloader type'
+        assert args.calculate_per_token_loss, 'Hybrid context parallelism must be used with --calculate-per-token-loss'
+
     # disable async_tensor_model_parallel_allreduce when
     # model parallel memory optimization is enabled
     if (args.tensor_model_parallel_size > 1 or args.context_parallel_size > 1) \
@@ -977,20 +969,18 @@ def validate_args(args, defaults={}):
     if args.add_bias_linear:
         args.add_qkv_bias = True
 
-    # Retro checks.
-    if args.retro_add_retriever:
-
-        # Train samples should be auto-loaded.
-        assert args.train_samples is not None, \
-            "args.train_samples should be auto-loaded from the retro config."
-
-        # Sequence parallelism unsupported.
-        assert not args.sequence_parallel, \
-            "retro currently does not support sequence parallelism."
+    if args.qk_clip:
+        assert is_te_min_version("2.9.0"), \
+            '--qk-clip is only supported with TE >= 2.9.0.'
+        assert 0.0 < args.qk_clip_alpha < 1.0, \
+            '--qk-clip-alpha must be between 0.0 and 1.0 when using --qk-clip.'
+        assert args.qk_clip_threshold > 0, \
+            '--qk-clip-threshold must be greater than 0 when using --qk-clip.'
 
-        # Pipeline parallelism unsupported.
-        assert args.pipeline_model_parallel_size == 1, \
-            "retro currently does not support pipeline parallelism."
+    # decoupled log max attention logit check
+    if args.log_max_attention_logit:
+        assert is_te_min_version("2.9.0"), \
+            '--log-max-attention-logit is only supported with TE >= 2.9.0.'
 
     if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
         assert not args.use_legacy_models, \
@@ -1025,7 +1015,7 @@ def validate_args(args, defaults={}):
         args.num_experts = None
     if args.num_experts is not None and args.moe_ffn_hidden_size is None:
         args.moe_ffn_hidden_size = args.ffn_hidden_size
-        print("Warning: moe_ffn_hidden_size is not set, using ffn_hidden_size for MoE instead.")
+        warn_rank_0("moe_ffn_hidden_size is not set, using ffn_hidden_size for MoE instead.")
 
     # Context parallel
     if args.context_parallel_size > 1:
@@ -1036,8 +1026,6 @@ def validate_args(args, defaults={}):
         assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
         assert args.num_experts % args.expert_model_parallel_size == 0, \
             "Number of experts should be a multiple of expert model parallel_size."
-        assert not args.fp16, \
-            "Expert parallelism is not supported with fp16 training."
 
     # MoE router check
     if isinstance(args.moe_router_load_balancing_type, list) and len(args.moe_router_load_balancing_type) == 1:
@@ -1147,6 +1135,14 @@ def validate_args(args, defaults={}):
     if args.load_main_params_from_ckpt:
         assert args.no_load_optim, '--load-main-params-from-ckpt must be used with --no-load-optim.'
 
+    if args.use_dist_ckpt and args.async_save:
+        if not args.use_persistent_ckpt_worker:
+            warn_rank_0(
+                '--async-save is not supported without --use-persistent-ckpt-worker. '
+                'Disabling --async-save.'
+            )
+            args.async_save = False
+
     # Inference args
     if args.inference_batch_times_seqlen_threshold > -1:
         assert args.pipeline_model_parallel_size > 1, \
@@ -1159,15 +1155,36 @@ def validate_args(args, defaults={}):
         assert args.inference_dynamic_batching_buffer_size_gb is not None
         assert args.inference_dynamic_batching_block_size % 256 == 0, "block size should be a multiple of 256"
 
+    if args.cuda_graph_impl == "local" and args.expert_model_parallel_size > 1:
+       assert args.moe_pad_experts_for_cuda_graph_inference, \
+        "--moe-pad-experts-for-cuda-graph-inference must be set when using CUDA graphs with expert parallelism"
+
     # MoE upcycling check
     if args.moe_use_upcycling:
         assert args.save is not None, "When using upcycling, the --save option must be specified."
         if not args.no_load_optim:
             args.no_load_optim = True
-            print('Warning: disabling --no-load-optim for upcycling.')
+            warn_rank_0('enabling --no-load-optim for upcycling.')
         if not args.no_load_rng:
             args.no_load_rng = True
-            print('Warning: disabling --no-load-rng for upcycling.')
+            warn_rank_0('enabling --no-load-rng for upcycling.')
+
+    # --skip-train checks.
+    if args.skip_train and not args.no_load_optim:
+        args.no_load_optim = True
+        warn_rank_0('enabling --no-load-optim when skipping training.')
+
+    # Muon optimizer check
+    if 'muon' in args.optimizer:
+
+        # TODO: remove these checks once we support them
+        assert not args.overlap_grad_reduce, "Muon optimizer does not support overlap grad reduce for now."
+        assert not args.overlap_param_gather, "Muon optimizer does not support overlap param gather for now."
+
+        assert not args.use_distributed_optimizer, "Muon optimizer does not support distributed optimizer for now."
+        assert not args.use_torch_fsdp2, "Muon optimizer does not support Torch-FSDP2 for now."
+        assert not args.use_megatron_fsdp, "Muon optimizer does not support Megatron-FSDP for now."
+        assert args.ckpt_format in ["torch", "torch_dist"], "Muon optimizer supports torch and torch_dist checkpoint format."
 
     # Optimizer CPU offload check
     if args.optimizer_cpu_offload:
@@ -1186,7 +1203,7 @@ def validate_args(args, defaults={}):
         assert args.replication_jump is not None, "--replication requires the value of --replication-jump!"
         assert args.non_persistent_ckpt_type == "local", f"--replication requires args.non_persistent_ckpt_type == 'local', but got: {args.non_persistent_ckpt_type}"
     elif args.replication_jump:
-        print("Warning: --replication-jump was specified despite not using replication. Ignoring.")
+        warn_rank_0("--replication-jump was specified despite not using replication. Ignoring.")
         args.replication_jump = None
 
     if args.delay_wgrad_compute:
@@ -1202,6 +1219,13 @@ def validate_args(args, defaults={}):
                 "when enabling delay_wgrad_compute"
             )
 
+    if args.fine_grained_activation_offloading:
+        assert args.transformer_impl == 'transformer_engine', \
+            "Fine-grained activation offloading is only supported with transformer_engine implementation"
+        if is_te_min_version("2.10.0"):
+            assert os.getenv("NVTE_CPU_OFFLOAD_V1", "0") == "1", \
+                "For fine-grained activation offloading with TE >= 2.10.0, NVTE_CPU_OFFLOAD_V1 should be set to 1 to avoid offloading weights."
+
     if args.mtp_num_layers:
         assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)."
         assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", (
@@ -1214,16 +1238,46 @@ def validate_args(args, defaults={}):
 
     # CUDA Graphs
     if args.cuda_graph_impl != "none":
-        if args.transformer_impl == 'transformer_engine' and not args.te_rng_tracker:
+        if (
+            "transformer_engine" in (args.transformer_impl, args.cuda_graph_impl)
+            and not args.te_rng_tracker
+        ):
             args.te_rng_tracker = True
             warn_rank_0("te_rng_tracker is not enabled, enabling it for CUDA graphs.", args.rank)
-        assert "expandable_segments:True" not in os.getenv("PYTORCH_CUDA_ALLOC_CONF", ""), (
-            "expandable_segments:True may not be safe when using CUDA Graphs with some specific parallel settings. "
-            "The training may crash with illegal memory access."
+        if args.cuda_graph_impl == "transformer_engine":
+            assert (
+                "expandable_segments:True" not in os.getenv("PYTORCH_CUDA_ALLOC_CONF", "")
+                or os.getenv("NCCL_GRAPH_REGISTER", "") == "0"
+            ), (
+                "Setting NCCL_GRAPH_REGISTER=0 to avoid illegal memory access when using "
+                "CUDA Graph with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True."
+            )
+    if args.cuda_graph_scope == "full" or (
+        isinstance(args.cuda_graph_scope, list) and "full" in args.cuda_graph_scope
+    ):
+        if isinstance(args.cuda_graph_scope, list):
+            assert args.cuda_graph_scope == ["full"], "full scope cannot be used with other scopes."
+        args.cuda_graph_scope = []
+        warn_rank_0(
+            'full scope is deprecated. Use empty cuda_graph_scope to capture the whole layer.'
         )
-        assert (
-            args.recompute_granularity != 'full'
-        ), 'recompute_granularity must not be full when CUDA Graphs are enabled.'
+    
+    if args.multi_latent_attention:
+        assert not args.group_query_attention, "Group query attention is mutually exclusive with multi latent attention."
+
+    # MoE latent projections
+    if args.moe_latent_size is not None:
+        assert args.moe_latent_size > 0, "MoE latent projection dimension has to be greater than zero."
+        assert args.num_experts is not None, "MoE latent projections are applicable only for MoE models."
+        assert not args.use_legacy_models, "MoE latent projections are only supported for mcore models."
+        assert not args.moe_use_legacy_grouped_gemm, "MoE latent projection is not supported yet with legacy grouped GEMM."
+
+    if args.tiktoken_special_tokens and not args.tokenizer_special_tokens:
+        warn_rank_0(
+            "--tiktoken-special-tokens argument is deprecated and will be removed soon. "
+            "Use --tokenizer-special-tokens instead."
+        )
+        args.tokenizer_special_tokens = args.tiktoken_special_tokens
 
     # Print arguments.
     _print_args("arguments", args)
@@ -1233,17 +1287,16 @@ def validate_args(args, defaults={}):
 
 def _print_args(title, args):
     """Print arguments."""
-    if args.rank == 0:
-        print(f'------------------------ {title} ------------------------',
-              flush=True)
+    from megatron.training.utils import is_rank0
+    if is_rank0():
+        print(f'------------------------ {title} ------------------------', flush=True)
         str_list = []
         for arg in vars(args):
             dots = '.' * (48 - len(arg))
             str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
         for arg in sorted(str_list, key=lambda x: x.lower()):
             print(arg, flush=True)
-        print(f'-------------------- end of {title} ---------------------',
-              flush=True)
+        print(f'-------------------- end of {title} ---------------------', flush=True)
 
 
 def _check_arg_is_not_none(args, arg):
@@ -1257,7 +1310,7 @@ def core_transformer_config_from_args(args, config_class=None):
 
     if args.multi_latent_attention:
         config_class = MLATransformerConfig
-    
+
     if args.heterogeneous_layers_config_path is not None:
         assert not args.multi_latent_attention, "Multi latent attention with heterogeneous layers is not supported."
         config_class = HeterogeneousTransformerConfig
@@ -1268,8 +1321,6 @@ def core_transformer_config_from_args(args, config_class=None):
         if hasattr(args, f.name):
             kw_args[f.name] = getattr(args, f.name)
     kw_args['persist_layer_norm'] = not args.no_persist_layer_norm
-    kw_args['layernorm_zero_centered_gamma'] = args.apply_layernorm_1p
-    kw_args['layernorm_epsilon'] = args.norm_epsilon
     kw_args['deallocate_pipeline_outputs'] = True
     kw_args['pipeline_dtype'] = args.params_dtype
     kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm
@@ -1324,6 +1375,17 @@ def core_transformer_config_from_args(args, config_class=None):
         kw_args['use_kitchen'] = True
         kw_args['quant_recipe'] = kitchen_quantization_recipe_config(args.kitchen_recipe_number)
 
+    kw_args['moe_latent_size'] = args.moe_latent_size
+
+    if args.te_precision_config_file:
+        assert not 'quant_recipe' in kw_args, "Quantization recipe already configured."
+        # TODO(kwyss): Prohibit fp8_params or fp4_params with this flexibility
+        kw_args['quant_recipe'] = load_quantization_recipe(args.te_precision_config_file)
+
+    if hasattr(args, "use_kitchen_attention"):
+        kw_args['use_kitchen_attention'] = args.use_kitchen_attention
+    if hasattr(args, "kitchen_attention_backend"):
+        kw_args['kitchen_attention_backend'] = args.kitchen_attention_backend
 
     # Return config.
     return config_class(**kw_args)
@@ -1332,71 +1394,15 @@ def core_transformer_config_from_args(args, config_class=None):
 def _add_transformer_engine_args(parser):
     group = parser.add_argument_group(title='Transformer-Engine')
 
-    group.add_argument('--fp8-format', default=None,
-                       choices=['e4m3', 'hybrid'],
-                       help='Which fp8 format scheme to use for FP8 tensors in the forward and backward pass',
-                       dest='fp8')
-    # per tensor current scaling recipe selection
-    group.add_argument('--fp8-recipe', default='delayed',
-                       choices=['tensorwise', 'delayed', 'mxfp8', 'blockwise', 'custom'],
-                       help='Which fp8 recipe to use for FP8 tensors in the forward and backward pass',
-                       dest='fp8_recipe')
-    group.add_argument('--fp8-quantizer-factory', default=None,
-                       help='Python import path to a callable quantizer factory, '
-                            'e.g., package.module.quantizer_factory.',
-                       dest='fp8_quantizer_factory')
     # delayed scaling only configs
-    group.add_argument('--fp8-margin', type=int, default=0,
-                       help='Scaling margin for fp8',
-                       dest='fp8_margin')
-    group.add_argument('--fp8-interval', type=int, default=1,
-                       help='DEPRECATED. This flag is ignored. Scaling update interval for fp8',
-                       dest='fp8_interval')
-    group.add_argument('--fp8-amax-history-len', type=int, default=1,
-                       help='Number of steps for which amax history is recorded per tensor',
-                       dest='fp8_amax_history_len')
-    group.add_argument('--fp8-amax-compute-algo', default='most_recent',
-                       choices=['most_recent', 'max'],
-                       help='Algorithm for computing amax from history',
-                       dest='fp8_amax_compute_algo')
-    group.add_argument('--no-fp8-wgrad', action='store_false',
-                       help='Execute wgrad in higher precision even for FP8 runs',
-                       dest='fp8_wgrad')
-    group.add_argument('--transformer-impl', default='transformer_engine',
-                       choices=['local', 'transformer_engine', 'inference_optimized'],
-                       help='Which Transformer implementation to use.')
     group.add_argument('--fp8-param-gather', action='store_true',
                        help='Keep the compute param in fp8 (do not use any other intermediate '
                             'dtype) and perform the param all-gather in fp8.')
-    group.add_argument('--first-last-layers-bf16', action='store_true',
-                       help='Construct first and last layers in bf16 when doing FP8 training.')
-    group.add_argument('--num-layers-at-start-in-bf16', type=int, default=1,
-                       help='Number of layers at start to construct in bf16 when --first-last-layers-bf16 is enabled.')
-    group.add_argument('--num-layers-at-end-in-bf16', type=int, default=1,
-                       help='Number of layers at end to construct in bf16 when --first-last-layers-bf16 is enabled.')
-    
+
     # FP4 related arguments
-    group.add_argument('--fp4-format', default=None,
-                       choices=['e2m1'],
-                       help='Which nvfp4 format scheme to use for FP4 tensors in the forward and backward pass',
-                       dest='fp4')
-    group.add_argument('--fp4-recipe', default='nvfp4',
-                       choices=['nvfp4', 'custom'],
-                       help='Which fp4 recipe to use for FP4 tensors in the forward and backward pass',
-                       dest='fp4_recipe')
-    group.add_argument('--fp4-quantizer-factory', default=None,
-                       help='Python import path to a callable quantizer factory, '
-                            'e.g., package.module.quantizer_factory.',
-                       dest='fp4_quantizer_factory')
-    group.add_argument('--fp4-param-gather', action='store_true',
-                       help='Keep the compute param in fp4 (do not use any other intermediate '
-                            'dtype) and perform the param all-gather in fp4.',
-                       dest='fp4_param')
-    group.add_argument('--te-rng-tracker', action='store_true', default=False,
-                       help='Use the Transformer Engine version of the random number generator. '
-                            'Required for CUDA graphs support.')
-    group.add_argument('--inference-rng-tracker', action='store_true', default=False,
-                       help='Use a random number generator configured for inference.')
+    group.add_argument('--te-precision-config-file', default=None,
+                       help='Configuration file to select per-module precision overrides. '
+                       'See TransformerEngineMixedPrecision.md')
     return parser
 
 def _add_inference_args(parser):
@@ -1421,32 +1427,20 @@ def _add_inference_args(parser):
                        choices=["megatron", "huggingface"],
                        help='Select either Megatron or Huggingface as the '
                        'Bert embedder.')
-    group.add_argument('--flash-decode', default=False, action="store_true",
-                       help='Whether to use the flash decoding kernel.')
-    group.add_argument('--enable-cuda-graph', default=False, action="store_true",
-                       help='Deprecated. Use --cuda-graph-impl=local instead. '
-                       'Use local implementation of CUDA graph capture and replay. '
-                       '--cuda-graph-scope=\"full_iteration\" enables whole iteration CUDA graph. ')
-    group.add_argument("--cuda-graph-warmup-steps", type=int, default=3,
-                       help="Number of CUDA graph warmup steps")
-    group.add_argument('--external-cuda-graph', action='store_true',
-                       help='Deprecated. Use --cuda-graph-impl=transformer_engine instead. '
-                       'Use TE make_graphed_callables() to capture the CUDA graph.')
-    group.add_argument('--cuda-graph-impl', type=str, default='none',
-                       choices=['none', 'local', 'transformer_engine'],
-                       help='Determines the CUDA graph capture implementation. '
-                       '"none": no CUDA graph. '
-                       '"local": capture the CUDA graph using MCore local implementation. --cuda-graph-scope=\"full_iteration\" enables whole iteration CUDA graph. '
-                       '"transformer_engine": capture the CUDA graph using TE make_graphed_callables().')
-    group.add_argument('--cuda-graph-scope', type=str, default='full',
-                       choices=['full', 'attn', 'full_iteration'],
-                       help='Determines the CUDA graphs capturing scope. Valid values are '
-                       '\"full\", \"attn\" and \"full_iteration\". \"Full\" scope captures a whole '
-                       'Transformer layer. \"Attn\" scope only captures operations in '
-                       'TransformerLayer._forward_attention(). \"ful_iteration\" scope captures a '
-                       'whole iteration. '
-                       'full_iteration scope is only supported with --cuda-graph-impl=local, '
-                       'attn scope is only supported with --cuda-graph-impl=transformer_engine.')
+    group.add_argument('--cuda-graph-scope', nargs='+', type=lambda scope: CudaGraphScope[scope] if scope != "full" else scope, default=[],
+                       help='Determines the CUDA graphs capturing scope. '
+                       'choices: "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba", "full_iteration". '
+                       '"attn": captures operations in TransformerLayer._forward_attention(). '
+                       '"mlp": captures operations in TransformerLayer._forward_mlp() for a dense layer. '
+                       '"moe": captures operations in TransformerLayer._forward_mlp() for a MoE layer. '
+                       '"moe_router": captures operations in TransformerLayer._forward_mlp() up to MoELayer.router(), '
+                       'including the shared experts if they are not overlapped with EP comm. '
+                       '"moe_preprocess": captures operations in MoELayer.preprocess(). Must be used together with "moe_router". '
+                       '"mamba": captures the mamba layer. '
+                       '"full_iteration": captures a whole iteration. '
+                       'full_iteration scope is only supported with --cuda-graph-impl=local, other scopes are only supported with --cuda-graph-impl=transformer_engine. '
+                       'If not specified, the default scope is to capture the whole Transformer layer. '
+                       'For backward compatibility, we still allow passing "full" to specify capturing the whole layer, and convert it to an empty list.')
     group.add_argument('--use-legacy-static-engine', action='store_true', default=False,
                        help='Use legacy static engine. (Current static engine uses dynamic engine under the hood)',
                        dest='use_legacy_static_engine')
@@ -1472,11 +1466,23 @@ def _add_inference_args(parser):
                        'If the UVM level is 0, then only GPU memory is used and '
                        'the total memory equals `buffer_size_gb`. If the UVM '
                        'level is 1, then additional memory is utilized on the '
-                       'CPU and the total memory equals `2 * buffer_size_gb`.')
+                       'CPU and the total memory equals `buffer_size_gb + '
+                       'paused_buffer_size_gb`.')
+    group.add_argument('--inference-dynamic-batching-paused-buffer-size-gb',
+                       type=float, default=None,
+                       help='Amount of memory reserved for paused requests in '
+                       'the dynamic inference context. Active requests are '
+                       'paused when there are not enough active blocks available '
+                       'to continue generating a request.')
     group.add_argument('--inference-dynamic-batching-block-size',
                        type=int, default=256,
                        help='KV cache block size. '
                        'It should be a multiple of 256')
+    group.add_argument('--inference-dynamic-batching-max-requests',
+                       type=int, default=None,
+                       help='Override the inference context\'s `max_requests`. '
+                       'By default, `max_requests` is set to the number of '
+                       'blocks in the context\'s memory buffer.')
     group.add_argument('--inference-dynamic-batching-max-tokens',
                        type=int, default=None,
                        help='Override the inference context\'s default `max_tokens`.')
@@ -1496,114 +1502,133 @@ def _add_inference_args(parser):
                        action='store_true', default=False,
                        help='Only use cuda graphs for decode-only steps, not prefill and mixed steps.')
     group.add_argument('--inference-dynamic-batching-unified-memory-level',
-                       type=int, default=1, choices=[0, 1],
+                       type=int, default=0, choices=[0, 1],
                        help='Set unified memory usage within the dynamic '
                        'inference context. The levels are: 0) no unified memory, '
                        '1) allocate `memory_buffer` in unified memory. '
                        'Eventually, additional levels will be included to '
                        'control other tensors within the context.')
-    group.add_argument('--symmetric-ar-type', type=str, default=None,
-                       choices=['two_shot', "one_shot", "multimem_all_reduce", None],
-                       help='What type of symmetric all reduce to use. The default is none which is no use of symetric memory')
     group.add_argument('--nccl-all-reduce-for-prefill',
                        action='store_true', default=False,
                        help='When using symmeric all reduce kernels this will use regular nccl kernels for prefill. This can be more effecient when prefill is large as the nccl kernels can be more bandwith optimized')
-    group.add_argument('--mlp-chunks-for-prefill', type=int, default=1,
-                       help='Number of chunks along sequence dimension for MLP '
-                       'computation during prefill')
-    group.add_argument('--disable-chunked-prefill', default=False, action="store_true",
-                       help='Disable chunked prefill (chunked prefill is enabled by default).')  
+    # TODO(ksanthanam): Clean this up in future PR
+    group.add_argument('--enable-chunked-prefill', dest='disable_chunked_prefill',
+                       action='store_false', default=True,
+                       help="Enable chunked prefill (disabled by default)")
+    group.add_argument('--disable-chunked-prefill', dest='disable_chunked_prefill',
+                       action='store_true', help=argparse.SUPPRESS)
     group.add_argument('--inference-dynamic-batching-cuda-graph-max-tokens',
                        type=int, default=16384,
                        help='Maximum number of tokens to capture in a cuda graph.')
     group.add_argument('--inference-dynamic-batching-cuda-graph-mixed-prefill-count',
                        type=int, default=16,
                        help='Number of mixed prefill requests to capture in a cuda graph.')
-    group.add_argument('--inference-wandb-logging-step-interval', type=int, default=0,
-                       help='Step interval for logging inference metrics to wandb. '
-                            'Default to 0 to disable inference wandb logging.')
+    group.add_argument('--inference-logging-step-interval', type=int, default=0,
+                       help='Step interval for logging inference metrics. '
+                            'Default to 0 to disable inference logging.')
+    group.add_argument('--inference-wandb-logging', action=argparse.BooleanOptionalAction,
+                       required=False, default=False, help='Enable inference wandb logging.')
     group.add_argument("--inference-coordinator-port", type=int, default=12346,
                        help="This port will be used to setup the inference coordinator on node-0")
     return parser
 
 
-def _add_retro_args(parser):
-    group = parser.add_argument_group(title='retro')
-
-    group.add_argument('--retro-project-dir', default=None,
-                       help='Retro project directory, which contains the '
-                       'preprocessed data for pretraining. This directory '
-                       'is built during preprocessing (see '
-                       'tools/retro/README.md), and contains subdirectories '
-                       'for the chunk database and pretraining neighbors.')
-    group.add_argument('--retro-add-retriever',
-                       action='store_true', default=False,
-                       help='Add a retriever to the transformer, for use in '
-                       'pretraining a Retro model.')
-    group.add_argument('--retro-cyclic-train-iters', type=int, default=None,
-                       help='Set number of training iterations for cyclic '
-                       'Retro training.')
-    group.add_argument('--retro-encoder-layers', type=int, default=2,
-                       help='Number of layers to use for the retrieval '
-                       'encoder.')
-    group.add_argument('--retro-encoder-hidden-dropout',
-                       type=float, default=0.1, help='Hidden dropout for '
-                       'retrieval encoder.')
-    group.add_argument('--retro-encoder-attention-dropout',
-                       type=float, default=0.1, help='Attention dropout for '
-                       'retrieval encoder.')
-    group.add_argument("--retro-num-neighbors", type=int, default=2,
-                       help='Number of neighbors to retrieve during '
-                       'pretraining.')
-    group.add_argument("--retro-num-retrieved-chunks", type=int, default=2,
-                       help='Number of chunks to retrieve from the retrieval '
-                       'database.')
-    group.add_argument("--retro-attention-gate", type=float, default=1,
-                       help="Gated cross attention.")
-    group.add_argument("--retro-no-verify-neighbor-count", action="store_false",
-                       dest="retro_verify_neighbor_count",
-                       help="Skip verifying that len(GPT dataset) == len(saved "
-                       "neighbors).")
-
-    # Enforce argument naming convention.
-    for action in group._group_actions:
-        prefix = action.dest.split("_")[0]
-        assert prefix == "retro", \
-            "Retro args must be prefixed with '--retro-*', for consistent " \
-            "styling. Please fix '%s'." % ", ".join(action.option_strings)
-
-    return parser
-
-
 def _add_network_size_args(parser):
+    exclude = [
+        # cannot provide callables over CLI
+        "timers",
+        "finalize_model_grads_func",
+        "grad_scale_func",
+        "no_sync_func",
+        "grad_sync_func",
+        "param_sync_func",
+        "_cpu_offloading_context",
+        "init_method",
+        "output_layer_init_method",
+        "embedding_init_method",
+        "activation_func",
+        # types affect docstring
+        "pipeline_model_parallel_layout",
+        "window_size",
+        "window_attn_skip_freq",
+        "no_rope_freq",
+        "moe_layer_freq",
+        "linear_attention_freq",
+        "moe_router_load_balancing_type",
+        "moe_aux_loss_coeff",
+        "cp_comm_type",
+        "cuda_graph_scope",
+        # no CLI argument exists for these
+        "virtual_pipeline_model_parallel_size",
+        "params_dtype",
+        "enable_autocast",
+        "autocast_dtype",
+        "num_microbatches_with_partial_activation_checkpoints",
+        "tp_comm_overlap_disable_qkv",
+        "tp_comm_overlap_disable_fc1",
+        "pipeline_dtype",
+        "variable_seq_lengths",
+        "batch_p2p_comm",
+        "batch_p2p_sync",
+        "deallocate_pipeline_outputs",
+        "cpu_offloading",
+        "cpu_offloading_activations",
+        "cpu_offloading_weights",
+        "cpu_offloading_double_buffering",
+        "num_layers_in_first_pipeline_stage",
+        "num_layers_in_last_pipeline_stage",
+        "softmax_scale",
+        "gated_linear_unit",
+        "bias_activation_fusion",
+        "activation_func_fp8_input_store",
+        "test_mode",
+        "memory_efficient_layer_norm",
+        "fused_single_qkv_rope",
+        "fp8_dot_product_attention",
+        "fp8_multi_head_attention",
+        "tp_only_amax_red",
+        "use_kitchen",
+        "moe_token_dropping",
+        "cuda_graph_use_single_mempool",
+        "cuda_graph_retain_backward_graph",
+        "disable_parameter_transpose_cache",
+        "inference_sampling_seed",
+        "use_inference_optimized_layers",
+        "heterogeneous_block_specs",
+        "hetereogenous_dist_checkpoint",
+        "quant_recipe",
+        # deprecated and no CLI arg exists
+        "tp_comm_atomic_ag",
+        "tp_comm_atomic_rs",
+        "moe_router_topk_limited_devices",
+        # already generated by another config
+        "inference_rng_tracker",
+        "use_te_rng_tracker",
+        "log_max_attention_logit",
+        "barrier_with_L1_time",
+        # args uses same var with a different name
+        "num_moe_experts",
+        "fp8_param",
+        # incompatible defaults in dataclass
+        "gradient_accumulation_fusion",
+        "overlap_p2p_comm",
+        "attention_softmax_in_fp32",
+        "masked_softmax_fusion",
+        "persist_layer_norm",
+        "bias_dropout_fusion",
+        "apply_rope_fusion",
+    ]
+    transformer_factory = ArgumentGroupFactory(TransformerConfig, exclude=exclude)
+    transformer_group = transformer_factory.build_group(parser, "transformer configuration")
+
     group = parser.add_argument_group(title='network size')
 
-    group.add_argument('--num-layers', type=int, default=None,
-                       help='Number of transformer layers.')
     group.add_argument('--encoder-num-layers', type=int, default=None,
                        help='Number of encoder transformer layers.')
     group.add_argument('--decoder-num-layers', type=int, default=None,
                        help='Number of decoder transformer layers.')
-    group.add_argument('--hidden-size', type=int, default=None,
-                       help='Transformer hidden size.')
-    group.add_argument('--ffn-hidden-size', type=int, default=None,
-                       help='Transformer Feed-Forward Network hidden size. '
-                       'This is set to 4*hidden-size if not provided')
-    group.add_argument('--num-attention-heads', type=int, default=None,
-                       help='Number of transformer attention heads.')
-    group.add_argument('--attention-backend', type=lambda attn_backend: AttnBackend[attn_backend], default=AttnBackend.auto, choices = list(AttnBackend), help='Attention backend to use (flash,fused,unfused,local,auto). Defaults to auto')
-    group.add_argument('--kv-channels', type=int, default=None,
-                       help='Projection weights dimension in multi-head '
-                       'attention. This is set to '
-                       '   args.hidden_size // args.num_attention_heads '
-                       'if not provided.')
     group.add_argument('--group-query-attention', action='store_true',
                           help='Use group-query attention.')
-    group.add_argument('--num-query-groups', type=int, default=1)
-    group.add_argument('--softmax-type', type=str, default='vanilla',
-                       choices=['learnable', 'vanilla', 'off-by-one'],
-                       help='Type of softmax to use for the attention. Supports both a fixed offset and '
-                       'learnable offset.')
     group.add_argument('--window-size', type=tuple_type, default=None,
                        help='Window size for window attention. If not provided, '
                             'window attention will be disabled.')
@@ -1631,8 +1656,6 @@ def _add_network_size_args(parser):
                        help='Base to use for rotary positional embeddings, default 10000')
     group.add_argument('--rotary-percent', type=float, default=1.0,
                        help='Percent of rotary dimension to use, default 100%%')
-    group.add_argument('--rotary-interleaved', action='store_true',
-                          help='Use interleaved rotary embedding.')
     group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
                        help='Sequence length interpolation factor for rotary embeddings.')
     group.add_argument('--use-rope-scaling', action='store_true',
@@ -1652,23 +1675,9 @@ def _add_network_size_args(parser):
                        action='store_false',
                        help='Disable position embedding. Deprecated: use --position-embedding-type',
                        dest='add_position_embedding')
-    group.add_argument('--mrope-section', nargs='+', type=int, default=None,
-                       help='Multimodal rope section is for channel dimension, empty by default.')
     group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
                        help='Pad the vocab size to be divisible by this value.'
                        'This is added for computational efficieny reasons.')
-    group.add_argument('--normalization', default='LayerNorm',
-                       choices=['LayerNorm', 'RMSNorm'],
-                       help='Which normalization technique to use.')
-    group.add_argument('--norm-epsilon', type=float, default=1e-5,
-                       help='Epsilon for layer norm and RMS norm.')
-    group.add_argument('--apply-layernorm-1p', action='store_true',
-                       help='Adjust LayerNorm weights such that they are centered '
-                       'around zero. This improves numerical stability.')
-    group.add_argument('--apply-residual-connection-post-layernorm',
-                       action='store_true',
-                       help='If set, use original BERT residula connection '
-                       'ordering.')
     group.add_argument('--openai-gelu', action='store_true',
                        help='Use OpenAIs GeLU implementation. This option'
                        'should not be used unless for backward compatibility'
@@ -1679,12 +1688,6 @@ def _add_network_size_args(parser):
                        help='Use gated linear units and SiLU activation instead of default gelu')
     group.add_argument('--quick-geglu', action='store_true',
                        help='Use quick geglu activation instead of default gelu')
-    group.add_argument('--activation-func-clamp-value', type=float, default=None,
-                       help='Clamp the output of the linear_fc1 in the activation function. Only used when '
-                            'activation_func is quick_gelu.')
-    group.add_argument('--glu-linear-offset', type=float, default=0.0,
-                       help='Offset term in the GLU activation function: activation_func(x[0]) * (x[1] + offset). '
-                            'Only used when gated_linear_unit is True')
     group.add_argument('--onnx-safe', type=bool, required=False,
                        help='Use workarounds for known problems with '
                        'Torch ONNX exporter')
@@ -1693,31 +1696,14 @@ def _add_network_size_args(parser):
                        dest='bert_binary_head')
     group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
                        help='Untie embeddings and output weights.')
-    group.add_argument('--multi-latent-attention', action='store_true',
-                       help='Use multi-latent attention for model.')
-    group.add_argument('--mtp-num-layers', type=int, default=None,
-                       help='Number of Multi-Token Prediction (MTP) Layers.'
-                       'MTP extends the prediction scope to multiple future tokens at each position.'
-                       'This MTP implementation sequentially predict additional tokens '
-                       'by using D sequential modules to predict D additional tokens.')
-    group.add_argument('--mtp-loss-scaling-factor', type=float, default=0.1,
-                       help='Scaling factor of Multi-Token Prediction (MTP) loss. '
-                       'We compute the average of the MTP losses across all depths, '
-                       'and multiply it the scaling factor to obtain the overall MTP loss, '
-                       'which serves as an additional training objective.')
     return parser
 
-
 def _add_straggler_detector_args(parser):
-    group = parser.add_argument_group(title='straggler')
-    group.add_argument('--log-straggler', action='store_true',
-                       help='If set, tracks and logs straggler per GPU.')
-    group.add_argument('--disable-straggler-on-startup', action='store_true',
-                       help='If set, StragglerDetector is disabled on startup.')
-    group.add_argument('--straggler-ctrlr-port', type=int, default=65535,
-                       help='Port number to toggle StragglerDetector on/off at runtime')
-    group.add_argument('--straggler-minmax-count', type=int, default=1,
-                       help='Number of ranks to report with high/low estimated throughput')
+    from megatron.training.resilience_config import StragglerDetectionConfig
+
+    straggler_factory = ArgumentGroupFactory(StragglerDetectionConfig)
+    group = straggler_factory.build_group(parser, "straggler")
+
     return parser
 
 def _add_workload_inspector_server_args(parser):
@@ -1812,112 +1798,30 @@ def _add_ft_package_args(parser):
     group.add_argument('--calc-ft-timeouts', action='store_true',
                        help='If set, FT package will try to automatically compute the timeouts. '
                        'Note: This feature is for Nvidia internal use only.')
+    group.add_argument('--ft-num-warmup-iters', type=int, default=5,
+                       help='Number of warmup iterations before monitoring step section and '
+                       'out-of-section timeouts. The first N iterations are excluded from '
+                       'timeout monitoring as they can be significantly slower than steady-state. '
+                       'Default: 5. Note: This feature is for Nvidia internal use only.')
     return parser
 
 
-def _add_config_logger_args(parser):
-    group = parser.add_argument_group(title='config logger')
-    group.add_argument('--config-logger-dir', type=str, default='',
-                       help='If set, will dump all configs to --config-logger-dir',
-                       dest='config_logger_dir')
-    return parser
+def _add_logging_args(parser):
+    from megatron.training.training_config import LoggerConfig
 
+    log_factory = ArgumentGroupFactory(LoggerConfig, exclude = ["log_throughput_to_tensorboard", "throughput_window_size", "memory_keys", "log_l2_norm_grad_to_tensorboard", "log_runtime_to_tensorboard", "runtime_time_unit", "filter_warnings", "modules_to_filter", "set_level_for_all_loggers", "save_config_filepath"])
+    group = log_factory.build_group(parser, title="logging")
 
-def _add_logging_args(parser):
-    group = parser.add_argument_group(title='logging')
-
-    group.add_argument('--log-params-norm', action='store_true',
-                       help='If set, calculate and log parameters norm.')
-    group.add_argument('--log-num-zeros-in-grad', action='store_true',
-                       help='If set, calculate and log the number of zeros in gradient.')
-    group.add_argument('--log-throughput', action='store_true',
-                       help='If set, calculate and log throughput per GPU.')
-    group.add_argument('--log-progress', action='store_true',
-                       help='If set, log progress (in terms of number of processed tokens and '
-                       'number of floating-point operations) to progress.txt file in checkpoint '
-                       'directory.')
-    group.add_argument('--timing-log-level', type=int,
-                       default=0, choices=range(0,3),
-                       help='Granularity level to measure and report timing. '
-                       '   0: report only iteration time and make sure timing '
-                       '      does not introduce extra overhead.'
-                       '   1: report timing for operations that are executed '
-                       '      very limited times (basically once) during '
-                       '      each iteration (such as gradient all-reduce) '
-                       '   2: report timing for operations that migh be '
-                       '      executed numerous times during each iteration. '
-                       'Note that setting the level to 1 or 2 might '
-                       'cause increase in iteration time.')
-    group.add_argument('--log-energy', action='store_true',
-                       help='If set, log energy consumption (in Joules)')
-    group.add_argument('--no-barrier-with-level-1-timing', action='store_false',
-                       help='If not set, use barrier with level 1 time '
-                       'measurements. Note that this is up to the user '
-                       'to make sure calling barrier with their timers '
-                       'will not result in hangs. This can happen if for '
-                       'example the user adds a level 1 timer that is not '
-                       'called by all ranks.',
-                       dest='barrier_with_L1_time')
-    group.add_argument('--timing-log-option', type=str, default='minmax',
-                       choices=['max', 'minmax', 'all'],
-                       help='Options for logging timing:'
-                       '  max: report the max timing across all ranks'
-                       '  minmax: report min and max timings across all ranks'
-                       '  all: report timings of all ranks.')
-    group.add_argument('--tensorboard-log-interval', type=int, default=1,
-                       help='Report to tensorboard interval.')
-    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
-                       help='Size of the tensorboard queue for pending events '
-                       'and summaries before one of the "add" calls forces a '
-                       'flush to disk.')
-    group.add_argument('--log-timers-to-tensorboard', action='store_true',
-                       help='If set, write timers to tensorboard.')
-    group.add_argument('--no-log-loss-scale-to-tensorboard',
-                       action='store_false',
-                       help='Disable loss-scale logging to tensorboard.',
-                       dest='log_loss_scale_to_tensorboard')
-    group.add_argument('--log-validation-ppl-to-tensorboard',
-                       action='store_true',
-                       help='If set, write validation perplexity to '
-                       'tensorboard.')
-    group.add_argument('--log-memory-to-tensorboard',
-                       action='store_true',
-                       help='Enable memory logging to tensorboard.')
-    group.add_argument('--log-world-size-to-tensorboard',
-                       action='store_true',
-                       help='Enable world size logging to tensorboard.')
-    group.add_argument('--wandb-project', type=str, default='',
-                       help='The wandb project name. Ignore wandb by default.')
-    group.add_argument('--wandb-entity', type=str, default='',
-                       help='The wandb entity name. It is useful when '
-                       'there are multiple sub-projects in a project. '
-                       'https://community.wandb.ai/t/how-do-i-decide-which-account-private-or-team-to-upload-the-run-to/5704 '
-                       'Ignore wandb by default.')    
-    group.add_argument('--wandb-exp-name', type=str, default='',
-                       help='The wandb experiment name.')
-    group.add_argument('--wandb-save-dir', type=str, default='',
-                       help='Path to save the wandb results locally.')
-    group.add_argument('--logging-level', type=int, default=None,
-                       help='Set default logging level')
     return parser
 
 
 def _add_regularization_args(parser):
     group = parser.add_argument_group(title='regularization')
 
-    group.add_argument('--attention-dropout', type=float, default=0.1,
-                       help='Post attention dropout probability.')
-    group.add_argument('--hidden-dropout', type=float, default=0.1,
-                       help='Dropout probability for hidden state transformer.')
     group.add_argument('--weight-decay', type=float, default=0.01,
                        help='Weight decay coefficient for L2 regularization.')
-    group.add_argument('--start-weight-decay', type=float,
-                       help='Initial weight decay coefficient for L2 regularization.')
-    group.add_argument('--end-weight-decay', type=float,
-                       help='End of run weight decay coefficient for L2 regularization.')
-    group.add_argument('--weight-decay-incr-style', type=str, default='constant',
-                       choices=['constant', 'linear', 'cosine'],
-                       help='Weight decay increment function.')
+    group.add_argument('--apply-wd-to-qk-layernorm', action='store_true',
+                       help='Apply weight decay to qk layernorm as a special case.')
     group.add_argument('--clip-grad', type=float, default=1.0,
                        help='Gradient clipping based on global L2 norm.')
     group.add_argument('--adam-beta1', type=float, default=0.9,
@@ -1931,6 +1835,33 @@ def _add_regularization_args(parser):
                        'numerical stability')
     group.add_argument('--sgd-momentum', type=float, default=0.9,
                        help='Momentum factor for sgd')
+    group.add_argument('--muon-momentum', type=float, default=0.9,
+                       help='Momentum factor for Muon optimizer')
+    group.add_argument('--muon-no-split-qkv', action='store_false', default=True,
+                       dest='muon_split_qkv',
+                       help='Whether to split QKV parameters for Muon optimizer')
+    group.add_argument('--muon-use-nesterov', action='store_true',
+                       help='Whether to use Nesterov-style momentum in the internal SGD')
+    group.add_argument('--muon-scale-mode', type=str, default='spectral',
+                       choices=['spectral', 'unit_rms_norm', 'shape_scaling'],
+                       help='Scale mode for Muon optimizer')
+    group.add_argument('--muon-fp32-matmul-prec', type=str, default='medium',
+                       choices=['low', 'medium', 'high'],
+                       help='FP32 matmul precision for Newton-Schulz iteration')
+    group.add_argument('--muon-num-ns-steps', type=int, default=5,
+                       help='Number of Newton-Schulz steps for Muon optimizer')
+    group.add_argument('--muon-tp-mode', type=str, default='blockwise',
+                       choices=['blockwise', 'duplicated', 'distributed'],
+                       help='How to perform NS calculation for tensor model parallel weights')
+    group.add_argument('--muon-extra-scale-factor', type=float, default=1.0,
+                       help='Additional scale factor for the muon update')
+
+    group.add_argument('--no-weight-decay-cond-type', type=str, choices=['apply_wd_to_qk_layernorm'],
+                       help='Type of no weight decay condition. Choices: '
+                       'None (default): apply weight decay to 1D weights and biases.'
+                       '"apply_wd_to_qk_layernorm": additionally apply weight decay to '
+                       'qk layernorm as a special case.'
+                       'DEPRECATED. Please use --apply-wd-to-qk-layernorm instead. ')
     return parser
 
 
@@ -1940,7 +1871,7 @@ def _add_rl_args(parser):
                        help="Use the RL training step.")
     group.add_argument('--rl-prompts-per-eval', type=int, default=32,
                        help='Number of prompts to evaluate for for each RL task.'
-                        'This evaluation can be very expensive when using environments' 
+                        'This evaluation can be very expensive when using environments'
                         'that evaluate pass@k so we default to a lower number.')
     # TODO(rkirby): allow for "complete" evaluation when --rl-prompts-per-eval is set to -1
     group.add_argument('--grpo-prompts-per-step', type=int, default=32,
@@ -1961,10 +1892,6 @@ def _add_rl_args(parser):
                        help="Entropy term weight in GRPO loss.")
     group.add_argument('--grpo-filter-groups-with-same-reward', action='store_true',
                        help="Filter groups with same reward.")
-    group.add_argument('--grpo-default-temperature', type=float, default=1.0,
-                       help="Default temperature for model inference.")
-    group.add_argument('--grpo-default-top-p', type=float, default=0,
-                       help="Default top-p for model inference.")
     group.add_argument('--langrl-inference-server-type', type=str,
                        choices=['inplace_megatron', 'inplace_megatron_chat'], default='inplace_megatron',
                        help="Type of inference server to use.")
@@ -1973,6 +1900,12 @@ def _add_rl_args(parser):
     group.add_argument('--langrl-external-server', action=argparse.BooleanOptionalAction, required=False, default=False)
     group.add_argument('--langrl-env-config', type=str, default=None,
                        help="Path to YAML config file for RL environment configuration.")
+    group.add_argument('--rl-default-temperature', type=float, default=1.0,
+                       help="Default temperature for model inference.")
+    group.add_argument('--rl-default-top-p', type=float, default=0,
+                       help="Default top-p for model inference.")
+    group.add_argument('--rl-default-top-k', type=int, default=-1,
+                       help="Default top-k for model inference.")
     group.add_argument('--rl-offload-optimizer-during-inference', action='store_true',
                        help='Offload optimizer state to CPU during inference/rollout to save GPU memory')
     group.add_argument('--rl-offload-kv-cache-during-training', action=argparse.BooleanOptionalAction, default=False,
@@ -1987,210 +1920,113 @@ def _add_rl_args(parser):
                        help='If set, use inference logprobs in importance sampling correction of the loss.')
     group.add_argument('--rl-importance-sampling-truncation-coef', type=float, default=None,
                        help="If --inference-logprobs-is-correction is on and this coefficient is set, apply truncation for the IS correction at GRPO loss.")
-    group.add_argument('--rl-calculate-intra-group-similarity', action=argparse.BooleanOptionalAction, default=False,
-                       help='If set, calculate the intra-group similarity of rollouts.')
-    group.add_argument('--rl-use-sequence-packing', action='store_true',
+    group.add_argument('--rl-use-sequence-packing', action=argparse.BooleanOptionalAction, type=bool, default=False,
                        help='Enable sequence packing')
-    group.add_argument('--rl-sequence-packing-bin-size', type=int, default=8192,
-                       help='Override bin size for sequence packing.')
+    group.add_argument('--rl-sequence-packing-max-sequences-per-bin', type=int, default=50,
+                       help='Maximum number of sequences that can be packed into a single bin. ')
     group.add_argument('--rl-sequence-packing-algo', type=str, default='fifo',
                        choices=['fifo', 'round-robin'],
                        help='Algorithm for distributing packed bins across ranks. '
                             'fifo: first-in-first-out sequential distribution, '
                             'round-robin: distribute bins cyclically across ranks for better load balancing')
+    group.add_argument('--rl-training-cuda-graphs', action=argparse.BooleanOptionalAction, type=bool,
+                       default=False,
+                       help='If set, do not call `delete_cuda_graphs` or `toggle_cuda_graphs` when the inference engine is suspended.')
+    group.add_argument('--rl-inference-tensor-model-parallel-size', type=int, default=None,
+                       help='Degree of tensor model parallelism for inference for RL.')     
+    group.add_argument(
+        '--rl-inference-pipeline-model-parallel-size',
+        type=int,
+        default=None,
+        help='Degree of pipeline model parallelism for inference for RL.',
+    )
+    group.add_argument(
+        '--rl-inference-expert-model-parallel-size',
+        type=int,
+        default=None,
+        help='Degree of expert model parallelism for inference for RL.',
+    )
+    group.add_argument(
+        '--rl-inference-expert-tensor-model-parallel-size',
+        type=int,
+        default=None,
+        help='Degree of expert tensor model parallelism for inference for RL. '
+             'For MoE models, this controls the TP size for expert layers specifically. '
+             'Defaults to training expert_tensor_parallel_size if not specified.',
+    )
+    group.add_argument(
+        '--rl-inference-model-unified-memory-level',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=(
+            'Allocate the separate RL inference model parameters from a unified virtual memory (UVM) '
+            'CUDA mempool. Level 0 disables UVM (default). Level 1 enables UVM allocation so the '
+            'inference model weights can be prefetched to CPU when idle while keeping CUDA-graph-safe '
+            'device pointers.'
+        ),
+    )
+    group.add_argument(
+        '--rl-offload-inference-model-weights-when-idle',
+        action=argparse.BooleanOptionalAction,
+        required=False,
+        default=False,
+        help=(
+            'When using a separate RL inference model with UVM-enabled parameters, prefetch its weights '
+            'to CPU when not doing rollout inference, and prefetch back to GPU right before inference. '
+            'Requires --rl-inference-model-unified-memory-level=1.'
+        ),
+    )
+    group.add_argument('--refit-method', type=str, default='gloo',
+                       choices=['nccl', 'gloo', 'nvshmem'],
+                       help=('Method to refit the model weights between training and inference models during RL. '
+                             'nccl: use NCCLCopyService to refit using NCCL; '
+                             'gloo: use GlooCopyService over CPU; '
+                             'nvshmem: use NVSHMEMCopyService to refit using the NVSHMEM.'))
+    group.add_argument('--rl-verify-model-weights-swap', action=argparse.BooleanOptionalAction, default=False,
+                       help='If set, verify that the model weights were correctly transferred by comparing forward pass outputs on'
+                       'the first swap of model weights.')
+
+    group.add_argument('--rl-parallel-generation-tasks', type=int, default=512,
+                        help='Number of parallel generation tasks for RL inference.')
+    group.add_argument('--rl-skip-bos-token', action=argparse.BooleanOptionalAction, type=bool, default=False,
+                        help='Skip BOS token at the beginning of the sequences. Default is False.')
     return parser
 
 def _add_training_args(parser):
-    group = parser.add_argument_group(title='training')
+    from megatron.training.training_config import TrainingConfig
+    from megatron.training.common_config import ProfilingConfig
+
+    prof_factory = ArgumentGroupFactory(ProfilingConfig, exclude=["record_shapes", "nvtx_ranges"])
+    prof_group = prof_factory.build_group(parser, "profiling")
+
+    train_factory = ArgumentGroupFactory(TrainingConfig)
+    group = train_factory.build_group(parser, "training")
 
-    group.add_argument('--micro-batch-size', type=int, default=None,
-                       help='Batch size per model instance (local batch size). '
-                       'Global batch size is local batch size times data '
-                       'parallel size times number of micro batches.')
     group.add_argument('--batch-size', type=int, default=None,
                        help='Old batch size parameter, do not use. '
                        'Use --micro-batch-size instead')
-    group.add_argument('--global-batch-size', type=int, default=None,
-                       help='Training batch size. If set, it should be a '
-                       'multiple of micro-batch-size times data-parallel-size. '
-                       'If this value is None, then '
-                       'use micro-batch-size * data-parallel-size as the '
-                       'global batch size. This choice will result in 1 for '
-                       'number of micro-batches.')
-    group.add_argument('--rampup-batch-size', nargs='*', default=None,
-                       help='Batch size ramp up with the following values:'
-                       '  --rampup-batch-size <start batch size> '
-                       '                      <batch size incerement> '
-                       '                      <ramp-up samples> '
-                       'For example:'
-                       '   --rampup-batch-size 16 8 300000 \\ '
-                       '   --global-batch-size 1024'
-                       'will start with global batch size 16 and over '
-                       ' (1024 - 16) / 8 = 126 intervals will increase'
-                       'the batch size linearly to 1024. In each interval'
-                       'we will use approximately 300000 / 126 = 2380 samples.')
-    group.add_argument('--decrease-batch-size-if-needed', action='store_true', default=False,
-                       help='If set, decrease batch size if microbatch_size * dp_size'
-                       'does not divide batch_size. Useful for KSO (Keep Soldiering On)'
-                       'to continue making progress if number of healthy GPUs (and'
-                       'corresponding dp_size) does not support current batch_size.'
-                       'Old batch_size will be restored if training is re-started with'
-                       'dp_size that divides batch_size // microbatch_size.')
     group.add_argument('--recompute-activations', action='store_true',
                        help='recompute activation to allow for training '
                        'with larger models, sequences, and batch sizes.')
-    group.add_argument('--recompute-granularity', type=str, default=None,
-                       choices=['full', 'selective'],
-                       help='Checkpoint activations to allow for training '
-                       'with larger models, sequences, and batch sizes. '
-                       'It is supported at two granularities 1) full: '
-                       'whole transformer layer is recomputed, '
-                       '2) selective: submodules set in --recompute-modules '
-                       'are recomputed, default is core_attn.')
     group.add_argument('--no-check-for-nan-in-loss-and-grad', action='store_false',
                        help='Check for NaNs in loss and grad',
                        dest='check_for_nan_in_loss_and_grad')
-    group.add_argument('--check-for-spiky-loss', action='store_true',
-                       help='Check for spiky loss',
-                       dest='check_for_spiky_loss')
     group.add_argument('--check-for-large-grads', action='store_true',
                        help='Check for unexpectedly large grads',
                        dest='check_for_large_grads')
-    group.add_argument('--distribute-saved-activations',
-                       action='store_true',
-                       help='If set, distribute recomputed activations '
-                       'across model parallel group.')
-    group.add_argument('--recompute-method', type=str, default=None,
-                       choices=['uniform', 'block'],
-                       help='1) uniform: uniformly divide the total number of '
-                       'Transformer layers and recompute the input activation of '
-                       'each divided chunk at specified granularity, '
-                       '2) recompute the input activations of only a set number of '
-                       'individual Transformer layers per pipeline stage and do the '
-                       'rest without any recomputing at specified granularity'
-                       'default) do not apply activations recompute to any layers')
-    group.add_argument('--recompute-num-layers', type=int, default=None,
-                       help='1) uniform: the number of Transformer layers in each '
-                       'uniformly divided recompute unit, '
-                       '2) block: the number of individual Transformer layers '
-                       'to recompute within each pipeline stage.')
-    group.add_argument('--recompute-modules', nargs='*', type=str, default=None,
-                       help='The submodules to recompute. '
-                       'choices: "core_attn", "moe_act", "layernorm", "mla_up_proj", '
-                       '         "mlp", "moe", "shared_experts". '
-                       'default: ["core_attn"].'
-                       '"core_attn": recompute the core attention part of the transformer layer. '
-                       '"moe_act": recompute the MoE MLP activation function. '
-                       '"layernorm": recompute the input_layernorm and pre_mlp_layernorm. '
-                       '"mla_up_proj": recompute the MLA up projection and RoPE applying parts.'
-                       '"mlp": recompute the dense MLP layer.'
-                       '"moe": recompute the MoE layer.'
-                       '"shared_experts": recompute the shared experts in the MoE layer.'
-                       '"moe_act", "layernorm", and "mla_up_proj" use output-discarding checkpointing, '
-                       '"core_attn", "mlp", "moe", and "shared_experts" use normal checkpointing.')
-    group.add_argument('--cpu-offloading-num-layers', type=int, default=0,
-                       help='The number of Transformer layers to offload to CPU.')
-    group.add_argument('--no-clone-scatter-output-in-embedding', action='store_false',
-                       help='If not set, clone the output of the scatter in embedding layer to GC original tensor.',
-                       dest='clone_scatter_output_in_embedding')
-    group.add_argument('--profile', action='store_true',
-                       help='Enable nsys profiling. When using this option, nsys '
-                       'options should be specified in commandline. An example '
-                       'nsys commandline is `nsys profile -s none -t nvtx,cuda '
-                       '-o <path/to/output_file> --force-overwrite true '
-                       '--capture-range=cudaProfilerApi '
-                       '--capture-range-end=stop`.')
-    group.add_argument('--profile-step-start', type=int, default=10,
-                       help='Global step to start profiling.')
-    group.add_argument('--profile-step-end', type=int, default=12,
-                       help='Global step to stop profiling.')
-    group.add_argument('--iterations-to-skip', nargs='+', type=int, default=[],
-                       help='List of iterations to skip, empty by default.')
     group.add_argument('--result-rejected-tracker-filename', type=str, default=None,
                        help='Optional name of file tracking `result_rejected` events.')
     group.add_argument('--disable-gloo-process-groups', action='store_false',
                        dest='enable_gloo_process_groups',
                        help='Disables creation and usage of Gloo process groups.')
-    group.add_argument('--use-pytorch-profiler', action='store_true',
-                       help='Use the built-in pytorch profiler. '
-                       'Useful if you wish to view profiles in tensorboard.',
-                       dest='use_pytorch_profiler')
-    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
-                       help='Global ranks to profile.')
-    group.add_argument('--record-memory-history', action="store_true", default=False,
-                       help='Record memory history in last rank.')
-    group.add_argument('--memory-snapshot-path', type=str, default="snapshot.pickle",
-                       help='Specifies where to dump the memory history pickle.')
-    group.add_argument('--tp-comm-overlap', action='store_true', help='Enables the '
-                       ' overlap of Tensor parallel communication and GEMM kernels.')
     group.add_argument('--tp-comm-overlap-cfg', type=str, default=None,
                        help='Config file when tp_comm_overlap is enabled.')
-    group.add_argument('--disable-tp-comm-overlap-ag', action='store_false',
-                       help=('Disables the All-Gather overlap with GEMM by '
-                             'pipelining the GEMM and All-Gather.'),
-                       dest='tp_comm_overlap_ag')
-    group.add_argument('--disable-tp-comm-overlap-rs', action='store_false',
-                       help=('Disables the Reduce-Scatter overlap with GEMM by '
-                             'pipelining the GEMM and Reduce-Scatter.'),
-                       dest='tp_comm_overlap_rs')
-    group.add_argument('--tp-comm-overlap-rs-dgrad', action='store_true',
-                       help = 'Enables the Reduce-Scatter overlap with dgrad GEMM.',
-                       dest='tp_comm_overlap_rs_dgrad')
-    group.add_argument('--disable-tp-comm-bulk-dgrad', action='store_false',
-                       help='Disables the All-Gather overlap with bprop activation gradient GEMM.',
-                       dest='tp_comm_bulk_dgrad')
-    group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false',
-                       help='Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.',
-                       dest='tp_comm_bulk_wgrad')
-    group.add_argument('--tp-comm-bootstrap-backend', default='nccl', type=str,
-                       choices=['nccl', 'mpi', 'gloo'],
-                       help='Set the bootstrapping backend of Tensor parallel communications.')
-    group.add_argument('--use-cpu-initialization', action='store_true',
-                       default=None,
-                       help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.')
-    group.add_argument('--empty-unused-memory-level', default=0, type=int,
-                       choices=[0, 1, 2],
-                       help='Call torch.cuda.empty_cache() each iteration '
-                       '(training and eval), to reduce fragmentation.'
-                       '0=off, 1=moderate, 2=aggressive.')
-    group.add_argument('--deterministic-mode', action='store_true',
-                       help='Choose code that has deterministic execution. This usually '
-                       'means slower execution, but is good for debugging and testing.')
-    group.add_argument('--check-weight-hash-across-dp-replicas-interval', type=int, default=None,
-                       help='Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.')
-    group.add_argument('--calculate-per-token-loss', action='store_true',
-                       help=('Scale cross entropy loss by the number of non-padded tokens in the '
-                             'global batch, versus the default behavior of assuming all tokens are non-padded.'))
-    group.add_argument('--train-sync-interval', type=int, default=None,
-                       help='Training CPU-GPU synchronization interval, to ensure that CPU is not running too far ahead of GPU.')
 
     # deprecated
     group.add_argument('--checkpoint-activations', action='store_true',
                        help='Checkpoint activation to allow for training '
                        'with larger models, sequences, and batch sizes.')
-    group.add_argument('--train-iters', type=int, default=None,
-                       help='Total number of iterations to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-samples', type=int, default=None,
-                       help='Total number of samples to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='Report loss and timing interval.')
-    group.add_argument('--exit-interval', type=int, default=None,
-                       help='Exit the program after the iteration is divisible '
-                       'by this value.')
-    group.add_argument('--exit-duration-in-mins', type=int, default=None,
-                       help='Exit the program after this many minutes.')
-    group.add_argument('--exit-signal-handler', action='store_true',
-                       help='Dynamically save the checkpoint and shutdown the '
-                       'training if signal is received')
-    group.add_argument('--exit-signal', type=str, default='SIGTERM',
-                       choices=list(SIGNAL_MAP.keys()),
-                       help='Signal to use for exit signal handler. If not specified, defaults to SIGTERM.')
-    group.add_argument('--tensorboard-dir', type=str, default=None,
-                       help='Write TensorBoard logs to this directory.')
     group.add_argument('--no-masked-softmax-fusion',
                        action='store_false',
                        help='Disable fusion of query_key_value scaling, '
@@ -2203,8 +2039,6 @@ def _add_training_args(parser):
                        help='Disable bias and swiglu fusion, the fusion is '
                        'available only when using megatron-core.',
                        dest='bias_swiglu_fusion')
-    group.add_argument('--use-fused-weighted-squared-relu', action='store_true',
-                       help='Use fused weighted squared relu when using MoE.')
     group.add_argument('--no-bias-dropout-fusion', action='store_false',
                        help='Disable bias and dropout fusion.',
                        dest='bias_dropout_fusion')
@@ -2216,23 +2050,11 @@ def _add_training_args(parser):
                       choices=['rope', 'yarn'],
                       help='Type of rope to use. Note that MLA takes yarn by default, '
                       'and common attention takes rope by default.')
-    group.add_argument('--cross-entropy-loss-fusion', action='store_true',
-                       help='Enabled fusion of cross entropy loss calculation.',
-                       dest='cross_entropy_loss_fusion')
-    group.add_argument('--cross-entropy-fusion-impl', type=str, default='native',
-                       choices=['native', 'te'],
-                       help='Implementation of cross entropy loss calculation.')
     group.add_argument('--use-flash-attn', action='store_true',
                        help='use FlashAttention implementation of attention. '
                        'https://arxiv.org/abs/2205.14135')
-    group.add_argument('--disable-bias-linear', action='store_false',
-                       help='Disable bias in the linear layers',
-                       dest='add_bias_linear')
-    group.add_argument('--add-qkv-bias', action='store_true',
-                       help='Enable bias only in the QKV linear layers',
-                       dest='add_qkv_bias')
     group.add_argument('--optimizer', type=str, default='adam',
-                       choices=['adam', 'sgd'],
+                       choices=['adam', 'sgd', 'muon', 'dist_muon'],
                        help='Optimizer function')
     group.add_argument('--optimizer-cpu-offload', action='store_true',
                        help='Offload optimizer state to CPU')
@@ -2253,17 +2075,11 @@ def _add_training_args(parser):
     group.add_argument('--dataloader-type', type=str, default=None,
                        choices=['single', 'cyclic', 'external'],
                        help='Single pass vs multiple pass data loader')
-    group.add_argument('--no-async-tensor-model-parallel-allreduce',
-                       action='store_false',
-                       help='DEPRECATED. This flag is ignored.',
-                       dest='async_tensor_model_parallel_allreduce')
     group.add_argument('--no-persist-layer-norm', action='store_true',
                        help='Disable using persistent fused layer norm kernel. '
                        'This kernel supports only a set of hidden sizes. Please '
                        'check persist_ln_hidden_sizes if your hidden '
                        'size is supported.')
-    group.add_argument('--sequence-parallel', action='store_true',
-                       help='Enable sequence parallel optimization.')
     group.add_argument('--no-gradient-accumulation-fusion',
                        action='store_false',
                        help='Disable fusing gradient accumulation to weight '
@@ -2276,80 +2092,29 @@ def _add_training_args(parser):
                        '--use-legacy-models to not use core models.')
     group.add_argument('--use-legacy-models', action='store_true',
                        help='Use the legacy Megatron models, not Megatron-Core models.')
-    group.add_argument('--manual-gc', action='store_true',
-                       help='Disable the threshold-based default garbage '
-                       'collector and trigger the garbage collection manually. '
-                       'Manual garbage collection helps to align the timing of '
-                       'the collection across ranks which mitigates the impact '
-                       'of CPU-associated jitters. When the manual gc is enabled, '
-                       'garbage collection is performed only at the start and the '
-                       'end of the validation routine by default.')
-    group.add_argument('--manual-gc-interval', type=int, default=0,
-                       help='Training step interval to trigger manual garbage '
-                       'collection. When the value is set to 0, garbage '
-                       'collection is not triggered between training steps.')
-    group.add_argument('--no-manual-gc-eval', action='store_false',
-                       help='When using manual garbage collection, disable '
-                       'garbage collection at the start and the end of each '
-                       'evaluation run.', dest='manual_gc_eval')
-    group.add_argument('--disable-tp-comm-split-ag', action='store_false',
-                       help='Disables the All-Gather overlap with fprop GEMM.',
-                       dest='tp_comm_split_ag')
-    group.add_argument('--disable-tp-comm-split-rs', action='store_false',
-                       help='Disables the Reduce-Scatter overlap with fprop GEMM.',
-                       dest='tp_comm_split_rs')
-    group.add_argument('--pipeline-model-parallel-comm-backend', type=str, default=None,
-                       choices=['nccl', 'ucc'],
-                       help='Select a communicator backend for pipeline parallel communication. '
-                       'If None, the default backend will be used.')
     group.add_argument('--high-priority-stream-groups', nargs='*', type=str, default=[],
                        help='The communicator group names to use high priority streams.')
-    group.add_argument('--use-te-activation-func', action='store_true',
-                       help='Use activation function kernel from Transformer Engine in MLP module.')
+    group.add_argument('--disable-jit-fuser', action='store_true',
+                       help='Disable the JIT fuser.')
 
     return parser
 
 
 def _add_rerun_machine_args(parser):
-    group = parser.add_argument_group(title='rerun engine')
-
-    group.add_argument('--error-injection-rate', type=int, default=0,
-                       help='Rate at which to inject unexpected results, '
-                       'e.g. 1000 means once every 1000 result validations')
-    group.add_argument('--error-injection-type', type=str, default='transient_error',
-                       choices=['correct_result', 'transient_error', 'persistent_error'],
-                       help='Type of error to inject. ')
-    group.add_argument('--rerun-mode', type=str, default='validate_results',
-                       choices=['disabled', 'validate_results', 'report_stats'],
-                       help='Use re-run engine to validate results (default) '
-                       'or to emit stats on variability of computations due to '
-                       'non-deterministic algorithms.')
+    from megatron.training.resilience_config import RerunStateMachineConfig
+
+    rerun_factory = ArgumentGroupFactory(RerunStateMachineConfig, exclude=["check_for_nan_in_loss"])
+    group = rerun_factory.build_group(parser, "rerun engine")
 
     return parser
 
 
 def _add_initialization_args(parser):
-    group = parser.add_argument_group(title='initialization')
-
-    group.add_argument('--seed', type=int, default=1234,
-                       help='Random seed used for python, numpy, '
-                       'pytorch, and cuda.')
-    group.add_argument('--data-parallel-random-init', action='store_true',
-                       help='Enable random initialization of params '
-                       'across data parallel ranks')
-    group.add_argument('--init-method-std', type=float, default=0.02,
-                       help='Standard deviation of the zero mean normal '
-                       'distribution used for weight initialization.')
-    group.add_argument('--embedding-init-method-std', type=float, default=None,
-                       help='Standard deviation of the zero mean normal '
-                       'distribution used for embedding weight initialization. '
-                       'If unset, embeddings will be initialized the same way '
-                       'as other weights. Setting this to a value around 1.0 '
-                       'may avoid loss spikes in training. Setting this to any '
-                       'value will also skip applying weight decay on embedding '
-                       'weights to avoid shrinkage towards zero. See '
-                       'https://arxiv.org/abs/2312.16903 for more details.'
-                       )
+    from megatron.training.common_config import RNGConfig
+
+    rng_factory = ArgumentGroupFactory(RNGConfig)
+    group = rng_factory.build_group(parser, "RNG and initialization")
+
     group.add_argument('--init-method-xavier-uniform', action='store_true',
                        help='Enable Xavier uniform parameter initialization')
 
@@ -2357,59 +2122,21 @@ def _add_initialization_args(parser):
 
 
 def _add_learning_rate_args(parser):
-    group = parser.add_argument_group(title='learning rate')
+    from megatron.training.training_config import SchedulerConfig
+
+    sched_factory = ArgumentGroupFactory(SchedulerConfig, exclude=["no_weight_decay_cond_type"])
+    group = sched_factory.build_group(parser, title="learning rate and weight decay")
 
     group.add_argument('--lr', type=float, default=None,
                        help='Initial learning rate. Depending on decay style '
                        'and initial warmup, the learning rate at each '
                        'iteration would be different.')
-    group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine', 'inverse-square-root', 'WSD'],
-                       help='Learning rate decay function.')
-    group.add_argument('--lr-wsd-decay-style', type=str, default='exponential',
-                       choices=['exponential', 'linear', 'cosine', 'minus_sqrt'],
-                       help='Decay style for the annealing phase of WSD'),
-    group.add_argument('--lr-decay-iters', type=int, default=None,
-                       help='number of iterations to decay learning rate over,'
-                       ' If None defaults to `--train-iters`')
-    group.add_argument('--lr-decay-samples', type=int, default=None,
-                       help='number of samples to decay learning rate over,'
-                       ' If None defaults to `--train-samples`')
-    group.add_argument('--lr-wsd-decay-samples', type=int, default=None,
-                       help='number of samples for the annealing phase in the wsd schedule')
-    group.add_argument('--lr-wsd-decay-iters', type=int, default=None,
-                       help='number of iterations for the annealing phase in the wsd schedule')
-    group.add_argument('--lr-warmup-fraction', type=float, default=None,
-                       help='fraction of lr-warmup-(iters/samples) to use '
-                       'for warmup (as a float)')
-    group.add_argument('--lr-warmup-iters', type=int, default=0,
-                       help='number of iterations to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--lr-warmup-samples', type=int, default=0,
-                       help='number of samples to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--lr-warmup-init', type=float, default=0.0,
-                       help='Initial value for learning rate warmup. The '
-                       'scheduler starts warmup from this value.')
     group.add_argument('--warmup', type=int, default=None,
                        help='Old lr warmup argument, do not use. Use one of the'
                        '--lr-warmup-* arguments above')
     group.add_argument('--min-lr', type=float, default=0.0,
                        help='Minimum value for learning rate. The scheduler'
                        'clip values below this threshold.')
-    group.add_argument('--override-opt_param-scheduler', '--override-opt-param-scheduler',
-                       action='store_true',
-                       help='Reset the values of the scheduler (learning rate,'
-                       'warmup iterations, minimum learning rate, maximum '
-                       'number of iterations, and decay style from input '
-                       'arguments and ignore values from checkpoints. Note'
-                       'that all the above values will be reset.')
-    group.add_argument('--use-checkpoint-opt_param-scheduler', '--use-checkpoint-opt-param-scheduler',
-                       action='store_true',
-                       help='Use checkpoint to set the values of the scheduler '
-                       '(learning rate, warmup iterations, minimum learning '
-                       'rate, maximum number of iterations, and decay style '
-                       'from checkpoint and ignore input arguments.')
     group.add_argument('--decoupled-lr', type=float, default=None,
                        help='Separate learning rate for the input and output layer')
     group.add_argument('--decoupled-min-lr', type=float, default=None,
@@ -2420,146 +2147,34 @@ def _add_learning_rate_args(parser):
 
 
 def _add_checkpointing_args(parser):
-    group = parser.add_argument_group(title='checkpointing')
-
-    group.add_argument('--save', type=str, default=None,
-                       help='Output directory to save checkpoints to.')
-    group.add_argument('--save-interval', '--persistent-save-interval', type=int, default=None,
-                       help='Number of iterations between persistent checkpoint saves.')
-    group.add_argument('--save-retain-interval', type=int, default=None,
-                       help='Number of iterations between retained checkpoints (other'
-                       'checkpoints _except the last checkpoint_ are automatically deleted).')
+    from megatron.training.training_config import CheckpointConfig
+
+    ckpt_factory = ArgumentGroupFactory(CheckpointConfig, exclude=["most_recent_k", "save_tokenizer_assets", "save_optim", "save_rng", "load_optim", "load_rng"])
+    group = ckpt_factory.build_group(parser, "checkpointing")
+
     group.add_argument('--no-save-optim', action='store_true', default=None,
                        help='Do not save current optimizer.')
     group.add_argument('--no-save-rng', action='store_true', default=None,
                        help='Do not save current rng state.')
-    group.add_argument('--load', type=str, default=None,
-                       help='Directory containing a model checkpoint.')
     group.add_argument('--no-load-optim', action='store_true', default=None,
                        help='Do not load optimizer when loading checkpoint.')
-    group.add_argument('--load-main-params-from-ckpt', action='store_true', default=None,
-                       help='Load main parameters from checkpoint directly.')
     group.add_argument('--no-load-rng', action='store_true', default=None,
                        help='Do not load rng state when loading checkpoint.')
-    group.add_argument('--no-strict-fsdp-dtensor-load', action='store_false', dest='strict_fsdp_dtensor_load',
-                       help='Do not strict loading for fsdp_dtensor checkpoint format.')
-    group.add_argument('--non-persistent-save-interval', type=int, default=None,
-                       help='Number of iterations between non-persistent saves.')
-    group.add_argument('--non-persistent-ckpt-type', type=str, default=None,
-                       choices=['global', 'local', 'in_memory', None],
-                       help='Type of non-persistent model checkpoints. '
-                           '"global" - Saved as a standard checkpoint (e.g., on Lustre) with old checkpoints being removed. '
-                           '"local" - Each rank saves a portion of the checkpoint locally (e.g., on SSD/ramdisk). '
-                           'None - No non-persistent checkpointing (default option).')
-    group.add_argument('--non-persistent-global-ckpt-dir', type=str, default=None,
-                       help='Directory containing global non-persistent model checkpoints.')
-    group.add_argument('--non-persistent-local-ckpt-dir', type=str, default=None,
-                       help='Directory containing local non-persistent model checkpoints.')
-    group.add_argument('--non-persistent-local-ckpt-algo', type=str, default='fully_parallel',
-                       choices=['fully_parallel', 'atomic'],
-                       help='Algorithm for local non-persistent checkpointing.')
-    group.add_argument('--finetune', action='store_true',
-                       help='Load model for finetuning. Do not load optimizer '
-                       'or rng state from checkpoint and set iteration to 0. '
-                       'Assumed when loading a release checkpoint.')
-    group.add_argument('--pretrained-checkpoint', type=str, default=None,
-                       help='Directory containing a pretrained model checkpoint for finetuning.')
-    group.add_argument('--ckpt-step', type=int, default=None,
-                       help='Checkpoint step to load model from.')
-    group.add_argument('--no-initialization', action='store_false',
-                       help='Do not perform initialization when building model, '
-                       'can reduce startup time when definitely loading from a '
-                       'checkpoint',
-                       dest='perform_initialization')
-    group.add_argument('--use-checkpoint-args', action='store_true',
-                       help='Override model-related command-line arguments with arguments from checkpoint')
-    group.add_argument('--use-mp-args-from-checkpoint-args', action='store_true',
-                       help='Copy model parallelism command-line arguments from checkpoint')
-    group.add_argument('--no-use-tokenizer-model-from-checkpoint-args', action='store_false',
-                       dest='use_tokenizer_model_from_checkpoint_args',
-                       help='If set, do not use tokenizer model path from checkpoint')
-    group.add_argument('--exit-on-missing-checkpoint', action='store_true',
-                       help="If '--load' is set, but checkpoint is not found "
-                       "(e.g., path typo), then exit instead of random "
-                       "initialization.")
     group.add_argument('--use-dist-ckpt', action='store_true',
                        dest='use_dist_ckpt_deprecated',
                        help='Deprecated: see --ckpt-format.')
-    group.add_argument('--use-persistent-ckpt-worker', action='store_true',
-                       help='Enables a persitent checkpoint worker for async save')
-
-    group.add_argument('--auto-detect-ckpt-format', action='store_true',
-                       help='Determine if the checkpoint format is in legacy or distributed format.'
-                            ' If False, expects distributed checkpoint iff args.ckpt_format != "torch".'
-                            ' Might slow down loading a bit (double rank0 ckpt load).')
     group.add_argument('--dist-ckpt-format',
                        dest='dist_ckpt_format_deprecated',
                        help='Deprecated: see --ckpt-format.')
-    group.add_argument('--ckpt-format', default='torch_dist',
-                       choices=['torch', 'torch_dist', 'zarr', 'torch_dcp', 'fsdp_dtensor'],
-                       help='Checkpoint format to use. torch is the format used by torch.save/load.'
-                       ' torch_dist is a megatron built-in distributed checkpointing format.'
-                       ' torch_dcp is the torch.distributed.checkpoint format.'
-                       ' fsdp_dtensor is a torch DCP native, Megatron FSDP training-specific checkpoint format.')
-    group.add_argument('--ckpt-convert-format', default=None,
-                       choices=['torch', 'torch_dist', 'zarr'],
-                       help='Checkpoint format for conversion.')
-    group.add_argument('--ckpt-convert-save', default=None,
-                       help='Save directory for converted checkpoint.')
-    group.add_argument('--ckpt-convert-update-legacy-dist-opt-format', action='store_true',
-                       help='When loading a checkpoint, update the legacy format '
-                       'for the distributed optimizer, which previously used a '
-                       'merged param/grad buffer and a different bucket mapping. '
-                       'The legacy format was deprecated on Feb 13, 2024.')
     group.add_argument('--ckpt-fully-parallel-save', action='store_true',
                        dest='ckpt_fully_parallel_save_deprecated',
                        help='Deprecated: see --no-ckpt-fully-parallel-save.')
-    group.add_argument('--no-ckpt-fully-parallel-save', action='store_false',
-                       dest='ckpt_fully_parallel_save',
-                       help='Disable applying full save parallelization across DP for'
-                            ' distributed checkpoints. Depending on ckpt format'
-                            ' might decrease the number of files in the checkpoint.'
-                            ' Makes DistributedOptimizer checkpoint non-reshardable.')
-    group.add_argument('--async-save', action='store_true', default=None,
-                       help='Apply async checkpointing save. Currently works only with'
-                            '`torch_dist` distributed checkpoint format.')
-    group.add_argument('--ckpt-fully-parallel-load', action='store_true',
-                       help='Apply full load parallelization across DP for'
-                            ' distributed checkpoints.')
-    group.add_argument('--ckpt-assume-constant-structure', action='store_true',
-                       help='If the model and optimizer state dict structure is'
-                            'constant throughout a *single training job*, it allows for'
-                            'different checkpointing performance optimizations.')
-    group.add_argument('--dist-ckpt-strictness', type=str, default='assume_ok_unexpected',
-                       choices=[e.value for e in StrictHandling],
-                       help='Determine handling of key mismatch during checkpoint load.'
-                            ' Check StrictHandling docs for flags meaning.'
-                            ' NOTE: This flag controls only distributed checkpoint'
-                            ' load from storage, not loading state dict into the model.')
-    group.add_argument('--dist-ckpt-save-pre-mcore-014', action='store_true',
-                       help='Revert checkpointing simplifications introduced in Megatron-Core'
-                            ' v0.14. This option affects only checkpoint saving format and will'
-                            ' be removed soon (checkpoint load format is determined based on'
-                            ' checkpoint metadata).')
-    group.add_argument('--dist-ckpt-optim-fully-reshardable', action='store_true',
-                       help='Make optimizer distributed checkpoint fully reshardable (TP/PP/EP/DP)'
-                            ' as opposed to plain DP reshardability.')
-    group.add_argument('--distrib-optim-fully-reshardable-mem-efficient', action='store_true',
-                       help='During distributed optimizer checkpoint save and load tries to use as'
-                            ' little memory as possible by using Gloo (instead of NCCL) and only one'
-                            ' rank for saving. Turn on only if experiencing host or device memory'
-                            ' issues. Has affect only with `--dist-ckpt-optim-fully-reshardable`'
-                            ' flag.')
     return parser
 
 
 def _add_mixed_precision_args(parser):
     group = parser.add_argument_group(title='mixed precision')
 
-    group.add_argument('--fp16', action='store_true',
-                       help='Run model in fp16 mode.')
-    group.add_argument('--bf16', action='store_true',
-                       help='Run model in bfloat16 mode.')
     group.add_argument('--grad-reduce-in-bf16', action='store_true',
                        help='Reduce gradients in bfloat16.')
     group.add_argument('--loss-scale', type=float, default=None,
@@ -2574,11 +2189,6 @@ def _add_mixed_precision_args(parser):
                        help='Window over which to raise/lower dynamic scale.')
     group.add_argument('--hysteresis', type=int, default=2,
                        help='hysteresis for dynamic loss scaling')
-    group.add_argument('--fp32-residual-connection', action='store_true',
-                       help='Move residual connections to fp32.')
-    group.add_argument('--apply-query-key-layer-scaling', action='store_true',
-                       help='Scale Q * K^T by 1 / layer-number. '
-                       'Useful for fp16 training. Also sets `attention_softmax_in_fp32` to True.')
     group.add_argument('--attention-softmax-in-fp32', action='store_true',
                        help='Run attention masking and softmax in fp32.')
     group.add_argument('--accumulate-allreduce-grads-in-fp32',
@@ -2587,9 +2197,6 @@ def _add_mixed_precision_args(parser):
     group.add_argument('--fp16-lm-cross-entropy', action='store_true',
                        help='Move the cross entropy unreduced loss calculation'
                        'for lm head to fp16.')
-    group.add_argument('--disable-bf16-reduced-precision-matmul', action='store_true',
-                       help='If True, sets torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction=False to '
-                       'prevent matmul from using reduced precision accumulation when using BF16.')
     group.add_argument('--reuse-grad-buf-for-mxfp8-param-ag', action='store_true',
                        help='If True, reuse the grad buffer for MXFP8 parameter all-gather.')
 
@@ -2599,10 +2206,6 @@ def _add_mixed_precision_args(parser):
 def _add_distributed_args(parser):
     group = parser.add_argument_group(title='distributed')
 
-    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
-                       help='Degree of tensor model parallelism.')
-    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
-                       help='Degree of pipeline model parallelism.')
     group.add_argument('--decoder-first-pipeline-num-layers',
                        type=int, default=None,
                        help=('The number of transformer layers on the first pipeline stage of the decoder. '
@@ -2626,15 +2229,9 @@ def _add_distributed_args(parser):
                        help='Number of layers per virtual pipeline stage')
     group.add_argument('--num-virtual-stages-per-pipeline-rank', type=int, default=None,
                        help='Number of virtual pipeline stages per pipeline parallelism rank')
-    group.add_argument('--microbatch-group-size-per-virtual-pipeline-stage', type=int, default=None,
-                       help='Number of contiguous microbatches per virtual pipeline stage',
-                       dest='microbatch_group_size_per_vp_stage')
     group.add_argument('--no-overlap-p2p-communication', action='store_false',
                        help='overlap pipeline parallel communication with forward and backward chunks in 1F1B',
                        dest='overlap_p2p_comm')
-    group.add_argument('--overlap-p2p-communication-warmup-flush', action='store_true',
-                       default=False, help='if set, overlap pipeline parallel communication in warmup and flush',
-                       dest='overlap_p2p_comm_warmup_flush')
     group.add_argument('--distributed-backend', default='nccl',
                        choices=['nccl', 'gloo'],
                        help='Which backend to use for distributed training.')
@@ -2645,13 +2242,6 @@ def _add_distributed_args(parser):
                             'This timeout is applied to all process groups after initialization.')
     group.add_argument('--overlap-grad-reduce', action='store_true',
                        default=False, help='If set, overlap DDP grad reduce.')
-    group.add_argument('--defer-embedding-wgrad-compute', action='store_true',
-                       default=False, help='If set, defers the vocabulary projection linear layer weight'
-                       'gradient compute to pipeline flush.', dest='defer_embedding_wgrad_compute')
-    group.add_argument('--wgrad-deferral-limit', type=int, default=0, help='Number of micro-batches for which'
-                       'weight gradient computation of vocabulary projection is deferred, defaults to 0 which'
-                       'means all the micro-batches are deferred. Invalid if `defer-embedding-wgrad-compute`'
-                       'is not set')
     group.add_argument('--no-align-grad-reduce', action='store_false',
                        help='If not set, all PP stages will launch gradient reduces simultaneously. '
                        'Otherwise, each PP stage will independently launch as needed.',
@@ -2682,10 +2272,6 @@ def _add_distributed_args(parser):
     group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
                        help='If not set, use scatter/gather to optimize communication of tensors in pipeline.',
                        dest='scatter_gather_tensors_in_pipeline')
-    group.add_argument('--use-ring-exchange-p2p', action='store_true',
-                       default=False, help='If set, use custom-built ring exchange '
-                       'for p2p communications. Note that this option will require '
-                       'a custom built image that support ring-exchange p2p.')
     group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
                        help='local rank passed from distributed launcher.')
     group.add_argument('--lazy-mpu-init', type=bool, required=False,
@@ -2694,21 +2280,18 @@ def _add_distributed_args(parser):
                        'complete it instead. Also turns on '
                        '--use-cpu-initialization flag. This is for '
                        'external DDP manager.' )
-    group.add_argument('--account-for-embedding-in-pipeline-split', action='store_true',
-                       default=False, help='If set, *input* embedding layer will be treated as a standard transformer'
-                       'layer in the context of partition and placement for pipeline parallelism.')
-    group.add_argument('--account-for-loss-in-pipeline-split', action='store_true',
-                       default=False, help='If set, loss layer will be treated as a standard transformer'
-                       'layer in the context of partition and placement for pipeline parallelism.')
     group.add_argument('--use-distributed-optimizer', action='store_true',
                        help='Use distributed optimizer.')
-    group.add_argument('--use-nccl-ub', action='store_true', dest='nccl_ub', 
+    group.add_argument('--use-nccl-ub', action='store_true', dest='nccl_ub',
                        help='Use the userbuffer registration for DP/FSDP communication buffers.'
                        'This option will reduce GPU SM usage for the DP/FSDP communication,'
                        'which is improving the performance of the overlapped computation.')
     group.add_argument('--disable-symmetric-registration', action='store_true', dest='disable_symmetric_registration',
                        default=False, help='Disable symmetric (window) registration for NCCL userbuffer registration.'
                        'This option will force to use conventional (local) userbuffer registration when use-nccl-ub is set.')
+    group.add_argument('--fsdp-manual-registration', action='store_true', dest='fsdp_manual_registration',
+                       default=False, help='Manually register the FSDP communication buffers to NCCL user buffer.'
+                       'This option is only effective when use-megatron-fsdp and use-nccl-ub is set.')
     group.add_argument('--use-sharp', action='store_true', 
                        help='Required to enable SHARP communication.')
     group.add_argument('--sharp-enabled-group', type=str, default=None,
@@ -2716,12 +2299,21 @@ def _add_distributed_args(parser):
                        help='IB SHARP can be enabled from only one communication group. '
                        'By default, it is enabled from dp group. '
                        'Available options: [dp, dp_replica]')
+    group.add_argument('--create-all-gather-group', action='store_true',
+                   help='Create a separate process group for all-gather operations '
+                   'to overlap reduce-scatter and all-gather operations.')
     group.add_argument('--use-megatron-fsdp', action='store_true',
                        help='Use the Megatron FSDP code path in DDP.')
-    group.add_argument('--init-model-with-meta-device', action='store_true')
     group.add_argument('--data-parallel-sharding-strategy', type=str, default='no_shard',
                        choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'],
                        help='Sharding strategy of data parallelism.')
+    group.add_argument('--outer-dp-sharding-strategy', type=str, default='no_shard',
+                       choices=['no_shard', 'optim'],
+                       help='Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode. '
+                            'Valid values are "no_shard" (DP Replication) and "optim" (Optimizer State Hybrid Sharding). '
+                            'The "optim" option is only supported when --data-parallel-sharding-strategy is "optim_grads_params". '
+                            'This option is only effective when Hybrid FSDP is enabled (i.e., when dp_outer_dim is not None). '
+                            'Default: "no_shard".')
     group.add_argument('--no-gradient-reduce-div-fusion', action='store_false', dest='gradient_reduce_div_fusion',
                        help='If not set, fuse the division in gradient reduce.')
     group.add_argument('--fsdp-double-buffer', action='store_true',
@@ -2746,20 +2338,12 @@ def _add_distributed_args(parser):
     group.add_argument('--torch-fsdp2-no-reshard-after-forward', action='store_false', dest='torch_fsdp2_reshard_after_forward',
                        help='Whether to reshard weights after forward pass when using PyTorch FSDP2. '
                        'Set to enable FSDP ZeRO-2.')
-    group.add_argument('--context-parallel-size', type=int, default=1,
-                       help='Degree of context parallelism.')
     group.add_argument('--cp-comm-type', nargs='+', type=str, default=["p2p"],
                        help='Inter-gpu communication type for context parallelism: '
                        'p2p, a2a, allgather or a2a+p2p. If a single string is provided, '
                        'all layers will share the same communication type. Users can also '
                        'specify separated types for each layer like '
                        '--cp-comm-type p2p p2p a2a a2a a2a+p2p a2a+p2p')
-    group.add_argument('--hierarchical-context-parallel-sizes', nargs='+', type=int, default=None,
-                       help='Degrees of the hierarchical context parallelism. Users should '
-                       'provide a list to specify the sizes for different levels. '
-                       '--hierarchical-context-parallel-sizes 2 4 indicates every two adjacent gpus '
-                       'forms the first level of cp groups and the cp ranks with the same odevity '
-                       'forms the second level of cp groups.')
     group.add_argument('--nccl-communicator-config-path', type=str, default=None,
                        help='Path to the yaml file with NCCL communicator '
                        'configurations. The number of min/max thread groups and thread '
@@ -2768,16 +2352,6 @@ def _add_distributed_args(parser):
     group.add_argument('--use-tp-pp-dp-mapping', action='store_true', default=False,
                         help='If set, distributed ranks initialize order is changed '
                         'from tp-cp-ep-dp-pp to tp-cp-ep-pp-dp.')
-    group.add_argument('--replication', action='store_true', default=False,
-                       help="If set, replication of local checkpoints is enabled. "
-                       "Needs to be enabled on all ranks.")
-    group.add_argument('--replication-jump', default=None, type=int,
-                       help="Specifies `J`, the spacing between ranks storing replicas of a given rank's data. "
-                       "Replicas for rank `n` may be on ranks `n+J`, `n+2J`, ..., or `n-J`, `n-2J`, etc. "
-                       "This flag has an effect only if --replication is used. "
-                       "and must be consistent across all ranks.")
-    group.add_argument('--replication-factor', default=2, type=int,
-                       help="Number of machines storing the replica of a given rank's data.")
     group.add_argument('--fake-process-group', action='store_true', default=False,
                        help='If set, initialize with fake distributed process group and all distributed communication operations will be skipped. \
                        This is quite useful for profiling memory usage of distributed training with just one GPU. \
@@ -2786,20 +2360,10 @@ def _add_distributed_args(parser):
 
 
 def _add_validation_args(parser):
-    group = parser.add_argument_group(title='validation')
-
-    group.add_argument('--full-validation', action='store_true', help='If set, each time validation occurs it uses the full validation dataset(s). This currently only works for GPT datasets!')
-    group.add_argument('--multiple-validation-sets', action='store_true', help='If set, multiple datasets listed in the validation split are evaluated independently with a separate loss for each dataset in the list. This argument requires that no weights are included in the list')
-    group.add_argument('--eval-iters', type=int, default=100,
-                       help='Number of iterations to run for evaluation'
-                       'validation/test for.')
-    group.add_argument('--eval-interval', type=int, default=1000,
-                       help='Interval between running evaluation on '
-                       'validation set.')
-    group.add_argument("--test-mode", action="store_true", help='Run all real-time test alongside the experiment.')
-    group.add_argument('--skip-train', action='store_true',
-                       default=False, help='If set, bypass the training loop, '
-                       'optionally do evaluation for validation/test, and exit.')
+    from megatron.training.training_config import ValidationConfig
+
+    val_factory = ArgumentGroupFactory(ValidationConfig)
+    group = val_factory.build_group(parser, "validation")
 
     return parser
 
@@ -2838,6 +2402,11 @@ def _add_tokenizer_args(parser):
                        help='Sentencepiece tokenizer model.')
     group.add_argument('--tokenizer-metadata', type=str, default=None,
                        help='Path to tokenizer metadata in json format.')
+    group.add_argument('--tokenizer-special-tokens', type=str, nargs='+', default=None,
+                       help='List of special tokens. For TikTokenizer needs to have '
+                            '["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]')
+    group.add_argument('--legacy-tokenizer', action='store_true', default=False,
+                       help='To use Megatron-LM legacy tokenizer system.')
     group.add_argument('--tiktoken-pattern', type=str, default=None,
                        help='Which tiktoken pattern to use. Options: [v1, v2]')
     group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000,
@@ -2845,9 +2414,13 @@ def _add_tokenizer_args(parser):
     group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None,
                        help='List of tiktoken special tokens, needs to have '
                             '["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]')
-    group.add_argument('--legacy-tokenizer', action='store_true', default=False,
-                       help='To use legacy tokenizer system.')
-    group.add_argument("--trust-remote-code", action="store_true",
+    group.add_argument('--tokenizer-sentencepiece-legacy', action='store_true', default=False,
+                       help='SentencePiece tokenizer wrapper legacy behavior. Allows special tokens usage.')
+    group.add_argument('--tokenizer-hf-use-fast', action='store_true', default=False,
+                       help='Whether to use fast HuggingFace tokenizer.')
+    group.add_argument('--tokenizer-hf-include-special-tokens', action='store_true', default=False,
+                       help='Converting text to ids will include special for HuggingFace tokenizer.')
+    group.add_argument("--trust-remote-code", action="store_true", default=False,
                        help='Whether or not to allow PreTrainedTokenizer to execute remote code')
     return parser
 
@@ -2863,6 +2436,10 @@ def _add_data_args(parser):
                        '(3) a list of prefixes e.g. prefix1 prefix2. '
                        'For (3), weights are inferred from the lengths of the contributing datasets. '
                        'This argument is exclusive to the other independent --*-data-path arguments.')
+    group.add_argument('--phase-transition-iterations', type=str, default=None,
+                       help='Comma-separated list of iterations where phase '
+                       'transitions occur. Requires fixed global batch size across phases. '
+                       'Does not support batch size ramp-up.')
     group.add_argument('--split', type=str, default=None,
                        help='Comma-separated list of proportions for training,'
                        ' validation, and test split. For example the split '
@@ -2888,6 +2465,12 @@ def _add_data_args(parser):
                        'we pass in a file path from which we read those arguments. '
                        'This is useful when the list of data is too big. Format is a '
                        'json file with `train`, `valid, `test` keys')
+    group.add_argument('--per-dataset-sequences-path', default=None,
+                       help='Path to a json file with the sequences per dataset. Check the tools/build_sequences_per_dataset.py script to build this file.')
+    group.add_argument('--dataloader-fast-cache-load', action='store_true',
+                       help='Option to use the fast cache loading path when building the datasets. Requires all the dataset caches to be built and stored in --data-cache-path.')
+    group.add_argument('--dataloader-defer-npy-index-mmap', action='store_true',
+                       help='Defer the mmap of the dataset indexes (.npy files) until the first access. Requires all the dataset caches to be built and stored in --data-cache-path.')
     group.add_argument('--data-cache-path', default=None,
                        help='Path to a directory to hold cached index files.')
     group.add_argument('--no-mmap-bin-files', action='store_false',
@@ -2903,9 +2486,6 @@ def _add_data_args(parser):
                        'This should be exclusive of --seq-length')
     group.add_argument('--decoder-seq-length', type=int, default=None,
                        help="Maximum decoder sequence length to process.")
-    group.add_argument('--retriever-seq-length', type=int, default=256,
-                       help='Maximum sequence length for the biencoder model '
-                       'for retriever')
     group.add_argument('--sample-rate', type=float, default=1.0,
                        help='sample rate for training data. Supposed to be 0 '
                             ' < sample_rate < 1')
@@ -3103,21 +2683,11 @@ def _add_vision_args(parser):
     group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30,
                        help='warmup teacher temperaure epochs')
 
-    # regularization arguments
-    group.add_argument('--qk-layernorm', action='store_true',
-                       help='Whether to layer normalize the q and k attention embeddings.')
-    group.add_argument('--qk-l2-norm', action='store_true',
-                       help='Use llama 4 qk l2 norm')
-
     return parser
 
 def _add_moe_args(parser):
     group = parser.add_argument_group(title="moe")
     # General arguments
-    group.add_argument('--expert-model-parallel-size', type=int, default=1,
-                       help='Degree of expert model parallelism.')
-    group.add_argument('--expert-tensor-parallel-size', type=int, default=None,
-                       help='Degree of expert model parallelism. Default is None, which will be set to the value of --tensor-model-paralle-size.')
     group.add_argument('--num-experts', type=int, default=None,
                        help='Number of Experts in MoE (None means no MoE)')
     group.add_argument('--moe-layer-freq', type=moe_freq_type, default=1,
@@ -3126,34 +2696,8 @@ def _add_moe_args(parser):
                             '- A string containing a Python list expression that defines a custom pattern, e.g.: '
                             '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
                             'where 1 indicates an expert layer and 0 indicates a dense layer. '
-                            'Examples: "([0]+[1]*23)": 1 dense layer followed by 23 experts layers, '
+                            'Examples: "([0]+[1]*23)": 1 dense layer followed by 23 expert layers, '
                             '"([1]*3+[0]*2)*2": Three expert layers followed by two dense layers, repeated twice.')
-    group.add_argument('--moe-ffn-hidden-size', type=int, default=None,
-                       help='The hidden size of each expert\'s feed-forward network (ffn). '
-                       'If not specified, defaults to the ffn_hidden_size.')
-    group.add_argument('--moe-shared-expert-intermediate-size', type=int, default=None,
-                       help='Shared expert total ffn hidden size. '
-                       'It should be equal to "num_shared_experts * ffn_size_of_each_shared_expert" if there are multiple shared experts. '
-                       'None means no shared expert. '
-                       'By default, the shared experts execute before the router. However, when '
-                       '--moe-shared-expert-overlap or --overlap-moe-expert-parallel-comm is set, '
-                       'the shared experts execute after the router, before the routed experts. '
-                       'This makes the gradients from the router and the shared experts added in '
-                       'different orders to the hidden_states, causing minor numerical differences '
-                       'in the hidden_states gradient.')
-    group.add_argument('--moe-shared-expert-overlap', action='store_true',
-                       help='Enable overlapping between shared expert computations and dispatcher communications. '
-                       'Without this, the shared experts execute before the router. '
-                       'Only effective when moe-shared-expert-intermediate-size is set.')
-    group.add_argument('--moe-grouped-gemm', action='store_true',
-                       help='When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine.')
-    group.add_argument('--moe-use-legacy-grouped-gemm', action='store_true',
-                       help='Use legacy GroupedMLP rather than TEGroupedMLP. Note: The legacy one will be deprecated soon.')
-    group.add_argument('--moe-layer-recompute', action='store_true',
-                       help='Enable checkpointing for moe_layer, should be used when memory is not sufficient. '
-                       'Deprecated. Use "--recompute-granularity selective --recompute-modules moe" instead.')
-    group.add_argument('--moe-extended-tp', action='store_true',
-                       help='Deprecated. Use --expert-tensor-parallel-size instead.')
     group.add_argument('--moe-use-upcycling', action='store_true',
                        help='Load a checkpoint of a dense model, convert it into an MoE model, and save the converted model to the path specified by --save. '
                        'Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.')
@@ -3162,89 +2706,10 @@ def _add_moe_args(parser):
                        choices=['aux_loss', 'seq_aux_loss', 'global_aux_loss', 'sinkhorn', 'none'],
                        default='aux_loss',
                        help='Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss".')
-    group.add_argument('--moe-router-dtype', type=str,
-                       choices=['fp32', 'fp64'],
-                       default=None,
-                       help='Data type for routing computation and expert output weighted averaging. '
-                            'Fp32/fp64 enhances numerical stability, especially with numerous experts. '
-                            'The perf impact should be negligible when used with permute fusion. '
-                            'None means no changes for dtype.')
-    group.add_argument('--moe-router-fusion', action='store_true',
-                       help='Enable fusion for MoE TopK routing and aux-loss computation. This is only supported in TransformerEngine 2.7.0 and above.')
-    group.add_argument('--moe-router-score-function', type=str,
-                       choices=['softmax', 'sigmoid'],
-                       default='softmax',
-                       help='Score function for MoE TopK routing. Can be "softmax" or "sigmoid".')
-    group.add_argument('--moe-router-topk', type=int, default=2,
-                       help='Number of experts to route to for each token. The default is 2.')
-    group.add_argument('--moe-router-pre-softmax', action='store_true',
-                       help='Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k.')
-    group.add_argument('--moe-router-num-groups', type=int, default=None,
-                       help='Number of groups to divide experts into for group-limited routing. When using group-limited routing: 1) Experts are divided into equal-sized groups, 2) For each token, a subset of groups are selected based on routing scores (sum of top-2 expert scores within each group), 3) From these selected groups, moe_router_topk experts are chosen.'
-                       'Two common use cases: 1) Device-limited routing: Set equal to expert parallel size (EP) to limit each token to experts on a subset of devices (See DeepSeek-V2: https://arxiv.org/pdf/2405.04434) 2) Node-limited routing: Set equal to number of nodes in EP group to limit each token to experts on a subset of nodes (See DeepSeek-V3: https://arxiv.org/pdf/2412.19437)')
-    group.add_argument('--moe-router-group-topk', type=int, default=None,
-                       help='Number of selected groups for group-limited routing.')
-    group.add_argument('--moe-router-topk-scaling-factor', type=float, default=None,
-                       help='Scaling factor for routing score in top-k selection, only works when --moe-router-pre-softmax enabled. Defaults to None, which means no scaling.')
-    group.add_argument('--moe-router-enable-expert-bias', action='store_true',
-                       help='TopK routing with dynamic expert bias in the aux-loss-free load balancing strategy. '
-                       'The routing decision is based on the sum of the routing scores and the expert bias. '
-                       'See https://arxiv.org/abs/2408.15664 for details.')
-    group.add_argument('--moe-router-bias-update-rate', type=float, default=1e-3,
-                       help='Expert bias update rate in the aux-loss-free load balancing strategy. '
-                       'The expert bias is updated based on the number of assigned tokens to each expert in a global batch, '
-                       'where the bias is increased for the experts with less assigned tokens and decreased for the experts with more assigned tokens. '
-                       'The default value 1e-3 is same as that used in DeepSeekV3.')
-    group.add_argument('--moe-router-force-load-balancing', action='store_true',
-                       help='[Experimental] Force override routing to balance token distribution using random logits for MoE routers, supporting naive top-k and group-limited top-k. This experimental feature is for benchmarking purposes only!')
-    group.add_argument('--moe-router-padding-for-quantization', action='store_true',
-                       help='Pad the routing_map to make sure the number of tokens each expert received '
-                       'is a multiple of 16/32 for FP8/FP4 precision. It is suggested to enable this for '
-                       'dropless training with FP8/FP4 precision when num_local_experts > 1. This is a more '
-                       'efficient way to pad for FP8/FP4 which eliminates the explicit padding in the '
-                       'GroupedMLP layer.')
-    group.add_argument('--moe-router-padding-for-fp8', action='store_true',
-                       help='[Compatibility alias for --moe-router-padding-for-quantization] '
-                       'Enabling this will also enable --moe-router-padding-for-quantization.')
     group.add_argument('--moe-aux-loss-coeff', type=float, nargs='+', default=0.0,
                        help='Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended.')
-    group.add_argument('--moe-z-loss-coeff', type=float, default=None,
-                       help='Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended.')
-    group.add_argument('--moe-input-jitter-eps', type=float, default=None,
-                       help='Add noise to the input tensor by applying jitter with a specified epsilon value.')
-    group.add_argument('--moe-per-layer-logging', action='store_true',
-                       help='Enable per-layer logging for MoE, currently supports auxiliary loss and z loss.')
     # Token dispatcher arguments
-    group.add_argument('--moe-token-dispatcher-type', type=str,
-                       choices=['allgather', 'alltoall', 'flex'],
-                       default='allgather',
-                       help="The type of token dispatcher to use. The default is 'allgather'. Options are 'allgather', 'alltoall'. We recommend using 'alltoall' when applying expert parallelism. For more information, please refer to the documentation in core/moe/README.")
-    group.add_argument('--moe-enable-deepep', action='store_true',
-                       help='DEPRECATED: Please use --moe-flex-dispatcher-backend=deepep instead.')
-    group.add_argument('--moe-flex-dispatcher-backend', type=str,
-                       choices=['deepep', 'hybridep'],
-                       default='deepep',
-                       help='The backend to use for flex token dispatcher. The default is "deepep". Options are "deepep" and "hybridep".')
-    group.add_argument('--moe-deepep-num-sms', type=int, default=20,
-                       help='Number of SMs to use for DeepEP.')
-    group.add_argument('--moe-hybridep-num-sms', type=int, default=16,
-                       help='Number of SMs to use for HybridEP.')
-    group.add_argument('--moe-permute-fusion', action='store_true',
-                       help='Fuse token rearrangement ops during token dispatching.')
-    # Token dropping arguments
-    group.add_argument('--moe-expert-capacity-factor', type=float, default=None,
-                       help='The capacity factor for each expert, None means no token will be dropped.')
-    group.add_argument('--moe-pad-expert-input-to-capacity', action='store_true',
-                       help='Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set.')
-    group.add_argument('--moe-token-drop-policy', type=str, default='probs', choices=['probs', 'position'],
-                       help='The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped.')
-    group.add_argument('--moe-apply-probs-on-input', action='store_true',
-                       help='Apply probs before mlp activation for moe routing.')
     # MoE communication overlap arguments
-    group.add_argument('--overlap-moe-expert-parallel-comm', action='store_true',
-                       help='Overlap the EP A2A communication by batch-level overlapping in 1f1b stage.')
-    group.add_argument('--delay-wgrad-compute', action='store_true',
-                       help='Delay the wgrad compute for batch-level overlapping')
 
     group.add_argument('--moe-upcycling-granularity', type=int, default=1,
                        help='This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. '
@@ -3278,11 +2743,25 @@ def _add_mla_args(parser):
 
     return parser
 
+def _add_experimental_attention_variant_args(parser):
+    group = parser.add_argument_group(title="experimental_attention_variant")
+    # Linear attention
+    group.add_argument('--linear-attention-freq', type=la_freq_type, default=None,
+                       help='Frequency between LA (linear attention) layers and'
+                            ' SDPA (scaled dot-product attention) layers. Accepts either: '
+                            '- An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer '
+                            '- A string containing a Python list expression that defines a custom pattern, e.g.: '
+                            '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
+                            'where 1 indicates an LA layer and 0 indicates a SDPA layer. '
+                            'Examples: "([0]+[1]*23)": 1 SDPA layer followed by 23 LA layers, '
+                            '"([1]*3+[0]*2)*2": Three LA layers followed by two SDPA layers, repeated twice.')
+    return parser
+
 def _add_heterogeneous_args(parser):
     """
-    Heterogeneous models refer to transformer architectures where individual layers can differ 
+    Heterogeneous models refer to transformer architectures where individual layers can differ
     in configuration. Specifically:
-        - Attention or MLP layers can be replaced with either a linear layer or a no-op 
+        - Attention or MLP layers can be replaced with either a linear layer or a no-op
         - MLP intermediate dimensions can vary between layers
     We use the format of the HuggingFace config files in llama nemotron models to define the architecture.
     For example, https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1/resolve/main/config.json
@@ -3354,20 +2833,6 @@ def _add_experimental_args(parser):
                             'hybrid ratio arguments, then the number of each type'
                             'of layer in the override pattern must match number in'
                             'the overidden pattern')
-    group.add_argument('--mamba-state-dim', type=int, default=128,
-                       help='State dimension for Mamba layers.')
-    group.add_argument('--mamba-head-dim', type=int, default=64,
-                       help='Head dimension for Mamba layers.')
-    group.add_argument('--mamba-num-groups', type=int, default=8,
-                       help='Number of groups for Mamba layers.')
-    group.add_argument('--mamba-num-heads', type=int, default=None,
-                       help='Number of heads for Mamba layers.'
-                       'If not set, then the number of heads will be '
-                       '--hidden-size * expand // --mamba-head-dim')
-    group.add_argument('--is-hybrid-model', default=False, action="store_true",
-                       help='Indicates whether the model is a hybrid model.')
-    group.add_argument('--disable-mamba-mem-eff-path', default=False, action="store_true",
-                       help='Disable Mamba efficient path.')
     group.add_argument('--yaml-cfg', type=str, default=None,
                        help = 'Config file to add additional arguments')
 
@@ -3405,13 +2870,12 @@ def _add_kitchen_quantization_arguments(parser: argparse.ArgumentParser):
     If kitchen isn't available, nothing to do here, return unchanged parser
     """
     try:
-        from megatron.core.extensions.kitchen import KitchenSpecProvider
+        from megatron.core.extensions.kitchen import KitchenSpecProvider, HAVE_KITCHEN
 
-        have_kitchen = True
     except (ImportError, ModuleNotFoundError):
-        have_kitchen = False
+        HAVE_KITCHEN = False
 
-    if have_kitchen:
+    if HAVE_KITCHEN:
         group = parser.add_argument_group(title="kitchen")
         recipe_or_config_group = group.add_mutually_exclusive_group(required=False)
         recipe_or_config_group.add_argument(
@@ -3425,13 +2889,14 @@ def _add_kitchen_quantization_arguments(parser: argparse.ArgumentParser):
             '--kitchen-recipe-number',
             type=int,
             default=None,
-            help="Use a default kitchen recipe for all layers as defined by QAT_PARAMS index",
+            help="Use a default kitchen recipe for all linear layers as defined by QAT_PARAMS index. "
+            "The argument has no effect on attention layers.",
         )
     return parser
 
 def _add_sft_args(parser):
     group = parser.add_argument_group(title='sft')
     group.add_argument('--sft', action="store_true", help='Megatron SFT training')
-    group.add_argument('--sft-tokenizer-prompt-format', type=str, default="nemotron-h-aligned", 
+    group.add_argument('--sft-tokenizer-prompt-format', type=str, default="nemotron-h-aligned",
                        help='SFT prompt format.')
     return parser
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index eb23e7cc092..a3d307f1e30 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -10,6 +10,7 @@
 import threading
 import types
 from argparse import Namespace
+from datetime import datetime
 from enum import Enum, auto
 from logging import getLogger
 from pathlib import Path
@@ -17,6 +18,7 @@
 
 import numpy as np
 import torch
+from typing import Optional, Union, List, Dict, Any
 from torch.distributed.checkpoint import FileSystemReader, default_planner
 
 from megatron.core import dist_checkpointing, mpu, tensor_parallel
@@ -28,11 +30,13 @@
 )
 from megatron.core.msc_utils import MultiStorageClientFeature, open_file
 from megatron.core.num_microbatches_calculator import update_num_microbatches
+from megatron.core.utils import get_pg_rank, get_pg_size
 from megatron.core.optimizer import DistributedOptimizer
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 from megatron.core.utils import get_torch_version, is_torch_min_version
 
 from ..core.dist_checkpointing.serialization import get_default_save_sharded_strategy
+from ..core.dist_checkpointing.utils import _clean_metadata_for_serialization
 from . import ft_integration, wandb_utils
 from .async_utils import is_empty_async_queue, schedule_async_save
 from .global_vars import get_args
@@ -55,11 +59,13 @@
 # [ModelOpt]: Import
 try:
     from modelopt.torch.opt.plugins import save_modelopt_state, save_sharded_modelopt_state
+    from megatron.post_training.utils import print_distributed_quant_summary
     has_nvidia_modelopt = True
 except Exception:
     has_nvidia_modelopt = False
 
 _CHECKPOINT_VERSION = None
+_LOADED_ITERATION = None
 
 logger = getLogger(__name__)
 _NON_PERSISTENT_CKPT_SUBDIR = 'non_persistent'
@@ -77,6 +83,22 @@ def get_checkpoint_version():
     return _CHECKPOINT_VERSION
 
 
+def set_loaded_iteration(value):
+    """Set the iteration that was loaded from checkpoint.
+
+    This is stored separately from args to avoid polluting the checkpoint
+    with runtime state (args is saved in checkpoints).
+    """
+    global _LOADED_ITERATION
+    _LOADED_ITERATION = value
+
+
+def get_loaded_iteration():
+    """Get the iteration that was loaded from checkpoint, or None if no checkpoint was loaded."""
+    global _LOADED_ITERATION
+    return _LOADED_ITERATION
+
+
 def check_checkpoint_args(checkpoint_args):
     """Ensure fixed arguments for a model are the same for the input
     arguments and the one retrieved from checkpoint."""
@@ -109,6 +131,8 @@ def _compare(arg_name, old_arg_name=None, default=None):
         _compare('tokenizer_type')
     if args.data_parallel_random_init:
         _compare('data_parallel_random_init')
+    if args.phase_transition_iterations:
+        _compare('global_batch_size')
     if get_checkpoint_version() < 3.0:
         _compare('tensor_model_parallel_size',
                  old_arg_name='model_parallel_size')
@@ -309,7 +333,7 @@ def read_metadata(tracker_filename):
     return max_iter, release
 
 
-def get_rng_state(ckpt_format: str):
+def get_rng_state(ckpt_format: str, tp_group: torch.distributed.ProcessGroup, pp_group: torch.distributed.ProcessGroup) -> Union[List[Dict[str, Any]], ShardedObject]:
     """Collect rng state across data parallel ranks."""
     args = get_args()
     rng_state = {
@@ -332,12 +356,31 @@ def get_rng_state(ckpt_format: str):
         rng_state_list = [rng_state]
 
     if ckpt_format == "torch_dist":
-        pp_rank = mpu.get_pipeline_model_parallel_rank()
-        pp_size = mpu.get_pipeline_model_parallel_world_size()
-        tp_rank = mpu.get_tensor_model_parallel_rank()
-        tp_size = mpu.get_tensor_model_parallel_world_size()
-        rng_state_list = ShardedObject('rng_state', rng_state_list, (pp_size, tp_size), (pp_rank, tp_rank),
-                                       replica_id=mpu.get_data_parallel_rank(with_context_parallel=True))
+        pp_rank = get_pg_rank(pp_group)
+        pp_size = get_pg_size(pp_group)
+        tp_rank = get_pg_rank(tp_group)
+        tp_size = get_pg_size(tp_group)
+        ep_size = mpu.get_expert_model_parallel_world_size()
+
+        if ep_size > 1:
+            # Shard RNG by PP, TP, DP when using expert parallelism.
+            dp_rank = mpu.get_data_parallel_rank(with_context_parallel=True)
+            dp_size = mpu.get_data_parallel_world_size(with_context_parallel=True)
+            rng_state_list = ShardedObject(
+                'rng_state',
+                rng_state_list,
+                (pp_size, tp_size, dp_size),
+                (pp_rank, tp_rank, dp_rank),
+                replica_id=0,
+            )
+        else:
+            rng_state_list = ShardedObject(
+                'rng_state',
+                rng_state_list,
+                (pp_size, tp_size),
+                (pp_rank, tp_rank),
+                replica_id=mpu.get_data_parallel_rank(with_context_parallel=True),
+            )
     elif ckpt_format == "fsdp_dtensor":
         pp_rank = mpu.get_pipeline_model_parallel_rank()
         tp_rank = mpu.get_tensor_model_parallel_rank()
@@ -354,7 +397,8 @@ class CheckpointType(Enum):
     TORCH_DCP = auto()
     FSDP_DTENSOR = auto()
 
-def _build_sharded_state_dict_metadata(args: Namespace) -> dict:
+
+def _build_sharded_state_dict_metadata(args: Namespace, dp_cp_group: Optional[torch.distributed.ProcessGroup] = None) -> dict:
     """Builds metadata used for sharded_state_dict versioning.
 
     The whole content metadata is passed to ``shared_state_dict`` model and optimizer methods
@@ -364,39 +408,69 @@ def _build_sharded_state_dict_metadata(args: Namespace) -> dict:
     In particular, a simple integer (or SemVer) versioning flag (e.g. `metadata['version'] = 3.4`)
     is discouraged, because the metadata serves for all models and optimizers and it's practically
     impossible to enforce a linearly increasing versioning for this whole space.
+
+    Args:
+        args: Arguments namespace
+        dp_cp_group: Data parallel + context parallel group (default: None, falls back to mpu API)
     """
     metadata = {}
 
     if args.use_distributed_optimizer and args.ckpt_format == "fsdp_dtensor":
         metadata['distrib_optim_sharding_type'] = 'fsdp_dtensor'
 
-    # Force pre-mcore 0.14 behavior for PyTorch versions below 2.6a0
-    force_pre_mcore_014 = not is_torch_min_version("2.6a0")
-    if force_pre_mcore_014 and not args.dist_ckpt_save_pre_mcore_014:
-        logger.warning(f"PyTorch version {get_torch_version()} below 2.6 detected."
-                       f" Forcing dist_ckpt_save_pre_mcore_014 behavior.")
-
-    if args.dist_ckpt_save_pre_mcore_014 or force_pre_mcore_014:
-        if args.use_distributed_optimizer and args.ckpt_format != "fsdp_dtensor":
-            if args.ckpt_fully_parallel_save:
-                metadata['distrib_optim_sharding_type'] = 'fully_sharded_model_space'
-            else:
-                metadata['distrib_optim_sharding_type'] = 'dp_zero_gather_scatter'
-    else:
-        if args.use_distributed_optimizer and args.ckpt_format != "fsdp_dtensor":
-            if args.dist_ckpt_optim_fully_reshardable:
-                metadata['distrib_optim_sharding_type'] = 'fully_reshardable'
-                metadata['distrib_optim_fully_reshardable_mem_efficient'] = args.distrib_optim_fully_reshardable_mem_efficient
-            else:
-                metadata['distrib_optim_sharding_type'] = 'dp_reshardable'
+    if args.use_distributed_optimizer and args.ckpt_format != "fsdp_dtensor":
+        if args.dist_ckpt_optim_fully_reshardable:
+            metadata['distrib_optim_sharding_type'] = 'fully_reshardable'
+            metadata['distrib_optim_fully_reshardable_mem_efficient'] = args.distrib_optim_fully_reshardable_mem_efficient
+        else:
+            metadata['distrib_optim_sharding_type'] = 'dp_reshardable'
 
     metadata['singleton_local_shards'] = False
     metadata['chained_optim_avoid_prefix'] = True
+    # Add dp_cp_group to metadata. If not provided, fallback to global parallel state.
+    if dp_cp_group is None:
+        dp_cp_group = mpu.get_data_parallel_group(with_context_parallel=True)
+    metadata['dp_cp_group'] = dp_cp_group
     return metadata
 
+
+def save_grads(save_dir, state_dict, iteration, grad_label):
+    """Persist state_dict of grads onto disk. In case of wgrads, this collection should
+    be performed before the grads are cleared but after they are reduced.
+
+    NOTE: wgrads for non-expert layers will be duplicated if using expert parallelism, but
+    this can be handled in postprocessing."""
+
+    print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saving {grad_label} "
+                 f"from iteration {iteration:7d}")
+
+    if mpu.get_expert_data_parallel_rank() == 0:
+        # Create saving directory.
+        ep_rank = mpu.get_expert_model_parallel_rank()
+        pp_rank = mpu.get_pipeline_model_parallel_rank()
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        assert save_dir is not None
+        assert iteration is not None
+        save_dir = os.path.join(save_dir, grad_label, f"iter_{iteration:07d}")
+        os.makedirs(save_dir, exist_ok=True)
+
+        # Save state_dict.
+        checkpoint_name = f"mp_rank_{tp_rank:02d}"
+        if mpu.get_pipeline_model_parallel_world_size() > 1:
+            checkpoint_name += f"_{pp_rank:03d}"
+        if mpu.get_expert_model_parallel_world_size() > 1:
+            checkpoint_name += f"_{ep_rank:03d}"
+        full_save_path = os.path.join(save_dir, f"{checkpoint_name}.pth")
+        # Convert back to dict (e.g., from collections.defaultdict) for easy loading later.
+        torch.save(dict(state_dict), full_save_path)
+
+    print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saved {grad_label} "
+                 f"from iteration {iteration:7d}")
+
+
 def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far,
                     checkpointing_context=None, pipeline_rank=None, expert_rank=None, tensor_rank=None, pipeline_parallel=None, expert_parallel=None, non_persistent_ckpt=False,
-                    train_data_iterator=None, preprocess_common_state_dict_fn = None, release=False):
+                    train_data_iterator=None, preprocess_common_state_dict_fn = None, release=False, tp_group: Optional[torch.distributed.ProcessGroup] = None, pp_group: Optional[torch.distributed.ProcessGroup] = None, dp_cp_group: Optional[torch.distributed.ProcessGroup] = None):
     """Save a model, optimizer and optionally dataloader checkpoint.
 
     Checkpointing context is used to persist some checkpointing state
@@ -410,6 +484,9 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
 
     Dataloader checkpoint is only saved if the dataloader supports it. Currently this applies only
     to the Megatron Energon dataloader (multimodal) and not the built-in Megatron dataloader (text-only).
+
+    Args:
+        dp_cp_group: Data parallel + context parallel group (default: None, falls back to mpu API)
     """
     start_ckpt = time()
     args = get_args()
@@ -449,11 +526,14 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
             raise NotImplementedError(f"Please use local or global non-persistent checkpoints (got: {args.non_persistent_ckpt_type})")
 
     ckpt_format = args.ckpt_format if ckpt_type == CheckpointType.GLOBAL else 'torch'
-    print_rank_0('saving checkpoint at iteration {:7d} to {} in {} format'.format(
-        iteration, save_dir, ckpt_format))
+    print_rank_0(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saving checkpoint "
+                 f"at iteration {iteration:7d} to {save_dir} in {ckpt_format} format")
 
     # Collect rng state across data parallel ranks.
-    rng_state = get_rng_state(args.ckpt_format)
+    if tp_group is None and pp_group is None:
+        tp_group = mpu.get_tensor_model_parallel_group()
+        pp_group = mpu.get_pipeline_model_parallel_group()
+    rng_state = get_rng_state(args.ckpt_format, tp_group, pp_group)
 
     # Collect rerun state across all ranks
     rerun_state_machine = get_rerun_state_machine()
@@ -482,6 +562,14 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
         if not optimizer.is_stub_optimizer:
             optimizer.save_parameter_state(optim_checkpoint_name)
 
+    # LayerWiseDistributedOptimizer save optimizer state to file on different ranks
+    if getattr(args, "optimizer", "adam").startswith("dist_") and args.ckpt_format == 'torch':
+        dp_rank = mpu.get_data_parallel_rank()
+        optim_checkpoint_name = os.path.join(os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt")
+        ensure_directory_exists(optim_checkpoint_name)
+        if not optimizer.is_stub_optimizer:
+            optimizer.save_state_dict_to_file(optim_checkpoint_name)
+
     async_save_request = None
     if args.async_save:
         if ckpt_type == CheckpointType.LEGACY:
@@ -496,7 +584,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
             or mpu.get_expert_data_parallel_rank() == 0 \
             or ckpt_type != CheckpointType.LEGACY:
         if ckpt_type != CheckpointType.LEGACY:
-            sharded_sd_metadata = _build_sharded_state_dict_metadata(args)
+            sharded_sd_metadata = _build_sharded_state_dict_metadata(args, dp_cp_group=dp_cp_group)
             if args.use_distributed_optimizer:
                 print_rank_0(f'Storing distributed optimizer sharded state of type'
                              f' {sharded_sd_metadata["distrib_optim_sharding_type"]}')
@@ -548,7 +636,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
                                                          async_sharded_save=args.async_save,
                                                          validate_access_integrity=validate_sharding_integrity,
                                                          preprocess_common_before_consistancy_check=preprocess_common_state_dict_fn,
-                                                         content_metadata=sharded_sd_metadata)
+                                                         content_metadata=_clean_metadata_for_serialization(sharded_sd_metadata))
             # [ModelOpt]: save sharded modelopt_state
             if has_nvidia_modelopt:
                 save_sharded_modelopt_state(model, checkpoint_name, (args.ckpt_format, 1))
@@ -619,8 +707,8 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
 
         if ckpt_type == CheckpointType.LOCAL:
             def iter_finalize_fn():
-                print_rank_0('  successfully saved local checkpoint from iteration {:7d}'
-                             .format(iteration))
+                print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] successfully "
+                             f"saved local checkpoint from iteration {iteration:7d}")
                 if args.log_progress and args.async_save:
                     append_to_progress_log(f'Saved async local checkpoint\tIteration: {iteration}',
                                            barrier=False)
@@ -636,9 +724,10 @@ def iter_finalize_fn():
                     f.write("release" if release else str(iteration))
                 tensor_rank_to_print = (tensor_rank if tensor_rank is not None else mpu.get_tensor_model_parallel_rank()) + 1
                 pipeline_rank_to_print = (pipeline_rank if pipeline_rank is not None else mpu.get_pipeline_model_parallel_rank()) + 1
-                print_rank_0(f'  successfully saved checkpoint from iteration {int(iteration):7d} to {args.save} '
-                             f'[ t {tensor_rank_to_print}/{mpu.get_tensor_model_parallel_world_size()}, '
-                             f'p {pipeline_rank_to_print}/{mpu.get_pipeline_model_parallel_world_size()} ]')
+                print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] successfully saved "
+                             f"checkpoint from iteration {int(iteration):7d} to {args.save} "
+                             f"[ t {tensor_rank_to_print}/{mpu.get_tensor_model_parallel_world_size()}, "
+                             f"p {pipeline_rank_to_print}/{mpu.get_pipeline_model_parallel_world_size()} ]")
                 if args.log_progress and args.async_save:
                     append_to_progress_log(f'Saved async checkpoint\tIteration: {iteration}',
                                            barrier=False)
@@ -648,8 +737,9 @@ def delete_checkpoint(args, iteration_to_delete):
                                                           return_base_dir=True)
                     try:
                         shutil.rmtree(checkpoint_name)  # TODO: Make this work with MSC remote paths?
-                        print_rank_0(f'  successfully deleted checkpoint from iteration {iteration_to_delete:7d} '
-                                     f'at {args.save}')
+                        print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] successfully "
+                                     f"deleted checkpoint from iteration {iteration_to_delete:7d} "
+                                     f"at {args.save}")
                         if args.log_progress:
                             append_to_progress_log(f'Deleted checkpoint\tIteration: {iteration_to_delete}', barrier=False)
                     except Exception as e:
@@ -700,8 +790,8 @@ def wandb_finalize_fn():
 
     if args.async_save:
         schedule_async_save(async_save_request)
-        print_rank_0('  scheduled an async checkpoint save at iteration {:7d} to {}' \
-                     .format(iteration, save_dir))
+        print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] scheduled "
+                     f"an async checkpoint save at iteration {iteration:7d} to {save_dir}")
 
     # Wait so everyone is done (not necessary)
     if torch.distributed.is_initialized():
@@ -764,7 +854,8 @@ def maybe_save_dataloader_state(train_iterator, iteration, dataloader_save_path)
         return
 
     dp_rank = mpu.get_data_parallel_rank()
-    print(f"saving dataloader checkpoint at iteration {iteration} to {dataloader_save_path}")
+    if dp_rank == 0:
+        print(f"saving dataloader checkpoint at iteration {iteration} to {dataloader_save_path}")
     train_dataloader_state_dict = train_iterator.iterable.save_state()
     data_state_save_path = get_checkpoint_name(
         dataloader_save_path, iteration,
@@ -809,7 +900,13 @@ def generate_state_dict(
             key = f"model{i}"
 
         if args.ckpt_format == "torch_dist":
-            model_sd = model[i].sharded_state_dict(**(model_sd_kwargs or {}))
+            model_sd = model[i].sharded_state_dict(
+                **(model_sd_kwargs or {
+                    "metadata": {
+                        "dp_cp_group": mpu.get_data_parallel_group(with_context_parallel=True)
+                    }
+                })
+            )
         else:   # torch, torch_dcp, fsdp_dtensor
             model_sd = model[i].state_dict_for_save_checkpoint()
 
@@ -818,10 +915,16 @@ def generate_state_dict(
     # Optimizer stuff.
     if not args.no_save_optim:
         if optimizer is not None and not optimizer.is_stub_optimizer:
-            optimizer_sd = None
 
             if args.ckpt_format == "torch_dist":
-                optimizer_sd = optimizer.sharded_state_dict(state_dict, **(optim_sd_kwargs or {}))
+                optimizer_sd = optimizer.sharded_state_dict(
+                    state_dict,
+                    **(optim_sd_kwargs or {
+                        "metadata": {
+                            "dp_cp_group": mpu.get_data_parallel_group(with_context_parallel=True)
+                        }
+                    })
+                )
             elif args.ckpt_format == "fsdp_dtensor":
                 if optim_sd_kwargs is None:
                     optim_sd_kwargs = {}
@@ -1085,6 +1188,10 @@ def _load_base_checkpoint(
     if getattr(args, "ckpt_step", None):
         iteration = args.ckpt_step
 
+    # Record the iteration loaded (stored separately from args to avoid
+    # polluting checkpoints, since args is saved in checkpoints).
+    set_loaded_iteration(iteration)
+
     if non_persistent_iteration != -1:  # there is a non-persistent checkpoint
         if non_persistent_iteration >= iteration:
             return _load_non_persistent_base_checkpoint(
@@ -1341,11 +1448,14 @@ def _set_arg(arg_name, old_arg_name=None, force=False):
     _set_arg('heterogeneous_layers_config_path', force=True)
     _set_arg('heterogeneous_layers_config_encoded_json', force=True)
 
+    # MoE latent projection.
+    _set_arg('moe_latent_size', force=True)
+
     # Tokenizer args.
-    _set_arg('tokenizer_type', force=True)
     # Using checkpoint version might not always be safe (e.g., if running on different cluster).
     if args.use_tokenizer_model_from_checkpoint_args:
         _set_arg('tokenizer_model', force=True)
+        _set_arg('tokenizer_type', force=True)
     _set_arg('tiktoken_pattern', force=True)
     _set_arg('padded_vocab_size')
 
@@ -1367,7 +1477,7 @@ def _set_arg(arg_name, old_arg_name=None, force=False):
 
 
 def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', strict=True,
-                    checkpointing_context=None, skip_load_to_model_and_opt=False):
+                    checkpointing_context=None, skip_load_to_model_and_opt=False, tp_group: Optional[torch.distributed.ProcessGroup] = None, pp_group: Optional[torch.distributed.ProcessGroup] = None, dp_cp_group: Optional[torch.distributed.ProcessGroup] = None):
     """Load a model checkpoint and return the iteration.
     strict (bool): whether to strictly enforce that the keys in
         :attr:`state_dict` of the checkpoint match the names of
@@ -1375,6 +1485,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
     skip_load_to_model_and_opt (bool): whether to call `load_state_dict`
         for :attr:`model` and :attr:`optimizer`. In case of running FSDP2 with mcore distributed
         checkpointing, the tensors are already loaded in-place by `_load_base_checkpoint`.
+    dp_cp_group: Data parallel + context parallel group (default: None, falls back to mpu API)
     """
     args = get_args()
     load_dir = getattr(args, load_arg)
@@ -1424,13 +1535,13 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
             ckpt_args = state_dict.get("args")
 
         if not hasattr(ckpt_args, "tensor_model_parallel_size"):
-            print_rank_0("WARNING: TP size not found in checkpoint args, using 0 as default.")
+            print_rank_0("WARNING: TP size not found in checkpoint args, using 1 as default.")
         if not hasattr(ckpt_args, "pipeline_model_parallel_size"):
-            print_rank_0("WARNING: PP size not found in checkpoint args, using 0 as default.")
+            print_rank_0("WARNING: PP size not found in checkpoint args, using 1 as default.")
 
         ckpt_tp_pp = (
-            getattr(ckpt_args, "tensor_model_parallel_size", 0),
-            getattr(ckpt_args, "pipeline_model_parallel_size", 0),
+            getattr(ckpt_args, "tensor_model_parallel_size", 1),
+            getattr(ckpt_args, "pipeline_model_parallel_size", 1),
         )
         run_tp_pp = (
             args.tensor_model_parallel_size,
@@ -1448,7 +1559,10 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
         # Determine if RNG state will be loaded
         if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune and not args.no_load_rng
                 and not getattr(ckpt_args, 'no_save_rng', False)):
-            gen_sd_rng_state = get_rng_state(args.ckpt_format)  # we can load the rng state
+            if tp_group is None and pp_group is None:
+                tp_group = mpu.get_tensor_model_parallel_group()
+                pp_group = mpu.get_pipeline_model_parallel_group()
+            gen_sd_rng_state = get_rng_state(args.ckpt_format, tp_group, pp_group)  # we can load the rng state
         else:
             ignore_rng_state = True
             gen_sd_rng_state = None
@@ -1460,6 +1574,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
         else:
             sharded_sd_metadata = dist_checkpointing.load_content_metadata(preloaded_state_dict=state_dict)
         print_rank_0(f'sharded_state_dict metadata loaded from the checkpoint: {sharded_sd_metadata}')
+
         # Determine if optimizer state will be loaded
         if (not release and not args.finetune and not args.no_load_optim
                 and not getattr(ckpt_args, 'no_save_optim', False)):
@@ -1493,6 +1608,15 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
             gen_sd_optim = None
             gen_sd_opt_param_scheduler = None
 
+        if dp_cp_group is None:
+            dp_cp_group = mpu.get_data_parallel_group(with_context_parallel=True)
+
+        # dist_checkpointing.load_content_metadata(...) may return None.
+        # Ensure we have a dict before updating to avoid NoneType AttributeError.
+        if sharded_sd_metadata is None:
+            sharded_sd_metadata = {}
+        sharded_sd_metadata["dp_cp_group"] = dp_cp_group
+
         optim_sd_kwargs = dict(metadata=sharded_sd_metadata, is_loading=True)
         model_sd_kwargs = dict(metadata=sharded_sd_metadata)
 
@@ -1534,12 +1658,15 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
     elif args.ckpt_format == "torch_dcp":
         model_sd = model[0].state_dict()
         optimizer_sd = optimizer.state_dict(is_loading=True)
+        if tp_group is None and pp_group is None:
+            tp_group = mpu.get_tensor_model_parallel_group()
+            pp_group = mpu.get_pipeline_model_parallel_group()
         sharded_state_dict = {
             "model": model_sd,
             "optimizer": optimizer_sd,
             "args": None,
             "iteration": 1,
-            "rng_state": get_rng_state(args.ckpt_format),
+            "rng_state": get_rng_state(args.ckpt_format, tp_group, pp_group),
             "checkpoint_version": None,
             "opt_param_scheduler": opt_param_scheduler.state_dict(),
             "num_floating_point_operations_so_far": 0,
@@ -1562,7 +1689,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
                     data_iterator=None, ckpt_format=ckpt_format, force=True,
                 )
             if not args.no_load_rng:
-                gen_sd_rng_state = get_rng_state(args.ckpt_format)
+                gen_sd_rng_state = get_rng_state(args.ckpt_format, tp_group, pp_group)
             if not args.no_load_optim:
                 gen_sd_optim = optimizer
                 gen_sd_opt_param_scheduler = opt_param_scheduler
@@ -1639,7 +1766,6 @@ def load_model_state_dict(module, state_dict, strict: bool):
                 load_return = module.load_state_dict(state_dict, strict=False)
                 print(f"load_return: {load_return}")
     # Model.
-    strict = False if args.retro_add_retriever else strict
     if not skip_load_to_model_and_opt:
         if len(ddp_model) == 1:
             load_model_state_dict(ddp_model[0], state_dict['model'], strict)
@@ -1659,7 +1785,12 @@ def load_model_state_dict(module, state_dict, strict: bool):
     if not release and not args.finetune and not args.no_load_optim:
         try:
             # Load state dict.
-            if not skip_load_to_model_and_opt and optimizer is not None and not optimizer.is_stub_optimizer:
+            if getattr(args, "optimizer", "adam").startswith("dist_") and args.ckpt_format == 'torch':
+                # LayerWiseDistributedOptimizer load optimizer state from file on different ranks
+                dp_rank = mpu.get_data_parallel_rank()
+                optim_checkpoint_name = os.path.join(os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt")
+                optimizer.load_state_dict_from_file(optim_checkpoint_name)
+            elif not skip_load_to_model_and_opt and optimizer is not None and not optimizer.is_stub_optimizer:
                 optimizer.load_state_dict(state_dict['optimizer'])
 
             # Load distributed optimizer's custom parameter state.
@@ -1704,11 +1835,13 @@ def load_model_state_dict(module, state_dict, strict: bool):
             if 'rerun_state_machine' in state_dict:
                 get_rerun_state_machine().load_state_dict(state_dict['rerun_state_machine'])
         except Exception as e:
-            print(f"Unable to restore RerunMachine from checkpoint: {e}. Skipping.")
+            print_rank_0(f"Unable to restore RerunMachine from checkpoint: {e}. Skipping.")
 
     # rng states.
     if not release and not args.finetune and not args.no_load_rng and not ignore_rng_state:
         try:
+            cuda_rng_tracker = tensor_parallel.get_cuda_rng_tracker()
+            graph_safe_rng = tensor_parallel.is_graph_safe_cuda_rng_tracker(cuda_rng_tracker)
             if 'rng_state' in state_dict:
                 if args.ckpt_format == "fsdp_dtensor":
                     # FSDP DTensor checkpoints store rng_state in a different format.
@@ -1717,7 +1850,7 @@ def load_model_state_dict(module, state_dict, strict: bool):
                     if f"({pp_rank}, {tp_rank})" in state_dict['rng_state']:
                         rng_state = state_dict['rng_state'][f"({pp_rank}, {tp_rank})"]
                     else:
-                        print("WARNING: RNG state not found for current TP/PP rank")
+                        print_rank_0("WARNING: RNG state not found for current TP/PP rank")
                         rng_state = next(iter(state_dict['rng_state'].values()))
                 else:
                     rng_state = state_dict['rng_state']
@@ -1734,8 +1867,10 @@ def load_model_state_dict(module, state_dict, strict: bool):
                 # Check for empty states array
                 if not rng_state['rng_tracker_states']:
                     raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
-                    rng_state['rng_tracker_states'])
+                rng_tracker_states = {
+                    k: tensor_parallel.convert_cuda_rng_state(v, to_graphable=graph_safe_rng)
+                    for k, v in rng_state['rng_tracker_states'].items()
+                }
             else:  # backward compatability
                 random.setstate(state_dict['random_rng_state'])
                 np.random.set_state(state_dict['np_rng_state'])
@@ -1744,8 +1879,11 @@ def load_model_state_dict(module, state_dict, strict: bool):
                 # Check for empty states array
                 if not state_dict['rng_tracker_states']:
                     raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
-                    state_dict['rng_tracker_states'])
+                rng_tracker_states = {
+                    k: tensor_parallel.convert_cuda_rng_state(v, to_graphable=graph_safe_rng)
+                    for k, v in state_dict['rng_tracker_states'].items()
+                }
+            cuda_rng_tracker.set_states(rng_tracker_states)
         except KeyError:
             print_rank_0('Unable to load rng state from checkpoint {}. '
                          'Specify --no-load-rng or --finetune to prevent '
@@ -1761,7 +1899,10 @@ def load_model_state_dict(module, state_dict, strict: bool):
                  f'[ t {mpu.get_tensor_model_parallel_rank() + 1}/{mpu.get_tensor_model_parallel_world_size()}, '
                  f'p {mpu.get_pipeline_model_parallel_rank() + 1}/{mpu.get_pipeline_model_parallel_world_size()} ] '
                  f'at iteration {iteration}')
-
+                 
+    if has_nvidia_modelopt:
+        print_distributed_quant_summary(model, msg="After loading checkpoint")
+        
     # Additional callback for wandb (last rank)
     if not torch.distributed.is_initialized() \
        or is_last_rank():
diff --git a/megatron/training/common_config.py b/megatron/training/common_config.py
new file mode 100644
index 00000000000..d1096e91154
--- /dev/null
+++ b/megatron/training/common_config.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass, field
+
+@dataclass(kw_only=True)
+class RNGConfig:
+    """Configuration settings for random number generation."""
+
+    seed: int = 1234
+    """Random seed used for python, numpy, pytorch, and cuda."""
+
+    te_rng_tracker: bool = False
+    """Use the Transformer Engine version of the random number generator.
+    Required for CUDA graphs support."""
+
+    inference_rng_tracker: bool = False
+    """Use a random number generator configured for inference."""
+
+    data_parallel_random_init: bool = False
+    """Enable random initialization of params across data parallel ranks"""
+
+
+@dataclass(kw_only=True)
+class ProfilingConfig:
+    """Configuration settings for profiling the training process."""
+
+    use_nsys_profiler: bool = field(default=False, metadata={"argparse_meta": {"arg_names": ["--profile"], "dest": "profile"}})
+    """Enable nsys profiling. When using this option, nsys options should be specified in
+    commandline. An example nsys commandline is
+    `nsys profile -s none -t nvtx,cuda -o <path/to/output_file> --force-overwrite true
+    --capture-range=cudaProfilerApi --capture-range-end=stop`.
+    """
+
+    profile_step_start: int = 10
+    """Global step to start profiling."""
+
+    profile_step_end: int = 12
+    """Global step to stop profiling."""
+
+    use_pytorch_profiler: bool = False
+    """Use the built-in pytorch profiler. Useful if you wish to view profiles in tensorboard."""
+
+    profile_ranks: list[int] = field(default_factory=lambda: [0])
+    """Global ranks to profile."""
+
+    record_memory_history: bool = False
+    """Record memory history in last rank."""
+
+    memory_snapshot_path: str = "snapshot.pickle"
+    """Specifies where to dump the memory history pickle."""
+
+    record_shapes: bool = False
+    """Record shapes of tensors."""
+
+    nvtx_ranges: bool = False
+    """Enable NVTX range annotations for profiling. When enabled, inserts NVTX markers
+    to categorize execution in profiler output."""
diff --git a/megatron/training/datasets/data_samplers.py b/megatron/training/datasets/data_samplers.py
index 1e7f47510d1..ca4cc1b36a3 100644
--- a/megatron/training/datasets/data_samplers.py
+++ b/megatron/training/datasets/data_samplers.py
@@ -39,14 +39,22 @@ def build_pretraining_data_loader(dataset, consumed_samples):
             data_parallel_size=mpu.get_data_parallel_world_size(),
         )
     elif args.dataloader_type == 'single':
-        # Megatron sampler
-        batch_sampler = MegatronPretrainingSampler(
-            total_samples=len(dataset),
-            consumed_samples=consumed_samples,
-            micro_batch_size=args.micro_batch_size,
-            data_parallel_rank=mpu.get_data_parallel_rank(),
-            data_parallel_size=mpu.get_data_parallel_world_size(),
-        )
+        if args.hybrid_context_parallel:
+            batch_sampler = HybridCPMegatronPretrainingSampler(
+                total_samples=len(dataset),
+                consumed_samples=consumed_samples,
+                micro_batch_size=args.micro_batch_size,
+                global_batch_size=args.global_batch_size,
+                data_parallel_rank=mpu.get_data_parallel_rank(),
+                data_parallel_size=mpu.get_data_parallel_world_size())
+        else:
+            # Megatron sampler
+            batch_sampler = MegatronPretrainingSampler(
+                total_samples=len(dataset),
+                consumed_samples=consumed_samples,
+                micro_batch_size=args.micro_batch_size,
+                data_parallel_rank=mpu.get_data_parallel_rank(),
+                data_parallel_size=mpu.get_data_parallel_world_size())
     elif args.dataloader_type == 'cyclic':
         batch_sampler = MegatronPretrainingRandomSampler(
             dataset,
@@ -71,6 +79,10 @@ def worker_init_fn(_):
         worker_init_fn if args.exit_signal_handler and args.num_workers > 0 else None
     )
     # Torch dataloader.
+    if args.hybrid_context_parallel:
+        extra_kwargs = {"collate_fn": lambda x: x,}
+    else:
+        extra_kwargs = {}
     return torch.utils.data.DataLoader(
         dataset,
         batch_sampler=batch_sampler,
@@ -78,9 +90,9 @@ def worker_init_fn(_):
         pin_memory=True,
         persistent_workers=True if args.num_workers > 0 else False,
         worker_init_fn=maybe_worker_init_fn,
+        **extra_kwargs,
     )
 
-
 class MegatronPretrainingSampler:
     """
     Sampler for Megatron pretraining dataloaders that divides data samples across
@@ -150,6 +162,49 @@ def __iter__(self):
             start_idx, end_idx = self.get_start_end_idx()
             yield batch[start_idx:end_idx]
 
+class HybridCPMegatronPretrainingSampler(MegatronPretrainingSampler):
+    """
+    Data sampler for hybrid context parallel (Hybrid CP) format.
+    This data sampler pulls in the entire global batch at once across all data parallel ranks.
+    This helps provide the Hybrid CP Dataloader Wrapper to schedule and load balance sub-samples
+    of the entire global batch.
+    """
+
+    def __init__(self, total_samples, consumed_samples, micro_batch_size, global_batch_size,
+                 data_parallel_rank, data_parallel_size, drop_last=True):
+        super().__init__(total_samples, consumed_samples, micro_batch_size, data_parallel_rank, data_parallel_size, drop_last)
+        self.global_batch_size = global_batch_size
+        self.data_parallel_size = data_parallel_size
+        self.num_micro_batches = self.global_batch_size // self.micro_batch_times_data_parallel_size
+
+    def __len__(self):
+        return self.total_samples
+
+    def get_start_end_idx_global_batch(self):
+        start_idx = [self.data_parallel_rank * self.micro_batch_size + i * self.micro_batch_size * self.data_parallel_size for i in range(self.num_micro_batches)]
+        end_idx = [start_idx[i] + self.micro_batch_size for i in range(self.num_micro_batches)]
+        return start_idx, end_idx
+
+    def __iter__(self):
+        batch = []
+        # Last batch will be dropped if drop_last is not set False
+        for idx in range(self.consumed_samples, self.total_samples):
+            batch.append(idx)
+            if len(batch) == self.micro_batch_times_data_parallel_size * self.num_micro_batches:
+                start_idx, end_idx = self.get_start_end_idx_global_batch()
+                global_batch_idx = []
+                for i in range(self.num_micro_batches):
+                    global_batch_idx.extend(batch[start_idx[i]:end_idx[i]])
+                yield global_batch_idx
+                batch = []
+
+        # Check the last partial batch and see drop_last is set
+        if len(batch) > 0 and not self.drop_last:
+            start_idx, end_idx = self.get_start_end_idx_global_batch()
+            global_batch_idx = []
+            for i in range(self.num_micro_batches):
+                global_batch_idx.extend(batch[start_idx[i]:end_idx[i]])
+            yield global_batch_idx
 
 class RandomSeedDataset(Dataset):
     """
diff --git a/megatron/training/datasets/sft_dataset.py b/megatron/training/datasets/sft_dataset.py
index e4d8a6faf24..b313dafb0ec 100644
--- a/megatron/training/datasets/sft_dataset.py
+++ b/megatron/training/datasets/sft_dataset.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
+import atexit, json
+from collections import Counter
 from typing import Any, Dict, Optional
 
 import numpy as np
@@ -25,6 +27,9 @@ class SFTLowLevelDataset:
                 {"role": "user", "content": "something1"},
                 {"role": "assistant", "content": "something2"},
             ]
+            A jsonl line can contain multiple conversations packed together into on list. Each
+            conversation starts with the system role, and conversations can have multiple turns
+            of the user and assistant roles.
     """
 
     def __init__(self, dataset_path: str) -> None:
@@ -68,79 +73,124 @@ def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfig) -> LowL
     def __len__(self) -> int:
         return self.num_samples
 
+    def _split_conversations(self, merged_conversations):
+        split_conversations = []
+        current = []
+        for msg in merged_conversations:
+            # Whenever we see a new system message, start a new conversation
+            if msg["role"] == "system":
+                if current:  # If previously accumulating a conversation, then store it
+                    split_conversations.append(current)
+                current = [msg]  # Then start the new conversation
+            else:
+                current.append(msg) # Continue accumulating the current conversation
+        if current:  # Store any remaining conversation
+            split_conversations.append(current)
+        return split_conversations
+
     def __getitem__(self, idx: int) -> Dict[str, Any]:
 
         tokenizer = self.config.tokenizer
-        max_seq_len = self.config.sequence_length
-
-        conversation_list = self.dataset[int(self.indices[idx % len(self.indices)])]
-        tokens, target = tokenizer.tokenize_conversation(
-            conversation_list, return_target=True, add_generation_prompt=False
-        )
-
-        force_eod_length = int(tokenizer.force_eod)
-
-        if len(tokens) > max_seq_len - force_eod_length:
-            tokens = tokens[: max_seq_len - force_eod_length]
-            target = target[: max_seq_len - force_eod_length]
-
-        # padding
-        num_tokens = len(tokens) + force_eod_length
-        padding_len = max_seq_len - num_tokens
-        assert padding_len >= 0
-        filler = [tokenizer.eod] * force_eod_length + [tokenizer.pad] * (padding_len + 1)
-
-        tokens = np.array(tokens.tolist() + filler, dtype=np.int64)
-        target = np.array(target.tolist() + filler, dtype=np.int64)
-
-        tokens = torch.tensor(tokens)
-        target = torch.tensor(target)
-
-        tokens = tokens[:-1].contiguous()
-        target = target[1:].contiguous()
-
-        loss_mask, position_ids, attention_mask = self._get_ltor_masks_and_position_ids(
-            max_seq_len, target, tokenizer.pad
-        )
-
-        if self.config.create_attention_mask:
-            ret = {
-                'tokens': tokens,
-                'labels': target,
-                'attention_mask': attention_mask,
-                'loss_mask': loss_mask,
-                'position_ids': position_ids,
-            }
-        else:
-            ret = {
-                'tokens': tokens,
-                'labels': target,
-                'loss_mask': loss_mask,
-                'position_ids': position_ids,
-            }
-
-        return ret
-
-    def _get_ltor_masks_and_position_ids(self, max_seq_len, target, pad_token):
-        """Build masks and position id for left to right model for SFT"""
-
-        assert not self.config.reset_position_ids and not self.config.reset_attention_mask
+        pack_length = self.config.sequence_length
+
+        merged_conversations = self.dataset[int(self.indices[idx % len(self.indices)])]
+        split_conversations = self._split_conversations(merged_conversations)
+
+        def extend_with_padding(tokens, targets, positions, pad_len):
+            tokens.extend([pad] * pad_len)
+            targets.extend([pad] * pad_len)
+            positions.extend(range(positions[-1]+1, positions[-1]+1+pad_len))
+
+        pack_tokens = []
+        pack_targets = []
+        pack_positions = []
+        cu_seqlens = [0]
+        eod = tokenizer.eod
+        pad = tokenizer.pad
+        # TODO(duncan): Track number of convs dropped and/or truncated and amount of end-padding
+        for conversation in split_conversations:
+
+            tokens, targets = tokenizer.tokenize_conversation(
+                conversation, return_target=True, add_generation_prompt=False
+            )
 
-        # Position ids.
-        position_ids = torch.arange(max_seq_len, dtype=torch.long)
+            tokens_list = tokens.tolist()
+            targets_list = targets.tolist()
+
+            # Add EOD, unless it's already present
+            if tokens_list[-1] != eod:
+                tokens_list.append(eod)
+                targets_list.append(eod)
+
+            pack_tokens.extend(tokens_list)
+            pack_targets.extend(targets_list)
+
+            assert not self.config.reset_position_ids
+            pack_positions.extend(range(len(tokens_list)))
+
+            if self.config.context_parallel_size > 1:
+                pad_granularity = self.config.context_parallel_size * 2
+                mod_token_count = len(pack_tokens) % pad_granularity
+                if mod_token_count != 0:
+                    pad_len = pad_granularity - mod_token_count
+                    extend_with_padding(pack_tokens, pack_targets, pack_positions, pad_len)
+
+            # TODO(duncan): Consider also padding to multiple of number of tokens here. This might
+            # be needed for efficiency (and potentially set via command-line argument).
+
+            cu_seqlens.append(len(pack_tokens))
+
+            # Handle any necessary truncation
+            if len(pack_tokens) >= pack_length + 1:  # +1 here to account for later alignment
+                # Truncate on the right
+                max_body = pack_length
+                pack_tokens = pack_tokens[:max_body]
+                pack_targets = pack_targets[:max_body]
+                pack_tokens.extend(pad)
+                pack_targets.extend(pad)
+                pack_positions = pack_positions[:pack_length+1]
+                # Note len({pack_tokens, pack_targets, pack_positions}) should be pack_length + 1
+                cu_seqlens[-1] = len(pack_tokens) - 1
+                break
+
+        # Handle any necessary padding
+        if len(pack_tokens) < pack_length + 1:  # +1 here to account for later alignment
+            pad_len = pack_length + 1 - len(pack_tokens)
+            extend_with_padding(pack_tokens, pack_targets, pack_positions, pad_len)
+            # Note len({pack_tokens, pack_targets, pack_positions}) should be pack_length + 1
+            cu_seqlens[-1] = len(pack_tokens) - 1
+
+        assert len(pack_tokens) == pack_length + 1
+        assert len(pack_targets) == pack_length + 1
+        assert len(pack_positions) == pack_length + 1
+
+        # Align and convert to tensors
+        input_ids    = torch.tensor(pack_tokens[:-1],  dtype=torch.int64)
+        labels       = torch.tensor(pack_targets[1:], dtype=torch.int64)
+        position_ids = torch.tensor(pack_positions[:-1], dtype=torch.int64)
 
         # Loss mask.
-        loss_mask = torch.ones(max_seq_len, dtype=torch.float)
-        loss_mask[target == pad_token] = 0.0  # mask paddings
-        loss_mask[target == IGNORE_INDEX] = 0.0  # mask prompts
-
-        if self.config.create_attention_mask:
-            attention_mask = torch.tril(
-                torch.ones((seq_length, seq_length), device=data.device)
-            ).unsqueeze(0)
-            # Convert attention mask to binary:
-            attention_mask = attention_mask < 0.5
-        else:
-            attention_mask = None
-
-        return loss_mask, position_ids, attention_mask
+        loss_mask = torch.ones(pack_length, dtype=torch.float32)
+        loss_mask[labels == pad] = 0.0  # Mask paddings
+        loss_mask[labels == IGNORE_INDEX] = 0.0  # mask prompts
+
+        # TODO(duncan): Optionally create an attention mask
+        assert not self.config.create_attention_mask and not self.config.reset_attention_mask
+        # attention_mask = None
+
+        assert len(cu_seqlens) >= 2
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32)
+        # Calculating max_seqlen here, rather than incrementally above, because of possible
+        # effects of truncation and padding
+        adjacent_diffs = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = adjacent_diffs.max()  # max_seqlen is a 0-D tensor
+
+        return {
+            'tokens': input_ids,
+            'labels': labels,
+            # 'attention_mask': attention_mask,  # PyTorch collate cannot handle NoneType
+            'loss_mask': loss_mask,
+            'position_ids': position_ids,
+            'cu_seqlens': cu_seqlens,
+            'max_seqlen': max_seqlen,
+        }
diff --git a/megatron/training/dgrad_logging.py b/megatron/training/dgrad_logging.py
new file mode 100644
index 00000000000..c046b4709fb
--- /dev/null
+++ b/megatron/training/dgrad_logging.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+
+"""dgrad logging using backward hooks."""
+
+from collections import defaultdict
+import torch
+import torch.nn as nn
+
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+
+from .checkpointing import save_grads
+from .utils import unwrap_model
+
+
+def _get_linear_types():
+    """Build tuple of linear layer types to capture gradients from."""
+    types = [nn.Linear, nn.Embedding, ColumnParallelLinear, RowParallelLinear]
+
+    # Add Transformer Engine layers if available.
+    try:
+        from megatron.core.extensions.transformer_engine import (
+            TELinear,
+            TEColumnParallelLinear,
+            TERowParallelLinear,
+            TELayerNormColumnParallelLinear,
+        )
+        types.extend([TELinear, TEColumnParallelLinear, TERowParallelLinear,
+                      TELayerNormColumnParallelLinear])
+    except ImportError:
+        pass
+
+    try:
+        from megatron.core.extensions.transformer_engine import (
+            TEGroupedLinear,
+            TEColumnParallelGroupedLinear,
+            TERowParallelGroupedLinear,
+        )
+        if TEGroupedLinear is not None:
+            types.extend([TEGroupedLinear, TEColumnParallelGroupedLinear,
+                          TERowParallelGroupedLinear])
+    except ImportError:
+        pass
+
+    return tuple(types)
+
+
+LINEAR_TYPES = _get_linear_types()
+
+
+class DataGradLogger:
+    """Captures and saves gradients from all linear layers using backward hooks.
+    
+    NOTE: Right now, we only save the dgrads for the last microbatch in a batch on DP replica 0.
+    The code below would need to be extended to save dgrads for all microbatches in a batch."""
+
+    def __init__(self, save_dir: str):
+        self._save_dir = save_dir
+        self._dgrads_state_dict = defaultdict(dict)
+        self._hooks = []
+
+    def _make_hook(self, model_chunk_name: str, module_name: str):
+        """Create a backward hook for a named module."""
+        def hook(_, grad_input, grad_output):
+            for idx, grad in enumerate(grad_output):
+                if grad is not None:
+                    grad_name = f"{module_name}/output{idx}"
+                    self._dgrads_state_dict[model_chunk_name][grad_name] = grad.detach().cpu()
+            for idx, grad in enumerate(grad_input):
+                if grad is not None:
+                    grad_name = f"{module_name}/input{idx}"
+                    self._dgrads_state_dict[model_chunk_name][grad_name] = grad.detach().cpu()
+        return hook
+
+    def save(self, iteration: int):
+        """Save captured gradients to disk and clear the buffer."""
+        if not self._dgrads_state_dict:
+            return
+        save_grads(self._save_dir, self._dgrads_state_dict, iteration, "dgrads")
+        self._dgrads_state_dict.clear()
+
+    def register_hooks(self, model: torch.nn.Module):
+        """Find and register hooks on all linear layers."""
+        assert len(self._hooks) == 0
+        for model_chunk_id, model_chunk in enumerate(model):
+            unwrapped_model_chunk = unwrap_model(model_chunk)
+            for module_name, module in unwrapped_model_chunk.named_modules():
+                if isinstance(module, LINEAR_TYPES):
+                    model_chunk_name = f"model_chunk{model_chunk_id}"
+                    handle = module.register_full_backward_hook(
+                        self._make_hook(model_chunk_name, module_name)
+                    )
+                    self._hooks.append(handle)
+
+    def remove_hooks(self):
+        """Remove all registered hooks."""
+        for handle in self._hooks:
+            handle.remove()
+        self._hooks.clear()
+
+
+_LOGGER = None
+
+
+def enable_dgrad_logging(model: torch.nn.Module, save_dir: str):
+    """Enable dgrad logging on a model."""
+    global _LOGGER
+    if _LOGGER is None:
+        _LOGGER = DataGradLogger(save_dir)
+    _LOGGER.register_hooks(model)
+
+
+def disable_dgrad_logging():
+    """Disable dgrad logging on a model."""
+    global _LOGGER
+    assert _LOGGER is not None
+    _LOGGER.remove_hooks()
+
+
+def save_dgrads(iteration: int):
+    """Save dgrads to disk."""
+    global _LOGGER
+    assert _LOGGER is not None
+    _LOGGER.save(iteration)
diff --git a/megatron/training/dist_signal_handler.py b/megatron/training/dist_signal_handler.py
index f1f3725c8a9..0ecd706fdc7 100644
--- a/megatron/training/dist_signal_handler.py
+++ b/megatron/training/dist_signal_handler.py
@@ -3,13 +3,6 @@
 
 import torch
 
-SIGNAL_MAP = {
-    'SIGTERM': signal.SIGTERM,
-    'SIGINT': signal.SIGINT,
-    'SIGUSR1': signal.SIGUSR1,
-    'SIGUSR2': signal.SIGUSR2
-}
-
 def get_world_size():
     if torch.distributed.is_available() and torch.distributed.is_initialized():
         world_size = torch.distributed.get_world_size()
@@ -55,8 +48,8 @@ def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None):
 
 
 class DistributedSignalHandler:
-    def __init__(self, sig: str = 'SIGTERM'):
-        self.sig = SIGNAL_MAP.get(sig, signal.SIGTERM)
+    def __init__(self, sig: signal.Signals = signal.SIGTERM):
+        self.sig = sig
 
     def signals_received(self):
         all_received = all_gather_item(
diff --git a/megatron/training/ft_integration.py b/megatron/training/ft_integration.py
index e10e3da995c..f3532e75639 100644
--- a/megatron/training/ft_integration.py
+++ b/megatron/training/ft_integration.py
@@ -45,6 +45,7 @@
 
 import torch
 
+from . import arguments
 from . import global_vars
 from .utils import is_rank0, print_rank_0
 
@@ -59,7 +60,7 @@
 _seen_tr_iters_cnt = 0
 _curr_eval_iter_idx = 0
 
-_NUM_WARMUP_ITERS = 1
+_NUM_WARMUP_ITERS = 1  # Will be set by --ft-num-warmup-iters (default: 5)
 _MIN_ITERS_FOR_STEP_TIMEOUT_UPDATE = 16
 
 
@@ -72,25 +73,22 @@ def get_rank_monitor_client() -> Optional[Any]:
     return _GLOBAL_RANK_MONITOR_CLIENT
 
 
-def setup(args: argparse.Namespace) -> None:
-    """Initialize fault tolerance
-
-    Args:
-        args (argparse.Namespace): parsed Megatron-LM command line arguments
+def setup() -> None:
+    """Initialize fault tolerance before initialize_megatron"""
+    args = arguments.parse_args(ignore_unknown_args=True)
+    if not args.enable_ft_package:
+        return
 
-    Raises:
-        ValueError: if invalid config is provided
-    """
+    # Initialize fault tolerance
     from nvidia_resiliency_ext.fault_tolerance import RankMonitorClient
 
-    print_rank_0(f"FT: initializing...")
+    if os.environ.get("RANK") == "0":
+        print("FT: initializing...", flush=True)
 
     checkpoint_dir = args.save
     if not checkpoint_dir:
         raise ValueError("checkpointing save dir must be set to enable fault tolerance")
-    if is_rank0() and not os.path.exists(checkpoint_dir):
-        # MLM checkpoint dir will be needed for saving FT state.
-        # it can happen before the checkpointing, so create it in advance
+    if not os.path.exists(checkpoint_dir):
         os.makedirs(checkpoint_dir, exist_ok=True)
 
     cli = RankMonitorClient()
@@ -107,9 +105,13 @@ def setup(args: argparse.Namespace) -> None:
     global _is_calculating_timeouts
     _is_calculating_timeouts = args.calc_ft_timeouts
 
-    cli.init_workload_monitoring()
+    global _NUM_WARMUP_ITERS
+    _NUM_WARMUP_ITERS = args.ft_num_warmup_iters
+
+    cli.init_workload_monitoring(num_warmup_iters=_NUM_WARMUP_ITERS)
     _load_state_if_exists()
-    print_rank_0(f"FT: initialized. Timeouts={cli.section_timeouts}")
+    if os.environ.get("RANK") == "0":
+        print(f"FT: initialized. Timeouts={cli.section_timeouts}", flush=True)
 
     cli.start_section("setup")
     global _is_setup_section_open
diff --git a/megatron/training/global_vars.py b/megatron/training/global_vars.py
index 62dbf701c1f..76e8df7cee3 100644
--- a/megatron/training/global_vars.py
+++ b/megatron/training/global_vars.py
@@ -9,6 +9,7 @@
 from megatron.core import Timers
 from megatron.core.config import set_experimental_flag
 from megatron.core.energy_monitor import EnergyMonitor
+from megatron.core.jit import disable_jit_fuser
 from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator, unset_num_microbatches_calculator
 from megatron.training.dist_signal_handler import DistributedSignalHandler
 from megatron.training.tokenizer import build_tokenizer
@@ -112,6 +113,9 @@ def set_global_variables(args, build_tokenizer=True):
     if args.exit_signal_handler:
         _set_signal_handler(args.exit_signal)
 
+    if args.disable_jit_fuser:
+        disable_jit_fuser()
+
 
 def unset_global_variables():
     """Unset global vars.
@@ -245,13 +249,13 @@ def _set_adlr_autoresume(args):
     _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
 
     if args.adlr_autoresume:
-        if args.rank == 0:
-            print('enabling autoresume ...', flush=True)
+        from megatron.training.utils import print_rank_0
+        print_rank_0('enabling autoresume ...')
         sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
         try:
             from userlib.auto_resume import AutoResume
         except ImportError:
-            print('ADLR autoresume is not available, exiting ...')
+            print_rank_0('ADLR autoresume is not available, exiting ...')
             sys.exit()
 
         _GLOBAL_ADLR_AUTORESUME = AutoResume
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
index 96594b5194d..c150ac3d5ca 100644
--- a/megatron/training/initialize.py
+++ b/megatron/training/initialize.py
@@ -22,14 +22,17 @@
     RerunMode,
     initialize_rerun_state_machine,
 )
+from megatron.core.transformer.custom_layers.batch_invariant_kernels import enable_batch_invariant_mode
 from megatron.core.utils import get_te_version, is_te_min_version, is_torch_min_version
 from megatron.legacy import fused_kernels
 from megatron.training import get_adlr_autoresume, get_args, get_tensorboard_writer
+from megatron.training.utils import print_rank_0, warn_rank_0
 from megatron.training import inprocess_restart
 from megatron.training.arguments import parse_args, validate_args
 from megatron.training.async_utils import init_persistent_async_worker
 from megatron.training.checkpointing import load_args_from_checkpoint
 from megatron.training.global_vars import set_global_variables
+from megatron.training.utils import is_rank0
 from megatron.training.yaml_arguments import validate_yaml
 
 logger = logging.getLogger(__name__)
@@ -71,12 +74,13 @@ def initialize_megatron(
         args.exit_on_missing_checkpoint = True
 
     if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
-        assert args.load is not None, "--use-checkpoint-args requires --load argument"
+        assert args.load is not None or args.pretrained_checkpoint is not None, "--use-checkpoint-args requires --load or --pretrained-checkpoint argument"
         assert args.non_persistent_ckpt_type != "local", (
             "--use-checkpoint-args is not supported with --non_persistent_ckpt_type=local. "
             "Two-stage checkpoint loading is not implemented, and all arguments must be defined "
             "before initializing LocalCheckpointManager."
         )
+        load_args_from_checkpoint(args, load_arg='pretrained_checkpoint')
         load_args_from_checkpoint(args)
 
     if args.async_save and args.use_persistent_ckpt_worker:
@@ -113,6 +117,10 @@ def state_restore_func(state_dict):
         ),
         result_rejected_tracker_filename=args.result_rejected_tracker_filename,
     )
+    
+    if args.batch_invariant_mode:
+        print_rank_0("Enabling batch invariant mode globally")
+        enable_batch_invariant_mode()
 
     # torch.distributed initialization
     def finish_mpu_init():
@@ -121,8 +129,7 @@ def finish_mpu_init():
         _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, store)
 
         # Random seeds for reproducibility.
-        if args.rank == 0:
-            print("> setting random seeds to {} ...".format(args.seed))
+        print_rank_0("> setting random seeds to {} ...".format(args.seed))
         _set_random_seed(
             args.seed,
             args.data_parallel_random_init,
@@ -205,13 +212,10 @@ def _compile_dependencies():
     )
     # Print a warning.
     if not ((args.fp16 or args.bf16) and custom_kernel_constraint and args.masked_softmax_fusion):
-        if args.rank == 0:
-            print(
-                "WARNING: constraints for invoking optimized"
-                " fused softmax kernel are not met. We default"
-                " back to unfused kernel invocations.",
-                flush=True,
-            )
+        warn_rank_0(
+            "Constraints for invoking optimized fused softmax kernel are not met. "
+            "We default back to unfused kernel invocations."
+        )
 
     # Always build on rank zero first.
     if torch.distributed.get_rank() == 0:
@@ -315,18 +319,13 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, s
     device_count = torch.cuda.device_count()
     if torch.distributed.is_initialized():
 
-        if args.rank == 0:
-            print(
-                "torch distributed is already initialized, " "skipping initialization ...",
-                flush=True,
-            )
+        print_rank_0("torch distributed is already initialized, skipping initialization ...")
         args.rank = torch.distributed.get_rank()
         args.world_size = torch.distributed.get_world_size()
 
     else:
 
-        if args.rank == 0:
-            print("> initializing torch distributed ...", flush=True)
+        print_rank_0("> initializing torch distributed ...")
         # Manually set the device ids.
         if device_count > 0:
             torch.cuda.set_device(args.local_rank)
@@ -370,6 +369,7 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, s
                 use_sharp=args.use_sharp,
                 context_parallel_size=args.context_parallel_size,
                 hierarchical_context_parallel_sizes=args.hierarchical_context_parallel_sizes,
+                hybrid_context_parallel=args.hybrid_context_parallel,
                 expert_model_parallel_size=args.expert_model_parallel_size,
                 num_distributed_optimizer_instances=args.num_distributed_optimizer_instances,
                 expert_tensor_parallel_size=args.expert_tensor_parallel_size,
@@ -381,16 +381,16 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, s
                 create_gloo_process_groups=args.enable_gloo_process_groups,
                 high_priority_stream_groups=args.high_priority_stream_groups,
                 sharp_enabled_group=args.sharp_enabled_group,
+                create_all_gather_group=args.create_all_gather_group,
+            )
+            print_rank_0(
+                f"> initialized tensor model parallel with size "
+                f"{mpu.get_tensor_model_parallel_world_size()}"
+            )
+            print_rank_0(
+                f"> initialized pipeline model parallel with size "
+                f"{mpu.get_pipeline_model_parallel_world_size()}"
             )
-            if args.rank == 0:
-                print(
-                    f"> initialized tensor model parallel with size "
-                    f"{mpu.get_tensor_model_parallel_world_size()}"
-                )
-                print(
-                    f"> initialized pipeline model parallel with size "
-                    f"{mpu.get_pipeline_model_parallel_world_size()}"
-                )
 
 
 def _init_autoresume():
@@ -542,5 +542,6 @@ def setup_logging() -> None:
         logging_level = args.logging_level
 
     if logging_level is not None:
-        logger.info(f'Setting logging level to {logging_level}')
+        if is_rank0():
+            logger.info(f'Setting logging level to {logging_level}')
         logging.getLogger().setLevel(logging_level)
diff --git a/megatron/training/one_logger_utils.py b/megatron/training/one_logger_utils.py
index ea41ba18af0..fd18c569d1d 100644
--- a/megatron/training/one_logger_utils.py
+++ b/megatron/training/one_logger_utils.py
@@ -297,8 +297,7 @@ def on_pretrain_start():
                 'one_logger_utils_version': _one_logger_utils_version,
             })
 
-def track_config_flags(train_iters, skip_train, do_train, do_valid, do_test,
-                           dataloader_type, retro_project_dir, retro_cyclic_train_iters):
+def track_config_flags(train_iters, skip_train, do_train, do_valid, do_test, dataloader_type):
     """Track flags about train/validation/test enablement
 
     Args:
@@ -308,16 +307,10 @@ def track_config_flags(train_iters, skip_train, do_train, do_valid, do_test,
         do_valid (bool): flags to do validation
         do_test (bool): flags to do test
         dataloader_type (str): dataloader type
-        retro_project_dir (str): Retro project directory
-        retro_cyclic_train_iters (int): iteration number for cyclic retro training
     """
     one_logger = get_one_logger()
     if one_logger:
         with one_logger.get_context_manager():
-            # Update train_iters for cyclic loader
-            if dataloader_type == 'cyclic' and retro_project_dir:
-                assert retro_cyclic_train_iters is not None
-                train_iters = retro_cyclic_train_iters
             # Track if training is enabled. Can only be done once args.do_train is assigned after dataloader is built.
             train_enabled = train_iters and (not skip_train) and do_train and train_iters > 0
             one_logger.log_metrics({
diff --git a/megatron/training/resilience_config.py b/megatron/training/resilience_config.py
new file mode 100644
index 00000000000..dd0bd716521
--- /dev/null
+++ b/megatron/training/resilience_config.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+from typing import Literal
+
+@dataclass(kw_only=True)
+class RerunStateMachineConfig:
+    """Configuration for the rerun state machine used for result validation or stats."""
+
+    error_injection_rate: int = 0
+    """Rate at which to inject unexpected results, e.g. 1000 means
+    once every 1000 result validations"""
+
+    error_injection_type: Literal["correct_result", "transient_error", "persistent_error"] = "transient_error"
+    """Type of error to inject. """
+
+    rerun_mode: Literal["disabled", "validate_results", "report_stats"] = "validate_results"
+    """Use re-run engine to validate results (default) or to emit stats
+    on variability of computations due to non-deterministic algorithms."""
+
+    check_for_nan_in_loss: bool = True
+    """Check for NaN in the loss."""
+
+    check_for_spiky_loss: bool = False
+    """Check for spiky loss."""
+
+
+@dataclass(kw_only=True)
+class StragglerDetectionConfig:
+    """Configuration settings for detecting and logging GPU stragglers."""
+
+    log_straggler: bool = False
+    """If set, tracks and logs straggler per GPU."""
+
+    straggler_ctrlr_port: int = 65535
+    """Port number to toggle StragglerDetector on/off at runtime"""
+
+    straggler_minmax_count: int = 1
+    """Number of ranks to report with high/low estimated throughput"""
+
+    disable_straggler_on_startup: bool = False
+    """If set, StragglerDetector is disabled on startup."""
+
diff --git a/megatron/training/theoretical_memory_usage.py b/megatron/training/theoretical_memory_usage.py
index 8737015dfa4..7d4043b62d7 100644
--- a/megatron/training/theoretical_memory_usage.py
+++ b/megatron/training/theoretical_memory_usage.py
@@ -56,20 +56,23 @@ def compute_weight_and_optimizer_memory(args, verbose=False):
         mtp_num_moe_layers = 0
         mtp_num_dense_layers = 0
 
+    # RMSNorm does not have bias, but LayerNorm has.
+    norm_size = 1 if args.normalization == "RMSNorm" else 2
+
     if args.multi_latent_attention:
         assert not args.group_query_attention
         if args.q_lora_rank is None:
             q_term = args.hidden_size * args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim)
         else:
             ## q lora + rope + q norm
-            q_term = args.q_lora_rank * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim) + 1) 
+            q_term = args.q_lora_rank * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim) + norm_size) 
         
         self_attn_term = (
             q_term
 
             ## kv lora + rope + kv norm
             + args.kv_lora_rank
-            * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.v_head_dim) + 1)
+            * (args.hidden_size + args.num_attention_heads * (args.qk_head_dim + args.v_head_dim) + norm_size)
             + args.hidden_size * args.qk_pos_emb_head_dim
 
             ## o proj
@@ -96,7 +99,7 @@ def compute_weight_and_optimizer_memory(args, verbose=False):
             # Dense MoE MLP.
             (args.ffn_hidden_size * gated_linear_multiplier)
             # Transformer layernorms.
-            + (2)
+            + norm_size
         )
         + self_attn_term
     )
@@ -109,12 +112,12 @@ def compute_weight_and_optimizer_memory(args, verbose=False):
             # Shared MoE MLP.
             + (shared_expert_ffn_hidden_size * gated_linear_multiplier)
             # Transformer layernorms.
-            + (2)
+            + norm_size
         )
         + self_attn_term
     )
     embedding_size = args.hidden_size * args.padded_vocab_size
-    final_layernorm = 2 * args.hidden_size
+    final_layernorm = norm_size * args.hidden_size
     if args.untie_embeddings_and_output_weights:
         num_parameters_in_embedding_layers = 2 * embedding_size
     else:
diff --git a/megatron/training/tokenizer/sft_tokenizer.py b/megatron/training/tokenizer/sft_tokenizer.py
index f525352e892..274c6f6c944 100644
--- a/megatron/training/tokenizer/sft_tokenizer.py
+++ b/megatron/training/tokenizer/sft_tokenizer.py
@@ -160,11 +160,6 @@ def get_special_tokens(self):
         """Get special tokens."""
         return self._tokenizer.get_added_vocab()
 
-    @property
-    def force_eod(self):
-        """To force an EOD at the end of every data sample in SFT."""
-        return self._prompt_format == "nemotron-h-aligned"
-
     @property
     def pad(self):
         """Pad token ID."""
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
index b1aad6819b4..33340a5e978 100644
--- a/megatron/training/tokenizer/tokenizer.py
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -20,8 +20,8 @@
 
 def build_tokenizer(args, **kwargs):
     """Initialize tokenizer."""
-    if args.rank == 0:
-        print('> building {} tokenizer ...'.format(args.tokenizer_type), flush=True)
+    from megatron.training.utils import print_rank_0
+    print_rank_0('> building {} tokenizer ...'.format(args.tokenizer_type))
 
     # Select and instantiate the tokenizer.
     if args.tokenizer_type == 'BertWordPieceLowerCase':
@@ -63,7 +63,7 @@ def build_tokenizer(args, **kwargs):
             pattern=pattern,
             vocab_size=args.vocab_size,
             num_special_tokens=args.tiktoken_num_special_tokens,
-            special_tokens=args.tiktoken_special_tokens,
+            special_tokens=args.tokenizer_special_tokens,
         )
     elif args.tokenizer_type == 'NullTokenizer':
         assert args.vocab_size is not None
diff --git a/megatron/training/training.py b/megatron/training/training.py
index 429bfb1d899..d61b9357a37 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -1,7 +1,37 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 """Pretrain utilities."""
+import time
+# The earliest we can measure the start time.
+_TRAIN_START_TIME = time.time()
+
+# Startup timestamps for tracking program initialization phases
+_STARTUP_TIMESTAMPS = {
+    'program_start': None,  # Set by entry script before imports
+    'main_entry': None,     # Set by entry script at start of __main__
+    'pretrain_entry': None, # Set at top of pretrain()
+}
+
+
+def set_startup_timestamps(program_start=None, main_entry=None):
+    """Set startup timestamps from the entry script.
+
+    Call this after imports but before calling pretrain() to register
+    the program start time and main entry time.
+
+    Args:
+        program_start: Timestamp captured at very start of program, before any imports.
+        main_entry: Timestamp captured right after entering __main__ block.
+    """
+    global _TRAIN_START_TIME, _STARTUP_TIMESTAMPS
+    if program_start is not None:
+        _TRAIN_START_TIME = program_start
+        _STARTUP_TIMESTAMPS['program_start'] = program_start
+    if main_entry is not None:
+        _STARTUP_TIMESTAMPS['main_entry'] = main_entry
 
+
+from collections import defaultdict
 import copy
 import dataclasses
 from datetime import datetime, timedelta
@@ -12,20 +42,21 @@
 import math
 import os
 import sys
-from typing import Any, Optional
+from contextlib import nullcontext
+from typing import Any, Optional, Dict
 
 import torch.distributed
 
 from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer
+from megatron.core.optimizer_param_scheduler import get_canonical_lr_for_logging
 from .log_handler import CustomHandler
 
 # Make default logging level INFO, but filter out all log messages not from MCore.
 logging.basicConfig(handlers=[CustomHandler()], level=logging.INFO)
 from .theoretical_memory_usage import report_theoretical_memory
-import time
 
-# The earliest we can measure the start time.
-_TRAIN_START_TIME = time.time()
+_LEGACY_TRAIN_START_TIME = time.time() # NOTE(asolergi-nv): Legacy timestamp
+
 import torch
 
 try:
@@ -33,6 +64,7 @@
     has_rl_utils = True
 except ImportError:
     has_rl_utils = False
+from megatron.rl.parallel_utils import build_inference_pg_collection
 try:
     from modelopt.torch.distill.plugins.megatron import (
         get_tensor_shapes_adjust_fn_for_distillation,
@@ -49,8 +81,12 @@
 
 
 from megatron.core import mpu, tensor_parallel
+from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+    is_linear_attention_variant,
+)
 from megatron.core.utils import (
     check_param_hashes_across_dp_replicas,
+    get_attr_wrapped_model,
     get_model_config,
     get_pg_size,
     get_pg_rank,
@@ -64,17 +100,22 @@
     is_vp_first_stage,
     is_vp_last_stage,
 )
+from megatron.core.optimizer import get_standard_config_overrides
 from megatron.training.checkpointing import load_checkpoint
-from megatron.training.checkpointing import save_checkpoint
+from megatron.training.checkpointing import save_checkpoint, save_grads
 from megatron.training.checkpointing import checkpoint_exists
+from megatron.training.checkpointing import get_loaded_iteration
 from megatron.core.full_cuda_graph import FullCudaGraphWrapper
 from megatron.core.transformer.cuda_graphs import TECudaGraphHelper
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.module import Float16Module
 from megatron.core.distributed import DistributedDataParallelConfig, TorchFullyShardedDataParallelConfig
 from megatron.core.distributed import DistributedDataParallel as DDP
 from megatron.core.distributed.fsdp.mcore_fsdp_adapter import FullyShardedDataParallel as megatron_FSDP
 from megatron.core.optimizer.optimizer import param_group_identifier_keys
 
+from megatron.core.optimizer.qk_clip import clip_qk
+
 try:
     from megatron.core.distributed import TorchFullyShardedDataParallel as torch_FSDP
 
@@ -85,6 +126,7 @@
 from megatron.core.distributed import finalize_model_grads
 from megatron.core.enums import ModelType
 from megatron.core.optimizer import get_megatron_optimizer, AdamOptimizerConfig, SGDOptimizerConfig, OptimizerConfig, ParamKey
+from megatron.core.optimizer.muon import get_megatron_muon_optimizer
 from megatron.core.rerun_state_machine import (
     get_rerun_state_machine,
     destroy_rerun_state_machine,
@@ -96,15 +138,20 @@
 from megatron.training.initialize import set_jit_fusion_options
 from megatron.training.utils import get_batch_on_this_cp_rank, get_batch_on_this_tp_rank
 from megatron.training.datasets.data_samplers import build_pretraining_data_loader
+from megatron.core.datasets.data_schedule import HybridCPDataLoaderWrapper
 from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
 from megatron.core.transformer.moe import upcycling_utils
-from megatron.core.transformer.moe.moe_utils import track_moe_metrics
+from megatron.core.transformer.moe.moe_utils import track_moe_metrics, clear_aux_losses_tracker
+from megatron.core.transformer.experimental_attention_variant.dsa import DSAIndexerLossLoggingHelper
 from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper
 from megatron.core.parallel_state import (
     destroy_global_memory_buffer,
+    destroy_global_symmetric_memory_buffer,
     destroy_model_parallel,
     update_pg_timeout
 )
+from megatron.core.inference.unified_memory import create_unified_mempool
+from megatron.core.resharding.refit import swap_model_weights
 
 from megatron.core.pipeline_parallel import get_forward_backward_func
 from megatron.core.num_microbatches_calculator import (
@@ -142,6 +189,7 @@
     get_energy_monitor,
 )
 from . import one_logger_utils
+from .dgrad_logging import enable_dgrad_logging, disable_dgrad_logging, save_dgrads
 
 from . import ft_integration
 
@@ -154,17 +202,21 @@ def destroy_global_state():
     destroy_global_vars()
     destroy_num_microbatches_calculator()
     destroy_global_memory_buffer()
+    destroy_global_symmetric_memory_buffer()
     destroy_model_parallel()
     destroy_rerun_state_machine()
 
 
-def print_datetime(string):
-    """Note that this call will sync across all ranks."""
+def print_datetime(string, override_timestamp=None):
+    """Note that this call will sync across all ranks. Use override_timestamp if provided;
+       otherwise use current timestamp."""
     torch.distributed.barrier()
-    time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    if override_timestamp is None:
+        time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+    else:
+        time_str = datetime.fromtimestamp(override_timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')
     print_rank_0(f'[{string}] datetime: {time_str} ')
 
-
 def num_floating_point_operations(args, batch_size):
     def calculate_layer_counts():
         """Calculate the number of attention, Mamba, and MLP layers."""
@@ -187,11 +239,19 @@ def mlp_layer_flops(batch_size, seq_len, hidden_size, expansion=4.0, swiglu=Fals
         return 4 * expansion * scale_factor * batch_size * seq_len * hidden_size**2
 
     def moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size,
-                        shared_expert_ffn_hidden_size, num_experts_routed_to, swiglu=False):
+                        shared_expert_ffn_hidden_size, num_experts_routed_to,
+                        moe_latent_size=None, swiglu=False):
         """Calculate FLOPs for an MoE layer."""
         scale_factor = 3.0 / 2.0 if swiglu else 1.0
-        routed_flops = (4 * batch_size * seq_len * hidden_size *
-                        moe_ffn_hidden_size * num_experts_routed_to * scale_factor)
+        if moe_latent_size is None:
+            routed_flops = (4 * batch_size * seq_len * hidden_size *
+                            moe_ffn_hidden_size * num_experts_routed_to * scale_factor)
+        else:
+            # Routed experts run on moe_latent_size.
+            routed_flops = (4 * batch_size * seq_len * moe_latent_size *
+                            moe_ffn_hidden_size * num_experts_routed_to * scale_factor)
+            # Up proj and down proj.
+            routed_flops += (4 * batch_size * seq_len * hidden_size * moe_latent_size)
         shared_flops = 4 * batch_size * seq_len * hidden_size * shared_expert_ffn_hidden_size * scale_factor
         return routed_flops + shared_flops
 
@@ -239,6 +299,7 @@ def hybrid_flops(batch_size, seq_len, hidden_size,
                      num_attn_heads=32, gqa=True,
                      gqa_groups=8, kv_channels=None,
                      mlp_expansion=4.0, swiglu=False,
+                     moe_latent_size=None,
                      moe_ffn_hidden_size=2048, shared_expert_ffn_hidden_size=2048, num_experts_routed_to=1,
                      vocab_size=256000):
         """Calculate total FLOPs for the hybrid model."""
@@ -251,7 +312,8 @@ def hybrid_flops(batch_size, seq_len, hidden_size,
                                                      mamba_state_dim, mamba_head_dim,
                                                      mamba_num_groups, mamba_num_heads) +
                 num_moe_layers * moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size,
-                                                 shared_expert_ffn_hidden_size, num_experts_routed_to, swiglu) +
+                                                 shared_expert_ffn_hidden_size, num_experts_routed_to,
+                                                 moe_latent_size, swiglu) +
                 (2 * batch_size * seq_len * hidden_size * vocab_size)  # logits computation
         )
         return flops_fwd * 3
@@ -259,9 +321,6 @@ def hybrid_flops(batch_size, seq_len, hidden_size,
     def transformer_flops():
         """Calculate FLOPs for a standard Transformer model."""
         # TODO(helenn/dnarayanan): Refactor this to reuse the helper methods.
-        # Attention projection size.
-        query_projection_size = args.kv_channels * args.num_attention_heads
-        query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_size
         # Group Query Attention.
         if not args.group_query_attention:
             args.num_query_groups = args.num_attention_heads
@@ -311,18 +370,15 @@ def transformer_flops():
             if args.moe_shared_expert_intermediate_size is None
             else args.moe_shared_expert_intermediate_size
         )
-        # SwiGLU.
-        gated_linear_multiplier = 3 / 2 if args.swiglu else 1
 
-        # The 12x term below comes from the following factors; for more details, see
-        # "APPENDIX: FLOATING-POINT OPERATIONS" in https://arxiv.org/abs/2104.04473.
         # - 3x: Each GEMM in the model needs to be performed 3 times (forward pass,
         #       backward wgrad [weight gradient], backward dgrad [data gradient]).
-        # - 2x: GEMMs of a particular size are stacked twice in the standard Transformer model
-        #       architectures implemented in this codebase (e.g., h->ffn_h GEMM and ffn_h->h GEMM
-        #       in MLP layer).
+        forward_backward_expansion_factor = 3
         # - 2x: A GEMM of a m*n tensor with a n*k tensor requires 2mnk floating-point operations.
-        expansion_factor = 3 * 2 * 2
+        fma_expansion_factor = 2
+        # - 3x (SwiGLU enabled): h->2*ffn_h GEMM and ffn_h->h GEMM are stacked.
+        # - 2x (SwiGLU disabled): h->ffn_h GEMM and ffn_h->h GEMM are stacked.
+        ffn_expansion_factor = 3 if args.swiglu else 2
 
         if args.multi_latent_attention:
             assert not args.group_query_attention
@@ -352,10 +408,9 @@ def transformer_flops():
                     + args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim)
                     + 1
                 )
-            self_attn_term = (
-                3
-                * 2  # fwd(1) + bwd(2) *FMA
-                * num_layers
+            standard_self_attn_term = (
+                forward_backward_expansion_factor
+                * fma_expansion_factor
                 * (
                     ## q lora + rope + q norm
                     q_term
@@ -372,53 +427,136 @@ def transformer_flops():
                     ## core attn
                     + args.seq_length
                     * (args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim))
-                    / 2
+                    / 2  # causal mask (only half of the mask is non-zero)
                     + args.seq_length * args.num_attention_heads * args.v_head_dim / 2
                 )
             )
 
         else:
             ## MHA or GQA
-            self_attn_term = (
-                expansion_factor
-                * num_layers
-                * args.hidden_size
-                * args.hidden_size
+            query_projection_size = args.kv_channels * args.num_attention_heads
+            key_projection_size = args.kv_channels * args.num_query_groups
+            value_projection_size = args.kv_channels * args.num_query_groups
+            gate_projection_size = query_projection_size if args.attention_output_gate else 0
+            standard_self_attn_term = (
+                forward_backward_expansion_factor
+                * fma_expansion_factor
                 * (
-                    (
-                        1
-                        + (args.num_query_groups / args.num_attention_heads)
-                        # # Only half of the attention matrix is non-zero and needs to be multiplied with V.
-                        + (args.seq_length / args.hidden_size / 2)
+                    ## qkv proj
+                    args.hidden_size
+                    * (
+                        query_projection_size
+                        + key_projection_size
+                        + value_projection_size
+                        + gate_projection_size
                     )
-                    * query_projection_to_hidden_size_ratio
+                    ## core attention
+                    + query_projection_size
+                    * args.seq_length
+                    / 2  # causal mask (only half of the mask is non-zero)
+                    * 2  # QK^T and (QK^T)V
+                    ## out proj
+                    + query_projection_size
+                    * args.hidden_size
                 )
             )
 
+        if is_linear_attention_variant(args.experimental_attention_variant):
+            # Calculate number of dense and MoE Transformer MLPs.
+            if isinstance(args.linear_attention_freq, int):
+                linear_attention_pattern = [
+                    # [1,1,...,1,0,1,1,...,1,0,...]
+                    0 if ((i + 1) % args.linear_attention_freq == 0)
+                    else 1 for i in range(num_layers)
+                ]
+            elif isinstance(args.linear_attention_freq, list):
+                linear_attention_pattern = args.linear_attention_freq
+                assert len(linear_attention_pattern) == num_layers, (
+                    f"Invalid length of linear_attention_pattern: {len(linear_attention_pattern)}, "
+                    f"expected {num_layers}, "
+                    f"current linear attention pattern: {args.linear_attention_freq}"
+                )
+            elif args.linear_attention_freq is None:
+                # This should be caught by config validation, but raise here as a safety check
+                raise ValueError(
+                    f"Linear attention type {args.experimental_attention_variant} is specified "
+                    "but linear_attention_freq is None. "
+                    "Please set linear_attention_freq to specify the LA/SDPA layer pattern."
+                )
+            else:
+                raise ValueError(
+                    f"Invalid linear_attention_freq: {type(args.linear_attention_freq)},"
+                    f" {args.linear_attention_freq}"
+                )
+            num_linear_attention_layers = sum(linear_attention_pattern)
+            num_standard_attention_layers = num_layers - num_linear_attention_layers
+
+            if args.experimental_attention_variant == "gated_delta_net":
+                # Calculate the FLOPs for the gated delta net attention.
+                qk_head_dim = args.linear_key_head_dim
+                v_head_dim = args.linear_value_head_dim
+                num_qk_heads = args.linear_num_key_heads
+                num_v_heads = args.linear_num_value_heads
+                qk_dim = qk_head_dim * num_qk_heads
+                v_dim = v_head_dim * num_v_heads
+                linear_self_attn_term = (
+                    forward_backward_expansion_factor
+                    * fma_expansion_factor
+                    * (
+                        ## in proj
+                        args.hidden_size
+                        * (2 * qk_dim + 2 * v_dim + 2 * num_v_heads)
+                        ## conv1d
+                        + args.linear_conv_kernel_dim
+                        * (2 * qk_dim + v_dim)
+                        ## gated delta rule
+                        + num_v_heads
+                        * (v_head_dim ** 2)
+                        * 4  # KK^T, VK^T, S(a(I-bKK^T)), and SQ
+                        ## out proj
+                        + args.hidden_size
+                        * v_dim
+                    )
+                )
+            else:
+                raise ValueError(
+                    "Invalid experimental_attention_variant: "
+                    f"{args.experimental_attention_variant}"
+                )
+        else:
+            num_linear_attention_layers = 0
+            linear_self_attn_term = 0
+            num_standard_attention_layers = num_layers
+
+        self_attn_term = (
+            linear_self_attn_term * num_linear_attention_layers
+            + standard_self_attn_term * num_standard_attention_layers
+        )
+
         total_floating_point_operations = (
             batch_size
             * args.seq_length
             * (
                 # MLP
-                expansion_factor
-                * num_layers
+                forward_backward_expansion_factor
+                * fma_expansion_factor
                 * args.hidden_size
                 * (
                     # dense layer (deepseek v2, v3 style)
-                    (args.ffn_hidden_size * gated_linear_multiplier)
-                    * (num_dense_layers / num_layers)
+                    (args.ffn_hidden_size * ffn_expansion_factor)
+                    * num_dense_layers
                     # routed experts
-                    + (moe_ffn_hidden_size * num_experts_routed_to * gated_linear_multiplier)
-                    * (num_moe_layers / num_layers)
+                    + (moe_ffn_hidden_size * num_experts_routed_to * ffn_expansion_factor)
+                    * num_moe_layers
                     # Shared Experts.
-                    + (shared_expert_ffn_hidden_size * gated_linear_multiplier)
-                    * (num_moe_layers / num_layers)
+                    + (shared_expert_ffn_hidden_size * ffn_expansion_factor)
+                    * num_moe_layers
                 )
                 # Self Attention
                 + self_attn_term
                 # MTP norms and proj
-                + 3
-                * 2
+                + forward_backward_expansion_factor
+                * fma_expansion_factor
                 * mtp_num_layers
                 * (
                     # MTP eh norm + final nrom
@@ -427,7 +565,11 @@ def transformer_flops():
                     + 2 * args.hidden_size * args.hidden_size
                 )
                 # Logit.
-                + 3 * 2 * args.hidden_size * args.padded_vocab_size * (mtp_num_layers + 1)
+                + forward_backward_expansion_factor
+                * fma_expansion_factor
+                * args.hidden_size
+                * args.padded_vocab_size
+                * (mtp_num_layers + 1)  # MTP + final logit
             )
         )
         return total_floating_point_operations
@@ -456,6 +598,7 @@ def transformer_flops():
             kv_channels=args.kv_channels,
             mlp_expansion=args.ffn_hidden_size / args.hidden_size,
             swiglu=args.swiglu,
+            moe_latent_size=args.moe_latent_size,
             moe_ffn_hidden_size=(args.moe_ffn_hidden_size if args.moe_ffn_hidden_size is not None
                                  else args.ffn_hidden_size),
             shared_expert_ffn_hidden_size=(0 if args.moe_shared_expert_intermediate_size is None
@@ -606,11 +749,21 @@ def pretrain(
         inprocess_call_wrapper: an optional instance of inprocess.CallWrapper,
             it is automatically injected when in-process restart is in use
     """
+    # Capture timestamp right at top of pretrain, before initialize_megatron
+    global _STARTUP_TIMESTAMPS
+    _STARTUP_TIMESTAMPS['pretrain_entry'] = time.time()
 
     if inprocess_call_wrapper is not None:
         iteration = inprocess_call_wrapper.iteration
         store = torch.distributed.PrefixStore(str(iteration), store)
 
+    timestamp_after_inprocess_setup = time.time()
+
+    # Early fault tolerance setup - must be done before initialize_megatron
+    # to enable monitoring of the initialization process
+    ft_integration.setup()
+    timestamp_after_in_job_setup = time.time()
+
     # Initalize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(
         extra_args_provider=extra_args_provider,
@@ -620,36 +773,91 @@ def pretrain(
         store=store,
     )
 
+    timestamp_after_initialize_megatron = time.time()
+
     args = get_args()
     timers = get_timers()
 
+    if args.fine_grained_activation_offloading:
+        from megatron.core.pipeline_parallel.utils import (
+            set_ideal_affinity_for_current_gpu
+        )
+        set_ideal_affinity_for_current_gpu()
+
+
     if args.log_progress:
         append_to_progress_log("Starting job")
 
-    # Initialize fault tolerance
-    # NOTE: ft_integration functions other than `setup` are no-op if the FT is not initialized
-    if args.enable_ft_package:
-        ft_integration.setup(args)
-        ft_integration.maybe_setup_simulated_fault()
-
     # Set pytorch JIT layer fusion options and warmup JIT functions.
     set_jit_fusion_options()
 
-    # Adjust the startup time so it reflects the largest value.
+    timestamp_after_set_jit_fusion_options = time.time()
+
+    # Adjust the startup time so it reflects the global minimum.
     # This will be closer to what scheduler will see (outside of
-    # image ... launches.
-    global _TRAIN_START_TIME
-    start_time_tensor = torch.tensor([_TRAIN_START_TIME], dtype=torch.double, device='cuda')
+    # image ... launches).
+    program_start = _STARTUP_TIMESTAMPS.get('program_start')
+    main_entry = _STARTUP_TIMESTAMPS.get('main_entry')
+    pretrain_entry = _STARTUP_TIMESTAMPS.get('pretrain_entry')
+
+    # Initialize program_start_global with a fallback value in case set_startup_timestamps() wasn't called
+    program_start_global = _TRAIN_START_TIME
+    if _STARTUP_TIMESTAMPS['program_start'] is not None:
+        program_start_global = torch.tensor([_STARTUP_TIMESTAMPS['program_start']], dtype=torch.double, device='cuda')
+        torch.distributed.all_reduce(program_start_global, op=torch.distributed.ReduceOp.MIN)
+        program_start_global = program_start_global.item()
+    set_startup_timestamps(program_start=program_start_global)
+
+    global _LEGACY_TRAIN_START_TIME
+    start_time_tensor = torch.tensor([_LEGACY_TRAIN_START_TIME], dtype=torch.double, device='cuda')
     torch.distributed.all_reduce(start_time_tensor, op=torch.distributed.ReduceOp.MIN)
-    _TRAIN_START_TIME = start_time_tensor.item()
+    _LEGACY_TRAIN_START_TIME = start_time_tensor.item()
+
+    # Capture megatron init end time (matches original time.time() placement)
+    megatron_init_end = time.time()
 
     app_metrics = {}
-    app_metrics['app_start_time'] = round(_TRAIN_START_TIME * 1000.0)
-    app_metrics['app_model_init_start_time'] = round(_TRAIN_START_TIME * 1000.0)
+    app_metrics['app_start_time'] = round(program_start_global * 1000.0)
+    app_metrics['app_model_init_start_time'] = round(program_start_global * 1000.0)
 
+    # Print basic megatron init time (using global min start)
+    # NOTE(asolergi-nv): This is not entirely accurate, but we keep it for backwards compatibility.
     print_rank_0(
-        'time to initialize megatron (seconds): {:.3f}'.format(time.time() - _TRAIN_START_TIME)
+        'time to initialize megatron (seconds): {:.3f}'.format(megatron_init_end - _LEGACY_TRAIN_START_TIME)
     )
+
+    # Note, not entirely accurate as rank 0 might not be the first or last to hit these timestamps
+    print_datetime('after in-process setup and before initialize_megatron', timestamp_after_inprocess_setup)
+    print_datetime('after in-job setup and before initialize_megatron', timestamp_after_in_job_setup)
+
+    if program_start is not None and main_entry is not None and pretrain_entry is not None:
+        # Inject startup deltas into timers
+        startup_timers = {
+            'startup-program-entry-spread': program_start - program_start_global, # Local program start timestamp vs the global earliest program start timestamp
+            'startup-library-setup': main_entry - program_start, # Local library imports
+            'startup-program-setup': pretrain_entry - main_entry, # Local __main__ entry to pretrain entry
+            'startup-in-process-setup': timestamp_after_inprocess_setup - pretrain_entry, # Local in-process setup
+            'startup-in-job-setup': timestamp_after_in_job_setup - timestamp_after_inprocess_setup, # Local in-job setup
+            'startup-initialize-megatron': timestamp_after_initialize_megatron - timestamp_after_in_job_setup, # Local initialize megatron
+            'startup-set-jit-fusion-options': timestamp_after_set_jit_fusion_options - timestamp_after_initialize_megatron, # Local set JIT fusion options
+            'all-reduce-start-timestamps-tensor': megatron_init_end - timestamp_after_set_jit_fusion_options, # 2x All-reduce, first collective call
+            'startup-megatron-init-local': megatron_init_end - pretrain_entry, # Local megatron init
+            'startup-megatron-init-global': megatron_init_end - program_start_global, # Local megatron init vs the global earliest program start timestamp
+        }
+        for name, delta in startup_timers.items():
+            timers(name, log_level=0).set_elapsed(delta)
+        timers.log(list(startup_timers.keys()), barrier=True)
+
+        # Print rank 0's absolute timestamps
+        startup_timestamps = {
+            'before library-setup': program_start,
+            'after library-setup': main_entry,
+            'before megatron-init': pretrain_entry,
+        }
+        for name, ts in startup_timestamps.items():
+            ts_str = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S.%f')
+            print_rank_0(f'[{name}] datetime: {ts_str}')
+
     print_datetime('after megatron is initialized')
     app_metrics['app_model_init_finish_time'] = one_logger_utils.get_timestamp_in_ms()
 
@@ -700,6 +908,70 @@ def pretrain(
     print_datetime('after model, optimizer, and learning rate ' 'scheduler are built')
     config = get_model_config(model[0])
 
+    # Build a separate inference model for RL if requested.
+    inference_model = None
+    if args.perform_rl_step:
+        if (
+            args.rl_inference_tensor_model_parallel_size is not None
+            or args.rl_inference_pipeline_model_parallel_size is not None
+            or args.rl_inference_expert_model_parallel_size is not None
+            or args.rl_inference_expert_tensor_model_parallel_size is not None
+        ):
+            print_rank_0(
+                "Building separate RL inference model with custom parallelism: "
+                f"TP={args.rl_inference_tensor_model_parallel_size}, "
+                f"PP={args.rl_inference_pipeline_model_parallel_size}, "
+                f"EP={args.rl_inference_expert_model_parallel_size}, "
+                f"ExptTP={args.rl_inference_expert_tensor_model_parallel_size}"
+            )
+            inference_pg_collection = build_inference_pg_collection(
+                args.world_size,
+                tp_size=args.rl_inference_tensor_model_parallel_size,
+                pp_size=args.rl_inference_pipeline_model_parallel_size,
+                ep_size=args.rl_inference_expert_model_parallel_size,
+                expt_tp_size=args.rl_inference_expert_tensor_model_parallel_size,
+                use_tp_pp_dp_mapping=args.use_tp_pp_dp_mapping,
+            )
+
+            # Build an isolated inference config so training config remains unchanged
+            inference_config = copy.deepcopy(config)
+            if args.rl_inference_tensor_model_parallel_size is not None:
+                inference_config.tensor_model_parallel_size = args.rl_inference_tensor_model_parallel_size
+            if args.rl_inference_pipeline_model_parallel_size is not None:
+                inference_config.pipeline_model_parallel_size = (
+                    args.rl_inference_pipeline_model_parallel_size
+                )
+            if args.rl_inference_expert_model_parallel_size is not None:
+                inference_config.expert_model_parallel_size = (
+                    args.rl_inference_expert_model_parallel_size
+                )
+            if args.rl_inference_expert_tensor_model_parallel_size is not None:
+                inference_config.expert_tensor_parallel_size = (
+                    args.rl_inference_expert_tensor_model_parallel_size
+                )
+
+            # Optionally allocate the RL inference model weights from a unified virtual memory (UVM)
+            # mempool so we can prefetch weights to CPU when idle while keeping CUDA-graph-safe pointers.
+            uvm_mempool = None
+            uvm_level = args.rl_inference_model_unified_memory_level
+            if uvm_level and uvm_level > 0:
+                uvm_mempool = create_unified_mempool()
+
+            mempool_ctx = (
+                torch.cuda.use_mem_pool(uvm_mempool) if uvm_mempool is not None else nullcontext()
+            )
+            with mempool_ctx:
+                inference_model = get_model(
+                    model_provider,
+                    model_type,
+                    wrap_with_ddp=False,
+                    pg_collection=inference_pg_collection,
+                    config=inference_config,
+                )
+            inference_model[0].eval()
+
+
+
     # Data stuff.
     app_metrics['app_build_dataiters_start_time'] = one_logger_utils.get_timestamp_in_ms()
     timers('train/valid/test-data-iterators-setup', log_level=0).start(barrier=True)
@@ -737,8 +1009,6 @@ def pretrain(
         args.do_valid,
         args.do_test,
         args.dataloader_type,
-        args.retro_project_dir,
-        args.retro_cyclic_train_iters,
     )
 
     # Print setup timing.
@@ -756,11 +1026,6 @@ def pretrain(
     if not args.skip_train:
         print_rank_0('training ...')
 
-        if args.dataloader_type == 'cyclic' and args.retro_project_dir:
-            assert args.retro_cyclic_train_iters is not None
-            args.train_iters = args.retro_cyclic_train_iters
-            print_rank_0("retro cyclic train iters : %d" % args.train_iters)
-
         iteration = 0
         if args.do_train and args.train_iters > 0:
             iteration, num_floating_point_operations_so_far = train(
@@ -774,6 +1039,7 @@ def pretrain(
                 config,
                 checkpointing_context,
                 non_loss_data_func,
+                inference_model,
             )
 
         print_datetime('after training is done')
@@ -802,8 +1068,18 @@ def pretrain(
     if args.do_valid:
         prefix = f'iteration {iteration} on validation set'
         if getattr(args, 'perform_rl_step', False):
+            rl_eval_model = model
+            if inference_model is not None:
+                inf_core = unwrap_model(inference_model[0])
+                # If separate inference and training models, swap training weights
+                # back to the inference model for RL evaluation.
+                rl_utils._maybe_prefetch_separate_inference_model_weights(inf_core, to_cpu=False)
+                swap_model_weights(model, inference_model, args.refit_method)
+                rl_eval_model = inference_model
             rl_utils.evaluate_and_print_results_rl(
-                valid_data_iterator, model, optimizer,
+                valid_data_iterator,
+                rl_eval_model,
+                optimizer,
                 iteration, write_to_tensorboard=not args.skip_train
             )
         else:
@@ -900,7 +1176,6 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
             # For distillation ckpts without ModelOpt state
             args.modelopt_enabled = True
 
-
     # Build model.
     def build_model():
         if (
@@ -939,6 +1214,7 @@ def build_model():
             model.model_type = model_type
         return model
 
+
     if args.init_model_with_meta_device:
         with torch.device('meta'):
             model = build_model()
@@ -988,12 +1264,8 @@ def build_model():
 
     # Materialize tensors on meta device (GPU allocation) if not using FSDP2 and not using Megatron FSDP.
     if args.init_model_with_meta_device and not args.use_torch_fsdp2 and not args.use_megatron_fsdp:
-        #for model_module in model:
         model = [to_empty_if_meta_device(model_module, device=torch.device("cuda")) for model_module in model]
 
-
-
-
     # Before TE2.x: The model_module.bfloat16()/model_module.half() above will call the inplace
     #               copy of TE's Float8Tensor, which will write an unwanted value (amax calculated
     #               from the current fp8 param) to its amax_history. The below function will correct
@@ -1034,8 +1306,6 @@ def build_model():
             kwargs['pad_buckets_for_high_nccl_busbw'] = args.ddp_pad_buckets_for_high_nccl_busbw
             kwargs['reduce_scatter_with_fp32_accumulation'] = args.ddp_reduce_scatter_with_fp32_accumulation
             kwargs['average_in_collective'] = args.ddp_average_in_collective
-            if args.use_megatron_fsdp and args.use_precision_aware_optimizer:
-                kwargs["preserve_fp32_weights"] = False
             ddp_config = DistributedDataParallelConfig(**kwargs)
 
             # In the Megatron FSDP and DDP use path, we need to initialize the bucket size.
@@ -1050,8 +1320,13 @@ def build_model():
             # Set bucket_size to infinity if overlap_grad_reduce is False.
             if not ddp_config.overlap_grad_reduce:
                 ddp_config.bucket_size = None
-
-        with torch.cuda.stream(torch.cuda.Stream()):
+        # Setup stream for ddp initialization. The side-stream may be necessary for cuda graph
+        #  capture support with DDP, but we sync it with the current stream to avoid races.
+        ddp_stream = torch.cuda.Stream()
+        # Wait for the default stream to complete before starting ddp_stream
+        ddp_stream.wait_stream(torch.cuda.current_stream())
+        # Make ddp_stream start after whatever the default stream already queued
+        with torch.cuda.stream(ddp_stream):
             model = [
                 DP(
                     config=config,
@@ -1064,6 +1339,9 @@ def build_model():
                 )
                 for (model_chunk_idx, model_chunk) in enumerate(model)
             ]
+        # End of setup_stream
+        # Critical: ensure side-stream work completes before touching params on default stream
+        torch.cuda.current_stream().wait_stream(ddp_stream)
 
         # Broadcast params from data parallel src rank to other data parallel ranks.
         if args.data_parallel_random_init:
@@ -1133,7 +1411,9 @@ def get_megatron_optimizer_config(args: Any) -> OptimizerConfig:
     """Return a Megatron optimizer config object from Megatron's arguments."""
 
     config = None
-    if args.optimizer == 'adam':
+    if args.optimizer == 'adam' or 'muon' in args.optimizer:
+        # TODO(deyuf): Muon needs both adam + muon but get() only receive one config
+        # So for now we keep using adam config that's back compat with old way
         kwargs = {}
         for f in dataclasses.fields(AdamOptimizerConfig):
             if hasattr(args, f.name):
@@ -1148,17 +1428,9 @@ def get_megatron_optimizer_config(args: Any) -> OptimizerConfig:
     else:
         raise ValueError("Invalid optimizer type!")
 
-    # Construct the appropriate config_overrides object.
-    # TODO: add more logic here as needed down the road.
-    if args.decoupled_lr is not None:
-        decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter")
-        decoupled_optimizer_config = copy.deepcopy(config)
-        decoupled_optimizer_config.lr = args.decoupled_lr
-        if args.decoupled_min_lr is not None:
-            decoupled_optimizer_config.min_lr = args.decoupled_min_lr
-        config_overrides = {decoupled_param_key: decoupled_optimizer_config}
-    else:
-        config_overrides = None
+    # Construct the appropriate config_overrides object. This default handles many cases, but
+    #  can be added to as needed by the user, or replaced entirely with a custom override.
+    config_overrides = get_standard_config_overrides(config=config)
 
     return config, config_overrides
 
@@ -1173,24 +1445,38 @@ def setup_model_and_optimizer(
     timers = get_timers()
     one_logger = get_one_logger()
 
-    model = get_model(model_provider_func, model_type)
+    wrap_with_ddp = not args.skip_train
+    model = get_model(model_provider_func, model_type, wrap_with_ddp=wrap_with_ddp)
     unwrapped_model = unwrap_model(model)
 
     one_logger and one_logger.log_metrics({"app_build_optimzer_start_time": one_logger_utils.get_timestamp_in_ms()})
-    config, config_overrides = get_megatron_optimizer_config(args)
-    config.timers = timers
+    if args.skip_train:
+        optimizer, opt_param_scheduler = None, None
+    else:
+        config, config_overrides = get_megatron_optimizer_config(args)
+        config.timers = timers
+
+        if 'muon' not in config.optimizer:
+            # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings
+            # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903
+            # default_skip_embedding_weight_decay=args.embedding_init_method_std is not None,
+            optimizer = get_megatron_optimizer(
+                config,
+                model,
+                config_overrides=config_overrides,
+                use_gloo_process_groups=args.enable_gloo_process_groups,
+                dump_param_to_param_group_map=args.dump_param_to_param_group_map,
+            )
+        else:
+            optimizer = get_megatron_muon_optimizer(
+                config,
+                model,
+                config_overrides=config_overrides,
+                use_gloo_process_groups=args.enable_gloo_process_groups,
+                layer_wise_distributed_optimizer='dist' in config.optimizer,
+            )
+        opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
 
-    # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings
-    # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903
-    # default_skip_embedding_weight_decay=args.embedding_init_method_std is not None,
-    optimizer = get_megatron_optimizer(
-        config,
-        model,
-        config_overrides=config_overrides,
-        use_gloo_process_groups=args.enable_gloo_process_groups,
-        dump_param_to_param_group_map=args.dump_param_to_param_group_map,
-    )
-    opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
     one_logger and one_logger.log_metrics({"app_build_optimzer_finish_time": one_logger_utils.get_timestamp_in_ms()})
 
     if args.moe_use_upcycling:
@@ -1209,7 +1495,7 @@ def setup_model_and_optimizer(
         # set dense model related args in to global args before getting dense model
         args.num_experts = None
         args.expert_model_parallel_size = 1
-        args.ffn_hidden_size = moe_ffn_hidden_size * args.moe_upcycling_granularity 
+        args.ffn_hidden_size = moe_ffn_hidden_size * args.moe_upcycling_granularity
 
         # get dense model
         dense_model_for_upcycling = get_model(model_provider_func, model_type)
@@ -1314,16 +1600,22 @@ def dummy_train_step(data_iterator):
             batch = get_batch_on_this_cp_rank(batch)
 
 
-def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_scheduler, config, forward_backward_func):
+def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_scheduler, config, forward_backward_func, iteration=None):
     """Single training step."""
     args = get_args()
     timers = get_timers()
 
     rerun_state_machine = get_rerun_state_machine()
+    save_dgrads_in_this_iteration = (args.save_dgrads_interval is not None and
+                                     (iteration + 1) % args.save_dgrads_interval == 0)
+    save_wgrads_in_this_iteration = (args.save_wgrads_interval is not None and
+                                     (iteration + 1) % args.save_wgrads_interval == 0)
     while rerun_state_machine.should_run_forward_backward(data_iterator):
         # Set grad to zero.
         for model_chunk in model:
             model_chunk.zero_grad_buffer()
+            # If saving main_grads in this iteration, then all-reduce instead of reduce-scatter.
+            model_chunk.force_all_reduce = save_wgrads_in_this_iteration
         optimizer.zero_grad()
 
         if has_nvidia_modelopt:
@@ -1340,12 +1632,23 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
         # For the mxfp8_param with reuse_grad_buf_for_mxfp8_param_ag and dp_ag_overlap,
         # we need to call the _copy_main_params_to_param_buffer() after the grad buffer
         # is zeroed by zero_grad_buffer() because param and grad buffer are shared.
+        #
+        # However, we should skip this on the first iteration when forward_pre_hook is disabled,
+        # because:
+        # 1. The first iteration's params are already in param.data (from init or checkpoint).
+        # 2. Without forward_pre_hook, finish_param_sync() won't be called to zero the grad buffer,
+        #    so the main grads will be polluted by the main params.
         if args.reuse_grad_buf_for_mxfp8_param_ag and args.overlap_param_gather:
-            for optim_instance in optimizer.chained_optimizers:
-                if isinstance(optim_instance, DistributedOptimizer):
-                    optim_instance._copy_main_params_to_param_buffer()
+            # Check if forward_pre_hook is enabled by checking if hooks are registered.
+            forward_pre_hook_enabled = len(model[0].remove_forward_pre_hook_handles) > 0
+            if forward_pre_hook_enabled:
+                for optim_instance in optimizer.chained_optimizers:
+                    if isinstance(optim_instance, DistributedOptimizer):
+                        optim_instance._copy_main_params_to_param_buffer()
 
         # Forward pass.
+        if save_dgrads_in_this_iteration:
+            enable_dgrad_logging(model, args.save)
         losses_reduced = forward_backward_func(
             forward_step_func=forward_step_func,
             data_iterator=data_iterator,
@@ -1356,10 +1659,34 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
             decoder_seq_length=args.decoder_seq_length,
             forward_only=False,
             adjust_tensor_shapes_fn=adjust_tensor_shapes_fn,
+            force_all_reduce=save_wgrads_in_this_iteration,
         )
+        if save_dgrads_in_this_iteration:
+            save_dgrads(iteration + 1)
+            disable_dgrad_logging()
+
+        # Reset force_all_reduce field.
+        for model_chunk in model:
+            model_chunk.force_all_reduce = False
+
+    # Checkpoint main_grads.
+    if save_wgrads_in_this_iteration:
+        # Collect state_dict of wgrads (each param's .main_grad field).
+        state_dict = defaultdict(dict)
+        for model_chunk_id, model_chunk in enumerate(model):
+            model_chunk_name = f"model_chunk{model_chunk_id}"
+            unwrapped_model_chunk = unwrap_model(model_chunk)
+            for param_name, param in unwrapped_model_chunk.named_parameters():
+                if getattr(param, "main_grad", None) is not None:
+                    main_grad_on_cpu = param.main_grad.cpu()
+                    state_dict[model_chunk_name][param_name] = main_grad_on_cpu
+
+        # iteration is 0-indexed, move to 1-indexed for checkpoint name and logging.
+        save_grads(args.save, state_dict, iteration + 1, "wgrads")
+
     should_checkpoint, should_exit, exit_code = rerun_state_machine.should_checkpoint_and_exit()
     if should_exit:
-        return {}, True, should_checkpoint, should_exit, exit_code, None, None
+        return {}, True, should_checkpoint, should_exit, exit_code, None, None, 0
 
     # Empty unused memory.
     if args.empty_unused_memory_level >= 1:
@@ -1374,6 +1701,13 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
 
     timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time)
     update_successful, grad_norm, num_zeros_in_grad = optimizer.step()
+
+    # get max attention logit for logging and run clip_qk()
+    # Part of MuonClip Optimizer step
+    log_max_attention_logit = 0
+    if args.qk_clip or args.log_max_attention_logit:
+        log_max_attention_logit = clip_qk(model, log_max_only=not args.qk_clip)
+
     timers('optimizer').stop()
 
     # when freezing sub-models we may have a mixture of successful and unsucessful ranks,
@@ -1409,28 +1743,14 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
         for key in losses_reduced[0].keys():
             val = [x[key].view(-1) for x in losses_reduced]
             if val[0].numel() == 2:
-                if args.sft:
-                    # in mcore the normalization happens on micro batch instead of global
-                    val = torch.vstack(val)
-                    val = val[:, 0] / val[:, 1]
-                    val = val.mean()
-                    torch.distributed.all_reduce(
-                        val,
-                        group=mpu.get_data_parallel_group(with_context_parallel=True)
-                    )
-                    val /= torch.distributed.get_world_size(
-                        group=mpu.get_data_parallel_group(with_context_parallel=True)
-                    )
-                    loss_reduced[key] = val
-                else:
-                    # there is one dict per microbatch. in new reporting, we average
-                    # over the total number of tokens across the global batch.
-                    val = torch.vstack(val).sum(dim=0)
-                    torch.distributed.all_reduce(
-                        val,
-                        group=mpu.get_data_parallel_group(with_context_parallel=True)
-                    )
-                    loss_reduced[key] = val[0] / val[1]
+                # there is one dict per microbatch. in new reporting, we average
+                # over the total number of tokens across the global batch.
+                val = torch.vstack(val).sum(dim=0)
+                torch.distributed.all_reduce(
+                    val,
+                    group=mpu.get_data_parallel_group(with_context_parallel=True)
+                )
+                loss_reduced[key] = val[0] / val[1]
             elif val[0].numel() == 1:
                 # legacy behavior, we average over the number of microbatches
                 val = torch.cat(val).mean()
@@ -1445,14 +1765,15 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
             exit_code,
             grad_norm,
             num_zeros_in_grad,
+            log_max_attention_logit,
         )
-    return {}, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad
+    return {}, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad, log_max_attention_logit
 
 
 def training_log(
     loss_dict,
     total_loss_dict,
-    learning_rate,
+    learning_rate: float | None,
     iteration,
     loss_scale,
     report_memory_flag,
@@ -1460,6 +1781,9 @@ def training_log(
     grad_norm,
     params_norm,
     num_zeros_in_grad,
+    max_attention_logit,
+    pg_collection=None,
+    is_first_iteration=False,
 ):
     """Log training information such as losses, timing, ...."""
     args = get_args()
@@ -1469,6 +1793,9 @@ def training_log(
     one_logger = get_one_logger()
     energy_monitor = get_energy_monitor()
 
+    # On first iteration, log stats but don't reset accumulators so normal interval stats remain accurate.
+    should_reset = not is_first_iteration
+
     # Advanced, skipped, and Nan iterations.
     advanced_iters_key = 'advanced iterations'
     skipped_iters_key = 'skipped iterations'
@@ -1547,15 +1874,16 @@ def training_log(
     total_iterations = total_loss_dict[advanced_iters_key] + total_loss_dict[skipped_iters_key]
 
     # learning rate will be None on ranks without trainable params, so we must gather across mp ranks
-    learning_rate = reduce_max_stat_across_model_parallel_group(learning_rate)
+    learning_rate: float | None = reduce_max_stat_across_model_parallel_group(learning_rate)
     # Tensorboard values.
     if writer and (iteration % args.tensorboard_log_interval == 0):
         if wandb_writer:
             wandb_writer.log({'samples vs steps': args.consumed_train_samples}, iteration)
-        writer.add_scalar('learning-rate', learning_rate, iteration)
-        writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples)
-        if wandb_writer:
-            wandb_writer.log({'learning-rate': learning_rate}, iteration)
+        if learning_rate is not None:
+            writer.add_scalar('learning-rate', learning_rate, iteration)
+            writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples)
+            if wandb_writer:
+                wandb_writer.log({'learning-rate': learning_rate}, iteration)
         if args.skipped_train_samples > 0:
             writer.add_scalar('skipped-train-samples', args.skipped_train_samples, iteration)
             if wandb_writer:
@@ -1620,6 +1948,12 @@ def training_log(
                 "mem-max-allocated-bytes", mem_stats["allocated_bytes.all.peak"], iteration
             )
             writer.add_scalar("mem-allocated-count", mem_stats["allocation.all.current"], iteration)
+        if args.log_max_attention_logit:
+            writer.add_scalar('max_attention_logit', max_attention_logit, iteration)
+            if wandb_writer:
+                wandb_writer.log({'max_attention_logit': max_attention_logit}, iteration)
+
+    # Log MoE metrics.
     if args.num_experts is not None:
         moe_loss_scale = 1 / get_num_microbatches()
         track_names = []
@@ -1649,13 +1983,29 @@ def training_log(
             num_layers=layers,
             moe_layer_freq=args.moe_layer_freq,
             mtp_num_layers=args.mtp_num_layers,
+            pg_collection=pg_collection,
         )
+
+    # Log MTP metrics.
     if args.mtp_num_layers is not None:
         mtp_loss_scale = 1 / get_num_microbatches()
         MTPLossLoggingHelper.track_mtp_metrics(
             mtp_loss_scale, iteration, writer, wandb_writer, total_loss_dict
         )
-    if iteration % args.log_interval == 0:
+
+    # Track sparse attention indexer loss.
+    if args.dsa_indexer_loss_coeff is not None and args.dsa_indexer_loss_coeff > 0:
+        indexer_loss_scale = 1 / get_num_microbatches()
+        DSAIndexerLossLoggingHelper.track_indexer_metrics(
+            loss_scale=indexer_loss_scale,
+            iteration=iteration,
+            writer=writer,
+            wandb_writer=wandb_writer,
+            total_loss_dict=total_loss_dict,
+        )
+
+    # Dump memory snapshot and print metrics to stdout.
+    if iteration % args.log_interval == 0 or is_first_iteration:
         if args.record_memory_history and (is_last_rank() or torch.distributed.get_backend() == 'fake'):
             snapshot = torch.cuda.memory._snapshot()
             from pickle import dump
@@ -1663,7 +2013,7 @@ def training_log(
             with open(args.memory_snapshot_path, 'wb') as f:
                 dump(snapshot, f)
 
-        elapsed_time = timers('interval-time').elapsed(barrier=True)
+        elapsed_time = timers('interval-time').elapsed(barrier=True, reset=should_reset)
         elapsed_time_per_iteration = elapsed_time / total_iterations
 
         throughput = num_floating_point_operations(args, batch_size) / (
@@ -1672,12 +2022,15 @@ def training_log(
 
         one_logger_utils.track_e2e_metrics(args.log_throughput, throughput)
 
-        if args.log_timers_to_tensorboard:
+        # We log to stdout after the first iteration (controlled by `is_first_iteration`)
+        # to document initialization overhead. Log statistics to TensorBoard and
+        # WandB according to the regular schedule.
+        if args.log_timers_to_tensorboard and not is_first_iteration:
             if writer:
                 writer.add_scalar('iteration-time', elapsed_time_per_iteration, iteration)
             if wandb_writer:
                 wandb_writer.log({'iteration-time': elapsed_time_per_iteration}, iteration)
-        log_string = f" [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]"
+        log_string = f" [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}]"
         log_string += ' iteration {:8d}/{:8d} |'.format(iteration, args.train_iters)
         log_string += ' consumed samples: {:12d} |'.format(args.consumed_train_samples)
         if has_rl_utils and args.rl_use_sequence_packing:
@@ -1715,7 +2068,8 @@ def training_log(
                 )
                 if avg > 0.0:
                     log_string += ' {}: {:.6E} |'.format(key, avg)
-                total_loss_dict[key] = torch.tensor([0.0], dtype=torch.float, device='cuda')
+                if should_reset:
+                    total_loss_dict[key] = torch.tensor([0.0], dtype=torch.float, device='cuda')
         log_string += f' loss scale: {loss_scale:.1f} |'
         if grad_norm is not None:
             log_string += f' grad norm: {grad_norm:.3f} |'
@@ -1727,25 +2081,32 @@ def training_log(
             total_loss_dict[skipped_iters_key]
         )
         log_string += ' number of nan iterations: {:3d} |'.format(total_loss_dict[nan_iters_key])
-        total_loss_dict[advanced_iters_key] = 0
-        total_loss_dict[skipped_iters_key] = 0
-        total_loss_dict[nan_iters_key] = 0
+        if should_reset:
+            total_loss_dict[advanced_iters_key] = 0
+            total_loss_dict[skipped_iters_key] = 0
+            total_loss_dict[nan_iters_key] = 0
         print_rank_last(log_string)
+        reported_memory_in_this_iteration = False
         if report_memory_flag:
             # Report memory after optimizer state has been initialized.
             if torch.distributed.get_rank() == 0:
                 num_microbatches = get_num_microbatches()
                 report_theoretical_memory(args, num_microbatches=num_microbatches, verbose=True)
             report_memory(f'(after {iteration} iterations)')
-            if iteration > 1:
+            reported_memory_in_this_iteration = True
+            loaded_iteration = max(get_loaded_iteration() or 0, 0)
+            if iteration > (loaded_iteration + 1):
                 # Make sure the memory after the second iteration is reported to include optimizer state memory.
                 report_memory_flag = False
-        # Write timers to wandb, don't reset the counts
+        if args.log_memory_interval is not None and iteration % args.log_memory_interval == 0 and \
+            not reported_memory_in_this_iteration:
+            report_memory(f'(after {iteration} iterations)')
+        # Write timers to wandb, don't reset the counts.
         if args.log_timers_to_tensorboard:
             timers.write(timers_to_log, writer, iteration, normalizer=args.log_interval, reset=False)
             timers.write(timers_to_log, wandb_writer, iteration, normalizer=args.log_interval, reset=False)
         # Log timers to stdout
-        timers.log(timers_to_log, normalizer=args.log_interval)
+        timers.log(timers_to_log, normalizer=args.log_interval, reset=should_reset)
 
     return report_memory_flag
 
@@ -1795,6 +2156,15 @@ def disable_forward_pre_hook(model_chunks, param_sync=True):
         model_chunk.disable_forward_pre_hook(param_sync=param_sync)
 
 
+def force_param_sync(model_chunks: list[DDP]) -> None:
+    for model_chunk in model_chunks:
+        assert isinstance(model_chunk, DDP)
+        model_chunk.start_param_sync(force_sync=True)
+
+# Only report memory for first 3 checkpoint saves.
+num_checkpoints_memory_reported = 0
+MAX_NUM_CHECKPOINTS_MEMORY_REPORTED = 3
+
 def save_checkpoint_and_time(
     iteration,
     model,
@@ -1820,7 +2190,15 @@ def save_checkpoint_and_time(
     # Log E2E metrics before save-checkpoint
     one_logger_utils.track_e2e_metrics()
     if should_disable_forward_pre_hook(args):
-        disable_forward_pre_hook(model)
+        force_param_sync(model)
+
+    global num_checkpoints_memory_reported, MAX_NUM_CHECKPOINTS_MEMORY_REPORTED
+    should_report_memory = num_checkpoints_memory_reported < MAX_NUM_CHECKPOINTS_MEMORY_REPORTED
+
+    if should_report_memory:
+        # Track memory before checkpoint save.
+        report_memory(f"(before save_checkpoint for iteration {iteration})")
+    # Save checkpoint.
     save_checkpoint(
         iteration,
         model,
@@ -1832,13 +2210,16 @@ def save_checkpoint_and_time(
         train_data_iterator=train_data_iterator,
         preprocess_common_state_dict_fn=preprocess_common_state_dict,
     )
+    if should_report_memory:
+        # Track memory after checkpoint save.
+        report_memory(f"(after save_checkpoint for iteration {iteration})")
+    num_checkpoints_memory_reported += 1
+
     if args.fp8:
         # Run garbage collection after checkpoint saving to free memory from
         # dequantized bf16 tensors that were temporarily created during fp8
         # model checkpoint saving.
         gc.collect()
-    if should_disable_forward_pre_hook(args):
-        enable_forward_pre_hook(model)
     timers(timer_key).stop(barrier=True)
     timers.log([timer_key])
 
@@ -1875,6 +2256,7 @@ def post_training_step_callbacks(
 
     # Straggler detector.
     if iteration % args.log_interval == 0 and args.log_straggler:
+        # Use FLOPs accumulated since last log event and then reset the counter
         stimer.report(num_floating_point_operations_since_last_log_event, args.log_interval)
         num_floating_point_operations_since_last_log_event = 0.0
 
@@ -1916,6 +2298,9 @@ def post_training_step_callbacks(
         if args.manual_gc_interval != 0 and iteration % args.manual_gc_interval == 0:
             gc.collect()
 
+    # Return updated FLOPs accumulator so caller can persist the reset
+    return num_floating_point_operations_since_last_log_event
+
 
 def checkpoint_and_decide_exit(
     model,
@@ -2005,7 +2390,13 @@ def checkpoint_and_decide_exit(
             return True
 
     # Exit based on iterations.
-    if args.exit_interval and iteration % args.exit_interval == 0:
+    if (
+        args.exit_interval
+        and iteration % args.exit_interval == 0
+    ) or (
+        args.phase_transition_iterations
+        and iteration in args.phase_transition_iterations
+    ):
         if args.save and not saved_checkpoint:
             save_checkpoint_and_time(
                 iteration,
@@ -2034,6 +2425,7 @@ def train(
     config,
     checkpointing_context,
     non_loss_data_func,
+    inference_model=None,
 ):
     """Training function: run train_step desired number of times, run validation, checkpoint."""
     args = get_args()
@@ -2101,6 +2493,9 @@ def train(
     energy_monitor = get_energy_monitor()
     one_logger = get_one_logger()
 
+    if args.hybrid_context_parallel:
+        train_data_iterator = iter(HybridCPDataLoaderWrapper(train_data_iterator, config))
+
     if args.run_workload_inspector_server:
         try:
             from workload_inspector.utils.webserver import run_server
@@ -2119,6 +2514,8 @@ def train(
     for model_module in model:
         model_module.train()
 
+    model_pg_collection = get_attr_wrapped_model(model[0], "pg_collection")
+
     # Tracking loss.
     total_loss_dict = {}
 
@@ -2177,6 +2574,7 @@ def train(
     pre_hook_enabled = False
     should_exit = False
     exit_code = 0
+    is_first_iteration = True
 
     if args.manual_gc:
         # Disable the default garbage collector and perform the collection manually.
@@ -2207,7 +2605,7 @@ def train(
     eval_iterations = 0
     # Wrap forward_backward_func for Full iteration CUDA graph
     forward_backward_func = get_forward_backward_func()
-    if args.cuda_graph_impl == "local" and args.cuda_graph_scope=="full_iteration":
+    if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
         forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps)
 
     def get_e2e_base_metrics():
@@ -2340,12 +2738,13 @@ def get_e2e_base_metrics():
         # Capture CUDA Graphs.
         if (
             args.cuda_graph_impl == "transformer_engine"
-            and iteration == args.cuda_graph_warmup_steps
+            and not cuda_graph_helper.graphs_created()
+            and iteration - start_iteration == args.cuda_graph_warmup_steps
         ):
-            if iteration > start_iteration and should_disable_forward_pre_hook(args):
+            if args.cuda_graph_warmup_steps > 0 and should_disable_forward_pre_hook(args):
                 disable_forward_pre_hook(model, param_sync=False)
             cuda_graph_helper.create_cudagraphs()
-            if iteration > start_iteration and should_disable_forward_pre_hook(args):
+            if args.cuda_graph_warmup_steps > 0 and should_disable_forward_pre_hook(args):
                 enable_forward_pre_hook(model)
                 cuda_graph_helper.cuda_graph_set_manual_hooks()
 
@@ -2369,8 +2768,15 @@ def get_e2e_base_metrics():
 
         if getattr(args, 'perform_rl_step', False):
             with torch.no_grad():
-                train_data_iterator = rl_utils.setup_grpo_data_iterator(
-                    model, optimizer, iteration, ref_state_dict, buffered_rollouts
+                train_data_iterator = rl_utils.get_grpo_data_iterator(
+                    model, inference_model, optimizer, iteration, ref_state_dict,
+                    grpo_iterations=args.grpo_iterations,
+                    grpo_prompts_per_step=args.grpo_prompts_per_step,
+                    grpo_group_size=args.grpo_group_size,
+                    global_batch_size=args.global_batch_size,
+                    sequence_packing=args.rl_use_sequence_packing,
+                    buffered_rollouts=buffered_rollouts,
+                    is_correction=args.rl_inference_logprobs_is_correction,
                 )
                 # Buffered rollouts are used as a state container for setups when
                 # we use previously-generated data for an update.
@@ -2385,8 +2791,9 @@ def get_e2e_base_metrics():
             exit_code,
             grad_norm,
             num_zeros_in_grad,
+            max_attention_logit,
         ) = train_step(
-            forward_step_func, train_data_iterator, model, optimizer, opt_param_scheduler, config, forward_backward_func
+            forward_step_func, train_data_iterator, model, optimizer, opt_param_scheduler, config, forward_backward_func, iteration=iteration
         )
         ft_integration.on_training_step_end()
         if should_checkpoint:
@@ -2422,16 +2829,36 @@ def get_e2e_base_metrics():
                     # Set the manual hooks here since it's not set right after the capturing.
                     if (
                         args.cuda_graph_impl == "transformer_engine"
-                        and iteration == args.cuda_graph_warmup_steps
+                        and args.cuda_graph_warmup_steps == 0
                     ):
+                        assert (
+                            cuda_graph_helper.graphs_created()
+                        ), "CUDA Graphs should have been created."
                         cuda_graph_helper.cuda_graph_set_manual_hooks()
 
         iteration += 1
 
+        # If requested, manually register FSDP communication buffers after a short warmup.
+        if (
+            getattr(args, "fsdp_manual_registration", False)
+            and getattr(args, "use_megatron_fsdp", False)
+            and iteration ==  start_iteration + 1
+        ):
+            for model_chunk in model:
+                if isinstance(model_chunk, megatron_FSDP) and getattr(
+                    model_chunk.ddp_config, "fsdp_manual_registration", False
+                ):
+                    pad_buf = getattr(model_chunk, "param_and_grad_buffer", None)
+                    if pad_buf is not None:
+                        pad_buf.manual_buffer_registration()
+
         if getattr(args, 'perform_rl_step', False) and args.rl_use_sequence_packing:
             iteration_sequences = rl_utils.get_iteration_sequence_count(args)
             # Track bins separately for packed mode
-            rl_utils.update_sequence_packing_metrics(args)
+            bin_count = (
+                mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches()
+            )
+            args.consumed_train_bins += bin_count
         else:
             batch_size = (
                 mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches()
@@ -2465,12 +2892,7 @@ def get_e2e_base_metrics():
 
         if args.log_params_norm:
             params_norm = calc_params_l2_norm(model)
-        learning_rate = None
-        for param_group in optimizer.param_groups:
-            if len(param_group['params']) == 0:
-                continue
-            if param_group['default_config']:
-                learning_rate = param_group['lr']
+        learning_rate = get_canonical_lr_for_logging(optimizer.param_groups)
         report_memory_flag = training_log(
             loss_dict,
             total_loss_dict,
@@ -2482,7 +2904,11 @@ def get_e2e_base_metrics():
             grad_norm,
             params_norm,
             num_zeros_in_grad,
+            max_attention_logit,
+            pg_collection=model_pg_collection,
+            is_first_iteration=is_first_iteration,
         )
+        is_first_iteration = False
 
         # Evaluation.
         if args.eval_interval and iteration % args.eval_interval == 0 and args.do_valid:
@@ -2498,8 +2924,23 @@ def get_e2e_base_metrics():
             prefix = f'iteration {iteration}'
             timers('eval-time', log_level=0).start(barrier=True)
             if getattr(args, 'perform_rl_step', False):
-                rl_utils.evaluate_and_print_results_rl(valid_data_iterator, model, optimizer,
-                                       iteration, write_to_tensorboard=True)
+                rl_eval_model = model
+                # If separate inference and training models, swap training weights
+                # back to the inference model for RL evaluation.
+                if inference_model is not None:
+                    inf_core = unwrap_model(inference_model[0])
+                    rl_utils._maybe_prefetch_separate_inference_model_weights(
+                        inf_core, to_cpu=False
+                    )
+                    swap_model_weights(model, inference_model, args.refit_method)
+                    rl_eval_model = inference_model
+                rl_utils.evaluate_and_print_results_rl(
+                    valid_data_iterator,
+                    rl_eval_model,
+                    optimizer,
+                    iteration,
+                    write_to_tensorboard=True,
+                )
             else:
                 evaluate_and_print_results(prefix, forward_step_func,
                                        valid_data_iterator, model,
@@ -2521,10 +2962,13 @@ def get_e2e_base_metrics():
             timers('interval-time', log_level=0).start(barrier=True)
             if args.log_energy:
                 energy_monitor.resume()
+            if args.num_experts is not None:
+                clear_aux_losses_tracker()
 
         # Miscellaneous post-training-step functions (e.g., FT heartbeats, GC).
-        # Some of these only happen at specific iterations.
-        post_training_step_callbacks(
+        # Some of these only happen at specific iterations. Capture updated FLOPs accumulator
+        # (it is reset inside the callback after logging).
+        num_floating_point_operations_since_last_log_event = post_training_step_callbacks(
             model,
             optimizer,
             opt_param_scheduler,
@@ -2547,6 +2991,10 @@ def get_e2e_base_metrics():
         if should_exit:
             break
 
+    # Destroy CUDA Graphs.
+    if args.cuda_graph_impl == "transformer_engine" and cuda_graph_helper.graphs_created():
+        cuda_graph_helper.delete_cuda_graphs()
+
     one_logger_utils.track_e2e_metrics()
 
     # Flush TensorBoard, WandB writers and one-logger.
@@ -2563,13 +3011,11 @@ def get_e2e_base_metrics():
     # a persistent async worker if persistent ckpt worker is enabled
     maybe_finalize_async_save(blocking=True, terminate=True)
     ft_integration.on_checkpointing_end(is_async_finalization=True)
-    if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None:
-        ft_integration.get_rank_monitor_client().shutdown_workload_monitoring()
 
     if args.log_energy:
         energy_monitor.lap()
         total_energy = energy_monitor.get_total()
-        print_rank_0(f"Total training energy (GPU): {total_energy / 1e6} MJ")
+        print_rank_0(f"Total training energy (GPU): {total_energy / 1e6:.3f} MJ")
         energy_monitor.shutdown()
 
     # If any exit conditions (signal handler, duration, iterations) have been reached, exit.
@@ -2622,7 +3068,7 @@ def evaluate(
     eval_batch_size = args.global_batch_size
     eval_num_microbatches = eval_batch_size // (args.micro_batch_size * args.data_parallel_size)
     forward_backward_func = get_forward_backward_func()
-    if args.cuda_graph_impl == "local" and args.cuda_graph_scope=="full_iteration":
+    if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
         forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps)
 
     if eval_iters is None:
@@ -2770,7 +3216,7 @@ def evaluate_and_print_results(
         eval_iters = [args.eval_iters]
     else:
         eval_iters = args.eval_iters
-        
+
     if args.full_validation:
         assert len(eval_iters) == len(data_iterators)
 
@@ -2786,7 +3232,7 @@ def evaluate_and_print_results(
         eval_iters = [args.eval_iters]
     else:
         eval_iters = args.eval_iters
-    
+
     for index, (iterator, iterations) in enumerate(zip(data_iterators, eval_iters)):
         suffix = ""
         if args.multiple_validation_sets:
@@ -2854,11 +3300,25 @@ def get_train_valid_test_num_samples():
     if args.full_validation:
         eval_samples = None
     else:
-        eval_iters = (args.train_iters // args.eval_interval + 1) * args.eval_iters
+        if args.skip_train:
+            eval_iters = args.eval_iters
+        else:
+            assert args.train_iters is not None
+            eval_iters = (args.train_iters // args.eval_interval + 1) * args.eval_iters
         eval_samples = eval_iters * args.global_batch_size
-    test_iters = args.eval_iters
+    test_samples = args.eval_iters * args.global_batch_size
+
+    # Get train_samples in current phase.
+    if args.phase_transition_iterations:
+        phase_transition_samples = [0] + [t * args.global_batch_size for t in args.phase_transition_iterations] + [args.train_samples]
+        current_sample = args.iteration * args.global_batch_size
+        last_transition_sample = max(s for s in phase_transition_samples if s <= current_sample)
+        next_transition_sample = min(s for s in phase_transition_samples if s > current_sample)
+        train_samples_in_current_phase = next_transition_sample - last_transition_sample
+    else:
+        train_samples_in_current_phase = train_samples
 
-    return (train_samples, eval_samples, test_iters * args.global_batch_size)
+    return (train_samples_in_current_phase, eval_samples, test_samples)
 
 
 def build_train_valid_test_datasets(build_train_valid_test_datasets_provider, train_valid_test_num_samples=None):
@@ -2886,6 +3346,7 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
         assert (
             args.train_samples is None
         ), 'Only backward compatiblity support for iteration-based training'
+
         args.consumed_train_samples = args.iteration * args.global_batch_size
     if args.iteration > 0 and args.consumed_valid_samples == 0:
         if args.train_samples is None:
@@ -2893,6 +3354,13 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
                 (args.iteration // args.eval_interval) * args.eval_iters * args.global_batch_size
             )
 
+    # Get consumed train samples in this phase.
+    if args.phase_transition_iterations:
+        last_transition = max(iteration for iteration in (0, *args.phase_transition_iterations) if iteration <= args.iteration)
+        consumed_train_samples_in_current_phase = (args.iteration - last_transition) * args.global_batch_size
+    else:
+        consumed_train_samples_in_current_phase = args.consumed_train_samples
+
     # Rely on distributed-aware core datasets, temporary
     is_distributed = getattr(build_train_valid_test_datasets_provider, "is_distributed", False)
 
@@ -2908,10 +3376,15 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
             do_train = args.train_iters > 0
             do_valid = (args.full_validation or args.eval_iters > 0)
             do_test = (args.full_validation or args.eval_iters > 0)
+
         else:
+            # Build datasets.
             train_ds, valid_ds, test_ds = build_train_valid_test_datasets(build_train_valid_test_datasets_provider)
             valid_ds = [valid_ds] if not isinstance(valid_ds, list) else valid_ds
-            train_dataloader = build_pretraining_data_loader(train_ds, args.consumed_train_samples)
+            if args.skip_train:
+                train_dataloader = None
+            else:
+                train_dataloader = build_pretraining_data_loader(train_ds, consumed_train_samples_in_current_phase)
             valid_dataloaders = []
             for valid_d in valid_ds:
                 if args.skip_train or args.full_validation:
@@ -2925,7 +3398,7 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
             if not args.multiple_validation_sets:
                 assert len(valid_dataloaders) == 1
             test_dataloader = build_pretraining_data_loader(test_ds, 0)
-            do_train = train_dataloader is not None and args.train_iters > 0
+            do_train = train_dataloader is not None and (args.skip_train or args.train_iters > 0)
             do_valid = valid_dataloaders is not None and (args.full_validation or args.eval_iters > 0)
             do_test = test_dataloader is not None and (args.full_validation or args.eval_iters > 0)
 
@@ -2982,7 +3455,7 @@ def _get_iterator(dataloader_type, dataloader):
 
     if valid_dataloaders is not None:
         # when using full validation, we need to override eval iters with the correct
-        # number of iterations on tp rank 0 so that it can be distributed to the other 
+        # number of iterations on tp rank 0 so that it can be distributed to the other
         # ranks later
         if args.full_validation:
             if args.multiple_validation_sets:
diff --git a/megatron/training/training_config.py b/megatron/training/training_config.py
new file mode 100644
index 00000000000..27c3f384c2f
--- /dev/null
+++ b/megatron/training/training_config.py
@@ -0,0 +1,498 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass, field
+import signal
+from typing import Literal
+
+@dataclass(kw_only=True)
+class TrainingConfig:
+    """Configuration settings related to the training loop."""
+
+    micro_batch_size: int | None = None
+    """Batch size per model instance (local batch size). Global batch size is local batch size times
+    data parallel size times number of micro batches."""
+
+    global_batch_size: int | None = None
+    """Training batch size. If set, it should be a multiple of micro-batch-size times
+    data-parallel-size. If this value is None, then use micro-batch-size * data-parallel-size
+    as the global batch size. This choice will result in 1 for number of micro-batches."""
+
+    rampup_batch_size: list[int] | None = field(default=None, metadata={"argparse_meta": {"nargs": 3}})
+    """Batch size ramp up with the following values: <start batch size>, <batch size increment>,
+    <ramp-up samples>
+    For example:
+        rampup-batch-size = [16, 8, 300000]
+        global-batch-size 1024
+    will start with global batch size 16 and over (1024 - 16) / 8 = 126 intervals will increase
+    the batch size linearly to 1024. In each interval we will use approximately
+    300000 / 126 = 2380 samples.
+    """
+
+    decrease_batch_size_if_needed: bool = False
+    """If set, decrease batch size if microbatch_size * dp_size does not 
+    divide batch_size. Old batch_size will be restored if training is re-started 
+    with dp_size that divides batch_size // microbatch_size."""
+
+    empty_unused_memory_level: Literal[0, 1, 2] = 0
+    """Call torch.cuda.empty_cache() each iteration (training and eval), to reduce fragmentation.
+    0=off, 1=moderate, 2=aggressive.
+    """
+
+    check_weight_hash_across_dp_replicas_interval: int | None = None
+    """Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked."""
+
+    train_sync_interval: int | None = None
+    """Training CPU-GPU synchronization interval, to ensure that CPU is not running too far ahead of GPU."""
+
+    train_iters: int | None = None
+    """Total number of iterations to train over all training runs.
+    Note that either train_iters or train_samples should be provided.
+    """
+
+    train_samples: int | None = None
+    """Total number of samples to train over all training runs.
+    Note that either train_iters or train_samples should be provided."""
+
+    exit_interval: int | None = None
+    """Exit the program after the iteration is divisible by this value."""
+
+    exit_duration_in_mins: int | None = None
+    """Exit the program after this many minutes."""
+
+    exit_signal_handler: bool = False
+    """Dynamically save the checkpoint and shutdown the training if SIGTERM is received"""
+
+    exit_signal: signal.Signals = signal.SIGTERM
+    """Signal for the signal handler to detect."""
+
+    exit_signal_handler_for_dataloader: bool = False
+    """Use signal handler for dataloader workers"""
+
+    manual_gc: bool = False
+    """Disable the threshold-based default garbage collector and trigger the garbage collection
+    manually. Manual garbage collection helps to align the timing of the collection across ranks
+    which mitigates the impact of CPU-associated jitters. When the manual gc is enabled, garbage
+    collection is performed only at the start and the end of the validation routine by default."""
+
+    manual_gc_interval: int = 0
+    """Training step interval to trigger manual garbage collection. Values > 0 will trigger garbage
+    collections between training steps.
+    """
+
+    manual_gc_eval: bool = True
+    """When using manual garbage collection, this controls garbage collection at the start and the
+    end of each evaluation run.
+    """
+
+    iterations_to_skip: list[int] = field(default_factory=list)
+    """List of iterations to skip during training, empty by default."""
+
+
+@dataclass(kw_only=True)
+class ValidationConfig:
+    """Configuration settings related to validation during or after model training."""
+
+    eval_iters: int | None = 100
+    """Number of iterations to run for evaluation. Used for both validation and test. If not set,
+    evaluation will not run."""
+
+    eval_interval: int | None = None
+    """Interval between running evaluation on validation set. If not set, evaluation will not run
+    during training.
+    """
+
+    skip_train: bool = False
+    """If set, bypass the training loop, perform evaluation for validation/test, and exit."""
+
+    test_mode: bool = False
+    """Run all real-time test alongside the experiment."""
+
+    full_validation: bool = False
+    """If set, each time validation occurs it uses the full validation dataset(s). This currently only works for GPT datasets!"""
+
+    multiple_validation_sets: bool = False
+    """If set, multiple datasets listed in the validation split are evaluated independently with a
+       separate loss for each dataset in the list. This argument requires that no weights are 
+       included in the list.
+    """
+
+
+@dataclass(kw_only=True)
+class SchedulerConfig:
+    """Configuration settings for the learning rate scheduler and weight decay."""
+
+    # ---------------- Learning rate config. ----------------
+    lr_decay_style: Literal["constant", "linear", "cosine", "inverse-square-root", "WSD"] = "linear"
+    """Learning rate decay function."""
+
+    lr_wsd_decay_style: Literal["exponential", "linear", "cosine", "minus_sqrt"] = "exponential"
+    """Decay style for the annealing phase of WSD"""
+
+    lr_decay_iters: int | None = None
+    """number of iterations to decay learning rate over, If None defaults to train iters"""
+
+    lr_decay_samples: int | None = None
+    """number of samples to decay learning rate over, If None defaults to train samples"""
+
+    lr_wsd_decay_iters: int | None = None
+    """number of iterations for the annealing phase in the wsd schedule"""
+
+    lr_wsd_decay_samples: int | None = None
+    """number of samples for the annealing phase in the wsd schedule"""
+
+    lr_warmup_fraction: float | None = None
+    """fraction of lr-warmup-(iters/samples) to use for warmup (as a float)"""
+
+    lr_warmup_iters: int = 0
+    """number of iterations to linearly warmup learning rate over."""
+
+    lr_warmup_samples: int = 0
+    """number of samples to linearly warmup learning rate over."""
+
+    lr_warmup_init: float = 0.0
+    """Initial value for learning rate warmup. The scheduler starts warmup from this value."""
+
+    lr_decay_steps: int | None = field(init=False, default=None)
+    """number of samples to decay learning rate over. Calculated at runtime from 
+    lr_decay_iters or lr_decay_samples.
+    """
+
+    lr_warmup_steps: int | None = field(init=False, default=None)
+    """number of samples to warmup learning rate over. Calculated at runtime from
+    lr_warmup_fraction, lr_warmup_iters, or lr_warmup_samples.
+    """
+    
+    override_opt_param_scheduler: bool = field(default=False, metadata={"argparse_meta": {"arg_names": ["--override-opt_param-scheduler", "--override-opt-param-scheduler"]}})
+    """Reset the values of the scheduler (learning rate, warmup iterations, minimum learning rate,
+    maximum number of iterations, and decay style) from input arguments and ignore values from
+    checkpoints. Note that all the above values will be reset."""
+
+    use_checkpoint_opt_param_scheduler: bool = field(default=False, metadata={"argparse_meta": {"arg_names": ["--use-checkpoint-opt_param-scheduler", "--use-checkpoint-opt-param-scheduler"]}})
+    """Use checkpoint to set the values of the scheduler (learning rate, warmup iterations,
+    minimum learning rate, maximum number of iterations, and decay style) from checkpoint
+    and ignore input arguments."""
+
+    # ---------------- Regularization config. ----------------
+
+    start_weight_decay: float | None = None
+    """Initial weight decay coefficient for L2 regularization."""
+
+    end_weight_decay: float | None = None
+    """End of run weight decay coefficient for L2 regularization."""
+
+    weight_decay_incr_style: Literal["constant", "linear", "cosine"] = "constant"
+    """Weight decay increment function."""
+
+    no_weight_decay_cond_type: Literal["qwen3_next"] | None = None
+    """Type of no weight decay condition. Choices:
+    None (default): param no weight decay if and only if it is 1D; or it is bias;
+    or it is embedding and embedding_init_method_std is not None.
+    "qwen3_next": In addition to the default rules, apply weight decay to qk layernorm as a special case."""
+
+    wd_incr_steps: int | None = field(init=False, default=None)
+    """Number of samples to increment weight decay over. Calculated at runtime."""
+
+    wsd_decay_steps: int | None = field(init=False, default=None)
+    """Number of samples to decay WSD weight decay. Calculated at runtime."""
+
+
+@dataclass(kw_only=True)
+class LoggerConfig:
+    """Configuration settings for logging, including TensorBoard and WandB."""
+
+    log_interval: int = 100
+    """Report loss and timing interval."""
+
+    log_params_norm: bool = False
+    """If set, calculate and log parameters norm."""
+
+    log_throughput: bool = False
+    """If set, calculate and log throughput per GPU."""
+
+    log_throughput_to_tensorboard: bool = False
+    """Enable throughput logging to tensorboard."""
+
+    throughput_window_size: int = 100
+    """Number of batches to use for a rolling average of throughput."""
+
+    log_progress: bool = False
+    """If set, log progress (in terms of number of processed tokens and number of floating-point operations)
+    to progress.txt file in checkpoint directory.
+    """
+
+    timing_log_level: Literal[0, 1, 2] = 0
+    """Granularity level to measure and report timing.
+    0: report only iteration time and make sure timing does not introduce extra overhead.
+    1: report timing for operations that are executed very limited times (basically once) during each iteration
+        (such as gradient all-reduce)
+    2: report timing for operations that migh be executed numerous times during each iteration.
+    Note that setting the level to 1 or 2 might cause increase in iteration time.
+    """
+
+    timing_log_option: Literal["max", "minmax", "all"] = "minmax"
+    """Options for logging timing:
+    max: report the max timing across all ranks
+    minmax: report min and max timings across all ranks
+    all: report timings of all ranks.
+    """
+
+    tensorboard_dir: str | None = None
+    """Write TensorBoard logs to this directory."""
+
+    tensorboard_log_interval: int = 1
+    """Report to tensorboard interval."""
+
+    tensorboard_queue_size: int = 1000
+    """Size of the tensorboard queue for pending events and summaries
+    before one of the 'add' calls forces a flush to disk.
+    """
+
+    log_timers_to_tensorboard: bool = False
+    """If set, write timers to tensorboard."""
+
+    log_loss_scale_to_tensorboard: bool = True
+    """Disable loss-scale logging to tensorboard."""
+
+    log_validation_ppl_to_tensorboard: bool = False
+    """If set, write validation perplexity to tensorboard."""
+
+    log_memory_to_tensorboard: bool = False
+    """Enable memory logging to tensorboard."""
+
+    memory_keys: dict[str, str] | None = None
+    """Names of memory statistics to log from `torch.cuda.memory_stats()`"""
+
+    log_memory_interval: int | None = None
+    """Report memory interval."""
+
+    log_device_memory_used: bool = False
+    """Log device memory used (as reported by nvidia-smi)."""
+
+    log_l2_norm_grad_to_tensorboard: bool = False
+    """Enable gradients logging to tensorboard."""
+
+    log_num_zeros_in_grad: bool = False
+    """If set, calculate and log the number of zeros in gradient."""
+
+    log_max_attention_logit: bool = False
+    """Enable max attention logit logging to tensorboard."""
+
+    log_runtime_to_tensorboard: bool = False
+    """Enable runtime metrics logging to tensorboard."""
+
+    runtime_time_unit: str = "hours"
+    """Time unit to use for time logging. """
+
+    barrier_with_L1_time: bool = field(default=True, metadata={"argparse_meta": {"arg_names": ["--no-barrier-with-level-1-timing"]}})
+    """If not disabled, use barrier with level 1 time measurements. Note that this is up to the user to
+    make sure calling barrier with their timers will not result in hangs. This can happen if for
+    example the user adds a level 1 timer that is not called by all ranks.
+    """
+
+    log_world_size_to_tensorboard: bool = False
+    """Enable world size logging to tensorboard."""
+
+    wandb_project: str | None = None
+    """The wandb project name. Ignore wandb by default."""
+
+    wandb_exp_name: str | None = None
+    """The wandb experiment name."""
+
+    wandb_save_dir: str | None = None
+    """Path to save the wandb results locally."""
+
+    wandb_entity: str | None = None
+    """The wandb entity name. It is useful when there are multiple sub-projects in a project."""
+
+    logging_level: int | None = None
+    """Set default logging level"""
+
+    filter_warnings: bool = True
+    """Filter out warning messages"""
+
+    modules_to_filter: list[str] | None = None
+    """List of modules to filter out from the logs"""
+
+    set_level_for_all_loggers: bool = False
+    """Set the logging level for all loggers. If False, only level for NeMo loggers will be set."""
+
+    log_energy: bool = False
+    """If set, log energy consumption (in Joules)."""
+
+    save_config_filepath: str | None = None
+    """If set, save the task configuration (ConfigContainer) to this file."""
+
+
+@dataclass(kw_only=True)
+class CheckpointConfig:
+    """Configuration settings for model checkpointing (saving and loading)."""
+
+    save: str | None = None
+    """Output directory to save checkpoints to."""
+
+    save_interval: int | None = field(default=None, metadata={"argparse_meta": {"arg_names": ["--save-interval", "--persistent-save-interval"]}})
+    """Number of iterations between persistent checkpoint saves."""
+
+    save_wgrads_interval: int | None = None
+    """Number of iterations between wgrad (main_grad) saves."""
+
+    save_dgrads_interval: int | None = None
+    """Number of iterations between dgrad saves."""
+
+    save_retain_interval: int | None = None
+    """Number of iterations between retained checkpoints
+    (other checkpoints except the last checkpoint are automatically deleted).
+    """
+
+    most_recent_k: int | None = -1
+    """Number of latest checkpoint to be saved."""
+
+    save_optim: bool = True
+    """Do not save current optimizer."""
+
+    save_rng: bool = True
+    """Do not save current rng state."""
+
+    load: str | None = None
+    """Directory containing a model checkpoint."""
+
+    load_optim: bool = True
+    """Do not load optimizer when loading checkpoint."""
+
+    load_main_params_from_ckpt: bool = False
+    """Load main parameters from checkpoint. When loading a model from a checkpoint without loading
+    the optimizer, the model parameters are updated but for fp16 optimizer with main parameters,
+    the main parameters need to also be updated.
+    """
+
+    load_rng: bool = True
+    """Do not load rng state when loading checkpoint."""
+
+    non_persistent_save_interval: int | None = None
+    """Number of iterations between non-persistent saves."""
+
+    non_persistent_ckpt_type: Literal["global", "local", "in_memory"] | None = None
+    """Type of non-persistent model checkpoints.
+    "global" - Saved as a standard checkpoint (e.g., on Lustre) with old checkpoints being removed.
+    "local" - [TBD] Each rank saves a portion of the checkpoint locally (e.g., on SSD/ramdisk).
+    "in_memory" - [TBD] A special kind of local checkpoint that avoids serialization.
+    None - No non-persistent checkpointing (default option)."""
+
+    non_persistent_global_ckpt_dir: str | None = None
+    """Directory containing global non-persistent model checkpoints."""
+
+    non_persistent_local_ckpt_dir: str | None = None
+    """Directory containing local non-persistent model checkpoints."""
+
+    non_persistent_local_ckpt_algo: Literal["fully_parallel", "atomic"] = "fully_parallel"
+    """Algorithm for local non-persistent checkpointing."""
+
+    finetune: bool = False
+    """Load model for finetuning. Do not load optimizer or rng state from checkpoint and set iteration to 0.
+    Assumed when loading a release checkpoint."""
+
+    pretrained_checkpoint: str | None = None
+    """Directory containing a pretrained model checkpoint for finetuning."""
+
+    ckpt_step: int | None = None
+    """Checkpoint step to load model from."""
+
+    use_checkpoint_args: bool = False
+    """Override model-related command-line arguments with arguments from checkpoint"""
+
+    use_mp_args_from_checkpoint_args: bool = False
+    """Copy model parallelism command-line arguments from checkpoint"""
+
+    use_tokenizer_model_from_checkpoint_args: bool = True
+    """If set, do not use tokenizer model path from checkpoint"""
+
+    exit_on_missing_checkpoint: bool = False
+    """If 'load' is set, but checkpoint is not found (e.g., path typo), then exit instead of random initialization."""
+
+    ckpt_format: Literal["torch", "torch_dist", "torch_dcp", "fsdp_dtensor"] = "torch_dist"
+    """ Checkpoint format to use. torch is the format used by torch.save/load.
+    torch_dist is a megatron built-in distributed checkpointing format.
+    torch_dcp is the torch.distributed.checkpoint format.
+    fsdp_dtensor is a torch DCP native, Megatron FSDP training-specific checkpoint format.
+    """
+
+    auto_detect_ckpt_format: bool = False
+    """Determine if the checkpoint format is in legacy or distributed format. If False,
+    expects distributed checkpoint iff args.ckpt_format != "torch". Might slow down 
+    loading a bit (double rank0 ckpt load).
+    """
+
+    ckpt_convert_format: Literal["torch", "torch_dist"] | None = None
+    """Checkpoint format for conversion."""
+
+    ckpt_convert_save: str | None = None
+    """Save directory for converted checkpoint."""
+
+    ckpt_convert_update_legacy_dist_opt_format: bool = False
+    """When loading a checkpoint, update the legacy format for the distributed optimizer,
+    which previously used a merged param/grad buffer and a different bucket mapping.
+    The legacy format was deprecated on Feb 13, 2024.
+    """
+
+    ckpt_fully_parallel_save: bool = True
+    """Disable applying full save parallelization across DP for distributed checkpoints.
+    Depending on ckpt format might decrease the number of files in the checkpoint.
+    Makes DistributedOptimizer checkpoint non-reshardable."""
+
+    async_save: bool = False
+    """Apply async checkpointing save. Currently works only with `torch_dist` distributed checkpoint format."""
+
+    use_persistent_ckpt_worker: bool = False
+    """Use a persistent background worker for async checkpoint saves. When enabled, creates a dedicated
+    worker thread/process for handling async saves. When disabled, uses temporal workers that are
+    created and destroyed for each save operation."""
+
+    ckpt_fully_parallel_load: bool = False
+    """Apply full load parallelization across DP for distributed checkpoints."""
+
+    ckpt_assume_constant_structure: bool = False
+    """Assume the checkpoint structure is constant across saves to enable optimizations."""
+
+    strict_fsdp_dtensor_load: bool = True
+    """Whether to enforce strict loading for FSDP DTensor checkpoints. When False, allows partial loading."""
+
+    dist_ckpt_strictness: Literal[
+        "assume_ok_unexpected",
+        "log_unexpected",
+        "log_all",
+        "raise_unexpected",
+        "raise_all",
+        "return_unexpected",
+        "return_all",
+        "ignore_all",
+    ] = "assume_ok_unexpected"
+    """Determine handling of key mismatch during checkpoint load. Check StrictHandling docs for flags meaning.
+    NOTE: This flag controls only distributed checkpoint load from storage, not loading state dict into the model."""
+
+    dist_ckpt_save_pre_mcore_014: bool = False
+    """Revert checkpointing simplifications introduced in Megatron-Core v0.14.
+    This option affects only checkpoint saving format and will be removed soon
+    (checkpoint load format is determined based on checkpoint metadata)."""
+
+    dist_ckpt_optim_fully_reshardable: bool = False
+    """Make optimizer distributed checkpoint fully reshardable (TP/PP/EP/DP) as opposed to plain DP reshardability."""
+
+    distrib_optim_fully_reshardable_mem_efficient: bool = False
+    """During distributed optimizer checkpoint save and load tries to use as little memory as possible
+    by using Gloo (instead of NCCL) and only one rank for saving. Turn on only if experiencing host or device memory
+    issues. Has affect only with `dist_ckpt_optim_fully_reshardable` flag."""
+
+    save_tokenizer_assets: bool = True
+    """Save tokenizer files to checkpoint directory. When enabled, saves all tokenizer artifacts
+    (vocab files, special tokens, tokenizer config) to make checkpoints self-contained and portable.
+    Set to False for performance-sensitive scenarios where tokenizer files are not needed."""
+
+    replication: bool = False
+    """If set, replication of local checkpoints is enabled. Needs to be enabled on all ranks."""
+
+    replication_jump: int | None = None
+    """Specifies `J`, the spacing between ranks storing replicas of a given rank's data. Replicas
+    for rank `n` may be on ranks `n+J`, `n+2J`, ..., or `n-J`, `n-2J`, etc. This flag has an
+    effect only if --replication is used. and must be consistent across all ranks."""
+
+    replication_factor: int = 2
+    """Number of machines storing the replica of a given rank's data."""
diff --git a/megatron/training/utils.py b/megatron/training/utils.py
index cc4560a7e3a..01897c15e1a 100644
--- a/megatron/training/utils.py
+++ b/megatron/training/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 """General utilities."""
 import json
@@ -12,6 +12,7 @@
 import torch
 
 from megatron.core.msc_utils import MultiStorageClientFeature, open_file
+from megatron.core._rank_utils import safe_get_rank as _safe_get_rank
 
 try:
     from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_l2norm
@@ -80,10 +81,21 @@ def calc_params_l2_norm(model, force_create_fp32_copy=False):
                 continue
             assert is_not_tp_duplicate
             if not getattr(param, 'allreduce', True):
-                # TODO: Implement memory optimization for MoE parameters.
                 assert param_is_not_shared(param)
                 param = to_local_if_dtensor(param)
-                moe_params_data.append(param.data.float() if args.bf16 else param.data)
+                if args.bf16:
+                    if not force_create_fp32_copy and hasattr(param, 'main_param'):
+                        if getattr(param, 'main_param_sharded', False):
+                            if param.main_param is not None:
+                                sharded_params_data.append(param.main_param)
+                        else:
+                            moe_params_data.append(param.main_param)
+                    else:
+                        # Fallback to original logic of making a fp32 copy of the
+                        # parameter if `.main_param` attribute is not available.
+                        moe_params_data.append(param.data.float())
+                else:
+                    moe_params_data.append(param.data)
             else:
                 if param_is_not_shared(param):
                     param = to_local_if_dtensor(param)
@@ -228,7 +240,7 @@ def average_losses_across_data_parallel_group(losses):
     return averaged_losses
 
 
-def reduce_max_stat_across_model_parallel_group(stat: float) -> float:
+def reduce_max_stat_across_model_parallel_group(stat: float) -> float | None:
     """
     Ranks without an optimizer will have no grad_norm or num_zeros_in_grad stats.
     We need to ensure the logging and writer rank has those values.
@@ -243,6 +255,7 @@ def reduce_max_stat_across_model_parallel_group(stat: float) -> float:
         stat, op=torch.distributed.ReduceOp.MAX, group=mpu.get_model_parallel_group()
     )
     if stat.item() == -1.0:
+        # No rank has a valid stat, so return None to indicate that it is None across all ranks.
         return None
     else:
         return stat.item()
@@ -265,12 +278,15 @@ def logical_and_across_model_parallel_group(input: bool) -> bool:
 
 def report_memory(name):
     """Simple GPU memory report."""
+    args = get_args()
     mega_bytes = 1024.0 * 1024.0
     string = name + ' memory (MB)'
-    string += ' | allocated: {}'.format(torch.cuda.memory_allocated() / mega_bytes)
-    string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated() / mega_bytes)
-    string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes)
-    string += ' | max reserved: {}'.format(torch.cuda.max_memory_reserved() / mega_bytes)
+    string += f" | allocated: {torch.cuda.memory_allocated() / mega_bytes:.2f}"
+    string += f" | max allocated: {torch.cuda.max_memory_allocated() / mega_bytes:.2f}"
+    string += f" | reserved: {torch.cuda.memory_reserved() / mega_bytes:.2f}"
+    string += f" | max reserved: {torch.cuda.max_memory_reserved() / mega_bytes:.2f}"
+    if args.log_device_memory_used:
+        string += f" | total device memory used: {torch.cuda.device_memory_used() / mega_bytes:.2f}"
     if mpu.get_data_parallel_rank() == 0:
         print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)
 
@@ -380,11 +396,9 @@ def print_rank_0(message, rank=None):
     if rank is not None:
         if rank == 0:
             print(message, flush=True)
-    elif torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            print(message, flush=True)
     else:
-        print(message, flush=True)
+        if _safe_get_rank() == 0:
+            print(message, flush=True)
 
 
 def warn_rank_0(message, rank=None):
@@ -392,20 +406,20 @@ def warn_rank_0(message, rank=None):
     if rank is not None:
         if rank == 0:
             warnings.warn(message)
-    elif torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            warnings.warn(message)
     else:
-        warnings.warn(message)
+        if _safe_get_rank() == 0:
+            warnings.warn(message)
 
 
 def is_rank0():
-    """Returns true if called in the rank0, false otherwise"""
-    return torch.distributed.is_initialized() and torch.distributed.get_rank() == 0
+    """Returns true if called in the rank0, false otherwise."""
+    return _safe_get_rank() == 0
 
 
 def is_last_rank():
-    return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1)
+    """Returns true if called on last rank, false otherwise."""
+    assert torch.distributed.is_initialized()
+    return _safe_get_rank() == (torch.distributed.get_world_size() - 1)
 
 
 def print_rank_last(message):
@@ -500,7 +514,7 @@ def get_blend_and_blend_per_split(args):
     return blend, blend_per_split
 
 
-def get_batch_on_this_tp_rank(data_iterator):
+def get_batch_on_this_tp_rank(data_iterator, mtp_on_this_rank: bool = False):
 
     args = get_args()
 
@@ -526,68 +540,169 @@ def _broadcast(item):
                 else data["attention_mask"].cuda(non_blocking=True)
             ),
             'position_ids': data["position_ids"].cuda(non_blocking=True),
+            'cu_seqlens': (
+                None
+                if "cu_seqlens" not in data
+                else data["cu_seqlens"].cuda(non_blocking=True)
+            ),
+            'max_seqlen': (
+                None
+                if "max_seqlen" not in data
+                else data["max_seqlen"].cuda(non_blocking=True)
+            ),
+            'local_cp_size': (
+                None
+                if "local_cp_size" not in data
+                else data["local_cp_size"].cuda(non_blocking=True)
+            ),
         }
 
-        if args.pipeline_model_parallel_size == 1:
+        def _broadcast_cu_seqlens(cu_seqlens):
+            dev = torch.cuda.current_device()
+            n = 0 if cu_seqlens is None else int(cu_seqlens.numel())
+            n_tensor = torch.tensor(n, dtype=torch.int64, device=dev)
+            _broadcast(n_tensor)
+
+            if n == 0:
+                buf = torch.empty(0, dtype=torch.int32, device=dev)
+            else:
+                assert isinstance(cu_seqlens, torch.Tensor)
+                assert cu_seqlens.dtype == torch.int32
+                assert cu_seqlens.shape[0] == 1, "micro-batch-size must be 1 for packing"
+                buf = cu_seqlens.to(device=dev, non_blocking=True).contiguous()
+            _broadcast(buf)
+
+        if args.hybrid_context_parallel:
+            seq_len = torch.tensor(batch['tokens'].shape[0], dtype=torch.int32, device=torch.cuda.current_device())
+            _broadcast(seq_len)
+            
+        if args.pipeline_model_parallel_size == 1 or mtp_on_this_rank:
             _broadcast(batch['tokens'])
             _broadcast(batch['labels'])
             _broadcast(batch['loss_mask'])
             _broadcast(batch['attention_mask'])
             _broadcast(batch['position_ids'])
+            _broadcast_cu_seqlens(batch['cu_seqlens'])
+            _broadcast(batch['max_seqlen'])
+            _broadcast(batch['local_cp_size'])
 
         elif mpu.is_pipeline_first_stage():
             _broadcast(batch['tokens'])
             _broadcast(batch['attention_mask'])
             _broadcast(batch['position_ids'])
+            _broadcast_cu_seqlens(batch['cu_seqlens'])
+            _broadcast(batch['max_seqlen'])
 
         elif mpu.is_pipeline_last_stage():
             # Multi-Token Prediction (MTP) layers need tokens and position_ids to calculate embedding.
             # Currently the Multi-Token Prediction (MTP) layers is fixed on the last stage, so we need
             # to broadcast tokens and position_ids to all of the tensor parallel ranks on the last stage.
-            if args.mtp_num_layers is not None:
-                _broadcast(batch['tokens'])
-                _broadcast(batch['position_ids'])
             _broadcast(batch['labels'])
             _broadcast(batch['loss_mask'])
             _broadcast(batch['attention_mask'])
 
-    else:
+        def _broadcast_cu_seqlens(cu_seqlens):
+            dev = torch.cuda.current_device()
+
+            n = 0 if cu_seqlens is None else int(cu_seqlens.numel())
+            n_tensor = torch.tensor(n, dtype=torch.int64, device=dev)
+            _broadcast(n_tensor)
+
+            if n == 0:
+                buf = torch.empty(0, dtype=torch.int32, device=dev)
+            else:
+                assert isinstance(cu_seqlens, torch.Tensor)
+                assert cu_seqlens.dtype == torch.int32
+                assert cu_seqlens.shape[0] == 1, "micro-batch-size must be 1 for packing"
+                buf = cu_seqlens.to(device=dev, non_blocking=True).contiguous()
+            _broadcast(buf)
 
+        _broadcast_cu_seqlens(batch['cu_seqlens'])
+        _broadcast(batch['max_seqlen'])
+
+    else:
+        if args.hybrid_context_parallel:
+            seq_len = torch.tensor(0, dtype=torch.int32, device=torch.cuda.current_device())
+            _broadcast(seq_len)
+            shape = (seq_len.item())
+        else:
+            shape = (args.micro_batch_size, args.seq_length)
+            
         tokens = torch.empty(
-            (args.micro_batch_size, args.seq_length),
+            shape,
             dtype=torch.int64,
             device=torch.cuda.current_device(),
         )
         labels = torch.empty(
-            (args.micro_batch_size, args.seq_length),
+            shape,
             dtype=torch.int64,
             device=torch.cuda.current_device(),
         )
         loss_mask = torch.empty(
-            (args.micro_batch_size, args.seq_length),
+            shape,
             dtype=torch.float32,
             device=torch.cuda.current_device(),
         )
         if args.create_attention_mask_in_dataloader:
+            shape_attention_mask = (args.micro_batch_size, 1, args.seq_length, args.seq_length) if not args.hybrid_context_parallel else (1, 1, shape[0], shape[0])
             attention_mask = torch.empty(
-                (args.micro_batch_size, 1, args.seq_length, args.seq_length),
+                shape_attention_mask,
                 dtype=torch.bool,
                 device=torch.cuda.current_device(),
             )
         else:
             attention_mask = None
         position_ids = torch.empty(
-            (args.micro_batch_size, args.seq_length),
+            shape,
             dtype=torch.int64,
             device=torch.cuda.current_device(),
         )
+        cu_seqlens = None
+        if args.sft:
+            max_seqlen = torch.empty(
+                1,
+                dtype=torch.int32,
+                device=torch.cuda.current_device(),
+            )
+        else:
+            max_seqlen = None
+
+        cu_seqlens = None
+        max_seqlen = torch.empty(
+            1,
+            dtype=torch.int32,
+            device=torch.cuda.current_device(),
+        ) if args.hybrid_context_parallel else None
+        local_cp_size = torch.empty(
+            1,
+            dtype=torch.int32,
+            device=torch.cuda.current_device(),
+        ) if args.hybrid_context_parallel else None
+
+        def _broadcast_cu_seqlens():
+            dev = torch.cuda.current_device()
+
+            n = torch.empty((), dtype=torch.int64, device=dev)
+            _broadcast(n)
+            n = int(n.item())
 
-        if args.pipeline_model_parallel_size == 1:
+            if n == 0:
+                cu_seqlens = torch.empty(0, dtype=torch.int32, device=dev)
+            else:
+                cu_seqlens = torch.empty((args.micro_batch_size, n), dtype=torch.int32, device=dev)
+            _broadcast(cu_seqlens)
+
+            return cu_seqlens if n > 0 else None
+
+        if args.pipeline_model_parallel_size == 1 or mtp_on_this_rank:
             _broadcast(tokens)
             _broadcast(labels)
             _broadcast(loss_mask)
             _broadcast(attention_mask)
             _broadcast(position_ids)
+            cu_seqlens = _broadcast_cu_seqlens()
+            _broadcast(max_seqlen)
+            _broadcast(local_cp_size)
 
         elif mpu.is_pipeline_first_stage():
             labels = None
@@ -596,28 +711,49 @@ def _broadcast(item):
             _broadcast(tokens)
             _broadcast(attention_mask)
             _broadcast(position_ids)
+            cu_seqlens = _broadcast_cu_seqlens()
+            _broadcast(max_seqlen)
 
         elif mpu.is_pipeline_last_stage():
             # Multi-Token Prediction (MTP) layers need tokens and position_ids to calculate embedding.
             # Currently the Multi-Token Prediction (MTP) layers is fixed on the last stage, so we need
             # to broadcast tokens and position_ids to all of the tensor parallel ranks on the last stage.
-            if args.mtp_num_layers is not None:
-                _broadcast(tokens)
-                _broadcast(position_ids)
-            else:
-                tokens = None
-                position_ids = None
+            tokens = None
+            position_ids = None
+            cu_seqlens = None
+            max_seqlen = None
 
             _broadcast(labels)
             _broadcast(loss_mask)
             _broadcast(attention_mask)
 
+        def _broadcast_cu_seqlens():
+            dev = torch.cuda.current_device()
+
+            n = torch.empty((), dtype=torch.int64, device=dev)
+            _broadcast(n)
+            n = int(n.item())
+
+            if n == 0:
+                cu_seqlens = torch.empty(0, dtype=torch.int32, device=dev)
+            else:
+                cu_seqlens = torch.empty((args.micro_batch_size, n), dtype=torch.int32, device=dev)
+            _broadcast(cu_seqlens)
+
+            return cu_seqlens if n > 0 else None
+
+        cu_seqlens = _broadcast_cu_seqlens()
+        _broadcast(max_seqlen)
+
         batch = {
             'tokens': tokens,
             'labels': labels,
             'loss_mask': loss_mask,
             'attention_mask': attention_mask,
             'position_ids': position_ids,
+            'cu_seqlens': cu_seqlens,
+            'max_seqlen': max_seqlen,
+            'local_cp_size': local_cp_size,
         }
 
     return batch
diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py
index 405d7b70fad..70ccac4402c 100644
--- a/megatron/training/yaml_arguments.py
+++ b/megatron/training/yaml_arguments.py
@@ -6,13 +6,13 @@
 import dataclasses
 import json
 import os
+import re
 import torch
 import types
+import yaml
 
 from itertools import chain, starmap
 from types import SimpleNamespace
-import yaml, re, os
-from types import SimpleNamespace
 
 import torch.nn.functional as F
 
@@ -318,23 +318,6 @@ def validate_yaml(args, defaults={}):
             raise RuntimeError(
                 "Using async gradient all reduce requires setting the environment "
                 "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
-
-    # Retro checks.
-    if getattr(args, 'retro_add_retriever', False):
-        raise Exception("Retro untested for yaml args. See arguments.py.")
-
-        # Sequence parallelism unsupported.
-        assert not args.sequence_parallel, \
-            "retro currently does not support sequence parallelism."
-
-        # Pipeline parallelism unsupported.
-        assert args.pipeline_model_parallel_size == 1, \
-            "retro currently does not support pipeline parallelism."
-
-    #TODO: Retro args loading not tested
-    # Load retro args (used by both Retro & GPT).
-    if getattr(args, 'retro_project_dir', None) is not None:
-        raise Exception("Retro untested for yaml args. See arguments.py.")
     
     # MoE Spec check
     if args.language_model.num_moe_experts is not None:
diff --git a/pretrain_bert.py b/pretrain_bert.py
index 401c32b4cb9..7cda429a849 100644
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -28,13 +28,14 @@
 from megatron.core.tokenizers import MegatronTokenizer
 
 
-def model_provider(pre_process=True, post_process=True, vp_stage=None):
+def model_provider(pre_process=True, post_process=True, vp_stage=None, config=None, pg_collection=None):
     """Build the model."""
 
     print_rank_0('building BERT model ...')
 
     args = get_args()
-    config = core_transformer_config_from_args(args)
+    if config is None:
+        config = core_transformer_config_from_args(args)
     num_tokentypes = 2 if args.bert_binary_head else 0
 
     if args.use_legacy_models:
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 6b602d33243..8eff08d24b2 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -1,7 +1,21 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 """Pretrain and SFT GPT."""
 
+# Capture the true program start time BEFORE any heavy imports.
+import time
+_PROGRAM_START_TIME = time.time()
+
+import json
+
+# Suppress warnings on all ranks but rank 0.
+import os
+import warnings
+rank = int(os.environ.get('RANK', 0))
+if rank != 0:
+    warnings.filterwarnings("ignore", category=UserWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+
 from functools import partial
 from typing import List, Optional, Tuple
 
@@ -14,10 +28,20 @@
 from megatron.core.enums import ModelType
 from megatron.core.models.gpt import GPTModel
 from megatron.core.rerun_state_machine import get_rerun_state_machine
+from megatron.core.utils import get_attr_wrapped_model, get_thd_batch_on_this_cp_rank, get_batch_on_this_hybrid_cp_rank, StragglerDetector
 from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer
-from megatron.core.utils import StragglerDetector, get_attr_wrapped_model
-from megatron.training import get_args, get_timers, get_tokenizer, inprocess_restart, pretrain, print_rank_0
+from megatron.training import (
+    get_args,
+    get_timers,
+    get_tokenizer,
+    inprocess_restart,
+    pretrain,
+    print_rank_0,
+    set_startup_timestamps,
+)
 from megatron.training.datasets.sft_dataset import SFTDataset
+from megatron.core.transformer.multi_token_prediction import mtp_on_this_rank, get_mtp_ranks
+from megatron.training.arguments import core_transformer_config_from_args
 from megatron.training.datasets.fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig
 from megatron.training.utils import (
     get_batch_on_this_cp_rank,
@@ -38,19 +62,39 @@
 stimer = StragglerDetector()
 
 
-def get_batch(data_iterator, vp_stage=None):
+def get_batch(data_iterator, vp_stage: Optional[int] = None):
     """Generate a batch."""
+    args = get_args()
+    config = core_transformer_config_from_args(args)
     # TODO: this is pretty hacky, find a better way
-    if not is_first_or_last_pipeline_stage(vp_stage):
-        return None, None, None, None, None
+    if not is_first_or_last_pipeline_stage(vp_stage) and (
+    (not mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage))):
+        return None, None, None, None, None, None
 
     # get batches based on the TP rank you are on
-    batch = get_batch_on_this_tp_rank(data_iterator)
-
-    # slice batch along sequence dimension for context parallelism
-    batch = get_batch_on_this_cp_rank(batch)
+    batch = get_batch_on_this_tp_rank(
+        data_iterator,
+        mtp_on_this_rank=mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage)
+        )
 
-    return batch.values()
+    cu_seqlens = batch.pop('cu_seqlens', None)
+    cu_seqlens_padded = batch.pop('cu_seqlens_padded', None)
+    max_seqlen = batch.pop('max_seqlen', None)
+    local_cp_size = batch.pop('local_cp_size', None)
+    if local_cp_size is not None:
+        local_cp_size = int(local_cp_size.item())
+
+    if cu_seqlens is None and local_cp_size is None:
+        # slice batch along sequence dimension for context parallelism
+        batch = get_batch_on_this_cp_rank(batch)  # The implementation of this function is in MCore
+        packed_seq_params = None
+    elif local_cp_size is None:  # Packed THD format
+        assert max_seqlen.dim() == 1
+        batch, packed_seq_params = get_thd_batch_on_this_cp_rank(batch, cu_seqlens, cu_seqlens_padded, max_seqlen)
+    else: # Hybrid CP format
+        batch, packed_seq_params = get_batch_on_this_hybrid_cp_rank(batch, local_cp_size)
+    
+    return (*batch.values(), packed_seq_params)
 
 
 # define spiky loss as a loss that's 10x the max loss observed
@@ -135,7 +179,7 @@ def forward_step(data_iterator, model: GPTModel, return_schedule_plan: bool = Fa
     global stimer
     with stimer(bdata=True):
         vp_stage = get_attr_wrapped_model(model, "vp_stage")
-        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator, vp_stage)
+        tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params = get_batch(data_iterator, vp_stage)
     timers('batch-generator').stop()
 
     with stimer:
@@ -151,7 +195,7 @@ def forward_step(data_iterator, model: GPTModel, return_schedule_plan: bool = Fa
                 return schedule_plan, partial(loss_func, loss_mask, model=model)
             else:
                 output_tensor = model(
-                    tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask
+                    tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask, packed_seq_params=packed_seq_params
                 )
 
     # [ModelOpt]: model is needed to access ModelOpt distillation losses
@@ -159,7 +203,12 @@ def forward_step(data_iterator, model: GPTModel, return_schedule_plan: bool = Fa
 
 
 def is_dataset_built_on_rank(vp_stage=None):
-    return is_first_or_last_pipeline_stage(vp_stage) and parallel_state.get_tensor_model_parallel_rank() == 0
+    args = get_args()
+    config = core_transformer_config_from_args(args)
+    return (
+        is_first_or_last_pipeline_stage(vp_stage)
+        or mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage)
+    ) and parallel_state.get_tensor_model_parallel_rank() == 0
 
 
 def core_gpt_dataset_config_from_args(args):
@@ -173,6 +222,11 @@ def core_gpt_dataset_config_from_args(args):
     blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]
     blend, blend_per_split = get_blend_and_blend_per_split(args)
 
+    sequences_per_dataset = None
+    if args.per_dataset_sequences_path is not None:
+        with open(args.per_dataset_sequences_path, "r") as f:
+            sequences_per_dataset = json.load(f)
+
     data_args = {
         "random_seed": args.seed,
         "sequence_length": args.seq_length,
@@ -192,6 +246,13 @@ def core_gpt_dataset_config_from_args(args):
         "object_storage_cache_path": args.object_storage_cache_path,
         "mid_level_dataset_surplus": args.mid_level_dataset_surplus,
         "allow_ambiguous_pad_tokens": args.allow_ambiguous_pad_tokens,
+        "fast_cache_load": args.dataloader_fast_cache_load,
+        "sequences_per_dataset": sequences_per_dataset,
+        "defer_npy_index_mmap": args.dataloader_defer_npy_index_mmap,
+        "context_parallel_size": args.context_parallel_size,
+        "data_parallel_size": args.data_parallel_size,
+        "sequence_parallel_size": args.tensor_model_parallel_size*args.sequence_parallel,
+        "hybrid_context_parallel": args.hybrid_context_parallel,
     }
 
     # add FIM args to the config
@@ -240,6 +301,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
 
     print_rank_0("> building train, validation, and test datasets for GPT ...")
 
+    is_dataset_built = partial(is_dataset_built_on_rank, vp_stage=vp_stage)
     train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
         dataset_type, train_val_test_num_samples, partial(is_dataset_built_on_rank, vp_stage=vp_stage), config
     ).build()
@@ -249,7 +311,27 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
     return train_ds, valid_ds, test_ds
 
 
+def get_embedding_ranks(pp_ranks: List[int]):
+    """Get the embedding ranks."""
+    embedding_ranks = [pp_ranks[0]]
+    if len(pp_ranks) > 1:
+        args = get_args()
+        if not args.untie_embeddings_and_output_weights:
+            embedding_ranks.append(pp_ranks[-1])
+        config = core_transformer_config_from_args(args)
+        mtp_ranks = get_mtp_ranks(pp_ranks, config)
+        embedding_ranks.extend(mtp_ranks)
+    embedding_ranks = list(set(embedding_ranks))
+    embedding_ranks = sorted(embedding_ranks)
+    return embedding_ranks
+
+
 if __name__ == "__main__":
+    # Timestamp right after entering __main__ block (after all imports/library setup)
+    _MAIN_ENTRY_TIME = time.time()
+
+    # Register startup timestamps for timing report in pretrain()
+    set_startup_timestamps(program_start=_PROGRAM_START_TIME, main_entry=_MAIN_ENTRY_TIME)
 
     # Temporary for transition to core datasets
     train_valid_test_datasets_provider.is_distributed = True
@@ -265,4 +347,5 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
         args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
         extra_args_provider=add_modelopt_args if has_nvidia_modelopt else None,
         store=store,
+        get_embedding_ranks=get_embedding_ranks,
     )
diff --git a/pretrain_mamba.py b/pretrain_mamba.py
index 45b646a6cc0..e1379be63e9 100644
--- a/pretrain_mamba.py
+++ b/pretrain_mamba.py
@@ -1,6 +1,20 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 """Pretrain and SFT Mamba."""
 
+# Capture the true program start time BEFORE any heavy imports.
+import time
+_PROGRAM_START_TIME = time.time()
+
+import json
+
+# Suppress warnings on all ranks but rank 0.
+import os
+import warnings
+rank = int(os.environ.get('RANK', 0))
+if rank != 0:
+    warnings.filterwarnings("ignore", category=UserWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+
 from functools import partial
 from typing import List, Optional, Tuple
 
@@ -11,11 +25,24 @@
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
 from megatron.core.enums import ModelType
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.parallel_state import (
+    get_context_parallel_rank,
+    get_context_parallel_world_size,
+)
 from megatron.core.models.mamba import MambaModel
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer
-from megatron.core.utils import StragglerDetector, get_attr_wrapped_model
-from megatron.training import get_args, get_timers, get_tokenizer, inprocess_restart, pretrain, print_rank_0
+from megatron.core.utils import get_attr_wrapped_model, is_te_min_version, StragglerDetector
+from megatron.training import (
+    get_args,
+    get_timers,
+    get_tokenizer,
+    inprocess_restart,
+    pretrain,
+    print_rank_0,
+    set_startup_timestamps,
+)
 from megatron.training.datasets.sft_dataset import SFTDataset
 from megatron.training.utils import (
     get_batch_on_this_cp_rank,
@@ -32,21 +59,90 @@
 except ImportError:
     has_nvidia_modelopt = False
 
+try:
+    # Register the TE CUDA kernels
+    import transformer_engine  # pylint: disable=unused-import
+
+    # Alias the PyTorch wrapper so we can call tex.* APIs
+    import transformer_engine_torch as tex
+except ImportError:
+    # TE isn’t installed or the torch wrapper is missing
+    tex = None
+
 stimer = StragglerDetector()
 
 
 def get_batch(data_iterator, vp_stage=None):
     """Generate a batch."""
 
-    # TODO: this is pretty hacky, find a better way
-    if not is_first_or_last_pipeline_stage(vp_stage):
-        return None, None, None, None, None
+    empty_batch = {
+        'tokens': None,
+        'labels': None,
+        'loss_mask': None,
+        'attention_mask': None,
+        'position_ids': None,
+        'cu_seqlens': None,
+        'max_seqlen': None,
+    }
+
+    # TODO(duncan): Is there a more efficient way to access is_packed_sequence here?
+    is_packed_sequence = get_args().sft  # SFT always uses packed sequence
+    if not is_first_or_last_pipeline_stage(vp_stage) and not is_packed_sequence:
+        return empty_batch.values()
 
-    # get batches based on the TP rank you are on
     batch = get_batch_on_this_tp_rank(data_iterator)
-
-    # slice batch along sequence dimension for context parallelism
-    batch = get_batch_on_this_cp_rank(batch)
+    
+    cu_seqlens = batch['cu_seqlens']
+    # Unused at the moment
+    cu_seqlens_padded = batch.pop('cu_seqlens_padded', None)
+    # Support for Hybrid Context Parallel (Unused in this script)
+    local_cp_size = batch.pop('local_cp_size', None)
+
+    if cu_seqlens is not None:
+        assert (
+            cu_seqlens.dim() == 2 and cu_seqlens.shape[0] == 1
+        ), "micro-batch-size must be 1 for packing"
+        cu_seqlens = cu_seqlens[0]
+        batch['cu_seqlens'] = cu_seqlens
+
+        max_seqlen = batch['max_seqlen']
+        assert max_seqlen.dim() == 1
+        # TODO(duncan): can this be kept as a 0-D tensor?
+        batch['max_seqlen'] = int(max_seqlen[0].item())
+
+    if mpu.is_pipeline_first_stage(ignore_virtual=(vp_stage is None), vp_stage=vp_stage):
+        total_tokens = batch['tokens'].size(1)
+    elif mpu.is_pipeline_last_stage(ignore_virtual=(vp_stage is None), vp_stage=vp_stage):
+        total_tokens = batch['labels'].size(1)
+    else:  # packed sequence
+        empty_batch['cu_seqlens'] = cu_seqlens
+        empty_batch['max_seqlen'] = max_seqlen
+        return empty_batch.values()
+
+    if cu_seqlens is None:
+        # slice batch along sequence dimension for context parallelism
+        batch = get_batch_on_this_cp_rank(batch)  # The implementation of this function is in MCore
+    else:  # Packed THD format
+        cp_size = get_context_parallel_world_size()
+        if cp_size > 1:  # slice batch along sequence dimension for context parallelism
+            assert tex is not None and is_te_min_version("1.10.0"), (
+                "Please update Transformer Engine to >= 1.10 to use "
+                "Context Parallel with THD format data"
+            )
+            cp_rank = get_context_parallel_rank()
+            index = tex.thd_get_partitioned_indices(
+                cu_seqlens,
+                total_tokens,
+                cp_size,
+                cp_rank,
+            )
+            for key, data in batch.items():
+                if key in {'attention_mask', 'cu_seqlens', 'max_seqlen'}:
+                    continue
+                if data is not None:
+                    # On first PP rank, labels and loss_mask can be None.
+                    # On last PP rank, tokens and position_ids can be None.
+                    batch[key] = data.index_select(1, index)
 
     return batch.values()
 
@@ -123,22 +219,57 @@ def forward_step(data_iterator, model: MambaModel):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
+
     global stimer
+
     with stimer(bdata=True):
         vp_stage = get_attr_wrapped_model(model, "vp_stage")
-        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator, vp_stage)
+        (
+            tokens,
+            labels,
+            loss_mask,
+            attention_mask,
+            position_ids,
+            cu_seqlens,
+            max_seqlen,
+        ) = get_batch(data_iterator, vp_stage)
+
+    if cu_seqlens is None:
+        packed_seq_params = None
+    else:
+        # TODO(duncan): This class seems overly complex for what needs to be conveyed
+        packed_seq_params = PackedSeqParams(
+            qkv_format="thd",
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_kv=cu_seqlens,
+            cu_seqlens_q_padded=None,
+            cu_seqlens_kv_padded=None,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_kv=max_seqlen,
+        )
+
     timers('batch-generator').stop()
 
     with stimer:
-        output_tensor = model(tokens, position_ids, attention_mask,
-                              labels=labels)
+        output_tensor = model(
+            tokens,
+            position_ids,
+            attention_mask,
+            labels=labels,
+            packed_seq_params=packed_seq_params,
+        )
 
     # [ModelOpt]: model is needed to access ModelOpt distillation losses
     return output_tensor, partial(loss_func, loss_mask, model=model)
 
 
-def is_dataset_built_on_rank(vp_stage=None):
-    return is_first_or_last_pipeline_stage(vp_stage) and mpu.get_tensor_model_parallel_rank() == 0
+def is_dataset_built_on_rank(vp_stage=None, is_packed_sequence=False):
+    if mpu.get_tensor_model_parallel_rank() != 0:
+        return False
+    elif is_packed_sequence:
+        return True
+    else:
+        return is_first_or_last_pipeline_stage(vp_stage)
 
 
 def core_gpt_dataset_config_from_args(args):
@@ -152,6 +283,11 @@ def core_gpt_dataset_config_from_args(args):
     blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]
     blend, blend_per_split = get_blend_and_blend_per_split(args)
 
+    sequences_per_dataset = None
+    if args.per_dataset_sequences_path is not None:
+        with open(args.per_dataset_sequences_path, "r") as f:
+            sequences_per_dataset = json.load(f)
+
     return GPTDatasetConfig(
         random_seed=args.seed,
         sequence_length=args.seq_length,
@@ -169,6 +305,10 @@ def core_gpt_dataset_config_from_args(args):
         object_storage_cache_path=args.object_storage_cache_path,
         mid_level_dataset_surplus=args.mid_level_dataset_surplus,
         allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens,
+        fast_cache_load=args.dataloader_fast_cache_load,
+        sequences_per_dataset=sequences_per_dataset,
+        defer_npy_index_mmap=args.dataloader_defer_npy_index_mmap,
+        context_parallel_size=args.context_parallel_size,
     )
 
 
@@ -181,8 +321,10 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
     args = get_args()
     config = core_gpt_dataset_config_from_args(args)
 
+    is_packed_sequence = False
     if args.sft:
         dataset_type = SFTDataset
+        is_packed_sequence = True  # SFT always uses packed sequence
     else:
         if args.mock_data:
             dataset_type = MockGPTDataset
@@ -194,7 +336,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
     train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
         dataset_type,
         train_val_test_num_samples,
-        partial(is_dataset_built_on_rank, vp_stage=vp_stage),
+        partial(is_dataset_built_on_rank, vp_stage=vp_stage, is_packed_sequence=is_packed_sequence),
         config
     ).build()
 
@@ -204,6 +346,11 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
 
 
 if __name__ == "__main__":
+    # Timestamp right after entering __main__ block (after all imports/library setup)
+    _MAIN_ENTRY_TIME = time.time()
+
+    # Register startup timestamps for timing report in pretrain()
+    set_startup_timestamps(program_start=_PROGRAM_START_TIME, main_entry=_MAIN_ENTRY_TIME)
 
     # Temporary for transition to core datasets
     train_valid_test_datasets_provider.is_distributed = True
diff --git a/pretrain_retro.py b/pretrain_retro.py
deleted file mode 100644
index 63abbac5e39..00000000000
--- a/pretrain_retro.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain Retro."""
-
-from functools import partial
-import torch
-
-from megatron.training import get_args
-from megatron.training import get_tokenizer
-from megatron.training import get_timers
-from megatron.training import print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
-from megatron.core import tensor_parallel
-from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets
-from megatron.core.datasets.retro.query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
-from megatron.core.enums import ModelType
-from megatron.core.models.retro import get_retro_decoder_block_spec, RetroConfig, RetroModel
-from megatron.core.models.retro.utils import get_all_true_mask
-from megatron.core.tokenizers import MegatronTokenizer
-from megatron.training import pretrain
-from megatron.training.utils import get_ltor_masks_and_position_ids
-from pretrain_gpt import (
-    is_dataset_built_on_rank,
-    loss_func,
-    model_provider as default_model_provider,
-    train_valid_test_datasets_provider as gpt_train_valid_test_datasets_provider,
-)
-
-
-def get_retro_config():
-    return core_transformer_config_from_args(get_args(), RetroConfig)
-
-
-def core_model_provider(pre_process=True, post_process=True):
-    """Build the model using Megatron-Core."""
-
-    args = get_args()
-    config = get_retro_config()
-
-    # NOTE: Experimental customization feature
-    if args.spec is not None:
-        block_spec = import_module(args.spec)()
-    else:
-        block_spec = get_retro_decoder_block_spec(config, use_transformer_engine=True)
-
-    print_rank_0('building GPT model ...')
-    model = RetroModel(
-        config=config,
-        transformer_layer_spec=block_spec,
-        vocab_size=args.padded_vocab_size,
-        max_sequence_length=args.max_position_embeddings,
-        pre_process=pre_process,
-        post_process=post_process,
-        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
-        parallel_output=True,
-        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
-        position_embedding_type=args.position_embedding_type,
-        rotary_percent=args.rotary_percent
-    )
-    return model
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model.
-
-    Select between two different model classes:
-      1. Default model (uses megatron.legacy.models/gpt_model.py).
-      2. Core model (uses megatron/core/models/retro/model.py).
-    """
-
-    args = get_args()
-    if not args.use_legacy_models and args.retro_add_retriever:
-        provider = core_model_provider
-    else:
-        provider = default_model_provider
-    model = provider(pre_process=pre_process, post_process=post_process)
-    return model
-
-
-def get_batch(data_iterator):
-    """Generate a batch"""
-
-    args = get_args()
-
-    if args.legacy_tokenizer:
-        tokenizer = get_tokenizer()
-    else:
-        tokenizer = build_tokenizer(args)
-        
-    config = get_retro_config()
-
-    # Items and their type.
-    keys = ['text']
-    if args.retro_add_retriever:
-        keys.append('neighbor_tokens')
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-
-    if args.retro_add_retriever:
-        # note: [bs * l * k, r]
-        # note: 2x == neighbor, continuation
-        neighbor_tokens = data_b['neighbor_tokens'] \
-            .view(-1, config.retro_retrieved_length).long()
-        _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(
-            neighbor_tokens,
-            tokenizer.eod,
-            args.reset_position_ids,
-            args.reset_attention_mask,
-            args.eod_mask_loss)
-        neighbor_attention_mask = get_all_true_mask(
-            (1, 1, config.retro_retrieved_length, config.retro_retrieved_length),
-            neighbor_tokens.device)
-        return tokens, labels, loss_mask, attention_mask, position_ids, \
-               neighbor_tokens, neighbor_attention_mask, neighbor_position_ids
-
-    else:
-        return tokens, labels, loss_mask, attention_mask, position_ids
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator').start()
-    if args.retro_add_retriever:
-        tokens, labels, loss_mask, attention_mask, position_ids, \
-            neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = \
-                get_batch(data_iterator)
-    else:
-        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-            data_iterator)
-        neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = \
-            None, None, None
-    timers('batch-generator').stop()
-
-    # Model call.
-    if args.use_legacy_models:
-        forward_kwargs = {
-            "retriever_input_ids" : neighbor_tokens,
-            "retriever_position_ids" : neighbor_position_ids,
-            "retriever_attn_mask" : neighbor_attention_mask,
-        }
-    else:
-        if args.retro_add_retriever:
-            forward_kwargs = {
-                "context_input_ids" : neighbor_tokens,
-                "context_position_ids" : neighbor_position_ids,
-                "context_mask" : neighbor_attention_mask,
-            }
-        else:
-            forward_kwargs = {}
- 
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          labels=labels, **forward_kwargs)
-
-    return output_tensor, partial(loss_func, loss_mask)
-
-
-def train_valid_test_datasets_provider(train_valid_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    if args.legacy_tokenizer:
-        tokenizer = get_tokenizer()
-    else:
-        tokenizer = build_tokenizer(args)
-
-    # Dataset config.
-    retro_config = get_retro_config()
-    data_config = MultiSplitGPTDatasetConfig(
-        random_seed=args.seed,
-        sequence_length=args.seq_length,
-        blend=get_blend_from_list(args.data_path),
-        blend_per_split=[
-            get_blend_from_list(args.train_data_path),
-            get_blend_from_list(args.valid_data_path),
-            get_blend_from_list(args.test_data_path)
-        ],
-        split=args.split,
-        split_preprocessing=retro_config.retro_split_preprocessing,
-        path_to_cache=args.data_cache_path,
-        return_document_ids=False,
-        tokenizer=tokenizer,
-        reset_position_ids=args.reset_position_ids,
-        reset_attention_mask=args.reset_attention_mask,
-        eod_mask_loss=args.eod_mask_loss,
-        mid_level_dataset_surplus=args.mid_level_dataset_surplus,
-        allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens,
-    )
-
-    # GPT datasets.
-    print_rank_0(" > multi-split gpt datasets.")
-    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
-        MultiSplitGPTDataset,
-        train_valid_test_num_samples,
-        is_dataset_built_on_rank,
-        data_config,
-    ).build()
-
-    gpt_datasets = {
-        "train" : (train_ds, train_valid_test_num_samples[0]),
-        "valid" : (valid_ds, train_valid_test_num_samples[1]),
-        "test"  : (test_ds, train_valid_test_num_samples[2]),
-    }
-
-    # Retro datasets.
-    if args.retro_add_retriever:
-        return get_retro_datasets(
-            config=retro_config,
-            gpt_datasets=gpt_datasets,
-            sample_length=args.seq_length,
-            eod_token_id=get_tokenizer().eod,
-        )
-
-    # Multi-split GPT datasets.
-    else:
-        return (
-            gpt_datasets["train"][0],
-            gpt_datasets["valid"][0],
-            gpt_datasets["test"][0],
-        )
-
-
-if __name__ == "__main__":
-
-    # Temporary for transition to core datasets.
-    train_valid_test_datasets_provider.is_distributed = True
-
-    pretrain(train_valid_test_datasets_provider,
-             model_provider,
-             ModelType.retro_decoder,
-             forward_step,
-             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
diff --git a/pretrain_t5.py b/pretrain_t5.py
index e74e7d8809e..2badae9d5ee 100644
--- a/pretrain_t5.py
+++ b/pretrain_t5.py
@@ -66,7 +66,12 @@
 
 
 def model_provider(
-    pre_process=True, post_process=True, add_encoder=True, add_decoder=True
+    pre_process=True,
+    post_process=True,
+    add_encoder=True,
+    add_decoder=True,
+    config=None,
+    pg_collection=None,
 ) -> Union[megatron.legacy.model.T5Model, T5Model]:
     """Builds the model.
 
@@ -83,7 +88,8 @@ def model_provider(
 
     args = get_args()
     
-    config = core_transformer_config_from_args(args)
+    if config is None:
+        config = core_transformer_config_from_args(args)
     if args.use_legacy_models:
         model = megatron.legacy.model.T5Model(
             config=config,
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index 524931d2727..9da1afa669f 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -43,7 +43,13 @@
 
 
 def model_provider(
-    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+    pre_process=True,
+    post_process=True,
+    add_encoder=True,
+    add_decoder=True,
+    parallel_output=True,
+    config=None,
+    pg_collection=None,
 ) -> LLaVAModel:
     """Builds the model.
 
@@ -100,7 +106,10 @@ def model_provider(
     args.max_position_embeddings = max(args.max_position_embeddings, args.decoder_seq_length)
 
     print_rank_0('building a multimodal model ...')
-    language_transformer_config = core_transformer_config_from_args(get_args())
+    if config is None:
+        language_transformer_config = core_transformer_config_from_args(get_args())
+    else:
+        language_transformer_config = config
     if args.decoder_num_layers is not None:
         language_transformer_config.num_layers = args.decoder_num_layers
     else:
diff --git a/pyproject.toml b/pyproject.toml
index b7ca0287dd9..e082e983c56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 
 [build-system]
-requires = ["setuptools>=80.0.0", "pybind11", "packaging>=24.2"]
+requires = ["setuptools<80.0.0", "pybind11", "packaging>=24.2"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools]
@@ -20,7 +20,7 @@ dynamic = ["version", "readme"]
 description = "Megatron Core - a library for efficient and scalable training of transformer based models"
 requires-python = ">=3.10"
 license = { text = "Apache 2.0" }
-dependencies = ["torch", "numpy", "packaging>=24.2"]
+dependencies = ["torch>=2.6.0", "numpy", "packaging>=24.2"]
 authors = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
 maintainers = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
 keywords = [
@@ -68,7 +68,7 @@ mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"]
 
 dev = [
     "nvidia-modelopt[torch]; sys_platform != 'darwin'",
-    "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.10.0",
+    "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0",
     "nvidia-resiliency-ext",
     "tqdm",
     "einops~=0.8",
@@ -78,14 +78,16 @@ dev = [
     "opentelemetry-api~=1.33.1",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
+    "flash-linear-attention~=0.4.0",
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
     "av",
-    "flashinfer-python",
+    "flashinfer-python~=0.5.0",
     "wget",
     "onnxscript",
     "fastapi~=0.50",                                          # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
     "datasets",
+    "emerging_optimizers",
 ]
 
 lts = [
@@ -100,11 +102,12 @@ lts = [
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
     "av",
-    "flashinfer-python",
+    "flashinfer-python~=0.5.0",
     "wget",
     "onnxscript",
     "fastapi~=0.50",                      # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
     "datasets",
+    "emerging_optimizers",
 ]
 
 [dependency-groups]
@@ -132,13 +135,13 @@ docs = [
     "nvidia-sphinx-theme", # Our NVIDIA theme
 ]
 build = [
-    "setuptools<80.0.0",
+    "setuptools<80.0.0,>=77.0.0",
     "packaging>=24.2",
     "hatchling",
     "pybind11",
     "Cython>=3.0.0",
     "torch",
-    "nvidia-mathdx",     # for TE
+    "nvidia-mathdx",              # for TE
 ]
 linting = [
     "ruff~=0.9.0",
@@ -172,12 +175,13 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
+
 flash_mla = [
     { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" },
 ]
-# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9`
+transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "5671fd3675906cda1ade26c24a65d3dedd88eb89" }
 nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "01a9a8ba360f7b2908728ad0516e0ad9d936966d" }
-emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "fb1add873e7851ec34b48581ea1b15761b73d189" }
+emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "v0.1.0" }
 
 [tool.isort]
 profile = "black"                                                          # black-compatible
diff --git a/scripts/check_api_backwards_compatibility.py b/scripts/check_api_backwards_compatibility.py
index 4977b806433..3c66f00b619 100644
--- a/scripts/check_api_backwards_compatibility.py
+++ b/scripts/check_api_backwards_compatibility.py
@@ -46,13 +46,22 @@
 # Decorators that exempt objects from compatibility checks
 EXEMPT_DECORATORS = ['internal_api', 'deprecated', 'experimental_api']
 
-# Breakage kinds to ignore (not actual API signature changes)
+# Breakage kinds to ignore globally (not actual API signature changes)
 # AttributeChangedValueBreakage: Changing constant values (e.g., VERSION = "1.0" -> "2.0")
 #   is not a breaking API change - the constant still exists with the same name
 IGNORED_BREAKAGE_KINDS = [
     'AttributeChangedValueBreakage',
 ]
 
+# Breakage kinds to ignore only for __init__ methods
+# ParameterMovedBreakage: Reordering parameters in __init__ is generally safe because:
+#   - Config dataclasses should always be initialized with keyword arguments
+#   - Adding fields to parent dataclasses shifts child __init__ params (inheritance artifact)
+#   - Nobody should call Config(4096, 32, ...) with positional args
+IGNORED_FOR_INIT_METHODS = [
+    'ParameterMovedBreakage',
+]
+
 
 def has_exempt_decorator(obj: Object) -> bool:
     """Check if a Griffe object has any exempt decorator.
@@ -217,6 +226,7 @@ def should_skip_change(change, filtered_paths: set) -> bool:
     
     A change is skipped if:
     - The change kind is in IGNORED_BREAKAGE_KINDS (not a signature change)
+    - The change kind is in IGNORED_FOR_INIT_METHODS and affects an __init__ method
     - The changed object itself is in filtered_paths (exact match)
     - The changed object is a child of an exempt object (prefix match)
     
@@ -227,7 +237,7 @@ def should_skip_change(change, filtered_paths: set) -> bool:
     Returns:
         bool: True if the change should be skipped (filtered out)
     """
-    # Check if this breakage kind should be ignored (not a signature change)
+    # Check if this breakage kind should be ignored globally (not a signature change)
     change_kind = type(change).__name__
     if change_kind in IGNORED_BREAKAGE_KINDS:
         return True
@@ -240,6 +250,12 @@ def should_skip_change(change, filtered_paths: set) -> bool:
     # e.g., "Class.__init__(param)" -> "Class.__init__"
     clean_path = path.split('(')[0] if '(' in path else path
     
+    # Check if this is a breakage kind we ignore for __init__ methods
+    # Config dataclasses should use keyword args, so parameter reordering is safe
+    if change_kind in IGNORED_FOR_INIT_METHODS:
+        if '.__init__' in clean_path:
+            return True
+    
     # Check exact match
     if clean_path in filtered_paths or path in filtered_paths:
         return True
diff --git a/tests/functional_tests/python_test_utils/compute_golden_statistics.py b/tests/functional_tests/python_test_utils/compute_golden_statistics.py
new file mode 100644
index 00000000000..d4863fa9476
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/compute_golden_statistics.py
@@ -0,0 +1,836 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""
+Compute statistical bounds for golden values from multiple test runs.
+
+This script aggregates results from multiple parallel runs of a functional test
+and computes statistics (min, max, mean, std) for each metric at each step.
+The output can be used to determine appropriate tolerances for test validation.
+
+Usage:
+    # Step 1: Run batch tests (from megatron-rl directory):
+    ./tests/functional_tests/shell_test_utils/run_batch_ci_tests.sh \\
+        test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_correctness_and_throughput.sh 10
+
+    # Step 2: Wait for jobs to complete, then compute statistics:
+    python tests/functional_tests/python_test_utils/compute_golden_statistics.py \\
+        --results-dir batch_test_logs_gpt_grpo_*/ \\
+        --output golden_values_stats.json \\
+        --recommend-tolerances
+
+    # The script parses .out log files to find where each run wrote its results.
+    # Each .out file should contain: "This test wrote results into /opt/megatron-lm/runs/<uuid>"
+    # The container path /opt/megatron-lm maps to the workspace root on the host.
+
+    # Or specify individual JSON files directly:
+    python compute_golden_statistics.py \\
+        --result-files runs/abc123/golden_values.json runs/def456/golden_values.json \\
+        --output golden_values_stats.json
+"""
+
+import argparse
+import glob
+import json
+import logging
+import math
+import os
+import sys
+from pathlib import Path
+from statistics import mean, median, stdev
+from typing import Any, Dict, List, Optional, Tuple
+
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def find_result_json_files(results_dir: str, workspace_root: Optional[str] = None) -> List[str]:
+    """
+    Find all result JSON files from a batch test run.
+
+    The batch test infrastructure (run_batch_ci_tests.sh) writes .out log files
+    to the results directory. Each .out file contains a line like:
+        "This test wrote results into /opt/megatron-lm/runs/<uuid>"
+
+    The container path /opt/megatron-lm maps to the workspace root on the host.
+    This function parses the .out files to find where the JSON results are.
+
+    Args:
+        results_dir: Path to batch_test_logs_* directory containing .out files
+        workspace_root: Root of the megatron workspace (defaults to cwd)
+    """
+    result_files = []
+    results_path = Path(results_dir)
+
+    if not results_path.exists():
+        logger.error(f"Results directory not found: {results_dir}")
+        return []
+
+    if workspace_root is None:
+        # Try to find workspace root by looking for common markers
+        workspace_root = os.getcwd()
+
+    # Find all .out files from batch test runs
+    out_files = list(results_path.glob("*.out"))
+
+    if not out_files:
+        logger.warning(f"No .out files found in {results_dir}")
+        # Fall back to searching for JSON files directly
+        return _find_json_files_directly(results_dir)
+
+    logger.info(f"Found {len(out_files)} .out files to parse")
+
+    for out_file in out_files:
+        json_path = _extract_result_path_from_log(out_file, workspace_root)
+        if json_path and os.path.exists(json_path):
+            result_files.append(json_path)
+        elif json_path:
+            logger.warning(f"Result file not found: {json_path} (from {out_file.name})")
+
+    return result_files
+
+
+def _extract_result_path_from_log(out_file: Path, workspace_root: str) -> Optional[str]:
+    """
+    Parse a .out log file to find the result JSON path.
+
+    Looks for the line: "This test wrote results into /opt/megatron-lm/runs/<uuid>"
+    and converts the container path to the host path.
+    """
+    try:
+        with open(out_file, 'r', errors='ignore') as f:
+            content = f.read()
+    except IOError as e:
+        logger.warning(f"Failed to read {out_file}: {e}")
+        return None
+
+    # Look for the output path marker
+    marker = "This test wrote results into "
+    for line in content.split('\n'):
+        if marker in line:
+            # Extract the path after the marker
+            idx = line.find(marker)
+            output_path = line[idx + len(marker) :].strip()
+
+            # Convert container path to host path
+            # /opt/megatron-lm/... -> <workspace_root>/...
+            if output_path.startswith("/opt/megatron-lm/"):
+                host_path = output_path.replace("/opt/megatron-lm/", "")
+                output_path = os.path.join(workspace_root, host_path)
+
+            # Find JSON result files in this directory (search recursively)
+            output_dir = Path(output_path)
+            if output_dir.exists() and output_dir.is_dir():
+                # Look for result JSON files with various naming patterns
+                # Search recursively since files may be in subdirectories (e.g., 1/, 2/)
+                patterns = [
+                    "**/golden_values*.json",
+                    "**/generations*.json",
+                    "**/test_results*.json",
+                ]
+
+                for pattern in patterns:
+                    json_files = list(output_dir.glob(pattern))
+                    if json_files:
+                        # Return the first match
+                        logger.debug(f"Found result file: {json_files[0]}")
+                        return str(json_files[0])
+
+                # Fallback: any JSON file in subdirectories
+                json_files = list(output_dir.glob("**/*.json"))
+                if json_files:
+                    logger.debug(f"Found result file (fallback): {json_files[0]}")
+                    return str(json_files[0])
+
+            logger.debug(f"Output directory not found or empty: {output_path}")
+            return None
+
+    logger.debug(f"No output path marker found in {out_file.name}")
+    return None
+
+
+def _find_json_files_directly(results_dir: str) -> List[str]:
+    """
+    Fallback: search for JSON files directly in the results directory.
+
+    This is used when .out files don't contain the expected markers.
+    """
+    result_files = []
+    results_path = Path(results_dir)
+
+    # Look for golden_values*.json files in subdirectories
+    patterns = ["**/golden_values*.json", "**/test_results*.json", "**/*_output.json"]
+
+    for pattern in patterns:
+        matches = list(results_path.glob(pattern))
+        result_files.extend([str(p) for p in matches])
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_files = []
+    for f in result_files:
+        if f not in seen:
+            seen.add(f)
+            unique_files.append(f)
+
+    return unique_files
+
+
+def load_result_file(filepath: str) -> Optional[Dict[str, Any]]:
+    """Load a single result JSON file."""
+    try:
+        with open(filepath, 'r') as f:
+            content = f.read()
+
+        data = json.loads(content)
+
+        # Handle JSONL format (single line)
+        if isinstance(data, str):
+            data = json.loads(data)
+
+        return data
+    except (json.JSONDecodeError, IOError) as e:
+        logger.warning(f"Failed to load {filepath}: {e}")
+        return None
+
+
+def _detect_result_format(data: Dict[str, Any]) -> str:
+    """
+    Detect whether the result file is from a training test or inference test.
+
+    Returns:
+        "training" - TensorBoard metrics format: {"metric_name": {"values": {...}}}
+        "inference" - Generation output format: {"request_id": {"latency": ..., ...}}
+        "unknown" - Unrecognized format
+    """
+    if not data:
+        return "unknown"
+
+    # Check first key's value structure
+    first_key = next(iter(data.keys()))
+    first_value = data[first_key]
+
+    if isinstance(first_value, dict):
+        if 'values' in first_value:
+            return "training"
+        if 'latency' in first_value or 'generated_text' in first_value:
+            return "inference"
+
+    return "unknown"
+
+
+def _is_valid_numeric(value) -> bool:
+    """Check if a value is a valid (non-NaN) numeric value."""
+    if isinstance(value, str):
+        try:
+            value = float(value)
+        except ValueError:
+            return False
+
+    if isinstance(value, (int, float)):
+        return not math.isnan(value)
+
+    return False
+
+
+def _to_float(value) -> Optional[float]:
+    """Convert value to float, returning None for invalid/NaN values."""
+    if isinstance(value, str):
+        try:
+            value = float(value)
+        except ValueError:
+            return None
+
+    if isinstance(value, (int, float)):
+        if math.isnan(value):
+            return None
+        return float(value)
+
+    return None
+
+
+def _aggregate_training_results(
+    data: Dict[str, Any], aggregated: Dict[str, Dict[str, List[float]]], run_index: int
+) -> None:
+    """Aggregate results from training test format."""
+    for metric_name, metric_data in data.items():
+        if not isinstance(metric_data, dict) or 'values' not in metric_data:
+            continue
+
+        if metric_name not in aggregated:
+            aggregated[metric_name] = {}
+
+        values = metric_data['values']
+        for step, value in values.items():
+            # Skip non-numeric or NaN values
+            float_val = _to_float(value)
+            if float_val is None:
+                continue
+
+            if step not in aggregated[metric_name]:
+                aggregated[metric_name][step] = []
+
+            aggregated[metric_name][step].append(float_val)
+
+        # For metrics that use median-based comparison in the test (iteration-time,
+        # mem-allocated-bytes, mem-max-allocated-bytes), also store all values from
+        # this run so we can compute per-run medians later.
+        # IMPORTANT: Store values in step order to match the test's index-based slicing.
+        if metric_name in ['iteration-time', 'mem-allocated-bytes', 'mem-max-allocated-bytes']:
+            all_values_key = f"_all_values_run_{run_index}"
+            if all_values_key not in aggregated[metric_name]:
+                aggregated[metric_name][all_values_key] = []
+
+            # Sort by step number to ensure consistent ordering for index-based slicing
+            sorted_steps = sorted(
+                values.keys(), key=lambda x: int(x) if x.isdigit() else float('inf')
+            )
+            for step in sorted_steps:
+                float_val = _to_float(values[step])
+                if float_val is None:
+                    continue
+                aggregated[metric_name][all_values_key].append(
+                    float_val
+                )  # Just the value, not tuple
+
+
+def _aggregate_inference_results(
+    data: Dict[str, Any], aggregated: Dict[str, Dict[str, List[float]]], run_index: int
+) -> None:
+    """
+    Aggregate results from inference test format.
+
+    Extracts metrics like latency, step_count, and logprob statistics
+    from generation outputs.
+    """
+    # Metrics to extract per request
+    latencies = []
+    step_counts = []
+    prompt_logprob_means = []
+    generated_logprob_means = []
+
+    for request_id, request_data in data.items():
+        if not isinstance(request_data, dict):
+            continue
+
+        # Extract latency
+        if 'latency' in request_data:
+            latencies.append(float(request_data['latency']))
+
+        # Extract step count
+        if 'step_count' in request_data:
+            step_counts.append(float(request_data['step_count']))
+
+        # Extract mean of prompt logprobs (as a consistency metric)
+        if 'prompt_logprobs' in request_data and request_data['prompt_logprobs']:
+            logprobs = request_data['prompt_logprobs']
+            if isinstance(logprobs, list) and len(logprobs) > 0:
+                prompt_logprob_means.append(sum(logprobs) / len(logprobs))
+
+        # Extract mean of generated logprobs
+        if 'generated_log_probs' in request_data and request_data['generated_log_probs']:
+            logprobs = request_data['generated_log_probs']
+            if isinstance(logprobs, list) and len(logprobs) > 0:
+                generated_logprob_means.append(sum(logprobs) / len(logprobs))
+
+    # Store aggregated metrics using run_index as the "step"
+    run_key = str(run_index)
+
+    if latencies:
+        if 'latency' not in aggregated:
+            aggregated['latency'] = {}
+        if 'mean' not in aggregated['latency']:
+            aggregated['latency']['mean'] = []
+        aggregated['latency']['mean'].append(sum(latencies) / len(latencies))
+
+        if 'total' not in aggregated['latency']:
+            aggregated['latency']['total'] = []
+        aggregated['latency']['total'].append(sum(latencies))
+
+    if step_counts:
+        if 'step_count' not in aggregated:
+            aggregated['step_count'] = {}
+        if 'mean' not in aggregated['step_count']:
+            aggregated['step_count']['mean'] = []
+        aggregated['step_count']['mean'].append(sum(step_counts) / len(step_counts))
+
+    if prompt_logprob_means:
+        if 'prompt_logprob_mean' not in aggregated:
+            aggregated['prompt_logprob_mean'] = {}
+        if 'mean' not in aggregated['prompt_logprob_mean']:
+            aggregated['prompt_logprob_mean']['mean'] = []
+        aggregated['prompt_logprob_mean']['mean'].append(
+            sum(prompt_logprob_means) / len(prompt_logprob_means)
+        )
+
+    if generated_logprob_means:
+        if 'generated_logprob_mean' not in aggregated:
+            aggregated['generated_logprob_mean'] = {}
+        if 'mean' not in aggregated['generated_logprob_mean']:
+            aggregated['generated_logprob_mean']['mean'] = []
+        aggregated['generated_logprob_mean']['mean'].append(
+            sum(generated_logprob_means) / len(generated_logprob_means)
+        )
+
+
+def aggregate_results(result_files: List[str]) -> Dict[str, Dict[str, List[float]]]:
+    """
+    Aggregate results from multiple JSON files.
+
+    Supports both training test format (TensorBoard metrics) and
+    inference test format (generation outputs).
+
+    Returns:
+        Dict mapping metric_name -> step/key -> list of values across all runs
+    """
+    aggregated: Dict[str, Dict[str, List[float]]] = {}
+    loaded_count = 0
+    detected_format = None
+
+    for idx, filepath in enumerate(result_files):
+        data = load_result_file(filepath)
+        if data is None:
+            continue
+
+        loaded_count += 1
+
+        # Detect format from first file
+        file_format = _detect_result_format(data)
+        if detected_format is None:
+            detected_format = file_format
+            logger.info(f"Detected result format: {file_format}")
+
+        if file_format == "training":
+            _aggregate_training_results(data, aggregated, idx)
+        elif file_format == "inference":
+            _aggregate_inference_results(data, aggregated, idx)
+        else:
+            logger.warning(f"Unknown format in {filepath}, skipping")
+
+    logger.info(f"Successfully loaded {loaded_count} of {len(result_files)} result files")
+    return aggregated
+
+
+def compute_statistics(aggregated: Dict[str, Dict[str, List[float]]]) -> Dict[str, Any]:
+    """
+    Compute statistics for each metric at each step.
+
+    Returns:
+        Dict with structure:
+        {
+            "metric_name": {
+                "num_samples": N,
+                "values": {
+                    "step": {
+                        "min": ...,
+                        "max": ...,
+                        "mean": ...,
+                        "std": ...,
+                        "samples": [...]  # original values
+                    }
+                }
+            }
+        }
+    """
+    stats: Dict[str, Any] = {}
+
+    for metric_name, step_values in aggregated.items():
+        # Determine number of samples (should be consistent across steps)
+        # Skip internal keys used for median calculations
+        regular_steps = {k: v for k, v in step_values.items() if not k.startswith("_")}
+        sample_counts = [len(vals) for vals in regular_steps.values()]
+        num_samples = max(sample_counts) if sample_counts else 0
+
+        metric_stats = {"num_samples": num_samples, "values": {}}
+
+        for step, values in regular_steps.items():
+            if len(values) == 0:
+                continue
+
+            step_stats = {
+                "min": min(values),
+                "max": max(values),
+                "mean": mean(values),
+                "std": stdev(values) if len(values) > 1 else 0.0,
+                "count": len(values),
+            }
+
+            # Include original samples for debugging
+            step_stats["samples"] = values
+
+            metric_stats["values"][step] = step_stats
+
+        stats[metric_name] = metric_stats
+
+    return stats
+
+
+def compute_recommended_tolerances(
+    stats: Dict[str, Any],
+    aggregated: Dict[str, Dict[str, List[float]]],
+    confidence_multiplier: float = 3.0,
+    start_step: int = 1,
+) -> Dict[str, Dict[str, float]]:
+    """
+    Compute recommended tolerances for each metric based on observed variance.
+
+    For metrics that use median-based comparison in the test (iteration-time,
+    mem-allocated-bytes, mem-max-allocated-bytes), computes variance of per-run
+    medians rather than per-step variance.
+
+    Args:
+        stats: Output from compute_statistics()
+        aggregated: Raw aggregated data (needed for median calculations)
+        confidence_multiplier: Number of standard deviations for bounds (default 3.0 for ~99.7% coverage)
+        start_step: First step to include in tolerance calculation (skips warmup steps)
+
+    Returns:
+        Dict mapping metric_name -> {
+            "relative_tolerance": recommended relative tolerance,
+            "absolute_tolerance": recommended absolute tolerance (for near-zero values),
+            "max_observed_relative_variance": max(|value - mean| / |mean|) across all samples
+        }
+    """
+    tolerances = {}
+
+    # Metrics that use median-based comparison in the test (iteration-time)
+    median_based_metrics = ['iteration-time']
+    # Metrics that use max-based comparison in the test (memory)
+    max_based_metrics = ['mem-allocated-bytes', 'mem-max-allocated-bytes']
+
+    for metric_name, metric_data in stats.items():
+        max_relative_variance = 0.0
+        max_absolute_variance = 0.0
+        steps_included = 0
+
+        # For median-based metrics, compute variance of per-run medians
+        if metric_name in median_based_metrics and metric_name in aggregated:
+            run_medians = []
+
+            # Find all run data keys
+            for key in aggregated[metric_name].keys():
+                if key.startswith("_all_values_run_"):
+                    run_data = aggregated[metric_name][key]
+                    # Use index-based slicing to match test behavior:
+                    # [start_step:] skips the first `start_step` items
+                    filtered_values = run_data[start_step:]
+
+                    if filtered_values:
+                        run_median = median(filtered_values)
+                        run_medians.append(run_median)
+
+            if run_medians:
+                median_mean = mean(run_medians)
+
+                # Compute relative variance of medians
+                if abs(median_mean) > 1e-9:
+                    for m in run_medians:
+                        rel_var = abs(m - median_mean) / abs(median_mean)
+                        max_relative_variance = max(max_relative_variance, rel_var)
+                else:
+                    for m in run_medians:
+                        max_absolute_variance = max(max_absolute_variance, abs(m))
+
+                steps_included = len(run_medians)
+
+                logger.debug(
+                    f"{metric_name}: computed variance from {len(run_medians)} run medians, "
+                    f"mean={median_mean:.4f}, max_rel_var={max_relative_variance:.4%}"
+                )
+
+        # For max-based metrics (memory), compute variance of per-run max values
+        elif metric_name in max_based_metrics and metric_name in aggregated:
+            run_maxes = []
+
+            # Find all run data keys
+            for key in aggregated[metric_name].keys():
+                if key.startswith("_all_values_run_"):
+                    run_data = aggregated[metric_name][key]
+                    # Skip first value (warmup), take max of rest
+                    filtered_values = run_data[1:] if len(run_data) > 1 else run_data
+
+                    if filtered_values:
+                        run_max = max(filtered_values)
+                        run_maxes.append(run_max)
+
+            if run_maxes:
+                max_mean = mean(run_maxes)
+
+                # Compute relative variance of max values
+                if abs(max_mean) > 1e-9:
+                    for m in run_maxes:
+                        rel_var = abs(m - max_mean) / abs(max_mean)
+                        max_relative_variance = max(max_relative_variance, rel_var)
+                else:
+                    for m in run_maxes:
+                        max_absolute_variance = max(max_absolute_variance, abs(m))
+
+                steps_included = len(run_maxes)
+
+                logger.debug(
+                    f"{metric_name}: computed variance from {len(run_maxes)} run maxes, "
+                    f"mean={max_mean:.4f}, max_rel_var={max_relative_variance:.4%}"
+                )
+        else:
+            # Standard per-step variance calculation for other metrics
+            for step, step_stats in metric_data["values"].items():
+                # Skip warmup steps - try to parse step as int, skip if < start_step
+                try:
+                    step_num = int(step)
+                    if step_num < start_step:
+                        continue
+                except (ValueError, TypeError):
+                    # Non-numeric step key (e.g., "mean" for inference metrics) - include it
+                    pass
+
+                steps_included += 1
+                mean_val = step_stats["mean"]
+
+                # Compute observed relative variance
+                if abs(mean_val) > 1e-9:
+                    # For non-zero means, compute relative variance
+                    for sample in step_stats["samples"]:
+                        rel_var = abs(sample - mean_val) / abs(mean_val)
+                        max_relative_variance = max(max_relative_variance, rel_var)
+                else:
+                    # For near-zero means, track absolute variance
+                    for sample in step_stats["samples"]:
+                        max_absolute_variance = max(max_absolute_variance, abs(sample))
+
+        # Recommend tolerance with safety margin
+        # Use observed variance * confidence_multiplier, with a minimum of 0.1%
+        recommended_relative = max(max_relative_variance * confidence_multiplier, 0.001)
+
+        # Round to reasonable precision
+        recommended_relative = round(recommended_relative, 4)
+
+        tolerances[metric_name] = {
+            "relative_tolerance": recommended_relative,
+            "absolute_tolerance": max(max_absolute_variance * confidence_multiplier, 1e-6),
+            "max_observed_relative_variance": round(max_relative_variance, 6),
+            "max_observed_absolute_variance": round(max_absolute_variance, 6),
+            "steps_included": steps_included,
+        }
+
+    return tolerances
+
+
+def format_summary(stats: Dict[str, Any], tolerances: Dict[str, Dict[str, float]]) -> str:
+    """Format a human-readable summary of the statistics."""
+    lines = []
+    lines.append("=" * 70)
+    lines.append("Golden Values Statistics Summary")
+    lines.append("=" * 70)
+
+    for metric_name in sorted(stats.keys()):
+        metric_data = stats[metric_name]
+        tol = tolerances.get(metric_name, {})
+
+        lines.append(f"\n{metric_name}:")
+        lines.append(f"  Samples: {metric_data['num_samples']}")
+        lines.append(f"  Steps: {len(metric_data['values'])}")
+
+        if tol:
+            lines.append(
+                f"  Max observed relative variance: {tol.get('max_observed_relative_variance', 'N/A'):.4%}"
+            )
+            lines.append(
+                f"  Recommended relative tolerance: {tol.get('relative_tolerance', 'N/A'):.2%}"
+            )
+            lines.append(
+                f"  Recommended absolute tolerance: {tol.get('absolute_tolerance', 'N/A'):.2e}"
+            )
+
+        # Show a few example steps
+        values = metric_data["values"]
+        example_steps = list(values.keys())[:3]
+        if example_steps:
+            lines.append("  Example steps:")
+            for step in example_steps:
+                s = values[step]
+                lines.append(
+                    f"    Step {step}: mean={s['mean']:.6g}, std={s['std']:.6g}, "
+                    f"range=[{s['min']:.6g}, {s['max']:.6g}]"
+                )
+
+    lines.append("\n" + "=" * 70)
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compute statistical bounds for golden values from multiple test runs.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument(
+        "--results-dir",
+        type=str,
+        help="Directory containing batch test results (searches for JSON files)",
+    )
+    input_group.add_argument(
+        "--result-files",
+        type=str,
+        nargs="+",
+        help="Explicit list of result JSON files to aggregate",
+    )
+
+    parser.add_argument(
+        "--output", "-o", type=str, required=True, help="Output path for statistics JSON file"
+    )
+
+    parser.add_argument(
+        "--recommend-tolerances",
+        action="store_true",
+        help="Compute and display recommended tolerances based on observed variance",
+    )
+
+    parser.add_argument(
+        "--confidence-multiplier",
+        type=float,
+        default=1.5,
+        help="Multiplier for observed max variance when computing recommended tolerance. "
+        "Example: if max observed variance is 5%% and multiplier is 1.5, recommended tolerance is 7.5%%. "
+        "Use higher values (2-3) for more safety margin. Default: 1.5",
+    )
+
+    parser.add_argument(
+        "--min-samples",
+        type=int,
+        default=2,
+        help="Minimum number of samples required to compute statistics (default: 2)",
+    )
+
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output")
+
+    parser.add_argument(
+        "--workspace-root",
+        type=str,
+        default=None,
+        help="Root of the megatron workspace (where runs/ directory is located). "
+        "Defaults to current working directory.",
+    )
+
+    parser.add_argument(
+        "--start-step",
+        type=int,
+        default=0,
+        help="Number of initial steps to skip (index-based, matching test behavior). "
+        "Uses Python slicing [start_step:] so --start-step 10 skips first 10 items. "
+        "Default: 0 (include all). Set to match THROUGHPUT_TEST_PARAMS.--start_step from model_config.yaml.",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Find or use result files
+    if args.results_dir:
+        result_files = find_result_json_files(args.results_dir, args.workspace_root)
+        if not result_files:
+            logger.error(f"No result JSON files found in {args.results_dir}")
+            logger.info("Make sure the batch tests have completed and results are available.")
+            logger.info(
+                "The script looks for .out files and parses them to find the result JSON paths."
+            )
+            logger.info(
+                "Each .out file should contain: 'This test wrote results into /opt/megatron-lm/runs/<uuid>'"
+            )
+            sys.exit(1)
+        logger.info(f"Found {len(result_files)} result files from {args.results_dir}")
+    else:
+        result_files = args.result_files
+        # Verify files exist
+        for f in result_files:
+            if not os.path.exists(f):
+                logger.error(f"Result file not found: {f}")
+                sys.exit(1)
+
+    if args.verbose:
+        for f in result_files:
+            logger.debug(f"  - {f}")
+
+    # Aggregate results
+    aggregated = aggregate_results(result_files)
+
+    if not aggregated:
+        logger.error("No valid results found to aggregate")
+        sys.exit(1)
+
+    # Check minimum samples
+    for metric_name, step_values in aggregated.items():
+        for step, values in step_values.items():
+            if len(values) < args.min_samples:
+                logger.warning(
+                    f"{metric_name} step {step}: only {len(values)} samples "
+                    f"(minimum {args.min_samples} recommended)"
+                )
+
+    # Compute statistics
+    stats = compute_statistics(aggregated)
+
+    # Compute recommended tolerances (excluding warmup steps)
+    if args.start_step > 1:
+        logger.info(f"Excluding steps < {args.start_step} from tolerance calculation (warmup)")
+    tolerances = compute_recommended_tolerances(
+        stats, aggregated, args.confidence_multiplier, start_step=args.start_step
+    )
+
+    # Build output
+    output = {
+        "metadata": {
+            "num_runs": len(result_files),
+            "result_files": result_files,
+            "confidence_multiplier": args.confidence_multiplier,
+            "start_step": args.start_step,
+        },
+        "statistics": stats,
+        "recommended_tolerances": tolerances,
+    }
+
+    # Write output
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, 'w') as f:
+        json.dump(output, f, indent=2)
+
+    logger.info(f"Statistics written to {args.output}")
+
+    # Print summary
+    if args.recommend_tolerances or args.verbose:
+        print(format_summary(stats, tolerances))
+
+        print("\nRecommended tolerance settings:")
+        print("-" * 50)
+        # Training test metrics
+        training_metrics = [
+            "lm-loss",
+            "lm loss",
+            "iteration-time",
+            "mem-allocated-bytes",
+            "mem-max-allocated-bytes",
+        ]
+        # Inference test metrics
+        inference_metrics = [
+            "latency",
+            "step_count",
+            "prompt_logprob_mean",
+            "generated_logprob_mean",
+        ]
+
+        for metric_name in training_metrics + inference_metrics:
+            if metric_name in tolerances:
+                tol = tolerances[metric_name]
+                var_name = metric_name.upper().replace('-', '_').replace(' ', '_')
+                print(
+                    f"{var_name}_RELATIVE_TOLERANCE = "
+                    f"{tol['relative_tolerance']}  # {tol['relative_tolerance']:.2%}"
+                )
+                print(f"{var_name}_ABSOLUTE_TOLERANCE = " f"{tol['absolute_tolerance']:.2e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/python_test_utils/test_grpo_training_loop.py b/tests/functional_tests/python_test_utils/test_grpo_training_loop.py
index 12e5da3fbad..6faca9b11b3 100644
--- a/tests/functional_tests/python_test_utils/test_grpo_training_loop.py
+++ b/tests/functional_tests/python_test_utils/test_grpo_training_loop.py
@@ -2,14 +2,93 @@
 
 import json
 import logging
-import math
 from statistics import median
+from typing import Any, Dict, List, Tuple
+
+import yaml
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+# Tolerance settings for all metrics.
+# These tolerances account for hardware variance (different GPU silicon,
+# driver versions, CUDA/cuDNN differences) while still catching real regressions.
+# Tolerances can be tuned using compute_golden_statistics.py to analyze variance
+# across multiple runs on different hardware.
+
+# LM Loss tolerances
+LM_LOSS_RELATIVE_TOLERANCE = 0.01  # 1% relative tolerance
+LM_LOSS_ABSOLUTE_TOLERANCE = 1e-6  # For values near zero
+
+# Iteration time tolerances (performance metric, higher variance expected)
+ITERATION_TIME_RELATIVE_TOLERANCE = 0.15  # 15% relative tolerance
+
+# Memory allocation tolerances
+MEM_ALLOCATED_BYTES_RELATIVE_TOLERANCE = 0.10  # 10% relative tolerance
+MEM_MAX_ALLOCATED_BYTES_RELATIVE_TOLERANCE = 0.10  # 10% relative tolerance
+
+
+def validate_with_tolerance(
+    golden_values: Dict[str, Any],
+    current_values: Dict[str, Any],
+    relative_tolerance: float,
+    absolute_tolerance: float = 1e-9,
+    metric_name: str = "metric",
+) -> Tuple[bool, List[str]]:
+    """
+    Validate that current values are within tolerance of golden values.
+
+    Args:
+        golden_values: Dict mapping step -> expected value
+        current_values: Dict mapping step -> actual value
+        relative_tolerance: Maximum allowed relative difference (e.g., 0.01 for 1%)
+        absolute_tolerance: Tolerance for values near zero
+        metric_name: Name of metric for error messages
+
+    Returns:
+        Tuple of (passed: bool, mismatches: List[str])
+    """
+    mismatches = []
+
+    for step, golden_val in golden_values.items():
+        if step not in current_values:
+            mismatches.append(f"Step {step}: missing in current run")
+            continue
 
-def test_grpo_training_loop(golden_values_path: str, test_values_path: str) -> None:
+        current_val = current_values[step]
+
+        # Handle the case where golden value is zero or near-zero
+        if golden_val == 0 or abs(golden_val) < absolute_tolerance:
+            if abs(current_val) > absolute_tolerance:
+                mismatches.append(f"Step {step}: expected ~0, got {current_val}")
+        else:
+            # Calculate relative difference
+            rel_diff = abs(current_val - golden_val) / abs(golden_val)
+            if rel_diff > relative_tolerance:
+                mismatches.append(
+                    f"Step {step}: {current_val} differs from golden {golden_val} "
+                    f"by {rel_diff:.4%} (tolerance: {relative_tolerance:.2%})"
+                )
+
+    # Check for extra steps in current that aren't in golden
+    extra_steps = set(current_values.keys()) - set(golden_values.keys())
+    if extra_steps:
+        logger.info(f"{metric_name}: Ignoring extra steps in current run: {extra_steps}")
+
+    return len(mismatches) == 0, mismatches
+
+
+def test_grpo_training_loop(
+    golden_values_path: str, test_values_path: str, model_config_path: str
+) -> None:
+    with open(model_config_path, 'r') as f:
+        model_config = yaml.safe_load(f)
+        metrics = model_config["METRICS"]
+        if "THROUGHPUT_TEST_PARAMS" in model_config:
+            throughput_test_params = model_config["THROUGHPUT_TEST_PARAMS"]
+            start_step = throughput_test_params["--start_step"]
+        else:
+            start_step = 1
 
     with open(golden_values_path, 'r') as f1, open(test_values_path, 'r') as f2:
         golden_values_content = f1.read()
@@ -26,7 +105,13 @@ def test_grpo_training_loop(golden_values_path: str, test_values_path: str) -> N
         # Handle JSONL output, assume only one line in this case.
         output_current = json.loads(output_current)
 
-    assert set(output_groundtruth.keys()).issuperset(
+    # Allow current run to have extra metrics not in golden values
+    # (only compare metrics defined in golden values)
+    extra_in_current = set(output_current.keys()) - set(output_groundtruth.keys())
+    if extra_in_current:
+        logger.info(f"Ignoring extra metrics in current run: {extra_in_current}")
+
+    assert set(output_groundtruth.keys()).issubset(
         set(output_current.keys())
     ), f"Some IDs from groundtruth are missing in current: {output_groundtruth.keys()} vs {output_current.keys()}"
     if set(output_groundtruth.keys()) != set(output_current.keys()):
@@ -35,24 +120,99 @@ def test_grpo_training_loop(golden_values_path: str, test_values_path: str) -> N
         )
     assert len(output_groundtruth) > 0, "No test performed for output"
 
-    if "iteration-time" in output_groundtruth.keys():
+    if "iteration-time" in metrics and "iteration-time" in output_current:
 
         # First warmup iteration is excluded from iteration-time statistics.
         iteration_time_sampled = median(
-            [l for l in output_current["iteration-time"]['values'].values()][1:]
+            [l for l in output_current["iteration-time"]['values'].values()][start_step:]
         )
         iteration_time_golden = median(
-            [l for l in output_groundtruth["iteration-time"]['values'].values()][1:]
+            [l for l in output_groundtruth["iteration-time"]['values'].values()][start_step:]
         )
 
-        # 10% is empirically observed to be within hardware variance.
-        assert (
-            0.9 * iteration_time_golden <= iteration_time_sampled <= 1.2 * iteration_time_golden
-        ), (
-            f"Iteration time {iteration_time_sampled} ms not within 10% below or 20% above "
-            f"golden value ~{iteration_time_golden} ms. "
+        lower_bound = (1 - ITERATION_TIME_RELATIVE_TOLERANCE) * iteration_time_golden
+        upper_bound = (1 + ITERATION_TIME_RELATIVE_TOLERANCE) * iteration_time_golden
+        assert lower_bound <= iteration_time_sampled <= upper_bound, (
+            f"Iteration time {iteration_time_sampled} ms not within "
+            f"{ITERATION_TIME_RELATIVE_TOLERANCE:.0%} of golden value ~{iteration_time_golden} ms. "
             f"Sampled: {output_current['iteration-time']} ms. "
             f"Please update golden values in the functional tests if this is expected."
         )
 
         output_groundtruth.pop('iteration-time')
+
+    if "lm-loss" in metrics and "lm-loss" in output_current:
+
+        # Validate lm-loss values with tolerance to account for hardware variance.
+        # Previously required exact matching, but this caused flaky failures due to
+        # floating-point differences across different GPU hardware.
+        golden_lm_loss_values = output_groundtruth["lm-loss"]['values']
+        current_lm_loss_values = output_current["lm-loss"]['values']
+
+        passed, mismatches = validate_with_tolerance(
+            golden_lm_loss_values,
+            current_lm_loss_values,
+            relative_tolerance=LM_LOSS_RELATIVE_TOLERANCE,
+            absolute_tolerance=LM_LOSS_ABSOLUTE_TOLERANCE,
+            metric_name="lm-loss",
+        )
+
+        if not passed:
+            error_msg = (
+                f"LM loss values outside tolerance ({LM_LOSS_RELATIVE_TOLERANCE:.1%}):\n"
+                + "\n".join(f"  - {m}" for m in mismatches)
+                + f"\n\nGolden: {golden_lm_loss_values}\n"
+                + f"Current: {current_lm_loss_values}\n"
+                + "Please update golden values in the functional tests if this is expected."
+            )
+            assert False, error_msg
+
+        output_groundtruth.pop('lm-loss')
+
+    if "mem-allocated-bytes" in metrics and "mem-allocated-bytes" in output_current:
+
+        # Use max instead of median - we care about worst-case memory usage
+        # Skip first step (warmup) which may have different memory characteristics
+        current_values = [l for l in output_current["mem-allocated-bytes"]['values'].values()][1:]
+        golden_values = [l for l in output_groundtruth["mem-allocated-bytes"]['values'].values()][
+            1:
+        ]
+
+        mem_allocated_bytes_sampled = max(current_values)
+        mem_allocated_bytes_golden = max(golden_values)
+
+        upper_bound = (1 + MEM_ALLOCATED_BYTES_RELATIVE_TOLERANCE) * mem_allocated_bytes_golden
+        assert mem_allocated_bytes_sampled <= upper_bound, (
+            f"Max mem allocated bytes {mem_allocated_bytes_sampled} bytes exceeds "
+            f"{MEM_ALLOCATED_BYTES_RELATIVE_TOLERANCE:.0%} above golden max {mem_allocated_bytes_golden} bytes. "
+            f"Upper bound: {upper_bound} bytes. "
+            f"Please update golden values in the functional tests if this is expected."
+        )
+
+        output_groundtruth.pop('mem-allocated-bytes')
+
+    if "mem-max-allocated-bytes" in metrics and "mem-max-allocated-bytes" in output_current:
+
+        # Use max - we care that peak memory doesn't exceed the golden peak
+        # Skip first step (warmup) which may have different memory characteristics
+        current_values = [l for l in output_current["mem-max-allocated-bytes"]['values'].values()][
+            1:
+        ]
+        golden_values = [
+            l for l in output_groundtruth["mem-max-allocated-bytes"]['values'].values()
+        ][1:]
+
+        mem_max_allocated_bytes_sampled = max(current_values)
+        mem_max_allocated_bytes_golden = max(golden_values)
+
+        upper_bound = (
+            1 + MEM_MAX_ALLOCATED_BYTES_RELATIVE_TOLERANCE
+        ) * mem_max_allocated_bytes_golden
+        assert mem_max_allocated_bytes_sampled <= upper_bound, (
+            f"Max mem-max-allocated bytes {mem_max_allocated_bytes_sampled} bytes exceeds "
+            f"{MEM_MAX_ALLOCATED_BYTES_RELATIVE_TOLERANCE:.0%} above golden max {mem_max_allocated_bytes_golden} bytes. "
+            f"Upper bound: {upper_bound} bytes. "
+            f"Please update golden values in the functional tests if this is expected."
+        )
+
+        output_groundtruth.pop('mem-max-allocated-bytes')
diff --git a/tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py b/tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py
index ae57db10e55..346b464b79d 100644
--- a/tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py
@@ -8,6 +8,32 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+_NON_REQUEST_TOP_LEVEL_KEYS = {
+    # System-level metrics
+    "throughput",
+    # Peak memory metrics (added by inference scripts; optionally checked if present in golden values)
+    "mem-max-allocated-bytes",
+}
+
+
+def _median_as_float(value):
+    """Convert scalar or list metric to a single float (median).
+
+    For list metrics (e.g., per-request throughput), treat the first element as
+    warmup if length > 1, matching existing throughput behavior.
+    """
+    if isinstance(value, list):
+        assert len(value) > 0, "Metric list is empty."
+        values = [float(v) for v in value]
+        if len(values) > 1:
+            values = values[1:]
+        return float(median(values))
+    return float(value)
+
+
+def _bytes_to_gib(num_bytes: float) -> float:
+    return float(num_bytes) / (1024.0**3)
+
 
 def test_inference_pipeline(golden_values_path: str, test_values_path: str) -> None:
 
@@ -26,12 +52,17 @@ def test_inference_pipeline(golden_values_path: str, test_values_path: str) -> N
         # Handle JSONL output, assume only one line in this case.
         output_current = json.loads(output_current)
 
-    assert set(output_groundtruth.keys()).issuperset(
-        set(output_current.keys())
-    ), f"Some IDs from groundtruth are missing in current: {output_groundtruth.keys()} vs {output_current.keys()}"
-    if set(output_groundtruth.keys()) != set(output_current.keys()):
+    groundtruth_request_ids = set(output_groundtruth.keys()) - _NON_REQUEST_TOP_LEVEL_KEYS
+    current_request_ids = set(output_current.keys()) - _NON_REQUEST_TOP_LEVEL_KEYS
+
+    assert groundtruth_request_ids.issuperset(current_request_ids), (
+        "Some request IDs from groundtruth are missing in current or current has unexpected IDs: "
+        f"{sorted(groundtruth_request_ids)} vs {sorted(current_request_ids)}"
+    )
+    if groundtruth_request_ids != current_request_ids:
         logger.warning(
-            f"Some IDs from groundtruth are missing in output, only the subset of ids in groundtruth will be tested: {output_groundtruth.keys()} vs {output_current.keys()}"
+            "Some request IDs from groundtruth are missing in output; only the subset of ids in groundtruth will be tested: "
+            f"{sorted(groundtruth_request_ids)} vs {sorted(current_request_ids)}"
         )
     assert len(output_groundtruth) > 0, "No test performed for output"
 
@@ -54,6 +85,35 @@ def test_inference_pipeline(golden_values_path: str, test_values_path: str) -> N
 
         output_groundtruth.pop('throughput')
 
+    # Peak memory regression checks (optional: only if present in golden values).
+    if "mem-max-allocated-bytes" in output_groundtruth:
+        assert "mem-max-allocated-bytes" in output_current, (
+            f"Golden values include mem-max-allocated-bytes but current output does not. "
+            "Ensure the inference script records memory metrics to the output JSON."
+        )
+        sampled = _median_as_float(output_current["mem-max-allocated-bytes"])
+        golden = _median_as_float(output_groundtruth["mem-max-allocated-bytes"])
+        assert golden > 0, f"Golden mem_max_allocated_bytes must be > 0, got {golden}."
+
+        low = 0.95 * golden
+        high = 1.05 * golden
+
+        if sampled < low:
+            raise AssertionError(
+                f"Memory is too low for mem-max-allocated-bytes: "
+                f"expected within 5% of {golden:.0f} bytes ({_bytes_to_gib(golden):.3f} GiB) "
+                f"but got {sampled:.0f} bytes ({_bytes_to_gib(sampled):.3f} GiB). "
+                "This is >5% lower than expected; please update golden values in the functional tests."
+            )
+        if sampled > high:
+            raise AssertionError(
+                f"Memory is too high for mem-max-allocated-bytes: "
+                f"expected within ±5% of {golden:.0f} bytes ({_bytes_to_gib(golden):.3f} GiB) "
+                f"but got {sampled:.0f} bytes ({_bytes_to_gib(sampled):.3f} GiB). "
+                "This is >5% higher than expected; this is likely a regression."
+            )
+        output_groundtruth.pop("mem-max-allocated-bytes")
+
     for request_id, groundtruth_results in output_groundtruth.items():
         current_results = output_current[request_id]
 
diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh
index 1d0e77a3477..72fd187d19d 100644
--- a/tests/functional_tests/shell_test_utils/_run_training.sh
+++ b/tests/functional_tests/shell_test_utils/_run_training.sh
@@ -159,7 +159,7 @@ MASTER_PORT=${MASTER_PORT:-6000}
 NUM_NODES=${NUM_NODES:-${SLURM_NNODES:-1}}
 GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID:-0}}
-LAST_RANK=7
+LAST_RANK=$((GPUS_PER_NODE - 1)) 
 export LOG_DIR=$OUTPUT_PATH/logs/$REPEAT
 mkdir -p $LOG_DIR
 
@@ -170,7 +170,7 @@ DISTRIBUTED_ARGS=(
     --master_port $MASTER_PORT
     --node_rank $NODE_RANK
     --log-dir $LOG_DIR
-    --tee "0:3,7:3"
+    --tee "0:3,$LAST_RANK:3"
     --redirects "3"
 )
 
diff --git a/tests/functional_tests/shell_test_utils/run_batch_ci_tests.sh b/tests/functional_tests/shell_test_utils/run_batch_ci_tests.sh
new file mode 100755
index 00000000000..9c99726555c
--- /dev/null
+++ b/tests/functional_tests/shell_test_utils/run_batch_ci_tests.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+#
+# Script to submit batch jobs to run test scripts across different compute nodes
+#
+# Usage:
+#   ./run_batch_ci_tests.sh <test_script> [num_jobs] [partition]
+#
+# Arguments:
+#   test_script  - Path to test script in test_cases/ (required)
+#   num_jobs     - Number of jobs to submit (default: 10)
+#   partition    - Slurm partition to use (default: interactive)
+#
+# Examples:
+#   ./run_batch_ci_tests.sh test_cases/moe/gpt_grpo_tp4tp2_pp1_ep4ep2_dp8_throughputtest.sh
+#   ./run_batch_ci_tests.sh test_cases/gpt/gpt3_mcore_te_tp2_pp2.sh 5
+#   ./run_batch_ci_tests.sh test_cases/bert/bert_mcore_tp2_pp2.sh 10 batch_block1
+#
+# To list available test scripts:
+#   ./run_batch_ci_tests.sh --list
+#   ./run_batch_ci_tests.sh --list moe      # List only moe tests
+#   ./run_batch_ci_tests.sh --list gpt      # List only gpt tests
+#
+
+set -e
+
+# Function to list available test scripts
+list_tests() {
+    local filter="${1:-}"
+    echo "Available test scripts in test_cases/:"
+    echo
+    if [ -n "$filter" ]; then
+        # List tests in specific subdirectory
+        if [ -d "test_cases/$filter" ]; then
+            find "test_cases/$filter" -name "*.sh" -type f | sort
+        else
+            echo "No test_cases/$filter directory found."
+            echo "Available subdirectories:"
+            ls -d test_cases/*/ 2>/dev/null | sed 's|test_cases/||g; s|/||g' | xargs -I {} echo "  {}"
+            exit 1
+        fi
+    else
+        # List all tests grouped by subdirectory
+        for dir in test_cases/*/; do
+            if [ -d "$dir" ]; then
+                subdir=$(basename "$dir")
+                echo "=== $subdir ==="
+                find "$dir" -name "*.sh" -type f | sort | sed 's|^|  |'
+                echo
+            fi
+        done
+    fi
+    exit 0
+}
+
+# Handle --list option
+if [ "${1:-}" = "--list" ]; then
+    list_tests "${2:-}"
+fi
+
+# Configuration (same as start_ci_interactive.sh)
+export DATASET_DIR=/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_mcore/mcore_ci
+export TGT_IMAGE=gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_dev:main
+export ACCOUNT=llmservice_fm_text
+
+# The test script to run inside the container (first argument, required)
+TEST_SCRIPT="${1:-}"
+
+if [ -z "$TEST_SCRIPT" ]; then
+    echo "ERROR: Test script path is required"
+    echo
+    echo "Usage: $0 <test_script> [num_jobs] [partition]"
+    echo
+    echo "Run '$0 --list' to see available test scripts"
+    exit 1
+fi
+
+# Number of jobs to submit (second argument, default 10)
+NUM_JOBS=${2:-10}
+
+# Partition (third argument, default to same as interactive - change if needed)
+# Common batch partition names: batch, batch_block1, dgx_batch, etc.
+export PARTITION=${3:-interactive}
+
+# Verify test script exists
+if [ ! -f "$TEST_SCRIPT" ]; then
+    echo "ERROR: Test script not found: $TEST_SCRIPT"
+    echo "Make sure you run this from the megatron-rl directory"
+    echo
+    echo "Run '$0 --list' to see available test scripts"
+    exit 1
+fi
+
+# Extract test name from script path for job naming
+# e.g., "test_cases/moe/gpt_grpo_tp4tp2_pp1_ep4ep2_dp8_throughputtest.sh" -> "gpt_grpo_tp4tp2_pp1_ep4ep2_dp8_throughputtest"
+TEST_NAME=$(basename "$TEST_SCRIPT" .sh)
+
+# Output directory for logs (include test name for clarity)
+LOG_DIR="$(pwd)/batch_test_logs_${TEST_NAME}_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$LOG_DIR"
+
+# Container mounts
+CONTAINER_MOUNTS="$DATASET_DIR:/mnt/artifacts,$(pwd):/opt/megatron-lm"
+
+echo "============================================="
+echo "Batch CI Test Submission"
+echo "============================================="
+echo "Test Script:  $TEST_SCRIPT"
+echo "Test Name:    $TEST_NAME"
+echo "Partition:    $PARTITION"
+echo "Account:      $ACCOUNT"
+echo "Image:        $TGT_IMAGE"
+echo "Dataset Dir:  $DATASET_DIR"
+echo "Num Jobs:     $NUM_JOBS"
+echo "Log Dir:      $LOG_DIR"
+echo "============================================="
+echo
+
+# Submit jobs
+# Truncate test name if too long for job name (max ~64 chars typically)
+SHORT_TEST_NAME="${TEST_NAME:0:50}"
+
+for i in $(seq 1 $NUM_JOBS); do
+    JOB_NAME="${SHORT_TEST_NAME}_run_${i}"
+    
+    sbatch \
+        --job-name="$JOB_NAME" \
+        --partition="$PARTITION" \
+        --account="$ACCOUNT" \
+        --nodes=1 \
+        --gpus-per-task=8 \
+        --time=1:00:00 \
+        --exclusive \
+        --output="$LOG_DIR/${JOB_NAME}_%j.out" \
+        --error="$LOG_DIR/${JOB_NAME}_%j.err" \
+        --export=ALL \
+        --wrap="srun \
+            --container-image=$TGT_IMAGE \
+            --container-workdir=/opt/megatron-lm \
+            --container-mounts=$CONTAINER_MOUNTS \
+            --no-container-mount-home \
+            bash -c 'cd /opt/megatron-lm && time bash $TEST_SCRIPT'"
+    
+    echo "Submitted job $i: $JOB_NAME"
+done
+
+echo
+echo "============================================="
+echo "All $NUM_JOBS jobs submitted!"
+echo "Monitor with: squeue -u \$USER"
+echo "Logs will be written to: $LOG_DIR"
+echo "============================================="
+
+# Create a helper script to check results
+cat > "$LOG_DIR/check_results.sh" << 'CHECKEOF'
+#!/bin/bash
+# Check the results of all batch test runs
+
+LOG_DIR="$(dirname "$0")"
+echo "Checking results in: $LOG_DIR"
+echo
+
+total=0
+passed=0
+failed=0
+pending=0
+
+# Match any .out file that ends with _run_N_JOBID.out pattern
+for outfile in "$LOG_DIR"/*_run_*.out; do
+    if [ -f "$outfile" ]; then
+        total=$((total + 1))
+        jobname=$(basename "$outfile" .out)
+        
+        # Check if file is empty (job still running or not started)
+        if [ ! -s "$outfile" ]; then
+            echo "PENDING: $jobname (no output yet)"
+            pending=$((pending + 1))
+            continue
+        fi
+        
+        # Check for success: look for "This test wrote results into" which indicates completion
+        if grep -q "This test wrote results into" "$outfile" 2>/dev/null; then
+            # Check for errors/failures
+            if grep -Ei "FAILED|AssertionError|Exception:|Traceback" "$outfile" 2>/dev/null | grep -v "grep" > /dev/null; then
+                echo "FAILED:  $jobname"
+                failed=$((failed + 1))
+            else
+                # Extract timing info
+                timing=$(grep -E "^real\s" "$outfile" 2>/dev/null | head -1 || echo "")
+                echo "PASSED:  $jobname $timing"
+                passed=$((passed + 1))
+            fi
+        else
+            # Job might still be running or crashed early
+            if grep -qi "error\|failed\|exception\|traceback" "$outfile" 2>/dev/null; then
+                echo "FAILED:  $jobname (error in output)"
+                failed=$((failed + 1))
+            else
+                echo "RUNNING: $jobname (incomplete output)"
+                pending=$((pending + 1))
+            fi
+        fi
+    fi
+done
+
+echo
+echo "============================================="
+echo "Summary:"
+echo "  Passed:  $passed"
+echo "  Failed:  $failed"
+echo "  Pending: $pending"
+echo "  Total:   $total"
+echo "============================================="
+
+if [ $failed -gt 0 ]; then
+    exit 1
+elif [ $pending -gt 0 ]; then
+    exit 2
+else
+    exit 0
+fi
+CHECKEOF
+chmod +x "$LOG_DIR/check_results.sh"
+
+# Create a script to show node info for each job
+cat > "$LOG_DIR/show_nodes.sh" << 'NODEEOF'
+#!/bin/bash
+# Show which node each job ran on
+
+LOG_DIR="$(dirname "$0")"
+echo "Node assignments for batch tests:"
+echo
+
+# Match any .out file that ends with _run_N_JOBID.out pattern
+for outfile in "$LOG_DIR"/*_run_*.out; do
+    if [ -f "$outfile" ]; then
+        jobname=$(basename "$outfile" .out)
+        jobid=$(echo "$outfile" | grep -oP '\d+(?=\.out)')
+        
+        # Try to get node from sacct or from output file
+        node=$(sacct -j "$jobid" --format=NodeList --noheader 2>/dev/null | head -1 | tr -d ' ')
+        if [ -z "$node" ]; then
+            node="unknown"
+        fi
+        
+        echo "$jobname (job $jobid): $node"
+    fi
+done
+NODEEOF
+chmod +x "$LOG_DIR/show_nodes.sh"
+
+echo "After jobs complete:"
+echo "  - Run '$LOG_DIR/check_results.sh' to check results"
+echo "  - Run '$LOG_DIR/show_nodes.sh' to see which nodes were used"
+echo
+echo "To run other tests, use: $0 --list to see available test scripts"
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh
index d2c55838565..3d47e591749 100644
--- a/tests/functional_tests/shell_test_utils/run_ci_test.sh
+++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh
@@ -8,6 +8,9 @@ ulimit -Sn $(ulimit -Hn)
 # Increase soft limit for number of processes to match hard limit
 ulimit -Su $(ulimit -Hu)
 
+# Set umask to 0002 to allow group read/write permissions
+umask 0002
+
 set +x
 for ARGUMENT in "$@"; do
     # Split on first = only, preserving any subsequent = signs in the value
@@ -51,6 +54,10 @@ set -exo pipefail
 # Extract settings from params file
 TEST_TYPE=$(cat $TRAINING_PARAMS_PATH |
     /usr/local/bin/yq '.TEST_TYPE')
+ENABLE_LIGHTWEIGHT_MODE=$(cat $TRAINING_PARAMS_PATH |
+    /usr/local/bin/yq '.ENV_VARS.ENABLE_LIGHTWEIGHT_MODE // "false"')
+N_REPEAT=$(cat $TRAINING_PARAMS_PATH |
+    /usr/local/bin/yq '.ENV_VARS.N_REPEAT // "'$N_REPEAT'"')
 MODE=$(cat $TRAINING_PARAMS_PATH |
     /usr/local/bin/yq '.MODE // "pretraining"')
 
@@ -67,6 +74,7 @@ mkdir -p $CHECKPOINT_SAVE_PATH
 mkdir -p $CHECKPOINT_LOAD_PATH || true
 _CHECKPOINT_LOAD_PATH=$CHECKPOINT_LOAD_PATH
 _CHECKPOINT_SAVE_PATH=$CHECKPOINT_SAVE_PATH
+_TENSORBOARD_PATH=$TENSORBOARD_PATH
 
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)
@@ -124,11 +132,19 @@ SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH |
 
 export RECORD_CHECKPOINTS=${RECORD_CHECKPOINTS:-"false"}
 
+NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID:-0}}
+
 for i in $(seq 1 $N_REPEAT); do
+    # Move TB logs into a repeat-specific directory
+    DIR=$(dirname "$_TENSORBOARD_PATH")
+    FILE=$(basename "$_TENSORBOARD_PATH")
+    export TENSORBOARD_PATH=$DIR/$i/$FILE
+    mkdir -p $(dirname $TENSORBOARD_PATH)
+
     if [[ $i -gt 1 ]]; then
-        rm -rf $CHECKPOINT_SAVE_PATH/*
-        rm -rf /tmp/checkpoints/*
-        rm -rf $TENSORBOARD_PATH/*
+        rm -rf $CHECKPOINT_SAVE_PATH/* || true
+        rm -rf /tmp/checkpoints/* || true   
+        rm -rf $TENSORBOARD_PATH/* || true
     fi
 
     # First run never loads from a checkpoint
@@ -195,15 +211,18 @@ for i in $(seq 1 $N_REPEAT); do
         echo "No frozen checkpoint found. Will skip second run."
 
         export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
-        rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
+        if [[ $NODE_RANK -eq 0 ]]; then
+            rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
+        fi
         echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_SAVE_PATH/latest_checkpointed_iteration.txt
         break
     fi
 
     if [[ "$TEST_TYPE" == "ckpt-resume" && "$TRAINING_EXIT_CODE" -eq 0 ]]; then
         export CHECKPOINT_LOAD_PATH=$CHECKPOINT_SAVE_PATH
-
-        rm -rf "$CHECKPOINT_LOAD_PATH/iter_$(printf "%07d\n" "$TRAIN_ITERS")"
+        if [[ $NODE_RANK -eq 0 ]]; then
+            rm -rf "$CHECKPOINT_LOAD_PATH/iter_$(printf "%07d\n" "$TRAIN_ITERS")"
+        fi
         echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_LOAD_PATH/latest_checkpointed_iteration.txt
 
         export RUN_NUMBER=2
@@ -220,7 +239,9 @@ for i in $(seq 1 $N_REPEAT); do
         bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh || TRAINING_EXIT_CODE=$?
 
         export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
-        rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
+        if [[ $NODE_RANK -eq 0 ]]; then
+            rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
+        fi
         echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_SAVE_PATH/latest_checkpointed_iteration.txt
     fi
 
@@ -308,7 +329,7 @@ for i in $(seq 1 $N_REPEAT); do
             if [[ "$TEST_TYPE" == "frozen-start" ]]; then
                 uv run --no-sync pytest -s -o log_cli=true --log-cli-level=info $ROOT_DIR/tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py \
                     --golden-values-path $GOLDEN_VALUES_PATH \
-                    --test-values-path $TENSORBOARD_PATH \
+                    --test-values-path $INFERENCE_OUTPUT_PATH \
                     --model-config-path ${TRAINING_PARAMS_PATH} \
                     $ALLOW_NONDETERMINISTIC_ALGO_ARG
             fi
diff --git a/tests/functional_tests/shell_test_utils/start_interactive_job.sh b/tests/functional_tests/shell_test_utils/start_interactive_job.sh
index 13067e7c0ea..cd0b16f93df 100644
--- a/tests/functional_tests/shell_test_utils/start_interactive_job.sh
+++ b/tests/functional_tests/shell_test_utils/start_interactive_job.sh
@@ -87,6 +87,7 @@ SRUN_CMD="srun \
     --container-image=$IMAGE \
     --container-workdir=/opt/megatron-lm \
     --container-mounts=$CONTAINER_MOUNTS \
+    --no-container-mount-home \
     --nodes=1 \
     $(if [ "$NO_GPUS_PER_TASK" = "FALSE" ]; then echo "--gpus-per-task=8"; fi) \
     --time=$TIME \
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json
index df02cb774f4..b9b1236875c 100644
--- a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json
@@ -24,36 +24,36 @@
             "18": 10.44272,
             "19": 10.43057,
             "20": 10.44534,
-            "21": 10.41778,
-            "22": 10.38667,
-            "23": 10.39322,
-            "24": 10.37847,
-            "25": 10.35474,
-            "26": 10.35955,
-            "27": 10.34527,
-            "28": 10.33539,
-            "29": 10.25416,
-            "30": 10.23011,
-            "31": 10.14092,
-            "32": 10.13601,
-            "33": 10.13944,
-            "34": 10.11377,
-            "35": 10.0888,
-            "36": 10.09247,
-            "37": 10.06836,
-            "38": 10.04664,
-            "39": 9.97584,
-            "40": 9.93781,
-            "41": 9.90867,
-            "42": 9.84873,
-            "43": 9.8577,
-            "44": 9.79259,
-            "45": 9.8035,
-            "46": 9.7029,
-            "47": 9.73432,
+            "21": 10.41771,
+            "22": 10.38656,
+            "23": 10.39328,
+            "24": 10.37849,
+            "25": 10.35466,
+            "26": 10.35965,
+            "27": 10.34523,
+            "28": 10.33556,
+            "29": 10.25418,
+            "30": 10.23008,
+            "31": 10.14093,
+            "32": 10.13603,
+            "33": 10.13936,
+            "34": 10.11381,
+            "35": 10.08888,
+            "36": 10.09238,
+            "37": 10.06851,
+            "38": 10.0466,
+            "39": 9.97582,
+            "40": 9.93764,
+            "41": 9.90872,
+            "42": 9.84882,
+            "43": 9.85772,
+            "44": 9.7925,
+            "45": 9.80329,
+            "46": 9.70285,
+            "47": 9.73423,
             "48": 9.70106,
-            "49": 9.69981,
-            "50": 9.70258
+            "49": 9.69966,
+            "50": 9.70252
         }
     },
     "num-zeros": {
@@ -80,37 +80,37 @@
             "17": 2409.0,
             "18": 2345.0,
             "19": 2374.0,
-            "20": 2739.0,
-            "21": 2030.0,
-            "22": 2819.0,
-            "23": 2763.0,
-            "24": 2731.0,
-            "25": 2429.0,
-            "26": 2817.0,
-            "27": 2944.0,
-            "28": 2741.0,
-            "29": 2639.0,
-            "30": 2723.0,
-            "31": 2158.0,
-            "32": 2242.0,
-            "33": 2046.0,
-            "34": 2139.0,
-            "35": 2492.0,
-            "36": 2641.0,
-            "37": 2853.0,
-            "38": 2705.0,
-            "39": 2807.0,
-            "40": 3333.0,
-            "41": 1762.0,
-            "42": 1410.0,
-            "43": 1558.0,
-            "44": 2384.0,
-            "45": 3170.0,
-            "46": 2664.0,
-            "47": 2641.0,
-            "48": 3490.0,
-            "49": 2928.0,
-            "50": 2487.0
+            "20": 2743.0,
+            "21": 2039.0,
+            "22": 2925.0,
+            "23": 2630.0,
+            "24": 2821.0,
+            "25": 2366.0,
+            "26": 2633.0,
+            "27": 2921.0,
+            "28": 2760.0,
+            "29": 2635.0,
+            "30": 2614.0,
+            "31": 2073.0,
+            "32": 2275.0,
+            "33": 2130.0,
+            "34": 2185.0,
+            "35": 2312.0,
+            "36": 2789.0,
+            "37": 2937.0,
+            "38": 2652.0,
+            "39": 2929.0,
+            "40": 3348.0,
+            "41": 1812.0,
+            "42": 1441.0,
+            "43": 1726.0,
+            "44": 2437.0,
+            "45": 3263.0,
+            "46": 2813.0,
+            "47": 2668.0,
+            "48": 3411.0,
+            "49": 3174.0,
+            "50": 2441.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3404871168.0,
-            "2": 3404871168.0,
-            "3": 3404871168.0,
-            "4": 3404871168.0,
-            "5": 3404871168.0,
-            "6": 3404871168.0,
-            "7": 3404871168.0,
-            "8": 3404871168.0,
-            "9": 3404871168.0,
-            "10": 3404871168.0,
-            "11": 3404871168.0,
-            "12": 3404871168.0,
-            "13": 3404871168.0,
-            "14": 3404871168.0,
-            "15": 3404871168.0,
-            "16": 3404871168.0,
-            "17": 3404871168.0,
-            "18": 3404871168.0,
-            "19": 3404871168.0,
-            "20": 3404871168.0,
-            "21": 3404871168.0,
-            "22": 3404871168.0,
-            "23": 3404871168.0,
-            "24": 3404871168.0,
-            "25": 3404871168.0,
-            "26": 3404871168.0,
-            "27": 3404871168.0,
-            "28": 3404871168.0,
-            "29": 3404871168.0,
-            "30": 3404871168.0,
-            "31": 3404871168.0,
-            "32": 3404871168.0,
-            "33": 3404871168.0,
-            "34": 3404871168.0,
-            "35": 3404871168.0,
-            "36": 3404871168.0,
-            "37": 3404871168.0,
-            "38": 3404871168.0,
-            "39": 3404871168.0,
-            "40": 3404871168.0,
-            "41": 3404871168.0,
-            "42": 3404871168.0,
-            "43": 3404871168.0,
-            "44": 3404871168.0,
-            "45": 3404871168.0,
-            "46": 3404871168.0,
-            "47": 3404871168.0,
-            "48": 3404871168.0,
-            "49": 3404871168.0,
-            "50": 3404871168.0
+            "1": 3405920768.0,
+            "2": 3405920768.0,
+            "3": 3405920768.0,
+            "4": 3405920768.0,
+            "5": 3405920768.0,
+            "6": 3405920768.0,
+            "7": 3405920768.0,
+            "8": 3405920768.0,
+            "9": 3405920768.0,
+            "10": 3405920768.0,
+            "11": 3405920768.0,
+            "12": 3405920768.0,
+            "13": 3405920768.0,
+            "14": 3405920768.0,
+            "15": 3405920768.0,
+            "16": 3405920768.0,
+            "17": 3405920768.0,
+            "18": 3405920768.0,
+            "19": 3405920768.0,
+            "20": 3405920768.0,
+            "21": 3405920768.0,
+            "22": 3405920768.0,
+            "23": 3405920768.0,
+            "24": 3405920768.0,
+            "25": 3405920768.0,
+            "26": 3405920768.0,
+            "27": 3405920768.0,
+            "28": 3405920768.0,
+            "29": 3405920768.0,
+            "30": 3405920768.0,
+            "31": 3405920768.0,
+            "32": 3405920768.0,
+            "33": 3405920768.0,
+            "34": 3405920768.0,
+            "35": 3405920768.0,
+            "36": 3405920768.0,
+            "37": 3405920768.0,
+            "38": 3405920768.0,
+            "39": 3405920768.0,
+            "40": 3405920768.0,
+            "41": 3405920768.0,
+            "42": 3405920768.0,
+            "43": 3405920768.0,
+            "44": 3405920768.0,
+            "45": 3405920768.0,
+            "46": 3405920768.0,
+            "47": 3405920768.0,
+            "48": 3405920768.0,
+            "49": 3405920768.0,
+            "50": 3405920768.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4194526208.0,
-            "2": 5660965888.0,
-            "3": 5660965888.0,
-            "4": 5660965888.0,
-            "5": 5660965888.0,
-            "6": 5660965888.0,
-            "7": 5660965888.0,
-            "8": 5660965888.0,
-            "9": 5660965888.0,
-            "10": 5660965888.0,
-            "11": 5660965888.0,
-            "12": 5660965888.0,
-            "13": 5660965888.0,
-            "14": 5660965888.0,
-            "15": 5660965888.0,
-            "16": 5660965888.0,
-            "17": 5660965888.0,
-            "18": 5660965888.0,
-            "19": 5660965888.0,
-            "20": 5660965888.0,
-            "21": 5660965888.0,
-            "22": 5660965888.0,
-            "23": 5660965888.0,
-            "24": 5660965888.0,
-            "25": 5660965888.0,
-            "26": 5660965888.0,
-            "27": 5660965888.0,
-            "28": 5660965888.0,
-            "29": 5660965888.0,
-            "30": 5660965888.0,
-            "31": 5660965888.0,
-            "32": 5660965888.0,
-            "33": 5660965888.0,
-            "34": 5660965888.0,
-            "35": 5660965888.0,
-            "36": 5660965888.0,
-            "37": 5660965888.0,
-            "38": 5660965888.0,
-            "39": 5660965888.0,
-            "40": 5660965888.0,
-            "41": 5660965888.0,
-            "42": 5660965888.0,
-            "43": 5660965888.0,
-            "44": 5660965888.0,
-            "45": 5660965888.0,
-            "46": 5660965888.0,
-            "47": 5660965888.0,
-            "48": 5660965888.0,
-            "49": 5660965888.0,
-            "50": 5660965888.0
+            "1": 4195575808.0,
+            "2": 5662015488.0,
+            "3": 5662015488.0,
+            "4": 5662015488.0,
+            "5": 5662015488.0,
+            "6": 5662015488.0,
+            "7": 5662015488.0,
+            "8": 5662015488.0,
+            "9": 5662015488.0,
+            "10": 5662015488.0,
+            "11": 5662015488.0,
+            "12": 5662015488.0,
+            "13": 5662015488.0,
+            "14": 5662015488.0,
+            "15": 5662015488.0,
+            "16": 5662015488.0,
+            "17": 5662015488.0,
+            "18": 5662015488.0,
+            "19": 5662015488.0,
+            "20": 5662015488.0,
+            "21": 5662015488.0,
+            "22": 5662015488.0,
+            "23": 5662015488.0,
+            "24": 5662015488.0,
+            "25": 5662015488.0,
+            "26": 5662015488.0,
+            "27": 5662015488.0,
+            "28": 5662015488.0,
+            "29": 5662015488.0,
+            "30": 5662015488.0,
+            "31": 5662015488.0,
+            "32": 5662015488.0,
+            "33": 5662015488.0,
+            "34": 5662015488.0,
+            "35": 5662015488.0,
+            "36": 5662015488.0,
+            "37": 5662015488.0,
+            "38": 5662015488.0,
+            "39": 5662015488.0,
+            "40": 5662015488.0,
+            "41": 5662015488.0,
+            "42": 5662015488.0,
+            "43": 5662015488.0,
+            "44": 5662015488.0,
+            "45": 5662015488.0,
+            "46": 5662015488.0,
+            "47": 5662015488.0,
+            "48": 5662015488.0,
+            "49": 5662015488.0,
+            "50": 5662015488.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.44279,
-            "2": 0.55345,
-            "3": 0.53909,
-            "4": 0.52187,
-            "5": 0.52958,
-            "6": 0.5241,
-            "7": 0.5353,
-            "8": 0.51946,
-            "9": 0.52732,
-            "10": 0.52759,
-            "11": 0.51849,
-            "12": 0.52326,
-            "13": 0.52472,
-            "14": 0.52577,
-            "15": 0.51817,
-            "16": 0.51922,
-            "17": 0.51686,
-            "18": 0.5248,
-            "19": 0.51945,
-            "20": 0.74697,
-            "21": 0.51544,
-            "22": 0.52412,
-            "23": 0.66206,
-            "24": 0.51781,
-            "25": 0.52429,
-            "26": 0.52068,
-            "27": 0.62432,
-            "28": 0.52016,
-            "29": 0.52217,
-            "30": 0.51949,
-            "31": 0.69033,
-            "32": 0.52127,
-            "33": 0.52602,
-            "34": 0.6403,
-            "35": 0.51723,
-            "36": 0.52445,
-            "37": 0.51746,
-            "38": 0.52296,
-            "39": 0.52159,
-            "40": 0.6718,
-            "41": 0.58171,
-            "42": 0.7393,
-            "43": 0.54277,
-            "44": 0.81615,
-            "45": 0.52284,
-            "46": 0.71947,
-            "47": 0.52219,
-            "48": 0.51866,
-            "49": 0.51764,
-            "50": 0.51841
+            "1": 9.33953,
+            "2": 0.53319,
+            "3": 0.47492,
+            "4": 0.43971,
+            "5": 0.43812,
+            "6": 0.43852,
+            "7": 0.4386,
+            "8": 0.43696,
+            "9": 0.4374,
+            "10": 0.43581,
+            "11": 0.71474,
+            "12": 0.44321,
+            "13": 0.73975,
+            "14": 0.44195,
+            "15": 0.43796,
+            "16": 0.43687,
+            "17": 0.43648,
+            "18": 0.43733,
+            "19": 0.43826,
+            "20": 0.44179,
+            "21": 1.02916,
+            "22": 0.7107,
+            "23": 0.70393,
+            "24": 0.904,
+            "25": 0.43822,
+            "26": 0.43864,
+            "27": 0.46131,
+            "28": 0.44753,
+            "29": 0.43372,
+            "30": 0.43644,
+            "31": 0.45145,
+            "32": 0.44608,
+            "33": 0.43714,
+            "34": 0.43395,
+            "35": 0.43358,
+            "36": 0.43471,
+            "37": 0.43343,
+            "38": 0.43378,
+            "39": 0.43774,
+            "40": 0.43399,
+            "41": 0.43662,
+            "42": 0.43501,
+            "43": 0.43703,
+            "44": 0.44084,
+            "45": 0.43443,
+            "46": 0.43652,
+            "47": 0.84278,
+            "48": 0.44024,
+            "49": 0.4409,
+            "50": 0.43833
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json
index 0d85e13b23b..30fa7e80d5a 100644
--- a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json
@@ -21,39 +21,39 @@
             "15": 10.52714,
             "16": 10.50594,
             "17": 10.5009,
-            "18": 10.51023,
-            "19": 10.493,
-            "20": 10.48862,
-            "21": 10.47473,
-            "22": 10.42799,
-            "23": 10.42684,
-            "24": 10.4036,
-            "25": 10.39991,
-            "26": 10.38461,
-            "27": 10.38216,
-            "28": 10.36877,
-            "29": 10.32192,
-            "30": 10.2204,
-            "31": 10.17094,
-            "32": 10.12605,
-            "33": 10.10628,
-            "34": 10.09438,
-            "35": 10.07042,
-            "36": 10.07481,
-            "37": 10.03644,
-            "38": 10.01812,
-            "39": 9.96852,
-            "40": 9.93082,
-            "41": 9.87316,
-            "42": 9.81842,
-            "43": 9.8156,
-            "44": 9.73841,
-            "45": 9.7628,
-            "46": 9.67691,
-            "47": 9.68688,
+            "18": 10.51024,
+            "19": 10.49283,
+            "20": 10.48852,
+            "21": 10.47463,
+            "22": 10.42802,
+            "23": 10.42674,
+            "24": 10.40359,
+            "25": 10.39998,
+            "26": 10.38464,
+            "27": 10.38236,
+            "28": 10.36891,
+            "29": 10.32202,
+            "30": 10.22049,
+            "31": 10.17103,
+            "32": 10.12583,
+            "33": 10.10622,
+            "34": 10.09458,
+            "35": 10.07043,
+            "36": 10.07484,
+            "37": 10.03646,
+            "38": 10.0182,
+            "39": 9.9686,
+            "40": 9.93086,
+            "41": 9.87312,
+            "42": 9.8185,
+            "43": 9.81546,
+            "44": 9.73852,
+            "45": 9.76279,
+            "46": 9.67679,
+            "47": 9.68692,
             "48": 9.66292,
             "49": 9.67587,
-            "50": 9.67446
+            "50": 9.67447
         }
     },
     "num-zeros": {
@@ -78,39 +78,39 @@
             "15": 2607.0,
             "16": 2411.0,
             "17": 2529.0,
-            "18": 2418.0,
-            "19": 2363.0,
-            "20": 2323.0,
-            "21": 2401.0,
-            "22": 2588.0,
-            "23": 2338.0,
-            "24": 2305.0,
-            "25": 2702.0,
-            "26": 2370.0,
-            "27": 2462.0,
-            "28": 2407.0,
-            "29": 2240.0,
-            "30": 2850.0,
-            "31": 2882.0,
-            "32": 2837.0,
-            "33": 2645.0,
-            "34": 2874.0,
-            "35": 2913.0,
-            "36": 3000.0,
-            "37": 3122.0,
-            "38": 2680.0,
-            "39": 2216.0,
-            "40": 2211.0,
-            "41": 3456.0,
-            "42": 3624.0,
-            "43": 3364.0,
-            "44": 4026.0,
-            "45": 4145.0,
-            "46": 2924.0,
-            "47": 1942.0,
-            "48": 3363.0,
-            "49": 3532.0,
-            "50": 3710.0
+            "18": 2392.0,
+            "19": 2417.0,
+            "20": 2269.0,
+            "21": 2382.0,
+            "22": 2652.0,
+            "23": 2420.0,
+            "24": 2251.0,
+            "25": 2616.0,
+            "26": 2433.0,
+            "27": 2470.0,
+            "28": 2335.0,
+            "29": 2270.0,
+            "30": 2689.0,
+            "31": 2960.0,
+            "32": 2808.0,
+            "33": 2659.0,
+            "34": 2932.0,
+            "35": 2926.0,
+            "36": 3103.0,
+            "37": 3227.0,
+            "38": 2634.0,
+            "39": 2132.0,
+            "40": 2236.0,
+            "41": 3589.0,
+            "42": 3470.0,
+            "43": 3467.0,
+            "44": 4038.0,
+            "45": 4173.0,
+            "46": 2993.0,
+            "47": 1996.0,
+            "48": 3318.0,
+            "49": 3662.0,
+            "50": 3572.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2061524480.0,
-            "2": 2061524480.0,
-            "3": 2061524480.0,
-            "4": 2061524480.0,
-            "5": 2061524480.0,
-            "6": 2061524480.0,
-            "7": 2061524480.0,
-            "8": 2061524480.0,
-            "9": 2061524480.0,
-            "10": 2061524480.0,
-            "11": 2061524480.0,
-            "12": 2061524480.0,
-            "13": 2061524480.0,
-            "14": 2061524480.0,
-            "15": 2061524480.0,
-            "16": 2061524480.0,
-            "17": 2061524480.0,
-            "18": 2061524480.0,
-            "19": 2061524480.0,
-            "20": 2061524480.0,
-            "21": 2061524480.0,
-            "22": 2061524480.0,
-            "23": 2061524480.0,
-            "24": 2061524480.0,
-            "25": 2061524480.0,
-            "26": 2061524480.0,
-            "27": 2061524480.0,
-            "28": 2061524480.0,
-            "29": 2061524480.0,
-            "30": 2061524480.0,
-            "31": 2061524480.0,
-            "32": 2061524480.0,
-            "33": 2061524480.0,
-            "34": 2061524480.0,
-            "35": 2061524480.0,
-            "36": 2061524480.0,
-            "37": 2061524480.0,
-            "38": 2061524480.0,
-            "39": 2061524480.0,
-            "40": 2061524480.0,
-            "41": 2061524480.0,
-            "42": 2061524480.0,
-            "43": 2061524480.0,
-            "44": 2061524480.0,
-            "45": 2061524480.0,
-            "46": 2061524480.0,
-            "47": 2061524480.0,
-            "48": 2061524480.0,
-            "49": 2061524480.0,
-            "50": 2061524480.0
+            "1": 2062574080.0,
+            "2": 2062574080.0,
+            "3": 2062574080.0,
+            "4": 2062574080.0,
+            "5": 2062574080.0,
+            "6": 2062574080.0,
+            "7": 2062574080.0,
+            "8": 2062574080.0,
+            "9": 2062574080.0,
+            "10": 2062574080.0,
+            "11": 2062574080.0,
+            "12": 2062574080.0,
+            "13": 2062574080.0,
+            "14": 2062574080.0,
+            "15": 2062574080.0,
+            "16": 2062574080.0,
+            "17": 2062574080.0,
+            "18": 2062574080.0,
+            "19": 2062574080.0,
+            "20": 2062574080.0,
+            "21": 2062574080.0,
+            "22": 2062574080.0,
+            "23": 2062574080.0,
+            "24": 2062574080.0,
+            "25": 2062574080.0,
+            "26": 2062574080.0,
+            "27": 2062574080.0,
+            "28": 2062574080.0,
+            "29": 2062574080.0,
+            "30": 2062574080.0,
+            "31": 2062574080.0,
+            "32": 2062574080.0,
+            "33": 2062574080.0,
+            "34": 2062574080.0,
+            "35": 2062574080.0,
+            "36": 2062574080.0,
+            "37": 2062574080.0,
+            "38": 2062574080.0,
+            "39": 2062574080.0,
+            "40": 2062574080.0,
+            "41": 2062574080.0,
+            "42": 2062574080.0,
+            "43": 2062574080.0,
+            "44": 2062574080.0,
+            "45": 2062574080.0,
+            "46": 2062574080.0,
+            "47": 2062574080.0,
+            "48": 2062574080.0,
+            "49": 2062574080.0,
+            "50": 2062574080.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4385424896.0,
-            "2": 5245672960.0,
-            "3": 5245672960.0,
-            "4": 5245672960.0,
-            "5": 5245672960.0,
-            "6": 5245672960.0,
-            "7": 5245672960.0,
-            "8": 5245672960.0,
-            "9": 5245672960.0,
-            "10": 5245672960.0,
-            "11": 5245672960.0,
-            "12": 5245672960.0,
-            "13": 5245672960.0,
-            "14": 5245672960.0,
-            "15": 5245672960.0,
-            "16": 5245672960.0,
-            "17": 5245672960.0,
-            "18": 5245672960.0,
-            "19": 5245672960.0,
-            "20": 5245672960.0,
-            "21": 5245672960.0,
-            "22": 5245672960.0,
-            "23": 5245672960.0,
-            "24": 5245672960.0,
-            "25": 5245672960.0,
-            "26": 5245672960.0,
-            "27": 5245672960.0,
-            "28": 5245672960.0,
-            "29": 5245672960.0,
-            "30": 5245672960.0,
-            "31": 5245672960.0,
-            "32": 5245672960.0,
-            "33": 5245672960.0,
-            "34": 5245672960.0,
-            "35": 5245672960.0,
-            "36": 5245672960.0,
-            "37": 5245672960.0,
-            "38": 5245672960.0,
-            "39": 5245672960.0,
-            "40": 5245672960.0,
-            "41": 5245672960.0,
-            "42": 5245672960.0,
-            "43": 5245672960.0,
-            "44": 5245672960.0,
-            "45": 5245672960.0,
-            "46": 5245672960.0,
-            "47": 5245672960.0,
-            "48": 5245672960.0,
-            "49": 5245672960.0,
-            "50": 5245672960.0
+            "1": 4386474496.0,
+            "2": 5246722560.0,
+            "3": 5246722560.0,
+            "4": 5246722560.0,
+            "5": 5246722560.0,
+            "6": 5246722560.0,
+            "7": 5246722560.0,
+            "8": 5246722560.0,
+            "9": 5246722560.0,
+            "10": 5246722560.0,
+            "11": 5246722560.0,
+            "12": 5246722560.0,
+            "13": 5246722560.0,
+            "14": 5246722560.0,
+            "15": 5246722560.0,
+            "16": 5246722560.0,
+            "17": 5246722560.0,
+            "18": 5246722560.0,
+            "19": 5246722560.0,
+            "20": 5246722560.0,
+            "21": 5246722560.0,
+            "22": 5246722560.0,
+            "23": 5246722560.0,
+            "24": 5246722560.0,
+            "25": 5246722560.0,
+            "26": 5246722560.0,
+            "27": 5246722560.0,
+            "28": 5246722560.0,
+            "29": 5246722560.0,
+            "30": 5246722560.0,
+            "31": 5246722560.0,
+            "32": 5246722560.0,
+            "33": 5246722560.0,
+            "34": 5246722560.0,
+            "35": 5246722560.0,
+            "36": 5246722560.0,
+            "37": 5246722560.0,
+            "38": 5246722560.0,
+            "39": 5246722560.0,
+            "40": 5246722560.0,
+            "41": 5246722560.0,
+            "42": 5246722560.0,
+            "43": 5246722560.0,
+            "44": 5246722560.0,
+            "45": 5246722560.0,
+            "46": 5246722560.0,
+            "47": 5246722560.0,
+            "48": 5246722560.0,
+            "49": 5246722560.0,
+            "50": 5246722560.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 14.48983,
-            "2": 0.782,
-            "3": 0.71913,
-            "4": 0.71541,
-            "5": 0.71528,
-            "6": 0.7219,
-            "7": 0.72729,
-            "8": 0.72714,
-            "9": 0.7634,
-            "10": 0.71523,
-            "11": 0.72303,
-            "12": 1.34179,
-            "13": 0.93338,
-            "14": 0.72484,
-            "15": 0.70784,
-            "16": 0.72443,
-            "17": 0.72151,
-            "18": 0.71102,
-            "19": 1.13624,
-            "20": 1.56469,
-            "21": 1.66622,
-            "22": 0.9574,
-            "23": 0.69921,
-            "24": 0.70477,
-            "25": 0.73932,
-            "26": 0.74798,
-            "27": 0.72633,
-            "28": 0.72782,
-            "29": 0.73646,
-            "30": 0.73665,
-            "31": 0.74301,
-            "32": 0.73363,
-            "33": 0.71952,
-            "34": 0.7406,
-            "35": 0.71103,
-            "36": 0.70026,
-            "37": 0.71087,
-            "38": 0.88272,
-            "39": 0.71279,
-            "40": 0.92123,
-            "41": 1.20193,
-            "42": 0.72924,
-            "43": 0.70749,
-            "44": 0.72158,
-            "45": 0.71169,
-            "46": 1.23637,
-            "47": 1.13432,
-            "48": 1.26896,
-            "49": 1.13682,
-            "50": 1.21366
+            "1": 12.53778,
+            "2": 0.64042,
+            "3": 0.57704,
+            "4": 0.56942,
+            "5": 0.55857,
+            "6": 1.5214,
+            "7": 0.8799,
+            "8": 0.58802,
+            "9": 0.58845,
+            "10": 0.91566,
+            "11": 1.66597,
+            "12": 1.31669,
+            "13": 0.9054,
+            "14": 0.55959,
+            "15": 0.55349,
+            "16": 0.56731,
+            "17": 0.54994,
+            "18": 0.56124,
+            "19": 0.54032,
+            "20": 0.54467,
+            "21": 0.56577,
+            "22": 0.59073,
+            "23": 0.55848,
+            "24": 0.5515,
+            "25": 0.56783,
+            "26": 0.58223,
+            "27": 0.56278,
+            "28": 0.55385,
+            "29": 0.54473,
+            "30": 0.54779,
+            "31": 0.54239,
+            "32": 0.53324,
+            "33": 0.54812,
+            "34": 0.57008,
+            "35": 0.56814,
+            "36": 0.55146,
+            "37": 0.56138,
+            "38": 0.80574,
+            "39": 0.5919,
+            "40": 0.83084,
+            "41": 0.9006,
+            "42": 0.82734,
+            "43": 0.98233,
+            "44": 1.08635,
+            "45": 1.33415,
+            "46": 1.29362,
+            "47": 1.03481,
+            "48": 1.02838,
+            "49": 0.56104,
+            "50": 0.57748
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json
index 1352649be85..7a21f7ae2f9 100644
--- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json
@@ -25,35 +25,35 @@
             "19": 10.44113,
             "20": 10.45448,
             "21": 10.43454,
-            "22": 10.40592,
-            "23": 10.39961,
-            "24": 10.37579,
-            "25": 10.38182,
-            "26": 10.35147,
+            "22": 10.40591,
+            "23": 10.39975,
+            "24": 10.37583,
+            "25": 10.38168,
+            "26": 10.3515,
             "27": 10.35388,
-            "28": 10.34937,
-            "29": 10.28711,
-            "30": 10.21159,
-            "31": 10.1726,
-            "32": 10.13421,
-            "33": 10.14744,
-            "34": 10.10737,
-            "35": 10.10581,
-            "36": 10.08735,
+            "28": 10.34965,
+            "29": 10.28701,
+            "30": 10.21143,
+            "31": 10.17272,
+            "32": 10.13416,
+            "33": 10.14725,
+            "34": 10.10738,
+            "35": 10.10592,
+            "36": 10.08739,
             "37": 10.08157,
-            "38": 10.07233,
-            "39": 10.00094,
-            "40": 9.98143,
-            "41": 9.92541,
-            "42": 9.87527,
-            "43": 9.88711,
-            "44": 9.80642,
-            "45": 9.82325,
-            "46": 9.73785,
-            "47": 9.74817,
-            "48": 9.71609,
-            "49": 9.74484,
-            "50": 9.72982
+            "38": 10.07245,
+            "39": 10.00093,
+            "40": 9.98138,
+            "41": 9.92543,
+            "42": 9.87534,
+            "43": 9.88716,
+            "44": 9.80646,
+            "45": 9.82342,
+            "46": 9.73786,
+            "47": 9.74811,
+            "48": 9.71614,
+            "49": 9.74493,
+            "50": 9.73
         }
     },
     "num-zeros": {
@@ -82,35 +82,35 @@
             "19": 2547.0,
             "20": 2850.0,
             "21": 1990.0,
-            "22": 2884.0,
-            "23": 2857.0,
-            "24": 2685.0,
-            "25": 2514.0,
-            "26": 2958.0,
-            "27": 2673.0,
-            "28": 2723.0,
-            "29": 2571.0,
-            "30": 2858.0,
-            "31": 2157.0,
-            "32": 2357.0,
-            "33": 2242.0,
-            "34": 2464.0,
-            "35": 2544.0,
-            "36": 2933.0,
-            "37": 3293.0,
-            "38": 2730.0,
-            "39": 2795.0,
-            "40": 3310.0,
-            "41": 1816.0,
-            "42": 1467.0,
-            "43": 1817.0,
-            "44": 2633.0,
-            "45": 3576.0,
-            "46": 3015.0,
-            "47": 2805.0,
-            "48": 3071.0,
-            "49": 2974.0,
-            "50": 2267.0
+            "22": 2964.0,
+            "23": 2695.0,
+            "24": 2772.0,
+            "25": 2524.0,
+            "26": 2977.0,
+            "27": 2627.0,
+            "28": 2776.0,
+            "29": 2514.0,
+            "30": 2843.0,
+            "31": 2070.0,
+            "32": 2362.0,
+            "33": 2211.0,
+            "34": 2574.0,
+            "35": 2499.0,
+            "36": 2943.0,
+            "37": 3347.0,
+            "38": 2628.0,
+            "39": 2781.0,
+            "40": 3335.0,
+            "41": 1800.0,
+            "42": 1598.0,
+            "43": 1719.0,
+            "44": 2631.0,
+            "45": 3492.0,
+            "46": 2988.0,
+            "47": 2784.0,
+            "48": 2951.0,
+            "49": 2907.0,
+            "50": 2113.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1784014336.0,
-            "2": 1784014336.0,
-            "3": 1784014336.0,
-            "4": 1784014336.0,
-            "5": 1784014336.0,
-            "6": 1784014336.0,
-            "7": 1784014336.0,
-            "8": 1784014336.0,
-            "9": 1784014336.0,
-            "10": 1784014336.0,
-            "11": 1784014336.0,
-            "12": 1784014336.0,
-            "13": 1784014336.0,
-            "14": 1784014336.0,
-            "15": 1784014336.0,
-            "16": 1784014336.0,
-            "17": 1784014336.0,
-            "18": 1784014336.0,
-            "19": 1784014336.0,
-            "20": 1784014336.0,
-            "21": 1784014336.0,
-            "22": 1784014336.0,
-            "23": 1784014336.0,
-            "24": 1784014336.0,
-            "25": 1784014336.0,
-            "26": 1784014336.0,
-            "27": 1784014336.0,
-            "28": 1784014336.0,
-            "29": 1784014336.0,
-            "30": 1784014336.0,
-            "31": 1784014336.0,
-            "32": 1784014336.0,
-            "33": 1784014336.0,
-            "34": 1784014336.0,
-            "35": 1784014336.0,
-            "36": 1784014336.0,
-            "37": 1784014336.0,
-            "38": 1784014336.0,
-            "39": 1784014336.0,
-            "40": 1784014336.0,
-            "41": 1784014336.0,
-            "42": 1784014336.0,
-            "43": 1784014336.0,
-            "44": 1784014336.0,
-            "45": 1784014336.0,
-            "46": 1784014336.0,
-            "47": 1784014336.0,
-            "48": 1784014336.0,
-            "49": 1784014336.0,
-            "50": 1784014336.0
+            "1": 1785063936.0,
+            "2": 1785063936.0,
+            "3": 1785063936.0,
+            "4": 1785063936.0,
+            "5": 1785063936.0,
+            "6": 1785063936.0,
+            "7": 1785063936.0,
+            "8": 1785063936.0,
+            "9": 1785063936.0,
+            "10": 1785063936.0,
+            "11": 1785063936.0,
+            "12": 1785063936.0,
+            "13": 1785063936.0,
+            "14": 1785063936.0,
+            "15": 1785063936.0,
+            "16": 1785063936.0,
+            "17": 1785063936.0,
+            "18": 1785063936.0,
+            "19": 1785063936.0,
+            "20": 1785063936.0,
+            "21": 1785063936.0,
+            "22": 1785063936.0,
+            "23": 1785063936.0,
+            "24": 1785063936.0,
+            "25": 1785063936.0,
+            "26": 1785063936.0,
+            "27": 1785063936.0,
+            "28": 1785063936.0,
+            "29": 1785063936.0,
+            "30": 1785063936.0,
+            "31": 1785063936.0,
+            "32": 1785063936.0,
+            "33": 1785063936.0,
+            "34": 1785063936.0,
+            "35": 1785063936.0,
+            "36": 1785063936.0,
+            "37": 1785063936.0,
+            "38": 1785063936.0,
+            "39": 1785063936.0,
+            "40": 1785063936.0,
+            "41": 1785063936.0,
+            "42": 1785063936.0,
+            "43": 1785063936.0,
+            "44": 1785063936.0,
+            "45": 1785063936.0,
+            "46": 1785063936.0,
+            "47": 1785063936.0,
+            "48": 1785063936.0,
+            "49": 1785063936.0,
+            "50": 1785063936.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2365860864.0,
-            "2": 3108323328.0,
-            "3": 3108323328.0,
-            "4": 3108323328.0,
-            "5": 3108323328.0,
-            "6": 3108842496.0,
-            "7": 3108842496.0,
-            "8": 3108842496.0,
-            "9": 3108842496.0,
-            "10": 3108842496.0,
-            "11": 3108842496.0,
-            "12": 3108842496.0,
-            "13": 3108842496.0,
-            "14": 3108842496.0,
-            "15": 3108842496.0,
-            "16": 3108842496.0,
-            "17": 3108842496.0,
-            "18": 3108842496.0,
-            "19": 3108842496.0,
-            "20": 3108842496.0,
-            "21": 3108842496.0,
-            "22": 3108842496.0,
-            "23": 3108842496.0,
-            "24": 3108842496.0,
-            "25": 3108842496.0,
-            "26": 3108842496.0,
-            "27": 3108842496.0,
-            "28": 3108842496.0,
-            "29": 3108842496.0,
-            "30": 3108842496.0,
-            "31": 3108842496.0,
-            "32": 3108842496.0,
-            "33": 3108842496.0,
-            "34": 3108842496.0,
-            "35": 3108842496.0,
-            "36": 3108842496.0,
-            "37": 3108842496.0,
-            "38": 3108842496.0,
-            "39": 3108842496.0,
-            "40": 3108842496.0,
-            "41": 3108842496.0,
-            "42": 3108842496.0,
-            "43": 3108842496.0,
-            "44": 3108842496.0,
-            "45": 3108842496.0,
-            "46": 3108842496.0,
-            "47": 3108842496.0,
-            "48": 3108842496.0,
-            "49": 3108842496.0,
-            "50": 3108842496.0
+            "1": 2366910464.0,
+            "2": 3109894144.0,
+            "3": 3109894144.0,
+            "4": 3109894144.0,
+            "5": 3109894144.0,
+            "6": 3109894144.0,
+            "7": 3109894144.0,
+            "8": 3109894144.0,
+            "9": 3109894144.0,
+            "10": 3109894144.0,
+            "11": 3109894144.0,
+            "12": 3109894144.0,
+            "13": 3109894144.0,
+            "14": 3109894144.0,
+            "15": 3109897216.0,
+            "16": 3109897216.0,
+            "17": 3109897216.0,
+            "18": 3109897216.0,
+            "19": 3109897216.0,
+            "20": 3109897216.0,
+            "21": 3109897216.0,
+            "22": 3109897216.0,
+            "23": 3109897216.0,
+            "24": 3109897216.0,
+            "25": 3109897216.0,
+            "26": 3109897216.0,
+            "27": 3109897216.0,
+            "28": 3109897216.0,
+            "29": 3109897216.0,
+            "30": 3109897216.0,
+            "31": 3109897216.0,
+            "32": 3109897216.0,
+            "33": 3109897216.0,
+            "34": 3109897216.0,
+            "35": 3109897216.0,
+            "36": 3109897216.0,
+            "37": 3109897216.0,
+            "38": 3109897216.0,
+            "39": 3109897216.0,
+            "40": 3109897216.0,
+            "41": 3109897216.0,
+            "42": 3109897216.0,
+            "43": 3109897216.0,
+            "44": 3109897216.0,
+            "45": 3109897216.0,
+            "46": 3109897216.0,
+            "47": 3109897216.0,
+            "48": 3109897216.0,
+            "49": 3109897216.0,
+            "50": 3109897216.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 11.98661,
-            "2": 1.05916,
-            "3": 1.01721,
-            "4": 1.02611,
-            "5": 1.02779,
-            "6": 1.11252,
-            "7": 1.0176,
-            "8": 1.02427,
-            "9": 1.02561,
-            "10": 1.01845,
-            "11": 1.02419,
-            "12": 1.01745,
-            "13": 1.01224,
-            "14": 1.02388,
-            "15": 1.03687,
-            "16": 1.01886,
-            "17": 1.01708,
-            "18": 1.01143,
-            "19": 1.01902,
-            "20": 1.49878,
-            "21": 1.47537,
-            "22": 1.01801,
-            "23": 1.05158,
-            "24": 1.03481,
-            "25": 1.01773,
-            "26": 1.01186,
-            "27": 1.02203,
-            "28": 1.01824,
-            "29": 1.01865,
-            "30": 1.02165,
-            "31": 1.0184,
-            "32": 1.02106,
-            "33": 1.04655,
-            "34": 1.03129,
-            "35": 1.01893,
-            "36": 1.02153,
-            "37": 1.02154,
-            "38": 1.0213,
-            "39": 1.14846,
-            "40": 1.02149,
-            "41": 1.01905,
-            "42": 1.02038,
-            "43": 1.03126,
-            "44": 1.04155,
-            "45": 1.01649,
-            "46": 1.01742,
-            "47": 1.02406,
-            "48": 1.27122,
-            "49": 1.15085,
-            "50": 1.10861
+            "1": 10.5121,
+            "2": 1.00958,
+            "3": 0.92732,
+            "4": 0.90421,
+            "5": 0.90504,
+            "6": 0.89943,
+            "7": 0.90319,
+            "8": 1.1748,
+            "9": 1.95208,
+            "10": 0.92148,
+            "11": 0.91859,
+            "12": 0.92137,
+            "13": 0.92531,
+            "14": 1.25591,
+            "15": 0.92418,
+            "16": 0.91961,
+            "17": 0.90838,
+            "18": 0.90766,
+            "19": 0.90747,
+            "20": 0.9061,
+            "21": 0.93723,
+            "22": 0.90644,
+            "23": 0.91067,
+            "24": 1.66749,
+            "25": 0.91188,
+            "26": 0.91194,
+            "27": 0.988,
+            "28": 0.92516,
+            "29": 0.91117,
+            "30": 1.435,
+            "31": 0.89868,
+            "32": 0.90735,
+            "33": 1.29737,
+            "34": 1.32235,
+            "35": 0.91506,
+            "36": 0.91851,
+            "37": 0.92715,
+            "38": 0.92769,
+            "39": 0.92632,
+            "40": 1.26827,
+            "41": 1.07193,
+            "42": 1.07217,
+            "43": 0.98674,
+            "44": 1.07179,
+            "45": 1.09756,
+            "46": 1.10568,
+            "47": 0.92215,
+            "48": 0.92051,
+            "49": 0.92335,
+            "50": 0.92251
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json
index bf20b2b00e3..d034c6bf7d8 100644
--- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json
@@ -44,15 +44,15 @@
             "38": 10.07257,
             "39": 10.0013,
             "40": 9.9816,
-            "41": 9.92549,
-            "42": 9.87529,
-            "43": 9.88742,
-            "44": 9.80641,
-            "45": 9.82342,
-            "46": 9.73815,
-            "47": 9.74831,
-            "48": 9.71619,
-            "49": 9.74504,
+            "41": 9.92551,
+            "42": 9.87537,
+            "43": 9.88725,
+            "44": 9.80659,
+            "45": 9.82349,
+            "46": 9.73821,
+            "47": 9.74829,
+            "48": 9.71628,
+            "49": 9.74489,
             "50": 9.73004
         }
     },
@@ -100,17 +100,17 @@
             "37": 3305.0,
             "38": 2682.0,
             "39": 2805.0,
-            "40": 3425.0,
-            "41": 1812.0,
-            "42": 1481.0,
-            "43": 1726.0,
-            "44": 2575.0,
-            "45": 3438.0,
-            "46": 2960.0,
-            "47": 2792.0,
-            "48": 3107.0,
-            "49": 2854.0,
-            "50": 2145.0
+            "40": 3430.0,
+            "41": 1767.0,
+            "42": 1516.0,
+            "43": 1798.0,
+            "44": 2790.0,
+            "45": 3578.0,
+            "46": 3016.0,
+            "47": 2890.0,
+            "48": 3065.0,
+            "49": 2914.0,
+            "50": 2208.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1767237120.0,
-            "2": 1767237120.0,
-            "3": 1767237120.0,
-            "4": 1767237120.0,
-            "5": 1767237120.0,
-            "6": 1767237120.0,
-            "7": 1767237120.0,
-            "8": 1767237120.0,
-            "9": 1767237120.0,
-            "10": 1767237120.0,
-            "11": 1767237120.0,
-            "12": 1767237120.0,
-            "13": 1767237120.0,
-            "14": 1767237120.0,
-            "15": 1767237120.0,
-            "16": 1767237120.0,
-            "17": 1767237120.0,
-            "18": 1767237120.0,
-            "19": 1767237120.0,
-            "20": 1767237120.0,
-            "21": 1767237120.0,
-            "22": 1767237120.0,
-            "23": 1767237120.0,
-            "24": 1767237120.0,
-            "25": 1767237120.0,
-            "26": 1767237120.0,
-            "27": 1767237120.0,
-            "28": 1767237120.0,
-            "29": 1767237120.0,
-            "30": 1767237120.0,
-            "31": 1767237120.0,
-            "32": 1767237120.0,
-            "33": 1767237120.0,
-            "34": 1767237120.0,
-            "35": 1767237120.0,
-            "36": 1767237120.0,
-            "37": 1767237120.0,
-            "38": 1767237120.0,
-            "39": 1767237120.0,
-            "40": 1767237120.0,
-            "41": 1767237120.0,
-            "42": 1767237120.0,
-            "43": 1767237120.0,
-            "44": 1767237120.0,
-            "45": 1767237120.0,
-            "46": 1767237120.0,
-            "47": 1767237120.0,
-            "48": 1767237120.0,
-            "49": 1767237120.0,
-            "50": 1767237120.0
+            "1": 1768285696.0,
+            "2": 1768285696.0,
+            "3": 1768285696.0,
+            "4": 1768285696.0,
+            "5": 1768285696.0,
+            "6": 1768285696.0,
+            "7": 1768285696.0,
+            "8": 1768285696.0,
+            "9": 1768285696.0,
+            "10": 1768285696.0,
+            "11": 1768285696.0,
+            "12": 1768285696.0,
+            "13": 1768285696.0,
+            "14": 1768285696.0,
+            "15": 1768285696.0,
+            "16": 1768285696.0,
+            "17": 1768285696.0,
+            "18": 1768285696.0,
+            "19": 1768285696.0,
+            "20": 1768285696.0,
+            "21": 1768285696.0,
+            "22": 1768285696.0,
+            "23": 1768285696.0,
+            "24": 1768285696.0,
+            "25": 1768285696.0,
+            "26": 1768285696.0,
+            "27": 1768285696.0,
+            "28": 1768285696.0,
+            "29": 1768285696.0,
+            "30": 1768285696.0,
+            "31": 1768285696.0,
+            "32": 1768285696.0,
+            "33": 1768285696.0,
+            "34": 1768285696.0,
+            "35": 1768285696.0,
+            "36": 1768285696.0,
+            "37": 1768285696.0,
+            "38": 1768285696.0,
+            "39": 1768285696.0,
+            "40": 1768285696.0,
+            "41": 1768285696.0,
+            "42": 1768285696.0,
+            "43": 1768285696.0,
+            "44": 1768285696.0,
+            "45": 1768285696.0,
+            "46": 1768285696.0,
+            "47": 1768285696.0,
+            "48": 1768285696.0,
+            "49": 1768285696.0,
+            "50": 1768285696.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2336500736.0,
-            "2": 3079487488.0,
-            "3": 3079487488.0,
-            "4": 3079487488.0,
-            "5": 3079487488.0,
-            "6": 3079487488.0,
-            "7": 3079487488.0,
-            "8": 3079487488.0,
-            "9": 3079487488.0,
-            "10": 3079487488.0,
-            "11": 3079487488.0,
-            "12": 3079487488.0,
-            "13": 3079487488.0,
-            "14": 3079487488.0,
-            "15": 3079487488.0,
-            "16": 3079487488.0,
-            "17": 3079487488.0,
-            "18": 3079487488.0,
-            "19": 3079487488.0,
-            "20": 3079487488.0,
-            "21": 3079487488.0,
-            "22": 3079487488.0,
-            "23": 3079487488.0,
-            "24": 3079487488.0,
-            "25": 3079487488.0,
-            "26": 3079487488.0,
-            "27": 3079487488.0,
-            "28": 3079487488.0,
-            "29": 3079487488.0,
-            "30": 3079487488.0,
-            "31": 3079487488.0,
-            "32": 3079487488.0,
-            "33": 3079487488.0,
-            "34": 3079487488.0,
-            "35": 3079487488.0,
-            "36": 3079487488.0,
-            "37": 3079487488.0,
-            "38": 3079487488.0,
-            "39": 3079487488.0,
-            "40": 3079487488.0,
-            "41": 3079487488.0,
-            "42": 3079487488.0,
-            "43": 3079487488.0,
-            "44": 3079487488.0,
-            "45": 3079487488.0,
-            "46": 3079487488.0,
-            "47": 3079487488.0,
-            "48": 3079487488.0,
-            "49": 3079487488.0,
-            "50": 3079487488.0
+            "1": 2337549312.0,
+            "2": 3080536064.0,
+            "3": 3082107392.0,
+            "4": 3082107392.0,
+            "5": 3082107392.0,
+            "6": 3082107392.0,
+            "7": 3082107392.0,
+            "8": 3082107392.0,
+            "9": 3082107392.0,
+            "10": 3082107392.0,
+            "11": 3082107392.0,
+            "12": 3082107392.0,
+            "13": 3082107392.0,
+            "14": 3082107392.0,
+            "15": 3082107392.0,
+            "16": 3082108928.0,
+            "17": 3082108928.0,
+            "18": 3082108928.0,
+            "19": 3082108928.0,
+            "20": 3082108928.0,
+            "21": 3082108928.0,
+            "22": 3082108928.0,
+            "23": 3082108928.0,
+            "24": 3082108928.0,
+            "25": 3082108928.0,
+            "26": 3082108928.0,
+            "27": 3082108928.0,
+            "28": 3082108928.0,
+            "29": 3082108928.0,
+            "30": 3082108928.0,
+            "31": 3082108928.0,
+            "32": 3082108928.0,
+            "33": 3082108928.0,
+            "34": 3082108928.0,
+            "35": 3082108928.0,
+            "36": 3082108928.0,
+            "37": 3082108928.0,
+            "38": 3082108928.0,
+            "39": 3082108928.0,
+            "40": 3082108928.0,
+            "41": 3082108928.0,
+            "42": 3082108928.0,
+            "43": 3082108928.0,
+            "44": 3082108928.0,
+            "45": 3082108928.0,
+            "46": 3082108928.0,
+            "47": 3082108928.0,
+            "48": 3082108928.0,
+            "49": 3082108928.0,
+            "50": 3082108928.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 11.68301,
-            "2": 0.87796,
-            "3": 0.84756,
-            "4": 0.85513,
-            "5": 0.85643,
-            "6": 0.85366,
-            "7": 0.8468,
-            "8": 0.84974,
-            "9": 0.84989,
-            "10": 0.8464,
-            "11": 0.84369,
-            "12": 0.84972,
-            "13": 0.84311,
-            "14": 0.85648,
-            "15": 1.1084,
-            "16": 0.8827,
-            "17": 0.87952,
-            "18": 0.88554,
-            "19": 0.82673,
-            "20": 0.82222,
-            "21": 1.06414,
-            "22": 1.09134,
-            "23": 1.02591,
-            "24": 0.82601,
-            "25": 0.82277,
-            "26": 0.81844,
-            "27": 0.82627,
-            "28": 0.82854,
-            "29": 0.82653,
-            "30": 0.82247,
-            "31": 0.82906,
-            "32": 0.82363,
-            "33": 0.82944,
-            "34": 0.82401,
-            "35": 0.82902,
-            "36": 0.83537,
-            "37": 0.8265,
-            "38": 0.82728,
-            "39": 0.82087,
-            "40": 0.82525,
-            "41": 0.82691,
-            "42": 1.14473,
-            "43": 0.97566,
-            "44": 0.82343,
-            "45": 0.82956,
-            "46": 0.82572,
-            "47": 0.83635,
-            "48": 0.94255,
-            "49": 0.99753,
-            "50": 1.10127
+            "1": 10.51798,
+            "2": 0.89864,
+            "3": 0.7978,
+            "4": 0.74774,
+            "5": 0.73987,
+            "6": 0.74277,
+            "7": 0.76779,
+            "8": 0.74313,
+            "9": 1.58315,
+            "10": 0.73453,
+            "11": 0.73215,
+            "12": 0.72957,
+            "13": 0.72967,
+            "14": 0.73868,
+            "15": 0.73216,
+            "16": 1.10392,
+            "17": 0.73363,
+            "18": 0.73647,
+            "19": 0.76464,
+            "20": 0.73565,
+            "21": 0.72858,
+            "22": 0.72652,
+            "23": 0.72858,
+            "24": 0.74508,
+            "25": 0.74166,
+            "26": 0.7704,
+            "27": 1.15428,
+            "28": 1.146,
+            "29": 0.73283,
+            "30": 0.73304,
+            "31": 0.73237,
+            "32": 0.7343,
+            "33": 0.73304,
+            "34": 0.72879,
+            "35": 0.73286,
+            "36": 1.74169,
+            "37": 1.10377,
+            "38": 0.73148,
+            "39": 0.73227,
+            "40": 0.73028,
+            "41": 0.73026,
+            "42": 1.15127,
+            "43": 1.11655,
+            "44": 0.73185,
+            "45": 1.17599,
+            "46": 1.07292,
+            "47": 0.72983,
+            "48": 0.72804,
+            "49": 0.73205,
+            "50": 0.72929
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json
index dc5d31f8f8b..4302b8e40ca 100644
--- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json
@@ -25,85 +25,85 @@
             "19": 10.44113,
             "20": 10.45448,
             "21": 10.43454,
-            "22": 10.40592,
-            "23": 10.39961,
-            "24": 10.37579,
-            "25": 10.38182,
-            "26": 10.35147,
+            "22": 10.40591,
+            "23": 10.39975,
+            "24": 10.37583,
+            "25": 10.38168,
+            "26": 10.3515,
             "27": 10.35388,
-            "28": 10.34937,
-            "29": 10.28711,
-            "30": 10.21159,
-            "31": 10.1726,
-            "32": 10.13421,
-            "33": 10.14744,
-            "34": 10.10737,
-            "35": 10.10581,
-            "36": 10.08735,
+            "28": 10.34965,
+            "29": 10.28701,
+            "30": 10.21143,
+            "31": 10.17272,
+            "32": 10.13416,
+            "33": 10.14725,
+            "34": 10.10738,
+            "35": 10.10592,
+            "36": 10.08739,
             "37": 10.08157,
-            "38": 10.07233,
-            "39": 10.00094,
-            "40": 9.98143,
-            "41": 9.92541,
-            "42": 9.87527,
-            "43": 9.88711,
-            "44": 9.80642,
-            "45": 9.82325,
-            "46": 9.73785,
-            "47": 9.74817,
-            "48": 9.71609,
-            "49": 9.74484,
-            "50": 9.72982,
-            "51": 9.71485,
-            "52": 9.66475,
-            "53": 9.60919,
-            "54": 9.62705,
-            "55": 9.61012,
-            "56": 9.617,
-            "57": 9.56786,
-            "58": 9.52731,
-            "59": 9.51668,
-            "60": 9.51865,
+            "38": 10.07245,
+            "39": 10.00093,
+            "40": 9.98138,
+            "41": 9.92543,
+            "42": 9.87534,
+            "43": 9.88716,
+            "44": 9.80646,
+            "45": 9.82342,
+            "46": 9.73786,
+            "47": 9.74811,
+            "48": 9.71614,
+            "49": 9.74493,
+            "50": 9.73,
+            "51": 9.71492,
+            "52": 9.66464,
+            "53": 9.60912,
+            "54": 9.62726,
+            "55": 9.6101,
+            "56": 9.61721,
+            "57": 9.56794,
+            "58": 9.52741,
+            "59": 9.51674,
+            "60": 9.51863,
             "61": 9.53132,
-            "62": 9.45016,
-            "63": 9.45725,
-            "64": 9.43435,
-            "65": 9.45801,
-            "66": 9.4368,
-            "67": 9.3968,
-            "68": 9.36474,
-            "69": 9.4095,
-            "70": 9.376,
-            "71": 9.41716,
-            "72": 9.42574,
-            "73": 9.37581,
-            "74": 9.41547,
-            "75": 9.37891,
-            "76": 9.28017,
-            "77": 9.32205,
-            "78": 9.35754,
-            "79": 9.32162,
-            "80": 9.31486,
-            "81": 9.2678,
-            "82": 9.34178,
-            "83": 9.32145,
-            "84": 9.24785,
-            "85": 9.35023,
-            "86": 9.22392,
-            "87": 9.3062,
-            "88": 9.29891,
-            "89": 9.22716,
-            "90": 9.28483,
-            "91": 9.23109,
-            "92": 9.27463,
-            "93": 9.19241,
-            "94": 9.23984,
-            "95": 9.28006,
-            "96": 9.17526,
-            "97": 9.21894,
-            "98": 9.17192,
-            "99": 9.16446,
-            "100": 9.14816
+            "62": 9.45018,
+            "63": 9.4572,
+            "64": 9.43437,
+            "65": 9.45816,
+            "66": 9.43669,
+            "67": 9.39678,
+            "68": 9.36478,
+            "69": 9.40956,
+            "70": 9.37595,
+            "71": 9.41738,
+            "72": 9.42564,
+            "73": 9.37611,
+            "74": 9.41543,
+            "75": 9.3788,
+            "76": 9.28012,
+            "77": 9.32212,
+            "78": 9.35744,
+            "79": 9.3215,
+            "80": 9.31497,
+            "81": 9.26785,
+            "82": 9.34183,
+            "83": 9.32151,
+            "84": 9.24796,
+            "85": 9.35033,
+            "86": 9.224,
+            "87": 9.30611,
+            "88": 9.29894,
+            "89": 9.22704,
+            "90": 9.28479,
+            "91": 9.2311,
+            "92": 9.27474,
+            "93": 9.19219,
+            "94": 9.23969,
+            "95": 9.28,
+            "96": 9.17525,
+            "97": 9.21888,
+            "98": 9.1721,
+            "99": 9.16455,
+            "100": 9.1482
         }
     },
     "num-zeros": {
@@ -132,85 +132,85 @@
             "19": 2547.0,
             "20": 2850.0,
             "21": 1990.0,
-            "22": 2884.0,
-            "23": 2857.0,
-            "24": 2685.0,
-            "25": 2514.0,
-            "26": 2958.0,
-            "27": 2673.0,
-            "28": 2723.0,
-            "29": 2571.0,
-            "30": 2858.0,
-            "31": 2157.0,
-            "32": 2357.0,
-            "33": 2242.0,
-            "34": 2464.0,
-            "35": 2544.0,
-            "36": 2933.0,
-            "37": 3293.0,
-            "38": 2730.0,
-            "39": 2795.0,
-            "40": 3310.0,
-            "41": 1816.0,
-            "42": 1467.0,
-            "43": 1817.0,
-            "44": 2633.0,
-            "45": 3576.0,
-            "46": 3015.0,
-            "47": 2805.0,
-            "48": 3071.0,
-            "49": 2974.0,
-            "50": 2267.0,
-            "51": 1923.0,
-            "52": 2515.0,
-            "53": 3615.0,
-            "54": 3426.0,
-            "55": 3436.0,
-            "56": 4411.0,
-            "57": 4095.0,
-            "58": 4308.0,
-            "59": 1687.0,
-            "60": 2431.0,
-            "61": 2151.0,
-            "62": 3986.0,
-            "63": 3558.0,
-            "64": 4286.0,
-            "65": 3052.0,
-            "66": 1720.0,
-            "67": 1910.0,
-            "68": 4193.0,
-            "69": 4347.0,
-            "70": 4596.0,
-            "71": 2078.0,
-            "72": 4406.0,
-            "73": 4062.0,
-            "74": 3358.0,
-            "75": 4606.0,
-            "76": 2187.0,
-            "77": 4854.0,
-            "78": 4098.0,
-            "79": 2652.0,
-            "80": 3776.0,
-            "81": 3550.0,
-            "82": 3031.0,
-            "83": 5345.0,
-            "84": 4396.0,
-            "85": 4354.0,
-            "86": 3332.0,
-            "87": 4815.0,
-            "88": 3303.0,
-            "89": 4611.0,
-            "90": 4346.0,
-            "91": 4361.0,
-            "92": 3502.0,
-            "93": 5624.0,
-            "94": 3733.0,
-            "95": 4728.0,
-            "96": 3534.0,
-            "97": 3873.0,
-            "98": 4525.0,
-            "99": 4329.0,
-            "100": 3365.0
+            "22": 2964.0,
+            "23": 2695.0,
+            "24": 2772.0,
+            "25": 2524.0,
+            "26": 2977.0,
+            "27": 2627.0,
+            "28": 2776.0,
+            "29": 2514.0,
+            "30": 2843.0,
+            "31": 2070.0,
+            "32": 2362.0,
+            "33": 2211.0,
+            "34": 2574.0,
+            "35": 2499.0,
+            "36": 2943.0,
+            "37": 3347.0,
+            "38": 2628.0,
+            "39": 2781.0,
+            "40": 3335.0,
+            "41": 1800.0,
+            "42": 1598.0,
+            "43": 1719.0,
+            "44": 2631.0,
+            "45": 3492.0,
+            "46": 2988.0,
+            "47": 2784.0,
+            "48": 2951.0,
+            "49": 2907.0,
+            "50": 2113.0,
+            "51": 1961.0,
+            "52": 2445.0,
+            "53": 3654.0,
+            "54": 3489.0,
+            "55": 3419.0,
+            "56": 4364.0,
+            "57": 4145.0,
+            "58": 4155.0,
+            "59": 1699.0,
+            "60": 2358.0,
+            "61": 2070.0,
+            "62": 4094.0,
+            "63": 3516.0,
+            "64": 4287.0,
+            "65": 2891.0,
+            "66": 1733.0,
+            "67": 1914.0,
+            "68": 4420.0,
+            "69": 4479.0,
+            "70": 4656.0,
+            "71": 2135.0,
+            "72": 4476.0,
+            "73": 4048.0,
+            "74": 3199.0,
+            "75": 4735.0,
+            "76": 2218.0,
+            "77": 4952.0,
+            "78": 4158.0,
+            "79": 2657.0,
+            "80": 3846.0,
+            "81": 3472.0,
+            "82": 2979.0,
+            "83": 5364.0,
+            "84": 4430.0,
+            "85": 4249.0,
+            "86": 3509.0,
+            "87": 4817.0,
+            "88": 3434.0,
+            "89": 4711.0,
+            "90": 4448.0,
+            "91": 4374.0,
+            "92": 3507.0,
+            "93": 5549.0,
+            "94": 3635.0,
+            "95": 4540.0,
+            "96": 3659.0,
+            "97": 3756.0,
+            "98": 4513.0,
+            "99": 4491.0,
+            "100": 3445.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1784014336.0,
-            "2": 1784014336.0,
-            "3": 1784014336.0,
-            "4": 1784014336.0,
-            "5": 1784014336.0,
-            "6": 1784014336.0,
-            "7": 1784014336.0,
-            "8": 1784014336.0,
-            "9": 1784014336.0,
-            "10": 1784014336.0,
-            "11": 1784014336.0,
-            "12": 1784014336.0,
-            "13": 1784014336.0,
-            "14": 1784014336.0,
-            "15": 1784014336.0,
-            "16": 1784014336.0,
-            "17": 1784014336.0,
-            "18": 1784014336.0,
-            "19": 1784014336.0,
-            "20": 1784014336.0,
-            "21": 1784014336.0,
-            "22": 1784014336.0,
-            "23": 1784014336.0,
-            "24": 1784014336.0,
-            "25": 1784014336.0,
-            "26": 1784014336.0,
-            "27": 1784014336.0,
-            "28": 1784014336.0,
-            "29": 1784014336.0,
-            "30": 1784014336.0,
-            "31": 1784014336.0,
-            "32": 1784014336.0,
-            "33": 1784014336.0,
-            "34": 1784014336.0,
-            "35": 1784014336.0,
-            "36": 1784014336.0,
-            "37": 1784014336.0,
-            "38": 1784014336.0,
-            "39": 1784014336.0,
-            "40": 1784014336.0,
-            "41": 1784014336.0,
-            "42": 1784014336.0,
-            "43": 1784014336.0,
-            "44": 1784014336.0,
-            "45": 1784014336.0,
-            "46": 1784014336.0,
-            "47": 1784014336.0,
-            "48": 1784014336.0,
-            "49": 1784014336.0,
-            "50": 1784014336.0,
-            "51": 1784014336.0,
-            "52": 1784014336.0,
-            "53": 1784014336.0,
-            "54": 1784014336.0,
-            "55": 1784014336.0,
-            "56": 1784014336.0,
-            "57": 1784014336.0,
-            "58": 1784014336.0,
-            "59": 1784014336.0,
-            "60": 1784014336.0,
-            "61": 1784014336.0,
-            "62": 1784014336.0,
-            "63": 1784014336.0,
-            "64": 1784014336.0,
-            "65": 1784014336.0,
-            "66": 1784014336.0,
-            "67": 1784014336.0,
-            "68": 1784014336.0,
-            "69": 1784014336.0,
-            "70": 1784014336.0,
-            "71": 1784014336.0,
-            "72": 1784014336.0,
-            "73": 1784014336.0,
-            "74": 1784014336.0,
-            "75": 1784014336.0,
-            "76": 1784014336.0,
-            "77": 1784014336.0,
-            "78": 1784014336.0,
-            "79": 1784014336.0,
-            "80": 1784014336.0,
-            "81": 1784014336.0,
-            "82": 1784014336.0,
-            "83": 1784014336.0,
-            "84": 1784014336.0,
-            "85": 1784014336.0,
-            "86": 1784014336.0,
-            "87": 1784014336.0,
-            "88": 1784014336.0,
-            "89": 1784014336.0,
-            "90": 1784014336.0,
-            "91": 1784014336.0,
-            "92": 1784014336.0,
-            "93": 1784014336.0,
-            "94": 1784014336.0,
-            "95": 1784014336.0,
-            "96": 1784014336.0,
-            "97": 1784014336.0,
-            "98": 1784014336.0,
-            "99": 1784014336.0,
-            "100": 1784014336.0
+            "1": 1785063936.0,
+            "2": 1785063936.0,
+            "3": 1785063936.0,
+            "4": 1785063936.0,
+            "5": 1785063936.0,
+            "6": 1785063936.0,
+            "7": 1785063936.0,
+            "8": 1785063936.0,
+            "9": 1785063936.0,
+            "10": 1785063936.0,
+            "11": 1785063936.0,
+            "12": 1785063936.0,
+            "13": 1785063936.0,
+            "14": 1785063936.0,
+            "15": 1785063936.0,
+            "16": 1785063936.0,
+            "17": 1785063936.0,
+            "18": 1785063936.0,
+            "19": 1785063936.0,
+            "20": 1785063936.0,
+            "21": 1785063936.0,
+            "22": 1785063936.0,
+            "23": 1785063936.0,
+            "24": 1785063936.0,
+            "25": 1785063936.0,
+            "26": 1785063936.0,
+            "27": 1785063936.0,
+            "28": 1785588224.0,
+            "29": 1785063936.0,
+            "30": 1785063936.0,
+            "31": 1785063936.0,
+            "32": 1785063936.0,
+            "33": 1785063936.0,
+            "34": 1785063936.0,
+            "35": 1785063936.0,
+            "36": 1785063936.0,
+            "37": 1785063936.0,
+            "38": 1785063936.0,
+            "39": 1785063936.0,
+            "40": 1785063936.0,
+            "41": 1785063936.0,
+            "42": 1785063936.0,
+            "43": 1785063936.0,
+            "44": 1785063936.0,
+            "45": 1785063936.0,
+            "46": 1785063936.0,
+            "47": 1785063936.0,
+            "48": 1785063936.0,
+            "49": 1785063936.0,
+            "50": 1785063936.0,
+            "51": 1785063936.0,
+            "52": 1785063936.0,
+            "53": 1785063936.0,
+            "54": 1785063936.0,
+            "55": 1785063936.0,
+            "56": 1785063936.0,
+            "57": 1785063936.0,
+            "58": 1785063936.0,
+            "59": 1785063936.0,
+            "60": 1785063936.0,
+            "61": 1785063936.0,
+            "62": 1785063936.0,
+            "63": 1785063936.0,
+            "64": 1785063936.0,
+            "65": 1785063936.0,
+            "66": 1785063936.0,
+            "67": 1785063936.0,
+            "68": 1785063936.0,
+            "69": 1785063936.0,
+            "70": 1785063936.0,
+            "71": 1785063936.0,
+            "72": 1785063936.0,
+            "73": 1785063936.0,
+            "74": 1785063936.0,
+            "75": 1785063936.0,
+            "76": 1785063936.0,
+            "77": 1785063936.0,
+            "78": 1785063936.0,
+            "79": 1785063936.0,
+            "80": 1785063936.0,
+            "81": 1785063936.0,
+            "82": 1785063936.0,
+            "83": 1785063936.0,
+            "84": 1785063936.0,
+            "85": 1785063936.0,
+            "86": 1785063936.0,
+            "87": 1785063936.0,
+            "88": 1785063936.0,
+            "89": 1785063936.0,
+            "90": 1785063936.0,
+            "91": 1785063936.0,
+            "92": 1785063936.0,
+            "93": 1785063936.0,
+            "94": 1785063936.0,
+            "95": 1785063936.0,
+            "96": 1785063936.0,
+            "97": 1785063936.0,
+            "98": 1785063936.0,
+            "99": 1785063936.0,
+            "100": 1785063936.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2365860864.0,
-            "2": 3108323328.0,
-            "3": 3108323328.0,
-            "4": 3108323328.0,
-            "5": 3108323328.0,
-            "6": 3108323328.0,
-            "7": 3108323328.0,
-            "8": 3108323328.0,
-            "9": 3108323328.0,
-            "10": 3108323328.0,
-            "11": 3108323328.0,
-            "12": 3108323328.0,
-            "13": 3108323328.0,
-            "14": 3108323328.0,
-            "15": 3108323328.0,
-            "16": 3108323328.0,
-            "17": 3108323328.0,
-            "18": 3108323328.0,
-            "19": 3108323328.0,
-            "20": 3108323328.0,
-            "21": 3108323328.0,
-            "22": 3108323328.0,
-            "23": 3108323328.0,
-            "24": 3108323328.0,
-            "25": 3108323328.0,
-            "26": 3108323328.0,
-            "27": 3108323328.0,
-            "28": 3108323328.0,
-            "29": 3108323328.0,
-            "30": 3108323328.0,
-            "31": 3108323328.0,
-            "32": 3108323328.0,
-            "33": 3108323328.0,
-            "34": 3108323328.0,
-            "35": 3108323328.0,
-            "36": 3108323328.0,
-            "37": 3108323328.0,
-            "38": 3108323328.0,
-            "39": 3108323328.0,
-            "40": 3108323328.0,
-            "41": 3108323328.0,
-            "42": 3108323328.0,
-            "43": 3108323328.0,
-            "44": 3108323328.0,
-            "45": 3108323328.0,
-            "46": 3108323328.0,
-            "47": 3108323328.0,
-            "48": 3108323328.0,
-            "49": 3108323328.0,
-            "50": 3108323328.0,
-            "51": 3108323328.0,
-            "52": 3108323328.0,
-            "53": 3108323328.0,
-            "54": 3108323328.0,
-            "55": 3108323328.0,
-            "56": 3108323328.0,
-            "57": 3108842496.0,
-            "58": 3108842496.0,
-            "59": 3108842496.0,
-            "60": 3108842496.0,
-            "61": 3108842496.0,
-            "62": 3108842496.0,
-            "63": 3108842496.0,
-            "64": 3108842496.0,
-            "65": 3108842496.0,
-            "66": 3108842496.0,
-            "67": 3108842496.0,
-            "68": 3108842496.0,
-            "69": 3108842496.0,
-            "70": 3108842496.0,
-            "71": 3108842496.0,
-            "72": 3108842496.0,
-            "73": 3108842496.0,
-            "74": 3108842496.0,
-            "75": 3108844544.0,
-            "76": 3108844544.0,
-            "77": 3108844544.0,
-            "78": 3108844544.0,
-            "79": 3108844544.0,
-            "80": 3108844544.0,
-            "81": 3108844544.0,
-            "82": 3108844544.0,
-            "83": 3108844544.0,
-            "84": 3108844544.0,
-            "85": 3108844544.0,
-            "86": 3108844544.0,
-            "87": 3108844544.0,
-            "88": 3108844544.0,
-            "89": 3108844544.0,
-            "90": 3108844544.0,
-            "91": 3108844544.0,
-            "92": 3108844544.0,
-            "93": 3108844544.0,
-            "94": 3108844544.0,
-            "95": 3108844544.0,
-            "96": 3108844544.0,
-            "97": 3108844544.0,
-            "98": 3108844544.0,
-            "99": 3108844544.0,
-            "100": 3108844544.0
+            "1": 2366910464.0,
+            "2": 3109372928.0,
+            "3": 3109372928.0,
+            "4": 3109372928.0,
+            "5": 3109372928.0,
+            "6": 3109892608.0,
+            "7": 3109892608.0,
+            "8": 3111465472.0,
+            "9": 3111465472.0,
+            "10": 3111465472.0,
+            "11": 3111465472.0,
+            "12": 3111470080.0,
+            "13": 3111470080.0,
+            "14": 3111470080.0,
+            "15": 3111470080.0,
+            "16": 3111470080.0,
+            "17": 3111470080.0,
+            "18": 3111470080.0,
+            "19": 3111470080.0,
+            "20": 3111470080.0,
+            "21": 3111470080.0,
+            "22": 3111470080.0,
+            "23": 3111470080.0,
+            "24": 3111470080.0,
+            "25": 3111470080.0,
+            "26": 3111470080.0,
+            "27": 3111470080.0,
+            "28": 3111470080.0,
+            "29": 3111470080.0,
+            "30": 3111470080.0,
+            "31": 3111470080.0,
+            "32": 3111470080.0,
+            "33": 3111470080.0,
+            "34": 3111470080.0,
+            "35": 3111470080.0,
+            "36": 3111988224.0,
+            "37": 3111988224.0,
+            "38": 3111988224.0,
+            "39": 3111988224.0,
+            "40": 3111988224.0,
+            "41": 3111988224.0,
+            "42": 3111988224.0,
+            "43": 3111988224.0,
+            "44": 3111988224.0,
+            "45": 3111988224.0,
+            "46": 3111988224.0,
+            "47": 3111988224.0,
+            "48": 3111988224.0,
+            "49": 3111988224.0,
+            "50": 3111988224.0,
+            "51": 3111988224.0,
+            "52": 3111988224.0,
+            "53": 3111988224.0,
+            "54": 3111988224.0,
+            "55": 3111988224.0,
+            "56": 3111988224.0,
+            "57": 3111988224.0,
+            "58": 3111988224.0,
+            "59": 3111988224.0,
+            "60": 3111988224.0,
+            "61": 3111988224.0,
+            "62": 3111988224.0,
+            "63": 3111988224.0,
+            "64": 3111988224.0,
+            "65": 3111988224.0,
+            "66": 3111988224.0,
+            "67": 3111988224.0,
+            "68": 3111988224.0,
+            "69": 3111988224.0,
+            "70": 3111988224.0,
+            "71": 3111988224.0,
+            "72": 3111988224.0,
+            "73": 3111988224.0,
+            "74": 3111988224.0,
+            "75": 3111988224.0,
+            "76": 3111988224.0,
+            "77": 3111988224.0,
+            "78": 3111988224.0,
+            "79": 3111988224.0,
+            "80": 3111988224.0,
+            "81": 3111988224.0,
+            "82": 3111988224.0,
+            "83": 3111988224.0,
+            "84": 3111988224.0,
+            "85": 3111988224.0,
+            "86": 3111988224.0,
+            "87": 3111988224.0,
+            "88": 3111988224.0,
+            "89": 3111988224.0,
+            "90": 3111988224.0,
+            "91": 3111988224.0,
+            "92": 3111988224.0,
+            "93": 3111988224.0,
+            "94": 3111988224.0,
+            "95": 3111988224.0,
+            "96": 3111988224.0,
+            "97": 3111988224.0,
+            "98": 3111988224.0,
+            "99": 3111988224.0,
+            "100": 3111988224.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 11.84806,
-            "2": 1.03522,
-            "3": 1.00793,
-            "4": 1.00939,
-            "5": 1.00929,
-            "6": 1.01517,
-            "7": 1.01009,
-            "8": 1.01561,
-            "9": 1.02131,
-            "10": 1.01787,
-            "11": 1.01149,
-            "12": 1.0128,
-            "13": 1.01358,
-            "14": 1.01768,
-            "15": 1.23565,
-            "16": 1.01096,
-            "17": 1.19479,
-            "18": 1.01674,
-            "19": 1.01808,
-            "20": 1.23016,
-            "21": 1.01908,
-            "22": 1.11536,
-            "23": 1.0888,
-            "24": 1.02965,
-            "25": 1.03972,
-            "26": 1.00766,
-            "27": 1.00981,
-            "28": 1.01339,
-            "29": 1.01801,
-            "30": 1.01655,
-            "31": 1.01796,
-            "32": 1.01286,
-            "33": 1.01823,
-            "34": 1.00604,
-            "35": 1.01493,
-            "36": 1.01106,
-            "37": 1.00783,
-            "38": 1.01573,
-            "39": 1.01525,
-            "40": 1.09842,
-            "41": 1.39919,
-            "42": 1.22658,
-            "43": 1.00841,
-            "44": 0.99932,
-            "45": 1.00156,
-            "46": 1.18473,
-            "47": 1.01528,
-            "48": 1.00768,
-            "49": 1.00498,
-            "50": 0.9957,
-            "51": 1.29149,
-            "52": 1.10051,
-            "53": 1.00264,
-            "54": 1.00531,
-            "55": 1.30558,
-            "56": 0.99836,
-            "57": 1.00645,
-            "58": 1.00413,
-            "59": 1.00106,
-            "60": 1.00076,
-            "61": 1.32205,
-            "62": 1.00795,
-            "63": 1.2523,
-            "64": 1.01369,
-            "65": 1.01151,
-            "66": 1.01484,
-            "67": 1.00831,
-            "68": 1.01849,
-            "69": 1.01821,
-            "70": 1.01316,
-            "71": 1.01068,
-            "72": 1.01792,
-            "73": 1.47417,
-            "74": 1.01143,
-            "75": 1.14077,
-            "76": 1.01286,
-            "77": 1.08819,
-            "78": 1.01005,
-            "79": 1.0069,
-            "80": 1.01196,
-            "81": 1.0882,
-            "82": 1.00417,
-            "83": 1.29479,
-            "84": 1.0044,
-            "85": 1.0103,
-            "86": 1.00862,
-            "87": 1.01863,
-            "88": 1.2549,
-            "89": 1.0075,
-            "90": 1.00874,
-            "91": 1.0111,
-            "92": 1.01049,
-            "93": 1.01084,
-            "94": 1.01043,
-            "95": 1.01246,
-            "96": 1.01317,
-            "97": 1.09821,
-            "98": 1.01406,
-            "99": 1.00578,
-            "100": 1.09442
+            "1": 11.18542,
+            "2": 0.99156,
+            "3": 0.93327,
+            "4": 0.90681,
+            "5": 0.90504,
+            "6": 0.90415,
+            "7": 0.90281,
+            "8": 1.14692,
+            "9": 1.44306,
+            "10": 0.89873,
+            "11": 0.90113,
+            "12": 0.89984,
+            "13": 1.24688,
+            "14": 0.90399,
+            "15": 0.90327,
+            "16": 0.89945,
+            "17": 0.90194,
+            "18": 0.89984,
+            "19": 0.89878,
+            "20": 0.89865,
+            "21": 0.90167,
+            "22": 0.90176,
+            "23": 0.90423,
+            "24": 2.02738,
+            "25": 0.90411,
+            "26": 0.90354,
+            "27": 0.90203,
+            "28": 1.26668,
+            "29": 0.89854,
+            "30": 1.45828,
+            "31": 0.90574,
+            "32": 0.90137,
+            "33": 1.70784,
+            "34": 0.89924,
+            "35": 0.90059,
+            "36": 0.90525,
+            "37": 0.90801,
+            "38": 0.90691,
+            "39": 0.9048,
+            "40": 1.47233,
+            "41": 0.91116,
+            "42": 1.22468,
+            "43": 1.0011,
+            "44": 1.22804,
+            "45": 1.12037,
+            "46": 1.00115,
+            "47": 0.91003,
+            "48": 0.91208,
+            "49": 0.91545,
+            "50": 0.91,
+            "51": 0.91471,
+            "52": 0.91238,
+            "53": 0.90865,
+            "54": 0.91588,
+            "55": 0.91889,
+            "56": 0.91882,
+            "57": 0.92072,
+            "58": 0.9202,
+            "59": 0.92355,
+            "60": 0.92097,
+            "61": 0.91924,
+            "62": 0.91496,
+            "63": 0.91648,
+            "64": 0.91615,
+            "65": 0.91333,
+            "66": 0.91743,
+            "67": 0.9094,
+            "68": 0.91122,
+            "69": 0.90894,
+            "70": 0.91968,
+            "71": 0.92199,
+            "72": 0.91976,
+            "73": 0.92156,
+            "74": 0.91995,
+            "75": 0.90852,
+            "76": 0.90983,
+            "77": 1.19595,
+            "78": 0.9092,
+            "79": 1.16564,
+            "80": 1.06882,
+            "81": 0.90637,
+            "82": 0.90812,
+            "83": 0.91,
+            "84": 0.90847,
+            "85": 0.88526,
+            "86": 0.87691,
+            "87": 0.88881,
+            "88": 0.87995,
+            "89": 0.9042,
+            "90": 0.90269,
+            "91": 0.90587,
+            "92": 0.90035,
+            "93": 0.89985,
+            "94": 0.90093,
+            "95": 0.90088,
+            "96": 0.89612,
+            "97": 0.89401,
+            "98": 0.89773,
+            "99": 0.90081,
+            "100": 0.8988
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..a5b9c2f1ab2
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.71492,
+            "52": 9.66464,
+            "53": 9.60912,
+            "54": 9.62726,
+            "55": 9.6101,
+            "56": 9.61721,
+            "57": 9.56794,
+            "58": 9.52741,
+            "59": 9.51674,
+            "60": 9.51863,
+            "61": 9.53132,
+            "62": 9.45018,
+            "63": 9.4572,
+            "64": 9.43437,
+            "65": 9.45816,
+            "66": 9.43669,
+            "67": 9.39678,
+            "68": 9.36478,
+            "69": 9.40956,
+            "70": 9.37595,
+            "71": 9.41738,
+            "72": 9.42564,
+            "73": 9.37611,
+            "74": 9.41543,
+            "75": 9.3788,
+            "76": 9.28012,
+            "77": 9.32212,
+            "78": 9.35744,
+            "79": 9.3215,
+            "80": 9.31497,
+            "81": 9.26785,
+            "82": 9.34183,
+            "83": 9.32151,
+            "84": 9.24796,
+            "85": 9.35033,
+            "86": 9.224,
+            "87": 9.30611,
+            "88": 9.29894,
+            "89": 9.22704,
+            "90": 9.28479,
+            "91": 9.2311,
+            "92": 9.27474,
+            "93": 9.19219,
+            "94": 9.23969,
+            "95": 9.28,
+            "96": 9.17525,
+            "97": 9.21888,
+            "98": 9.1721,
+            "99": 9.16455,
+            "100": 9.1482
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1961.0,
+            "52": 2445.0,
+            "53": 3654.0,
+            "54": 3489.0,
+            "55": 3419.0,
+            "56": 4364.0,
+            "57": 4145.0,
+            "58": 4155.0,
+            "59": 1699.0,
+            "60": 2358.0,
+            "61": 2070.0,
+            "62": 4094.0,
+            "63": 3516.0,
+            "64": 4287.0,
+            "65": 2891.0,
+            "66": 1733.0,
+            "67": 1914.0,
+            "68": 4420.0,
+            "69": 4479.0,
+            "70": 4656.0,
+            "71": 2135.0,
+            "72": 4476.0,
+            "73": 4048.0,
+            "74": 3199.0,
+            "75": 4735.0,
+            "76": 2218.0,
+            "77": 4952.0,
+            "78": 4158.0,
+            "79": 2657.0,
+            "80": 3846.0,
+            "81": 3472.0,
+            "82": 2979.0,
+            "83": 5364.0,
+            "84": 4430.0,
+            "85": 4249.0,
+            "86": 3509.0,
+            "87": 4817.0,
+            "88": 3434.0,
+            "89": 4711.0,
+            "90": 4448.0,
+            "91": 4374.0,
+            "92": 3507.0,
+            "93": 5549.0,
+            "94": 3635.0,
+            "95": 4540.0,
+            "96": 3659.0,
+            "97": 3756.0,
+            "98": 4513.0,
+            "99": 4491.0,
+            "100": 3445.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1786112512.0,
+            "52": 1786112512.0,
+            "53": 1786112512.0,
+            "54": 1786112512.0,
+            "55": 1786112512.0,
+            "56": 1786112512.0,
+            "57": 1786112512.0,
+            "58": 1786112512.0,
+            "59": 1786112512.0,
+            "60": 1786112512.0,
+            "61": 1786112512.0,
+            "62": 1786112512.0,
+            "63": 1786112512.0,
+            "64": 1786112512.0,
+            "65": 1786112512.0,
+            "66": 1786112512.0,
+            "67": 1786112512.0,
+            "68": 1786112512.0,
+            "69": 1786112512.0,
+            "70": 1786112512.0,
+            "71": 1786112512.0,
+            "72": 1786112512.0,
+            "73": 1786112512.0,
+            "74": 1786112512.0,
+            "75": 1786112512.0,
+            "76": 1786112512.0,
+            "77": 1786112512.0,
+            "78": 1786112512.0,
+            "79": 1786112512.0,
+            "80": 1786112512.0,
+            "81": 1786112512.0,
+            "82": 1786112512.0,
+            "83": 1786112512.0,
+            "84": 1786112512.0,
+            "85": 1786112512.0,
+            "86": 1786112512.0,
+            "87": 1786112512.0,
+            "88": 1786112512.0,
+            "89": 1786112512.0,
+            "90": 1786112512.0,
+            "91": 1786112512.0,
+            "92": 1786112512.0,
+            "93": 1786112512.0,
+            "94": 1786112512.0,
+            "95": 1786112512.0,
+            "96": 1786112512.0,
+            "97": 1786112512.0,
+            "98": 1786112512.0,
+            "99": 1786112512.0,
+            "100": 1786112512.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3110419456.0,
+            "52": 3110421504.0,
+            "53": 3110421504.0,
+            "54": 3110421504.0,
+            "55": 3110421504.0,
+            "56": 3110421504.0,
+            "57": 3110421504.0,
+            "58": 3110421504.0,
+            "59": 3110421504.0,
+            "60": 3110421504.0,
+            "61": 3110421504.0,
+            "62": 3110421504.0,
+            "63": 3110421504.0,
+            "64": 3110421504.0,
+            "65": 3110421504.0,
+            "66": 3110421504.0,
+            "67": 3110421504.0,
+            "68": 3110421504.0,
+            "69": 3110421504.0,
+            "70": 3110421504.0,
+            "71": 3110421504.0,
+            "72": 3110421504.0,
+            "73": 3110421504.0,
+            "74": 3110421504.0,
+            "75": 3110421504.0,
+            "76": 3110421504.0,
+            "77": 3110421504.0,
+            "78": 3110421504.0,
+            "79": 3110421504.0,
+            "80": 3110421504.0,
+            "81": 3110421504.0,
+            "82": 3110421504.0,
+            "83": 3110421504.0,
+            "84": 3110421504.0,
+            "85": 3110421504.0,
+            "86": 3110421504.0,
+            "87": 3110421504.0,
+            "88": 3110421504.0,
+            "89": 3110421504.0,
+            "90": 3110421504.0,
+            "91": 3110421504.0,
+            "92": 3110421504.0,
+            "93": 3110421504.0,
+            "94": 3110421504.0,
+            "95": 3110421504.0,
+            "96": 3110421504.0,
+            "97": 3110421504.0,
+            "98": 3110421504.0,
+            "99": 3110421504.0,
+            "100": 3110421504.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.75043,
+            "52": 1.0039,
+            "53": 0.95516,
+            "54": 0.91159,
+            "55": 0.90836,
+            "56": 0.94785,
+            "57": 1.16936,
+            "58": 1.19663,
+            "59": 1.28755,
+            "60": 0.88429,
+            "61": 0.8835,
+            "62": 0.91894,
+            "63": 0.88317,
+            "64": 0.89119,
+            "65": 0.88844,
+            "66": 1.26569,
+            "67": 0.88764,
+            "68": 0.88401,
+            "69": 0.89243,
+            "70": 0.8883,
+            "71": 0.89113,
+            "72": 0.91101,
+            "73": 0.89072,
+            "74": 2.04797,
+            "75": 0.90184,
+            "76": 0.93408,
+            "77": 1.2869,
+            "78": 0.95072,
+            "79": 0.96458,
+            "80": 0.90559,
+            "81": 0.95787,
+            "82": 0.90855,
+            "83": 1.71942,
+            "84": 0.94521,
+            "85": 0.88307,
+            "86": 0.88152,
+            "87": 0.89039,
+            "88": 0.88803,
+            "89": 0.90894,
+            "90": 0.89894,
+            "91": 1.05886,
+            "92": 1.19588,
+            "93": 1.37335,
+            "94": 0.8898,
+            "95": 1.07004,
+            "96": 0.88806,
+            "97": 0.89083,
+            "98": 0.90547,
+            "99": 0.94317,
+            "100": 0.90081
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json
index 27a34e32198..fe766022589 100644
--- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json
@@ -44,66 +44,66 @@
             "38": 10.07257,
             "39": 10.0013,
             "40": 9.9816,
-            "41": 9.92549,
-            "42": 9.87529,
-            "43": 9.88742,
-            "44": 9.80641,
-            "45": 9.82342,
-            "46": 9.73815,
-            "47": 9.74831,
-            "48": 9.71619,
-            "49": 9.74504,
+            "41": 9.92551,
+            "42": 9.87537,
+            "43": 9.88725,
+            "44": 9.80659,
+            "45": 9.82349,
+            "46": 9.73821,
+            "47": 9.74829,
+            "48": 9.71628,
+            "49": 9.74489,
             "50": 9.73004,
-            "51": 9.71503,
-            "52": 9.66484,
-            "53": 9.60935,
-            "54": 9.62735,
-            "55": 9.61036,
-            "56": 9.61745,
+            "51": 9.71501,
+            "52": 9.66488,
+            "53": 9.60917,
+            "54": 9.62733,
+            "55": 9.61022,
+            "56": 9.61723,
             "57": 9.56794,
-            "58": 9.52742,
-            "59": 9.51685,
-            "60": 9.51873,
-            "61": 9.53147,
-            "62": 9.45024,
-            "63": 9.45733,
-            "64": 9.43455,
-            "65": 9.4582,
-            "66": 9.43694,
-            "67": 9.39693,
-            "68": 9.36491,
-            "69": 9.40957,
-            "70": 9.37605,
-            "71": 9.41735,
-            "72": 9.42581,
-            "73": 9.37614,
-            "74": 9.41544,
-            "75": 9.37897,
-            "76": 9.28015,
-            "77": 9.32215,
-            "78": 9.35752,
-            "79": 9.32154,
-            "80": 9.31496,
-            "81": 9.26776,
-            "82": 9.34189,
-            "83": 9.32163,
-            "84": 9.24791,
-            "85": 9.35021,
-            "86": 9.22383,
-            "87": 9.30627,
-            "88": 9.29884,
+            "58": 9.52733,
+            "59": 9.51677,
+            "60": 9.5188,
+            "61": 9.53149,
+            "62": 9.45031,
+            "63": 9.45717,
+            "64": 9.43441,
+            "65": 9.45812,
+            "66": 9.43672,
+            "67": 9.39687,
+            "68": 9.36469,
+            "69": 9.40964,
+            "70": 9.37606,
+            "71": 9.41737,
+            "72": 9.42585,
+            "73": 9.37601,
+            "74": 9.4154,
+            "75": 9.37896,
+            "76": 9.28004,
+            "77": 9.32212,
+            "78": 9.35755,
+            "79": 9.3216,
+            "80": 9.31491,
+            "81": 9.26783,
+            "82": 9.342,
+            "83": 9.32159,
+            "84": 9.24786,
+            "85": 9.35018,
+            "86": 9.22384,
+            "87": 9.30618,
+            "88": 9.29905,
             "89": 9.22708,
-            "90": 9.28475,
-            "91": 9.23116,
-            "92": 9.27477,
-            "93": 9.1922,
-            "94": 9.23984,
-            "95": 9.27996,
-            "96": 9.17534,
-            "97": 9.21892,
-            "98": 9.1719,
-            "99": 9.1646,
-            "100": 9.14809
+            "90": 9.28498,
+            "91": 9.23123,
+            "92": 9.27487,
+            "93": 9.19233,
+            "94": 9.23985,
+            "95": 9.28002,
+            "96": 9.17532,
+            "97": 9.21898,
+            "98": 9.17203,
+            "99": 9.16444,
+            "100": 9.14821
         }
     },
     "num-zeros": {
@@ -150,67 +150,67 @@
             "37": 3305.0,
             "38": 2682.0,
             "39": 2805.0,
-            "40": 3425.0,
-            "41": 1812.0,
-            "42": 1481.0,
-            "43": 1726.0,
-            "44": 2575.0,
-            "45": 3438.0,
-            "46": 2960.0,
-            "47": 2792.0,
-            "48": 3107.0,
-            "49": 2854.0,
-            "50": 2145.0,
-            "51": 1964.0,
-            "52": 2437.0,
-            "53": 3823.0,
-            "54": 3427.0,
-            "55": 3392.0,
-            "56": 4421.0,
-            "57": 4003.0,
-            "58": 4224.0,
-            "59": 1816.0,
-            "60": 2520.0,
-            "61": 2106.0,
-            "62": 4011.0,
-            "63": 3637.0,
-            "64": 4375.0,
-            "65": 3080.0,
-            "66": 1753.0,
-            "67": 1913.0,
-            "68": 4407.0,
-            "69": 4475.0,
-            "70": 4419.0,
-            "71": 2152.0,
-            "72": 4399.0,
-            "73": 4134.0,
-            "74": 3315.0,
-            "75": 4815.0,
-            "76": 2322.0,
-            "77": 5019.0,
-            "78": 4171.0,
-            "79": 2788.0,
-            "80": 3831.0,
-            "81": 3411.0,
-            "82": 3004.0,
-            "83": 5145.0,
-            "84": 4399.0,
-            "85": 4295.0,
-            "86": 3410.0,
-            "87": 4880.0,
-            "88": 3350.0,
-            "89": 4659.0,
-            "90": 4370.0,
-            "91": 4273.0,
-            "92": 3325.0,
-            "93": 5509.0,
-            "94": 3804.0,
-            "95": 4711.0,
-            "96": 3631.0,
-            "97": 3774.0,
-            "98": 4477.0,
-            "99": 4459.0,
-            "100": 3220.0
+            "40": 3430.0,
+            "41": 1767.0,
+            "42": 1516.0,
+            "43": 1798.0,
+            "44": 2790.0,
+            "45": 3578.0,
+            "46": 3016.0,
+            "47": 2890.0,
+            "48": 3065.0,
+            "49": 2914.0,
+            "50": 2208.0,
+            "51": 1900.0,
+            "52": 2483.0,
+            "53": 3763.0,
+            "54": 3478.0,
+            "55": 3412.0,
+            "56": 4400.0,
+            "57": 4019.0,
+            "58": 4253.0,
+            "59": 1805.0,
+            "60": 2457.0,
+            "61": 2045.0,
+            "62": 3994.0,
+            "63": 3650.0,
+            "64": 4466.0,
+            "65": 2968.0,
+            "66": 1837.0,
+            "67": 1961.0,
+            "68": 4347.0,
+            "69": 4441.0,
+            "70": 4452.0,
+            "71": 2131.0,
+            "72": 4523.0,
+            "73": 4105.0,
+            "74": 3300.0,
+            "75": 4651.0,
+            "76": 2216.0,
+            "77": 4932.0,
+            "78": 4218.0,
+            "79": 2784.0,
+            "80": 3824.0,
+            "81": 3472.0,
+            "82": 2976.0,
+            "83": 5282.0,
+            "84": 4464.0,
+            "85": 4344.0,
+            "86": 3460.0,
+            "87": 4774.0,
+            "88": 3426.0,
+            "89": 4600.0,
+            "90": 4360.0,
+            "91": 4283.0,
+            "92": 3362.0,
+            "93": 5633.0,
+            "94": 3676.0,
+            "95": 4610.0,
+            "96": 3449.0,
+            "97": 3751.0,
+            "98": 4524.0,
+            "99": 4399.0,
+            "100": 3295.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1767237120.0,
-            "2": 1767237120.0,
-            "3": 1767237120.0,
-            "4": 1767237120.0,
-            "5": 1767237120.0,
-            "6": 1767237120.0,
-            "7": 1767237120.0,
-            "8": 1767237120.0,
-            "9": 1767237120.0,
-            "10": 1767237120.0,
-            "11": 1767237120.0,
-            "12": 1767237120.0,
-            "13": 1767237120.0,
-            "14": 1767237120.0,
-            "15": 1767237120.0,
-            "16": 1767237120.0,
-            "17": 1767237120.0,
-            "18": 1767237120.0,
-            "19": 1767237120.0,
-            "20": 1767237120.0,
-            "21": 1767237120.0,
-            "22": 1767237120.0,
-            "23": 1767237120.0,
-            "24": 1767237120.0,
-            "25": 1767237120.0,
-            "26": 1767237120.0,
-            "27": 1767237120.0,
-            "28": 1767237120.0,
-            "29": 1767237120.0,
-            "30": 1767237120.0,
-            "31": 1767237120.0,
-            "32": 1767237120.0,
-            "33": 1767237120.0,
-            "34": 1767237120.0,
-            "35": 1767237120.0,
-            "36": 1767237120.0,
-            "37": 1767237120.0,
-            "38": 1767237120.0,
-            "39": 1767237120.0,
-            "40": 1767237120.0,
-            "41": 1767237120.0,
-            "42": 1767237120.0,
-            "43": 1767237120.0,
-            "44": 1767237120.0,
-            "45": 1767237120.0,
-            "46": 1767237120.0,
-            "47": 1767237120.0,
-            "48": 1767237120.0,
-            "49": 1767237120.0,
-            "50": 1767237120.0,
-            "51": 1767237120.0,
-            "52": 1767237120.0,
-            "53": 1767237120.0,
-            "54": 1767237120.0,
-            "55": 1767237120.0,
-            "56": 1767237120.0,
-            "57": 1767237120.0,
-            "58": 1767237120.0,
-            "59": 1767237120.0,
-            "60": 1767237120.0,
-            "61": 1767237120.0,
-            "62": 1767237120.0,
-            "63": 1767237120.0,
-            "64": 1767237120.0,
-            "65": 1767237120.0,
-            "66": 1767237120.0,
-            "67": 1767237120.0,
-            "68": 1767237120.0,
-            "69": 1767237120.0,
-            "70": 1767237120.0,
-            "71": 1767237120.0,
-            "72": 1767237120.0,
-            "73": 1767237120.0,
-            "74": 1767237120.0,
-            "75": 1767237120.0,
-            "76": 1767237120.0,
-            "77": 1767237120.0,
-            "78": 1767237120.0,
-            "79": 1767237120.0,
-            "80": 1767237120.0,
-            "81": 1767237120.0,
-            "82": 1767237120.0,
-            "83": 1767237120.0,
-            "84": 1767237120.0,
-            "85": 1767237120.0,
-            "86": 1767237120.0,
-            "87": 1767237120.0,
-            "88": 1767237120.0,
-            "89": 1767237120.0,
-            "90": 1767237120.0,
-            "91": 1767237120.0,
-            "92": 1767237120.0,
-            "93": 1767237120.0,
-            "94": 1767237120.0,
-            "95": 1767237120.0,
-            "96": 1767237120.0,
-            "97": 1767237120.0,
-            "98": 1767237120.0,
-            "99": 1767237120.0,
-            "100": 1767237120.0
+            "1": 1768285696.0,
+            "2": 1768285696.0,
+            "3": 1768285696.0,
+            "4": 1768285696.0,
+            "5": 1768285696.0,
+            "6": 1768285696.0,
+            "7": 1768285696.0,
+            "8": 1768285696.0,
+            "9": 1768285696.0,
+            "10": 1768285696.0,
+            "11": 1768285696.0,
+            "12": 1768285696.0,
+            "13": 1768285696.0,
+            "14": 1768285696.0,
+            "15": 1768285696.0,
+            "16": 1768285696.0,
+            "17": 1768285696.0,
+            "18": 1768285696.0,
+            "19": 1768285696.0,
+            "20": 1768285696.0,
+            "21": 1768285696.0,
+            "22": 1768285696.0,
+            "23": 1768285696.0,
+            "24": 1768285696.0,
+            "25": 1768285696.0,
+            "26": 1768285696.0,
+            "27": 1768285696.0,
+            "28": 1768285696.0,
+            "29": 1768285696.0,
+            "30": 1768285696.0,
+            "31": 1768285696.0,
+            "32": 1768285696.0,
+            "33": 1768285696.0,
+            "34": 1768285696.0,
+            "35": 1768285696.0,
+            "36": 1768285696.0,
+            "37": 1768285696.0,
+            "38": 1768285696.0,
+            "39": 1768285696.0,
+            "40": 1768285696.0,
+            "41": 1768285696.0,
+            "42": 1768285696.0,
+            "43": 1768285696.0,
+            "44": 1768285696.0,
+            "45": 1768285696.0,
+            "46": 1768285696.0,
+            "47": 1768285696.0,
+            "48": 1768285696.0,
+            "49": 1768285696.0,
+            "50": 1768285696.0,
+            "51": 1768285696.0,
+            "52": 1768285696.0,
+            "53": 1768285696.0,
+            "54": 1768285696.0,
+            "55": 1768285696.0,
+            "56": 1768285696.0,
+            "57": 1768285696.0,
+            "58": 1768285696.0,
+            "59": 1768285696.0,
+            "60": 1768285696.0,
+            "61": 1768285696.0,
+            "62": 1768285696.0,
+            "63": 1768285696.0,
+            "64": 1768285696.0,
+            "65": 1768285696.0,
+            "66": 1768285696.0,
+            "67": 1768285696.0,
+            "68": 1768285696.0,
+            "69": 1768285696.0,
+            "70": 1768285696.0,
+            "71": 1768285696.0,
+            "72": 1768285696.0,
+            "73": 1768285696.0,
+            "74": 1769334272.0,
+            "75": 1768285696.0,
+            "76": 1768285696.0,
+            "77": 1768285696.0,
+            "78": 1768285696.0,
+            "79": 1768285696.0,
+            "80": 1768285696.0,
+            "81": 1768285696.0,
+            "82": 1768285696.0,
+            "83": 1768285696.0,
+            "84": 1768285696.0,
+            "85": 1768285696.0,
+            "86": 1768285696.0,
+            "87": 1768285696.0,
+            "88": 1768285696.0,
+            "89": 1768285696.0,
+            "90": 1768285696.0,
+            "91": 1768285696.0,
+            "92": 1768285696.0,
+            "93": 1768285696.0,
+            "94": 1768285696.0,
+            "95": 1768285696.0,
+            "96": 1768285696.0,
+            "97": 1768285696.0,
+            "98": 1768285696.0,
+            "99": 1768285696.0,
+            "100": 1768285696.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2336500736.0,
-            "2": 3079487488.0,
-            "3": 3079487488.0,
-            "4": 3079487488.0,
-            "5": 3079487488.0,
-            "6": 3079487488.0,
-            "7": 3079487488.0,
-            "8": 3079487488.0,
-            "9": 3079487488.0,
-            "10": 3079487488.0,
-            "11": 3079487488.0,
-            "12": 3079487488.0,
-            "13": 3079487488.0,
-            "14": 3079487488.0,
-            "15": 3079487488.0,
-            "16": 3079487488.0,
-            "17": 3079487488.0,
-            "18": 3079487488.0,
-            "19": 3079487488.0,
-            "20": 3079487488.0,
-            "21": 3079487488.0,
-            "22": 3079487488.0,
-            "23": 3079487488.0,
-            "24": 3079487488.0,
-            "25": 3079487488.0,
-            "26": 3079487488.0,
-            "27": 3079487488.0,
-            "28": 3079487488.0,
-            "29": 3079487488.0,
-            "30": 3079487488.0,
-            "31": 3079487488.0,
-            "32": 3079487488.0,
-            "33": 3079487488.0,
-            "34": 3079487488.0,
-            "35": 3079487488.0,
-            "36": 3079487488.0,
-            "37": 3079487488.0,
-            "38": 3079487488.0,
-            "39": 3079487488.0,
-            "40": 3079487488.0,
-            "41": 3079487488.0,
-            "42": 3079487488.0,
-            "43": 3079487488.0,
-            "44": 3079487488.0,
-            "45": 3079487488.0,
-            "46": 3079487488.0,
-            "47": 3079487488.0,
-            "48": 3079487488.0,
-            "49": 3079487488.0,
-            "50": 3079487488.0,
-            "51": 3079487488.0,
-            "52": 3079487488.0,
-            "53": 3079487488.0,
-            "54": 3079487488.0,
-            "55": 3079487488.0,
-            "56": 3079487488.0,
-            "57": 3079487488.0,
-            "58": 3079487488.0,
-            "59": 3079487488.0,
-            "60": 3079487488.0,
-            "61": 3079487488.0,
-            "62": 3079487488.0,
-            "63": 3079487488.0,
-            "64": 3079487488.0,
-            "65": 3079487488.0,
-            "66": 3079487488.0,
-            "67": 3079487488.0,
-            "68": 3079487488.0,
-            "69": 3079487488.0,
-            "70": 3079487488.0,
-            "71": 3079487488.0,
-            "72": 3079487488.0,
-            "73": 3079487488.0,
-            "74": 3079487488.0,
-            "75": 3079487488.0,
-            "76": 3079487488.0,
-            "77": 3079487488.0,
-            "78": 3079487488.0,
-            "79": 3079487488.0,
-            "80": 3079487488.0,
-            "81": 3079487488.0,
-            "82": 3079487488.0,
-            "83": 3079487488.0,
-            "84": 3079487488.0,
-            "85": 3079487488.0,
-            "86": 3079487488.0,
-            "87": 3079487488.0,
-            "88": 3079487488.0,
-            "89": 3079487488.0,
-            "90": 3079487488.0,
-            "91": 3079487488.0,
-            "92": 3079487488.0,
-            "93": 3079487488.0,
-            "94": 3079487488.0,
-            "95": 3079487488.0,
-            "96": 3079487488.0,
-            "97": 3079487488.0,
-            "98": 3079487488.0,
-            "99": 3079487488.0,
-            "100": 3079487488.0
+            "1": 2337549312.0,
+            "2": 3080536064.0,
+            "3": 3080536064.0,
+            "4": 3080536064.0,
+            "5": 3080536064.0,
+            "6": 3080536064.0,
+            "7": 3080536064.0,
+            "8": 3080536064.0,
+            "9": 3080536064.0,
+            "10": 3080536064.0,
+            "11": 3080536064.0,
+            "12": 3080536064.0,
+            "13": 3080536064.0,
+            "14": 3080536064.0,
+            "15": 3080536064.0,
+            "16": 3080536064.0,
+            "17": 3080536064.0,
+            "18": 3080536064.0,
+            "19": 3080536064.0,
+            "20": 3080536064.0,
+            "21": 3080536064.0,
+            "22": 3080536064.0,
+            "23": 3082107392.0,
+            "24": 3082107392.0,
+            "25": 3082107392.0,
+            "26": 3082107392.0,
+            "27": 3082107392.0,
+            "28": 3082107392.0,
+            "29": 3082107392.0,
+            "30": 3082107392.0,
+            "31": 3082107392.0,
+            "32": 3082107392.0,
+            "33": 3082107392.0,
+            "34": 3082107392.0,
+            "35": 3082107392.0,
+            "36": 3082107392.0,
+            "37": 3082107392.0,
+            "38": 3082107392.0,
+            "39": 3082107392.0,
+            "40": 3082107392.0,
+            "41": 3082107392.0,
+            "42": 3082107392.0,
+            "43": 3082107392.0,
+            "44": 3082107392.0,
+            "45": 3082107392.0,
+            "46": 3082107392.0,
+            "47": 3082107392.0,
+            "48": 3082107392.0,
+            "49": 3082107392.0,
+            "50": 3082107392.0,
+            "51": 3082107392.0,
+            "52": 3082107392.0,
+            "53": 3082107392.0,
+            "54": 3082107392.0,
+            "55": 3082107392.0,
+            "56": 3082107392.0,
+            "57": 3082107392.0,
+            "58": 3082107392.0,
+            "59": 3082107392.0,
+            "60": 3082107392.0,
+            "61": 3082107392.0,
+            "62": 3082107392.0,
+            "63": 3082107392.0,
+            "64": 3082107392.0,
+            "65": 3082107392.0,
+            "66": 3082107392.0,
+            "67": 3082107392.0,
+            "68": 3082107392.0,
+            "69": 3082107392.0,
+            "70": 3082107392.0,
+            "71": 3082107392.0,
+            "72": 3082107392.0,
+            "73": 3082107392.0,
+            "74": 3082108928.0,
+            "75": 3082108928.0,
+            "76": 3082108928.0,
+            "77": 3082108928.0,
+            "78": 3082108928.0,
+            "79": 3082108928.0,
+            "80": 3082108928.0,
+            "81": 3082108928.0,
+            "82": 3082108928.0,
+            "83": 3082108928.0,
+            "84": 3082108928.0,
+            "85": 3082108928.0,
+            "86": 3082108928.0,
+            "87": 3082108928.0,
+            "88": 3082108928.0,
+            "89": 3082108928.0,
+            "90": 3082108928.0,
+            "91": 3082108928.0,
+            "92": 3082108928.0,
+            "93": 3082108928.0,
+            "94": 3082108928.0,
+            "95": 3082108928.0,
+            "96": 3082108928.0,
+            "97": 3082108928.0,
+            "98": 3082108928.0,
+            "99": 3082108928.0,
+            "100": 3082108928.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 11.74907,
-            "2": 0.85881,
-            "3": 0.84325,
-            "4": 0.84358,
-            "5": 0.84379,
-            "6": 0.84251,
-            "7": 0.84123,
-            "8": 0.8499,
-            "9": 0.8999,
-            "10": 0.92522,
-            "11": 0.94116,
-            "12": 0.85793,
-            "13": 0.84568,
-            "14": 0.84264,
-            "15": 0.84084,
-            "16": 0.84084,
-            "17": 0.83843,
-            "18": 0.8412,
-            "19": 0.84178,
-            "20": 1.1044,
-            "21": 1.21871,
-            "22": 1.25946,
-            "23": 0.85008,
-            "24": 0.91404,
-            "25": 0.84787,
-            "26": 0.84792,
-            "27": 0.85174,
-            "28": 0.84996,
-            "29": 0.84337,
-            "30": 0.84498,
-            "31": 0.8486,
-            "32": 0.84203,
-            "33": 0.84451,
-            "34": 0.85648,
-            "35": 0.83537,
-            "36": 0.84205,
-            "37": 0.83563,
-            "38": 0.84541,
-            "39": 0.84231,
-            "40": 0.84639,
-            "41": 0.84365,
-            "42": 0.84512,
-            "43": 0.84437,
-            "44": 0.84299,
-            "45": 0.85866,
-            "46": 0.84237,
-            "47": 0.84617,
-            "48": 1.18328,
-            "49": 0.88875,
-            "50": 0.96388,
-            "51": 0.98149,
-            "52": 0.89905,
-            "53": 0.84382,
-            "54": 0.85382,
-            "55": 0.84338,
-            "56": 0.84282,
-            "57": 0.92404,
-            "58": 0.84627,
-            "59": 0.83811,
-            "60": 0.83802,
-            "61": 0.85109,
-            "62": 0.83231,
-            "63": 0.83505,
-            "64": 1.15842,
-            "65": 1.1324,
-            "66": 0.83972,
-            "67": 0.82896,
-            "68": 0.82596,
-            "69": 0.83118,
-            "70": 0.84229,
-            "71": 0.8328,
-            "72": 0.82924,
-            "73": 0.83555,
-            "74": 0.83422,
-            "75": 0.90796,
-            "76": 0.85077,
-            "77": 1.07568,
-            "78": 1.30938,
-            "79": 1.12037,
-            "80": 0.82751,
-            "81": 0.83544,
-            "82": 0.88688,
-            "83": 1.16362,
-            "84": 0.83207,
-            "85": 0.83917,
-            "86": 1.14681,
-            "87": 1.17025,
-            "88": 0.82985,
-            "89": 0.82492,
-            "90": 0.90586,
-            "91": 0.83299,
-            "92": 0.83139,
-            "93": 0.83405,
-            "94": 0.83756,
-            "95": 0.83351,
-            "96": 0.83063,
-            "97": 0.83499,
-            "98": 0.84617,
-            "99": 0.83623,
-            "100": 0.84014
+            "1": 10.24286,
+            "2": 0.82679,
+            "3": 0.79409,
+            "4": 0.76435,
+            "5": 0.77118,
+            "6": 0.74558,
+            "7": 0.74667,
+            "8": 0.77701,
+            "9": 1.97605,
+            "10": 0.75455,
+            "11": 0.74398,
+            "12": 0.74114,
+            "13": 0.7501,
+            "14": 0.74704,
+            "15": 0.74029,
+            "16": 1.1307,
+            "17": 0.73862,
+            "18": 0.73445,
+            "19": 0.73384,
+            "20": 0.73927,
+            "21": 0.74153,
+            "22": 0.73755,
+            "23": 0.76958,
+            "24": 0.7377,
+            "25": 0.73987,
+            "26": 0.77483,
+            "27": 1.30185,
+            "28": 0.76,
+            "29": 0.75644,
+            "30": 0.77716,
+            "31": 0.83125,
+            "32": 0.80226,
+            "33": 0.74041,
+            "34": 0.74334,
+            "35": 1.17386,
+            "36": 1.53868,
+            "37": 0.77003,
+            "38": 0.76358,
+            "39": 0.77015,
+            "40": 0.77216,
+            "41": 0.76865,
+            "42": 1.214,
+            "43": 1.04802,
+            "44": 0.758,
+            "45": 1.27424,
+            "46": 1.12734,
+            "47": 0.7573,
+            "48": 0.74875,
+            "49": 0.74989,
+            "50": 0.75416,
+            "51": 0.75904,
+            "52": 0.75338,
+            "53": 0.75124,
+            "54": 0.73937,
+            "55": 0.74096,
+            "56": 0.75129,
+            "57": 0.75097,
+            "58": 0.74724,
+            "59": 0.74661,
+            "60": 0.74245,
+            "61": 0.74378,
+            "62": 0.74491,
+            "63": 0.74147,
+            "64": 0.74756,
+            "65": 0.74511,
+            "66": 0.74967,
+            "67": 0.7462,
+            "68": 0.74176,
+            "69": 0.74258,
+            "70": 0.74323,
+            "71": 0.74412,
+            "72": 0.74522,
+            "73": 0.74053,
+            "74": 0.74312,
+            "75": 0.74157,
+            "76": 1.12862,
+            "77": 0.74522,
+            "78": 1.08987,
+            "79": 0.94746,
+            "80": 0.877,
+            "81": 0.74472,
+            "82": 0.74142,
+            "83": 0.74342,
+            "84": 0.7418,
+            "85": 0.74017,
+            "86": 0.7399,
+            "87": 0.73594,
+            "88": 0.73916,
+            "89": 0.73537,
+            "90": 0.75037,
+            "91": 0.7341,
+            "92": 0.73469,
+            "93": 0.7333,
+            "94": 0.73221,
+            "95": 0.73055,
+            "96": 0.73133,
+            "97": 0.73591,
+            "98": 0.74108,
+            "99": 0.74467,
+            "100": 0.73711
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..de97d194787
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.71501,
+            "52": 9.66488,
+            "53": 9.60917,
+            "54": 9.62733,
+            "55": 9.61022,
+            "56": 9.61723,
+            "57": 9.56794,
+            "58": 9.52733,
+            "59": 9.51677,
+            "60": 9.5188,
+            "61": 9.53149,
+            "62": 9.45031,
+            "63": 9.45717,
+            "64": 9.43441,
+            "65": 9.45812,
+            "66": 9.43672,
+            "67": 9.39687,
+            "68": 9.36469,
+            "69": 9.40964,
+            "70": 9.37606,
+            "71": 9.41737,
+            "72": 9.42585,
+            "73": 9.37601,
+            "74": 9.4154,
+            "75": 9.37896,
+            "76": 9.28004,
+            "77": 9.32212,
+            "78": 9.35755,
+            "79": 9.3216,
+            "80": 9.31491,
+            "81": 9.26783,
+            "82": 9.342,
+            "83": 9.32159,
+            "84": 9.24786,
+            "85": 9.35018,
+            "86": 9.22384,
+            "87": 9.30618,
+            "88": 9.29905,
+            "89": 9.22708,
+            "90": 9.28498,
+            "91": 9.23123,
+            "92": 9.27487,
+            "93": 9.19233,
+            "94": 9.23985,
+            "95": 9.28002,
+            "96": 9.17532,
+            "97": 9.21898,
+            "98": 9.17203,
+            "99": 9.16444,
+            "100": 9.14821
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1900.0,
+            "52": 2483.0,
+            "53": 3763.0,
+            "54": 3478.0,
+            "55": 3412.0,
+            "56": 4400.0,
+            "57": 4019.0,
+            "58": 4253.0,
+            "59": 1805.0,
+            "60": 2457.0,
+            "61": 2045.0,
+            "62": 3994.0,
+            "63": 3650.0,
+            "64": 4466.0,
+            "65": 2968.0,
+            "66": 1837.0,
+            "67": 1961.0,
+            "68": 4347.0,
+            "69": 4441.0,
+            "70": 4452.0,
+            "71": 2131.0,
+            "72": 4523.0,
+            "73": 4105.0,
+            "74": 3300.0,
+            "75": 4651.0,
+            "76": 2216.0,
+            "77": 4932.0,
+            "78": 4218.0,
+            "79": 2784.0,
+            "80": 3824.0,
+            "81": 3472.0,
+            "82": 2976.0,
+            "83": 5282.0,
+            "84": 4464.0,
+            "85": 4344.0,
+            "86": 3460.0,
+            "87": 4774.0,
+            "88": 3426.0,
+            "89": 4600.0,
+            "90": 4360.0,
+            "91": 4283.0,
+            "92": 3362.0,
+            "93": 5633.0,
+            "94": 3676.0,
+            "95": 4610.0,
+            "96": 3449.0,
+            "97": 3751.0,
+            "98": 4524.0,
+            "99": 4399.0,
+            "100": 3295.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1769334272.0,
+            "52": 1769334272.0,
+            "53": 1769334272.0,
+            "54": 1769334272.0,
+            "55": 1769334272.0,
+            "56": 1769334272.0,
+            "57": 1769334272.0,
+            "58": 1769334272.0,
+            "59": 1769334272.0,
+            "60": 1769334272.0,
+            "61": 1769334272.0,
+            "62": 1769334272.0,
+            "63": 1769334272.0,
+            "64": 1769334272.0,
+            "65": 1769334272.0,
+            "66": 1769334272.0,
+            "67": 1769334272.0,
+            "68": 1769334272.0,
+            "69": 1769334272.0,
+            "70": 1769334272.0,
+            "71": 1769334272.0,
+            "72": 1769334272.0,
+            "73": 1769334272.0,
+            "74": 1769334272.0,
+            "75": 1769334272.0,
+            "76": 1769334272.0,
+            "77": 1769334272.0,
+            "78": 1769334272.0,
+            "79": 1769334272.0,
+            "80": 1769334272.0,
+            "81": 1769334272.0,
+            "82": 1769334272.0,
+            "83": 1769334272.0,
+            "84": 1769334272.0,
+            "85": 1769334272.0,
+            "86": 1769334272.0,
+            "87": 1769334272.0,
+            "88": 1769334272.0,
+            "89": 1769334272.0,
+            "90": 1769334272.0,
+            "91": 1769334272.0,
+            "92": 1769334272.0,
+            "93": 1769334272.0,
+            "94": 1769334272.0,
+            "95": 1769334272.0,
+            "96": 1769334272.0,
+            "97": 1769334272.0,
+            "98": 1769334272.0,
+            "99": 1769334272.0,
+            "100": 1769334272.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3081058304.0,
+            "52": 3081060352.0,
+            "53": 3081060352.0,
+            "54": 3081060352.0,
+            "55": 3081060352.0,
+            "56": 3081060352.0,
+            "57": 3081060352.0,
+            "58": 3081060352.0,
+            "59": 3081060352.0,
+            "60": 3081060352.0,
+            "61": 3081060352.0,
+            "62": 3081060352.0,
+            "63": 3081060352.0,
+            "64": 3081060352.0,
+            "65": 3081060352.0,
+            "66": 3081060352.0,
+            "67": 3081060352.0,
+            "68": 3081060352.0,
+            "69": 3081060352.0,
+            "70": 3081060352.0,
+            "71": 3081060352.0,
+            "72": 3081060352.0,
+            "73": 3081060352.0,
+            "74": 3081060352.0,
+            "75": 3081060352.0,
+            "76": 3081060352.0,
+            "77": 3081060352.0,
+            "78": 3081060352.0,
+            "79": 3081060352.0,
+            "80": 3081060352.0,
+            "81": 3081060352.0,
+            "82": 3081060352.0,
+            "83": 3081060352.0,
+            "84": 3081060352.0,
+            "85": 3081060352.0,
+            "86": 3081060352.0,
+            "87": 3081060352.0,
+            "88": 3081060352.0,
+            "89": 3081060352.0,
+            "90": 3081060352.0,
+            "91": 3081060352.0,
+            "92": 3081060352.0,
+            "93": 3081060352.0,
+            "94": 3081060352.0,
+            "95": 3081060352.0,
+            "96": 3081060352.0,
+            "97": 3081060352.0,
+            "98": 3081060352.0,
+            "99": 3081060352.0,
+            "100": 3081060352.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.15551,
+            "52": 0.8598,
+            "53": 0.74904,
+            "54": 0.7512,
+            "55": 0.75011,
+            "56": 0.7593,
+            "57": 1.36317,
+            "58": 1.3678,
+            "59": 0.75114,
+            "60": 0.74624,
+            "61": 0.74824,
+            "62": 0.75285,
+            "63": 0.75097,
+            "64": 0.7539,
+            "65": 1.11179,
+            "66": 0.7482,
+            "67": 0.75224,
+            "68": 0.75225,
+            "69": 0.73791,
+            "70": 0.74141,
+            "71": 0.74372,
+            "72": 0.74097,
+            "73": 1.17879,
+            "74": 1.13369,
+            "75": 0.75135,
+            "76": 0.74737,
+            "77": 0.7455,
+            "78": 0.74472,
+            "79": 1.10005,
+            "80": 0.74804,
+            "81": 0.75235,
+            "82": 2.07286,
+            "83": 0.74595,
+            "84": 0.75659,
+            "85": 0.74796,
+            "86": 0.73902,
+            "87": 0.73952,
+            "88": 0.73743,
+            "89": 0.74161,
+            "90": 0.94861,
+            "91": 0.94405,
+            "92": 1.05613,
+            "93": 1.27634,
+            "94": 0.80928,
+            "95": 0.77886,
+            "96": 1.11223,
+            "97": 0.73925,
+            "98": 0.773,
+            "99": 0.74424,
+            "100": 0.78256
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
index 88adf60a26e..bc0ee3bcb1e 100644
--- a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
@@ -12,48 +12,48 @@
             "6": 10.41563,
             "7": 10.42859,
             "8": 10.42079,
-            "9": 10.43014,
-            "10": 10.40859,
-            "11": 10.43501,
-            "12": 10.4025,
-            "13": 10.42274,
-            "14": 10.41249,
-            "15": 10.40948,
-            "16": 10.40806,
-            "17": 10.3892,
-            "18": 10.38857,
-            "19": 10.37147,
-            "20": 10.40453,
-            "21": 10.36615,
-            "22": 10.34963,
-            "23": 10.35388,
-            "24": 10.30136,
-            "25": 10.31117,
-            "26": 10.30241,
-            "27": 10.2821,
-            "28": 10.27928,
-            "29": 10.23928,
-            "30": 10.14742,
-            "31": 10.10532,
-            "32": 10.09426,
-            "33": 10.09032,
-            "34": 10.06437,
-            "35": 10.04643,
-            "36": 10.03306,
-            "37": 10.00505,
-            "38": 10.00274,
-            "39": 9.91418,
-            "40": 9.91103,
-            "41": 9.86562,
-            "42": 9.78095,
-            "43": 9.79496,
-            "44": 9.73077,
-            "45": 9.7428,
-            "46": 9.63829,
-            "47": 9.6868,
-            "48": 9.637,
-            "49": 9.6554,
-            "50": 9.65776
+            "9": 10.43013,
+            "10": 10.4087,
+            "11": 10.43493,
+            "12": 10.40244,
+            "13": 10.42282,
+            "14": 10.41239,
+            "15": 10.40952,
+            "16": 10.40789,
+            "17": 10.38944,
+            "18": 10.38859,
+            "19": 10.37154,
+            "20": 10.40445,
+            "21": 10.36609,
+            "22": 10.34962,
+            "23": 10.354,
+            "24": 10.30131,
+            "25": 10.3111,
+            "26": 10.30252,
+            "27": 10.28202,
+            "28": 10.27924,
+            "29": 10.23941,
+            "30": 10.14739,
+            "31": 10.10547,
+            "32": 10.09424,
+            "33": 10.09034,
+            "34": 10.0645,
+            "35": 10.04644,
+            "36": 10.03308,
+            "37": 10.00522,
+            "38": 10.00297,
+            "39": 9.91428,
+            "40": 9.91112,
+            "41": 9.86566,
+            "42": 9.78083,
+            "43": 9.79476,
+            "44": 9.73084,
+            "45": 9.74269,
+            "46": 9.63796,
+            "47": 9.68694,
+            "48": 9.63705,
+            "49": 9.65524,
+            "50": 9.65788
         }
     },
     "num-zeros": {
@@ -69,48 +69,48 @@
             "6": 2985.0,
             "7": 3208.0,
             "8": 3314.0,
-            "9": 3134.0,
-            "10": 3124.0,
-            "11": 3913.0,
-            "12": 3008.0,
-            "13": 3108.0,
-            "14": 3652.0,
-            "15": 3267.0,
-            "16": 3662.0,
-            "17": 3680.0,
-            "18": 3708.0,
-            "19": 3375.0,
-            "20": 3449.0,
-            "21": 3115.0,
-            "22": 3545.0,
-            "23": 3516.0,
-            "24": 3789.0,
-            "25": 3570.0,
-            "26": 3719.0,
-            "27": 2808.0,
-            "28": 3823.0,
-            "29": 3626.0,
-            "30": 4136.0,
-            "31": 2541.0,
-            "32": 3945.0,
-            "33": 3501.0,
-            "34": 3795.0,
-            "35": 3652.0,
-            "36": 4269.0,
-            "37": 4152.0,
-            "38": 3787.0,
-            "39": 3873.0,
-            "40": 4661.0,
-            "41": 2846.0,
-            "42": 1556.0,
-            "43": 2809.0,
-            "44": 4030.0,
-            "45": 4724.0,
-            "46": 4587.0,
-            "47": 3120.0,
-            "48": 4366.0,
-            "49": 3839.0,
-            "50": 3146.0
+            "9": 3210.0,
+            "10": 3297.0,
+            "11": 2833.0,
+            "12": 2982.0,
+            "13": 3178.0,
+            "14": 3705.0,
+            "15": 3252.0,
+            "16": 3615.0,
+            "17": 3789.0,
+            "18": 3620.0,
+            "19": 3327.0,
+            "20": 3539.0,
+            "21": 3129.0,
+            "22": 3597.0,
+            "23": 3595.0,
+            "24": 2781.0,
+            "25": 3585.0,
+            "26": 3607.0,
+            "27": 4015.0,
+            "28": 3836.0,
+            "29": 3716.0,
+            "30": 4150.0,
+            "31": 3472.0,
+            "32": 3024.0,
+            "33": 3553.0,
+            "34": 3793.0,
+            "35": 3757.0,
+            "36": 4205.0,
+            "37": 4221.0,
+            "38": 3819.0,
+            "39": 3866.0,
+            "40": 3554.0,
+            "41": 2883.0,
+            "42": 2592.0,
+            "43": 2856.0,
+            "44": 3173.0,
+            "45": 4948.0,
+            "46": 4572.0,
+            "47": 4077.0,
+            "48": 4355.0,
+            "49": 3885.0,
+            "50": 3266.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1661765632.0,
-            "2": 1661765632.0,
-            "3": 1661765632.0,
-            "4": 1661765632.0,
-            "5": 1661765632.0,
-            "6": 1661765632.0,
-            "7": 1661765632.0,
-            "8": 1661765632.0,
-            "9": 1661765632.0,
-            "10": 1661765632.0,
-            "11": 1661765632.0,
-            "12": 1661765632.0,
-            "13": 1661765632.0,
-            "14": 1661765632.0,
-            "15": 1661765632.0,
-            "16": 1661765632.0,
-            "17": 1661765632.0,
-            "18": 1661765632.0,
-            "19": 1661765632.0,
-            "20": 1661765632.0,
-            "21": 1661765632.0,
-            "22": 1661765632.0,
-            "23": 1661765632.0,
-            "24": 1661765632.0,
-            "25": 1661765632.0,
-            "26": 1661765632.0,
-            "27": 1661765632.0,
-            "28": 1661765632.0,
-            "29": 1661765632.0,
-            "30": 1661765632.0,
-            "31": 1661765632.0,
-            "32": 1661765632.0,
-            "33": 1661765632.0,
-            "34": 1661765632.0,
-            "35": 1661765632.0,
-            "36": 1661765632.0,
-            "37": 1661765632.0,
-            "38": 1661765632.0,
-            "39": 1661765632.0,
-            "40": 1661765632.0,
-            "41": 1661765632.0,
-            "42": 1661765632.0,
-            "43": 1661765632.0,
-            "44": 1661765632.0,
-            "45": 1661765632.0,
-            "46": 1661765632.0,
-            "47": 1661765632.0,
-            "48": 1661765632.0,
-            "49": 1661765632.0,
-            "50": 1661765632.0
+            "1": 1662815232.0,
+            "2": 1662815232.0,
+            "3": 1662815232.0,
+            "4": 1662815232.0,
+            "5": 1662815232.0,
+            "6": 1662815232.0,
+            "7": 1662815232.0,
+            "8": 1662815232.0,
+            "9": 1662815232.0,
+            "10": 1662815232.0,
+            "11": 1662815232.0,
+            "12": 1662815232.0,
+            "13": 1662815232.0,
+            "14": 1662815232.0,
+            "15": 1662815232.0,
+            "16": 1662815232.0,
+            "17": 1662815232.0,
+            "18": 1662815232.0,
+            "19": 1662815232.0,
+            "20": 1662815232.0,
+            "21": 1662815232.0,
+            "22": 1662815232.0,
+            "23": 1662815232.0,
+            "24": 1662815232.0,
+            "25": 1662815232.0,
+            "26": 1662815232.0,
+            "27": 1662815232.0,
+            "28": 1662815232.0,
+            "29": 1662815232.0,
+            "30": 1662815232.0,
+            "31": 1662815232.0,
+            "32": 1662815232.0,
+            "33": 1662815232.0,
+            "34": 1662815232.0,
+            "35": 1662815232.0,
+            "36": 1662815232.0,
+            "37": 1662815232.0,
+            "38": 1662815232.0,
+            "39": 1662815232.0,
+            "40": 1662815232.0,
+            "41": 1662815232.0,
+            "42": 1662815232.0,
+            "43": 1662815232.0,
+            "44": 1662815232.0,
+            "45": 1662815232.0,
+            "46": 1662815232.0,
+            "47": 1662815232.0,
+            "48": 1662815232.0,
+            "49": 1662815232.0,
+            "50": 1662815232.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2506479104.0,
-            "2": 3205449216.0,
-            "3": 3205449216.0,
-            "4": 3205449216.0,
-            "5": 3205449216.0,
-            "6": 3205449216.0,
-            "7": 3205449216.0,
-            "8": 3205449216.0,
-            "9": 3205449216.0,
-            "10": 3205449216.0,
-            "11": 3205449216.0,
-            "12": 3205449216.0,
-            "13": 3205449216.0,
-            "14": 3205449216.0,
-            "15": 3205449216.0,
-            "16": 3205449216.0,
-            "17": 3205449216.0,
-            "18": 3205449216.0,
-            "19": 3205449216.0,
-            "20": 3205449216.0,
-            "21": 3205449216.0,
-            "22": 3205449216.0,
-            "23": 3205449216.0,
-            "24": 3205449216.0,
-            "25": 3205449216.0,
-            "26": 3205449216.0,
-            "27": 3205449216.0,
-            "28": 3205449216.0,
-            "29": 3205449216.0,
-            "30": 3205449216.0,
-            "31": 3205449216.0,
-            "32": 3205449216.0,
-            "33": 3205449216.0,
-            "34": 3205449216.0,
-            "35": 3205449216.0,
-            "36": 3205449216.0,
-            "37": 3205449216.0,
-            "38": 3205449216.0,
-            "39": 3205449216.0,
-            "40": 3205449216.0,
-            "41": 3205449216.0,
-            "42": 3205449216.0,
-            "43": 3205449216.0,
-            "44": 3205449216.0,
-            "45": 3205449216.0,
-            "46": 3205449216.0,
-            "47": 3205449216.0,
-            "48": 3205449216.0,
-            "49": 3205449216.0,
-            "50": 3205449216.0
+            "1": 2507528704.0,
+            "2": 3206498816.0,
+            "3": 3206498816.0,
+            "4": 3206498816.0,
+            "5": 3206498816.0,
+            "6": 3206498816.0,
+            "7": 3206498816.0,
+            "8": 3206498816.0,
+            "9": 3206498816.0,
+            "10": 3206498816.0,
+            "11": 3206498816.0,
+            "12": 3206498816.0,
+            "13": 3206498816.0,
+            "14": 3206498816.0,
+            "15": 3206498816.0,
+            "16": 3206498816.0,
+            "17": 3206498816.0,
+            "18": 3206498816.0,
+            "19": 3206498816.0,
+            "20": 3206498816.0,
+            "21": 3206498816.0,
+            "22": 3206498816.0,
+            "23": 3206498816.0,
+            "24": 3206498816.0,
+            "25": 3206498816.0,
+            "26": 3206498816.0,
+            "27": 3206498816.0,
+            "28": 3206498816.0,
+            "29": 3206498816.0,
+            "30": 3206498816.0,
+            "31": 3206498816.0,
+            "32": 3206498816.0,
+            "33": 3206498816.0,
+            "34": 3206498816.0,
+            "35": 3206498816.0,
+            "36": 3206498816.0,
+            "37": 3206498816.0,
+            "38": 3206498816.0,
+            "39": 3206498816.0,
+            "40": 3206498816.0,
+            "41": 3206498816.0,
+            "42": 3206498816.0,
+            "43": 3206498816.0,
+            "44": 3206498816.0,
+            "45": 3206498816.0,
+            "46": 3206498816.0,
+            "47": 3206498816.0,
+            "48": 3206498816.0,
+            "49": 3206498816.0,
+            "50": 3206498816.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.29331,
-            "2": 1.82828,
-            "3": 1.75745,
-            "4": 1.75149,
-            "5": 1.76912,
-            "6": 1.75888,
-            "7": 1.75313,
-            "8": 1.75423,
-            "9": 1.74482,
-            "10": 1.84387,
-            "11": 2.01499,
-            "12": 1.74448,
-            "13": 1.75425,
-            "14": 2.09351,
-            "15": 1.77765,
-            "16": 1.76841,
-            "17": 1.75495,
-            "18": 2.05727,
-            "19": 1.77481,
-            "20": 2.11285,
-            "21": 1.77659,
-            "22": 1.75669,
-            "23": 1.75872,
-            "24": 2.1065,
-            "25": 2.02543,
-            "26": 1.84773,
-            "27": 1.76632,
-            "28": 1.76482,
-            "29": 1.75732,
-            "30": 1.75335,
-            "31": 1.75453,
-            "32": 1.80627,
-            "33": 1.757,
-            "34": 1.75719,
-            "35": 1.75478,
-            "36": 1.76009,
-            "37": 1.75602,
-            "38": 1.75806,
-            "39": 1.75609,
-            "40": 1.75247,
-            "41": 1.75179,
-            "42": 1.75873,
-            "43": 1.77534,
-            "44": 1.80833,
-            "45": 1.74663,
-            "46": 1.75048,
-            "47": 1.7473,
-            "48": 1.75253,
-            "49": 1.76783,
-            "50": 1.75365
+            "1": 10.8403,
+            "2": 1.75656,
+            "3": 1.70317,
+            "4": 1.66346,
+            "5": 1.6703,
+            "6": 1.66753,
+            "7": 2.21547,
+            "8": 1.68918,
+            "9": 1.77005,
+            "10": 1.75261,
+            "11": 1.77153,
+            "12": 1.65933,
+            "13": 1.65337,
+            "14": 2.37845,
+            "15": 2.04839,
+            "16": 2.07092,
+            "17": 1.67053,
+            "18": 1.6729,
+            "19": 1.65463,
+            "20": 1.67298,
+            "21": 1.66273,
+            "22": 1.64743,
+            "23": 1.64351,
+            "24": 1.63695,
+            "25": 1.66076,
+            "26": 1.66885,
+            "27": 1.64423,
+            "28": 1.64773,
+            "29": 1.64565,
+            "30": 1.64171,
+            "31": 1.63705,
+            "32": 1.64216,
+            "33": 1.64504,
+            "34": 1.64255,
+            "35": 1.64762,
+            "36": 1.64913,
+            "37": 1.63831,
+            "38": 1.65213,
+            "39": 1.66065,
+            "40": 1.63954,
+            "41": 1.63964,
+            "42": 1.64408,
+            "43": 1.64113,
+            "44": 1.65016,
+            "45": 1.63618,
+            "46": 1.65229,
+            "47": 1.64761,
+            "48": 1.76963,
+            "49": 1.62535,
+            "50": 1.63142
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
index ab5558fa7d2..af341b0f670 100644
--- a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
@@ -45,6 +45,7 @@ MODEL_ARGS:
   --log-params-norm: true
   --log-validation-ppl-to-tensorboard: true
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --attention-backend: unfused
   --exit-interval: 20000
diff --git a/tests/functional_tests/test_cases/common/moe_perf/__main__.py b/tests/functional_tests/test_cases/common/moe_perf/__main__.py
new file mode 100644
index 00000000000..ace44c7ca4f
--- /dev/null
+++ b/tests/functional_tests/test_cases/common/moe_perf/__main__.py
@@ -0,0 +1,420 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+"""GPU performance regression tests for the MoE layer."""
+
+from __future__ import annotations
+
+import gc
+import json
+import os
+import statistics
+from contextlib import nullcontext
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, Mapping, Optional, Sequence, cast
+
+import pytest  # type: ignore[import]
+import torch
+
+from megatron.core.config import set_experimental_flag
+from megatron.core.fp8_utils import get_fp8_context
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP, HAVE_HYBRIDEP
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.moe.moe_utils import RandomSTE
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from megatron.training.initialize import _set_random_seed
+from tests.unit_tests.test_utilities import Utils
+
+from .test_cases import PERFORMANCE_CASES, MoEPerformanceCase
+
+# NOTE: Performance regression threshold
+DEFAULT_MAX_REGRESSION_RATIO = 1.02
+DEFAULT_MAX_VARIANCE_RATIO = 0.02  # The std/mean should be less than 2%
+WARMUP_ITERS = 5
+MEASURE_ITERS = 20
+
+
+BASELINES_PATH = Path(__file__).resolve().parent / "baseline.json"
+UPDATE_BASELINES_ENV = "MEGATRON_UPDATE_PERF_BASELINES"
+
+
+def _build_transformer_config(case: MoEPerformanceCase) -> TransformerConfig:
+    model = case.model
+    config_kwargs = dict(
+        num_layers=1,
+        hidden_size=model.hidden_size,
+        moe_ffn_hidden_size=model.moe_ffn_hidden_size,
+        num_attention_heads=model.num_attention_heads,
+        # MoE Arguments
+        num_moe_experts=model.num_experts,
+        moe_router_topk=model.router_topk,
+        moe_router_load_balancing_type="aux_loss",
+        moe_aux_loss_coeff=1.0,
+        moe_token_dispatcher_type=case.token_dispatcher,
+        moe_flex_dispatcher_backend=case.moe_flex_dispatcher_backend,
+        use_cpu_initialization=True,
+        add_bias_linear=False,
+        # Router Arguments
+        moe_router_num_groups=model.moe_router_num_groups,
+        moe_router_group_topk=model.moe_router_group_topk,
+        moe_router_score_function=model.moe_router_score_function,
+        moe_router_dtype=model.moe_router_dtype,
+        moe_router_enable_expert_bias=model.moe_router_enable_expert_bias,
+        # Parallelism Arguments
+        sequence_parallel=case.tensor_model_parallel_size > 1,
+        tensor_model_parallel_size=case.tensor_model_parallel_size,
+        pipeline_model_parallel_size=case.pipeline_model_parallel_size,
+        expert_model_parallel_size=case.expert_model_parallel_size,
+        expert_tensor_parallel_size=case.expert_tensor_parallel_size,
+        context_parallel_size=case.context_parallel_size,
+        params_dtype=case.input_dtype,
+        bf16=True,
+        fp8=case.fp8,
+        moe_permute_fusion=case.moe_permute_fusion,
+        moe_router_fusion=case.moe_router_fusion,
+        moe_router_force_load_balancing=case.moe_router_force_load_balancing,
+    )
+
+    if case.fp8:
+        config_kwargs.update(
+            dict(fp8="hybrid", fp8_margin=0, fp8_interval=1, fp8_recipe="blockwise")
+        )
+
+    return TransformerConfig(**config_kwargs)
+
+
+# NOTE: Only TE backend is covered in this test.
+def _resolve_moe_submodules(case: MoEPerformanceCase):
+    layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        num_experts=case.model.num_experts, moe_grouped_gemm=True
+    )
+    return layer_spec.submodules.mlp.submodules
+
+
+def _load_baselines() -> Dict[str, Dict[str, float]]:
+    if not BASELINES_PATH.exists():
+        return {}
+    with BASELINES_PATH.open("r", encoding="utf-8") as fh:
+        return json.load(fh)
+
+
+def _persist_baselines(data: Dict[str, Dict[str, float]]) -> None:
+    BASELINES_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with BASELINES_PATH.open("w", encoding="utf-8") as fh:
+        json.dump(data, fh, indent=2, sort_keys=True)
+        fh.write("\n")
+
+
+def _serialize_metrics(metrics: Dict[str, float]) -> Dict[str, float]:
+    forward_ms = metrics["forward_ms"]
+    backward_ms = metrics["backward_ms"]
+    return {
+        "forward_ms": forward_ms,
+        "backward_ms": backward_ms,
+        "max_allocated_bytes": metrics["max_allocated_bytes"],
+        "max_regression_ratio": DEFAULT_MAX_REGRESSION_RATIO,
+    }
+
+
+def _assert_within_baseline(
+    case_name: str, metrics: Mapping[str, Any], baselines: Dict[str, Dict[str, float]]
+):
+    baseline = baselines.get(case_name)
+    if baseline is None:
+        pytest.fail(
+            f"Missing baseline data for {case_name}. Set {UPDATE_BASELINES_ENV}=1 to record."
+        )
+
+    max_ratio = baseline.get("max_regression_ratio", DEFAULT_MAX_REGRESSION_RATIO)
+
+    def _limit(metric_name: str) -> float:
+        baseline_value = baseline.get(metric_name)
+        if baseline_value is None:
+            return float("inf")
+        ratio_limit = baseline_value * max_ratio
+        return ratio_limit
+
+    fwd_limit = _limit("forward_ms")
+    bwd_limit = _limit("backward_ms")
+    mem_limit = _limit("max_allocated_bytes")
+
+    forward_ms = cast(float, metrics["forward_ms"])
+    backward_ms = cast(float, metrics["backward_ms"])
+    max_allocated_bytes = cast(float, metrics["max_allocated_bytes"])
+
+    forward_std_ms = cast(float, metrics.get("forward_std_ms", 0.0))
+    backward_std_ms = cast(float, metrics.get("backward_std_ms", 0.0))
+    forward_timings = cast(Sequence[float], metrics.get("forward_timings", ()))
+    backward_timings = cast(Sequence[float], metrics.get("backward_timings", ()))
+
+    assert (
+        forward_ms <= fwd_limit
+    ), f"Forward pass for {case_name} regressed: {forward_ms:.3f} ms (limit {fwd_limit:.3f} ms)."
+    assert (
+        backward_ms <= bwd_limit
+    ), f"Backward pass for {case_name} regressed: {backward_ms:.3f} ms (limit {bwd_limit:.3f} ms)."
+
+    if forward_ms > 0.0:
+        assert forward_std_ms / forward_ms <= DEFAULT_MAX_VARIANCE_RATIO, (
+            "Forward pass for "
+            f"{case_name} has high variance: {forward_std_ms:.3f} ms "
+            f"(limit {DEFAULT_MAX_VARIANCE_RATIO:.3f} of {forward_ms:.3f} ms). "
+            f"The full timings are {list(forward_timings)}."
+        )
+    if backward_ms > 0.0:
+        assert backward_std_ms / backward_ms <= DEFAULT_MAX_VARIANCE_RATIO, (
+            "Backward pass for "
+            f"{case_name} has high variance: {backward_std_ms:.3f} ms "
+            f"(limit {DEFAULT_MAX_VARIANCE_RATIO:.3f} of {backward_ms:.3f} ms). "
+            f"The full timings are {list(backward_timings)}."
+        )
+    assert max_allocated_bytes <= mem_limit, (
+        "Max allocated memory for "
+        f"{case_name} regressed: {max_allocated_bytes / (1024 ** 2):.3f} MiB "
+        f"(limit {mem_limit / (1024 ** 2):.3f} MiB)."
+    )
+
+
+def _benchmark_moe_layer(layer: MoELayer, case: MoEPerformanceCase):
+    torch.cuda.synchronize()
+    set_experimental_flag(True)
+
+    forward_timings = []
+    backward_timings = []
+    max_allocated_bytes = []
+
+    generator = torch.Generator(device="cuda").manual_seed(1234)
+    model = case.model
+
+    if case.manual_gc:
+        torch.cuda.empty_cache()
+        gc.disable()
+        gc.collect()
+
+    # NOTE: Using the same input tensor for all iterations to prevent different routing results,
+    # which may lead to different kernels and library load/compile overhead.
+    input_tensor = torch.randn(
+        model.seq_length,
+        model.micro_batch_size,
+        model.hidden_size,
+        device="cuda",
+        dtype=case.input_dtype,
+        generator=generator,
+    )
+    input_tensor.requires_grad_(True)
+    for iteration in range(WARMUP_ITERS + MEASURE_ITERS):
+        if RandomSTE.generator is not None:
+            RandomSTE.generator.manual_seed(RandomSTE.generator.initial_seed())
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            torch.distributed.barrier()
+        torch.cuda.nvtx.range_push(f"({case.name}) iteration {iteration}")
+        # Use a long CUDA kernel to hide the router launch overhead
+        with torch.cuda.nvtx.range("(dummy GEMM)"):
+            dummy_tensor = torch.randn(8192, 8192, device="cuda")
+            torch.matmul(dummy_tensor, dummy_tensor)
+            del dummy_tensor
+        input_tensor.grad = None
+        layer.zero_grad(set_to_none=True)
+        torch.cuda.reset_peak_memory_stats()
+
+        fwd_start = torch.cuda.Event(enable_timing=True)
+        fwd_end = torch.cuda.Event(enable_timing=True)
+        bwd_start = torch.cuda.Event(enable_timing=True)
+        bwd_end = torch.cuda.Event(enable_timing=True)
+
+        context = get_fp8_context(layer.config) if case.fp8 else nullcontext()
+        with context:
+            fwd_start.record()
+            output, _ = layer(input_tensor)
+            fwd_end.record()
+
+            backward_grad = torch.randn_like(output)
+            bwd_start.record()
+            output.backward(backward_grad)
+            bwd_end.record()
+
+        torch.cuda.nvtx.range_pop()
+        torch.cuda.synchronize()
+
+        if iteration >= WARMUP_ITERS:
+            forward_timings.append(fwd_start.elapsed_time(fwd_end))
+            backward_timings.append(bwd_start.elapsed_time(bwd_end))
+            max_allocated_bytes.append(torch.cuda.max_memory_allocated())
+
+    # Exclude the top 3 values from timings lists to avoid outliers
+    forward_timings_sorted = sorted(forward_timings)[:-3]
+    backward_timings_sorted = sorted(backward_timings)[:-3]
+    forward_ms = statistics.mean(forward_timings)
+    backward_ms = statistics.mean(backward_timings)
+    max_allocated_bytes = statistics.mean(max_allocated_bytes)
+
+    if case.manual_gc:
+        gc.collect()
+        gc.enable()
+
+    if Utils.rank == 0:
+        print(f"({case.name}) forward times {forward_timings}")
+    return {
+        "forward_ms": forward_ms,
+        "backward_ms": backward_ms,
+        "forward_std_ms": statistics.pstdev(forward_timings) if len(forward_timings) > 1 else 0.0,
+        "backward_std_ms": (
+            statistics.pstdev(backward_timings) if len(backward_timings) > 1 else 0.0
+        ),
+        "max_allocated_bytes": max_allocated_bytes,
+        "forward_timings": forward_timings,
+        "backward_timings": backward_timings,
+    }
+
+
+def _maybe_update_baseline(
+    case: MoEPerformanceCase, metrics: Dict[str, float], baselines: Dict[str, Dict[str, float]]
+):
+    forward_ms = metrics["forward_ms"]
+    backward_ms = metrics["backward_ms"]
+    forward_std_ms = metrics["forward_std_ms"]
+    backward_std_ms = metrics["backward_std_ms"]
+    assert forward_std_ms / forward_ms <= DEFAULT_MAX_VARIANCE_RATIO, (
+        "Forward pass for "
+        f"{case.name} has high variance: {forward_std_ms:.3f} ms "
+        f"(limit {DEFAULT_MAX_VARIANCE_RATIO:.3f} of {forward_ms:.3f} ms)."
+    )
+    assert backward_std_ms / backward_ms <= DEFAULT_MAX_VARIANCE_RATIO, (
+        "Backward pass for "
+        f"{case.name} has high variance: {backward_std_ms:.3f} ms "
+        f"(limit {DEFAULT_MAX_VARIANCE_RATIO:.3f} of {backward_ms:.3f} ms)."
+    )
+    baselines[case.name] = _serialize_metrics(metrics)
+    _persist_baselines(baselines)
+
+
+def _prepare_moe_layer(case: MoEPerformanceCase) -> MoELayer:
+    config = _build_transformer_config(case)
+    submodules = _resolve_moe_submodules(case)
+    layer = MoELayer(config=config, submodules=submodules).cuda().to(dtype=torch.bfloat16)
+
+    layer.train()
+    return layer
+
+
+def _check_env():
+    NCCL_MAX_NCHANNELS = os.environ.get("NCCL_MAX_NCHANNELS")
+    if NCCL_MAX_NCHANNELS is not None:
+        pytest.fail(
+            f"NCCL_MAX_NCHANNELS is set to {NCCL_MAX_NCHANNELS}, this may lead to performance regression"
+        )
+
+
+def _check_dependencies(case: MoEPerformanceCase):
+    if case.token_dispatcher == "flex":
+        if case.moe_flex_dispatcher_backend == "deepep":
+            if not HAVE_DEEP_EP:
+                pytest.skip("DeepEP is not available")
+        elif case.moe_flex_dispatcher_backend == "hybridep":
+            if not HAVE_HYBRIDEP:
+                pytest.skip("HybridEP is not available")
+
+
+@pytest.mark.flaky(reruns=10)
+@pytest.mark.internal
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="CUDA is required for MoE performance benchmarking"
+)
+@pytest.mark.parametrize("perf_case", PERFORMANCE_CASES, ids=lambda c: c.name)
+def test_moe_layer_performance(perf_case: MoEPerformanceCase, debug_mode: bool = False):
+    _check_env()
+    _check_dependencies(perf_case)
+    if not perf_case.is_current_platform():
+        pytest.skip(
+            "GPU platform mismatch: "
+            f"expected '{perf_case.gpu_platform}', "
+            f"found '{torch.cuda.get_device_name(torch.cuda.current_device())}'."
+        )
+
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=perf_case.tensor_model_parallel_size,
+        pipeline_model_parallel_size=perf_case.pipeline_model_parallel_size,
+        expert_model_parallel_size=perf_case.expert_model_parallel_size,
+        context_parallel_size=perf_case.context_parallel_size,
+        expert_tensor_parallel_size=perf_case.expert_tensor_parallel_size,
+    )
+
+    try:
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        torch.cuda.reset_peak_memory_stats()
+        layer = _prepare_moe_layer(perf_case)
+        with torch.cuda.nvtx.range(f"({perf_case.name})"):
+            metrics = _benchmark_moe_layer(layer, perf_case)
+
+        summary = (
+            f"MoE layer performance ({perf_case.name}): forward {metrics['forward_ms']:.3f} ms "
+            f"(σ={metrics['forward_std_ms']:.3f}), backward {metrics['backward_ms']:.3f} ms "
+            f"(σ={metrics['backward_std_ms']:.3f}), max mem {metrics['max_allocated_bytes'] / (1024 ** 2):.3f} MiB"
+        )
+        if Utils.rank == 0:
+            print(summary)
+
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+        # Don't check performance if profiling is enabled
+        baseline_failed = False
+        baseline_failure_message = ""
+
+        # Only rank 0 checks the baseline
+        if Utils.rank == 0 and not debug_mode:
+            baselines = _load_baselines()
+            try:
+                if os.getenv(UPDATE_BASELINES_ENV) == "1":
+                    _maybe_update_baseline(perf_case, metrics, baselines)
+                else:
+                    _assert_within_baseline(perf_case.name, metrics, baselines)
+            except AssertionError as exc:
+                baseline_failed = True
+                baseline_failure_message = str(exc)
+
+        failure_tensor = torch.tensor(
+            [1 if baseline_failed else 0],
+            device=torch.device("cuda", torch.cuda.current_device()),
+            dtype=torch.int32,
+        )
+        torch.distributed.all_reduce(failure_tensor, op=torch.distributed.ReduceOp.MAX)
+        baseline_failed = bool(failure_tensor.item())
+
+        if baseline_failed:
+            if Utils.rank != 0:
+                baseline_failure_message = "Baseline regression detected on rank 0."
+                pytest.fail(baseline_failure_message, pytrace=False)
+            else:
+                pytest.fail(baseline_failure_message, pytrace=True)
+
+    finally:
+        Utils.destroy_model_parallel()
+        torch.cuda.empty_cache()
+
+
+# Main entry for local performance testing
+# Commands to run with nsys profiling:
+# nsys profile --sample=none --cpuctxsw=none -t cuda,nvtx \
+#         -f true -x true \
+#         --cuda-graph-trace=node \
+#         --capture-range=cudaProfilerApi \
+#         --capture-range-end=stop \
+#         -o output \
+#         uv run --no-sync python -m torch.distributed.run --nproc_per_node=8 --nnodes=1 -m tests.functional_tests.test_cases.common.moe_perf
+# Commands to run with pytest:
+# export MEGATRON_UPDATE_PERF_BASELINES=0 # set to 1 to update baseline perf numbers
+# uv run --no-sync python -m torch.distributed.run --nproc_per_node=8 --nnodes=1 -m tests.functional_tests.test_cases.common.moe_perf
+if __name__ == "__main__":
+    pytest.main(["-x", "-v", "-s", __file__])  # -xvs
+    # torch.cuda.cudart().cudaProfilerStart()
+    # torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
+    # for case in PERFORMANCE_CASES:
+    #     if case.name == "mixtral_a2a_tp1ep8_fp8":
+    #         test_moe_layer_performance(case, debug_mode=True)
+    # torch.cuda.cudart().cudaProfilerStop()
+    # torch.distributed.destroy_process_group()
diff --git a/tests/functional_tests/test_cases/common/moe_perf/baseline.json b/tests/functional_tests/test_cases/common/moe_perf/baseline.json
new file mode 100644
index 00000000000..af8af9f9482
--- /dev/null
+++ b/tests/functional_tests/test_cases/common/moe_perf/baseline.json
@@ -0,0 +1,74 @@
+{
+  "deepseek_a2a_tp1ep8_bf16": {
+    "backward_ms": 10.482670497894286,
+    "forward_ms": 7.119169592857361,
+    "max_allocated_bytes": 3442750464,
+    "max_regression_ratio": 1.02
+  },
+  "deepseek_a2a_tp1ep8_fp8": {
+    "backward_ms": 10.126460886001587,
+    "forward_ms": 7.609272027015686,
+    "max_allocated_bytes": 4412930560,
+    "max_regression_ratio": 1.02
+  },
+  "deepseek_deepep_tp1ep8_bf16": {
+    "backward_ms": 9.055137538909912,
+    "forward_ms": 5.681718397140503,
+    "max_allocated_bytes": 2445288448,
+    "max_regression_ratio": 1.02
+  },
+  "deepseek_deepep_tp1ep8_fp8": {
+    "backward_ms": 8.60211353302002,
+    "forward_ms": 6.034772801399231,
+    "max_allocated_bytes": 3707410944,
+    "max_regression_ratio": 1.02
+  },
+  "deepseek_hybridep_tp1ep8_bf16": {
+    "backward_ms": 8.795001602172851,
+    "forward_ms": 5.390828824043274,
+    "max_allocated_bytes": 2424369664,
+    "max_regression_ratio": 1.02
+  },
+  "deepseek_hybridep_tp1ep8_fp8": {
+    "backward_ms": 7.6874864339828495,
+    "forward_ms": 5.142886424064637,
+    "max_allocated_bytes": 2652078464,
+    "max_regression_ratio": 1.02
+  },
+  "mixtral_a2a_tp1ep8_bf16": {
+    "backward_ms": 7.684332823753357,
+    "forward_ms": 4.32688798904419,
+    "max_allocated_bytes": 2890180198.4,
+    "max_regression_ratio": 1.02
+  },
+  "mixtral_a2a_tp1ep8_fp8": {
+    "backward_ms": 6.618246412277221,
+    "forward_ms": 3.7876319885253906,
+    "max_allocated_bytes": 3095914086.4,
+    "max_regression_ratio": 1.02
+  },
+  "mixtral_deepep_tp1ep8_bf16": {
+    "backward_ms": 7.854356813430786,
+    "forward_ms": 4.635550403594971,
+    "max_allocated_bytes": 2890295808,
+    "max_regression_ratio": 1.02
+  },
+  "mixtral_deepep_tp1ep8_fp8": {
+    "backward_ms": 6.82720000743866,
+    "forward_ms": 4.101150441169739,
+    "max_allocated_bytes": 3095448064,
+    "max_regression_ratio": 1.02
+  },
+  "mixtral_hybridep_tp1ep8_bf16": {
+    "backward_ms": 7.740782427787781,
+    "forward_ms": 4.342604804039001,
+    "max_allocated_bytes": 2889559552,
+    "max_regression_ratio": 1.02
+  },
+  "mixtral_hybridep_tp1ep8_fp8": {
+    "backward_ms": 6.57167682647705,
+    "forward_ms": 3.7007392168045046,
+    "max_allocated_bytes": 3095269376,
+    "max_regression_ratio": 1.02
+  }
+}
diff --git a/tests/functional_tests/test_cases/common/moe_perf/test_cases.py b/tests/functional_tests/test_cases/common/moe_perf/test_cases.py
new file mode 100644
index 00000000000..8d8535ca513
--- /dev/null
+++ b/tests/functional_tests/test_cases/common/moe_perf/test_cases.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+from dataclasses import dataclass
+from typing import Iterable, Optional
+
+import torch
+
+
+@dataclass(frozen=True)
+class MoEModelConfig:
+    seq_length: int
+    micro_batch_size: int
+    hidden_size: int
+    moe_ffn_hidden_size: int
+    num_experts: int
+    router_topk: int
+    num_attention_heads: int = 8
+    moe_shared_expert_intermediate_size: Optional[int] = None
+
+    # Router related
+    moe_router_load_balancing_type: str = "aux_loss"
+    moe_router_num_groups: Optional[int] = None
+    moe_router_group_topk: Optional[int] = None
+    moe_router_score_function: str = "softmax"
+    moe_router_dtype: str = "fp32"
+    moe_router_enable_expert_bias: bool = False
+
+
+@dataclass(frozen=True)
+class MoEPerformanceCase:
+    """Describes a single MoE performance configuration to exercise."""
+
+    name: str
+    model: MoEModelConfig
+
+    # Token dispatcher related
+    token_dispatcher: str
+    moe_flex_dispatcher_backend: str = "deepep"
+
+    # FP8 related
+    fp8: Optional[str] = None
+    fp8_recipe: Optional[str] = None
+
+    # Tested GPU platform
+    gpu_platform: str = "H100"
+
+    # Parallelism related
+    tensor_model_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    expert_model_parallel_size: int = 1
+    context_parallel_size: int = 1
+    expert_tensor_parallel_size: int = 1
+
+    # kernel fusion related
+    moe_permute_fusion: bool = True
+    moe_router_fusion: bool = True
+
+    # Performance stability related
+    moe_router_force_load_balancing: bool = True
+    manual_gc: bool = True
+
+    @property
+    def input_dtype(self) -> torch.dtype:
+        return torch.bfloat16
+
+    def is_current_platform(self) -> bool:
+        if self.gpu_platform is None:
+            return True
+        device_name = torch.cuda.get_device_name(torch.cuda.current_device())
+        return self.gpu_platform.lower() in device_name.lower()
+
+
+MIXTRAL_PROXY = MoEModelConfig(
+    seq_length=4096,
+    micro_batch_size=1,
+    hidden_size=4096,
+    moe_ffn_hidden_size=14336,
+    num_experts=8,
+    router_topk=2,
+    moe_router_load_balancing_type="aux_loss",
+)
+
+DEEPSEEK_PROXY = MoEModelConfig(
+    seq_length=4096,
+    micro_batch_size=1,
+    hidden_size=7168,
+    moe_ffn_hidden_size=2048,
+    num_experts=32,
+    router_topk=8,
+    moe_router_load_balancing_type="seq_aux_loss",
+    moe_router_num_groups=8,
+    moe_router_group_topk=4,
+    moe_router_score_function="sigmoid",
+    moe_router_dtype="fp32",
+    moe_router_enable_expert_bias=True,
+    moe_shared_expert_intermediate_size=2048,
+)
+
+
+PERFORMANCE_CASES: Iterable[MoEPerformanceCase] = (
+    MoEPerformanceCase(
+        name="mixtral_a2a_tp1ep8_fp8",
+        token_dispatcher="alltoall",
+        model=MIXTRAL_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+        fp8="e4m3",
+        fp8_recipe="blockwise",
+    ),
+    MoEPerformanceCase(
+        name="mixtral_deepep_tp1ep8_fp8",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="deepep",
+        model=MIXTRAL_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+        fp8="e4m3",
+        fp8_recipe="blockwise",
+    ),
+    MoEPerformanceCase(
+        name="mixtral_hybridep_tp1ep8_fp8",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="hybridep",
+        model=MIXTRAL_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+        fp8="e4m3",
+        fp8_recipe="blockwise",
+    ),
+    MoEPerformanceCase(
+        name="deepseek_a2a_tp1ep8_fp8",
+        token_dispatcher="alltoall",
+        model=DEEPSEEK_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+        fp8="e4m3",
+        fp8_recipe="blockwise",
+    ),
+    MoEPerformanceCase(
+        name="deepseek_hybridep_tp1ep8_fp8",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="hybridep",
+        model=DEEPSEEK_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+        fp8="e4m3",
+        fp8_recipe="blockwise",
+    ),
+    MoEPerformanceCase(
+        name="deepseek_deepep_tp1ep8_fp8",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="deepep",
+        model=DEEPSEEK_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+        fp8="e4m3",
+        fp8_recipe="blockwise",
+    ),
+    MoEPerformanceCase(
+        name="mixtral_a2a_tp1ep8_bf16",
+        token_dispatcher="alltoall",
+        model=MIXTRAL_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+    ),
+    MoEPerformanceCase(
+        name="mixtral_deepep_tp1ep8_bf16",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="deepep",
+        model=MIXTRAL_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+    ),
+    MoEPerformanceCase(
+        name="mixtral_hybridep_tp1ep8_bf16",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="hybridep",
+        model=MIXTRAL_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+    ),
+    MoEPerformanceCase(
+        name="deepseek_a2a_tp1ep8_bf16",
+        token_dispatcher="alltoall",
+        model=DEEPSEEK_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+    ),
+    MoEPerformanceCase(
+        name="deepseek_deepep_tp1ep8_bf16",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="deepep",
+        model=DEEPSEEK_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+    ),
+    MoEPerformanceCase(
+        name="deepseek_hybridep_tp1ep8_bf16",
+        token_dispatcher="flex",
+        moe_flex_dispatcher_backend="hybridep",
+        model=DEEPSEEK_PROXY,
+        tensor_model_parallel_size=1,
+        expert_model_parallel_size=8,
+    ),
+)
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
index 44f9de33775..692e3882e02 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
@@ -84,6 +84,7 @@ MODEL_ARGS:
   --log-interval: 100
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   # Add mixed precision args
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
index 8b437ba75e7..4c05b0ba87f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
@@ -70,7 +70,8 @@ MODEL_ARGS:
   # Add checkpointing args
   --load: ${CHECKPOINT_LOAD_PATH}
   --save: ${CHECKPOINT_SAVE_PATH}
-  --save-interval: 1000
+  --save-interval: 5000
+  --save-retain-interval: 2500
   # Add initialization args
   --init-method-std: 0.0134
   # Add logging args
@@ -83,6 +84,7 @@ MODEL_ARGS:
   --log-interval: 100
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   # Add mixed precision args
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..0174aaf4684
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,162 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 12.59654,
+            "2": 12.60484,
+            "3": 12.59799,
+            "4": 12.59687,
+            "5": 12.59285,
+            "6": 12.59259,
+            "7": 12.58011,
+            "8": 12.54308,
+            "9": 12.51049,
+            "10": 12.49679,
+            "11": 12.32875,
+            "12": 12.29944,
+            "13": 12.2346,
+            "14": 12.23325,
+            "15": 11.81699,
+            "16": 11.80131,
+            "17": 11.76433,
+            "18": 11.73986,
+            "19": 11.6089,
+            "20": 11.50642,
+            "21": 11.26938,
+            "22": 11.37967,
+            "23": 11.288,
+            "24": 11.16331,
+            "25": 10.99891
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 521037632.0,
+            "2": 521666368.0,
+            "3": 520934816.0,
+            "4": 521227264.0,
+            "5": 520996064.0,
+            "6": 521371840.0,
+            "7": 521420352.0,
+            "8": 521057344.0,
+            "9": 521461504.0,
+            "10": 521178624.0,
+            "11": 522279104.0,
+            "12": 521439616.0,
+            "13": 521475712.0,
+            "14": 522445376.0,
+            "15": 521592960.0,
+            "16": 521416448.0,
+            "17": 521026496.0,
+            "18": 521277760.0,
+            "19": 521154656.0,
+            "20": 521134784.0,
+            "21": 522907648.0,
+            "22": 521590304.0,
+            "23": 521352384.0,
+            "24": 521424640.0,
+            "25": 523543808.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 24540168192.0,
+            "2": 24540168192.0,
+            "3": 24540168192.0,
+            "4": 24540168192.0,
+            "5": 24540168192.0,
+            "6": 24540168192.0,
+            "7": 24540168192.0,
+            "8": 24540168192.0,
+            "9": 24540168192.0,
+            "10": 24540168192.0,
+            "11": 24540168192.0,
+            "12": 24540168192.0,
+            "13": 24540168192.0,
+            "14": 24540168192.0,
+            "15": 24540168192.0,
+            "16": 24540168192.0,
+            "17": 24540168192.0,
+            "18": 24540168192.0,
+            "19": 24540168192.0,
+            "20": 24540168192.0,
+            "21": 24540168192.0,
+            "22": 24540168192.0,
+            "23": 24540168192.0,
+            "24": 24540168192.0,
+            "25": 24540168192.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 52729765888.0,
+            "2": 60518424576.0,
+            "3": 60518424576.0,
+            "4": 60518424576.0,
+            "5": 60518424576.0,
+            "6": 60518424576.0,
+            "7": 60518424576.0,
+            "8": 60518424576.0,
+            "9": 60518424576.0,
+            "10": 60518424576.0,
+            "11": 60518424576.0,
+            "12": 60518424576.0,
+            "13": 60518424576.0,
+            "14": 60518424576.0,
+            "15": 60518424576.0,
+            "16": 60518424576.0,
+            "17": 60518424576.0,
+            "18": 60518424576.0,
+            "19": 60518424576.0,
+            "20": 60518424576.0,
+            "21": 60518424576.0,
+            "22": 60518424576.0,
+            "23": 60518424576.0,
+            "24": 60518424576.0,
+            "25": 60518424576.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.93489,
+            "3": "nan",
+            "4": 0.83885,
+            "5": "nan",
+            "6": 0.86101,
+            "7": "nan",
+            "8": 0.82617,
+            "9": "nan",
+            "10": 0.8264,
+            "11": "nan",
+            "12": 0.82456,
+            "13": "nan",
+            "14": 0.82414,
+            "15": "nan",
+            "16": 0.82604,
+            "17": "nan",
+            "18": 0.83002,
+            "19": "nan",
+            "20": 0.8234,
+            "21": "nan",
+            "22": 0.82298,
+            "23": "nan",
+            "24": 0.82311,
+            "25": "nan"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json
index 478f889b21c..2ed3bf0784f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json
@@ -88,7 +88,7 @@
             "18": 24540168192.0,
             "19": 24540168192.0,
             "20": 24540168192.0,
-            "21": 24540389376.0,
+            "21": 24540168192.0,
             "22": 24540168192.0,
             "23": 24540168192.0,
             "24": 24540168192.0,
@@ -100,7 +100,7 @@
         "end_step": 25,
         "step_interval": 1,
         "values": {
-            "1": 52730810368.0,
+            "1": 52730814464.0,
             "2": 60518424576.0,
             "3": 60518424576.0,
             "4": 60518424576.0,
@@ -133,29 +133,29 @@
         "step_interval": 1,
         "values": {
             "1": "nan",
-            "2": 10.03336,
+            "2": 11.06832,
             "3": "nan",
-            "4": 1.18525,
+            "4": 1.16152,
             "5": "nan",
-            "6": 1.18158,
+            "6": 1.15069,
             "7": "nan",
-            "8": 1.18536,
+            "8": 1.15402,
             "9": "nan",
-            "10": 1.18428,
+            "10": 1.15412,
             "11": "nan",
-            "12": 1.18625,
+            "12": 1.15321,
             "13": "nan",
-            "14": 1.18256,
+            "14": 1.15624,
             "15": "nan",
-            "16": 1.18023,
+            "16": 1.1571,
             "17": "nan",
-            "18": 1.18227,
+            "18": 1.15577,
             "19": "nan",
-            "20": 1.18284,
+            "20": 1.15939,
             "21": "nan",
-            "22": 1.18238,
+            "22": 1.15675,
             "23": "nan",
-            "24": 1.18151,
+            "24": 1.15533,
             "25": "nan"
         }
     }
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml
index c2a26a070fb..52b7c68d384 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml
@@ -53,7 +53,6 @@ MODEL_ARGS:
   --no-gradient-accumulation-fusion: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..ca51cd1bcb3
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,162 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 12.61164,
+            "2": 12.60596,
+            "3": 12.60278,
+            "4": 12.59692,
+            "5": 12.5956,
+            "6": 12.59777,
+            "7": 12.58051,
+            "8": 12.53845,
+            "9": 12.51222,
+            "10": 12.49859,
+            "11": 12.32384,
+            "12": 12.29418,
+            "13": 12.23141,
+            "14": 12.22824,
+            "15": 11.82221,
+            "16": 11.80412,
+            "17": 11.76119,
+            "18": 11.73708,
+            "19": 11.61309,
+            "20": 11.50147,
+            "21": 11.26475,
+            "22": 11.37638,
+            "23": 11.28398,
+            "24": 11.1565,
+            "25": 10.99865
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 523049152.0,
+            "2": 523677792.0,
+            "3": 522947712.0,
+            "4": 523241632.0,
+            "5": 523021120.0,
+            "6": 523374368.0,
+            "7": 523437888.0,
+            "8": 523083584.0,
+            "9": 523470432.0,
+            "10": 523196128.0,
+            "11": 524297728.0,
+            "12": 523455584.0,
+            "13": 523501312.0,
+            "14": 524479392.0,
+            "15": 523634048.0,
+            "16": 523462624.0,
+            "17": 523079392.0,
+            "18": 523360448.0,
+            "19": 523209952.0,
+            "20": 523228480.0,
+            "21": 524938432.0,
+            "22": 523660512.0,
+            "23": 523415872.0,
+            "24": 523485056.0,
+            "25": 525638592.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 20663463936.0,
+            "2": 20663463936.0,
+            "3": 20663463936.0,
+            "4": 20663463936.0,
+            "5": 20663463936.0,
+            "6": 20663463936.0,
+            "7": 20663463936.0,
+            "8": 20663463936.0,
+            "9": 20663463936.0,
+            "10": 20663463936.0,
+            "11": 20663463936.0,
+            "12": 20663463936.0,
+            "13": 20663463936.0,
+            "14": 20663463936.0,
+            "15": 20663463936.0,
+            "16": 20663463936.0,
+            "17": 20663463936.0,
+            "18": 20663463936.0,
+            "19": 20663463936.0,
+            "20": 20663463936.0,
+            "21": 20663463936.0,
+            "22": 20663463936.0,
+            "23": 20663463936.0,
+            "24": 20663463936.0,
+            "25": 20663463936.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": 51363229696.0,
+            "2": 58217480192.0,
+            "3": 58217480192.0,
+            "4": 58217480192.0,
+            "5": 58217480192.0,
+            "6": 58217480192.0,
+            "7": 58217480192.0,
+            "8": 58217480192.0,
+            "9": 58217480192.0,
+            "10": 58217480192.0,
+            "11": 58217480192.0,
+            "12": 58217480192.0,
+            "13": 58217480192.0,
+            "14": 58217480192.0,
+            "15": 58217480192.0,
+            "16": 58217480192.0,
+            "17": 58217480192.0,
+            "18": 58217480192.0,
+            "19": 58217480192.0,
+            "20": 58217480192.0,
+            "21": 58217480192.0,
+            "22": 58217480192.0,
+            "23": 58217480192.0,
+            "24": 58217480192.0,
+            "25": 58217480192.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 25,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.88247,
+            "3": "nan",
+            "4": 0.98359,
+            "5": "nan",
+            "6": 0.91373,
+            "7": "nan",
+            "8": 1.07044,
+            "9": "nan",
+            "10": 0.91309,
+            "11": "nan",
+            "12": 0.91579,
+            "13": "nan",
+            "14": 0.90609,
+            "15": "nan",
+            "16": 0.90906,
+            "17": "nan",
+            "18": 0.91134,
+            "19": "nan",
+            "20": 0.90623,
+            "21": "nan",
+            "22": 0.91236,
+            "23": "nan",
+            "24": 0.9145,
+            "25": "nan"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json
index 0847af86737..a05cc0a0778 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json
@@ -100,7 +100,7 @@
         "end_step": 25,
         "step_interval": 1,
         "values": {
-            "1": 50289545216.0,
+            "1": 50289487872.0,
             "2": 57143791616.0,
             "3": 57143791616.0,
             "4": 57143791616.0,
@@ -133,29 +133,29 @@
         "step_interval": 1,
         "values": {
             "1": "nan",
-            "2": 6.11084,
+            "2": 5.99154,
             "3": "nan",
-            "4": 1.11678,
+            "4": 1.10664,
             "5": "nan",
-            "6": 1.11532,
+            "6": 1.10108,
             "7": "nan",
-            "8": 1.11539,
+            "8": 1.09852,
             "9": "nan",
-            "10": 1.1161,
+            "10": 1.10395,
             "11": "nan",
-            "12": 1.11723,
+            "12": 1.13133,
             "13": "nan",
-            "14": 1.11756,
+            "14": 1.1009,
             "15": "nan",
-            "16": 1.11596,
+            "16": 1.10173,
             "17": "nan",
-            "18": 1.11605,
+            "18": 1.10058,
             "19": "nan",
-            "20": 1.11783,
+            "20": 1.10006,
             "21": "nan",
-            "22": 1.11636,
+            "22": 1.10081,
             "23": "nan",
-            "24": 1.11585,
+            "24": 1.09852,
             "25": "nan"
         }
     }
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_1/model_config.yaml
index 9cd921e9833..cd590ff1554 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_1/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_1/model_config.yaml
@@ -57,7 +57,6 @@ MODEL_ARGS:
   --save-interval: 10000
   --eval-interval: 1000
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
@@ -81,9 +80,9 @@ MODEL_ARGS:
   --error-injection-type: persistent_error
 AFTER_SCRIPT: |
   check_log() { if [[ -z $(grep -r $1 "$2" $LOG_DIR) ]]; then exit 1; else echo OK; fi }
-  check_log -F "WARNING:megatron.core.rerun_state_machine:Result validation enabled"
-  check_log -F "WARNING:megatron.core.rerun_state_machine:Injecting error type Persistent error"
-  check_log -F "WARNING:megatron.core.rerun_state_machine:First rerun: unexpected result is reproducible within the tolerance"
-  check_log -F "WARNING:megatron.core.rerun_state_machine:Saving a checkpoint and exiting now. Please resume the job from the checkpoint to rerun the last iteration and establish a diagnostic"
+  check_log -F "Result validation enabled"
+  check_log -F "Injecting error type Persistent error"
+  check_log -F "First rerun: unexpected result is reproducible within the tolerance"
+  check_log -F "Saving a checkpoint and exiting now. Please resume the job from the checkpoint to rerun the last iteration and establish a diagnostic"
   EXIT_CODE=0
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..d501eb20ca1
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/golden_values_dev_dgx_h100.json
@@ -0,0 +1,42 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 1,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86791
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 1,
+        "step_interval": 1,
+        "values": {
+            "1": 152866448.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 1,
+        "step_interval": 1,
+        "values": {
+            "1": 67277201408.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 1,
+        "step_interval": 1,
+        "values": {
+            "1": 67277205504.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 1,
+        "step_interval": 1,
+        "values": {
+            "1": 14.45281
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/model_config.yaml
index 582c9523f73..aff4e8a6f3d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/model_config.yaml
@@ -85,6 +85,7 @@ BASE_MODEL_ARGS: &BASE_MODEL_ARGS
   --eval-interval: 1000
   --ckpt-format: torch_dist
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
+  --dist-ckpt-optim-fully-reshardable: true
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2
   # data settings
@@ -101,8 +102,6 @@ BASE_MODEL_ARGS: &BASE_MODEL_ARGS
   --log-validation-ppl-to-tensorboard: true
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
-  # Needed for easy access to optimizer states
-  --dist-ckpt-save-pre-mcore-014: true
   # rerun settings
   --rerun-mode: validate_results
 MODEL_ARGS:
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_transient/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_transient/model_config.yaml
index 59a57e2212b..b091c0ff9f6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_transient/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_transient/model_config.yaml
@@ -57,7 +57,6 @@ MODEL_ARGS:
   --save-interval: 10000
   --eval-interval: 1000
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
@@ -86,4 +85,4 @@ AFTER_SCRIPT: |
   check_log -F "WARNING:megatron.core.rerun_state_machine:Injecting error type Transient error"
   check_log -E "ERROR:megatron\.core\.rerun_state_machine:Rank [0-9]+, node ([0-9a-z]|\-)+, device [0-9]+: Possible transient error!!"
   EXIT_CODE=0
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..bac18297ae6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.77536,
+            "2": 10.78444,
+            "3": 10.78593,
+            "4": 10.7484,
+            "5": 10.81554,
+            "6": 10.82691,
+            "7": 10.78469,
+            "8": 10.77764,
+            "9": 10.78351,
+            "10": 10.74241,
+            "11": 10.83031,
+            "12": 10.80335,
+            "13": 10.81653,
+            "14": 10.82186,
+            "15": 10.74223,
+            "16": 10.75087,
+            "17": 10.71888,
+            "18": 10.74308,
+            "19": 10.7407,
+            "20": 10.63713,
+            "21": 10.6277,
+            "22": 10.48435,
+            "23": 10.65701,
+            "24": 10.52682,
+            "25": 10.47546,
+            "26": 10.54091,
+            "27": 10.55554,
+            "28": 10.52147,
+            "29": 10.53465,
+            "30": 10.30892,
+            "31": 10.06663,
+            "32": 10.41746,
+            "33": 10.42487,
+            "34": 10.1739,
+            "35": 10.22475,
+            "36": 10.18282,
+            "37": 10.29689,
+            "38": 10.14801,
+            "39": 10.36934,
+            "40": 10.04004,
+            "41": 10.10752,
+            "42": 10.18198,
+            "43": 9.79649,
+            "44": 9.91071,
+            "45": 9.79715,
+            "46": 9.79411,
+            "47": 10.11365,
+            "48": 9.82516,
+            "49": 9.50416,
+            "50": 9.88698
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1625.0,
+            "2": 1666.0,
+            "3": 1695.0,
+            "4": 1746.0,
+            "5": 1977.0,
+            "6": 1839.0,
+            "7": 1894.0,
+            "8": 1665.0,
+            "9": 1929.0,
+            "10": 1436.0,
+            "11": 1794.0,
+            "12": 1845.0,
+            "13": 1976.0,
+            "14": 1931.0,
+            "15": 1971.0,
+            "16": 2095.0,
+            "17": 1805.0,
+            "18": 1764.0,
+            "19": 1753.0,
+            "20": 1693.0,
+            "21": 1872.0,
+            "22": 1669.0,
+            "23": 2113.0,
+            "24": 1589.0,
+            "25": 1679.0,
+            "26": 1667.0,
+            "27": 1779.0,
+            "28": 2025.0,
+            "29": 1940.0,
+            "30": 1885.0,
+            "31": 1623.0,
+            "32": 1978.0,
+            "33": 2203.0,
+            "34": 1947.0,
+            "35": 2040.0,
+            "36": 2002.0,
+            "37": 2346.0,
+            "38": 2100.0,
+            "39": 2479.0,
+            "40": 2258.0,
+            "41": 2347.0,
+            "42": 2331.0,
+            "43": 2125.0,
+            "44": 2126.0,
+            "45": 2130.0,
+            "46": 2342.0,
+            "47": 2550.0,
+            "48": 2401.0,
+            "49": 2216.0,
+            "50": 2456.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 581489664.0,
+            "2": 581489664.0,
+            "3": 581489664.0,
+            "4": 581489664.0,
+            "5": 581489664.0,
+            "6": 581489664.0,
+            "7": 581489664.0,
+            "8": 581489664.0,
+            "9": 581489664.0,
+            "10": 581489664.0,
+            "11": 581489664.0,
+            "12": 581489664.0,
+            "13": 581489664.0,
+            "14": 581489664.0,
+            "15": 581489664.0,
+            "16": 581489664.0,
+            "17": 581489664.0,
+            "18": 581489664.0,
+            "19": 581489664.0,
+            "20": 581489664.0,
+            "21": 581489664.0,
+            "22": 581489664.0,
+            "23": 581489664.0,
+            "24": 581489664.0,
+            "25": 581489664.0,
+            "26": 581489664.0,
+            "27": 581489664.0,
+            "28": 581489664.0,
+            "29": 581489664.0,
+            "30": 581489664.0,
+            "31": 581489664.0,
+            "32": 581489664.0,
+            "33": 581489664.0,
+            "34": 581489664.0,
+            "35": 581489664.0,
+            "36": 581489664.0,
+            "37": 581489664.0,
+            "38": 581489664.0,
+            "39": 581489664.0,
+            "40": 581489664.0,
+            "41": 581489664.0,
+            "42": 581489664.0,
+            "43": 581489664.0,
+            "44": 581489664.0,
+            "45": 581489664.0,
+            "46": 581489664.0,
+            "47": 581489664.0,
+            "48": 581489664.0,
+            "49": 581489664.0,
+            "50": 581489664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4605814272.0,
+            "2": 4702430720.0,
+            "3": 4702430720.0,
+            "4": 4702430720.0,
+            "5": 4702430720.0,
+            "6": 4702430720.0,
+            "7": 4702430720.0,
+            "8": 4702430720.0,
+            "9": 4702430720.0,
+            "10": 4702430720.0,
+            "11": 4702430720.0,
+            "12": 4702430720.0,
+            "13": 4702430720.0,
+            "14": 4702430720.0,
+            "15": 4702430720.0,
+            "16": 4702430720.0,
+            "17": 4702430720.0,
+            "18": 4702430720.0,
+            "19": 4702430720.0,
+            "20": 4702430720.0,
+            "21": 4702430720.0,
+            "22": 4702430720.0,
+            "23": 4702430720.0,
+            "24": 4702430720.0,
+            "25": 4702430720.0,
+            "26": 4702430720.0,
+            "27": 4702430720.0,
+            "28": 4702430720.0,
+            "29": 4702430720.0,
+            "30": 4702430720.0,
+            "31": 4702430720.0,
+            "32": 4702430720.0,
+            "33": 4702430720.0,
+            "34": 4702430720.0,
+            "35": 4702430720.0,
+            "36": 4702430720.0,
+            "37": 4702430720.0,
+            "38": 4702430720.0,
+            "39": 4702430720.0,
+            "40": 4702430720.0,
+            "41": 4702430720.0,
+            "42": 4702430720.0,
+            "43": 4702430720.0,
+            "44": 4702430720.0,
+            "45": 4702430720.0,
+            "46": 4702430720.0,
+            "47": 4702430720.0,
+            "48": 4702430720.0,
+            "49": 4702430720.0,
+            "50": 4702430720.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.46491,
+            "3": 0.11014,
+            "4": 0.09722,
+            "5": 0.09703,
+            "6": 0.09705,
+            "7": 0.09593,
+            "8": 0.09584,
+            "9": 0.09505,
+            "10": 0.0949,
+            "11": 0.09504,
+            "12": 0.09589,
+            "13": 0.09506,
+            "14": 0.09425,
+            "15": 0.09404,
+            "16": 0.09465,
+            "17": 0.09237,
+            "18": 0.09201,
+            "19": 0.09159,
+            "20": 0.09124,
+            "21": 0.09092,
+            "22": 0.09028,
+            "23": 0.08966,
+            "24": 0.08893,
+            "25": 0.09042,
+            "26": 0.09055,
+            "27": 0.08889,
+            "28": 0.08857,
+            "29": 0.0884,
+            "30": 0.08807,
+            "31": 0.08777,
+            "32": 0.08747,
+            "33": 0.0876,
+            "34": 0.08733,
+            "35": 0.0886,
+            "36": 0.08828,
+            "37": 0.08789,
+            "38": 0.08768,
+            "39": 0.08819,
+            "40": 0.08922,
+            "41": 0.08797,
+            "42": 0.0876,
+            "43": 0.0868,
+            "44": 0.08693,
+            "45": 0.08661,
+            "46": 0.08657,
+            "47": 0.08769,
+            "48": 0.08644,
+            "49": 0.08681,
+            "50": 0.08702
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json
index cd90888e65d..036b53dabb1 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 6.95394,
-            "2": 0.0878,
-            "3": 0.06953,
-            "4": 0.07916,
-            "5": 0.06775,
-            "6": 0.07681,
-            "7": 0.06695,
-            "8": 0.0786,
-            "9": 0.0664,
-            "10": 0.08059,
-            "11": 0.06554,
-            "12": 0.07501,
-            "13": 0.06663,
-            "14": 0.06608,
-            "15": 0.06585,
-            "16": 0.06738,
-            "17": 0.067,
-            "18": 0.06553,
-            "19": 0.06755,
-            "20": 0.06723,
-            "21": 0.06559,
-            "22": 0.0664,
-            "23": 0.06722,
-            "24": 0.06553,
-            "25": 0.06829,
-            "26": 0.06873,
-            "27": 0.06733,
-            "28": 0.06731,
-            "29": 0.06824,
-            "30": 0.06696,
-            "31": 0.06661,
-            "32": 0.06587,
-            "33": 0.06588,
-            "34": 0.06564,
-            "35": 0.06761,
-            "36": 0.06655,
-            "37": 0.06712,
-            "38": 0.06601,
-            "39": 0.06661,
-            "40": 0.06632,
-            "41": 0.0691,
-            "42": 0.06551,
-            "43": 0.06839,
-            "44": 0.06528,
-            "45": 0.06744,
-            "46": 0.0675,
-            "47": 0.06698,
-            "48": 0.0649,
-            "49": 0.06596,
-            "50": 0.06581
+            "1": 6.80579,
+            "2": 0.08104,
+            "3": 0.07547,
+            "4": 0.05731,
+            "5": 0.06226,
+            "6": 0.05988,
+            "7": 0.06566,
+            "8": 0.06635,
+            "9": 0.06593,
+            "10": 0.06639,
+            "11": 0.06591,
+            "12": 0.06568,
+            "13": 0.06504,
+            "14": 0.06232,
+            "15": 0.06162,
+            "16": 0.05614,
+            "17": 0.06083,
+            "18": 0.05789,
+            "19": 0.05867,
+            "20": 0.05574,
+            "21": 0.06043,
+            "22": 0.05778,
+            "23": 0.06166,
+            "24": 0.05671,
+            "25": 0.05765,
+            "26": 0.05638,
+            "27": 0.05601,
+            "28": 0.05637,
+            "29": 0.05497,
+            "30": 0.05757,
+            "31": 0.05556,
+            "32": 0.05715,
+            "33": 0.05761,
+            "34": 0.05779,
+            "35": 0.05996,
+            "36": 0.05761,
+            "37": 0.06454,
+            "38": 0.0575,
+            "39": 0.05802,
+            "40": 0.05752,
+            "41": 0.05904,
+            "42": 0.05622,
+            "43": 0.0555,
+            "44": 0.05785,
+            "45": 0.0578,
+            "46": 0.05758,
+            "47": 0.05729,
+            "48": 0.05652,
+            "49": 0.05619,
+            "50": 0.05705
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..8bcd3aa91d5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.77472,
+            "2": 10.7834,
+            "3": 10.783,
+            "4": 10.74952,
+            "5": 10.8207,
+            "6": 10.8234,
+            "7": 10.79076,
+            "8": 10.78002,
+            "9": 10.78621,
+            "10": 10.74365,
+            "11": 10.8322,
+            "12": 10.80441,
+            "13": 10.8213,
+            "14": 10.82574,
+            "15": 10.74146,
+            "16": 10.75035,
+            "17": 10.72535,
+            "18": 10.74231,
+            "19": 10.7445,
+            "20": 10.63706,
+            "21": 10.63104,
+            "22": 10.48032,
+            "23": 10.65993,
+            "24": 10.5253,
+            "25": 10.47539,
+            "26": 10.54133,
+            "27": 10.5547,
+            "28": 10.521,
+            "29": 10.53614,
+            "30": 10.30519,
+            "31": 10.06487,
+            "32": 10.41559,
+            "33": 10.42241,
+            "34": 10.1741,
+            "35": 10.22337,
+            "36": 10.18522,
+            "37": 10.30398,
+            "38": 10.14967,
+            "39": 10.37031,
+            "40": 10.04015,
+            "41": 10.10913,
+            "42": 10.17951,
+            "43": 9.79734,
+            "44": 9.90801,
+            "45": 9.79837,
+            "46": 9.79661,
+            "47": 10.12063,
+            "48": 9.82076,
+            "49": 9.50507,
+            "50": 9.88047
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1603.0,
+            "2": 1689.0,
+            "3": 1616.0,
+            "4": 1774.0,
+            "5": 2059.0,
+            "6": 1983.0,
+            "7": 2102.0,
+            "8": 1640.0,
+            "9": 1877.0,
+            "10": 1435.0,
+            "11": 1981.0,
+            "12": 1898.0,
+            "13": 1949.0,
+            "14": 1797.0,
+            "15": 1923.0,
+            "16": 1993.0,
+            "17": 1804.0,
+            "18": 1793.0,
+            "19": 1808.0,
+            "20": 1658.0,
+            "21": 1881.0,
+            "22": 1744.0,
+            "23": 2029.0,
+            "24": 1621.0,
+            "25": 1550.0,
+            "26": 1686.0,
+            "27": 1794.0,
+            "28": 1927.0,
+            "29": 1974.0,
+            "30": 1884.0,
+            "31": 1610.0,
+            "32": 1934.0,
+            "33": 2098.0,
+            "34": 1840.0,
+            "35": 2033.0,
+            "36": 2052.0,
+            "37": 2302.0,
+            "38": 2119.0,
+            "39": 2421.0,
+            "40": 2242.0,
+            "41": 2339.0,
+            "42": 2362.0,
+            "43": 2065.0,
+            "44": 2186.0,
+            "45": 2266.0,
+            "46": 2378.0,
+            "47": 2504.0,
+            "48": 2503.0,
+            "49": 2303.0,
+            "50": 2494.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 581489664.0,
+            "2": 581489664.0,
+            "3": 581489664.0,
+            "4": 581489664.0,
+            "5": 581489664.0,
+            "6": 581489664.0,
+            "7": 581489664.0,
+            "8": 581489664.0,
+            "9": 581489664.0,
+            "10": 581489664.0,
+            "11": 581489664.0,
+            "12": 581489664.0,
+            "13": 581489664.0,
+            "14": 581489664.0,
+            "15": 581489664.0,
+            "16": 581489664.0,
+            "17": 581489664.0,
+            "18": 581489664.0,
+            "19": 581489664.0,
+            "20": 581489664.0,
+            "21": 581489664.0,
+            "22": 581489664.0,
+            "23": 581489664.0,
+            "24": 581489664.0,
+            "25": 581489664.0,
+            "26": 581489664.0,
+            "27": 581489664.0,
+            "28": 581489664.0,
+            "29": 581489664.0,
+            "30": 581489664.0,
+            "31": 581489664.0,
+            "32": 581489664.0,
+            "33": 581489664.0,
+            "34": 581489664.0,
+            "35": 581489664.0,
+            "36": 581489664.0,
+            "37": 581489664.0,
+            "38": 581489664.0,
+            "39": 581489664.0,
+            "40": 581489664.0,
+            "41": 581489664.0,
+            "42": 581489664.0,
+            "43": 581489664.0,
+            "44": 581489664.0,
+            "45": 581489664.0,
+            "46": 581489664.0,
+            "47": 581489664.0,
+            "48": 581489664.0,
+            "49": 581489664.0,
+            "50": 581489664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4605814272.0,
+            "2": 4702430720.0,
+            "3": 4702430720.0,
+            "4": 4702430720.0,
+            "5": 4702430720.0,
+            "6": 4702430720.0,
+            "7": 4702430720.0,
+            "8": 4702430720.0,
+            "9": 4702430720.0,
+            "10": 4702430720.0,
+            "11": 4702430720.0,
+            "12": 4702430720.0,
+            "13": 4702430720.0,
+            "14": 4702430720.0,
+            "15": 4702430720.0,
+            "16": 4702430720.0,
+            "17": 4702430720.0,
+            "18": 4702430720.0,
+            "19": 4702430720.0,
+            "20": 4702430720.0,
+            "21": 4702430720.0,
+            "22": 4702430720.0,
+            "23": 4702430720.0,
+            "24": 4702430720.0,
+            "25": 4702430720.0,
+            "26": 4702430720.0,
+            "27": 4702430720.0,
+            "28": 4702430720.0,
+            "29": 4702430720.0,
+            "30": 4702430720.0,
+            "31": 4702430720.0,
+            "32": 4702430720.0,
+            "33": 4702430720.0,
+            "34": 4702430720.0,
+            "35": 4702430720.0,
+            "36": 4702430720.0,
+            "37": 4702430720.0,
+            "38": 4702430720.0,
+            "39": 4702430720.0,
+            "40": 4702430720.0,
+            "41": 4702430720.0,
+            "42": 4702430720.0,
+            "43": 4702430720.0,
+            "44": 4702430720.0,
+            "45": 4702430720.0,
+            "46": 4702430720.0,
+            "47": 4702430720.0,
+            "48": 4702430720.0,
+            "49": 4702430720.0,
+            "50": 4702430720.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.34829,
+            "3": 0.10511,
+            "4": 0.09797,
+            "5": 0.09705,
+            "6": 0.09665,
+            "7": 0.09616,
+            "8": 0.09616,
+            "9": 0.0968,
+            "10": 0.09708,
+            "11": 0.09598,
+            "12": 0.09533,
+            "13": 0.09549,
+            "14": 0.09665,
+            "15": 0.09303,
+            "16": 0.0963,
+            "17": 0.10058,
+            "18": 0.09955,
+            "19": 0.10067,
+            "20": 0.10221,
+            "21": 0.09941,
+            "22": 0.09872,
+            "23": 0.09975,
+            "24": 0.10322,
+            "25": 0.09837,
+            "26": 0.09834,
+            "27": 0.09843,
+            "28": 0.09692,
+            "29": 0.09907,
+            "30": 0.09889,
+            "31": 0.10064,
+            "32": 0.09748,
+            "33": 0.09927,
+            "34": 0.09831,
+            "35": 0.09862,
+            "36": 0.09852,
+            "37": 0.09869,
+            "38": 0.09941,
+            "39": 0.09945,
+            "40": 0.10014,
+            "41": 0.09934,
+            "42": 0.10081,
+            "43": 0.10148,
+            "44": 0.09766,
+            "45": 0.09746,
+            "46": 0.09842,
+            "47": 0.09924,
+            "48": 0.09864,
+            "49": 0.09829,
+            "50": 0.09685
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_h100.json
index db410897813..5718cc22850 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 581488640.0,
-            "2": 581488640.0,
-            "3": 581488640.0,
-            "4": 581488640.0,
-            "5": 581488640.0,
-            "6": 581488640.0,
-            "7": 581488640.0,
-            "8": 581488640.0,
-            "9": 581488640.0,
-            "10": 581488640.0,
-            "11": 581488640.0,
-            "12": 581488640.0,
-            "13": 581488640.0,
-            "14": 581488640.0,
-            "15": 581488640.0,
-            "16": 581488640.0,
-            "17": 581488640.0,
-            "18": 581488640.0,
-            "19": 581488640.0,
-            "20": 581488640.0,
-            "21": 581488640.0,
-            "22": 581488640.0,
-            "23": 581488640.0,
-            "24": 581488640.0,
-            "25": 581488640.0,
-            "26": 581488640.0,
-            "27": 581488640.0,
-            "28": 581488640.0,
-            "29": 581488640.0,
-            "30": 581488640.0,
-            "31": 581488640.0,
-            "32": 581488640.0,
-            "33": 581488640.0,
-            "34": 581488640.0,
-            "35": 581488640.0,
-            "36": 581488640.0,
-            "37": 581488640.0,
-            "38": 581488640.0,
-            "39": 581488640.0,
-            "40": 581488640.0,
-            "41": 581488640.0,
-            "42": 581488640.0,
-            "43": 581488640.0,
-            "44": 581488640.0,
-            "45": 581488640.0,
-            "46": 581488640.0,
-            "47": 581488640.0,
-            "48": 581488640.0,
-            "49": 581488640.0,
-            "50": 581488640.0
+            "1": 581489664.0,
+            "2": 581489664.0,
+            "3": 581489664.0,
+            "4": 581489664.0,
+            "5": 581489664.0,
+            "6": 581489664.0,
+            "7": 581489664.0,
+            "8": 581489664.0,
+            "9": 581489664.0,
+            "10": 581489664.0,
+            "11": 581489664.0,
+            "12": 581489664.0,
+            "13": 581489664.0,
+            "14": 581489664.0,
+            "15": 581489664.0,
+            "16": 581489664.0,
+            "17": 581489664.0,
+            "18": 581489664.0,
+            "19": 581489664.0,
+            "20": 581489664.0,
+            "21": 581489664.0,
+            "22": 581489664.0,
+            "23": 581489664.0,
+            "24": 581489664.0,
+            "25": 581489664.0,
+            "26": 581489664.0,
+            "27": 581489664.0,
+            "28": 581489664.0,
+            "29": 581489664.0,
+            "30": 581489664.0,
+            "31": 581489664.0,
+            "32": 581489664.0,
+            "33": 581489664.0,
+            "34": 581489664.0,
+            "35": 581489664.0,
+            "36": 581489664.0,
+            "37": 581489664.0,
+            "38": 581489664.0,
+            "39": 581489664.0,
+            "40": 581489664.0,
+            "41": 581489664.0,
+            "42": 581489664.0,
+            "43": 581489664.0,
+            "44": 581489664.0,
+            "45": 581489664.0,
+            "46": 581489664.0,
+            "47": 581489664.0,
+            "48": 581489664.0,
+            "49": 581489664.0,
+            "50": 581489664.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4605813248.0,
-            "2": 4702429696.0,
-            "3": 4702429696.0,
-            "4": 4702429696.0,
-            "5": 4702429696.0,
-            "6": 4702429696.0,
-            "7": 4702429696.0,
-            "8": 4702429696.0,
-            "9": 4702429696.0,
-            "10": 4702429696.0,
-            "11": 4702429696.0,
-            "12": 4702429696.0,
-            "13": 4702429696.0,
-            "14": 4702429696.0,
-            "15": 4702429696.0,
-            "16": 4702429696.0,
-            "17": 4702429696.0,
-            "18": 4702429696.0,
-            "19": 4702429696.0,
-            "20": 4702429696.0,
-            "21": 4702429696.0,
-            "22": 4702429696.0,
-            "23": 4702429696.0,
-            "24": 4702429696.0,
-            "25": 4702429696.0,
-            "26": 4702429696.0,
-            "27": 4702429696.0,
-            "28": 4702429696.0,
-            "29": 4702429696.0,
-            "30": 4702429696.0,
-            "31": 4702429696.0,
-            "32": 4702429696.0,
-            "33": 4702429696.0,
-            "34": 4702429696.0,
-            "35": 4702429696.0,
-            "36": 4702429696.0,
-            "37": 4702429696.0,
-            "38": 4702429696.0,
-            "39": 4702429696.0,
-            "40": 4702429696.0,
-            "41": 4702429696.0,
-            "42": 4702429696.0,
-            "43": 4702429696.0,
-            "44": 4702429696.0,
-            "45": 4702429696.0,
-            "46": 4702429696.0,
-            "47": 4702429696.0,
-            "48": 4702429696.0,
-            "49": 4702429696.0,
-            "50": 4702429696.0
+            "1": 4605814272.0,
+            "2": 4702430720.0,
+            "3": 4702430720.0,
+            "4": 4702430720.0,
+            "5": 4702430720.0,
+            "6": 4702430720.0,
+            "7": 4702430720.0,
+            "8": 4702430720.0,
+            "9": 4702430720.0,
+            "10": 4702430720.0,
+            "11": 4702430720.0,
+            "12": 4702430720.0,
+            "13": 4702430720.0,
+            "14": 4702430720.0,
+            "15": 4702430720.0,
+            "16": 4702430720.0,
+            "17": 4702430720.0,
+            "18": 4702430720.0,
+            "19": 4702430720.0,
+            "20": 4702430720.0,
+            "21": 4702430720.0,
+            "22": 4702430720.0,
+            "23": 4702430720.0,
+            "24": 4702430720.0,
+            "25": 4702430720.0,
+            "26": 4702430720.0,
+            "27": 4702430720.0,
+            "28": 4702430720.0,
+            "29": 4702430720.0,
+            "30": 4702430720.0,
+            "31": 4702430720.0,
+            "32": 4702430720.0,
+            "33": 4702430720.0,
+            "34": 4702430720.0,
+            "35": 4702430720.0,
+            "36": 4702430720.0,
+            "37": 4702430720.0,
+            "38": 4702430720.0,
+            "39": 4702430720.0,
+            "40": 4702430720.0,
+            "41": 4702430720.0,
+            "42": 4702430720.0,
+            "43": 4702430720.0,
+            "44": 4702430720.0,
+            "45": 4702430720.0,
+            "46": 4702430720.0,
+            "47": 4702430720.0,
+            "48": 4702430720.0,
+            "49": 4702430720.0,
+            "50": 4702430720.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 6.7331,
-            "2": 0.09599,
-            "3": 0.08799,
-            "4": 0.08582,
-            "5": 0.08478,
-            "6": 0.08513,
-            "7": 0.07688,
-            "8": 0.07429,
-            "9": 0.07778,
-            "10": 0.07515,
-            "11": 0.07987,
-            "12": 0.07525,
-            "13": 0.07727,
-            "14": 0.07535,
-            "15": 0.07896,
-            "16": 0.07509,
-            "17": 0.07751,
-            "18": 0.076,
-            "19": 0.07647,
-            "20": 0.07502,
-            "21": 0.07467,
-            "22": 0.07544,
-            "23": 0.0742,
-            "24": 0.07536,
-            "25": 0.07588,
-            "26": 0.07381,
-            "27": 0.07407,
-            "28": 0.075,
-            "29": 0.07424,
-            "30": 0.07454,
-            "31": 0.07482,
-            "32": 0.07526,
-            "33": 0.07493,
-            "34": 0.07437,
-            "35": 0.07447,
-            "36": 0.07482,
-            "37": 0.07454,
-            "38": 0.07501,
-            "39": 0.07495,
-            "40": 0.07481,
-            "41": 0.07433,
-            "42": 0.07467,
-            "43": 0.0754,
-            "44": 0.07543,
-            "45": 0.07498,
-            "46": 0.07457,
-            "47": 0.07378,
-            "48": 0.07477,
-            "49": 0.07465,
-            "50": 0.07444
+            "1": 8.63401,
+            "2": 0.09023,
+            "3": 0.07348,
+            "4": 0.05746,
+            "5": 0.05663,
+            "6": 0.05755,
+            "7": 0.0574,
+            "8": 0.05838,
+            "9": 0.05585,
+            "10": 0.05739,
+            "11": 0.05576,
+            "12": 0.0561,
+            "13": 0.05582,
+            "14": 0.05815,
+            "15": 0.05615,
+            "16": 0.05649,
+            "17": 0.05732,
+            "18": 0.05614,
+            "19": 0.05614,
+            "20": 0.0565,
+            "21": 0.05624,
+            "22": 0.05712,
+            "23": 0.05601,
+            "24": 0.05772,
+            "25": 0.05612,
+            "26": 0.05714,
+            "27": 0.05571,
+            "28": 0.05803,
+            "29": 0.0562,
+            "30": 0.05628,
+            "31": 0.05602,
+            "32": 0.05667,
+            "33": 0.05631,
+            "34": 0.05631,
+            "35": 0.05623,
+            "36": 0.0565,
+            "37": 0.05737,
+            "38": 0.05733,
+            "39": 0.05988,
+            "40": 0.05739,
+            "41": 0.05719,
+            "42": 0.05699,
+            "43": 0.05608,
+            "44": 0.05867,
+            "45": 0.05838,
+            "46": 0.05842,
+            "47": 0.05635,
+            "48": 0.05732,
+            "49": 0.0569,
+            "50": 0.05736
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json
index dd30f7144c7..5e28e46bf28 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4.57734,
-            "2": 0.12447,
-            "3": 0.1105,
-            "4": 0.11652,
-            "5": 0.11171,
-            "6": 0.10268,
-            "7": 0.0964,
-            "8": 0.09397,
-            "9": 0.09475,
-            "10": 0.09372,
-            "11": 0.09325,
-            "12": 0.09309,
-            "13": 0.09305,
-            "14": 0.09354,
-            "15": 0.09324,
-            "16": 0.09342,
-            "17": 0.09327,
-            "18": 0.09347,
-            "19": 0.09283,
-            "20": 0.09308,
-            "21": 0.09266,
-            "22": 0.09487,
-            "23": 0.09318,
-            "24": 0.09338,
-            "25": 0.09306,
-            "26": 0.09374,
-            "27": 0.09386,
-            "28": 0.09412,
-            "29": 0.09395,
-            "30": 0.09393,
-            "31": 0.09439,
-            "32": 0.09481,
-            "33": 0.09338,
-            "34": 0.09466,
-            "35": 0.0936,
-            "36": 0.09463,
-            "37": 0.09316,
-            "38": 0.09572,
-            "39": 0.09295,
-            "40": 0.09592,
-            "41": 0.09322,
-            "42": 0.09468,
-            "43": 0.09488,
-            "44": 0.09323,
-            "45": 0.09265,
-            "46": 0.09574,
-            "47": 0.09267,
-            "48": 0.09592,
-            "49": 0.09356,
-            "50": 0.09502
+            "1": 3.16333,
+            "2": 0.12429,
+            "3": 0.10327,
+            "4": 0.09373,
+            "5": 0.09355,
+            "6": 0.0921,
+            "7": 0.09247,
+            "8": 0.09175,
+            "9": 0.08988,
+            "10": 0.09206,
+            "11": 0.0907,
+            "12": 0.09062,
+            "13": 0.09067,
+            "14": 0.09178,
+            "15": 0.09006,
+            "16": 0.09058,
+            "17": 0.09113,
+            "18": 0.08975,
+            "19": 0.08958,
+            "20": 0.08974,
+            "21": 0.0895,
+            "22": 0.08967,
+            "23": 0.08965,
+            "24": 0.08985,
+            "25": 0.08964,
+            "26": 0.09069,
+            "27": 0.08964,
+            "28": 0.08972,
+            "29": 0.08977,
+            "30": 0.08994,
+            "31": 0.0898,
+            "32": 0.08953,
+            "33": 0.09044,
+            "34": 0.09062,
+            "35": 0.09102,
+            "36": 0.09102,
+            "37": 0.09125,
+            "38": 0.09035,
+            "39": 0.09141,
+            "40": 0.09069,
+            "41": 0.0916,
+            "42": 0.09094,
+            "43": 0.09103,
+            "44": 0.09176,
+            "45": 0.09169,
+            "46": 0.09186,
+            "47": 0.09119,
+            "48": 0.09112,
+            "49": 0.09072,
+            "50": 0.09246
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/model_config.yaml
index c7b46ff9b8d..fc92d226b6d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..eb5a06ac1fc
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.77447,
+            "2": 10.78365,
+            "3": 10.78346,
+            "4": 10.74822,
+            "5": 10.81983,
+            "6": 10.82303,
+            "7": 10.79055,
+            "8": 10.77956,
+            "9": 10.78595,
+            "10": 10.74453,
+            "11": 10.83267,
+            "12": 10.80426,
+            "13": 10.82082,
+            "14": 10.82567,
+            "15": 10.74206,
+            "16": 10.74904,
+            "17": 10.7252,
+            "18": 10.74176,
+            "19": 10.74412,
+            "20": 10.63678,
+            "21": 10.63055,
+            "22": 10.47962,
+            "23": 10.65976,
+            "24": 10.52477,
+            "25": 10.47552,
+            "26": 10.54117,
+            "27": 10.55491,
+            "28": 10.52139,
+            "29": 10.536,
+            "30": 10.3053,
+            "31": 10.0644,
+            "32": 10.41569,
+            "33": 10.42199,
+            "34": 10.17393,
+            "35": 10.22403,
+            "36": 10.18498,
+            "37": 10.30417,
+            "38": 10.14995,
+            "39": 10.37042,
+            "40": 10.03994,
+            "41": 10.10953,
+            "42": 10.17937,
+            "43": 9.79747,
+            "44": 9.90812,
+            "45": 9.79809,
+            "46": 9.7966,
+            "47": 10.12109,
+            "48": 9.82083,
+            "49": 9.50495,
+            "50": 9.88025,
+            "51": 9.83614,
+            "52": 9.72315,
+            "53": 10.05318,
+            "54": 9.93747,
+            "55": 9.87384,
+            "56": 9.60449,
+            "57": 9.4523,
+            "58": 9.8188,
+            "59": 9.5772,
+            "60": 9.48534,
+            "61": 9.68548,
+            "62": 9.97906,
+            "63": 9.36419,
+            "64": 9.76203,
+            "65": 8.94097,
+            "66": 9.69475,
+            "67": 9.36656,
+            "68": 9.77745,
+            "69": 9.79001,
+            "70": 9.72374,
+            "71": 9.62037,
+            "72": 9.57423,
+            "73": 9.48575,
+            "74": 8.92729,
+            "75": 9.41651,
+            "76": 9.07747,
+            "77": 10.05444,
+            "78": 9.71914,
+            "79": 9.37306,
+            "80": 9.40003,
+            "81": 9.47844,
+            "82": 9.69867,
+            "83": 9.31155,
+            "84": 9.41457,
+            "85": 9.61163,
+            "86": 9.07418,
+            "87": 9.5939,
+            "88": 9.74928,
+            "89": 9.5985,
+            "90": 9.82761,
+            "91": 9.33631,
+            "92": 9.35805,
+            "93": 9.08552,
+            "94": 8.82786,
+            "95": 9.5303,
+            "96": 9.52663,
+            "97": 9.30483,
+            "98": 9.67007,
+            "99": 8.89606,
+            "100": 9.40702
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1531.0,
+            "2": 1722.0,
+            "3": 1589.0,
+            "4": 1870.0,
+            "5": 1992.0,
+            "6": 1894.0,
+            "7": 1954.0,
+            "8": 1697.0,
+            "9": 1855.0,
+            "10": 1477.0,
+            "11": 1889.0,
+            "12": 1848.0,
+            "13": 1973.0,
+            "14": 1877.0,
+            "15": 2015.0,
+            "16": 1943.0,
+            "17": 1772.0,
+            "18": 1764.0,
+            "19": 1782.0,
+            "20": 1678.0,
+            "21": 1906.0,
+            "22": 1738.0,
+            "23": 2057.0,
+            "24": 1597.0,
+            "25": 1567.0,
+            "26": 1762.0,
+            "27": 1932.0,
+            "28": 1987.0,
+            "29": 1936.0,
+            "30": 1965.0,
+            "31": 1554.0,
+            "32": 1846.0,
+            "33": 2148.0,
+            "34": 1872.0,
+            "35": 1985.0,
+            "36": 1906.0,
+            "37": 2245.0,
+            "38": 2119.0,
+            "39": 2495.0,
+            "40": 2274.0,
+            "41": 2236.0,
+            "42": 2318.0,
+            "43": 2068.0,
+            "44": 2120.0,
+            "45": 2265.0,
+            "46": 2447.0,
+            "47": 2584.0,
+            "48": 2296.0,
+            "49": 2252.0,
+            "50": 2568.0,
+            "51": 2650.0,
+            "52": 2700.0,
+            "53": 2863.0,
+            "54": 2676.0,
+            "55": 2390.0,
+            "56": 2753.0,
+            "57": 2430.0,
+            "58": 2919.0,
+            "59": 2831.0,
+            "60": 2428.0,
+            "61": 2932.0,
+            "62": 2724.0,
+            "63": 2579.0,
+            "64": 2987.0,
+            "65": 2506.0,
+            "66": 2886.0,
+            "67": 2871.0,
+            "68": 2870.0,
+            "69": 3001.0,
+            "70": 3294.0,
+            "71": 3043.0,
+            "72": 2614.0,
+            "73": 3054.0,
+            "74": 2024.0,
+            "75": 2507.0,
+            "76": 3020.0,
+            "77": 3253.0,
+            "78": 3230.0,
+            "79": 3210.0,
+            "80": 3252.0,
+            "81": 3614.0,
+            "82": 3395.0,
+            "83": 2919.0,
+            "84": 3296.0,
+            "85": 3320.0,
+            "86": 2865.0,
+            "87": 3931.0,
+            "88": 3240.0,
+            "89": 3428.0,
+            "90": 3127.0,
+            "91": 2815.0,
+            "92": 3098.0,
+            "93": 2796.0,
+            "94": 3324.0,
+            "95": 3428.0,
+            "96": 3541.0,
+            "97": 3216.0,
+            "98": 3705.0,
+            "99": 3184.0,
+            "100": 3073.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 581489664.0,
+            "2": 581489664.0,
+            "3": 581489664.0,
+            "4": 581489664.0,
+            "5": 581489664.0,
+            "6": 581489664.0,
+            "7": 581489664.0,
+            "8": 581489664.0,
+            "9": 581489664.0,
+            "10": 581489664.0,
+            "11": 581489664.0,
+            "12": 581489664.0,
+            "13": 581489664.0,
+            "14": 581489664.0,
+            "15": 581489664.0,
+            "16": 581489664.0,
+            "17": 581489664.0,
+            "18": 581489664.0,
+            "19": 581489664.0,
+            "20": 581489664.0,
+            "21": 581489664.0,
+            "22": 581489664.0,
+            "23": 581489664.0,
+            "24": 581489664.0,
+            "25": 581489664.0,
+            "26": 581489664.0,
+            "27": 581489664.0,
+            "28": 581489664.0,
+            "29": 581489664.0,
+            "30": 581489664.0,
+            "31": 581489664.0,
+            "32": 581489664.0,
+            "33": 581489664.0,
+            "34": 581489664.0,
+            "35": 581489664.0,
+            "36": 581489664.0,
+            "37": 581489664.0,
+            "38": 581489664.0,
+            "39": 581489664.0,
+            "40": 581489664.0,
+            "41": 581489664.0,
+            "42": 581489664.0,
+            "43": 581489664.0,
+            "44": 581489664.0,
+            "45": 581489664.0,
+            "46": 581489664.0,
+            "47": 581489664.0,
+            "48": 581489664.0,
+            "49": 581489664.0,
+            "50": 581489664.0,
+            "51": 581489664.0,
+            "52": 581489664.0,
+            "53": 581489664.0,
+            "54": 581489664.0,
+            "55": 581489664.0,
+            "56": 581489664.0,
+            "57": 581489664.0,
+            "58": 581489664.0,
+            "59": 581489664.0,
+            "60": 581489664.0,
+            "61": 581489664.0,
+            "62": 581489664.0,
+            "63": 581489664.0,
+            "64": 581489664.0,
+            "65": 581489664.0,
+            "66": 581489664.0,
+            "67": 581489664.0,
+            "68": 581489664.0,
+            "69": 581489664.0,
+            "70": 581489664.0,
+            "71": 581489664.0,
+            "72": 581489664.0,
+            "73": 581489664.0,
+            "74": 581489664.0,
+            "75": 581489664.0,
+            "76": 581489664.0,
+            "77": 581489664.0,
+            "78": 581489664.0,
+            "79": 581489664.0,
+            "80": 581489664.0,
+            "81": 581489664.0,
+            "82": 581489664.0,
+            "83": 581489664.0,
+            "84": 581489664.0,
+            "85": 581489664.0,
+            "86": 581489664.0,
+            "87": 581489664.0,
+            "88": 581489664.0,
+            "89": 581489664.0,
+            "90": 581489664.0,
+            "91": 581489664.0,
+            "92": 581489664.0,
+            "93": 581489664.0,
+            "94": 581489664.0,
+            "95": 581489664.0,
+            "96": 581489664.0,
+            "97": 581489664.0,
+            "98": 581489664.0,
+            "99": 581489664.0,
+            "100": 581489664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2644459008.0,
+            "2": 2741075456.0,
+            "3": 2741075456.0,
+            "4": 2741075456.0,
+            "5": 2741075456.0,
+            "6": 2741075456.0,
+            "7": 2741075456.0,
+            "8": 2741075456.0,
+            "9": 2741075456.0,
+            "10": 2741075456.0,
+            "11": 2741075456.0,
+            "12": 2741075456.0,
+            "13": 2741075456.0,
+            "14": 2741075456.0,
+            "15": 2741075456.0,
+            "16": 2741075456.0,
+            "17": 2741075456.0,
+            "18": 2741075456.0,
+            "19": 2741075456.0,
+            "20": 2741075456.0,
+            "21": 2741075456.0,
+            "22": 2741075456.0,
+            "23": 2741075456.0,
+            "24": 2741075456.0,
+            "25": 2741075456.0,
+            "26": 2741075456.0,
+            "27": 2741075456.0,
+            "28": 2741075456.0,
+            "29": 2741075456.0,
+            "30": 2741075456.0,
+            "31": 2741075456.0,
+            "32": 2741075456.0,
+            "33": 2741075456.0,
+            "34": 2741075456.0,
+            "35": 2741075456.0,
+            "36": 2741075456.0,
+            "37": 2741075456.0,
+            "38": 2741075456.0,
+            "39": 2741075456.0,
+            "40": 2741075456.0,
+            "41": 2741075456.0,
+            "42": 2741075456.0,
+            "43": 2741075456.0,
+            "44": 2741075456.0,
+            "45": 2741075456.0,
+            "46": 2741075456.0,
+            "47": 2741075456.0,
+            "48": 2741075456.0,
+            "49": 2741075456.0,
+            "50": 2741075456.0,
+            "51": 2741075456.0,
+            "52": 2741075456.0,
+            "53": 2741075456.0,
+            "54": 2741075456.0,
+            "55": 2741075456.0,
+            "56": 2741075456.0,
+            "57": 2741075456.0,
+            "58": 2741075456.0,
+            "59": 2741075456.0,
+            "60": 2741075456.0,
+            "61": 2741075456.0,
+            "62": 2741075456.0,
+            "63": 2741075456.0,
+            "64": 2741075456.0,
+            "65": 2741075456.0,
+            "66": 2741075456.0,
+            "67": 2741075456.0,
+            "68": 2741075456.0,
+            "69": 2741075456.0,
+            "70": 2741075456.0,
+            "71": 2741075456.0,
+            "72": 2741075456.0,
+            "73": 2741075456.0,
+            "74": 2741075456.0,
+            "75": 2741075456.0,
+            "76": 2741075456.0,
+            "77": 2741075456.0,
+            "78": 2741075456.0,
+            "79": 2741075456.0,
+            "80": 2741075456.0,
+            "81": 2741075456.0,
+            "82": 2741075456.0,
+            "83": 2741075456.0,
+            "84": 2741075456.0,
+            "85": 2741075456.0,
+            "86": 2741075456.0,
+            "87": 2741075456.0,
+            "88": 2741075456.0,
+            "89": 2741075456.0,
+            "90": 2741075456.0,
+            "91": 2741075456.0,
+            "92": 2741075456.0,
+            "93": 2741075456.0,
+            "94": 2741075456.0,
+            "95": 2741075456.0,
+            "96": 2741075456.0,
+            "97": 2741075456.0,
+            "98": 2741075456.0,
+            "99": 2741075456.0,
+            "100": 2741075456.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.38769,
+            "3": 0.09884,
+            "4": 0.08514,
+            "5": 0.08435,
+            "6": 0.08412,
+            "7": 0.08558,
+            "8": 0.08425,
+            "9": 0.08436,
+            "10": 0.08457,
+            "11": 0.08469,
+            "12": 0.0848,
+            "13": 0.08487,
+            "14": 0.08571,
+            "15": 0.08487,
+            "16": 0.08529,
+            "17": 0.08559,
+            "18": 0.0898,
+            "19": 0.08482,
+            "20": 0.08509,
+            "21": 0.08527,
+            "22": 0.08597,
+            "23": 0.08592,
+            "24": 0.08654,
+            "25": 0.08608,
+            "26": 0.08574,
+            "27": 0.08542,
+            "28": 0.0856,
+            "29": 0.08581,
+            "30": 0.08539,
+            "31": 0.08675,
+            "32": 0.08679,
+            "33": 0.08699,
+            "34": 0.08599,
+            "35": 0.08568,
+            "36": 0.08528,
+            "37": 0.08527,
+            "38": 0.08526,
+            "39": 0.08614,
+            "40": 0.08507,
+            "41": 0.08552,
+            "42": 0.08586,
+            "43": 0.08568,
+            "44": 0.0866,
+            "45": 0.08692,
+            "46": 0.08614,
+            "47": 0.0859,
+            "48": 0.0863,
+            "49": 0.08723,
+            "50": 0.08703,
+            "51": 0.09195,
+            "52": 0.0775,
+            "53": 0.07822,
+            "54": 0.07813,
+            "55": 0.0784,
+            "56": 0.07871,
+            "57": 0.07816,
+            "58": 0.07787,
+            "59": 0.07958,
+            "60": 0.07893,
+            "61": 0.07873,
+            "62": 0.07887,
+            "63": 0.07945,
+            "64": 0.07879,
+            "65": 0.08059,
+            "66": 0.08041,
+            "67": 0.08127,
+            "68": 0.07996,
+            "69": 0.0799,
+            "70": 0.07821,
+            "71": 0.07712,
+            "72": 0.07745,
+            "73": 0.07774,
+            "74": 0.07859,
+            "75": 0.07741,
+            "76": 0.07753,
+            "77": 0.07725,
+            "78": 0.07676,
+            "79": 0.07838,
+            "80": 0.07786,
+            "81": 0.07743,
+            "82": 0.07732,
+            "83": 0.0773,
+            "84": 0.07664,
+            "85": 0.07753,
+            "86": 0.07826,
+            "87": 0.07764,
+            "88": 0.07681,
+            "89": 0.07911,
+            "90": 0.07799,
+            "91": 0.07796,
+            "92": 0.07853,
+            "93": 0.07736,
+            "94": 0.07777,
+            "95": 0.07791,
+            "96": 0.07723,
+            "97": 0.07753,
+            "98": 0.07789,
+            "99": 0.07782,
+            "100": 0.07733
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..5c26c56ee5a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83614,
+            "52": 9.72315,
+            "53": 10.05318,
+            "54": 9.93747,
+            "55": 9.87384,
+            "56": 9.60449,
+            "57": 9.4523,
+            "58": 9.8188,
+            "59": 9.5772,
+            "60": 9.48534,
+            "61": 9.68548,
+            "62": 9.97906,
+            "63": 9.36419,
+            "64": 9.76203,
+            "65": 8.94097,
+            "66": 9.69475,
+            "67": 9.36656,
+            "68": 9.77745,
+            "69": 9.79001,
+            "70": 9.72374,
+            "71": 9.62037,
+            "72": 9.57423,
+            "73": 9.48575,
+            "74": 8.92729,
+            "75": 9.41651,
+            "76": 9.07747,
+            "77": 10.05444,
+            "78": 9.71914,
+            "79": 9.37306,
+            "80": 9.40003,
+            "81": 9.47844,
+            "82": 9.69867,
+            "83": 9.31155,
+            "84": 9.41457,
+            "85": 9.61163,
+            "86": 9.07418,
+            "87": 9.5939,
+            "88": 9.74928,
+            "89": 9.5985,
+            "90": 9.82761,
+            "91": 9.33631,
+            "92": 9.35805,
+            "93": 9.08552,
+            "94": 8.82786,
+            "95": 9.5303,
+            "96": 9.52663,
+            "97": 9.30483,
+            "98": 9.67007,
+            "99": 8.89606,
+            "100": 9.40702
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2650.0,
+            "52": 2700.0,
+            "53": 2863.0,
+            "54": 2676.0,
+            "55": 2390.0,
+            "56": 2753.0,
+            "57": 2430.0,
+            "58": 2919.0,
+            "59": 2831.0,
+            "60": 2428.0,
+            "61": 2932.0,
+            "62": 2724.0,
+            "63": 2579.0,
+            "64": 2987.0,
+            "65": 2506.0,
+            "66": 2886.0,
+            "67": 2871.0,
+            "68": 2870.0,
+            "69": 3001.0,
+            "70": 3294.0,
+            "71": 3043.0,
+            "72": 2614.0,
+            "73": 3054.0,
+            "74": 2024.0,
+            "75": 2507.0,
+            "76": 3020.0,
+            "77": 3253.0,
+            "78": 3230.0,
+            "79": 3210.0,
+            "80": 3252.0,
+            "81": 3614.0,
+            "82": 3395.0,
+            "83": 2919.0,
+            "84": 3296.0,
+            "85": 3320.0,
+            "86": 2865.0,
+            "87": 3931.0,
+            "88": 3240.0,
+            "89": 3428.0,
+            "90": 3127.0,
+            "91": 2815.0,
+            "92": 3098.0,
+            "93": 2796.0,
+            "94": 3324.0,
+            "95": 3428.0,
+            "96": 3541.0,
+            "97": 3216.0,
+            "98": 3705.0,
+            "99": 3184.0,
+            "100": 3073.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 581489664.0,
+            "52": 581489664.0,
+            "53": 581489664.0,
+            "54": 581489664.0,
+            "55": 581489664.0,
+            "56": 581489664.0,
+            "57": 581489664.0,
+            "58": 581489664.0,
+            "59": 581489664.0,
+            "60": 581489664.0,
+            "61": 581489664.0,
+            "62": 581489664.0,
+            "63": 581489664.0,
+            "64": 581489664.0,
+            "65": 581489664.0,
+            "66": 581489664.0,
+            "67": 581489664.0,
+            "68": 581489664.0,
+            "69": 581489664.0,
+            "70": 581489664.0,
+            "71": 581489664.0,
+            "72": 581489664.0,
+            "73": 581489664.0,
+            "74": 581489664.0,
+            "75": 581489664.0,
+            "76": 581489664.0,
+            "77": 581489664.0,
+            "78": 581489664.0,
+            "79": 581489664.0,
+            "80": 581489664.0,
+            "81": 581489664.0,
+            "82": 581489664.0,
+            "83": 581489664.0,
+            "84": 581489664.0,
+            "85": 581489664.0,
+            "86": 581489664.0,
+            "87": 581489664.0,
+            "88": 581489664.0,
+            "89": 581489664.0,
+            "90": 581489664.0,
+            "91": 581489664.0,
+            "92": 581489664.0,
+            "93": 581489664.0,
+            "94": 581489664.0,
+            "95": 581489664.0,
+            "96": 581489664.0,
+            "97": 581489664.0,
+            "98": 581489664.0,
+            "99": 581489664.0,
+            "100": 581489664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2708568576.0,
+            "52": 2742124032.0,
+            "53": 2742124032.0,
+            "54": 2742124032.0,
+            "55": 2742124032.0,
+            "56": 2742124032.0,
+            "57": 2742124032.0,
+            "58": 2742124032.0,
+            "59": 2742124032.0,
+            "60": 2742124032.0,
+            "61": 2742124032.0,
+            "62": 2742124032.0,
+            "63": 2742124032.0,
+            "64": 2742124032.0,
+            "65": 2742124032.0,
+            "66": 2742124032.0,
+            "67": 2742124032.0,
+            "68": 2742124032.0,
+            "69": 2742124032.0,
+            "70": 2742124032.0,
+            "71": 2742124032.0,
+            "72": 2742124032.0,
+            "73": 2742124032.0,
+            "74": 2742124032.0,
+            "75": 2742124032.0,
+            "76": 2742124032.0,
+            "77": 2742124032.0,
+            "78": 2742124032.0,
+            "79": 2742124032.0,
+            "80": 2742124032.0,
+            "81": 2742124032.0,
+            "82": 2742124032.0,
+            "83": 2742124032.0,
+            "84": 2742124032.0,
+            "85": 2742124032.0,
+            "86": 2742124032.0,
+            "87": 2742124032.0,
+            "88": 2742124032.0,
+            "89": 2742124032.0,
+            "90": 2742124032.0,
+            "91": 2742124032.0,
+            "92": 2742124032.0,
+            "93": 2742124032.0,
+            "94": 2742124032.0,
+            "95": 2742124032.0,
+            "96": 2742124032.0,
+            "97": 2742124032.0,
+            "98": 2742124032.0,
+            "99": 2742124032.0,
+            "100": 2742124032.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.1716,
+            "53": 0.09643,
+            "54": 0.08435,
+            "55": 0.08492,
+            "56": 0.08409,
+            "57": 0.08624,
+            "58": 0.08522,
+            "59": 0.08521,
+            "60": 0.08445,
+            "61": 0.08447,
+            "62": 0.08412,
+            "63": 0.08534,
+            "64": 0.08529,
+            "65": 0.08566,
+            "66": 0.08409,
+            "67": 0.08468,
+            "68": 0.08268,
+            "69": 0.08161,
+            "70": 0.08416,
+            "71": 0.08383,
+            "72": 0.08425,
+            "73": 0.08363,
+            "74": 0.08451,
+            "75": 0.08423,
+            "76": 0.08453,
+            "77": 0.08475,
+            "78": 0.08435,
+            "79": 0.0844,
+            "80": 0.08466,
+            "81": 0.08777,
+            "82": 0.08524,
+            "83": 0.08559,
+            "84": 0.08524,
+            "85": 0.08501,
+            "86": 0.08518,
+            "87": 0.08503,
+            "88": 0.08555,
+            "89": 0.0855,
+            "90": 0.08584,
+            "91": 0.08419,
+            "92": 0.08467,
+            "93": 0.08514,
+            "94": 0.08518,
+            "95": 0.08444,
+            "96": 0.08484,
+            "97": 0.08521,
+            "98": 0.08697,
+            "99": 0.08772,
+            "100": 0.08544
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
index 686e980d509..131bcbe928e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 581488640.0,
-            "2": 581488640.0,
-            "3": 581488640.0,
-            "4": 581488640.0,
-            "5": 581488640.0,
-            "6": 581488640.0,
-            "7": 581488640.0,
-            "8": 581488640.0,
-            "9": 581488640.0,
-            "10": 581488640.0,
-            "11": 581488640.0,
-            "12": 581488640.0,
-            "13": 581488640.0,
-            "14": 581488640.0,
-            "15": 581488640.0,
-            "16": 581488640.0,
-            "17": 581488640.0,
-            "18": 581488640.0,
-            "19": 581488640.0,
-            "20": 581488640.0,
-            "21": 581488640.0,
-            "22": 581488640.0,
-            "23": 581488640.0,
-            "24": 581488640.0,
-            "25": 581488640.0,
-            "26": 581488640.0,
-            "27": 581488640.0,
-            "28": 581488640.0,
-            "29": 581488640.0,
-            "30": 581488640.0,
-            "31": 581488640.0,
-            "32": 581488640.0,
-            "33": 581488640.0,
-            "34": 581488640.0,
-            "35": 581488640.0,
-            "36": 581488640.0,
-            "37": 581488640.0,
-            "38": 581488640.0,
-            "39": 581488640.0,
-            "40": 581488640.0,
-            "41": 581488640.0,
-            "42": 581488640.0,
-            "43": 581488640.0,
-            "44": 581488640.0,
-            "45": 581488640.0,
-            "46": 581488640.0,
-            "47": 581488640.0,
-            "48": 581488640.0,
-            "49": 581488640.0,
-            "50": 581488640.0,
-            "51": 581488640.0,
-            "52": 581488640.0,
-            "53": 581488640.0,
-            "54": 581488640.0,
-            "55": 581488640.0,
-            "56": 581488640.0,
-            "57": 581488640.0,
-            "58": 581488640.0,
-            "59": 581488640.0,
-            "60": 581488640.0,
-            "61": 581488640.0,
-            "62": 581488640.0,
-            "63": 581488640.0,
-            "64": 581488640.0,
-            "65": 581488640.0,
-            "66": 581488640.0,
-            "67": 581488640.0,
-            "68": 581488640.0,
-            "69": 581488640.0,
-            "70": 581488640.0,
-            "71": 581488640.0,
-            "72": 581488640.0,
-            "73": 581488640.0,
-            "74": 581488640.0,
-            "75": 581488640.0,
-            "76": 581488640.0,
-            "77": 581488640.0,
-            "78": 581488640.0,
-            "79": 581488640.0,
-            "80": 581488640.0,
-            "81": 581488640.0,
-            "82": 581488640.0,
-            "83": 581488640.0,
-            "84": 581488640.0,
-            "85": 581488640.0,
-            "86": 581488640.0,
-            "87": 581488640.0,
-            "88": 581488640.0,
-            "89": 581488640.0,
-            "90": 581488640.0,
-            "91": 581488640.0,
-            "92": 581488640.0,
-            "93": 581488640.0,
-            "94": 581488640.0,
-            "95": 581488640.0,
-            "96": 581488640.0,
-            "97": 581488640.0,
-            "98": 581488640.0,
-            "99": 581488640.0,
-            "100": 581488640.0
+            "1": 581489664.0,
+            "2": 581489664.0,
+            "3": 581489664.0,
+            "4": 581489664.0,
+            "5": 581489664.0,
+            "6": 581489664.0,
+            "7": 581489664.0,
+            "8": 581489664.0,
+            "9": 581489664.0,
+            "10": 581489664.0,
+            "11": 581489664.0,
+            "12": 581489664.0,
+            "13": 581489664.0,
+            "14": 581489664.0,
+            "15": 581489664.0,
+            "16": 581489664.0,
+            "17": 581489664.0,
+            "18": 581489664.0,
+            "19": 581489664.0,
+            "20": 581489664.0,
+            "21": 581489664.0,
+            "22": 581489664.0,
+            "23": 581489664.0,
+            "24": 581489664.0,
+            "25": 581489664.0,
+            "26": 581489664.0,
+            "27": 581489664.0,
+            "28": 581489664.0,
+            "29": 581489664.0,
+            "30": 581489664.0,
+            "31": 581489664.0,
+            "32": 581489664.0,
+            "33": 581489664.0,
+            "34": 581489664.0,
+            "35": 581489664.0,
+            "36": 581489664.0,
+            "37": 581489664.0,
+            "38": 581489664.0,
+            "39": 581489664.0,
+            "40": 581489664.0,
+            "41": 581489664.0,
+            "42": 581489664.0,
+            "43": 581489664.0,
+            "44": 581489664.0,
+            "45": 581489664.0,
+            "46": 581489664.0,
+            "47": 581489664.0,
+            "48": 581489664.0,
+            "49": 581489664.0,
+            "50": 581489664.0,
+            "51": 581489664.0,
+            "52": 581489664.0,
+            "53": 581489664.0,
+            "54": 581489664.0,
+            "55": 581489664.0,
+            "56": 581489664.0,
+            "57": 581489664.0,
+            "58": 581489664.0,
+            "59": 581489664.0,
+            "60": 581489664.0,
+            "61": 581489664.0,
+            "62": 581489664.0,
+            "63": 581489664.0,
+            "64": 581489664.0,
+            "65": 581489664.0,
+            "66": 581489664.0,
+            "67": 581489664.0,
+            "68": 581489664.0,
+            "69": 581489664.0,
+            "70": 581489664.0,
+            "71": 581489664.0,
+            "72": 581489664.0,
+            "73": 581489664.0,
+            "74": 581489664.0,
+            "75": 581489664.0,
+            "76": 581489664.0,
+            "77": 581489664.0,
+            "78": 581489664.0,
+            "79": 581489664.0,
+            "80": 581489664.0,
+            "81": 581489664.0,
+            "82": 581489664.0,
+            "83": 581489664.0,
+            "84": 581489664.0,
+            "85": 581489664.0,
+            "86": 581489664.0,
+            "87": 581489664.0,
+            "88": 581489664.0,
+            "89": 581489664.0,
+            "90": 581489664.0,
+            "91": 581489664.0,
+            "92": 581489664.0,
+            "93": 581489664.0,
+            "94": 581489664.0,
+            "95": 581489664.0,
+            "96": 581489664.0,
+            "97": 581489664.0,
+            "98": 581489664.0,
+            "99": 581489664.0,
+            "100": 581489664.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2594126336.0,
-            "2": 2690742784.0,
-            "3": 2690742784.0,
-            "4": 2690742784.0,
-            "5": 2690742784.0,
-            "6": 2690742784.0,
-            "7": 2690742784.0,
-            "8": 2690742784.0,
-            "9": 2690742784.0,
-            "10": 2690742784.0,
-            "11": 2690742784.0,
-            "12": 2690742784.0,
-            "13": 2690742784.0,
-            "14": 2690742784.0,
-            "15": 2690742784.0,
-            "16": 2690742784.0,
-            "17": 2690742784.0,
-            "18": 2690742784.0,
-            "19": 2690742784.0,
-            "20": 2690742784.0,
-            "21": 2690742784.0,
-            "22": 2690742784.0,
-            "23": 2690742784.0,
-            "24": 2690742784.0,
-            "25": 2690742784.0,
-            "26": 2690742784.0,
-            "27": 2690742784.0,
-            "28": 2690742784.0,
-            "29": 2690742784.0,
-            "30": 2690742784.0,
-            "31": 2690742784.0,
-            "32": 2690742784.0,
-            "33": 2690742784.0,
-            "34": 2690742784.0,
-            "35": 2690742784.0,
-            "36": 2690742784.0,
-            "37": 2690742784.0,
-            "38": 2690742784.0,
-            "39": 2690742784.0,
-            "40": 2690742784.0,
-            "41": 2690742784.0,
-            "42": 2690742784.0,
-            "43": 2690742784.0,
-            "44": 2690742784.0,
-            "45": 2690742784.0,
-            "46": 2690742784.0,
-            "47": 2690742784.0,
-            "48": 2690742784.0,
-            "49": 2690742784.0,
-            "50": 2690742784.0,
-            "51": 2690742784.0,
-            "52": 2690742784.0,
-            "53": 2690742784.0,
-            "54": 2690742784.0,
-            "55": 2690742784.0,
-            "56": 2690742784.0,
-            "57": 2690742784.0,
-            "58": 2690742784.0,
-            "59": 2690742784.0,
-            "60": 2690742784.0,
-            "61": 2690742784.0,
-            "62": 2690742784.0,
-            "63": 2690742784.0,
-            "64": 2690742784.0,
-            "65": 2690742784.0,
-            "66": 2690742784.0,
-            "67": 2690742784.0,
-            "68": 2690742784.0,
-            "69": 2690742784.0,
-            "70": 2690742784.0,
-            "71": 2690742784.0,
-            "72": 2690742784.0,
-            "73": 2690742784.0,
-            "74": 2690742784.0,
-            "75": 2690742784.0,
-            "76": 2690742784.0,
-            "77": 2690742784.0,
-            "78": 2690742784.0,
-            "79": 2690742784.0,
-            "80": 2690742784.0,
-            "81": 2690742784.0,
-            "82": 2690742784.0,
-            "83": 2690742784.0,
-            "84": 2690742784.0,
-            "85": 2690742784.0,
-            "86": 2690742784.0,
-            "87": 2690742784.0,
-            "88": 2690742784.0,
-            "89": 2690742784.0,
-            "90": 2690742784.0,
-            "91": 2690742784.0,
-            "92": 2690742784.0,
-            "93": 2690742784.0,
-            "94": 2690742784.0,
-            "95": 2690742784.0,
-            "96": 2690742784.0,
-            "97": 2690742784.0,
-            "98": 2690742784.0,
-            "99": 2690742784.0,
-            "100": 2690742784.0
+            "1": 2594127360.0,
+            "2": 2690743808.0,
+            "3": 2690743808.0,
+            "4": 2690743808.0,
+            "5": 2690743808.0,
+            "6": 2690743808.0,
+            "7": 2690743808.0,
+            "8": 2690743808.0,
+            "9": 2690743808.0,
+            "10": 2690743808.0,
+            "11": 2690743808.0,
+            "12": 2690743808.0,
+            "13": 2690743808.0,
+            "14": 2690743808.0,
+            "15": 2690743808.0,
+            "16": 2690743808.0,
+            "17": 2690743808.0,
+            "18": 2690743808.0,
+            "19": 2690743808.0,
+            "20": 2690743808.0,
+            "21": 2690743808.0,
+            "22": 2690743808.0,
+            "23": 2690743808.0,
+            "24": 2690743808.0,
+            "25": 2690743808.0,
+            "26": 2690743808.0,
+            "27": 2690743808.0,
+            "28": 2690743808.0,
+            "29": 2690743808.0,
+            "30": 2690743808.0,
+            "31": 2690743808.0,
+            "32": 2690743808.0,
+            "33": 2690743808.0,
+            "34": 2690743808.0,
+            "35": 2690743808.0,
+            "36": 2690743808.0,
+            "37": 2690743808.0,
+            "38": 2690743808.0,
+            "39": 2690743808.0,
+            "40": 2690743808.0,
+            "41": 2690743808.0,
+            "42": 2690743808.0,
+            "43": 2690743808.0,
+            "44": 2690743808.0,
+            "45": 2690743808.0,
+            "46": 2690743808.0,
+            "47": 2690743808.0,
+            "48": 2690743808.0,
+            "49": 2690743808.0,
+            "50": 2690743808.0,
+            "51": 2690743808.0,
+            "52": 2690743808.0,
+            "53": 2690743808.0,
+            "54": 2690743808.0,
+            "55": 2690743808.0,
+            "56": 2690743808.0,
+            "57": 2690743808.0,
+            "58": 2690743808.0,
+            "59": 2690743808.0,
+            "60": 2690743808.0,
+            "61": 2690743808.0,
+            "62": 2690743808.0,
+            "63": 2690743808.0,
+            "64": 2690743808.0,
+            "65": 2690743808.0,
+            "66": 2690743808.0,
+            "67": 2690743808.0,
+            "68": 2690743808.0,
+            "69": 2690743808.0,
+            "70": 2690743808.0,
+            "71": 2690743808.0,
+            "72": 2690743808.0,
+            "73": 2690743808.0,
+            "74": 2690743808.0,
+            "75": 2690743808.0,
+            "76": 2690743808.0,
+            "77": 2690743808.0,
+            "78": 2690743808.0,
+            "79": 2690743808.0,
+            "80": 2690743808.0,
+            "81": 2690743808.0,
+            "82": 2690743808.0,
+            "83": 2690743808.0,
+            "84": 2690743808.0,
+            "85": 2690743808.0,
+            "86": 2690743808.0,
+            "87": 2690743808.0,
+            "88": 2690743808.0,
+            "89": 2690743808.0,
+            "90": 2690743808.0,
+            "91": 2690743808.0,
+            "92": 2690743808.0,
+            "93": 2690743808.0,
+            "94": 2690743808.0,
+            "95": 2690743808.0,
+            "96": 2690743808.0,
+            "97": 2690743808.0,
+            "98": 2690743808.0,
+            "99": 2690743808.0,
+            "100": 2690743808.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 7.46673,
-            "2": 0.07879,
-            "3": 0.06822,
-            "4": 0.06744,
-            "5": 0.06664,
-            "6": 0.06786,
-            "7": 0.06766,
-            "8": 0.06659,
-            "9": 0.06797,
-            "10": 0.07184,
-            "11": 0.07288,
-            "12": 0.07188,
-            "13": 0.07026,
-            "14": 0.06821,
-            "15": 0.06667,
-            "16": 0.06656,
-            "17": 0.06764,
-            "18": 0.06816,
-            "19": 0.06695,
-            "20": 0.06832,
-            "21": 0.06808,
-            "22": 0.06822,
-            "23": 0.06838,
-            "24": 0.06731,
-            "25": 0.06857,
-            "26": 0.06706,
-            "27": 0.06819,
-            "28": 0.06784,
-            "29": 0.06785,
-            "30": 0.06735,
-            "31": 0.0685,
-            "32": 0.07005,
-            "33": 0.07122,
-            "34": 0.07241,
-            "35": 0.07067,
-            "36": 0.06981,
-            "37": 0.06934,
-            "38": 0.06771,
-            "39": 0.06805,
-            "40": 0.06824,
-            "41": 0.06831,
-            "42": 0.06733,
-            "43": 0.06819,
-            "44": 0.06816,
-            "45": 0.06847,
-            "46": 0.0674,
-            "47": 0.06856,
-            "48": 0.07158,
-            "49": 0.07079,
-            "50": 0.0717,
-            "51": 0.08179,
-            "52": 0.07272,
-            "53": 0.06939,
-            "54": 0.06631,
-            "55": 0.07046,
-            "56": 0.09852,
-            "57": 0.06464,
-            "58": 0.06466,
-            "59": 0.06537,
-            "60": 0.06301,
-            "61": 0.06361,
-            "62": 0.06551,
-            "63": 0.06563,
-            "64": 0.0749,
-            "65": 0.0748,
-            "66": 0.07507,
-            "67": 0.07552,
-            "68": 0.07573,
-            "69": 0.07066,
-            "70": 0.0658,
-            "71": 0.0647,
-            "72": 0.06444,
-            "73": 0.06462,
-            "74": 0.06543,
-            "75": 0.06609,
-            "76": 0.06503,
-            "77": 0.06499,
-            "78": 0.0644,
-            "79": 0.06439,
-            "80": 0.06417,
-            "81": 0.06401,
-            "82": 0.06575,
-            "83": 0.06494,
-            "84": 0.06442,
-            "85": 0.06396,
-            "86": 0.06422,
-            "87": 0.06484,
-            "88": 0.06512,
-            "89": 0.06426,
-            "90": 0.06481,
-            "91": 0.06476,
-            "92": 0.06383,
-            "93": 0.06456,
-            "94": 0.06292,
-            "95": 0.0638,
-            "96": 0.06392,
-            "97": 0.06356,
-            "98": 0.06355,
-            "99": 0.06439,
-            "100": 0.06428
+            "1": 6.85919,
+            "2": 0.0831,
+            "3": 0.08065,
+            "4": 0.05861,
+            "5": 0.04976,
+            "6": 0.05045,
+            "7": 0.04972,
+            "8": 0.04911,
+            "9": 0.04965,
+            "10": 0.04942,
+            "11": 0.04916,
+            "12": 0.04915,
+            "13": 0.04939,
+            "14": 0.04993,
+            "15": 0.04987,
+            "16": 0.04906,
+            "17": 0.05015,
+            "18": 0.04924,
+            "19": 0.05168,
+            "20": 0.04963,
+            "21": 0.05051,
+            "22": 0.04948,
+            "23": 0.05006,
+            "24": 0.04939,
+            "25": 0.05019,
+            "26": 0.04951,
+            "27": 0.05048,
+            "28": 0.04917,
+            "29": 0.05015,
+            "30": 0.04921,
+            "31": 0.04969,
+            "32": 0.04894,
+            "33": 0.04941,
+            "34": 0.04938,
+            "35": 0.04927,
+            "36": 0.04942,
+            "37": 0.04944,
+            "38": 0.04973,
+            "39": 0.04957,
+            "40": 0.05016,
+            "41": 0.04968,
+            "42": 0.05042,
+            "43": 0.0523,
+            "44": 0.04956,
+            "45": 0.04948,
+            "46": 0.05093,
+            "47": 0.0493,
+            "48": 0.0498,
+            "49": 0.05177,
+            "50": 0.05032,
+            "51": 0.05749,
+            "52": 0.05013,
+            "53": 0.0512,
+            "54": 0.04935,
+            "55": 0.04891,
+            "56": 0.04976,
+            "57": 0.04984,
+            "58": 0.04964,
+            "59": 0.05274,
+            "60": 0.04962,
+            "61": 0.05096,
+            "62": 0.04934,
+            "63": 0.04971,
+            "64": 0.0503,
+            "65": 0.05028,
+            "66": 0.04991,
+            "67": 0.04926,
+            "68": 0.04848,
+            "69": 0.0493,
+            "70": 0.04943,
+            "71": 0.04943,
+            "72": 0.04852,
+            "73": 0.04928,
+            "74": 0.04895,
+            "75": 0.04995,
+            "76": 0.04877,
+            "77": 0.0492,
+            "78": 0.04886,
+            "79": 0.04938,
+            "80": 0.04894,
+            "81": 0.04892,
+            "82": 0.05016,
+            "83": 0.04964,
+            "84": 0.04956,
+            "85": 0.04881,
+            "86": 0.04999,
+            "87": 0.04908,
+            "88": 0.04838,
+            "89": 0.04957,
+            "90": 0.04882,
+            "91": 0.04993,
+            "92": 0.05004,
+            "93": 0.05003,
+            "94": 0.04961,
+            "95": 0.05132,
+            "96": 0.05071,
+            "97": 0.04952,
+            "98": 0.04851,
+            "99": 0.05027,
+            "100": 0.04988
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..4519bd52155
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85447,
+            "52": 9.73936,
+            "53": 10.07426,
+            "54": 9.96915,
+            "55": 9.88574,
+            "56": 9.62437,
+            "57": 9.4823,
+            "58": 9.83483,
+            "59": 9.58732,
+            "60": 9.50245,
+            "61": 9.69343,
+            "62": 9.98806,
+            "63": 9.39103,
+            "64": 9.78021,
+            "65": 8.94515,
+            "66": 9.70494,
+            "67": 9.37251,
+            "68": 9.78329,
+            "69": 9.79058,
+            "70": 9.74454,
+            "71": 9.62301,
+            "72": 9.58458,
+            "73": 9.50513,
+            "74": 8.94312,
+            "75": 9.42524,
+            "76": 9.07601,
+            "77": 10.06353,
+            "78": 9.72308,
+            "79": 9.37502,
+            "80": 9.40453,
+            "81": 9.47794,
+            "82": 9.69667,
+            "83": 9.3072,
+            "84": 9.41526,
+            "85": 9.61293,
+            "86": 9.07195,
+            "87": 9.5884,
+            "88": 9.74762,
+            "89": 9.59982,
+            "90": 9.81672,
+            "91": 9.3379,
+            "92": 9.35605,
+            "93": 9.07425,
+            "94": 8.8351,
+            "95": 9.5184,
+            "96": 9.52391,
+            "97": 9.30923,
+            "98": 9.66743,
+            "99": 8.88419,
+            "100": 9.39924
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2598.0,
+            "52": 2547.0,
+            "53": 2957.0,
+            "54": 2750.0,
+            "55": 2372.0,
+            "56": 2569.0,
+            "57": 2395.0,
+            "58": 2901.0,
+            "59": 2741.0,
+            "60": 2430.0,
+            "61": 2868.0,
+            "62": 2651.0,
+            "63": 2507.0,
+            "64": 3014.0,
+            "65": 2683.0,
+            "66": 2935.0,
+            "67": 2783.0,
+            "68": 2725.0,
+            "69": 2788.0,
+            "70": 3152.0,
+            "71": 3026.0,
+            "72": 2415.0,
+            "73": 3122.0,
+            "74": 1967.0,
+            "75": 2581.0,
+            "76": 3010.0,
+            "77": 3294.0,
+            "78": 3166.0,
+            "79": 3150.0,
+            "80": 3246.0,
+            "81": 3566.0,
+            "82": 3285.0,
+            "83": 2817.0,
+            "84": 3269.0,
+            "85": 3425.0,
+            "86": 2819.0,
+            "87": 3577.0,
+            "88": 3004.0,
+            "89": 3323.0,
+            "90": 3023.0,
+            "91": 2661.0,
+            "92": 3066.0,
+            "93": 2691.0,
+            "94": 3305.0,
+            "95": 3403.0,
+            "96": 3377.0,
+            "97": 3242.0,
+            "98": 3697.0,
+            "99": 3112.0,
+            "100": 3199.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 581489664.0,
+            "52": 581489664.0,
+            "53": 581489664.0,
+            "54": 581489664.0,
+            "55": 581489664.0,
+            "56": 581489664.0,
+            "57": 581489664.0,
+            "58": 581489664.0,
+            "59": 581489664.0,
+            "60": 581489664.0,
+            "61": 581489664.0,
+            "62": 581489664.0,
+            "63": 581489664.0,
+            "64": 581489664.0,
+            "65": 581489664.0,
+            "66": 581489664.0,
+            "67": 581489664.0,
+            "68": 581489664.0,
+            "69": 581489664.0,
+            "70": 581489664.0,
+            "71": 581489664.0,
+            "72": 581489664.0,
+            "73": 581489664.0,
+            "74": 581489664.0,
+            "75": 581489664.0,
+            "76": 581489664.0,
+            "77": 581489664.0,
+            "78": 581489664.0,
+            "79": 581489664.0,
+            "80": 581489664.0,
+            "81": 581489664.0,
+            "82": 581489664.0,
+            "83": 581489664.0,
+            "84": 581489664.0,
+            "85": 581489664.0,
+            "86": 581489664.0,
+            "87": 581489664.0,
+            "88": 581489664.0,
+            "89": 581489664.0,
+            "90": 581489664.0,
+            "91": 581489664.0,
+            "92": 581489664.0,
+            "93": 581489664.0,
+            "94": 581489664.0,
+            "95": 581489664.0,
+            "96": 581489664.0,
+            "97": 581489664.0,
+            "98": 581489664.0,
+            "99": 581489664.0,
+            "100": 581489664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2658236928.0,
+            "52": 2691792384.0,
+            "53": 2691792384.0,
+            "54": 2691792384.0,
+            "55": 2691792384.0,
+            "56": 2691792384.0,
+            "57": 2691792384.0,
+            "58": 2691792384.0,
+            "59": 2691792384.0,
+            "60": 2691792384.0,
+            "61": 2691792384.0,
+            "62": 2691792384.0,
+            "63": 2691792384.0,
+            "64": 2691792384.0,
+            "65": 2691792384.0,
+            "66": 2691792384.0,
+            "67": 2691792384.0,
+            "68": 2691792384.0,
+            "69": 2691792384.0,
+            "70": 2691792384.0,
+            "71": 2691792384.0,
+            "72": 2691792384.0,
+            "73": 2691792384.0,
+            "74": 2691792384.0,
+            "75": 2691792384.0,
+            "76": 2691792384.0,
+            "77": 2691792384.0,
+            "78": 2691792384.0,
+            "79": 2691792384.0,
+            "80": 2691792384.0,
+            "81": 2691792384.0,
+            "82": 2691792384.0,
+            "83": 2691792384.0,
+            "84": 2691792384.0,
+            "85": 2691792384.0,
+            "86": 2691792384.0,
+            "87": 2691792384.0,
+            "88": 2691792384.0,
+            "89": 2691792384.0,
+            "90": 2691792384.0,
+            "91": 2691792384.0,
+            "92": 2691792384.0,
+            "93": 2691792384.0,
+            "94": 2691792384.0,
+            "95": 2691792384.0,
+            "96": 2691792384.0,
+            "97": 2691792384.0,
+            "98": 2691792384.0,
+            "99": 2691792384.0,
+            "100": 2691792384.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6.24535,
+            "52": 0.08446,
+            "53": 0.05106,
+            "54": 0.05053,
+            "55": 0.05025,
+            "56": 0.06328,
+            "57": 0.05006,
+            "58": 0.04939,
+            "59": 0.04895,
+            "60": 0.05032,
+            "61": 0.05024,
+            "62": 0.04926,
+            "63": 0.051,
+            "64": 0.04994,
+            "65": 0.0516,
+            "66": 0.05582,
+            "67": 0.05024,
+            "68": 0.04967,
+            "69": 0.04945,
+            "70": 0.05103,
+            "71": 0.04971,
+            "72": 0.0494,
+            "73": 0.05144,
+            "74": 0.0497,
+            "75": 0.05084,
+            "76": 0.05125,
+            "77": 0.05002,
+            "78": 0.04992,
+            "79": 0.05192,
+            "80": 0.05131,
+            "81": 0.05007,
+            "82": 0.05145,
+            "83": 0.05065,
+            "84": 0.05098,
+            "85": 0.05005,
+            "86": 0.05133,
+            "87": 0.05031,
+            "88": 0.05145,
+            "89": 0.05038,
+            "90": 0.49172,
+            "91": 0.05261,
+            "92": 0.05313,
+            "93": 0.05042,
+            "94": 0.05061,
+            "95": 0.05207,
+            "96": 0.04992,
+            "97": 0.04998,
+            "98": 0.05103,
+            "99": 0.05004,
+            "100": 0.05054
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json
index 9dad9972e22..b6e4891b3bb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 4.10688,
-            "2": 0.11397,
-            "3": 0.08797,
-            "4": 0.08663,
-            "5": 0.08687,
-            "6": 0.08702,
-            "7": 0.08653,
-            "8": 0.08674,
-            "9": 0.08696,
-            "10": 0.08678,
-            "11": 0.08635,
-            "12": 0.08637,
-            "13": 0.08738,
-            "14": 0.08674,
-            "15": 0.08706,
-            "16": 0.08684,
-            "17": 0.08681,
-            "18": 0.08601,
-            "19": 0.08591,
-            "20": 0.08645,
-            "21": 0.08634,
-            "22": 0.08598,
-            "23": 0.08618,
-            "24": 0.08622,
-            "25": 0.08632,
-            "26": 0.08621,
-            "27": 0.08644,
-            "28": 0.08581,
-            "29": 0.08622,
-            "30": 0.08652,
-            "31": 0.08679,
-            "32": 0.08526,
-            "33": 0.08525,
-            "34": 0.08525,
-            "35": 0.08519,
-            "36": 0.08535,
-            "37": 0.08568,
-            "38": 0.0852,
-            "39": 0.08521,
-            "40": 0.08523,
-            "41": 0.08535,
-            "42": 0.08486,
-            "43": 0.08614,
-            "44": 0.08491,
-            "45": 0.08554,
-            "46": 0.08508,
-            "47": 0.08524,
-            "48": 0.08608,
-            "49": 0.08565,
-            "50": 0.08559,
-            "51": 0.10342,
-            "52": 0.09048,
-            "53": 0.08707,
-            "54": 0.08719,
-            "55": 0.08631,
-            "56": 0.11667,
-            "57": 0.08592,
-            "58": 0.08517,
-            "59": 0.08612,
-            "60": 0.08514,
-            "61": 0.0855,
-            "62": 0.08527,
-            "63": 0.08586,
-            "64": 0.08556,
-            "65": 0.08633,
-            "66": 0.08532,
-            "67": 0.08593,
-            "68": 0.08563,
-            "69": 0.08537,
-            "70": 0.08538,
-            "71": 0.08507,
-            "72": 0.08593,
-            "73": 0.08623,
-            "74": 0.08561,
-            "75": 0.08536,
-            "76": 0.08551,
-            "77": 0.08526,
-            "78": 0.0859,
-            "79": 0.08518,
-            "80": 0.08601,
-            "81": 0.08574,
-            "82": 0.08618,
-            "83": 0.08532,
-            "84": 0.08505,
-            "85": 0.08545,
-            "86": 0.08554,
-            "87": 0.08542,
-            "88": 0.08575,
-            "89": 0.0861,
-            "90": 0.08516,
-            "91": 0.08552,
-            "92": 0.08581,
-            "93": 0.08558,
-            "94": 0.08577,
-            "95": 0.08708,
-            "96": 0.08574,
-            "97": 0.08543,
-            "98": 0.0855,
-            "99": 0.08537,
-            "100": 0.08541
+            "1": 3.22526,
+            "2": 0.19893,
+            "3": 0.09313,
+            "4": 0.08045,
+            "5": 0.08171,
+            "6": 0.08058,
+            "7": 0.08022,
+            "8": 0.07981,
+            "9": 0.0808,
+            "10": 0.08068,
+            "11": 0.08073,
+            "12": 0.08318,
+            "13": 0.08514,
+            "14": 0.08404,
+            "15": 0.08382,
+            "16": 0.08982,
+            "17": 0.08387,
+            "18": 0.08342,
+            "19": 0.08359,
+            "20": 0.07926,
+            "21": 0.08037,
+            "22": 0.08041,
+            "23": 0.08187,
+            "24": 0.08232,
+            "25": 0.08012,
+            "26": 0.08081,
+            "27": 0.08072,
+            "28": 0.08454,
+            "29": 0.08003,
+            "30": 0.07895,
+            "31": 0.08312,
+            "32": 0.08109,
+            "33": 0.08106,
+            "34": 0.07905,
+            "35": 0.08145,
+            "36": 0.08345,
+            "37": 0.07972,
+            "38": 0.07895,
+            "39": 0.0795,
+            "40": 0.07971,
+            "41": 0.08032,
+            "42": 0.07938,
+            "43": 0.0806,
+            "44": 0.07956,
+            "45": 0.07918,
+            "46": 0.07961,
+            "47": 0.07937,
+            "48": 0.08049,
+            "49": 0.07875,
+            "50": 0.07866,
+            "51": 0.08212,
+            "52": 0.07853,
+            "53": 0.07869,
+            "54": 0.07753,
+            "55": 0.0774,
+            "56": 0.07699,
+            "57": 0.07754,
+            "58": 0.07721,
+            "59": 0.07784,
+            "60": 0.07727,
+            "61": 0.07709,
+            "62": 0.07721,
+            "63": 0.07751,
+            "64": 0.07763,
+            "65": 0.07813,
+            "66": 0.07898,
+            "67": 0.07875,
+            "68": 0.07868,
+            "69": 0.0789,
+            "70": 0.07834,
+            "71": 0.07782,
+            "72": 0.07816,
+            "73": 0.0785,
+            "74": 0.0787,
+            "75": 0.07812,
+            "76": 0.07812,
+            "77": 0.07845,
+            "78": 0.07888,
+            "79": 0.07811,
+            "80": 0.07836,
+            "81": 0.07854,
+            "82": 0.07902,
+            "83": 0.07769,
+            "84": 0.07776,
+            "85": 0.07749,
+            "86": 0.07824,
+            "87": 0.07761,
+            "88": 0.07812,
+            "89": 0.07814,
+            "90": 0.07827,
+            "91": 0.07825,
+            "92": 0.07856,
+            "93": 0.07779,
+            "94": 0.07786,
+            "95": 0.07734,
+            "96": 0.07776,
+            "97": 0.07809,
+            "98": 0.07855,
+            "99": 0.07768,
+            "100": 0.08111
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..c941dc70aab
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83459,
+            "52": 9.73231,
+            "53": 10.04881,
+            "54": 9.93895,
+            "55": 9.86297,
+            "56": 9.613,
+            "57": 9.46964,
+            "58": 9.81136,
+            "59": 9.57107,
+            "60": 9.48153,
+            "61": 9.67881,
+            "62": 9.96579,
+            "63": 9.35276,
+            "64": 9.75644,
+            "65": 8.93769,
+            "66": 9.68152,
+            "67": 9.35669,
+            "68": 9.76806,
+            "69": 9.7739,
+            "70": 9.71012,
+            "71": 9.60009,
+            "72": 9.56796,
+            "73": 9.47739,
+            "74": 8.93177,
+            "75": 9.40721,
+            "76": 9.06847,
+            "77": 10.0464,
+            "78": 9.70984,
+            "79": 9.35731,
+            "80": 9.38978,
+            "81": 9.4662,
+            "82": 9.68056,
+            "83": 9.29144,
+            "84": 9.40194,
+            "85": 9.59734,
+            "86": 9.06207,
+            "87": 9.57921,
+            "88": 9.73262,
+            "89": 9.58838,
+            "90": 9.80354,
+            "91": 9.31991,
+            "92": 9.35013,
+            "93": 9.06378,
+            "94": 8.81909,
+            "95": 9.50572,
+            "96": 9.51068,
+            "97": 9.29244,
+            "98": 9.65579,
+            "99": 8.87401,
+            "100": 9.38837
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2452.0,
+            "52": 2576.0,
+            "53": 2914.0,
+            "54": 2741.0,
+            "55": 2408.0,
+            "56": 2650.0,
+            "57": 2264.0,
+            "58": 2853.0,
+            "59": 2757.0,
+            "60": 2509.0,
+            "61": 3076.0,
+            "62": 2709.0,
+            "63": 2563.0,
+            "64": 3041.0,
+            "65": 2687.0,
+            "66": 3089.0,
+            "67": 2767.0,
+            "68": 2930.0,
+            "69": 2911.0,
+            "70": 3286.0,
+            "71": 3105.0,
+            "72": 2507.0,
+            "73": 3063.0,
+            "74": 2022.0,
+            "75": 2763.0,
+            "76": 3002.0,
+            "77": 3382.0,
+            "78": 3470.0,
+            "79": 3109.0,
+            "80": 3357.0,
+            "81": 3798.0,
+            "82": 3348.0,
+            "83": 2763.0,
+            "84": 3271.0,
+            "85": 3245.0,
+            "86": 2587.0,
+            "87": 3650.0,
+            "88": 3103.0,
+            "89": 3471.0,
+            "90": 3086.0,
+            "91": 3050.0,
+            "92": 3368.0,
+            "93": 2828.0,
+            "94": 3495.0,
+            "95": 3424.0,
+            "96": 3559.0,
+            "97": 3289.0,
+            "98": 3727.0,
+            "99": 3275.0,
+            "100": 3401.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 552128512.0,
+            "52": 552128512.0,
+            "53": 552128512.0,
+            "54": 552128512.0,
+            "55": 552128512.0,
+            "56": 552128512.0,
+            "57": 552128512.0,
+            "58": 552128512.0,
+            "59": 552128512.0,
+            "60": 552128512.0,
+            "61": 552128512.0,
+            "62": 552128512.0,
+            "63": 552128512.0,
+            "64": 552128512.0,
+            "65": 552128512.0,
+            "66": 552128512.0,
+            "67": 552128512.0,
+            "68": 552128512.0,
+            "69": 552128512.0,
+            "70": 552128512.0,
+            "71": 552128512.0,
+            "72": 552128512.0,
+            "73": 552128512.0,
+            "74": 552128512.0,
+            "75": 552128512.0,
+            "76": 552128512.0,
+            "77": 552128512.0,
+            "78": 552128512.0,
+            "79": 552128512.0,
+            "80": 552128512.0,
+            "81": 552128512.0,
+            "82": 552128512.0,
+            "83": 552128512.0,
+            "84": 552128512.0,
+            "85": 552128512.0,
+            "86": 552128512.0,
+            "87": 552128512.0,
+            "88": 552128512.0,
+            "89": 552128512.0,
+            "90": 552128512.0,
+            "91": 552128512.0,
+            "92": 552128512.0,
+            "93": 552128512.0,
+            "94": 552128512.0,
+            "95": 552128512.0,
+            "96": 552128512.0,
+            "97": 552128512.0,
+            "98": 552128512.0,
+            "99": 552128512.0,
+            "100": 552128512.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2679207424.0,
+            "52": 2712762880.0,
+            "53": 2712762880.0,
+            "54": 2712762880.0,
+            "55": 2712762880.0,
+            "56": 2712762880.0,
+            "57": 2712762880.0,
+            "58": 2712762880.0,
+            "59": 2712762880.0,
+            "60": 2712762880.0,
+            "61": 2712762880.0,
+            "62": 2712762880.0,
+            "63": 2712762880.0,
+            "64": 2712762880.0,
+            "65": 2712762880.0,
+            "66": 2712762880.0,
+            "67": 2712762880.0,
+            "68": 2712762880.0,
+            "69": 2712762880.0,
+            "70": 2712762880.0,
+            "71": 2712762880.0,
+            "72": 2712762880.0,
+            "73": 2712762880.0,
+            "74": 2712762880.0,
+            "75": 2712762880.0,
+            "76": 2712762880.0,
+            "77": 2712762880.0,
+            "78": 2712762880.0,
+            "79": 2712762880.0,
+            "80": 2712762880.0,
+            "81": 2712762880.0,
+            "82": 2712762880.0,
+            "83": 2712762880.0,
+            "84": 2712762880.0,
+            "85": 2712762880.0,
+            "86": 2712762880.0,
+            "87": 2712762880.0,
+            "88": 2712762880.0,
+            "89": 2712762880.0,
+            "90": 2712762880.0,
+            "91": 2712762880.0,
+            "92": 2712762880.0,
+            "93": 2712762880.0,
+            "94": 2712762880.0,
+            "95": 2712762880.0,
+            "96": 2712762880.0,
+            "97": 2712762880.0,
+            "98": 2712762880.0,
+            "99": 2712762880.0,
+            "100": 2712762880.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.18495,
+            "52": 0.6276,
+            "53": 0.08049,
+            "54": 0.07972,
+            "55": 0.08135,
+            "56": 0.07856,
+            "57": 0.08351,
+            "58": 0.07967,
+            "59": 0.08019,
+            "60": 0.0792,
+            "61": 0.07924,
+            "62": 0.07905,
+            "63": 0.08021,
+            "64": 0.07964,
+            "65": 0.07981,
+            "66": 0.07892,
+            "67": 0.07984,
+            "68": 0.07904,
+            "69": 0.07969,
+            "70": 0.07923,
+            "71": 0.07928,
+            "72": 0.07969,
+            "73": 0.07956,
+            "74": 0.08002,
+            "75": 0.07918,
+            "76": 0.07955,
+            "77": 0.07938,
+            "78": 0.08006,
+            "79": 0.07935,
+            "80": 0.07959,
+            "81": 0.08018,
+            "82": 0.07963,
+            "83": 0.07952,
+            "84": 0.07938,
+            "85": 0.07915,
+            "86": 0.07965,
+            "87": 0.07999,
+            "88": 0.07951,
+            "89": 0.08006,
+            "90": 0.0794,
+            "91": 0.07948,
+            "92": 0.07896,
+            "93": 0.07977,
+            "94": 0.07916,
+            "95": 0.07921,
+            "96": 0.07884,
+            "97": 0.0796,
+            "98": 0.07923,
+            "99": 0.07955,
+            "100": 0.07931
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..a084bf35662
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.77447,
+            "2": 10.78365,
+            "3": 10.78346,
+            "4": 10.74822,
+            "5": 10.81983,
+            "6": 10.82303,
+            "7": 10.79055,
+            "8": 10.77956,
+            "9": 10.78595,
+            "10": 10.74453,
+            "11": 10.83267,
+            "12": 10.80426,
+            "13": 10.82082,
+            "14": 10.82568,
+            "15": 10.74205,
+            "16": 10.74901,
+            "17": 10.72521,
+            "18": 10.74178,
+            "19": 10.74415,
+            "20": 10.63672,
+            "21": 10.63053,
+            "22": 10.47964,
+            "23": 10.65979,
+            "24": 10.52478,
+            "25": 10.47552,
+            "26": 10.54115,
+            "27": 10.55498,
+            "28": 10.52138,
+            "29": 10.53601,
+            "30": 10.3053,
+            "31": 10.06443,
+            "32": 10.41576,
+            "33": 10.42199,
+            "34": 10.17396,
+            "35": 10.22407,
+            "36": 10.18503,
+            "37": 10.30413,
+            "38": 10.14998,
+            "39": 10.37038,
+            "40": 10.03991,
+            "41": 10.1095,
+            "42": 10.17936,
+            "43": 9.79751,
+            "44": 9.90816,
+            "45": 9.79806,
+            "46": 9.79659,
+            "47": 10.1211,
+            "48": 9.82086,
+            "49": 9.50494,
+            "50": 9.88025,
+            "51": 9.83617,
+            "52": 9.72317,
+            "53": 10.05321,
+            "54": 9.93744,
+            "55": 9.87386,
+            "56": 9.60451,
+            "57": 9.45231,
+            "58": 9.81883,
+            "59": 9.57722,
+            "60": 9.48536,
+            "61": 9.68547,
+            "62": 9.97907,
+            "63": 9.36417,
+            "64": 9.76205,
+            "65": 8.94102,
+            "66": 9.69479,
+            "67": 9.36657,
+            "68": 9.77743,
+            "69": 9.78996,
+            "70": 9.72377,
+            "71": 9.62042,
+            "72": 9.57421,
+            "73": 9.48574,
+            "74": 8.92728,
+            "75": 9.41652,
+            "76": 9.07749,
+            "77": 10.05445,
+            "78": 9.71913,
+            "79": 9.37304,
+            "80": 9.40003,
+            "81": 9.47846,
+            "82": 9.69869,
+            "83": 9.31156,
+            "84": 9.41458,
+            "85": 9.61162,
+            "86": 9.07419,
+            "87": 9.59392,
+            "88": 9.74925,
+            "89": 9.59851,
+            "90": 9.82763,
+            "91": 9.33629,
+            "92": 9.35804,
+            "93": 9.08549,
+            "94": 8.8279,
+            "95": 9.53033,
+            "96": 9.52662,
+            "97": 9.30484,
+            "98": 9.67007,
+            "99": 8.89604,
+            "100": 9.407
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1531.0,
+            "2": 1722.0,
+            "3": 1589.0,
+            "4": 1870.0,
+            "5": 1992.0,
+            "6": 1894.0,
+            "7": 1954.0,
+            "8": 1697.0,
+            "9": 1855.0,
+            "10": 1477.0,
+            "11": 1889.0,
+            "12": 1848.0,
+            "13": 1885.0,
+            "14": 1934.0,
+            "15": 1984.0,
+            "16": 1934.0,
+            "17": 1820.0,
+            "18": 1643.0,
+            "19": 1735.0,
+            "20": 1682.0,
+            "21": 1974.0,
+            "22": 1733.0,
+            "23": 1932.0,
+            "24": 1650.0,
+            "25": 1603.0,
+            "26": 1762.0,
+            "27": 1846.0,
+            "28": 1899.0,
+            "29": 2020.0,
+            "30": 1941.0,
+            "31": 1620.0,
+            "32": 1902.0,
+            "33": 2053.0,
+            "34": 1891.0,
+            "35": 1988.0,
+            "36": 1990.0,
+            "37": 2382.0,
+            "38": 2143.0,
+            "39": 2445.0,
+            "40": 2284.0,
+            "41": 2265.0,
+            "42": 2272.0,
+            "43": 2112.0,
+            "44": 2088.0,
+            "45": 2332.0,
+            "46": 2345.0,
+            "47": 2550.0,
+            "48": 2419.0,
+            "49": 2250.0,
+            "50": 2509.0,
+            "51": 2708.0,
+            "52": 2707.0,
+            "53": 2812.0,
+            "54": 2620.0,
+            "55": 2399.0,
+            "56": 2790.0,
+            "57": 2301.0,
+            "58": 3008.0,
+            "59": 2863.0,
+            "60": 2465.0,
+            "61": 2808.0,
+            "62": 2607.0,
+            "63": 2442.0,
+            "64": 2977.0,
+            "65": 2646.0,
+            "66": 3061.0,
+            "67": 2818.0,
+            "68": 2891.0,
+            "69": 3036.0,
+            "70": 3160.0,
+            "71": 3064.0,
+            "72": 2618.0,
+            "73": 2978.0,
+            "74": 2000.0,
+            "75": 2580.0,
+            "76": 2967.0,
+            "77": 3281.0,
+            "78": 3131.0,
+            "79": 3108.0,
+            "80": 3217.0,
+            "81": 3614.0,
+            "82": 3411.0,
+            "83": 2834.0,
+            "84": 3191.0,
+            "85": 3306.0,
+            "86": 2806.0,
+            "87": 3808.0,
+            "88": 3237.0,
+            "89": 3425.0,
+            "90": 3202.0,
+            "91": 2829.0,
+            "92": 3105.0,
+            "93": 2882.0,
+            "94": 3303.0,
+            "95": 3310.0,
+            "96": 3499.0,
+            "97": 3211.0,
+            "98": 3741.0,
+            "99": 3167.0,
+            "100": 3049.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1260800512.0,
+            "2": 1260800512.0,
+            "3": 1260800512.0,
+            "4": 1260800512.0,
+            "5": 1260800512.0,
+            "6": 1260800512.0,
+            "7": 1260800512.0,
+            "8": 1260800512.0,
+            "9": 1260800512.0,
+            "10": 1260800512.0,
+            "11": 1260800512.0,
+            "12": 1260800512.0,
+            "13": 1260800512.0,
+            "14": 1260800512.0,
+            "15": 1260800512.0,
+            "16": 1260800512.0,
+            "17": 1260800512.0,
+            "18": 1260800512.0,
+            "19": 1260800512.0,
+            "20": 1260800512.0,
+            "21": 1260800512.0,
+            "22": 1260800512.0,
+            "23": 1260800512.0,
+            "24": 1260800512.0,
+            "25": 1260800512.0,
+            "26": 1260800512.0,
+            "27": 1260800512.0,
+            "28": 1260800512.0,
+            "29": 1260800512.0,
+            "30": 1260800512.0,
+            "31": 1260800512.0,
+            "32": 1260800512.0,
+            "33": 1260800512.0,
+            "34": 1260800512.0,
+            "35": 1260800512.0,
+            "36": 1260800512.0,
+            "37": 1260800512.0,
+            "38": 1260800512.0,
+            "39": 1260800512.0,
+            "40": 1260800512.0,
+            "41": 1260800512.0,
+            "42": 1260800512.0,
+            "43": 1260800512.0,
+            "44": 1260800512.0,
+            "45": 1260800512.0,
+            "46": 1260800512.0,
+            "47": 1260800512.0,
+            "48": 1260800512.0,
+            "49": 1260800512.0,
+            "50": 1260800512.0,
+            "51": 1260800512.0,
+            "52": 1260800512.0,
+            "53": 1260800512.0,
+            "54": 1260800512.0,
+            "55": 1260800512.0,
+            "56": 1260800512.0,
+            "57": 1260800512.0,
+            "58": 1260800512.0,
+            "59": 1260800512.0,
+            "60": 1260800512.0,
+            "61": 1260800512.0,
+            "62": 1260800512.0,
+            "63": 1260800512.0,
+            "64": 1260800512.0,
+            "65": 1260800512.0,
+            "66": 1260800512.0,
+            "67": 1260800512.0,
+            "68": 1260800512.0,
+            "69": 1260800512.0,
+            "70": 1260800512.0,
+            "71": 1260800512.0,
+            "72": 1260800512.0,
+            "73": 1260800512.0,
+            "74": 1260800512.0,
+            "75": 1260800512.0,
+            "76": 1260800512.0,
+            "77": 1260800512.0,
+            "78": 1260800512.0,
+            "79": 1260800512.0,
+            "80": 1260800512.0,
+            "81": 1260800512.0,
+            "82": 1260800512.0,
+            "83": 1260800512.0,
+            "84": 1260800512.0,
+            "85": 1260800512.0,
+            "86": 1260800512.0,
+            "87": 1260800512.0,
+            "88": 1260800512.0,
+            "89": 1260800512.0,
+            "90": 1260800512.0,
+            "91": 1260800512.0,
+            "92": 1260800512.0,
+            "93": 1260800512.0,
+            "94": 1260800512.0,
+            "95": 1260800512.0,
+            "96": 1260800512.0,
+            "97": 1260800512.0,
+            "98": 1260800512.0,
+            "99": 1260800512.0,
+            "100": 1260800512.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2013853696.0,
+            "2": 2562382848.0,
+            "3": 2562382848.0,
+            "4": 2562382848.0,
+            "5": 2562382848.0,
+            "6": 2562382848.0,
+            "7": 2562382848.0,
+            "8": 2562382848.0,
+            "9": 2562382848.0,
+            "10": 2562382848.0,
+            "11": 2562382848.0,
+            "12": 2562382848.0,
+            "13": 2562382848.0,
+            "14": 2562382848.0,
+            "15": 2562382848.0,
+            "16": 2562382848.0,
+            "17": 2562382848.0,
+            "18": 2562382848.0,
+            "19": 2562382848.0,
+            "20": 2562382848.0,
+            "21": 2562382848.0,
+            "22": 2562382848.0,
+            "23": 2562382848.0,
+            "24": 2562382848.0,
+            "25": 2562382848.0,
+            "26": 2562382848.0,
+            "27": 2562382848.0,
+            "28": 2562382848.0,
+            "29": 2562382848.0,
+            "30": 2562382848.0,
+            "31": 2562382848.0,
+            "32": 2562382848.0,
+            "33": 2562382848.0,
+            "34": 2562382848.0,
+            "35": 2562382848.0,
+            "36": 2562382848.0,
+            "37": 2562382848.0,
+            "38": 2562382848.0,
+            "39": 2562382848.0,
+            "40": 2562382848.0,
+            "41": 2562382848.0,
+            "42": 2562382848.0,
+            "43": 2562382848.0,
+            "44": 2562382848.0,
+            "45": 2562382848.0,
+            "46": 2562382848.0,
+            "47": 2562382848.0,
+            "48": 2562382848.0,
+            "49": 2562382848.0,
+            "50": 2562382848.0,
+            "51": 2562382848.0,
+            "52": 2562382848.0,
+            "53": 2562382848.0,
+            "54": 2562382848.0,
+            "55": 2562382848.0,
+            "56": 2562382848.0,
+            "57": 2562382848.0,
+            "58": 2562382848.0,
+            "59": 2562382848.0,
+            "60": 2562382848.0,
+            "61": 2562382848.0,
+            "62": 2562382848.0,
+            "63": 2562382848.0,
+            "64": 2562382848.0,
+            "65": 2562382848.0,
+            "66": 2562382848.0,
+            "67": 2562382848.0,
+            "68": 2562382848.0,
+            "69": 2562382848.0,
+            "70": 2562382848.0,
+            "71": 2562382848.0,
+            "72": 2562382848.0,
+            "73": 2562382848.0,
+            "74": 2562382848.0,
+            "75": 2562382848.0,
+            "76": 2562382848.0,
+            "77": 2562382848.0,
+            "78": 2562382848.0,
+            "79": 2562382848.0,
+            "80": 2562382848.0,
+            "81": 2562382848.0,
+            "82": 2562382848.0,
+            "83": 2562382848.0,
+            "84": 2562382848.0,
+            "85": 2562382848.0,
+            "86": 2562382848.0,
+            "87": 2562382848.0,
+            "88": 2562382848.0,
+            "89": 2562382848.0,
+            "90": 2562382848.0,
+            "91": 2562382848.0,
+            "92": 2562382848.0,
+            "93": 2562382848.0,
+            "94": 2562382848.0,
+            "95": 2562382848.0,
+            "96": 2562382848.0,
+            "97": 2562382848.0,
+            "98": 2562382848.0,
+            "99": 2562382848.0,
+            "100": 2562382848.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.02256,
+            "3": 0.13455,
+            "4": 0.12293,
+            "5": 0.12302,
+            "6": 0.1233,
+            "7": 0.12328,
+            "8": 0.12248,
+            "9": 0.12446,
+            "10": 0.12285,
+            "11": 0.12255,
+            "12": 0.12296,
+            "13": 0.12411,
+            "14": 0.12369,
+            "15": 0.12438,
+            "16": 0.12387,
+            "17": 0.12481,
+            "18": 0.12591,
+            "19": 0.12445,
+            "20": 0.12257,
+            "21": 0.12141,
+            "22": 0.12289,
+            "23": 0.12296,
+            "24": 0.12246,
+            "25": 0.12246,
+            "26": 0.12219,
+            "27": 0.12283,
+            "28": 0.12209,
+            "29": 0.12164,
+            "30": 0.12236,
+            "31": 0.1236,
+            "32": 0.12251,
+            "33": 0.12372,
+            "34": 0.12054,
+            "35": 0.12166,
+            "36": 0.12052,
+            "37": 0.12268,
+            "38": 0.12181,
+            "39": 0.12231,
+            "40": 0.1195,
+            "41": 0.12001,
+            "42": 0.12145,
+            "43": 0.12238,
+            "44": 0.12054,
+            "45": 0.11842,
+            "46": 0.11812,
+            "47": 0.11785,
+            "48": 0.11631,
+            "49": 0.11798,
+            "50": 0.11707,
+            "51": 0.12234,
+            "52": 0.11424,
+            "53": 0.11577,
+            "54": 0.11058,
+            "55": 0.11651,
+            "56": 0.12356,
+            "57": 0.12837,
+            "58": 0.1238,
+            "59": 0.13093,
+            "60": 0.14556,
+            "61": 0.1747,
+            "62": 0.14486,
+            "63": 0.15679,
+            "64": 0.14116,
+            "65": 0.13574,
+            "66": 0.16023,
+            "67": 0.14862,
+            "68": 0.14163,
+            "69": 0.14244,
+            "70": 0.13512,
+            "71": 0.1407,
+            "72": 0.14689,
+            "73": 0.13238,
+            "74": 0.1279,
+            "75": 0.12535,
+            "76": 0.12172,
+            "77": 0.12314,
+            "78": 0.12089,
+            "79": 0.11925,
+            "80": 0.11854,
+            "81": 0.11618,
+            "82": 0.11706,
+            "83": 0.11632,
+            "84": 0.11839,
+            "85": 0.11763,
+            "86": 0.11977,
+            "87": 0.12013,
+            "88": 0.11954,
+            "89": 0.11859,
+            "90": 0.11546,
+            "91": 0.11494,
+            "92": 0.1154,
+            "93": 0.11866,
+            "94": 0.25826,
+            "95": 0.11359,
+            "96": 0.11427,
+            "97": 0.11526,
+            "98": 0.11269,
+            "99": 0.11239,
+            "100": 0.11374
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..9853cec1655
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83617,
+            "52": 9.72317,
+            "53": 10.05321,
+            "54": 9.93744,
+            "55": 9.87386,
+            "56": 9.60451,
+            "57": 9.45231,
+            "58": 9.81883,
+            "59": 9.57722,
+            "60": 9.48536,
+            "61": 9.68547,
+            "62": 9.97907,
+            "63": 9.36417,
+            "64": 9.76205,
+            "65": 8.94102,
+            "66": 9.69479,
+            "67": 9.36657,
+            "68": 9.77743,
+            "69": 9.78996,
+            "70": 9.72377,
+            "71": 9.62042,
+            "72": 9.57421,
+            "73": 9.48574,
+            "74": 8.92728,
+            "75": 9.41652,
+            "76": 9.07749,
+            "77": 10.05445,
+            "78": 9.71913,
+            "79": 9.37304,
+            "80": 9.40003,
+            "81": 9.47846,
+            "82": 9.69869,
+            "83": 9.31156,
+            "84": 9.41458,
+            "85": 9.61162,
+            "86": 9.07419,
+            "87": 9.59392,
+            "88": 9.74925,
+            "89": 9.59851,
+            "90": 9.82763,
+            "91": 9.33629,
+            "92": 9.35804,
+            "93": 9.08549,
+            "94": 8.8279,
+            "95": 9.53033,
+            "96": 9.52662,
+            "97": 9.30484,
+            "98": 9.67007,
+            "99": 8.89604,
+            "100": 9.407
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2708.0,
+            "52": 2707.0,
+            "53": 2812.0,
+            "54": 2620.0,
+            "55": 2399.0,
+            "56": 2790.0,
+            "57": 2301.0,
+            "58": 3008.0,
+            "59": 2863.0,
+            "60": 2465.0,
+            "61": 2808.0,
+            "62": 2607.0,
+            "63": 2442.0,
+            "64": 2977.0,
+            "65": 2646.0,
+            "66": 3061.0,
+            "67": 2818.0,
+            "68": 2891.0,
+            "69": 3036.0,
+            "70": 3160.0,
+            "71": 3064.0,
+            "72": 2618.0,
+            "73": 2978.0,
+            "74": 2000.0,
+            "75": 2580.0,
+            "76": 2967.0,
+            "77": 3281.0,
+            "78": 3131.0,
+            "79": 3108.0,
+            "80": 3217.0,
+            "81": 3614.0,
+            "82": 3411.0,
+            "83": 2834.0,
+            "84": 3191.0,
+            "85": 3306.0,
+            "86": 2806.0,
+            "87": 3808.0,
+            "88": 3237.0,
+            "89": 3425.0,
+            "90": 3202.0,
+            "91": 2829.0,
+            "92": 3105.0,
+            "93": 2882.0,
+            "94": 3303.0,
+            "95": 3310.0,
+            "96": 3499.0,
+            "97": 3211.0,
+            "98": 3741.0,
+            "99": 3167.0,
+            "100": 3049.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1261849088.0,
+            "52": 1261849088.0,
+            "53": 1261849088.0,
+            "54": 1261849088.0,
+            "55": 1261849088.0,
+            "56": 1261849088.0,
+            "57": 1261849088.0,
+            "58": 1261849088.0,
+            "59": 1261849088.0,
+            "60": 1261849088.0,
+            "61": 1261849088.0,
+            "62": 1261849088.0,
+            "63": 1261849088.0,
+            "64": 1261849088.0,
+            "65": 1261849088.0,
+            "66": 1261849088.0,
+            "67": 1261849088.0,
+            "68": 1261849088.0,
+            "69": 1261849088.0,
+            "70": 1261849088.0,
+            "71": 1261849088.0,
+            "72": 1261849088.0,
+            "73": 1261849088.0,
+            "74": 1261849088.0,
+            "75": 1261849088.0,
+            "76": 1261849088.0,
+            "77": 1261849088.0,
+            "78": 1261849088.0,
+            "79": 1261849088.0,
+            "80": 1261849088.0,
+            "81": 1261849088.0,
+            "82": 1261849088.0,
+            "83": 1261849088.0,
+            "84": 1261849088.0,
+            "85": 1261849088.0,
+            "86": 1261849088.0,
+            "87": 1261849088.0,
+            "88": 1261849088.0,
+            "89": 1261849088.0,
+            "90": 1261849088.0,
+            "91": 1261849088.0,
+            "92": 1261849088.0,
+            "93": 1261849088.0,
+            "94": 1261849088.0,
+            "95": 1261849088.0,
+            "96": 1261849088.0,
+            "97": 1261849088.0,
+            "98": 1261849088.0,
+            "99": 1261849088.0,
+            "100": 1261849088.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2530924544.0,
+            "52": 2564480000.0,
+            "53": 2564480000.0,
+            "54": 2564480000.0,
+            "55": 2564480000.0,
+            "56": 2564480000.0,
+            "57": 2564480000.0,
+            "58": 2564480000.0,
+            "59": 2564480000.0,
+            "60": 2564480000.0,
+            "61": 2564480000.0,
+            "62": 2564480000.0,
+            "63": 2564480000.0,
+            "64": 2564480000.0,
+            "65": 2564480000.0,
+            "66": 2564480000.0,
+            "67": 2564480000.0,
+            "68": 2564480000.0,
+            "69": 2564480000.0,
+            "70": 2564480000.0,
+            "71": 2564480000.0,
+            "72": 2564480000.0,
+            "73": 2564480000.0,
+            "74": 2564480000.0,
+            "75": 2564480000.0,
+            "76": 2564480000.0,
+            "77": 2564480000.0,
+            "78": 2564480000.0,
+            "79": 2564480000.0,
+            "80": 2564480000.0,
+            "81": 2564480000.0,
+            "82": 2564480000.0,
+            "83": 2564480000.0,
+            "84": 2564480000.0,
+            "85": 2564480000.0,
+            "86": 2564480000.0,
+            "87": 2564480000.0,
+            "88": 2564480000.0,
+            "89": 2564480000.0,
+            "90": 2564480000.0,
+            "91": 2564480000.0,
+            "92": 2564480000.0,
+            "93": 2564480000.0,
+            "94": 2564480000.0,
+            "95": 2564480000.0,
+            "96": 2564480000.0,
+            "97": 2564480000.0,
+            "98": 2564480000.0,
+            "99": 2564480000.0,
+            "100": 2564480000.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 1.73474,
+            "53": 0.13329,
+            "54": 0.12193,
+            "55": 0.12308,
+            "56": 0.12634,
+            "57": 0.12745,
+            "58": 0.12425,
+            "59": 0.12199,
+            "60": 0.12359,
+            "61": 0.11982,
+            "62": 0.12161,
+            "63": 0.11993,
+            "64": 0.12221,
+            "65": 0.12364,
+            "66": 0.12245,
+            "67": 0.12126,
+            "68": 0.12211,
+            "69": 0.11961,
+            "70": 0.12166,
+            "71": 0.11999,
+            "72": 0.12512,
+            "73": 0.12157,
+            "74": 0.11996,
+            "75": 0.12183,
+            "76": 0.11982,
+            "77": 0.1205,
+            "78": 0.12225,
+            "79": 0.12245,
+            "80": 0.12222,
+            "81": 0.12087,
+            "82": 0.11834,
+            "83": 0.11849,
+            "84": 0.11754,
+            "85": 0.1168,
+            "86": 0.11739,
+            "87": 0.11786,
+            "88": 0.1178,
+            "89": 0.11801,
+            "90": 0.11757,
+            "91": 0.11668,
+            "92": 0.11659,
+            "93": 0.11656,
+            "94": 0.11741,
+            "95": 0.11613,
+            "96": 0.11801,
+            "97": 0.11735,
+            "98": 0.1168,
+            "99": 0.11724,
+            "100": 0.1175
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100.json
index df5117f4d8f..05b11c3c8ee 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1261848064.0,
-            "2": 1261848064.0,
-            "3": 1261848064.0,
-            "4": 1261848064.0,
-            "5": 1261848064.0,
-            "6": 1261848064.0,
-            "7": 1261848064.0,
-            "8": 1261848064.0,
-            "9": 1261848064.0,
-            "10": 1261848064.0,
-            "11": 1261848064.0,
-            "12": 1261848064.0,
-            "13": 1261848064.0,
-            "14": 1261848064.0,
-            "15": 1261848064.0,
-            "16": 1261848064.0,
-            "17": 1261848064.0,
-            "18": 1261848064.0,
-            "19": 1261848064.0,
-            "20": 1261848064.0,
-            "21": 1261848064.0,
-            "22": 1261848064.0,
-            "23": 1261848064.0,
-            "24": 1261848064.0,
-            "25": 1261848064.0,
-            "26": 1261848064.0,
-            "27": 1261848064.0,
-            "28": 1261848064.0,
-            "29": 1261848064.0,
-            "30": 1261848064.0,
-            "31": 1261848064.0,
-            "32": 1261848064.0,
-            "33": 1261848064.0,
-            "34": 1261848064.0,
-            "35": 1261848064.0,
-            "36": 1261848064.0,
-            "37": 1261848064.0,
-            "38": 1261848064.0,
-            "39": 1261848064.0,
-            "40": 1261848064.0,
-            "41": 1261848064.0,
-            "42": 1261848064.0,
-            "43": 1261848064.0,
-            "44": 1261848064.0,
-            "45": 1261848064.0,
-            "46": 1261848064.0,
-            "47": 1261848064.0,
-            "48": 1261848064.0,
-            "49": 1261848064.0,
-            "50": 1261848064.0,
-            "51": 1261848064.0,
-            "52": 1261848064.0,
-            "53": 1261848064.0,
-            "54": 1261848064.0,
-            "55": 1261848064.0,
-            "56": 1261848064.0,
-            "57": 1261848064.0,
-            "58": 1261848064.0,
-            "59": 1261848064.0,
-            "60": 1261848064.0,
-            "61": 1261848064.0,
-            "62": 1261848064.0,
-            "63": 1261848064.0,
-            "64": 1261848064.0,
-            "65": 1261848064.0,
-            "66": 1261848064.0,
-            "67": 1261848064.0,
-            "68": 1261848064.0,
-            "69": 1261848064.0,
-            "70": 1261848064.0,
-            "71": 1261848064.0,
-            "72": 1261848064.0,
-            "73": 1261848064.0,
-            "74": 1261848064.0,
-            "75": 1261848064.0,
-            "76": 1261848064.0,
-            "77": 1261848064.0,
-            "78": 1261848064.0,
-            "79": 1261848064.0,
-            "80": 1261848064.0,
-            "81": 1261848064.0,
-            "82": 1261848064.0,
-            "83": 1261848064.0,
-            "84": 1261848064.0,
-            "85": 1261848064.0,
-            "86": 1261848064.0,
-            "87": 1261848064.0,
-            "88": 1261848064.0,
-            "89": 1261848064.0,
-            "90": 1261848064.0,
-            "91": 1261848064.0,
-            "92": 1261848064.0,
-            "93": 1261848064.0,
-            "94": 1261848064.0,
-            "95": 1261848064.0,
-            "96": 1261848064.0,
-            "97": 1261848064.0,
-            "98": 1261848064.0,
-            "99": 1261848064.0,
-            "100": 1261848064.0
+            "1": 1261849088.0,
+            "2": 1261849088.0,
+            "3": 1261849088.0,
+            "4": 1261849088.0,
+            "5": 1261849088.0,
+            "6": 1261849088.0,
+            "7": 1261849088.0,
+            "8": 1261849088.0,
+            "9": 1261849088.0,
+            "10": 1261849088.0,
+            "11": 1261849088.0,
+            "12": 1261849088.0,
+            "13": 1261849088.0,
+            "14": 1261849088.0,
+            "15": 1261849088.0,
+            "16": 1261849088.0,
+            "17": 1261849088.0,
+            "18": 1261849088.0,
+            "19": 1261849088.0,
+            "20": 1261849088.0,
+            "21": 1261849088.0,
+            "22": 1261849088.0,
+            "23": 1261849088.0,
+            "24": 1261849088.0,
+            "25": 1261849088.0,
+            "26": 1261849088.0,
+            "27": 1261849088.0,
+            "28": 1261849088.0,
+            "29": 1261849088.0,
+            "30": 1261849088.0,
+            "31": 1261849088.0,
+            "32": 1261849088.0,
+            "33": 1261849088.0,
+            "34": 1261849088.0,
+            "35": 1261849088.0,
+            "36": 1261849088.0,
+            "37": 1261849088.0,
+            "38": 1261849088.0,
+            "39": 1261849088.0,
+            "40": 1261849088.0,
+            "41": 1261849088.0,
+            "42": 1261849088.0,
+            "43": 1261849088.0,
+            "44": 1261849088.0,
+            "45": 1261849088.0,
+            "46": 1261849088.0,
+            "47": 1261849088.0,
+            "48": 1261849088.0,
+            "49": 1261849088.0,
+            "50": 1261849088.0,
+            "51": 1261849088.0,
+            "52": 1261849088.0,
+            "53": 1261849088.0,
+            "54": 1261849088.0,
+            "55": 1261849088.0,
+            "56": 1261849088.0,
+            "57": 1261849088.0,
+            "58": 1261849088.0,
+            "59": 1261849088.0,
+            "60": 1261849088.0,
+            "61": 1261849088.0,
+            "62": 1261849088.0,
+            "63": 1261849088.0,
+            "64": 1261849088.0,
+            "65": 1261849088.0,
+            "66": 1261849088.0,
+            "67": 1261849088.0,
+            "68": 1261849088.0,
+            "69": 1261849088.0,
+            "70": 1261849088.0,
+            "71": 1261849088.0,
+            "72": 1261849088.0,
+            "73": 1261849088.0,
+            "74": 1261849088.0,
+            "75": 1261849088.0,
+            "76": 1261849088.0,
+            "77": 1261849088.0,
+            "78": 1261849088.0,
+            "79": 1261849088.0,
+            "80": 1261849088.0,
+            "81": 1261849088.0,
+            "82": 1261849088.0,
+            "83": 1261849088.0,
+            "84": 1261849088.0,
+            "85": 1261849088.0,
+            "86": 1261849088.0,
+            "87": 1261849088.0,
+            "88": 1261849088.0,
+            "89": 1261849088.0,
+            "90": 1261849088.0,
+            "91": 1261849088.0,
+            "92": 1261849088.0,
+            "93": 1261849088.0,
+            "94": 1261849088.0,
+            "95": 1261849088.0,
+            "96": 1261849088.0,
+            "97": 1261849088.0,
+            "98": 1261849088.0,
+            "99": 1261849088.0,
+            "100": 1261849088.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2013852672.0,
-            "2": 2563430400.0,
-            "3": 2563430400.0,
-            "4": 2563430400.0,
-            "5": 2563430400.0,
-            "6": 2563430400.0,
-            "7": 2563430400.0,
-            "8": 2563430400.0,
-            "9": 2563430400.0,
-            "10": 2563430400.0,
-            "11": 2563430400.0,
-            "12": 2563430400.0,
-            "13": 2563430400.0,
-            "14": 2563430400.0,
-            "15": 2563430400.0,
-            "16": 2563430400.0,
-            "17": 2563430400.0,
-            "18": 2563430400.0,
-            "19": 2563430400.0,
-            "20": 2563430400.0,
-            "21": 2563430400.0,
-            "22": 2563430400.0,
-            "23": 2563430400.0,
-            "24": 2563430400.0,
-            "25": 2563430400.0,
-            "26": 2563430400.0,
-            "27": 2563430400.0,
-            "28": 2563430400.0,
-            "29": 2563430400.0,
-            "30": 2563430400.0,
-            "31": 2563430400.0,
-            "32": 2563430400.0,
-            "33": 2563430400.0,
-            "34": 2563430400.0,
-            "35": 2563430400.0,
-            "36": 2563430400.0,
-            "37": 2563430400.0,
-            "38": 2563430400.0,
-            "39": 2563430400.0,
-            "40": 2563430400.0,
-            "41": 2563430400.0,
-            "42": 2563430400.0,
-            "43": 2563430400.0,
-            "44": 2563430400.0,
-            "45": 2563430400.0,
-            "46": 2563430400.0,
-            "47": 2563430400.0,
-            "48": 2563430400.0,
-            "49": 2563430400.0,
-            "50": 2563430400.0,
-            "51": 2563430400.0,
-            "52": 2563430400.0,
-            "53": 2563430400.0,
-            "54": 2563430400.0,
-            "55": 2563430400.0,
-            "56": 2563430400.0,
-            "57": 2563430400.0,
-            "58": 2563430400.0,
-            "59": 2563430400.0,
-            "60": 2563430400.0,
-            "61": 2563430400.0,
-            "62": 2563430400.0,
-            "63": 2563430400.0,
-            "64": 2563430400.0,
-            "65": 2563430400.0,
-            "66": 2563430400.0,
-            "67": 2563430400.0,
-            "68": 2563430400.0,
-            "69": 2563430400.0,
-            "70": 2563430400.0,
-            "71": 2563430400.0,
-            "72": 2563430400.0,
-            "73": 2563430400.0,
-            "74": 2563430400.0,
-            "75": 2563430400.0,
-            "76": 2563430400.0,
-            "77": 2563430400.0,
-            "78": 2563430400.0,
-            "79": 2563430400.0,
-            "80": 2563430400.0,
-            "81": 2563430400.0,
-            "82": 2563430400.0,
-            "83": 2563430400.0,
-            "84": 2563430400.0,
-            "85": 2563430400.0,
-            "86": 2563430400.0,
-            "87": 2563430400.0,
-            "88": 2563430400.0,
-            "89": 2563430400.0,
-            "90": 2563430400.0,
-            "91": 2563430400.0,
-            "92": 2563430400.0,
-            "93": 2563430400.0,
-            "94": 2563430400.0,
-            "95": 2563430400.0,
-            "96": 2563430400.0,
-            "97": 2563430400.0,
-            "98": 2563430400.0,
-            "99": 2563430400.0,
-            "100": 2563430400.0
+            "1": 2013853696.0,
+            "2": 2563431424.0,
+            "3": 2563431424.0,
+            "4": 2563431424.0,
+            "5": 2563431424.0,
+            "6": 2563431424.0,
+            "7": 2563431424.0,
+            "8": 2563431424.0,
+            "9": 2563431424.0,
+            "10": 2563431424.0,
+            "11": 2563431424.0,
+            "12": 2563431424.0,
+            "13": 2563431424.0,
+            "14": 2563431424.0,
+            "15": 2563431424.0,
+            "16": 2563431424.0,
+            "17": 2563431424.0,
+            "18": 2563431424.0,
+            "19": 2563431424.0,
+            "20": 2563431424.0,
+            "21": 2563431424.0,
+            "22": 2563431424.0,
+            "23": 2563431424.0,
+            "24": 2563431424.0,
+            "25": 2563431424.0,
+            "26": 2563431424.0,
+            "27": 2563431424.0,
+            "28": 2563431424.0,
+            "29": 2563431424.0,
+            "30": 2563431424.0,
+            "31": 2563431424.0,
+            "32": 2563431424.0,
+            "33": 2563431424.0,
+            "34": 2563431424.0,
+            "35": 2563431424.0,
+            "36": 2563431424.0,
+            "37": 2563431424.0,
+            "38": 2563431424.0,
+            "39": 2563431424.0,
+            "40": 2563431424.0,
+            "41": 2563431424.0,
+            "42": 2563431424.0,
+            "43": 2563431424.0,
+            "44": 2563431424.0,
+            "45": 2563431424.0,
+            "46": 2563431424.0,
+            "47": 2563431424.0,
+            "48": 2563431424.0,
+            "49": 2563431424.0,
+            "50": 2563431424.0,
+            "51": 2563431424.0,
+            "52": 2563431424.0,
+            "53": 2563431424.0,
+            "54": 2563431424.0,
+            "55": 2563431424.0,
+            "56": 2563431424.0,
+            "57": 2563431424.0,
+            "58": 2563431424.0,
+            "59": 2563431424.0,
+            "60": 2563431424.0,
+            "61": 2563431424.0,
+            "62": 2563431424.0,
+            "63": 2563431424.0,
+            "64": 2563431424.0,
+            "65": 2563431424.0,
+            "66": 2563431424.0,
+            "67": 2563431424.0,
+            "68": 2563431424.0,
+            "69": 2563431424.0,
+            "70": 2563431424.0,
+            "71": 2563431424.0,
+            "72": 2563431424.0,
+            "73": 2563431424.0,
+            "74": 2563431424.0,
+            "75": 2563431424.0,
+            "76": 2563431424.0,
+            "77": 2563431424.0,
+            "78": 2563431424.0,
+            "79": 2563431424.0,
+            "80": 2563431424.0,
+            "81": 2563431424.0,
+            "82": 2563431424.0,
+            "83": 2563431424.0,
+            "84": 2563431424.0,
+            "85": 2563431424.0,
+            "86": 2563431424.0,
+            "87": 2563431424.0,
+            "88": 2563431424.0,
+            "89": 2563431424.0,
+            "90": 2563431424.0,
+            "91": 2563431424.0,
+            "92": 2563431424.0,
+            "93": 2563431424.0,
+            "94": 2563431424.0,
+            "95": 2563431424.0,
+            "96": 2563431424.0,
+            "97": 2563431424.0,
+            "98": 2563431424.0,
+            "99": 2563431424.0,
+            "100": 2563431424.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.36321,
-            "2": 0.1218,
-            "3": 0.11132,
-            "4": 0.10707,
-            "5": 0.0969,
-            "6": 0.09387,
-            "7": 0.09166,
-            "8": 0.09482,
-            "9": 0.09368,
-            "10": 0.09371,
-            "11": 0.0914,
-            "12": 0.09315,
-            "13": 0.09323,
-            "14": 0.09407,
-            "15": 0.09341,
-            "16": 0.09525,
-            "17": 0.09338,
-            "18": 0.09247,
-            "19": 0.09648,
-            "20": 0.09425,
-            "21": 0.09329,
-            "22": 0.09356,
-            "23": 0.09379,
-            "24": 0.09405,
-            "25": 0.0935,
-            "26": 0.09238,
-            "27": 0.09612,
-            "28": 0.09315,
-            "29": 0.09297,
-            "30": 0.09342,
-            "31": 0.09294,
-            "32": 0.09287,
-            "33": 0.09256,
-            "34": 0.09461,
-            "35": 0.09355,
-            "36": 0.09517,
-            "37": 0.09434,
-            "38": 0.0956,
-            "39": 0.09435,
-            "40": 0.09568,
-            "41": 0.09615,
-            "42": 0.09244,
-            "43": 0.09364,
-            "44": 0.09376,
-            "45": 0.09258,
-            "46": 0.09268,
-            "47": 0.09255,
-            "48": 0.09424,
-            "49": 0.09573,
-            "50": 0.09436,
-            "51": 0.0945,
-            "52": 0.09894,
-            "53": 0.09918,
-            "54": 0.09823,
-            "55": 0.09863,
-            "56": 0.09834,
-            "57": 0.09709,
-            "58": 0.09303,
-            "59": 0.09404,
-            "60": 0.09192,
-            "61": 0.09198,
-            "62": 0.09274,
-            "63": 0.09166,
-            "64": 0.09147,
-            "65": 0.09327,
-            "66": 0.11015,
-            "67": 0.09684,
-            "68": 0.09642,
-            "69": 0.09562,
-            "70": 0.0924,
-            "71": 0.09384,
-            "72": 0.09189,
-            "73": 0.09372,
-            "74": 0.09193,
-            "75": 0.09409,
-            "76": 0.09252,
-            "77": 0.09275,
-            "78": 0.09475,
-            "79": 0.0945,
-            "80": 0.10107,
-            "81": 0.09197,
-            "82": 0.09204,
-            "83": 0.09353,
-            "84": 0.09326,
-            "85": 0.09194,
-            "86": 0.1029,
-            "87": 0.09285,
-            "88": 0.09168,
-            "89": 0.09478,
-            "90": 0.09254,
-            "91": 0.0921,
-            "92": 0.09246,
-            "93": 0.09207,
-            "94": 0.09324,
-            "95": 0.09431,
-            "96": 0.09195,
-            "97": 0.09285,
-            "98": 0.09175,
-            "99": 0.09153,
-            "100": 0.11457
+            "1": 4.95057,
+            "2": 0.11272,
+            "3": 0.10006,
+            "4": 0.0754,
+            "5": 0.07446,
+            "6": 0.07499,
+            "7": 0.07451,
+            "8": 0.07507,
+            "9": 0.07406,
+            "10": 0.07462,
+            "11": 0.07387,
+            "12": 0.07421,
+            "13": 0.07426,
+            "14": 0.075,
+            "15": 0.07429,
+            "16": 0.07394,
+            "17": 0.07476,
+            "18": 0.07498,
+            "19": 0.07455,
+            "20": 0.07456,
+            "21": 0.07463,
+            "22": 0.07473,
+            "23": 0.07475,
+            "24": 0.0743,
+            "25": 0.07447,
+            "26": 0.07414,
+            "27": 0.07438,
+            "28": 0.07665,
+            "29": 0.07618,
+            "30": 0.07525,
+            "31": 0.07718,
+            "32": 0.07452,
+            "33": 0.07632,
+            "34": 0.07594,
+            "35": 0.0752,
+            "36": 0.07788,
+            "37": 0.07472,
+            "38": 0.07514,
+            "39": 0.07557,
+            "40": 0.07528,
+            "41": 0.07668,
+            "42": 0.07829,
+            "43": 0.07561,
+            "44": 0.07525,
+            "45": 0.07522,
+            "46": 0.08858,
+            "47": 0.09212,
+            "48": 0.07649,
+            "49": 0.07761,
+            "50": 0.07534,
+            "51": 0.0797,
+            "52": 0.07601,
+            "53": 0.07588,
+            "54": 0.07564,
+            "55": 0.07643,
+            "56": 0.07613,
+            "57": 0.07562,
+            "58": 0.07558,
+            "59": 0.07588,
+            "60": 0.07563,
+            "61": 0.07585,
+            "62": 0.07578,
+            "63": 0.07559,
+            "64": 0.07502,
+            "65": 0.07586,
+            "66": 0.07503,
+            "67": 0.0755,
+            "68": 0.07448,
+            "69": 0.07531,
+            "70": 0.07481,
+            "71": 0.07524,
+            "72": 0.07712,
+            "73": 0.07539,
+            "74": 0.07566,
+            "75": 0.07497,
+            "76": 0.07458,
+            "77": 0.07476,
+            "78": 0.07547,
+            "79": 0.07542,
+            "80": 0.07549,
+            "81": 0.07589,
+            "82": 0.07548,
+            "83": 0.07513,
+            "84": 0.07494,
+            "85": 0.07468,
+            "86": 0.07522,
+            "87": 0.07487,
+            "88": 0.07533,
+            "89": 0.07545,
+            "90": 0.07496,
+            "91": 0.07533,
+            "92": 0.07435,
+            "93": 0.07549,
+            "94": 0.07465,
+            "95": 0.07523,
+            "96": 0.07531,
+            "97": 0.07697,
+            "98": 0.0768,
+            "99": 0.07605,
+            "100": 0.07588
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..2a8709b9af2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8545,
+            "52": 9.7393,
+            "53": 10.07426,
+            "54": 9.96913,
+            "55": 9.88574,
+            "56": 9.62438,
+            "57": 9.48229,
+            "58": 9.83484,
+            "59": 9.58731,
+            "60": 9.50243,
+            "61": 9.6934,
+            "62": 9.988,
+            "63": 9.39105,
+            "64": 9.78022,
+            "65": 8.94516,
+            "66": 9.70492,
+            "67": 9.37249,
+            "68": 9.78328,
+            "69": 9.79057,
+            "70": 9.74451,
+            "71": 9.62298,
+            "72": 9.58457,
+            "73": 9.50511,
+            "74": 8.94308,
+            "75": 9.42524,
+            "76": 9.07602,
+            "77": 10.06352,
+            "78": 9.72307,
+            "79": 9.37497,
+            "80": 9.40454,
+            "81": 9.4779,
+            "82": 9.69669,
+            "83": 9.30714,
+            "84": 9.41525,
+            "85": 9.61295,
+            "86": 9.07198,
+            "87": 9.58834,
+            "88": 9.7476,
+            "89": 9.59984,
+            "90": 9.81672,
+            "91": 9.33791,
+            "92": 9.35608,
+            "93": 9.07423,
+            "94": 8.83511,
+            "95": 9.51841,
+            "96": 9.52391,
+            "97": 9.30922,
+            "98": 9.66746,
+            "99": 8.88421,
+            "100": 9.39923
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2543.0,
+            "52": 2613.0,
+            "53": 2945.0,
+            "54": 2713.0,
+            "55": 2503.0,
+            "56": 2692.0,
+            "57": 2338.0,
+            "58": 2961.0,
+            "59": 2620.0,
+            "60": 2367.0,
+            "61": 2909.0,
+            "62": 2728.0,
+            "63": 2399.0,
+            "64": 2909.0,
+            "65": 2605.0,
+            "66": 2983.0,
+            "67": 2793.0,
+            "68": 2663.0,
+            "69": 2833.0,
+            "70": 3135.0,
+            "71": 2997.0,
+            "72": 2464.0,
+            "73": 3088.0,
+            "74": 1970.0,
+            "75": 2556.0,
+            "76": 3064.0,
+            "77": 3231.0,
+            "78": 3097.0,
+            "79": 3035.0,
+            "80": 3301.0,
+            "81": 3599.0,
+            "82": 3215.0,
+            "83": 2757.0,
+            "84": 3130.0,
+            "85": 3380.0,
+            "86": 2742.0,
+            "87": 3723.0,
+            "88": 3066.0,
+            "89": 3264.0,
+            "90": 3198.0,
+            "91": 2718.0,
+            "92": 3070.0,
+            "93": 2624.0,
+            "94": 3301.0,
+            "95": 3431.0,
+            "96": 3358.0,
+            "97": 3142.0,
+            "98": 3704.0,
+            "99": 3107.0,
+            "100": 3089.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1261849088.0,
+            "52": 1261849088.0,
+            "53": 1261849088.0,
+            "54": 1261849088.0,
+            "55": 1261849088.0,
+            "56": 1261849088.0,
+            "57": 1261849088.0,
+            "58": 1261849088.0,
+            "59": 1261849088.0,
+            "60": 1261849088.0,
+            "61": 1261849088.0,
+            "62": 1261849088.0,
+            "63": 1261849088.0,
+            "64": 1261849088.0,
+            "65": 1261849088.0,
+            "66": 1261849088.0,
+            "67": 1261849088.0,
+            "68": 1261849088.0,
+            "69": 1261849088.0,
+            "70": 1261849088.0,
+            "71": 1261849088.0,
+            "72": 1261849088.0,
+            "73": 1261849088.0,
+            "74": 1261849088.0,
+            "75": 1261849088.0,
+            "76": 1261849088.0,
+            "77": 1261849088.0,
+            "78": 1261849088.0,
+            "79": 1261849088.0,
+            "80": 1261849088.0,
+            "81": 1261849088.0,
+            "82": 1261849088.0,
+            "83": 1261849088.0,
+            "84": 1261849088.0,
+            "85": 1261849088.0,
+            "86": 1261849088.0,
+            "87": 1261849088.0,
+            "88": 1261849088.0,
+            "89": 1261849088.0,
+            "90": 1261849088.0,
+            "91": 1261849088.0,
+            "92": 1261849088.0,
+            "93": 1261849088.0,
+            "94": 1261849088.0,
+            "95": 1261849088.0,
+            "96": 1261849088.0,
+            "97": 1261849088.0,
+            "98": 1261849088.0,
+            "99": 1261849088.0,
+            "100": 1261849088.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2530924544.0,
+            "52": 2564480000.0,
+            "53": 2564480000.0,
+            "54": 2564480000.0,
+            "55": 2564480000.0,
+            "56": 2564480000.0,
+            "57": 2564480000.0,
+            "58": 2564480000.0,
+            "59": 2564480000.0,
+            "60": 2564480000.0,
+            "61": 2564480000.0,
+            "62": 2564480000.0,
+            "63": 2564480000.0,
+            "64": 2564480000.0,
+            "65": 2564480000.0,
+            "66": 2564480000.0,
+            "67": 2564480000.0,
+            "68": 2564480000.0,
+            "69": 2564480000.0,
+            "70": 2564480000.0,
+            "71": 2564480000.0,
+            "72": 2564480000.0,
+            "73": 2564480000.0,
+            "74": 2564480000.0,
+            "75": 2564480000.0,
+            "76": 2564480000.0,
+            "77": 2564480000.0,
+            "78": 2564480000.0,
+            "79": 2564480000.0,
+            "80": 2564480000.0,
+            "81": 2564480000.0,
+            "82": 2564480000.0,
+            "83": 2564480000.0,
+            "84": 2564480000.0,
+            "85": 2564480000.0,
+            "86": 2564480000.0,
+            "87": 2564480000.0,
+            "88": 2564480000.0,
+            "89": 2564480000.0,
+            "90": 2564480000.0,
+            "91": 2564480000.0,
+            "92": 2564480000.0,
+            "93": 2564480000.0,
+            "94": 2564480000.0,
+            "95": 2564480000.0,
+            "96": 2564480000.0,
+            "97": 2564480000.0,
+            "98": 2564480000.0,
+            "99": 2564480000.0,
+            "100": 2564480000.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.87745,
+            "52": 0.09791,
+            "53": 0.07996,
+            "54": 0.07698,
+            "55": 0.07921,
+            "56": 0.07768,
+            "57": 0.07938,
+            "58": 0.077,
+            "59": 0.0799,
+            "60": 0.07696,
+            "61": 0.07996,
+            "62": 0.07691,
+            "63": 0.08005,
+            "64": 0.0814,
+            "65": 0.07853,
+            "66": 0.07696,
+            "67": 0.07866,
+            "68": 0.07694,
+            "69": 0.07801,
+            "70": 0.07717,
+            "71": 0.07878,
+            "72": 0.07724,
+            "73": 0.18173,
+            "74": 0.09573,
+            "75": 0.07905,
+            "76": 0.0777,
+            "77": 0.07736,
+            "78": 0.08065,
+            "79": 0.07839,
+            "80": 0.08069,
+            "81": 0.0787,
+            "82": 0.07798,
+            "83": 0.08482,
+            "84": 0.07927,
+            "85": 0.08138,
+            "86": 0.08293,
+            "87": 0.08143,
+            "88": 0.07796,
+            "89": 0.07668,
+            "90": 0.07901,
+            "91": 0.07807,
+            "92": 0.0798,
+            "93": 0.0768,
+            "94": 0.07634,
+            "95": 0.07708,
+            "96": 0.07653,
+            "97": 0.0783,
+            "98": 0.07633,
+            "99": 0.07617,
+            "100": 0.07786
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100.json
index bd7ca46935f..3d5c6f6dc4b 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 3.43734,
-            "2": 0.14648,
-            "3": 0.12542,
-            "4": 0.12603,
-            "5": 0.12388,
-            "6": 0.12524,
-            "7": 0.12279,
-            "8": 0.1239,
-            "9": 0.12244,
-            "10": 0.12336,
-            "11": 0.12345,
-            "12": 0.12322,
-            "13": 0.12318,
-            "14": 0.12381,
-            "15": 0.12343,
-            "16": 0.12319,
-            "17": 0.12276,
-            "18": 0.12324,
-            "19": 0.12355,
-            "20": 0.12315,
-            "21": 0.12294,
-            "22": 0.12326,
-            "23": 0.12303,
-            "24": 0.12294,
-            "25": 0.12286,
-            "26": 0.12388,
-            "27": 0.12341,
-            "28": 0.12301,
-            "29": 0.12267,
-            "30": 0.12315,
-            "31": 0.12371,
-            "32": 0.12359,
-            "33": 0.12298,
-            "34": 0.12283,
-            "35": 0.12266,
-            "36": 0.12356,
-            "37": 0.12377,
-            "38": 0.12388,
-            "39": 0.12525,
-            "40": 0.12501,
-            "41": 0.12357,
-            "42": 0.12376,
-            "43": 0.12304,
-            "44": 0.12342,
-            "45": 0.12284,
-            "46": 0.12332,
-            "47": 0.12324,
-            "48": 0.12279,
-            "49": 0.12276,
-            "50": 0.12391,
-            "51": 0.12862,
-            "52": 0.12214,
-            "53": 0.12006,
-            "54": 0.12101,
-            "55": 0.12062,
-            "56": 0.12088,
-            "57": 0.121,
-            "58": 0.12034,
-            "59": 0.12049,
-            "60": 0.12066,
-            "61": 0.11974,
-            "62": 0.11979,
-            "63": 0.12196,
-            "64": 0.12149,
-            "65": 0.12119,
-            "66": 0.12067,
-            "67": 0.12079,
-            "68": 0.12104,
-            "69": 0.12025,
-            "70": 0.12059,
-            "71": 0.12069,
-            "72": 0.12102,
-            "73": 0.12115,
-            "74": 0.1208,
-            "75": 0.12051,
-            "76": 0.12011,
-            "77": 0.11958,
-            "78": 0.12095,
-            "79": 0.11983,
-            "80": 0.12106,
-            "81": 0.1203,
-            "82": 0.12062,
-            "83": 0.12021,
-            "84": 0.12036,
-            "85": 0.12053,
-            "86": 0.12119,
-            "87": 0.12057,
-            "88": 0.12092,
-            "89": 0.12271,
-            "90": 0.12095,
-            "91": 0.1204,
-            "92": 0.12052,
-            "93": 0.12075,
-            "94": 0.12025,
-            "95": 0.12129,
-            "96": 0.12087,
-            "97": 0.12098,
-            "98": 0.12136,
-            "99": 0.12046,
-            "100": 0.12064
+            "1": 2.58038,
+            "2": 0.24481,
+            "3": 0.14335,
+            "4": 0.12008,
+            "5": 0.11519,
+            "6": 0.11576,
+            "7": 0.11592,
+            "8": 0.11621,
+            "9": 0.11509,
+            "10": 0.11622,
+            "11": 0.11438,
+            "12": 0.12519,
+            "13": 0.11661,
+            "14": 0.11675,
+            "15": 0.11585,
+            "16": 0.11602,
+            "17": 0.11511,
+            "18": 0.11563,
+            "19": 0.1151,
+            "20": 0.11612,
+            "21": 0.11576,
+            "22": 0.11985,
+            "23": 0.11629,
+            "24": 0.11712,
+            "25": 0.11544,
+            "26": 0.11643,
+            "27": 0.1158,
+            "28": 0.1159,
+            "29": 0.11547,
+            "30": 0.11692,
+            "31": 0.11579,
+            "32": 0.11621,
+            "33": 0.11916,
+            "34": 0.11636,
+            "35": 0.11562,
+            "36": 0.11659,
+            "37": 0.11547,
+            "38": 0.11647,
+            "39": 0.1158,
+            "40": 0.11627,
+            "41": 0.11596,
+            "42": 0.11632,
+            "43": 0.11615,
+            "44": 0.11641,
+            "45": 0.11517,
+            "46": 0.117,
+            "47": 0.11569,
+            "48": 0.11641,
+            "49": 0.1153,
+            "50": 0.11761,
+            "51": 0.12112,
+            "52": 0.11688,
+            "53": 0.11745,
+            "54": 0.11527,
+            "55": 0.1155,
+            "56": 0.11515,
+            "57": 0.1278,
+            "58": 0.11901,
+            "59": 0.11522,
+            "60": 0.11514,
+            "61": 0.11577,
+            "62": 0.1152,
+            "63": 0.11508,
+            "64": 0.11441,
+            "65": 0.11536,
+            "66": 0.11387,
+            "67": 0.11491,
+            "68": 0.11494,
+            "69": 0.11516,
+            "70": 0.11427,
+            "71": 0.11457,
+            "72": 0.11443,
+            "73": 0.11522,
+            "74": 0.1147,
+            "75": 0.11473,
+            "76": 0.11408,
+            "77": 0.11464,
+            "78": 0.11499,
+            "79": 0.11494,
+            "80": 0.11435,
+            "81": 0.11479,
+            "82": 0.11427,
+            "83": 0.11504,
+            "84": 0.11412,
+            "85": 0.11455,
+            "86": 0.11473,
+            "87": 0.11484,
+            "88": 0.1137,
+            "89": 0.11543,
+            "90": 0.11349,
+            "91": 0.11471,
+            "92": 0.114,
+            "93": 0.11498,
+            "94": 0.11434,
+            "95": 0.11497,
+            "96": 0.11416,
+            "97": 0.11454,
+            "98": 0.1143,
+            "99": 0.1145,
+            "100": 0.11459
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..4402397bfe1
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83457,
+            "52": 9.73232,
+            "53": 10.0488,
+            "54": 9.93895,
+            "55": 9.863,
+            "56": 9.613,
+            "57": 9.46966,
+            "58": 9.81135,
+            "59": 9.57107,
+            "60": 9.48155,
+            "61": 9.6788,
+            "62": 9.96581,
+            "63": 9.35273,
+            "64": 9.75648,
+            "65": 8.93771,
+            "66": 9.68153,
+            "67": 9.35671,
+            "68": 9.76807,
+            "69": 9.7739,
+            "70": 9.71016,
+            "71": 9.60009,
+            "72": 9.56793,
+            "73": 9.4774,
+            "74": 8.93177,
+            "75": 9.4072,
+            "76": 9.06849,
+            "77": 10.0464,
+            "78": 9.70988,
+            "79": 9.35733,
+            "80": 9.38975,
+            "81": 9.4662,
+            "82": 9.68058,
+            "83": 9.2914,
+            "84": 9.40191,
+            "85": 9.59735,
+            "86": 9.06209,
+            "87": 9.57922,
+            "88": 9.73259,
+            "89": 9.58836,
+            "90": 9.80354,
+            "91": 9.31991,
+            "92": 9.35011,
+            "93": 9.06382,
+            "94": 8.81909,
+            "95": 9.50568,
+            "96": 9.51071,
+            "97": 9.29241,
+            "98": 9.65578,
+            "99": 8.87401,
+            "100": 9.38833
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2508.0,
+            "52": 2495.0,
+            "53": 2856.0,
+            "54": 2692.0,
+            "55": 2482.0,
+            "56": 2614.0,
+            "57": 2283.0,
+            "58": 2894.0,
+            "59": 2659.0,
+            "60": 2561.0,
+            "61": 3006.0,
+            "62": 2671.0,
+            "63": 2488.0,
+            "64": 3092.0,
+            "65": 2622.0,
+            "66": 3108.0,
+            "67": 2741.0,
+            "68": 2942.0,
+            "69": 2983.0,
+            "70": 3347.0,
+            "71": 3034.0,
+            "72": 2438.0,
+            "73": 3075.0,
+            "74": 1931.0,
+            "75": 2722.0,
+            "76": 2960.0,
+            "77": 3387.0,
+            "78": 3268.0,
+            "79": 3079.0,
+            "80": 3404.0,
+            "81": 3674.0,
+            "82": 3192.0,
+            "83": 2791.0,
+            "84": 3224.0,
+            "85": 3237.0,
+            "86": 2646.0,
+            "87": 3840.0,
+            "88": 3114.0,
+            "89": 3410.0,
+            "90": 3184.0,
+            "91": 3073.0,
+            "92": 3396.0,
+            "93": 2711.0,
+            "94": 3530.0,
+            "95": 3387.0,
+            "96": 3530.0,
+            "97": 3277.0,
+            "98": 3775.0,
+            "99": 3421.0,
+            "100": 3350.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1232487936.0,
+            "52": 1232487936.0,
+            "53": 1232487936.0,
+            "54": 1232487936.0,
+            "55": 1232487936.0,
+            "56": 1232487936.0,
+            "57": 1232487936.0,
+            "58": 1232487936.0,
+            "59": 1232487936.0,
+            "60": 1232487936.0,
+            "61": 1232487936.0,
+            "62": 1232487936.0,
+            "63": 1232487936.0,
+            "64": 1232487936.0,
+            "65": 1232487936.0,
+            "66": 1232487936.0,
+            "67": 1232487936.0,
+            "68": 1232487936.0,
+            "69": 1232487936.0,
+            "70": 1232487936.0,
+            "71": 1232487936.0,
+            "72": 1232487936.0,
+            "73": 1232487936.0,
+            "74": 1232487936.0,
+            "75": 1232487936.0,
+            "76": 1232487936.0,
+            "77": 1232487936.0,
+            "78": 1232487936.0,
+            "79": 1232487936.0,
+            "80": 1232487936.0,
+            "81": 1232487936.0,
+            "82": 1232487936.0,
+            "83": 1232487936.0,
+            "84": 1232487936.0,
+            "85": 1232487936.0,
+            "86": 1232487936.0,
+            "87": 1232487936.0,
+            "88": 1232487936.0,
+            "89": 1232487936.0,
+            "90": 1232487936.0,
+            "91": 1232487936.0,
+            "92": 1232487936.0,
+            "93": 1232487936.0,
+            "94": 1232487936.0,
+            "95": 1232487936.0,
+            "96": 1232487936.0,
+            "97": 1232487936.0,
+            "98": 1232487936.0,
+            "99": 1232487936.0,
+            "100": 1232487936.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2501563392.0,
+            "52": 2535118848.0,
+            "53": 2535118848.0,
+            "54": 2535118848.0,
+            "55": 2535118848.0,
+            "56": 2535118848.0,
+            "57": 2535118848.0,
+            "58": 2535118848.0,
+            "59": 2535118848.0,
+            "60": 2535118848.0,
+            "61": 2535118848.0,
+            "62": 2535118848.0,
+            "63": 2535118848.0,
+            "64": 2535118848.0,
+            "65": 2535118848.0,
+            "66": 2535118848.0,
+            "67": 2535118848.0,
+            "68": 2535118848.0,
+            "69": 2535118848.0,
+            "70": 2535118848.0,
+            "71": 2535118848.0,
+            "72": 2535118848.0,
+            "73": 2535118848.0,
+            "74": 2535118848.0,
+            "75": 2535118848.0,
+            "76": 2535118848.0,
+            "77": 2535118848.0,
+            "78": 2535118848.0,
+            "79": 2535118848.0,
+            "80": 2535118848.0,
+            "81": 2535118848.0,
+            "82": 2535118848.0,
+            "83": 2535118848.0,
+            "84": 2535118848.0,
+            "85": 2535118848.0,
+            "86": 2535118848.0,
+            "87": 2535118848.0,
+            "88": 2535118848.0,
+            "89": 2535118848.0,
+            "90": 2535118848.0,
+            "91": 2535118848.0,
+            "92": 2535118848.0,
+            "93": 2535118848.0,
+            "94": 2535118848.0,
+            "95": 2535118848.0,
+            "96": 2535118848.0,
+            "97": 2535118848.0,
+            "98": 2535118848.0,
+            "99": 2535118848.0,
+            "100": 2535118848.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4.25367,
+            "52": 0.13205,
+            "53": 0.11484,
+            "54": 0.11811,
+            "55": 0.11596,
+            "56": 0.11581,
+            "57": 0.11498,
+            "58": 0.11563,
+            "59": 0.11477,
+            "60": 0.11575,
+            "61": 0.11498,
+            "62": 0.11551,
+            "63": 0.11663,
+            "64": 0.11428,
+            "65": 0.11448,
+            "66": 0.11417,
+            "67": 0.11362,
+            "68": 0.11442,
+            "69": 0.11406,
+            "70": 0.11487,
+            "71": 0.11375,
+            "72": 0.11459,
+            "73": 0.11365,
+            "74": 0.11414,
+            "75": 0.11435,
+            "76": 0.11545,
+            "77": 0.11362,
+            "78": 0.11443,
+            "79": 0.11286,
+            "80": 0.11385,
+            "81": 0.11272,
+            "82": 0.11354,
+            "83": 0.11294,
+            "84": 0.11396,
+            "85": 0.11272,
+            "86": 0.11396,
+            "87": 0.11339,
+            "88": 0.11475,
+            "89": 0.11779,
+            "90": 0.11386,
+            "91": 0.11507,
+            "92": 0.11404,
+            "93": 0.11335,
+            "94": 0.11449,
+            "95": 0.11323,
+            "96": 0.11451,
+            "97": 0.11365,
+            "98": 0.11398,
+            "99": 0.11453,
+            "100": 0.11417
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/model_config.yaml
index bb0708b11ef..8a471f2238b 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/model_config.yaml
index 4aeea406fb9..0310dbf2a64 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/model_config.yaml
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..684fd4831fe
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.77012,
+            "2": 10.78244,
+            "3": 10.77833,
+            "4": 10.75145,
+            "5": 10.80955,
+            "6": 10.8223,
+            "7": 10.80193,
+            "8": 10.78868,
+            "9": 10.79503,
+            "10": 10.71341,
+            "11": 10.85003,
+            "12": 10.80071,
+            "13": 10.8263,
+            "14": 10.84293,
+            "15": 10.7559,
+            "16": 10.75248,
+            "17": 10.70854,
+            "18": 10.74761,
+            "19": 10.74709,
+            "20": 10.64388,
+            "21": 10.60456,
+            "22": 10.43295,
+            "23": 10.66573,
+            "24": 10.50049,
+            "25": 10.43605,
+            "26": 10.51463,
+            "27": 10.54136,
+            "28": 10.51359,
+            "29": 10.53716,
+            "30": 10.25964,
+            "31": 9.97634,
+            "32": 10.39958,
+            "33": 10.38607,
+            "34": 10.11016,
+            "35": 10.1741,
+            "36": 10.11553,
+            "37": 10.26008,
+            "38": 10.07462,
+            "39": 10.32873,
+            "40": 9.96852,
+            "41": 10.05099,
+            "42": 10.12726,
+            "43": 9.70798,
+            "44": 9.83287,
+            "45": 9.70538,
+            "46": 9.7134,
+            "47": 10.05872,
+            "48": 9.74565,
+            "49": 9.40522,
+            "50": 9.80891,
+            "51": 9.76757,
+            "52": 9.64732,
+            "53": 9.995,
+            "54": 9.88603,
+            "55": 9.81763,
+            "56": 9.53914,
+            "57": 9.38192,
+            "58": 9.75896,
+            "59": 9.52106,
+            "60": 9.42443,
+            "61": 9.63665,
+            "62": 9.92974,
+            "63": 9.29595,
+            "64": 9.70631,
+            "65": 8.88066,
+            "66": 9.64072,
+            "67": 9.32146,
+            "68": 9.73692,
+            "69": 9.75346,
+            "70": 9.68289,
+            "71": 9.58117,
+            "72": 9.52491,
+            "73": 9.44094,
+            "74": 8.86077,
+            "75": 9.36671,
+            "76": 9.01691,
+            "77": 10.02224,
+            "78": 9.68354,
+            "79": 9.33325,
+            "80": 9.3582,
+            "81": 9.43786,
+            "82": 9.66102,
+            "83": 9.26223,
+            "84": 9.37189,
+            "85": 9.56652,
+            "86": 9.04493,
+            "87": 9.5575,
+            "88": 9.70541,
+            "89": 9.55092,
+            "90": 9.79196,
+            "91": 9.29173,
+            "92": 9.31225,
+            "93": 9.0433,
+            "94": 8.78683,
+            "95": 9.49525,
+            "96": 9.48391,
+            "97": 9.25966,
+            "98": 9.62611,
+            "99": 8.85031,
+            "100": 9.36043
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1660.0,
+            "2": 1892.0,
+            "3": 1805.0,
+            "4": 1861.0,
+            "5": 2134.0,
+            "6": 1964.0,
+            "7": 2077.0,
+            "8": 1755.0,
+            "9": 1942.0,
+            "10": 1516.0,
+            "11": 1981.0,
+            "12": 1962.0,
+            "13": 2092.0,
+            "14": 1940.0,
+            "15": 2030.0,
+            "16": 1975.0,
+            "17": 2081.0,
+            "18": 1925.0,
+            "19": 1890.0,
+            "20": 1806.0,
+            "21": 1992.0,
+            "22": 1833.0,
+            "23": 2082.0,
+            "24": 1806.0,
+            "25": 1834.0,
+            "26": 1935.0,
+            "27": 1987.0,
+            "28": 2157.0,
+            "29": 2045.0,
+            "30": 1959.0,
+            "31": 1733.0,
+            "32": 2011.0,
+            "33": 2149.0,
+            "34": 2014.0,
+            "35": 2131.0,
+            "36": 2027.0,
+            "37": 2337.0,
+            "38": 2210.0,
+            "39": 2454.0,
+            "40": 2335.0,
+            "41": 2379.0,
+            "42": 2359.0,
+            "43": 2101.0,
+            "44": 2280.0,
+            "45": 2138.0,
+            "46": 2297.0,
+            "47": 2454.0,
+            "48": 2586.0,
+            "49": 2213.0,
+            "50": 2414.0,
+            "51": 2613.0,
+            "52": 2647.0,
+            "53": 2908.0,
+            "54": 2580.0,
+            "55": 2486.0,
+            "56": 2687.0,
+            "57": 2577.0,
+            "58": 2824.0,
+            "59": 2720.0,
+            "60": 2410.0,
+            "61": 2744.0,
+            "62": 2536.0,
+            "63": 2652.0,
+            "64": 2918.0,
+            "65": 2742.0,
+            "66": 2927.0,
+            "67": 2920.0,
+            "68": 2652.0,
+            "69": 3019.0,
+            "70": 2996.0,
+            "71": 2835.0,
+            "72": 2664.0,
+            "73": 3211.0,
+            "74": 2311.0,
+            "75": 2658.0,
+            "76": 3155.0,
+            "77": 3051.0,
+            "78": 3073.0,
+            "79": 3116.0,
+            "80": 3191.0,
+            "81": 3237.0,
+            "82": 3218.0,
+            "83": 2689.0,
+            "84": 3294.0,
+            "85": 3209.0,
+            "86": 2558.0,
+            "87": 3462.0,
+            "88": 3287.0,
+            "89": 3201.0,
+            "90": 3331.0,
+            "91": 3183.0,
+            "92": 3201.0,
+            "93": 2942.0,
+            "94": 3274.0,
+            "95": 3132.0,
+            "96": 3200.0,
+            "97": 3054.0,
+            "98": 3544.0,
+            "99": 3387.0,
+            "100": 3192.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 917459968.0,
+            "2": 917459968.0,
+            "3": 917459968.0,
+            "4": 917459968.0,
+            "5": 917459968.0,
+            "6": 917459968.0,
+            "7": 917459968.0,
+            "8": 917459968.0,
+            "9": 917459968.0,
+            "10": 917459968.0,
+            "11": 917459968.0,
+            "12": 917459968.0,
+            "13": 917459968.0,
+            "14": 917459968.0,
+            "15": 917459968.0,
+            "16": 917459968.0,
+            "17": 917459968.0,
+            "18": 917459968.0,
+            "19": 917459968.0,
+            "20": 917459968.0,
+            "21": 917459968.0,
+            "22": 917459968.0,
+            "23": 917459968.0,
+            "24": 917459968.0,
+            "25": 917459968.0,
+            "26": 917459968.0,
+            "27": 917459968.0,
+            "28": 917459968.0,
+            "29": 917459968.0,
+            "30": 917459968.0,
+            "31": 917459968.0,
+            "32": 917459968.0,
+            "33": 917459968.0,
+            "34": 917459968.0,
+            "35": 917459968.0,
+            "36": 917459968.0,
+            "37": 917459968.0,
+            "38": 917459968.0,
+            "39": 917459968.0,
+            "40": 917459968.0,
+            "41": 917459968.0,
+            "42": 917459968.0,
+            "43": 917459968.0,
+            "44": 917459968.0,
+            "45": 917459968.0,
+            "46": 917459968.0,
+            "47": 917459968.0,
+            "48": 917459968.0,
+            "49": 917459968.0,
+            "50": 917459968.0,
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2262889472.0,
+            "2": 2621306880.0,
+            "3": 2621306880.0,
+            "4": 2621306880.0,
+            "5": 2621306880.0,
+            "6": 2621306880.0,
+            "7": 2621306880.0,
+            "8": 2621306880.0,
+            "9": 2621306880.0,
+            "10": 2621306880.0,
+            "11": 2621306880.0,
+            "12": 2621306880.0,
+            "13": 2621306880.0,
+            "14": 2621306880.0,
+            "15": 2621306880.0,
+            "16": 2621306880.0,
+            "17": 2621306880.0,
+            "18": 2621306880.0,
+            "19": 2621306880.0,
+            "20": 2621306880.0,
+            "21": 2621306880.0,
+            "22": 2621306880.0,
+            "23": 2621306880.0,
+            "24": 2621306880.0,
+            "25": 2621306880.0,
+            "26": 2621306880.0,
+            "27": 2621306880.0,
+            "28": 2621306880.0,
+            "29": 2621306880.0,
+            "30": 2621306880.0,
+            "31": 2621306880.0,
+            "32": 2621306880.0,
+            "33": 2621306880.0,
+            "34": 2621306880.0,
+            "35": 2621306880.0,
+            "36": 2621306880.0,
+            "37": 2621306880.0,
+            "38": 2621306880.0,
+            "39": 2621306880.0,
+            "40": 2621306880.0,
+            "41": 2621306880.0,
+            "42": 2621306880.0,
+            "43": 2621306880.0,
+            "44": 2621306880.0,
+            "45": 2621306880.0,
+            "46": 2621306880.0,
+            "47": 2621306880.0,
+            "48": 2621306880.0,
+            "49": 2621306880.0,
+            "50": 2621306880.0,
+            "51": 2621306880.0,
+            "52": 2621306880.0,
+            "53": 2621306880.0,
+            "54": 2621306880.0,
+            "55": 2621306880.0,
+            "56": 2621306880.0,
+            "57": 2621306880.0,
+            "58": 2621306880.0,
+            "59": 2621306880.0,
+            "60": 2621306880.0,
+            "61": 2621306880.0,
+            "62": 2621306880.0,
+            "63": 2621306880.0,
+            "64": 2621306880.0,
+            "65": 2621306880.0,
+            "66": 2621306880.0,
+            "67": 2621306880.0,
+            "68": 2621306880.0,
+            "69": 2621306880.0,
+            "70": 2621306880.0,
+            "71": 2621306880.0,
+            "72": 2621306880.0,
+            "73": 2621306880.0,
+            "74": 2621306880.0,
+            "75": 2621306880.0,
+            "76": 2621306880.0,
+            "77": 2621306880.0,
+            "78": 2621306880.0,
+            "79": 2621306880.0,
+            "80": 2621306880.0,
+            "81": 2621306880.0,
+            "82": 2621306880.0,
+            "83": 2621306880.0,
+            "84": 2621306880.0,
+            "85": 2621306880.0,
+            "86": 2621306880.0,
+            "87": 2621306880.0,
+            "88": 2621306880.0,
+            "89": 2621306880.0,
+            "90": 2621306880.0,
+            "91": 2621306880.0,
+            "92": 2621306880.0,
+            "93": 2621306880.0,
+            "94": 2621306880.0,
+            "95": 2621306880.0,
+            "96": 2621306880.0,
+            "97": 2621306880.0,
+            "98": 2621306880.0,
+            "99": 2621306880.0,
+            "100": 2621306880.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.22064,
+            "3": 0.13024,
+            "4": 0.11768,
+            "5": 0.11875,
+            "6": 0.11742,
+            "7": 0.11821,
+            "8": 0.11878,
+            "9": 0.11922,
+            "10": 0.11834,
+            "11": 0.11707,
+            "12": 0.1171,
+            "13": 0.11874,
+            "14": 0.12245,
+            "15": 0.11821,
+            "16": 0.1177,
+            "17": 0.11857,
+            "18": 0.11778,
+            "19": 0.1187,
+            "20": 0.11835,
+            "21": 0.12351,
+            "22": 0.11771,
+            "23": 0.11773,
+            "24": 0.11819,
+            "25": 0.11705,
+            "26": 0.12602,
+            "27": 0.12585,
+            "28": 0.12677,
+            "29": 0.12752,
+            "30": 0.12847,
+            "31": 0.12883,
+            "32": 0.12586,
+            "33": 0.12437,
+            "34": 0.12277,
+            "35": 0.12212,
+            "36": 0.12255,
+            "37": 0.12084,
+            "38": 0.12104,
+            "39": 0.12124,
+            "40": 0.12086,
+            "41": 0.12101,
+            "42": 0.11969,
+            "43": 0.1197,
+            "44": 0.11956,
+            "45": 0.11977,
+            "46": 0.11865,
+            "47": 0.11795,
+            "48": 0.11928,
+            "49": 0.11794,
+            "50": 0.11851,
+            "51": 0.12726,
+            "52": 0.11929,
+            "53": 0.11813,
+            "54": 0.11818,
+            "55": 0.11748,
+            "56": 0.11707,
+            "57": 0.11896,
+            "58": 0.11832,
+            "59": 0.11799,
+            "60": 0.11784,
+            "61": 0.11888,
+            "62": 0.11879,
+            "63": 0.11819,
+            "64": 0.1185,
+            "65": 0.11926,
+            "66": 0.11924,
+            "67": 0.11982,
+            "68": 0.11873,
+            "69": 0.11986,
+            "70": 0.11895,
+            "71": 0.11964,
+            "72": 0.11906,
+            "73": 0.12017,
+            "74": 0.11976,
+            "75": 0.11759,
+            "76": 0.11921,
+            "77": 0.11907,
+            "78": 0.11823,
+            "79": 0.11867,
+            "80": 0.11934,
+            "81": 0.11888,
+            "82": 0.11988,
+            "83": 0.1213,
+            "84": 0.11913,
+            "85": 0.12002,
+            "86": 0.12046,
+            "87": 0.11952,
+            "88": 0.11819,
+            "89": 0.11901,
+            "90": 0.11918,
+            "91": 0.11919,
+            "92": 0.11824,
+            "93": 0.12018,
+            "94": 0.11929,
+            "95": 0.11974,
+            "96": 0.11767,
+            "97": 0.11845,
+            "98": 0.11695,
+            "99": 0.11892,
+            "100": 0.11948
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..f92f1c4672e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.76757,
+            "52": 9.64732,
+            "53": 9.995,
+            "54": 9.88603,
+            "55": 9.81763,
+            "56": 9.53914,
+            "57": 9.38192,
+            "58": 9.75896,
+            "59": 9.52106,
+            "60": 9.42443,
+            "61": 9.63665,
+            "62": 9.92974,
+            "63": 9.29595,
+            "64": 9.70631,
+            "65": 8.88066,
+            "66": 9.64072,
+            "67": 9.32146,
+            "68": 9.73692,
+            "69": 9.75346,
+            "70": 9.68289,
+            "71": 9.58117,
+            "72": 9.52491,
+            "73": 9.44094,
+            "74": 8.86077,
+            "75": 9.36671,
+            "76": 9.01691,
+            "77": 10.02224,
+            "78": 9.68354,
+            "79": 9.33325,
+            "80": 9.3582,
+            "81": 9.43786,
+            "82": 9.66102,
+            "83": 9.26223,
+            "84": 9.37189,
+            "85": 9.56652,
+            "86": 9.04493,
+            "87": 9.5575,
+            "88": 9.70541,
+            "89": 9.55092,
+            "90": 9.79196,
+            "91": 9.29173,
+            "92": 9.31225,
+            "93": 9.0433,
+            "94": 8.78683,
+            "95": 9.49525,
+            "96": 9.48391,
+            "97": 9.25966,
+            "98": 9.62611,
+            "99": 8.85031,
+            "100": 9.36043
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2613.0,
+            "52": 2647.0,
+            "53": 2908.0,
+            "54": 2580.0,
+            "55": 2486.0,
+            "56": 2687.0,
+            "57": 2577.0,
+            "58": 2824.0,
+            "59": 2720.0,
+            "60": 2410.0,
+            "61": 2744.0,
+            "62": 2536.0,
+            "63": 2652.0,
+            "64": 2918.0,
+            "65": 2742.0,
+            "66": 2927.0,
+            "67": 2920.0,
+            "68": 2652.0,
+            "69": 3019.0,
+            "70": 2996.0,
+            "71": 2835.0,
+            "72": 2664.0,
+            "73": 3211.0,
+            "74": 2311.0,
+            "75": 2658.0,
+            "76": 3155.0,
+            "77": 3051.0,
+            "78": 3073.0,
+            "79": 3116.0,
+            "80": 3191.0,
+            "81": 3237.0,
+            "82": 3218.0,
+            "83": 2689.0,
+            "84": 3294.0,
+            "85": 3209.0,
+            "86": 2558.0,
+            "87": 3462.0,
+            "88": 3287.0,
+            "89": 3201.0,
+            "90": 3331.0,
+            "91": 3183.0,
+            "92": 3201.0,
+            "93": 2942.0,
+            "94": 3274.0,
+            "95": 3132.0,
+            "96": 3200.0,
+            "97": 3054.0,
+            "98": 3544.0,
+            "99": 3387.0,
+            "100": 3192.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2622354432.0,
+            "52": 2622355456.0,
+            "53": 2622355456.0,
+            "54": 2622355456.0,
+            "55": 2622355456.0,
+            "56": 2622355456.0,
+            "57": 2622355456.0,
+            "58": 2622355456.0,
+            "59": 2622355456.0,
+            "60": 2622355456.0,
+            "61": 2622355456.0,
+            "62": 2622355456.0,
+            "63": 2622355456.0,
+            "64": 2622355456.0,
+            "65": 2622355456.0,
+            "66": 2622355456.0,
+            "67": 2622355456.0,
+            "68": 2622355456.0,
+            "69": 2622355456.0,
+            "70": 2622355456.0,
+            "71": 2622355456.0,
+            "72": 2622355456.0,
+            "73": 2622355456.0,
+            "74": 2622355456.0,
+            "75": 2622355456.0,
+            "76": 2622355456.0,
+            "77": 2622355456.0,
+            "78": 2622355456.0,
+            "79": 2622355456.0,
+            "80": 2622355456.0,
+            "81": 2622355456.0,
+            "82": 2622355456.0,
+            "83": 2622355456.0,
+            "84": 2622355456.0,
+            "85": 2622355456.0,
+            "86": 2622355456.0,
+            "87": 2622355456.0,
+            "88": 2622355456.0,
+            "89": 2622355456.0,
+            "90": 2622355456.0,
+            "91": 2622355456.0,
+            "92": 2622355456.0,
+            "93": 2622355456.0,
+            "94": 2622355456.0,
+            "95": 2622355456.0,
+            "96": 2622355456.0,
+            "97": 2622355456.0,
+            "98": 2622355456.0,
+            "99": 2622355456.0,
+            "100": 2622355456.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.13245,
+            "53": 0.1297,
+            "54": 0.11767,
+            "55": 0.11927,
+            "56": 0.12061,
+            "57": 0.12305,
+            "58": 0.12043,
+            "59": 0.11822,
+            "60": 0.11725,
+            "61": 0.11813,
+            "62": 0.11746,
+            "63": 0.11736,
+            "64": 0.11897,
+            "65": 0.12036,
+            "66": 0.11746,
+            "67": 0.11937,
+            "68": 0.11862,
+            "69": 0.11914,
+            "70": 0.11949,
+            "71": 0.11638,
+            "72": 0.11794,
+            "73": 0.11866,
+            "74": 0.11751,
+            "75": 0.11637,
+            "76": 0.11834,
+            "77": 0.11768,
+            "78": 0.11854,
+            "79": 0.11727,
+            "80": 0.11732,
+            "81": 0.11811,
+            "82": 0.11878,
+            "83": 0.11805,
+            "84": 0.11921,
+            "85": 0.11932,
+            "86": 0.11908,
+            "87": 0.12476,
+            "88": 0.12628,
+            "89": 0.12876,
+            "90": 0.12617,
+            "91": 0.12743,
+            "92": 0.12783,
+            "93": 0.12812,
+            "94": 0.12493,
+            "95": 0.12584,
+            "96": 0.12791,
+            "97": 0.12455,
+            "98": 0.1269,
+            "99": 0.12715,
+            "100": 0.12714
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100.json
index b0474f2f8ec..1f743e8c2e8 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 921653248.0,
-            "2": 921653248.0,
-            "3": 921653248.0,
-            "4": 921653248.0,
-            "5": 921653248.0,
-            "6": 921653248.0,
-            "7": 921653248.0,
-            "8": 921653248.0,
-            "9": 921653248.0,
-            "10": 921653248.0,
-            "11": 921653248.0,
-            "12": 921653248.0,
-            "13": 921653248.0,
-            "14": 921653248.0,
-            "15": 921653248.0,
-            "16": 921653248.0,
-            "17": 921653248.0,
-            "18": 921653248.0,
-            "19": 921653248.0,
-            "20": 921653248.0,
-            "21": 921653248.0,
-            "22": 921653248.0,
-            "23": 921653248.0,
-            "24": 921653248.0,
-            "25": 921653248.0,
-            "26": 921653248.0,
-            "27": 921653248.0,
-            "28": 921653248.0,
-            "29": 921653248.0,
-            "30": 921653248.0,
-            "31": 921653248.0,
-            "32": 921653248.0,
-            "33": 921653248.0,
-            "34": 921653248.0,
-            "35": 921653248.0,
-            "36": 921653248.0,
-            "37": 921653248.0,
-            "38": 921653248.0,
-            "39": 921653248.0,
-            "40": 921653248.0,
-            "41": 921653248.0,
-            "42": 921653248.0,
-            "43": 921653248.0,
-            "44": 921653248.0,
-            "45": 921653248.0,
-            "46": 921653248.0,
-            "47": 921653248.0,
-            "48": 921653248.0,
-            "49": 921653248.0,
-            "50": 921653248.0,
-            "51": 921653248.0,
-            "52": 921653248.0,
-            "53": 921653248.0,
-            "54": 921653248.0,
-            "55": 921653248.0,
-            "56": 921653248.0,
-            "57": 921653248.0,
-            "58": 921653248.0,
-            "59": 921653248.0,
-            "60": 921653248.0,
-            "61": 921653248.0,
-            "62": 921653248.0,
-            "63": 921653248.0,
-            "64": 921653248.0,
-            "65": 921653248.0,
-            "66": 921653248.0,
-            "67": 921653248.0,
-            "68": 921653248.0,
-            "69": 921653248.0,
-            "70": 921653248.0,
-            "71": 921653248.0,
-            "72": 921653248.0,
-            "73": 921653248.0,
-            "74": 921653248.0,
-            "75": 921653248.0,
-            "76": 921653248.0,
-            "77": 921653248.0,
-            "78": 921653248.0,
-            "79": 921653248.0,
-            "80": 921653248.0,
-            "81": 921653248.0,
-            "82": 921653248.0,
-            "83": 921653248.0,
-            "84": 921653248.0,
-            "85": 921653248.0,
-            "86": 921653248.0,
-            "87": 921653248.0,
-            "88": 921653248.0,
-            "89": 921653248.0,
-            "90": 921653248.0,
-            "91": 921653248.0,
-            "92": 921653248.0,
-            "93": 921653248.0,
-            "94": 921653248.0,
-            "95": 921653248.0,
-            "96": 921653248.0,
-            "97": 921653248.0,
-            "98": 921653248.0,
-            "99": 921653248.0,
-            "100": 921653248.0
+            "1": 917459968.0,
+            "2": 917459968.0,
+            "3": 917459968.0,
+            "4": 917459968.0,
+            "5": 917459968.0,
+            "6": 917459968.0,
+            "7": 917459968.0,
+            "8": 917459968.0,
+            "9": 917459968.0,
+            "10": 917459968.0,
+            "11": 917459968.0,
+            "12": 917459968.0,
+            "13": 917459968.0,
+            "14": 917459968.0,
+            "15": 917459968.0,
+            "16": 917459968.0,
+            "17": 917459968.0,
+            "18": 917459968.0,
+            "19": 917459968.0,
+            "20": 917459968.0,
+            "21": 917459968.0,
+            "22": 917459968.0,
+            "23": 917459968.0,
+            "24": 917459968.0,
+            "25": 917459968.0,
+            "26": 917459968.0,
+            "27": 917459968.0,
+            "28": 917459968.0,
+            "29": 917459968.0,
+            "30": 917459968.0,
+            "31": 917459968.0,
+            "32": 917459968.0,
+            "33": 917459968.0,
+            "34": 917459968.0,
+            "35": 917459968.0,
+            "36": 917459968.0,
+            "37": 917459968.0,
+            "38": 917459968.0,
+            "39": 917459968.0,
+            "40": 917459968.0,
+            "41": 917459968.0,
+            "42": 917459968.0,
+            "43": 917459968.0,
+            "44": 917459968.0,
+            "45": 917459968.0,
+            "46": 917459968.0,
+            "47": 917459968.0,
+            "48": 917459968.0,
+            "49": 917459968.0,
+            "50": 917459968.0,
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2237722624.0,
-            "2": 2600334336.0,
-            "3": 2600334336.0,
-            "4": 2600334336.0,
-            "5": 2600334336.0,
-            "6": 2600334336.0,
-            "7": 2600334336.0,
-            "8": 2600334336.0,
-            "9": 2600334336.0,
-            "10": 2600334336.0,
-            "11": 2600334336.0,
-            "12": 2600334336.0,
-            "13": 2600334336.0,
-            "14": 2600334336.0,
-            "15": 2600334336.0,
-            "16": 2600334336.0,
-            "17": 2600334336.0,
-            "18": 2600334336.0,
-            "19": 2600334336.0,
-            "20": 2600334336.0,
-            "21": 2600334336.0,
-            "22": 2600334336.0,
-            "23": 2600334336.0,
-            "24": 2600334336.0,
-            "25": 2600334336.0,
-            "26": 2600334336.0,
-            "27": 2600334336.0,
-            "28": 2600334336.0,
-            "29": 2600334336.0,
-            "30": 2600334336.0,
-            "31": 2600334336.0,
-            "32": 2600334336.0,
-            "33": 2600334336.0,
-            "34": 2600334336.0,
-            "35": 2600334336.0,
-            "36": 2600334336.0,
-            "37": 2600334336.0,
-            "38": 2600334336.0,
-            "39": 2600334336.0,
-            "40": 2600334336.0,
-            "41": 2600334336.0,
-            "42": 2600334336.0,
-            "43": 2600334336.0,
-            "44": 2600334336.0,
-            "45": 2600334336.0,
-            "46": 2600334336.0,
-            "47": 2600334336.0,
-            "48": 2600334336.0,
-            "49": 2600334336.0,
-            "50": 2600334336.0,
-            "51": 2600334336.0,
-            "52": 2600334336.0,
-            "53": 2600334336.0,
-            "54": 2600334336.0,
-            "55": 2600334336.0,
-            "56": 2600334336.0,
-            "57": 2600334336.0,
-            "58": 2600334336.0,
-            "59": 2600334336.0,
-            "60": 2600334336.0,
-            "61": 2600334336.0,
-            "62": 2600334336.0,
-            "63": 2600334336.0,
-            "64": 2600334336.0,
-            "65": 2600334336.0,
-            "66": 2600334336.0,
-            "67": 2600334336.0,
-            "68": 2600334336.0,
-            "69": 2600334336.0,
-            "70": 2600334336.0,
-            "71": 2600334336.0,
-            "72": 2600334336.0,
-            "73": 2600334336.0,
-            "74": 2600334336.0,
-            "75": 2600334336.0,
-            "76": 2600334336.0,
-            "77": 2600334336.0,
-            "78": 2600334336.0,
-            "79": 2600334336.0,
-            "80": 2600334336.0,
-            "81": 2600334336.0,
-            "82": 2600334336.0,
-            "83": 2600334336.0,
-            "84": 2600334336.0,
-            "85": 2600334336.0,
-            "86": 2600334336.0,
-            "87": 2600334336.0,
-            "88": 2600334336.0,
-            "89": 2600334336.0,
-            "90": 2600334336.0,
-            "91": 2600334336.0,
-            "92": 2600334336.0,
-            "93": 2600334336.0,
-            "94": 2600334336.0,
-            "95": 2600334336.0,
-            "96": 2600334336.0,
-            "97": 2600334336.0,
-            "98": 2600334336.0,
-            "99": 2600334336.0,
-            "100": 2600334336.0
+            "1": 2236675072.0,
+            "2": 2596141056.0,
+            "3": 2596141056.0,
+            "4": 2596141056.0,
+            "5": 2596141056.0,
+            "6": 2596141056.0,
+            "7": 2596141056.0,
+            "8": 2596141056.0,
+            "9": 2596141056.0,
+            "10": 2596141056.0,
+            "11": 2596141056.0,
+            "12": 2596141056.0,
+            "13": 2596141056.0,
+            "14": 2596141056.0,
+            "15": 2596141056.0,
+            "16": 2596141056.0,
+            "17": 2596141056.0,
+            "18": 2596141056.0,
+            "19": 2596141056.0,
+            "20": 2596141056.0,
+            "21": 2596141056.0,
+            "22": 2596141056.0,
+            "23": 2596141056.0,
+            "24": 2596141056.0,
+            "25": 2596141056.0,
+            "26": 2596141056.0,
+            "27": 2596141056.0,
+            "28": 2596141056.0,
+            "29": 2596141056.0,
+            "30": 2596141056.0,
+            "31": 2596141056.0,
+            "32": 2596141056.0,
+            "33": 2596141056.0,
+            "34": 2596141056.0,
+            "35": 2596141056.0,
+            "36": 2596141056.0,
+            "37": 2596141056.0,
+            "38": 2596141056.0,
+            "39": 2596141056.0,
+            "40": 2596141056.0,
+            "41": 2596141056.0,
+            "42": 2596141056.0,
+            "43": 2596141056.0,
+            "44": 2596141056.0,
+            "45": 2596141056.0,
+            "46": 2596141056.0,
+            "47": 2596141056.0,
+            "48": 2596141056.0,
+            "49": 2596141056.0,
+            "50": 2596141056.0,
+            "51": 2596141056.0,
+            "52": 2596141056.0,
+            "53": 2596141056.0,
+            "54": 2596141056.0,
+            "55": 2596141056.0,
+            "56": 2596141056.0,
+            "57": 2596141056.0,
+            "58": 2596141056.0,
+            "59": 2596141056.0,
+            "60": 2596141056.0,
+            "61": 2596141056.0,
+            "62": 2596141056.0,
+            "63": 2596141056.0,
+            "64": 2596141056.0,
+            "65": 2596141056.0,
+            "66": 2596141056.0,
+            "67": 2596141056.0,
+            "68": 2596141056.0,
+            "69": 2596141056.0,
+            "70": 2596141056.0,
+            "71": 2596141056.0,
+            "72": 2596141056.0,
+            "73": 2596141056.0,
+            "74": 2596141056.0,
+            "75": 2596141056.0,
+            "76": 2596141056.0,
+            "77": 2596141056.0,
+            "78": 2596141056.0,
+            "79": 2596141056.0,
+            "80": 2596141056.0,
+            "81": 2596141056.0,
+            "82": 2596141056.0,
+            "83": 2596141056.0,
+            "84": 2596141056.0,
+            "85": 2596141056.0,
+            "86": 2596141056.0,
+            "87": 2596141056.0,
+            "88": 2596141056.0,
+            "89": 2596141056.0,
+            "90": 2596141056.0,
+            "91": 2596141056.0,
+            "92": 2596141056.0,
+            "93": 2596141056.0,
+            "94": 2596141056.0,
+            "95": 2596141056.0,
+            "96": 2596141056.0,
+            "97": 2596141056.0,
+            "98": 2596141056.0,
+            "99": 2596141056.0,
+            "100": 2596141056.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.39748,
-            "2": 0.11699,
-            "3": 0.10324,
-            "4": 0.10602,
-            "5": 0.10273,
-            "6": 0.10169,
-            "7": 0.10402,
-            "8": 0.10582,
-            "9": 0.10893,
-            "10": 0.10156,
-            "11": 0.10006,
-            "12": 0.10034,
-            "13": 0.10111,
-            "14": 0.10835,
-            "15": 0.10198,
-            "16": 0.10295,
-            "17": 0.10379,
-            "18": 0.10096,
-            "19": 0.10678,
-            "20": 0.10208,
-            "21": 0.10213,
-            "22": 0.10179,
-            "23": 0.10357,
-            "24": 0.10282,
-            "25": 0.09979,
-            "26": 0.10143,
-            "27": 0.10197,
-            "28": 0.10127,
-            "29": 0.10116,
-            "30": 0.10243,
-            "31": 0.10107,
-            "32": 0.10147,
-            "33": 0.10181,
-            "34": 0.1038,
-            "35": 0.10095,
-            "36": 0.09889,
-            "37": 0.09992,
-            "38": 0.10001,
-            "39": 0.10006,
-            "40": 0.10004,
-            "41": 0.09886,
-            "42": 0.09836,
-            "43": 0.09974,
-            "44": 0.10016,
-            "45": 0.10004,
-            "46": 0.09945,
-            "47": 0.0989,
-            "48": 0.09882,
-            "49": 0.09906,
-            "50": 0.09893,
-            "51": 0.10108,
-            "52": 0.10571,
-            "53": 0.10114,
-            "54": 0.09935,
-            "55": 0.09893,
-            "56": 0.09871,
-            "57": 0.10568,
-            "58": 0.09952,
-            "59": 0.10185,
-            "60": 0.09937,
-            "61": 0.09902,
-            "62": 0.10469,
-            "63": 0.10029,
-            "64": 0.09881,
-            "65": 0.09927,
-            "66": 0.09932,
-            "67": 0.10538,
-            "68": 0.09988,
-            "69": 0.10144,
-            "70": 0.09918,
-            "71": 0.10686,
-            "72": 0.09922,
-            "73": 0.09936,
-            "74": 0.09915,
-            "75": 0.09862,
-            "76": 0.1068,
-            "77": 0.09885,
-            "78": 0.09998,
-            "79": 0.1002,
-            "80": 0.09911,
-            "81": 0.10038,
-            "82": 0.09931,
-            "83": 0.09871,
-            "84": 0.09987,
-            "85": 0.09983,
-            "86": 0.10014,
-            "87": 0.0994,
-            "88": 0.09924,
-            "89": 0.10058,
-            "90": 0.10033,
-            "91": 0.10009,
-            "92": 0.10037,
-            "93": 0.09877,
-            "94": 0.09968,
-            "95": 0.10011,
-            "96": 0.09929,
-            "97": 0.09969,
-            "98": 0.09929,
-            "99": 0.10037,
-            "100": 0.10155
+            "1": 7.66848,
+            "2": 0.11896,
+            "3": 0.09977,
+            "4": 0.07967,
+            "5": 0.07964,
+            "6": 0.07997,
+            "7": 0.08012,
+            "8": 0.07951,
+            "9": 0.08093,
+            "10": 0.07978,
+            "11": 0.07959,
+            "12": 0.0801,
+            "13": 0.08014,
+            "14": 0.08001,
+            "15": 0.08005,
+            "16": 0.0803,
+            "17": 0.0801,
+            "18": 0.07861,
+            "19": 0.07885,
+            "20": 0.07921,
+            "21": 0.07891,
+            "22": 0.07852,
+            "23": 0.07915,
+            "24": 0.07938,
+            "25": 0.08,
+            "26": 0.0813,
+            "27": 0.07978,
+            "28": 0.07899,
+            "29": 0.0798,
+            "30": 0.08028,
+            "31": 0.07891,
+            "32": 0.07911,
+            "33": 0.07848,
+            "34": 0.07925,
+            "35": 0.07821,
+            "36": 0.07899,
+            "37": 0.07887,
+            "38": 0.07866,
+            "39": 0.07853,
+            "40": 0.08169,
+            "41": 0.07849,
+            "42": 0.07836,
+            "43": 0.0786,
+            "44": 0.07878,
+            "45": 0.07828,
+            "46": 0.07805,
+            "47": 0.07784,
+            "48": 0.07807,
+            "49": 0.0787,
+            "50": 0.0789,
+            "51": 0.09689,
+            "52": 0.08417,
+            "53": 0.08482,
+            "54": 0.08198,
+            "55": 0.07942,
+            "56": 0.07871,
+            "57": 0.07976,
+            "58": 0.07956,
+            "59": 0.08,
+            "60": 0.0792,
+            "61": 0.07836,
+            "62": 0.07989,
+            "63": 0.0809,
+            "64": 0.08148,
+            "65": 0.08043,
+            "66": 0.07986,
+            "67": 0.08023,
+            "68": 0.07899,
+            "69": 0.07929,
+            "70": 0.08168,
+            "71": 0.08127,
+            "72": 0.0786,
+            "73": 0.07921,
+            "74": 0.07909,
+            "75": 0.0791,
+            "76": 0.07958,
+            "77": 0.07852,
+            "78": 0.07999,
+            "79": 0.07999,
+            "80": 0.08194,
+            "81": 0.07923,
+            "82": 0.07928,
+            "83": 0.07876,
+            "84": 0.07871,
+            "85": 0.08021,
+            "86": 0.07922,
+            "87": 0.07979,
+            "88": 0.0797,
+            "89": 0.08029,
+            "90": 0.15516,
+            "91": 0.11731,
+            "92": 0.11011,
+            "93": 0.14646,
+            "94": 0.08003,
+            "95": 0.08107,
+            "96": 0.07984,
+            "97": 0.07889,
+            "98": 0.07881,
+            "99": 0.07894,
+            "100": 0.07813
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..01ab2714529
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.80381,
+            "52": 9.68202,
+            "53": 10.02345,
+            "54": 9.91634,
+            "55": 9.82456,
+            "56": 9.56974,
+            "57": 9.42672,
+            "58": 9.78081,
+            "59": 9.53243,
+            "60": 9.44593,
+            "61": 9.64254,
+            "62": 9.94293,
+            "63": 9.31764,
+            "64": 9.72548,
+            "65": 8.88739,
+            "66": 9.65691,
+            "67": 9.31749,
+            "68": 9.73495,
+            "69": 9.74866,
+            "70": 9.69625,
+            "71": 9.57689,
+            "72": 9.52422,
+            "73": 9.45595,
+            "74": 8.88269,
+            "75": 9.37584,
+            "76": 9.01136,
+            "77": 10.02287,
+            "78": 9.67963,
+            "79": 9.33172,
+            "80": 9.35826,
+            "81": 9.43394,
+            "82": 9.65054,
+            "83": 9.25503,
+            "84": 9.3714,
+            "85": 9.5623,
+            "86": 9.03489,
+            "87": 9.54614,
+            "88": 9.69785,
+            "89": 9.54656,
+            "90": 9.77624,
+            "91": 9.2884,
+            "92": 9.30662,
+            "93": 9.02647,
+            "94": 8.78837,
+            "95": 9.48027,
+            "96": 9.47974,
+            "97": 9.25611,
+            "98": 9.61949,
+            "99": 8.83824,
+            "100": 9.35135
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2514.0,
+            "52": 2513.0,
+            "53": 2894.0,
+            "54": 2656.0,
+            "55": 2348.0,
+            "56": 2506.0,
+            "57": 2501.0,
+            "58": 2770.0,
+            "59": 2681.0,
+            "60": 2434.0,
+            "61": 2776.0,
+            "62": 2596.0,
+            "63": 2617.0,
+            "64": 3012.0,
+            "65": 2657.0,
+            "66": 2947.0,
+            "67": 3089.0,
+            "68": 2818.0,
+            "69": 2909.0,
+            "70": 3025.0,
+            "71": 2924.0,
+            "72": 2702.0,
+            "73": 2947.0,
+            "74": 2306.0,
+            "75": 2791.0,
+            "76": 3093.0,
+            "77": 3107.0,
+            "78": 3134.0,
+            "79": 3205.0,
+            "80": 3123.0,
+            "81": 3290.0,
+            "82": 3172.0,
+            "83": 2719.0,
+            "84": 3328.0,
+            "85": 3255.0,
+            "86": 2546.0,
+            "87": 3472.0,
+            "88": 3068.0,
+            "89": 2953.0,
+            "90": 3300.0,
+            "91": 3154.0,
+            "92": 3061.0,
+            "93": 2889.0,
+            "94": 3535.0,
+            "95": 3078.0,
+            "96": 3181.0,
+            "97": 3135.0,
+            "98": 3569.0,
+            "99": 3319.0,
+            "100": 3223.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2596140032.0,
+            "52": 2596141056.0,
+            "53": 2596141056.0,
+            "54": 2596141056.0,
+            "55": 2596141056.0,
+            "56": 2596141056.0,
+            "57": 2596141056.0,
+            "58": 2596141056.0,
+            "59": 2596141056.0,
+            "60": 2596141056.0,
+            "61": 2596141056.0,
+            "62": 2596141056.0,
+            "63": 2596141056.0,
+            "64": 2596141056.0,
+            "65": 2596141056.0,
+            "66": 2596141056.0,
+            "67": 2596141056.0,
+            "68": 2596141056.0,
+            "69": 2596141056.0,
+            "70": 2596141056.0,
+            "71": 2596141056.0,
+            "72": 2596141056.0,
+            "73": 2596141056.0,
+            "74": 2596141056.0,
+            "75": 2596141056.0,
+            "76": 2596141056.0,
+            "77": 2596141056.0,
+            "78": 2596141056.0,
+            "79": 2596141056.0,
+            "80": 2596141056.0,
+            "81": 2596141056.0,
+            "82": 2596141056.0,
+            "83": 2596141056.0,
+            "84": 2596141056.0,
+            "85": 2596141056.0,
+            "86": 2596141056.0,
+            "87": 2596141056.0,
+            "88": 2596141056.0,
+            "89": 2596141056.0,
+            "90": 2596141056.0,
+            "91": 2596141056.0,
+            "92": 2596141056.0,
+            "93": 2596141056.0,
+            "94": 2596141056.0,
+            "95": 2596141056.0,
+            "96": 2596141056.0,
+            "97": 2596141056.0,
+            "98": 2596141056.0,
+            "99": 2596141056.0,
+            "100": 2596141056.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.16514,
+            "52": 0.11315,
+            "53": 0.08114,
+            "54": 0.08317,
+            "55": 0.08019,
+            "56": 0.08314,
+            "57": 0.08621,
+            "58": 0.08016,
+            "59": 0.07921,
+            "60": 0.08005,
+            "61": 0.08103,
+            "62": 0.10234,
+            "63": 0.1001,
+            "64": 0.4876,
+            "65": 0.08127,
+            "66": 0.079,
+            "67": 0.07859,
+            "68": 0.08085,
+            "69": 0.07943,
+            "70": 0.07842,
+            "71": 0.07899,
+            "72": 0.07958,
+            "73": 0.07925,
+            "74": 0.08017,
+            "75": 0.07902,
+            "76": 0.08039,
+            "77": 0.07802,
+            "78": 0.07857,
+            "79": 0.07907,
+            "80": 0.07806,
+            "81": 0.07858,
+            "82": 0.08046,
+            "83": 0.07775,
+            "84": 0.07777,
+            "85": 0.07752,
+            "86": 0.07844,
+            "87": 0.07834,
+            "88": 0.07837,
+            "89": 0.07893,
+            "90": 0.07826,
+            "91": 0.07839,
+            "92": 0.07815,
+            "93": 0.07767,
+            "94": 0.0784,
+            "95": 0.07785,
+            "96": 0.07909,
+            "97": 0.07789,
+            "98": 0.0771,
+            "99": 0.07799,
+            "100": 0.08104
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..d7fe14d5d09
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.7692,
+            "2": 10.78173,
+            "3": 10.77785,
+            "4": 10.75155,
+            "5": 10.80909,
+            "6": 10.8218,
+            "7": 10.80242,
+            "8": 10.78781,
+            "9": 10.7948,
+            "10": 10.713,
+            "11": 10.85088,
+            "12": 10.80067,
+            "13": 10.82614,
+            "14": 10.84338,
+            "15": 10.75514,
+            "16": 10.75194,
+            "17": 10.70801,
+            "18": 10.74736,
+            "19": 10.74723,
+            "20": 10.64347,
+            "21": 10.60434,
+            "22": 10.43223,
+            "23": 10.66534,
+            "24": 10.50025,
+            "25": 10.43523,
+            "26": 10.51418,
+            "27": 10.5415,
+            "28": 10.51383,
+            "29": 10.53731,
+            "30": 10.25937,
+            "31": 9.97666,
+            "32": 10.39972,
+            "33": 10.38587,
+            "34": 10.11012,
+            "35": 10.17419,
+            "36": 10.11601,
+            "37": 10.26042,
+            "38": 10.0751,
+            "39": 10.32912,
+            "40": 9.9687,
+            "41": 10.05131,
+            "42": 10.12745,
+            "43": 9.70822,
+            "44": 9.83332,
+            "45": 9.70556,
+            "46": 9.7136,
+            "47": 10.05915,
+            "48": 9.7456,
+            "49": 9.40552,
+            "50": 9.80892,
+            "51": 9.76773,
+            "52": 9.64757,
+            "53": 9.99521,
+            "54": 9.88624,
+            "55": 9.81783,
+            "56": 9.53944,
+            "57": 9.38198,
+            "58": 9.75913,
+            "59": 9.52125,
+            "60": 9.42463,
+            "61": 9.63669,
+            "62": 9.93001,
+            "63": 9.29627,
+            "64": 9.70638,
+            "65": 8.88076,
+            "66": 9.64079,
+            "67": 9.32154,
+            "68": 9.737,
+            "69": 9.75369,
+            "70": 9.68294,
+            "71": 9.58129,
+            "72": 9.52492,
+            "73": 9.44113,
+            "74": 8.86077,
+            "75": 9.3667,
+            "76": 9.01682,
+            "77": 10.0224,
+            "78": 9.68369,
+            "79": 9.33323,
+            "80": 9.35819,
+            "81": 9.43805,
+            "82": 9.66108,
+            "83": 9.26227,
+            "84": 9.37195,
+            "85": 9.56661,
+            "86": 9.04515,
+            "87": 9.55767,
+            "88": 9.70545,
+            "89": 9.55104,
+            "90": 9.79186,
+            "91": 9.29174,
+            "92": 9.31247,
+            "93": 9.04313,
+            "94": 8.7869,
+            "95": 9.49543,
+            "96": 9.48418,
+            "97": 9.25973,
+            "98": 9.62635,
+            "99": 8.85054,
+            "100": 9.36076
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1750.0,
+            "2": 1874.0,
+            "3": 1769.0,
+            "4": 1936.0,
+            "5": 2122.0,
+            "6": 2095.0,
+            "7": 2027.0,
+            "8": 1845.0,
+            "9": 2127.0,
+            "10": 1456.0,
+            "11": 1996.0,
+            "12": 1715.0,
+            "13": 2108.0,
+            "14": 1919.0,
+            "15": 2047.0,
+            "16": 1932.0,
+            "17": 2016.0,
+            "18": 1872.0,
+            "19": 1921.0,
+            "20": 1768.0,
+            "21": 1953.0,
+            "22": 1836.0,
+            "23": 2100.0,
+            "24": 1817.0,
+            "25": 1809.0,
+            "26": 1841.0,
+            "27": 2005.0,
+            "28": 2109.0,
+            "29": 2055.0,
+            "30": 1949.0,
+            "31": 1736.0,
+            "32": 2070.0,
+            "33": 2162.0,
+            "34": 1964.0,
+            "35": 2007.0,
+            "36": 2021.0,
+            "37": 2425.0,
+            "38": 2329.0,
+            "39": 2430.0,
+            "40": 2340.0,
+            "41": 2324.0,
+            "42": 2289.0,
+            "43": 2097.0,
+            "44": 2349.0,
+            "45": 2282.0,
+            "46": 2442.0,
+            "47": 2459.0,
+            "48": 2414.0,
+            "49": 2282.0,
+            "50": 2385.0,
+            "51": 2647.0,
+            "52": 2648.0,
+            "53": 2878.0,
+            "54": 2654.0,
+            "55": 2580.0,
+            "56": 2658.0,
+            "57": 2547.0,
+            "58": 2739.0,
+            "59": 2779.0,
+            "60": 2349.0,
+            "61": 2741.0,
+            "62": 2617.0,
+            "63": 2512.0,
+            "64": 2800.0,
+            "65": 2697.0,
+            "66": 2966.0,
+            "67": 2952.0,
+            "68": 2833.0,
+            "69": 3029.0,
+            "70": 2977.0,
+            "71": 2813.0,
+            "72": 2664.0,
+            "73": 3085.0,
+            "74": 2292.0,
+            "75": 2810.0,
+            "76": 3025.0,
+            "77": 3025.0,
+            "78": 3037.0,
+            "79": 3181.0,
+            "80": 3234.0,
+            "81": 3273.0,
+            "82": 3294.0,
+            "83": 2707.0,
+            "84": 3332.0,
+            "85": 3336.0,
+            "86": 2585.0,
+            "87": 3448.0,
+            "88": 3239.0,
+            "89": 3137.0,
+            "90": 3341.0,
+            "91": 3188.0,
+            "92": 3246.0,
+            "93": 2823.0,
+            "94": 3358.0,
+            "95": 3202.0,
+            "96": 3118.0,
+            "97": 3163.0,
+            "98": 3645.0,
+            "99": 3345.0,
+            "100": 3201.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 917459968.0,
+            "2": 917459968.0,
+            "3": 917459968.0,
+            "4": 917459968.0,
+            "5": 917459968.0,
+            "6": 917459968.0,
+            "7": 917459968.0,
+            "8": 917459968.0,
+            "9": 917459968.0,
+            "10": 917459968.0,
+            "11": 917459968.0,
+            "12": 917459968.0,
+            "13": 917459968.0,
+            "14": 917459968.0,
+            "15": 917459968.0,
+            "16": 917459968.0,
+            "17": 917459968.0,
+            "18": 917459968.0,
+            "19": 917459968.0,
+            "20": 917459968.0,
+            "21": 917459968.0,
+            "22": 917459968.0,
+            "23": 917459968.0,
+            "24": 917459968.0,
+            "25": 917459968.0,
+            "26": 917459968.0,
+            "27": 917459968.0,
+            "28": 917459968.0,
+            "29": 917459968.0,
+            "30": 917459968.0,
+            "31": 917459968.0,
+            "32": 917459968.0,
+            "33": 917459968.0,
+            "34": 917459968.0,
+            "35": 917459968.0,
+            "36": 917459968.0,
+            "37": 917459968.0,
+            "38": 917459968.0,
+            "39": 917459968.0,
+            "40": 917459968.0,
+            "41": 917459968.0,
+            "42": 917459968.0,
+            "43": 917459968.0,
+            "44": 917459968.0,
+            "45": 917459968.0,
+            "46": 917459968.0,
+            "47": 917459968.0,
+            "48": 917459968.0,
+            "49": 917459968.0,
+            "50": 917459968.0,
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2266035200.0,
+            "2": 2624452608.0,
+            "3": 2624452608.0,
+            "4": 2624452608.0,
+            "5": 2624452608.0,
+            "6": 2624452608.0,
+            "7": 2624452608.0,
+            "8": 2624452608.0,
+            "9": 2624452608.0,
+            "10": 2624452608.0,
+            "11": 2624452608.0,
+            "12": 2624452608.0,
+            "13": 2624452608.0,
+            "14": 2624452608.0,
+            "15": 2624452608.0,
+            "16": 2624452608.0,
+            "17": 2624452608.0,
+            "18": 2624452608.0,
+            "19": 2624452608.0,
+            "20": 2624452608.0,
+            "21": 2624452608.0,
+            "22": 2624452608.0,
+            "23": 2624452608.0,
+            "24": 2624452608.0,
+            "25": 2624452608.0,
+            "26": 2624452608.0,
+            "27": 2624452608.0,
+            "28": 2624452608.0,
+            "29": 2624452608.0,
+            "30": 2624452608.0,
+            "31": 2624452608.0,
+            "32": 2624452608.0,
+            "33": 2624452608.0,
+            "34": 2624452608.0,
+            "35": 2624452608.0,
+            "36": 2624452608.0,
+            "37": 2624452608.0,
+            "38": 2624452608.0,
+            "39": 2624452608.0,
+            "40": 2624452608.0,
+            "41": 2624452608.0,
+            "42": 2624452608.0,
+            "43": 2624452608.0,
+            "44": 2624452608.0,
+            "45": 2624452608.0,
+            "46": 2624452608.0,
+            "47": 2624452608.0,
+            "48": 2624452608.0,
+            "49": 2624452608.0,
+            "50": 2624452608.0,
+            "51": 2624452608.0,
+            "52": 2624452608.0,
+            "53": 2624452608.0,
+            "54": 2624452608.0,
+            "55": 2624452608.0,
+            "56": 2624452608.0,
+            "57": 2624452608.0,
+            "58": 2624452608.0,
+            "59": 2624452608.0,
+            "60": 2624452608.0,
+            "61": 2624452608.0,
+            "62": 2624452608.0,
+            "63": 2624452608.0,
+            "64": 2624452608.0,
+            "65": 2624452608.0,
+            "66": 2624452608.0,
+            "67": 2624452608.0,
+            "68": 2624452608.0,
+            "69": 2624452608.0,
+            "70": 2624452608.0,
+            "71": 2624452608.0,
+            "72": 2624452608.0,
+            "73": 2624452608.0,
+            "74": 2624452608.0,
+            "75": 2624452608.0,
+            "76": 2624452608.0,
+            "77": 2624452608.0,
+            "78": 2624452608.0,
+            "79": 2624452608.0,
+            "80": 2624452608.0,
+            "81": 2624452608.0,
+            "82": 2624452608.0,
+            "83": 2624452608.0,
+            "84": 2624452608.0,
+            "85": 2624452608.0,
+            "86": 2624452608.0,
+            "87": 2624452608.0,
+            "88": 2624452608.0,
+            "89": 2624452608.0,
+            "90": 2624452608.0,
+            "91": 2624452608.0,
+            "92": 2624452608.0,
+            "93": 2624452608.0,
+            "94": 2624452608.0,
+            "95": 2624452608.0,
+            "96": 2624452608.0,
+            "97": 2624452608.0,
+            "98": 2624452608.0,
+            "99": 2624452608.0,
+            "100": 2624452608.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.44626,
+            "3": 0.14544,
+            "4": 0.14069,
+            "5": 0.13132,
+            "6": 0.13447,
+            "7": 0.13519,
+            "8": 0.13562,
+            "9": 0.13513,
+            "10": 0.13387,
+            "11": 0.13378,
+            "12": 0.13575,
+            "13": 0.13462,
+            "14": 0.13384,
+            "15": 0.13412,
+            "16": 0.13347,
+            "17": 0.13555,
+            "18": 0.13515,
+            "19": 0.13443,
+            "20": 0.14433,
+            "21": 0.14638,
+            "22": 0.14561,
+            "23": 0.13968,
+            "24": 0.13694,
+            "25": 0.14479,
+            "26": 0.14038,
+            "27": 0.1473,
+            "28": 0.14099,
+            "29": 0.13829,
+            "30": 0.13782,
+            "31": 0.13746,
+            "32": 0.13573,
+            "33": 0.13325,
+            "34": 0.14385,
+            "35": 0.14367,
+            "36": 0.14113,
+            "37": 0.1394,
+            "38": 0.136,
+            "39": 0.13678,
+            "40": 0.13539,
+            "41": 0.1364,
+            "42": 0.13593,
+            "43": 0.13738,
+            "44": 0.13238,
+            "45": 0.13667,
+            "46": 0.14472,
+            "47": 0.1358,
+            "48": 0.13697,
+            "49": 0.13391,
+            "50": 0.13536,
+            "51": 0.16637,
+            "52": 0.15213,
+            "53": 0.14685,
+            "54": 0.14134,
+            "55": 0.14007,
+            "56": 0.13524,
+            "57": 0.13779,
+            "58": 0.13841,
+            "59": 0.13821,
+            "60": 0.13687,
+            "61": 0.13663,
+            "62": 0.13401,
+            "63": 0.13389,
+            "64": 0.13289,
+            "65": 0.13362,
+            "66": 0.13754,
+            "67": 0.13473,
+            "68": 0.13402,
+            "69": 0.13491,
+            "70": 0.13536,
+            "71": 0.13258,
+            "72": 0.13482,
+            "73": 0.13371,
+            "74": 0.13507,
+            "75": 0.13595,
+            "76": 0.13613,
+            "77": 0.13395,
+            "78": 0.13252,
+            "79": 0.13394,
+            "80": 0.13329,
+            "81": 0.13388,
+            "82": 0.13407,
+            "83": 0.13522,
+            "84": 0.13579,
+            "85": 0.13452,
+            "86": 0.13422,
+            "87": 0.13388,
+            "88": 0.1343,
+            "89": 0.13546,
+            "90": 0.13522,
+            "91": 0.13458,
+            "92": 0.1341,
+            "93": 0.13519,
+            "94": 0.13534,
+            "95": 0.13521,
+            "96": 0.13886,
+            "97": 0.13832,
+            "98": 0.14048,
+            "99": 0.14022,
+            "100": 0.13732
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..13e04822669
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.76773,
+            "52": 9.64757,
+            "53": 9.99521,
+            "54": 9.88624,
+            "55": 9.81783,
+            "56": 9.53944,
+            "57": 9.38198,
+            "58": 9.75913,
+            "59": 9.52125,
+            "60": 9.42463,
+            "61": 9.63669,
+            "62": 9.93001,
+            "63": 9.29627,
+            "64": 9.70638,
+            "65": 8.88076,
+            "66": 9.64079,
+            "67": 9.32154,
+            "68": 9.737,
+            "69": 9.75369,
+            "70": 9.68294,
+            "71": 9.58129,
+            "72": 9.52492,
+            "73": 9.44113,
+            "74": 8.86077,
+            "75": 9.3667,
+            "76": 9.01682,
+            "77": 10.0224,
+            "78": 9.68369,
+            "79": 9.33323,
+            "80": 9.35819,
+            "81": 9.43805,
+            "82": 9.66108,
+            "83": 9.26227,
+            "84": 9.37195,
+            "85": 9.56661,
+            "86": 9.04515,
+            "87": 9.55767,
+            "88": 9.70545,
+            "89": 9.55104,
+            "90": 9.79186,
+            "91": 9.29174,
+            "92": 9.31247,
+            "93": 9.04313,
+            "94": 8.7869,
+            "95": 9.49543,
+            "96": 9.48418,
+            "97": 9.25973,
+            "98": 9.62635,
+            "99": 8.85054,
+            "100": 9.36076
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2647.0,
+            "52": 2648.0,
+            "53": 2878.0,
+            "54": 2654.0,
+            "55": 2580.0,
+            "56": 2658.0,
+            "57": 2547.0,
+            "58": 2739.0,
+            "59": 2779.0,
+            "60": 2349.0,
+            "61": 2741.0,
+            "62": 2617.0,
+            "63": 2512.0,
+            "64": 2800.0,
+            "65": 2697.0,
+            "66": 2966.0,
+            "67": 2952.0,
+            "68": 2833.0,
+            "69": 3029.0,
+            "70": 2977.0,
+            "71": 2813.0,
+            "72": 2664.0,
+            "73": 3085.0,
+            "74": 2292.0,
+            "75": 2810.0,
+            "76": 3025.0,
+            "77": 3025.0,
+            "78": 3037.0,
+            "79": 3181.0,
+            "80": 3234.0,
+            "81": 3273.0,
+            "82": 3294.0,
+            "83": 2707.0,
+            "84": 3332.0,
+            "85": 3336.0,
+            "86": 2585.0,
+            "87": 3448.0,
+            "88": 3239.0,
+            "89": 3137.0,
+            "90": 3341.0,
+            "91": 3188.0,
+            "92": 3246.0,
+            "93": 2823.0,
+            "94": 3358.0,
+            "95": 3202.0,
+            "96": 3118.0,
+            "97": 3163.0,
+            "98": 3645.0,
+            "99": 3345.0,
+            "100": 3201.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2625500160.0,
+            "52": 2625501184.0,
+            "53": 2625501184.0,
+            "54": 2625501184.0,
+            "55": 2625501184.0,
+            "56": 2625501184.0,
+            "57": 2625501184.0,
+            "58": 2625501184.0,
+            "59": 2625501184.0,
+            "60": 2625501184.0,
+            "61": 2625501184.0,
+            "62": 2625501184.0,
+            "63": 2625501184.0,
+            "64": 2625501184.0,
+            "65": 2625501184.0,
+            "66": 2625501184.0,
+            "67": 2625501184.0,
+            "68": 2625501184.0,
+            "69": 2625501184.0,
+            "70": 2625501184.0,
+            "71": 2625501184.0,
+            "72": 2625501184.0,
+            "73": 2625501184.0,
+            "74": 2625501184.0,
+            "75": 2625501184.0,
+            "76": 2625501184.0,
+            "77": 2625501184.0,
+            "78": 2625501184.0,
+            "79": 2625501184.0,
+            "80": 2625501184.0,
+            "81": 2625501184.0,
+            "82": 2625501184.0,
+            "83": 2625501184.0,
+            "84": 2625501184.0,
+            "85": 2625501184.0,
+            "86": 2625501184.0,
+            "87": 2625501184.0,
+            "88": 2625501184.0,
+            "89": 2625501184.0,
+            "90": 2625501184.0,
+            "91": 2625501184.0,
+            "92": 2625501184.0,
+            "93": 2625501184.0,
+            "94": 2625501184.0,
+            "95": 2625501184.0,
+            "96": 2625501184.0,
+            "97": 2625501184.0,
+            "98": 2625501184.0,
+            "99": 2625501184.0,
+            "100": 2625501184.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.60445,
+            "53": 0.14686,
+            "54": 0.13325,
+            "55": 0.13174,
+            "56": 0.13234,
+            "57": 0.13268,
+            "58": 0.13337,
+            "59": 0.13324,
+            "60": 0.13107,
+            "61": 0.13206,
+            "62": 0.1329,
+            "63": 0.13379,
+            "64": 0.1348,
+            "65": 0.13602,
+            "66": 0.13298,
+            "67": 0.13465,
+            "68": 0.13495,
+            "69": 0.13454,
+            "70": 0.13536,
+            "71": 0.13494,
+            "72": 0.13541,
+            "73": 0.13997,
+            "74": 0.1423,
+            "75": 0.13785,
+            "76": 0.14664,
+            "77": 0.16548,
+            "78": 0.17704,
+            "79": 0.15011,
+            "80": 0.14471,
+            "81": 0.13952,
+            "82": 0.13892,
+            "83": 0.13568,
+            "84": 0.13463,
+            "85": 0.13878,
+            "86": 0.13867,
+            "87": 0.13899,
+            "88": 0.13819,
+            "89": 0.13945,
+            "90": 0.13964,
+            "91": 0.13862,
+            "92": 0.13655,
+            "93": 0.13587,
+            "94": 0.13572,
+            "95": 0.1357,
+            "96": 0.13598,
+            "97": 0.13642,
+            "98": 0.13742,
+            "99": 0.13474,
+            "100": 0.13647
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100.json
index 8655a61eb9b..b1c227e9ae3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 921653248.0,
-            "2": 921653248.0,
-            "3": 921653248.0,
-            "4": 921653248.0,
-            "5": 921653248.0,
-            "6": 921653248.0,
-            "7": 921653248.0,
-            "8": 921653248.0,
-            "9": 921653248.0,
-            "10": 921653248.0,
-            "11": 921653248.0,
-            "12": 921653248.0,
-            "13": 921653248.0,
-            "14": 921653248.0,
-            "15": 921653248.0,
-            "16": 921653248.0,
-            "17": 921653248.0,
-            "18": 921653248.0,
-            "19": 921653248.0,
-            "20": 921653248.0,
-            "21": 921653248.0,
-            "22": 921653248.0,
-            "23": 921653248.0,
-            "24": 921653248.0,
-            "25": 921653248.0,
-            "26": 921653248.0,
-            "27": 921653248.0,
-            "28": 921653248.0,
-            "29": 921653248.0,
-            "30": 921653248.0,
-            "31": 921653248.0,
-            "32": 921653248.0,
-            "33": 921653248.0,
-            "34": 921653248.0,
-            "35": 921653248.0,
-            "36": 921653248.0,
-            "37": 921653248.0,
-            "38": 921653248.0,
-            "39": 921653248.0,
-            "40": 921653248.0,
-            "41": 921653248.0,
-            "42": 921653248.0,
-            "43": 921653248.0,
-            "44": 921653248.0,
-            "45": 921653248.0,
-            "46": 921653248.0,
-            "47": 921653248.0,
-            "48": 921653248.0,
-            "49": 921653248.0,
-            "50": 921653248.0,
-            "51": 921653248.0,
-            "52": 921653248.0,
-            "53": 921653248.0,
-            "54": 921653248.0,
-            "55": 921653248.0,
-            "56": 921653248.0,
-            "57": 921653248.0,
-            "58": 921653248.0,
-            "59": 921653248.0,
-            "60": 921653248.0,
-            "61": 921653248.0,
-            "62": 921653248.0,
-            "63": 921653248.0,
-            "64": 921653248.0,
-            "65": 921653248.0,
-            "66": 921653248.0,
-            "67": 921653248.0,
-            "68": 921653248.0,
-            "69": 921653248.0,
-            "70": 921653248.0,
-            "71": 921653248.0,
-            "72": 921653248.0,
-            "73": 921653248.0,
-            "74": 921653248.0,
-            "75": 921653248.0,
-            "76": 921653248.0,
-            "77": 921653248.0,
-            "78": 921653248.0,
-            "79": 921653248.0,
-            "80": 921653248.0,
-            "81": 921653248.0,
-            "82": 921653248.0,
-            "83": 921653248.0,
-            "84": 921653248.0,
-            "85": 921653248.0,
-            "86": 921653248.0,
-            "87": 921653248.0,
-            "88": 921653248.0,
-            "89": 921653248.0,
-            "90": 921653248.0,
-            "91": 921653248.0,
-            "92": 921653248.0,
-            "93": 921653248.0,
-            "94": 921653248.0,
-            "95": 921653248.0,
-            "96": 921653248.0,
-            "97": 921653248.0,
-            "98": 921653248.0,
-            "99": 921653248.0,
-            "100": 921653248.0
+            "1": 917459968.0,
+            "2": 917459968.0,
+            "3": 917459968.0,
+            "4": 917459968.0,
+            "5": 917459968.0,
+            "6": 917459968.0,
+            "7": 917459968.0,
+            "8": 917459968.0,
+            "9": 917459968.0,
+            "10": 917459968.0,
+            "11": 917459968.0,
+            "12": 917459968.0,
+            "13": 917459968.0,
+            "14": 917459968.0,
+            "15": 917459968.0,
+            "16": 917459968.0,
+            "17": 917459968.0,
+            "18": 917459968.0,
+            "19": 917459968.0,
+            "20": 917459968.0,
+            "21": 917459968.0,
+            "22": 917459968.0,
+            "23": 917459968.0,
+            "24": 917459968.0,
+            "25": 917459968.0,
+            "26": 917459968.0,
+            "27": 917459968.0,
+            "28": 917459968.0,
+            "29": 917459968.0,
+            "30": 917459968.0,
+            "31": 917459968.0,
+            "32": 917459968.0,
+            "33": 917459968.0,
+            "34": 917459968.0,
+            "35": 917459968.0,
+            "36": 917459968.0,
+            "37": 917459968.0,
+            "38": 917459968.0,
+            "39": 917459968.0,
+            "40": 917459968.0,
+            "41": 917459968.0,
+            "42": 917459968.0,
+            "43": 917459968.0,
+            "44": 917459968.0,
+            "45": 917459968.0,
+            "46": 917459968.0,
+            "47": 917459968.0,
+            "48": 917459968.0,
+            "49": 917459968.0,
+            "50": 917459968.0,
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2240868352.0,
-            "2": 2603480064.0,
-            "3": 2603480064.0,
-            "4": 2603480064.0,
-            "5": 2603480064.0,
-            "6": 2603480064.0,
-            "7": 2603480064.0,
-            "8": 2603480064.0,
-            "9": 2603480064.0,
-            "10": 2603480064.0,
-            "11": 2603480064.0,
-            "12": 2603480064.0,
-            "13": 2603480064.0,
-            "14": 2603480064.0,
-            "15": 2603480064.0,
-            "16": 2603480064.0,
-            "17": 2603480064.0,
-            "18": 2603480064.0,
-            "19": 2603480064.0,
-            "20": 2603480064.0,
-            "21": 2603480064.0,
-            "22": 2603480064.0,
-            "23": 2603480064.0,
-            "24": 2603480064.0,
-            "25": 2603480064.0,
-            "26": 2603480064.0,
-            "27": 2603480064.0,
-            "28": 2603480064.0,
-            "29": 2603480064.0,
-            "30": 2603480064.0,
-            "31": 2603480064.0,
-            "32": 2603480064.0,
-            "33": 2603480064.0,
-            "34": 2603480064.0,
-            "35": 2603480064.0,
-            "36": 2603480064.0,
-            "37": 2603480064.0,
-            "38": 2603480064.0,
-            "39": 2603480064.0,
-            "40": 2603480064.0,
-            "41": 2603480064.0,
-            "42": 2603480064.0,
-            "43": 2603480064.0,
-            "44": 2603480064.0,
-            "45": 2603480064.0,
-            "46": 2603480064.0,
-            "47": 2603480064.0,
-            "48": 2603480064.0,
-            "49": 2603480064.0,
-            "50": 2603480064.0,
-            "51": 2603480064.0,
-            "52": 2603480064.0,
-            "53": 2603480064.0,
-            "54": 2603480064.0,
-            "55": 2603480064.0,
-            "56": 2603480064.0,
-            "57": 2603480064.0,
-            "58": 2603480064.0,
-            "59": 2603480064.0,
-            "60": 2603480064.0,
-            "61": 2603480064.0,
-            "62": 2603480064.0,
-            "63": 2603480064.0,
-            "64": 2603480064.0,
-            "65": 2603480064.0,
-            "66": 2603480064.0,
-            "67": 2603480064.0,
-            "68": 2603480064.0,
-            "69": 2603480064.0,
-            "70": 2603480064.0,
-            "71": 2603480064.0,
-            "72": 2603480064.0,
-            "73": 2603480064.0,
-            "74": 2603480064.0,
-            "75": 2603480064.0,
-            "76": 2603480064.0,
-            "77": 2603480064.0,
-            "78": 2603480064.0,
-            "79": 2603480064.0,
-            "80": 2603480064.0,
-            "81": 2603480064.0,
-            "82": 2603480064.0,
-            "83": 2603480064.0,
-            "84": 2603480064.0,
-            "85": 2603480064.0,
-            "86": 2603480064.0,
-            "87": 2603480064.0,
-            "88": 2603480064.0,
-            "89": 2603480064.0,
-            "90": 2603480064.0,
-            "91": 2603480064.0,
-            "92": 2603480064.0,
-            "93": 2603480064.0,
-            "94": 2603480064.0,
-            "95": 2603480064.0,
-            "96": 2603480064.0,
-            "97": 2603480064.0,
-            "98": 2603480064.0,
-            "99": 2603480064.0,
-            "100": 2603480064.0
+            "1": 2239820800.0,
+            "2": 2599286784.0,
+            "3": 2599286784.0,
+            "4": 2599286784.0,
+            "5": 2599286784.0,
+            "6": 2599286784.0,
+            "7": 2599286784.0,
+            "8": 2599286784.0,
+            "9": 2599286784.0,
+            "10": 2599286784.0,
+            "11": 2599286784.0,
+            "12": 2599286784.0,
+            "13": 2599286784.0,
+            "14": 2599286784.0,
+            "15": 2599286784.0,
+            "16": 2599286784.0,
+            "17": 2599286784.0,
+            "18": 2599286784.0,
+            "19": 2599286784.0,
+            "20": 2599286784.0,
+            "21": 2599286784.0,
+            "22": 2599286784.0,
+            "23": 2599286784.0,
+            "24": 2599286784.0,
+            "25": 2599286784.0,
+            "26": 2599286784.0,
+            "27": 2599286784.0,
+            "28": 2599286784.0,
+            "29": 2599286784.0,
+            "30": 2599286784.0,
+            "31": 2599286784.0,
+            "32": 2599286784.0,
+            "33": 2599286784.0,
+            "34": 2599286784.0,
+            "35": 2599286784.0,
+            "36": 2599286784.0,
+            "37": 2599286784.0,
+            "38": 2599286784.0,
+            "39": 2599286784.0,
+            "40": 2599286784.0,
+            "41": 2599286784.0,
+            "42": 2599286784.0,
+            "43": 2599286784.0,
+            "44": 2599286784.0,
+            "45": 2599286784.0,
+            "46": 2599286784.0,
+            "47": 2599286784.0,
+            "48": 2599286784.0,
+            "49": 2599286784.0,
+            "50": 2599286784.0,
+            "51": 2599286784.0,
+            "52": 2599286784.0,
+            "53": 2599286784.0,
+            "54": 2599286784.0,
+            "55": 2599286784.0,
+            "56": 2599286784.0,
+            "57": 2599286784.0,
+            "58": 2599286784.0,
+            "59": 2599286784.0,
+            "60": 2599286784.0,
+            "61": 2599286784.0,
+            "62": 2599286784.0,
+            "63": 2599286784.0,
+            "64": 2599286784.0,
+            "65": 2599286784.0,
+            "66": 2599286784.0,
+            "67": 2599286784.0,
+            "68": 2599286784.0,
+            "69": 2599286784.0,
+            "70": 2599286784.0,
+            "71": 2599286784.0,
+            "72": 2599286784.0,
+            "73": 2599286784.0,
+            "74": 2599286784.0,
+            "75": 2599286784.0,
+            "76": 2599286784.0,
+            "77": 2599286784.0,
+            "78": 2599286784.0,
+            "79": 2599286784.0,
+            "80": 2599286784.0,
+            "81": 2599286784.0,
+            "82": 2599286784.0,
+            "83": 2599286784.0,
+            "84": 2599286784.0,
+            "85": 2599286784.0,
+            "86": 2599286784.0,
+            "87": 2599286784.0,
+            "88": 2599286784.0,
+            "89": 2599286784.0,
+            "90": 2599286784.0,
+            "91": 2599286784.0,
+            "92": 2599286784.0,
+            "93": 2599286784.0,
+            "94": 2599286784.0,
+            "95": 2599286784.0,
+            "96": 2599286784.0,
+            "97": 2599286784.0,
+            "98": 2599286784.0,
+            "99": 2599286784.0,
+            "100": 2599286784.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.33977,
-            "2": 0.14663,
-            "3": 0.12463,
-            "4": 0.11901,
-            "5": 0.118,
-            "6": 0.11842,
-            "7": 0.11849,
-            "8": 0.11649,
-            "9": 0.11703,
-            "10": 0.11655,
-            "11": 0.11646,
-            "12": 0.11802,
-            "13": 0.11742,
-            "14": 0.1167,
-            "15": 0.11429,
-            "16": 0.11654,
-            "17": 0.11533,
-            "18": 0.11853,
-            "19": 0.1171,
-            "20": 0.11735,
-            "21": 0.11515,
-            "22": 0.11632,
-            "23": 0.11865,
-            "24": 0.11706,
-            "25": 0.11644,
-            "26": 0.11684,
-            "27": 0.11688,
-            "28": 0.11839,
-            "29": 0.11706,
-            "30": 0.11761,
-            "31": 0.11696,
-            "32": 0.11567,
-            "33": 0.1149,
-            "34": 0.11395,
-            "35": 0.11367,
-            "36": 0.11567,
-            "37": 0.11646,
-            "38": 0.11392,
-            "39": 0.11516,
-            "40": 0.11529,
-            "41": 0.11559,
-            "42": 0.11519,
-            "43": 0.11808,
-            "44": 0.11599,
-            "45": 0.11605,
-            "46": 0.11502,
-            "47": 0.11651,
-            "48": 0.11713,
-            "49": 0.11667,
-            "50": 0.11432,
-            "51": 0.12857,
-            "52": 0.12187,
-            "53": 0.11684,
-            "54": 0.11222,
-            "55": 0.11538,
-            "56": 0.11241,
-            "57": 0.11229,
-            "58": 0.11087,
-            "59": 0.11183,
-            "60": 0.11124,
-            "61": 0.11009,
-            "62": 0.11052,
-            "63": 0.11585,
-            "64": 0.11262,
-            "65": 0.11148,
-            "66": 0.11248,
-            "67": 0.11274,
-            "68": 0.11394,
-            "69": 0.11397,
-            "70": 0.11233,
-            "71": 0.11354,
-            "72": 0.11589,
-            "73": 0.11373,
-            "74": 0.11483,
-            "75": 0.11512,
-            "76": 0.11378,
-            "77": 0.11431,
-            "78": 0.11374,
-            "79": 0.11521,
-            "80": 0.11486,
-            "81": 0.11364,
-            "82": 0.11419,
-            "83": 0.11439,
-            "84": 0.11589,
-            "85": 0.11422,
-            "86": 0.11458,
-            "87": 0.11184,
-            "88": 0.11418,
-            "89": 0.11264,
-            "90": 0.11169,
-            "91": 0.11452,
-            "92": 0.11215,
-            "93": 0.11431,
-            "94": 0.11145,
-            "95": 0.11129,
-            "96": 0.11113,
-            "97": 0.11365,
-            "98": 0.11127,
-            "99": 0.11136,
-            "100": 0.11229
+            "1": 7.15273,
+            "2": 0.12761,
+            "3": 0.108,
+            "4": 0.08804,
+            "5": 0.08914,
+            "6": 0.0872,
+            "7": 0.0865,
+            "8": 0.09025,
+            "9": 0.09224,
+            "10": 0.08785,
+            "11": 0.08842,
+            "12": 0.08678,
+            "13": 0.08768,
+            "14": 0.08732,
+            "15": 0.08754,
+            "16": 0.08689,
+            "17": 0.08745,
+            "18": 0.08749,
+            "19": 0.08681,
+            "20": 0.08755,
+            "21": 0.08798,
+            "22": 0.08687,
+            "23": 0.0869,
+            "24": 0.08666,
+            "25": 0.08694,
+            "26": 0.08728,
+            "27": 0.08672,
+            "28": 0.09131,
+            "29": 0.09876,
+            "30": 0.09345,
+            "31": 0.0871,
+            "32": 0.08745,
+            "33": 0.0868,
+            "34": 0.08664,
+            "35": 0.08688,
+            "36": 0.08685,
+            "37": 0.08807,
+            "38": 0.08807,
+            "39": 0.09095,
+            "40": 0.08728,
+            "41": 0.08918,
+            "42": 0.0874,
+            "43": 0.08812,
+            "44": 0.08765,
+            "45": 0.08765,
+            "46": 0.08695,
+            "47": 0.08967,
+            "48": 0.08734,
+            "49": 0.08707,
+            "50": 0.08818,
+            "51": 0.09801,
+            "52": 0.09366,
+            "53": 0.09478,
+            "54": 0.09027,
+            "55": 0.08632,
+            "56": 0.0857,
+            "57": 0.08636,
+            "58": 0.08585,
+            "59": 0.08632,
+            "60": 0.08559,
+            "61": 0.08575,
+            "62": 0.08716,
+            "63": 0.08612,
+            "64": 0.08569,
+            "65": 0.0876,
+            "66": 0.08587,
+            "67": 0.0862,
+            "68": 0.08594,
+            "69": 0.0858,
+            "70": 0.08668,
+            "71": 0.08553,
+            "72": 0.08961,
+            "73": 0.09562,
+            "74": 0.09156,
+            "75": 0.0901,
+            "76": 0.08615,
+            "77": 0.08562,
+            "78": 0.08664,
+            "79": 0.08569,
+            "80": 0.08621,
+            "81": 0.08562,
+            "82": 0.08601,
+            "83": 0.08551,
+            "84": 0.08569,
+            "85": 0.08622,
+            "86": 0.08639,
+            "87": 0.08581,
+            "88": 0.08569,
+            "89": 0.08624,
+            "90": 0.086,
+            "91": 0.08602,
+            "92": 0.08575,
+            "93": 0.08626,
+            "94": 0.0869,
+            "95": 0.0867,
+            "96": 0.0872,
+            "97": 0.08727,
+            "98": 0.08652,
+            "99": 0.0867,
+            "100": 0.08593
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..57da3647845
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.80375,
+            "52": 9.68218,
+            "53": 10.02348,
+            "54": 9.91595,
+            "55": 9.82442,
+            "56": 9.56994,
+            "57": 9.42628,
+            "58": 9.78075,
+            "59": 9.53254,
+            "60": 9.44561,
+            "61": 9.64249,
+            "62": 9.94298,
+            "63": 9.31745,
+            "64": 9.7256,
+            "65": 8.88735,
+            "66": 9.65711,
+            "67": 9.31747,
+            "68": 9.73506,
+            "69": 9.74863,
+            "70": 9.69601,
+            "71": 9.57682,
+            "72": 9.52425,
+            "73": 9.4558,
+            "74": 8.8826,
+            "75": 9.37563,
+            "76": 9.01106,
+            "77": 10.02278,
+            "78": 9.6796,
+            "79": 9.33171,
+            "80": 9.35836,
+            "81": 9.43399,
+            "82": 9.65055,
+            "83": 9.2551,
+            "84": 9.37131,
+            "85": 9.56237,
+            "86": 9.0351,
+            "87": 9.54617,
+            "88": 9.69806,
+            "89": 9.54657,
+            "90": 9.77627,
+            "91": 9.28858,
+            "92": 9.30652,
+            "93": 9.02646,
+            "94": 8.7883,
+            "95": 9.48041,
+            "96": 9.47962,
+            "97": 9.25545,
+            "98": 9.61947,
+            "99": 8.83854,
+            "100": 9.35116
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2508.0,
+            "52": 2483.0,
+            "53": 2959.0,
+            "54": 2554.0,
+            "55": 2408.0,
+            "56": 2452.0,
+            "57": 2528.0,
+            "58": 2594.0,
+            "59": 2750.0,
+            "60": 2563.0,
+            "61": 2794.0,
+            "62": 2495.0,
+            "63": 2493.0,
+            "64": 2965.0,
+            "65": 2569.0,
+            "66": 2877.0,
+            "67": 2969.0,
+            "68": 2803.0,
+            "69": 2944.0,
+            "70": 3001.0,
+            "71": 2867.0,
+            "72": 2714.0,
+            "73": 3017.0,
+            "74": 2281.0,
+            "75": 2774.0,
+            "76": 2983.0,
+            "77": 2955.0,
+            "78": 3148.0,
+            "79": 3076.0,
+            "80": 2992.0,
+            "81": 3255.0,
+            "82": 3212.0,
+            "83": 2809.0,
+            "84": 3266.0,
+            "85": 3188.0,
+            "86": 2616.0,
+            "87": 3492.0,
+            "88": 3130.0,
+            "89": 3020.0,
+            "90": 3238.0,
+            "91": 3106.0,
+            "92": 3183.0,
+            "93": 2960.0,
+            "94": 3492.0,
+            "95": 3112.0,
+            "96": 3256.0,
+            "97": 3055.0,
+            "98": 3558.0,
+            "99": 3196.0,
+            "100": 3109.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 917459968.0,
+            "52": 917459968.0,
+            "53": 917459968.0,
+            "54": 917459968.0,
+            "55": 917459968.0,
+            "56": 917459968.0,
+            "57": 917459968.0,
+            "58": 917459968.0,
+            "59": 917459968.0,
+            "60": 917459968.0,
+            "61": 917459968.0,
+            "62": 917459968.0,
+            "63": 917459968.0,
+            "64": 917459968.0,
+            "65": 917459968.0,
+            "66": 917459968.0,
+            "67": 917459968.0,
+            "68": 917459968.0,
+            "69": 917459968.0,
+            "70": 917459968.0,
+            "71": 917459968.0,
+            "72": 917459968.0,
+            "73": 917459968.0,
+            "74": 917459968.0,
+            "75": 917459968.0,
+            "76": 917459968.0,
+            "77": 917459968.0,
+            "78": 917459968.0,
+            "79": 917459968.0,
+            "80": 917459968.0,
+            "81": 917459968.0,
+            "82": 917459968.0,
+            "83": 917459968.0,
+            "84": 917459968.0,
+            "85": 917459968.0,
+            "86": 917459968.0,
+            "87": 917459968.0,
+            "88": 917459968.0,
+            "89": 917459968.0,
+            "90": 917459968.0,
+            "91": 917459968.0,
+            "92": 917459968.0,
+            "93": 917459968.0,
+            "94": 917459968.0,
+            "95": 917459968.0,
+            "96": 917459968.0,
+            "97": 917459968.0,
+            "98": 917459968.0,
+            "99": 917459968.0,
+            "100": 917459968.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2599285760.0,
+            "52": 2599286784.0,
+            "53": 2599286784.0,
+            "54": 2599286784.0,
+            "55": 2599286784.0,
+            "56": 2599286784.0,
+            "57": 2599286784.0,
+            "58": 2599286784.0,
+            "59": 2599286784.0,
+            "60": 2599286784.0,
+            "61": 2599286784.0,
+            "62": 2599286784.0,
+            "63": 2599286784.0,
+            "64": 2599286784.0,
+            "65": 2599286784.0,
+            "66": 2599286784.0,
+            "67": 2599286784.0,
+            "68": 2599286784.0,
+            "69": 2599286784.0,
+            "70": 2599286784.0,
+            "71": 2599286784.0,
+            "72": 2599286784.0,
+            "73": 2599286784.0,
+            "74": 2599286784.0,
+            "75": 2599286784.0,
+            "76": 2599286784.0,
+            "77": 2599286784.0,
+            "78": 2599286784.0,
+            "79": 2599286784.0,
+            "80": 2599286784.0,
+            "81": 2599286784.0,
+            "82": 2599286784.0,
+            "83": 2599286784.0,
+            "84": 2599286784.0,
+            "85": 2599286784.0,
+            "86": 2599286784.0,
+            "87": 2599286784.0,
+            "88": 2599286784.0,
+            "89": 2599286784.0,
+            "90": 2599286784.0,
+            "91": 2599286784.0,
+            "92": 2599286784.0,
+            "93": 2599286784.0,
+            "94": 2599286784.0,
+            "95": 2599286784.0,
+            "96": 2599286784.0,
+            "97": 2599286784.0,
+            "98": 2599286784.0,
+            "99": 2599286784.0,
+            "100": 2599286784.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.45,
+            "52": 0.1176,
+            "53": 0.08802,
+            "54": 0.08699,
+            "55": 0.08722,
+            "56": 0.08722,
+            "57": 0.09047,
+            "58": 0.08702,
+            "59": 0.08774,
+            "60": 0.08696,
+            "61": 0.08697,
+            "62": 0.08669,
+            "63": 0.08744,
+            "64": 0.08973,
+            "65": 0.08942,
+            "66": 0.08847,
+            "67": 0.0878,
+            "68": 0.0868,
+            "69": 0.08686,
+            "70": 0.08743,
+            "71": 0.08699,
+            "72": 0.08754,
+            "73": 0.08641,
+            "74": 0.08819,
+            "75": 0.08738,
+            "76": 0.50165,
+            "77": 0.08865,
+            "78": 0.08729,
+            "79": 0.0866,
+            "80": 0.08763,
+            "81": 0.08755,
+            "82": 0.08768,
+            "83": 0.0877,
+            "84": 0.08704,
+            "85": 0.08686,
+            "86": 0.0893,
+            "87": 0.08757,
+            "88": 0.08695,
+            "89": 0.08918,
+            "90": 0.08715,
+            "91": 0.08682,
+            "92": 0.08819,
+            "93": 0.08755,
+            "94": 0.08919,
+            "95": 0.08702,
+            "96": 0.0863,
+            "97": 0.08852,
+            "98": 0.08865,
+            "99": 0.08679,
+            "100": 0.08757
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json
index 48aee8d379f..80a7902517d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.51973,
-            "2": 0.20593,
-            "3": 0.14945,
-            "4": 0.14775,
-            "5": 0.14785,
-            "6": 0.14767,
-            "7": 0.14754,
-            "8": 0.14649,
-            "9": 0.14636,
-            "10": 0.14713,
-            "11": 0.14628,
-            "12": 0.14658,
-            "13": 0.14581,
-            "14": 0.14652,
-            "15": 0.14657,
-            "16": 0.14585,
-            "17": 0.14783,
-            "18": 0.1469,
-            "19": 0.14603,
-            "20": 0.14662,
-            "21": 0.14635,
-            "22": 0.1461,
-            "23": 0.14688,
-            "24": 0.14579,
-            "25": 0.14587,
-            "26": 0.14836,
-            "27": 0.14598,
-            "28": 0.1458,
-            "29": 0.14604,
-            "30": 0.14624,
-            "31": 0.14719,
-            "32": 0.14625,
-            "33": 0.14582,
-            "34": 0.14603,
-            "35": 0.14619,
-            "36": 0.14587,
-            "37": 0.14585,
-            "38": 0.14625,
-            "39": 0.14572,
-            "40": 0.14629,
-            "41": 0.14561,
-            "42": 0.14587,
-            "43": 0.14672,
-            "44": 0.14572,
-            "45": 0.14618,
-            "46": 0.14622,
-            "47": 0.14572,
-            "48": 0.14538,
-            "49": 0.14571,
-            "50": 0.1457,
-            "51": 0.1553,
-            "52": 0.14793,
-            "53": 0.14797,
-            "54": 0.14774,
-            "55": 0.14702,
-            "56": 0.15765,
-            "57": 0.1544,
-            "58": 0.15368,
-            "59": 0.15399,
-            "60": 0.15366,
-            "61": 0.15362,
-            "62": 0.15351,
-            "63": 0.15339,
-            "64": 0.15353,
-            "65": 0.15154,
-            "66": 0.14531,
-            "67": 0.14661,
-            "68": 0.14599,
-            "69": 0.14546,
-            "70": 0.14633,
-            "71": 0.14568,
-            "72": 0.1461,
-            "73": 0.14601,
-            "74": 0.1469,
-            "75": 0.14561,
-            "76": 0.14575,
-            "77": 0.14581,
-            "78": 0.14634,
-            "79": 0.14619,
-            "80": 0.14627,
-            "81": 0.146,
-            "82": 0.14559,
-            "83": 0.14618,
-            "84": 0.14683,
-            "85": 0.14582,
-            "86": 0.1462,
-            "87": 0.14574,
-            "88": 0.14574,
-            "89": 0.14516,
-            "90": 0.14556,
-            "91": 0.146,
-            "92": 0.14702,
-            "93": 0.14541,
-            "94": 0.14625,
-            "95": 0.14586,
-            "96": 0.1455,
-            "97": 0.14559,
-            "98": 0.14614,
-            "99": 0.15005,
-            "100": 0.14598
+            "1": 6.65648,
+            "2": 0.19179,
+            "3": 0.15416,
+            "4": 0.14165,
+            "5": 0.14069,
+            "6": 0.14005,
+            "7": 0.14441,
+            "8": 0.14847,
+            "9": 0.14867,
+            "10": 0.15034,
+            "11": 0.14788,
+            "12": 0.14812,
+            "13": 0.14762,
+            "14": 0.14827,
+            "15": 0.14673,
+            "16": 0.14725,
+            "17": 0.14727,
+            "18": 0.14703,
+            "19": 0.14722,
+            "20": 0.14733,
+            "21": 0.14692,
+            "22": 0.14653,
+            "23": 0.14777,
+            "24": 0.14694,
+            "25": 0.14763,
+            "26": 0.1471,
+            "27": 0.14674,
+            "28": 0.14635,
+            "29": 0.14703,
+            "30": 0.14621,
+            "31": 0.14691,
+            "32": 0.14767,
+            "33": 0.14672,
+            "34": 0.14669,
+            "35": 0.14593,
+            "36": 0.14589,
+            "37": 0.14687,
+            "38": 0.14638,
+            "39": 0.14701,
+            "40": 0.14657,
+            "41": 0.14668,
+            "42": 0.14663,
+            "43": 0.14455,
+            "44": 0.13873,
+            "45": 0.13973,
+            "46": 0.13942,
+            "47": 0.13835,
+            "48": 0.13884,
+            "49": 0.13842,
+            "50": 0.13788,
+            "51": 0.14634,
+            "52": 0.14143,
+            "53": 0.13935,
+            "54": 0.14449,
+            "55": 0.13995,
+            "56": 0.14005,
+            "57": 0.13884,
+            "58": 0.13823,
+            "59": 0.13958,
+            "60": 0.13806,
+            "61": 0.13998,
+            "62": 0.1391,
+            "63": 0.13808,
+            "64": 0.1378,
+            "65": 0.13831,
+            "66": 0.13766,
+            "67": 0.13871,
+            "68": 0.13842,
+            "69": 0.13825,
+            "70": 0.14322,
+            "71": 0.13773,
+            "72": 0.13739,
+            "73": 0.1379,
+            "74": 0.13895,
+            "75": 0.14238,
+            "76": 0.14002,
+            "77": 0.13711,
+            "78": 0.13768,
+            "79": 0.13786,
+            "80": 0.13681,
+            "81": 0.13744,
+            "82": 0.13817,
+            "83": 0.13649,
+            "84": 0.13687,
+            "85": 0.13779,
+            "86": 0.14075,
+            "87": 0.13645,
+            "88": 0.1389,
+            "89": 0.13781,
+            "90": 0.13671,
+            "91": 0.13682,
+            "92": 0.13637,
+            "93": 0.13642,
+            "94": 0.13696,
+            "95": 0.13741,
+            "96": 0.1363,
+            "97": 0.13656,
+            "98": 0.13634,
+            "99": 0.13708,
+            "100": 0.14224
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..2287a0ab752
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.79157,
+            "52": 9.68731,
+            "53": 10.02181,
+            "54": 9.90398,
+            "55": 9.82389,
+            "56": 9.57081,
+            "57": 9.40818,
+            "58": 9.77678,
+            "59": 9.52729,
+            "60": 9.44284,
+            "61": 9.64071,
+            "62": 9.94046,
+            "63": 9.31099,
+            "64": 9.72506,
+            "65": 8.8916,
+            "66": 9.6525,
+            "67": 9.31718,
+            "68": 9.73957,
+            "69": 9.74304,
+            "70": 9.67942,
+            "71": 9.56228,
+            "72": 9.53149,
+            "73": 9.44531,
+            "74": 8.88431,
+            "75": 9.3677,
+            "76": 9.02482,
+            "77": 10.01647,
+            "78": 9.6813,
+            "79": 9.32719,
+            "80": 9.3577,
+            "81": 9.43335,
+            "82": 9.64804,
+            "83": 9.25573,
+            "84": 9.36738,
+            "85": 9.56091,
+            "86": 9.03567,
+            "87": 9.54622,
+            "88": 9.70041,
+            "89": 9.54992,
+            "90": 9.77126,
+            "91": 9.28801,
+            "92": 9.31055,
+            "93": 9.03195,
+            "94": 8.78121,
+            "95": 9.48115,
+            "96": 9.4759,
+            "97": 9.2489,
+            "98": 9.61705,
+            "99": 8.8368,
+            "100": 9.35043
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2482.0,
+            "52": 2570.0,
+            "53": 2835.0,
+            "54": 2589.0,
+            "55": 2450.0,
+            "56": 2744.0,
+            "57": 2429.0,
+            "58": 2684.0,
+            "59": 2748.0,
+            "60": 2464.0,
+            "61": 2995.0,
+            "62": 2518.0,
+            "63": 2570.0,
+            "64": 2843.0,
+            "65": 2648.0,
+            "66": 2842.0,
+            "67": 2954.0,
+            "68": 2833.0,
+            "69": 3027.0,
+            "70": 2993.0,
+            "71": 3010.0,
+            "72": 2597.0,
+            "73": 3002.0,
+            "74": 2325.0,
+            "75": 2882.0,
+            "76": 3143.0,
+            "77": 3062.0,
+            "78": 3272.0,
+            "79": 3303.0,
+            "80": 3280.0,
+            "81": 3517.0,
+            "82": 3283.0,
+            "83": 2834.0,
+            "84": 3365.0,
+            "85": 3288.0,
+            "86": 2562.0,
+            "87": 3493.0,
+            "88": 3388.0,
+            "89": 3102.0,
+            "90": 3230.0,
+            "91": 3154.0,
+            "92": 3263.0,
+            "93": 2967.0,
+            "94": 3520.0,
+            "95": 3175.0,
+            "96": 3317.0,
+            "97": 2999.0,
+            "98": 3549.0,
+            "99": 3248.0,
+            "100": 3227.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 888098816.0,
+            "52": 888098816.0,
+            "53": 888098816.0,
+            "54": 888098816.0,
+            "55": 888098816.0,
+            "56": 888098816.0,
+            "57": 888098816.0,
+            "58": 888098816.0,
+            "59": 888098816.0,
+            "60": 888098816.0,
+            "61": 888098816.0,
+            "62": 888098816.0,
+            "63": 888098816.0,
+            "64": 888098816.0,
+            "65": 888098816.0,
+            "66": 888098816.0,
+            "67": 888098816.0,
+            "68": 888098816.0,
+            "69": 888098816.0,
+            "70": 888098816.0,
+            "71": 888098816.0,
+            "72": 888098816.0,
+            "73": 888098816.0,
+            "74": 888098816.0,
+            "75": 888098816.0,
+            "76": 888098816.0,
+            "77": 888098816.0,
+            "78": 888098816.0,
+            "79": 888098816.0,
+            "80": 888098816.0,
+            "81": 888098816.0,
+            "82": 888098816.0,
+            "83": 888098816.0,
+            "84": 888098816.0,
+            "85": 888098816.0,
+            "86": 888098816.0,
+            "87": 888098816.0,
+            "88": 888098816.0,
+            "89": 888098816.0,
+            "90": 888098816.0,
+            "91": 888098816.0,
+            "92": 888098816.0,
+            "93": 888098816.0,
+            "94": 888098816.0,
+            "95": 888098816.0,
+            "96": 888098816.0,
+            "97": 888098816.0,
+            "98": 888098816.0,
+            "99": 888098816.0,
+            "100": 888098816.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2595090432.0,
+            "52": 2595091456.0,
+            "53": 2595091456.0,
+            "54": 2595091456.0,
+            "55": 2595091456.0,
+            "56": 2595091456.0,
+            "57": 2595091456.0,
+            "58": 2595091456.0,
+            "59": 2595091456.0,
+            "60": 2595091456.0,
+            "61": 2595091456.0,
+            "62": 2595091456.0,
+            "63": 2595091456.0,
+            "64": 2595091456.0,
+            "65": 2595091456.0,
+            "66": 2595091456.0,
+            "67": 2595091456.0,
+            "68": 2595091456.0,
+            "69": 2595091456.0,
+            "70": 2595091456.0,
+            "71": 2595091456.0,
+            "72": 2595091456.0,
+            "73": 2595091456.0,
+            "74": 2595091456.0,
+            "75": 2595091456.0,
+            "76": 2595091456.0,
+            "77": 2595091456.0,
+            "78": 2595091456.0,
+            "79": 2595091456.0,
+            "80": 2595091456.0,
+            "81": 2595091456.0,
+            "82": 2595091456.0,
+            "83": 2595091456.0,
+            "84": 2595091456.0,
+            "85": 2595091456.0,
+            "86": 2595091456.0,
+            "87": 2595091456.0,
+            "88": 2595091456.0,
+            "89": 2595091456.0,
+            "90": 2595091456.0,
+            "91": 2595091456.0,
+            "92": 2595091456.0,
+            "93": 2595091456.0,
+            "94": 2595091456.0,
+            "95": 2595091456.0,
+            "96": 2595091456.0,
+            "97": 2595091456.0,
+            "98": 2595091456.0,
+            "99": 2595091456.0,
+            "100": 2595091456.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.7416,
+            "52": 0.17157,
+            "53": 0.14456,
+            "54": 0.14361,
+            "55": 0.14299,
+            "56": 0.14258,
+            "57": 0.14257,
+            "58": 0.14319,
+            "59": 0.14348,
+            "60": 0.1429,
+            "61": 0.14295,
+            "62": 0.1431,
+            "63": 0.1419,
+            "64": 0.14379,
+            "65": 0.59005,
+            "66": 0.15082,
+            "67": 0.14226,
+            "68": 0.14098,
+            "69": 0.14096,
+            "70": 0.1413,
+            "71": 0.14073,
+            "72": 0.14094,
+            "73": 0.14097,
+            "74": 0.14117,
+            "75": 0.14054,
+            "76": 0.14081,
+            "77": 0.14153,
+            "78": 0.59387,
+            "79": 0.14301,
+            "80": 0.14139,
+            "81": 0.14173,
+            "82": 0.1418,
+            "83": 0.14133,
+            "84": 0.14096,
+            "85": 0.14024,
+            "86": 0.14063,
+            "87": 0.14049,
+            "88": 0.14117,
+            "89": 0.14144,
+            "90": 0.14055,
+            "91": 0.14175,
+            "92": 0.14246,
+            "93": 0.14114,
+            "94": 0.14391,
+            "95": 0.14119,
+            "96": 0.14114,
+            "97": 0.14158,
+            "98": 0.1408,
+            "99": 0.14214,
+            "100": 0.14462
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/model_config.yaml
index c75d099790f..37b5f5d7471 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/model_config.yaml
index ffabf9583f6..7ddf65f4ca8 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/model_config.yaml
@@ -46,7 +46,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/model_config.yaml
index d91e9be4f54..9a3947b5e71 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/model_config.yaml
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --attention-backend: unfused
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..63c74381364
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.83582,
+            "2": 10.83571,
+            "3": 10.83523,
+            "4": 10.79949,
+            "5": 10.84909,
+            "6": 10.86563,
+            "7": 10.82789,
+            "8": 10.8363,
+            "9": 10.83997,
+            "10": 10.79865,
+            "11": 10.8677,
+            "12": 10.84994,
+            "13": 10.85915,
+            "14": 10.86874,
+            "15": 10.80173,
+            "16": 10.79183,
+            "17": 10.77353,
+            "18": 10.78739,
+            "19": 10.78983,
+            "20": 10.68446,
+            "21": 10.6784,
+            "22": 10.5257,
+            "23": 10.70726,
+            "24": 10.56551,
+            "25": 10.51602,
+            "26": 10.58017,
+            "27": 10.58981,
+            "28": 10.54551,
+            "29": 10.57726,
+            "30": 10.34051,
+            "31": 10.07051,
+            "32": 10.44503,
+            "33": 10.44293,
+            "34": 10.19391,
+            "35": 10.24261,
+            "36": 10.19236,
+            "37": 10.32969,
+            "38": 10.16551,
+            "39": 10.38729,
+            "40": 10.05174,
+            "41": 10.12191,
+            "42": 10.19259,
+            "43": 9.8069,
+            "44": 9.92475,
+            "45": 9.80639,
+            "46": 9.80145,
+            "47": 10.12104,
+            "48": 9.83127,
+            "49": 9.50404,
+            "50": 9.87954,
+            "51": 9.83807,
+            "52": 9.72058,
+            "53": 10.0568,
+            "54": 9.95032,
+            "55": 9.88328,
+            "56": 9.60431,
+            "57": 9.45518,
+            "58": 9.81927,
+            "59": 9.58262,
+            "60": 9.48844,
+            "61": 9.68577,
+            "62": 9.97779,
+            "63": 9.36765,
+            "64": 9.75913,
+            "65": 8.9376,
+            "66": 9.69257,
+            "67": 9.36621,
+            "68": 9.78303,
+            "69": 9.79318,
+            "70": 9.72699,
+            "71": 9.62875,
+            "72": 9.58004,
+            "73": 9.487,
+            "74": 8.92041,
+            "75": 9.41128,
+            "76": 9.07564,
+            "77": 10.05848,
+            "78": 9.72184,
+            "79": 9.3732,
+            "80": 9.40079,
+            "81": 9.4792,
+            "82": 9.69754,
+            "83": 9.31037,
+            "84": 9.41777,
+            "85": 9.61194,
+            "86": 9.07155,
+            "87": 9.59661,
+            "88": 9.74709,
+            "89": 9.59667,
+            "90": 9.82915,
+            "91": 9.33725,
+            "92": 9.3564,
+            "93": 9.08552,
+            "94": 8.82807,
+            "95": 9.52842,
+            "96": 9.52611,
+            "97": 9.30632,
+            "98": 9.66808,
+            "99": 8.89461,
+            "100": 9.40666
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1536.0,
+            "2": 1592.0,
+            "3": 1551.0,
+            "4": 1769.0,
+            "5": 1824.0,
+            "6": 1800.0,
+            "7": 1734.0,
+            "8": 1619.0,
+            "9": 1829.0,
+            "10": 1355.0,
+            "11": 1911.0,
+            "12": 1721.0,
+            "13": 1913.0,
+            "14": 1708.0,
+            "15": 1919.0,
+            "16": 1938.0,
+            "17": 1740.0,
+            "18": 1676.0,
+            "19": 1743.0,
+            "20": 1535.0,
+            "21": 1797.0,
+            "22": 1661.0,
+            "23": 1887.0,
+            "24": 1666.0,
+            "25": 1633.0,
+            "26": 1676.0,
+            "27": 1740.0,
+            "28": 1991.0,
+            "29": 1918.0,
+            "30": 1806.0,
+            "31": 1588.0,
+            "32": 1863.0,
+            "33": 2126.0,
+            "34": 1812.0,
+            "35": 1976.0,
+            "36": 1875.0,
+            "37": 2301.0,
+            "38": 2131.0,
+            "39": 2351.0,
+            "40": 2130.0,
+            "41": 2391.0,
+            "42": 2255.0,
+            "43": 1975.0,
+            "44": 2138.0,
+            "45": 2208.0,
+            "46": 2364.0,
+            "47": 2564.0,
+            "48": 2337.0,
+            "49": 2142.0,
+            "50": 2423.0,
+            "51": 2546.0,
+            "52": 2590.0,
+            "53": 2879.0,
+            "54": 2697.0,
+            "55": 2316.0,
+            "56": 2549.0,
+            "57": 2261.0,
+            "58": 2904.0,
+            "59": 2740.0,
+            "60": 2434.0,
+            "61": 2801.0,
+            "62": 2663.0,
+            "63": 2502.0,
+            "64": 2948.0,
+            "65": 2644.0,
+            "66": 2961.0,
+            "67": 2813.0,
+            "68": 2686.0,
+            "69": 2912.0,
+            "70": 3096.0,
+            "71": 2854.0,
+            "72": 2454.0,
+            "73": 3081.0,
+            "74": 1933.0,
+            "75": 2465.0,
+            "76": 3012.0,
+            "77": 3163.0,
+            "78": 2997.0,
+            "79": 3089.0,
+            "80": 3187.0,
+            "81": 3500.0,
+            "82": 3339.0,
+            "83": 2705.0,
+            "84": 3205.0,
+            "85": 3033.0,
+            "86": 2818.0,
+            "87": 3671.0,
+            "88": 3190.0,
+            "89": 3336.0,
+            "90": 3320.0,
+            "91": 2698.0,
+            "92": 3072.0,
+            "93": 2750.0,
+            "94": 3397.0,
+            "95": 3317.0,
+            "96": 3290.0,
+            "97": 3116.0,
+            "98": 3732.0,
+            "99": 3049.0,
+            "100": 2974.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 745146880.0,
+            "2": 745146880.0,
+            "3": 745146880.0,
+            "4": 745146880.0,
+            "5": 745146880.0,
+            "6": 745146880.0,
+            "7": 745146880.0,
+            "8": 745146880.0,
+            "9": 745146880.0,
+            "10": 745146880.0,
+            "11": 745146880.0,
+            "12": 745146880.0,
+            "13": 745146880.0,
+            "14": 745146880.0,
+            "15": 745146880.0,
+            "16": 745146880.0,
+            "17": 745146880.0,
+            "18": 745146880.0,
+            "19": 745146880.0,
+            "20": 745146880.0,
+            "21": 745146880.0,
+            "22": 745146880.0,
+            "23": 745146880.0,
+            "24": 745146880.0,
+            "25": 745146880.0,
+            "26": 745146880.0,
+            "27": 745146880.0,
+            "28": 745146880.0,
+            "29": 745146880.0,
+            "30": 745146880.0,
+            "31": 745146880.0,
+            "32": 745146880.0,
+            "33": 745146880.0,
+            "34": 745146880.0,
+            "35": 745146880.0,
+            "36": 745146880.0,
+            "37": 745146880.0,
+            "38": 745146880.0,
+            "39": 745146880.0,
+            "40": 745146880.0,
+            "41": 745146880.0,
+            "42": 745146880.0,
+            "43": 745146880.0,
+            "44": 745146880.0,
+            "45": 745146880.0,
+            "46": 745146880.0,
+            "47": 745146880.0,
+            "48": 745146880.0,
+            "49": 745146880.0,
+            "50": 745146880.0,
+            "51": 745146880.0,
+            "52": 745146880.0,
+            "53": 745146880.0,
+            "54": 745146880.0,
+            "55": 745146880.0,
+            "56": 745146880.0,
+            "57": 745146880.0,
+            "58": 745146880.0,
+            "59": 745146880.0,
+            "60": 745146880.0,
+            "61": 745146880.0,
+            "62": 745146880.0,
+            "63": 745146880.0,
+            "64": 745146880.0,
+            "65": 745146880.0,
+            "66": 745146880.0,
+            "67": 745146880.0,
+            "68": 745146880.0,
+            "69": 745146880.0,
+            "70": 745146880.0,
+            "71": 745146880.0,
+            "72": 745146880.0,
+            "73": 745146880.0,
+            "74": 745146880.0,
+            "75": 745146880.0,
+            "76": 745146880.0,
+            "77": 745146880.0,
+            "78": 745146880.0,
+            "79": 745146880.0,
+            "80": 745146880.0,
+            "81": 745146880.0,
+            "82": 745146880.0,
+            "83": 745146880.0,
+            "84": 745146880.0,
+            "85": 745146880.0,
+            "86": 745146880.0,
+            "87": 745146880.0,
+            "88": 745146880.0,
+            "89": 745146880.0,
+            "90": 745146880.0,
+            "91": 745146880.0,
+            "92": 745146880.0,
+            "93": 745146880.0,
+            "94": 745146880.0,
+            "95": 745146880.0,
+            "96": 745146880.0,
+            "97": 745146880.0,
+            "98": 745146880.0,
+            "99": 745146880.0,
+            "100": 745146880.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1939785728.0,
+            "2": 2222434304.0,
+            "3": 2222434304.0,
+            "4": 2222434304.0,
+            "5": 2222434304.0,
+            "6": 2222434304.0,
+            "7": 2222434304.0,
+            "8": 2222434304.0,
+            "9": 2222434304.0,
+            "10": 2222434304.0,
+            "11": 2222434304.0,
+            "12": 2222434304.0,
+            "13": 2222434304.0,
+            "14": 2222434304.0,
+            "15": 2222434304.0,
+            "16": 2222434304.0,
+            "17": 2222434304.0,
+            "18": 2222434304.0,
+            "19": 2222434304.0,
+            "20": 2222434304.0,
+            "21": 2222434304.0,
+            "22": 2222434304.0,
+            "23": 2222434304.0,
+            "24": 2222434304.0,
+            "25": 2222434304.0,
+            "26": 2222434304.0,
+            "27": 2222434304.0,
+            "28": 2222434304.0,
+            "29": 2222434304.0,
+            "30": 2222434304.0,
+            "31": 2222434304.0,
+            "32": 2222434304.0,
+            "33": 2222434304.0,
+            "34": 2222434304.0,
+            "35": 2222434304.0,
+            "36": 2222434304.0,
+            "37": 2222434304.0,
+            "38": 2222434304.0,
+            "39": 2222434304.0,
+            "40": 2222434304.0,
+            "41": 2222434304.0,
+            "42": 2222434304.0,
+            "43": 2222434304.0,
+            "44": 2222434304.0,
+            "45": 2222434304.0,
+            "46": 2222434304.0,
+            "47": 2222434304.0,
+            "48": 2222434304.0,
+            "49": 2222434304.0,
+            "50": 2222434304.0,
+            "51": 2222434304.0,
+            "52": 2222434304.0,
+            "53": 2222434304.0,
+            "54": 2222434304.0,
+            "55": 2222434304.0,
+            "56": 2222434304.0,
+            "57": 2222434304.0,
+            "58": 2222434304.0,
+            "59": 2222434304.0,
+            "60": 2222434304.0,
+            "61": 2222434304.0,
+            "62": 2222434304.0,
+            "63": 2222434304.0,
+            "64": 2222434304.0,
+            "65": 2222434304.0,
+            "66": 2222434304.0,
+            "67": 2222434304.0,
+            "68": 2222434304.0,
+            "69": 2222434304.0,
+            "70": 2222434304.0,
+            "71": 2222434304.0,
+            "72": 2222434304.0,
+            "73": 2222434304.0,
+            "74": 2222434304.0,
+            "75": 2222434304.0,
+            "76": 2222434304.0,
+            "77": 2222434304.0,
+            "78": 2222434304.0,
+            "79": 2222434304.0,
+            "80": 2222434304.0,
+            "81": 2222434304.0,
+            "82": 2222434304.0,
+            "83": 2222434304.0,
+            "84": 2222434304.0,
+            "85": 2222434304.0,
+            "86": 2222434304.0,
+            "87": 2222434304.0,
+            "88": 2222434304.0,
+            "89": 2222434304.0,
+            "90": 2222434304.0,
+            "91": 2222434304.0,
+            "92": 2222434304.0,
+            "93": 2222434304.0,
+            "94": 2222434304.0,
+            "95": 2222434304.0,
+            "96": 2222434304.0,
+            "97": 2222434304.0,
+            "98": 2222434304.0,
+            "99": 2222434304.0,
+            "100": 2222434304.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.20838,
+            "3": 0.13042,
+            "4": 0.11826,
+            "5": 0.11718,
+            "6": 0.11797,
+            "7": 0.1177,
+            "8": 0.11717,
+            "9": 0.11846,
+            "10": 0.11778,
+            "11": 0.11712,
+            "12": 0.11866,
+            "13": 0.12004,
+            "14": 0.11788,
+            "15": 0.11787,
+            "16": 0.1181,
+            "17": 0.11903,
+            "18": 0.11843,
+            "19": 0.11754,
+            "20": 0.11834,
+            "21": 0.11897,
+            "22": 0.12726,
+            "23": 0.13834,
+            "24": 0.15039,
+            "25": 0.14107,
+            "26": 0.14586,
+            "27": 0.16343,
+            "28": 0.2297,
+            "29": 0.26681,
+            "30": 0.19748,
+            "31": 0.2586,
+            "32": 0.12068,
+            "33": 0.11944,
+            "34": 0.11896,
+            "35": 0.11984,
+            "36": 0.11823,
+            "37": 0.11997,
+            "38": 0.11949,
+            "39": 0.11877,
+            "40": 0.11898,
+            "41": 0.11996,
+            "42": 0.11893,
+            "43": 0.12547,
+            "44": 0.13195,
+            "45": 0.12144,
+            "46": 0.11997,
+            "47": 0.12005,
+            "48": 0.11855,
+            "49": 0.11944,
+            "50": 0.11842,
+            "51": 0.14635,
+            "52": 0.12016,
+            "53": 0.11762,
+            "54": 0.11802,
+            "55": 0.1184,
+            "56": 0.11774,
+            "57": 0.12181,
+            "58": 0.11784,
+            "59": 0.11936,
+            "60": 0.11831,
+            "61": 0.11819,
+            "62": 0.11807,
+            "63": 0.11828,
+            "64": 0.11663,
+            "65": 0.11901,
+            "66": 0.1168,
+            "67": 0.1167,
+            "68": 0.12002,
+            "69": 0.12016,
+            "70": 0.1186,
+            "71": 0.11772,
+            "72": 0.1189,
+            "73": 0.11915,
+            "74": 0.11908,
+            "75": 0.11898,
+            "76": 0.11863,
+            "77": 0.11869,
+            "78": 0.11971,
+            "79": 0.11843,
+            "80": 0.1198,
+            "81": 0.12003,
+            "82": 0.11885,
+            "83": 0.11905,
+            "84": 0.12002,
+            "85": 0.1192,
+            "86": 0.11872,
+            "87": 0.11777,
+            "88": 0.11801,
+            "89": 0.11864,
+            "90": 0.11769,
+            "91": 0.11692,
+            "92": 0.12015,
+            "93": 0.12072,
+            "94": 0.11802,
+            "95": 0.11798,
+            "96": 0.12278,
+            "97": 0.11941,
+            "98": 0.1174,
+            "99": 0.11816,
+            "100": 0.12102
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..12556b60c96
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83807,
+            "52": 9.72058,
+            "53": 10.0568,
+            "54": 9.95032,
+            "55": 9.88328,
+            "56": 9.60431,
+            "57": 9.45518,
+            "58": 9.81927,
+            "59": 9.58262,
+            "60": 9.48844,
+            "61": 9.68577,
+            "62": 9.97779,
+            "63": 9.36765,
+            "64": 9.75913,
+            "65": 8.9376,
+            "66": 9.69257,
+            "67": 9.36621,
+            "68": 9.78303,
+            "69": 9.79318,
+            "70": 9.72699,
+            "71": 9.62875,
+            "72": 9.58004,
+            "73": 9.487,
+            "74": 8.92041,
+            "75": 9.41128,
+            "76": 9.07564,
+            "77": 10.05848,
+            "78": 9.72184,
+            "79": 9.3732,
+            "80": 9.40079,
+            "81": 9.4792,
+            "82": 9.69754,
+            "83": 9.31037,
+            "84": 9.41777,
+            "85": 9.61194,
+            "86": 9.07155,
+            "87": 9.59661,
+            "88": 9.74709,
+            "89": 9.59667,
+            "90": 9.82915,
+            "91": 9.33725,
+            "92": 9.3564,
+            "93": 9.08552,
+            "94": 8.82807,
+            "95": 9.52842,
+            "96": 9.52611,
+            "97": 9.30632,
+            "98": 9.66808,
+            "99": 8.89461,
+            "100": 9.40666
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2546.0,
+            "52": 2590.0,
+            "53": 2879.0,
+            "54": 2697.0,
+            "55": 2316.0,
+            "56": 2549.0,
+            "57": 2261.0,
+            "58": 2904.0,
+            "59": 2740.0,
+            "60": 2434.0,
+            "61": 2801.0,
+            "62": 2663.0,
+            "63": 2502.0,
+            "64": 2948.0,
+            "65": 2644.0,
+            "66": 2961.0,
+            "67": 2813.0,
+            "68": 2686.0,
+            "69": 2912.0,
+            "70": 3096.0,
+            "71": 2854.0,
+            "72": 2454.0,
+            "73": 3081.0,
+            "74": 1933.0,
+            "75": 2465.0,
+            "76": 3012.0,
+            "77": 3163.0,
+            "78": 2997.0,
+            "79": 3089.0,
+            "80": 3187.0,
+            "81": 3500.0,
+            "82": 3339.0,
+            "83": 2705.0,
+            "84": 3205.0,
+            "85": 3033.0,
+            "86": 2818.0,
+            "87": 3671.0,
+            "88": 3190.0,
+            "89": 3336.0,
+            "90": 3320.0,
+            "91": 2698.0,
+            "92": 3072.0,
+            "93": 2750.0,
+            "94": 3397.0,
+            "95": 3317.0,
+            "96": 3290.0,
+            "97": 3116.0,
+            "98": 3732.0,
+            "99": 3049.0,
+            "100": 2974.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 746195456.0,
+            "52": 746195456.0,
+            "53": 746195456.0,
+            "54": 746195456.0,
+            "55": 746195456.0,
+            "56": 746195456.0,
+            "57": 746195456.0,
+            "58": 746195456.0,
+            "59": 746195456.0,
+            "60": 746195456.0,
+            "61": 746195456.0,
+            "62": 746195456.0,
+            "63": 746195456.0,
+            "64": 746195456.0,
+            "65": 746195456.0,
+            "66": 746195456.0,
+            "67": 746195456.0,
+            "68": 746195456.0,
+            "69": 746195456.0,
+            "70": 746195456.0,
+            "71": 746195456.0,
+            "72": 746195456.0,
+            "73": 746195456.0,
+            "74": 746195456.0,
+            "75": 746195456.0,
+            "76": 746195456.0,
+            "77": 746195456.0,
+            "78": 746195456.0,
+            "79": 746195456.0,
+            "80": 746195456.0,
+            "81": 746195456.0,
+            "82": 746195456.0,
+            "83": 746195456.0,
+            "84": 746195456.0,
+            "85": 746195456.0,
+            "86": 746195456.0,
+            "87": 746195456.0,
+            "88": 746195456.0,
+            "89": 746195456.0,
+            "90": 746195456.0,
+            "91": 746195456.0,
+            "92": 746195456.0,
+            "93": 746195456.0,
+            "94": 746195456.0,
+            "95": 746195456.0,
+            "96": 746195456.0,
+            "97": 746195456.0,
+            "98": 746195456.0,
+            "99": 746195456.0,
+            "100": 746195456.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2223482880.0,
+            "52": 2223483904.0,
+            "53": 2223483904.0,
+            "54": 2223483904.0,
+            "55": 2223483904.0,
+            "56": 2223483904.0,
+            "57": 2223483904.0,
+            "58": 2223483904.0,
+            "59": 2223483904.0,
+            "60": 2223483904.0,
+            "61": 2223483904.0,
+            "62": 2223483904.0,
+            "63": 2223483904.0,
+            "64": 2223483904.0,
+            "65": 2223483904.0,
+            "66": 2223483904.0,
+            "67": 2223483904.0,
+            "68": 2223483904.0,
+            "69": 2223483904.0,
+            "70": 2223483904.0,
+            "71": 2223483904.0,
+            "72": 2223483904.0,
+            "73": 2223483904.0,
+            "74": 2223483904.0,
+            "75": 2223483904.0,
+            "76": 2223483904.0,
+            "77": 2223483904.0,
+            "78": 2223483904.0,
+            "79": 2223483904.0,
+            "80": 2223483904.0,
+            "81": 2223483904.0,
+            "82": 2223483904.0,
+            "83": 2223483904.0,
+            "84": 2223483904.0,
+            "85": 2223483904.0,
+            "86": 2223483904.0,
+            "87": 2223483904.0,
+            "88": 2223483904.0,
+            "89": 2223483904.0,
+            "90": 2223483904.0,
+            "91": 2223483904.0,
+            "92": 2223483904.0,
+            "93": 2223483904.0,
+            "94": 2223483904.0,
+            "95": 2223483904.0,
+            "96": 2223483904.0,
+            "97": 2223483904.0,
+            "98": 2223483904.0,
+            "99": 2223483904.0,
+            "100": 2223483904.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 1.94458,
+            "53": 0.13487,
+            "54": 0.12133,
+            "55": 0.12128,
+            "56": 0.12059,
+            "57": 0.11937,
+            "58": 0.11813,
+            "59": 0.11931,
+            "60": 0.12225,
+            "61": 0.1198,
+            "62": 0.1197,
+            "63": 0.12083,
+            "64": 0.12132,
+            "65": 0.12067,
+            "66": 0.12047,
+            "67": 0.12065,
+            "68": 0.12005,
+            "69": 0.12047,
+            "70": 0.11977,
+            "71": 0.1205,
+            "72": 0.11909,
+            "73": 0.11956,
+            "74": 0.12277,
+            "75": 0.11982,
+            "76": 0.12087,
+            "77": 0.12003,
+            "78": 0.12188,
+            "79": 0.12094,
+            "80": 0.12076,
+            "81": 0.12072,
+            "82": 0.12053,
+            "83": 0.11961,
+            "84": 0.12306,
+            "85": 0.12275,
+            "86": 0.11989,
+            "87": 0.11996,
+            "88": 0.1294,
+            "89": 0.12077,
+            "90": 0.1204,
+            "91": 0.12138,
+            "92": 0.11998,
+            "93": 0.12202,
+            "94": 0.12092,
+            "95": 0.11985,
+            "96": 0.11995,
+            "97": 0.12124,
+            "98": 0.12243,
+            "99": 0.12016,
+            "100": 0.12049
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100.json
index 72743900cff..5b2aa3ce19c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 746194432.0,
-            "2": 746194432.0,
-            "3": 746194432.0,
-            "4": 746194432.0,
-            "5": 746194432.0,
-            "6": 746194432.0,
-            "7": 746194432.0,
-            "8": 746194432.0,
-            "9": 746194432.0,
-            "10": 746194432.0,
-            "11": 746194432.0,
-            "12": 746194432.0,
-            "13": 746194432.0,
-            "14": 746194432.0,
-            "15": 746194432.0,
-            "16": 746194432.0,
-            "17": 746194432.0,
-            "18": 746194432.0,
-            "19": 746194432.0,
-            "20": 746194432.0,
-            "21": 746194432.0,
-            "22": 746194432.0,
-            "23": 746194432.0,
-            "24": 746194432.0,
-            "25": 746194432.0,
-            "26": 746194432.0,
-            "27": 746194432.0,
-            "28": 746194432.0,
-            "29": 746194432.0,
-            "30": 746194432.0,
-            "31": 746194432.0,
-            "32": 746194432.0,
-            "33": 746194432.0,
-            "34": 746194432.0,
-            "35": 746194432.0,
-            "36": 746194432.0,
-            "37": 746194432.0,
-            "38": 746194432.0,
-            "39": 746194432.0,
-            "40": 746194432.0,
-            "41": 746194432.0,
-            "42": 746194432.0,
-            "43": 746194432.0,
-            "44": 746194432.0,
-            "45": 746194432.0,
-            "46": 746194432.0,
-            "47": 746194432.0,
-            "48": 746194432.0,
-            "49": 746194432.0,
-            "50": 746194432.0,
-            "51": 746194432.0,
-            "52": 746194432.0,
-            "53": 746194432.0,
-            "54": 746194432.0,
-            "55": 746194432.0,
-            "56": 746194432.0,
-            "57": 746194432.0,
-            "58": 746194432.0,
-            "59": 746194432.0,
-            "60": 746194432.0,
-            "61": 746194432.0,
-            "62": 746194432.0,
-            "63": 746194432.0,
-            "64": 746194432.0,
-            "65": 746194432.0,
-            "66": 746194432.0,
-            "67": 746194432.0,
-            "68": 746194432.0,
-            "69": 746194432.0,
-            "70": 746194432.0,
-            "71": 746194432.0,
-            "72": 746194432.0,
-            "73": 746194432.0,
-            "74": 746194432.0,
-            "75": 746194432.0,
-            "76": 746194432.0,
-            "77": 746194432.0,
-            "78": 746194432.0,
-            "79": 746194432.0,
-            "80": 746194432.0,
-            "81": 746194432.0,
-            "82": 746194432.0,
-            "83": 746194432.0,
-            "84": 746194432.0,
-            "85": 746194432.0,
-            "86": 746194432.0,
-            "87": 746194432.0,
-            "88": 746194432.0,
-            "89": 746194432.0,
-            "90": 746194432.0,
-            "91": 746194432.0,
-            "92": 746194432.0,
-            "93": 746194432.0,
-            "94": 746194432.0,
-            "95": 746194432.0,
-            "96": 746194432.0,
-            "97": 746194432.0,
-            "98": 746194432.0,
-            "99": 746194432.0,
-            "100": 746194432.0
+            "1": 747244032.0,
+            "2": 747244032.0,
+            "3": 747244032.0,
+            "4": 747244032.0,
+            "5": 747244032.0,
+            "6": 747244032.0,
+            "7": 747244032.0,
+            "8": 747244032.0,
+            "9": 747244032.0,
+            "10": 747244032.0,
+            "11": 747244032.0,
+            "12": 747244032.0,
+            "13": 747244032.0,
+            "14": 747244032.0,
+            "15": 747244032.0,
+            "16": 747244032.0,
+            "17": 747244032.0,
+            "18": 747244032.0,
+            "19": 747244032.0,
+            "20": 747244032.0,
+            "21": 747244032.0,
+            "22": 747244032.0,
+            "23": 747244032.0,
+            "24": 747244032.0,
+            "25": 747244032.0,
+            "26": 747244032.0,
+            "27": 747244032.0,
+            "28": 747244032.0,
+            "29": 747244032.0,
+            "30": 747244032.0,
+            "31": 747244032.0,
+            "32": 747244032.0,
+            "33": 747244032.0,
+            "34": 747244032.0,
+            "35": 747244032.0,
+            "36": 747244032.0,
+            "37": 747244032.0,
+            "38": 747244032.0,
+            "39": 747244032.0,
+            "40": 747244032.0,
+            "41": 747244032.0,
+            "42": 747244032.0,
+            "43": 747244032.0,
+            "44": 747244032.0,
+            "45": 747244032.0,
+            "46": 747244032.0,
+            "47": 747244032.0,
+            "48": 747244032.0,
+            "49": 747244032.0,
+            "50": 747244032.0,
+            "51": 747244032.0,
+            "52": 747244032.0,
+            "53": 747244032.0,
+            "54": 747244032.0,
+            "55": 747244032.0,
+            "56": 747244032.0,
+            "57": 747244032.0,
+            "58": 747244032.0,
+            "59": 747244032.0,
+            "60": 747244032.0,
+            "61": 747244032.0,
+            "62": 747244032.0,
+            "63": 747244032.0,
+            "64": 747244032.0,
+            "65": 747244032.0,
+            "66": 747244032.0,
+            "67": 747244032.0,
+            "68": 747244032.0,
+            "69": 747244032.0,
+            "70": 747244032.0,
+            "71": 747244032.0,
+            "72": 747244032.0,
+            "73": 747244032.0,
+            "74": 747244032.0,
+            "75": 747244032.0,
+            "76": 747244032.0,
+            "77": 747244032.0,
+            "78": 747244032.0,
+            "79": 747244032.0,
+            "80": 747244032.0,
+            "81": 747244032.0,
+            "82": 747244032.0,
+            "83": 747244032.0,
+            "84": 747244032.0,
+            "85": 747244032.0,
+            "86": 747244032.0,
+            "87": 747244032.0,
+            "88": 747244032.0,
+            "89": 747244032.0,
+            "90": 747244032.0,
+            "91": 747244032.0,
+            "92": 747244032.0,
+            "93": 747244032.0,
+            "94": 747244032.0,
+            "95": 747244032.0,
+            "96": 747244032.0,
+            "97": 747244032.0,
+            "98": 747244032.0,
+            "99": 747244032.0,
+            "100": 747244032.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1926153216.0,
-            "2": 2209851392.0,
-            "3": 2209851392.0,
-            "4": 2209851392.0,
-            "5": 2209851392.0,
-            "6": 2209851392.0,
-            "7": 2209851392.0,
-            "8": 2209851392.0,
-            "9": 2209851392.0,
-            "10": 2209851392.0,
-            "11": 2209851392.0,
-            "12": 2209851392.0,
-            "13": 2209851392.0,
-            "14": 2209851392.0,
-            "15": 2209851392.0,
-            "16": 2209851392.0,
-            "17": 2209851392.0,
-            "18": 2209851392.0,
-            "19": 2209851392.0,
-            "20": 2209851392.0,
-            "21": 2209851392.0,
-            "22": 2209851392.0,
-            "23": 2209851392.0,
-            "24": 2209851392.0,
-            "25": 2209851392.0,
-            "26": 2209851392.0,
-            "27": 2209851392.0,
-            "28": 2209851392.0,
-            "29": 2209851392.0,
-            "30": 2209851392.0,
-            "31": 2209851392.0,
-            "32": 2209851392.0,
-            "33": 2209851392.0,
-            "34": 2209851392.0,
-            "35": 2209851392.0,
-            "36": 2209851392.0,
-            "37": 2209851392.0,
-            "38": 2209851392.0,
-            "39": 2209851392.0,
-            "40": 2209851392.0,
-            "41": 2209851392.0,
-            "42": 2209851392.0,
-            "43": 2209851392.0,
-            "44": 2209851392.0,
-            "45": 2209851392.0,
-            "46": 2209851392.0,
-            "47": 2209851392.0,
-            "48": 2209851392.0,
-            "49": 2209851392.0,
-            "50": 2209851392.0,
-            "51": 2209851392.0,
-            "52": 2209851392.0,
-            "53": 2209851392.0,
-            "54": 2209851392.0,
-            "55": 2209851392.0,
-            "56": 2209851392.0,
-            "57": 2209851392.0,
-            "58": 2209851392.0,
-            "59": 2209851392.0,
-            "60": 2209851392.0,
-            "61": 2209851392.0,
-            "62": 2209851392.0,
-            "63": 2209851392.0,
-            "64": 2209851392.0,
-            "65": 2209851392.0,
-            "66": 2209851392.0,
-            "67": 2209851392.0,
-            "68": 2209851392.0,
-            "69": 2209851392.0,
-            "70": 2209851392.0,
-            "71": 2209851392.0,
-            "72": 2209851392.0,
-            "73": 2209851392.0,
-            "74": 2209851392.0,
-            "75": 2209851392.0,
-            "76": 2209851392.0,
-            "77": 2209851392.0,
-            "78": 2209851392.0,
-            "79": 2209851392.0,
-            "80": 2209851392.0,
-            "81": 2209851392.0,
-            "82": 2209851392.0,
-            "83": 2209851392.0,
-            "84": 2209851392.0,
-            "85": 2209851392.0,
-            "86": 2209851392.0,
-            "87": 2209851392.0,
-            "88": 2209851392.0,
-            "89": 2209851392.0,
-            "90": 2209851392.0,
-            "91": 2209851392.0,
-            "92": 2209851392.0,
-            "93": 2209851392.0,
-            "94": 2209851392.0,
-            "95": 2209851392.0,
-            "96": 2209851392.0,
-            "97": 2209851392.0,
-            "98": 2209851392.0,
-            "99": 2209851392.0,
-            "100": 2209851392.0
+            "1": 1927202816.0,
+            "2": 2211948544.0,
+            "3": 2211948544.0,
+            "4": 2211948544.0,
+            "5": 2211948544.0,
+            "6": 2211948544.0,
+            "7": 2211948544.0,
+            "8": 2211948544.0,
+            "9": 2211948544.0,
+            "10": 2211948544.0,
+            "11": 2211948544.0,
+            "12": 2211948544.0,
+            "13": 2211948544.0,
+            "14": 2211948544.0,
+            "15": 2211948544.0,
+            "16": 2211948544.0,
+            "17": 2211948544.0,
+            "18": 2211948544.0,
+            "19": 2211948544.0,
+            "20": 2211948544.0,
+            "21": 2211948544.0,
+            "22": 2211948544.0,
+            "23": 2211948544.0,
+            "24": 2211948544.0,
+            "25": 2211948544.0,
+            "26": 2211948544.0,
+            "27": 2211948544.0,
+            "28": 2211948544.0,
+            "29": 2211948544.0,
+            "30": 2211948544.0,
+            "31": 2211948544.0,
+            "32": 2211948544.0,
+            "33": 2211948544.0,
+            "34": 2211948544.0,
+            "35": 2211948544.0,
+            "36": 2211948544.0,
+            "37": 2211948544.0,
+            "38": 2211948544.0,
+            "39": 2211948544.0,
+            "40": 2211948544.0,
+            "41": 2211948544.0,
+            "42": 2211948544.0,
+            "43": 2211948544.0,
+            "44": 2211948544.0,
+            "45": 2211948544.0,
+            "46": 2211948544.0,
+            "47": 2211948544.0,
+            "48": 2211948544.0,
+            "49": 2211948544.0,
+            "50": 2211948544.0,
+            "51": 2211948544.0,
+            "52": 2211948544.0,
+            "53": 2211948544.0,
+            "54": 2211948544.0,
+            "55": 2211948544.0,
+            "56": 2211948544.0,
+            "57": 2211948544.0,
+            "58": 2211948544.0,
+            "59": 2211948544.0,
+            "60": 2211948544.0,
+            "61": 2211948544.0,
+            "62": 2211948544.0,
+            "63": 2211948544.0,
+            "64": 2211948544.0,
+            "65": 2211948544.0,
+            "66": 2211948544.0,
+            "67": 2211948544.0,
+            "68": 2211948544.0,
+            "69": 2211948544.0,
+            "70": 2211948544.0,
+            "71": 2211948544.0,
+            "72": 2211948544.0,
+            "73": 2211948544.0,
+            "74": 2211948544.0,
+            "75": 2211948544.0,
+            "76": 2211948544.0,
+            "77": 2211948544.0,
+            "78": 2211948544.0,
+            "79": 2211948544.0,
+            "80": 2211948544.0,
+            "81": 2211948544.0,
+            "82": 2211948544.0,
+            "83": 2211948544.0,
+            "84": 2211948544.0,
+            "85": 2211948544.0,
+            "86": 2211948544.0,
+            "87": 2211948544.0,
+            "88": 2211948544.0,
+            "89": 2211948544.0,
+            "90": 2211948544.0,
+            "91": 2211948544.0,
+            "92": 2211948544.0,
+            "93": 2211948544.0,
+            "94": 2211948544.0,
+            "95": 2211948544.0,
+            "96": 2211948544.0,
+            "97": 2211948544.0,
+            "98": 2211948544.0,
+            "99": 2211948544.0,
+            "100": 2211948544.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 12.71973,
-            "2": 0.14026,
-            "3": 0.11862,
-            "4": 0.10675,
-            "5": 0.10706,
-            "6": 0.10639,
-            "7": 0.10733,
-            "8": 0.10668,
-            "9": 0.10876,
-            "10": 0.10818,
-            "11": 0.10917,
-            "12": 0.1083,
-            "13": 0.10781,
-            "14": 0.10774,
-            "15": 0.10649,
-            "16": 0.10734,
-            "17": 0.10691,
-            "18": 0.10561,
-            "19": 0.10658,
-            "20": 0.10698,
-            "21": 0.10786,
-            "22": 0.10799,
-            "23": 0.10759,
-            "24": 0.10883,
-            "25": 0.10795,
-            "26": 0.10754,
-            "27": 0.10823,
-            "28": 0.10763,
-            "29": 0.10845,
-            "30": 0.10831,
-            "31": 0.10745,
-            "32": 0.10718,
-            "33": 0.10787,
-            "34": 0.10797,
-            "35": 0.1082,
-            "36": 0.10752,
-            "37": 0.10829,
-            "38": 0.10875,
-            "39": 0.10866,
-            "40": 0.1088,
-            "41": 0.10879,
-            "42": 0.10749,
-            "43": 0.10899,
-            "44": 0.10725,
-            "45": 0.10697,
-            "46": 0.10761,
-            "47": 0.10683,
-            "48": 0.10976,
-            "49": 0.10965,
-            "50": 0.10766,
-            "51": 0.123,
-            "52": 0.11396,
-            "53": 0.10816,
-            "54": 0.10864,
-            "55": 0.12449,
-            "56": 0.1076,
-            "57": 0.10895,
-            "58": 0.10793,
-            "59": 0.10902,
-            "60": 0.10551,
-            "61": 0.10575,
-            "62": 0.10761,
-            "63": 0.10614,
-            "64": 0.10584,
-            "65": 0.10699,
-            "66": 0.1077,
-            "67": 0.10786,
-            "68": 0.10744,
-            "69": 0.10671,
-            "70": 0.10786,
-            "71": 0.10765,
-            "72": 0.10586,
-            "73": 0.10669,
-            "74": 0.10611,
-            "75": 0.10692,
-            "76": 0.10782,
-            "77": 0.10601,
-            "78": 0.10616,
-            "79": 0.10555,
-            "80": 0.10728,
-            "81": 0.10656,
-            "82": 0.10848,
-            "83": 0.10786,
-            "84": 0.10935,
-            "85": 0.11246,
-            "86": 0.11271,
-            "87": 0.10885,
-            "88": 0.10616,
-            "89": 0.10731,
-            "90": 0.10705,
-            "91": 0.10547,
-            "92": 0.10622,
-            "93": 0.10619,
-            "94": 0.10678,
-            "95": 0.10769,
-            "96": 0.10574,
-            "97": 0.10691,
-            "98": 0.10682,
-            "99": 0.10685,
-            "100": 0.10542
+            "1": 8.42141,
+            "2": 0.12821,
+            "3": 0.10969,
+            "4": 0.08528,
+            "5": 0.08609,
+            "6": 0.08514,
+            "7": 0.08511,
+            "8": 0.08614,
+            "9": 0.0853,
+            "10": 0.08556,
+            "11": 0.08506,
+            "12": 0.08648,
+            "13": 0.08513,
+            "14": 0.08524,
+            "15": 0.08502,
+            "16": 0.08679,
+            "17": 0.08617,
+            "18": 0.08799,
+            "19": 0.08587,
+            "20": 0.08552,
+            "21": 0.08665,
+            "22": 0.08551,
+            "23": 0.08517,
+            "24": 0.08535,
+            "25": 0.08579,
+            "26": 0.08526,
+            "27": 0.08602,
+            "28": 0.08519,
+            "29": 0.08544,
+            "30": 0.08512,
+            "31": 0.0856,
+            "32": 0.08591,
+            "33": 0.08561,
+            "34": 0.08518,
+            "35": 0.08492,
+            "36": 0.08517,
+            "37": 0.08548,
+            "38": 0.08494,
+            "39": 0.08594,
+            "40": 0.08522,
+            "41": 0.08599,
+            "42": 0.0854,
+            "43": 0.08536,
+            "44": 0.0855,
+            "45": 0.08648,
+            "46": 0.088,
+            "47": 0.08639,
+            "48": 0.08682,
+            "49": 0.08646,
+            "50": 0.08529,
+            "51": 0.09801,
+            "52": 0.08949,
+            "53": 0.08726,
+            "54": 0.08702,
+            "55": 0.08687,
+            "56": 0.08692,
+            "57": 0.08726,
+            "58": 0.0871,
+            "59": 0.08762,
+            "60": 0.08729,
+            "61": 0.08712,
+            "62": 0.0868,
+            "63": 0.08725,
+            "64": 0.08676,
+            "65": 0.08718,
+            "66": 0.08682,
+            "67": 0.08754,
+            "68": 0.08695,
+            "69": 0.08788,
+            "70": 0.08724,
+            "71": 0.08705,
+            "72": 0.08759,
+            "73": 0.08826,
+            "74": 0.0871,
+            "75": 0.08684,
+            "76": 0.08689,
+            "77": 0.08656,
+            "78": 0.08667,
+            "79": 0.08705,
+            "80": 0.08727,
+            "81": 0.0879,
+            "82": 0.08956,
+            "83": 0.08661,
+            "84": 0.08671,
+            "85": 0.08761,
+            "86": 0.08652,
+            "87": 0.08663,
+            "88": 0.08663,
+            "89": 0.08687,
+            "90": 0.08718,
+            "91": 0.0868,
+            "92": 0.08665,
+            "93": 0.08695,
+            "94": 0.08685,
+            "95": 0.08671,
+            "96": 0.08669,
+            "97": 0.08742,
+            "98": 0.08628,
+            "99": 0.08628,
+            "100": 0.08651
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..17f5f4ed8eb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85075,
+            "52": 9.73181,
+            "53": 10.06388,
+            "54": 9.95432,
+            "55": 9.87204,
+            "56": 9.61823,
+            "57": 9.47467,
+            "58": 9.82802,
+            "59": 9.57962,
+            "60": 9.49074,
+            "61": 9.68473,
+            "62": 9.99245,
+            "63": 9.38364,
+            "64": 9.77766,
+            "65": 8.94008,
+            "66": 9.70099,
+            "67": 9.3605,
+            "68": 9.77766,
+            "69": 9.78865,
+            "70": 9.73813,
+            "71": 9.61811,
+            "72": 9.58068,
+            "73": 9.4964,
+            "74": 8.93812,
+            "75": 9.42081,
+            "76": 9.07416,
+            "77": 10.06077,
+            "78": 9.71952,
+            "79": 9.37088,
+            "80": 9.39874,
+            "81": 9.47802,
+            "82": 9.69299,
+            "83": 9.30276,
+            "84": 9.41548,
+            "85": 9.60883,
+            "86": 9.07461,
+            "87": 9.58826,
+            "88": 9.74392,
+            "89": 9.5951,
+            "90": 9.81217,
+            "91": 9.33796,
+            "92": 9.3534,
+            "93": 9.07315,
+            "94": 8.83127,
+            "95": 9.51524,
+            "96": 9.52183,
+            "97": 9.31012,
+            "98": 9.66532,
+            "99": 8.88179,
+            "100": 9.39375
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2509.0,
+            "52": 2528.0,
+            "53": 2851.0,
+            "54": 2747.0,
+            "55": 2333.0,
+            "56": 2724.0,
+            "57": 2315.0,
+            "58": 2754.0,
+            "59": 2774.0,
+            "60": 2336.0,
+            "61": 2912.0,
+            "62": 2415.0,
+            "63": 2341.0,
+            "64": 2837.0,
+            "65": 2661.0,
+            "66": 3000.0,
+            "67": 2779.0,
+            "68": 2691.0,
+            "69": 2793.0,
+            "70": 3183.0,
+            "71": 2962.0,
+            "72": 2393.0,
+            "73": 2997.0,
+            "74": 1935.0,
+            "75": 2463.0,
+            "76": 3065.0,
+            "77": 3184.0,
+            "78": 3154.0,
+            "79": 3127.0,
+            "80": 3286.0,
+            "81": 3386.0,
+            "82": 3128.0,
+            "83": 2608.0,
+            "84": 3079.0,
+            "85": 3260.0,
+            "86": 2687.0,
+            "87": 3591.0,
+            "88": 3035.0,
+            "89": 3165.0,
+            "90": 3166.0,
+            "91": 2690.0,
+            "92": 2897.0,
+            "93": 2630.0,
+            "94": 3348.0,
+            "95": 3349.0,
+            "96": 3288.0,
+            "97": 3055.0,
+            "98": 3516.0,
+            "99": 3035.0,
+            "100": 3109.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 746195456.0,
+            "52": 746195456.0,
+            "53": 746195456.0,
+            "54": 746195456.0,
+            "55": 746195456.0,
+            "56": 746195456.0,
+            "57": 746195456.0,
+            "58": 746195456.0,
+            "59": 746195456.0,
+            "60": 746195456.0,
+            "61": 746195456.0,
+            "62": 746195456.0,
+            "63": 746195456.0,
+            "64": 746195456.0,
+            "65": 746195456.0,
+            "66": 746195456.0,
+            "67": 746195456.0,
+            "68": 746195456.0,
+            "69": 746195456.0,
+            "70": 746195456.0,
+            "71": 746195456.0,
+            "72": 746195456.0,
+            "73": 746195456.0,
+            "74": 746195456.0,
+            "75": 746195456.0,
+            "76": 746195456.0,
+            "77": 746195456.0,
+            "78": 746195456.0,
+            "79": 746195456.0,
+            "80": 746195456.0,
+            "81": 746195456.0,
+            "82": 746195456.0,
+            "83": 746195456.0,
+            "84": 746195456.0,
+            "85": 746195456.0,
+            "86": 746195456.0,
+            "87": 746195456.0,
+            "88": 746195456.0,
+            "89": 746195456.0,
+            "90": 746195456.0,
+            "91": 746195456.0,
+            "92": 746195456.0,
+            "93": 746195456.0,
+            "94": 746195456.0,
+            "95": 746195456.0,
+            "96": 746195456.0,
+            "97": 746195456.0,
+            "98": 746195456.0,
+            "99": 746195456.0,
+            "100": 746195456.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2210899968.0,
+            "52": 2210900992.0,
+            "53": 2210900992.0,
+            "54": 2210900992.0,
+            "55": 2210900992.0,
+            "56": 2210900992.0,
+            "57": 2210900992.0,
+            "58": 2210900992.0,
+            "59": 2210900992.0,
+            "60": 2210900992.0,
+            "61": 2210900992.0,
+            "62": 2210900992.0,
+            "63": 2210900992.0,
+            "64": 2210900992.0,
+            "65": 2210900992.0,
+            "66": 2210900992.0,
+            "67": 2210900992.0,
+            "68": 2210900992.0,
+            "69": 2210900992.0,
+            "70": 2210900992.0,
+            "71": 2210900992.0,
+            "72": 2210900992.0,
+            "73": 2210900992.0,
+            "74": 2210900992.0,
+            "75": 2210900992.0,
+            "76": 2210900992.0,
+            "77": 2210900992.0,
+            "78": 2210900992.0,
+            "79": 2210900992.0,
+            "80": 2210900992.0,
+            "81": 2210900992.0,
+            "82": 2210900992.0,
+            "83": 2210900992.0,
+            "84": 2210900992.0,
+            "85": 2210900992.0,
+            "86": 2210900992.0,
+            "87": 2210900992.0,
+            "88": 2210900992.0,
+            "89": 2210900992.0,
+            "90": 2210900992.0,
+            "91": 2210900992.0,
+            "92": 2210900992.0,
+            "93": 2210900992.0,
+            "94": 2210900992.0,
+            "95": 2210900992.0,
+            "96": 2210900992.0,
+            "97": 2210900992.0,
+            "98": 2210900992.0,
+            "99": 2210900992.0,
+            "100": 2210900992.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8.15802,
+            "52": 0.13009,
+            "53": 0.08915,
+            "54": 0.089,
+            "55": 0.08861,
+            "56": 0.08871,
+            "57": 0.08895,
+            "58": 0.08939,
+            "59": 0.08862,
+            "60": 0.08875,
+            "61": 0.08835,
+            "62": 0.09029,
+            "63": 0.09034,
+            "64": 0.08922,
+            "65": 0.08953,
+            "66": 0.09166,
+            "67": 0.08868,
+            "68": 0.08954,
+            "69": 0.08916,
+            "70": 0.08982,
+            "71": 0.08837,
+            "72": 0.0903,
+            "73": 0.08971,
+            "74": 0.09129,
+            "75": 0.09221,
+            "76": 0.08837,
+            "77": 0.0912,
+            "78": 0.08894,
+            "79": 0.08857,
+            "80": 0.089,
+            "81": 0.0893,
+            "82": 0.08924,
+            "83": 0.08842,
+            "84": 0.08918,
+            "85": 0.08897,
+            "86": 0.08832,
+            "87": 0.08827,
+            "88": 0.08998,
+            "89": 0.08959,
+            "90": 0.08882,
+            "91": 0.08911,
+            "92": 0.08926,
+            "93": 0.08845,
+            "94": 0.08884,
+            "95": 0.08981,
+            "96": 0.08858,
+            "97": 0.09088,
+            "98": 0.09007,
+            "99": 0.08931,
+            "100": 0.09003
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..a9a12874e97
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.83582,
+            "2": 10.83571,
+            "3": 10.83523,
+            "4": 10.79949,
+            "5": 10.84909,
+            "6": 10.86563,
+            "7": 10.82789,
+            "8": 10.8363,
+            "9": 10.83997,
+            "10": 10.79865,
+            "11": 10.8677,
+            "12": 10.84994,
+            "13": 10.85915,
+            "14": 10.86874,
+            "15": 10.80173,
+            "16": 10.79183,
+            "17": 10.77353,
+            "18": 10.78739,
+            "19": 10.78983,
+            "20": 10.68446,
+            "21": 10.6784,
+            "22": 10.5257,
+            "23": 10.70726,
+            "24": 10.56551,
+            "25": 10.51602,
+            "26": 10.58017,
+            "27": 10.58981,
+            "28": 10.54551,
+            "29": 10.57726,
+            "30": 10.34051,
+            "31": 10.07051,
+            "32": 10.44503,
+            "33": 10.44293,
+            "34": 10.19391,
+            "35": 10.24261,
+            "36": 10.19236,
+            "37": 10.32969,
+            "38": 10.16551,
+            "39": 10.38729,
+            "40": 10.05174,
+            "41": 10.12191,
+            "42": 10.19259,
+            "43": 9.8069,
+            "44": 9.92475,
+            "45": 9.80639,
+            "46": 9.80145,
+            "47": 10.12104,
+            "48": 9.83127,
+            "49": 9.50404,
+            "50": 9.87954,
+            "51": 9.83807,
+            "52": 9.72058,
+            "53": 10.0568,
+            "54": 9.95032,
+            "55": 9.88328,
+            "56": 9.60431,
+            "57": 9.45518,
+            "58": 9.81927,
+            "59": 9.58262,
+            "60": 9.48844,
+            "61": 9.68577,
+            "62": 9.97779,
+            "63": 9.36765,
+            "64": 9.75913,
+            "65": 8.9376,
+            "66": 9.69257,
+            "67": 9.36621,
+            "68": 9.78303,
+            "69": 9.79318,
+            "70": 9.72699,
+            "71": 9.62875,
+            "72": 9.58004,
+            "73": 9.487,
+            "74": 8.92041,
+            "75": 9.41128,
+            "76": 9.07564,
+            "77": 10.05848,
+            "78": 9.72184,
+            "79": 9.3732,
+            "80": 9.40079,
+            "81": 9.4792,
+            "82": 9.69754,
+            "83": 9.31037,
+            "84": 9.41777,
+            "85": 9.61194,
+            "86": 9.07155,
+            "87": 9.59661,
+            "88": 9.74709,
+            "89": 9.59667,
+            "90": 9.82915,
+            "91": 9.33725,
+            "92": 9.3564,
+            "93": 9.08552,
+            "94": 8.82807,
+            "95": 9.52842,
+            "96": 9.52611,
+            "97": 9.30632,
+            "98": 9.66808,
+            "99": 8.89461,
+            "100": 9.40666
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1536.0,
+            "2": 1592.0,
+            "3": 1551.0,
+            "4": 1769.0,
+            "5": 1824.0,
+            "6": 1800.0,
+            "7": 1734.0,
+            "8": 1619.0,
+            "9": 1829.0,
+            "10": 1355.0,
+            "11": 1911.0,
+            "12": 1721.0,
+            "13": 1913.0,
+            "14": 1708.0,
+            "15": 1919.0,
+            "16": 1938.0,
+            "17": 1740.0,
+            "18": 1676.0,
+            "19": 1743.0,
+            "20": 1535.0,
+            "21": 1797.0,
+            "22": 1661.0,
+            "23": 1887.0,
+            "24": 1666.0,
+            "25": 1633.0,
+            "26": 1676.0,
+            "27": 1740.0,
+            "28": 1991.0,
+            "29": 1918.0,
+            "30": 1806.0,
+            "31": 1588.0,
+            "32": 1863.0,
+            "33": 2126.0,
+            "34": 1812.0,
+            "35": 1976.0,
+            "36": 1875.0,
+            "37": 2301.0,
+            "38": 2131.0,
+            "39": 2351.0,
+            "40": 2130.0,
+            "41": 2391.0,
+            "42": 2255.0,
+            "43": 1975.0,
+            "44": 2138.0,
+            "45": 2208.0,
+            "46": 2364.0,
+            "47": 2564.0,
+            "48": 2337.0,
+            "49": 2142.0,
+            "50": 2423.0,
+            "51": 2546.0,
+            "52": 2590.0,
+            "53": 2879.0,
+            "54": 2697.0,
+            "55": 2316.0,
+            "56": 2549.0,
+            "57": 2261.0,
+            "58": 2904.0,
+            "59": 2740.0,
+            "60": 2434.0,
+            "61": 2801.0,
+            "62": 2663.0,
+            "63": 2502.0,
+            "64": 2948.0,
+            "65": 2644.0,
+            "66": 2961.0,
+            "67": 2813.0,
+            "68": 2686.0,
+            "69": 2912.0,
+            "70": 3096.0,
+            "71": 2854.0,
+            "72": 2454.0,
+            "73": 3081.0,
+            "74": 1933.0,
+            "75": 2465.0,
+            "76": 3012.0,
+            "77": 3163.0,
+            "78": 2997.0,
+            "79": 3089.0,
+            "80": 3187.0,
+            "81": 3500.0,
+            "82": 3339.0,
+            "83": 2705.0,
+            "84": 3205.0,
+            "85": 3033.0,
+            "86": 2818.0,
+            "87": 3671.0,
+            "88": 3190.0,
+            "89": 3336.0,
+            "90": 3320.0,
+            "91": 2698.0,
+            "92": 3072.0,
+            "93": 2750.0,
+            "94": 3397.0,
+            "95": 3317.0,
+            "96": 3290.0,
+            "97": 3116.0,
+            "98": 3732.0,
+            "99": 3049.0,
+            "100": 2974.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 745146880.0,
+            "2": 745146880.0,
+            "3": 745146880.0,
+            "4": 745146880.0,
+            "5": 745146880.0,
+            "6": 745146880.0,
+            "7": 745146880.0,
+            "8": 745146880.0,
+            "9": 745146880.0,
+            "10": 745146880.0,
+            "11": 745146880.0,
+            "12": 745146880.0,
+            "13": 745146880.0,
+            "14": 745146880.0,
+            "15": 745146880.0,
+            "16": 745146880.0,
+            "17": 745146880.0,
+            "18": 745146880.0,
+            "19": 745146880.0,
+            "20": 745146880.0,
+            "21": 745146880.0,
+            "22": 745146880.0,
+            "23": 745146880.0,
+            "24": 745146880.0,
+            "25": 745146880.0,
+            "26": 745146880.0,
+            "27": 745146880.0,
+            "28": 745146880.0,
+            "29": 745146880.0,
+            "30": 745146880.0,
+            "31": 745146880.0,
+            "32": 745146880.0,
+            "33": 745146880.0,
+            "34": 745146880.0,
+            "35": 745146880.0,
+            "36": 745146880.0,
+            "37": 745146880.0,
+            "38": 745146880.0,
+            "39": 745146880.0,
+            "40": 745146880.0,
+            "41": 745146880.0,
+            "42": 745146880.0,
+            "43": 745146880.0,
+            "44": 745146880.0,
+            "45": 745146880.0,
+            "46": 745146880.0,
+            "47": 745146880.0,
+            "48": 745146880.0,
+            "49": 745146880.0,
+            "50": 745146880.0,
+            "51": 745146880.0,
+            "52": 745146880.0,
+            "53": 745146880.0,
+            "54": 745146880.0,
+            "55": 745146880.0,
+            "56": 745146880.0,
+            "57": 745146880.0,
+            "58": 745146880.0,
+            "59": 745146880.0,
+            "60": 745146880.0,
+            "61": 745146880.0,
+            "62": 745146880.0,
+            "63": 745146880.0,
+            "64": 745146880.0,
+            "65": 745146880.0,
+            "66": 745146880.0,
+            "67": 745146880.0,
+            "68": 745146880.0,
+            "69": 745146880.0,
+            "70": 745146880.0,
+            "71": 745146880.0,
+            "72": 745146880.0,
+            "73": 745146880.0,
+            "74": 745146880.0,
+            "75": 745146880.0,
+            "76": 745146880.0,
+            "77": 745146880.0,
+            "78": 745146880.0,
+            "79": 745146880.0,
+            "80": 745146880.0,
+            "81": 745146880.0,
+            "82": 745146880.0,
+            "83": 745146880.0,
+            "84": 745146880.0,
+            "85": 745146880.0,
+            "86": 745146880.0,
+            "87": 745146880.0,
+            "88": 745146880.0,
+            "89": 745146880.0,
+            "90": 745146880.0,
+            "91": 745146880.0,
+            "92": 745146880.0,
+            "93": 745146880.0,
+            "94": 745146880.0,
+            "95": 745146880.0,
+            "96": 745146880.0,
+            "97": 745146880.0,
+            "98": 745146880.0,
+            "99": 745146880.0,
+            "100": 745146880.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1939785728.0,
+            "2": 2222434304.0,
+            "3": 2222434304.0,
+            "4": 2222434304.0,
+            "5": 2222434304.0,
+            "6": 2222434304.0,
+            "7": 2222434304.0,
+            "8": 2222434304.0,
+            "9": 2222434304.0,
+            "10": 2222434304.0,
+            "11": 2222434304.0,
+            "12": 2222434304.0,
+            "13": 2222434304.0,
+            "14": 2222434304.0,
+            "15": 2222434304.0,
+            "16": 2222434304.0,
+            "17": 2222434304.0,
+            "18": 2222434304.0,
+            "19": 2222434304.0,
+            "20": 2222434304.0,
+            "21": 2222434304.0,
+            "22": 2222434304.0,
+            "23": 2222434304.0,
+            "24": 2222434304.0,
+            "25": 2222434304.0,
+            "26": 2222434304.0,
+            "27": 2222434304.0,
+            "28": 2222434304.0,
+            "29": 2222434304.0,
+            "30": 2222434304.0,
+            "31": 2222434304.0,
+            "32": 2222434304.0,
+            "33": 2222434304.0,
+            "34": 2222434304.0,
+            "35": 2222434304.0,
+            "36": 2222434304.0,
+            "37": 2222434304.0,
+            "38": 2222434304.0,
+            "39": 2222434304.0,
+            "40": 2222434304.0,
+            "41": 2222434304.0,
+            "42": 2222434304.0,
+            "43": 2222434304.0,
+            "44": 2222434304.0,
+            "45": 2222434304.0,
+            "46": 2222434304.0,
+            "47": 2222434304.0,
+            "48": 2222434304.0,
+            "49": 2222434304.0,
+            "50": 2222434304.0,
+            "51": 2222434304.0,
+            "52": 2222434304.0,
+            "53": 2222434304.0,
+            "54": 2222434304.0,
+            "55": 2222434304.0,
+            "56": 2222434304.0,
+            "57": 2222434304.0,
+            "58": 2222434304.0,
+            "59": 2222434304.0,
+            "60": 2222434304.0,
+            "61": 2222434304.0,
+            "62": 2222434304.0,
+            "63": 2222434304.0,
+            "64": 2222434304.0,
+            "65": 2222434304.0,
+            "66": 2222434304.0,
+            "67": 2222434304.0,
+            "68": 2222434304.0,
+            "69": 2222434304.0,
+            "70": 2222434304.0,
+            "71": 2222434304.0,
+            "72": 2222434304.0,
+            "73": 2222434304.0,
+            "74": 2222434304.0,
+            "75": 2222434304.0,
+            "76": 2222434304.0,
+            "77": 2222434304.0,
+            "78": 2222434304.0,
+            "79": 2222434304.0,
+            "80": 2222434304.0,
+            "81": 2222434304.0,
+            "82": 2222434304.0,
+            "83": 2222434304.0,
+            "84": 2222434304.0,
+            "85": 2222434304.0,
+            "86": 2222434304.0,
+            "87": 2222434304.0,
+            "88": 2222434304.0,
+            "89": 2222434304.0,
+            "90": 2222434304.0,
+            "91": 2222434304.0,
+            "92": 2222434304.0,
+            "93": 2222434304.0,
+            "94": 2222434304.0,
+            "95": 2222434304.0,
+            "96": 2222434304.0,
+            "97": 2222434304.0,
+            "98": 2222434304.0,
+            "99": 2222434304.0,
+            "100": 2222434304.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.22807,
+            "3": 0.13601,
+            "4": 0.12128,
+            "5": 0.1198,
+            "6": 0.1228,
+            "7": 0.12056,
+            "8": 0.11886,
+            "9": 0.11944,
+            "10": 0.11995,
+            "11": 0.11935,
+            "12": 0.11905,
+            "13": 0.11975,
+            "14": 0.12242,
+            "15": 0.12061,
+            "16": 0.12046,
+            "17": 0.1208,
+            "18": 0.12205,
+            "19": 0.12427,
+            "20": 0.12315,
+            "21": 0.11965,
+            "22": 0.12231,
+            "23": 0.12286,
+            "24": 0.12394,
+            "25": 0.12377,
+            "26": 0.12221,
+            "27": 0.11936,
+            "28": 0.11894,
+            "29": 0.11945,
+            "30": 0.12192,
+            "31": 0.12571,
+            "32": 0.12346,
+            "33": 0.12413,
+            "34": 0.12225,
+            "35": 0.12328,
+            "36": 0.12241,
+            "37": 0.12432,
+            "38": 0.12195,
+            "39": 0.12262,
+            "40": 0.12198,
+            "41": 0.12396,
+            "42": 0.12194,
+            "43": 0.12435,
+            "44": 0.12108,
+            "45": 0.12326,
+            "46": 0.1218,
+            "47": 0.12308,
+            "48": 0.12384,
+            "49": 0.12795,
+            "50": 0.12572,
+            "51": 0.13502,
+            "52": 0.13106,
+            "53": 0.14515,
+            "54": 0.12597,
+            "55": 0.1249,
+            "56": 0.12535,
+            "57": 0.12569,
+            "58": 0.12489,
+            "59": 0.12862,
+            "60": 0.12778,
+            "61": 0.12731,
+            "62": 0.12786,
+            "63": 0.13022,
+            "64": 0.12789,
+            "65": 0.12838,
+            "66": 0.12571,
+            "67": 0.12651,
+            "68": 0.12592,
+            "69": 0.12663,
+            "70": 0.12691,
+            "71": 0.12636,
+            "72": 0.12638,
+            "73": 0.12671,
+            "74": 0.12637,
+            "75": 0.12602,
+            "76": 0.12598,
+            "77": 0.12554,
+            "78": 0.12553,
+            "79": 0.12501,
+            "80": 0.13898,
+            "81": 0.14589,
+            "82": 0.14718,
+            "83": 0.14665,
+            "84": 0.16017,
+            "85": 0.14231,
+            "86": 0.15628,
+            "87": 0.14055,
+            "88": 0.13961,
+            "89": 0.14878,
+            "90": 0.14486,
+            "91": 0.1432,
+            "92": 0.14946,
+            "93": 0.14581,
+            "94": 0.1623,
+            "95": 0.15638,
+            "96": 0.12895,
+            "97": 0.12907,
+            "98": 0.12824,
+            "99": 0.12741,
+            "100": 0.12543
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..8cb40558f2c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83807,
+            "52": 9.72058,
+            "53": 10.0568,
+            "54": 9.95032,
+            "55": 9.88328,
+            "56": 9.60431,
+            "57": 9.45518,
+            "58": 9.81927,
+            "59": 9.58262,
+            "60": 9.48844,
+            "61": 9.68577,
+            "62": 9.97779,
+            "63": 9.36765,
+            "64": 9.75913,
+            "65": 8.9376,
+            "66": 9.69257,
+            "67": 9.36621,
+            "68": 9.78303,
+            "69": 9.79318,
+            "70": 9.72699,
+            "71": 9.62875,
+            "72": 9.58004,
+            "73": 9.487,
+            "74": 8.92041,
+            "75": 9.41128,
+            "76": 9.07564,
+            "77": 10.05848,
+            "78": 9.72184,
+            "79": 9.3732,
+            "80": 9.40079,
+            "81": 9.4792,
+            "82": 9.69754,
+            "83": 9.31037,
+            "84": 9.41777,
+            "85": 9.61194,
+            "86": 9.07155,
+            "87": 9.59661,
+            "88": 9.74709,
+            "89": 9.59667,
+            "90": 9.82915,
+            "91": 9.33725,
+            "92": 9.3564,
+            "93": 9.08552,
+            "94": 8.82807,
+            "95": 9.52842,
+            "96": 9.52611,
+            "97": 9.30632,
+            "98": 9.66808,
+            "99": 8.89461,
+            "100": 9.40666
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2546.0,
+            "52": 2590.0,
+            "53": 2879.0,
+            "54": 2697.0,
+            "55": 2316.0,
+            "56": 2549.0,
+            "57": 2261.0,
+            "58": 2904.0,
+            "59": 2740.0,
+            "60": 2434.0,
+            "61": 2801.0,
+            "62": 2663.0,
+            "63": 2502.0,
+            "64": 2948.0,
+            "65": 2644.0,
+            "66": 2961.0,
+            "67": 2813.0,
+            "68": 2686.0,
+            "69": 2912.0,
+            "70": 3096.0,
+            "71": 2854.0,
+            "72": 2454.0,
+            "73": 3081.0,
+            "74": 1933.0,
+            "75": 2465.0,
+            "76": 3012.0,
+            "77": 3163.0,
+            "78": 2997.0,
+            "79": 3089.0,
+            "80": 3187.0,
+            "81": 3500.0,
+            "82": 3339.0,
+            "83": 2705.0,
+            "84": 3205.0,
+            "85": 3033.0,
+            "86": 2818.0,
+            "87": 3671.0,
+            "88": 3190.0,
+            "89": 3336.0,
+            "90": 3320.0,
+            "91": 2698.0,
+            "92": 3072.0,
+            "93": 2750.0,
+            "94": 3397.0,
+            "95": 3317.0,
+            "96": 3290.0,
+            "97": 3116.0,
+            "98": 3732.0,
+            "99": 3049.0,
+            "100": 2974.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 746195456.0,
+            "52": 746195456.0,
+            "53": 746195456.0,
+            "54": 746195456.0,
+            "55": 746195456.0,
+            "56": 746195456.0,
+            "57": 746195456.0,
+            "58": 746195456.0,
+            "59": 746195456.0,
+            "60": 746195456.0,
+            "61": 746195456.0,
+            "62": 746195456.0,
+            "63": 746195456.0,
+            "64": 746195456.0,
+            "65": 746195456.0,
+            "66": 746195456.0,
+            "67": 746195456.0,
+            "68": 746195456.0,
+            "69": 746195456.0,
+            "70": 746195456.0,
+            "71": 746195456.0,
+            "72": 746195456.0,
+            "73": 746195456.0,
+            "74": 746195456.0,
+            "75": 746195456.0,
+            "76": 746195456.0,
+            "77": 746195456.0,
+            "78": 746195456.0,
+            "79": 746195456.0,
+            "80": 746195456.0,
+            "81": 746195456.0,
+            "82": 746195456.0,
+            "83": 746195456.0,
+            "84": 746195456.0,
+            "85": 746195456.0,
+            "86": 746195456.0,
+            "87": 746195456.0,
+            "88": 746195456.0,
+            "89": 746195456.0,
+            "90": 746195456.0,
+            "91": 746195456.0,
+            "92": 746195456.0,
+            "93": 746195456.0,
+            "94": 746195456.0,
+            "95": 746195456.0,
+            "96": 746195456.0,
+            "97": 746195456.0,
+            "98": 746195456.0,
+            "99": 746195456.0,
+            "100": 746195456.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2223482880.0,
+            "52": 2223483904.0,
+            "53": 2223483904.0,
+            "54": 2223483904.0,
+            "55": 2223483904.0,
+            "56": 2223483904.0,
+            "57": 2223483904.0,
+            "58": 2223483904.0,
+            "59": 2223483904.0,
+            "60": 2223483904.0,
+            "61": 2223483904.0,
+            "62": 2223483904.0,
+            "63": 2223483904.0,
+            "64": 2223483904.0,
+            "65": 2223483904.0,
+            "66": 2223483904.0,
+            "67": 2223483904.0,
+            "68": 2223483904.0,
+            "69": 2223483904.0,
+            "70": 2223483904.0,
+            "71": 2223483904.0,
+            "72": 2223483904.0,
+            "73": 2223483904.0,
+            "74": 2223483904.0,
+            "75": 2223483904.0,
+            "76": 2223483904.0,
+            "77": 2223483904.0,
+            "78": 2223483904.0,
+            "79": 2223483904.0,
+            "80": 2223483904.0,
+            "81": 2223483904.0,
+            "82": 2223483904.0,
+            "83": 2223483904.0,
+            "84": 2223483904.0,
+            "85": 2223483904.0,
+            "86": 2223483904.0,
+            "87": 2223483904.0,
+            "88": 2223483904.0,
+            "89": 2223483904.0,
+            "90": 2223483904.0,
+            "91": 2223483904.0,
+            "92": 2223483904.0,
+            "93": 2223483904.0,
+            "94": 2223483904.0,
+            "95": 2223483904.0,
+            "96": 2223483904.0,
+            "97": 2223483904.0,
+            "98": 2223483904.0,
+            "99": 2223483904.0,
+            "100": 2223483904.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.08357,
+            "53": 0.13321,
+            "54": 0.11949,
+            "55": 0.11861,
+            "56": 0.11817,
+            "57": 0.12088,
+            "58": 0.11937,
+            "59": 0.11893,
+            "60": 0.11961,
+            "61": 0.11894,
+            "62": 0.11953,
+            "63": 0.11978,
+            "64": 0.11983,
+            "65": 0.12255,
+            "66": 0.12188,
+            "67": 0.12135,
+            "68": 0.11972,
+            "69": 0.11963,
+            "70": 0.11929,
+            "71": 0.11924,
+            "72": 0.12023,
+            "73": 0.12093,
+            "74": 0.12082,
+            "75": 0.11862,
+            "76": 0.11797,
+            "77": 0.11862,
+            "78": 0.12219,
+            "79": 0.12137,
+            "80": 0.11873,
+            "81": 0.11752,
+            "82": 0.1208,
+            "83": 0.11974,
+            "84": 0.1182,
+            "85": 0.11721,
+            "86": 0.11748,
+            "87": 0.11944,
+            "88": 0.11934,
+            "89": 0.11847,
+            "90": 0.11837,
+            "91": 0.11938,
+            "92": 0.11761,
+            "93": 0.11737,
+            "94": 0.12142,
+            "95": 0.12574,
+            "96": 0.12197,
+            "97": 0.12384,
+            "98": 0.12251,
+            "99": 0.13032,
+            "100": 0.12305
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100.json
index 50639a30816..4fffaabca8a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 746194432.0,
-            "2": 746194432.0,
-            "3": 746194432.0,
-            "4": 746194432.0,
-            "5": 746194432.0,
-            "6": 746194432.0,
-            "7": 746194432.0,
-            "8": 746194432.0,
-            "9": 746194432.0,
-            "10": 746194432.0,
-            "11": 746194432.0,
-            "12": 746194432.0,
-            "13": 746194432.0,
-            "14": 746194432.0,
-            "15": 746194432.0,
-            "16": 746194432.0,
-            "17": 746194432.0,
-            "18": 746194432.0,
-            "19": 746194432.0,
-            "20": 746194432.0,
-            "21": 746194432.0,
-            "22": 746194432.0,
-            "23": 746194432.0,
-            "24": 746194432.0,
-            "25": 746194432.0,
-            "26": 746194432.0,
-            "27": 746194432.0,
-            "28": 746194432.0,
-            "29": 746194432.0,
-            "30": 746194432.0,
-            "31": 746194432.0,
-            "32": 746194432.0,
-            "33": 746194432.0,
-            "34": 746194432.0,
-            "35": 746194432.0,
-            "36": 746194432.0,
-            "37": 746194432.0,
-            "38": 746194432.0,
-            "39": 746194432.0,
-            "40": 746194432.0,
-            "41": 746194432.0,
-            "42": 746194432.0,
-            "43": 746194432.0,
-            "44": 746194432.0,
-            "45": 746194432.0,
-            "46": 746194432.0,
-            "47": 746194432.0,
-            "48": 746194432.0,
-            "49": 746194432.0,
-            "50": 746194432.0,
-            "51": 746194432.0,
-            "52": 746194432.0,
-            "53": 746194432.0,
-            "54": 746194432.0,
-            "55": 746194432.0,
-            "56": 746194432.0,
-            "57": 746194432.0,
-            "58": 746194432.0,
-            "59": 746194432.0,
-            "60": 746194432.0,
-            "61": 746194432.0,
-            "62": 746194432.0,
-            "63": 746194432.0,
-            "64": 746194432.0,
-            "65": 746194432.0,
-            "66": 746194432.0,
-            "67": 746194432.0,
-            "68": 746194432.0,
-            "69": 746194432.0,
-            "70": 746194432.0,
-            "71": 746194432.0,
-            "72": 746194432.0,
-            "73": 746194432.0,
-            "74": 746194432.0,
-            "75": 746194432.0,
-            "76": 746194432.0,
-            "77": 746194432.0,
-            "78": 746194432.0,
-            "79": 746194432.0,
-            "80": 746194432.0,
-            "81": 746194432.0,
-            "82": 746194432.0,
-            "83": 746194432.0,
-            "84": 746194432.0,
-            "85": 746194432.0,
-            "86": 746194432.0,
-            "87": 746194432.0,
-            "88": 746194432.0,
-            "89": 746194432.0,
-            "90": 746194432.0,
-            "91": 746194432.0,
-            "92": 746194432.0,
-            "93": 746194432.0,
-            "94": 746194432.0,
-            "95": 746194432.0,
-            "96": 746194432.0,
-            "97": 746194432.0,
-            "98": 746194432.0,
-            "99": 746194432.0,
-            "100": 746194432.0
+            "1": 747244032.0,
+            "2": 747244032.0,
+            "3": 747244032.0,
+            "4": 747244032.0,
+            "5": 747244032.0,
+            "6": 747244032.0,
+            "7": 747244032.0,
+            "8": 747244032.0,
+            "9": 747244032.0,
+            "10": 747244032.0,
+            "11": 747244032.0,
+            "12": 747244032.0,
+            "13": 747244032.0,
+            "14": 747244032.0,
+            "15": 747244032.0,
+            "16": 747244032.0,
+            "17": 747244032.0,
+            "18": 747244032.0,
+            "19": 747244032.0,
+            "20": 747244032.0,
+            "21": 747244032.0,
+            "22": 747244032.0,
+            "23": 747244032.0,
+            "24": 747244032.0,
+            "25": 747244032.0,
+            "26": 747244032.0,
+            "27": 747244032.0,
+            "28": 747244032.0,
+            "29": 747244032.0,
+            "30": 747244032.0,
+            "31": 747244032.0,
+            "32": 747244032.0,
+            "33": 747244032.0,
+            "34": 747244032.0,
+            "35": 747244032.0,
+            "36": 747244032.0,
+            "37": 747244032.0,
+            "38": 747244032.0,
+            "39": 747244032.0,
+            "40": 747244032.0,
+            "41": 747244032.0,
+            "42": 747244032.0,
+            "43": 747244032.0,
+            "44": 747244032.0,
+            "45": 747244032.0,
+            "46": 747244032.0,
+            "47": 747244032.0,
+            "48": 747244032.0,
+            "49": 747244032.0,
+            "50": 747244032.0,
+            "51": 747244032.0,
+            "52": 747244032.0,
+            "53": 747244032.0,
+            "54": 747244032.0,
+            "55": 747244032.0,
+            "56": 747244032.0,
+            "57": 747244032.0,
+            "58": 747244032.0,
+            "59": 747244032.0,
+            "60": 747244032.0,
+            "61": 747244032.0,
+            "62": 747244032.0,
+            "63": 747244032.0,
+            "64": 747244032.0,
+            "65": 747244032.0,
+            "66": 747244032.0,
+            "67": 747244032.0,
+            "68": 747244032.0,
+            "69": 747244032.0,
+            "70": 747244032.0,
+            "71": 747244032.0,
+            "72": 747244032.0,
+            "73": 747244032.0,
+            "74": 747244032.0,
+            "75": 747244032.0,
+            "76": 747244032.0,
+            "77": 747244032.0,
+            "78": 747244032.0,
+            "79": 747244032.0,
+            "80": 747244032.0,
+            "81": 747244032.0,
+            "82": 747244032.0,
+            "83": 747244032.0,
+            "84": 747244032.0,
+            "85": 747244032.0,
+            "86": 747244032.0,
+            "87": 747244032.0,
+            "88": 747244032.0,
+            "89": 747244032.0,
+            "90": 747244032.0,
+            "91": 747244032.0,
+            "92": 747244032.0,
+            "93": 747244032.0,
+            "94": 747244032.0,
+            "95": 747244032.0,
+            "96": 747244032.0,
+            "97": 747244032.0,
+            "98": 747244032.0,
+            "99": 747244032.0,
+            "100": 747244032.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1926153216.0,
-            "2": 2209851392.0,
-            "3": 2209851392.0,
-            "4": 2209851392.0,
-            "5": 2209851392.0,
-            "6": 2209851392.0,
-            "7": 2209851392.0,
-            "8": 2209851392.0,
-            "9": 2209851392.0,
-            "10": 2209851392.0,
-            "11": 2209851392.0,
-            "12": 2209851392.0,
-            "13": 2209851392.0,
-            "14": 2209851392.0,
-            "15": 2209851392.0,
-            "16": 2209851392.0,
-            "17": 2209851392.0,
-            "18": 2209851392.0,
-            "19": 2209851392.0,
-            "20": 2209851392.0,
-            "21": 2209851392.0,
-            "22": 2209851392.0,
-            "23": 2209851392.0,
-            "24": 2209851392.0,
-            "25": 2209851392.0,
-            "26": 2209851392.0,
-            "27": 2209851392.0,
-            "28": 2209851392.0,
-            "29": 2209851392.0,
-            "30": 2209851392.0,
-            "31": 2209851392.0,
-            "32": 2209851392.0,
-            "33": 2209851392.0,
-            "34": 2209851392.0,
-            "35": 2209851392.0,
-            "36": 2209851392.0,
-            "37": 2209851392.0,
-            "38": 2209851392.0,
-            "39": 2209851392.0,
-            "40": 2209851392.0,
-            "41": 2209851392.0,
-            "42": 2209851392.0,
-            "43": 2209851392.0,
-            "44": 2209851392.0,
-            "45": 2209851392.0,
-            "46": 2209851392.0,
-            "47": 2209851392.0,
-            "48": 2209851392.0,
-            "49": 2209851392.0,
-            "50": 2209851392.0,
-            "51": 2209851392.0,
-            "52": 2209851392.0,
-            "53": 2209851392.0,
-            "54": 2209851392.0,
-            "55": 2209851392.0,
-            "56": 2209851392.0,
-            "57": 2209851392.0,
-            "58": 2209851392.0,
-            "59": 2209851392.0,
-            "60": 2209851392.0,
-            "61": 2209851392.0,
-            "62": 2209851392.0,
-            "63": 2209851392.0,
-            "64": 2209851392.0,
-            "65": 2209851392.0,
-            "66": 2209851392.0,
-            "67": 2209851392.0,
-            "68": 2209851392.0,
-            "69": 2209851392.0,
-            "70": 2209851392.0,
-            "71": 2209851392.0,
-            "72": 2209851392.0,
-            "73": 2209851392.0,
-            "74": 2209851392.0,
-            "75": 2209851392.0,
-            "76": 2209851392.0,
-            "77": 2209851392.0,
-            "78": 2209851392.0,
-            "79": 2209851392.0,
-            "80": 2209851392.0,
-            "81": 2209851392.0,
-            "82": 2209851392.0,
-            "83": 2209851392.0,
-            "84": 2209851392.0,
-            "85": 2209851392.0,
-            "86": 2209851392.0,
-            "87": 2209851392.0,
-            "88": 2209851392.0,
-            "89": 2209851392.0,
-            "90": 2209851392.0,
-            "91": 2209851392.0,
-            "92": 2209851392.0,
-            "93": 2209851392.0,
-            "94": 2209851392.0,
-            "95": 2209851392.0,
-            "96": 2209851392.0,
-            "97": 2209851392.0,
-            "98": 2209851392.0,
-            "99": 2209851392.0,
-            "100": 2209851392.0
+            "1": 1927202816.0,
+            "2": 2211948544.0,
+            "3": 2211948544.0,
+            "4": 2211948544.0,
+            "5": 2211948544.0,
+            "6": 2211948544.0,
+            "7": 2211948544.0,
+            "8": 2211948544.0,
+            "9": 2211948544.0,
+            "10": 2211948544.0,
+            "11": 2211948544.0,
+            "12": 2211948544.0,
+            "13": 2211948544.0,
+            "14": 2211948544.0,
+            "15": 2211948544.0,
+            "16": 2211948544.0,
+            "17": 2211948544.0,
+            "18": 2211948544.0,
+            "19": 2211948544.0,
+            "20": 2211948544.0,
+            "21": 2211948544.0,
+            "22": 2211948544.0,
+            "23": 2211948544.0,
+            "24": 2211948544.0,
+            "25": 2211948544.0,
+            "26": 2211948544.0,
+            "27": 2211948544.0,
+            "28": 2211948544.0,
+            "29": 2211948544.0,
+            "30": 2211948544.0,
+            "31": 2211948544.0,
+            "32": 2211948544.0,
+            "33": 2211948544.0,
+            "34": 2211948544.0,
+            "35": 2211948544.0,
+            "36": 2211948544.0,
+            "37": 2211948544.0,
+            "38": 2211948544.0,
+            "39": 2211948544.0,
+            "40": 2211948544.0,
+            "41": 2211948544.0,
+            "42": 2211948544.0,
+            "43": 2211948544.0,
+            "44": 2211948544.0,
+            "45": 2211948544.0,
+            "46": 2211948544.0,
+            "47": 2211948544.0,
+            "48": 2211948544.0,
+            "49": 2211948544.0,
+            "50": 2211948544.0,
+            "51": 2211948544.0,
+            "52": 2211948544.0,
+            "53": 2211948544.0,
+            "54": 2211948544.0,
+            "55": 2211948544.0,
+            "56": 2211948544.0,
+            "57": 2211948544.0,
+            "58": 2211948544.0,
+            "59": 2211948544.0,
+            "60": 2211948544.0,
+            "61": 2211948544.0,
+            "62": 2211948544.0,
+            "63": 2211948544.0,
+            "64": 2211948544.0,
+            "65": 2211948544.0,
+            "66": 2211948544.0,
+            "67": 2211948544.0,
+            "68": 2211948544.0,
+            "69": 2211948544.0,
+            "70": 2211948544.0,
+            "71": 2211948544.0,
+            "72": 2211948544.0,
+            "73": 2211948544.0,
+            "74": 2211948544.0,
+            "75": 2211948544.0,
+            "76": 2211948544.0,
+            "77": 2211948544.0,
+            "78": 2211948544.0,
+            "79": 2211948544.0,
+            "80": 2211948544.0,
+            "81": 2211948544.0,
+            "82": 2211948544.0,
+            "83": 2211948544.0,
+            "84": 2211948544.0,
+            "85": 2211948544.0,
+            "86": 2211948544.0,
+            "87": 2211948544.0,
+            "88": 2211948544.0,
+            "89": 2211948544.0,
+            "90": 2211948544.0,
+            "91": 2211948544.0,
+            "92": 2211948544.0,
+            "93": 2211948544.0,
+            "94": 2211948544.0,
+            "95": 2211948544.0,
+            "96": 2211948544.0,
+            "97": 2211948544.0,
+            "98": 2211948544.0,
+            "99": 2211948544.0,
+            "100": 2211948544.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 12.88983,
-            "2": 0.12288,
-            "3": 0.10944,
-            "4": 0.10822,
-            "5": 0.10919,
-            "6": 0.10835,
-            "7": 0.11035,
-            "8": 0.10879,
-            "9": 0.11001,
-            "10": 0.11009,
-            "11": 0.10945,
-            "12": 0.10868,
-            "13": 0.1086,
-            "14": 0.10899,
-            "15": 0.10852,
-            "16": 0.10822,
-            "17": 0.10818,
-            "18": 0.10877,
-            "19": 0.10888,
-            "20": 0.10828,
-            "21": 0.109,
-            "22": 0.108,
-            "23": 0.10722,
-            "24": 0.10731,
-            "25": 0.1075,
-            "26": 0.10744,
-            "27": 0.10843,
-            "28": 0.10831,
-            "29": 0.10841,
-            "30": 0.10718,
-            "31": 0.10837,
-            "32": 0.10773,
-            "33": 0.10792,
-            "34": 0.10698,
-            "35": 0.10976,
-            "36": 0.10758,
-            "37": 0.10825,
-            "38": 0.10781,
-            "39": 0.10912,
-            "40": 0.10847,
-            "41": 0.10786,
-            "42": 0.10767,
-            "43": 0.10761,
-            "44": 0.1076,
-            "45": 0.1078,
-            "46": 0.10992,
-            "47": 0.1061,
-            "48": 0.10654,
-            "49": 0.10566,
-            "50": 0.1066,
-            "51": 0.11234,
-            "52": 0.11065,
-            "53": 0.10795,
-            "54": 0.10668,
-            "55": 0.10678,
-            "56": 0.10889,
-            "57": 0.10802,
-            "58": 0.12482,
-            "59": 0.10666,
-            "60": 0.10637,
-            "61": 0.10776,
-            "62": 0.10743,
-            "63": 0.10782,
-            "64": 0.10634,
-            "65": 0.10744,
-            "66": 0.10859,
-            "67": 0.10949,
-            "68": 0.1075,
-            "69": 0.10803,
-            "70": 0.10688,
-            "71": 0.10797,
-            "72": 0.10752,
-            "73": 0.10816,
-            "74": 0.10734,
-            "75": 0.10832,
-            "76": 0.10815,
-            "77": 0.10868,
-            "78": 0.10839,
-            "79": 0.1074,
-            "80": 0.10866,
-            "81": 0.11122,
-            "82": 0.11035,
-            "83": 0.1101,
-            "84": 0.1122,
-            "85": 0.10866,
-            "86": 0.10915,
-            "87": 0.10842,
-            "88": 0.10723,
-            "89": 0.10849,
-            "90": 0.10814,
-            "91": 0.10833,
-            "92": 0.10719,
-            "93": 0.10725,
-            "94": 0.10754,
-            "95": 0.10758,
-            "96": 0.1082,
-            "97": 0.10768,
-            "98": 0.10708,
-            "99": 0.10785,
-            "100": 0.10841
+            "1": 8.07511,
+            "2": 0.14681,
+            "3": 0.10596,
+            "4": 0.08711,
+            "5": 0.0876,
+            "6": 0.08568,
+            "7": 0.08664,
+            "8": 0.08587,
+            "9": 0.08577,
+            "10": 0.08621,
+            "11": 0.08632,
+            "12": 0.08547,
+            "13": 0.08657,
+            "14": 0.086,
+            "15": 0.08713,
+            "16": 0.08626,
+            "17": 0.0867,
+            "18": 0.08636,
+            "19": 0.08698,
+            "20": 0.08625,
+            "21": 0.08785,
+            "22": 0.08871,
+            "23": 0.08659,
+            "24": 0.08847,
+            "25": 0.09629,
+            "26": 0.09476,
+            "27": 0.08553,
+            "28": 0.08477,
+            "29": 0.08431,
+            "30": 0.08434,
+            "31": 0.08557,
+            "32": 0.08544,
+            "33": 0.08488,
+            "34": 0.08582,
+            "35": 0.08395,
+            "36": 0.08398,
+            "37": 0.08559,
+            "38": 0.08441,
+            "39": 0.08418,
+            "40": 0.08528,
+            "41": 0.0861,
+            "42": 0.08685,
+            "43": 0.08626,
+            "44": 0.08751,
+            "45": 0.08791,
+            "46": 0.087,
+            "47": 0.08684,
+            "48": 0.08803,
+            "49": 0.08859,
+            "50": 0.09019,
+            "51": 0.10254,
+            "52": 0.09302,
+            "53": 0.10544,
+            "54": 0.08758,
+            "55": 0.0856,
+            "56": 0.08575,
+            "57": 0.08685,
+            "58": 0.08631,
+            "59": 0.08389,
+            "60": 0.08441,
+            "61": 0.08423,
+            "62": 0.08509,
+            "63": 0.08726,
+            "64": 0.08594,
+            "65": 0.08568,
+            "66": 0.08392,
+            "67": 0.08415,
+            "68": 0.0849,
+            "69": 0.08418,
+            "70": 0.08396,
+            "71": 0.08448,
+            "72": 0.08498,
+            "73": 0.08408,
+            "74": 0.08475,
+            "75": 0.08328,
+            "76": 0.08384,
+            "77": 0.08424,
+            "78": 0.08463,
+            "79": 0.0841,
+            "80": 0.08431,
+            "81": 0.08441,
+            "82": 0.0848,
+            "83": 0.08442,
+            "84": 0.08437,
+            "85": 0.08486,
+            "86": 0.08464,
+            "87": 0.0837,
+            "88": 0.0844,
+            "89": 0.08503,
+            "90": 0.08351,
+            "91": 0.0839,
+            "92": 0.08423,
+            "93": 0.08472,
+            "94": 0.08463,
+            "95": 0.08455,
+            "96": 0.08373,
+            "97": 0.08396,
+            "98": 0.08358,
+            "99": 0.08466,
+            "100": 0.08402
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..b60cbfef0c0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85075,
+            "52": 9.73181,
+            "53": 10.06388,
+            "54": 9.95432,
+            "55": 9.87204,
+            "56": 9.61823,
+            "57": 9.47467,
+            "58": 9.82802,
+            "59": 9.57962,
+            "60": 9.49074,
+            "61": 9.68473,
+            "62": 9.99245,
+            "63": 9.38364,
+            "64": 9.77766,
+            "65": 8.94008,
+            "66": 9.70099,
+            "67": 9.3605,
+            "68": 9.77766,
+            "69": 9.78865,
+            "70": 9.73813,
+            "71": 9.61811,
+            "72": 9.58068,
+            "73": 9.4964,
+            "74": 8.93812,
+            "75": 9.42081,
+            "76": 9.07416,
+            "77": 10.06077,
+            "78": 9.71952,
+            "79": 9.37088,
+            "80": 9.39874,
+            "81": 9.47802,
+            "82": 9.69299,
+            "83": 9.30276,
+            "84": 9.41548,
+            "85": 9.60883,
+            "86": 9.07461,
+            "87": 9.58826,
+            "88": 9.74392,
+            "89": 9.5951,
+            "90": 9.81217,
+            "91": 9.33796,
+            "92": 9.3534,
+            "93": 9.07315,
+            "94": 8.83127,
+            "95": 9.51524,
+            "96": 9.52183,
+            "97": 9.31012,
+            "98": 9.66532,
+            "99": 8.88179,
+            "100": 9.39375
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2509.0,
+            "52": 2528.0,
+            "53": 2851.0,
+            "54": 2747.0,
+            "55": 2333.0,
+            "56": 2724.0,
+            "57": 2315.0,
+            "58": 2754.0,
+            "59": 2774.0,
+            "60": 2336.0,
+            "61": 2912.0,
+            "62": 2415.0,
+            "63": 2341.0,
+            "64": 2837.0,
+            "65": 2661.0,
+            "66": 3000.0,
+            "67": 2779.0,
+            "68": 2691.0,
+            "69": 2793.0,
+            "70": 3183.0,
+            "71": 2962.0,
+            "72": 2393.0,
+            "73": 2997.0,
+            "74": 1935.0,
+            "75": 2463.0,
+            "76": 3065.0,
+            "77": 3184.0,
+            "78": 3154.0,
+            "79": 3127.0,
+            "80": 3286.0,
+            "81": 3386.0,
+            "82": 3128.0,
+            "83": 2608.0,
+            "84": 3079.0,
+            "85": 3260.0,
+            "86": 2687.0,
+            "87": 3591.0,
+            "88": 3035.0,
+            "89": 3165.0,
+            "90": 3166.0,
+            "91": 2690.0,
+            "92": 2897.0,
+            "93": 2630.0,
+            "94": 3348.0,
+            "95": 3349.0,
+            "96": 3288.0,
+            "97": 3055.0,
+            "98": 3516.0,
+            "99": 3035.0,
+            "100": 3109.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 746195456.0,
+            "52": 746195456.0,
+            "53": 746195456.0,
+            "54": 746195456.0,
+            "55": 746195456.0,
+            "56": 746195456.0,
+            "57": 746195456.0,
+            "58": 746195456.0,
+            "59": 746195456.0,
+            "60": 746195456.0,
+            "61": 746195456.0,
+            "62": 746195456.0,
+            "63": 746195456.0,
+            "64": 746195456.0,
+            "65": 746195456.0,
+            "66": 746195456.0,
+            "67": 746195456.0,
+            "68": 746195456.0,
+            "69": 746195456.0,
+            "70": 746195456.0,
+            "71": 746195456.0,
+            "72": 746195456.0,
+            "73": 746195456.0,
+            "74": 746195456.0,
+            "75": 746195456.0,
+            "76": 746195456.0,
+            "77": 746195456.0,
+            "78": 746195456.0,
+            "79": 746195456.0,
+            "80": 746195456.0,
+            "81": 746195456.0,
+            "82": 746195456.0,
+            "83": 746195456.0,
+            "84": 746195456.0,
+            "85": 746195456.0,
+            "86": 746195456.0,
+            "87": 746195456.0,
+            "88": 746195456.0,
+            "89": 746195456.0,
+            "90": 746195456.0,
+            "91": 746195456.0,
+            "92": 746195456.0,
+            "93": 746195456.0,
+            "94": 746195456.0,
+            "95": 746195456.0,
+            "96": 746195456.0,
+            "97": 746195456.0,
+            "98": 746195456.0,
+            "99": 746195456.0,
+            "100": 746195456.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2210899968.0,
+            "52": 2210900992.0,
+            "53": 2210900992.0,
+            "54": 2210900992.0,
+            "55": 2210900992.0,
+            "56": 2210900992.0,
+            "57": 2210900992.0,
+            "58": 2210900992.0,
+            "59": 2210900992.0,
+            "60": 2210900992.0,
+            "61": 2210900992.0,
+            "62": 2210900992.0,
+            "63": 2210900992.0,
+            "64": 2210900992.0,
+            "65": 2210900992.0,
+            "66": 2210900992.0,
+            "67": 2210900992.0,
+            "68": 2210900992.0,
+            "69": 2210900992.0,
+            "70": 2210900992.0,
+            "71": 2210900992.0,
+            "72": 2210900992.0,
+            "73": 2210900992.0,
+            "74": 2210900992.0,
+            "75": 2210900992.0,
+            "76": 2210900992.0,
+            "77": 2210900992.0,
+            "78": 2210900992.0,
+            "79": 2210900992.0,
+            "80": 2210900992.0,
+            "81": 2210900992.0,
+            "82": 2210900992.0,
+            "83": 2210900992.0,
+            "84": 2210900992.0,
+            "85": 2210900992.0,
+            "86": 2210900992.0,
+            "87": 2210900992.0,
+            "88": 2210900992.0,
+            "89": 2210900992.0,
+            "90": 2210900992.0,
+            "91": 2210900992.0,
+            "92": 2210900992.0,
+            "93": 2210900992.0,
+            "94": 2210900992.0,
+            "95": 2210900992.0,
+            "96": 2210900992.0,
+            "97": 2210900992.0,
+            "98": 2210900992.0,
+            "99": 2210900992.0,
+            "100": 2210900992.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8.5499,
+            "52": 0.12372,
+            "53": 0.09645,
+            "54": 0.09114,
+            "55": 0.08966,
+            "56": 0.09034,
+            "57": 0.08956,
+            "58": 0.09056,
+            "59": 0.09042,
+            "60": 0.0897,
+            "61": 0.09016,
+            "62": 0.09046,
+            "63": 0.08857,
+            "64": 0.08779,
+            "65": 0.08907,
+            "66": 0.08837,
+            "67": 0.08806,
+            "68": 0.08776,
+            "69": 0.08756,
+            "70": 0.08787,
+            "71": 0.08828,
+            "72": 0.08894,
+            "73": 0.08812,
+            "74": 0.08757,
+            "75": 0.08963,
+            "76": 0.09209,
+            "77": 0.0916,
+            "78": 0.09224,
+            "79": 0.09091,
+            "80": 0.08695,
+            "81": 0.0874,
+            "82": 0.08839,
+            "83": 0.08746,
+            "84": 0.09295,
+            "85": 0.09,
+            "86": 0.09021,
+            "87": 0.09075,
+            "88": 0.08904,
+            "89": 0.08839,
+            "90": 0.08875,
+            "91": 0.08852,
+            "92": 0.08796,
+            "93": 0.08905,
+            "94": 0.08832,
+            "95": 0.08897,
+            "96": 0.08836,
+            "97": 0.08869,
+            "98": 0.08858,
+            "99": 0.08878,
+            "100": 0.08832
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100.json
index 1246b8727ef..6e5f31a169a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.43574,
-            "2": 0.18308,
-            "3": 0.16294,
-            "4": 0.15632,
-            "5": 0.15517,
-            "6": 0.15061,
-            "7": 0.15109,
-            "8": 0.1538,
-            "9": 0.15077,
-            "10": 0.15142,
-            "11": 0.15024,
-            "12": 0.15039,
-            "13": 0.14987,
-            "14": 0.15044,
-            "15": 0.1495,
-            "16": 0.15003,
-            "17": 0.14988,
-            "18": 0.1497,
-            "19": 0.15459,
-            "20": 0.15076,
-            "21": 0.1498,
-            "22": 0.15044,
-            "23": 0.15051,
-            "24": 0.15062,
-            "25": 0.14953,
-            "26": 0.15047,
-            "27": 0.14851,
-            "28": 0.14802,
-            "29": 0.14861,
-            "30": 0.1485,
-            "31": 0.1498,
-            "32": 0.14871,
-            "33": 0.1485,
-            "34": 0.14707,
-            "35": 0.14796,
-            "36": 0.14719,
-            "37": 0.15012,
-            "38": 0.14804,
-            "39": 0.1487,
-            "40": 0.14779,
-            "41": 0.14844,
-            "42": 0.1496,
-            "43": 0.15014,
-            "44": 0.14977,
-            "45": 0.1478,
-            "46": 0.14891,
-            "47": 0.14844,
-            "48": 0.1488,
-            "49": 0.14931,
-            "50": 0.14761,
-            "51": 0.15888,
-            "52": 0.1517,
-            "53": 0.14904,
-            "54": 0.17961,
-            "55": 0.14804,
-            "56": 0.1496,
-            "57": 0.1487,
-            "58": 0.14801,
-            "59": 0.14729,
-            "60": 0.14749,
-            "61": 0.14745,
-            "62": 0.1471,
-            "63": 0.14817,
-            "64": 0.1497,
-            "65": 0.14753,
-            "66": 0.14753,
-            "67": 0.14859,
-            "68": 0.14714,
-            "69": 0.14776,
-            "70": 0.14847,
-            "71": 0.14829,
-            "72": 0.14858,
-            "73": 0.14828,
-            "74": 0.14783,
-            "75": 0.14793,
-            "76": 0.14768,
-            "77": 0.14752,
-            "78": 0.14931,
-            "79": 0.15045,
-            "80": 0.14813,
-            "81": 0.1489,
-            "82": 0.1475,
-            "83": 0.14844,
-            "84": 0.1489,
-            "85": 0.14809,
-            "86": 0.14835,
-            "87": 0.14718,
-            "88": 0.14876,
-            "89": 0.14859,
-            "90": 0.1479,
-            "91": 0.14803,
-            "92": 0.14798,
-            "93": 0.14876,
-            "94": 0.14705,
-            "95": 0.14837,
-            "96": 0.14805,
-            "97": 0.14837,
-            "98": 0.14721,
-            "99": 0.14843,
-            "100": 0.14828
+            "1": 3.59409,
+            "2": 0.17465,
+            "3": 0.16266,
+            "4": 0.1495,
+            "5": 0.14527,
+            "6": 0.14428,
+            "7": 0.14381,
+            "8": 0.14313,
+            "9": 0.14427,
+            "10": 0.14389,
+            "11": 0.1443,
+            "12": 0.14275,
+            "13": 0.1429,
+            "14": 0.14279,
+            "15": 0.14378,
+            "16": 0.14358,
+            "17": 0.14299,
+            "18": 0.14217,
+            "19": 0.14256,
+            "20": 0.14345,
+            "21": 0.14367,
+            "22": 0.14305,
+            "23": 0.14257,
+            "24": 0.14186,
+            "25": 0.1423,
+            "26": 0.14156,
+            "27": 0.14279,
+            "28": 0.14152,
+            "29": 0.14248,
+            "30": 0.14222,
+            "31": 0.14276,
+            "32": 0.14268,
+            "33": 0.14313,
+            "34": 0.14133,
+            "35": 0.14312,
+            "36": 0.14147,
+            "37": 0.14217,
+            "38": 0.14071,
+            "39": 0.14226,
+            "40": 0.14163,
+            "41": 0.14393,
+            "42": 0.14189,
+            "43": 0.14266,
+            "44": 0.14185,
+            "45": 0.1438,
+            "46": 0.14173,
+            "47": 0.14272,
+            "48": 0.14379,
+            "49": 0.14245,
+            "50": 0.1422,
+            "51": 0.1491,
+            "52": 0.16902,
+            "53": 0.14276,
+            "54": 0.14121,
+            "55": 0.14203,
+            "56": 0.14111,
+            "57": 0.14215,
+            "58": 0.14121,
+            "59": 0.14274,
+            "60": 0.14079,
+            "61": 0.14212,
+            "62": 0.14078,
+            "63": 0.14277,
+            "64": 0.14264,
+            "65": 0.14256,
+            "66": 0.14207,
+            "67": 0.14426,
+            "68": 0.14138,
+            "69": 0.14293,
+            "70": 0.1423,
+            "71": 0.14265,
+            "72": 0.14181,
+            "73": 0.14253,
+            "74": 0.14239,
+            "75": 0.1436,
+            "76": 0.14184,
+            "77": 0.14185,
+            "78": 0.14261,
+            "79": 0.14322,
+            "80": 0.14295,
+            "81": 0.14304,
+            "82": 0.14307,
+            "83": 0.14253,
+            "84": 0.14179,
+            "85": 0.14257,
+            "86": 0.14198,
+            "87": 0.15027,
+            "88": 0.14143,
+            "89": 0.14408,
+            "90": 0.14207,
+            "91": 0.14351,
+            "92": 0.14216,
+            "93": 0.14223,
+            "94": 0.14137,
+            "95": 0.14285,
+            "96": 0.14202,
+            "97": 0.14246,
+            "98": 0.1411,
+            "99": 0.14199,
+            "100": 0.14181
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..34c3b02116b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84055,
+            "52": 9.73438,
+            "53": 10.05482,
+            "54": 9.94058,
+            "55": 9.87124,
+            "56": 9.61045,
+            "57": 9.46116,
+            "58": 9.81654,
+            "59": 9.57887,
+            "60": 9.48507,
+            "61": 9.68515,
+            "62": 9.97438,
+            "63": 9.36298,
+            "64": 9.76793,
+            "65": 8.93913,
+            "66": 9.68918,
+            "67": 9.36638,
+            "68": 9.77507,
+            "69": 9.78344,
+            "70": 9.72196,
+            "71": 9.60806,
+            "72": 9.57714,
+            "73": 9.48934,
+            "74": 8.94008,
+            "75": 9.40867,
+            "76": 9.08075,
+            "77": 10.05717,
+            "78": 9.72281,
+            "79": 9.36465,
+            "80": 9.39746,
+            "81": 9.47553,
+            "82": 9.6886,
+            "83": 9.30263,
+            "84": 9.41008,
+            "85": 9.60793,
+            "86": 9.07115,
+            "87": 9.58676,
+            "88": 9.74129,
+            "89": 9.5986,
+            "90": 9.81041,
+            "91": 9.33113,
+            "92": 9.35502,
+            "93": 9.07481,
+            "94": 8.82745,
+            "95": 9.51149,
+            "96": 9.51876,
+            "97": 9.30173,
+            "98": 9.66726,
+            "99": 8.88087,
+            "100": 9.39727
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2574.0,
+            "52": 2457.0,
+            "53": 2905.0,
+            "54": 2609.0,
+            "55": 2220.0,
+            "56": 2663.0,
+            "57": 2258.0,
+            "58": 2898.0,
+            "59": 2676.0,
+            "60": 2397.0,
+            "61": 3048.0,
+            "62": 2533.0,
+            "63": 2370.0,
+            "64": 2975.0,
+            "65": 2591.0,
+            "66": 3065.0,
+            "67": 2732.0,
+            "68": 2870.0,
+            "69": 2955.0,
+            "70": 3112.0,
+            "71": 2989.0,
+            "72": 2451.0,
+            "73": 2881.0,
+            "74": 1859.0,
+            "75": 2649.0,
+            "76": 3026.0,
+            "77": 3316.0,
+            "78": 3212.0,
+            "79": 3183.0,
+            "80": 3262.0,
+            "81": 3669.0,
+            "82": 3187.0,
+            "83": 2798.0,
+            "84": 3209.0,
+            "85": 3309.0,
+            "86": 2738.0,
+            "87": 3804.0,
+            "88": 2989.0,
+            "89": 3327.0,
+            "90": 3031.0,
+            "91": 2720.0,
+            "92": 2972.0,
+            "93": 2719.0,
+            "94": 3387.0,
+            "95": 3321.0,
+            "96": 3342.0,
+            "97": 3191.0,
+            "98": 3533.0,
+            "99": 3214.0,
+            "100": 3318.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 716834304.0,
+            "52": 716834304.0,
+            "53": 716834304.0,
+            "54": 716834304.0,
+            "55": 716834304.0,
+            "56": 716834304.0,
+            "57": 716834304.0,
+            "58": 716834304.0,
+            "59": 716834304.0,
+            "60": 716834304.0,
+            "61": 716834304.0,
+            "62": 716834304.0,
+            "63": 716834304.0,
+            "64": 716834304.0,
+            "65": 716834304.0,
+            "66": 716834304.0,
+            "67": 716834304.0,
+            "68": 716834304.0,
+            "69": 716834304.0,
+            "70": 716834304.0,
+            "71": 716834304.0,
+            "72": 716834304.0,
+            "73": 716834304.0,
+            "74": 716834304.0,
+            "75": 716834304.0,
+            "76": 716834304.0,
+            "77": 716834304.0,
+            "78": 716834304.0,
+            "79": 716834304.0,
+            "80": 716834304.0,
+            "81": 716834304.0,
+            "82": 716834304.0,
+            "83": 716834304.0,
+            "84": 716834304.0,
+            "85": 716834304.0,
+            "86": 716834304.0,
+            "87": 716834304.0,
+            "88": 716834304.0,
+            "89": 716834304.0,
+            "90": 716834304.0,
+            "91": 716834304.0,
+            "92": 716834304.0,
+            "93": 716834304.0,
+            "94": 716834304.0,
+            "95": 716834304.0,
+            "96": 716834304.0,
+            "97": 716834304.0,
+            "98": 716834304.0,
+            "99": 716834304.0,
+            "100": 716834304.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2194121728.0,
+            "52": 2194122752.0,
+            "53": 2194122752.0,
+            "54": 2194122752.0,
+            "55": 2194122752.0,
+            "56": 2194122752.0,
+            "57": 2194122752.0,
+            "58": 2194122752.0,
+            "59": 2194122752.0,
+            "60": 2194122752.0,
+            "61": 2194122752.0,
+            "62": 2194122752.0,
+            "63": 2194122752.0,
+            "64": 2194122752.0,
+            "65": 2194122752.0,
+            "66": 2194122752.0,
+            "67": 2194122752.0,
+            "68": 2194122752.0,
+            "69": 2194122752.0,
+            "70": 2194122752.0,
+            "71": 2194122752.0,
+            "72": 2194122752.0,
+            "73": 2194122752.0,
+            "74": 2194122752.0,
+            "75": 2194122752.0,
+            "76": 2194122752.0,
+            "77": 2194122752.0,
+            "78": 2194122752.0,
+            "79": 2194122752.0,
+            "80": 2194122752.0,
+            "81": 2194122752.0,
+            "82": 2194122752.0,
+            "83": 2194122752.0,
+            "84": 2194122752.0,
+            "85": 2194122752.0,
+            "86": 2194122752.0,
+            "87": 2194122752.0,
+            "88": 2194122752.0,
+            "89": 2194122752.0,
+            "90": 2194122752.0,
+            "91": 2194122752.0,
+            "92": 2194122752.0,
+            "93": 2194122752.0,
+            "94": 2194122752.0,
+            "95": 2194122752.0,
+            "96": 2194122752.0,
+            "97": 2194122752.0,
+            "98": 2194122752.0,
+            "99": 2194122752.0,
+            "100": 2194122752.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8.8238,
+            "52": 0.63078,
+            "53": 0.15101,
+            "54": 0.14953,
+            "55": 0.15024,
+            "56": 0.14932,
+            "57": 0.15011,
+            "58": 0.15001,
+            "59": 0.15206,
+            "60": 0.14938,
+            "61": 0.1487,
+            "62": 0.14818,
+            "63": 0.14803,
+            "64": 0.15056,
+            "65": 0.14975,
+            "66": 0.14796,
+            "67": 0.14853,
+            "68": 0.14679,
+            "69": 0.14809,
+            "70": 0.14665,
+            "71": 0.14693,
+            "72": 0.1481,
+            "73": 0.14536,
+            "74": 0.14342,
+            "75": 0.14313,
+            "76": 0.14287,
+            "77": 0.14085,
+            "78": 0.14168,
+            "79": 0.14286,
+            "80": 0.14201,
+            "81": 0.14225,
+            "82": 0.14262,
+            "83": 0.14349,
+            "84": 0.14179,
+            "85": 0.14222,
+            "86": 0.14195,
+            "87": 0.14171,
+            "88": 0.14105,
+            "89": 0.14252,
+            "90": 0.14411,
+            "91": 0.1446,
+            "92": 0.14295,
+            "93": 0.14308,
+            "94": 0.14176,
+            "95": 0.14267,
+            "96": 0.14302,
+            "97": 0.14305,
+            "98": 0.14273,
+            "99": 0.14183,
+            "100": 0.14202
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..1fc5ef869c5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.74992,
+            "2": 10.77613,
+            "3": 10.75714,
+            "4": 10.72305,
+            "5": 10.80036,
+            "6": 10.821,
+            "7": 10.77176,
+            "8": 10.7988,
+            "9": 10.77447,
+            "10": 10.70645,
+            "11": 10.8328,
+            "12": 10.81872,
+            "13": 10.83078,
+            "14": 10.83381,
+            "15": 10.76396,
+            "16": 10.76573,
+            "17": 10.71925,
+            "18": 10.76797,
+            "19": 10.75316,
+            "20": 10.70911,
+            "21": 10.69217,
+            "22": 10.56534,
+            "23": 10.70907,
+            "24": 10.6159,
+            "25": 10.55058,
+            "26": 10.62591,
+            "27": 10.64705,
+            "28": 10.63623,
+            "29": 10.65641,
+            "30": 10.43675,
+            "31": 10.21912,
+            "32": 10.5512,
+            "33": 10.53381,
+            "34": 10.31821,
+            "35": 10.36833,
+            "36": 10.3562,
+            "37": 10.46302,
+            "38": 10.33833,
+            "39": 10.50306,
+            "40": 10.23446,
+            "41": 10.27335,
+            "42": 10.3295,
+            "43": 9.97414,
+            "44": 10.1075,
+            "45": 9.98853,
+            "46": 9.95474,
+            "47": 10.2514,
+            "48": 10.01228,
+            "49": 9.70796,
+            "50": 10.05505,
+            "51": 9.9812,
+            "52": 9.89198,
+            "53": 10.19208,
+            "54": 10.09574,
+            "55": 10.00506,
+            "56": 9.78714,
+            "57": 9.64607,
+            "58": 9.9862,
+            "59": 9.72684,
+            "60": 9.67172,
+            "61": 9.80984,
+            "62": 10.11126,
+            "63": 9.54877,
+            "64": 9.90929,
+            "65": 9.08735,
+            "66": 9.84659,
+            "67": 9.48264,
+            "68": 9.89439,
+            "69": 9.87695,
+            "70": 9.82469,
+            "71": 9.72751,
+            "72": 9.72911,
+            "73": 9.62051,
+            "74": 9.11601,
+            "75": 9.55057,
+            "76": 9.21504,
+            "77": 10.14893,
+            "78": 9.8138,
+            "79": 9.47515,
+            "80": 9.51582,
+            "81": 9.58685,
+            "82": 9.79026,
+            "83": 9.45587,
+            "84": 9.50503,
+            "85": 9.71387,
+            "86": 9.17463,
+            "87": 9.66601,
+            "88": 9.84354,
+            "89": 9.70734,
+            "90": 9.8955,
+            "91": 9.48652,
+            "92": 9.47023,
+            "93": 9.21481,
+            "94": 8.94327,
+            "95": 9.6154,
+            "96": 9.63634,
+            "97": 9.37644,
+            "98": 9.74975,
+            "99": 9.01753,
+            "100": 9.50515
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2656.0,
+            "2": 2663.0,
+            "3": 2673.0,
+            "4": 2426.0,
+            "5": 2931.0,
+            "6": 3062.0,
+            "7": 2591.0,
+            "8": 2693.0,
+            "9": 2713.0,
+            "10": 2502.0,
+            "11": 2904.0,
+            "12": 2792.0,
+            "13": 2979.0,
+            "14": 3000.0,
+            "15": 2952.0,
+            "16": 2860.0,
+            "17": 2717.0,
+            "18": 2802.0,
+            "19": 2868.0,
+            "20": 2620.0,
+            "21": 2792.0,
+            "22": 2532.0,
+            "23": 2701.0,
+            "24": 2580.0,
+            "25": 2466.0,
+            "26": 2839.0,
+            "27": 2703.0,
+            "28": 2719.0,
+            "29": 2971.0,
+            "30": 2755.0,
+            "31": 2448.0,
+            "32": 2670.0,
+            "33": 2791.0,
+            "34": 2439.0,
+            "35": 2662.0,
+            "36": 2496.0,
+            "37": 2806.0,
+            "38": 2697.0,
+            "39": 2786.0,
+            "40": 2539.0,
+            "41": 2605.0,
+            "42": 2640.0,
+            "43": 2324.0,
+            "44": 2548.0,
+            "45": 2291.0,
+            "46": 2437.0,
+            "47": 2605.0,
+            "48": 2395.0,
+            "49": 2478.0,
+            "50": 2633.0,
+            "51": 2676.0,
+            "52": 2581.0,
+            "53": 2898.0,
+            "54": 2849.0,
+            "55": 2548.0,
+            "56": 2661.0,
+            "57": 2510.0,
+            "58": 2758.0,
+            "59": 2650.0,
+            "60": 2242.0,
+            "61": 2628.0,
+            "62": 2899.0,
+            "63": 2605.0,
+            "64": 2939.0,
+            "65": 2572.0,
+            "66": 2896.0,
+            "67": 2640.0,
+            "68": 2709.0,
+            "69": 2889.0,
+            "70": 3012.0,
+            "71": 2978.0,
+            "72": 2536.0,
+            "73": 2964.0,
+            "74": 2163.0,
+            "75": 2603.0,
+            "76": 2974.0,
+            "77": 3007.0,
+            "78": 3138.0,
+            "79": 3197.0,
+            "80": 2984.0,
+            "81": 3280.0,
+            "82": 3341.0,
+            "83": 2757.0,
+            "84": 3399.0,
+            "85": 3320.0,
+            "86": 2882.0,
+            "87": 3407.0,
+            "88": 3278.0,
+            "89": 3336.0,
+            "90": 3322.0,
+            "91": 2472.0,
+            "92": 3061.0,
+            "93": 2911.0,
+            "94": 3005.0,
+            "95": 2984.0,
+            "96": 2991.0,
+            "97": 3178.0,
+            "98": 3343.0,
+            "99": 2929.0,
+            "100": 2588.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 745732608.0,
+            "2": 745732608.0,
+            "3": 745732608.0,
+            "4": 745732608.0,
+            "5": 745732608.0,
+            "6": 745732608.0,
+            "7": 745732608.0,
+            "8": 745732608.0,
+            "9": 745732608.0,
+            "10": 745732608.0,
+            "11": 745732608.0,
+            "12": 745732608.0,
+            "13": 745732608.0,
+            "14": 745732608.0,
+            "15": 745732608.0,
+            "16": 745732608.0,
+            "17": 745732608.0,
+            "18": 745732608.0,
+            "19": 745732608.0,
+            "20": 745732608.0,
+            "21": 745732608.0,
+            "22": 745732608.0,
+            "23": 745732608.0,
+            "24": 745732608.0,
+            "25": 745732608.0,
+            "26": 745732608.0,
+            "27": 745732608.0,
+            "28": 745732608.0,
+            "29": 745732608.0,
+            "30": 745732608.0,
+            "31": 745732608.0,
+            "32": 745732608.0,
+            "33": 745732608.0,
+            "34": 745732608.0,
+            "35": 745732608.0,
+            "36": 745732608.0,
+            "37": 745732608.0,
+            "38": 745732608.0,
+            "39": 745732608.0,
+            "40": 745732608.0,
+            "41": 745732608.0,
+            "42": 745732608.0,
+            "43": 745732608.0,
+            "44": 745732608.0,
+            "45": 745732608.0,
+            "46": 745732608.0,
+            "47": 745732608.0,
+            "48": 745732608.0,
+            "49": 745732608.0,
+            "50": 745732608.0,
+            "51": 745732608.0,
+            "52": 745732608.0,
+            "53": 745732608.0,
+            "54": 745732608.0,
+            "55": 745732608.0,
+            "56": 745732608.0,
+            "57": 745732608.0,
+            "58": 745732608.0,
+            "59": 745732608.0,
+            "60": 745732608.0,
+            "61": 745732608.0,
+            "62": 745732608.0,
+            "63": 745732608.0,
+            "64": 745732608.0,
+            "65": 745732608.0,
+            "66": 745732608.0,
+            "67": 745732608.0,
+            "68": 745732608.0,
+            "69": 745732608.0,
+            "70": 745732608.0,
+            "71": 745732608.0,
+            "72": 745732608.0,
+            "73": 745732608.0,
+            "74": 745732608.0,
+            "75": 745732608.0,
+            "76": 745732608.0,
+            "77": 745732608.0,
+            "78": 745732608.0,
+            "79": 745732608.0,
+            "80": 745732608.0,
+            "81": 745732608.0,
+            "82": 745732608.0,
+            "83": 745732608.0,
+            "84": 745732608.0,
+            "85": 745732608.0,
+            "86": 745732608.0,
+            "87": 745732608.0,
+            "88": 745732608.0,
+            "89": 745732608.0,
+            "90": 745732608.0,
+            "91": 745732608.0,
+            "92": 745732608.0,
+            "93": 745732608.0,
+            "94": 745732608.0,
+            "95": 745732608.0,
+            "96": 745732608.0,
+            "97": 745732608.0,
+            "98": 745732608.0,
+            "99": 745732608.0,
+            "100": 745732608.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1940442112.0,
+            "2": 2223151104.0,
+            "3": 2223151104.0,
+            "4": 2223151104.0,
+            "5": 2223151104.0,
+            "6": 2223151104.0,
+            "7": 2223151104.0,
+            "8": 2223151104.0,
+            "9": 2223151104.0,
+            "10": 2223151104.0,
+            "11": 2223151104.0,
+            "12": 2223151104.0,
+            "13": 2223151104.0,
+            "14": 2223151104.0,
+            "15": 2223151104.0,
+            "16": 2223151104.0,
+            "17": 2223151104.0,
+            "18": 2223151104.0,
+            "19": 2223151104.0,
+            "20": 2223151104.0,
+            "21": 2223151104.0,
+            "22": 2223151104.0,
+            "23": 2223151104.0,
+            "24": 2223151104.0,
+            "25": 2223151104.0,
+            "26": 2223151104.0,
+            "27": 2223151104.0,
+            "28": 2223151104.0,
+            "29": 2223151104.0,
+            "30": 2223151104.0,
+            "31": 2223151104.0,
+            "32": 2223151104.0,
+            "33": 2223151104.0,
+            "34": 2223151104.0,
+            "35": 2223151104.0,
+            "36": 2223151104.0,
+            "37": 2223151104.0,
+            "38": 2223151104.0,
+            "39": 2223151104.0,
+            "40": 2223151104.0,
+            "41": 2223151104.0,
+            "42": 2223151104.0,
+            "43": 2223151104.0,
+            "44": 2223151104.0,
+            "45": 2223151104.0,
+            "46": 2223151104.0,
+            "47": 2223151104.0,
+            "48": 2223151104.0,
+            "49": 2223151104.0,
+            "50": 2223151104.0,
+            "51": 2223151104.0,
+            "52": 2223151104.0,
+            "53": 2223151104.0,
+            "54": 2223151104.0,
+            "55": 2223151104.0,
+            "56": 2223151104.0,
+            "57": 2223151104.0,
+            "58": 2223151104.0,
+            "59": 2223151104.0,
+            "60": 2223151104.0,
+            "61": 2223151104.0,
+            "62": 2223151104.0,
+            "63": 2223151104.0,
+            "64": 2223151104.0,
+            "65": 2223151104.0,
+            "66": 2223151104.0,
+            "67": 2223151104.0,
+            "68": 2223151104.0,
+            "69": 2223151104.0,
+            "70": 2223151104.0,
+            "71": 2223151104.0,
+            "72": 2223151104.0,
+            "73": 2223151104.0,
+            "74": 2223151104.0,
+            "75": 2223151104.0,
+            "76": 2223151104.0,
+            "77": 2223151104.0,
+            "78": 2223151104.0,
+            "79": 2223151104.0,
+            "80": 2223151104.0,
+            "81": 2223151104.0,
+            "82": 2223151104.0,
+            "83": 2223151104.0,
+            "84": 2223151104.0,
+            "85": 2223151104.0,
+            "86": 2223151104.0,
+            "87": 2223151104.0,
+            "88": 2223151104.0,
+            "89": 2223151104.0,
+            "90": 2223151104.0,
+            "91": 2223151104.0,
+            "92": 2223151104.0,
+            "93": 2223151104.0,
+            "94": 2223151104.0,
+            "95": 2223151104.0,
+            "96": 2223151104.0,
+            "97": 2223151104.0,
+            "98": 2223151104.0,
+            "99": 2223151104.0,
+            "100": 2223151104.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.5568,
+            "3": 0.14788,
+            "4": 0.13602,
+            "5": 0.13596,
+            "6": 0.136,
+            "7": 0.13621,
+            "8": 0.13502,
+            "9": 0.13408,
+            "10": 0.23083,
+            "11": 0.14377,
+            "12": 0.14332,
+            "13": 0.15453,
+            "14": 0.15537,
+            "15": 0.15549,
+            "16": 0.15444,
+            "17": 0.15453,
+            "18": 0.15178,
+            "19": 0.21432,
+            "20": 0.15336,
+            "21": 0.1534,
+            "22": 0.15483,
+            "23": 0.15395,
+            "24": 0.15469,
+            "25": 0.15447,
+            "26": 0.15509,
+            "27": 0.1545,
+            "28": 0.15527,
+            "29": 0.15593,
+            "30": 0.15688,
+            "31": 0.15659,
+            "32": 0.15629,
+            "33": 0.15533,
+            "34": 0.155,
+            "35": 0.15519,
+            "36": 0.15784,
+            "37": 0.15943,
+            "38": 0.15552,
+            "39": 0.15486,
+            "40": 0.15539,
+            "41": 0.15618,
+            "42": 0.15569,
+            "43": 0.15725,
+            "44": 0.15522,
+            "45": 0.1553,
+            "46": 0.15719,
+            "47": 0.15571,
+            "48": 0.15568,
+            "49": 0.15362,
+            "50": 0.15495,
+            "51": 0.18287,
+            "52": 0.16115,
+            "53": 0.15739,
+            "54": 0.15665,
+            "55": 0.15684,
+            "56": 0.15658,
+            "57": 0.15631,
+            "58": 0.22153,
+            "59": 0.15604,
+            "60": 0.15313,
+            "61": 0.15485,
+            "62": 0.15518,
+            "63": 0.15719,
+            "64": 0.15757,
+            "65": 0.15904,
+            "66": 0.15846,
+            "67": 0.15846,
+            "68": 0.15754,
+            "69": 0.15779,
+            "70": 0.1589,
+            "71": 0.16037,
+            "72": 0.15778,
+            "73": 0.15771,
+            "74": 0.155,
+            "75": 0.15611,
+            "76": 0.15702,
+            "77": 0.15564,
+            "78": 0.15892,
+            "79": 0.15669,
+            "80": 0.15768,
+            "81": 0.15805,
+            "82": 0.15778,
+            "83": 0.15674,
+            "84": 0.15715,
+            "85": 0.15834,
+            "86": 0.15763,
+            "87": 0.15855,
+            "88": 0.15589,
+            "89": 0.15616,
+            "90": 0.15639,
+            "91": 0.15722,
+            "92": 0.15788,
+            "93": 0.15597,
+            "94": 0.15817,
+            "95": 0.15819,
+            "96": 0.15869,
+            "97": 0.15875,
+            "98": 0.15993,
+            "99": 0.16297,
+            "100": 0.16682
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..2cf6e26ff95
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.9812,
+            "52": 9.89198,
+            "53": 10.19208,
+            "54": 10.09574,
+            "55": 10.00506,
+            "56": 9.78714,
+            "57": 9.64607,
+            "58": 9.9862,
+            "59": 9.72684,
+            "60": 9.67172,
+            "61": 9.80984,
+            "62": 10.11126,
+            "63": 9.54877,
+            "64": 9.90929,
+            "65": 9.08735,
+            "66": 9.84659,
+            "67": 9.48264,
+            "68": 9.89439,
+            "69": 9.87695,
+            "70": 9.82469,
+            "71": 9.72751,
+            "72": 9.72911,
+            "73": 9.62051,
+            "74": 9.11601,
+            "75": 9.55057,
+            "76": 9.21504,
+            "77": 10.14893,
+            "78": 9.8138,
+            "79": 9.47515,
+            "80": 9.51582,
+            "81": 9.58685,
+            "82": 9.79026,
+            "83": 9.45587,
+            "84": 9.50503,
+            "85": 9.71387,
+            "86": 9.17463,
+            "87": 9.66601,
+            "88": 9.84354,
+            "89": 9.70734,
+            "90": 9.8955,
+            "91": 9.48652,
+            "92": 9.47023,
+            "93": 9.21481,
+            "94": 8.94327,
+            "95": 9.6154,
+            "96": 9.63634,
+            "97": 9.37644,
+            "98": 9.74975,
+            "99": 9.01753,
+            "100": 9.50515
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2676.0,
+            "52": 2581.0,
+            "53": 2898.0,
+            "54": 2849.0,
+            "55": 2548.0,
+            "56": 2661.0,
+            "57": 2510.0,
+            "58": 2758.0,
+            "59": 2650.0,
+            "60": 2242.0,
+            "61": 2628.0,
+            "62": 2899.0,
+            "63": 2605.0,
+            "64": 2939.0,
+            "65": 2572.0,
+            "66": 2896.0,
+            "67": 2640.0,
+            "68": 2709.0,
+            "69": 2889.0,
+            "70": 3012.0,
+            "71": 2978.0,
+            "72": 2536.0,
+            "73": 2964.0,
+            "74": 2163.0,
+            "75": 2603.0,
+            "76": 2974.0,
+            "77": 3007.0,
+            "78": 3138.0,
+            "79": 3197.0,
+            "80": 2984.0,
+            "81": 3280.0,
+            "82": 3341.0,
+            "83": 2757.0,
+            "84": 3399.0,
+            "85": 3320.0,
+            "86": 2882.0,
+            "87": 3407.0,
+            "88": 3278.0,
+            "89": 3336.0,
+            "90": 3322.0,
+            "91": 2472.0,
+            "92": 3061.0,
+            "93": 2911.0,
+            "94": 3005.0,
+            "95": 2984.0,
+            "96": 2991.0,
+            "97": 3178.0,
+            "98": 3343.0,
+            "99": 2929.0,
+            "100": 2588.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 744880640.0,
+            "52": 744880640.0,
+            "53": 744880640.0,
+            "54": 744880640.0,
+            "55": 744880640.0,
+            "56": 744880640.0,
+            "57": 744880640.0,
+            "58": 744880640.0,
+            "59": 744880640.0,
+            "60": 744880640.0,
+            "61": 744880640.0,
+            "62": 744880640.0,
+            "63": 744880640.0,
+            "64": 744880640.0,
+            "65": 744880640.0,
+            "66": 744880640.0,
+            "67": 744880640.0,
+            "68": 744880640.0,
+            "69": 744880640.0,
+            "70": 744880640.0,
+            "71": 744880640.0,
+            "72": 744880640.0,
+            "73": 744880640.0,
+            "74": 744880640.0,
+            "75": 744880640.0,
+            "76": 744880640.0,
+            "77": 744880640.0,
+            "78": 744880640.0,
+            "79": 744880640.0,
+            "80": 744880640.0,
+            "81": 744880640.0,
+            "82": 744880640.0,
+            "83": 744880640.0,
+            "84": 744880640.0,
+            "85": 744880640.0,
+            "86": 744880640.0,
+            "87": 744880640.0,
+            "88": 744880640.0,
+            "89": 744880640.0,
+            "90": 744880640.0,
+            "91": 744880640.0,
+            "92": 744880640.0,
+            "93": 744880640.0,
+            "94": 744880640.0,
+            "95": 744880640.0,
+            "96": 744880640.0,
+            "97": 744880640.0,
+            "98": 744880640.0,
+            "99": 744880640.0,
+            "100": 744880640.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2222430208.0,
+            "52": 2222431232.0,
+            "53": 2222431232.0,
+            "54": 2222431232.0,
+            "55": 2222431232.0,
+            "56": 2222431232.0,
+            "57": 2222431232.0,
+            "58": 2222431232.0,
+            "59": 2222431232.0,
+            "60": 2222431232.0,
+            "61": 2222431232.0,
+            "62": 2222431232.0,
+            "63": 2222431232.0,
+            "64": 2222431232.0,
+            "65": 2222431232.0,
+            "66": 2222431232.0,
+            "67": 2222431232.0,
+            "68": 2222431232.0,
+            "69": 2222431232.0,
+            "70": 2222431232.0,
+            "71": 2222431232.0,
+            "72": 2222431232.0,
+            "73": 2222431232.0,
+            "74": 2222431232.0,
+            "75": 2222431232.0,
+            "76": 2222431232.0,
+            "77": 2222431232.0,
+            "78": 2222431232.0,
+            "79": 2222431232.0,
+            "80": 2222431232.0,
+            "81": 2222431232.0,
+            "82": 2222431232.0,
+            "83": 2222431232.0,
+            "84": 2222431232.0,
+            "85": 2222431232.0,
+            "86": 2222431232.0,
+            "87": 2222431232.0,
+            "88": 2222431232.0,
+            "89": 2222431232.0,
+            "90": 2222431232.0,
+            "91": 2222431232.0,
+            "92": 2222431232.0,
+            "93": 2222431232.0,
+            "94": 2222431232.0,
+            "95": 2222431232.0,
+            "96": 2222431232.0,
+            "97": 2222431232.0,
+            "98": 2222431232.0,
+            "99": 2222431232.0,
+            "100": 2222431232.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.21538,
+            "53": 0.14615,
+            "54": 0.13599,
+            "55": 0.13518,
+            "56": 0.13401,
+            "57": 0.13944,
+            "58": 0.13509,
+            "59": 0.1377,
+            "60": 0.13698,
+            "61": 0.137,
+            "62": 0.13756,
+            "63": 0.14119,
+            "64": 0.13937,
+            "65": 0.13725,
+            "66": 0.13667,
+            "67": 0.13894,
+            "68": 0.13705,
+            "69": 0.1375,
+            "70": 0.13655,
+            "71": 0.13624,
+            "72": 0.13743,
+            "73": 0.13786,
+            "74": 0.13678,
+            "75": 0.13803,
+            "76": 0.13591,
+            "77": 0.13654,
+            "78": 0.13783,
+            "79": 0.13724,
+            "80": 0.13943,
+            "81": 0.13808,
+            "82": 0.13899,
+            "83": 0.13956,
+            "84": 0.14004,
+            "85": 0.14504,
+            "86": 0.14078,
+            "87": 0.14075,
+            "88": 0.14222,
+            "89": 0.14283,
+            "90": 0.14178,
+            "91": 0.14143,
+            "92": 0.14178,
+            "93": 0.14108,
+            "94": 0.14248,
+            "95": 0.14123,
+            "96": 0.14274,
+            "97": 0.14429,
+            "98": 0.14312,
+            "99": 0.14121,
+            "100": 0.14248
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json
index 245c396be68..42889e09b26 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 745731584.0,
-            "2": 745731584.0,
-            "3": 745731584.0,
-            "4": 745731584.0,
-            "5": 745731584.0,
-            "6": 745731584.0,
-            "7": 745731584.0,
-            "8": 745731584.0,
-            "9": 745731584.0,
-            "10": 745731584.0,
-            "11": 745731584.0,
-            "12": 745731584.0,
-            "13": 745731584.0,
-            "14": 745731584.0,
-            "15": 745731584.0,
-            "16": 745731584.0,
-            "17": 745731584.0,
-            "18": 745731584.0,
-            "19": 745731584.0,
-            "20": 745731584.0,
-            "21": 745731584.0,
-            "22": 745731584.0,
-            "23": 745731584.0,
-            "24": 745731584.0,
-            "25": 745731584.0,
-            "26": 745731584.0,
-            "27": 745731584.0,
-            "28": 745731584.0,
-            "29": 745731584.0,
-            "30": 745731584.0,
-            "31": 745731584.0,
-            "32": 745731584.0,
-            "33": 745731584.0,
-            "34": 745731584.0,
-            "35": 745731584.0,
-            "36": 745731584.0,
-            "37": 745731584.0,
-            "38": 745731584.0,
-            "39": 745731584.0,
-            "40": 745731584.0,
-            "41": 745731584.0,
-            "42": 745731584.0,
-            "43": 745731584.0,
-            "44": 745731584.0,
-            "45": 745731584.0,
-            "46": 745731584.0,
-            "47": 745731584.0,
-            "48": 745731584.0,
-            "49": 745731584.0,
-            "50": 745731584.0,
-            "51": 745731584.0,
-            "52": 745731584.0,
-            "53": 745731584.0,
-            "54": 745731584.0,
-            "55": 745731584.0,
-            "56": 745731584.0,
-            "57": 745731584.0,
-            "58": 745731584.0,
-            "59": 745731584.0,
-            "60": 745731584.0,
-            "61": 745731584.0,
-            "62": 745731584.0,
-            "63": 745731584.0,
-            "64": 745731584.0,
-            "65": 745731584.0,
-            "66": 745731584.0,
-            "67": 745731584.0,
-            "68": 745731584.0,
-            "69": 745731584.0,
-            "70": 745731584.0,
-            "71": 745731584.0,
-            "72": 745731584.0,
-            "73": 745731584.0,
-            "74": 745731584.0,
-            "75": 745731584.0,
-            "76": 745731584.0,
-            "77": 745731584.0,
-            "78": 745731584.0,
-            "79": 745731584.0,
-            "80": 745731584.0,
-            "81": 745731584.0,
-            "82": 745731584.0,
-            "83": 745731584.0,
-            "84": 745731584.0,
-            "85": 745731584.0,
-            "86": 745731584.0,
-            "87": 745731584.0,
-            "88": 745731584.0,
-            "89": 745731584.0,
-            "90": 745731584.0,
-            "91": 745731584.0,
-            "92": 745731584.0,
-            "93": 745731584.0,
-            "94": 745731584.0,
-            "95": 745731584.0,
-            "96": 745731584.0,
-            "97": 745731584.0,
-            "98": 745731584.0,
-            "99": 745731584.0,
-            "100": 745731584.0
+            "1": 744815104.0,
+            "2": 744815104.0,
+            "3": 744815104.0,
+            "4": 744815104.0,
+            "5": 744815104.0,
+            "6": 744815104.0,
+            "7": 744815104.0,
+            "8": 744815104.0,
+            "9": 744815104.0,
+            "10": 744815104.0,
+            "11": 744815104.0,
+            "12": 744815104.0,
+            "13": 744815104.0,
+            "14": 744815104.0,
+            "15": 744815104.0,
+            "16": 744815104.0,
+            "17": 744815104.0,
+            "18": 744815104.0,
+            "19": 744815104.0,
+            "20": 744815104.0,
+            "21": 744815104.0,
+            "22": 744815104.0,
+            "23": 744815104.0,
+            "24": 744815104.0,
+            "25": 744815104.0,
+            "26": 744815104.0,
+            "27": 744815104.0,
+            "28": 744815104.0,
+            "29": 744815104.0,
+            "30": 744815104.0,
+            "31": 744815104.0,
+            "32": 744815104.0,
+            "33": 744815104.0,
+            "34": 744815104.0,
+            "35": 744815104.0,
+            "36": 744815104.0,
+            "37": 744815104.0,
+            "38": 744815104.0,
+            "39": 744815104.0,
+            "40": 744815104.0,
+            "41": 744815104.0,
+            "42": 744815104.0,
+            "43": 744815104.0,
+            "44": 744815104.0,
+            "45": 744815104.0,
+            "46": 744815104.0,
+            "47": 744815104.0,
+            "48": 744815104.0,
+            "49": 744815104.0,
+            "50": 744815104.0,
+            "51": 744815104.0,
+            "52": 744815104.0,
+            "53": 744815104.0,
+            "54": 744815104.0,
+            "55": 744815104.0,
+            "56": 744815104.0,
+            "57": 744815104.0,
+            "58": 744815104.0,
+            "59": 744815104.0,
+            "60": 744815104.0,
+            "61": 744815104.0,
+            "62": 744815104.0,
+            "63": 744815104.0,
+            "64": 744815104.0,
+            "65": 744815104.0,
+            "66": 744815104.0,
+            "67": 744815104.0,
+            "68": 744815104.0,
+            "69": 744815104.0,
+            "70": 744815104.0,
+            "71": 744815104.0,
+            "72": 744815104.0,
+            "73": 744815104.0,
+            "74": 744815104.0,
+            "75": 744815104.0,
+            "76": 744815104.0,
+            "77": 744815104.0,
+            "78": 744815104.0,
+            "79": 744815104.0,
+            "80": 744815104.0,
+            "81": 744815104.0,
+            "82": 744815104.0,
+            "83": 744815104.0,
+            "84": 744815104.0,
+            "85": 744815104.0,
+            "86": 744815104.0,
+            "87": 744815104.0,
+            "88": 744815104.0,
+            "89": 744815104.0,
+            "90": 744815104.0,
+            "91": 744815104.0,
+            "92": 744815104.0,
+            "93": 744815104.0,
+            "94": 744815104.0,
+            "95": 744815104.0,
+            "96": 744815104.0,
+            "97": 744815104.0,
+            "98": 744815104.0,
+            "99": 744815104.0,
+            "100": 744815104.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1928906752.0,
-            "2": 2210568192.0,
-            "3": 2210568192.0,
-            "4": 2210568192.0,
-            "5": 2210568192.0,
-            "6": 2210568192.0,
-            "7": 2210568192.0,
-            "8": 2210568192.0,
-            "9": 2210568192.0,
-            "10": 2210568192.0,
-            "11": 2210568192.0,
-            "12": 2210568192.0,
-            "13": 2210568192.0,
-            "14": 2210568192.0,
-            "15": 2210568192.0,
-            "16": 2210568192.0,
-            "17": 2210568192.0,
-            "18": 2210568192.0,
-            "19": 2210568192.0,
-            "20": 2210568192.0,
-            "21": 2210568192.0,
-            "22": 2210568192.0,
-            "23": 2210568192.0,
-            "24": 2210568192.0,
-            "25": 2210568192.0,
-            "26": 2210568192.0,
-            "27": 2210568192.0,
-            "28": 2210568192.0,
-            "29": 2210568192.0,
-            "30": 2210568192.0,
-            "31": 2210568192.0,
-            "32": 2210568192.0,
-            "33": 2210568192.0,
-            "34": 2210568192.0,
-            "35": 2210568192.0,
-            "36": 2210568192.0,
-            "37": 2210568192.0,
-            "38": 2210568192.0,
-            "39": 2210568192.0,
-            "40": 2210568192.0,
-            "41": 2210568192.0,
-            "42": 2210568192.0,
-            "43": 2210568192.0,
-            "44": 2210568192.0,
-            "45": 2210568192.0,
-            "46": 2210568192.0,
-            "47": 2210568192.0,
-            "48": 2210568192.0,
-            "49": 2210568192.0,
-            "50": 2210568192.0,
-            "51": 2210568192.0,
-            "52": 2210568192.0,
-            "53": 2210568192.0,
-            "54": 2210568192.0,
-            "55": 2210568192.0,
-            "56": 2210568192.0,
-            "57": 2210568192.0,
-            "58": 2210568192.0,
-            "59": 2210568192.0,
-            "60": 2210568192.0,
-            "61": 2210568192.0,
-            "62": 2210568192.0,
-            "63": 2210568192.0,
-            "64": 2210568192.0,
-            "65": 2210568192.0,
-            "66": 2210568192.0,
-            "67": 2210568192.0,
-            "68": 2210568192.0,
-            "69": 2210568192.0,
-            "70": 2210568192.0,
-            "71": 2210568192.0,
-            "72": 2210568192.0,
-            "73": 2210568192.0,
-            "74": 2210568192.0,
-            "75": 2210568192.0,
-            "76": 2210568192.0,
-            "77": 2210568192.0,
-            "78": 2210568192.0,
-            "79": 2210568192.0,
-            "80": 2210568192.0,
-            "81": 2210568192.0,
-            "82": 2210568192.0,
-            "83": 2210568192.0,
-            "84": 2210568192.0,
-            "85": 2210568192.0,
-            "86": 2210568192.0,
-            "87": 2210568192.0,
-            "88": 2210568192.0,
-            "89": 2210568192.0,
-            "90": 2210568192.0,
-            "91": 2210568192.0,
-            "92": 2210568192.0,
-            "93": 2210568192.0,
-            "94": 2210568192.0,
-            "95": 2210568192.0,
-            "96": 2210568192.0,
-            "97": 2210568192.0,
-            "98": 2210568192.0,
-            "99": 2210568192.0,
-            "100": 2210568192.0
+            "1": 1928907776.0,
+            "2": 2210305536.0,
+            "3": 2210305536.0,
+            "4": 2210305536.0,
+            "5": 2210305536.0,
+            "6": 2210305536.0,
+            "7": 2210305536.0,
+            "8": 2210305536.0,
+            "9": 2210305536.0,
+            "10": 2210305536.0,
+            "11": 2210305536.0,
+            "12": 2210305536.0,
+            "13": 2210305536.0,
+            "14": 2210305536.0,
+            "15": 2210305536.0,
+            "16": 2210305536.0,
+            "17": 2210305536.0,
+            "18": 2210305536.0,
+            "19": 2210305536.0,
+            "20": 2210305536.0,
+            "21": 2210305536.0,
+            "22": 2210305536.0,
+            "23": 2210305536.0,
+            "24": 2210305536.0,
+            "25": 2210305536.0,
+            "26": 2210305536.0,
+            "27": 2210305536.0,
+            "28": 2210305536.0,
+            "29": 2210305536.0,
+            "30": 2210305536.0,
+            "31": 2210305536.0,
+            "32": 2210305536.0,
+            "33": 2210305536.0,
+            "34": 2210305536.0,
+            "35": 2210305536.0,
+            "36": 2210305536.0,
+            "37": 2210305536.0,
+            "38": 2210305536.0,
+            "39": 2210305536.0,
+            "40": 2210305536.0,
+            "41": 2210305536.0,
+            "42": 2210305536.0,
+            "43": 2210305536.0,
+            "44": 2210305536.0,
+            "45": 2210305536.0,
+            "46": 2210305536.0,
+            "47": 2210305536.0,
+            "48": 2210305536.0,
+            "49": 2210305536.0,
+            "50": 2210305536.0,
+            "51": 2210305536.0,
+            "52": 2210305536.0,
+            "53": 2210305536.0,
+            "54": 2210305536.0,
+            "55": 2210305536.0,
+            "56": 2210305536.0,
+            "57": 2210305536.0,
+            "58": 2210305536.0,
+            "59": 2210305536.0,
+            "60": 2210305536.0,
+            "61": 2210305536.0,
+            "62": 2210305536.0,
+            "63": 2210305536.0,
+            "64": 2210305536.0,
+            "65": 2210305536.0,
+            "66": 2210305536.0,
+            "67": 2210305536.0,
+            "68": 2210305536.0,
+            "69": 2210305536.0,
+            "70": 2210305536.0,
+            "71": 2210305536.0,
+            "72": 2210305536.0,
+            "73": 2210305536.0,
+            "74": 2210305536.0,
+            "75": 2210305536.0,
+            "76": 2210305536.0,
+            "77": 2210305536.0,
+            "78": 2210305536.0,
+            "79": 2210305536.0,
+            "80": 2210305536.0,
+            "81": 2210305536.0,
+            "82": 2210305536.0,
+            "83": 2210305536.0,
+            "84": 2210305536.0,
+            "85": 2210305536.0,
+            "86": 2210305536.0,
+            "87": 2210305536.0,
+            "88": 2210305536.0,
+            "89": 2210305536.0,
+            "90": 2210305536.0,
+            "91": 2210305536.0,
+            "92": 2210305536.0,
+            "93": 2210305536.0,
+            "94": 2210305536.0,
+            "95": 2210305536.0,
+            "96": 2210305536.0,
+            "97": 2210305536.0,
+            "98": 2210305536.0,
+            "99": 2210305536.0,
+            "100": 2210305536.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 15.33061,
-            "2": 0.15156,
-            "3": 0.12174,
-            "4": 0.12197,
-            "5": 0.12023,
-            "6": 0.11997,
-            "7": 0.11882,
-            "8": 0.11859,
-            "9": 0.11967,
-            "10": 0.11724,
-            "11": 0.11735,
-            "12": 0.11593,
-            "13": 0.11661,
-            "14": 0.11794,
-            "15": 0.11649,
-            "16": 0.11682,
-            "17": 0.11623,
-            "18": 0.11719,
-            "19": 0.11753,
-            "20": 0.11581,
-            "21": 0.11757,
-            "22": 0.11628,
-            "23": 0.11692,
-            "24": 0.1163,
-            "25": 0.1167,
-            "26": 0.11646,
-            "27": 0.11803,
-            "28": 0.11984,
-            "29": 0.11941,
-            "30": 0.11857,
-            "31": 0.11687,
-            "32": 0.11515,
-            "33": 0.11754,
-            "34": 0.11591,
-            "35": 0.11819,
-            "36": 0.11754,
-            "37": 0.11694,
-            "38": 0.11726,
-            "39": 0.11761,
-            "40": 0.11745,
-            "41": 0.11768,
-            "42": 0.11775,
-            "43": 0.11661,
-            "44": 0.11724,
-            "45": 0.1189,
-            "46": 0.11964,
-            "47": 0.11985,
-            "48": 0.12086,
-            "49": 0.11855,
-            "50": 0.11941,
-            "51": 0.13155,
-            "52": 0.12627,
-            "53": 0.12132,
-            "54": 0.12027,
-            "55": 0.12076,
-            "56": 0.14178,
-            "57": 0.12294,
-            "58": 0.12155,
-            "59": 0.11843,
-            "60": 0.11687,
-            "61": 0.11827,
-            "62": 0.11957,
-            "63": 0.11945,
-            "64": 0.11781,
-            "65": 0.12041,
-            "66": 0.11949,
-            "67": 0.12059,
-            "68": 0.11821,
-            "69": 0.11858,
-            "70": 0.11799,
-            "71": 0.12009,
-            "72": 0.12095,
-            "73": 0.11845,
-            "74": 0.11834,
-            "75": 0.11893,
-            "76": 0.1214,
-            "77": 0.1195,
-            "78": 0.11933,
-            "79": 0.11885,
-            "80": 0.11948,
-            "81": 0.12097,
-            "82": 0.12,
-            "83": 0.11954,
-            "84": 0.11693,
-            "85": 0.1175,
-            "86": 0.11941,
-            "87": 0.11723,
-            "88": 0.11941,
-            "89": 0.11804,
-            "90": 0.11751,
-            "91": 0.11952,
-            "92": 0.11778,
-            "93": 0.11924,
-            "94": 0.11755,
-            "95": 0.11789,
-            "96": 0.11673,
-            "97": 0.11967,
-            "98": 0.11752,
-            "99": 0.11926,
-            "100": 0.11806
+            "1": 38.50475,
+            "2": 0.14031,
+            "3": 0.11652,
+            "4": 0.09549,
+            "5": 0.09354,
+            "6": 0.09569,
+            "7": 0.09409,
+            "8": 0.09473,
+            "9": 0.09388,
+            "10": 0.09459,
+            "11": 0.09596,
+            "12": 0.09466,
+            "13": 0.09509,
+            "14": 0.09586,
+            "15": 0.09314,
+            "16": 0.09368,
+            "17": 0.09468,
+            "18": 0.09494,
+            "19": 0.09289,
+            "20": 0.09427,
+            "21": 0.09599,
+            "22": 0.09701,
+            "23": 0.09665,
+            "24": 0.09712,
+            "25": 0.09542,
+            "26": 0.09515,
+            "27": 0.09642,
+            "28": 0.09519,
+            "29": 0.09691,
+            "30": 0.09651,
+            "31": 0.09742,
+            "32": 0.09503,
+            "33": 0.09471,
+            "34": 0.09424,
+            "35": 0.09574,
+            "36": 0.09438,
+            "37": 0.09509,
+            "38": 0.09428,
+            "39": 0.09484,
+            "40": 0.09459,
+            "41": 0.0951,
+            "42": 0.09671,
+            "43": 0.09633,
+            "44": 0.09511,
+            "45": 0.09592,
+            "46": 0.09579,
+            "47": 0.09614,
+            "48": 0.09464,
+            "49": 0.0958,
+            "50": 0.09782,
+            "51": 0.10564,
+            "52": 0.09373,
+            "53": 0.09475,
+            "54": 0.09323,
+            "55": 0.09237,
+            "56": 0.09293,
+            "57": 0.09228,
+            "58": 0.0948,
+            "59": 0.09906,
+            "60": 0.10026,
+            "61": 0.09961,
+            "62": 0.09923,
+            "63": 0.09889,
+            "64": 0.09888,
+            "65": 0.09925,
+            "66": 0.1,
+            "67": 0.09782,
+            "68": 0.09891,
+            "69": 0.09132,
+            "70": 0.09102,
+            "71": 0.091,
+            "72": 0.09368,
+            "73": 0.09219,
+            "74": 0.09374,
+            "75": 0.09232,
+            "76": 0.09428,
+            "77": 0.09256,
+            "78": 0.09623,
+            "79": 0.09624,
+            "80": 0.09622,
+            "81": 0.09668,
+            "82": 0.09651,
+            "83": 0.10042,
+            "84": 0.09998,
+            "85": 0.10102,
+            "86": 0.09975,
+            "87": 0.09955,
+            "88": 0.10135,
+            "89": 0.10038,
+            "90": 0.09933,
+            "91": 0.10071,
+            "92": 0.09992,
+            "93": 0.10054,
+            "94": 0.09927,
+            "95": 0.0998,
+            "96": 0.101,
+            "97": 0.09268,
+            "98": 0.09188,
+            "99": 0.09185,
+            "100": 0.09107
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..eca47cac99b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.00084,
+            "52": 9.89672,
+            "53": 10.19876,
+            "54": 10.09066,
+            "55": 10.00567,
+            "56": 9.77199,
+            "57": 9.64533,
+            "58": 9.98587,
+            "59": 9.72608,
+            "60": 9.6777,
+            "61": 9.8157,
+            "62": 10.092,
+            "63": 9.54758,
+            "64": 9.90438,
+            "65": 9.09492,
+            "66": 9.84068,
+            "67": 9.48471,
+            "68": 9.88996,
+            "69": 9.87691,
+            "70": 9.85294,
+            "71": 9.73278,
+            "72": 9.72558,
+            "73": 9.63706,
+            "74": 9.12334,
+            "75": 9.55335,
+            "76": 9.21765,
+            "77": 10.15202,
+            "78": 9.81465,
+            "79": 9.47558,
+            "80": 9.52073,
+            "81": 9.5872,
+            "82": 9.79125,
+            "83": 9.44848,
+            "84": 9.49585,
+            "85": 9.72189,
+            "86": 9.18037,
+            "87": 9.66127,
+            "88": 9.84359,
+            "89": 9.71651,
+            "90": 9.88102,
+            "91": 9.48434,
+            "92": 9.4705,
+            "93": 9.20911,
+            "94": 8.95382,
+            "95": 9.60554,
+            "96": 9.63976,
+            "97": 9.38762,
+            "98": 9.7573,
+            "99": 9.0159,
+            "100": 9.49925
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2575.0,
+            "52": 2621.0,
+            "53": 2891.0,
+            "54": 2655.0,
+            "55": 2559.0,
+            "56": 2566.0,
+            "57": 2471.0,
+            "58": 2767.0,
+            "59": 2529.0,
+            "60": 2289.0,
+            "61": 2642.0,
+            "62": 2820.0,
+            "63": 2654.0,
+            "64": 3020.0,
+            "65": 2687.0,
+            "66": 2884.0,
+            "67": 2666.0,
+            "68": 2720.0,
+            "69": 2738.0,
+            "70": 3004.0,
+            "71": 2816.0,
+            "72": 2537.0,
+            "73": 2826.0,
+            "74": 2192.0,
+            "75": 2647.0,
+            "76": 3048.0,
+            "77": 3019.0,
+            "78": 3134.0,
+            "79": 3092.0,
+            "80": 3054.0,
+            "81": 3298.0,
+            "82": 3350.0,
+            "83": 2597.0,
+            "84": 3436.0,
+            "85": 3350.0,
+            "86": 2993.0,
+            "87": 3509.0,
+            "88": 3403.0,
+            "89": 3490.0,
+            "90": 3368.0,
+            "91": 2461.0,
+            "92": 2803.0,
+            "93": 2933.0,
+            "94": 2888.0,
+            "95": 3138.0,
+            "96": 3047.0,
+            "97": 3016.0,
+            "98": 3382.0,
+            "99": 2995.0,
+            "100": 2490.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 745929216.0,
+            "52": 745929216.0,
+            "53": 745929216.0,
+            "54": 745929216.0,
+            "55": 745929216.0,
+            "56": 745929216.0,
+            "57": 745929216.0,
+            "58": 745929216.0,
+            "59": 745929216.0,
+            "60": 745929216.0,
+            "61": 745929216.0,
+            "62": 745929216.0,
+            "63": 745929216.0,
+            "64": 745929216.0,
+            "65": 745929216.0,
+            "66": 745929216.0,
+            "67": 745929216.0,
+            "68": 745929216.0,
+            "69": 745929216.0,
+            "70": 745929216.0,
+            "71": 745929216.0,
+            "72": 745929216.0,
+            "73": 745929216.0,
+            "74": 745929216.0,
+            "75": 745929216.0,
+            "76": 745929216.0,
+            "77": 745929216.0,
+            "78": 745929216.0,
+            "79": 745929216.0,
+            "80": 745929216.0,
+            "81": 745929216.0,
+            "82": 745929216.0,
+            "83": 745929216.0,
+            "84": 745929216.0,
+            "85": 745929216.0,
+            "86": 745929216.0,
+            "87": 745929216.0,
+            "88": 745929216.0,
+            "89": 745929216.0,
+            "90": 745929216.0,
+            "91": 745929216.0,
+            "92": 745929216.0,
+            "93": 745929216.0,
+            "94": 745929216.0,
+            "95": 745929216.0,
+            "96": 745929216.0,
+            "97": 745929216.0,
+            "98": 745929216.0,
+            "99": 745929216.0,
+            "100": 745929216.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2209847296.0,
+            "52": 2209848320.0,
+            "53": 2209848320.0,
+            "54": 2209848320.0,
+            "55": 2209848320.0,
+            "56": 2209848320.0,
+            "57": 2209848320.0,
+            "58": 2209848320.0,
+            "59": 2209848320.0,
+            "60": 2209848320.0,
+            "61": 2209848320.0,
+            "62": 2209848320.0,
+            "63": 2209848320.0,
+            "64": 2209848320.0,
+            "65": 2209848320.0,
+            "66": 2209848320.0,
+            "67": 2209848320.0,
+            "68": 2209848320.0,
+            "69": 2209848320.0,
+            "70": 2209848320.0,
+            "71": 2209848320.0,
+            "72": 2209848320.0,
+            "73": 2209848320.0,
+            "74": 2209848320.0,
+            "75": 2209848320.0,
+            "76": 2209848320.0,
+            "77": 2209848320.0,
+            "78": 2209848320.0,
+            "79": 2209848320.0,
+            "80": 2209848320.0,
+            "81": 2209848320.0,
+            "82": 2209848320.0,
+            "83": 2209848320.0,
+            "84": 2209848320.0,
+            "85": 2209848320.0,
+            "86": 2209848320.0,
+            "87": 2209848320.0,
+            "88": 2209848320.0,
+            "89": 2209848320.0,
+            "90": 2209848320.0,
+            "91": 2209848320.0,
+            "92": 2209848320.0,
+            "93": 2209848320.0,
+            "94": 2209848320.0,
+            "95": 2209848320.0,
+            "96": 2209848320.0,
+            "97": 2209848320.0,
+            "98": 2209848320.0,
+            "99": 2209848320.0,
+            "100": 2209848320.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 37.2947,
+            "52": 0.14072,
+            "53": 0.09482,
+            "54": 0.09404,
+            "55": 0.09449,
+            "56": 0.09381,
+            "57": 0.09346,
+            "58": 0.09378,
+            "59": 0.095,
+            "60": 0.09392,
+            "61": 0.09499,
+            "62": 0.09499,
+            "63": 0.09735,
+            "64": 0.10206,
+            "65": 0.09653,
+            "66": 0.09566,
+            "67": 0.09553,
+            "68": 0.09405,
+            "69": 0.09463,
+            "70": 0.09396,
+            "71": 0.09424,
+            "72": 0.0967,
+            "73": 0.09895,
+            "74": 0.09633,
+            "75": 0.0965,
+            "76": 0.09665,
+            "77": 0.10127,
+            "78": 0.10066,
+            "79": 0.10529,
+            "80": 0.10669,
+            "81": 0.10018,
+            "82": 0.09658,
+            "83": 0.09504,
+            "84": 0.0941,
+            "85": 0.09377,
+            "86": 0.09642,
+            "87": 0.09327,
+            "88": 0.09416,
+            "89": 0.09453,
+            "90": 0.09434,
+            "91": 0.09472,
+            "92": 0.09416,
+            "93": 0.09427,
+            "94": 0.09459,
+            "95": 0.09437,
+            "96": 0.09352,
+            "97": 0.09986,
+            "98": 0.09365,
+            "99": 0.09441,
+            "100": 0.094
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..34725e2965a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.97513,
+            "2": 10.97995,
+            "3": 10.98066,
+            "4": 10.99791,
+            "5": 10.96412,
+            "6": 10.95966,
+            "7": 10.97622,
+            "8": 10.97531,
+            "9": 10.97506,
+            "10": 10.97665,
+            "11": 10.92846,
+            "12": 10.9494,
+            "13": 10.94009,
+            "14": 10.93747,
+            "15": 10.92917,
+            "16": 10.91904,
+            "17": 10.90495,
+            "18": 10.89425,
+            "19": 10.89215,
+            "20": 10.81808,
+            "21": 10.7816,
+            "22": 10.70813,
+            "23": 10.7819,
+            "24": 10.69774,
+            "25": 10.66245,
+            "26": 10.69992,
+            "27": 10.68419,
+            "28": 10.62061,
+            "29": 10.62277,
+            "30": 10.45367,
+            "31": 10.24899,
+            "32": 10.52222,
+            "33": 10.51211,
+            "34": 10.30154,
+            "35": 10.34384,
+            "36": 10.30677,
+            "37": 10.38891,
+            "38": 10.24857,
+            "39": 10.44177,
+            "40": 10.16246,
+            "41": 10.20434,
+            "42": 10.26319,
+            "43": 9.9082,
+            "44": 10.01995,
+            "45": 9.91152,
+            "46": 9.886,
+            "47": 10.18408,
+            "48": 9.9033,
+            "49": 9.59959,
+            "50": 9.96198,
+            "51": 9.90259,
+            "52": 9.79281,
+            "53": 10.11536,
+            "54": 9.99216,
+            "55": 9.91665,
+            "56": 9.66015,
+            "57": 9.52038,
+            "58": 9.87094,
+            "59": 9.6209,
+            "60": 9.54952,
+            "61": 9.70012,
+            "62": 10.00629,
+            "63": 9.42168,
+            "64": 9.79893,
+            "65": 8.97548,
+            "66": 9.73165,
+            "67": 9.38933,
+            "68": 9.80066,
+            "69": 9.81152,
+            "70": 9.76761,
+            "71": 9.63356,
+            "72": 9.59892,
+            "73": 9.51708,
+            "74": 8.96512,
+            "75": 9.43589,
+            "76": 9.11207,
+            "77": 10.06881,
+            "78": 9.72515,
+            "79": 9.39985,
+            "80": 9.41154,
+            "81": 9.50094,
+            "82": 9.69861,
+            "83": 9.33578,
+            "84": 9.4341,
+            "85": 9.63907,
+            "86": 9.06166,
+            "87": 9.60563,
+            "88": 9.77626,
+            "89": 9.6243,
+            "90": 9.82766,
+            "91": 9.35869,
+            "92": 9.38066,
+            "93": 9.09681,
+            "94": 8.83995,
+            "95": 9.52751,
+            "96": 9.53562,
+            "97": 9.32689,
+            "98": 9.69354,
+            "99": 8.88933,
+            "100": 9.42104
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 22726972.0,
+            "2": 22924386.0,
+            "3": 22597036.0,
+            "4": 23219218.0,
+            "5": 22714492.0,
+            "6": 23021698.0,
+            "7": 22771376.0,
+            "8": 22926820.0,
+            "9": 22841276.0,
+            "10": 22918392.0,
+            "11": 22500620.0,
+            "12": 22459672.0,
+            "13": 22917468.0,
+            "14": 22388398.0,
+            "15": 22822252.0,
+            "16": 22830612.0,
+            "17": 22820228.0,
+            "18": 22582844.0,
+            "19": 22618412.0,
+            "20": 22693594.0,
+            "21": 22739320.0,
+            "22": 22800076.0,
+            "23": 22539112.0,
+            "24": 22770966.0,
+            "25": 22819404.0,
+            "26": 22548188.0,
+            "27": 22468652.0,
+            "28": 22453560.0,
+            "29": 22530344.0,
+            "30": 22630776.0,
+            "31": 22955664.0,
+            "32": 22585020.0,
+            "33": 22558760.0,
+            "34": 22835536.0,
+            "35": 22787790.0,
+            "36": 22589526.0,
+            "37": 22497640.0,
+            "38": 22896056.0,
+            "39": 22802282.0,
+            "40": 22657698.0,
+            "41": 22659592.0,
+            "42": 22666980.0,
+            "43": 22976392.0,
+            "44": 22747128.0,
+            "45": 22674364.0,
+            "46": 22883920.0,
+            "47": 22634300.0,
+            "48": 22928164.0,
+            "49": 22728710.0,
+            "50": 22904340.0,
+            "51": 22791436.0,
+            "52": 22748292.0,
+            "53": 22924772.0,
+            "54": 22840284.0,
+            "55": 22517880.0,
+            "56": 22877730.0,
+            "57": 23113080.0,
+            "58": 22845568.0,
+            "59": 22716022.0,
+            "60": 22743056.0,
+            "61": 22724434.0,
+            "62": 22672316.0,
+            "63": 22846416.0,
+            "64": 22823178.0,
+            "65": 23061654.0,
+            "66": 22729712.0,
+            "67": 22908434.0,
+            "68": 22610444.0,
+            "69": 22584604.0,
+            "70": 22828526.0,
+            "71": 22748442.0,
+            "72": 22655052.0,
+            "73": 22740588.0,
+            "74": 23048316.0,
+            "75": 23054664.0,
+            "76": 22901072.0,
+            "77": 22272198.0,
+            "78": 22789244.0,
+            "79": 22743700.0,
+            "80": 22706576.0,
+            "81": 22890704.0,
+            "82": 22778282.0,
+            "83": 22840256.0,
+            "84": 23010368.0,
+            "85": 22711796.0,
+            "86": 23103236.0,
+            "87": 22735120.0,
+            "88": 22636998.0,
+            "89": 22498612.0,
+            "90": 22972652.0,
+            "91": 22767776.0,
+            "92": 22809424.0,
+            "93": 22658980.0,
+            "94": 22911920.0,
+            "95": 23047890.0,
+            "96": 22828804.0,
+            "97": 22608196.0,
+            "98": 22762820.0,
+            "99": 22906714.0,
+            "100": 23016048.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 748541440.0,
+            "2": 748541440.0,
+            "3": 748541440.0,
+            "4": 748541440.0,
+            "5": 748541440.0,
+            "6": 748541440.0,
+            "7": 748541440.0,
+            "8": 748541440.0,
+            "9": 748541440.0,
+            "10": 748541440.0,
+            "11": 748541440.0,
+            "12": 748541440.0,
+            "13": 748541440.0,
+            "14": 748541440.0,
+            "15": 748541440.0,
+            "16": 748541440.0,
+            "17": 748541440.0,
+            "18": 748541440.0,
+            "19": 748541440.0,
+            "20": 748541440.0,
+            "21": 748541440.0,
+            "22": 748541440.0,
+            "23": 748541440.0,
+            "24": 748541440.0,
+            "25": 748541440.0,
+            "26": 748541440.0,
+            "27": 748541440.0,
+            "28": 748541440.0,
+            "29": 748541440.0,
+            "30": 748541440.0,
+            "31": 748541440.0,
+            "32": 748541440.0,
+            "33": 748541440.0,
+            "34": 748541440.0,
+            "35": 748541440.0,
+            "36": 748541440.0,
+            "37": 748541440.0,
+            "38": 748541440.0,
+            "39": 748541440.0,
+            "40": 748541440.0,
+            "41": 748541440.0,
+            "42": 748541440.0,
+            "43": 748541440.0,
+            "44": 748541440.0,
+            "45": 748541440.0,
+            "46": 748541440.0,
+            "47": 748541440.0,
+            "48": 748541440.0,
+            "49": 748541440.0,
+            "50": 748541440.0,
+            "51": 748541440.0,
+            "52": 748541440.0,
+            "53": 748541440.0,
+            "54": 748541440.0,
+            "55": 748541440.0,
+            "56": 748541440.0,
+            "57": 748541440.0,
+            "58": 748541440.0,
+            "59": 748541440.0,
+            "60": 748541440.0,
+            "61": 748541440.0,
+            "62": 748541440.0,
+            "63": 748541440.0,
+            "64": 748541440.0,
+            "65": 748541440.0,
+            "66": 748541440.0,
+            "67": 748541440.0,
+            "68": 748541440.0,
+            "69": 748541440.0,
+            "70": 748541440.0,
+            "71": 748541440.0,
+            "72": 748541440.0,
+            "73": 748541440.0,
+            "74": 748541440.0,
+            "75": 748541440.0,
+            "76": 748541440.0,
+            "77": 748541440.0,
+            "78": 748541440.0,
+            "79": 748541440.0,
+            "80": 748541440.0,
+            "81": 748541440.0,
+            "82": 748541440.0,
+            "83": 748541440.0,
+            "84": 748541440.0,
+            "85": 748541440.0,
+            "86": 748541440.0,
+            "87": 748541440.0,
+            "88": 748541440.0,
+            "89": 748541440.0,
+            "90": 748541440.0,
+            "91": 748541440.0,
+            "92": 748541440.0,
+            "93": 748541440.0,
+            "94": 748541440.0,
+            "95": 748541440.0,
+            "96": 748541440.0,
+            "97": 748541440.0,
+            "98": 748541440.0,
+            "99": 748541440.0,
+            "100": 748541440.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1939923968.0,
+            "2": 2224781312.0,
+            "3": 2224781312.0,
+            "4": 2224781312.0,
+            "5": 2224781312.0,
+            "6": 2224781312.0,
+            "7": 2224781312.0,
+            "8": 2224781312.0,
+            "9": 2224781312.0,
+            "10": 2224781312.0,
+            "11": 2224781312.0,
+            "12": 2224781312.0,
+            "13": 2224781312.0,
+            "14": 2224781312.0,
+            "15": 2224781312.0,
+            "16": 2224781312.0,
+            "17": 2224781312.0,
+            "18": 2224781312.0,
+            "19": 2224781312.0,
+            "20": 2224781312.0,
+            "21": 2224781312.0,
+            "22": 2224781312.0,
+            "23": 2224781312.0,
+            "24": 2224781312.0,
+            "25": 2224781312.0,
+            "26": 2224781312.0,
+            "27": 2224781312.0,
+            "28": 2224781312.0,
+            "29": 2224781312.0,
+            "30": 2224781312.0,
+            "31": 2224781312.0,
+            "32": 2224781312.0,
+            "33": 2224781312.0,
+            "34": 2224781312.0,
+            "35": 2224781312.0,
+            "36": 2224781312.0,
+            "37": 2224781312.0,
+            "38": 2224781312.0,
+            "39": 2224781312.0,
+            "40": 2224781312.0,
+            "41": 2224781312.0,
+            "42": 2224781312.0,
+            "43": 2224781312.0,
+            "44": 2224781312.0,
+            "45": 2224781312.0,
+            "46": 2224781312.0,
+            "47": 2224781312.0,
+            "48": 2224781312.0,
+            "49": 2224781312.0,
+            "50": 2224781312.0,
+            "51": 2224781312.0,
+            "52": 2224781312.0,
+            "53": 2224781312.0,
+            "54": 2224781312.0,
+            "55": 2224781312.0,
+            "56": 2224781312.0,
+            "57": 2224781312.0,
+            "58": 2224781312.0,
+            "59": 2224781312.0,
+            "60": 2224781312.0,
+            "61": 2224781312.0,
+            "62": 2224781312.0,
+            "63": 2224781312.0,
+            "64": 2224781312.0,
+            "65": 2224781312.0,
+            "66": 2224781312.0,
+            "67": 2224781312.0,
+            "68": 2224781312.0,
+            "69": 2224781312.0,
+            "70": 2224781312.0,
+            "71": 2224781312.0,
+            "72": 2224781312.0,
+            "73": 2224781312.0,
+            "74": 2224781312.0,
+            "75": 2224781312.0,
+            "76": 2224781312.0,
+            "77": 2224781312.0,
+            "78": 2224781312.0,
+            "79": 2224781312.0,
+            "80": 2224781312.0,
+            "81": 2224781312.0,
+            "82": 2224781312.0,
+            "83": 2224781312.0,
+            "84": 2224781312.0,
+            "85": 2224781312.0,
+            "86": 2224781312.0,
+            "87": 2224781312.0,
+            "88": 2224781312.0,
+            "89": 2224781312.0,
+            "90": 2224781312.0,
+            "91": 2224781312.0,
+            "92": 2224781312.0,
+            "93": 2224781312.0,
+            "94": 2224781312.0,
+            "95": 2224781312.0,
+            "96": 2224781312.0,
+            "97": 2224781312.0,
+            "98": 2224781312.0,
+            "99": 2224781312.0,
+            "100": 2224781312.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.33774,
+            "3": 0.14705,
+            "4": 0.13192,
+            "5": 0.13017,
+            "6": 0.1292,
+            "7": 0.13024,
+            "8": 0.13032,
+            "9": 0.12858,
+            "10": 0.12948,
+            "11": 0.12888,
+            "12": 0.12888,
+            "13": 0.12811,
+            "14": 0.12943,
+            "15": 0.12948,
+            "16": 0.1295,
+            "17": 0.13022,
+            "18": 0.12847,
+            "19": 0.12992,
+            "20": 0.1308,
+            "21": 0.12844,
+            "22": 0.13063,
+            "23": 0.13033,
+            "24": 0.13003,
+            "25": 0.12935,
+            "26": 0.13016,
+            "27": 0.12989,
+            "28": 0.12947,
+            "29": 0.12857,
+            "30": 0.12949,
+            "31": 0.12997,
+            "32": 0.12843,
+            "33": 0.1291,
+            "34": 0.12894,
+            "35": 0.13061,
+            "36": 0.12974,
+            "37": 0.12939,
+            "38": 0.13039,
+            "39": 0.13034,
+            "40": 0.13069,
+            "41": 0.13259,
+            "42": 0.13109,
+            "43": 0.13211,
+            "44": 0.1299,
+            "45": 0.1295,
+            "46": 0.13001,
+            "47": 0.13037,
+            "48": 0.13043,
+            "49": 0.13012,
+            "50": 0.12915,
+            "51": 0.14665,
+            "52": 0.12869,
+            "53": 0.12717,
+            "54": 0.12709,
+            "55": 0.12611,
+            "56": 0.12645,
+            "57": 0.12711,
+            "58": 0.12728,
+            "59": 0.1269,
+            "60": 0.12701,
+            "61": 0.1281,
+            "62": 0.12781,
+            "63": 0.12842,
+            "64": 0.12745,
+            "65": 0.12897,
+            "66": 0.12786,
+            "67": 0.12983,
+            "68": 0.13068,
+            "69": 0.1284,
+            "70": 0.12896,
+            "71": 0.1288,
+            "72": 0.13026,
+            "73": 0.13011,
+            "74": 0.12891,
+            "75": 0.12798,
+            "76": 0.12866,
+            "77": 0.12994,
+            "78": 0.12957,
+            "79": 0.12765,
+            "80": 0.12884,
+            "81": 0.12898,
+            "82": 0.12927,
+            "83": 0.12848,
+            "84": 0.12845,
+            "85": 0.12849,
+            "86": 0.12983,
+            "87": 0.1303,
+            "88": 0.12961,
+            "89": 0.13093,
+            "90": 0.12951,
+            "91": 0.12818,
+            "92": 0.12902,
+            "93": 0.12967,
+            "94": 0.13419,
+            "95": 0.14029,
+            "96": 0.1429,
+            "97": 0.14018,
+            "98": 0.13632,
+            "99": 0.14128,
+            "100": 0.14034
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..dd354f801de
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.90259,
+            "52": 9.79281,
+            "53": 10.11536,
+            "54": 9.99216,
+            "55": 9.91665,
+            "56": 9.66015,
+            "57": 9.52038,
+            "58": 9.87094,
+            "59": 9.6209,
+            "60": 9.54952,
+            "61": 9.70012,
+            "62": 10.00629,
+            "63": 9.42168,
+            "64": 9.79893,
+            "65": 8.97548,
+            "66": 9.73165,
+            "67": 9.38933,
+            "68": 9.80066,
+            "69": 9.81152,
+            "70": 9.76761,
+            "71": 9.63356,
+            "72": 9.59892,
+            "73": 9.51708,
+            "74": 8.96512,
+            "75": 9.43589,
+            "76": 9.11207,
+            "77": 10.06881,
+            "78": 9.72515,
+            "79": 9.39985,
+            "80": 9.41154,
+            "81": 9.50094,
+            "82": 9.69861,
+            "83": 9.33578,
+            "84": 9.4341,
+            "85": 9.63907,
+            "86": 9.06166,
+            "87": 9.60563,
+            "88": 9.77626,
+            "89": 9.6243,
+            "90": 9.82766,
+            "91": 9.35869,
+            "92": 9.38066,
+            "93": 9.09681,
+            "94": 8.83995,
+            "95": 9.52751,
+            "96": 9.53562,
+            "97": 9.32689,
+            "98": 9.69354,
+            "99": 8.88933,
+            "100": 9.42104
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 22791436.0,
+            "52": 22748292.0,
+            "53": 22924772.0,
+            "54": 22840284.0,
+            "55": 22517880.0,
+            "56": 22877730.0,
+            "57": 23113080.0,
+            "58": 22845568.0,
+            "59": 22716022.0,
+            "60": 22743056.0,
+            "61": 22724434.0,
+            "62": 22672316.0,
+            "63": 22846416.0,
+            "64": 22823178.0,
+            "65": 23061654.0,
+            "66": 22729712.0,
+            "67": 22908434.0,
+            "68": 22610444.0,
+            "69": 22584604.0,
+            "70": 22828526.0,
+            "71": 22748442.0,
+            "72": 22655052.0,
+            "73": 22740588.0,
+            "74": 23048316.0,
+            "75": 23054664.0,
+            "76": 22901072.0,
+            "77": 22272198.0,
+            "78": 22789244.0,
+            "79": 22743700.0,
+            "80": 22706576.0,
+            "81": 22890704.0,
+            "82": 22778282.0,
+            "83": 22840256.0,
+            "84": 23010368.0,
+            "85": 22711796.0,
+            "86": 23103236.0,
+            "87": 22735120.0,
+            "88": 22636998.0,
+            "89": 22498612.0,
+            "90": 22972652.0,
+            "91": 22767776.0,
+            "92": 22809424.0,
+            "93": 22658980.0,
+            "94": 22911920.0,
+            "95": 23047890.0,
+            "96": 22828804.0,
+            "97": 22608196.0,
+            "98": 22762820.0,
+            "99": 22906714.0,
+            "100": 23016048.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 746444288.0,
+            "52": 746444288.0,
+            "53": 746444288.0,
+            "54": 746444288.0,
+            "55": 746444288.0,
+            "56": 746444288.0,
+            "57": 746444288.0,
+            "58": 746444288.0,
+            "59": 746444288.0,
+            "60": 746444288.0,
+            "61": 746444288.0,
+            "62": 746444288.0,
+            "63": 746444288.0,
+            "64": 746444288.0,
+            "65": 746444288.0,
+            "66": 746444288.0,
+            "67": 746444288.0,
+            "68": 746444288.0,
+            "69": 746444288.0,
+            "70": 746444288.0,
+            "71": 746444288.0,
+            "72": 746444288.0,
+            "73": 746444288.0,
+            "74": 746444288.0,
+            "75": 746444288.0,
+            "76": 746444288.0,
+            "77": 746444288.0,
+            "78": 746444288.0,
+            "79": 746444288.0,
+            "80": 746444288.0,
+            "81": 746444288.0,
+            "82": 746444288.0,
+            "83": 746444288.0,
+            "84": 746444288.0,
+            "85": 746444288.0,
+            "86": 746444288.0,
+            "87": 746444288.0,
+            "88": 746444288.0,
+            "89": 746444288.0,
+            "90": 746444288.0,
+            "91": 746444288.0,
+            "92": 746444288.0,
+            "93": 746444288.0,
+            "94": 746444288.0,
+            "95": 746444288.0,
+            "96": 746444288.0,
+            "97": 746444288.0,
+            "98": 746444288.0,
+            "99": 746444288.0,
+            "100": 746444288.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2223731712.0,
+            "52": 2223732736.0,
+            "53": 2223732736.0,
+            "54": 2223732736.0,
+            "55": 2223732736.0,
+            "56": 2223732736.0,
+            "57": 2223732736.0,
+            "58": 2223732736.0,
+            "59": 2223732736.0,
+            "60": 2223732736.0,
+            "61": 2223732736.0,
+            "62": 2223732736.0,
+            "63": 2223732736.0,
+            "64": 2223732736.0,
+            "65": 2223732736.0,
+            "66": 2223732736.0,
+            "67": 2223732736.0,
+            "68": 2223732736.0,
+            "69": 2223732736.0,
+            "70": 2223732736.0,
+            "71": 2223732736.0,
+            "72": 2223732736.0,
+            "73": 2223732736.0,
+            "74": 2223732736.0,
+            "75": 2223732736.0,
+            "76": 2223732736.0,
+            "77": 2223732736.0,
+            "78": 2223732736.0,
+            "79": 2223732736.0,
+            "80": 2223732736.0,
+            "81": 2223732736.0,
+            "82": 2223732736.0,
+            "83": 2223732736.0,
+            "84": 2223732736.0,
+            "85": 2223732736.0,
+            "86": 2223732736.0,
+            "87": 2223732736.0,
+            "88": 2223732736.0,
+            "89": 2223732736.0,
+            "90": 2223732736.0,
+            "91": 2223732736.0,
+            "92": 2223732736.0,
+            "93": 2223732736.0,
+            "94": 2223732736.0,
+            "95": 2223732736.0,
+            "96": 2223732736.0,
+            "97": 2223732736.0,
+            "98": 2223732736.0,
+            "99": 2223732736.0,
+            "100": 2223732736.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.28424,
+            "53": 0.15724,
+            "54": 0.14436,
+            "55": 0.14133,
+            "56": 0.14939,
+            "57": 0.15152,
+            "58": 0.16555,
+            "59": 0.19478,
+            "60": 0.13288,
+            "61": 0.13086,
+            "62": 0.13088,
+            "63": 0.13074,
+            "64": 0.1303,
+            "65": 0.13189,
+            "66": 0.13138,
+            "67": 0.12968,
+            "68": 0.13118,
+            "69": 0.13064,
+            "70": 0.12931,
+            "71": 0.12915,
+            "72": 0.12915,
+            "73": 0.13375,
+            "74": 0.13641,
+            "75": 0.13586,
+            "76": 0.13551,
+            "77": 0.13604,
+            "78": 0.13931,
+            "79": 0.13798,
+            "80": 0.13724,
+            "81": 0.13702,
+            "82": 0.13663,
+            "83": 0.1357,
+            "84": 0.13618,
+            "85": 0.13577,
+            "86": 0.13569,
+            "87": 0.13635,
+            "88": 0.13659,
+            "89": 0.13724,
+            "90": 0.13599,
+            "91": 0.13637,
+            "92": 0.13565,
+            "93": 0.13693,
+            "94": 0.13576,
+            "95": 0.13566,
+            "96": 0.13579,
+            "97": 0.13592,
+            "98": 0.13631,
+            "99": 0.13476,
+            "100": 0.13606
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100.json
index d3d593b49c2..4943a180a1f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 746443264.0,
-            "2": 746443264.0,
-            "3": 746443264.0,
-            "4": 746443264.0,
-            "5": 746443264.0,
-            "6": 746443264.0,
-            "7": 746443264.0,
-            "8": 746443264.0,
-            "9": 746443264.0,
-            "10": 746443264.0,
-            "11": 746443264.0,
-            "12": 746443264.0,
-            "13": 746443264.0,
-            "14": 746443264.0,
-            "15": 746443264.0,
-            "16": 746443264.0,
-            "17": 746443264.0,
-            "18": 746443264.0,
-            "19": 746443264.0,
-            "20": 746443264.0,
-            "21": 746443264.0,
-            "22": 746443264.0,
-            "23": 746443264.0,
-            "24": 746443264.0,
-            "25": 746443264.0,
-            "26": 746443264.0,
-            "27": 746443264.0,
-            "28": 746443264.0,
-            "29": 746443264.0,
-            "30": 746443264.0,
-            "31": 746443264.0,
-            "32": 746443264.0,
-            "33": 746443264.0,
-            "34": 746443264.0,
-            "35": 746443264.0,
-            "36": 746443264.0,
-            "37": 746443264.0,
-            "38": 746443264.0,
-            "39": 746443264.0,
-            "40": 746443264.0,
-            "41": 746443264.0,
-            "42": 746443264.0,
-            "43": 746443264.0,
-            "44": 746443264.0,
-            "45": 746443264.0,
-            "46": 746443264.0,
-            "47": 746443264.0,
-            "48": 746443264.0,
-            "49": 746443264.0,
-            "50": 746443264.0,
-            "51": 746443264.0,
-            "52": 746443264.0,
-            "53": 746443264.0,
-            "54": 746443264.0,
-            "55": 746443264.0,
-            "56": 746443264.0,
-            "57": 746443264.0,
-            "58": 746443264.0,
-            "59": 746443264.0,
-            "60": 746443264.0,
-            "61": 746443264.0,
-            "62": 746443264.0,
-            "63": 746443264.0,
-            "64": 746443264.0,
-            "65": 746443264.0,
-            "66": 746443264.0,
-            "67": 746443264.0,
-            "68": 746443264.0,
-            "69": 746443264.0,
-            "70": 746443264.0,
-            "71": 746443264.0,
-            "72": 746443264.0,
-            "73": 746443264.0,
-            "74": 746443264.0,
-            "75": 746443264.0,
-            "76": 746443264.0,
-            "77": 746443264.0,
-            "78": 746443264.0,
-            "79": 746443264.0,
-            "80": 746443264.0,
-            "81": 746443264.0,
-            "82": 746443264.0,
-            "83": 746443264.0,
-            "84": 746443264.0,
-            "85": 746443264.0,
-            "86": 746443264.0,
-            "87": 746443264.0,
-            "88": 746443264.0,
-            "89": 746443264.0,
-            "90": 746443264.0,
-            "91": 746443264.0,
-            "92": 746443264.0,
-            "93": 746443264.0,
-            "94": 746443264.0,
-            "95": 746443264.0,
-            "96": 746443264.0,
-            "97": 746443264.0,
-            "98": 746443264.0,
-            "99": 746443264.0,
-            "100": 746443264.0
+            "1": 747492864.0,
+            "2": 747492864.0,
+            "3": 747492864.0,
+            "4": 747492864.0,
+            "5": 747492864.0,
+            "6": 747492864.0,
+            "7": 747492864.0,
+            "8": 747492864.0,
+            "9": 747492864.0,
+            "10": 747492864.0,
+            "11": 747492864.0,
+            "12": 747492864.0,
+            "13": 747492864.0,
+            "14": 747492864.0,
+            "15": 747492864.0,
+            "16": 747492864.0,
+            "17": 747492864.0,
+            "18": 747492864.0,
+            "19": 747492864.0,
+            "20": 747492864.0,
+            "21": 747492864.0,
+            "22": 747492864.0,
+            "23": 747492864.0,
+            "24": 747492864.0,
+            "25": 747492864.0,
+            "26": 747492864.0,
+            "27": 747492864.0,
+            "28": 747492864.0,
+            "29": 747492864.0,
+            "30": 747492864.0,
+            "31": 747492864.0,
+            "32": 747492864.0,
+            "33": 747492864.0,
+            "34": 747492864.0,
+            "35": 747492864.0,
+            "36": 747492864.0,
+            "37": 747492864.0,
+            "38": 747492864.0,
+            "39": 747492864.0,
+            "40": 747492864.0,
+            "41": 747492864.0,
+            "42": 747492864.0,
+            "43": 747492864.0,
+            "44": 747492864.0,
+            "45": 747492864.0,
+            "46": 747492864.0,
+            "47": 747492864.0,
+            "48": 747492864.0,
+            "49": 747492864.0,
+            "50": 747492864.0,
+            "51": 747492864.0,
+            "52": 747492864.0,
+            "53": 747492864.0,
+            "54": 747492864.0,
+            "55": 747492864.0,
+            "56": 747492864.0,
+            "57": 747492864.0,
+            "58": 747492864.0,
+            "59": 747492864.0,
+            "60": 747492864.0,
+            "61": 747492864.0,
+            "62": 747492864.0,
+            "63": 747492864.0,
+            "64": 747492864.0,
+            "65": 747492864.0,
+            "66": 747492864.0,
+            "67": 747492864.0,
+            "68": 747492864.0,
+            "69": 747492864.0,
+            "70": 747492864.0,
+            "71": 747492864.0,
+            "72": 747492864.0,
+            "73": 747492864.0,
+            "74": 747492864.0,
+            "75": 747492864.0,
+            "76": 747492864.0,
+            "77": 747492864.0,
+            "78": 747492864.0,
+            "79": 747492864.0,
+            "80": 747492864.0,
+            "81": 747492864.0,
+            "82": 747492864.0,
+            "83": 747492864.0,
+            "84": 747492864.0,
+            "85": 747492864.0,
+            "86": 747492864.0,
+            "87": 747492864.0,
+            "88": 747492864.0,
+            "89": 747492864.0,
+            "90": 747492864.0,
+            "91": 747492864.0,
+            "92": 747492864.0,
+            "93": 747492864.0,
+            "94": 747492864.0,
+            "95": 747492864.0,
+            "96": 747492864.0,
+            "97": 747492864.0,
+            "98": 747492864.0,
+            "99": 747492864.0,
+            "100": 747492864.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1926291456.0,
-            "2": 2210100224.0,
-            "3": 2210100224.0,
-            "4": 2210100224.0,
-            "5": 2210100224.0,
-            "6": 2210100224.0,
-            "7": 2210100224.0,
-            "8": 2210100224.0,
-            "9": 2210100224.0,
-            "10": 2210100224.0,
-            "11": 2210100224.0,
-            "12": 2210100224.0,
-            "13": 2210100224.0,
-            "14": 2210100224.0,
-            "15": 2210100224.0,
-            "16": 2210100224.0,
-            "17": 2210100224.0,
-            "18": 2210100224.0,
-            "19": 2210100224.0,
-            "20": 2210100224.0,
-            "21": 2210100224.0,
-            "22": 2210100224.0,
-            "23": 2210100224.0,
-            "24": 2210100224.0,
-            "25": 2210100224.0,
-            "26": 2210100224.0,
-            "27": 2210100224.0,
-            "28": 2210100224.0,
-            "29": 2210100224.0,
-            "30": 2210100224.0,
-            "31": 2210100224.0,
-            "32": 2210100224.0,
-            "33": 2210100224.0,
-            "34": 2210100224.0,
-            "35": 2210100224.0,
-            "36": 2210100224.0,
-            "37": 2210100224.0,
-            "38": 2210100224.0,
-            "39": 2210100224.0,
-            "40": 2210100224.0,
-            "41": 2210100224.0,
-            "42": 2210100224.0,
-            "43": 2210100224.0,
-            "44": 2210100224.0,
-            "45": 2210100224.0,
-            "46": 2210100224.0,
-            "47": 2210100224.0,
-            "48": 2210100224.0,
-            "49": 2210100224.0,
-            "50": 2210100224.0,
-            "51": 2210100224.0,
-            "52": 2210100224.0,
-            "53": 2210100224.0,
-            "54": 2210100224.0,
-            "55": 2210100224.0,
-            "56": 2210100224.0,
-            "57": 2210100224.0,
-            "58": 2210100224.0,
-            "59": 2210100224.0,
-            "60": 2210100224.0,
-            "61": 2210100224.0,
-            "62": 2210100224.0,
-            "63": 2210100224.0,
-            "64": 2210100224.0,
-            "65": 2210100224.0,
-            "66": 2210100224.0,
-            "67": 2210100224.0,
-            "68": 2210100224.0,
-            "69": 2210100224.0,
-            "70": 2210100224.0,
-            "71": 2210100224.0,
-            "72": 2210100224.0,
-            "73": 2210100224.0,
-            "74": 2210100224.0,
-            "75": 2210100224.0,
-            "76": 2210100224.0,
-            "77": 2210100224.0,
-            "78": 2210100224.0,
-            "79": 2210100224.0,
-            "80": 2210100224.0,
-            "81": 2210100224.0,
-            "82": 2210100224.0,
-            "83": 2210100224.0,
-            "84": 2210100224.0,
-            "85": 2210100224.0,
-            "86": 2210100224.0,
-            "87": 2210100224.0,
-            "88": 2210100224.0,
-            "89": 2210100224.0,
-            "90": 2210100224.0,
-            "91": 2210100224.0,
-            "92": 2210100224.0,
-            "93": 2210100224.0,
-            "94": 2210100224.0,
-            "95": 2210100224.0,
-            "96": 2210100224.0,
-            "97": 2210100224.0,
-            "98": 2210100224.0,
-            "99": 2210100224.0,
-            "100": 2210100224.0
+            "1": 1927341056.0,
+            "2": 2212197376.0,
+            "3": 2212197376.0,
+            "4": 2212197376.0,
+            "5": 2212197376.0,
+            "6": 2212197376.0,
+            "7": 2212197376.0,
+            "8": 2212197376.0,
+            "9": 2212197376.0,
+            "10": 2212197376.0,
+            "11": 2212197376.0,
+            "12": 2212197376.0,
+            "13": 2212197376.0,
+            "14": 2212197376.0,
+            "15": 2212197376.0,
+            "16": 2212197376.0,
+            "17": 2212197376.0,
+            "18": 2212197376.0,
+            "19": 2212197376.0,
+            "20": 2212197376.0,
+            "21": 2212197376.0,
+            "22": 2212197376.0,
+            "23": 2212197376.0,
+            "24": 2212197376.0,
+            "25": 2212197376.0,
+            "26": 2212197376.0,
+            "27": 2212197376.0,
+            "28": 2212197376.0,
+            "29": 2212197376.0,
+            "30": 2212197376.0,
+            "31": 2212197376.0,
+            "32": 2212197376.0,
+            "33": 2212197376.0,
+            "34": 2212197376.0,
+            "35": 2212197376.0,
+            "36": 2212197376.0,
+            "37": 2212197376.0,
+            "38": 2212197376.0,
+            "39": 2212197376.0,
+            "40": 2212197376.0,
+            "41": 2212197376.0,
+            "42": 2212197376.0,
+            "43": 2212197376.0,
+            "44": 2212197376.0,
+            "45": 2212197376.0,
+            "46": 2212197376.0,
+            "47": 2212197376.0,
+            "48": 2212197376.0,
+            "49": 2212197376.0,
+            "50": 2212197376.0,
+            "51": 2212197376.0,
+            "52": 2212197376.0,
+            "53": 2212197376.0,
+            "54": 2212197376.0,
+            "55": 2212197376.0,
+            "56": 2212197376.0,
+            "57": 2212197376.0,
+            "58": 2212197376.0,
+            "59": 2212197376.0,
+            "60": 2212197376.0,
+            "61": 2212197376.0,
+            "62": 2212197376.0,
+            "63": 2212197376.0,
+            "64": 2212197376.0,
+            "65": 2212197376.0,
+            "66": 2212197376.0,
+            "67": 2212197376.0,
+            "68": 2212197376.0,
+            "69": 2212197376.0,
+            "70": 2212197376.0,
+            "71": 2212197376.0,
+            "72": 2212197376.0,
+            "73": 2212197376.0,
+            "74": 2212197376.0,
+            "75": 2212197376.0,
+            "76": 2212197376.0,
+            "77": 2212197376.0,
+            "78": 2212197376.0,
+            "79": 2212197376.0,
+            "80": 2212197376.0,
+            "81": 2212197376.0,
+            "82": 2212197376.0,
+            "83": 2212197376.0,
+            "84": 2212197376.0,
+            "85": 2212197376.0,
+            "86": 2212197376.0,
+            "87": 2212197376.0,
+            "88": 2212197376.0,
+            "89": 2212197376.0,
+            "90": 2212197376.0,
+            "91": 2212197376.0,
+            "92": 2212197376.0,
+            "93": 2212197376.0,
+            "94": 2212197376.0,
+            "95": 2212197376.0,
+            "96": 2212197376.0,
+            "97": 2212197376.0,
+            "98": 2212197376.0,
+            "99": 2212197376.0,
+            "100": 2212197376.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 14.49723,
-            "2": 0.13917,
-            "3": 0.12323,
-            "4": 0.12243,
-            "5": 0.12247,
-            "6": 0.12126,
-            "7": 0.12098,
-            "8": 0.1227,
-            "9": 0.12232,
-            "10": 0.12216,
-            "11": 0.12203,
-            "12": 0.12472,
-            "13": 0.11919,
-            "14": 0.12363,
-            "15": 0.11934,
-            "16": 0.12078,
-            "17": 0.1214,
-            "18": 0.12382,
-            "19": 0.11938,
-            "20": 0.11818,
-            "21": 0.1195,
-            "22": 0.1193,
-            "23": 0.11729,
-            "24": 0.11671,
-            "25": 0.11812,
-            "26": 0.11788,
-            "27": 0.11835,
-            "28": 0.11687,
-            "29": 0.11683,
-            "30": 0.1185,
-            "31": 0.11738,
-            "32": 0.11696,
-            "33": 0.11541,
-            "34": 0.11482,
-            "35": 0.11307,
-            "36": 0.11445,
-            "37": 0.11503,
-            "38": 0.11448,
-            "39": 0.11562,
-            "40": 0.11468,
-            "41": 0.11341,
-            "42": 0.11368,
-            "43": 0.11604,
-            "44": 0.11649,
-            "45": 0.11581,
-            "46": 0.11637,
-            "47": 0.11699,
-            "48": 0.11661,
-            "49": 0.11522,
-            "50": 0.11451,
-            "51": 0.12299,
-            "52": 0.11449,
-            "53": 0.11137,
-            "54": 0.11274,
-            "55": 0.1121,
-            "56": 0.11212,
-            "57": 0.11573,
-            "58": 0.11206,
-            "59": 0.11388,
-            "60": 0.11369,
-            "61": 0.11208,
-            "62": 0.11287,
-            "63": 0.11238,
-            "64": 0.11193,
-            "65": 0.11205,
-            "66": 0.11482,
-            "67": 0.1131,
-            "68": 0.11433,
-            "69": 0.11257,
-            "70": 0.1116,
-            "71": 0.11365,
-            "72": 0.11214,
-            "73": 0.11376,
-            "74": 0.11389,
-            "75": 0.11397,
-            "76": 0.11359,
-            "77": 0.11346,
-            "78": 0.11235,
-            "79": 0.11282,
-            "80": 0.11301,
-            "81": 0.11347,
-            "82": 0.11356,
-            "83": 0.11321,
-            "84": 0.11412,
-            "85": 0.11256,
-            "86": 0.11555,
-            "87": 0.11224,
-            "88": 0.11344,
-            "89": 0.11351,
-            "90": 0.11218,
-            "91": 0.11235,
-            "92": 0.11417,
-            "93": 0.11691,
-            "94": 0.11326,
-            "95": 0.11519,
-            "96": 0.11321,
-            "97": 0.11272,
-            "98": 0.11268,
-            "99": 0.11187,
-            "100": 0.11371
+            "1": 9.78643,
+            "2": 0.13398,
+            "3": 0.11557,
+            "4": 0.09095,
+            "5": 0.09137,
+            "6": 0.09276,
+            "7": 0.09034,
+            "8": 0.09082,
+            "9": 0.09002,
+            "10": 0.09121,
+            "11": 0.08989,
+            "12": 0.0895,
+            "13": 0.09015,
+            "14": 0.09012,
+            "15": 0.0903,
+            "16": 0.09019,
+            "17": 0.0907,
+            "18": 0.09055,
+            "19": 0.08988,
+            "20": 0.08984,
+            "21": 0.08951,
+            "22": 0.0913,
+            "23": 0.08972,
+            "24": 0.08995,
+            "25": 0.09008,
+            "26": 0.08931,
+            "27": 0.09055,
+            "28": 0.08926,
+            "29": 0.09028,
+            "30": 0.09142,
+            "31": 0.09085,
+            "32": 0.09027,
+            "33": 0.09061,
+            "34": 0.08998,
+            "35": 0.09113,
+            "36": 0.09039,
+            "37": 0.08973,
+            "38": 0.09065,
+            "39": 0.08993,
+            "40": 0.09112,
+            "41": 0.10695,
+            "42": 0.11371,
+            "43": 0.09964,
+            "44": 0.09076,
+            "45": 0.0899,
+            "46": 0.09204,
+            "47": 0.0904,
+            "48": 0.08998,
+            "49": 0.09097,
+            "50": 0.08971,
+            "51": 0.10825,
+            "52": 0.097,
+            "53": 0.09456,
+            "54": 0.09109,
+            "55": 0.09071,
+            "56": 0.09099,
+            "57": 0.09129,
+            "58": 0.09159,
+            "59": 0.09138,
+            "60": 0.09089,
+            "61": 0.09092,
+            "62": 0.09153,
+            "63": 0.09208,
+            "64": 0.09107,
+            "65": 0.0918,
+            "66": 0.09116,
+            "67": 0.09075,
+            "68": 0.09166,
+            "69": 0.0948,
+            "70": 0.09166,
+            "71": 0.09195,
+            "72": 0.09271,
+            "73": 0.09226,
+            "74": 0.09271,
+            "75": 0.09216,
+            "76": 0.09129,
+            "77": 0.09221,
+            "78": 0.09252,
+            "79": 0.09161,
+            "80": 0.09144,
+            "81": 0.09112,
+            "82": 0.09152,
+            "83": 0.09106,
+            "84": 0.09137,
+            "85": 0.09127,
+            "86": 0.09136,
+            "87": 0.09077,
+            "88": 0.09362,
+            "89": 0.09244,
+            "90": 0.09162,
+            "91": 0.09114,
+            "92": 0.09065,
+            "93": 0.0913,
+            "94": 0.09071,
+            "95": 0.09096,
+            "96": 0.09066,
+            "97": 0.09585,
+            "98": 0.09148,
+            "99": 0.09232,
+            "100": 0.09229
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..2c197fd4e6b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8866,
+            "52": 9.78429,
+            "53": 10.10842,
+            "54": 9.97368,
+            "55": 9.89803,
+            "56": 9.65427,
+            "57": 9.52013,
+            "58": 9.87297,
+            "59": 9.6132,
+            "60": 9.54967,
+            "61": 9.70681,
+            "62": 9.98533,
+            "63": 9.41357,
+            "64": 9.80966,
+            "65": 8.97052,
+            "66": 9.72773,
+            "67": 9.39183,
+            "68": 9.8084,
+            "69": 9.82052,
+            "70": 9.76655,
+            "71": 9.63414,
+            "72": 9.60485,
+            "73": 9.52299,
+            "74": 8.9718,
+            "75": 9.42321,
+            "76": 9.10113,
+            "77": 10.0716,
+            "78": 9.74266,
+            "79": 9.40343,
+            "80": 9.41333,
+            "81": 9.49931,
+            "82": 9.70236,
+            "83": 9.33436,
+            "84": 9.43774,
+            "85": 9.63924,
+            "86": 9.07931,
+            "87": 9.60447,
+            "88": 9.7824,
+            "89": 9.62386,
+            "90": 9.84241,
+            "91": 9.35506,
+            "92": 9.38398,
+            "93": 9.09747,
+            "94": 8.8471,
+            "95": 9.5314,
+            "96": 9.54263,
+            "97": 9.32886,
+            "98": 9.6926,
+            "99": 8.89976,
+            "100": 9.43124
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 22791108.0,
+            "52": 22748190.0,
+            "53": 22924900.0,
+            "54": 22840164.0,
+            "55": 22518344.0,
+            "56": 22877680.0,
+            "57": 23113944.0,
+            "58": 22846268.0,
+            "59": 22716084.0,
+            "60": 22742984.0,
+            "61": 22724584.0,
+            "62": 22672944.0,
+            "63": 22846388.0,
+            "64": 22823650.0,
+            "65": 23061058.0,
+            "66": 22729266.0,
+            "67": 22908888.0,
+            "68": 22610020.0,
+            "69": 22583826.0,
+            "70": 22829374.0,
+            "71": 22748240.0,
+            "72": 22654480.0,
+            "73": 22741180.0,
+            "74": 23047914.0,
+            "75": 23054396.0,
+            "76": 22900788.0,
+            "77": 22271588.0,
+            "78": 22789024.0,
+            "79": 22743632.0,
+            "80": 22706696.0,
+            "81": 22891372.0,
+            "82": 22777860.0,
+            "83": 22840532.0,
+            "84": 23010386.0,
+            "85": 22711212.0,
+            "86": 23103006.0,
+            "87": 22734564.0,
+            "88": 22637848.0,
+            "89": 22497850.0,
+            "90": 22972712.0,
+            "91": 22767188.0,
+            "92": 22808834.0,
+            "93": 22659304.0,
+            "94": 22911552.0,
+            "95": 23047794.0,
+            "96": 22829386.0,
+            "97": 22608168.0,
+            "98": 22762756.0,
+            "99": 22905900.0,
+            "100": 23015488.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 746444288.0,
+            "52": 746444288.0,
+            "53": 746444288.0,
+            "54": 746444288.0,
+            "55": 746444288.0,
+            "56": 746444288.0,
+            "57": 746444288.0,
+            "58": 746444288.0,
+            "59": 746444288.0,
+            "60": 746444288.0,
+            "61": 746444288.0,
+            "62": 746444288.0,
+            "63": 746444288.0,
+            "64": 746444288.0,
+            "65": 746444288.0,
+            "66": 746444288.0,
+            "67": 746444288.0,
+            "68": 746444288.0,
+            "69": 746444288.0,
+            "70": 746444288.0,
+            "71": 746444288.0,
+            "72": 746444288.0,
+            "73": 746444288.0,
+            "74": 746444288.0,
+            "75": 746444288.0,
+            "76": 746444288.0,
+            "77": 746444288.0,
+            "78": 746444288.0,
+            "79": 746444288.0,
+            "80": 746444288.0,
+            "81": 746444288.0,
+            "82": 746444288.0,
+            "83": 746444288.0,
+            "84": 746444288.0,
+            "85": 746444288.0,
+            "86": 746444288.0,
+            "87": 746444288.0,
+            "88": 746444288.0,
+            "89": 746444288.0,
+            "90": 746444288.0,
+            "91": 746444288.0,
+            "92": 746444288.0,
+            "93": 746444288.0,
+            "94": 746444288.0,
+            "95": 746444288.0,
+            "96": 746444288.0,
+            "97": 746444288.0,
+            "98": 746444288.0,
+            "99": 746444288.0,
+            "100": 746444288.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2211148800.0,
+            "52": 2211149824.0,
+            "53": 2211149824.0,
+            "54": 2211149824.0,
+            "55": 2211149824.0,
+            "56": 2211149824.0,
+            "57": 2211149824.0,
+            "58": 2211149824.0,
+            "59": 2211149824.0,
+            "60": 2211149824.0,
+            "61": 2211149824.0,
+            "62": 2211149824.0,
+            "63": 2211149824.0,
+            "64": 2211149824.0,
+            "65": 2211149824.0,
+            "66": 2211149824.0,
+            "67": 2211149824.0,
+            "68": 2211149824.0,
+            "69": 2211149824.0,
+            "70": 2211149824.0,
+            "71": 2211149824.0,
+            "72": 2211149824.0,
+            "73": 2211149824.0,
+            "74": 2211149824.0,
+            "75": 2211149824.0,
+            "76": 2211149824.0,
+            "77": 2211149824.0,
+            "78": 2211149824.0,
+            "79": 2211149824.0,
+            "80": 2211149824.0,
+            "81": 2211149824.0,
+            "82": 2211149824.0,
+            "83": 2211149824.0,
+            "84": 2211149824.0,
+            "85": 2211149824.0,
+            "86": 2211149824.0,
+            "87": 2211149824.0,
+            "88": 2211149824.0,
+            "89": 2211149824.0,
+            "90": 2211149824.0,
+            "91": 2211149824.0,
+            "92": 2211149824.0,
+            "93": 2211149824.0,
+            "94": 2211149824.0,
+            "95": 2211149824.0,
+            "96": 2211149824.0,
+            "97": 2211149824.0,
+            "98": 2211149824.0,
+            "99": 2211149824.0,
+            "100": 2211149824.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8.06828,
+            "52": 0.13754,
+            "53": 0.09299,
+            "54": 0.0937,
+            "55": 0.09396,
+            "56": 0.09244,
+            "57": 0.09247,
+            "58": 0.09209,
+            "59": 0.09263,
+            "60": 0.09275,
+            "61": 0.09238,
+            "62": 0.09116,
+            "63": 0.0965,
+            "64": 0.09261,
+            "65": 0.09256,
+            "66": 0.09274,
+            "67": 0.09252,
+            "68": 0.09299,
+            "69": 0.09249,
+            "70": 0.09223,
+            "71": 0.09259,
+            "72": 0.09409,
+            "73": 0.09265,
+            "74": 0.09487,
+            "75": 0.0923,
+            "76": 0.09244,
+            "77": 0.09219,
+            "78": 0.0922,
+            "79": 0.09407,
+            "80": 0.09255,
+            "81": 0.09438,
+            "82": 0.09241,
+            "83": 0.09253,
+            "84": 0.09203,
+            "85": 0.09473,
+            "86": 0.09291,
+            "87": 0.0919,
+            "88": 0.0924,
+            "89": 0.09178,
+            "90": 0.09274,
+            "91": 0.09205,
+            "92": 0.09276,
+            "93": 0.09224,
+            "94": 0.09252,
+            "95": 0.09076,
+            "96": 0.09167,
+            "97": 0.09167,
+            "98": 0.0936,
+            "99": 0.09222,
+            "100": 0.09183
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json
index 307cec2659c..80b22797395 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.1537,
-            "2": 0.18498,
-            "3": 0.16024,
-            "4": 0.16059,
-            "5": 0.16002,
-            "6": 0.16103,
-            "7": 0.1591,
-            "8": 0.15912,
-            "9": 0.15909,
-            "10": 0.1574,
-            "11": 0.15721,
-            "12": 0.15764,
-            "13": 0.16009,
-            "14": 0.16035,
-            "15": 0.15973,
-            "16": 0.15641,
-            "17": 0.15673,
-            "18": 0.1565,
-            "19": 0.15684,
-            "20": 0.15713,
-            "21": 0.15762,
-            "22": 0.15859,
-            "23": 0.15877,
-            "24": 0.15973,
-            "25": 0.15946,
-            "26": 0.15909,
-            "27": 0.15855,
-            "28": 0.15876,
-            "29": 0.15921,
-            "30": 0.16148,
-            "31": 0.15991,
-            "32": 0.1576,
-            "33": 0.15829,
-            "34": 0.15886,
-            "35": 0.15948,
-            "36": 0.15819,
-            "37": 0.15886,
-            "38": 0.15896,
-            "39": 0.16029,
-            "40": 0.15802,
-            "41": 0.16038,
-            "42": 0.15965,
-            "43": 0.15985,
-            "44": 0.15882,
-            "45": 0.16056,
-            "46": 0.1592,
-            "47": 0.20747,
-            "48": 0.16124,
-            "49": 0.16012,
-            "50": 0.15759,
-            "51": 0.16615,
-            "52": 0.15685,
-            "53": 0.15965,
-            "54": 0.15787,
-            "55": 0.15762,
-            "56": 0.15748,
-            "57": 0.15807,
-            "58": 0.15831,
-            "59": 0.15671,
-            "60": 0.15765,
-            "61": 0.15997,
-            "62": 0.15756,
-            "63": 0.15822,
-            "64": 0.15898,
-            "65": 0.15778,
-            "66": 0.15853,
-            "67": 0.15855,
-            "68": 0.15784,
-            "69": 0.15777,
-            "70": 0.15791,
-            "71": 0.15907,
-            "72": 0.15986,
-            "73": 0.15727,
-            "74": 0.15842,
-            "75": 0.15738,
-            "76": 0.15786,
-            "77": 0.15749,
-            "78": 0.15761,
-            "79": 0.15838,
-            "80": 0.15955,
-            "81": 0.15796,
-            "82": 0.15816,
-            "83": 0.15953,
-            "84": 0.15849,
-            "85": 0.15905,
-            "86": 0.15852,
-            "87": 0.15827,
-            "88": 0.15773,
-            "89": 0.15778,
-            "90": 0.15679,
-            "91": 0.1583,
-            "92": 0.15749,
-            "93": 0.15843,
-            "94": 0.15878,
-            "95": 0.15805,
-            "96": 0.1588,
-            "97": 0.15983,
-            "98": 0.16098,
-            "99": 0.16131,
-            "100": 0.15935
+            "1": 5.03932,
+            "2": 0.18621,
+            "3": 0.17196,
+            "4": 0.15545,
+            "5": 0.1504,
+            "6": 0.15031,
+            "7": 0.14857,
+            "8": 0.14917,
+            "9": 0.1495,
+            "10": 0.14924,
+            "11": 0.14939,
+            "12": 0.14861,
+            "13": 0.14915,
+            "14": 0.14919,
+            "15": 0.14909,
+            "16": 0.14904,
+            "17": 0.14933,
+            "18": 0.14874,
+            "19": 0.14902,
+            "20": 0.14813,
+            "21": 0.14885,
+            "22": 0.14872,
+            "23": 0.14993,
+            "24": 0.14895,
+            "25": 0.14768,
+            "26": 0.14781,
+            "27": 0.14754,
+            "28": 0.14775,
+            "29": 0.15216,
+            "30": 0.15461,
+            "31": 0.1541,
+            "32": 0.14739,
+            "33": 0.14626,
+            "34": 0.14619,
+            "35": 0.14604,
+            "36": 0.14567,
+            "37": 0.14566,
+            "38": 0.14678,
+            "39": 0.14625,
+            "40": 0.14515,
+            "41": 0.1459,
+            "42": 0.14526,
+            "43": 0.14647,
+            "44": 0.14562,
+            "45": 0.14545,
+            "46": 0.14621,
+            "47": 0.14567,
+            "48": 0.14603,
+            "49": 0.14558,
+            "50": 0.14505,
+            "51": 0.16204,
+            "52": 0.15073,
+            "53": 0.15152,
+            "54": 0.15093,
+            "55": 0.15055,
+            "56": 0.15091,
+            "57": 0.15302,
+            "58": 0.15142,
+            "59": 0.15079,
+            "60": 0.15185,
+            "61": 0.14979,
+            "62": 0.15038,
+            "63": 0.15098,
+            "64": 0.1503,
+            "65": 0.15057,
+            "66": 0.15088,
+            "67": 0.15024,
+            "68": 0.15134,
+            "69": 0.15072,
+            "70": 0.15092,
+            "71": 0.15108,
+            "72": 0.15129,
+            "73": 0.15025,
+            "74": 0.15185,
+            "75": 0.15148,
+            "76": 0.15102,
+            "77": 0.15066,
+            "78": 0.15069,
+            "79": 0.1514,
+            "80": 0.15055,
+            "81": 0.15068,
+            "82": 0.15079,
+            "83": 0.15141,
+            "84": 0.15081,
+            "85": 0.15116,
+            "86": 0.15171,
+            "87": 0.15012,
+            "88": 0.15018,
+            "89": 0.1509,
+            "90": 0.15033,
+            "91": 0.15134,
+            "92": 0.15061,
+            "93": 0.1505,
+            "94": 0.15109,
+            "95": 0.1506,
+            "96": 0.15188,
+            "97": 0.15182,
+            "98": 0.15154,
+            "99": 0.15201,
+            "100": 0.15117
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..6b3ff627828
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.86793,
+            "52": 9.76274,
+            "53": 10.10895,
+            "54": 9.95538,
+            "55": 9.8756,
+            "56": 9.64751,
+            "57": 9.48989,
+            "58": 9.85502,
+            "59": 9.59457,
+            "60": 9.52968,
+            "61": 9.69589,
+            "62": 10.01676,
+            "63": 9.38778,
+            "64": 9.80211,
+            "65": 8.95119,
+            "66": 9.72857,
+            "67": 9.37577,
+            "68": 9.80463,
+            "69": 9.81,
+            "70": 9.7662,
+            "71": 9.63135,
+            "72": 9.5784,
+            "73": 9.52148,
+            "74": 8.94976,
+            "75": 9.43087,
+            "76": 9.08489,
+            "77": 10.089,
+            "78": 9.72754,
+            "79": 9.37612,
+            "80": 9.40849,
+            "81": 9.49766,
+            "82": 9.71298,
+            "83": 9.33332,
+            "84": 9.43928,
+            "85": 9.63373,
+            "86": 9.07038,
+            "87": 9.61245,
+            "88": 9.78304,
+            "89": 9.60878,
+            "90": 9.85164,
+            "91": 9.34542,
+            "92": 9.38281,
+            "93": 9.07319,
+            "94": 8.81684,
+            "95": 9.51809,
+            "96": 9.54033,
+            "97": 9.34061,
+            "98": 9.70134,
+            "99": 8.88786,
+            "100": 9.43285
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 22791326.0,
+            "52": 22749392.0,
+            "53": 22925970.0,
+            "54": 22839434.0,
+            "55": 22518416.0,
+            "56": 22877660.0,
+            "57": 23113304.0,
+            "58": 22845008.0,
+            "59": 22715512.0,
+            "60": 22743058.0,
+            "61": 22723950.0,
+            "62": 22673248.0,
+            "63": 22846074.0,
+            "64": 22823228.0,
+            "65": 23060212.0,
+            "66": 22729902.0,
+            "67": 22907278.0,
+            "68": 22610092.0,
+            "69": 22584360.0,
+            "70": 22829348.0,
+            "71": 22749420.0,
+            "72": 22655446.0,
+            "73": 22740974.0,
+            "74": 23048296.0,
+            "75": 23053922.0,
+            "76": 22901008.0,
+            "77": 22272806.0,
+            "78": 22789370.0,
+            "79": 22743288.0,
+            "80": 22706236.0,
+            "81": 22890976.0,
+            "82": 22777092.0,
+            "83": 22839240.0,
+            "84": 23010352.0,
+            "85": 22712004.0,
+            "86": 23103740.0,
+            "87": 22734788.0,
+            "88": 22637620.0,
+            "89": 22499200.0,
+            "90": 22972420.0,
+            "91": 22766428.0,
+            "92": 22808890.0,
+            "93": 22659888.0,
+            "94": 22910970.0,
+            "95": 23048514.0,
+            "96": 22829470.0,
+            "97": 22608826.0,
+            "98": 22763528.0,
+            "99": 22905754.0,
+            "100": 23016268.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 717083136.0,
+            "52": 717083136.0,
+            "53": 717083136.0,
+            "54": 717083136.0,
+            "55": 717083136.0,
+            "56": 717083136.0,
+            "57": 717083136.0,
+            "58": 717083136.0,
+            "59": 717083136.0,
+            "60": 717083136.0,
+            "61": 717083136.0,
+            "62": 717083136.0,
+            "63": 717083136.0,
+            "64": 717083136.0,
+            "65": 717083136.0,
+            "66": 717083136.0,
+            "67": 717083136.0,
+            "68": 717083136.0,
+            "69": 717083136.0,
+            "70": 717083136.0,
+            "71": 717083136.0,
+            "72": 717083136.0,
+            "73": 717083136.0,
+            "74": 717083136.0,
+            "75": 717083136.0,
+            "76": 717083136.0,
+            "77": 717083136.0,
+            "78": 717083136.0,
+            "79": 717083136.0,
+            "80": 717083136.0,
+            "81": 717083136.0,
+            "82": 717083136.0,
+            "83": 717083136.0,
+            "84": 717083136.0,
+            "85": 717083136.0,
+            "86": 717083136.0,
+            "87": 717083136.0,
+            "88": 717083136.0,
+            "89": 717083136.0,
+            "90": 717083136.0,
+            "91": 717083136.0,
+            "92": 717083136.0,
+            "93": 717083136.0,
+            "94": 717083136.0,
+            "95": 717083136.0,
+            "96": 717083136.0,
+            "97": 717083136.0,
+            "98": 717083136.0,
+            "99": 717083136.0,
+            "100": 717083136.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2194370560.0,
+            "52": 2194371584.0,
+            "53": 2194371584.0,
+            "54": 2194371584.0,
+            "55": 2194371584.0,
+            "56": 2194371584.0,
+            "57": 2194371584.0,
+            "58": 2194371584.0,
+            "59": 2194371584.0,
+            "60": 2194371584.0,
+            "61": 2194371584.0,
+            "62": 2194371584.0,
+            "63": 2194371584.0,
+            "64": 2194371584.0,
+            "65": 2194371584.0,
+            "66": 2194371584.0,
+            "67": 2194371584.0,
+            "68": 2194371584.0,
+            "69": 2194371584.0,
+            "70": 2194371584.0,
+            "71": 2194371584.0,
+            "72": 2194371584.0,
+            "73": 2194371584.0,
+            "74": 2194371584.0,
+            "75": 2194371584.0,
+            "76": 2194371584.0,
+            "77": 2194371584.0,
+            "78": 2194371584.0,
+            "79": 2194371584.0,
+            "80": 2194371584.0,
+            "81": 2194371584.0,
+            "82": 2194371584.0,
+            "83": 2194371584.0,
+            "84": 2194371584.0,
+            "85": 2194371584.0,
+            "86": 2194371584.0,
+            "87": 2194371584.0,
+            "88": 2194371584.0,
+            "89": 2194371584.0,
+            "90": 2194371584.0,
+            "91": 2194371584.0,
+            "92": 2194371584.0,
+            "93": 2194371584.0,
+            "94": 2194371584.0,
+            "95": 2194371584.0,
+            "96": 2194371584.0,
+            "97": 2194371584.0,
+            "98": 2194371584.0,
+            "99": 2194371584.0,
+            "100": 2194371584.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.88691,
+            "52": 0.18475,
+            "53": 0.15645,
+            "54": 0.15149,
+            "55": 0.15178,
+            "56": 0.15436,
+            "57": 0.15089,
+            "58": 0.15055,
+            "59": 0.15075,
+            "60": 0.1517,
+            "61": 0.15028,
+            "62": 0.14804,
+            "63": 0.14921,
+            "64": 0.15,
+            "65": 0.14973,
+            "66": 0.15168,
+            "67": 0.15493,
+            "68": 0.15271,
+            "69": 0.15341,
+            "70": 0.15423,
+            "71": 0.15432,
+            "72": 0.15491,
+            "73": 0.1552,
+            "74": 0.15454,
+            "75": 0.15427,
+            "76": 0.15393,
+            "77": 0.15383,
+            "78": 0.15459,
+            "79": 0.15484,
+            "80": 0.1534,
+            "81": 0.15504,
+            "82": 0.15286,
+            "83": 0.15444,
+            "84": 0.15427,
+            "85": 0.15522,
+            "86": 0.15438,
+            "87": 0.15378,
+            "88": 0.15395,
+            "89": 0.15338,
+            "90": 0.1542,
+            "91": 0.15415,
+            "92": 0.15382,
+            "93": 0.15529,
+            "94": 0.15411,
+            "95": 0.15301,
+            "96": 0.15392,
+            "97": 0.15398,
+            "98": 0.15485,
+            "99": 0.15384,
+            "100": 0.15373
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/model_config.yaml
index a3a1a458739..ea6f2520553 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/model_config.yaml
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..2610b7fe2f4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82558,
+            "2": 10.83322,
+            "3": 10.82737,
+            "4": 10.79588,
+            "5": 10.85708,
+            "6": 10.86392,
+            "7": 10.8269,
+            "8": 10.82589,
+            "9": 10.83705,
+            "10": 10.79716,
+            "11": 10.87851,
+            "12": 10.85794,
+            "13": 10.8537,
+            "14": 10.87547,
+            "15": 10.79179,
+            "16": 10.80303,
+            "17": 10.7745,
+            "18": 10.804,
+            "19": 10.79363,
+            "20": 10.69591,
+            "21": 10.68551,
+            "22": 10.53149,
+            "23": 10.70658,
+            "24": 10.57317,
+            "25": 10.51546,
+            "26": 10.59072,
+            "27": 10.60736,
+            "28": 10.57024,
+            "29": 10.58904,
+            "30": 10.34679,
+            "31": 10.07734,
+            "32": 10.46319,
+            "33": 10.45704,
+            "34": 10.19923,
+            "35": 10.25593,
+            "36": 10.21246,
+            "37": 10.34688,
+            "38": 10.18009,
+            "39": 10.408,
+            "40": 10.07603,
+            "41": 10.12932,
+            "42": 10.21134,
+            "43": 9.81692,
+            "44": 9.94028,
+            "45": 9.81699,
+            "46": 9.80606,
+            "47": 10.12475,
+            "48": 9.8405,
+            "49": 9.50971,
+            "50": 9.88934
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1691.0,
+            "2": 1553.0,
+            "3": 1673.0,
+            "4": 1760.0,
+            "5": 1852.0,
+            "6": 1861.0,
+            "7": 1907.0,
+            "8": 1712.0,
+            "9": 1919.0,
+            "10": 1427.0,
+            "11": 1965.0,
+            "12": 1742.0,
+            "13": 1946.0,
+            "14": 1903.0,
+            "15": 1851.0,
+            "16": 1804.0,
+            "17": 1778.0,
+            "18": 1702.0,
+            "19": 1703.0,
+            "20": 1706.0,
+            "21": 1916.0,
+            "22": 1698.0,
+            "23": 2009.0,
+            "24": 1606.0,
+            "25": 1625.0,
+            "26": 1722.0,
+            "27": 1784.0,
+            "28": 1981.0,
+            "29": 1919.0,
+            "30": 1948.0,
+            "31": 1503.0,
+            "32": 1904.0,
+            "33": 2058.0,
+            "34": 1737.0,
+            "35": 1916.0,
+            "36": 1980.0,
+            "37": 2263.0,
+            "38": 2121.0,
+            "39": 2277.0,
+            "40": 2021.0,
+            "41": 2202.0,
+            "42": 2340.0,
+            "43": 1973.0,
+            "44": 2006.0,
+            "45": 2128.0,
+            "46": 2132.0,
+            "47": 2438.0,
+            "48": 2286.0,
+            "49": 2215.0,
+            "50": 2337.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 759682560.0,
+            "2": 759682560.0,
+            "3": 759682560.0,
+            "4": 759682560.0,
+            "5": 759682560.0,
+            "6": 759682560.0,
+            "7": 759682560.0,
+            "8": 759682560.0,
+            "9": 759682560.0,
+            "10": 759682560.0,
+            "11": 759682560.0,
+            "12": 759682560.0,
+            "13": 759682560.0,
+            "14": 759682560.0,
+            "15": 759682560.0,
+            "16": 759682560.0,
+            "17": 759682560.0,
+            "18": 759682560.0,
+            "19": 759682560.0,
+            "20": 759682560.0,
+            "21": 759682560.0,
+            "22": 759682560.0,
+            "23": 759682560.0,
+            "24": 759682560.0,
+            "25": 759682560.0,
+            "26": 759682560.0,
+            "27": 759682560.0,
+            "28": 759682560.0,
+            "29": 759682560.0,
+            "30": 759682560.0,
+            "31": 759682560.0,
+            "32": 759682560.0,
+            "33": 759682560.0,
+            "34": 759682560.0,
+            "35": 759682560.0,
+            "36": 759682560.0,
+            "37": 759682560.0,
+            "38": 759682560.0,
+            "39": 759682560.0,
+            "40": 759682560.0,
+            "41": 759682560.0,
+            "42": 759682560.0,
+            "43": 759682560.0,
+            "44": 759682560.0,
+            "45": 759682560.0,
+            "46": 759682560.0,
+            "47": 759682560.0,
+            "48": 759682560.0,
+            "49": 759682560.0,
+            "50": 759682560.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 3866814976.0,
+            "2": 4148526592.0,
+            "3": 4148526592.0,
+            "4": 4148526592.0,
+            "5": 4148526592.0,
+            "6": 4148526592.0,
+            "7": 4148526592.0,
+            "8": 4148526592.0,
+            "9": 4148526592.0,
+            "10": 4148526592.0,
+            "11": 4148526592.0,
+            "12": 4148526592.0,
+            "13": 4148526592.0,
+            "14": 4148526592.0,
+            "15": 4148526592.0,
+            "16": 4148526592.0,
+            "17": 4148526592.0,
+            "18": 4148526592.0,
+            "19": 4148526592.0,
+            "20": 4148526592.0,
+            "21": 4148526592.0,
+            "22": 4148526592.0,
+            "23": 4148526592.0,
+            "24": 4148526592.0,
+            "25": 4148526592.0,
+            "26": 4148526592.0,
+            "27": 4148526592.0,
+            "28": 4148526592.0,
+            "29": 4148526592.0,
+            "30": 4148526592.0,
+            "31": 4148526592.0,
+            "32": 4148526592.0,
+            "33": 4148526592.0,
+            "34": 4148526592.0,
+            "35": 4148526592.0,
+            "36": 4148526592.0,
+            "37": 4148526592.0,
+            "38": 4148526592.0,
+            "39": 4148526592.0,
+            "40": 4148526592.0,
+            "41": 4148526592.0,
+            "42": 4148526592.0,
+            "43": 4148526592.0,
+            "44": 4148526592.0,
+            "45": 4148526592.0,
+            "46": 4148526592.0,
+            "47": 4148526592.0,
+            "48": 4148526592.0,
+            "49": 4148526592.0,
+            "50": 4148526592.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.94258,
+            "3": 0.12978,
+            "4": 0.11688,
+            "5": 0.11937,
+            "6": 0.12093,
+            "7": 0.12307,
+            "8": 0.13062,
+            "9": 0.12926,
+            "10": 0.1228,
+            "11": 0.12859,
+            "12": 0.12404,
+            "13": 0.12912,
+            "14": 0.12318,
+            "15": 0.12609,
+            "16": 0.13327,
+            "17": 0.12859,
+            "18": 0.12957,
+            "19": 0.12658,
+            "20": 0.12929,
+            "21": 0.12937,
+            "22": 0.1298,
+            "23": 0.12888,
+            "24": 0.12917,
+            "25": 0.1285,
+            "26": 0.12864,
+            "27": 0.13061,
+            "28": 0.1272,
+            "29": 0.12953,
+            "30": 0.12693,
+            "31": 0.13141,
+            "32": 0.12786,
+            "33": 0.12815,
+            "34": 0.12937,
+            "35": 0.12957,
+            "36": 0.12737,
+            "37": 0.1313,
+            "38": 0.12977,
+            "39": 0.12805,
+            "40": 0.1298,
+            "41": 0.1296,
+            "42": 0.13074,
+            "43": 0.12955,
+            "44": 0.13171,
+            "45": 0.13055,
+            "46": 0.13271,
+            "47": 0.13004,
+            "48": 0.12873,
+            "49": 0.13129,
+            "50": 0.12858
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_h100.json
index b5d55ac433c..1c87eb73023 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 759681536.0,
-            "2": 759681536.0,
-            "3": 759681536.0,
-            "4": 759681536.0,
-            "5": 759681536.0,
-            "6": 759681536.0,
-            "7": 759681536.0,
-            "8": 759681536.0,
-            "9": 759681536.0,
-            "10": 759681536.0,
-            "11": 759681536.0,
-            "12": 759681536.0,
-            "13": 759681536.0,
-            "14": 759681536.0,
-            "15": 759681536.0,
-            "16": 759681536.0,
-            "17": 759681536.0,
-            "18": 759681536.0,
-            "19": 759681536.0,
-            "20": 759681536.0,
-            "21": 759681536.0,
-            "22": 759681536.0,
-            "23": 759681536.0,
-            "24": 759681536.0,
-            "25": 759681536.0,
-            "26": 759681536.0,
-            "27": 759681536.0,
-            "28": 759681536.0,
-            "29": 759681536.0,
-            "30": 759681536.0,
-            "31": 759681536.0,
-            "32": 759681536.0,
-            "33": 759681536.0,
-            "34": 759681536.0,
-            "35": 759681536.0,
-            "36": 759681536.0,
-            "37": 759681536.0,
-            "38": 759681536.0,
-            "39": 759681536.0,
-            "40": 759681536.0,
-            "41": 759681536.0,
-            "42": 759681536.0,
-            "43": 759681536.0,
-            "44": 759681536.0,
-            "45": 759681536.0,
-            "46": 759681536.0,
-            "47": 759681536.0,
-            "48": 759681536.0,
-            "49": 759681536.0,
-            "50": 759681536.0
+            "1": 759682560.0,
+            "2": 759682560.0,
+            "3": 759682560.0,
+            "4": 759682560.0,
+            "5": 759682560.0,
+            "6": 759682560.0,
+            "7": 759682560.0,
+            "8": 759682560.0,
+            "9": 759682560.0,
+            "10": 759682560.0,
+            "11": 759682560.0,
+            "12": 759682560.0,
+            "13": 759682560.0,
+            "14": 759682560.0,
+            "15": 759682560.0,
+            "16": 759682560.0,
+            "17": 759682560.0,
+            "18": 759682560.0,
+            "19": 759682560.0,
+            "20": 759682560.0,
+            "21": 759682560.0,
+            "22": 759682560.0,
+            "23": 759682560.0,
+            "24": 759682560.0,
+            "25": 759682560.0,
+            "26": 759682560.0,
+            "27": 759682560.0,
+            "28": 759682560.0,
+            "29": 759682560.0,
+            "30": 759682560.0,
+            "31": 759682560.0,
+            "32": 759682560.0,
+            "33": 759682560.0,
+            "34": 759682560.0,
+            "35": 759682560.0,
+            "36": 759682560.0,
+            "37": 759682560.0,
+            "38": 759682560.0,
+            "39": 759682560.0,
+            "40": 759682560.0,
+            "41": 759682560.0,
+            "42": 759682560.0,
+            "43": 759682560.0,
+            "44": 759682560.0,
+            "45": 759682560.0,
+            "46": 759682560.0,
+            "47": 759682560.0,
+            "48": 759682560.0,
+            "49": 759682560.0,
+            "50": 759682560.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3866813952.0,
-            "2": 4148525568.0,
-            "3": 4148525568.0,
-            "4": 4148525568.0,
-            "5": 4148525568.0,
-            "6": 4148525568.0,
-            "7": 4148525568.0,
-            "8": 4148525568.0,
-            "9": 4148525568.0,
-            "10": 4148525568.0,
-            "11": 4148525568.0,
-            "12": 4148525568.0,
-            "13": 4148525568.0,
-            "14": 4148525568.0,
-            "15": 4148525568.0,
-            "16": 4148525568.0,
-            "17": 4148525568.0,
-            "18": 4148525568.0,
-            "19": 4148525568.0,
-            "20": 4148525568.0,
-            "21": 4148525568.0,
-            "22": 4148525568.0,
-            "23": 4148525568.0,
-            "24": 4148525568.0,
-            "25": 4148525568.0,
-            "26": 4148525568.0,
-            "27": 4148525568.0,
-            "28": 4148525568.0,
-            "29": 4148525568.0,
-            "30": 4148525568.0,
-            "31": 4148525568.0,
-            "32": 4148525568.0,
-            "33": 4148525568.0,
-            "34": 4148525568.0,
-            "35": 4148525568.0,
-            "36": 4148525568.0,
-            "37": 4148525568.0,
-            "38": 4148525568.0,
-            "39": 4148525568.0,
-            "40": 4148525568.0,
-            "41": 4148525568.0,
-            "42": 4148525568.0,
-            "43": 4148525568.0,
-            "44": 4148525568.0,
-            "45": 4148525568.0,
-            "46": 4148525568.0,
-            "47": 4148525568.0,
-            "48": 4148525568.0,
-            "49": 4148525568.0,
-            "50": 4148525568.0
+            "1": 3866814976.0,
+            "2": 4148526592.0,
+            "3": 4148526592.0,
+            "4": 4148526592.0,
+            "5": 4148526592.0,
+            "6": 4148526592.0,
+            "7": 4148526592.0,
+            "8": 4148526592.0,
+            "9": 4148526592.0,
+            "10": 4148526592.0,
+            "11": 4148526592.0,
+            "12": 4148526592.0,
+            "13": 4148526592.0,
+            "14": 4148526592.0,
+            "15": 4148526592.0,
+            "16": 4148526592.0,
+            "17": 4148526592.0,
+            "18": 4148526592.0,
+            "19": 4148526592.0,
+            "20": 4148526592.0,
+            "21": 4148526592.0,
+            "22": 4148526592.0,
+            "23": 4148526592.0,
+            "24": 4148526592.0,
+            "25": 4148526592.0,
+            "26": 4148526592.0,
+            "27": 4148526592.0,
+            "28": 4148526592.0,
+            "29": 4148526592.0,
+            "30": 4148526592.0,
+            "31": 4148526592.0,
+            "32": 4148526592.0,
+            "33": 4148526592.0,
+            "34": 4148526592.0,
+            "35": 4148526592.0,
+            "36": 4148526592.0,
+            "37": 4148526592.0,
+            "38": 4148526592.0,
+            "39": 4148526592.0,
+            "40": 4148526592.0,
+            "41": 4148526592.0,
+            "42": 4148526592.0,
+            "43": 4148526592.0,
+            "44": 4148526592.0,
+            "45": 4148526592.0,
+            "46": 4148526592.0,
+            "47": 4148526592.0,
+            "48": 4148526592.0,
+            "49": 4148526592.0,
+            "50": 4148526592.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 12.80183,
-            "2": 0.14507,
-            "3": 0.13423,
-            "4": 0.12539,
-            "5": 0.12233,
-            "6": 0.12325,
-            "7": 0.12437,
-            "8": 0.12453,
-            "9": 0.12348,
-            "10": 0.12305,
-            "11": 0.12491,
-            "12": 0.12346,
-            "13": 0.1234,
-            "14": 0.12145,
-            "15": 0.12227,
-            "16": 0.12254,
-            "17": 0.12422,
-            "18": 0.12237,
-            "19": 0.12342,
-            "20": 0.1219,
-            "21": 0.1212,
-            "22": 0.12243,
-            "23": 0.11962,
-            "24": 0.1224,
-            "25": 0.12155,
-            "26": 0.12253,
-            "27": 0.12095,
-            "28": 0.12035,
-            "29": 0.12115,
-            "30": 0.11898,
-            "31": 0.12063,
-            "32": 0.1189,
-            "33": 0.12106,
-            "34": 0.11766,
-            "35": 0.11962,
-            "36": 0.12112,
-            "37": 0.11847,
-            "38": 0.11727,
-            "39": 0.11905,
-            "40": 0.11887,
-            "41": 0.11948,
-            "42": 0.11832,
-            "43": 0.11858,
-            "44": 0.1186,
-            "45": 0.12057,
-            "46": 0.1186,
-            "47": 0.12097,
-            "48": 0.11934,
-            "49": 0.11972,
-            "50": 0.12006
+            "1": 9.85525,
+            "2": 0.11909,
+            "3": 0.10687,
+            "4": 0.08766,
+            "5": 0.08696,
+            "6": 0.08852,
+            "7": 0.08705,
+            "8": 0.0866,
+            "9": 0.08968,
+            "10": 0.09051,
+            "11": 0.08988,
+            "12": 0.08985,
+            "13": 0.09145,
+            "14": 0.09034,
+            "15": 0.09081,
+            "16": 0.09029,
+            "17": 0.09013,
+            "18": 0.09023,
+            "19": 0.09004,
+            "20": 0.09017,
+            "21": 0.08987,
+            "22": 0.09048,
+            "23": 0.09047,
+            "24": 0.08991,
+            "25": 0.09343,
+            "26": 0.0901,
+            "27": 0.08989,
+            "28": 0.09443,
+            "29": 0.09097,
+            "30": 0.09106,
+            "31": 0.0927,
+            "32": 0.08602,
+            "33": 0.08691,
+            "34": 0.08755,
+            "35": 0.08733,
+            "36": 0.08692,
+            "37": 0.08659,
+            "38": 0.08868,
+            "39": 0.08692,
+            "40": 0.08731,
+            "41": 0.08817,
+            "42": 0.08696,
+            "43": 0.08838,
+            "44": 0.08859,
+            "45": 0.08767,
+            "46": 0.0873,
+            "47": 0.08882,
+            "48": 0.08631,
+            "49": 0.08619,
+            "50": 0.0861
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgx_a100.json
index 4bf73c8b005..a98babc2900 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgx_a100.json
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 9.25479,
-            "2": 0.18004,
-            "3": 0.15444,
-            "4": 0.15284,
-            "5": 0.15391,
-            "6": 0.14333,
-            "7": 0.14244,
-            "8": 0.13997,
-            "9": 0.14112,
-            "10": 0.13863,
-            "11": 0.13707,
-            "12": 0.13575,
-            "13": 0.13558,
-            "14": 0.13535,
-            "15": 0.13556,
-            "16": 0.13648,
-            "17": 0.13495,
-            "18": 0.1343,
-            "19": 0.13442,
-            "20": 0.13441,
-            "21": 0.1344,
-            "22": 0.13478,
-            "23": 0.13473,
-            "24": 0.13476,
-            "25": 0.13536,
-            "26": 0.13345,
-            "27": 0.1342,
-            "28": 0.13421,
-            "29": 0.13479,
-            "30": 0.13378,
-            "31": 0.13418,
-            "32": 0.13411,
-            "33": 0.13351,
-            "34": 0.13374,
-            "35": 0.13406,
-            "36": 0.13396,
-            "37": 0.13435,
-            "38": 0.13356,
-            "39": 0.13367,
-            "40": 0.13361,
-            "41": 0.13454,
-            "42": 0.13463,
-            "43": 0.13524,
-            "44": 0.13356,
-            "45": 0.13403,
-            "46": 0.1347,
-            "47": 0.13379,
-            "48": 0.1343,
-            "49": 0.13391,
-            "50": 0.13371
+            "1": 5.08022,
+            "2": 0.18501,
+            "3": 0.16189,
+            "4": 0.1446,
+            "5": 0.14506,
+            "6": 0.1419,
+            "7": 0.14224,
+            "8": 0.14228,
+            "9": 0.14173,
+            "10": 0.14459,
+            "11": 0.14301,
+            "12": 0.14363,
+            "13": 0.14381,
+            "14": 0.143,
+            "15": 0.14252,
+            "16": 0.14227,
+            "17": 0.14143,
+            "18": 0.1425,
+            "19": 0.14097,
+            "20": 0.14109,
+            "21": 0.1415,
+            "22": 0.14165,
+            "23": 0.142,
+            "24": 0.14241,
+            "25": 0.1412,
+            "26": 0.14126,
+            "27": 0.14207,
+            "28": 0.14045,
+            "29": 0.14206,
+            "30": 0.14192,
+            "31": 0.14255,
+            "32": 0.14132,
+            "33": 0.14178,
+            "34": 0.14151,
+            "35": 0.14117,
+            "36": 0.14088,
+            "37": 0.14137,
+            "38": 0.14111,
+            "39": 0.13997,
+            "40": 0.14118,
+            "41": 0.14179,
+            "42": 0.14063,
+            "43": 0.14381,
+            "44": 0.14122,
+            "45": 0.14142,
+            "46": 0.14112,
+            "47": 0.14094,
+            "48": 0.14134,
+            "49": 0.14094,
+            "50": 0.14002
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/model_config.yaml
index 767283cf2a1..fff31764409 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/model_config.yaml
@@ -46,7 +46,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..10988c85257
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82558,
+            "2": 10.83322,
+            "3": 10.82737,
+            "4": 10.79588,
+            "5": 10.85708,
+            "6": 10.86392,
+            "7": 10.8269,
+            "8": 10.82588,
+            "9": 10.83699,
+            "10": 10.79719,
+            "11": 10.87851,
+            "12": 10.85797,
+            "13": 10.85368,
+            "14": 10.87548,
+            "15": 10.79177,
+            "16": 10.80301,
+            "17": 10.7745,
+            "18": 10.80399,
+            "19": 10.79365,
+            "20": 10.69588,
+            "21": 10.6855,
+            "22": 10.53152,
+            "23": 10.70658,
+            "24": 10.57319,
+            "25": 10.51545,
+            "26": 10.59076,
+            "27": 10.60738,
+            "28": 10.57025,
+            "29": 10.58904,
+            "30": 10.34674,
+            "31": 10.07736,
+            "32": 10.46317,
+            "33": 10.45705,
+            "34": 10.19923,
+            "35": 10.25593,
+            "36": 10.21246,
+            "37": 10.34689,
+            "38": 10.18008,
+            "39": 10.40796,
+            "40": 10.07602,
+            "41": 10.12935,
+            "42": 10.21132,
+            "43": 9.81692,
+            "44": 9.94027,
+            "45": 9.817,
+            "46": 9.80608,
+            "47": 10.12473,
+            "48": 9.84047,
+            "49": 9.50975,
+            "50": 9.88932
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1691.0,
+            "2": 1553.0,
+            "3": 1673.0,
+            "4": 1760.0,
+            "5": 1852.0,
+            "6": 1861.0,
+            "7": 1852.0,
+            "8": 1755.0,
+            "9": 1952.0,
+            "10": 1427.0,
+            "11": 1857.0,
+            "12": 1820.0,
+            "13": 1948.0,
+            "14": 1828.0,
+            "15": 1913.0,
+            "16": 1881.0,
+            "17": 1770.0,
+            "18": 1683.0,
+            "19": 1784.0,
+            "20": 1714.0,
+            "21": 1969.0,
+            "22": 1701.0,
+            "23": 1972.0,
+            "24": 1545.0,
+            "25": 1537.0,
+            "26": 1650.0,
+            "27": 1770.0,
+            "28": 1889.0,
+            "29": 1946.0,
+            "30": 2031.0,
+            "31": 1511.0,
+            "32": 1848.0,
+            "33": 2009.0,
+            "34": 1749.0,
+            "35": 1978.0,
+            "36": 1926.0,
+            "37": 2358.0,
+            "38": 2036.0,
+            "39": 2202.0,
+            "40": 2015.0,
+            "41": 2184.0,
+            "42": 2304.0,
+            "43": 2079.0,
+            "44": 2042.0,
+            "45": 2082.0,
+            "46": 2206.0,
+            "47": 2417.0,
+            "48": 2284.0,
+            "49": 2231.0,
+            "50": 2430.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 552193536.0,
+            "2": 552193536.0,
+            "3": 553242112.0,
+            "4": 553242112.0,
+            "5": 552193536.0,
+            "6": 553242112.0,
+            "7": 553242112.0,
+            "8": 553242112.0,
+            "9": 553242112.0,
+            "10": 553242112.0,
+            "11": 553242112.0,
+            "12": 552193536.0,
+            "13": 552193536.0,
+            "14": 552193536.0,
+            "15": 552193536.0,
+            "16": 553242112.0,
+            "17": 553242112.0,
+            "18": 552193536.0,
+            "19": 553242112.0,
+            "20": 553242112.0,
+            "21": 553242112.0,
+            "22": 552193536.0,
+            "23": 553242112.0,
+            "24": 553242112.0,
+            "25": 553242112.0,
+            "26": 553242112.0,
+            "27": 553242112.0,
+            "28": 553242112.0,
+            "29": 553242112.0,
+            "30": 553242112.0,
+            "31": 552193536.0,
+            "32": 552193536.0,
+            "33": 553242112.0,
+            "34": 553242112.0,
+            "35": 552193536.0,
+            "36": 553242112.0,
+            "37": 552193536.0,
+            "38": 552193536.0,
+            "39": 552193536.0,
+            "40": 552193536.0,
+            "41": 552193536.0,
+            "42": 552193536.0,
+            "43": 552193536.0,
+            "44": 552193536.0,
+            "45": 552193536.0,
+            "46": 552193536.0,
+            "47": 552193536.0,
+            "48": 552193536.0,
+            "49": 552193536.0,
+            "50": 553242112.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 3798208000.0,
+            "2": 3942086144.0,
+            "3": 3942086144.0,
+            "4": 3942086144.0,
+            "5": 3942086144.0,
+            "6": 3942086144.0,
+            "7": 3942086144.0,
+            "8": 3942086144.0,
+            "9": 3942086144.0,
+            "10": 3942086144.0,
+            "11": 3942086144.0,
+            "12": 3942086144.0,
+            "13": 3942086144.0,
+            "14": 3942086144.0,
+            "15": 3942086144.0,
+            "16": 3942086144.0,
+            "17": 3942086144.0,
+            "18": 3942086144.0,
+            "19": 3942086144.0,
+            "20": 3942086144.0,
+            "21": 3942086144.0,
+            "22": 3942086144.0,
+            "23": 3942086144.0,
+            "24": 3942086144.0,
+            "25": 3942086144.0,
+            "26": 3942086144.0,
+            "27": 3942086144.0,
+            "28": 3942086144.0,
+            "29": 3942086144.0,
+            "30": 3942086144.0,
+            "31": 3942086144.0,
+            "32": 3942086144.0,
+            "33": 3942086144.0,
+            "34": 3942086144.0,
+            "35": 3942086144.0,
+            "36": 3942086144.0,
+            "37": 3942086144.0,
+            "38": 3942086144.0,
+            "39": 3942086144.0,
+            "40": 3942086144.0,
+            "41": 3942086144.0,
+            "42": 3942086144.0,
+            "43": 3942086144.0,
+            "44": 3942086144.0,
+            "45": 3942086144.0,
+            "46": 3942086144.0,
+            "47": 3942086144.0,
+            "48": 3942086144.0,
+            "49": 3942086144.0,
+            "50": 3942086144.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.84171,
+            "3": 0.13294,
+            "4": 0.11994,
+            "5": 0.11682,
+            "6": 0.11799,
+            "7": 0.12021,
+            "8": 0.11949,
+            "9": 0.1195,
+            "10": 0.12086,
+            "11": 0.21563,
+            "12": 0.12013,
+            "13": 0.1204,
+            "14": 0.1188,
+            "15": 0.1192,
+            "16": 0.11917,
+            "17": 0.11999,
+            "18": 0.12006,
+            "19": 0.11965,
+            "20": 0.12016,
+            "21": 0.21525,
+            "22": 0.11978,
+            "23": 0.12009,
+            "24": 0.12004,
+            "25": 0.12129,
+            "26": 0.12041,
+            "27": 0.12075,
+            "28": 0.12015,
+            "29": 0.1204,
+            "30": 0.12048,
+            "31": 0.21709,
+            "32": 0.12108,
+            "33": 0.11972,
+            "34": 0.12,
+            "35": 0.11969,
+            "36": 0.11944,
+            "37": 0.11946,
+            "38": 0.12056,
+            "39": 0.12045,
+            "40": 0.12052,
+            "41": 0.21777,
+            "42": 0.12063,
+            "43": 0.12165,
+            "44": 0.1204,
+            "45": 0.12036,
+            "46": 0.12154,
+            "47": 0.12043,
+            "48": 0.12145,
+            "49": 0.12079,
+            "50": 0.12035
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_h100.json
index 5e069163f6c..ea2f72181ea 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 552054272.0,
-            "2": 552054272.0,
-            "3": 552054272.0,
-            "4": 552054272.0,
-            "5": 552054272.0,
-            "6": 552054272.0,
-            "7": 552054272.0,
-            "8": 552054272.0,
-            "9": 552054272.0,
-            "10": 552054272.0,
-            "11": 552054272.0,
-            "12": 552054272.0,
-            "13": 552054272.0,
-            "14": 552054272.0,
-            "15": 552054272.0,
-            "16": 552054272.0,
-            "17": 552054272.0,
-            "18": 552054272.0,
-            "19": 552054272.0,
-            "20": 552054272.0,
-            "21": 552054272.0,
-            "22": 552054272.0,
-            "23": 552054272.0,
-            "24": 552054272.0,
-            "25": 552054272.0,
-            "26": 552054272.0,
-            "27": 552054272.0,
-            "28": 552054272.0,
-            "29": 552054272.0,
-            "30": 552054272.0,
-            "31": 552054272.0,
-            "32": 552054272.0,
-            "33": 552054272.0,
-            "34": 552054272.0,
-            "35": 552054272.0,
-            "36": 552054272.0,
-            "37": 552054272.0,
-            "38": 552054272.0,
-            "39": 552054272.0,
-            "40": 552054272.0,
-            "41": 552054272.0,
-            "42": 552054272.0,
-            "43": 552054272.0,
-            "44": 552054272.0,
-            "45": 552054272.0,
-            "46": 552054272.0,
-            "47": 552054272.0,
-            "48": 552054272.0,
-            "49": 552054272.0,
-            "50": 552054272.0
+            "1": 553245184.0,
+            "2": 553245184.0,
+            "3": 553245184.0,
+            "4": 553245184.0,
+            "5": 553245184.0,
+            "6": 553245184.0,
+            "7": 553245184.0,
+            "8": 553245184.0,
+            "9": 553245184.0,
+            "10": 553245184.0,
+            "11": 553245184.0,
+            "12": 553245184.0,
+            "13": 553245184.0,
+            "14": 553245184.0,
+            "15": 553245184.0,
+            "16": 553245184.0,
+            "17": 553245184.0,
+            "18": 553245184.0,
+            "19": 553245184.0,
+            "20": 553245184.0,
+            "21": 553245184.0,
+            "22": 553245184.0,
+            "23": 553245184.0,
+            "24": 553245184.0,
+            "25": 553245184.0,
+            "26": 553245184.0,
+            "27": 553245184.0,
+            "28": 553245184.0,
+            "29": 553245184.0,
+            "30": 553245184.0,
+            "31": 553245184.0,
+            "32": 553245184.0,
+            "33": 553245184.0,
+            "34": 553245184.0,
+            "35": 553245184.0,
+            "36": 553245184.0,
+            "37": 553245184.0,
+            "38": 553245184.0,
+            "39": 553245184.0,
+            "40": 553245184.0,
+            "41": 553245184.0,
+            "42": 553245184.0,
+            "43": 553245184.0,
+            "44": 553245184.0,
+            "45": 553245184.0,
+            "46": 553245184.0,
+            "47": 553245184.0,
+            "48": 553245184.0,
+            "49": 553245184.0,
+            "50": 553245184.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3798206976.0,
-            "2": 3940899328.0,
-            "3": 3940899328.0,
-            "4": 3940899328.0,
-            "5": 3940899328.0,
-            "6": 3940899328.0,
-            "7": 3940899328.0,
-            "8": 3940899328.0,
-            "9": 3940899328.0,
-            "10": 3940899328.0,
-            "11": 3940899328.0,
-            "12": 3940899328.0,
-            "13": 3940899328.0,
-            "14": 3940899328.0,
-            "15": 3940899328.0,
-            "16": 3940899328.0,
-            "17": 3940899328.0,
-            "18": 3940899328.0,
-            "19": 3940899328.0,
-            "20": 3940899328.0,
-            "21": 3940899328.0,
-            "22": 3940899328.0,
-            "23": 3940899328.0,
-            "24": 3940899328.0,
-            "25": 3940899328.0,
-            "26": 3940899328.0,
-            "27": 3940899328.0,
-            "28": 3940899328.0,
-            "29": 3940899328.0,
-            "30": 3940899328.0,
-            "31": 3940899328.0,
-            "32": 3940899328.0,
-            "33": 3940899328.0,
-            "34": 3940899328.0,
-            "35": 3940899328.0,
-            "36": 3940899328.0,
-            "37": 3940899328.0,
-            "38": 3940899328.0,
-            "39": 3940899328.0,
-            "40": 3940899328.0,
-            "41": 3940899328.0,
-            "42": 3940899328.0,
-            "43": 3940899328.0,
-            "44": 3940899328.0,
-            "45": 3940899328.0,
-            "46": 3940899328.0,
-            "47": 3940899328.0,
-            "48": 3940899328.0,
-            "49": 3940899328.0,
-            "50": 3940899328.0
+            "1": 3798208000.0,
+            "2": 3943137792.0,
+            "3": 3943137792.0,
+            "4": 3943137792.0,
+            "5": 3943137792.0,
+            "6": 3943137792.0,
+            "7": 3943137792.0,
+            "8": 3943137792.0,
+            "9": 3943137792.0,
+            "10": 3943137792.0,
+            "11": 3943137792.0,
+            "12": 3943137792.0,
+            "13": 3943137792.0,
+            "14": 3943137792.0,
+            "15": 3943137792.0,
+            "16": 3943137792.0,
+            "17": 3943137792.0,
+            "18": 3943137792.0,
+            "19": 3943137792.0,
+            "20": 3943137792.0,
+            "21": 3943137792.0,
+            "22": 3943137792.0,
+            "23": 3943137792.0,
+            "24": 3943137792.0,
+            "25": 3943137792.0,
+            "26": 3943137792.0,
+            "27": 3943137792.0,
+            "28": 3943137792.0,
+            "29": 3943137792.0,
+            "30": 3943137792.0,
+            "31": 3943137792.0,
+            "32": 3943137792.0,
+            "33": 3943137792.0,
+            "34": 3943137792.0,
+            "35": 3943137792.0,
+            "36": 3943137792.0,
+            "37": 3943137792.0,
+            "38": 3943137792.0,
+            "39": 3943137792.0,
+            "40": 3943137792.0,
+            "41": 3943137792.0,
+            "42": 3943137792.0,
+            "43": 3943137792.0,
+            "44": 3943137792.0,
+            "45": 3943137792.0,
+            "46": 3943137792.0,
+            "47": 3943137792.0,
+            "48": 3943137792.0,
+            "49": 3943137792.0,
+            "50": 3943137792.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 15.65845,
-            "2": 0.14332,
-            "3": 0.12833,
-            "4": 0.12525,
-            "5": 0.12451,
-            "6": 0.12488,
-            "7": 0.12455,
-            "8": 0.12623,
-            "9": 0.1249,
-            "10": 0.127,
-            "11": 0.29256,
-            "12": 0.12446,
-            "13": 0.12388,
-            "14": 0.12448,
-            "15": 0.12475,
-            "16": 0.12507,
-            "17": 0.12682,
-            "18": 0.12473,
-            "19": 0.12569,
-            "20": 0.12441,
-            "21": 0.28384,
-            "22": 0.12554,
-            "23": 0.12552,
-            "24": 0.12663,
-            "25": 0.12441,
-            "26": 0.12547,
-            "27": 0.12485,
-            "28": 0.12492,
-            "29": 0.12419,
-            "30": 0.12518,
-            "31": 0.28416,
-            "32": 0.12399,
-            "33": 0.12692,
-            "34": 0.12606,
-            "35": 0.12537,
-            "36": 0.12614,
-            "37": 0.12484,
-            "38": 0.12464,
-            "39": 0.12396,
-            "40": 0.1239,
-            "41": 0.28831,
-            "42": 0.12609,
-            "43": 0.12537,
-            "44": 0.12484,
-            "45": 0.12567,
-            "46": 0.12791,
-            "47": 0.12281,
-            "48": 0.124,
-            "49": 0.12486,
-            "50": 0.12585
+            "1": 13.97343,
+            "2": 0.13214,
+            "3": 0.11635,
+            "4": 0.09459,
+            "5": 0.0948,
+            "6": 0.09321,
+            "7": 0.09394,
+            "8": 0.09525,
+            "9": 0.09364,
+            "10": 0.09321,
+            "11": 0.22069,
+            "12": 0.09263,
+            "13": 0.09317,
+            "14": 0.09315,
+            "15": 0.09254,
+            "16": 0.09554,
+            "17": 0.09332,
+            "18": 0.09352,
+            "19": 0.09438,
+            "20": 0.09298,
+            "21": 0.22042,
+            "22": 0.09282,
+            "23": 0.09311,
+            "24": 0.09404,
+            "25": 0.09412,
+            "26": 0.09311,
+            "27": 0.09293,
+            "28": 0.09243,
+            "29": 0.09294,
+            "30": 0.09541,
+            "31": 0.22042,
+            "32": 0.09422,
+            "33": 0.09281,
+            "34": 0.09264,
+            "35": 0.09337,
+            "36": 0.09247,
+            "37": 0.09252,
+            "38": 0.09352,
+            "39": 0.09297,
+            "40": 0.09265,
+            "41": 0.22109,
+            "42": 0.09577,
+            "43": 0.09321,
+            "44": 0.0937,
+            "45": 0.09442,
+            "46": 0.09283,
+            "47": 0.09255,
+            "48": 0.09325,
+            "49": 0.09296,
+            "50": 0.09323
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgx_a100.json
index 35ef87a5085..36d7ec97749 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgx_a100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 522452480.0,
-            "2": 522452480.0,
-            "3": 522452480.0,
-            "4": 522452480.0,
-            "5": 522452480.0,
-            "6": 522452480.0,
-            "7": 522452480.0,
-            "8": 522452480.0,
-            "9": 523501056.0,
-            "10": 522452480.0,
-            "11": 522452480.0,
-            "12": 523501056.0,
-            "13": 522452480.0,
-            "14": 522452480.0,
-            "15": 522452480.0,
-            "16": 522452480.0,
-            "17": 522452480.0,
-            "18": 522452480.0,
-            "19": 523501056.0,
-            "20": 523501056.0,
-            "21": 522452480.0,
-            "22": 522452480.0,
-            "23": 522452480.0,
-            "24": 523501056.0,
-            "25": 522452480.0,
-            "26": 522452480.0,
-            "27": 522452480.0,
-            "28": 522452480.0,
-            "29": 523501056.0,
-            "30": 522452480.0,
-            "31": 522452480.0,
-            "32": 522452480.0,
-            "33": 522452480.0,
-            "34": 522452480.0,
-            "35": 522452480.0,
-            "36": 522452480.0,
-            "37": 522452480.0,
-            "38": 522452480.0,
-            "39": 522452480.0,
-            "40": 522452480.0,
-            "41": 523371008.0,
-            "42": 522452480.0,
-            "43": 522452480.0,
-            "44": 522452480.0,
-            "45": 522452480.0,
-            "46": 523501056.0,
-            "47": 522452480.0,
-            "48": 522452480.0,
-            "49": 523501056.0,
-            "50": 522452480.0
+            "1": 522966528.0,
+            "2": 522966528.0,
+            "3": 522966528.0,
+            "4": 522966528.0,
+            "5": 522966528.0,
+            "6": 522966528.0,
+            "7": 522966528.0,
+            "8": 522966528.0,
+            "9": 522966528.0,
+            "10": 522966528.0,
+            "11": 522966528.0,
+            "12": 522966528.0,
+            "13": 522966528.0,
+            "14": 522966528.0,
+            "15": 522966528.0,
+            "16": 522966528.0,
+            "17": 522966528.0,
+            "18": 522966528.0,
+            "19": 522966528.0,
+            "20": 522966528.0,
+            "21": 522966528.0,
+            "22": 522966528.0,
+            "23": 522966528.0,
+            "24": 522966528.0,
+            "25": 522966528.0,
+            "26": 522966528.0,
+            "27": 522966528.0,
+            "28": 522966528.0,
+            "29": 522966528.0,
+            "30": 522966528.0,
+            "31": 522966528.0,
+            "32": 522966528.0,
+            "33": 522966528.0,
+            "34": 522966528.0,
+            "35": 522966528.0,
+            "36": 522966528.0,
+            "37": 522966528.0,
+            "38": 522966528.0,
+            "39": 522966528.0,
+            "40": 522966528.0,
+            "41": 522966528.0,
+            "42": 522966528.0,
+            "43": 522966528.0,
+            "44": 522966528.0,
+            "45": 522966528.0,
+            "46": 522966528.0,
+            "47": 522966528.0,
+            "48": 522966528.0,
+            "49": 522966528.0,
+            "50": 522966528.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -176,55 +176,55 @@
         "step_interval": 1,
         "values": {
             "1": 3768846848.0,
-            "2": 3913263616.0,
-            "3": 3913263616.0,
-            "4": 3913263616.0,
-            "5": 3913263616.0,
-            "6": 3913263616.0,
-            "7": 3913263616.0,
-            "8": 3913263616.0,
-            "9": 3913263616.0,
-            "10": 3913263616.0,
-            "11": 3913263616.0,
-            "12": 3913263616.0,
-            "13": 3913263616.0,
-            "14": 3913263616.0,
-            "15": 3913263616.0,
-            "16": 3913263616.0,
-            "17": 3913263616.0,
-            "18": 3913263616.0,
-            "19": 3913263616.0,
-            "20": 3913263616.0,
-            "21": 3913263616.0,
-            "22": 3913263616.0,
-            "23": 3913263616.0,
-            "24": 3913263616.0,
-            "25": 3913263616.0,
-            "26": 3913263616.0,
-            "27": 3913263616.0,
-            "28": 3913263616.0,
-            "29": 3913263616.0,
-            "30": 3913263616.0,
-            "31": 3913263616.0,
-            "32": 3913263616.0,
-            "33": 3913263616.0,
-            "34": 3913263616.0,
-            "35": 3913263616.0,
-            "36": 3913263616.0,
-            "37": 3913263616.0,
-            "38": 3913263616.0,
-            "39": 3913263616.0,
-            "40": 3913263616.0,
-            "41": 3913263616.0,
-            "42": 3913263616.0,
-            "43": 3913263616.0,
-            "44": 3913263616.0,
-            "45": 3913263616.0,
-            "46": 3913263616.0,
-            "47": 3913263616.0,
-            "48": 3913263616.0,
-            "49": 3913263616.0,
-            "50": 3913263616.0
+            "2": 3913646592.0,
+            "3": 3913646592.0,
+            "4": 3913646592.0,
+            "5": 3913646592.0,
+            "6": 3913646592.0,
+            "7": 3913646592.0,
+            "8": 3913646592.0,
+            "9": 3913646592.0,
+            "10": 3913646592.0,
+            "11": 3913646592.0,
+            "12": 3913646592.0,
+            "13": 3913646592.0,
+            "14": 3913646592.0,
+            "15": 3913646592.0,
+            "16": 3913646592.0,
+            "17": 3913646592.0,
+            "18": 3913646592.0,
+            "19": 3913646592.0,
+            "20": 3913646592.0,
+            "21": 3913646592.0,
+            "22": 3913646592.0,
+            "23": 3913646592.0,
+            "24": 3913646592.0,
+            "25": 3913646592.0,
+            "26": 3913646592.0,
+            "27": 3913646592.0,
+            "28": 3913646592.0,
+            "29": 3913646592.0,
+            "30": 3913646592.0,
+            "31": 3913646592.0,
+            "32": 3913646592.0,
+            "33": 3913646592.0,
+            "34": 3913646592.0,
+            "35": 3913646592.0,
+            "36": 3913646592.0,
+            "37": 3913646592.0,
+            "38": 3913646592.0,
+            "39": 3913646592.0,
+            "40": 3913646592.0,
+            "41": 3913646592.0,
+            "42": 3913646592.0,
+            "43": 3913646592.0,
+            "44": 3913646592.0,
+            "45": 3913646592.0,
+            "46": 3913646592.0,
+            "47": 3913646592.0,
+            "48": 3913646592.0,
+            "49": 3913646592.0,
+            "50": 3913646592.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.5204,
-            "2": 0.1877,
-            "3": 0.15851,
-            "4": 0.15284,
-            "5": 0.15092,
-            "6": 0.15084,
-            "7": 0.14786,
-            "8": 0.14787,
-            "9": 0.14698,
-            "10": 0.15104,
-            "11": 0.29695,
-            "12": 0.14795,
-            "13": 0.14771,
-            "14": 0.14662,
-            "15": 0.14409,
-            "16": 0.14378,
-            "17": 0.14431,
-            "18": 0.14329,
-            "19": 0.14334,
-            "20": 0.14441,
-            "21": 0.28011,
-            "22": 0.14378,
-            "23": 0.14643,
-            "24": 0.14572,
-            "25": 0.14331,
-            "26": 0.14307,
-            "27": 0.14541,
-            "28": 0.14512,
-            "29": 0.14536,
-            "30": 0.14358,
-            "31": 0.28944,
-            "32": 0.14533,
-            "33": 0.14477,
-            "34": 0.14423,
-            "35": 0.14395,
-            "36": 0.14486,
-            "37": 0.14319,
-            "38": 0.14455,
-            "39": 0.14454,
-            "40": 0.14537,
-            "41": 0.29312,
-            "42": 0.14458,
-            "43": 0.14749,
-            "44": 0.14448,
-            "45": 0.14501,
-            "46": 0.14588,
-            "47": 0.14249,
-            "48": 0.14564,
-            "49": 0.14388,
-            "50": 0.14222
+            "1": 7.26942,
+            "2": 0.17361,
+            "3": 0.16661,
+            "4": 0.15374,
+            "5": 0.1539,
+            "6": 0.15237,
+            "7": 0.15491,
+            "8": 0.16016,
+            "9": 0.1524,
+            "10": 0.14907,
+            "11": 0.28249,
+            "12": 0.14867,
+            "13": 0.14835,
+            "14": 0.14748,
+            "15": 0.14906,
+            "16": 0.14768,
+            "17": 0.15182,
+            "18": 0.14947,
+            "19": 0.15009,
+            "20": 0.14968,
+            "21": 0.28262,
+            "22": 0.14991,
+            "23": 0.14955,
+            "24": 0.14949,
+            "25": 0.14929,
+            "26": 0.14942,
+            "27": 0.14898,
+            "28": 0.15187,
+            "29": 0.14918,
+            "30": 0.14827,
+            "31": 0.2861,
+            "32": 0.14873,
+            "33": 0.14777,
+            "34": 0.14736,
+            "35": 0.14865,
+            "36": 0.14795,
+            "37": 0.148,
+            "38": 0.14799,
+            "39": 0.14777,
+            "40": 0.14776,
+            "41": 0.28572,
+            "42": 0.14812,
+            "43": 0.14967,
+            "44": 0.14785,
+            "45": 0.14785,
+            "46": 0.14867,
+            "47": 0.14775,
+            "48": 0.14841,
+            "49": 0.14786,
+            "50": 0.14872
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml
index 778e7d361b3..f6a72754edb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..a34edb3389a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82555,
+            "2": 10.83286,
+            "3": 10.82762,
+            "4": 10.79573,
+            "5": 10.85695,
+            "6": 10.86391,
+            "7": 10.82616,
+            "8": 10.82544,
+            "9": 10.83584,
+            "10": 10.79629,
+            "11": 10.8782,
+            "12": 10.85821,
+            "13": 10.85418,
+            "14": 10.87518,
+            "15": 10.79205,
+            "16": 10.80305,
+            "17": 10.77428,
+            "18": 10.8046,
+            "19": 10.79338,
+            "20": 10.69563,
+            "21": 10.68645,
+            "22": 10.53149,
+            "23": 10.70629,
+            "24": 10.57273,
+            "25": 10.5144,
+            "26": 10.58993,
+            "27": 10.60707,
+            "28": 10.57003,
+            "29": 10.58929,
+            "30": 10.34675,
+            "31": 10.07709,
+            "32": 10.46194,
+            "33": 10.45484,
+            "34": 10.19662,
+            "35": 10.25291,
+            "36": 10.20971,
+            "37": 10.34492,
+            "38": 10.17789,
+            "39": 10.4061,
+            "40": 10.07414,
+            "41": 10.12736,
+            "42": 10.20823,
+            "43": 9.81194,
+            "44": 9.93354,
+            "45": 9.80953,
+            "46": 9.79773,
+            "47": 10.11569,
+            "48": 9.83234,
+            "49": 9.50281,
+            "50": 9.88181,
+            "51": 9.83458,
+            "52": 9.71756,
+            "53": 10.05126,
+            "54": 9.94371,
+            "55": 9.87457,
+            "56": 9.6029,
+            "57": 9.45086,
+            "58": 9.811,
+            "59": 9.56395,
+            "60": 9.47155,
+            "61": 9.66553,
+            "62": 9.96353,
+            "63": 9.34709,
+            "64": 9.743,
+            "65": 8.92136,
+            "66": 9.67858,
+            "67": 9.35222,
+            "68": 9.76563,
+            "69": 9.7774,
+            "70": 9.70407,
+            "71": 9.60099,
+            "72": 9.5498,
+            "73": 9.46046,
+            "74": 8.89068,
+            "75": 9.3874,
+            "76": 9.04469,
+            "77": 10.03647,
+            "78": 9.6996,
+            "79": 9.34722,
+            "80": 9.37822,
+            "81": 9.45421,
+            "82": 9.67529,
+            "83": 9.28446,
+            "84": 9.39113,
+            "85": 9.58663,
+            "86": 9.04694,
+            "87": 9.56972,
+            "88": 9.72085,
+            "89": 9.5673,
+            "90": 9.79474,
+            "91": 9.30448,
+            "92": 9.32183,
+            "93": 9.0517,
+            "94": 8.79005,
+            "95": 9.4918,
+            "96": 9.48711,
+            "97": 9.26589,
+            "98": 9.62592,
+            "99": 8.85252,
+            "100": 9.35907
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1651.0,
+            "2": 1716.0,
+            "3": 1772.0,
+            "4": 1774.0,
+            "5": 1920.0,
+            "6": 1864.0,
+            "7": 1830.0,
+            "8": 1695.0,
+            "9": 1858.0,
+            "10": 1367.0,
+            "11": 1915.0,
+            "12": 1797.0,
+            "13": 1899.0,
+            "14": 1769.0,
+            "15": 1880.0,
+            "16": 1806.0,
+            "17": 1822.0,
+            "18": 1686.0,
+            "19": 1728.0,
+            "20": 1667.0,
+            "21": 1897.0,
+            "22": 1703.0,
+            "23": 1967.0,
+            "24": 1595.0,
+            "25": 1583.0,
+            "26": 1684.0,
+            "27": 1911.0,
+            "28": 1969.0,
+            "29": 1864.0,
+            "30": 1943.0,
+            "31": 1535.0,
+            "32": 1895.0,
+            "33": 2078.0,
+            "34": 1739.0,
+            "35": 1940.0,
+            "36": 1919.0,
+            "37": 2460.0,
+            "38": 2107.0,
+            "39": 2261.0,
+            "40": 2059.0,
+            "41": 2183.0,
+            "42": 2269.0,
+            "43": 1972.0,
+            "44": 2040.0,
+            "45": 2093.0,
+            "46": 2140.0,
+            "47": 2476.0,
+            "48": 2311.0,
+            "49": 2165.0,
+            "50": 2411.0,
+            "51": 2471.0,
+            "52": 2670.0,
+            "53": 2883.0,
+            "54": 2589.0,
+            "55": 2427.0,
+            "56": 2774.0,
+            "57": 2246.0,
+            "58": 2994.0,
+            "59": 2922.0,
+            "60": 2416.0,
+            "61": 2960.0,
+            "62": 2646.0,
+            "63": 2488.0,
+            "64": 2956.0,
+            "65": 2746.0,
+            "66": 2864.0,
+            "67": 2794.0,
+            "68": 2703.0,
+            "69": 2990.0,
+            "70": 3012.0,
+            "71": 2884.0,
+            "72": 2536.0,
+            "73": 3054.0,
+            "74": 2100.0,
+            "75": 2573.0,
+            "76": 3076.0,
+            "77": 3025.0,
+            "78": 3014.0,
+            "79": 3083.0,
+            "80": 2989.0,
+            "81": 3452.0,
+            "82": 3253.0,
+            "83": 2759.0,
+            "84": 3186.0,
+            "85": 3247.0,
+            "86": 2624.0,
+            "87": 3594.0,
+            "88": 3009.0,
+            "89": 3286.0,
+            "90": 3354.0,
+            "91": 2869.0,
+            "92": 3156.0,
+            "93": 2809.0,
+            "94": 3350.0,
+            "95": 3033.0,
+            "96": 3323.0,
+            "97": 3091.0,
+            "98": 3356.0,
+            "99": 3326.0,
+            "100": 3144.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 759682560.0,
+            "2": 759682560.0,
+            "3": 759682560.0,
+            "4": 759682560.0,
+            "5": 759682560.0,
+            "6": 759682560.0,
+            "7": 759682560.0,
+            "8": 759682560.0,
+            "9": 759682560.0,
+            "10": 759682560.0,
+            "11": 759682560.0,
+            "12": 759682560.0,
+            "13": 759682560.0,
+            "14": 759682560.0,
+            "15": 759682560.0,
+            "16": 759682560.0,
+            "17": 759682560.0,
+            "18": 759682560.0,
+            "19": 759682560.0,
+            "20": 759682560.0,
+            "21": 759682560.0,
+            "22": 759682560.0,
+            "23": 759682560.0,
+            "24": 759682560.0,
+            "25": 759682560.0,
+            "26": 759682560.0,
+            "27": 759682560.0,
+            "28": 759682560.0,
+            "29": 759682560.0,
+            "30": 759682560.0,
+            "31": 759682560.0,
+            "32": 759682560.0,
+            "33": 759682560.0,
+            "34": 759682560.0,
+            "35": 759682560.0,
+            "36": 759682560.0,
+            "37": 759682560.0,
+            "38": 759682560.0,
+            "39": 759682560.0,
+            "40": 759682560.0,
+            "41": 759682560.0,
+            "42": 759682560.0,
+            "43": 759682560.0,
+            "44": 759682560.0,
+            "45": 759682560.0,
+            "46": 759682560.0,
+            "47": 759682560.0,
+            "48": 759682560.0,
+            "49": 759682560.0,
+            "50": 759682560.0,
+            "51": 759682560.0,
+            "52": 759682560.0,
+            "53": 759682560.0,
+            "54": 759682560.0,
+            "55": 759682560.0,
+            "56": 759682560.0,
+            "57": 759682560.0,
+            "58": 759682560.0,
+            "59": 759682560.0,
+            "60": 759682560.0,
+            "61": 759682560.0,
+            "62": 759682560.0,
+            "63": 759682560.0,
+            "64": 759682560.0,
+            "65": 759682560.0,
+            "66": 759682560.0,
+            "67": 759682560.0,
+            "68": 759682560.0,
+            "69": 759682560.0,
+            "70": 759682560.0,
+            "71": 759682560.0,
+            "72": 759682560.0,
+            "73": 759682560.0,
+            "74": 759682560.0,
+            "75": 759682560.0,
+            "76": 759682560.0,
+            "77": 759682560.0,
+            "78": 759682560.0,
+            "79": 759682560.0,
+            "80": 759682560.0,
+            "81": 759682560.0,
+            "82": 759682560.0,
+            "83": 759682560.0,
+            "84": 759682560.0,
+            "85": 759682560.0,
+            "86": 759682560.0,
+            "87": 759682560.0,
+            "88": 759682560.0,
+            "89": 759682560.0,
+            "90": 759682560.0,
+            "91": 759682560.0,
+            "92": 759682560.0,
+            "93": 759682560.0,
+            "94": 759682560.0,
+            "95": 759682560.0,
+            "96": 759682560.0,
+            "97": 759682560.0,
+            "98": 759682560.0,
+            "99": 759682560.0,
+            "100": 759682560.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2395798528.0,
+            "2": 2677510144.0,
+            "3": 2677510144.0,
+            "4": 2677510144.0,
+            "5": 2677510144.0,
+            "6": 2677510144.0,
+            "7": 2677510144.0,
+            "8": 2677510144.0,
+            "9": 2677510144.0,
+            "10": 2677510144.0,
+            "11": 2677510144.0,
+            "12": 2677510144.0,
+            "13": 2677510144.0,
+            "14": 2677510144.0,
+            "15": 2677510144.0,
+            "16": 2677510144.0,
+            "17": 2677510144.0,
+            "18": 2677510144.0,
+            "19": 2677510144.0,
+            "20": 2677510144.0,
+            "21": 2677510144.0,
+            "22": 2677510144.0,
+            "23": 2677510144.0,
+            "24": 2677510144.0,
+            "25": 2677510144.0,
+            "26": 2677510144.0,
+            "27": 2677510144.0,
+            "28": 2677510144.0,
+            "29": 2677510144.0,
+            "30": 2677510144.0,
+            "31": 2677510144.0,
+            "32": 2677510144.0,
+            "33": 2677510144.0,
+            "34": 2677510144.0,
+            "35": 2677510144.0,
+            "36": 2677510144.0,
+            "37": 2677510144.0,
+            "38": 2677510144.0,
+            "39": 2677510144.0,
+            "40": 2677510144.0,
+            "41": 2677510144.0,
+            "42": 2677510144.0,
+            "43": 2677510144.0,
+            "44": 2677510144.0,
+            "45": 2677510144.0,
+            "46": 2677510144.0,
+            "47": 2677510144.0,
+            "48": 2677510144.0,
+            "49": 2677510144.0,
+            "50": 2677510144.0,
+            "51": 2677510144.0,
+            "52": 2677510144.0,
+            "53": 2677510144.0,
+            "54": 2677510144.0,
+            "55": 2677510144.0,
+            "56": 2677510144.0,
+            "57": 2677510144.0,
+            "58": 2677510144.0,
+            "59": 2677510144.0,
+            "60": 2677510144.0,
+            "61": 2677510144.0,
+            "62": 2677510144.0,
+            "63": 2677510144.0,
+            "64": 2677510144.0,
+            "65": 2677510144.0,
+            "66": 2677510144.0,
+            "67": 2677510144.0,
+            "68": 2677510144.0,
+            "69": 2677510144.0,
+            "70": 2677510144.0,
+            "71": 2677510144.0,
+            "72": 2677510144.0,
+            "73": 2677510144.0,
+            "74": 2677510144.0,
+            "75": 2677510144.0,
+            "76": 2677510144.0,
+            "77": 2677510144.0,
+            "78": 2677510144.0,
+            "79": 2677510144.0,
+            "80": 2677510144.0,
+            "81": 2677510144.0,
+            "82": 2677510144.0,
+            "83": 2677510144.0,
+            "84": 2677510144.0,
+            "85": 2677510144.0,
+            "86": 2677510144.0,
+            "87": 2677510144.0,
+            "88": 2677510144.0,
+            "89": 2677510144.0,
+            "90": 2677510144.0,
+            "91": 2677510144.0,
+            "92": 2677510144.0,
+            "93": 2677510144.0,
+            "94": 2677510144.0,
+            "95": 2677510144.0,
+            "96": 2677510144.0,
+            "97": 2677510144.0,
+            "98": 2677510144.0,
+            "99": 2677510144.0,
+            "100": 2677510144.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.73603,
+            "3": 0.12344,
+            "4": 0.10783,
+            "5": 0.10595,
+            "6": 0.10649,
+            "7": 0.10691,
+            "8": 0.10679,
+            "9": 0.10607,
+            "10": 0.10675,
+            "11": 0.10687,
+            "12": 0.10636,
+            "13": 0.10663,
+            "14": 0.10668,
+            "15": 0.10696,
+            "16": 0.10672,
+            "17": 0.10678,
+            "18": 0.10603,
+            "19": 0.10659,
+            "20": 0.10684,
+            "21": 0.10766,
+            "22": 0.10849,
+            "23": 0.10853,
+            "24": 0.10805,
+            "25": 0.10776,
+            "26": 0.1069,
+            "27": 0.10818,
+            "28": 0.10669,
+            "29": 0.10643,
+            "30": 0.10634,
+            "31": 0.10766,
+            "32": 0.1076,
+            "33": 0.10583,
+            "34": 0.10631,
+            "35": 0.10587,
+            "36": 0.1054,
+            "37": 0.10589,
+            "38": 0.10633,
+            "39": 0.10593,
+            "40": 0.10674,
+            "41": 0.10812,
+            "42": 0.11127,
+            "43": 0.11494,
+            "44": 0.11409,
+            "45": 0.11538,
+            "46": 0.11702,
+            "47": 0.1155,
+            "48": 0.11481,
+            "49": 0.11507,
+            "50": 0.11401,
+            "51": 0.11655,
+            "52": 0.11513,
+            "53": 0.11379,
+            "54": 0.11378,
+            "55": 0.11658,
+            "56": 0.11792,
+            "57": 0.11792,
+            "58": 0.11715,
+            "59": 0.11915,
+            "60": 0.11642,
+            "61": 0.11578,
+            "62": 0.1171,
+            "63": 0.11758,
+            "64": 0.11517,
+            "65": 0.11624,
+            "66": 0.11434,
+            "67": 0.11609,
+            "68": 0.11506,
+            "69": 0.11568,
+            "70": 0.11661,
+            "71": 0.11647,
+            "72": 0.1166,
+            "73": 0.11795,
+            "74": 0.11661,
+            "75": 0.11785,
+            "76": 0.11659,
+            "77": 0.11531,
+            "78": 0.11705,
+            "79": 0.11662,
+            "80": 0.11765,
+            "81": 0.11829,
+            "82": 0.11742,
+            "83": 0.11529,
+            "84": 0.11678,
+            "85": 0.11581,
+            "86": 0.11703,
+            "87": 0.11699,
+            "88": 0.11641,
+            "89": 0.11638,
+            "90": 0.11586,
+            "91": 0.11853,
+            "92": 0.11725,
+            "93": 0.1178,
+            "94": 0.11647,
+            "95": 0.11672,
+            "96": 0.11702,
+            "97": 0.11754,
+            "98": 0.11614,
+            "99": 0.11757,
+            "100": 0.11708
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_h100.json
index 603dba4c2e5..2f0a7e29034 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_h100.json
@@ -21,89 +21,89 @@
             "15": 10.81973,
             "16": 10.83156,
             "17": 10.79863,
-            "18": 10.81648,
-            "19": 10.8189,
+            "18": 10.8165,
+            "19": 10.81889,
             "20": 10.72685,
-            "21": 10.70581,
-            "22": 10.56347,
-            "23": 10.72794,
-            "24": 10.60761,
+            "21": 10.7058,
+            "22": 10.5635,
+            "23": 10.7279,
+            "24": 10.6076,
             "25": 10.55128,
-            "26": 10.60749,
-            "27": 10.6277,
-            "28": 10.58262,
-            "29": 10.59959,
-            "30": 10.36566,
-            "31": 10.11988,
-            "32": 10.4755,
-            "33": 10.46637,
-            "34": 10.22009,
-            "35": 10.2744,
-            "36": 10.22594,
-            "37": 10.35729,
-            "38": 10.19156,
+            "26": 10.60747,
+            "27": 10.62771,
+            "28": 10.5826,
+            "29": 10.59962,
+            "30": 10.36565,
+            "31": 10.1199,
+            "32": 10.47544,
+            "33": 10.46636,
+            "34": 10.22008,
+            "35": 10.27436,
+            "36": 10.2259,
+            "37": 10.3573,
+            "38": 10.19161,
             "39": 10.41342,
-            "40": 10.0956,
-            "41": 10.15511,
+            "40": 10.09564,
+            "41": 10.15513,
             "42": 10.22085,
-            "43": 9.82797,
-            "44": 9.96276,
-            "45": 9.83425,
-            "46": 9.82209,
-            "47": 10.14765,
-            "48": 9.84681,
-            "49": 9.53377,
-            "50": 9.90532,
-            "51": 9.85116,
-            "52": 9.73516,
-            "53": 10.05863,
-            "54": 9.94369,
+            "43": 9.82792,
+            "44": 9.96282,
+            "45": 9.83422,
+            "46": 9.8221,
+            "47": 10.14764,
+            "48": 9.84684,
+            "49": 9.53373,
+            "50": 9.90531,
+            "51": 9.85118,
+            "52": 9.73512,
+            "53": 10.05864,
+            "54": 9.94367,
             "55": 9.87297,
-            "56": 9.61703,
-            "57": 9.4675,
-            "58": 9.82223,
-            "59": 9.57338,
-            "60": 9.48861,
-            "61": 9.67921,
-            "62": 9.97513,
-            "63": 9.37045,
-            "64": 9.76643,
-            "65": 8.93435,
-            "66": 9.69463,
-            "67": 9.35357,
+            "56": 9.61699,
+            "57": 9.46751,
+            "58": 9.82221,
+            "59": 9.57334,
+            "60": 9.48862,
+            "61": 9.67922,
+            "62": 9.97512,
+            "63": 9.37044,
+            "64": 9.76642,
+            "65": 8.9343,
+            "66": 9.69461,
+            "67": 9.35362,
             "68": 9.76826,
-            "69": 9.77682,
-            "70": 9.72364,
-            "71": 9.59895,
-            "72": 9.56454,
-            "73": 9.48327,
-            "74": 8.92062,
+            "69": 9.77678,
+            "70": 9.72363,
+            "71": 9.59894,
+            "72": 9.56455,
+            "73": 9.48329,
+            "74": 8.92064,
             "75": 9.40392,
-            "76": 9.05301,
-            "77": 10.04175,
+            "76": 9.05297,
+            "77": 10.04178,
             "78": 9.69879,
-            "79": 9.35128,
-            "80": 9.38215,
-            "81": 9.45866,
-            "82": 9.67518,
-            "83": 9.28411,
-            "84": 9.39313,
-            "85": 9.5893,
-            "86": 9.05182,
-            "87": 9.56419,
-            "88": 9.71756,
+            "79": 9.35126,
+            "80": 9.38212,
+            "81": 9.45864,
+            "82": 9.67516,
+            "83": 9.2841,
+            "84": 9.39311,
+            "85": 9.58936,
+            "86": 9.05178,
+            "87": 9.56418,
+            "88": 9.71755,
             "89": 9.57129,
             "90": 9.78202,
-            "91": 9.3061,
-            "92": 9.32048,
-            "93": 9.03942,
-            "94": 8.79522,
-            "95": 9.47913,
-            "96": 9.48454,
-            "97": 9.2699,
-            "98": 9.62563,
-            "99": 8.84255,
-            "100": 9.34982
+            "91": 9.30611,
+            "92": 9.32046,
+            "93": 9.03939,
+            "94": 8.7952,
+            "95": 9.47908,
+            "96": 9.48453,
+            "97": 9.26989,
+            "98": 9.62564,
+            "99": 8.84254,
+            "100": 9.3498
         }
     },
     "num-zeros": {
@@ -126,91 +126,91 @@
             "13": 1931.0,
             "14": 1678.0,
             "15": 1918.0,
-            "16": 1961.0,
-            "17": 1711.0,
-            "18": 1658.0,
-            "19": 1791.0,
-            "20": 1610.0,
-            "21": 1815.0,
-            "22": 1677.0,
-            "23": 1952.0,
-            "24": 1612.0,
-            "25": 1597.0,
-            "26": 1657.0,
-            "27": 1850.0,
-            "28": 2013.0,
-            "29": 1966.0,
-            "30": 1875.0,
-            "31": 1585.0,
-            "32": 1941.0,
-            "33": 2085.0,
-            "34": 1837.0,
-            "35": 2045.0,
-            "36": 1898.0,
-            "37": 2333.0,
-            "38": 2247.0,
-            "39": 2266.0,
-            "40": 2184.0,
-            "41": 2209.0,
-            "42": 2164.0,
-            "43": 2076.0,
-            "44": 2169.0,
-            "45": 2077.0,
-            "46": 2325.0,
-            "47": 2505.0,
-            "48": 2442.0,
-            "49": 2205.0,
-            "50": 2196.0,
-            "51": 2500.0,
-            "52": 2572.0,
-            "53": 2905.0,
-            "54": 2794.0,
-            "55": 2351.0,
-            "56": 2606.0,
-            "57": 2388.0,
-            "58": 2864.0,
-            "59": 2726.0,
-            "60": 2359.0,
-            "61": 2915.0,
-            "62": 2610.0,
-            "63": 2397.0,
-            "64": 2886.0,
-            "65": 2577.0,
-            "66": 2913.0,
-            "67": 2715.0,
-            "68": 2646.0,
-            "69": 2805.0,
-            "70": 3151.0,
-            "71": 2917.0,
-            "72": 2403.0,
-            "73": 2948.0,
-            "74": 1994.0,
-            "75": 2425.0,
-            "76": 2898.0,
-            "77": 3085.0,
-            "78": 3228.0,
-            "79": 2981.0,
-            "80": 3254.0,
-            "81": 3499.0,
-            "82": 3121.0,
-            "83": 2711.0,
-            "84": 3105.0,
-            "85": 3492.0,
-            "86": 2693.0,
-            "87": 3602.0,
-            "88": 3052.0,
-            "89": 3230.0,
-            "90": 3160.0,
-            "91": 2647.0,
-            "92": 3160.0,
-            "93": 2650.0,
-            "94": 3430.0,
-            "95": 3247.0,
-            "96": 3353.0,
-            "97": 3064.0,
-            "98": 3486.0,
-            "99": 3190.0,
-            "100": 3076.0
+            "16": 1945.0,
+            "17": 1707.0,
+            "18": 1635.0,
+            "19": 1720.0,
+            "20": 1609.0,
+            "21": 1813.0,
+            "22": 1682.0,
+            "23": 1908.0,
+            "24": 1620.0,
+            "25": 1563.0,
+            "26": 1640.0,
+            "27": 1775.0,
+            "28": 1873.0,
+            "29": 1969.0,
+            "30": 1896.0,
+            "31": 1588.0,
+            "32": 1907.0,
+            "33": 2180.0,
+            "34": 1850.0,
+            "35": 1987.0,
+            "36": 1901.0,
+            "37": 2358.0,
+            "38": 2253.0,
+            "39": 2364.0,
+            "40": 2173.0,
+            "41": 2234.0,
+            "42": 2281.0,
+            "43": 2027.0,
+            "44": 2127.0,
+            "45": 2170.0,
+            "46": 2317.0,
+            "47": 2438.0,
+            "48": 2391.0,
+            "49": 2276.0,
+            "50": 2205.0,
+            "51": 2647.0,
+            "52": 2533.0,
+            "53": 2935.0,
+            "54": 2623.0,
+            "55": 2386.0,
+            "56": 2664.0,
+            "57": 2391.0,
+            "58": 2863.0,
+            "59": 2758.0,
+            "60": 2456.0,
+            "61": 2865.0,
+            "62": 2559.0,
+            "63": 2463.0,
+            "64": 3014.0,
+            "65": 2526.0,
+            "66": 3010.0,
+            "67": 2723.0,
+            "68": 2616.0,
+            "69": 2739.0,
+            "70": 3188.0,
+            "71": 2919.0,
+            "72": 2355.0,
+            "73": 2921.0,
+            "74": 1944.0,
+            "75": 2454.0,
+            "76": 3005.0,
+            "77": 3204.0,
+            "78": 3244.0,
+            "79": 3047.0,
+            "80": 3220.0,
+            "81": 3492.0,
+            "82": 3205.0,
+            "83": 2692.0,
+            "84": 3149.0,
+            "85": 3256.0,
+            "86": 2562.0,
+            "87": 3753.0,
+            "88": 2921.0,
+            "89": 3239.0,
+            "90": 3001.0,
+            "91": 2656.0,
+            "92": 3146.0,
+            "93": 2642.0,
+            "94": 3289.0,
+            "95": 3324.0,
+            "96": 3350.0,
+            "97": 3079.0,
+            "98": 3564.0,
+            "99": 3215.0,
+            "100": 3238.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 759681536.0,
-            "2": 759681536.0,
-            "3": 759681536.0,
-            "4": 759681536.0,
-            "5": 759681536.0,
-            "6": 759681536.0,
-            "7": 759681536.0,
-            "8": 759681536.0,
-            "9": 759681536.0,
-            "10": 759681536.0,
-            "11": 759681536.0,
-            "12": 759681536.0,
-            "13": 759681536.0,
-            "14": 759681536.0,
-            "15": 759681536.0,
-            "16": 759681536.0,
-            "17": 759681536.0,
-            "18": 759681536.0,
-            "19": 759681536.0,
-            "20": 759681536.0,
-            "21": 759681536.0,
-            "22": 759681536.0,
-            "23": 759681536.0,
-            "24": 759681536.0,
-            "25": 759681536.0,
-            "26": 759681536.0,
-            "27": 759681536.0,
-            "28": 759681536.0,
-            "29": 759681536.0,
-            "30": 759681536.0,
-            "31": 759681536.0,
-            "32": 759681536.0,
-            "33": 759681536.0,
-            "34": 759681536.0,
-            "35": 759681536.0,
-            "36": 759681536.0,
-            "37": 759681536.0,
-            "38": 759681536.0,
-            "39": 759681536.0,
-            "40": 759681536.0,
-            "41": 759681536.0,
-            "42": 759681536.0,
-            "43": 759681536.0,
-            "44": 759681536.0,
-            "45": 759681536.0,
-            "46": 759681536.0,
-            "47": 759681536.0,
-            "48": 759681536.0,
-            "49": 759681536.0,
-            "50": 759681536.0,
-            "51": 759681536.0,
-            "52": 759681536.0,
-            "53": 759681536.0,
-            "54": 759681536.0,
-            "55": 759681536.0,
-            "56": 759681536.0,
-            "57": 759681536.0,
-            "58": 759681536.0,
-            "59": 759681536.0,
-            "60": 759681536.0,
-            "61": 759681536.0,
-            "62": 759681536.0,
-            "63": 759681536.0,
-            "64": 759681536.0,
-            "65": 759681536.0,
-            "66": 759681536.0,
-            "67": 759681536.0,
-            "68": 759681536.0,
-            "69": 759681536.0,
-            "70": 759681536.0,
-            "71": 759681536.0,
-            "72": 759681536.0,
-            "73": 759681536.0,
-            "74": 759681536.0,
-            "75": 759681536.0,
-            "76": 759681536.0,
-            "77": 759681536.0,
-            "78": 759681536.0,
-            "79": 759681536.0,
-            "80": 759681536.0,
-            "81": 759681536.0,
-            "82": 759681536.0,
-            "83": 759681536.0,
-            "84": 759681536.0,
-            "85": 759681536.0,
-            "86": 759681536.0,
-            "87": 759681536.0,
-            "88": 759681536.0,
-            "89": 759681536.0,
-            "90": 759681536.0,
-            "91": 759681536.0,
-            "92": 759681536.0,
-            "93": 759681536.0,
-            "94": 759681536.0,
-            "95": 759681536.0,
-            "96": 759681536.0,
-            "97": 759681536.0,
-            "98": 759681536.0,
-            "99": 759681536.0,
-            "100": 759681536.0
+            "1": 759682560.0,
+            "2": 759682560.0,
+            "3": 759682560.0,
+            "4": 759682560.0,
+            "5": 759682560.0,
+            "6": 759682560.0,
+            "7": 759682560.0,
+            "8": 759682560.0,
+            "9": 759682560.0,
+            "10": 759682560.0,
+            "11": 759682560.0,
+            "12": 759682560.0,
+            "13": 759682560.0,
+            "14": 759682560.0,
+            "15": 759682560.0,
+            "16": 759682560.0,
+            "17": 759682560.0,
+            "18": 759682560.0,
+            "19": 759682560.0,
+            "20": 759682560.0,
+            "21": 759682560.0,
+            "22": 759682560.0,
+            "23": 759682560.0,
+            "24": 759682560.0,
+            "25": 759682560.0,
+            "26": 759682560.0,
+            "27": 759682560.0,
+            "28": 759682560.0,
+            "29": 759682560.0,
+            "30": 759682560.0,
+            "31": 759682560.0,
+            "32": 759682560.0,
+            "33": 759682560.0,
+            "34": 759682560.0,
+            "35": 759682560.0,
+            "36": 759682560.0,
+            "37": 759682560.0,
+            "38": 759682560.0,
+            "39": 759682560.0,
+            "40": 759682560.0,
+            "41": 759682560.0,
+            "42": 759682560.0,
+            "43": 759682560.0,
+            "44": 759682560.0,
+            "45": 759682560.0,
+            "46": 759682560.0,
+            "47": 759682560.0,
+            "48": 759682560.0,
+            "49": 759682560.0,
+            "50": 759682560.0,
+            "51": 759682560.0,
+            "52": 759682560.0,
+            "53": 759682560.0,
+            "54": 759682560.0,
+            "55": 759682560.0,
+            "56": 759682560.0,
+            "57": 759682560.0,
+            "58": 759682560.0,
+            "59": 759682560.0,
+            "60": 759682560.0,
+            "61": 759682560.0,
+            "62": 759682560.0,
+            "63": 759682560.0,
+            "64": 759682560.0,
+            "65": 759682560.0,
+            "66": 759682560.0,
+            "67": 759682560.0,
+            "68": 759682560.0,
+            "69": 759682560.0,
+            "70": 759682560.0,
+            "71": 759682560.0,
+            "72": 759682560.0,
+            "73": 759682560.0,
+            "74": 759682560.0,
+            "75": 759682560.0,
+            "76": 759682560.0,
+            "77": 759682560.0,
+            "78": 759682560.0,
+            "79": 759682560.0,
+            "80": 759682560.0,
+            "81": 759682560.0,
+            "82": 759682560.0,
+            "83": 759682560.0,
+            "84": 759682560.0,
+            "85": 759682560.0,
+            "86": 759682560.0,
+            "87": 759682560.0,
+            "88": 759682560.0,
+            "89": 759682560.0,
+            "90": 759682560.0,
+            "91": 759682560.0,
+            "92": 759682560.0,
+            "93": 759682560.0,
+            "94": 759682560.0,
+            "95": 759682560.0,
+            "96": 759682560.0,
+            "97": 759682560.0,
+            "98": 759682560.0,
+            "99": 759682560.0,
+            "100": 759682560.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2358048768.0,
-            "2": 2639760384.0,
-            "3": 2639760384.0,
-            "4": 2639760384.0,
-            "5": 2639760384.0,
-            "6": 2639760384.0,
-            "7": 2639760384.0,
-            "8": 2639760384.0,
-            "9": 2639760384.0,
-            "10": 2639760384.0,
-            "11": 2639760384.0,
-            "12": 2639760384.0,
-            "13": 2639760384.0,
-            "14": 2639760384.0,
-            "15": 2639760384.0,
-            "16": 2639760384.0,
-            "17": 2639760384.0,
-            "18": 2639760384.0,
-            "19": 2639760384.0,
-            "20": 2639760384.0,
-            "21": 2639760384.0,
-            "22": 2639760384.0,
-            "23": 2639760384.0,
-            "24": 2639760384.0,
-            "25": 2639760384.0,
-            "26": 2639760384.0,
-            "27": 2639760384.0,
-            "28": 2639760384.0,
-            "29": 2639760384.0,
-            "30": 2639760384.0,
-            "31": 2639760384.0,
-            "32": 2639760384.0,
-            "33": 2639760384.0,
-            "34": 2639760384.0,
-            "35": 2639760384.0,
-            "36": 2639760384.0,
-            "37": 2639760384.0,
-            "38": 2639760384.0,
-            "39": 2639760384.0,
-            "40": 2639760384.0,
-            "41": 2639760384.0,
-            "42": 2639760384.0,
-            "43": 2639760384.0,
-            "44": 2639760384.0,
-            "45": 2639760384.0,
-            "46": 2639760384.0,
-            "47": 2639760384.0,
-            "48": 2639760384.0,
-            "49": 2639760384.0,
-            "50": 2639760384.0,
-            "51": 2639760384.0,
-            "52": 2639760384.0,
-            "53": 2639760384.0,
-            "54": 2639760384.0,
-            "55": 2639760384.0,
-            "56": 2639760384.0,
-            "57": 2639760384.0,
-            "58": 2639760384.0,
-            "59": 2639760384.0,
-            "60": 2639760384.0,
-            "61": 2639760384.0,
-            "62": 2639760384.0,
-            "63": 2639760384.0,
-            "64": 2639760384.0,
-            "65": 2639760384.0,
-            "66": 2639760384.0,
-            "67": 2639760384.0,
-            "68": 2639760384.0,
-            "69": 2639760384.0,
-            "70": 2639760384.0,
-            "71": 2639760384.0,
-            "72": 2639760384.0,
-            "73": 2639760384.0,
-            "74": 2639760384.0,
-            "75": 2639760384.0,
-            "76": 2639760384.0,
-            "77": 2639760384.0,
-            "78": 2639760384.0,
-            "79": 2639760384.0,
-            "80": 2639760384.0,
-            "81": 2639760384.0,
-            "82": 2639760384.0,
-            "83": 2639760384.0,
-            "84": 2639760384.0,
-            "85": 2639760384.0,
-            "86": 2639760384.0,
-            "87": 2639760384.0,
-            "88": 2639760384.0,
-            "89": 2639760384.0,
-            "90": 2639760384.0,
-            "91": 2639760384.0,
-            "92": 2639760384.0,
-            "93": 2639760384.0,
-            "94": 2639760384.0,
-            "95": 2639760384.0,
-            "96": 2639760384.0,
-            "97": 2639760384.0,
-            "98": 2639760384.0,
-            "99": 2639760384.0,
-            "100": 2639760384.0
+            "1": 2358049792.0,
+            "2": 2639761408.0,
+            "3": 2639761408.0,
+            "4": 2639761408.0,
+            "5": 2639761408.0,
+            "6": 2639761408.0,
+            "7": 2639761408.0,
+            "8": 2639761408.0,
+            "9": 2639761408.0,
+            "10": 2639761408.0,
+            "11": 2639761408.0,
+            "12": 2639761408.0,
+            "13": 2639761408.0,
+            "14": 2639761408.0,
+            "15": 2639761408.0,
+            "16": 2639761408.0,
+            "17": 2639761408.0,
+            "18": 2639761408.0,
+            "19": 2639761408.0,
+            "20": 2639761408.0,
+            "21": 2639761408.0,
+            "22": 2639761408.0,
+            "23": 2639761408.0,
+            "24": 2639761408.0,
+            "25": 2639761408.0,
+            "26": 2639761408.0,
+            "27": 2639761408.0,
+            "28": 2639761408.0,
+            "29": 2639761408.0,
+            "30": 2639761408.0,
+            "31": 2639761408.0,
+            "32": 2639761408.0,
+            "33": 2639761408.0,
+            "34": 2639761408.0,
+            "35": 2639761408.0,
+            "36": 2639761408.0,
+            "37": 2639761408.0,
+            "38": 2639761408.0,
+            "39": 2639761408.0,
+            "40": 2639761408.0,
+            "41": 2639761408.0,
+            "42": 2639761408.0,
+            "43": 2639761408.0,
+            "44": 2639761408.0,
+            "45": 2639761408.0,
+            "46": 2639761408.0,
+            "47": 2639761408.0,
+            "48": 2639761408.0,
+            "49": 2639761408.0,
+            "50": 2639761408.0,
+            "51": 2639761408.0,
+            "52": 2639761408.0,
+            "53": 2639761408.0,
+            "54": 2639761408.0,
+            "55": 2639761408.0,
+            "56": 2639761408.0,
+            "57": 2639761408.0,
+            "58": 2639761408.0,
+            "59": 2639761408.0,
+            "60": 2639761408.0,
+            "61": 2639761408.0,
+            "62": 2639761408.0,
+            "63": 2639761408.0,
+            "64": 2639761408.0,
+            "65": 2639761408.0,
+            "66": 2639761408.0,
+            "67": 2639761408.0,
+            "68": 2639761408.0,
+            "69": 2639761408.0,
+            "70": 2639761408.0,
+            "71": 2639761408.0,
+            "72": 2639761408.0,
+            "73": 2639761408.0,
+            "74": 2639761408.0,
+            "75": 2639761408.0,
+            "76": 2639761408.0,
+            "77": 2639761408.0,
+            "78": 2639761408.0,
+            "79": 2639761408.0,
+            "80": 2639761408.0,
+            "81": 2639761408.0,
+            "82": 2639761408.0,
+            "83": 2639761408.0,
+            "84": 2639761408.0,
+            "85": 2639761408.0,
+            "86": 2639761408.0,
+            "87": 2639761408.0,
+            "88": 2639761408.0,
+            "89": 2639761408.0,
+            "90": 2639761408.0,
+            "91": 2639761408.0,
+            "92": 2639761408.0,
+            "93": 2639761408.0,
+            "94": 2639761408.0,
+            "95": 2639761408.0,
+            "96": 2639761408.0,
+            "97": 2639761408.0,
+            "98": 2639761408.0,
+            "99": 2639761408.0,
+            "100": 2639761408.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 16.0335,
-            "2": 0.14377,
-            "3": 0.129,
-            "4": 0.12162,
-            "5": 0.11612,
-            "6": 0.11324,
-            "7": 0.11415,
-            "8": 0.11274,
-            "9": 0.11392,
-            "10": 0.11729,
-            "11": 0.11228,
-            "12": 0.11141,
-            "13": 0.11245,
-            "14": 0.11042,
-            "15": 0.11174,
-            "16": 0.1114,
-            "17": 0.11204,
-            "18": 0.11241,
-            "19": 0.11298,
-            "20": 0.11272,
-            "21": 0.11169,
-            "22": 0.11228,
-            "23": 0.11255,
-            "24": 0.11124,
-            "25": 0.11188,
-            "26": 0.11351,
-            "27": 0.11159,
-            "28": 0.11318,
-            "29": 0.11016,
-            "30": 0.11051,
-            "31": 0.11184,
-            "32": 0.11116,
-            "33": 0.1106,
-            "34": 0.11105,
-            "35": 0.113,
-            "36": 0.11198,
-            "37": 0.1117,
-            "38": 0.11109,
-            "39": 0.1099,
-            "40": 0.11097,
-            "41": 0.11159,
-            "42": 0.11191,
-            "43": 0.11283,
-            "44": 0.11266,
-            "45": 0.111,
-            "46": 0.11347,
-            "47": 0.1099,
-            "48": 0.10973,
-            "49": 0.11225,
-            "50": 0.11231,
-            "51": 0.1122,
-            "52": 0.10985,
-            "53": 0.11147,
-            "54": 0.11064,
-            "55": 0.11101,
-            "56": 0.11356,
-            "57": 0.11368,
-            "58": 0.11185,
-            "59": 0.11193,
-            "60": 0.11205,
-            "61": 0.11176,
-            "62": 0.11293,
-            "63": 0.1127,
-            "64": 0.11343,
-            "65": 0.11282,
-            "66": 0.11245,
-            "67": 0.11385,
-            "68": 0.11071,
-            "69": 0.11079,
-            "70": 0.112,
-            "71": 0.1108,
-            "72": 0.11299,
-            "73": 0.11305,
-            "74": 0.11343,
-            "75": 0.11155,
-            "76": 0.11323,
-            "77": 0.11174,
-            "78": 0.11138,
-            "79": 0.11246,
-            "80": 0.11252,
-            "81": 0.11217,
-            "82": 0.11269,
-            "83": 0.11312,
-            "84": 0.11075,
-            "85": 0.11227,
-            "86": 0.11159,
-            "87": 0.11227,
-            "88": 0.11227,
-            "89": 0.11277,
-            "90": 0.11219,
-            "91": 0.11067,
-            "92": 0.10961,
-            "93": 0.10907,
-            "94": 0.11584,
-            "95": 0.1087,
-            "96": 0.11107,
-            "97": 0.11046,
-            "98": 0.10986,
-            "99": 0.11249,
-            "100": 0.1095
+            "1": 9.86816,
+            "2": 0.1216,
+            "3": 0.1166,
+            "4": 0.08589,
+            "5": 0.08587,
+            "6": 0.08491,
+            "7": 0.0844,
+            "8": 0.08084,
+            "9": 0.07931,
+            "10": 0.0798,
+            "11": 0.07849,
+            "12": 0.07832,
+            "13": 0.0803,
+            "14": 0.08035,
+            "15": 0.07881,
+            "16": 0.07881,
+            "17": 0.08069,
+            "18": 0.0794,
+            "19": 0.07935,
+            "20": 0.07915,
+            "21": 0.07896,
+            "22": 0.08062,
+            "23": 0.08009,
+            "24": 0.07923,
+            "25": 0.07839,
+            "26": 0.08166,
+            "27": 0.07977,
+            "28": 0.08005,
+            "29": 0.08017,
+            "30": 0.08118,
+            "31": 0.0811,
+            "32": 0.07964,
+            "33": 0.08086,
+            "34": 0.08069,
+            "35": 0.07986,
+            "36": 0.08098,
+            "37": 0.07939,
+            "38": 0.07947,
+            "39": 0.07943,
+            "40": 0.08028,
+            "41": 0.07981,
+            "42": 0.08016,
+            "43": 0.08245,
+            "44": 0.0799,
+            "45": 0.08077,
+            "46": 0.08028,
+            "47": 0.07892,
+            "48": 0.07997,
+            "49": 0.08314,
+            "50": 0.08027,
+            "51": 0.08246,
+            "52": 0.07991,
+            "53": 0.08005,
+            "54": 0.07954,
+            "55": 0.07969,
+            "56": 0.07938,
+            "57": 0.07891,
+            "58": 0.07987,
+            "59": 0.0798,
+            "60": 0.08057,
+            "61": 0.07888,
+            "62": 0.07914,
+            "63": 0.07997,
+            "64": 0.07986,
+            "65": 0.07977,
+            "66": 0.07953,
+            "67": 0.07927,
+            "68": 0.08003,
+            "69": 0.08005,
+            "70": 0.07926,
+            "71": 0.07923,
+            "72": 0.07966,
+            "73": 0.08033,
+            "74": 0.08038,
+            "75": 0.07956,
+            "76": 0.07935,
+            "77": 0.07891,
+            "78": 0.08007,
+            "79": 0.08135,
+            "80": 0.08025,
+            "81": 0.07919,
+            "82": 0.07932,
+            "83": 0.07953,
+            "84": 0.07937,
+            "85": 0.0797,
+            "86": 0.08168,
+            "87": 0.08023,
+            "88": 0.07957,
+            "89": 0.08011,
+            "90": 0.07975,
+            "91": 0.08043,
+            "92": 0.08179,
+            "93": 0.08049,
+            "94": 0.07951,
+            "95": 0.08026,
+            "96": 0.08,
+            "97": 0.07948,
+            "98": 0.0805,
+            "99": 0.07879,
+            "100": 0.07954
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgx_a100.json
index 6081b627567..a620f25b6eb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgx_a100.json
@@ -14,96 +14,96 @@
             "8": 10.83427,
             "9": 10.83995,
             "10": 10.78684,
-            "11": 10.88021,
-            "12": 10.85971,
-            "13": 10.86589,
-            "14": 10.87818,
-            "15": 10.79463,
-            "16": 10.79607,
-            "17": 10.7688,
-            "18": 10.81045,
-            "19": 10.79836,
-            "20": 10.69045,
-            "21": 10.67932,
-            "22": 10.52101,
-            "23": 10.70743,
-            "24": 10.57665,
-            "25": 10.52275,
-            "26": 10.595,
-            "27": 10.5855,
-            "28": 10.56131,
+            "11": 10.88024,
+            "12": 10.85967,
+            "13": 10.86586,
+            "14": 10.87816,
+            "15": 10.79461,
+            "16": 10.79608,
+            "17": 10.76878,
+            "18": 10.81048,
+            "19": 10.79832,
+            "20": 10.69042,
+            "21": 10.67929,
+            "22": 10.52098,
+            "23": 10.70741,
+            "24": 10.57667,
+            "25": 10.52277,
+            "26": 10.59496,
+            "27": 10.58548,
+            "28": 10.56129,
             "29": 10.56894,
-            "30": 10.34527,
-            "31": 10.10019,
-            "32": 10.45229,
+            "30": 10.3453,
+            "31": 10.1002,
+            "32": 10.45227,
             "33": 10.44356,
-            "34": 10.20397,
-            "35": 10.25844,
+            "34": 10.20401,
+            "35": 10.25843,
             "36": 10.2103,
-            "37": 10.32252,
-            "38": 10.1661,
-            "39": 10.38156,
-            "40": 10.07025,
-            "41": 10.13542,
-            "42": 10.19416,
-            "43": 9.80626,
-            "44": 9.92627,
-            "45": 9.8024,
-            "46": 9.79983,
-            "47": 10.11662,
-            "48": 9.81307,
-            "49": 9.50044,
-            "50": 9.87631,
-            "51": 9.82781,
-            "52": 9.71723,
-            "53": 10.03979,
-            "54": 9.92177,
-            "55": 9.85515,
-            "56": 9.59253,
-            "57": 9.44144,
-            "58": 9.79602,
-            "59": 9.55567,
-            "60": 9.4664,
+            "37": 10.32249,
+            "38": 10.16611,
+            "39": 10.38155,
+            "40": 10.07026,
+            "41": 10.13534,
+            "42": 10.19417,
+            "43": 9.80625,
+            "44": 9.92626,
+            "45": 9.80241,
+            "46": 9.79982,
+            "47": 10.11664,
+            "48": 9.81302,
+            "49": 9.50045,
+            "50": 9.87633,
+            "51": 9.82782,
+            "52": 9.71728,
+            "53": 10.03983,
+            "54": 9.92178,
+            "55": 9.85516,
+            "56": 9.59252,
+            "57": 9.44146,
+            "58": 9.79606,
+            "59": 9.55569,
+            "60": 9.46635,
             "61": 9.6666,
-            "62": 9.95363,
-            "63": 9.33626,
-            "64": 9.74152,
-            "65": 8.9178,
-            "66": 9.66632,
+            "62": 9.95362,
+            "63": 9.33627,
+            "64": 9.7415,
+            "65": 8.91782,
+            "66": 9.66633,
             "67": 9.34424,
             "68": 9.75273,
-            "69": 9.75727,
+            "69": 9.75725,
             "70": 9.69242,
-            "71": 9.5868,
-            "72": 9.55099,
+            "71": 9.58679,
+            "72": 9.551,
             "73": 9.46289,
-            "74": 8.90671,
-            "75": 9.37793,
-            "76": 9.04952,
-            "77": 10.0301,
-            "78": 9.69192,
-            "79": 9.33464,
-            "80": 9.3667,
-            "81": 9.44418,
-            "82": 9.66164,
-            "83": 9.27209,
-            "84": 9.38066,
+            "74": 8.90674,
+            "75": 9.37794,
+            "76": 9.04951,
+            "77": 10.03011,
+            "78": 9.69189,
+            "79": 9.33463,
+            "80": 9.36672,
+            "81": 9.44419,
+            "82": 9.66162,
+            "83": 9.2721,
+            "84": 9.38062,
             "85": 9.57618,
-            "86": 9.0424,
-            "87": 9.55703,
+            "86": 9.04242,
+            "87": 9.557,
             "88": 9.70385,
-            "89": 9.56619,
-            "90": 9.77295,
-            "91": 9.29396,
-            "92": 9.31912,
-            "93": 9.03406,
+            "89": 9.56616,
+            "90": 9.77294,
+            "91": 9.29399,
+            "92": 9.31911,
+            "93": 9.03403,
             "94": 8.78526,
-            "95": 9.46938,
-            "96": 9.47497,
-            "97": 9.25688,
-            "98": 9.61835,
-            "99": 8.83233,
-            "100": 9.34557
+            "95": 9.46939,
+            "96": 9.47496,
+            "97": 9.25683,
+            "98": 9.61833,
+            "99": 8.8323,
+            "100": 9.34562
         }
     },
     "num-zeros": {
@@ -119,98 +119,98 @@
             "6": 1823.0,
             "7": 1719.0,
             "8": 1637.0,
-            "9": 1742.0,
-            "10": 1358.0,
-            "11": 1882.0,
-            "12": 1781.0,
-            "13": 1847.0,
-            "14": 1753.0,
-            "15": 1883.0,
-            "16": 1755.0,
-            "17": 1752.0,
-            "18": 1683.0,
-            "19": 1817.0,
-            "20": 1663.0,
-            "21": 1795.0,
-            "22": 1698.0,
-            "23": 1996.0,
-            "24": 1620.0,
-            "25": 1658.0,
-            "26": 1727.0,
-            "27": 1781.0,
-            "28": 2085.0,
-            "29": 1952.0,
-            "30": 1821.0,
-            "31": 1646.0,
-            "32": 1879.0,
-            "33": 2034.0,
-            "34": 1861.0,
-            "35": 1834.0,
-            "36": 1913.0,
-            "37": 2333.0,
-            "38": 2070.0,
-            "39": 2245.0,
-            "40": 2126.0,
-            "41": 2311.0,
-            "42": 2213.0,
-            "43": 1907.0,
-            "44": 1951.0,
-            "45": 2001.0,
-            "46": 2218.0,
-            "47": 2533.0,
-            "48": 2436.0,
-            "49": 2188.0,
-            "50": 2342.0,
-            "51": 2562.0,
-            "52": 2529.0,
-            "53": 3031.0,
-            "54": 2744.0,
-            "55": 2264.0,
-            "56": 2794.0,
-            "57": 2183.0,
-            "58": 2882.0,
-            "59": 2769.0,
-            "60": 2399.0,
-            "61": 3031.0,
-            "62": 2706.0,
-            "63": 2388.0,
-            "64": 3046.0,
-            "65": 2597.0,
-            "66": 3092.0,
-            "67": 2730.0,
-            "68": 2858.0,
-            "69": 2982.0,
-            "70": 3202.0,
-            "71": 2964.0,
-            "72": 2450.0,
-            "73": 2817.0,
-            "74": 1834.0,
-            "75": 2609.0,
-            "76": 3000.0,
-            "77": 3180.0,
-            "78": 3113.0,
-            "79": 3145.0,
-            "80": 3258.0,
-            "81": 3645.0,
-            "82": 3075.0,
-            "83": 2812.0,
-            "84": 3295.0,
-            "85": 3368.0,
-            "86": 2730.0,
-            "87": 3717.0,
-            "88": 3056.0,
-            "89": 3252.0,
-            "90": 2954.0,
-            "91": 2798.0,
-            "92": 3089.0,
-            "93": 2742.0,
-            "94": 3420.0,
-            "95": 3225.0,
-            "96": 3362.0,
-            "97": 3118.0,
-            "98": 3671.0,
-            "99": 3341.0,
-            "100": 3428.0
+            "9": 1776.0,
+            "10": 1356.0,
+            "11": 1851.0,
+            "12": 1753.0,
+            "13": 1865.0,
+            "14": 1686.0,
+            "15": 1859.0,
+            "16": 1834.0,
+            "17": 1776.0,
+            "18": 1609.0,
+            "19": 1771.0,
+            "20": 1624.0,
+            "21": 1885.0,
+            "22": 1740.0,
+            "23": 1950.0,
+            "24": 1707.0,
+            "25": 1746.0,
+            "26": 1809.0,
+            "27": 1822.0,
+            "28": 2039.0,
+            "29": 1989.0,
+            "30": 1888.0,
+            "31": 1607.0,
+            "32": 1891.0,
+            "33": 2102.0,
+            "34": 1900.0,
+            "35": 1939.0,
+            "36": 1937.0,
+            "37": 2319.0,
+            "38": 2215.0,
+            "39": 2289.0,
+            "40": 2081.0,
+            "41": 2341.0,
+            "42": 2227.0,
+            "43": 1889.0,
+            "44": 2002.0,
+            "45": 1989.0,
+            "46": 2300.0,
+            "47": 2473.0,
+            "48": 2407.0,
+            "49": 2291.0,
+            "50": 2423.0,
+            "51": 2489.0,
+            "52": 2624.0,
+            "53": 2894.0,
+            "54": 2672.0,
+            "55": 2317.0,
+            "56": 2736.0,
+            "57": 2197.0,
+            "58": 2903.0,
+            "59": 2833.0,
+            "60": 2448.0,
+            "61": 2942.0,
+            "62": 2603.0,
+            "63": 2412.0,
+            "64": 2913.0,
+            "65": 2665.0,
+            "66": 3011.0,
+            "67": 2573.0,
+            "68": 2848.0,
+            "69": 2990.0,
+            "70": 3095.0,
+            "71": 2974.0,
+            "72": 2383.0,
+            "73": 2769.0,
+            "74": 1867.0,
+            "75": 2542.0,
+            "76": 2962.0,
+            "77": 3172.0,
+            "78": 3190.0,
+            "79": 3132.0,
+            "80": 3350.0,
+            "81": 3621.0,
+            "82": 3145.0,
+            "83": 2739.0,
+            "84": 3366.0,
+            "85": 3493.0,
+            "86": 2693.0,
+            "87": 3840.0,
+            "88": 2919.0,
+            "89": 3191.0,
+            "90": 3013.0,
+            "91": 2796.0,
+            "92": 3092.0,
+            "93": 2693.0,
+            "94": 3339.0,
+            "95": 3297.0,
+            "96": 3553.0,
+            "97": 3085.0,
+            "98": 3564.0,
+            "99": 3313.0,
+            "100": 3482.0
         }
     },
     "mem-allocated-bytes": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 12.81482,
-            "2": 0.16445,
-            "3": 0.16681,
-            "4": 0.12923,
-            "5": 0.12855,
-            "6": 0.12774,
-            "7": 0.12794,
-            "8": 0.12857,
-            "9": 0.12785,
-            "10": 0.12889,
-            "11": 0.13344,
-            "12": 0.1302,
-            "13": 0.13007,
-            "14": 0.12962,
-            "15": 0.13044,
-            "16": 0.12918,
-            "17": 0.13075,
-            "18": 0.13004,
-            "19": 0.13052,
-            "20": 0.13025,
-            "21": 0.12825,
-            "22": 0.13322,
-            "23": 0.13274,
-            "24": 0.13114,
-            "25": 0.13075,
-            "26": 0.12979,
-            "27": 0.13026,
-            "28": 0.13147,
-            "29": 0.13072,
-            "30": 0.13098,
-            "31": 0.13095,
-            "32": 0.13054,
-            "33": 0.13038,
-            "34": 0.13142,
-            "35": 0.13065,
-            "36": 0.12923,
-            "37": 0.13039,
-            "38": 0.12981,
-            "39": 0.12995,
-            "40": 0.13035,
-            "41": 0.12966,
-            "42": 0.13013,
-            "43": 0.13031,
-            "44": 0.13066,
-            "45": 0.12952,
-            "46": 0.13059,
-            "47": 0.12932,
-            "48": 0.13133,
-            "49": 0.13099,
+            "1": 4.6439,
+            "2": 0.15791,
+            "3": 0.1504,
+            "4": 0.13422,
+            "5": 0.1326,
+            "6": 0.13299,
+            "7": 0.13449,
+            "8": 0.12991,
+            "9": 0.12948,
+            "10": 0.13174,
+            "11": 0.13098,
+            "12": 0.13037,
+            "13": 0.13071,
+            "14": 0.13091,
+            "15": 0.1311,
+            "16": 0.13106,
+            "17": 0.13049,
+            "18": 0.13044,
+            "19": 0.13091,
+            "20": 0.13092,
+            "21": 0.13077,
+            "22": 0.13178,
+            "23": 0.13149,
+            "24": 0.13147,
+            "25": 0.13094,
+            "26": 0.13089,
+            "27": 0.13076,
+            "28": 0.13077,
+            "29": 0.13143,
+            "30": 0.13073,
+            "31": 0.13091,
+            "32": 0.13106,
+            "33": 0.13097,
+            "34": 0.13044,
+            "35": 0.13123,
+            "36": 0.13087,
+            "37": 0.13144,
+            "38": 0.13066,
+            "39": 0.13081,
+            "40": 0.13065,
+            "41": 0.13133,
+            "42": 0.13115,
+            "43": 0.13136,
+            "44": 0.13079,
+            "45": 0.13085,
+            "46": 0.13162,
+            "47": 0.131,
+            "48": 0.13067,
+            "49": 0.13121,
             "50": 0.13032,
-            "51": 0.13345,
-            "52": 0.13027,
-            "53": 0.13035,
-            "54": 0.13064,
-            "55": 0.13026,
-            "56": 0.13053,
-            "57": 0.13106,
-            "58": 0.13032,
-            "59": 0.13178,
-            "60": 0.13233,
-            "61": 0.13005,
-            "62": 0.13045,
-            "63": 0.13061,
-            "64": 0.13066,
-            "65": 0.13102,
-            "66": 0.13143,
-            "67": 0.13033,
-            "68": 0.13066,
-            "69": 0.12904,
-            "70": 0.13059,
-            "71": 0.13052,
-            "72": 0.13076,
-            "73": 0.13215,
-            "74": 0.13173,
-            "75": 0.13126,
-            "76": 0.12946,
-            "77": 0.13071,
-            "78": 0.12973,
-            "79": 0.12962,
-            "80": 0.12976,
-            "81": 0.12993,
-            "82": 0.12829,
-            "83": 0.13132,
-            "84": 0.1304,
-            "85": 0.13095,
-            "86": 0.13112,
-            "87": 0.12994,
-            "88": 0.13287,
-            "89": 0.1284,
-            "90": 0.1303,
-            "91": 0.12966,
-            "92": 0.13139,
-            "93": 0.12932,
-            "94": 0.12687,
-            "95": 0.13012,
-            "96": 0.12919,
-            "97": 0.13166,
-            "98": 0.12958,
-            "99": 0.13126,
-            "100": 0.1303
+            "51": 0.13326,
+            "52": 0.13146,
+            "53": 0.1304,
+            "54": 0.13069,
+            "55": 0.13128,
+            "56": 0.13061,
+            "57": 0.13062,
+            "58": 0.13056,
+            "59": 0.13062,
+            "60": 0.13016,
+            "61": 0.13079,
+            "62": 0.13079,
+            "63": 0.13044,
+            "64": 0.13074,
+            "65": 0.13159,
+            "66": 0.13108,
+            "67": 0.13125,
+            "68": 0.13103,
+            "69": 0.1306,
+            "70": 0.13075,
+            "71": 0.13114,
+            "72": 0.13089,
+            "73": 0.13109,
+            "74": 0.13187,
+            "75": 0.13679,
+            "76": 0.13183,
+            "77": 0.13183,
+            "78": 0.1322,
+            "79": 0.13235,
+            "80": 0.13227,
+            "81": 0.13232,
+            "82": 0.13263,
+            "83": 0.13214,
+            "84": 0.13146,
+            "85": 0.13162,
+            "86": 0.13188,
+            "87": 0.13144,
+            "88": 0.13202,
+            "89": 0.1326,
+            "90": 0.1313,
+            "91": 0.13207,
+            "92": 0.13186,
+            "93": 0.13226,
+            "94": 0.13226,
+            "95": 0.13194,
+            "96": 0.13248,
+            "97": 0.13228,
+            "98": 0.13188,
+            "99": 0.13261,
+            "100": 0.13281
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..0758fd3a8cf
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82555,
+            "2": 10.83286,
+            "3": 10.82763,
+            "4": 10.79573,
+            "5": 10.85699,
+            "6": 10.8639,
+            "7": 10.82612,
+            "8": 10.82543,
+            "9": 10.8359,
+            "10": 10.79633,
+            "11": 10.87819,
+            "12": 10.85823,
+            "13": 10.85425,
+            "14": 10.87526,
+            "15": 10.79206,
+            "16": 10.80309,
+            "17": 10.77438,
+            "18": 10.80484,
+            "19": 10.79368,
+            "20": 10.69574,
+            "21": 10.68657,
+            "22": 10.53162,
+            "23": 10.70642,
+            "24": 10.57336,
+            "25": 10.51534,
+            "26": 10.59088,
+            "27": 10.60779,
+            "28": 10.57051,
+            "29": 10.58978,
+            "30": 10.34722,
+            "31": 10.07772,
+            "32": 10.46349,
+            "33": 10.45726,
+            "34": 10.19975,
+            "35": 10.25642,
+            "36": 10.21264,
+            "37": 10.34717,
+            "38": 10.18011,
+            "39": 10.40833,
+            "40": 10.07628,
+            "41": 10.1297,
+            "42": 10.21174,
+            "43": 9.8171,
+            "44": 9.94032,
+            "45": 9.81748,
+            "46": 9.8063,
+            "47": 10.12475,
+            "48": 9.84049,
+            "49": 9.51015,
+            "50": 9.88941,
+            "51": 9.8426,
+            "52": 9.72578,
+            "53": 10.05977,
+            "54": 9.95226,
+            "55": 9.88321,
+            "56": 9.61276,
+            "57": 9.46222,
+            "58": 9.82313,
+            "59": 9.57665,
+            "60": 9.48518,
+            "61": 9.6788,
+            "62": 9.97777,
+            "63": 9.36212,
+            "64": 9.75714,
+            "65": 8.93499,
+            "66": 9.69281,
+            "67": 9.36709,
+            "68": 9.78179,
+            "69": 9.79451,
+            "70": 9.72295,
+            "71": 9.62027,
+            "72": 9.56974,
+            "73": 9.481,
+            "74": 8.91241,
+            "75": 9.40906,
+            "76": 9.06623,
+            "77": 10.05808,
+            "78": 9.72188,
+            "79": 9.36927,
+            "80": 9.40027,
+            "81": 9.47702,
+            "82": 9.69788,
+            "83": 9.30742,
+            "84": 9.41496,
+            "85": 9.61115,
+            "86": 9.07104,
+            "87": 9.59609,
+            "88": 9.74908,
+            "89": 9.5961,
+            "90": 9.82722,
+            "91": 9.3366,
+            "92": 9.3558,
+            "93": 9.08695,
+            "94": 8.82752,
+            "95": 9.53066,
+            "96": 9.52759,
+            "97": 9.30671,
+            "98": 9.66909,
+            "99": 8.89637,
+            "100": 9.4052
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1651.0,
+            "2": 1716.0,
+            "3": 1760.0,
+            "4": 1771.0,
+            "5": 1899.0,
+            "6": 1905.0,
+            "7": 1842.0,
+            "8": 1706.0,
+            "9": 1891.0,
+            "10": 1543.0,
+            "11": 1937.0,
+            "12": 1794.0,
+            "13": 1982.0,
+            "14": 1727.0,
+            "15": 1890.0,
+            "16": 1746.0,
+            "17": 1818.0,
+            "18": 1651.0,
+            "19": 1782.0,
+            "20": 1698.0,
+            "21": 1950.0,
+            "22": 1702.0,
+            "23": 1972.0,
+            "24": 1551.0,
+            "25": 1587.0,
+            "26": 1773.0,
+            "27": 1791.0,
+            "28": 1858.0,
+            "29": 1950.0,
+            "30": 1951.0,
+            "31": 1499.0,
+            "32": 1823.0,
+            "33": 2055.0,
+            "34": 1788.0,
+            "35": 1877.0,
+            "36": 1933.0,
+            "37": 2302.0,
+            "38": 2181.0,
+            "39": 2223.0,
+            "40": 2009.0,
+            "41": 2178.0,
+            "42": 2185.0,
+            "43": 2041.0,
+            "44": 2069.0,
+            "45": 2004.0,
+            "46": 2212.0,
+            "47": 2446.0,
+            "48": 2290.0,
+            "49": 2183.0,
+            "50": 2323.0,
+            "51": 2587.0,
+            "52": 2574.0,
+            "53": 2831.0,
+            "54": 2602.0,
+            "55": 2403.0,
+            "56": 2822.0,
+            "57": 2223.0,
+            "58": 2954.0,
+            "59": 2871.0,
+            "60": 2518.0,
+            "61": 2922.0,
+            "62": 2677.0,
+            "63": 2533.0,
+            "64": 3023.0,
+            "65": 2609.0,
+            "66": 2960.0,
+            "67": 2867.0,
+            "68": 2652.0,
+            "69": 3053.0,
+            "70": 3011.0,
+            "71": 2870.0,
+            "72": 2460.0,
+            "73": 3114.0,
+            "74": 2017.0,
+            "75": 2527.0,
+            "76": 2954.0,
+            "77": 2955.0,
+            "78": 3055.0,
+            "79": 3098.0,
+            "80": 3047.0,
+            "81": 3362.0,
+            "82": 3296.0,
+            "83": 2825.0,
+            "84": 3113.0,
+            "85": 3196.0,
+            "86": 2666.0,
+            "87": 3583.0,
+            "88": 2985.0,
+            "89": 3259.0,
+            "90": 3220.0,
+            "91": 2781.0,
+            "92": 3090.0,
+            "93": 2686.0,
+            "94": 3474.0,
+            "95": 3147.0,
+            "96": 3418.0,
+            "97": 3036.0,
+            "98": 3411.0,
+            "99": 3152.0,
+            "100": 3098.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 763221504.0,
+            "2": 763221504.0,
+            "3": 763221504.0,
+            "4": 763221504.0,
+            "5": 763221504.0,
+            "6": 763221504.0,
+            "7": 763221504.0,
+            "8": 763221504.0,
+            "9": 763221504.0,
+            "10": 763221504.0,
+            "11": 763221504.0,
+            "12": 763221504.0,
+            "13": 763221504.0,
+            "14": 763221504.0,
+            "15": 763221504.0,
+            "16": 763221504.0,
+            "17": 763221504.0,
+            "18": 763221504.0,
+            "19": 763221504.0,
+            "20": 763221504.0,
+            "21": 763221504.0,
+            "22": 763221504.0,
+            "23": 763221504.0,
+            "24": 763221504.0,
+            "25": 763221504.0,
+            "26": 763221504.0,
+            "27": 763221504.0,
+            "28": 763221504.0,
+            "29": 763221504.0,
+            "30": 763221504.0,
+            "31": 763221504.0,
+            "32": 763221504.0,
+            "33": 763221504.0,
+            "34": 763221504.0,
+            "35": 763221504.0,
+            "36": 763221504.0,
+            "37": 763221504.0,
+            "38": 763221504.0,
+            "39": 763221504.0,
+            "40": 763221504.0,
+            "41": 763221504.0,
+            "42": 763221504.0,
+            "43": 763221504.0,
+            "44": 763221504.0,
+            "45": 763221504.0,
+            "46": 763221504.0,
+            "47": 763221504.0,
+            "48": 763221504.0,
+            "49": 763221504.0,
+            "50": 763221504.0,
+            "51": 763221504.0,
+            "52": 763221504.0,
+            "53": 763221504.0,
+            "54": 763221504.0,
+            "55": 763221504.0,
+            "56": 763221504.0,
+            "57": 763221504.0,
+            "58": 763221504.0,
+            "59": 763221504.0,
+            "60": 763221504.0,
+            "61": 763221504.0,
+            "62": 763221504.0,
+            "63": 763221504.0,
+            "64": 763221504.0,
+            "65": 763221504.0,
+            "66": 763221504.0,
+            "67": 763221504.0,
+            "68": 763221504.0,
+            "69": 763221504.0,
+            "70": 763221504.0,
+            "71": 763221504.0,
+            "72": 763221504.0,
+            "73": 763221504.0,
+            "74": 763221504.0,
+            "75": 763221504.0,
+            "76": 763221504.0,
+            "77": 763221504.0,
+            "78": 763221504.0,
+            "79": 763221504.0,
+            "80": 763221504.0,
+            "81": 763221504.0,
+            "82": 763221504.0,
+            "83": 763221504.0,
+            "84": 763221504.0,
+            "85": 763221504.0,
+            "86": 763221504.0,
+            "87": 763221504.0,
+            "88": 763221504.0,
+            "89": 763221504.0,
+            "90": 763221504.0,
+            "91": 763221504.0,
+            "92": 763221504.0,
+            "93": 763221504.0,
+            "94": 763221504.0,
+            "95": 763221504.0,
+            "96": 763221504.0,
+            "97": 763221504.0,
+            "98": 763221504.0,
+            "99": 763221504.0,
+            "100": 763221504.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2398288896.0,
+            "2": 2681049088.0,
+            "3": 2681049088.0,
+            "4": 2681049088.0,
+            "5": 2681049088.0,
+            "6": 2681049088.0,
+            "7": 2681049088.0,
+            "8": 2681049088.0,
+            "9": 2681049088.0,
+            "10": 2681049088.0,
+            "11": 2681049088.0,
+            "12": 2681049088.0,
+            "13": 2681049088.0,
+            "14": 2681049088.0,
+            "15": 2681049088.0,
+            "16": 2681049088.0,
+            "17": 2681049088.0,
+            "18": 2681049088.0,
+            "19": 2681049088.0,
+            "20": 2681049088.0,
+            "21": 2681049088.0,
+            "22": 2681049088.0,
+            "23": 2681049088.0,
+            "24": 2681049088.0,
+            "25": 2681049088.0,
+            "26": 2681049088.0,
+            "27": 2681049088.0,
+            "28": 2681049088.0,
+            "29": 2681049088.0,
+            "30": 2681049088.0,
+            "31": 2681049088.0,
+            "32": 2681049088.0,
+            "33": 2681049088.0,
+            "34": 2681049088.0,
+            "35": 2681049088.0,
+            "36": 2681049088.0,
+            "37": 2681049088.0,
+            "38": 2681049088.0,
+            "39": 2681049088.0,
+            "40": 2681049088.0,
+            "41": 2681049088.0,
+            "42": 2681049088.0,
+            "43": 2681049088.0,
+            "44": 2681049088.0,
+            "45": 2681049088.0,
+            "46": 2681049088.0,
+            "47": 2681049088.0,
+            "48": 2681049088.0,
+            "49": 2681049088.0,
+            "50": 2681049088.0,
+            "51": 2681049088.0,
+            "52": 2681049088.0,
+            "53": 2681049088.0,
+            "54": 2681049088.0,
+            "55": 2681049088.0,
+            "56": 2681049088.0,
+            "57": 2681049088.0,
+            "58": 2681049088.0,
+            "59": 2681049088.0,
+            "60": 2681049088.0,
+            "61": 2681049088.0,
+            "62": 2681049088.0,
+            "63": 2681049088.0,
+            "64": 2681049088.0,
+            "65": 2681049088.0,
+            "66": 2681049088.0,
+            "67": 2681049088.0,
+            "68": 2681049088.0,
+            "69": 2681049088.0,
+            "70": 2681049088.0,
+            "71": 2681049088.0,
+            "72": 2681049088.0,
+            "73": 2681049088.0,
+            "74": 2681049088.0,
+            "75": 2681049088.0,
+            "76": 2681049088.0,
+            "77": 2681049088.0,
+            "78": 2681049088.0,
+            "79": 2681049088.0,
+            "80": 2681049088.0,
+            "81": 2681049088.0,
+            "82": 2681049088.0,
+            "83": 2681049088.0,
+            "84": 2681049088.0,
+            "85": 2681049088.0,
+            "86": 2681049088.0,
+            "87": 2681049088.0,
+            "88": 2681049088.0,
+            "89": 2681049088.0,
+            "90": 2681049088.0,
+            "91": 2681049088.0,
+            "92": 2681049088.0,
+            "93": 2681049088.0,
+            "94": 2681049088.0,
+            "95": 2681049088.0,
+            "96": 2681049088.0,
+            "97": 2681049088.0,
+            "98": 2681049088.0,
+            "99": 2681049088.0,
+            "100": 2681049088.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.0922,
+            "3": 0.12788,
+            "4": 0.11451,
+            "5": 0.11407,
+            "6": 0.11304,
+            "7": 0.11565,
+            "8": 0.11482,
+            "9": 0.11841,
+            "10": 0.11916,
+            "11": 0.11884,
+            "12": 0.11911,
+            "13": 0.1155,
+            "14": 0.12253,
+            "15": 0.11369,
+            "16": 0.11887,
+            "17": 0.11433,
+            "18": 0.12243,
+            "19": 0.11544,
+            "20": 0.11344,
+            "21": 0.1254,
+            "22": 0.11712,
+            "23": 0.12494,
+            "24": 0.12239,
+            "25": 0.12344,
+            "26": 0.11952,
+            "27": 0.12117,
+            "28": 0.11916,
+            "29": 0.11974,
+            "30": 0.11517,
+            "31": 0.1219,
+            "32": 0.12112,
+            "33": 0.11997,
+            "34": 0.1133,
+            "35": 0.12245,
+            "36": 0.12118,
+            "37": 0.11239,
+            "38": 0.12174,
+            "39": 0.11964,
+            "40": 0.11993,
+            "41": 0.12013,
+            "42": 0.12614,
+            "43": 0.11697,
+            "44": 0.11669,
+            "45": 0.11781,
+            "46": 0.11776,
+            "47": 0.11182,
+            "48": 0.1196,
+            "49": 0.11814,
+            "50": 0.11736,
+            "51": 0.12093,
+            "52": 0.1107,
+            "53": 0.11502,
+            "54": 0.11571,
+            "55": 0.11493,
+            "56": 0.11712,
+            "57": 0.11663,
+            "58": 0.11203,
+            "59": 0.11604,
+            "60": 0.11649,
+            "61": 0.11616,
+            "62": 0.11641,
+            "63": 0.11603,
+            "64": 0.11613,
+            "65": 0.11708,
+            "66": 0.11292,
+            "67": 0.11356,
+            "68": 0.11416,
+            "69": 0.11305,
+            "70": 0.11582,
+            "71": 0.11552,
+            "72": 0.11318,
+            "73": 0.11798,
+            "74": 0.11632,
+            "75": 0.11624,
+            "76": 0.11602,
+            "77": 0.11547,
+            "78": 0.11457,
+            "79": 0.11402,
+            "80": 0.11415,
+            "81": 0.11627,
+            "82": 0.11295,
+            "83": 0.11397,
+            "84": 0.11221,
+            "85": 0.11326,
+            "86": 0.11792,
+            "87": 0.11391,
+            "88": 0.11365,
+            "89": 0.11478,
+            "90": 0.11346,
+            "91": 0.11213,
+            "92": 0.11712,
+            "93": 0.11574,
+            "94": 0.11724,
+            "95": 0.11254,
+            "96": 0.11871,
+            "97": 0.11957,
+            "98": 0.11759,
+            "99": 0.11864,
+            "100": 0.11833
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..f7efd011023
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8426,
+            "52": 9.72578,
+            "53": 10.05977,
+            "54": 9.95226,
+            "55": 9.88321,
+            "56": 9.61276,
+            "57": 9.46222,
+            "58": 9.82313,
+            "59": 9.57665,
+            "60": 9.48518,
+            "61": 9.6788,
+            "62": 9.97777,
+            "63": 9.36212,
+            "64": 9.75714,
+            "65": 8.93499,
+            "66": 9.69281,
+            "67": 9.36709,
+            "68": 9.78179,
+            "69": 9.79451,
+            "70": 9.72295,
+            "71": 9.62027,
+            "72": 9.56974,
+            "73": 9.481,
+            "74": 8.91241,
+            "75": 9.40906,
+            "76": 9.06623,
+            "77": 10.05808,
+            "78": 9.72188,
+            "79": 9.36927,
+            "80": 9.40027,
+            "81": 9.47702,
+            "82": 9.69788,
+            "83": 9.30742,
+            "84": 9.41496,
+            "85": 9.61115,
+            "86": 9.07104,
+            "87": 9.59609,
+            "88": 9.74908,
+            "89": 9.5961,
+            "90": 9.82722,
+            "91": 9.3366,
+            "92": 9.3558,
+            "93": 9.08695,
+            "94": 8.82752,
+            "95": 9.53066,
+            "96": 9.52759,
+            "97": 9.30671,
+            "98": 9.66909,
+            "99": 8.89637,
+            "100": 9.4052
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2587.0,
+            "52": 2574.0,
+            "53": 2831.0,
+            "54": 2602.0,
+            "55": 2403.0,
+            "56": 2822.0,
+            "57": 2223.0,
+            "58": 2954.0,
+            "59": 2871.0,
+            "60": 2518.0,
+            "61": 2922.0,
+            "62": 2677.0,
+            "63": 2533.0,
+            "64": 3023.0,
+            "65": 2609.0,
+            "66": 2960.0,
+            "67": 2867.0,
+            "68": 2652.0,
+            "69": 3053.0,
+            "70": 3011.0,
+            "71": 2870.0,
+            "72": 2460.0,
+            "73": 3114.0,
+            "74": 2017.0,
+            "75": 2527.0,
+            "76": 2954.0,
+            "77": 2955.0,
+            "78": 3055.0,
+            "79": 3098.0,
+            "80": 3047.0,
+            "81": 3362.0,
+            "82": 3296.0,
+            "83": 2825.0,
+            "84": 3113.0,
+            "85": 3196.0,
+            "86": 2666.0,
+            "87": 3583.0,
+            "88": 2985.0,
+            "89": 3259.0,
+            "90": 3220.0,
+            "91": 2781.0,
+            "92": 3090.0,
+            "93": 2686.0,
+            "94": 3474.0,
+            "95": 3147.0,
+            "96": 3418.0,
+            "97": 3036.0,
+            "98": 3411.0,
+            "99": 3152.0,
+            "100": 3098.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 763221504.0,
+            "52": 763221504.0,
+            "53": 763221504.0,
+            "54": 763221504.0,
+            "55": 763221504.0,
+            "56": 763221504.0,
+            "57": 763221504.0,
+            "58": 763221504.0,
+            "59": 763221504.0,
+            "60": 763221504.0,
+            "61": 763221504.0,
+            "62": 763221504.0,
+            "63": 763221504.0,
+            "64": 763221504.0,
+            "65": 763221504.0,
+            "66": 763221504.0,
+            "67": 763221504.0,
+            "68": 763221504.0,
+            "69": 763221504.0,
+            "70": 763221504.0,
+            "71": 763221504.0,
+            "72": 763221504.0,
+            "73": 763221504.0,
+            "74": 763221504.0,
+            "75": 763221504.0,
+            "76": 763221504.0,
+            "77": 763221504.0,
+            "78": 763221504.0,
+            "79": 763221504.0,
+            "80": 763221504.0,
+            "81": 763221504.0,
+            "82": 763221504.0,
+            "83": 763221504.0,
+            "84": 763221504.0,
+            "85": 763221504.0,
+            "86": 763221504.0,
+            "87": 763221504.0,
+            "88": 763221504.0,
+            "89": 763221504.0,
+            "90": 763221504.0,
+            "91": 763221504.0,
+            "92": 763221504.0,
+            "93": 763221504.0,
+            "94": 763221504.0,
+            "95": 763221504.0,
+            "96": 763221504.0,
+            "97": 763221504.0,
+            "98": 763221504.0,
+            "99": 763221504.0,
+            "100": 763221504.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2682096640.0,
+            "52": 2682097664.0,
+            "53": 2682097664.0,
+            "54": 2682097664.0,
+            "55": 2682097664.0,
+            "56": 2682097664.0,
+            "57": 2682097664.0,
+            "58": 2682097664.0,
+            "59": 2682097664.0,
+            "60": 2682097664.0,
+            "61": 2682097664.0,
+            "62": 2682097664.0,
+            "63": 2682097664.0,
+            "64": 2682097664.0,
+            "65": 2682097664.0,
+            "66": 2682097664.0,
+            "67": 2682097664.0,
+            "68": 2682097664.0,
+            "69": 2682097664.0,
+            "70": 2682097664.0,
+            "71": 2682097664.0,
+            "72": 2682097664.0,
+            "73": 2682097664.0,
+            "74": 2682097664.0,
+            "75": 2682097664.0,
+            "76": 2682097664.0,
+            "77": 2682097664.0,
+            "78": 2682097664.0,
+            "79": 2682097664.0,
+            "80": 2682097664.0,
+            "81": 2682097664.0,
+            "82": 2682097664.0,
+            "83": 2682097664.0,
+            "84": 2682097664.0,
+            "85": 2682097664.0,
+            "86": 2682097664.0,
+            "87": 2682097664.0,
+            "88": 2682097664.0,
+            "89": 2682097664.0,
+            "90": 2682097664.0,
+            "91": 2682097664.0,
+            "92": 2682097664.0,
+            "93": 2682097664.0,
+            "94": 2682097664.0,
+            "95": 2682097664.0,
+            "96": 2682097664.0,
+            "97": 2682097664.0,
+            "98": 2682097664.0,
+            "99": 2682097664.0,
+            "100": 2682097664.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 3.06236,
+            "53": 0.13884,
+            "54": 0.12077,
+            "55": 0.12587,
+            "56": 0.12115,
+            "57": 0.12166,
+            "58": 0.12353,
+            "59": 0.1247,
+            "60": 0.12221,
+            "61": 0.12159,
+            "62": 0.12136,
+            "63": 0.13043,
+            "64": 0.12973,
+            "65": 0.13067,
+            "66": 0.14918,
+            "67": 0.11954,
+            "68": 0.11631,
+            "69": 0.11511,
+            "70": 0.11621,
+            "71": 0.11553,
+            "72": 0.11537,
+            "73": 0.11691,
+            "74": 0.11875,
+            "75": 0.11769,
+            "76": 0.11586,
+            "77": 0.11847,
+            "78": 0.11896,
+            "79": 0.11697,
+            "80": 0.11854,
+            "81": 0.11758,
+            "82": 0.11531,
+            "83": 0.11776,
+            "84": 0.11613,
+            "85": 0.11822,
+            "86": 0.11858,
+            "87": 0.11763,
+            "88": 0.11691,
+            "89": 0.11931,
+            "90": 0.11678,
+            "91": 0.11601,
+            "92": 0.11377,
+            "93": 0.11692,
+            "94": 0.11741,
+            "95": 0.11634,
+            "96": 0.1145,
+            "97": 0.12011,
+            "98": 0.11722,
+            "99": 0.11609,
+            "100": 0.11641
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100.json
index f0d9be9be9d..126681fbe76 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 763220480.0,
-            "2": 763220480.0,
-            "3": 763220480.0,
-            "4": 763220480.0,
-            "5": 763220480.0,
-            "6": 763220480.0,
-            "7": 763220480.0,
-            "8": 763220480.0,
-            "9": 763220480.0,
-            "10": 763220480.0,
-            "11": 763220480.0,
-            "12": 763220480.0,
-            "13": 763220480.0,
-            "14": 763220480.0,
-            "15": 763220480.0,
-            "16": 763220480.0,
-            "17": 763220480.0,
-            "18": 763220480.0,
-            "19": 763220480.0,
-            "20": 763220480.0,
-            "21": 763220480.0,
-            "22": 763220480.0,
-            "23": 763220480.0,
-            "24": 763220480.0,
-            "25": 763220480.0,
-            "26": 763220480.0,
-            "27": 763220480.0,
-            "28": 763220480.0,
-            "29": 763220480.0,
-            "30": 763220480.0,
-            "31": 763220480.0,
-            "32": 763220480.0,
-            "33": 763220480.0,
-            "34": 763220480.0,
-            "35": 763220480.0,
-            "36": 763220480.0,
-            "37": 763220480.0,
-            "38": 763220480.0,
-            "39": 763220480.0,
-            "40": 763220480.0,
-            "41": 763220480.0,
-            "42": 763220480.0,
-            "43": 763220480.0,
-            "44": 763220480.0,
-            "45": 763220480.0,
-            "46": 763220480.0,
-            "47": 763220480.0,
-            "48": 763220480.0,
-            "49": 763220480.0,
-            "50": 763220480.0,
-            "51": 763220480.0,
-            "52": 763220480.0,
-            "53": 763220480.0,
-            "54": 763220480.0,
-            "55": 763220480.0,
-            "56": 763220480.0,
-            "57": 763220480.0,
-            "58": 763220480.0,
-            "59": 763220480.0,
-            "60": 763220480.0,
-            "61": 763220480.0,
-            "62": 763220480.0,
-            "63": 763220480.0,
-            "64": 763220480.0,
-            "65": 763220480.0,
-            "66": 763220480.0,
-            "67": 763220480.0,
-            "68": 763220480.0,
-            "69": 763220480.0,
-            "70": 763220480.0,
-            "71": 763220480.0,
-            "72": 763220480.0,
-            "73": 763220480.0,
-            "74": 763220480.0,
-            "75": 763220480.0,
-            "76": 763220480.0,
-            "77": 763220480.0,
-            "78": 763220480.0,
-            "79": 763220480.0,
-            "80": 763220480.0,
-            "81": 763220480.0,
-            "82": 763220480.0,
-            "83": 763220480.0,
-            "84": 763220480.0,
-            "85": 763220480.0,
-            "86": 763220480.0,
-            "87": 763220480.0,
-            "88": 763220480.0,
-            "89": 763220480.0,
-            "90": 763220480.0,
-            "91": 763220480.0,
-            "92": 763220480.0,
-            "93": 763220480.0,
-            "94": 763220480.0,
-            "95": 763220480.0,
-            "96": 763220480.0,
-            "97": 763220480.0,
-            "98": 763220480.0,
-            "99": 763220480.0,
-            "100": 763220480.0
+            "1": 765318656.0,
+            "2": 765318656.0,
+            "3": 765318656.0,
+            "4": 765318656.0,
+            "5": 765318656.0,
+            "6": 765318656.0,
+            "7": 765318656.0,
+            "8": 765318656.0,
+            "9": 765318656.0,
+            "10": 765318656.0,
+            "11": 765318656.0,
+            "12": 765318656.0,
+            "13": 765318656.0,
+            "14": 765318656.0,
+            "15": 765318656.0,
+            "16": 765318656.0,
+            "17": 765318656.0,
+            "18": 765318656.0,
+            "19": 765318656.0,
+            "20": 765318656.0,
+            "21": 765318656.0,
+            "22": 765318656.0,
+            "23": 765318656.0,
+            "24": 765318656.0,
+            "25": 765318656.0,
+            "26": 765318656.0,
+            "27": 765318656.0,
+            "28": 765318656.0,
+            "29": 765318656.0,
+            "30": 765318656.0,
+            "31": 765318656.0,
+            "32": 765318656.0,
+            "33": 765318656.0,
+            "34": 765318656.0,
+            "35": 765318656.0,
+            "36": 765318656.0,
+            "37": 765318656.0,
+            "38": 765318656.0,
+            "39": 765318656.0,
+            "40": 765318656.0,
+            "41": 765318656.0,
+            "42": 765318656.0,
+            "43": 765318656.0,
+            "44": 765318656.0,
+            "45": 765318656.0,
+            "46": 765318656.0,
+            "47": 765318656.0,
+            "48": 765318656.0,
+            "49": 765318656.0,
+            "50": 765318656.0,
+            "51": 765318656.0,
+            "52": 765318656.0,
+            "53": 765318656.0,
+            "54": 765318656.0,
+            "55": 765318656.0,
+            "56": 765318656.0,
+            "57": 765318656.0,
+            "58": 765318656.0,
+            "59": 765318656.0,
+            "60": 765318656.0,
+            "61": 765318656.0,
+            "62": 765318656.0,
+            "63": 765318656.0,
+            "64": 765318656.0,
+            "65": 765318656.0,
+            "66": 765318656.0,
+            "67": 765318656.0,
+            "68": 765318656.0,
+            "69": 765318656.0,
+            "70": 765318656.0,
+            "71": 765318656.0,
+            "72": 765318656.0,
+            "73": 765318656.0,
+            "74": 765318656.0,
+            "75": 765318656.0,
+            "76": 765318656.0,
+            "77": 765318656.0,
+            "78": 765318656.0,
+            "79": 765318656.0,
+            "80": 765318656.0,
+            "81": 765318656.0,
+            "82": 765318656.0,
+            "83": 765318656.0,
+            "84": 765318656.0,
+            "85": 765318656.0,
+            "86": 765318656.0,
+            "87": 765318656.0,
+            "88": 765318656.0,
+            "89": 765318656.0,
+            "90": 765318656.0,
+            "91": 765318656.0,
+            "92": 765318656.0,
+            "93": 765318656.0,
+            "94": 765318656.0,
+            "95": 765318656.0,
+            "96": 765318656.0,
+            "97": 765318656.0,
+            "98": 765318656.0,
+            "99": 765318656.0,
+            "100": 765318656.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2359490560.0,
-            "2": 2643299328.0,
-            "3": 2643299328.0,
-            "4": 2643299328.0,
-            "5": 2643299328.0,
-            "6": 2643299328.0,
-            "7": 2643299328.0,
-            "8": 2643299328.0,
-            "9": 2643299328.0,
-            "10": 2643299328.0,
-            "11": 2643299328.0,
-            "12": 2643299328.0,
-            "13": 2643299328.0,
-            "14": 2643299328.0,
-            "15": 2643299328.0,
-            "16": 2643299328.0,
-            "17": 2643299328.0,
-            "18": 2643299328.0,
-            "19": 2643299328.0,
-            "20": 2643299328.0,
-            "21": 2643299328.0,
-            "22": 2643299328.0,
-            "23": 2643299328.0,
-            "24": 2643299328.0,
-            "25": 2643299328.0,
-            "26": 2643299328.0,
-            "27": 2643299328.0,
-            "28": 2643299328.0,
-            "29": 2643299328.0,
-            "30": 2643299328.0,
-            "31": 2643299328.0,
-            "32": 2643299328.0,
-            "33": 2643299328.0,
-            "34": 2643299328.0,
-            "35": 2643299328.0,
-            "36": 2643299328.0,
-            "37": 2643299328.0,
-            "38": 2643299328.0,
-            "39": 2643299328.0,
-            "40": 2643299328.0,
-            "41": 2643299328.0,
-            "42": 2643299328.0,
-            "43": 2643299328.0,
-            "44": 2643299328.0,
-            "45": 2643299328.0,
-            "46": 2643299328.0,
-            "47": 2643299328.0,
-            "48": 2643299328.0,
-            "49": 2643299328.0,
-            "50": 2643299328.0,
-            "51": 2643299328.0,
-            "52": 2643299328.0,
-            "53": 2643299328.0,
-            "54": 2643299328.0,
-            "55": 2643299328.0,
-            "56": 2643299328.0,
-            "57": 2643299328.0,
-            "58": 2643299328.0,
-            "59": 2643299328.0,
-            "60": 2643299328.0,
-            "61": 2643299328.0,
-            "62": 2643299328.0,
-            "63": 2643299328.0,
-            "64": 2643299328.0,
-            "65": 2643299328.0,
-            "66": 2643299328.0,
-            "67": 2643299328.0,
-            "68": 2643299328.0,
-            "69": 2643299328.0,
-            "70": 2643299328.0,
-            "71": 2643299328.0,
-            "72": 2643299328.0,
-            "73": 2643299328.0,
-            "74": 2643299328.0,
-            "75": 2643299328.0,
-            "76": 2643299328.0,
-            "77": 2643299328.0,
-            "78": 2643299328.0,
-            "79": 2643299328.0,
-            "80": 2643299328.0,
-            "81": 2643299328.0,
-            "82": 2643299328.0,
-            "83": 2643299328.0,
-            "84": 2643299328.0,
-            "85": 2643299328.0,
-            "86": 2643299328.0,
-            "87": 2643299328.0,
-            "88": 2643299328.0,
-            "89": 2643299328.0,
-            "90": 2643299328.0,
-            "91": 2643299328.0,
-            "92": 2643299328.0,
-            "93": 2643299328.0,
-            "94": 2643299328.0,
-            "95": 2643299328.0,
-            "96": 2643299328.0,
-            "97": 2643299328.0,
-            "98": 2643299328.0,
-            "99": 2643299328.0,
-            "100": 2643299328.0
+            "1": 2360539648.0,
+            "2": 2645397504.0,
+            "3": 2645397504.0,
+            "4": 2645397504.0,
+            "5": 2645397504.0,
+            "6": 2645397504.0,
+            "7": 2645397504.0,
+            "8": 2645397504.0,
+            "9": 2645397504.0,
+            "10": 2645397504.0,
+            "11": 2645397504.0,
+            "12": 2645397504.0,
+            "13": 2645397504.0,
+            "14": 2645397504.0,
+            "15": 2645397504.0,
+            "16": 2645397504.0,
+            "17": 2645397504.0,
+            "18": 2645397504.0,
+            "19": 2645397504.0,
+            "20": 2645397504.0,
+            "21": 2645397504.0,
+            "22": 2645397504.0,
+            "23": 2645397504.0,
+            "24": 2645397504.0,
+            "25": 2645397504.0,
+            "26": 2645397504.0,
+            "27": 2645397504.0,
+            "28": 2645397504.0,
+            "29": 2645397504.0,
+            "30": 2645397504.0,
+            "31": 2645397504.0,
+            "32": 2645397504.0,
+            "33": 2645397504.0,
+            "34": 2645397504.0,
+            "35": 2645397504.0,
+            "36": 2645397504.0,
+            "37": 2645397504.0,
+            "38": 2645397504.0,
+            "39": 2645397504.0,
+            "40": 2645397504.0,
+            "41": 2645397504.0,
+            "42": 2645397504.0,
+            "43": 2645397504.0,
+            "44": 2645397504.0,
+            "45": 2645397504.0,
+            "46": 2645397504.0,
+            "47": 2645397504.0,
+            "48": 2645397504.0,
+            "49": 2645397504.0,
+            "50": 2645397504.0,
+            "51": 2645397504.0,
+            "52": 2645397504.0,
+            "53": 2645397504.0,
+            "54": 2645397504.0,
+            "55": 2645397504.0,
+            "56": 2645397504.0,
+            "57": 2645397504.0,
+            "58": 2645397504.0,
+            "59": 2645397504.0,
+            "60": 2645397504.0,
+            "61": 2645397504.0,
+            "62": 2645397504.0,
+            "63": 2645397504.0,
+            "64": 2645397504.0,
+            "65": 2645397504.0,
+            "66": 2645397504.0,
+            "67": 2645397504.0,
+            "68": 2645397504.0,
+            "69": 2645397504.0,
+            "70": 2645397504.0,
+            "71": 2645397504.0,
+            "72": 2645397504.0,
+            "73": 2645397504.0,
+            "74": 2645397504.0,
+            "75": 2645397504.0,
+            "76": 2645397504.0,
+            "77": 2645397504.0,
+            "78": 2645397504.0,
+            "79": 2645397504.0,
+            "80": 2645397504.0,
+            "81": 2645397504.0,
+            "82": 2645397504.0,
+            "83": 2645397504.0,
+            "84": 2645397504.0,
+            "85": 2645397504.0,
+            "86": 2645397504.0,
+            "87": 2645397504.0,
+            "88": 2645397504.0,
+            "89": 2645397504.0,
+            "90": 2645397504.0,
+            "91": 2645397504.0,
+            "92": 2645397504.0,
+            "93": 2645397504.0,
+            "94": 2645397504.0,
+            "95": 2645397504.0,
+            "96": 2645397504.0,
+            "97": 2645397504.0,
+            "98": 2645397504.0,
+            "99": 2645397504.0,
+            "100": 2645397504.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 17.57509,
-            "2": 0.1453,
-            "3": 0.11184,
-            "4": 0.11457,
-            "5": 0.12345,
-            "6": 0.12167,
-            "7": 0.12451,
-            "8": 0.11003,
-            "9": 0.11229,
-            "10": 0.11078,
-            "11": 0.11178,
-            "12": 0.11071,
-            "13": 0.11183,
-            "14": 0.1131,
-            "15": 0.11195,
-            "16": 0.11109,
-            "17": 0.11155,
-            "18": 0.11436,
-            "19": 0.11335,
-            "20": 0.11235,
-            "21": 0.11323,
-            "22": 0.11234,
-            "23": 0.1131,
-            "24": 0.11154,
-            "25": 0.11274,
-            "26": 0.11525,
-            "27": 0.11435,
-            "28": 0.11247,
-            "29": 0.11318,
-            "30": 0.11126,
-            "31": 0.11489,
-            "32": 0.11045,
-            "33": 0.1114,
-            "34": 0.11253,
-            "35": 0.11114,
-            "36": 0.114,
-            "37": 0.11201,
-            "38": 0.10979,
-            "39": 0.11069,
-            "40": 0.11078,
-            "41": 0.11142,
-            "42": 0.11091,
-            "43": 0.11324,
-            "44": 0.11151,
-            "45": 0.11295,
-            "46": 0.11174,
-            "47": 0.10954,
-            "48": 0.11083,
-            "49": 0.11195,
-            "50": 0.11251,
-            "51": 0.11627,
-            "52": 0.11199,
-            "53": 0.11127,
-            "54": 0.11464,
-            "55": 0.11072,
-            "56": 0.1136,
-            "57": 0.11119,
-            "58": 0.11025,
-            "59": 0.11083,
-            "60": 0.11126,
-            "61": 0.10968,
-            "62": 0.11104,
-            "63": 0.11515,
-            "64": 0.11136,
-            "65": 0.11454,
-            "66": 0.10994,
-            "67": 0.11003,
-            "68": 0.10997,
-            "69": 0.11155,
-            "70": 0.11002,
-            "71": 0.1121,
-            "72": 0.11334,
-            "73": 0.11221,
-            "74": 0.11542,
-            "75": 0.11082,
-            "76": 0.10997,
-            "77": 0.11087,
-            "78": 0.11222,
-            "79": 0.11343,
-            "80": 0.11462,
-            "81": 0.11272,
-            "82": 0.11293,
-            "83": 0.113,
-            "84": 0.11134,
-            "85": 0.11308,
-            "86": 0.11357,
-            "87": 0.11341,
-            "88": 0.11349,
-            "89": 0.11342,
-            "90": 0.11212,
-            "91": 0.11377,
-            "92": 0.11421,
-            "93": 0.1115,
-            "94": 0.11293,
-            "95": 0.11334,
-            "96": 0.11303,
-            "97": 0.11198,
-            "98": 0.11326,
-            "99": 0.11128,
-            "100": 0.1117
+            "1": 11.89927,
+            "2": 0.1153,
+            "3": 0.10368,
+            "4": 0.08198,
+            "5": 0.0823,
+            "6": 0.0813,
+            "7": 0.08053,
+            "8": 0.08097,
+            "9": 0.08083,
+            "10": 0.08105,
+            "11": 0.08193,
+            "12": 0.08083,
+            "13": 0.08063,
+            "14": 0.08095,
+            "15": 0.08115,
+            "16": 0.08099,
+            "17": 0.08128,
+            "18": 0.08134,
+            "19": 0.08147,
+            "20": 0.08174,
+            "21": 0.08185,
+            "22": 0.08175,
+            "23": 0.08109,
+            "24": 0.08065,
+            "25": 0.08488,
+            "26": 0.08433,
+            "27": 0.08446,
+            "28": 0.08482,
+            "29": 0.08645,
+            "30": 0.08469,
+            "31": 0.08623,
+            "32": 0.08474,
+            "33": 0.08443,
+            "34": 0.08442,
+            "35": 0.08287,
+            "36": 0.08188,
+            "37": 0.08068,
+            "38": 0.0808,
+            "39": 0.08041,
+            "40": 0.08119,
+            "41": 0.08373,
+            "42": 0.08116,
+            "43": 0.08394,
+            "44": 0.08252,
+            "45": 0.08182,
+            "46": 0.08217,
+            "47": 0.08115,
+            "48": 0.08122,
+            "49": 0.08084,
+            "50": 0.08062,
+            "51": 0.09006,
+            "52": 0.08529,
+            "53": 0.08552,
+            "54": 0.08335,
+            "55": 0.08266,
+            "56": 0.08016,
+            "57": 0.08221,
+            "58": 0.08,
+            "59": 0.08121,
+            "60": 0.08027,
+            "61": 0.08342,
+            "62": 0.08237,
+            "63": 0.08269,
+            "64": 0.0825,
+            "65": 0.08238,
+            "66": 0.08275,
+            "67": 0.08276,
+            "68": 0.08526,
+            "69": 0.0814,
+            "70": 0.08183,
+            "71": 0.08214,
+            "72": 0.08252,
+            "73": 0.0824,
+            "74": 0.08248,
+            "75": 0.08211,
+            "76": 0.0822,
+            "77": 0.08148,
+            "78": 0.08193,
+            "79": 0.08271,
+            "80": 0.082,
+            "81": 0.08216,
+            "82": 0.08205,
+            "83": 0.0823,
+            "84": 0.08236,
+            "85": 0.08239,
+            "86": 0.0805,
+            "87": 0.07901,
+            "88": 0.07985,
+            "89": 0.07962,
+            "90": 0.07883,
+            "91": 0.07962,
+            "92": 0.07909,
+            "93": 0.07986,
+            "94": 0.08107,
+            "95": 0.08014,
+            "96": 0.07993,
+            "97": 0.08061,
+            "98": 0.0808,
+            "99": 0.07879,
+            "100": 0.07901
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..93e78f67d5d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8587,
+            "52": 9.74287,
+            "53": 10.06647,
+            "54": 9.95168,
+            "55": 9.88096,
+            "56": 9.62625,
+            "57": 9.47766,
+            "58": 9.8335,
+            "59": 9.58522,
+            "60": 9.50125,
+            "61": 9.69186,
+            "62": 9.98858,
+            "63": 9.38478,
+            "64": 9.78027,
+            "65": 8.94761,
+            "66": 9.70857,
+            "67": 9.36847,
+            "68": 9.78438,
+            "69": 9.79407,
+            "70": 9.7424,
+            "71": 9.61808,
+            "72": 9.58427,
+            "73": 9.50347,
+            "74": 8.9422,
+            "75": 9.42532,
+            "76": 9.07407,
+            "77": 10.06351,
+            "78": 9.7208,
+            "79": 9.37296,
+            "80": 9.40396,
+            "81": 9.48168,
+            "82": 9.69778,
+            "83": 9.30711,
+            "84": 9.41712,
+            "85": 9.61405,
+            "86": 9.07618,
+            "87": 9.59088,
+            "88": 9.7464,
+            "89": 9.59987,
+            "90": 9.81418,
+            "91": 9.33775,
+            "92": 9.35372,
+            "93": 9.07397,
+            "94": 8.8317,
+            "95": 9.5173,
+            "96": 9.52412,
+            "97": 9.30995,
+            "98": 9.66807,
+            "99": 8.8859,
+            "100": 9.39541
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2608.0,
+            "52": 2444.0,
+            "53": 2898.0,
+            "54": 2664.0,
+            "55": 2325.0,
+            "56": 2614.0,
+            "57": 2394.0,
+            "58": 2812.0,
+            "59": 2771.0,
+            "60": 2361.0,
+            "61": 2855.0,
+            "62": 2675.0,
+            "63": 2393.0,
+            "64": 3014.0,
+            "65": 2673.0,
+            "66": 3051.0,
+            "67": 2657.0,
+            "68": 2662.0,
+            "69": 2736.0,
+            "70": 3139.0,
+            "71": 2943.0,
+            "72": 2293.0,
+            "73": 2908.0,
+            "74": 1887.0,
+            "75": 2519.0,
+            "76": 3060.0,
+            "77": 3191.0,
+            "78": 3211.0,
+            "79": 3081.0,
+            "80": 3205.0,
+            "81": 3563.0,
+            "82": 3201.0,
+            "83": 2614.0,
+            "84": 3162.0,
+            "85": 3209.0,
+            "86": 2660.0,
+            "87": 3729.0,
+            "88": 3002.0,
+            "89": 3160.0,
+            "90": 3168.0,
+            "91": 2753.0,
+            "92": 3258.0,
+            "93": 2617.0,
+            "94": 3341.0,
+            "95": 3261.0,
+            "96": 3370.0,
+            "97": 3163.0,
+            "98": 3566.0,
+            "99": 3179.0,
+            "100": 3135.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 763221504.0,
+            "52": 763221504.0,
+            "53": 763221504.0,
+            "54": 763221504.0,
+            "55": 763221504.0,
+            "56": 763221504.0,
+            "57": 763221504.0,
+            "58": 763221504.0,
+            "59": 763221504.0,
+            "60": 763221504.0,
+            "61": 763221504.0,
+            "62": 763221504.0,
+            "63": 763221504.0,
+            "64": 763221504.0,
+            "65": 763221504.0,
+            "66": 763221504.0,
+            "67": 763221504.0,
+            "68": 763221504.0,
+            "69": 763221504.0,
+            "70": 763221504.0,
+            "71": 763221504.0,
+            "72": 763221504.0,
+            "73": 763221504.0,
+            "74": 763221504.0,
+            "75": 763221504.0,
+            "76": 763221504.0,
+            "77": 763221504.0,
+            "78": 763221504.0,
+            "79": 763221504.0,
+            "80": 763221504.0,
+            "81": 763221504.0,
+            "82": 763221504.0,
+            "83": 763221504.0,
+            "84": 763221504.0,
+            "85": 763221504.0,
+            "86": 763221504.0,
+            "87": 763221504.0,
+            "88": 763221504.0,
+            "89": 763221504.0,
+            "90": 763221504.0,
+            "91": 763221504.0,
+            "92": 763221504.0,
+            "93": 763221504.0,
+            "94": 763221504.0,
+            "95": 763221504.0,
+            "96": 763221504.0,
+            "97": 763221504.0,
+            "98": 763221504.0,
+            "99": 763221504.0,
+            "100": 763221504.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2643299328.0,
+            "52": 2643300352.0,
+            "53": 2643300352.0,
+            "54": 2643300352.0,
+            "55": 2643300352.0,
+            "56": 2643300352.0,
+            "57": 2643300352.0,
+            "58": 2643300352.0,
+            "59": 2643300352.0,
+            "60": 2643300352.0,
+            "61": 2643300352.0,
+            "62": 2643300352.0,
+            "63": 2643300352.0,
+            "64": 2643300352.0,
+            "65": 2643300352.0,
+            "66": 2643300352.0,
+            "67": 2643300352.0,
+            "68": 2643300352.0,
+            "69": 2643300352.0,
+            "70": 2643300352.0,
+            "71": 2643300352.0,
+            "72": 2643300352.0,
+            "73": 2643300352.0,
+            "74": 2643300352.0,
+            "75": 2643300352.0,
+            "76": 2643300352.0,
+            "77": 2643300352.0,
+            "78": 2643300352.0,
+            "79": 2643300352.0,
+            "80": 2643300352.0,
+            "81": 2643300352.0,
+            "82": 2643300352.0,
+            "83": 2643300352.0,
+            "84": 2643300352.0,
+            "85": 2643300352.0,
+            "86": 2643300352.0,
+            "87": 2643300352.0,
+            "88": 2643300352.0,
+            "89": 2643300352.0,
+            "90": 2643300352.0,
+            "91": 2643300352.0,
+            "92": 2643300352.0,
+            "93": 2643300352.0,
+            "94": 2643300352.0,
+            "95": 2643300352.0,
+            "96": 2643300352.0,
+            "97": 2643300352.0,
+            "98": 2643300352.0,
+            "99": 2643300352.0,
+            "100": 2643300352.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 12.33731,
+            "52": 0.11786,
+            "53": 0.07991,
+            "54": 0.07986,
+            "55": 0.07966,
+            "56": 0.07938,
+            "57": 0.07961,
+            "58": 0.07896,
+            "59": 0.08173,
+            "60": 0.08111,
+            "61": 0.07932,
+            "62": 0.07983,
+            "63": 0.07857,
+            "64": 0.07987,
+            "65": 0.08064,
+            "66": 0.08044,
+            "67": 0.07986,
+            "68": 0.07972,
+            "69": 0.08138,
+            "70": 0.07961,
+            "71": 0.07849,
+            "72": 0.07845,
+            "73": 0.07863,
+            "74": 0.07911,
+            "75": 0.07806,
+            "76": 0.0788,
+            "77": 0.07844,
+            "78": 0.07863,
+            "79": 0.07852,
+            "80": 0.07836,
+            "81": 0.07846,
+            "82": 0.07827,
+            "83": 0.0783,
+            "84": 0.08097,
+            "85": 0.07901,
+            "86": 0.07807,
+            "87": 0.07812,
+            "88": 0.07877,
+            "89": 0.07887,
+            "90": 0.08111,
+            "91": 0.07881,
+            "92": 0.08093,
+            "93": 0.07971,
+            "94": 0.08058,
+            "95": 0.07862,
+            "96": 0.07919,
+            "97": 0.07748,
+            "98": 0.07748,
+            "99": 0.07818,
+            "100": 0.07748
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100.json
index 0cb12854799..f68a55e951c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100.json
@@ -325,7 +325,7 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2367879168.0,
+            "1": 2368927744.0,
             "2": 2651687936.0,
             "3": 2651687936.0,
             "4": 2651687936.0,
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 12.6402,
-            "2": 0.15932,
-            "3": 0.13183,
-            "4": 0.12969,
-            "5": 0.12913,
-            "6": 0.12942,
-            "7": 0.12823,
-            "8": 0.13014,
-            "9": 0.1305,
-            "10": 0.13004,
-            "11": 0.12983,
-            "12": 0.12943,
-            "13": 0.12925,
-            "14": 0.13022,
-            "15": 0.12947,
-            "16": 0.12988,
-            "17": 0.12984,
-            "18": 0.12989,
-            "19": 0.12987,
-            "20": 0.12935,
-            "21": 0.12974,
-            "22": 0.12965,
-            "23": 0.12983,
-            "24": 0.13037,
-            "25": 0.1293,
-            "26": 0.12914,
-            "27": 0.12908,
-            "28": 0.12909,
-            "29": 0.13186,
-            "30": 0.13433,
-            "31": 0.13401,
-            "32": 0.12902,
-            "33": 0.12808,
-            "34": 0.12907,
-            "35": 0.12884,
-            "36": 0.12913,
-            "37": 0.12932,
-            "38": 0.12992,
-            "39": 0.13072,
-            "40": 0.13131,
-            "41": 0.13172,
-            "42": 0.13072,
-            "43": 0.13259,
-            "44": 0.13124,
-            "45": 0.13129,
-            "46": 0.1291,
-            "47": 0.1308,
-            "48": 0.1301,
-            "49": 0.12906,
-            "50": 0.12828,
-            "51": 0.14265,
-            "52": 0.12979,
-            "53": 0.126,
-            "54": 0.12545,
-            "55": 0.12582,
-            "56": 0.12573,
-            "57": 0.12516,
-            "58": 0.1252,
-            "59": 0.12598,
-            "60": 0.12562,
-            "61": 0.12544,
-            "62": 0.12472,
-            "63": 0.12548,
-            "64": 0.12537,
-            "65": 0.12534,
-            "66": 0.12474,
-            "67": 0.12528,
-            "68": 0.12481,
-            "69": 0.12531,
-            "70": 0.12547,
-            "71": 0.12492,
-            "72": 0.12533,
-            "73": 0.12583,
-            "74": 0.1253,
-            "75": 0.12453,
-            "76": 0.12486,
-            "77": 0.12501,
-            "78": 0.12491,
-            "79": 0.12247,
-            "80": 0.1223,
-            "81": 0.1243,
-            "82": 0.12257,
-            "83": 0.12179,
-            "84": 0.12254,
-            "85": 0.12231,
-            "86": 0.12263,
-            "87": 0.12152,
-            "88": 0.12188,
-            "89": 0.1228,
-            "90": 0.12133,
-            "91": 0.1216,
-            "92": 0.12133,
-            "93": 0.12135,
-            "94": 0.12216,
-            "95": 0.12141,
-            "96": 0.12205,
-            "97": 0.12356,
-            "98": 0.12174,
-            "99": 0.12252,
-            "100": 0.1222
+            "1": 5.40788,
+            "2": 0.15608,
+            "3": 0.1477,
+            "4": 0.13403,
+            "5": 0.13382,
+            "6": 0.13308,
+            "7": 0.1344,
+            "8": 0.13063,
+            "9": 0.12991,
+            "10": 0.13084,
+            "11": 0.13107,
+            "12": 0.13009,
+            "13": 0.13035,
+            "14": 0.13027,
+            "15": 0.13037,
+            "16": 0.1302,
+            "17": 0.12981,
+            "18": 0.12893,
+            "19": 0.12914,
+            "20": 0.12893,
+            "21": 0.12912,
+            "22": 0.1334,
+            "23": 0.13093,
+            "24": 0.13133,
+            "25": 0.13036,
+            "26": 0.13026,
+            "27": 0.13063,
+            "28": 0.13046,
+            "29": 0.13311,
+            "30": 0.13167,
+            "31": 0.13145,
+            "32": 0.13051,
+            "33": 0.13072,
+            "34": 0.1308,
+            "35": 0.13145,
+            "36": 0.13046,
+            "37": 0.13066,
+            "38": 0.13075,
+            "39": 0.13108,
+            "40": 0.1305,
+            "41": 0.13132,
+            "42": 0.1308,
+            "43": 0.13149,
+            "44": 0.13097,
+            "45": 0.13099,
+            "46": 0.13204,
+            "47": 0.13136,
+            "48": 0.13051,
+            "49": 0.13073,
+            "50": 0.13055,
+            "51": 0.1389,
+            "52": 0.13184,
+            "53": 0.13181,
+            "54": 0.13087,
+            "55": 0.13152,
+            "56": 0.13181,
+            "57": 0.13138,
+            "58": 0.13134,
+            "59": 0.13133,
+            "60": 0.13251,
+            "61": 0.13157,
+            "62": 0.13187,
+            "63": 0.13183,
+            "64": 0.13133,
+            "65": 0.13157,
+            "66": 0.13239,
+            "67": 0.13213,
+            "68": 0.13166,
+            "69": 0.13128,
+            "70": 0.13118,
+            "71": 0.13129,
+            "72": 0.1319,
+            "73": 0.13204,
+            "74": 0.13343,
+            "75": 0.13119,
+            "76": 0.13129,
+            "77": 0.13116,
+            "78": 0.13092,
+            "79": 0.13228,
+            "80": 0.13183,
+            "81": 0.13133,
+            "82": 0.13205,
+            "83": 0.13189,
+            "84": 0.13312,
+            "85": 0.13289,
+            "86": 0.13578,
+            "87": 0.13422,
+            "88": 0.1347,
+            "89": 0.13466,
+            "90": 0.13428,
+            "91": 0.13512,
+            "92": 0.13241,
+            "93": 0.12996,
+            "94": 0.1315,
+            "95": 0.12919,
+            "96": 0.12806,
+            "97": 0.12848,
+            "98": 0.12922,
+            "99": 0.12714,
+            "100": 0.12757
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..91d84b88527
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83549,
+            "52": 9.72516,
+            "53": 10.04799,
+            "54": 9.93011,
+            "55": 9.8636,
+            "56": 9.60217,
+            "57": 9.45187,
+            "58": 9.8078,
+            "59": 9.56783,
+            "60": 9.47966,
+            "61": 9.67984,
+            "62": 9.96754,
+            "63": 9.35113,
+            "64": 9.75623,
+            "65": 8.9318,
+            "66": 9.68107,
+            "67": 9.35956,
+            "68": 9.76948,
+            "69": 9.77492,
+            "70": 9.71182,
+            "71": 9.60632,
+            "72": 9.57129,
+            "73": 9.48392,
+            "74": 8.92911,
+            "75": 9.40028,
+            "76": 9.07194,
+            "77": 10.05252,
+            "78": 9.71494,
+            "79": 9.35747,
+            "80": 9.38946,
+            "81": 9.46791,
+            "82": 9.68508,
+            "83": 9.29588,
+            "84": 9.40522,
+            "85": 9.60163,
+            "86": 9.06713,
+            "87": 9.58402,
+            "88": 9.73304,
+            "89": 9.59526,
+            "90": 9.80555,
+            "91": 9.32604,
+            "92": 9.35323,
+            "93": 9.06915,
+            "94": 8.82268,
+            "95": 9.50858,
+            "96": 9.51584,
+            "97": 9.2976,
+            "98": 9.66184,
+            "99": 8.87662,
+            "100": 9.39222
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2441.0,
+            "52": 2482.0,
+            "53": 2916.0,
+            "54": 2550.0,
+            "55": 2347.0,
+            "56": 2765.0,
+            "57": 2116.0,
+            "58": 2968.0,
+            "59": 2810.0,
+            "60": 2384.0,
+            "61": 2912.0,
+            "62": 2554.0,
+            "63": 2364.0,
+            "64": 3035.0,
+            "65": 2648.0,
+            "66": 2979.0,
+            "67": 2741.0,
+            "68": 2799.0,
+            "69": 3071.0,
+            "70": 3098.0,
+            "71": 2950.0,
+            "72": 2342.0,
+            "73": 2829.0,
+            "74": 1840.0,
+            "75": 2426.0,
+            "76": 2941.0,
+            "77": 3245.0,
+            "78": 3272.0,
+            "79": 3066.0,
+            "80": 3221.0,
+            "81": 3565.0,
+            "82": 3162.0,
+            "83": 2876.0,
+            "84": 3180.0,
+            "85": 3410.0,
+            "86": 2778.0,
+            "87": 3752.0,
+            "88": 2995.0,
+            "89": 3264.0,
+            "90": 2940.0,
+            "91": 2791.0,
+            "92": 3118.0,
+            "93": 2634.0,
+            "94": 3464.0,
+            "95": 3344.0,
+            "96": 3499.0,
+            "97": 3122.0,
+            "98": 3568.0,
+            "99": 3272.0,
+            "100": 3476.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 733860352.0,
+            "52": 733860352.0,
+            "53": 733860352.0,
+            "54": 733860352.0,
+            "55": 733860352.0,
+            "56": 733860352.0,
+            "57": 733860352.0,
+            "58": 733860352.0,
+            "59": 733860352.0,
+            "60": 733860352.0,
+            "61": 733860352.0,
+            "62": 733860352.0,
+            "63": 733860352.0,
+            "64": 733860352.0,
+            "65": 733860352.0,
+            "66": 733860352.0,
+            "67": 733860352.0,
+            "68": 733860352.0,
+            "69": 733860352.0,
+            "70": 733860352.0,
+            "71": 733860352.0,
+            "72": 733860352.0,
+            "73": 733860352.0,
+            "74": 733860352.0,
+            "75": 733860352.0,
+            "76": 733860352.0,
+            "77": 733860352.0,
+            "78": 733860352.0,
+            "79": 733860352.0,
+            "80": 733860352.0,
+            "81": 733860352.0,
+            "82": 733860352.0,
+            "83": 733860352.0,
+            "84": 733860352.0,
+            "85": 733860352.0,
+            "86": 733860352.0,
+            "87": 733860352.0,
+            "88": 733860352.0,
+            "89": 733860352.0,
+            "90": 733860352.0,
+            "91": 733860352.0,
+            "92": 733860352.0,
+            "93": 733860352.0,
+            "94": 733860352.0,
+            "95": 733860352.0,
+            "96": 733860352.0,
+            "97": 733860352.0,
+            "98": 733860352.0,
+            "99": 733860352.0,
+            "100": 733860352.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2652735488.0,
+            "52": 2652736512.0,
+            "53": 2652736512.0,
+            "54": 2652736512.0,
+            "55": 2652736512.0,
+            "56": 2652736512.0,
+            "57": 2652736512.0,
+            "58": 2652736512.0,
+            "59": 2652736512.0,
+            "60": 2652736512.0,
+            "61": 2652736512.0,
+            "62": 2652736512.0,
+            "63": 2652736512.0,
+            "64": 2652736512.0,
+            "65": 2652736512.0,
+            "66": 2652736512.0,
+            "67": 2652736512.0,
+            "68": 2652736512.0,
+            "69": 2652736512.0,
+            "70": 2652736512.0,
+            "71": 2652736512.0,
+            "72": 2652736512.0,
+            "73": 2652736512.0,
+            "74": 2652736512.0,
+            "75": 2652736512.0,
+            "76": 2652736512.0,
+            "77": 2652736512.0,
+            "78": 2652736512.0,
+            "79": 2652736512.0,
+            "80": 2652736512.0,
+            "81": 2652736512.0,
+            "82": 2652736512.0,
+            "83": 2652736512.0,
+            "84": 2652736512.0,
+            "85": 2652736512.0,
+            "86": 2652736512.0,
+            "87": 2652736512.0,
+            "88": 2652736512.0,
+            "89": 2652736512.0,
+            "90": 2652736512.0,
+            "91": 2652736512.0,
+            "92": 2652736512.0,
+            "93": 2652736512.0,
+            "94": 2652736512.0,
+            "95": 2652736512.0,
+            "96": 2652736512.0,
+            "97": 2652736512.0,
+            "98": 2652736512.0,
+            "99": 2652736512.0,
+            "100": 2652736512.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6.95149,
+            "52": 0.16138,
+            "53": 0.14143,
+            "54": 0.14147,
+            "55": 0.14039,
+            "56": 0.14065,
+            "57": 0.14197,
+            "58": 0.14092,
+            "59": 0.13304,
+            "60": 0.1316,
+            "61": 0.13067,
+            "62": 0.13101,
+            "63": 0.13087,
+            "64": 0.13347,
+            "65": 0.13501,
+            "66": 0.13486,
+            "67": 0.13415,
+            "68": 0.13402,
+            "69": 0.1339,
+            "70": 0.1332,
+            "71": 0.13414,
+            "72": 0.13291,
+            "73": 0.1334,
+            "74": 0.13397,
+            "75": 0.13253,
+            "76": 0.13314,
+            "77": 0.13317,
+            "78": 0.13335,
+            "79": 0.13316,
+            "80": 0.13312,
+            "81": 0.13302,
+            "82": 0.13404,
+            "83": 0.13393,
+            "84": 0.13355,
+            "85": 0.13237,
+            "86": 0.13361,
+            "87": 0.13268,
+            "88": 0.13156,
+            "89": 0.13245,
+            "90": 0.13179,
+            "91": 0.13173,
+            "92": 0.13158,
+            "93": 0.13204,
+            "94": 0.1318,
+            "95": 0.13972,
+            "96": 0.13128,
+            "97": 0.12988,
+            "98": 0.13091,
+            "99": 0.13155,
+            "100": 0.1314
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..28843c12217
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82555,
+            "2": 10.83286,
+            "3": 10.82763,
+            "4": 10.79573,
+            "5": 10.85699,
+            "6": 10.8639,
+            "7": 10.82612,
+            "8": 10.82542,
+            "9": 10.83587,
+            "10": 10.79627,
+            "11": 10.87822,
+            "12": 10.85824,
+            "13": 10.85426,
+            "14": 10.87526,
+            "15": 10.79208,
+            "16": 10.80307,
+            "17": 10.77438,
+            "18": 10.80487,
+            "19": 10.79369,
+            "20": 10.69576,
+            "21": 10.68654,
+            "22": 10.53161,
+            "23": 10.70646,
+            "24": 10.57337,
+            "25": 10.51533,
+            "26": 10.5909,
+            "27": 10.60777,
+            "28": 10.57049,
+            "29": 10.58979,
+            "30": 10.34722,
+            "31": 10.07771,
+            "32": 10.46349,
+            "33": 10.45722,
+            "34": 10.19974,
+            "35": 10.25643,
+            "36": 10.21263,
+            "37": 10.34718,
+            "38": 10.18009,
+            "39": 10.40838,
+            "40": 10.07629,
+            "41": 10.1297,
+            "42": 10.2117,
+            "43": 9.81708,
+            "44": 9.94034,
+            "45": 9.81748,
+            "46": 9.80633,
+            "47": 10.12473,
+            "48": 9.84047,
+            "49": 9.51012,
+            "50": 9.88943,
+            "51": 9.84256,
+            "52": 9.72573,
+            "53": 10.05974,
+            "54": 9.95226,
+            "55": 9.88318,
+            "56": 9.61275,
+            "57": 9.46219,
+            "58": 9.8231,
+            "59": 9.57666,
+            "60": 9.48516,
+            "61": 9.67876,
+            "62": 9.97782,
+            "63": 9.36212,
+            "64": 9.75714,
+            "65": 8.93494,
+            "66": 9.69283,
+            "67": 9.36708,
+            "68": 9.78178,
+            "69": 9.79452,
+            "70": 9.72296,
+            "71": 9.62031,
+            "72": 9.56974,
+            "73": 9.48101,
+            "74": 8.91241,
+            "75": 9.40905,
+            "76": 9.06617,
+            "77": 10.05809,
+            "78": 9.72194,
+            "79": 9.36927,
+            "80": 9.40029,
+            "81": 9.47702,
+            "82": 9.69787,
+            "83": 9.30742,
+            "84": 9.41492,
+            "85": 9.61113,
+            "86": 9.07103,
+            "87": 9.5961,
+            "88": 9.74909,
+            "89": 9.59604,
+            "90": 9.82722,
+            "91": 9.33657,
+            "92": 9.35582,
+            "93": 9.08689,
+            "94": 8.82754,
+            "95": 9.53065,
+            "96": 9.5276,
+            "97": 9.30672,
+            "98": 9.66905,
+            "99": 8.89635,
+            "100": 9.40525
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1651.0,
+            "2": 1716.0,
+            "3": 1760.0,
+            "4": 1771.0,
+            "5": 1899.0,
+            "6": 1905.0,
+            "7": 1842.0,
+            "8": 1667.0,
+            "9": 1822.0,
+            "10": 1434.0,
+            "11": 1852.0,
+            "12": 1741.0,
+            "13": 1905.0,
+            "14": 1841.0,
+            "15": 1857.0,
+            "16": 1841.0,
+            "17": 1800.0,
+            "18": 1666.0,
+            "19": 1803.0,
+            "20": 1800.0,
+            "21": 1836.0,
+            "22": 1688.0,
+            "23": 1994.0,
+            "24": 1641.0,
+            "25": 1577.0,
+            "26": 1676.0,
+            "27": 1876.0,
+            "28": 1970.0,
+            "29": 1945.0,
+            "30": 1916.0,
+            "31": 1494.0,
+            "32": 1868.0,
+            "33": 2135.0,
+            "34": 1740.0,
+            "35": 1924.0,
+            "36": 1854.0,
+            "37": 2363.0,
+            "38": 2164.0,
+            "39": 2262.0,
+            "40": 2081.0,
+            "41": 2168.0,
+            "42": 2247.0,
+            "43": 2055.0,
+            "44": 2070.0,
+            "45": 1988.0,
+            "46": 2208.0,
+            "47": 2559.0,
+            "48": 2287.0,
+            "49": 2194.0,
+            "50": 2303.0,
+            "51": 2552.0,
+            "52": 2565.0,
+            "53": 2883.0,
+            "54": 2710.0,
+            "55": 2301.0,
+            "56": 2798.0,
+            "57": 2334.0,
+            "58": 2979.0,
+            "59": 2960.0,
+            "60": 2451.0,
+            "61": 2841.0,
+            "62": 2577.0,
+            "63": 2516.0,
+            "64": 2907.0,
+            "65": 2567.0,
+            "66": 2862.0,
+            "67": 2809.0,
+            "68": 2609.0,
+            "69": 2965.0,
+            "70": 2985.0,
+            "71": 2864.0,
+            "72": 2613.0,
+            "73": 3108.0,
+            "74": 2048.0,
+            "75": 2563.0,
+            "76": 3046.0,
+            "77": 3127.0,
+            "78": 2959.0,
+            "79": 3082.0,
+            "80": 3025.0,
+            "81": 3400.0,
+            "82": 3223.0,
+            "83": 2786.0,
+            "84": 3180.0,
+            "85": 3233.0,
+            "86": 2611.0,
+            "87": 3542.0,
+            "88": 3084.0,
+            "89": 3210.0,
+            "90": 3271.0,
+            "91": 2770.0,
+            "92": 3220.0,
+            "93": 2662.0,
+            "94": 3405.0,
+            "95": 3085.0,
+            "96": 3336.0,
+            "97": 3050.0,
+            "98": 3421.0,
+            "99": 3271.0,
+            "100": 3079.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 552325632.0,
+            "2": 552325632.0,
+            "3": 552325632.0,
+            "4": 552325632.0,
+            "5": 552325632.0,
+            "6": 552325632.0,
+            "7": 552325632.0,
+            "8": 552325632.0,
+            "9": 552325632.0,
+            "10": 552325632.0,
+            "11": 552325632.0,
+            "12": 552325632.0,
+            "13": 552325632.0,
+            "14": 552325632.0,
+            "15": 552325632.0,
+            "16": 552325632.0,
+            "17": 552325632.0,
+            "18": 552325632.0,
+            "19": 552325632.0,
+            "20": 552325632.0,
+            "21": 552325632.0,
+            "22": 552325632.0,
+            "23": 552325632.0,
+            "24": 552325632.0,
+            "25": 552325632.0,
+            "26": 552325632.0,
+            "27": 552325632.0,
+            "28": 552325632.0,
+            "29": 552325632.0,
+            "30": 552325632.0,
+            "31": 552325632.0,
+            "32": 552325632.0,
+            "33": 552325632.0,
+            "34": 552325632.0,
+            "35": 552325632.0,
+            "36": 552325632.0,
+            "37": 552325632.0,
+            "38": 552325632.0,
+            "39": 552325632.0,
+            "40": 552325632.0,
+            "41": 552325632.0,
+            "42": 552325632.0,
+            "43": 552325632.0,
+            "44": 552325632.0,
+            "45": 553374208.0,
+            "46": 552325632.0,
+            "47": 552325632.0,
+            "48": 553374208.0,
+            "49": 552325632.0,
+            "50": 552325632.0,
+            "51": 552325632.0,
+            "52": 552325632.0,
+            "53": 552325632.0,
+            "54": 552325632.0,
+            "55": 552325632.0,
+            "56": 552325632.0,
+            "57": 552325632.0,
+            "58": 552325632.0,
+            "59": 552325632.0,
+            "60": 552325632.0,
+            "61": 552325632.0,
+            "62": 552325632.0,
+            "63": 552325632.0,
+            "64": 552325632.0,
+            "65": 552325632.0,
+            "66": 552325632.0,
+            "67": 552325632.0,
+            "68": 552325632.0,
+            "69": 552325632.0,
+            "70": 552325632.0,
+            "71": 552325632.0,
+            "72": 552325632.0,
+            "73": 552325632.0,
+            "74": 552325632.0,
+            "75": 552325632.0,
+            "76": 552325632.0,
+            "77": 552325632.0,
+            "78": 552325632.0,
+            "79": 552325632.0,
+            "80": 552325632.0,
+            "81": 552325632.0,
+            "82": 552325632.0,
+            "83": 552325632.0,
+            "84": 552325632.0,
+            "85": 552325632.0,
+            "86": 552325632.0,
+            "87": 552325632.0,
+            "88": 552325632.0,
+            "89": 552325632.0,
+            "90": 552325632.0,
+            "91": 552325632.0,
+            "92": 552325632.0,
+            "93": 552325632.0,
+            "94": 552325632.0,
+            "95": 552325632.0,
+            "96": 552325632.0,
+            "97": 552325632.0,
+            "98": 552325632.0,
+            "99": 552325632.0,
+            "100": 552325632.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2328239104.0,
+            "2": 2471201792.0,
+            "3": 2471201792.0,
+            "4": 2471201792.0,
+            "5": 2471201792.0,
+            "6": 2471201792.0,
+            "7": 2471201792.0,
+            "8": 2471201792.0,
+            "9": 2471201792.0,
+            "10": 2471201792.0,
+            "11": 2471201792.0,
+            "12": 2471201792.0,
+            "13": 2471201792.0,
+            "14": 2471201792.0,
+            "15": 2471201792.0,
+            "16": 2471201792.0,
+            "17": 2471201792.0,
+            "18": 2471201792.0,
+            "19": 2471201792.0,
+            "20": 2471201792.0,
+            "21": 2471201792.0,
+            "22": 2471201792.0,
+            "23": 2471201792.0,
+            "24": 2471201792.0,
+            "25": 2471201792.0,
+            "26": 2471201792.0,
+            "27": 2471201792.0,
+            "28": 2471201792.0,
+            "29": 2471201792.0,
+            "30": 2471201792.0,
+            "31": 2471201792.0,
+            "32": 2471201792.0,
+            "33": 2471201792.0,
+            "34": 2471201792.0,
+            "35": 2471201792.0,
+            "36": 2471201792.0,
+            "37": 2471201792.0,
+            "38": 2471201792.0,
+            "39": 2471201792.0,
+            "40": 2471201792.0,
+            "41": 2471201792.0,
+            "42": 2471201792.0,
+            "43": 2471201792.0,
+            "44": 2471201792.0,
+            "45": 2471201792.0,
+            "46": 2471201792.0,
+            "47": 2471201792.0,
+            "48": 2471201792.0,
+            "49": 2471201792.0,
+            "50": 2471201792.0,
+            "51": 2471201792.0,
+            "52": 2471201792.0,
+            "53": 2471201792.0,
+            "54": 2471201792.0,
+            "55": 2471201792.0,
+            "56": 2471201792.0,
+            "57": 2471201792.0,
+            "58": 2471201792.0,
+            "59": 2471201792.0,
+            "60": 2471201792.0,
+            "61": 2471201792.0,
+            "62": 2471201792.0,
+            "63": 2471201792.0,
+            "64": 2471201792.0,
+            "65": 2471201792.0,
+            "66": 2471201792.0,
+            "67": 2471201792.0,
+            "68": 2471201792.0,
+            "69": 2471201792.0,
+            "70": 2471201792.0,
+            "71": 2471201792.0,
+            "72": 2471201792.0,
+            "73": 2471201792.0,
+            "74": 2471201792.0,
+            "75": 2471201792.0,
+            "76": 2471201792.0,
+            "77": 2471201792.0,
+            "78": 2471201792.0,
+            "79": 2471201792.0,
+            "80": 2471201792.0,
+            "81": 2471201792.0,
+            "82": 2471201792.0,
+            "83": 2471201792.0,
+            "84": 2471201792.0,
+            "85": 2471201792.0,
+            "86": 2471201792.0,
+            "87": 2471201792.0,
+            "88": 2471201792.0,
+            "89": 2471201792.0,
+            "90": 2471201792.0,
+            "91": 2471201792.0,
+            "92": 2471201792.0,
+            "93": 2471201792.0,
+            "94": 2471201792.0,
+            "95": 2471201792.0,
+            "96": 2471201792.0,
+            "97": 2471201792.0,
+            "98": 2471201792.0,
+            "99": 2471201792.0,
+            "100": 2471201792.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.33855,
+            "3": 0.12562,
+            "4": 0.10973,
+            "5": 0.10864,
+            "6": 0.10778,
+            "7": 0.10885,
+            "8": 0.10884,
+            "9": 0.10877,
+            "10": 0.10868,
+            "11": 0.10997,
+            "12": 0.10853,
+            "13": 0.1086,
+            "14": 0.10927,
+            "15": 0.10879,
+            "16": 0.10908,
+            "17": 0.10873,
+            "18": 0.10883,
+            "19": 0.11028,
+            "20": 0.11031,
+            "21": 0.11086,
+            "22": 0.10971,
+            "23": 0.10987,
+            "24": 0.1089,
+            "25": 0.11118,
+            "26": 0.10952,
+            "27": 0.1165,
+            "28": 0.11961,
+            "29": 0.11977,
+            "30": 0.11657,
+            "31": 0.11728,
+            "32": 0.11689,
+            "33": 0.11642,
+            "34": 0.11739,
+            "35": 0.11665,
+            "36": 0.11537,
+            "37": 0.11552,
+            "38": 0.11544,
+            "39": 0.11538,
+            "40": 0.11584,
+            "41": 0.11597,
+            "42": 0.11635,
+            "43": 0.11593,
+            "44": 0.11678,
+            "45": 0.11608,
+            "46": 0.11637,
+            "47": 0.11572,
+            "48": 0.11577,
+            "49": 0.11481,
+            "50": 0.11561,
+            "51": 0.1213,
+            "52": 0.10892,
+            "53": 0.10742,
+            "54": 0.10842,
+            "55": 0.10806,
+            "56": 0.10869,
+            "57": 0.11057,
+            "58": 0.108,
+            "59": 0.10875,
+            "60": 0.10969,
+            "61": 0.1087,
+            "62": 0.10795,
+            "63": 0.1094,
+            "64": 0.10922,
+            "65": 0.11102,
+            "66": 0.11016,
+            "67": 0.10977,
+            "68": 0.10988,
+            "69": 0.11029,
+            "70": 0.11078,
+            "71": 0.11019,
+            "72": 0.11727,
+            "73": 0.11024,
+            "74": 0.11054,
+            "75": 0.10949,
+            "76": 0.11384,
+            "77": 0.11011,
+            "78": 0.1101,
+            "79": 0.10943,
+            "80": 0.11059,
+            "81": 0.11173,
+            "82": 0.10987,
+            "83": 0.1094,
+            "84": 0.10956,
+            "85": 0.11029,
+            "86": 0.11179,
+            "87": 0.10953,
+            "88": 0.11045,
+            "89": 0.1102,
+            "90": 0.10897,
+            "91": 0.11022,
+            "92": 0.10965,
+            "93": 0.11042,
+            "94": 0.11158,
+            "95": 0.11059,
+            "96": 0.11046,
+            "97": 0.11123,
+            "98": 0.11055,
+            "99": 0.11178,
+            "100": 0.11266
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..43ec9ec960f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84256,
+            "52": 9.72573,
+            "53": 10.05974,
+            "54": 9.95226,
+            "55": 9.88318,
+            "56": 9.61275,
+            "57": 9.46219,
+            "58": 9.8231,
+            "59": 9.57666,
+            "60": 9.48516,
+            "61": 9.67876,
+            "62": 9.97782,
+            "63": 9.36212,
+            "64": 9.75714,
+            "65": 8.93494,
+            "66": 9.69283,
+            "67": 9.36708,
+            "68": 9.78178,
+            "69": 9.79452,
+            "70": 9.72296,
+            "71": 9.62031,
+            "72": 9.56974,
+            "73": 9.48101,
+            "74": 8.91241,
+            "75": 9.40905,
+            "76": 9.06617,
+            "77": 10.05809,
+            "78": 9.72194,
+            "79": 9.36927,
+            "80": 9.40029,
+            "81": 9.47702,
+            "82": 9.69787,
+            "83": 9.30742,
+            "84": 9.41492,
+            "85": 9.61113,
+            "86": 9.07103,
+            "87": 9.5961,
+            "88": 9.74909,
+            "89": 9.59604,
+            "90": 9.82722,
+            "91": 9.33657,
+            "92": 9.35582,
+            "93": 9.08689,
+            "94": 8.82754,
+            "95": 9.53065,
+            "96": 9.5276,
+            "97": 9.30672,
+            "98": 9.66905,
+            "99": 8.89635,
+            "100": 9.40525
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2552.0,
+            "52": 2565.0,
+            "53": 2883.0,
+            "54": 2710.0,
+            "55": 2301.0,
+            "56": 2798.0,
+            "57": 2334.0,
+            "58": 2979.0,
+            "59": 2960.0,
+            "60": 2451.0,
+            "61": 2841.0,
+            "62": 2577.0,
+            "63": 2516.0,
+            "64": 2907.0,
+            "65": 2567.0,
+            "66": 2862.0,
+            "67": 2809.0,
+            "68": 2609.0,
+            "69": 2965.0,
+            "70": 2985.0,
+            "71": 2864.0,
+            "72": 2613.0,
+            "73": 3108.0,
+            "74": 2048.0,
+            "75": 2563.0,
+            "76": 3046.0,
+            "77": 3127.0,
+            "78": 2959.0,
+            "79": 3082.0,
+            "80": 3025.0,
+            "81": 3400.0,
+            "82": 3223.0,
+            "83": 2786.0,
+            "84": 3180.0,
+            "85": 3233.0,
+            "86": 2611.0,
+            "87": 3542.0,
+            "88": 3084.0,
+            "89": 3210.0,
+            "90": 3271.0,
+            "91": 2770.0,
+            "92": 3220.0,
+            "93": 2662.0,
+            "94": 3405.0,
+            "95": 3085.0,
+            "96": 3336.0,
+            "97": 3050.0,
+            "98": 3421.0,
+            "99": 3271.0,
+            "100": 3079.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 552064512.0,
+            "52": 552064512.0,
+            "53": 552064512.0,
+            "54": 552064512.0,
+            "55": 552064512.0,
+            "56": 552064512.0,
+            "57": 552064512.0,
+            "58": 552064512.0,
+            "59": 552064512.0,
+            "60": 552064512.0,
+            "61": 552064512.0,
+            "62": 552064512.0,
+            "63": 552064512.0,
+            "64": 552064512.0,
+            "65": 552064512.0,
+            "66": 552064512.0,
+            "67": 552064512.0,
+            "68": 552064512.0,
+            "69": 552064512.0,
+            "70": 552064512.0,
+            "71": 552064512.0,
+            "72": 552064512.0,
+            "73": 552064512.0,
+            "74": 552064512.0,
+            "75": 552064512.0,
+            "76": 552064512.0,
+            "77": 552064512.0,
+            "78": 552064512.0,
+            "79": 552064512.0,
+            "80": 552064512.0,
+            "81": 552064512.0,
+            "82": 552064512.0,
+            "83": 552064512.0,
+            "84": 552064512.0,
+            "85": 552064512.0,
+            "86": 552064512.0,
+            "87": 552064512.0,
+            "88": 552064512.0,
+            "89": 552064512.0,
+            "90": 552064512.0,
+            "91": 552064512.0,
+            "92": 552064512.0,
+            "93": 552064512.0,
+            "94": 552064512.0,
+            "95": 552064512.0,
+            "96": 552064512.0,
+            "97": 552064512.0,
+            "98": 552064512.0,
+            "99": 552064512.0,
+            "100": 552064512.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2470941696.0,
+            "52": 2470942208.0,
+            "53": 2470942208.0,
+            "54": 2470942208.0,
+            "55": 2470942208.0,
+            "56": 2470942208.0,
+            "57": 2470942208.0,
+            "58": 2470942208.0,
+            "59": 2470942208.0,
+            "60": 2470942208.0,
+            "61": 2470942720.0,
+            "62": 2470942720.0,
+            "63": 2470942720.0,
+            "64": 2470942720.0,
+            "65": 2470942720.0,
+            "66": 2470942720.0,
+            "67": 2470942720.0,
+            "68": 2470942720.0,
+            "69": 2470942720.0,
+            "70": 2470942720.0,
+            "71": 2470942720.0,
+            "72": 2470942720.0,
+            "73": 2470942720.0,
+            "74": 2470942720.0,
+            "75": 2470942720.0,
+            "76": 2470942720.0,
+            "77": 2470942720.0,
+            "78": 2470942720.0,
+            "79": 2470942720.0,
+            "80": 2470942720.0,
+            "81": 2470942720.0,
+            "82": 2470942720.0,
+            "83": 2470942720.0,
+            "84": 2470942720.0,
+            "85": 2470942720.0,
+            "86": 2470942720.0,
+            "87": 2470942720.0,
+            "88": 2470942720.0,
+            "89": 2470942720.0,
+            "90": 2470942720.0,
+            "91": 2470942720.0,
+            "92": 2470942720.0,
+            "93": 2470942720.0,
+            "94": 2470942720.0,
+            "95": 2470942720.0,
+            "96": 2470942720.0,
+            "97": 2470942720.0,
+            "98": 2470942720.0,
+            "99": 2470942720.0,
+            "100": 2470942720.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 3.27475,
+            "53": 0.12363,
+            "54": 0.11177,
+            "55": 0.1112,
+            "56": 0.11272,
+            "57": 0.11284,
+            "58": 0.11161,
+            "59": 0.11167,
+            "60": 0.11262,
+            "61": 0.11171,
+            "62": 0.11092,
+            "63": 0.11143,
+            "64": 0.11171,
+            "65": 0.11299,
+            "66": 0.1124,
+            "67": 0.1119,
+            "68": 0.11174,
+            "69": 0.11252,
+            "70": 0.11217,
+            "71": 0.1112,
+            "72": 0.11653,
+            "73": 0.11887,
+            "74": 0.11966,
+            "75": 0.11921,
+            "76": 0.12192,
+            "77": 0.1219,
+            "78": 0.12342,
+            "79": 0.12312,
+            "80": 0.12263,
+            "81": 0.12762,
+            "82": 0.1234,
+            "83": 0.12364,
+            "84": 0.12458,
+            "85": 0.12385,
+            "86": 0.12395,
+            "87": 0.12307,
+            "88": 0.12362,
+            "89": 0.12421,
+            "90": 0.12452,
+            "91": 0.12623,
+            "92": 0.1253,
+            "93": 0.12482,
+            "94": 0.12453,
+            "95": 0.12892,
+            "96": 0.13902,
+            "97": 0.12489,
+            "98": 0.12331,
+            "99": 0.12522,
+            "100": 0.12499
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json
index acadb81abbe..82352c11781 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 551269888.0,
-            "2": 551269888.0,
-            "3": 551269888.0,
-            "4": 552318464.0,
-            "5": 551269888.0,
-            "6": 551269888.0,
-            "7": 551269888.0,
-            "8": 551269888.0,
-            "9": 551269888.0,
-            "10": 551269888.0,
-            "11": 551269888.0,
-            "12": 551269888.0,
-            "13": 551269888.0,
-            "14": 551269888.0,
-            "15": 551269888.0,
-            "16": 551269888.0,
-            "17": 551269888.0,
-            "18": 551269888.0,
-            "19": 551269888.0,
-            "20": 551269888.0,
-            "21": 551269888.0,
-            "22": 551269888.0,
-            "23": 551269888.0,
-            "24": 551269888.0,
-            "25": 551269888.0,
-            "26": 551269888.0,
-            "27": 551269888.0,
-            "28": 551269888.0,
-            "29": 551269888.0,
-            "30": 551269888.0,
-            "31": 551269888.0,
-            "32": 551269888.0,
-            "33": 551269888.0,
-            "34": 551269888.0,
-            "35": 551269888.0,
-            "36": 551269888.0,
-            "37": 551269888.0,
-            "38": 551269888.0,
-            "39": 551269888.0,
-            "40": 551269888.0,
-            "41": 551269888.0,
-            "42": 551269888.0,
-            "43": 551269888.0,
-            "44": 551269888.0,
-            "45": 551269888.0,
-            "46": 551269888.0,
-            "47": 551269888.0,
-            "48": 551269888.0,
-            "49": 551269888.0,
-            "50": 551269888.0,
-            "51": 551269888.0,
-            "52": 551269888.0,
-            "53": 551269888.0,
-            "54": 551269888.0,
-            "55": 551269888.0,
-            "56": 551269888.0,
-            "57": 551269888.0,
-            "58": 551269888.0,
-            "59": 551269888.0,
-            "60": 551269888.0,
-            "61": 551269888.0,
-            "62": 551269888.0,
-            "63": 551269888.0,
-            "64": 551269888.0,
-            "65": 551269888.0,
-            "66": 551269888.0,
-            "67": 551269888.0,
-            "68": 551269888.0,
-            "69": 551269888.0,
-            "70": 551269888.0,
-            "71": 551269888.0,
-            "72": 551269888.0,
-            "73": 551269888.0,
-            "74": 551269888.0,
-            "75": 551269888.0,
-            "76": 551269888.0,
-            "77": 551269888.0,
-            "78": 551269888.0,
-            "79": 551269888.0,
-            "80": 551269888.0,
-            "81": 551269888.0,
-            "82": 551269888.0,
-            "83": 551269888.0,
-            "84": 551269888.0,
-            "85": 551269888.0,
-            "86": 551269888.0,
-            "87": 551269888.0,
-            "88": 551269888.0,
-            "89": 551269888.0,
-            "90": 551269888.0,
-            "91": 551269888.0,
-            "92": 551269888.0,
-            "93": 551269888.0,
-            "94": 551269888.0,
-            "95": 551269888.0,
-            "96": 551269888.0,
-            "97": 551269888.0,
-            "98": 551269888.0,
-            "99": 551269888.0,
-            "100": 551269888.0
+            "1": 551278080.0,
+            "2": 551278080.0,
+            "3": 551278080.0,
+            "4": 551278080.0,
+            "5": 551278080.0,
+            "6": 551278080.0,
+            "7": 551278080.0,
+            "8": 551278080.0,
+            "9": 551278080.0,
+            "10": 551278080.0,
+            "11": 551278080.0,
+            "12": 551278080.0,
+            "13": 551278080.0,
+            "14": 551278080.0,
+            "15": 551278080.0,
+            "16": 551278080.0,
+            "17": 551278080.0,
+            "18": 551278080.0,
+            "19": 551278080.0,
+            "20": 551278080.0,
+            "21": 551278080.0,
+            "22": 551278080.0,
+            "23": 551278080.0,
+            "24": 551278080.0,
+            "25": 551278080.0,
+            "26": 551278080.0,
+            "27": 551278080.0,
+            "28": 551278080.0,
+            "29": 551278080.0,
+            "30": 551278080.0,
+            "31": 551278080.0,
+            "32": 551278080.0,
+            "33": 551278080.0,
+            "34": 551278080.0,
+            "35": 551278080.0,
+            "36": 551278080.0,
+            "37": 551278080.0,
+            "38": 551278080.0,
+            "39": 551278080.0,
+            "40": 551278080.0,
+            "41": 551278080.0,
+            "42": 551278080.0,
+            "43": 551278080.0,
+            "44": 551278080.0,
+            "45": 551278080.0,
+            "46": 551278080.0,
+            "47": 551278080.0,
+            "48": 551278080.0,
+            "49": 551278080.0,
+            "50": 551278080.0,
+            "51": 551278080.0,
+            "52": 551278080.0,
+            "53": 551278080.0,
+            "54": 551278080.0,
+            "55": 551278080.0,
+            "56": 551278080.0,
+            "57": 551278080.0,
+            "58": 551278080.0,
+            "59": 551278080.0,
+            "60": 551278080.0,
+            "61": 551278080.0,
+            "62": 551278080.0,
+            "63": 551278080.0,
+            "64": 551278080.0,
+            "65": 551278080.0,
+            "66": 551278080.0,
+            "67": 551278080.0,
+            "68": 551278080.0,
+            "69": 551278080.0,
+            "70": 551278080.0,
+            "71": 551278080.0,
+            "72": 551278080.0,
+            "73": 551278080.0,
+            "74": 551278080.0,
+            "75": 551278080.0,
+            "76": 551278080.0,
+            "77": 551278080.0,
+            "78": 551278080.0,
+            "79": 551278080.0,
+            "80": 551278080.0,
+            "81": 551278080.0,
+            "82": 551278080.0,
+            "83": 551278080.0,
+            "84": 551278080.0,
+            "85": 551278080.0,
+            "86": 551278080.0,
+            "87": 551278080.0,
+            "88": 551278080.0,
+            "89": 551278080.0,
+            "90": 551278080.0,
+            "91": 551278080.0,
+            "92": 551278080.0,
+            "93": 551278080.0,
+            "94": 551278080.0,
+            "95": 551278080.0,
+            "96": 551278080.0,
+            "97": 551278080.0,
+            "98": 551278080.0,
+            "99": 551278080.0,
+            "100": 551278080.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2290489344.0,
-            "2": 2432397312.0,
-            "3": 2432397312.0,
-            "4": 2432397312.0,
-            "5": 2432397312.0,
-            "6": 2432397312.0,
-            "7": 2432397312.0,
-            "8": 2432397312.0,
-            "9": 2432397312.0,
-            "10": 2432397312.0,
-            "11": 2432397312.0,
-            "12": 2432397312.0,
-            "13": 2432397312.0,
-            "14": 2432397312.0,
-            "15": 2432397312.0,
-            "16": 2432397312.0,
-            "17": 2432397312.0,
-            "18": 2432397312.0,
-            "19": 2432397312.0,
-            "20": 2432397312.0,
-            "21": 2432397312.0,
-            "22": 2432397312.0,
-            "23": 2432397312.0,
-            "24": 2432397312.0,
-            "25": 2432397312.0,
-            "26": 2432397312.0,
-            "27": 2432397312.0,
-            "28": 2432397312.0,
-            "29": 2432397312.0,
-            "30": 2432397312.0,
-            "31": 2432397312.0,
-            "32": 2432397312.0,
-            "33": 2432397312.0,
-            "34": 2432397312.0,
-            "35": 2432397312.0,
-            "36": 2432397312.0,
-            "37": 2432397312.0,
-            "38": 2432397312.0,
-            "39": 2432397312.0,
-            "40": 2432397312.0,
-            "41": 2432397312.0,
-            "42": 2432397312.0,
-            "43": 2432397312.0,
-            "44": 2432397312.0,
-            "45": 2432397312.0,
-            "46": 2432397312.0,
-            "47": 2432397312.0,
-            "48": 2432397312.0,
-            "49": 2432397312.0,
-            "50": 2432397312.0,
-            "51": 2432397312.0,
-            "52": 2432397312.0,
-            "53": 2432397312.0,
-            "54": 2432397312.0,
-            "55": 2432397312.0,
-            "56": 2432397312.0,
-            "57": 2432397312.0,
-            "58": 2432397312.0,
-            "59": 2432397312.0,
-            "60": 2432397312.0,
-            "61": 2432397312.0,
-            "62": 2432397312.0,
-            "63": 2432397312.0,
-            "64": 2432397312.0,
-            "65": 2432397312.0,
-            "66": 2432397312.0,
-            "67": 2432397312.0,
-            "68": 2432397312.0,
-            "69": 2432397312.0,
-            "70": 2432397312.0,
-            "71": 2432397312.0,
-            "72": 2432397312.0,
-            "73": 2432397312.0,
-            "74": 2432397312.0,
-            "75": 2432397312.0,
-            "76": 2432397312.0,
-            "77": 2432397312.0,
-            "78": 2432397312.0,
-            "79": 2432397312.0,
-            "80": 2432397312.0,
-            "81": 2432397312.0,
-            "82": 2432397312.0,
-            "83": 2432397312.0,
-            "84": 2432397312.0,
-            "85": 2432397312.0,
-            "86": 2432397312.0,
-            "87": 2432397312.0,
-            "88": 2432397312.0,
-            "89": 2432397312.0,
-            "90": 2432397312.0,
-            "91": 2432397312.0,
-            "92": 2432397312.0,
-            "93": 2432397312.0,
-            "94": 2432397312.0,
-            "95": 2432397312.0,
-            "96": 2432397312.0,
-            "97": 2432397312.0,
-            "98": 2432397312.0,
-            "99": 2432397312.0,
-            "100": 2432397312.0
+            "1": 2289441792.0,
+            "2": 2432405504.0,
+            "3": 2432405504.0,
+            "4": 2432405504.0,
+            "5": 2432405504.0,
+            "6": 2432405504.0,
+            "7": 2432405504.0,
+            "8": 2432405504.0,
+            "9": 2432405504.0,
+            "10": 2432405504.0,
+            "11": 2432405504.0,
+            "12": 2432405504.0,
+            "13": 2432405504.0,
+            "14": 2432405504.0,
+            "15": 2432405504.0,
+            "16": 2432405504.0,
+            "17": 2432405504.0,
+            "18": 2432405504.0,
+            "19": 2432405504.0,
+            "20": 2432405504.0,
+            "21": 2432405504.0,
+            "22": 2432405504.0,
+            "23": 2432405504.0,
+            "24": 2432405504.0,
+            "25": 2432405504.0,
+            "26": 2432405504.0,
+            "27": 2432405504.0,
+            "28": 2432405504.0,
+            "29": 2432405504.0,
+            "30": 2432405504.0,
+            "31": 2432405504.0,
+            "32": 2432405504.0,
+            "33": 2432405504.0,
+            "34": 2432405504.0,
+            "35": 2432405504.0,
+            "36": 2432405504.0,
+            "37": 2432405504.0,
+            "38": 2432405504.0,
+            "39": 2432405504.0,
+            "40": 2432405504.0,
+            "41": 2432405504.0,
+            "42": 2432405504.0,
+            "43": 2432405504.0,
+            "44": 2432405504.0,
+            "45": 2432405504.0,
+            "46": 2432405504.0,
+            "47": 2432405504.0,
+            "48": 2432405504.0,
+            "49": 2432405504.0,
+            "50": 2432405504.0,
+            "51": 2432405504.0,
+            "52": 2432405504.0,
+            "53": 2432405504.0,
+            "54": 2432405504.0,
+            "55": 2432405504.0,
+            "56": 2432405504.0,
+            "57": 2432405504.0,
+            "58": 2432405504.0,
+            "59": 2432405504.0,
+            "60": 2432405504.0,
+            "61": 2432405504.0,
+            "62": 2432405504.0,
+            "63": 2432405504.0,
+            "64": 2432405504.0,
+            "65": 2432405504.0,
+            "66": 2432405504.0,
+            "67": 2432405504.0,
+            "68": 2432405504.0,
+            "69": 2432405504.0,
+            "70": 2432405504.0,
+            "71": 2432405504.0,
+            "72": 2432405504.0,
+            "73": 2432405504.0,
+            "74": 2432405504.0,
+            "75": 2432405504.0,
+            "76": 2432405504.0,
+            "77": 2432405504.0,
+            "78": 2432405504.0,
+            "79": 2432405504.0,
+            "80": 2432405504.0,
+            "81": 2432405504.0,
+            "82": 2432405504.0,
+            "83": 2432405504.0,
+            "84": 2432405504.0,
+            "85": 2432405504.0,
+            "86": 2432405504.0,
+            "87": 2432405504.0,
+            "88": 2432405504.0,
+            "89": 2432405504.0,
+            "90": 2432405504.0,
+            "91": 2432405504.0,
+            "92": 2432405504.0,
+            "93": 2432405504.0,
+            "94": 2432405504.0,
+            "95": 2432405504.0,
+            "96": 2432405504.0,
+            "97": 2432405504.0,
+            "98": 2432405504.0,
+            "99": 2432405504.0,
+            "100": 2432405504.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 17.61957,
-            "2": 0.12347,
-            "3": 0.11094,
-            "4": 0.11482,
-            "5": 0.11141,
-            "6": 0.10928,
-            "7": 0.10905,
-            "8": 0.11026,
-            "9": 0.11003,
-            "10": 0.11095,
-            "11": 0.11002,
-            "12": 0.1122,
-            "13": 0.11472,
-            "14": 0.11511,
-            "15": 0.11073,
-            "16": 0.11228,
-            "17": 0.11342,
-            "18": 0.11197,
-            "19": 0.11062,
-            "20": 0.11097,
-            "21": 0.11081,
-            "22": 0.11379,
-            "23": 0.10968,
-            "24": 0.11083,
-            "25": 0.11649,
-            "26": 0.11043,
-            "27": 0.11175,
-            "28": 0.11122,
-            "29": 0.11218,
-            "30": 0.11261,
-            "31": 0.11314,
-            "32": 0.10971,
-            "33": 0.11028,
-            "34": 0.11149,
-            "35": 0.11122,
-            "36": 0.11079,
-            "37": 0.11188,
-            "38": 0.1115,
-            "39": 0.11238,
-            "40": 0.11528,
-            "41": 0.11165,
-            "42": 0.11137,
-            "43": 0.11139,
-            "44": 0.11074,
-            "45": 0.11141,
-            "46": 0.11158,
-            "47": 0.1105,
-            "48": 0.11128,
-            "49": 0.11164,
-            "50": 0.11572,
-            "51": 0.11625,
-            "52": 0.10969,
-            "53": 0.10904,
-            "54": 0.1098,
-            "55": 0.10896,
-            "56": 0.11225,
-            "57": 0.11301,
-            "58": 0.11047,
-            "59": 0.10959,
-            "60": 0.11005,
-            "61": 0.11018,
-            "62": 0.10831,
-            "63": 0.10997,
-            "64": 0.10896,
-            "65": 0.11116,
-            "66": 0.11148,
-            "67": 0.1092,
-            "68": 0.10947,
-            "69": 0.10933,
-            "70": 0.10869,
-            "71": 0.10873,
-            "72": 0.10849,
-            "73": 0.10872,
-            "74": 0.10951,
-            "75": 0.1119,
-            "76": 0.1109,
-            "77": 0.10896,
-            "78": 0.10963,
-            "79": 0.11057,
-            "80": 0.10858,
-            "81": 0.10732,
-            "82": 0.10824,
-            "83": 0.11006,
-            "84": 0.11062,
-            "85": 0.1096,
-            "86": 0.10933,
-            "87": 0.11001,
-            "88": 0.11053,
-            "89": 0.10899,
-            "90": 0.10989,
-            "91": 0.10903,
-            "92": 0.10959,
-            "93": 0.11185,
-            "94": 0.11166,
-            "95": 0.11067,
-            "96": 0.11183,
-            "97": 0.11136,
-            "98": 0.11022,
-            "99": 0.11091,
-            "100": 0.10951
+            "1": 12.06542,
+            "2": 0.1206,
+            "3": 0.10179,
+            "4": 0.08257,
+            "5": 0.08196,
+            "6": 0.08184,
+            "7": 0.08247,
+            "8": 0.08147,
+            "9": 0.08127,
+            "10": 0.08228,
+            "11": 0.0839,
+            "12": 0.08236,
+            "13": 0.08232,
+            "14": 0.08218,
+            "15": 0.08336,
+            "16": 0.08213,
+            "17": 0.08296,
+            "18": 0.0816,
+            "19": 0.08269,
+            "20": 0.08138,
+            "21": 0.08303,
+            "22": 0.08243,
+            "23": 0.08357,
+            "24": 0.08151,
+            "25": 0.08392,
+            "26": 0.08247,
+            "27": 0.08229,
+            "28": 0.08279,
+            "29": 0.08232,
+            "30": 0.0824,
+            "31": 0.08146,
+            "32": 0.08912,
+            "33": 0.08386,
+            "34": 0.08198,
+            "35": 0.08188,
+            "36": 0.08394,
+            "37": 0.08154,
+            "38": 0.08111,
+            "39": 0.08175,
+            "40": 0.08143,
+            "41": 0.08312,
+            "42": 0.08219,
+            "43": 0.08218,
+            "44": 0.08316,
+            "45": 0.08162,
+            "46": 0.08265,
+            "47": 0.08169,
+            "48": 0.08346,
+            "49": 0.08176,
+            "50": 0.08213,
+            "51": 0.09096,
+            "52": 0.08501,
+            "53": 0.08473,
+            "54": 0.08165,
+            "55": 0.08129,
+            "56": 0.08244,
+            "57": 0.08158,
+            "58": 0.08104,
+            "59": 0.08185,
+            "60": 0.0834,
+            "61": 0.08139,
+            "62": 0.08134,
+            "63": 0.086,
+            "64": 0.08155,
+            "65": 0.08326,
+            "66": 0.08135,
+            "67": 0.08434,
+            "68": 0.0817,
+            "69": 0.08297,
+            "70": 0.08039,
+            "71": 0.0801,
+            "72": 0.07962,
+            "73": 0.07979,
+            "74": 0.08099,
+            "75": 0.08004,
+            "76": 0.07961,
+            "77": 0.07959,
+            "78": 0.08021,
+            "79": 0.08102,
+            "80": 0.07949,
+            "81": 0.08018,
+            "82": 0.08014,
+            "83": 0.07929,
+            "84": 0.07992,
+            "85": 0.07982,
+            "86": 0.08024,
+            "87": 0.08054,
+            "88": 0.08161,
+            "89": 0.08084,
+            "90": 0.08079,
+            "91": 0.08239,
+            "92": 0.08091,
+            "93": 0.07966,
+            "94": 0.08301,
+            "95": 0.08124,
+            "96": 0.08066,
+            "97": 0.08098,
+            "98": 0.08072,
+            "99": 0.08164,
+            "100": 0.08106
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..490e22e59f4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85868,
+            "52": 9.74284,
+            "53": 10.06645,
+            "54": 9.95167,
+            "55": 9.88096,
+            "56": 9.62626,
+            "57": 9.47768,
+            "58": 9.83346,
+            "59": 9.58526,
+            "60": 9.50125,
+            "61": 9.69182,
+            "62": 9.98853,
+            "63": 9.38476,
+            "64": 9.7803,
+            "65": 8.94762,
+            "66": 9.70856,
+            "67": 9.36852,
+            "68": 9.78439,
+            "69": 9.79406,
+            "70": 9.74241,
+            "71": 9.61808,
+            "72": 9.58428,
+            "73": 9.5035,
+            "74": 8.94221,
+            "75": 9.42529,
+            "76": 9.07408,
+            "77": 10.06351,
+            "78": 9.7208,
+            "79": 9.37294,
+            "80": 9.40396,
+            "81": 9.48168,
+            "82": 9.69778,
+            "83": 9.30714,
+            "84": 9.41712,
+            "85": 9.61407,
+            "86": 9.07615,
+            "87": 9.59094,
+            "88": 9.74641,
+            "89": 9.59993,
+            "90": 9.8142,
+            "91": 9.33773,
+            "92": 9.35373,
+            "93": 9.07395,
+            "94": 8.83173,
+            "95": 9.51734,
+            "96": 9.52415,
+            "97": 9.30995,
+            "98": 9.66805,
+            "99": 8.88588,
+            "100": 9.39538
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2607.0,
+            "52": 2618.0,
+            "53": 2828.0,
+            "54": 2730.0,
+            "55": 2351.0,
+            "56": 2753.0,
+            "57": 2323.0,
+            "58": 2809.0,
+            "59": 2721.0,
+            "60": 2440.0,
+            "61": 2875.0,
+            "62": 2726.0,
+            "63": 2444.0,
+            "64": 3001.0,
+            "65": 2602.0,
+            "66": 2981.0,
+            "67": 2676.0,
+            "68": 2623.0,
+            "69": 2802.0,
+            "70": 3234.0,
+            "71": 2902.0,
+            "72": 2337.0,
+            "73": 2856.0,
+            "74": 1903.0,
+            "75": 2388.0,
+            "76": 3118.0,
+            "77": 3108.0,
+            "78": 3122.0,
+            "79": 2994.0,
+            "80": 3186.0,
+            "81": 3470.0,
+            "82": 3164.0,
+            "83": 2726.0,
+            "84": 3214.0,
+            "85": 3262.0,
+            "86": 2602.0,
+            "87": 3658.0,
+            "88": 2906.0,
+            "89": 3054.0,
+            "90": 3018.0,
+            "91": 2690.0,
+            "92": 3106.0,
+            "93": 2701.0,
+            "94": 3263.0,
+            "95": 3426.0,
+            "96": 3405.0,
+            "97": 3087.0,
+            "98": 3510.0,
+            "99": 3148.0,
+            "100": 3204.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 694779392.0,
+            "52": 694779392.0,
+            "53": 694779392.0,
+            "54": 694779392.0,
+            "55": 694779392.0,
+            "56": 694779392.0,
+            "57": 694779392.0,
+            "58": 694779392.0,
+            "59": 694779392.0,
+            "60": 694779392.0,
+            "61": 694779392.0,
+            "62": 694779392.0,
+            "63": 694779392.0,
+            "64": 694779392.0,
+            "65": 694779392.0,
+            "66": 694779392.0,
+            "67": 694779392.0,
+            "68": 694779392.0,
+            "69": 694779392.0,
+            "70": 694779392.0,
+            "71": 694779392.0,
+            "72": 694779392.0,
+            "73": 694779392.0,
+            "74": 694779392.0,
+            "75": 694779392.0,
+            "76": 694779392.0,
+            "77": 694779392.0,
+            "78": 694779392.0,
+            "79": 694779392.0,
+            "80": 694779392.0,
+            "81": 694779392.0,
+            "82": 694779392.0,
+            "83": 694779392.0,
+            "84": 694779392.0,
+            "85": 694779392.0,
+            "86": 694779392.0,
+            "87": 694779392.0,
+            "88": 694779392.0,
+            "89": 694779392.0,
+            "90": 694779392.0,
+            "91": 694779392.0,
+            "92": 694779392.0,
+            "93": 694779392.0,
+            "94": 694779392.0,
+            "95": 694779392.0,
+            "96": 694779392.0,
+            "97": 694779392.0,
+            "98": 694779392.0,
+            "99": 694779392.0,
+            "100": 694779392.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2574857216.0,
+            "52": 2574858240.0,
+            "53": 2574858240.0,
+            "54": 2574858240.0,
+            "55": 2574858240.0,
+            "56": 2574858240.0,
+            "57": 2574858240.0,
+            "58": 2574858240.0,
+            "59": 2574858240.0,
+            "60": 2574858240.0,
+            "61": 2574858240.0,
+            "62": 2574858240.0,
+            "63": 2574858240.0,
+            "64": 2574858240.0,
+            "65": 2574858240.0,
+            "66": 2574858240.0,
+            "67": 2574858240.0,
+            "68": 2574858240.0,
+            "69": 2574858240.0,
+            "70": 2574858240.0,
+            "71": 2574858240.0,
+            "72": 2574858240.0,
+            "73": 2574858240.0,
+            "74": 2574858240.0,
+            "75": 2574858240.0,
+            "76": 2574858240.0,
+            "77": 2574858240.0,
+            "78": 2574858240.0,
+            "79": 2574858240.0,
+            "80": 2574858240.0,
+            "81": 2574858240.0,
+            "82": 2574858240.0,
+            "83": 2574858240.0,
+            "84": 2574858240.0,
+            "85": 2574858240.0,
+            "86": 2574858240.0,
+            "87": 2574858240.0,
+            "88": 2574858240.0,
+            "89": 2574858240.0,
+            "90": 2574858240.0,
+            "91": 2574858240.0,
+            "92": 2574858240.0,
+            "93": 2574858240.0,
+            "94": 2574858240.0,
+            "95": 2574858240.0,
+            "96": 2574858240.0,
+            "97": 2574858240.0,
+            "98": 2574858240.0,
+            "99": 2574858240.0,
+            "100": 2574858240.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 11.89299,
+            "52": 0.11287,
+            "53": 0.08679,
+            "54": 0.08602,
+            "55": 0.0852,
+            "56": 0.08169,
+            "57": 0.08199,
+            "58": 0.08035,
+            "59": 0.07992,
+            "60": 0.08061,
+            "61": 0.0805,
+            "62": 0.08001,
+            "63": 0.08077,
+            "64": 0.08064,
+            "65": 0.08121,
+            "66": 0.08051,
+            "67": 0.08071,
+            "68": 0.08067,
+            "69": 0.08042,
+            "70": 0.08041,
+            "71": 0.0815,
+            "72": 0.08101,
+            "73": 0.08129,
+            "74": 0.08058,
+            "75": 0.08105,
+            "76": 0.08085,
+            "77": 0.08323,
+            "78": 0.08354,
+            "79": 0.08364,
+            "80": 0.08354,
+            "81": 0.08367,
+            "82": 0.08118,
+            "83": 0.08169,
+            "84": 0.08345,
+            "85": 0.08141,
+            "86": 0.08179,
+            "87": 0.08142,
+            "88": 0.0817,
+            "89": 0.08146,
+            "90": 0.50232,
+            "91": 0.08211,
+            "92": 0.08131,
+            "93": 0.08164,
+            "94": 0.08213,
+            "95": 0.08221,
+            "96": 0.08288,
+            "97": 0.08215,
+            "98": 0.08186,
+            "99": 0.08239,
+            "100": 0.08223
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json
index 5d20ab395ec..691a79fb9b0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json
@@ -325,7 +325,7 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2297829376.0,
+            "1": 2298877952.0,
             "2": 2439228416.0,
             "3": 2439228416.0,
             "4": 2439228416.0,
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.8604,
-            "2": 0.16953,
-            "3": 0.13987,
-            "4": 0.13824,
-            "5": 0.13775,
-            "6": 0.13549,
-            "7": 0.13611,
-            "8": 0.13584,
-            "9": 0.13626,
-            "10": 0.13922,
-            "11": 0.13526,
-            "12": 0.13455,
-            "13": 0.13222,
-            "14": 0.13324,
-            "15": 0.1325,
-            "16": 0.13211,
-            "17": 0.13198,
-            "18": 0.13145,
-            "19": 0.13207,
-            "20": 0.13182,
-            "21": 0.13297,
-            "22": 0.1322,
-            "23": 0.13275,
-            "24": 0.1319,
-            "25": 0.13822,
-            "26": 0.13214,
-            "27": 0.13169,
-            "28": 0.13196,
-            "29": 0.13229,
-            "30": 0.13285,
-            "31": 0.13112,
-            "32": 0.13222,
-            "33": 0.13056,
-            "34": 0.13076,
-            "35": 0.13218,
-            "36": 0.13126,
-            "37": 0.13091,
-            "38": 0.13048,
-            "39": 0.13082,
-            "40": 0.1308,
-            "41": 0.13202,
-            "42": 0.1314,
-            "43": 0.13222,
-            "44": 0.13074,
-            "45": 0.13237,
-            "46": 0.13272,
-            "47": 0.13239,
-            "48": 0.13266,
-            "49": 0.13226,
-            "50": 0.13164,
-            "51": 0.13425,
-            "52": 0.13044,
-            "53": 0.13037,
-            "54": 0.13007,
-            "55": 0.1301,
-            "56": 0.13001,
-            "57": 0.13054,
-            "58": 0.12972,
-            "59": 0.13049,
-            "60": 0.13042,
-            "61": 0.12903,
-            "62": 0.13042,
-            "63": 0.13104,
-            "64": 0.13008,
-            "65": 0.13158,
-            "66": 0.13091,
-            "67": 0.13089,
-            "68": 0.13084,
-            "69": 0.12903,
-            "70": 0.13015,
-            "71": 0.12957,
-            "72": 0.12997,
-            "73": 0.13025,
-            "74": 0.12989,
-            "75": 0.13018,
-            "76": 0.12962,
-            "77": 0.13065,
-            "78": 0.12915,
-            "79": 0.13007,
-            "80": 0.12972,
-            "81": 0.1301,
-            "82": 0.12927,
-            "83": 0.1302,
-            "84": 0.12991,
-            "85": 0.13129,
-            "86": 0.13063,
-            "87": 0.13028,
-            "88": 0.1305,
-            "89": 0.13046,
-            "90": 0.12991,
-            "91": 0.13058,
-            "92": 0.13044,
-            "93": 0.13009,
-            "94": 0.1306,
-            "95": 0.13082,
-            "96": 0.13068,
-            "97": 0.13403,
-            "98": 0.13199,
-            "99": 0.13191,
-            "100": 0.13014
+            "1": 5.78436,
+            "2": 0.15737,
+            "3": 0.15175,
+            "4": 0.13338,
+            "5": 0.13371,
+            "6": 0.13122,
+            "7": 0.13094,
+            "8": 0.13089,
+            "9": 0.13127,
+            "10": 0.1325,
+            "11": 0.13263,
+            "12": 0.13197,
+            "13": 0.1321,
+            "14": 0.13177,
+            "15": 0.13107,
+            "16": 0.13105,
+            "17": 0.13225,
+            "18": 0.13154,
+            "19": 0.13094,
+            "20": 0.13082,
+            "21": 0.13074,
+            "22": 0.13108,
+            "23": 0.13092,
+            "24": 0.13137,
+            "25": 0.13097,
+            "26": 0.13061,
+            "27": 0.13081,
+            "28": 0.13087,
+            "29": 0.13114,
+            "30": 0.1316,
+            "31": 0.13201,
+            "32": 0.13122,
+            "33": 0.13114,
+            "34": 0.13117,
+            "35": 0.13149,
+            "36": 0.13065,
+            "37": 0.13085,
+            "38": 0.13105,
+            "39": 0.13143,
+            "40": 0.13125,
+            "41": 0.13337,
+            "42": 0.13078,
+            "43": 0.13258,
+            "44": 0.13138,
+            "45": 0.13103,
+            "46": 0.13168,
+            "47": 0.13123,
+            "48": 0.13091,
+            "49": 0.13137,
+            "50": 0.13118,
+            "51": 0.13768,
+            "52": 0.13317,
+            "53": 0.1336,
+            "54": 0.1328,
+            "55": 0.13244,
+            "56": 0.13289,
+            "57": 0.13268,
+            "58": 0.13228,
+            "59": 0.13233,
+            "60": 0.13203,
+            "61": 0.13361,
+            "62": 0.13211,
+            "63": 0.13195,
+            "64": 0.13158,
+            "65": 0.13275,
+            "66": 0.13199,
+            "67": 0.13166,
+            "68": 0.13257,
+            "69": 0.13175,
+            "70": 0.13157,
+            "71": 0.13714,
+            "72": 0.13192,
+            "73": 0.13291,
+            "74": 0.13314,
+            "75": 0.13276,
+            "76": 0.13221,
+            "77": 0.13203,
+            "78": 0.13255,
+            "79": 0.13169,
+            "80": 0.13279,
+            "81": 0.13297,
+            "82": 0.13191,
+            "83": 0.13163,
+            "84": 0.13271,
+            "85": 0.13215,
+            "86": 0.13225,
+            "87": 0.13265,
+            "88": 0.13135,
+            "89": 0.13216,
+            "90": 0.13163,
+            "91": 0.1317,
+            "92": 0.13178,
+            "93": 0.13167,
+            "94": 0.13291,
+            "95": 0.13256,
+            "96": 0.13258,
+            "97": 0.13202,
+            "98": 0.13253,
+            "99": 0.13337,
+            "100": 0.13354
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..56ff788b9ee
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.83548,
+            "52": 9.72518,
+            "53": 10.04799,
+            "54": 9.93007,
+            "55": 9.86362,
+            "56": 9.60218,
+            "57": 9.45185,
+            "58": 9.80781,
+            "59": 9.56786,
+            "60": 9.47966,
+            "61": 9.67985,
+            "62": 9.9675,
+            "63": 9.35111,
+            "64": 9.75622,
+            "65": 8.93178,
+            "66": 9.68108,
+            "67": 9.35959,
+            "68": 9.76948,
+            "69": 9.77494,
+            "70": 9.71179,
+            "71": 9.60631,
+            "72": 9.57134,
+            "73": 9.48393,
+            "74": 8.92913,
+            "75": 9.4003,
+            "76": 9.07189,
+            "77": 10.05248,
+            "78": 9.71492,
+            "79": 9.35744,
+            "80": 9.38946,
+            "81": 9.46798,
+            "82": 9.68509,
+            "83": 9.29591,
+            "84": 9.40521,
+            "85": 9.60161,
+            "86": 9.06713,
+            "87": 9.58406,
+            "88": 9.73301,
+            "89": 9.59528,
+            "90": 9.80559,
+            "91": 9.32603,
+            "92": 9.3532,
+            "93": 9.06916,
+            "94": 8.82266,
+            "95": 9.50858,
+            "96": 9.51587,
+            "97": 9.29763,
+            "98": 9.66187,
+            "99": 8.87661,
+            "100": 9.39222
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2554.0,
+            "52": 2619.0,
+            "53": 2863.0,
+            "54": 2688.0,
+            "55": 2406.0,
+            "56": 2649.0,
+            "57": 2175.0,
+            "58": 2856.0,
+            "59": 2775.0,
+            "60": 2307.0,
+            "61": 2914.0,
+            "62": 2644.0,
+            "63": 2362.0,
+            "64": 2946.0,
+            "65": 2578.0,
+            "66": 3122.0,
+            "67": 2697.0,
+            "68": 2687.0,
+            "69": 2956.0,
+            "70": 3157.0,
+            "71": 3028.0,
+            "72": 2294.0,
+            "73": 2876.0,
+            "74": 1887.0,
+            "75": 2523.0,
+            "76": 2937.0,
+            "77": 3162.0,
+            "78": 3318.0,
+            "79": 3074.0,
+            "80": 3213.0,
+            "81": 3664.0,
+            "82": 3238.0,
+            "83": 2838.0,
+            "84": 3251.0,
+            "85": 3275.0,
+            "86": 2748.0,
+            "87": 3758.0,
+            "88": 3023.0,
+            "89": 3267.0,
+            "90": 3085.0,
+            "91": 2812.0,
+            "92": 3116.0,
+            "93": 2665.0,
+            "94": 3380.0,
+            "95": 3236.0,
+            "96": 3462.0,
+            "97": 3002.0,
+            "98": 3545.0,
+            "99": 3265.0,
+            "100": 3458.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 665418240.0,
+            "52": 665418240.0,
+            "53": 665418240.0,
+            "54": 665418240.0,
+            "55": 665418240.0,
+            "56": 665418240.0,
+            "57": 665418240.0,
+            "58": 665418240.0,
+            "59": 665418240.0,
+            "60": 665418240.0,
+            "61": 665418240.0,
+            "62": 665418240.0,
+            "63": 665418240.0,
+            "64": 665418240.0,
+            "65": 665418240.0,
+            "66": 665418240.0,
+            "67": 665418240.0,
+            "68": 665418240.0,
+            "69": 665418240.0,
+            "70": 665418240.0,
+            "71": 665418240.0,
+            "72": 665418240.0,
+            "73": 665418240.0,
+            "74": 665418240.0,
+            "75": 665418240.0,
+            "76": 665418240.0,
+            "77": 665418240.0,
+            "78": 665418240.0,
+            "79": 665418240.0,
+            "80": 665418240.0,
+            "81": 665418240.0,
+            "82": 665418240.0,
+            "83": 665418240.0,
+            "84": 665418240.0,
+            "85": 665418240.0,
+            "86": 665418240.0,
+            "87": 665418240.0,
+            "88": 665418240.0,
+            "89": 665418240.0,
+            "90": 665418240.0,
+            "91": 665418240.0,
+            "92": 665418240.0,
+            "93": 665418240.0,
+            "94": 665418240.0,
+            "95": 665418240.0,
+            "96": 665418240.0,
+            "97": 665418240.0,
+            "98": 665418240.0,
+            "99": 665418240.0,
+            "100": 665418240.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2584293376.0,
+            "52": 2584294400.0,
+            "53": 2584294400.0,
+            "54": 2584294400.0,
+            "55": 2584294400.0,
+            "56": 2584294400.0,
+            "57": 2584294400.0,
+            "58": 2584294400.0,
+            "59": 2584294400.0,
+            "60": 2584294400.0,
+            "61": 2584294400.0,
+            "62": 2584294400.0,
+            "63": 2584294400.0,
+            "64": 2584294400.0,
+            "65": 2584294400.0,
+            "66": 2584294400.0,
+            "67": 2584294400.0,
+            "68": 2584294400.0,
+            "69": 2584294400.0,
+            "70": 2584294400.0,
+            "71": 2584294400.0,
+            "72": 2584294400.0,
+            "73": 2584294400.0,
+            "74": 2584294400.0,
+            "75": 2584294400.0,
+            "76": 2584294400.0,
+            "77": 2584294400.0,
+            "78": 2584294400.0,
+            "79": 2584294400.0,
+            "80": 2584294400.0,
+            "81": 2584294400.0,
+            "82": 2584294400.0,
+            "83": 2584294400.0,
+            "84": 2584294400.0,
+            "85": 2584294400.0,
+            "86": 2584294400.0,
+            "87": 2584294400.0,
+            "88": 2584294400.0,
+            "89": 2584294400.0,
+            "90": 2584294400.0,
+            "91": 2584294400.0,
+            "92": 2584294400.0,
+            "93": 2584294400.0,
+            "94": 2584294400.0,
+            "95": 2584294400.0,
+            "96": 2584294400.0,
+            "97": 2584294400.0,
+            "98": 2584294400.0,
+            "99": 2584294400.0,
+            "100": 2584294400.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 5.37898,
+            "52": 0.16971,
+            "53": 0.14151,
+            "54": 0.1405,
+            "55": 0.13911,
+            "56": 0.13857,
+            "57": 0.13809,
+            "58": 0.13698,
+            "59": 0.13775,
+            "60": 0.13751,
+            "61": 0.1373,
+            "62": 0.13729,
+            "63": 0.13806,
+            "64": 0.13698,
+            "65": 0.13838,
+            "66": 0.13833,
+            "67": 0.13702,
+            "68": 0.13614,
+            "69": 0.13521,
+            "70": 0.13469,
+            "71": 0.13425,
+            "72": 0.13475,
+            "73": 0.13506,
+            "74": 0.13559,
+            "75": 0.13539,
+            "76": 0.13477,
+            "77": 0.13458,
+            "78": 0.13576,
+            "79": 0.13452,
+            "80": 0.13517,
+            "81": 0.13478,
+            "82": 0.13453,
+            "83": 0.13498,
+            "84": 0.13478,
+            "85": 0.13424,
+            "86": 0.13432,
+            "87": 0.1342,
+            "88": 0.13455,
+            "89": 0.13469,
+            "90": 0.13451,
+            "91": 0.13468,
+            "92": 0.13446,
+            "93": 0.1351,
+            "94": 0.13437,
+            "95": 0.13457,
+            "96": 0.13491,
+            "97": 0.13442,
+            "98": 0.13661,
+            "99": 0.13617,
+            "100": 0.13595
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml
index fed75814df5..ad7854aeacb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..ab954626b0e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82555,
+            "2": 10.83286,
+            "3": 10.82763,
+            "4": 10.79573,
+            "5": 10.85699,
+            "6": 10.8639,
+            "7": 10.82612,
+            "8": 10.82542,
+            "9": 10.83587,
+            "10": 10.79627,
+            "11": 10.87822,
+            "12": 10.85824,
+            "13": 10.85426,
+            "14": 10.87526,
+            "15": 10.79208,
+            "16": 10.80307,
+            "17": 10.77438,
+            "18": 10.80487,
+            "19": 10.79369,
+            "20": 10.69576,
+            "21": 10.68654,
+            "22": 10.53161,
+            "23": 10.70646,
+            "24": 10.57337,
+            "25": 10.51533,
+            "26": 10.5909,
+            "27": 10.60777,
+            "28": 10.57049,
+            "29": 10.58979,
+            "30": 10.34722,
+            "31": 10.07771,
+            "32": 10.46349,
+            "33": 10.45722,
+            "34": 10.19974,
+            "35": 10.25643,
+            "36": 10.21263,
+            "37": 10.34718,
+            "38": 10.18009,
+            "39": 10.40838,
+            "40": 10.07629,
+            "41": 10.1297,
+            "42": 10.2117,
+            "43": 9.81708,
+            "44": 9.94034,
+            "45": 9.81748,
+            "46": 9.80633,
+            "47": 10.12473,
+            "48": 9.84047,
+            "49": 9.51012,
+            "50": 9.88943,
+            "51": 9.84256,
+            "52": 9.72573,
+            "53": 10.05974,
+            "54": 9.95226,
+            "55": 9.88318,
+            "56": 9.61275,
+            "57": 9.46219,
+            "58": 9.8231,
+            "59": 9.57666,
+            "60": 9.48516,
+            "61": 9.67876,
+            "62": 9.97782,
+            "63": 9.36212,
+            "64": 9.75714,
+            "65": 8.93494,
+            "66": 9.69283,
+            "67": 9.36708,
+            "68": 9.78178,
+            "69": 9.79452,
+            "70": 9.72296,
+            "71": 9.62031,
+            "72": 9.56974,
+            "73": 9.48101,
+            "74": 8.91241,
+            "75": 9.40905,
+            "76": 9.06617,
+            "77": 10.05809,
+            "78": 9.72194,
+            "79": 9.36927,
+            "80": 9.40029,
+            "81": 9.47702,
+            "82": 9.69787,
+            "83": 9.30742,
+            "84": 9.41492,
+            "85": 9.61113,
+            "86": 9.07103,
+            "87": 9.5961,
+            "88": 9.74909,
+            "89": 9.59604,
+            "90": 9.82722,
+            "91": 9.33657,
+            "92": 9.35582,
+            "93": 9.08689,
+            "94": 8.82754,
+            "95": 9.53065,
+            "96": 9.5276,
+            "97": 9.30672,
+            "98": 9.66905,
+            "99": 8.89635,
+            "100": 9.40525
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1651.0,
+            "2": 1716.0,
+            "3": 1760.0,
+            "4": 1771.0,
+            "5": 1899.0,
+            "6": 1905.0,
+            "7": 1842.0,
+            "8": 1667.0,
+            "9": 1822.0,
+            "10": 1434.0,
+            "11": 1852.0,
+            "12": 1741.0,
+            "13": 1905.0,
+            "14": 1841.0,
+            "15": 1857.0,
+            "16": 1841.0,
+            "17": 1800.0,
+            "18": 1666.0,
+            "19": 1803.0,
+            "20": 1800.0,
+            "21": 1836.0,
+            "22": 1688.0,
+            "23": 1994.0,
+            "24": 1641.0,
+            "25": 1577.0,
+            "26": 1676.0,
+            "27": 1876.0,
+            "28": 1970.0,
+            "29": 1945.0,
+            "30": 1916.0,
+            "31": 1494.0,
+            "32": 1868.0,
+            "33": 2135.0,
+            "34": 1740.0,
+            "35": 1924.0,
+            "36": 1854.0,
+            "37": 2363.0,
+            "38": 2164.0,
+            "39": 2262.0,
+            "40": 2081.0,
+            "41": 2168.0,
+            "42": 2247.0,
+            "43": 2055.0,
+            "44": 2070.0,
+            "45": 1988.0,
+            "46": 2208.0,
+            "47": 2559.0,
+            "48": 2287.0,
+            "49": 2194.0,
+            "50": 2303.0,
+            "51": 2552.0,
+            "52": 2565.0,
+            "53": 2883.0,
+            "54": 2710.0,
+            "55": 2301.0,
+            "56": 2798.0,
+            "57": 2334.0,
+            "58": 2979.0,
+            "59": 2960.0,
+            "60": 2451.0,
+            "61": 2841.0,
+            "62": 2577.0,
+            "63": 2516.0,
+            "64": 2907.0,
+            "65": 2567.0,
+            "66": 2862.0,
+            "67": 2809.0,
+            "68": 2609.0,
+            "69": 2965.0,
+            "70": 2985.0,
+            "71": 2864.0,
+            "72": 2613.0,
+            "73": 3108.0,
+            "74": 2048.0,
+            "75": 2563.0,
+            "76": 3046.0,
+            "77": 3127.0,
+            "78": 2959.0,
+            "79": 3082.0,
+            "80": 3025.0,
+            "81": 3400.0,
+            "82": 3223.0,
+            "83": 2786.0,
+            "84": 3180.0,
+            "85": 3233.0,
+            "86": 2611.0,
+            "87": 3542.0,
+            "88": 3084.0,
+            "89": 3210.0,
+            "90": 3271.0,
+            "91": 2770.0,
+            "92": 3220.0,
+            "93": 2662.0,
+            "94": 3405.0,
+            "95": 3085.0,
+            "96": 3336.0,
+            "97": 3050.0,
+            "98": 3421.0,
+            "99": 3271.0,
+            "100": 3079.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 548399616.0,
+            "2": 548399616.0,
+            "3": 548399616.0,
+            "4": 548399616.0,
+            "5": 548399616.0,
+            "6": 548399616.0,
+            "7": 548399616.0,
+            "8": 548399616.0,
+            "9": 548399616.0,
+            "10": 548399616.0,
+            "11": 548399616.0,
+            "12": 548399616.0,
+            "13": 548399616.0,
+            "14": 548399616.0,
+            "15": 548399616.0,
+            "16": 548399616.0,
+            "17": 548399616.0,
+            "18": 548399616.0,
+            "19": 548399616.0,
+            "20": 548399616.0,
+            "21": 548399616.0,
+            "22": 548399616.0,
+            "23": 548399616.0,
+            "24": 548399616.0,
+            "25": 548399616.0,
+            "26": 548399616.0,
+            "27": 548399616.0,
+            "28": 548399616.0,
+            "29": 548399616.0,
+            "30": 548399616.0,
+            "31": 548399616.0,
+            "32": 548399616.0,
+            "33": 548399616.0,
+            "34": 548399616.0,
+            "35": 548399616.0,
+            "36": 548399616.0,
+            "37": 548399616.0,
+            "38": 548399616.0,
+            "39": 548399616.0,
+            "40": 548399616.0,
+            "41": 548399616.0,
+            "42": 548399616.0,
+            "43": 548399616.0,
+            "44": 548399616.0,
+            "45": 548399616.0,
+            "46": 548399616.0,
+            "47": 548399616.0,
+            "48": 548399616.0,
+            "49": 548399616.0,
+            "50": 548399616.0,
+            "51": 548399616.0,
+            "52": 548399616.0,
+            "53": 548399616.0,
+            "54": 548399616.0,
+            "55": 548399616.0,
+            "56": 548399616.0,
+            "57": 548399616.0,
+            "58": 548399616.0,
+            "59": 548399616.0,
+            "60": 548399616.0,
+            "61": 548399616.0,
+            "62": 548399616.0,
+            "63": 548399616.0,
+            "64": 548399616.0,
+            "65": 548399616.0,
+            "66": 548399616.0,
+            "67": 548399616.0,
+            "68": 548399616.0,
+            "69": 548399616.0,
+            "70": 548399616.0,
+            "71": 548399616.0,
+            "72": 548399616.0,
+            "73": 548399616.0,
+            "74": 548399616.0,
+            "75": 548399616.0,
+            "76": 548399616.0,
+            "77": 548399616.0,
+            "78": 548399616.0,
+            "79": 548399616.0,
+            "80": 548399616.0,
+            "81": 548399616.0,
+            "82": 548399616.0,
+            "83": 548399616.0,
+            "84": 548399616.0,
+            "85": 548399616.0,
+            "86": 548399616.0,
+            "87": 548399616.0,
+            "88": 548399616.0,
+            "89": 548399616.0,
+            "90": 548399616.0,
+            "91": 548399616.0,
+            "92": 548399616.0,
+            "93": 548399616.0,
+            "94": 548399616.0,
+            "95": 548399616.0,
+            "96": 548399616.0,
+            "97": 548399616.0,
+            "98": 548399616.0,
+            "99": 548399616.0,
+            "100": 548399616.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2325370880.0,
+            "2": 2466227200.0,
+            "3": 2466227200.0,
+            "4": 2466227200.0,
+            "5": 2466227200.0,
+            "6": 2466227200.0,
+            "7": 2466227200.0,
+            "8": 2466227200.0,
+            "9": 2466227200.0,
+            "10": 2466227200.0,
+            "11": 2466227200.0,
+            "12": 2466227200.0,
+            "13": 2466227200.0,
+            "14": 2466227200.0,
+            "15": 2466227200.0,
+            "16": 2466227200.0,
+            "17": 2466227200.0,
+            "18": 2466227200.0,
+            "19": 2466227200.0,
+            "20": 2466227200.0,
+            "21": 2466227200.0,
+            "22": 2466227200.0,
+            "23": 2466227200.0,
+            "24": 2466227200.0,
+            "25": 2466227200.0,
+            "26": 2466227200.0,
+            "27": 2466227200.0,
+            "28": 2466227200.0,
+            "29": 2466227200.0,
+            "30": 2466227200.0,
+            "31": 2466227200.0,
+            "32": 2466227200.0,
+            "33": 2466227200.0,
+            "34": 2466227200.0,
+            "35": 2466227200.0,
+            "36": 2466227200.0,
+            "37": 2466227200.0,
+            "38": 2466227200.0,
+            "39": 2466227200.0,
+            "40": 2466227200.0,
+            "41": 2466227200.0,
+            "42": 2466227200.0,
+            "43": 2466227200.0,
+            "44": 2466227200.0,
+            "45": 2466227200.0,
+            "46": 2466227200.0,
+            "47": 2466227200.0,
+            "48": 2466227200.0,
+            "49": 2466227200.0,
+            "50": 2466227200.0,
+            "51": 2466227200.0,
+            "52": 2466227200.0,
+            "53": 2466227200.0,
+            "54": 2466227200.0,
+            "55": 2466227200.0,
+            "56": 2466227200.0,
+            "57": 2466227200.0,
+            "58": 2466227200.0,
+            "59": 2466227200.0,
+            "60": 2466227200.0,
+            "61": 2466227200.0,
+            "62": 2466227200.0,
+            "63": 2466227200.0,
+            "64": 2466227200.0,
+            "65": 2466227200.0,
+            "66": 2466227200.0,
+            "67": 2466227200.0,
+            "68": 2466227200.0,
+            "69": 2466227200.0,
+            "70": 2466227200.0,
+            "71": 2466227200.0,
+            "72": 2466227200.0,
+            "73": 2466227200.0,
+            "74": 2466227200.0,
+            "75": 2466227200.0,
+            "76": 2466227200.0,
+            "77": 2466227200.0,
+            "78": 2466227200.0,
+            "79": 2466227200.0,
+            "80": 2466227200.0,
+            "81": 2466227200.0,
+            "82": 2466227200.0,
+            "83": 2466227200.0,
+            "84": 2466227200.0,
+            "85": 2466227200.0,
+            "86": 2466227200.0,
+            "87": 2466227200.0,
+            "88": 2466227200.0,
+            "89": 2466227200.0,
+            "90": 2466227200.0,
+            "91": 2466227200.0,
+            "92": 2466227200.0,
+            "93": 2466227200.0,
+            "94": 2466227200.0,
+            "95": 2466227200.0,
+            "96": 2466227200.0,
+            "97": 2466227200.0,
+            "98": 2466227200.0,
+            "99": 2466227200.0,
+            "100": 2466227200.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.76786,
+            "2": 0.13256,
+            "3": 0.26046,
+            "4": 0.11178,
+            "5": 0.24866,
+            "6": 0.1232,
+            "7": 0.21705,
+            "8": 0.25373,
+            "9": 0.10845,
+            "10": 0.24176,
+            "11": 0.32229,
+            "12": 0.16743,
+            "13": 0.27675,
+            "14": 0.10674,
+            "15": 0.23677,
+            "16": 0.24253,
+            "17": 0.1093,
+            "18": 0.10679,
+            "19": 0.10721,
+            "20": 0.25414,
+            "21": 0.21498,
+            "22": 0.10728,
+            "23": 0.10796,
+            "24": 0.12419,
+            "25": 0.11194,
+            "26": 0.10802,
+            "27": 0.36403,
+            "28": 0.10527,
+            "29": 0.10971,
+            "30": 0.10869,
+            "31": 0.25185,
+            "32": 0.20786,
+            "33": 0.1097,
+            "34": 0.10836,
+            "35": 0.23722,
+            "36": 0.12158,
+            "37": 0.1137,
+            "38": 0.10759,
+            "39": 0.2238,
+            "40": 0.23329,
+            "41": 0.20392,
+            "42": 0.10935,
+            "43": 0.11981,
+            "44": 0.11039,
+            "45": 0.10755,
+            "46": 0.10875,
+            "47": 0.22415,
+            "48": 0.11024,
+            "49": 0.47527,
+            "50": 0.11071,
+            "51": 0.21161,
+            "52": 0.10861,
+            "53": 0.10793,
+            "54": 0.24873,
+            "55": 0.21365,
+            "56": 0.1064,
+            "57": 0.20935,
+            "58": 0.24181,
+            "59": 0.14913,
+            "60": 0.10905,
+            "61": 0.20375,
+            "62": 0.20001,
+            "63": 0.20843,
+            "64": 0.11035,
+            "65": 0.23806,
+            "66": 0.11206,
+            "67": 0.10915,
+            "68": 0.22684,
+            "69": 0.10627,
+            "70": 0.24098,
+            "71": 0.20399,
+            "72": 0.1078,
+            "73": 0.1103,
+            "74": 0.11151,
+            "75": 0.11175,
+            "76": 0.11055,
+            "77": 0.10702,
+            "78": 0.11005,
+            "79": 0.11071,
+            "80": 0.11049,
+            "81": 0.54906,
+            "82": 0.10895,
+            "83": 0.23816,
+            "84": 0.11114,
+            "85": 0.10811,
+            "86": 0.11137,
+            "87": 0.11047,
+            "88": 0.22025,
+            "89": 0.22508,
+            "90": 0.10735,
+            "91": 0.21332,
+            "92": 0.23884,
+            "93": 0.10845,
+            "94": 0.10944,
+            "95": 0.22451,
+            "96": 0.10871,
+            "97": 0.28678,
+            "98": 0.11138,
+            "99": 0.11082,
+            "100": 0.11057
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
index e45c3949555..852f0cf6ee6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.90397,
-            "2": 0.16607,
-            "3": 0.13982,
-            "4": 0.14032,
-            "5": 0.13765,
-            "6": 0.13651,
-            "7": 0.13453,
-            "8": 0.13413,
-            "9": 0.13703,
-            "10": 0.13873,
-            "11": 0.28364,
-            "12": 0.13723,
-            "13": 0.13756,
-            "14": 0.1379,
-            "15": 0.14148,
-            "16": 0.1356,
-            "17": 0.13661,
-            "18": 0.13568,
-            "19": 0.13637,
-            "20": 0.1367,
-            "21": 0.28276,
-            "22": 0.13722,
-            "23": 0.13404,
-            "24": 0.13414,
-            "25": 0.1341,
-            "26": 0.13595,
+            "1": 12.07462,
+            "2": 0.1735,
+            "3": 0.1566,
+            "4": 0.13588,
+            "5": 0.13681,
+            "6": 0.13636,
+            "7": 0.13617,
+            "8": 0.13757,
+            "9": 0.13674,
+            "10": 0.13723,
+            "11": 0.30898,
+            "12": 0.13427,
+            "13": 0.13436,
+            "14": 0.13398,
+            "15": 0.1343,
+            "16": 0.13416,
+            "17": 0.13488,
+            "18": 0.13457,
+            "19": 0.1346,
+            "20": 0.13478,
+            "21": 0.27765,
+            "22": 0.13422,
+            "23": 0.13459,
+            "24": 0.1337,
+            "25": 0.13474,
+            "26": 0.13421,
             "27": 0.13446,
-            "28": 0.13477,
-            "29": 0.13439,
-            "30": 0.13383,
-            "31": 0.27955,
-            "32": 0.13416,
-            "33": 0.13472,
-            "34": 0.13383,
-            "35": 0.13499,
-            "36": 0.13468,
-            "37": 0.13332,
-            "38": 0.13449,
-            "39": 0.13488,
-            "40": 0.1347,
-            "41": 0.2818,
-            "42": 0.13497,
-            "43": 0.13495,
-            "44": 0.13372,
-            "45": 0.13385,
-            "46": 0.13479,
-            "47": 0.13339,
-            "48": 0.13334,
-            "49": 0.13393,
-            "50": 0.13346,
-            "51": 0.2815,
-            "52": 0.13492,
-            "53": 0.13387,
-            "54": 0.13407,
-            "55": 0.13263,
-            "56": 0.13379,
-            "57": 0.13439,
-            "58": 0.13407,
-            "59": 0.13481,
-            "60": 0.13407,
-            "61": 0.28073,
-            "62": 0.13474,
-            "63": 0.13363,
-            "64": 0.13359,
-            "65": 0.13323,
-            "66": 0.13437,
-            "67": 0.13391,
-            "68": 0.13344,
-            "69": 0.21561,
-            "70": 0.1337,
-            "71": 0.27778,
-            "72": 0.13359,
-            "73": 0.13364,
-            "74": 0.13406,
-            "75": 0.13376,
-            "76": 0.13308,
-            "77": 0.13263,
-            "78": 0.13172,
-            "79": 0.13328,
-            "80": 0.13387,
-            "81": 0.28018,
-            "82": 0.13437,
-            "83": 0.13645,
-            "84": 0.13548,
-            "85": 0.13558,
-            "86": 0.13447,
-            "87": 0.13492,
-            "88": 0.13361,
-            "89": 0.13427,
-            "90": 0.13332,
-            "91": 0.27771,
-            "92": 0.13375,
-            "93": 0.1331,
-            "94": 0.13317,
-            "95": 0.13408,
-            "96": 0.13418,
-            "97": 0.13752,
-            "98": 0.13493,
-            "99": 0.13408,
-            "100": 0.13136
+            "28": 0.13381,
+            "29": 0.134,
+            "30": 0.13373,
+            "31": 0.27812,
+            "32": 0.13383,
+            "33": 0.13406,
+            "34": 0.13341,
+            "35": 0.13501,
+            "36": 0.13349,
+            "37": 0.13319,
+            "38": 0.13345,
+            "39": 0.13383,
+            "40": 0.13285,
+            "41": 0.29258,
+            "42": 0.13394,
+            "43": 0.13373,
+            "44": 0.13332,
+            "45": 0.13359,
+            "46": 0.13504,
+            "47": 0.13407,
+            "48": 0.13352,
+            "49": 0.13439,
+            "50": 0.1334,
+            "51": 0.28209,
+            "52": 0.13691,
+            "53": 0.13662,
+            "54": 0.13717,
+            "55": 0.13691,
+            "56": 0.13684,
+            "57": 0.13847,
+            "58": 0.13658,
+            "59": 0.13753,
+            "60": 0.13745,
+            "61": 0.30258,
+            "62": 0.13813,
+            "63": 0.14191,
+            "64": 0.13802,
+            "65": 0.13764,
+            "66": 0.13783,
+            "67": 0.13952,
+            "68": 0.13799,
+            "69": 0.13795,
+            "70": 0.13735,
+            "71": 0.30569,
+            "72": 0.13924,
+            "73": 0.1384,
+            "74": 0.13859,
+            "75": 0.13793,
+            "76": 0.13693,
+            "77": 0.13831,
+            "78": 0.13768,
+            "79": 0.1392,
+            "80": 0.13806,
+            "81": 0.30792,
+            "82": 0.1386,
+            "83": 0.13782,
+            "84": 0.13746,
+            "85": 0.13781,
+            "86": 0.13783,
+            "87": 0.13772,
+            "88": 0.13728,
+            "89": 0.13847,
+            "90": 0.13748,
+            "91": 0.31327,
+            "92": 0.13717,
+            "93": 0.138,
+            "94": 0.13824,
+            "95": 0.13692,
+            "96": 0.13681,
+            "97": 0.138,
+            "98": 0.13737,
+            "99": 0.13804,
+            "100": 0.13722
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..6785ccf3405
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.96249,
+            "2": 10.97263,
+            "3": 10.95604,
+            "4": 10.99185,
+            "5": 10.94911,
+            "6": 10.94586,
+            "7": 10.97227,
+            "8": 10.96531,
+            "9": 10.95864,
+            "10": 10.96522,
+            "11": 10.92975,
+            "12": 10.93134,
+            "13": 10.93843,
+            "14": 10.93051,
+            "15": 10.92149,
+            "16": 10.91358,
+            "17": 10.89583,
+            "18": 10.88315,
+            "19": 10.88739,
+            "20": 10.81664,
+            "21": 10.77565,
+            "22": 10.71731,
+            "23": 10.77156,
+            "24": 10.70856,
+            "25": 10.67765,
+            "26": 10.70309,
+            "27": 10.69021,
+            "28": 10.62094,
+            "29": 10.61335,
+            "30": 10.46967,
+            "31": 10.2743,
+            "32": 10.52078,
+            "33": 10.51563,
+            "34": 10.3085,
+            "35": 10.35579,
+            "36": 10.31814,
+            "37": 10.39823,
+            "38": 10.26329,
+            "39": 10.44238,
+            "40": 10.17104,
+            "41": 10.20058,
+            "42": 10.26164,
+            "43": 9.9303,
+            "44": 10.02911,
+            "45": 9.9202,
+            "46": 9.88631,
+            "47": 10.18638,
+            "48": 9.90626,
+            "49": 9.60031,
+            "50": 9.96555,
+            "51": 9.89946,
+            "52": 9.78501,
+            "53": 10.1053,
+            "54": 9.98473,
+            "55": 9.90831,
+            "56": 9.65981,
+            "57": 9.52396,
+            "58": 9.87215,
+            "59": 9.6169,
+            "60": 9.54609,
+            "61": 9.7001,
+            "62": 9.99569,
+            "63": 9.41669,
+            "64": 9.79572,
+            "65": 8.97339,
+            "66": 9.72409,
+            "67": 9.38538,
+            "68": 9.79899,
+            "69": 9.80931,
+            "70": 9.76598,
+            "71": 9.63141,
+            "72": 9.59357,
+            "73": 9.51102,
+            "74": 8.95643,
+            "75": 9.42625,
+            "76": 9.11036,
+            "77": 10.06643,
+            "78": 9.72178,
+            "79": 9.39646,
+            "80": 9.40915,
+            "81": 9.49577,
+            "82": 9.69623,
+            "83": 9.33227,
+            "84": 9.43138,
+            "85": 9.62886,
+            "86": 9.06094,
+            "87": 9.60054,
+            "88": 9.77282,
+            "89": 9.61807,
+            "90": 9.824,
+            "91": 9.3519,
+            "92": 9.37754,
+            "93": 9.09307,
+            "94": 8.83497,
+            "95": 9.52251,
+            "96": 9.53024,
+            "97": 9.32185,
+            "98": 9.68444,
+            "99": 8.8844,
+            "100": 9.4165
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 22727740.0,
+            "2": 22924404.0,
+            "3": 22597002.0,
+            "4": 23219532.0,
+            "5": 22715420.0,
+            "6": 23021500.0,
+            "7": 22771380.0,
+            "8": 22926852.0,
+            "9": 22841780.0,
+            "10": 22917780.0,
+            "11": 22500516.0,
+            "12": 22459810.0,
+            "13": 22917164.0,
+            "14": 22388968.0,
+            "15": 22821358.0,
+            "16": 22831192.0,
+            "17": 22819736.0,
+            "18": 22582350.0,
+            "19": 22618104.0,
+            "20": 22693400.0,
+            "21": 22739610.0,
+            "22": 22800008.0,
+            "23": 22538272.0,
+            "24": 22771352.0,
+            "25": 22819066.0,
+            "26": 22547720.0,
+            "27": 22469212.0,
+            "28": 22453960.0,
+            "29": 22529656.0,
+            "30": 22630960.0,
+            "31": 22955540.0,
+            "32": 22584916.0,
+            "33": 22558336.0,
+            "34": 22835478.0,
+            "35": 22787746.0,
+            "36": 22589468.0,
+            "37": 22496828.0,
+            "38": 22896094.0,
+            "39": 22802714.0,
+            "40": 22657992.0,
+            "41": 22659460.0,
+            "42": 22667202.0,
+            "43": 22977092.0,
+            "44": 22746836.0,
+            "45": 22675370.0,
+            "46": 22884172.0,
+            "47": 22633868.0,
+            "48": 22928116.0,
+            "49": 22727456.0,
+            "50": 22904148.0,
+            "51": 22792094.0,
+            "52": 22748864.0,
+            "53": 22925208.0,
+            "54": 22840064.0,
+            "55": 22518576.0,
+            "56": 22877644.0,
+            "57": 23113416.0,
+            "58": 22845068.0,
+            "59": 22715704.0,
+            "60": 22743324.0,
+            "61": 22723260.0,
+            "62": 22672600.0,
+            "63": 22846484.0,
+            "64": 22822992.0,
+            "65": 23061634.0,
+            "66": 22729736.0,
+            "67": 22908874.0,
+            "68": 22610620.0,
+            "69": 22583304.0,
+            "70": 22828816.0,
+            "71": 22748974.0,
+            "72": 22654840.0,
+            "73": 22741132.0,
+            "74": 23047902.0,
+            "75": 23054368.0,
+            "76": 22901688.0,
+            "77": 22272290.0,
+            "78": 22789530.0,
+            "79": 22743876.0,
+            "80": 22706184.0,
+            "81": 22891292.0,
+            "82": 22778490.0,
+            "83": 22839152.0,
+            "84": 23009710.0,
+            "85": 22711788.0,
+            "86": 23103398.0,
+            "87": 22735162.0,
+            "88": 22637356.0,
+            "89": 22498244.0,
+            "90": 22972336.0,
+            "91": 22767438.0,
+            "92": 22808640.0,
+            "93": 22658540.0,
+            "94": 22912524.0,
+            "95": 23048146.0,
+            "96": 22828804.0,
+            "97": 22608672.0,
+            "98": 22763072.0,
+            "99": 22906218.0,
+            "100": 23015634.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 548399616.0,
+            "2": 548399616.0,
+            "3": 548399616.0,
+            "4": 548399616.0,
+            "5": 548399616.0,
+            "6": 548399616.0,
+            "7": 548399616.0,
+            "8": 548399616.0,
+            "9": 548399616.0,
+            "10": 548399616.0,
+            "11": 548399616.0,
+            "12": 548399616.0,
+            "13": 548399616.0,
+            "14": 548399616.0,
+            "15": 548399616.0,
+            "16": 548399616.0,
+            "17": 548399616.0,
+            "18": 548399616.0,
+            "19": 548399616.0,
+            "20": 548399616.0,
+            "21": 548399616.0,
+            "22": 548399616.0,
+            "23": 548399616.0,
+            "24": 548399616.0,
+            "25": 548399616.0,
+            "26": 548399616.0,
+            "27": 548399616.0,
+            "28": 548399616.0,
+            "29": 548399616.0,
+            "30": 548399616.0,
+            "31": 548399616.0,
+            "32": 548399616.0,
+            "33": 548399616.0,
+            "34": 548399616.0,
+            "35": 548399616.0,
+            "36": 548399616.0,
+            "37": 548399616.0,
+            "38": 548399616.0,
+            "39": 548399616.0,
+            "40": 548399616.0,
+            "41": 548399616.0,
+            "42": 548399616.0,
+            "43": 548399616.0,
+            "44": 548399616.0,
+            "45": 548399616.0,
+            "46": 548399616.0,
+            "47": 548399616.0,
+            "48": 548399616.0,
+            "49": 548399616.0,
+            "50": 548399616.0,
+            "51": 548399616.0,
+            "52": 548399616.0,
+            "53": 548399616.0,
+            "54": 548399616.0,
+            "55": 548399616.0,
+            "56": 548399616.0,
+            "57": 548399616.0,
+            "58": 548399616.0,
+            "59": 548399616.0,
+            "60": 548399616.0,
+            "61": 548399616.0,
+            "62": 548399616.0,
+            "63": 548399616.0,
+            "64": 548399616.0,
+            "65": 548399616.0,
+            "66": 548399616.0,
+            "67": 548399616.0,
+            "68": 548399616.0,
+            "69": 548399616.0,
+            "70": 548399616.0,
+            "71": 548399616.0,
+            "72": 548399616.0,
+            "73": 548399616.0,
+            "74": 548399616.0,
+            "75": 548399616.0,
+            "76": 548399616.0,
+            "77": 548399616.0,
+            "78": 548399616.0,
+            "79": 548399616.0,
+            "80": 548399616.0,
+            "81": 548399616.0,
+            "82": 548399616.0,
+            "83": 548399616.0,
+            "84": 548399616.0,
+            "85": 548399616.0,
+            "86": 548399616.0,
+            "87": 548399616.0,
+            "88": 548399616.0,
+            "89": 548399616.0,
+            "90": 548399616.0,
+            "91": 548399616.0,
+            "92": 548399616.0,
+            "93": 548399616.0,
+            "94": 548399616.0,
+            "95": 548399616.0,
+            "96": 548399616.0,
+            "97": 548399616.0,
+            "98": 548399616.0,
+            "99": 548399616.0,
+            "100": 548399616.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2325370880.0,
+            "2": 2466227200.0,
+            "3": 2466227200.0,
+            "4": 2466227200.0,
+            "5": 2466227200.0,
+            "6": 2466227200.0,
+            "7": 2466227200.0,
+            "8": 2466227200.0,
+            "9": 2466227200.0,
+            "10": 2466227200.0,
+            "11": 2466227200.0,
+            "12": 2466227200.0,
+            "13": 2466227200.0,
+            "14": 2466227200.0,
+            "15": 2466227200.0,
+            "16": 2466227200.0,
+            "17": 2466227200.0,
+            "18": 2466227200.0,
+            "19": 2466227200.0,
+            "20": 2466227200.0,
+            "21": 2466227200.0,
+            "22": 2466227200.0,
+            "23": 2466227200.0,
+            "24": 2466227200.0,
+            "25": 2466227200.0,
+            "26": 2466227200.0,
+            "27": 2466227200.0,
+            "28": 2466227200.0,
+            "29": 2466227200.0,
+            "30": 2466227200.0,
+            "31": 2466227200.0,
+            "32": 2466227200.0,
+            "33": 2466227200.0,
+            "34": 2466227200.0,
+            "35": 2466227200.0,
+            "36": 2466227200.0,
+            "37": 2466227200.0,
+            "38": 2466227200.0,
+            "39": 2466227200.0,
+            "40": 2466227200.0,
+            "41": 2466227200.0,
+            "42": 2466227200.0,
+            "43": 2466227200.0,
+            "44": 2466227200.0,
+            "45": 2466227200.0,
+            "46": 2466227200.0,
+            "47": 2466227200.0,
+            "48": 2466227200.0,
+            "49": 2466227200.0,
+            "50": 2466227200.0,
+            "51": 2466227200.0,
+            "52": 2466227200.0,
+            "53": 2466227200.0,
+            "54": 2466227200.0,
+            "55": 2466227200.0,
+            "56": 2466227200.0,
+            "57": 2466227200.0,
+            "58": 2466227200.0,
+            "59": 2466227200.0,
+            "60": 2466227200.0,
+            "61": 2466227200.0,
+            "62": 2466227200.0,
+            "63": 2466227200.0,
+            "64": 2466227200.0,
+            "65": 2466227200.0,
+            "66": 2466227200.0,
+            "67": 2466227200.0,
+            "68": 2466227200.0,
+            "69": 2466227200.0,
+            "70": 2466227200.0,
+            "71": 2466227200.0,
+            "72": 2466227200.0,
+            "73": 2466227200.0,
+            "74": 2466227200.0,
+            "75": 2466227200.0,
+            "76": 2466227200.0,
+            "77": 2466227200.0,
+            "78": 2466227200.0,
+            "79": 2466227200.0,
+            "80": 2466227200.0,
+            "81": 2466227200.0,
+            "82": 2466227200.0,
+            "83": 2466227200.0,
+            "84": 2466227200.0,
+            "85": 2466227200.0,
+            "86": 2466227200.0,
+            "87": 2466227200.0,
+            "88": 2466227200.0,
+            "89": 2466227200.0,
+            "90": 2466227200.0,
+            "91": 2466227200.0,
+            "92": 2466227200.0,
+            "93": 2466227200.0,
+            "94": 2466227200.0,
+            "95": 2466227200.0,
+            "96": 2466227200.0,
+            "97": 2466227200.0,
+            "98": 2466227200.0,
+            "99": 2466227200.0,
+            "100": 2466227200.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.43543,
+            "2": 0.13665,
+            "3": 0.25165,
+            "4": 0.12154,
+            "5": 0.12485,
+            "6": 0.12299,
+            "7": 0.15052,
+            "8": 0.1169,
+            "9": 0.22995,
+            "10": 0.12843,
+            "11": 0.12174,
+            "12": 0.12421,
+            "13": 0.22106,
+            "14": 0.15546,
+            "15": 0.12535,
+            "16": 0.12448,
+            "17": 0.12283,
+            "18": 0.12251,
+            "19": 0.12247,
+            "20": 0.12198,
+            "21": 0.12007,
+            "22": 0.22391,
+            "23": 0.12977,
+            "24": 0.12857,
+            "25": 0.24314,
+            "26": 0.13193,
+            "27": 0.12813,
+            "28": 0.12875,
+            "29": 0.22448,
+            "30": 0.12465,
+            "31": 0.23898,
+            "32": 0.12577,
+            "33": 0.12563,
+            "34": 0.12562,
+            "35": 0.15646,
+            "36": 0.12633,
+            "37": 0.12485,
+            "38": 0.21163,
+            "39": 0.13978,
+            "40": 0.12472,
+            "41": 0.12409,
+            "42": 0.12462,
+            "43": 0.12837,
+            "44": 0.12431,
+            "45": 0.12445,
+            "46": 0.23272,
+            "47": 0.12786,
+            "48": 0.12842,
+            "49": 0.22766,
+            "50": 0.1262,
+            "51": 0.13206,
+            "52": 0.21451,
+            "53": 0.13634,
+            "54": 0.11899,
+            "55": 0.12242,
+            "56": 0.24089,
+            "57": 0.12507,
+            "58": 0.12886,
+            "59": 0.1281,
+            "60": 0.22921,
+            "61": 0.13825,
+            "62": 0.22494,
+            "63": 0.27913,
+            "64": 0.16101,
+            "65": 0.27886,
+            "66": 0.13864,
+            "67": 0.21998,
+            "68": 0.1264,
+            "69": 0.12091,
+            "70": 0.22463,
+            "71": 0.12416,
+            "72": 0.17663,
+            "73": 0.12113,
+            "74": 0.12227,
+            "75": 0.21518,
+            "76": 0.11973,
+            "77": 0.15395,
+            "78": 0.19544,
+            "79": 0.23282,
+            "80": 0.23167,
+            "81": 0.12293,
+            "82": 0.23426,
+            "83": 0.23926,
+            "84": 0.12806,
+            "85": 0.12027,
+            "86": 0.23455,
+            "87": 0.12541,
+            "88": 0.1208,
+            "89": 0.11759,
+            "90": 0.11849,
+            "91": 0.24522,
+            "92": 0.1157,
+            "93": 0.23994,
+            "94": 0.12794,
+            "95": 0.18044,
+            "96": 0.30003,
+            "97": 0.12202,
+            "98": 0.1229,
+            "99": 0.12193,
+            "100": 0.23044
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json
index 7848ef42dd8..65edeb55e3d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 13.44016,
-            "2": 0.17357,
-            "3": 0.14155,
-            "4": 0.14433,
-            "5": 0.14312,
-            "6": 0.14041,
-            "7": 0.14082,
-            "8": 0.13921,
-            "9": 0.1399,
-            "10": 0.13856,
-            "11": 0.13995,
-            "12": 0.13864,
-            "13": 0.13803,
-            "14": 0.13783,
-            "15": 0.13752,
-            "16": 0.13882,
-            "17": 0.13834,
-            "18": 0.13863,
-            "19": 0.13872,
-            "20": 0.1384,
-            "21": 0.13424,
-            "22": 0.13105,
-            "23": 0.13094,
-            "24": 0.1307,
-            "25": 0.13252,
-            "26": 0.13172,
-            "27": 0.12995,
-            "28": 0.13015,
-            "29": 0.13002,
-            "30": 0.13019,
-            "31": 0.13071,
-            "32": 0.13106,
-            "33": 0.1305,
-            "34": 0.13023,
-            "35": 0.13178,
-            "36": 0.13167,
-            "37": 0.13002,
-            "38": 0.13094,
-            "39": 0.13093,
-            "40": 0.13167,
-            "41": 0.13178,
-            "42": 0.13107,
-            "43": 0.1328,
-            "44": 0.13048,
-            "45": 0.13046,
-            "46": 0.13126,
-            "47": 0.12901,
-            "48": 0.12854,
-            "49": 0.12862,
-            "50": 0.12918,
-            "51": 0.14204,
-            "52": 0.13766,
-            "53": 0.13573,
-            "54": 0.13601,
-            "55": 0.13392,
-            "56": 0.13591,
-            "57": 0.13683,
-            "58": 0.13487,
-            "59": 0.13645,
-            "60": 0.13627,
-            "61": 0.13507,
-            "62": 0.13578,
-            "63": 0.13619,
-            "64": 0.13556,
-            "65": 0.13673,
-            "66": 0.13706,
-            "67": 0.13535,
-            "68": 0.13581,
-            "69": 0.1342,
-            "70": 0.13519,
-            "71": 0.13563,
-            "72": 0.13553,
-            "73": 0.13626,
-            "74": 0.13636,
-            "75": 0.1351,
-            "76": 0.13531,
-            "77": 0.1341,
-            "78": 0.13121,
-            "79": 0.13164,
-            "80": 0.1338,
-            "81": 0.13214,
-            "82": 0.13227,
-            "83": 0.13301,
-            "84": 0.13291,
-            "85": 0.13384,
-            "86": 0.13276,
-            "87": 0.13499,
-            "88": 0.13549,
-            "89": 0.13554,
-            "90": 0.13505,
-            "91": 0.13486,
-            "92": 0.13406,
-            "93": 0.13522,
-            "94": 0.13615,
-            "95": 0.1365,
-            "96": 0.13586,
-            "97": 0.13623,
-            "98": 0.13603,
-            "99": 0.13615,
-            "100": 0.13526
+            "1": 7.21369,
+            "2": 0.1831,
+            "3": 0.15682,
+            "4": 0.14056,
+            "5": 0.13853,
+            "6": 0.13587,
+            "7": 0.13515,
+            "8": 0.13475,
+            "9": 0.13511,
+            "10": 0.13623,
+            "11": 0.13495,
+            "12": 0.13604,
+            "13": 0.13619,
+            "14": 0.13493,
+            "15": 0.13654,
+            "16": 0.135,
+            "17": 0.13441,
+            "18": 0.13422,
+            "19": 0.13368,
+            "20": 0.13434,
+            "21": 0.13405,
+            "22": 0.13547,
+            "23": 0.13766,
+            "24": 0.14005,
+            "25": 0.1397,
+            "26": 0.13807,
+            "27": 0.13719,
+            "28": 0.13707,
+            "29": 0.1384,
+            "30": 0.13799,
+            "31": 0.13774,
+            "32": 0.13838,
+            "33": 0.13846,
+            "34": 0.13735,
+            "35": 0.1399,
+            "36": 0.13989,
+            "37": 0.13915,
+            "38": 0.1394,
+            "39": 0.14001,
+            "40": 0.13993,
+            "41": 0.13938,
+            "42": 0.14004,
+            "43": 0.14041,
+            "44": 0.14062,
+            "45": 0.13996,
+            "46": 0.14021,
+            "47": 0.14,
+            "48": 0.13971,
+            "49": 0.13941,
+            "50": 0.13887,
+            "51": 0.14225,
+            "52": 0.13981,
+            "53": 0.13886,
+            "54": 0.13925,
+            "55": 0.141,
+            "56": 0.13843,
+            "57": 0.14096,
+            "58": 0.13853,
+            "59": 0.13902,
+            "60": 0.13975,
+            "61": 0.13772,
+            "62": 0.13889,
+            "63": 0.1372,
+            "64": 0.13725,
+            "65": 0.13793,
+            "66": 0.13913,
+            "67": 0.13885,
+            "68": 0.13752,
+            "69": 0.13831,
+            "70": 0.13735,
+            "71": 0.13736,
+            "72": 0.13847,
+            "73": 0.13902,
+            "74": 0.13786,
+            "75": 0.1382,
+            "76": 0.13854,
+            "77": 0.13828,
+            "78": 0.13847,
+            "79": 0.13887,
+            "80": 0.13758,
+            "81": 0.13798,
+            "82": 0.13775,
+            "83": 0.13914,
+            "84": 0.13872,
+            "85": 0.13875,
+            "86": 0.13942,
+            "87": 0.13828,
+            "88": 0.1378,
+            "89": 0.13834,
+            "90": 0.1384,
+            "91": 0.13837,
+            "92": 0.13872,
+            "93": 0.13843,
+            "94": 0.13831,
+            "95": 0.13887,
+            "96": 0.13825,
+            "97": 0.13822,
+            "98": 0.13872,
+            "99": 0.13922,
+            "100": 0.13751
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..2999f912c8f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81847,
+            "2": 10.81935,
+            "3": 10.83689,
+            "4": 10.83432,
+            "5": 10.84975,
+            "6": 10.83477,
+            "7": 10.82465,
+            "8": 10.81547,
+            "9": 10.87712,
+            "10": 10.88236,
+            "11": 10.87197,
+            "12": 10.82476,
+            "13": 10.84812,
+            "14": 10.81966,
+            "15": 10.80548,
+            "16": 10.80144,
+            "17": 10.77232,
+            "18": 10.78639,
+            "19": 10.74499,
+            "20": 10.62485,
+            "21": 10.68096,
+            "22": 10.65118,
+            "23": 10.76355,
+            "24": 10.61936,
+            "25": 10.46094,
+            "26": 10.59639,
+            "27": 10.54041,
+            "28": 10.44451,
+            "29": 10.39564,
+            "30": 10.40393,
+            "31": 10.51276,
+            "32": 10.32147,
+            "33": 10.26365,
+            "34": 10.46889,
+            "35": 9.96002,
+            "36": 10.11577,
+            "37": 10.0112,
+            "38": 10.38367,
+            "39": 9.78625,
+            "40": 10.10474,
+            "41": 10.13172,
+            "42": 10.02873,
+            "43": 10.20988,
+            "44": 10.07363,
+            "45": 9.69403,
+            "46": 9.99615,
+            "47": 9.93462,
+            "48": 9.6742,
+            "49": 9.91778,
+            "50": 9.93162,
+            "51": 9.80504,
+            "52": 9.32627,
+            "53": 9.6594,
+            "54": 9.87232,
+            "55": 9.99774,
+            "56": 9.83023,
+            "57": 9.75542,
+            "58": 9.82528,
+            "59": 9.32819,
+            "60": 9.35425,
+            "61": 9.44562,
+            "62": 10.20265,
+            "63": 9.362,
+            "64": 9.63412,
+            "65": 9.71326,
+            "66": 9.53682,
+            "67": 9.67365,
+            "68": 9.5994,
+            "69": 9.38537,
+            "70": 9.75361,
+            "71": 9.88632,
+            "72": 9.70683,
+            "73": 9.40123,
+            "74": 9.44529,
+            "75": 8.96867,
+            "76": 9.57975,
+            "77": 9.62562,
+            "78": 9.40252,
+            "79": 9.54279,
+            "80": 9.32635,
+            "81": 9.70785,
+            "82": 9.91615,
+            "83": 9.33512,
+            "84": 9.47626,
+            "85": 8.98192,
+            "86": 9.67249,
+            "87": 9.44309,
+            "88": 9.59721,
+            "89": 9.53706,
+            "90": 9.56301,
+            "91": 9.63798,
+            "92": 9.14066,
+            "93": 9.4357,
+            "94": 9.55795,
+            "95": 9.14422,
+            "96": 8.77023,
+            "97": 9.58717,
+            "98": 9.79488,
+            "99": 9.38629,
+            "100": 9.21781
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1093.0,
+            "2": 1211.0,
+            "3": 1288.0,
+            "4": 1273.0,
+            "5": 1242.0,
+            "6": 1323.0,
+            "7": 1211.0,
+            "8": 999.0,
+            "9": 1427.0,
+            "10": 1373.0,
+            "11": 1223.0,
+            "12": 1326.0,
+            "13": 1295.0,
+            "14": 1137.0,
+            "15": 1228.0,
+            "16": 1206.0,
+            "17": 1192.0,
+            "18": 1345.0,
+            "19": 1109.0,
+            "20": 1104.0,
+            "21": 1244.0,
+            "22": 1180.0,
+            "23": 1301.0,
+            "24": 1301.0,
+            "25": 1101.0,
+            "26": 1277.0,
+            "27": 1268.0,
+            "28": 1267.0,
+            "29": 1314.0,
+            "30": 1418.0,
+            "31": 1467.0,
+            "32": 1463.0,
+            "33": 1457.0,
+            "34": 1519.0,
+            "35": 1308.0,
+            "36": 1289.0,
+            "37": 1397.0,
+            "38": 1566.0,
+            "39": 1356.0,
+            "40": 1499.0,
+            "41": 1618.0,
+            "42": 1607.0,
+            "43": 1715.0,
+            "44": 1532.0,
+            "45": 1441.0,
+            "46": 1780.0,
+            "47": 1585.0,
+            "48": 1610.0,
+            "49": 1736.0,
+            "50": 1689.0,
+            "51": 1743.0,
+            "52": 1684.0,
+            "53": 1829.0,
+            "54": 1884.0,
+            "55": 1833.0,
+            "56": 2031.0,
+            "57": 1941.0,
+            "58": 1755.0,
+            "59": 1637.0,
+            "60": 1841.0,
+            "61": 2259.0,
+            "62": 2132.0,
+            "63": 2034.0,
+            "64": 1929.0,
+            "65": 2296.0,
+            "66": 2209.0,
+            "67": 2152.0,
+            "68": 2259.0,
+            "69": 2150.0,
+            "70": 2498.0,
+            "71": 2338.0,
+            "72": 2491.0,
+            "73": 2089.0,
+            "74": 2324.0,
+            "75": 1882.0,
+            "76": 2210.0,
+            "77": 2293.0,
+            "78": 2482.0,
+            "79": 2651.0,
+            "80": 1935.0,
+            "81": 2339.0,
+            "82": 2512.0,
+            "83": 2503.0,
+            "84": 2027.0,
+            "85": 2248.0,
+            "86": 2323.0,
+            "87": 2665.0,
+            "88": 2316.0,
+            "89": 2574.0,
+            "90": 2400.0,
+            "91": 2451.0,
+            "92": 1991.0,
+            "93": 2150.0,
+            "94": 2443.0,
+            "95": 2381.0,
+            "96": 2114.0,
+            "97": 2288.0,
+            "98": 2287.0,
+            "99": 2302.0,
+            "100": 2104.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 759682560.0,
+            "2": 759682560.0,
+            "3": 759682560.0,
+            "4": 759682560.0,
+            "5": 759682560.0,
+            "6": 759682560.0,
+            "7": 759682560.0,
+            "8": 759682560.0,
+            "9": 759682560.0,
+            "10": 759682560.0,
+            "11": 759682560.0,
+            "12": 759682560.0,
+            "13": 759682560.0,
+            "14": 759682560.0,
+            "15": 759682560.0,
+            "16": 759682560.0,
+            "17": 759682560.0,
+            "18": 759682560.0,
+            "19": 759682560.0,
+            "20": 759682560.0,
+            "21": 759682560.0,
+            "22": 759682560.0,
+            "23": 759682560.0,
+            "24": 759682560.0,
+            "25": 759682560.0,
+            "26": 759682560.0,
+            "27": 759682560.0,
+            "28": 759682560.0,
+            "29": 759682560.0,
+            "30": 759682560.0,
+            "31": 759682560.0,
+            "32": 759682560.0,
+            "33": 759682560.0,
+            "34": 759682560.0,
+            "35": 759682560.0,
+            "36": 759682560.0,
+            "37": 759682560.0,
+            "38": 759682560.0,
+            "39": 759682560.0,
+            "40": 759682560.0,
+            "41": 759682560.0,
+            "42": 759682560.0,
+            "43": 759682560.0,
+            "44": 759682560.0,
+            "45": 759682560.0,
+            "46": 759682560.0,
+            "47": 759682560.0,
+            "48": 759682560.0,
+            "49": 759682560.0,
+            "50": 759682560.0,
+            "51": 759682560.0,
+            "52": 759682560.0,
+            "53": 759682560.0,
+            "54": 759682560.0,
+            "55": 759682560.0,
+            "56": 759682560.0,
+            "57": 759682560.0,
+            "58": 759682560.0,
+            "59": 759682560.0,
+            "60": 759682560.0,
+            "61": 759682560.0,
+            "62": 759682560.0,
+            "63": 759682560.0,
+            "64": 759682560.0,
+            "65": 759682560.0,
+            "66": 759682560.0,
+            "67": 759682560.0,
+            "68": 759682560.0,
+            "69": 759682560.0,
+            "70": 759682560.0,
+            "71": 759682560.0,
+            "72": 759682560.0,
+            "73": 759682560.0,
+            "74": 759682560.0,
+            "75": 759682560.0,
+            "76": 759682560.0,
+            "77": 759682560.0,
+            "78": 759682560.0,
+            "79": 759682560.0,
+            "80": 759682560.0,
+            "81": 759682560.0,
+            "82": 759682560.0,
+            "83": 759682560.0,
+            "84": 759682560.0,
+            "85": 759682560.0,
+            "86": 759682560.0,
+            "87": 759682560.0,
+            "88": 759682560.0,
+            "89": 759682560.0,
+            "90": 759682560.0,
+            "91": 759682560.0,
+            "92": 759682560.0,
+            "93": 759682560.0,
+            "94": 759682560.0,
+            "95": 759682560.0,
+            "96": 759682560.0,
+            "97": 759682560.0,
+            "98": 759682560.0,
+            "99": 759682560.0,
+            "100": 759682560.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2542994944.0,
+            "2": 2824706560.0,
+            "3": 2824706560.0,
+            "4": 2824706560.0,
+            "5": 2824706560.0,
+            "6": 2824706560.0,
+            "7": 2824706560.0,
+            "8": 2824706560.0,
+            "9": 2824706560.0,
+            "10": 2824706560.0,
+            "11": 2824706560.0,
+            "12": 2824706560.0,
+            "13": 2824706560.0,
+            "14": 2824706560.0,
+            "15": 2824706560.0,
+            "16": 2824706560.0,
+            "17": 2824706560.0,
+            "18": 2824706560.0,
+            "19": 2824706560.0,
+            "20": 2824706560.0,
+            "21": 2824706560.0,
+            "22": 2824706560.0,
+            "23": 2824706560.0,
+            "24": 2824706560.0,
+            "25": 2824706560.0,
+            "26": 2824706560.0,
+            "27": 2824706560.0,
+            "28": 2824706560.0,
+            "29": 2824706560.0,
+            "30": 2824706560.0,
+            "31": 2824706560.0,
+            "32": 2824706560.0,
+            "33": 2824706560.0,
+            "34": 2824706560.0,
+            "35": 2824706560.0,
+            "36": 2824706560.0,
+            "37": 2824706560.0,
+            "38": 2824706560.0,
+            "39": 2824706560.0,
+            "40": 2824706560.0,
+            "41": 2824706560.0,
+            "42": 2824706560.0,
+            "43": 2824706560.0,
+            "44": 2824706560.0,
+            "45": 2824706560.0,
+            "46": 2824706560.0,
+            "47": 2824706560.0,
+            "48": 2824706560.0,
+            "49": 2824706560.0,
+            "50": 2824706560.0,
+            "51": 2824706560.0,
+            "52": 2824706560.0,
+            "53": 2824706560.0,
+            "54": 2824706560.0,
+            "55": 2824706560.0,
+            "56": 2824706560.0,
+            "57": 2824706560.0,
+            "58": 2824706560.0,
+            "59": 2824706560.0,
+            "60": 2824706560.0,
+            "61": 2824706560.0,
+            "62": 2824706560.0,
+            "63": 2824706560.0,
+            "64": 2824706560.0,
+            "65": 2824706560.0,
+            "66": 2824706560.0,
+            "67": 2824706560.0,
+            "68": 2824706560.0,
+            "69": 2824706560.0,
+            "70": 2824706560.0,
+            "71": 2824706560.0,
+            "72": 2824706560.0,
+            "73": 2824706560.0,
+            "74": 2824706560.0,
+            "75": 2824706560.0,
+            "76": 2824706560.0,
+            "77": 2824706560.0,
+            "78": 2824706560.0,
+            "79": 2824706560.0,
+            "80": 2824706560.0,
+            "81": 2824706560.0,
+            "82": 2824706560.0,
+            "83": 2824706560.0,
+            "84": 2824706560.0,
+            "85": 2824706560.0,
+            "86": 2824706560.0,
+            "87": 2824706560.0,
+            "88": 2824706560.0,
+            "89": 2824706560.0,
+            "90": 2824706560.0,
+            "91": 2824706560.0,
+            "92": 2824706560.0,
+            "93": 2824706560.0,
+            "94": 2824706560.0,
+            "95": 2824706560.0,
+            "96": 2824706560.0,
+            "97": 2824706560.0,
+            "98": 2824706560.0,
+            "99": 2824706560.0,
+            "100": 2824706560.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 9.46016,
+            "2": 0.42187,
+            "3": 0.15692,
+            "4": 0.35623,
+            "5": 0.25874,
+            "6": 0.17276,
+            "7": 0.18359,
+            "8": 0.19391,
+            "9": 0.19884,
+            "10": 0.27267,
+            "11": 0.25203,
+            "12": 0.16389,
+            "13": 0.35153,
+            "14": 0.15991,
+            "15": 0.268,
+            "16": 0.1778,
+            "17": 0.18774,
+            "18": 0.18637,
+            "19": 0.17789,
+            "20": 0.22748,
+            "21": 0.23632,
+            "22": 0.15657,
+            "23": 0.30888,
+            "24": 0.15208,
+            "25": 0.14888,
+            "26": 0.22189,
+            "27": 0.17979,
+            "28": 0.24137,
+            "29": 0.2423,
+            "30": 0.27274,
+            "31": 0.26218,
+            "32": 0.20249,
+            "33": 0.41473,
+            "34": 0.23104,
+            "35": 0.3203,
+            "36": 0.20187,
+            "37": 0.15959,
+            "38": 0.35951,
+            "39": 0.15125,
+            "40": 0.15444,
+            "41": 0.15359,
+            "42": 0.35395,
+            "43": 0.29841,
+            "44": 0.14696,
+            "45": 0.15582,
+            "46": 0.4465,
+            "47": 0.15406,
+            "48": 0.16257,
+            "49": 0.15478,
+            "50": 0.15489,
+            "51": 0.1534,
+            "52": 0.40345,
+            "53": 0.14379,
+            "54": 0.31104,
+            "55": 0.14226,
+            "56": 0.23475,
+            "57": 0.31848,
+            "58": 0.1553,
+            "59": 0.15368,
+            "60": 0.24773,
+            "61": 0.26981,
+            "62": 0.14177,
+            "63": 0.15237,
+            "64": 0.18307,
+            "65": 0.23266,
+            "66": 0.24928,
+            "67": 0.36215,
+            "68": 0.15228,
+            "69": 0.21389,
+            "70": 0.35043,
+            "71": 0.14126,
+            "72": 0.3495,
+            "73": 0.23925,
+            "74": 0.23063,
+            "75": 0.14077,
+            "76": 0.14281,
+            "77": 0.14126,
+            "78": 0.14448,
+            "79": 0.14178,
+            "80": 0.22094,
+            "81": 0.13999,
+            "82": 0.30865,
+            "83": 0.14029,
+            "84": 0.15021,
+            "85": 0.14158,
+            "86": 0.14189,
+            "87": 0.14288,
+            "88": 0.22637,
+            "89": 0.14095,
+            "90": 0.23496,
+            "91": 0.18038,
+            "92": 0.14174,
+            "93": 0.1569,
+            "94": 0.34426,
+            "95": 0.14211,
+            "96": 0.14174,
+            "97": 0.14527,
+            "98": 0.14364,
+            "99": 0.1424,
+            "100": 0.21352
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgx_a100.json
index 4771e4e3c8c..facbb05b6ce 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 8.66407,
-            "2": 0.18828,
-            "3": 0.15715,
-            "4": 0.15685,
-            "5": 0.1544,
-            "6": 0.15356,
-            "7": 0.15196,
-            "8": 0.15101,
-            "9": 0.15114,
-            "10": 0.15067,
-            "11": 0.15113,
-            "12": 0.15109,
-            "13": 0.15255,
-            "14": 0.15181,
-            "15": 0.15165,
-            "16": 0.14989,
-            "17": 0.15094,
-            "18": 0.15062,
-            "19": 0.15148,
-            "20": 0.15014,
-            "21": 0.15114,
-            "22": 0.14973,
-            "23": 0.15192,
-            "24": 0.15003,
-            "25": 0.15228,
-            "26": 0.15066,
-            "27": 0.15209,
-            "28": 0.15056,
-            "29": 0.1516,
-            "30": 0.15083,
-            "31": 0.15211,
-            "32": 0.15028,
-            "33": 0.1518,
-            "34": 0.1494,
-            "35": 0.1521,
-            "36": 0.15002,
-            "37": 0.15257,
-            "38": 0.15095,
-            "39": 0.1517,
-            "40": 0.1501,
-            "41": 0.15352,
-            "42": 0.15453,
-            "43": 0.15187,
-            "44": 0.15281,
-            "45": 0.15294,
-            "46": 0.15214,
-            "47": 0.15376,
-            "48": 0.15363,
-            "49": 0.15977,
-            "50": 0.15249,
-            "51": 0.15543,
-            "52": 0.15363,
-            "53": 0.15379,
-            "54": 0.15555,
-            "55": 0.15252,
-            "56": 0.15295,
-            "57": 0.15496,
-            "58": 0.15756,
-            "59": 0.15345,
-            "60": 0.15784,
-            "61": 0.1581,
-            "62": 0.15302,
-            "63": 0.15579,
-            "64": 0.1536,
-            "65": 0.15523,
-            "66": 0.15593,
-            "67": 0.15868,
-            "68": 0.15303,
-            "69": 0.1554,
-            "70": 0.15409,
-            "71": 0.15229,
-            "72": 0.15299,
-            "73": 0.15495,
-            "74": 0.15601,
-            "75": 0.15285,
-            "76": 0.15774,
-            "77": 0.15171,
-            "78": 0.15423,
-            "79": 0.15398,
-            "80": 0.15445,
-            "81": 0.15381,
-            "82": 0.15311,
-            "83": 0.15584,
-            "84": 0.15556,
-            "85": 0.15506,
-            "86": 0.15314,
-            "87": 0.15269,
-            "88": 0.15515,
-            "89": 0.15923,
-            "90": 0.15325,
-            "91": 0.15755,
-            "92": 0.1543,
-            "93": 0.15481,
-            "94": 0.15321,
-            "95": 0.15397,
-            "96": 0.15322,
-            "97": 0.15471,
-            "98": 0.15631,
-            "99": 0.15271,
-            "100": 0.15653
+            "1": 4.52697,
+            "2": 0.21474,
+            "3": 0.18314,
+            "4": 0.16433,
+            "5": 0.16389,
+            "6": 0.16359,
+            "7": 0.16288,
+            "8": 0.16485,
+            "9": 0.16341,
+            "10": 0.16636,
+            "11": 0.16459,
+            "12": 0.16651,
+            "13": 0.16923,
+            "14": 0.16588,
+            "15": 0.16651,
+            "16": 0.16571,
+            "17": 0.16475,
+            "18": 0.16415,
+            "19": 0.16344,
+            "20": 0.16403,
+            "21": 0.16411,
+            "22": 0.16617,
+            "23": 0.16394,
+            "24": 0.16115,
+            "25": 0.16345,
+            "26": 0.16393,
+            "27": 0.16292,
+            "28": 0.16353,
+            "29": 0.1621,
+            "30": 0.1632,
+            "31": 0.16184,
+            "32": 0.16212,
+            "33": 0.16236,
+            "34": 0.16223,
+            "35": 0.16188,
+            "36": 0.16211,
+            "37": 0.16174,
+            "38": 0.16217,
+            "39": 0.16213,
+            "40": 0.16319,
+            "41": 0.1679,
+            "42": 0.17056,
+            "43": 0.16263,
+            "44": 0.1638,
+            "45": 0.16323,
+            "46": 0.16272,
+            "47": 0.16241,
+            "48": 0.16364,
+            "49": 0.16119,
+            "50": 0.16337,
+            "51": 0.16229,
+            "52": 0.16049,
+            "53": 0.16182,
+            "54": 0.15929,
+            "55": 0.15979,
+            "56": 0.15935,
+            "57": 0.15888,
+            "58": 0.16004,
+            "59": 0.15878,
+            "60": 0.15969,
+            "61": 0.16006,
+            "62": 0.15989,
+            "63": 0.15996,
+            "64": 0.15989,
+            "65": 0.15888,
+            "66": 0.15863,
+            "67": 0.15963,
+            "68": 0.15962,
+            "69": 0.15986,
+            "70": 0.15937,
+            "71": 0.15986,
+            "72": 0.15975,
+            "73": 0.16047,
+            "74": 0.15974,
+            "75": 0.1605,
+            "76": 0.15902,
+            "77": 0.16002,
+            "78": 0.15954,
+            "79": 0.16066,
+            "80": 0.15999,
+            "81": 0.15955,
+            "82": 0.15938,
+            "83": 0.16064,
+            "84": 0.15923,
+            "85": 0.15974,
+            "86": 0.1596,
+            "87": 0.16022,
+            "88": 0.15929,
+            "89": 0.15973,
+            "90": 0.16082,
+            "91": 0.15947,
+            "92": 0.16049,
+            "93": 0.1592,
+            "94": 0.15949,
+            "95": 0.16054,
+            "96": 0.1606,
+            "97": 0.15901,
+            "98": 0.15935,
+            "99": 0.16016,
+            "100": 0.15993
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..8a90b6fb7df
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81848,
+            "2": 10.8198,
+            "3": 10.83668,
+            "4": 10.83525,
+            "5": 10.84996,
+            "6": 10.83445,
+            "7": 10.82529,
+            "8": 10.81514,
+            "9": 10.87713,
+            "10": 10.88261,
+            "11": 10.87195,
+            "12": 10.8249,
+            "13": 10.84823,
+            "14": 10.81959,
+            "15": 10.80596,
+            "16": 10.80141,
+            "17": 10.77143,
+            "18": 10.78633,
+            "19": 10.74566,
+            "20": 10.62432,
+            "21": 10.68067,
+            "22": 10.65086,
+            "23": 10.76421,
+            "24": 10.61849,
+            "25": 10.46057,
+            "26": 10.59622,
+            "27": 10.54041,
+            "28": 10.44496,
+            "29": 10.39552,
+            "30": 10.40391,
+            "31": 10.51272,
+            "32": 10.32089,
+            "33": 10.26353,
+            "34": 10.46902,
+            "35": 9.95972,
+            "36": 10.11517,
+            "37": 10.01099,
+            "38": 10.38317,
+            "39": 9.78588,
+            "40": 10.10413,
+            "41": 10.13151,
+            "42": 10.02832,
+            "43": 10.2098,
+            "44": 10.07339,
+            "45": 9.69361,
+            "46": 9.99604,
+            "47": 9.93464,
+            "48": 9.67414,
+            "49": 9.91775,
+            "50": 9.93121
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1118.0,
+            "2": 1206.0,
+            "3": 1308.0,
+            "4": 1243.0,
+            "5": 1256.0,
+            "6": 1296.0,
+            "7": 1259.0,
+            "8": 1023.0,
+            "9": 1295.0,
+            "10": 1319.0,
+            "11": 1282.0,
+            "12": 1361.0,
+            "13": 1336.0,
+            "14": 1176.0,
+            "15": 1188.0,
+            "16": 1255.0,
+            "17": 1182.0,
+            "18": 1341.0,
+            "19": 1043.0,
+            "20": 1099.0,
+            "21": 1248.0,
+            "22": 1233.0,
+            "23": 1369.0,
+            "24": 1365.0,
+            "25": 1073.0,
+            "26": 1245.0,
+            "27": 1211.0,
+            "28": 1306.0,
+            "29": 1317.0,
+            "30": 1426.0,
+            "31": 1476.0,
+            "32": 1399.0,
+            "33": 1444.0,
+            "34": 1483.0,
+            "35": 1242.0,
+            "36": 1326.0,
+            "37": 1447.0,
+            "38": 1542.0,
+            "39": 1342.0,
+            "40": 1560.0,
+            "41": 1611.0,
+            "42": 1607.0,
+            "43": 1651.0,
+            "44": 1594.0,
+            "45": 1499.0,
+            "46": 1744.0,
+            "47": 1571.0,
+            "48": 1523.0,
+            "49": 1629.0,
+            "50": 1747.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 759682560.0,
+            "2": 759682560.0,
+            "3": 759682560.0,
+            "4": 759682560.0,
+            "5": 759682560.0,
+            "6": 759682560.0,
+            "7": 759682560.0,
+            "8": 759682560.0,
+            "9": 759682560.0,
+            "10": 759682560.0,
+            "11": 759682560.0,
+            "12": 759682560.0,
+            "13": 759682560.0,
+            "14": 759682560.0,
+            "15": 759682560.0,
+            "16": 759682560.0,
+            "17": 759682560.0,
+            "18": 759682560.0,
+            "19": 759682560.0,
+            "20": 759682560.0,
+            "21": 759682560.0,
+            "22": 759682560.0,
+            "23": 759682560.0,
+            "24": 759682560.0,
+            "25": 759682560.0,
+            "26": 759682560.0,
+            "27": 759682560.0,
+            "28": 759682560.0,
+            "29": 759682560.0,
+            "30": 759682560.0,
+            "31": 759682560.0,
+            "32": 759682560.0,
+            "33": 759682560.0,
+            "34": 759682560.0,
+            "35": 759682560.0,
+            "36": 759682560.0,
+            "37": 759682560.0,
+            "38": 759682560.0,
+            "39": 759682560.0,
+            "40": 759682560.0,
+            "41": 759682560.0,
+            "42": 759682560.0,
+            "43": 759682560.0,
+            "44": 759682560.0,
+            "45": 759682560.0,
+            "46": 759682560.0,
+            "47": 759682560.0,
+            "48": 759682560.0,
+            "49": 759682560.0,
+            "50": 759682560.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4340903936.0,
+            "2": 4622615552.0,
+            "3": 4622615552.0,
+            "4": 4622615552.0,
+            "5": 4622615552.0,
+            "6": 4622615552.0,
+            "7": 4622615552.0,
+            "8": 4622615552.0,
+            "9": 4622615552.0,
+            "10": 4622615552.0,
+            "11": 4622615552.0,
+            "12": 4622615552.0,
+            "13": 4622615552.0,
+            "14": 4622615552.0,
+            "15": 4622615552.0,
+            "16": 4622615552.0,
+            "17": 4622615552.0,
+            "18": 4622615552.0,
+            "19": 4622615552.0,
+            "20": 4622615552.0,
+            "21": 4622615552.0,
+            "22": 4622615552.0,
+            "23": 4622615552.0,
+            "24": 4622615552.0,
+            "25": 4622615552.0,
+            "26": 4622615552.0,
+            "27": 4622615552.0,
+            "28": 4622615552.0,
+            "29": 4622615552.0,
+            "30": 4622615552.0,
+            "31": 4622615552.0,
+            "32": 4622615552.0,
+            "33": 4622615552.0,
+            "34": 4622615552.0,
+            "35": 4622615552.0,
+            "36": 4622615552.0,
+            "37": 4622615552.0,
+            "38": 4622615552.0,
+            "39": 4622615552.0,
+            "40": 4622615552.0,
+            "41": 4622615552.0,
+            "42": 4622615552.0,
+            "43": 4622615552.0,
+            "44": 4622615552.0,
+            "45": 4622615552.0,
+            "46": 4622615552.0,
+            "47": 4622615552.0,
+            "48": 4622615552.0,
+            "49": 4622615552.0,
+            "50": 4622615552.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.1265,
+            "3": 0.15779,
+            "4": 0.14192,
+            "5": 0.14446,
+            "6": 0.14251,
+            "7": 0.14375,
+            "8": 0.1446,
+            "9": 0.14351,
+            "10": 0.14568,
+            "11": 0.14477,
+            "12": 0.14491,
+            "13": 0.1447,
+            "14": 0.14656,
+            "15": 0.14652,
+            "16": 0.14521,
+            "17": 0.14638,
+            "18": 0.14483,
+            "19": 0.14549,
+            "20": 0.14457,
+            "21": 0.14306,
+            "22": 0.14559,
+            "23": 0.14596,
+            "24": 0.14513,
+            "25": 0.14367,
+            "26": 0.14368,
+            "27": 0.14398,
+            "28": 0.14369,
+            "29": 0.14435,
+            "30": 0.14415,
+            "31": 0.1433,
+            "32": 0.14342,
+            "33": 0.1441,
+            "34": 0.14372,
+            "35": 0.14431,
+            "36": 0.1454,
+            "37": 0.14634,
+            "38": 0.14514,
+            "39": 0.14529,
+            "40": 0.14504,
+            "41": 0.14496,
+            "42": 0.14436,
+            "43": 0.14492,
+            "44": 0.14452,
+            "45": 0.14629,
+            "46": 0.14514,
+            "47": 0.14578,
+            "48": 0.1442,
+            "49": 0.14396,
+            "50": 0.14376
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_h100.json
index 3f213856697..399a2c50a8d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_h100.json
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 13.74796,
-            "2": 0.16361,
-            "3": 0.12487,
-            "4": 0.11772,
-            "5": 0.11849,
-            "6": 0.11989,
-            "7": 0.11765,
-            "8": 0.11845,
-            "9": 0.11909,
-            "10": 0.11808,
-            "11": 0.11972,
-            "12": 0.12,
-            "13": 0.11843,
-            "14": 0.11918,
-            "15": 0.11921,
-            "16": 0.11744,
-            "17": 0.11954,
-            "18": 0.11987,
-            "19": 0.12032,
-            "20": 0.11887,
-            "21": 0.16664,
-            "22": 0.14091,
-            "23": 0.11946,
-            "24": 0.11878,
-            "25": 0.12175,
-            "26": 0.16637,
-            "27": 0.12057,
-            "28": 0.11963,
-            "29": 0.11766,
-            "30": 0.11771,
-            "31": 0.11891,
-            "32": 0.11873,
-            "33": 0.12109,
-            "34": 0.12022,
-            "35": 0.11979,
-            "36": 0.12012,
-            "37": 0.11942,
-            "38": 0.12115,
-            "39": 0.1194,
-            "40": 0.12047,
-            "41": 0.12028,
-            "42": 0.12169,
-            "43": 0.12404,
-            "44": 0.12402,
-            "45": 0.12356,
-            "46": 0.12029,
-            "47": 0.11637,
-            "48": 0.11959,
-            "49": 0.11817,
-            "50": 0.12162
+            "1": 10.04337,
+            "2": 0.16822,
+            "3": 0.13237,
+            "4": 0.10427,
+            "5": 0.10319,
+            "6": 0.10424,
+            "7": 0.10225,
+            "8": 0.10398,
+            "9": 0.10251,
+            "10": 0.10246,
+            "11": 0.10345,
+            "12": 0.103,
+            "13": 0.10547,
+            "14": 0.10352,
+            "15": 0.10359,
+            "16": 0.1027,
+            "17": 0.10378,
+            "18": 0.10313,
+            "19": 0.10368,
+            "20": 0.10223,
+            "21": 0.10211,
+            "22": 0.1031,
+            "23": 0.10247,
+            "24": 0.1027,
+            "25": 0.10174,
+            "26": 0.10084,
+            "27": 0.10138,
+            "28": 0.10076,
+            "29": 0.10064,
+            "30": 0.10061,
+            "31": 0.10034,
+            "32": 0.10099,
+            "33": 0.10117,
+            "34": 0.10033,
+            "35": 0.10174,
+            "36": 0.10259,
+            "37": 0.1046,
+            "38": 0.10281,
+            "39": 0.10332,
+            "40": 0.10619,
+            "41": 0.10943,
+            "42": 0.10864,
+            "43": 0.10388,
+            "44": 0.10366,
+            "45": 0.10485,
+            "46": 0.10446,
+            "47": 0.10301,
+            "48": 0.10412,
+            "49": 0.10182,
+            "50": 0.10428
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgx_a100.json
index 16019e9879e..f8dcbbe7370 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgx_a100.json
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 9.03263,
-            "2": 0.21266,
-            "3": 0.17373,
-            "4": 0.17827,
-            "5": 0.17392,
-            "6": 0.17641,
-            "7": 0.17509,
-            "8": 0.17211,
-            "9": 0.17464,
-            "10": 0.21373,
-            "11": 0.17143,
-            "12": 0.17137,
-            "13": 0.17701,
-            "14": 0.17242,
-            "15": 0.16945,
-            "16": 0.1686,
-            "17": 0.16945,
-            "18": 0.16793,
-            "19": 0.16997,
-            "20": 0.16992,
-            "21": 0.17016,
-            "22": 0.16832,
-            "23": 0.16853,
-            "24": 0.16912,
-            "25": 0.16822,
-            "26": 0.16908,
-            "27": 0.16609,
-            "28": 0.239,
-            "29": 0.16968,
-            "30": 0.16763,
-            "31": 0.16962,
-            "32": 0.16788,
-            "33": 0.1681,
-            "34": 0.16749,
-            "35": 0.16866,
-            "36": 0.1697,
-            "37": 0.16838,
-            "38": 0.16867,
-            "39": 0.16699,
-            "40": 0.17098,
-            "41": 0.1671,
-            "42": 0.17036,
-            "43": 0.16755,
-            "44": 0.16699,
-            "45": 0.1678,
-            "46": 0.17136,
-            "47": 0.16725,
-            "48": 0.17257,
-            "49": 0.16903,
-            "50": 0.1687
+            "1": 4.65524,
+            "2": 0.20203,
+            "3": 0.1867,
+            "4": 0.16962,
+            "5": 0.16879,
+            "6": 0.16945,
+            "7": 0.16988,
+            "8": 0.16975,
+            "9": 0.16924,
+            "10": 0.16948,
+            "11": 0.17005,
+            "12": 0.16958,
+            "13": 0.16927,
+            "14": 0.16868,
+            "15": 0.1691,
+            "16": 0.16964,
+            "17": 0.17076,
+            "18": 0.16992,
+            "19": 0.17012,
+            "20": 0.17014,
+            "21": 0.16937,
+            "22": 0.16994,
+            "23": 0.16976,
+            "24": 0.16985,
+            "25": 0.16941,
+            "26": 0.16946,
+            "27": 0.16954,
+            "28": 0.16999,
+            "29": 0.17047,
+            "30": 0.17035,
+            "31": 0.16906,
+            "32": 0.17029,
+            "33": 0.17019,
+            "34": 0.17057,
+            "35": 0.17053,
+            "36": 0.16952,
+            "37": 0.16983,
+            "38": 0.16978,
+            "39": 0.17145,
+            "40": 0.17013,
+            "41": 0.17043,
+            "42": 0.17038,
+            "43": 0.1705,
+            "44": 0.17028,
+            "45": 0.17067,
+            "46": 0.16968,
+            "47": 0.16977,
+            "48": 0.16977,
+            "49": 0.16921,
+            "50": 0.17026
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..da72109d85d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.93757,
+            "2": 10.92393,
+            "3": 10.94318,
+            "4": 10.93348,
+            "5": 10.93027,
+            "6": 10.92214,
+            "7": 10.9129,
+            "8": 10.92494,
+            "9": 10.94302,
+            "10": 10.92589,
+            "11": 10.89715,
+            "12": 10.91085,
+            "13": 10.91359,
+            "14": 10.90092,
+            "15": 10.87211,
+            "16": 10.86524,
+            "17": 10.869,
+            "18": 10.85374,
+            "19": 10.84295,
+            "20": 10.76663,
+            "21": 10.74374,
+            "22": 10.67695,
+            "23": 10.72701,
+            "24": 10.66494,
+            "25": 10.62546,
+            "26": 10.654,
+            "27": 10.62035,
+            "28": 10.56813,
+            "29": 10.56412,
+            "30": 10.41005,
+            "31": 10.21717,
+            "32": 10.46613,
+            "33": 10.47136,
+            "34": 10.26038,
+            "35": 10.30272,
+            "36": 10.264,
+            "37": 10.35196,
+            "38": 10.22183,
+            "39": 10.38981,
+            "40": 10.11089,
+            "41": 10.13597,
+            "42": 10.21619,
+            "43": 9.89444,
+            "44": 9.985,
+            "45": 9.87317,
+            "46": 9.86222,
+            "47": 10.13614,
+            "48": 9.86196,
+            "49": 9.56912,
+            "50": 9.91564
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 22727242.0,
+            "2": 22924896.0,
+            "3": 22597216.0,
+            "4": 23219716.0,
+            "5": 22714012.0,
+            "6": 23021178.0,
+            "7": 22770808.0,
+            "8": 22926716.0,
+            "9": 22842500.0,
+            "10": 22918960.0,
+            "11": 22500834.0,
+            "12": 22460340.0,
+            "13": 22917536.0,
+            "14": 22388990.0,
+            "15": 22821224.0,
+            "16": 22831266.0,
+            "17": 22819108.0,
+            "18": 22582264.0,
+            "19": 22617384.0,
+            "20": 22693436.0,
+            "21": 22739352.0,
+            "22": 22800104.0,
+            "23": 22539998.0,
+            "24": 22771512.0,
+            "25": 22819132.0,
+            "26": 22547588.0,
+            "27": 22468844.0,
+            "28": 22453516.0,
+            "29": 22529320.0,
+            "30": 22630996.0,
+            "31": 22955520.0,
+            "32": 22585756.0,
+            "33": 22557744.0,
+            "34": 22835696.0,
+            "35": 22787828.0,
+            "36": 22588412.0,
+            "37": 22498040.0,
+            "38": 22896082.0,
+            "39": 22801992.0,
+            "40": 22657536.0,
+            "41": 22659220.0,
+            "42": 22667844.0,
+            "43": 22975904.0,
+            "44": 22745960.0,
+            "45": 22675400.0,
+            "46": 22884844.0,
+            "47": 22633716.0,
+            "48": 22928608.0,
+            "49": 22727282.0,
+            "50": 22904808.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 688128512.0,
+            "2": 688128512.0,
+            "3": 688128512.0,
+            "4": 688128512.0,
+            "5": 688128512.0,
+            "6": 688128512.0,
+            "7": 688128512.0,
+            "8": 688128512.0,
+            "9": 688128512.0,
+            "10": 688128512.0,
+            "11": 688128512.0,
+            "12": 688128512.0,
+            "13": 688128512.0,
+            "14": 688128512.0,
+            "15": 688128512.0,
+            "16": 688128512.0,
+            "17": 688128512.0,
+            "18": 688128512.0,
+            "19": 688128512.0,
+            "20": 688128512.0,
+            "21": 688128512.0,
+            "22": 688128512.0,
+            "23": 688128512.0,
+            "24": 688128512.0,
+            "25": 688128512.0,
+            "26": 688128512.0,
+            "27": 688128512.0,
+            "28": 688128512.0,
+            "29": 688128512.0,
+            "30": 688128512.0,
+            "31": 688128512.0,
+            "32": 688128512.0,
+            "33": 688128512.0,
+            "34": 688128512.0,
+            "35": 688128512.0,
+            "36": 688128512.0,
+            "37": 688128512.0,
+            "38": 688128512.0,
+            "39": 688128512.0,
+            "40": 688128512.0,
+            "41": 688128512.0,
+            "42": 688128512.0,
+            "43": 688128512.0,
+            "44": 688128512.0,
+            "45": 688128512.0,
+            "46": 688128512.0,
+            "47": 688128512.0,
+            "48": 688128512.0,
+            "49": 688128512.0,
+            "50": 688128512.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 2158025216.0,
+            "2": 2416613888.0,
+            "3": 2416613888.0,
+            "4": 2416613888.0,
+            "5": 2416613888.0,
+            "6": 2416613888.0,
+            "7": 2416613888.0,
+            "8": 2416613888.0,
+            "9": 2416613888.0,
+            "10": 2416613888.0,
+            "11": 2416613888.0,
+            "12": 2416613888.0,
+            "13": 2416613888.0,
+            "14": 2416613888.0,
+            "15": 2416613888.0,
+            "16": 2416613888.0,
+            "17": 2416613888.0,
+            "18": 2416613888.0,
+            "19": 2416613888.0,
+            "20": 2416613888.0,
+            "21": 2416613888.0,
+            "22": 2416613888.0,
+            "23": 2416613888.0,
+            "24": 2416613888.0,
+            "25": 2416613888.0,
+            "26": 2416613888.0,
+            "27": 2416613888.0,
+            "28": 2416613888.0,
+            "29": 2416613888.0,
+            "30": 2416613888.0,
+            "31": 2416613888.0,
+            "32": 2416613888.0,
+            "33": 2416613888.0,
+            "34": 2416613888.0,
+            "35": 2416613888.0,
+            "36": 2416613888.0,
+            "37": 2416613888.0,
+            "38": 2416613888.0,
+            "39": 2416613888.0,
+            "40": 2416613888.0,
+            "41": 2416613888.0,
+            "42": 2416613888.0,
+            "43": 2416613888.0,
+            "44": 2416613888.0,
+            "45": 2416613888.0,
+            "46": 2416613888.0,
+            "47": 2416613888.0,
+            "48": 2416613888.0,
+            "49": 2416613888.0,
+            "50": 2416613888.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.44061,
+            "3": 0.15375,
+            "4": 0.14111,
+            "5": 0.14053,
+            "6": 0.14023,
+            "7": 0.14152,
+            "8": 0.14128,
+            "9": 0.1417,
+            "10": 0.14155,
+            "11": 0.14076,
+            "12": 0.1405,
+            "13": 0.14129,
+            "14": 0.14106,
+            "15": 0.14101,
+            "16": 0.14178,
+            "17": 0.14173,
+            "18": 0.14103,
+            "19": 0.14094,
+            "20": 0.14012,
+            "21": 0.14153,
+            "22": 0.14228,
+            "23": 0.14118,
+            "24": 0.14079,
+            "25": 0.14034,
+            "26": 0.14027,
+            "27": 0.13947,
+            "28": 0.13928,
+            "29": 0.1398,
+            "30": 0.14085,
+            "31": 0.14179,
+            "32": 0.13944,
+            "33": 0.14174,
+            "34": 0.1436,
+            "35": 0.13902,
+            "36": 0.13933,
+            "37": 0.13922,
+            "38": 0.13997,
+            "39": 0.13881,
+            "40": 0.13924,
+            "41": 0.1392,
+            "42": 0.14092,
+            "43": 0.14136,
+            "44": 0.14035,
+            "45": 0.13841,
+            "46": 0.1411,
+            "47": 0.13878,
+            "48": 0.14005,
+            "49": 0.13925,
+            "50": 0.13845
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_h100.json
index ea2bd7effce..8a17375878f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 689176064.0,
-            "2": 689176064.0,
-            "3": 689176064.0,
-            "4": 689176064.0,
-            "5": 689176064.0,
-            "6": 689176064.0,
-            "7": 689176064.0,
-            "8": 689176064.0,
-            "9": 689176064.0,
-            "10": 689176064.0,
-            "11": 689176064.0,
-            "12": 689176064.0,
-            "13": 689176064.0,
-            "14": 689176064.0,
-            "15": 689176064.0,
-            "16": 689176064.0,
-            "17": 689176064.0,
-            "18": 689176064.0,
-            "19": 689176064.0,
-            "20": 689176064.0,
-            "21": 689176064.0,
-            "22": 689176064.0,
-            "23": 689176064.0,
-            "24": 689176064.0,
-            "25": 689176064.0,
-            "26": 689176064.0,
-            "27": 689176064.0,
-            "28": 689176064.0,
-            "29": 689176064.0,
-            "30": 689176064.0,
-            "31": 689176064.0,
-            "32": 689176064.0,
-            "33": 689176064.0,
-            "34": 689176064.0,
-            "35": 689176064.0,
-            "36": 689176064.0,
-            "37": 689176064.0,
-            "38": 689176064.0,
-            "39": 689176064.0,
-            "40": 689176064.0,
-            "41": 689176064.0,
-            "42": 689176064.0,
-            "43": 689176064.0,
-            "44": 689176064.0,
-            "45": 689176064.0,
-            "46": 689176064.0,
-            "47": 689176064.0,
-            "48": 689176064.0,
-            "49": 689176064.0,
-            "50": 689176064.0
+            "1": 687079936.0,
+            "2": 687079936.0,
+            "3": 687079936.0,
+            "4": 687079936.0,
+            "5": 687079936.0,
+            "6": 687079936.0,
+            "7": 687079936.0,
+            "8": 687079936.0,
+            "9": 687079936.0,
+            "10": 687079936.0,
+            "11": 687079936.0,
+            "12": 687079936.0,
+            "13": 687079936.0,
+            "14": 687079936.0,
+            "15": 687079936.0,
+            "16": 687079936.0,
+            "17": 687079936.0,
+            "18": 687079936.0,
+            "19": 687079936.0,
+            "20": 687079936.0,
+            "21": 687079936.0,
+            "22": 687079936.0,
+            "23": 687079936.0,
+            "24": 687079936.0,
+            "25": 687079936.0,
+            "26": 687079936.0,
+            "27": 687079936.0,
+            "28": 687079936.0,
+            "29": 687079936.0,
+            "30": 687079936.0,
+            "31": 687079936.0,
+            "32": 687079936.0,
+            "33": 687079936.0,
+            "34": 687079936.0,
+            "35": 687079936.0,
+            "36": 687079936.0,
+            "37": 687079936.0,
+            "38": 687079936.0,
+            "39": 687079936.0,
+            "40": 687079936.0,
+            "41": 687079936.0,
+            "42": 687079936.0,
+            "43": 687079936.0,
+            "44": 687079936.0,
+            "45": 687079936.0,
+            "46": 687079936.0,
+            "47": 687079936.0,
+            "48": 687079936.0,
+            "49": 687079936.0,
+            "50": 687079936.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2158024192.0,
-            "2": 2416613888.0,
-            "3": 2416613888.0,
-            "4": 2416613888.0,
-            "5": 2416613888.0,
-            "6": 2416613888.0,
-            "7": 2416613888.0,
-            "8": 2416613888.0,
-            "9": 2416613888.0,
-            "10": 2416613888.0,
-            "11": 2416613888.0,
-            "12": 2416613888.0,
-            "13": 2416613888.0,
-            "14": 2416613888.0,
-            "15": 2416613888.0,
-            "16": 2416613888.0,
-            "17": 2416613888.0,
-            "18": 2416613888.0,
-            "19": 2416613888.0,
-            "20": 2416613888.0,
-            "21": 2416613888.0,
-            "22": 2416613888.0,
-            "23": 2416613888.0,
-            "24": 2416613888.0,
-            "25": 2416613888.0,
-            "26": 2416613888.0,
-            "27": 2416613888.0,
-            "28": 2416613888.0,
-            "29": 2416613888.0,
-            "30": 2416613888.0,
-            "31": 2416613888.0,
-            "32": 2416613888.0,
-            "33": 2416613888.0,
-            "34": 2416613888.0,
-            "35": 2416613888.0,
-            "36": 2416613888.0,
-            "37": 2416613888.0,
-            "38": 2416613888.0,
-            "39": 2416613888.0,
-            "40": 2416613888.0,
-            "41": 2416613888.0,
-            "42": 2416613888.0,
-            "43": 2416613888.0,
-            "44": 2416613888.0,
-            "45": 2416613888.0,
-            "46": 2416613888.0,
-            "47": 2416613888.0,
-            "48": 2416613888.0,
-            "49": 2416613888.0,
-            "50": 2416613888.0
+            "1": 2158025216.0,
+            "2": 2414517760.0,
+            "3": 2414517760.0,
+            "4": 2414517760.0,
+            "5": 2414517760.0,
+            "6": 2414517760.0,
+            "7": 2414517760.0,
+            "8": 2414517760.0,
+            "9": 2414517760.0,
+            "10": 2414517760.0,
+            "11": 2414517760.0,
+            "12": 2414517760.0,
+            "13": 2414517760.0,
+            "14": 2414517760.0,
+            "15": 2414517760.0,
+            "16": 2414517760.0,
+            "17": 2414517760.0,
+            "18": 2414517760.0,
+            "19": 2414517760.0,
+            "20": 2414517760.0,
+            "21": 2414517760.0,
+            "22": 2414517760.0,
+            "23": 2414517760.0,
+            "24": 2414517760.0,
+            "25": 2414517760.0,
+            "26": 2414517760.0,
+            "27": 2414517760.0,
+            "28": 2414517760.0,
+            "29": 2414517760.0,
+            "30": 2414517760.0,
+            "31": 2414517760.0,
+            "32": 2414517760.0,
+            "33": 2414517760.0,
+            "34": 2414517760.0,
+            "35": 2414517760.0,
+            "36": 2414517760.0,
+            "37": 2414517760.0,
+            "38": 2414517760.0,
+            "39": 2414517760.0,
+            "40": 2414517760.0,
+            "41": 2414517760.0,
+            "42": 2414517760.0,
+            "43": 2414517760.0,
+            "44": 2414517760.0,
+            "45": 2414517760.0,
+            "46": 2414517760.0,
+            "47": 2414517760.0,
+            "48": 2414517760.0,
+            "49": 2414517760.0,
+            "50": 2414517760.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 11.4694,
-            "2": 0.13977,
-            "3": 0.12731,
-            "4": 0.12879,
-            "5": 0.11865,
-            "6": 0.118,
-            "7": 0.11942,
-            "8": 0.11938,
-            "9": 0.11951,
-            "10": 0.11735,
-            "11": 0.11836,
-            "12": 0.11978,
-            "13": 0.11914,
-            "14": 0.11821,
-            "15": 0.11692,
-            "16": 0.11708,
-            "17": 0.11825,
-            "18": 0.11909,
-            "19": 0.11996,
-            "20": 0.11962,
-            "21": 0.12002,
-            "22": 0.11972,
-            "23": 0.11943,
-            "24": 0.11873,
-            "25": 0.11787,
-            "26": 0.1172,
-            "27": 0.11703,
-            "28": 0.12106,
-            "29": 0.11863,
-            "30": 0.11927,
-            "31": 0.11941,
-            "32": 0.11801,
-            "33": 0.11903,
-            "34": 0.1181,
-            "35": 0.11794,
-            "36": 0.11973,
-            "37": 0.11831,
-            "38": 0.11753,
-            "39": 0.11901,
-            "40": 0.11713,
-            "41": 0.11926,
-            "42": 0.11756,
-            "43": 0.1189,
-            "44": 0.11853,
-            "45": 0.12132,
-            "46": 0.11905,
-            "47": 0.11892,
-            "48": 0.11664,
-            "49": 0.11721,
-            "50": 0.11854
+            "1": 11.04447,
+            "2": 0.15303,
+            "3": 0.11363,
+            "4": 0.09774,
+            "5": 0.09666,
+            "6": 0.0975,
+            "7": 0.09718,
+            "8": 0.09631,
+            "9": 0.09764,
+            "10": 0.0962,
+            "11": 0.09842,
+            "12": 0.09595,
+            "13": 0.09748,
+            "14": 0.09614,
+            "15": 0.09539,
+            "16": 0.09589,
+            "17": 0.09791,
+            "18": 0.0971,
+            "19": 0.09598,
+            "20": 0.09703,
+            "21": 0.09477,
+            "22": 0.09625,
+            "23": 0.09521,
+            "24": 0.09591,
+            "25": 0.09662,
+            "26": 0.09594,
+            "27": 0.096,
+            "28": 0.09633,
+            "29": 0.09553,
+            "30": 0.09789,
+            "31": 0.09628,
+            "32": 0.09629,
+            "33": 0.09555,
+            "34": 0.09528,
+            "35": 0.09554,
+            "36": 0.09515,
+            "37": 0.09514,
+            "38": 0.09534,
+            "39": 0.0958,
+            "40": 0.09495,
+            "41": 0.09747,
+            "42": 0.0951,
+            "43": 0.09603,
+            "44": 0.09547,
+            "45": 0.09561,
+            "46": 0.09761,
+            "47": 0.09506,
+            "48": 0.09637,
+            "49": 0.09518,
+            "50": 0.09512
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgx_a100.json
index 775784e5ee0..06a1af0c063 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgx_a100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 659815936.0,
-            "2": 659815936.0,
-            "3": 659815936.0,
-            "4": 659815936.0,
-            "5": 659815936.0,
-            "6": 659815936.0,
-            "7": 659815936.0,
-            "8": 659815936.0,
-            "9": 659815936.0,
-            "10": 659815936.0,
-            "11": 659815936.0,
-            "12": 659815936.0,
-            "13": 659815936.0,
-            "14": 659815936.0,
-            "15": 659815936.0,
-            "16": 659815936.0,
-            "17": 659815936.0,
-            "18": 659815936.0,
-            "19": 659815936.0,
-            "20": 659815936.0,
-            "21": 659815936.0,
-            "22": 659815936.0,
-            "23": 659815936.0,
-            "24": 659815936.0,
-            "25": 659815936.0,
-            "26": 659815936.0,
-            "27": 659815936.0,
-            "28": 659815936.0,
-            "29": 659815936.0,
-            "30": 659815936.0,
-            "31": 659815936.0,
-            "32": 659815936.0,
-            "33": 659815936.0,
-            "34": 659815936.0,
-            "35": 659815936.0,
-            "36": 659815936.0,
-            "37": 659815936.0,
-            "38": 659815936.0,
-            "39": 659815936.0,
-            "40": 659815936.0,
-            "41": 659815936.0,
-            "42": 659815936.0,
-            "43": 659815936.0,
-            "44": 659815936.0,
-            "45": 659815936.0,
-            "46": 659815936.0,
-            "47": 659815936.0,
-            "48": 659815936.0,
-            "49": 659815936.0,
-            "50": 659815936.0
+            "1": 657718784.0,
+            "2": 657718784.0,
+            "3": 657718784.0,
+            "4": 657718784.0,
+            "5": 657718784.0,
+            "6": 657718784.0,
+            "7": 657718784.0,
+            "8": 657718784.0,
+            "9": 657718784.0,
+            "10": 657718784.0,
+            "11": 657718784.0,
+            "12": 657718784.0,
+            "13": 657718784.0,
+            "14": 657718784.0,
+            "15": 657718784.0,
+            "16": 657718784.0,
+            "17": 657718784.0,
+            "18": 657718784.0,
+            "19": 657718784.0,
+            "20": 657718784.0,
+            "21": 657718784.0,
+            "22": 657718784.0,
+            "23": 657718784.0,
+            "24": 657718784.0,
+            "25": 657718784.0,
+            "26": 657718784.0,
+            "27": 657718784.0,
+            "28": 657718784.0,
+            "29": 657718784.0,
+            "30": 657718784.0,
+            "31": 657718784.0,
+            "32": 657718784.0,
+            "33": 657718784.0,
+            "34": 657718784.0,
+            "35": 657718784.0,
+            "36": 657718784.0,
+            "37": 657718784.0,
+            "38": 657718784.0,
+            "39": 657718784.0,
+            "40": 657718784.0,
+            "41": 657718784.0,
+            "42": 657718784.0,
+            "43": 657718784.0,
+            "44": 657718784.0,
+            "45": 657718784.0,
+            "46": 657718784.0,
+            "47": 657718784.0,
+            "48": 657718784.0,
+            "49": 657718784.0,
+            "50": 657718784.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -176,55 +176,55 @@
         "step_interval": 1,
         "values": {
             "1": 2128664064.0,
-            "2": 2387253760.0,
-            "3": 2387253760.0,
-            "4": 2387253760.0,
-            "5": 2387253760.0,
-            "6": 2387253760.0,
-            "7": 2387253760.0,
-            "8": 2387253760.0,
-            "9": 2387253760.0,
-            "10": 2387253760.0,
-            "11": 2387253760.0,
-            "12": 2387253760.0,
-            "13": 2387253760.0,
-            "14": 2387253760.0,
-            "15": 2387253760.0,
-            "16": 2387253760.0,
-            "17": 2387253760.0,
-            "18": 2387253760.0,
-            "19": 2387253760.0,
-            "20": 2387253760.0,
-            "21": 2387253760.0,
-            "22": 2387253760.0,
-            "23": 2387253760.0,
-            "24": 2387253760.0,
-            "25": 2387253760.0,
-            "26": 2387253760.0,
-            "27": 2387253760.0,
-            "28": 2387253760.0,
-            "29": 2387253760.0,
-            "30": 2387253760.0,
-            "31": 2387253760.0,
-            "32": 2387253760.0,
-            "33": 2387253760.0,
-            "34": 2387253760.0,
-            "35": 2387253760.0,
-            "36": 2387253760.0,
-            "37": 2387253760.0,
-            "38": 2387253760.0,
-            "39": 2387253760.0,
-            "40": 2387253760.0,
-            "41": 2387253760.0,
-            "42": 2387253760.0,
-            "43": 2387253760.0,
-            "44": 2387253760.0,
-            "45": 2387253760.0,
-            "46": 2387253760.0,
-            "47": 2387253760.0,
-            "48": 2387253760.0,
-            "49": 2387253760.0,
-            "50": 2387253760.0
+            "2": 2385156608.0,
+            "3": 2385156608.0,
+            "4": 2385156608.0,
+            "5": 2385156608.0,
+            "6": 2385156608.0,
+            "7": 2385156608.0,
+            "8": 2385156608.0,
+            "9": 2385156608.0,
+            "10": 2385156608.0,
+            "11": 2385156608.0,
+            "12": 2385156608.0,
+            "13": 2385156608.0,
+            "14": 2385156608.0,
+            "15": 2385156608.0,
+            "16": 2385156608.0,
+            "17": 2385156608.0,
+            "18": 2385156608.0,
+            "19": 2385156608.0,
+            "20": 2385156608.0,
+            "21": 2385156608.0,
+            "22": 2385156608.0,
+            "23": 2385156608.0,
+            "24": 2385156608.0,
+            "25": 2385156608.0,
+            "26": 2385156608.0,
+            "27": 2385156608.0,
+            "28": 2385156608.0,
+            "29": 2385156608.0,
+            "30": 2385156608.0,
+            "31": 2385156608.0,
+            "32": 2385156608.0,
+            "33": 2385156608.0,
+            "34": 2385156608.0,
+            "35": 2385156608.0,
+            "36": 2385156608.0,
+            "37": 2385156608.0,
+            "38": 2385156608.0,
+            "39": 2385156608.0,
+            "40": 2385156608.0,
+            "41": 2385156608.0,
+            "42": 2385156608.0,
+            "43": 2385156608.0,
+            "44": 2385156608.0,
+            "45": 2385156608.0,
+            "46": 2385156608.0,
+            "47": 2385156608.0,
+            "48": 2385156608.0,
+            "49": 2385156608.0,
+            "50": 2385156608.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.48419,
-            "2": 0.19482,
-            "3": 0.26748,
-            "4": 0.1633,
-            "5": 0.15828,
-            "6": 0.15656,
-            "7": 0.1572,
-            "8": 0.15759,
-            "9": 0.15735,
-            "10": 0.15751,
-            "11": 0.15648,
-            "12": 0.15605,
-            "13": 0.15693,
-            "14": 0.15672,
-            "15": 0.15676,
-            "16": 0.15664,
-            "17": 0.15683,
-            "18": 0.15646,
-            "19": 0.15696,
-            "20": 0.15623,
-            "21": 0.15652,
-            "22": 0.15759,
-            "23": 0.15729,
-            "24": 0.15687,
-            "25": 0.15563,
-            "26": 0.1575,
-            "27": 0.15616,
-            "28": 0.15855,
-            "29": 0.15771,
-            "30": 0.15851,
-            "31": 0.1579,
-            "32": 0.1587,
-            "33": 0.1577,
-            "34": 0.15827,
-            "35": 0.15808,
-            "36": 0.15825,
-            "37": 0.1583,
-            "38": 0.15836,
-            "39": 0.15797,
-            "40": 0.15829,
-            "41": 0.15787,
-            "42": 0.15789,
-            "43": 0.15839,
-            "44": 0.15862,
-            "45": 0.15727,
-            "46": 0.15919,
-            "47": 0.15859,
-            "48": 0.15898,
-            "49": 0.15832,
-            "50": 0.1586
+            "1": 4.20554,
+            "2": 0.17937,
+            "3": 0.16839,
+            "4": 0.15493,
+            "5": 0.15446,
+            "6": 0.15139,
+            "7": 0.15305,
+            "8": 0.15189,
+            "9": 0.15216,
+            "10": 0.15599,
+            "11": 0.15357,
+            "12": 0.15419,
+            "13": 0.15436,
+            "14": 0.15288,
+            "15": 0.15253,
+            "16": 0.15223,
+            "17": 0.15315,
+            "18": 0.15292,
+            "19": 0.15296,
+            "20": 0.15256,
+            "21": 0.15297,
+            "22": 0.15389,
+            "23": 0.15399,
+            "24": 0.15299,
+            "25": 0.15347,
+            "26": 0.15651,
+            "27": 0.15552,
+            "28": 0.15444,
+            "29": 0.15801,
+            "30": 0.15708,
+            "31": 0.15903,
+            "32": 0.15742,
+            "33": 0.15743,
+            "34": 0.15818,
+            "35": 0.15832,
+            "36": 0.15788,
+            "37": 0.1571,
+            "38": 0.15852,
+            "39": 0.15701,
+            "40": 0.15794,
+            "41": 0.15813,
+            "42": 0.15763,
+            "43": 0.15873,
+            "44": 0.15814,
+            "45": 0.15802,
+            "46": 0.15831,
+            "47": 0.1573,
+            "48": 0.1585,
+            "49": 0.15823,
+            "50": 0.15801
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..28de0d56b1b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.85072,
+            "2": 10.87664,
+            "3": 10.85783,
+            "4": 10.84306,
+            "5": 10.88146,
+            "6": 10.87139,
+            "7": 10.89191,
+            "8": 10.85963,
+            "9": 10.86934,
+            "10": 10.8278,
+            "11": 10.90311,
+            "12": 10.87801,
+            "13": 10.87305,
+            "14": 10.89032,
+            "15": 10.87011,
+            "16": 10.8511,
+            "17": 10.84459,
+            "18": 10.84726,
+            "19": 10.86383,
+            "20": 10.82208,
+            "21": 10.79825,
+            "22": 10.73204,
+            "23": 10.81839,
+            "24": 10.74606,
+            "25": 10.71761,
+            "26": 10.77202,
+            "27": 10.77401,
+            "28": 10.72063,
+            "29": 10.72787,
+            "30": 10.59722,
+            "31": 10.42528,
+            "32": 10.6597,
+            "33": 10.6513,
+            "34": 10.49325,
+            "35": 10.52835,
+            "36": 10.49365,
+            "37": 10.57261,
+            "38": 10.44872,
+            "39": 10.58148,
+            "40": 10.32557,
+            "41": 10.36356,
+            "42": 10.41806,
+            "43": 10.12507,
+            "44": 10.22734,
+            "45": 10.12083,
+            "46": 10.10118,
+            "47": 10.36102,
+            "48": 10.09786,
+            "49": 9.8396,
+            "50": 10.15591
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 22727256.0,
+            "2": 22925250.0,
+            "3": 22596852.0,
+            "4": 23219000.0,
+            "5": 22714020.0,
+            "6": 23020792.0,
+            "7": 22771170.0,
+            "8": 22926228.0,
+            "9": 22842640.0,
+            "10": 22918308.0,
+            "11": 22499960.0,
+            "12": 22459596.0,
+            "13": 22916016.0,
+            "14": 22388008.0,
+            "15": 22821540.0,
+            "16": 22830500.0,
+            "17": 22818592.0,
+            "18": 22582030.0,
+            "19": 22617218.0,
+            "20": 22693536.0,
+            "21": 22739118.0,
+            "22": 22798904.0,
+            "23": 22538834.0,
+            "24": 22770708.0,
+            "25": 22818172.0,
+            "26": 22547374.0,
+            "27": 22467964.0,
+            "28": 22452370.0,
+            "29": 22528234.0,
+            "30": 22630740.0,
+            "31": 22954650.0,
+            "32": 22584568.0,
+            "33": 22557506.0,
+            "34": 22835004.0,
+            "35": 22787526.0,
+            "36": 22588580.0,
+            "37": 22496788.0,
+            "38": 22895632.0,
+            "39": 22800112.0,
+            "40": 22657224.0,
+            "41": 22658160.0,
+            "42": 22666840.0,
+            "43": 22975312.0,
+            "44": 22745190.0,
+            "45": 22674440.0,
+            "46": 22883296.0,
+            "47": 22633056.0,
+            "48": 22927568.0,
+            "49": 22727008.0,
+            "50": 22903184.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 640822784.0,
+            "2": 640822784.0,
+            "3": 640822784.0,
+            "4": 640822784.0,
+            "5": 640822784.0,
+            "6": 640822784.0,
+            "7": 640822784.0,
+            "8": 640822784.0,
+            "9": 640822784.0,
+            "10": 640822784.0,
+            "11": 640822784.0,
+            "12": 640822784.0,
+            "13": 640822784.0,
+            "14": 640822784.0,
+            "15": 640822784.0,
+            "16": 640822784.0,
+            "17": 640822784.0,
+            "18": 640822784.0,
+            "19": 640822784.0,
+            "20": 640822784.0,
+            "21": 640822784.0,
+            "22": 640822784.0,
+            "23": 640822784.0,
+            "24": 640822784.0,
+            "25": 640822784.0,
+            "26": 640822784.0,
+            "27": 640822784.0,
+            "28": 640822784.0,
+            "29": 640822784.0,
+            "30": 640822784.0,
+            "31": 640822784.0,
+            "32": 640822784.0,
+            "33": 640822784.0,
+            "34": 640822784.0,
+            "35": 640822784.0,
+            "36": 640822784.0,
+            "37": 640822784.0,
+            "38": 640822784.0,
+            "39": 640822784.0,
+            "40": 640822784.0,
+            "41": 640822784.0,
+            "42": 640822784.0,
+            "43": 640822784.0,
+            "44": 640822784.0,
+            "45": 640822784.0,
+            "46": 640822784.0,
+            "47": 640822784.0,
+            "48": 640822784.0,
+            "49": 640822784.0,
+            "50": 640822784.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 2610027008.0,
+            "2": 2842349056.0,
+            "3": 2842349056.0,
+            "4": 2842349056.0,
+            "5": 2842349056.0,
+            "6": 2842349056.0,
+            "7": 2842349056.0,
+            "8": 2842349056.0,
+            "9": 2842349056.0,
+            "10": 2842349056.0,
+            "11": 2842349056.0,
+            "12": 2842349056.0,
+            "13": 2842349056.0,
+            "14": 2842349056.0,
+            "15": 2842349056.0,
+            "16": 2842349056.0,
+            "17": 2842349056.0,
+            "18": 2842349056.0,
+            "19": 2842349056.0,
+            "20": 2842349056.0,
+            "21": 2842349056.0,
+            "22": 2842349056.0,
+            "23": 2842349056.0,
+            "24": 2842349056.0,
+            "25": 2842349056.0,
+            "26": 2842349056.0,
+            "27": 2842349056.0,
+            "28": 2842349056.0,
+            "29": 2842349056.0,
+            "30": 2842349056.0,
+            "31": 2842349056.0,
+            "32": 2842349056.0,
+            "33": 2842349056.0,
+            "34": 2842349056.0,
+            "35": 2842349056.0,
+            "36": 2842349056.0,
+            "37": 2842349056.0,
+            "38": 2842349056.0,
+            "39": 2842349056.0,
+            "40": 2842349056.0,
+            "41": 2842349056.0,
+            "42": 2842349056.0,
+            "43": 2842349056.0,
+            "44": 2842349056.0,
+            "45": 2842349056.0,
+            "46": 2842349056.0,
+            "47": 2842349056.0,
+            "48": 2842349056.0,
+            "49": 2842349056.0,
+            "50": 2842349056.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.91983,
+            "3": 0.07964,
+            "4": 0.0755,
+            "5": 0.06521,
+            "6": 0.07949,
+            "7": 0.0691,
+            "8": 0.06527,
+            "9": 0.09221,
+            "10": 0.0948,
+            "11": 0.07486,
+            "12": 0.06312,
+            "13": 0.06422,
+            "14": 0.0656,
+            "15": 0.07274,
+            "16": 0.06384,
+            "17": 0.06441,
+            "18": 0.06446,
+            "19": 0.06349,
+            "20": 0.06319,
+            "21": 0.06302,
+            "22": 0.06467,
+            "23": 0.06428,
+            "24": 0.06395,
+            "25": 0.06411,
+            "26": 0.06398,
+            "27": 0.06336,
+            "28": 0.06345,
+            "29": 0.07201,
+            "30": 0.06458,
+            "31": 0.06379,
+            "32": 0.06337,
+            "33": 0.06262,
+            "34": 0.06257,
+            "35": 0.06407,
+            "36": 0.06308,
+            "37": 0.06324,
+            "38": 0.06353,
+            "39": 0.06346,
+            "40": 0.06294,
+            "41": 0.06471,
+            "42": 0.06426,
+            "43": 0.06446,
+            "44": 0.06426,
+            "45": 0.06337,
+            "46": 0.06427,
+            "47": 0.06421,
+            "48": 0.06315,
+            "49": 0.0639,
+            "50": 0.06324
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_h100.json
index 8f65ccec75e..bf7a46b3f3c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 638724608.0,
-            "2": 638724608.0,
-            "3": 638724608.0,
-            "4": 638724608.0,
-            "5": 638724608.0,
-            "6": 638724608.0,
-            "7": 638724608.0,
-            "8": 638724608.0,
-            "9": 638724608.0,
-            "10": 638724608.0,
-            "11": 638724608.0,
-            "12": 638724608.0,
-            "13": 638724608.0,
-            "14": 638724608.0,
-            "15": 638724608.0,
-            "16": 638724608.0,
-            "17": 638724608.0,
-            "18": 638724608.0,
-            "19": 638724608.0,
-            "20": 638724608.0,
-            "21": 638724608.0,
-            "22": 638724608.0,
-            "23": 638724608.0,
-            "24": 638724608.0,
-            "25": 638724608.0,
-            "26": 638724608.0,
-            "27": 638724608.0,
-            "28": 638724608.0,
-            "29": 638724608.0,
-            "30": 638724608.0,
-            "31": 638724608.0,
-            "32": 638724608.0,
-            "33": 638724608.0,
-            "34": 638724608.0,
-            "35": 638724608.0,
-            "36": 638724608.0,
-            "37": 638724608.0,
-            "38": 638724608.0,
-            "39": 638724608.0,
-            "40": 638724608.0,
-            "41": 638724608.0,
-            "42": 638724608.0,
-            "43": 638724608.0,
-            "44": 638724608.0,
-            "45": 638724608.0,
-            "46": 638724608.0,
-            "47": 638724608.0,
-            "48": 638724608.0,
-            "49": 638724608.0,
-            "50": 638724608.0
+            "1": 640822784.0,
+            "2": 640822784.0,
+            "3": 640822784.0,
+            "4": 640822784.0,
+            "5": 640822784.0,
+            "6": 640822784.0,
+            "7": 640822784.0,
+            "8": 640822784.0,
+            "9": 640822784.0,
+            "10": 640822784.0,
+            "11": 640822784.0,
+            "12": 640822784.0,
+            "13": 640822784.0,
+            "14": 640822784.0,
+            "15": 640822784.0,
+            "16": 640822784.0,
+            "17": 640822784.0,
+            "18": 640822784.0,
+            "19": 640822784.0,
+            "20": 640822784.0,
+            "21": 640822784.0,
+            "22": 640822784.0,
+            "23": 640822784.0,
+            "24": 640822784.0,
+            "25": 640822784.0,
+            "26": 640822784.0,
+            "27": 640822784.0,
+            "28": 640822784.0,
+            "29": 640822784.0,
+            "30": 641740288.0,
+            "31": 640822784.0,
+            "32": 640822784.0,
+            "33": 640822784.0,
+            "34": 640822784.0,
+            "35": 640822784.0,
+            "36": 640822784.0,
+            "37": 640822784.0,
+            "38": 640822784.0,
+            "39": 640822784.0,
+            "40": 640822784.0,
+            "41": 640822784.0,
+            "42": 640822784.0,
+            "43": 640822784.0,
+            "44": 640822784.0,
+            "45": 640822784.0,
+            "46": 640822784.0,
+            "47": 640822784.0,
+            "48": 641740288.0,
+            "49": 640822784.0,
+            "50": 640822784.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2610025984.0,
-            "2": 2840250880.0,
-            "3": 2840250880.0,
-            "4": 2840250880.0,
-            "5": 2840250880.0,
-            "6": 2840250880.0,
-            "7": 2840250880.0,
-            "8": 2840250880.0,
-            "9": 2840250880.0,
-            "10": 2840250880.0,
-            "11": 2840250880.0,
-            "12": 2840250880.0,
-            "13": 2840250880.0,
-            "14": 2840250880.0,
-            "15": 2840250880.0,
-            "16": 2840250880.0,
-            "17": 2840250880.0,
-            "18": 2840250880.0,
-            "19": 2840250880.0,
-            "20": 2840250880.0,
-            "21": 2840250880.0,
-            "22": 2840250880.0,
-            "23": 2840250880.0,
-            "24": 2840250880.0,
-            "25": 2840250880.0,
-            "26": 2840250880.0,
-            "27": 2840250880.0,
-            "28": 2840250880.0,
-            "29": 2840250880.0,
-            "30": 2840250880.0,
-            "31": 2840250880.0,
-            "32": 2840250880.0,
-            "33": 2840250880.0,
-            "34": 2840250880.0,
-            "35": 2840250880.0,
-            "36": 2840250880.0,
-            "37": 2840250880.0,
-            "38": 2840250880.0,
-            "39": 2840250880.0,
-            "40": 2840250880.0,
-            "41": 2840250880.0,
-            "42": 2840250880.0,
-            "43": 2840250880.0,
-            "44": 2840250880.0,
-            "45": 2840250880.0,
-            "46": 2840250880.0,
-            "47": 2840250880.0,
-            "48": 2840250880.0,
-            "49": 2840250880.0,
-            "50": 2840250880.0
+            "1": 2610027008.0,
+            "2": 2842349056.0,
+            "3": 2842349056.0,
+            "4": 2843266560.0,
+            "5": 2843266560.0,
+            "6": 2843266560.0,
+            "7": 2843266560.0,
+            "8": 2843266560.0,
+            "9": 2843266560.0,
+            "10": 2843266560.0,
+            "11": 2843266560.0,
+            "12": 2843266560.0,
+            "13": 2843266560.0,
+            "14": 2843266560.0,
+            "15": 2843266560.0,
+            "16": 2843266560.0,
+            "17": 2843266560.0,
+            "18": 2843266560.0,
+            "19": 2843266560.0,
+            "20": 2843266560.0,
+            "21": 2843266560.0,
+            "22": 2843266560.0,
+            "23": 2843266560.0,
+            "24": 2843266560.0,
+            "25": 2843266560.0,
+            "26": 2843266560.0,
+            "27": 2843266560.0,
+            "28": 2843266560.0,
+            "29": 2843266560.0,
+            "30": 2843266560.0,
+            "31": 2843266560.0,
+            "32": 2843266560.0,
+            "33": 2843266560.0,
+            "34": 2843266560.0,
+            "35": 2843266560.0,
+            "36": 2843266560.0,
+            "37": 2843266560.0,
+            "38": 2843266560.0,
+            "39": 2843266560.0,
+            "40": 2843266560.0,
+            "41": 2843266560.0,
+            "42": 2843266560.0,
+            "43": 2843266560.0,
+            "44": 2843266560.0,
+            "45": 2843266560.0,
+            "46": 2843266560.0,
+            "47": 2843266560.0,
+            "48": 2843266560.0,
+            "49": 2843266560.0,
+            "50": 2843266560.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 12.45868,
-            "2": 0.10817,
-            "3": 0.08964,
-            "4": 0.08342,
-            "5": 0.08198,
-            "6": 0.08179,
-            "7": 0.08172,
-            "8": 0.08319,
-            "9": 0.07964,
-            "10": 0.07872,
-            "11": 0.07783,
-            "12": 0.07839,
-            "13": 0.07961,
-            "14": 0.07913,
-            "15": 0.08021,
-            "16": 0.07965,
-            "17": 0.07946,
-            "18": 0.07924,
-            "19": 0.0792,
-            "20": 0.07919,
-            "21": 0.07872,
-            "22": 0.07958,
-            "23": 0.07857,
-            "24": 0.0793,
-            "25": 0.07936,
-            "26": 0.07956,
-            "27": 0.07904,
-            "28": 0.07939,
-            "29": 0.08007,
-            "30": 0.07912,
-            "31": 0.07945,
-            "32": 0.07845,
-            "33": 0.07804,
-            "34": 0.07801,
-            "35": 0.07775,
-            "36": 0.07835,
-            "37": 0.0781,
-            "38": 0.07939,
-            "39": 0.07789,
-            "40": 0.07803,
-            "41": 0.07935,
-            "42": 0.07838,
-            "43": 0.07862,
-            "44": 0.07884,
-            "45": 0.07747,
-            "46": 0.07832,
-            "47": 0.07792,
-            "48": 0.07896,
-            "49": 0.07798,
-            "50": 0.0779
+            "1": 11.63091,
+            "2": 0.10057,
+            "3": 0.08189,
+            "4": 0.05797,
+            "5": 0.05721,
+            "6": 0.05698,
+            "7": 0.05706,
+            "8": 0.05717,
+            "9": 0.05757,
+            "10": 0.05769,
+            "11": 0.05657,
+            "12": 0.05708,
+            "13": 0.05676,
+            "14": 0.05712,
+            "15": 0.05745,
+            "16": 0.05704,
+            "17": 0.05756,
+            "18": 0.05699,
+            "19": 0.05682,
+            "20": 0.05715,
+            "21": 0.0569,
+            "22": 0.05766,
+            "23": 0.0572,
+            "24": 0.05719,
+            "25": 0.05674,
+            "26": 0.05685,
+            "27": 0.05682,
+            "28": 0.05657,
+            "29": 0.0565,
+            "30": 0.05693,
+            "31": 0.05726,
+            "32": 0.05673,
+            "33": 0.05675,
+            "34": 0.05664,
+            "35": 0.05717,
+            "36": 0.05653,
+            "37": 0.05652,
+            "38": 0.05671,
+            "39": 0.05659,
+            "40": 0.05731,
+            "41": 0.05949,
+            "42": 0.05669,
+            "43": 0.05723,
+            "44": 0.05695,
+            "45": 0.05766,
+            "46": 0.05736,
+            "47": 0.05802,
+            "48": 0.05662,
+            "49": 0.05689,
+            "50": 0.05838
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgx_a100.json
index 44d53d6e9d6..7995900ad8f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgx_a100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 609364480.0,
-            "2": 609364480.0,
-            "3": 609364480.0,
-            "4": 609364480.0,
-            "5": 609364480.0,
-            "6": 609364480.0,
-            "7": 609364480.0,
-            "8": 609364480.0,
-            "9": 609364480.0,
-            "10": 609364480.0,
-            "11": 609364480.0,
-            "12": 609364480.0,
-            "13": 609364480.0,
-            "14": 609364480.0,
-            "15": 609364480.0,
-            "16": 609364480.0,
-            "17": 609364480.0,
-            "18": 609364480.0,
-            "19": 609364480.0,
-            "20": 609364480.0,
-            "21": 609364480.0,
-            "22": 609364480.0,
-            "23": 609364480.0,
-            "24": 609364480.0,
-            "25": 609364480.0,
-            "26": 609364480.0,
-            "27": 609364480.0,
-            "28": 609364480.0,
-            "29": 609364480.0,
-            "30": 609364480.0,
-            "31": 609364480.0,
-            "32": 609364480.0,
-            "33": 609364480.0,
-            "34": 609364480.0,
-            "35": 609364480.0,
-            "36": 609364480.0,
-            "37": 609364480.0,
-            "38": 609364480.0,
-            "39": 609364480.0,
-            "40": 609364480.0,
-            "41": 609364480.0,
-            "42": 609364480.0,
-            "43": 609364480.0,
-            "44": 609364480.0,
-            "45": 609364480.0,
-            "46": 609364480.0,
-            "47": 609364480.0,
-            "48": 609364480.0,
-            "49": 609364480.0,
-            "50": 609364480.0
+            "1": 611461632.0,
+            "2": 611461632.0,
+            "3": 611461632.0,
+            "4": 611461632.0,
+            "5": 611461632.0,
+            "6": 611461632.0,
+            "7": 611461632.0,
+            "8": 611461632.0,
+            "9": 611461632.0,
+            "10": 611461632.0,
+            "11": 611461632.0,
+            "12": 611461632.0,
+            "13": 611461632.0,
+            "14": 611461632.0,
+            "15": 611461632.0,
+            "16": 611461632.0,
+            "17": 611461632.0,
+            "18": 611461632.0,
+            "19": 611461632.0,
+            "20": 611461632.0,
+            "21": 611461632.0,
+            "22": 611461632.0,
+            "23": 611461632.0,
+            "24": 611461632.0,
+            "25": 611461632.0,
+            "26": 611461632.0,
+            "27": 611461632.0,
+            "28": 611461632.0,
+            "29": 611461632.0,
+            "30": 611461632.0,
+            "31": 611461632.0,
+            "32": 611461632.0,
+            "33": 611461632.0,
+            "34": 611461632.0,
+            "35": 611461632.0,
+            "36": 611461632.0,
+            "37": 611461632.0,
+            "38": 611461632.0,
+            "39": 611461632.0,
+            "40": 611461632.0,
+            "41": 611461632.0,
+            "42": 611461632.0,
+            "43": 611461632.0,
+            "44": 611461632.0,
+            "45": 611461632.0,
+            "46": 611461632.0,
+            "47": 611461632.0,
+            "48": 611461632.0,
+            "49": 611461632.0,
+            "50": 611461632.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -176,55 +176,55 @@
         "step_interval": 1,
         "values": {
             "1": 2580665856.0,
-            "2": 2810890752.0,
-            "3": 2811808256.0,
-            "4": 2811808256.0,
-            "5": 2811808256.0,
-            "6": 2811808256.0,
-            "7": 2811808256.0,
-            "8": 2811808256.0,
-            "9": 2811808256.0,
-            "10": 2811808256.0,
-            "11": 2811808256.0,
-            "12": 2811808256.0,
-            "13": 2811808256.0,
-            "14": 2811808256.0,
-            "15": 2811808256.0,
-            "16": 2811808256.0,
-            "17": 2811808256.0,
-            "18": 2811808256.0,
-            "19": 2811808256.0,
-            "20": 2811808256.0,
-            "21": 2811808256.0,
-            "22": 2811808256.0,
-            "23": 2811808256.0,
-            "24": 2811808256.0,
-            "25": 2811808256.0,
-            "26": 2811808256.0,
-            "27": 2811808256.0,
-            "28": 2811808256.0,
-            "29": 2811808256.0,
-            "30": 2811808256.0,
-            "31": 2811808256.0,
-            "32": 2811808256.0,
-            "33": 2811808256.0,
-            "34": 2811808256.0,
-            "35": 2811808256.0,
-            "36": 2811808256.0,
-            "37": 2811808256.0,
-            "38": 2811808256.0,
-            "39": 2811808256.0,
-            "40": 2811808256.0,
-            "41": 2811808256.0,
-            "42": 2811808256.0,
-            "43": 2811808256.0,
-            "44": 2811808256.0,
-            "45": 2811808256.0,
-            "46": 2811808256.0,
-            "47": 2811808256.0,
-            "48": 2811808256.0,
-            "49": 2811808256.0,
-            "50": 2811808256.0
+            "2": 2812987904.0,
+            "3": 2812987904.0,
+            "4": 2812987904.0,
+            "5": 2812987904.0,
+            "6": 2812987904.0,
+            "7": 2812987904.0,
+            "8": 2812987904.0,
+            "9": 2812987904.0,
+            "10": 2812987904.0,
+            "11": 2812987904.0,
+            "12": 2812987904.0,
+            "13": 2812987904.0,
+            "14": 2812987904.0,
+            "15": 2812987904.0,
+            "16": 2812987904.0,
+            "17": 2812987904.0,
+            "18": 2812987904.0,
+            "19": 2812987904.0,
+            "20": 2812987904.0,
+            "21": 2812987904.0,
+            "22": 2812987904.0,
+            "23": 2812987904.0,
+            "24": 2812987904.0,
+            "25": 2812987904.0,
+            "26": 2812987904.0,
+            "27": 2812987904.0,
+            "28": 2812987904.0,
+            "29": 2812987904.0,
+            "30": 2812987904.0,
+            "31": 2812987904.0,
+            "32": 2812987904.0,
+            "33": 2812987904.0,
+            "34": 2812987904.0,
+            "35": 2812987904.0,
+            "36": 2812987904.0,
+            "37": 2812987904.0,
+            "38": 2812987904.0,
+            "39": 2812987904.0,
+            "40": 2812987904.0,
+            "41": 2812987904.0,
+            "42": 2812987904.0,
+            "43": 2812987904.0,
+            "44": 2812987904.0,
+            "45": 2812987904.0,
+            "46": 2812987904.0,
+            "47": 2812987904.0,
+            "48": 2812987904.0,
+            "49": 2812987904.0,
+            "50": 2812987904.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 9.118,
-            "2": 0.12375,
-            "3": 0.31133,
-            "4": 0.09209,
-            "5": 0.09124,
-            "6": 0.09155,
-            "7": 0.09163,
-            "8": 0.0915,
-            "9": 0.09161,
-            "10": 0.09407,
-            "11": 0.09038,
-            "12": 0.09031,
-            "13": 0.09069,
-            "14": 0.09024,
-            "15": 0.09043,
-            "16": 0.08996,
-            "17": 0.09133,
-            "18": 0.09072,
-            "19": 0.09048,
-            "20": 0.09016,
-            "21": 0.09061,
-            "22": 0.09073,
-            "23": 0.09098,
-            "24": 0.09135,
-            "25": 0.09235,
-            "26": 0.09059,
-            "27": 0.09009,
-            "28": 0.09049,
-            "29": 0.09147,
-            "30": 0.09097,
-            "31": 0.09098,
-            "32": 0.09045,
-            "33": 0.09082,
-            "34": 0.08994,
-            "35": 0.09054,
-            "36": 0.09124,
-            "37": 0.09063,
-            "38": 0.08989,
-            "39": 0.09234,
-            "40": 0.09165,
-            "41": 0.09179,
-            "42": 0.09165,
-            "43": 0.09235,
-            "44": 0.09147,
-            "45": 0.0922,
-            "46": 0.09192,
-            "47": 0.09138,
-            "48": 0.09278,
-            "49": 0.09145,
-            "50": 0.09175
+            "1": 5.29488,
+            "2": 0.12291,
+            "3": 0.10694,
+            "4": 0.09161,
+            "5": 0.09138,
+            "6": 0.09229,
+            "7": 0.09025,
+            "8": 0.08872,
+            "9": 0.08988,
+            "10": 0.08934,
+            "11": 0.08865,
+            "12": 0.08864,
+            "13": 0.08947,
+            "14": 0.08897,
+            "15": 0.08938,
+            "16": 0.08885,
+            "17": 0.08914,
+            "18": 0.08802,
+            "19": 0.08997,
+            "20": 0.08786,
+            "21": 0.08941,
+            "22": 0.08893,
+            "23": 0.08869,
+            "24": 0.08862,
+            "25": 0.08883,
+            "26": 0.08857,
+            "27": 0.08808,
+            "28": 0.088,
+            "29": 0.08839,
+            "30": 0.088,
+            "31": 0.08888,
+            "32": 0.08825,
+            "33": 0.08778,
+            "34": 0.08749,
+            "35": 0.0885,
+            "36": 0.08731,
+            "37": 0.08765,
+            "38": 0.08815,
+            "39": 0.08808,
+            "40": 0.08731,
+            "41": 0.08911,
+            "42": 0.08759,
+            "43": 0.08898,
+            "44": 0.08797,
+            "45": 0.08803,
+            "46": 0.08736,
+            "47": 0.08757,
+            "48": 0.0873,
+            "49": 0.08751,
+            "50": 0.08746
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/model_config.yaml
index dea9b4aad98..d074a823ffd 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/model_config.yaml
@@ -46,7 +46,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml
new file mode 100644
index 00000000000..ee2c093e0ab
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml
@@ -0,0 +1,82 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  ENABLE_LIGHTWEIGHT_MODE: true
+MODEL_ARGS:
+  # Add network size args
+  --untie-embeddings-and-output-weights: true
+  --num-layers: 6
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --group-query-attention: true
+  --num-query-groups: 2
+  --swiglu: true
+  --position-embedding-type: rope
+  --rotary-percent: 0.5
+  --no-rope-fusion: true #TODO: We can remove this once upgrading to the DEV container
+  --apply-layernorm-1p: true
+  --apply-wd-to-qk-layernorm: true
+  --attention-output-gate: true
+  --experimental-attention-variant: gated_delta_net
+  --linear-attention-freq: 3
+  --linear-conv-kernel-dim: 4
+  --linear-key-head-dim: 64
+  --linear-value-head-dim: 64
+  --linear-num-key-heads: 4
+  --linear-num-value-heads: 8
+  # Add MoE args
+  --num-experts: 32
+  --moe-ffn-hidden-size: 64
+  --moe-shared-expert-intermediate-size: 64
+  --moe-shared-expert-gate: true
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 8
+  --disable-bias-linear: true
+  --moe-router-dtype: fp32
+  # Add logging args
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 0
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 25
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --sequence-parallel: true
+  --untie-embeddings-and-output-weights: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
index 556fcfbcf11..1171dfc454e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
@@ -61,7 +61,6 @@ MODEL_ARGS:
   --no-gradient-accumulation-fusion: true
   --use-checkpoint-opt_param-scheduler: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/model_config.yaml
index 3175a07cc88..57bcdeb7ca6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..a5fecfacf8f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.87037,
+            "2": 10.87119,
+            "3": 10.84723,
+            "4": 10.83181,
+            "5": 10.86879,
+            "6": 10.8876,
+            "7": 10.86094,
+            "8": 10.86859,
+            "9": 10.85425,
+            "10": 10.82318,
+            "11": 10.86741,
+            "12": 10.86662,
+            "13": 10.88537,
+            "14": 10.88991,
+            "15": 10.81369,
+            "16": 10.80328,
+            "17": 10.77723,
+            "18": 10.81064,
+            "19": 10.80526,
+            "20": 10.7034,
+            "21": 10.67013,
+            "22": 10.51206,
+            "23": 10.69987,
+            "24": 10.56044,
+            "25": 10.49854,
+            "26": 10.57876,
+            "27": 10.56747,
+            "28": 10.53107,
+            "29": 10.55838,
+            "30": 10.32726,
+            "31": 10.04382,
+            "32": 10.42576,
+            "33": 10.41931,
+            "34": 10.15673,
+            "35": 10.21897,
+            "36": 10.16206,
+            "37": 10.29717,
+            "38": 10.1323,
+            "39": 10.35955,
+            "40": 10.02292,
+            "41": 10.06592,
+            "42": 10.15514,
+            "43": 9.7561,
+            "44": 9.86986,
+            "45": 9.75091,
+            "46": 9.73604,
+            "47": 10.07473,
+            "48": 9.77502,
+            "49": 9.43421,
+            "50": 9.84343,
+            "51": 9.78575,
+            "52": 9.67077,
+            "53": 10.00722,
+            "54": 9.89701,
+            "55": 9.82613,
+            "56": 9.54829,
+            "57": 9.40075,
+            "58": 9.77419,
+            "59": 9.51686,
+            "60": 9.42722,
+            "61": 9.63404,
+            "62": 9.93883,
+            "63": 9.30503,
+            "64": 9.71265,
+            "65": 8.86835,
+            "66": 9.64476,
+            "67": 9.31344,
+            "68": 9.73448,
+            "69": 9.755,
+            "70": 9.68616,
+            "71": 9.57699,
+            "72": 9.53063,
+            "73": 9.43094,
+            "74": 8.85481,
+            "75": 9.35821,
+            "76": 9.01443,
+            "77": 10.02645,
+            "78": 9.6811,
+            "79": 9.33347,
+            "80": 9.35483,
+            "81": 9.44132,
+            "82": 9.66188,
+            "83": 9.26309,
+            "84": 9.37181,
+            "85": 9.57429,
+            "86": 9.0344,
+            "87": 9.56188,
+            "88": 9.71279,
+            "89": 9.55801,
+            "90": 9.79197,
+            "91": 9.29017,
+            "92": 9.31612,
+            "93": 9.04053,
+            "94": 8.78283,
+            "95": 9.49395,
+            "96": 9.48877,
+            "97": 9.26048,
+            "98": 9.63126,
+            "99": 8.85095,
+            "100": 9.36493
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 617.0,
+            "2": 610.0,
+            "3": 593.0,
+            "4": 664.0,
+            "5": 648.0,
+            "6": 669.0,
+            "7": 605.0,
+            "8": 612.0,
+            "9": 596.0,
+            "10": 559.0,
+            "11": 659.0,
+            "12": 590.0,
+            "13": 663.0,
+            "14": 675.0,
+            "15": 672.0,
+            "16": 715.0,
+            "17": 627.0,
+            "18": 614.0,
+            "19": 680.0,
+            "20": 570.0,
+            "21": 674.0,
+            "22": 593.0,
+            "23": 633.0,
+            "24": 612.0,
+            "25": 606.0,
+            "26": 669.0,
+            "27": 625.0,
+            "28": 753.0,
+            "29": 733.0,
+            "30": 661.0,
+            "31": 648.0,
+            "32": 688.0,
+            "33": 786.0,
+            "34": 689.0,
+            "35": 675.0,
+            "36": 734.0,
+            "37": 807.0,
+            "38": 799.0,
+            "39": 831.0,
+            "40": 745.0,
+            "41": 780.0,
+            "42": 868.0,
+            "43": 713.0,
+            "44": 751.0,
+            "45": 817.0,
+            "46": 856.0,
+            "47": 934.0,
+            "48": 906.0,
+            "49": 840.0,
+            "50": 799.0,
+            "51": 923.0,
+            "52": 897.0,
+            "53": 1019.0,
+            "54": 908.0,
+            "55": 839.0,
+            "56": 976.0,
+            "57": 853.0,
+            "58": 1024.0,
+            "59": 988.0,
+            "60": 870.0,
+            "61": 1041.0,
+            "62": 961.0,
+            "63": 847.0,
+            "64": 1053.0,
+            "65": 1004.0,
+            "66": 1005.0,
+            "67": 938.0,
+            "68": 1006.0,
+            "69": 1110.0,
+            "70": 985.0,
+            "71": 1002.0,
+            "72": 958.0,
+            "73": 997.0,
+            "74": 705.0,
+            "75": 870.0,
+            "76": 1088.0,
+            "77": 1153.0,
+            "78": 1083.0,
+            "79": 1035.0,
+            "80": 1122.0,
+            "81": 1209.0,
+            "82": 1132.0,
+            "83": 1024.0,
+            "84": 1122.0,
+            "85": 1141.0,
+            "86": 861.0,
+            "87": 1190.0,
+            "88": 1115.0,
+            "89": 1128.0,
+            "90": 1107.0,
+            "91": 1128.0,
+            "92": 1160.0,
+            "93": 973.0,
+            "94": 1117.0,
+            "95": 1022.0,
+            "96": 1178.0,
+            "97": 1068.0,
+            "98": 1278.0,
+            "99": 1071.0,
+            "100": 1175.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 638631424.0,
+            "2": 638631424.0,
+            "3": 638631424.0,
+            "4": 638631424.0,
+            "5": 638631424.0,
+            "6": 638631424.0,
+            "7": 638631424.0,
+            "8": 638631424.0,
+            "9": 638631424.0,
+            "10": 638631424.0,
+            "11": 638631424.0,
+            "12": 638631424.0,
+            "13": 638631424.0,
+            "14": 638631424.0,
+            "15": 638631424.0,
+            "16": 638631424.0,
+            "17": 638631424.0,
+            "18": 638631424.0,
+            "19": 638631424.0,
+            "20": 638631424.0,
+            "21": 638631424.0,
+            "22": 638631424.0,
+            "23": 638631424.0,
+            "24": 638631424.0,
+            "25": 638631424.0,
+            "26": 638631424.0,
+            "27": 638631424.0,
+            "28": 638631424.0,
+            "29": 638631424.0,
+            "30": 638631424.0,
+            "31": 638631424.0,
+            "32": 638631424.0,
+            "33": 638631424.0,
+            "34": 638631424.0,
+            "35": 638631424.0,
+            "36": 638631424.0,
+            "37": 638631424.0,
+            "38": 638631424.0,
+            "39": 638631424.0,
+            "40": 638631424.0,
+            "41": 638631424.0,
+            "42": 638631424.0,
+            "43": 638631424.0,
+            "44": 638631424.0,
+            "45": 638631424.0,
+            "46": 638631424.0,
+            "47": 638631424.0,
+            "48": 638631424.0,
+            "49": 638631424.0,
+            "50": 638631424.0,
+            "51": 638631424.0,
+            "52": 638631424.0,
+            "53": 638631424.0,
+            "54": 638631424.0,
+            "55": 638631424.0,
+            "56": 638631424.0,
+            "57": 638631424.0,
+            "58": 638631424.0,
+            "59": 638631424.0,
+            "60": 638631424.0,
+            "61": 638631424.0,
+            "62": 638631424.0,
+            "63": 638631424.0,
+            "64": 638631424.0,
+            "65": 638631424.0,
+            "66": 638631424.0,
+            "67": 638631424.0,
+            "68": 638631424.0,
+            "69": 638631424.0,
+            "70": 638631424.0,
+            "71": 638631424.0,
+            "72": 638631424.0,
+            "73": 638631424.0,
+            "74": 638631424.0,
+            "75": 638631424.0,
+            "76": 638631424.0,
+            "77": 638631424.0,
+            "78": 638631424.0,
+            "79": 638631424.0,
+            "80": 638631424.0,
+            "81": 638631424.0,
+            "82": 638631424.0,
+            "83": 638631424.0,
+            "84": 638631424.0,
+            "85": 638631424.0,
+            "86": 638631424.0,
+            "87": 638631424.0,
+            "88": 638631424.0,
+            "89": 638631424.0,
+            "90": 638631424.0,
+            "91": 638631424.0,
+            "92": 638631424.0,
+            "93": 638631424.0,
+            "94": 638631424.0,
+            "95": 638631424.0,
+            "96": 638631424.0,
+            "97": 638631424.0,
+            "98": 638631424.0,
+            "99": 638631424.0,
+            "100": 638631424.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 910633472.0,
+            "2": 1170499584.0,
+            "3": 1170499584.0,
+            "4": 1170499584.0,
+            "5": 1170499584.0,
+            "6": 1170499584.0,
+            "7": 1170500096.0,
+            "8": 1170500096.0,
+            "9": 1170500096.0,
+            "10": 1170500096.0,
+            "11": 1170761728.0,
+            "12": 1170761728.0,
+            "13": 1170761728.0,
+            "14": 1173644800.0,
+            "15": 1173644800.0,
+            "16": 1178888192.0,
+            "17": 1178888192.0,
+            "18": 1178888192.0,
+            "19": 1178888192.0,
+            "20": 1178888192.0,
+            "21": 1178888192.0,
+            "22": 1178888192.0,
+            "23": 1178888192.0,
+            "24": 1178888192.0,
+            "25": 1178888192.0,
+            "26": 1178888192.0,
+            "27": 1178888192.0,
+            "28": 1178888192.0,
+            "29": 1178888192.0,
+            "30": 1178888192.0,
+            "31": 1178888192.0,
+            "32": 1178888192.0,
+            "33": 1178888192.0,
+            "34": 1178888192.0,
+            "35": 1178888192.0,
+            "36": 1178888192.0,
+            "37": 1179936768.0,
+            "38": 1179936768.0,
+            "39": 1179936768.0,
+            "40": 1179936768.0,
+            "41": 1179936768.0,
+            "42": 1179936768.0,
+            "43": 1179936768.0,
+            "44": 1179936768.0,
+            "45": 1179936768.0,
+            "46": 1179936768.0,
+            "47": 1179936768.0,
+            "48": 1179936768.0,
+            "49": 1179936768.0,
+            "50": 1179936768.0,
+            "51": 1179936768.0,
+            "52": 1179936768.0,
+            "53": 1179936768.0,
+            "54": 1179936768.0,
+            "55": 1179936768.0,
+            "56": 1179936768.0,
+            "57": 1179936768.0,
+            "58": 1179936768.0,
+            "59": 1179936768.0,
+            "60": 1179936768.0,
+            "61": 1179936768.0,
+            "62": 1179936768.0,
+            "63": 1179936768.0,
+            "64": 1179936768.0,
+            "65": 1179936768.0,
+            "66": 1179936768.0,
+            "67": 1179936768.0,
+            "68": 1179936768.0,
+            "69": 1179936768.0,
+            "70": 1179936768.0,
+            "71": 1179936768.0,
+            "72": 1179936768.0,
+            "73": 1179936768.0,
+            "74": 1179936768.0,
+            "75": 1179936768.0,
+            "76": 1179936768.0,
+            "77": 1179936768.0,
+            "78": 1179936768.0,
+            "79": 1179936768.0,
+            "80": 1179936768.0,
+            "81": 1179936768.0,
+            "82": 1179936768.0,
+            "83": 1179936768.0,
+            "84": 1179936768.0,
+            "85": 1179936768.0,
+            "86": 1179936768.0,
+            "87": 1179936768.0,
+            "88": 1179936768.0,
+            "89": 1179936768.0,
+            "90": 1179936768.0,
+            "91": 1179936768.0,
+            "92": 1179936768.0,
+            "93": 1179936768.0,
+            "94": 1179936768.0,
+            "95": 1179936768.0,
+            "96": 1179936768.0,
+            "97": 1179936768.0,
+            "98": 1179936768.0,
+            "99": 1180984832.0,
+            "100": 1180984832.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.94981,
+            "3": 0.44844,
+            "4": 0.45032,
+            "5": 0.45007,
+            "6": 0.45413,
+            "7": 0.45595,
+            "8": 0.46147,
+            "9": 0.45703,
+            "10": 0.45713,
+            "11": 0.45843,
+            "12": 0.46481,
+            "13": 0.46367,
+            "14": 0.46145,
+            "15": 0.46144,
+            "16": 0.45944,
+            "17": 0.46053,
+            "18": 0.46234,
+            "19": 0.46318,
+            "20": 0.46597,
+            "21": 0.46872,
+            "22": 0.46167,
+            "23": 0.46295,
+            "24": 0.46293,
+            "25": 0.4609,
+            "26": 0.46534,
+            "27": 0.46202,
+            "28": 0.46538,
+            "29": 0.45857,
+            "30": 0.45499,
+            "31": 0.45555,
+            "32": 0.45778,
+            "33": 0.45371,
+            "34": 0.4591,
+            "35": 0.45853,
+            "36": 0.46033,
+            "37": 0.46964,
+            "38": 0.46372,
+            "39": 0.47429,
+            "40": 0.4563,
+            "41": 0.45921,
+            "42": 0.46933,
+            "43": 0.45644,
+            "44": 0.46471,
+            "45": 0.4574,
+            "46": 0.46247,
+            "47": 0.45727,
+            "48": 0.45962,
+            "49": 0.45179,
+            "50": 0.45444,
+            "51": 0.4599,
+            "52": 0.44679,
+            "53": 0.45022,
+            "54": 0.45041,
+            "55": 0.45771,
+            "56": 0.45328,
+            "57": 0.45098,
+            "58": 0.44748,
+            "59": 0.44807,
+            "60": 0.4538,
+            "61": 0.45222,
+            "62": 0.44954,
+            "63": 0.44907,
+            "64": 0.45008,
+            "65": 0.44883,
+            "66": 0.4485,
+            "67": 0.44967,
+            "68": 0.45395,
+            "69": 0.45369,
+            "70": 0.45227,
+            "71": 0.45433,
+            "72": 0.45362,
+            "73": 0.45783,
+            "74": 0.45269,
+            "75": 0.45513,
+            "76": 0.45076,
+            "77": 0.4512,
+            "78": 0.4499,
+            "79": 0.45799,
+            "80": 0.45507,
+            "81": 0.45882,
+            "82": 0.46542,
+            "83": 0.45653,
+            "84": 0.46726,
+            "85": 0.47932,
+            "86": 0.45972,
+            "87": 0.46195,
+            "88": 0.46285,
+            "89": 0.46098,
+            "90": 0.46499,
+            "91": 0.46284,
+            "92": 0.46284,
+            "93": 0.45889,
+            "94": 0.45485,
+            "95": 0.45165,
+            "96": 0.45389,
+            "97": 0.45854,
+            "98": 0.45665,
+            "99": 0.46287,
+            "100": 0.47613
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..21e65ea8685
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.78575,
+            "52": 9.67079,
+            "53": 10.00718,
+            "54": 9.897,
+            "55": 9.82613,
+            "56": 9.54826,
+            "57": 9.40078,
+            "58": 9.77416,
+            "59": 9.51683,
+            "60": 9.42721,
+            "61": 9.63407,
+            "62": 9.93885,
+            "63": 9.30502,
+            "64": 9.71263,
+            "65": 8.86836,
+            "66": 9.64475,
+            "67": 9.31349,
+            "68": 9.73448,
+            "69": 9.75501,
+            "70": 9.68613,
+            "71": 9.57698,
+            "72": 9.53067,
+            "73": 9.43091,
+            "74": 8.85477,
+            "75": 9.35819,
+            "76": 9.01446,
+            "77": 10.02647,
+            "78": 9.68112,
+            "79": 9.33348,
+            "80": 9.35484,
+            "81": 9.44135,
+            "82": 9.66189,
+            "83": 9.2631,
+            "84": 9.37182,
+            "85": 9.57428,
+            "86": 9.03438,
+            "87": 9.56188,
+            "88": 9.7128,
+            "89": 9.55803,
+            "90": 9.79197,
+            "91": 9.2902,
+            "92": 9.31613,
+            "93": 9.04053,
+            "94": 8.78282,
+            "95": 9.49399,
+            "96": 9.48876,
+            "97": 9.2605,
+            "98": 9.6313,
+            "99": 8.85096,
+            "100": 9.36491
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 930.0,
+            "52": 935.0,
+            "53": 1000.0,
+            "54": 919.0,
+            "55": 868.0,
+            "56": 1000.0,
+            "57": 827.0,
+            "58": 1023.0,
+            "59": 1019.0,
+            "60": 876.0,
+            "61": 1017.0,
+            "62": 936.0,
+            "63": 963.0,
+            "64": 1082.0,
+            "65": 982.0,
+            "66": 1037.0,
+            "67": 986.0,
+            "68": 1083.0,
+            "69": 1055.0,
+            "70": 1040.0,
+            "71": 999.0,
+            "72": 883.0,
+            "73": 1019.0,
+            "74": 728.0,
+            "75": 847.0,
+            "76": 1083.0,
+            "77": 1150.0,
+            "78": 1105.0,
+            "79": 1071.0,
+            "80": 1139.0,
+            "81": 1195.0,
+            "82": 1064.0,
+            "83": 1012.0,
+            "84": 1105.0,
+            "85": 1121.0,
+            "86": 836.0,
+            "87": 1193.0,
+            "88": 1096.0,
+            "89": 1116.0,
+            "90": 1162.0,
+            "91": 1098.0,
+            "92": 1160.0,
+            "93": 906.0,
+            "94": 1177.0,
+            "95": 1117.0,
+            "96": 1232.0,
+            "97": 1115.0,
+            "98": 1241.0,
+            "99": 1032.0,
+            "100": 1132.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 640336384.0,
+            "52": 640336384.0,
+            "53": 640336384.0,
+            "54": 640336384.0,
+            "55": 640336384.0,
+            "56": 640336384.0,
+            "57": 640336384.0,
+            "58": 640336384.0,
+            "59": 640336384.0,
+            "60": 640336384.0,
+            "61": 640336384.0,
+            "62": 640336384.0,
+            "63": 640336384.0,
+            "64": 640336384.0,
+            "65": 640336384.0,
+            "66": 640336384.0,
+            "67": 640336384.0,
+            "68": 640336384.0,
+            "69": 640336384.0,
+            "70": 640336384.0,
+            "71": 640336384.0,
+            "72": 640336384.0,
+            "73": 640336384.0,
+            "74": 640336384.0,
+            "75": 640336384.0,
+            "76": 640336384.0,
+            "77": 640336384.0,
+            "78": 640336384.0,
+            "79": 640336384.0,
+            "80": 640336384.0,
+            "81": 640336384.0,
+            "82": 640336384.0,
+            "83": 640336384.0,
+            "84": 640336384.0,
+            "85": 640336384.0,
+            "86": 640336384.0,
+            "87": 640336384.0,
+            "88": 640336384.0,
+            "89": 640336384.0,
+            "90": 640336384.0,
+            "91": 640336384.0,
+            "92": 640336384.0,
+            "93": 640336384.0,
+            "94": 640336384.0,
+            "95": 640336384.0,
+            "96": 640336384.0,
+            "97": 640336384.0,
+            "98": 640336384.0,
+            "99": 640336384.0,
+            "100": 640336384.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1171155456.0,
+            "52": 1174302208.0,
+            "53": 1176398848.0,
+            "54": 1176398848.0,
+            "55": 1176398848.0,
+            "56": 1176398848.0,
+            "57": 1176398848.0,
+            "58": 1176398848.0,
+            "59": 1178496512.0,
+            "60": 1178496512.0,
+            "61": 1178496512.0,
+            "62": 1178496512.0,
+            "63": 1178496512.0,
+            "64": 1178496512.0,
+            "65": 1178496512.0,
+            "66": 1178496512.0,
+            "67": 1178496512.0,
+            "68": 1178496512.0,
+            "69": 1178496512.0,
+            "70": 1178496512.0,
+            "71": 1178496512.0,
+            "72": 1178496512.0,
+            "73": 1178496512.0,
+            "74": 1178496512.0,
+            "75": 1178496512.0,
+            "76": 1178496512.0,
+            "77": 1178496512.0,
+            "78": 1178496512.0,
+            "79": 1178496512.0,
+            "80": 1178496512.0,
+            "81": 1178496512.0,
+            "82": 1178496512.0,
+            "83": 1178496512.0,
+            "84": 1178496512.0,
+            "85": 1178496512.0,
+            "86": 1178496512.0,
+            "87": 1178496512.0,
+            "88": 1178496512.0,
+            "89": 1178496512.0,
+            "90": 1178496512.0,
+            "91": 1178496512.0,
+            "92": 1178496512.0,
+            "93": 1178496512.0,
+            "94": 1178496512.0,
+            "95": 1178496512.0,
+            "96": 1178496512.0,
+            "97": 1178496512.0,
+            "98": 1178496512.0,
+            "99": 1178496512.0,
+            "100": 1178496512.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.54283,
+            "53": 0.43318,
+            "54": 0.42496,
+            "55": 0.42557,
+            "56": 0.42913,
+            "57": 0.44479,
+            "58": 0.45904,
+            "59": 0.46949,
+            "60": 0.46125,
+            "61": 0.46841,
+            "62": 0.4736,
+            "63": 0.46017,
+            "64": 0.45223,
+            "65": 0.45568,
+            "66": 0.44984,
+            "67": 0.44794,
+            "68": 0.45062,
+            "69": 0.45415,
+            "70": 0.46315,
+            "71": 0.45069,
+            "72": 0.45122,
+            "73": 0.45026,
+            "74": 0.44997,
+            "75": 0.44929,
+            "76": 0.45314,
+            "77": 0.45848,
+            "78": 0.4566,
+            "79": 0.45909,
+            "80": 0.46265,
+            "81": 0.4592,
+            "82": 0.47898,
+            "83": 0.47817,
+            "84": 0.46757,
+            "85": 0.46663,
+            "86": 0.46924,
+            "87": 0.48331,
+            "88": 0.46217,
+            "89": 0.4596,
+            "90": 0.45471,
+            "91": 0.45598,
+            "92": 0.45849,
+            "93": 0.4626,
+            "94": 0.46398,
+            "95": 0.45663,
+            "96": 0.45814,
+            "97": 0.45394,
+            "98": 0.45984,
+            "99": 0.47284,
+            "100": 0.46707
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json
index e88d1fcb739..fcabeb878a2 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json
@@ -4,106 +4,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.85163,
-            "2": 10.85389,
+            "1": 10.85166,
+            "2": 10.85388,
             "3": 10.83866,
             "4": 10.84328,
-            "5": 10.8787,
-            "6": 10.87586,
-            "7": 10.86186,
-            "8": 10.84928,
-            "9": 10.84877,
-            "10": 10.80639,
-            "11": 10.88679,
-            "12": 10.85682,
-            "13": 10.86235,
-            "14": 10.87768,
-            "15": 10.81037,
-            "16": 10.81984,
-            "17": 10.7828,
-            "18": 10.80322,
-            "19": 10.78358,
-            "20": 10.68694,
-            "21": 10.66905,
-            "22": 10.52315,
-            "23": 10.68436,
-            "24": 10.56577,
-            "25": 10.49705,
+            "5": 10.87866,
+            "6": 10.87587,
+            "7": 10.86182,
+            "8": 10.84929,
+            "9": 10.84878,
+            "10": 10.80638,
+            "11": 10.88681,
+            "12": 10.85678,
+            "13": 10.86232,
+            "14": 10.87763,
+            "15": 10.81038,
+            "16": 10.81986,
+            "17": 10.78278,
+            "18": 10.80323,
+            "19": 10.78355,
+            "20": 10.68693,
+            "21": 10.66908,
+            "22": 10.52312,
+            "23": 10.68433,
+            "24": 10.56579,
+            "25": 10.49704,
             "26": 10.56553,
-            "27": 10.58171,
+            "27": 10.58173,
             "28": 10.52995,
             "29": 10.55561,
-            "30": 10.32672,
-            "31": 10.07636,
-            "32": 10.43058,
-            "33": 10.42455,
-            "34": 10.16647,
-            "35": 10.22486,
-            "36": 10.18341,
-            "37": 10.29956,
-            "38": 10.14498,
-            "39": 10.37061,
-            "40": 10.04385,
-            "41": 10.0945,
-            "42": 10.17381,
-            "43": 9.77538,
-            "44": 9.90308,
-            "45": 9.779,
-            "46": 9.76548,
-            "47": 10.10723,
-            "48": 9.80029,
-            "49": 9.47526,
-            "50": 9.85792,
-            "51": 9.80039,
-            "52": 9.69506,
-            "53": 10.0285,
-            "54": 9.9143,
-            "55": 9.83807,
+            "30": 10.32669,
+            "31": 10.07637,
+            "32": 10.43055,
+            "33": 10.42453,
+            "34": 10.1665,
+            "35": 10.22484,
+            "36": 10.18342,
+            "37": 10.29954,
+            "38": 10.14501,
+            "39": 10.37065,
+            "40": 10.04387,
+            "41": 10.09449,
+            "42": 10.17379,
+            "43": 9.77531,
+            "44": 9.9031,
+            "45": 9.77897,
+            "46": 9.7655,
+            "47": 10.10719,
+            "48": 9.80026,
+            "49": 9.47522,
+            "50": 9.85791,
+            "51": 9.80035,
+            "52": 9.69511,
+            "53": 10.02853,
+            "54": 9.91431,
+            "55": 9.83806,
             "56": 9.57833,
-            "57": 9.42582,
+            "57": 9.42585,
             "58": 9.79172,
-            "59": 9.53617,
-            "60": 9.44186,
-            "61": 9.65656,
-            "62": 9.94377,
-            "63": 9.32151,
-            "64": 9.73339,
-            "65": 8.88427,
-            "66": 9.65533,
-            "67": 9.32106,
-            "68": 9.75064,
-            "69": 9.764,
-            "70": 9.70469,
-            "71": 9.56861,
-            "72": 9.53902,
+            "59": 9.53621,
+            "60": 9.44189,
+            "61": 9.65658,
+            "62": 9.94379,
+            "63": 9.3214,
+            "64": 9.73336,
+            "65": 8.88432,
+            "66": 9.65534,
+            "67": 9.32102,
+            "68": 9.75059,
+            "69": 9.76397,
+            "70": 9.70471,
+            "71": 9.56854,
+            "72": 9.53904,
             "73": 9.45226,
-            "74": 8.87736,
-            "75": 9.37933,
+            "74": 8.87739,
+            "75": 9.37931,
             "76": 9.01867,
             "77": 10.03519,
-            "78": 9.69263,
-            "79": 9.33459,
-            "80": 9.36591,
-            "81": 9.43919,
-            "82": 9.66572,
-            "83": 9.25441,
-            "84": 9.378,
-            "85": 9.57422,
-            "86": 9.03277,
+            "78": 9.69265,
+            "79": 9.33455,
+            "80": 9.36593,
+            "81": 9.4392,
+            "82": 9.66573,
+            "83": 9.25449,
+            "84": 9.37805,
+            "85": 9.57423,
+            "86": 9.03275,
             "87": 9.55775,
             "88": 9.71521,
-            "89": 9.55703,
-            "90": 9.788,
-            "91": 9.29518,
-            "92": 9.31516,
-            "93": 9.03246,
-            "94": 8.79087,
-            "95": 9.48833,
-            "96": 9.49574,
-            "97": 9.2713,
-            "98": 9.64071,
-            "99": 8.84741,
-            "100": 9.35871
+            "89": 9.55701,
+            "90": 9.78806,
+            "91": 9.29516,
+            "92": 9.31513,
+            "93": 9.03243,
+            "94": 8.79086,
+            "95": 9.48838,
+            "96": 9.49572,
+            "97": 9.27133,
+            "98": 9.6407,
+            "99": 8.84739,
+            "100": 9.35873
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 600.0,
-            "2": 574.0,
-            "3": 677.0,
-            "4": 617.0,
-            "5": 669.0,
-            "6": 650.0,
-            "7": 700.0,
+            "1": 615.0,
+            "2": 567.0,
+            "3": 639.0,
+            "4": 642.0,
+            "5": 662.0,
+            "6": 700.0,
+            "7": 710.0,
             "8": 624.0,
-            "9": 649.0,
-            "10": 562.0,
-            "11": 661.0,
-            "12": 622.0,
-            "13": 711.0,
-            "14": 656.0,
-            "15": 688.0,
-            "16": 667.0,
-            "17": 696.0,
-            "18": 660.0,
-            "19": 607.0,
-            "20": 649.0,
-            "21": 646.0,
-            "22": 653.0,
-            "23": 743.0,
-            "24": 678.0,
-            "25": 663.0,
-            "26": 661.0,
-            "27": 703.0,
-            "28": 769.0,
-            "29": 775.0,
-            "30": 767.0,
-            "31": 606.0,
-            "32": 755.0,
-            "33": 764.0,
-            "34": 676.0,
-            "35": 779.0,
-            "36": 768.0,
-            "37": 824.0,
-            "38": 808.0,
-            "39": 893.0,
-            "40": 795.0,
-            "41": 774.0,
-            "42": 895.0,
-            "43": 758.0,
-            "44": 770.0,
-            "45": 738.0,
+            "9": 630.0,
+            "10": 524.0,
+            "11": 720.0,
+            "12": 664.0,
+            "13": 674.0,
+            "14": 680.0,
+            "15": 695.0,
+            "16": 700.0,
+            "17": 670.0,
+            "18": 690.0,
+            "19": 632.0,
+            "20": 640.0,
+            "21": 656.0,
+            "22": 647.0,
+            "23": 731.0,
+            "24": 647.0,
+            "25": 628.0,
+            "26": 651.0,
+            "27": 673.0,
+            "28": 758.0,
+            "29": 784.0,
+            "30": 718.0,
+            "31": 564.0,
+            "32": 765.0,
+            "33": 817.0,
+            "34": 703.0,
+            "35": 705.0,
+            "36": 759.0,
+            "37": 812.0,
+            "38": 826.0,
+            "39": 849.0,
+            "40": 827.0,
+            "41": 819.0,
+            "42": 845.0,
+            "43": 716.0,
+            "44": 759.0,
+            "45": 727.0,
             "46": 856.0,
-            "47": 912.0,
-            "48": 843.0,
-            "49": 884.0,
-            "50": 782.0,
-            "51": 967.0,
-            "52": 940.0,
-            "53": 988.0,
-            "54": 937.0,
-            "55": 870.0,
-            "56": 981.0,
-            "57": 838.0,
-            "58": 909.0,
-            "59": 969.0,
-            "60": 821.0,
-            "61": 1016.0,
-            "62": 953.0,
-            "63": 895.0,
-            "64": 1137.0,
-            "65": 917.0,
-            "66": 1050.0,
-            "67": 946.0,
-            "68": 974.0,
-            "69": 1091.0,
-            "70": 1024.0,
-            "71": 1104.0,
-            "72": 888.0,
-            "73": 967.0,
-            "74": 657.0,
-            "75": 879.0,
-            "76": 977.0,
-            "77": 1172.0,
-            "78": 1085.0,
-            "79": 1107.0,
-            "80": 1178.0,
-            "81": 1236.0,
-            "82": 1103.0,
-            "83": 975.0,
-            "84": 1164.0,
-            "85": 1160.0,
-            "86": 879.0,
-            "87": 1184.0,
-            "88": 1102.0,
-            "89": 1105.0,
-            "90": 1122.0,
-            "91": 1065.0,
-            "92": 1090.0,
-            "93": 848.0,
-            "94": 1158.0,
-            "95": 1173.0,
-            "96": 1140.0,
-            "97": 1074.0,
-            "98": 1203.0,
-            "99": 1141.0,
-            "100": 1111.0
+            "47": 962.0,
+            "48": 827.0,
+            "49": 873.0,
+            "50": 804.0,
+            "51": 908.0,
+            "52": 927.0,
+            "53": 989.0,
+            "54": 941.0,
+            "55": 852.0,
+            "56": 912.0,
+            "57": 880.0,
+            "58": 952.0,
+            "59": 984.0,
+            "60": 801.0,
+            "61": 1030.0,
+            "62": 918.0,
+            "63": 910.0,
+            "64": 1061.0,
+            "65": 982.0,
+            "66": 1062.0,
+            "67": 964.0,
+            "68": 973.0,
+            "69": 1075.0,
+            "70": 1008.0,
+            "71": 1050.0,
+            "72": 918.0,
+            "73": 992.0,
+            "74": 677.0,
+            "75": 907.0,
+            "76": 1055.0,
+            "77": 1107.0,
+            "78": 1134.0,
+            "79": 1049.0,
+            "80": 1086.0,
+            "81": 1209.0,
+            "82": 1072.0,
+            "83": 1028.0,
+            "84": 1165.0,
+            "85": 1194.0,
+            "86": 884.0,
+            "87": 1206.0,
+            "88": 1080.0,
+            "89": 1155.0,
+            "90": 1062.0,
+            "91": 1141.0,
+            "92": 1133.0,
+            "93": 900.0,
+            "94": 1126.0,
+            "95": 1096.0,
+            "96": 1109.0,
+            "97": 1052.0,
+            "98": 1249.0,
+            "99": 1150.0,
+            "100": 1090.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 689356288.0,
-            "2": 689356288.0,
-            "3": 689356288.0,
-            "4": 689356288.0,
-            "5": 689356288.0,
-            "6": 689356288.0,
-            "7": 689356288.0,
-            "8": 689356288.0,
-            "9": 689356288.0,
-            "10": 689356288.0,
-            "11": 689356288.0,
-            "12": 689356288.0,
-            "13": 689356288.0,
-            "14": 689356288.0,
-            "15": 689356288.0,
-            "16": 689356288.0,
-            "17": 689356288.0,
-            "18": 689356288.0,
-            "19": 689356288.0,
-            "20": 689356288.0,
-            "21": 689356288.0,
-            "22": 689356288.0,
-            "23": 689356288.0,
-            "24": 689356288.0,
-            "25": 689356288.0,
-            "26": 689356288.0,
-            "27": 689356288.0,
-            "28": 689356288.0,
-            "29": 689356288.0,
-            "30": 689356288.0,
-            "31": 689356288.0,
-            "32": 689356288.0,
-            "33": 689356288.0,
-            "34": 689356288.0,
-            "35": 689356288.0,
-            "36": 689356288.0,
-            "37": 689356288.0,
-            "38": 689356288.0,
-            "39": 689356288.0,
-            "40": 689356288.0,
-            "41": 689356288.0,
-            "42": 689356288.0,
-            "43": 689356288.0,
-            "44": 689356288.0,
-            "45": 689356288.0,
-            "46": 689356288.0,
-            "47": 689356288.0,
-            "48": 689356288.0,
-            "49": 689356288.0,
-            "50": 689356288.0,
-            "51": 689356288.0,
-            "52": 689356288.0,
-            "53": 689356288.0,
-            "54": 689356288.0,
-            "55": 689356288.0,
-            "56": 689356288.0,
-            "57": 689356288.0,
-            "58": 689356288.0,
-            "59": 689356288.0,
-            "60": 689356288.0,
-            "61": 689356288.0,
-            "62": 689356288.0,
-            "63": 689356288.0,
-            "64": 689356288.0,
-            "65": 689356288.0,
-            "66": 689356288.0,
-            "67": 689356288.0,
-            "68": 689356288.0,
-            "69": 689356288.0,
-            "70": 689356288.0,
-            "71": 689356288.0,
-            "72": 689356288.0,
-            "73": 689356288.0,
-            "74": 689356288.0,
-            "75": 689356288.0,
-            "76": 689356288.0,
-            "77": 689356288.0,
-            "78": 689356288.0,
-            "79": 689356288.0,
-            "80": 689356288.0,
-            "81": 689356288.0,
-            "82": 689356288.0,
-            "83": 689356288.0,
-            "84": 689356288.0,
-            "85": 689356288.0,
-            "86": 689356288.0,
-            "87": 689356288.0,
-            "88": 689356288.0,
-            "89": 689356288.0,
-            "90": 689356288.0,
-            "91": 689356288.0,
-            "92": 689356288.0,
-            "93": 689356288.0,
-            "94": 689356288.0,
-            "95": 689356288.0,
-            "96": 689356288.0,
-            "97": 689356288.0,
-            "98": 689356288.0,
-            "99": 689356288.0,
-            "100": 689356288.0
+            "1": 690404864.0,
+            "2": 690404864.0,
+            "3": 690404864.0,
+            "4": 690404864.0,
+            "5": 690404864.0,
+            "6": 690404864.0,
+            "7": 690404864.0,
+            "8": 690404864.0,
+            "9": 690404864.0,
+            "10": 690404864.0,
+            "11": 690404864.0,
+            "12": 690404864.0,
+            "13": 690404864.0,
+            "14": 690404864.0,
+            "15": 690404864.0,
+            "16": 690404864.0,
+            "17": 690404864.0,
+            "18": 690404864.0,
+            "19": 690404864.0,
+            "20": 690404864.0,
+            "21": 690404864.0,
+            "22": 690404864.0,
+            "23": 690404864.0,
+            "24": 690404864.0,
+            "25": 690404864.0,
+            "26": 690404864.0,
+            "27": 690404864.0,
+            "28": 690404864.0,
+            "29": 690404864.0,
+            "30": 690404864.0,
+            "31": 690404864.0,
+            "32": 690404864.0,
+            "33": 690404864.0,
+            "34": 690404864.0,
+            "35": 690404864.0,
+            "36": 690404864.0,
+            "37": 690404864.0,
+            "38": 690404864.0,
+            "39": 690404864.0,
+            "40": 690404864.0,
+            "41": 690404864.0,
+            "42": 690404864.0,
+            "43": 690404864.0,
+            "44": 690404864.0,
+            "45": 690404864.0,
+            "46": 690404864.0,
+            "47": 690404864.0,
+            "48": 690404864.0,
+            "49": 690404864.0,
+            "50": 690404864.0,
+            "51": 690404864.0,
+            "52": 690404864.0,
+            "53": 690404864.0,
+            "54": 690404864.0,
+            "55": 690404864.0,
+            "56": 690404864.0,
+            "57": 690404864.0,
+            "58": 690404864.0,
+            "59": 690404864.0,
+            "60": 690404864.0,
+            "61": 690404864.0,
+            "62": 690404864.0,
+            "63": 690404864.0,
+            "64": 690404864.0,
+            "65": 690404864.0,
+            "66": 690404864.0,
+            "67": 690404864.0,
+            "68": 690404864.0,
+            "69": 690404864.0,
+            "70": 690404864.0,
+            "71": 690404864.0,
+            "72": 690404864.0,
+            "73": 690404864.0,
+            "74": 690404864.0,
+            "75": 690404864.0,
+            "76": 690404864.0,
+            "77": 690404864.0,
+            "78": 690404864.0,
+            "79": 690404864.0,
+            "80": 690404864.0,
+            "81": 690404864.0,
+            "82": 690404864.0,
+            "83": 690404864.0,
+            "84": 690404864.0,
+            "85": 690404864.0,
+            "86": 690404864.0,
+            "87": 690404864.0,
+            "88": 690404864.0,
+            "89": 690404864.0,
+            "90": 690404864.0,
+            "91": 690404864.0,
+            "92": 690404864.0,
+            "93": 690404864.0,
+            "94": 690404864.0,
+            "95": 690404864.0,
+            "96": 690404864.0,
+            "97": 690404864.0,
+            "98": 690404864.0,
+            "99": 690404864.0,
+            "100": 690404864.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 959652864.0,
-            "2": 1221223936.0,
-            "3": 1221224960.0,
-            "4": 1221224960.0,
-            "5": 1221224960.0,
-            "6": 1221224960.0,
-            "7": 1221224960.0,
-            "8": 1221224960.0,
-            "9": 1221224960.0,
-            "10": 1221224960.0,
-            "11": 1221224960.0,
-            "12": 1221224960.0,
-            "13": 1221224960.0,
-            "14": 1221224960.0,
-            "15": 1221224960.0,
-            "16": 1221224960.0,
-            "17": 1221224960.0,
-            "18": 1221224960.0,
-            "19": 1221224960.0,
-            "20": 1221224960.0,
-            "21": 1221224960.0,
-            "22": 1221224960.0,
-            "23": 1221224960.0,
-            "24": 1221224960.0,
-            "25": 1221224960.0,
-            "26": 1221224960.0,
-            "27": 1221224960.0,
-            "28": 1221224960.0,
-            "29": 1221224960.0,
-            "30": 1221224960.0,
-            "31": 1221224960.0,
-            "32": 1221224960.0,
-            "33": 1221224960.0,
-            "34": 1221224960.0,
-            "35": 1221224960.0,
-            "36": 1221224960.0,
-            "37": 1221224960.0,
-            "38": 1221224960.0,
-            "39": 1221224960.0,
-            "40": 1221224960.0,
-            "41": 1221224960.0,
-            "42": 1221224960.0,
-            "43": 1221224960.0,
-            "44": 1221224960.0,
-            "45": 1221224960.0,
-            "46": 1221224960.0,
-            "47": 1221224960.0,
-            "48": 1221224960.0,
-            "49": 1221224960.0,
-            "50": 1221224960.0,
-            "51": 1221224960.0,
-            "52": 1221224960.0,
-            "53": 1221224960.0,
-            "54": 1221224960.0,
-            "55": 1221224960.0,
-            "56": 1221224960.0,
-            "57": 1221224960.0,
-            "58": 1221224960.0,
-            "59": 1221224960.0,
-            "60": 1221224960.0,
-            "61": 1221224960.0,
-            "62": 1221224960.0,
-            "63": 1221224960.0,
-            "64": 1221224960.0,
-            "65": 1221224960.0,
-            "66": 1221224960.0,
-            "67": 1221224960.0,
-            "68": 1221224960.0,
-            "69": 1221224960.0,
-            "70": 1221224960.0,
-            "71": 1221224960.0,
-            "72": 1221224960.0,
-            "73": 1221224960.0,
-            "74": 1221224960.0,
-            "75": 1221224960.0,
-            "76": 1221224960.0,
-            "77": 1221224960.0,
-            "78": 1221224960.0,
-            "79": 1221224960.0,
-            "80": 1221224960.0,
-            "81": 1221224960.0,
-            "82": 1221224960.0,
-            "83": 1221224960.0,
-            "84": 1221224960.0,
-            "85": 1221224960.0,
-            "86": 1221224960.0,
-            "87": 1221224960.0,
-            "88": 1221224960.0,
-            "89": 1221224960.0,
-            "90": 1221224960.0,
-            "91": 1221224960.0,
-            "92": 1221224960.0,
-            "93": 1221224960.0,
-            "94": 1221224960.0,
-            "95": 1221224960.0,
-            "96": 1221224960.0,
-            "97": 1221224960.0,
-            "98": 1221224960.0,
-            "99": 1221224960.0,
-            "100": 1221224960.0
+            "1": 963848704.0,
+            "2": 1223319552.0,
+            "3": 1223321600.0,
+            "4": 1226467840.0,
+            "5": 1226467840.0,
+            "6": 1226467840.0,
+            "7": 1226467840.0,
+            "8": 1226467840.0,
+            "9": 1226467840.0,
+            "10": 1226467840.0,
+            "11": 1226467840.0,
+            "12": 1226467840.0,
+            "13": 1226467840.0,
+            "14": 1226467840.0,
+            "15": 1226467840.0,
+            "16": 1226467840.0,
+            "17": 1226467840.0,
+            "18": 1226467840.0,
+            "19": 1226467840.0,
+            "20": 1226467840.0,
+            "21": 1226467840.0,
+            "22": 1226467840.0,
+            "23": 1226467840.0,
+            "24": 1226467840.0,
+            "25": 1226467840.0,
+            "26": 1226467840.0,
+            "27": 1226467840.0,
+            "28": 1226467840.0,
+            "29": 1226467840.0,
+            "30": 1226467840.0,
+            "31": 1226467840.0,
+            "32": 1226467840.0,
+            "33": 1226467840.0,
+            "34": 1226467840.0,
+            "35": 1226467840.0,
+            "36": 1226467840.0,
+            "37": 1226467840.0,
+            "38": 1226467840.0,
+            "39": 1226467840.0,
+            "40": 1226467840.0,
+            "41": 1226467840.0,
+            "42": 1226467840.0,
+            "43": 1226467840.0,
+            "44": 1226467840.0,
+            "45": 1226467840.0,
+            "46": 1226467840.0,
+            "47": 1226467840.0,
+            "48": 1226467840.0,
+            "49": 1226467840.0,
+            "50": 1226467840.0,
+            "51": 1226467840.0,
+            "52": 1226467840.0,
+            "53": 1226467840.0,
+            "54": 1226467840.0,
+            "55": 1226467840.0,
+            "56": 1226467840.0,
+            "57": 1226467840.0,
+            "58": 1226467840.0,
+            "59": 1226467840.0,
+            "60": 1226467840.0,
+            "61": 1226467840.0,
+            "62": 1226467840.0,
+            "63": 1226467840.0,
+            "64": 1226467840.0,
+            "65": 1226467840.0,
+            "66": 1228564480.0,
+            "67": 1228564480.0,
+            "68": 1228564480.0,
+            "69": 1228564480.0,
+            "70": 1228564480.0,
+            "71": 1228564480.0,
+            "72": 1228564480.0,
+            "73": 1228564480.0,
+            "74": 1228564480.0,
+            "75": 1228564480.0,
+            "76": 1228564480.0,
+            "77": 1228564480.0,
+            "78": 1228564480.0,
+            "79": 1228564480.0,
+            "80": 1228564480.0,
+            "81": 1228564480.0,
+            "82": 1228564480.0,
+            "83": 1228564480.0,
+            "84": 1228564480.0,
+            "85": 1228564480.0,
+            "86": 1228564480.0,
+            "87": 1228564480.0,
+            "88": 1228564480.0,
+            "89": 1228564480.0,
+            "90": 1228564480.0,
+            "91": 1228564480.0,
+            "92": 1228564480.0,
+            "93": 1228564480.0,
+            "94": 1228564480.0,
+            "95": 1228564480.0,
+            "96": 1228564480.0,
+            "97": 1228564480.0,
+            "98": 1228564480.0,
+            "99": 1228564480.0,
+            "100": 1228564480.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.34397,
-            "2": 0.2989,
-            "3": 0.28701,
-            "4": 0.28299,
-            "5": 0.28509,
-            "6": 0.28378,
-            "7": 0.28776,
-            "8": 0.28423,
-            "9": 0.28722,
-            "10": 0.28077,
-            "11": 0.28936,
-            "12": 0.28752,
-            "13": 0.2827,
-            "14": 0.28574,
-            "15": 0.28467,
-            "16": 0.28217,
-            "17": 0.28486,
-            "18": 0.28581,
-            "19": 0.28155,
-            "20": 0.28509,
-            "21": 0.28251,
-            "22": 0.28381,
-            "23": 0.27876,
-            "24": 0.28748,
-            "25": 0.28028,
-            "26": 0.28778,
-            "27": 0.28262,
-            "28": 0.28332,
-            "29": 0.28115,
-            "30": 0.28178,
-            "31": 0.28495,
-            "32": 0.28165,
-            "33": 0.28663,
-            "34": 0.29207,
-            "35": 0.28688,
-            "36": 0.27656,
-            "37": 0.28363,
-            "38": 0.28429,
-            "39": 0.28629,
-            "40": 0.27969,
-            "41": 0.27978,
-            "42": 0.28454,
-            "43": 0.28022,
-            "44": 0.28402,
-            "45": 0.27645,
-            "46": 0.28795,
-            "47": 0.28097,
-            "48": 0.28395,
-            "49": 0.28183,
-            "50": 0.28615,
-            "51": 0.28373,
-            "52": 0.27449,
-            "53": 0.27345,
-            "54": 0.27869,
-            "55": 0.27079,
-            "56": 0.27901,
-            "57": 0.27662,
-            "58": 0.27749,
-            "59": 0.27681,
-            "60": 0.27639,
-            "61": 0.27275,
-            "62": 0.27644,
-            "63": 0.27655,
-            "64": 0.2741,
-            "65": 0.27749,
-            "66": 0.27321,
-            "67": 0.27962,
-            "68": 0.2759,
-            "69": 0.27771,
-            "70": 0.27472,
-            "71": 0.27602,
-            "72": 0.27221,
-            "73": 0.27682,
-            "74": 0.27563,
-            "75": 0.27287,
-            "76": 0.27345,
-            "77": 0.27491,
-            "78": 0.27512,
-            "79": 0.27463,
-            "80": 0.27721,
-            "81": 0.27482,
-            "82": 0.27638,
-            "83": 0.27219,
-            "84": 0.27519,
-            "85": 0.27727,
-            "86": 0.2756,
-            "87": 0.27351,
-            "88": 0.27369,
-            "89": 0.27604,
-            "90": 0.27461,
-            "91": 0.27436,
-            "92": 0.27679,
-            "93": 0.27705,
-            "94": 0.27348,
-            "95": 0.28014,
-            "96": 0.27482,
-            "97": 0.27546,
-            "98": 0.27381,
-            "99": 0.27767,
-            "100": 0.27505
+            "1": 26.73247,
+            "2": 0.28783,
+            "3": 0.26296,
+            "4": 0.24972,
+            "5": 0.2479,
+            "6": 0.24714,
+            "7": 0.24726,
+            "8": 0.24855,
+            "9": 0.24703,
+            "10": 0.24477,
+            "11": 0.24467,
+            "12": 0.24519,
+            "13": 0.24528,
+            "14": 0.24363,
+            "15": 0.24416,
+            "16": 0.24464,
+            "17": 0.24373,
+            "18": 0.24449,
+            "19": 0.24381,
+            "20": 0.24223,
+            "21": 0.24321,
+            "22": 0.24402,
+            "23": 0.24351,
+            "24": 0.24104,
+            "25": 0.2457,
+            "26": 0.26018,
+            "27": 0.24263,
+            "28": 0.24452,
+            "29": 0.24554,
+            "30": 0.24449,
+            "31": 0.24131,
+            "32": 0.24436,
+            "33": 0.24229,
+            "34": 0.24145,
+            "35": 0.24151,
+            "36": 0.24069,
+            "37": 0.24346,
+            "38": 0.24255,
+            "39": 0.2406,
+            "40": 0.2461,
+            "41": 0.24292,
+            "42": 0.24219,
+            "43": 0.24382,
+            "44": 0.24308,
+            "45": 0.24494,
+            "46": 0.24068,
+            "47": 0.24147,
+            "48": 0.24203,
+            "49": 0.24203,
+            "50": 0.67265,
+            "51": 0.25099,
+            "52": 0.24353,
+            "53": 0.2433,
+            "54": 0.2415,
+            "55": 0.24839,
+            "56": 0.24674,
+            "57": 0.25418,
+            "58": 0.24862,
+            "59": 0.24888,
+            "60": 0.24709,
+            "61": 0.24747,
+            "62": 0.24661,
+            "63": 0.2473,
+            "64": 0.24646,
+            "65": 0.24565,
+            "66": 0.24543,
+            "67": 0.24477,
+            "68": 0.24661,
+            "69": 0.24448,
+            "70": 0.24685,
+            "71": 0.24516,
+            "72": 0.2468,
+            "73": 0.2464,
+            "74": 0.24577,
+            "75": 0.24431,
+            "76": 0.248,
+            "77": 0.24567,
+            "78": 0.24542,
+            "79": 0.24648,
+            "80": 0.24639,
+            "81": 0.24794,
+            "82": 0.24579,
+            "83": 0.24552,
+            "84": 0.24513,
+            "85": 0.24815,
+            "86": 0.2459,
+            "87": 0.24473,
+            "88": 0.24826,
+            "89": 0.24495,
+            "90": 0.24673,
+            "91": 0.24489,
+            "92": 0.2447,
+            "93": 0.24508,
+            "94": 0.24553,
+            "95": 0.24031,
+            "96": 0.24272,
+            "97": 0.24481,
+            "98": 0.24216,
+            "99": 0.24091,
+            "100": 0.24384
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..3b380aa8354
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.80035,
+            "52": 9.69509,
+            "53": 10.02853,
+            "54": 9.9143,
+            "55": 9.8381,
+            "56": 9.57833,
+            "57": 9.42584,
+            "58": 9.79167,
+            "59": 9.53621,
+            "60": 9.44186,
+            "61": 9.65657,
+            "62": 9.94379,
+            "63": 9.32145,
+            "64": 9.73337,
+            "65": 8.88429,
+            "66": 9.65529,
+            "67": 9.32104,
+            "68": 9.75065,
+            "69": 9.764,
+            "70": 9.70469,
+            "71": 9.56858,
+            "72": 9.53904,
+            "73": 9.45226,
+            "74": 8.87738,
+            "75": 9.37933,
+            "76": 9.01863,
+            "77": 10.0352,
+            "78": 9.69262,
+            "79": 9.33456,
+            "80": 9.36592,
+            "81": 9.43916,
+            "82": 9.66575,
+            "83": 9.25444,
+            "84": 9.37804,
+            "85": 9.57421,
+            "86": 9.03275,
+            "87": 9.55774,
+            "88": 9.71525,
+            "89": 9.55707,
+            "90": 9.78808,
+            "91": 9.29516,
+            "92": 9.31517,
+            "93": 9.03243,
+            "94": 8.79084,
+            "95": 9.48835,
+            "96": 9.49573,
+            "97": 9.27132,
+            "98": 9.64071,
+            "99": 8.84737,
+            "100": 9.35874
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 903.0,
+            "52": 949.0,
+            "53": 1088.0,
+            "54": 951.0,
+            "55": 860.0,
+            "56": 937.0,
+            "57": 858.0,
+            "58": 1036.0,
+            "59": 925.0,
+            "60": 897.0,
+            "61": 1029.0,
+            "62": 921.0,
+            "63": 901.0,
+            "64": 1087.0,
+            "65": 919.0,
+            "66": 1033.0,
+            "67": 996.0,
+            "68": 963.0,
+            "69": 1003.0,
+            "70": 1100.0,
+            "71": 1057.0,
+            "72": 901.0,
+            "73": 1061.0,
+            "74": 728.0,
+            "75": 943.0,
+            "76": 1070.0,
+            "77": 1164.0,
+            "78": 1138.0,
+            "79": 1046.0,
+            "80": 1162.0,
+            "81": 1204.0,
+            "82": 1108.0,
+            "83": 998.0,
+            "84": 1165.0,
+            "85": 1164.0,
+            "86": 904.0,
+            "87": 1222.0,
+            "88": 1098.0,
+            "89": 1129.0,
+            "90": 1176.0,
+            "91": 1102.0,
+            "92": 1174.0,
+            "93": 894.0,
+            "94": 1187.0,
+            "95": 1128.0,
+            "96": 1204.0,
+            "97": 1108.0,
+            "98": 1311.0,
+            "99": 1148.0,
+            "100": 1085.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 690405888.0,
+            "52": 690405888.0,
+            "53": 690405888.0,
+            "54": 690405888.0,
+            "55": 690405888.0,
+            "56": 690405888.0,
+            "57": 690405888.0,
+            "58": 690405888.0,
+            "59": 690405888.0,
+            "60": 690405888.0,
+            "61": 690405888.0,
+            "62": 690405888.0,
+            "63": 690405888.0,
+            "64": 690405888.0,
+            "65": 690405888.0,
+            "66": 690405888.0,
+            "67": 690405888.0,
+            "68": 690405888.0,
+            "69": 690405888.0,
+            "70": 690405888.0,
+            "71": 690405888.0,
+            "72": 690405888.0,
+            "73": 690405888.0,
+            "74": 690405888.0,
+            "75": 690405888.0,
+            "76": 690405888.0,
+            "77": 690405888.0,
+            "78": 690405888.0,
+            "79": 690405888.0,
+            "80": 690405888.0,
+            "81": 690405888.0,
+            "82": 690405888.0,
+            "83": 690405888.0,
+            "84": 690405888.0,
+            "85": 690405888.0,
+            "86": 690405888.0,
+            "87": 690405888.0,
+            "88": 690405888.0,
+            "89": 690405888.0,
+            "90": 690405888.0,
+            "91": 690405888.0,
+            "92": 690405888.0,
+            "93": 690405888.0,
+            "94": 690405888.0,
+            "95": 690405888.0,
+            "96": 690405888.0,
+            "97": 690405888.0,
+            "98": 690405888.0,
+            "99": 690405888.0,
+            "100": 690405888.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1223321088.0,
+            "52": 1226468864.0,
+            "53": 1226468864.0,
+            "54": 1228565504.0,
+            "55": 1228565504.0,
+            "56": 1228565504.0,
+            "57": 1228565504.0,
+            "58": 1228565504.0,
+            "59": 1228565504.0,
+            "60": 1228565504.0,
+            "61": 1228565504.0,
+            "62": 1228565504.0,
+            "63": 1228565504.0,
+            "64": 1228565504.0,
+            "65": 1228565504.0,
+            "66": 1228565504.0,
+            "67": 1228565504.0,
+            "68": 1228565504.0,
+            "69": 1228565504.0,
+            "70": 1228565504.0,
+            "71": 1228565504.0,
+            "72": 1228565504.0,
+            "73": 1228565504.0,
+            "74": 1228566016.0,
+            "75": 1228566016.0,
+            "76": 1228566016.0,
+            "77": 1228566016.0,
+            "78": 1228566016.0,
+            "79": 1228566016.0,
+            "80": 1228566016.0,
+            "81": 1228566016.0,
+            "82": 1228566016.0,
+            "83": 1228566016.0,
+            "84": 1228566016.0,
+            "85": 1228566016.0,
+            "86": 1228566016.0,
+            "87": 1228566016.0,
+            "88": 1228566016.0,
+            "89": 1228566016.0,
+            "90": 1228566016.0,
+            "91": 1228566016.0,
+            "92": 1228566016.0,
+            "93": 1228566016.0,
+            "94": 1228566016.0,
+            "95": 1228566016.0,
+            "96": 1228566016.0,
+            "97": 1228566016.0,
+            "98": 1228566016.0,
+            "99": 1228566016.0,
+            "100": 1228566016.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 25.67788,
+            "52": 0.27964,
+            "53": 0.25526,
+            "54": 0.2537,
+            "55": 0.2523,
+            "56": 0.25288,
+            "57": 0.25243,
+            "58": 0.2522,
+            "59": 0.25578,
+            "60": 0.25303,
+            "61": 0.25704,
+            "62": 0.25347,
+            "63": 0.2528,
+            "64": 0.25153,
+            "65": 0.25122,
+            "66": 0.25213,
+            "67": 0.25303,
+            "68": 0.2521,
+            "69": 0.25248,
+            "70": 0.25281,
+            "71": 0.25433,
+            "72": 0.25335,
+            "73": 0.2575,
+            "74": 0.25031,
+            "75": 0.25434,
+            "76": 0.2531,
+            "77": 0.25113,
+            "78": 0.24927,
+            "79": 0.24552,
+            "80": 0.24948,
+            "81": 0.24453,
+            "82": 0.24712,
+            "83": 0.2471,
+            "84": 0.24736,
+            "85": 0.24646,
+            "86": 0.24642,
+            "87": 0.24815,
+            "88": 0.2471,
+            "89": 0.24587,
+            "90": 0.24585,
+            "91": 0.24688,
+            "92": 0.24797,
+            "93": 0.25482,
+            "94": 0.2575,
+            "95": 0.25582,
+            "96": 0.25432,
+            "97": 0.25729,
+            "98": 0.25905,
+            "99": 0.2577,
+            "100": 0.25797
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json
index 7c012c1a85c..c8c8b2bbc63 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.88759, "5": 10.90192, "10": 10.86852, "15": 10.84829, "20": 10.71772, "25": 10.54267, "30": 10.33644, "35": 10.23973, "40": 10.03267, "45": 9.76819, "50": 9.85325, "55": 9.82266, "60": 9.43752, "65": 8.87843, "70": 9.68161, "75": 9.37198, "80": 9.35656, "85": 9.57143, "90": 9.77728, "95": 9.4856, "100": 9.35907}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 593.0, "5": 652.0, "10": 560.0, "15": 661.0, "20": 582.0, "25": 585.0, "30": 641.0, "35": 776.0, "40": 759.0, "45": 798.0, "50": 914.0, "55": 880.0, "60": 850.0, "65": 943.0, "70": 1067.0, "75": 874.0, "80": 1086.0, "85": 1093.0, "90": 1124.0, "95": 1118.0, "100": 1169.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 609795072.0, "5": 609795072.0, "10": 609795072.0, "15": 609795072.0, "20": 609795072.0, "25": 609795072.0, "30": 609795072.0, "35": 609795072.0, "40": 609795072.0, "45": 609795072.0, "50": 609795072.0, "55": 609795072.0, "60": 609795072.0, "65": 609795072.0, "70": 609795072.0, "75": 609795072.0, "80": 609795072.0, "85": 609795072.0, "90": 609795072.0, "95": 609795072.0, "100": 609795072.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 881296384.0, "5": 1141688320.0, "10": 1143770624.0, "15": 1143770624.0, "20": 1143770624.0, "25": 1143770624.0, "30": 1143770624.0, "35": 1143770624.0, "40": 1143770624.0, "45": 1143770624.0, "50": 1143770624.0, "55": 1143770624.0, "60": 1143770624.0, "65": 1143770624.0, "70": 1143770624.0, "75": 1143770624.0, "80": 1143770624.0, "85": 1143770624.0, "90": 1143784448.0, "95": 1143784448.0, "100": 1143784448.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.21791, "5": 0.32637, "10": 0.34092, "15": 0.32491, "20": 0.32495, "25": 0.34258, "30": 0.32373, "35": 0.32364, "40": 0.33541, "45": 0.32433, "50": 0.323, "55": 0.32727, "60": 0.3458, "65": 0.32544, "70": 0.33008, "75": 0.33089, "80": 0.32333, "85": 0.3359, "90": 0.32368, "95": 0.3227, "100": 0.3389}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.88762,
+            "2": 10.90373,
+            "3": 10.87084,
+            "4": 10.8703,
+            "5": 10.90194,
+            "6": 10.90847,
+            "7": 10.88783,
+            "8": 10.87729,
+            "9": 10.88358,
+            "10": 10.86852,
+            "11": 10.88097,
+            "12": 10.88498,
+            "13": 10.90366,
+            "14": 10.89975,
+            "15": 10.84831,
+            "16": 10.84519,
+            "17": 10.80088,
+            "18": 10.82615,
+            "19": 10.81894,
+            "20": 10.71775,
+            "21": 10.69282,
+            "22": 10.57372,
+            "23": 10.70805,
+            "24": 10.58158,
+            "25": 10.54269,
+            "26": 10.60192,
+            "27": 10.59774,
+            "28": 10.55016,
+            "29": 10.5634,
+            "30": 10.33643,
+            "31": 10.09542,
+            "32": 10.43666,
+            "33": 10.43053,
+            "34": 10.1772,
+            "35": 10.23973,
+            "36": 10.18243,
+            "37": 10.30498,
+            "38": 10.14899,
+            "39": 10.35867,
+            "40": 10.03262,
+            "41": 10.08767,
+            "42": 10.16354,
+            "43": 9.78193,
+            "44": 9.89592,
+            "45": 9.76818,
+            "46": 9.76745,
+            "47": 10.08837,
+            "48": 9.78338,
+            "49": 9.4572,
+            "50": 9.85324,
+            "51": 9.78849,
+            "52": 9.67829,
+            "53": 10.01953,
+            "54": 9.90017,
+            "55": 9.82266,
+            "56": 9.5637,
+            "57": 9.4179,
+            "58": 9.77443,
+            "59": 9.52364,
+            "60": 9.43755,
+            "61": 9.64826,
+            "62": 9.9369,
+            "63": 9.30557,
+            "64": 9.72234,
+            "65": 8.87843,
+            "66": 9.65136,
+            "67": 9.31594,
+            "68": 9.73881,
+            "69": 9.74595,
+            "70": 9.68157,
+            "71": 9.56047,
+            "72": 9.5391,
+            "73": 9.44519,
+            "74": 8.88645,
+            "75": 9.37195,
+            "76": 9.03135,
+            "77": 10.03088,
+            "78": 9.68941,
+            "79": 9.33246,
+            "80": 9.35652,
+            "81": 9.43617,
+            "82": 9.65385,
+            "83": 9.25759,
+            "84": 9.36534,
+            "85": 9.57143,
+            "86": 9.03651,
+            "87": 9.55864,
+            "88": 9.70773,
+            "89": 9.55528,
+            "90": 9.77728,
+            "91": 9.29749,
+            "92": 9.32182,
+            "93": 9.02995,
+            "94": 8.78447,
+            "95": 9.4856,
+            "96": 9.48706,
+            "97": 9.27003,
+            "98": 9.63514,
+            "99": 8.83979,
+            "100": 9.35906
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 609.0,
+            "2": 618.0,
+            "3": 638.0,
+            "4": 584.0,
+            "5": 663.0,
+            "6": 688.0,
+            "7": 647.0,
+            "8": 577.0,
+            "9": 690.0,
+            "10": 550.0,
+            "11": 704.0,
+            "12": 610.0,
+            "13": 645.0,
+            "14": 666.0,
+            "15": 652.0,
+            "16": 609.0,
+            "17": 623.0,
+            "18": 625.0,
+            "19": 637.0,
+            "20": 649.0,
+            "21": 668.0,
+            "22": 612.0,
+            "23": 671.0,
+            "24": 619.0,
+            "25": 614.0,
+            "26": 641.0,
+            "27": 611.0,
+            "28": 706.0,
+            "29": 716.0,
+            "30": 663.0,
+            "31": 603.0,
+            "32": 669.0,
+            "33": 760.0,
+            "34": 684.0,
+            "35": 679.0,
+            "36": 731.0,
+            "37": 792.0,
+            "38": 767.0,
+            "39": 852.0,
+            "40": 771.0,
+            "41": 800.0,
+            "42": 830.0,
+            "43": 750.0,
+            "44": 767.0,
+            "45": 821.0,
+            "46": 798.0,
+            "47": 922.0,
+            "48": 902.0,
+            "49": 839.0,
+            "50": 854.0,
+            "51": 960.0,
+            "52": 843.0,
+            "53": 1097.0,
+            "54": 940.0,
+            "55": 904.0,
+            "56": 926.0,
+            "57": 832.0,
+            "58": 1049.0,
+            "59": 948.0,
+            "60": 853.0,
+            "61": 1032.0,
+            "62": 964.0,
+            "63": 951.0,
+            "64": 1077.0,
+            "65": 956.0,
+            "66": 1065.0,
+            "67": 939.0,
+            "68": 1023.0,
+            "69": 1051.0,
+            "70": 1120.0,
+            "71": 1060.0,
+            "72": 849.0,
+            "73": 1014.0,
+            "74": 705.0,
+            "75": 838.0,
+            "76": 1045.0,
+            "77": 1118.0,
+            "78": 1125.0,
+            "79": 977.0,
+            "80": 1113.0,
+            "81": 1149.0,
+            "82": 1071.0,
+            "83": 1023.0,
+            "84": 1117.0,
+            "85": 1070.0,
+            "86": 857.0,
+            "87": 1139.0,
+            "88": 1071.0,
+            "89": 1160.0,
+            "90": 1062.0,
+            "91": 1091.0,
+            "92": 1184.0,
+            "93": 860.0,
+            "94": 1125.0,
+            "95": 1151.0,
+            "96": 1211.0,
+            "97": 1011.0,
+            "98": 1240.0,
+            "99": 1098.0,
+            "100": 1129.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 609140224.0,
+            "2": 609140224.0,
+            "3": 609140224.0,
+            "4": 609140224.0,
+            "5": 609140224.0,
+            "6": 609140224.0,
+            "7": 609140224.0,
+            "8": 609140224.0,
+            "9": 609140224.0,
+            "10": 609140224.0,
+            "11": 609140224.0,
+            "12": 609140224.0,
+            "13": 609140224.0,
+            "14": 609140224.0,
+            "15": 609140224.0,
+            "16": 609140224.0,
+            "17": 609140224.0,
+            "18": 609140224.0,
+            "19": 609140224.0,
+            "20": 609140224.0,
+            "21": 609140224.0,
+            "22": 609140224.0,
+            "23": 609140224.0,
+            "24": 609140224.0,
+            "25": 609140224.0,
+            "26": 609140224.0,
+            "27": 609140224.0,
+            "28": 609140224.0,
+            "29": 609140224.0,
+            "30": 609140224.0,
+            "31": 609140224.0,
+            "32": 609140224.0,
+            "33": 609140224.0,
+            "34": 609140224.0,
+            "35": 609140224.0,
+            "36": 609140224.0,
+            "37": 609140224.0,
+            "38": 609140224.0,
+            "39": 609140224.0,
+            "40": 609140224.0,
+            "41": 609140224.0,
+            "42": 609140224.0,
+            "43": 609140224.0,
+            "44": 609140224.0,
+            "45": 609140224.0,
+            "46": 609140224.0,
+            "47": 609140224.0,
+            "48": 609140224.0,
+            "49": 609140224.0,
+            "50": 609140224.0,
+            "51": 609140224.0,
+            "52": 609140224.0,
+            "53": 609140224.0,
+            "54": 609140224.0,
+            "55": 609140224.0,
+            "56": 609140224.0,
+            "57": 609140224.0,
+            "58": 609140224.0,
+            "59": 609140224.0,
+            "60": 609140224.0,
+            "61": 609140224.0,
+            "62": 609140224.0,
+            "63": 609140224.0,
+            "64": 609140224.0,
+            "65": 609140224.0,
+            "66": 609140224.0,
+            "67": 609140224.0,
+            "68": 609140224.0,
+            "69": 609140224.0,
+            "70": 609140224.0,
+            "71": 609140224.0,
+            "72": 609140224.0,
+            "73": 609140224.0,
+            "74": 609140224.0,
+            "75": 609140224.0,
+            "76": 609140224.0,
+            "77": 609140224.0,
+            "78": 609140224.0,
+            "79": 609140224.0,
+            "80": 609140224.0,
+            "81": 609140224.0,
+            "82": 609140224.0,
+            "83": 609140224.0,
+            "84": 609140224.0,
+            "85": 609140224.0,
+            "86": 609140224.0,
+            "87": 609140224.0,
+            "88": 609140224.0,
+            "89": 609140224.0,
+            "90": 609140224.0,
+            "91": 609140224.0,
+            "92": 609140224.0,
+            "93": 609140224.0,
+            "94": 609140224.0,
+            "95": 609140224.0,
+            "96": 609140224.0,
+            "97": 609140224.0,
+            "98": 609140224.0,
+            "99": 609140224.0,
+            "100": 609140224.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 880223232.0,
+            "2": 1150445056.0,
+            "3": 1150445056.0,
+            "4": 1152542720.0,
+            "5": 1152542720.0,
+            "6": 1152542720.0,
+            "7": 1152542720.0,
+            "8": 1152542720.0,
+            "9": 1152542720.0,
+            "10": 1152542720.0,
+            "11": 1152542720.0,
+            "12": 1152542720.0,
+            "13": 1152542720.0,
+            "14": 1152542720.0,
+            "15": 1152542720.0,
+            "16": 1152542720.0,
+            "17": 1152542720.0,
+            "18": 1152542720.0,
+            "19": 1152542720.0,
+            "20": 1152542720.0,
+            "21": 1152542720.0,
+            "22": 1152542720.0,
+            "23": 1152542720.0,
+            "24": 1152542720.0,
+            "25": 1152542720.0,
+            "26": 1152542720.0,
+            "27": 1153460736.0,
+            "28": 1153460736.0,
+            "29": 1153460736.0,
+            "30": 1153460736.0,
+            "31": 1153460736.0,
+            "32": 1153460736.0,
+            "33": 1153460736.0,
+            "34": 1153460736.0,
+            "35": 1153460736.0,
+            "36": 1153460736.0,
+            "37": 1153460736.0,
+            "38": 1153460736.0,
+            "39": 1153460736.0,
+            "40": 1153460736.0,
+            "41": 1153460736.0,
+            "42": 1153460736.0,
+            "43": 1153460736.0,
+            "44": 1153460736.0,
+            "45": 1153460736.0,
+            "46": 1153460736.0,
+            "47": 1153460736.0,
+            "48": 1153460736.0,
+            "49": 1153460736.0,
+            "50": 1153460736.0,
+            "51": 1153460736.0,
+            "52": 1153460736.0,
+            "53": 1153460736.0,
+            "54": 1153460736.0,
+            "55": 1153460736.0,
+            "56": 1153460736.0,
+            "57": 1153460736.0,
+            "58": 1153460736.0,
+            "59": 1153460736.0,
+            "60": 1153460736.0,
+            "61": 1153460736.0,
+            "62": 1153460736.0,
+            "63": 1153460736.0,
+            "64": 1153460736.0,
+            "65": 1153460736.0,
+            "66": 1153460736.0,
+            "67": 1153460736.0,
+            "68": 1153460736.0,
+            "69": 1153460736.0,
+            "70": 1153460736.0,
+            "71": 1153460736.0,
+            "72": 1153460736.0,
+            "73": 1153460736.0,
+            "74": 1153460736.0,
+            "75": 1153460736.0,
+            "76": 1153460736.0,
+            "77": 1153460736.0,
+            "78": 1153460736.0,
+            "79": 1153460736.0,
+            "80": 1153460736.0,
+            "81": 1153460736.0,
+            "82": 1153460736.0,
+            "83": 1153460736.0,
+            "84": 1153460736.0,
+            "85": 1153460736.0,
+            "86": 1153460736.0,
+            "87": 1153460736.0,
+            "88": 1153460736.0,
+            "89": 1153460736.0,
+            "90": 1153460736.0,
+            "91": 1153460736.0,
+            "92": 1153460736.0,
+            "93": 1153460736.0,
+            "94": 1153460736.0,
+            "95": 1153460736.0,
+            "96": 1153460736.0,
+            "97": 1153460736.0,
+            "98": 1153460736.0,
+            "99": 1153460736.0,
+            "100": 1153460736.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 14.84186,
+            "2": 0.40445,
+            "3": 0.37825,
+            "4": 0.36592,
+            "5": 0.36636,
+            "6": 0.36609,
+            "7": 0.36611,
+            "8": 0.36712,
+            "9": 0.36621,
+            "10": 0.3668,
+            "11": 0.36731,
+            "12": 0.36501,
+            "13": 0.36592,
+            "14": 0.36633,
+            "15": 0.36689,
+            "16": 0.36886,
+            "17": 0.36624,
+            "18": 0.36649,
+            "19": 0.36595,
+            "20": 0.36539,
+            "21": 0.36582,
+            "22": 0.36824,
+            "23": 0.36684,
+            "24": 0.36474,
+            "25": 0.36651,
+            "26": 0.36402,
+            "27": 0.3665,
+            "28": 0.36596,
+            "29": 0.3683,
+            "30": 0.38775,
+            "31": 0.36759,
+            "32": 0.36551,
+            "33": 0.36889,
+            "34": 0.80549,
+            "35": 0.36014,
+            "36": 0.36023,
+            "37": 0.74512,
+            "38": 0.37154,
+            "39": 0.35739,
+            "40": 0.79726,
+            "41": 0.35594,
+            "42": 0.35485,
+            "43": 0.82879,
+            "44": 0.35555,
+            "45": 0.3543,
+            "46": 0.35396,
+            "47": 0.35419,
+            "48": 0.35366,
+            "49": 0.68813,
+            "50": 0.35739,
+            "51": 0.3635,
+            "52": 0.36241,
+            "53": 0.35898,
+            "54": 0.36085,
+            "55": 0.35981,
+            "56": 0.35989,
+            "57": 0.36149,
+            "58": 0.36219,
+            "59": 0.36015,
+            "60": 0.36165,
+            "61": 0.35985,
+            "62": 0.36093,
+            "63": 0.3622,
+            "64": 0.3576,
+            "65": 0.36027,
+            "66": 0.36035,
+            "67": 0.36194,
+            "68": 0.35988,
+            "69": 0.35888,
+            "70": 0.3603,
+            "71": 0.36034,
+            "72": 0.35844,
+            "73": 0.35834,
+            "74": 0.36016,
+            "75": 0.36243,
+            "76": 0.3612,
+            "77": 0.35873,
+            "78": 0.36065,
+            "79": 0.35851,
+            "80": 0.35864,
+            "81": 0.36332,
+            "82": 0.36043,
+            "83": 0.35786,
+            "84": 0.35965,
+            "85": 0.35924,
+            "86": 0.35886,
+            "87": 0.36811,
+            "88": 0.36592,
+            "89": 0.36483,
+            "90": 0.36595,
+            "91": 0.36082,
+            "92": 0.3625,
+            "93": 0.35948,
+            "94": 0.35859,
+            "95": 0.35947,
+            "96": 0.35991,
+            "97": 0.36021,
+            "98": 0.35991,
+            "99": 0.35971,
+            "100": 0.35838
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..a9134cc22bc
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.78849,
+            "52": 9.67829,
+            "53": 10.01954,
+            "54": 9.90021,
+            "55": 9.82264,
+            "56": 9.56375,
+            "57": 9.4179,
+            "58": 9.7744,
+            "59": 9.52369,
+            "60": 9.43754,
+            "61": 9.64825,
+            "62": 9.93694,
+            "63": 9.30556,
+            "64": 9.72236,
+            "65": 8.87844,
+            "66": 9.65135,
+            "67": 9.31592,
+            "68": 9.7388,
+            "69": 9.74594,
+            "70": 9.68162,
+            "71": 9.5605,
+            "72": 9.53911,
+            "73": 9.44523,
+            "74": 8.88645,
+            "75": 9.37201,
+            "76": 9.03136,
+            "77": 10.03083,
+            "78": 9.68941,
+            "79": 9.3325,
+            "80": 9.35653,
+            "81": 9.43622,
+            "82": 9.65384,
+            "83": 9.2576,
+            "84": 9.36531,
+            "85": 9.57144,
+            "86": 9.03655,
+            "87": 9.55863,
+            "88": 9.70775,
+            "89": 9.55528,
+            "90": 9.77727,
+            "91": 9.2975,
+            "92": 9.32182,
+            "93": 9.02989,
+            "94": 8.78447,
+            "95": 9.48562,
+            "96": 9.48704,
+            "97": 9.27003,
+            "98": 9.63514,
+            "99": 8.8398,
+            "100": 9.35907
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1017.0,
+            "52": 937.0,
+            "53": 1026.0,
+            "54": 948.0,
+            "55": 841.0,
+            "56": 980.0,
+            "57": 765.0,
+            "58": 1018.0,
+            "59": 999.0,
+            "60": 874.0,
+            "61": 1056.0,
+            "62": 954.0,
+            "63": 920.0,
+            "64": 1089.0,
+            "65": 884.0,
+            "66": 1087.0,
+            "67": 952.0,
+            "68": 1047.0,
+            "69": 1088.0,
+            "70": 1074.0,
+            "71": 1037.0,
+            "72": 810.0,
+            "73": 1025.0,
+            "74": 741.0,
+            "75": 920.0,
+            "76": 1040.0,
+            "77": 1141.0,
+            "78": 1082.0,
+            "79": 1080.0,
+            "80": 1042.0,
+            "81": 1205.0,
+            "82": 1051.0,
+            "83": 960.0,
+            "84": 1184.0,
+            "85": 1109.0,
+            "86": 797.0,
+            "87": 1202.0,
+            "88": 1015.0,
+            "89": 1139.0,
+            "90": 987.0,
+            "91": 1050.0,
+            "92": 1163.0,
+            "93": 881.0,
+            "94": 1102.0,
+            "95": 1125.0,
+            "96": 1193.0,
+            "97": 1112.0,
+            "98": 1239.0,
+            "99": 1121.0,
+            "100": 1154.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 610975232.0,
+            "52": 610975232.0,
+            "53": 610975232.0,
+            "54": 610975232.0,
+            "55": 610975232.0,
+            "56": 610975232.0,
+            "57": 610975232.0,
+            "58": 610975232.0,
+            "59": 610975232.0,
+            "60": 610975232.0,
+            "61": 610975232.0,
+            "62": 610975232.0,
+            "63": 610975232.0,
+            "64": 610975232.0,
+            "65": 610975232.0,
+            "66": 610975232.0,
+            "67": 610975232.0,
+            "68": 610975232.0,
+            "69": 610975232.0,
+            "70": 610975232.0,
+            "71": 610975232.0,
+            "72": 610975232.0,
+            "73": 610975232.0,
+            "74": 610975232.0,
+            "75": 610975232.0,
+            "76": 610975232.0,
+            "77": 610975232.0,
+            "78": 610975232.0,
+            "79": 610975232.0,
+            "80": 610975232.0,
+            "81": 610975232.0,
+            "82": 610975232.0,
+            "83": 610975232.0,
+            "84": 610975232.0,
+            "85": 610975232.0,
+            "86": 610975232.0,
+            "87": 610975232.0,
+            "88": 610975232.0,
+            "89": 610975232.0,
+            "90": 610975232.0,
+            "91": 610975232.0,
+            "92": 610975232.0,
+            "93": 610975232.0,
+            "94": 610975232.0,
+            "95": 610975232.0,
+            "96": 610975232.0,
+            "97": 610975232.0,
+            "98": 610975232.0,
+            "99": 610975232.0,
+            "100": 610975232.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1146775040.0,
+            "52": 1146776064.0,
+            "53": 1146776064.0,
+            "54": 1146776064.0,
+            "55": 1146776064.0,
+            "56": 1146776064.0,
+            "57": 1146776064.0,
+            "58": 1146776064.0,
+            "59": 1146776064.0,
+            "60": 1146776064.0,
+            "61": 1146776064.0,
+            "62": 1146776064.0,
+            "63": 1146776064.0,
+            "64": 1146776064.0,
+            "65": 1146776064.0,
+            "66": 1146776064.0,
+            "67": 1147824640.0,
+            "68": 1147824640.0,
+            "69": 1147824640.0,
+            "70": 1147824640.0,
+            "71": 1147824640.0,
+            "72": 1147824640.0,
+            "73": 1147824640.0,
+            "74": 1147824640.0,
+            "75": 1147824640.0,
+            "76": 1147824640.0,
+            "77": 1147824640.0,
+            "78": 1147824640.0,
+            "79": 1147824640.0,
+            "80": 1147824640.0,
+            "81": 1147824640.0,
+            "82": 1147824640.0,
+            "83": 1147824640.0,
+            "84": 1147824640.0,
+            "85": 1147824640.0,
+            "86": 1147824640.0,
+            "87": 1147824640.0,
+            "88": 1147824640.0,
+            "89": 1147824640.0,
+            "90": 1147824640.0,
+            "91": 1147824640.0,
+            "92": 1147824640.0,
+            "93": 1147824640.0,
+            "94": 1147824640.0,
+            "95": 1147824640.0,
+            "96": 1147824640.0,
+            "97": 1147824640.0,
+            "98": 1147824640.0,
+            "99": 1147824640.0,
+            "100": 1147824640.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 14.91489,
+            "52": 0.3901,
+            "53": 0.37105,
+            "54": 0.36976,
+            "55": 0.36846,
+            "56": 0.36819,
+            "57": 0.36943,
+            "58": 0.36873,
+            "59": 0.37048,
+            "60": 0.3696,
+            "61": 0.36867,
+            "62": 0.36991,
+            "63": 0.36919,
+            "64": 0.36728,
+            "65": 0.36884,
+            "66": 0.37058,
+            "67": 0.36765,
+            "68": 0.36925,
+            "69": 0.36821,
+            "70": 0.36876,
+            "71": 0.36845,
+            "72": 0.36856,
+            "73": 0.36946,
+            "74": 0.36927,
+            "75": 0.36875,
+            "76": 0.36813,
+            "77": 0.37033,
+            "78": 0.36854,
+            "79": 0.36796,
+            "80": 0.36964,
+            "81": 0.36883,
+            "82": 0.36983,
+            "83": 0.37114,
+            "84": 0.36966,
+            "85": 0.36965,
+            "86": 0.36722,
+            "87": 0.36512,
+            "88": 0.3663,
+            "89": 0.36544,
+            "90": 0.3634,
+            "91": 0.36718,
+            "92": 0.3648,
+            "93": 0.36513,
+            "94": 0.36611,
+            "95": 0.3655,
+            "96": 0.36533,
+            "97": 0.3669,
+            "98": 0.36517,
+            "99": 0.36574,
+            "100": 0.36518
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..52d95069cff
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.87192,
+            "2": 10.87243,
+            "3": 10.86245,
+            "4": 10.84367,
+            "5": 10.87782,
+            "6": 10.89351,
+            "7": 10.87195,
+            "8": 10.87656,
+            "9": 10.86866,
+            "10": 10.83844,
+            "11": 10.87549,
+            "12": 10.87587,
+            "13": 10.89089,
+            "14": 10.89697,
+            "15": 10.83165,
+            "16": 10.82447,
+            "17": 10.80203,
+            "18": 10.82966,
+            "19": 10.82308,
+            "20": 10.73682,
+            "21": 10.71008,
+            "22": 10.56492,
+            "23": 10.73066,
+            "24": 10.60695,
+            "25": 10.55578,
+            "26": 10.62423,
+            "27": 10.6196,
+            "28": 10.57904,
+            "29": 10.60302,
+            "30": 10.38932,
+            "31": 10.12985,
+            "32": 10.47779,
+            "33": 10.47516,
+            "34": 10.22981,
+            "35": 10.28817,
+            "36": 10.23457,
+            "37": 10.35363,
+            "38": 10.20006,
+            "39": 10.41054,
+            "40": 10.09837,
+            "41": 10.13918,
+            "42": 10.22109,
+            "43": 9.85049,
+            "44": 9.95421,
+            "45": 9.84312,
+            "46": 9.82557,
+            "47": 10.13684,
+            "48": 9.8549,
+            "49": 9.53552,
+            "50": 9.91111,
+            "51": 9.85898,
+            "52": 9.75133,
+            "53": 10.06617,
+            "54": 9.95613,
+            "55": 9.89104,
+            "56": 9.62508,
+            "57": 9.47981,
+            "58": 9.83478,
+            "59": 9.58498,
+            "60": 9.49806,
+            "61": 9.69192,
+            "62": 9.98825,
+            "63": 9.37824,
+            "64": 9.76808,
+            "65": 8.94514,
+            "66": 9.70125,
+            "67": 9.37149,
+            "68": 9.78313,
+            "69": 9.79923,
+            "70": 9.7312,
+            "71": 9.62753,
+            "72": 9.58452,
+            "73": 9.48417,
+            "74": 8.92523,
+            "75": 9.4118,
+            "76": 9.0796,
+            "77": 10.06083,
+            "78": 9.7215,
+            "79": 9.38109,
+            "80": 9.40161,
+            "81": 9.48468,
+            "82": 9.70219,
+            "83": 9.31549,
+            "84": 9.41786,
+            "85": 9.61785,
+            "86": 9.077,
+            "87": 9.59967,
+            "88": 9.75356,
+            "89": 9.60341,
+            "90": 9.82789,
+            "91": 9.33668,
+            "92": 9.36036,
+            "93": 9.08765,
+            "94": 8.83052,
+            "95": 9.5296,
+            "96": 9.53024,
+            "97": 9.30627,
+            "98": 9.67298,
+            "99": 8.89917,
+            "100": 9.40828
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1622.0,
+            "2": 1753.0,
+            "3": 1697.0,
+            "4": 1783.0,
+            "5": 2009.0,
+            "6": 1855.0,
+            "7": 1765.0,
+            "8": 1627.0,
+            "9": 1798.0,
+            "10": 1429.0,
+            "11": 1819.0,
+            "12": 1654.0,
+            "13": 1862.0,
+            "14": 1742.0,
+            "15": 1868.0,
+            "16": 1932.0,
+            "17": 1713.0,
+            "18": 1692.0,
+            "19": 1721.0,
+            "20": 1579.0,
+            "21": 1788.0,
+            "22": 1769.0,
+            "23": 1944.0,
+            "24": 1664.0,
+            "25": 1628.0,
+            "26": 1641.0,
+            "27": 1835.0,
+            "28": 1956.0,
+            "29": 2013.0,
+            "30": 1885.0,
+            "31": 1576.0,
+            "32": 1933.0,
+            "33": 2119.0,
+            "34": 1856.0,
+            "35": 1965.0,
+            "36": 1971.0,
+            "37": 2255.0,
+            "38": 2088.0,
+            "39": 2451.0,
+            "40": 2172.0,
+            "41": 2296.0,
+            "42": 2276.0,
+            "43": 1969.0,
+            "44": 2094.0,
+            "45": 2044.0,
+            "46": 2227.0,
+            "47": 2648.0,
+            "48": 2394.0,
+            "49": 2407.0,
+            "50": 2297.0,
+            "51": 2554.0,
+            "52": 2466.0,
+            "53": 2923.0,
+            "54": 2612.0,
+            "55": 2351.0,
+            "56": 2757.0,
+            "57": 2313.0,
+            "58": 2798.0,
+            "59": 2750.0,
+            "60": 2376.0,
+            "61": 2848.0,
+            "62": 2668.0,
+            "63": 2468.0,
+            "64": 2818.0,
+            "65": 2630.0,
+            "66": 2992.0,
+            "67": 2802.0,
+            "68": 2794.0,
+            "69": 2851.0,
+            "70": 3059.0,
+            "71": 2869.0,
+            "72": 2424.0,
+            "73": 3035.0,
+            "74": 2113.0,
+            "75": 2485.0,
+            "76": 2782.0,
+            "77": 3252.0,
+            "78": 3149.0,
+            "79": 3192.0,
+            "80": 3229.0,
+            "81": 3397.0,
+            "82": 3297.0,
+            "83": 2766.0,
+            "84": 3192.0,
+            "85": 3206.0,
+            "86": 2648.0,
+            "87": 3709.0,
+            "88": 2962.0,
+            "89": 3273.0,
+            "90": 3149.0,
+            "91": 2825.0,
+            "92": 3047.0,
+            "93": 2918.0,
+            "94": 3432.0,
+            "95": 3266.0,
+            "96": 3574.0,
+            "97": 3190.0,
+            "98": 3564.0,
+            "99": 2977.0,
+            "100": 3249.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 491224576.0,
+            "2": 491224576.0,
+            "3": 491224576.0,
+            "4": 491224576.0,
+            "5": 491224576.0,
+            "6": 491224576.0,
+            "7": 491224576.0,
+            "8": 491224576.0,
+            "9": 491224576.0,
+            "10": 491224576.0,
+            "11": 491224576.0,
+            "12": 491224576.0,
+            "13": 491224576.0,
+            "14": 491224576.0,
+            "15": 491224576.0,
+            "16": 491224576.0,
+            "17": 491224576.0,
+            "18": 491224576.0,
+            "19": 491224576.0,
+            "20": 491224576.0,
+            "21": 491224576.0,
+            "22": 491224576.0,
+            "23": 491224576.0,
+            "24": 491224576.0,
+            "25": 491224576.0,
+            "26": 491224576.0,
+            "27": 491224576.0,
+            "28": 491224576.0,
+            "29": 491224576.0,
+            "30": 491224576.0,
+            "31": 491224576.0,
+            "32": 491224576.0,
+            "33": 491224576.0,
+            "34": 491224576.0,
+            "35": 491224576.0,
+            "36": 491224576.0,
+            "37": 491224576.0,
+            "38": 491224576.0,
+            "39": 491224576.0,
+            "40": 491224576.0,
+            "41": 491224576.0,
+            "42": 491224576.0,
+            "43": 491224576.0,
+            "44": 491224576.0,
+            "45": 491224576.0,
+            "46": 491224576.0,
+            "47": 491224576.0,
+            "48": 491224576.0,
+            "49": 491224576.0,
+            "50": 491224576.0,
+            "51": 491224576.0,
+            "52": 491224576.0,
+            "53": 491224576.0,
+            "54": 491224576.0,
+            "55": 491224576.0,
+            "56": 491224576.0,
+            "57": 491224576.0,
+            "58": 491224576.0,
+            "59": 491224576.0,
+            "60": 491224576.0,
+            "61": 491224576.0,
+            "62": 491224576.0,
+            "63": 491224576.0,
+            "64": 491224576.0,
+            "65": 491224576.0,
+            "66": 491224576.0,
+            "67": 491224576.0,
+            "68": 491224576.0,
+            "69": 491224576.0,
+            "70": 491224576.0,
+            "71": 491224576.0,
+            "72": 491224576.0,
+            "73": 491224576.0,
+            "74": 491224576.0,
+            "75": 491224576.0,
+            "76": 491224576.0,
+            "77": 491224576.0,
+            "78": 491224576.0,
+            "79": 491224576.0,
+            "80": 491224576.0,
+            "81": 491224576.0,
+            "82": 491224576.0,
+            "83": 491224576.0,
+            "84": 491224576.0,
+            "85": 491224576.0,
+            "86": 491224576.0,
+            "87": 491224576.0,
+            "88": 491224576.0,
+            "89": 491224576.0,
+            "90": 491224576.0,
+            "91": 491224576.0,
+            "92": 491224576.0,
+            "93": 491224576.0,
+            "94": 491224576.0,
+            "95": 491224576.0,
+            "96": 491224576.0,
+            "97": 491224576.0,
+            "98": 491224576.0,
+            "99": 491224576.0,
+            "100": 491224576.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1578442240.0,
+            "2": 1706868224.0,
+            "3": 1706868224.0,
+            "4": 1706868224.0,
+            "5": 1706868224.0,
+            "6": 1706868224.0,
+            "7": 1706868224.0,
+            "8": 1706868224.0,
+            "9": 1706868224.0,
+            "10": 1706868224.0,
+            "11": 1706868224.0,
+            "12": 1706868224.0,
+            "13": 1706868224.0,
+            "14": 1706868224.0,
+            "15": 1706868224.0,
+            "16": 1706868224.0,
+            "17": 1706868224.0,
+            "18": 1706868224.0,
+            "19": 1706868224.0,
+            "20": 1706868224.0,
+            "21": 1706868224.0,
+            "22": 1706868224.0,
+            "23": 1706868224.0,
+            "24": 1706868224.0,
+            "25": 1706868224.0,
+            "26": 1706868224.0,
+            "27": 1706868224.0,
+            "28": 1706868224.0,
+            "29": 1706868224.0,
+            "30": 1706868224.0,
+            "31": 1706868224.0,
+            "32": 1706868224.0,
+            "33": 1706868224.0,
+            "34": 1706868224.0,
+            "35": 1706868224.0,
+            "36": 1706868224.0,
+            "37": 1706868224.0,
+            "38": 1706868224.0,
+            "39": 1706868224.0,
+            "40": 1706868224.0,
+            "41": 1706868224.0,
+            "42": 1706868224.0,
+            "43": 1706868224.0,
+            "44": 1706868224.0,
+            "45": 1706868224.0,
+            "46": 1706868224.0,
+            "47": 1706868224.0,
+            "48": 1706868224.0,
+            "49": 1706868224.0,
+            "50": 1706868224.0,
+            "51": 1706868224.0,
+            "52": 1706868224.0,
+            "53": 1706868224.0,
+            "54": 1706868224.0,
+            "55": 1706868224.0,
+            "56": 1706868224.0,
+            "57": 1706868224.0,
+            "58": 1706868224.0,
+            "59": 1706868224.0,
+            "60": 1706868224.0,
+            "61": 1706868224.0,
+            "62": 1706868224.0,
+            "63": 1706868224.0,
+            "64": 1706868224.0,
+            "65": 1706868224.0,
+            "66": 1706868224.0,
+            "67": 1706868224.0,
+            "68": 1706868224.0,
+            "69": 1706868224.0,
+            "70": 1706868224.0,
+            "71": 1706868224.0,
+            "72": 1706868224.0,
+            "73": 1706868224.0,
+            "74": 1706868224.0,
+            "75": 1706868224.0,
+            "76": 1706868224.0,
+            "77": 1706868224.0,
+            "78": 1706868224.0,
+            "79": 1706868224.0,
+            "80": 1706868224.0,
+            "81": 1706868224.0,
+            "82": 1706868224.0,
+            "83": 1706868224.0,
+            "84": 1706868224.0,
+            "85": 1706868224.0,
+            "86": 1706868224.0,
+            "87": 1706868224.0,
+            "88": 1706868224.0,
+            "89": 1706868224.0,
+            "90": 1706868224.0,
+            "91": 1706868224.0,
+            "92": 1706868224.0,
+            "93": 1706868224.0,
+            "94": 1706868224.0,
+            "95": 1706868224.0,
+            "96": 1706868224.0,
+            "97": 1706868224.0,
+            "98": 1706868224.0,
+            "99": 1706868224.0,
+            "100": 1706868224.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.23604,
+            "3": 0.17137,
+            "4": 0.15759,
+            "5": 0.15886,
+            "6": 0.15986,
+            "7": 0.16006,
+            "8": 0.16032,
+            "9": 0.15956,
+            "10": 0.15866,
+            "11": 0.16034,
+            "12": 0.161,
+            "13": 0.16092,
+            "14": 0.16138,
+            "15": 0.16079,
+            "16": 0.16106,
+            "17": 0.16054,
+            "18": 0.16039,
+            "19": 0.15987,
+            "20": 0.1604,
+            "21": 0.1606,
+            "22": 0.1605,
+            "23": 0.16063,
+            "24": 0.16081,
+            "25": 0.16081,
+            "26": 0.16009,
+            "27": 0.16063,
+            "28": 0.16056,
+            "29": 0.16163,
+            "30": 0.16078,
+            "31": 0.16052,
+            "32": 0.16157,
+            "33": 0.16141,
+            "34": 0.1609,
+            "35": 0.16067,
+            "36": 0.164,
+            "37": 0.16064,
+            "38": 0.16086,
+            "39": 0.16108,
+            "40": 0.1619,
+            "41": 0.15987,
+            "42": 0.16141,
+            "43": 0.16345,
+            "44": 0.15987,
+            "45": 0.16151,
+            "46": 0.16073,
+            "47": 0.16034,
+            "48": 0.15782,
+            "49": 0.15892,
+            "50": 0.15976,
+            "51": 0.17905,
+            "52": 0.16268,
+            "53": 0.15809,
+            "54": 0.15783,
+            "55": 0.1601,
+            "56": 0.16197,
+            "57": 0.16434,
+            "58": 0.16544,
+            "59": 0.16658,
+            "60": 0.16487,
+            "61": 0.16473,
+            "62": 0.1655,
+            "63": 0.16592,
+            "64": 0.1663,
+            "65": 0.16721,
+            "66": 0.16644,
+            "67": 0.16736,
+            "68": 0.16682,
+            "69": 0.16612,
+            "70": 0.1673,
+            "71": 0.1652,
+            "72": 0.16908,
+            "73": 0.16732,
+            "74": 0.16542,
+            "75": 0.16546,
+            "76": 0.16719,
+            "77": 0.16547,
+            "78": 0.16719,
+            "79": 0.16664,
+            "80": 0.16167,
+            "81": 0.163,
+            "82": 0.16309,
+            "83": 0.16502,
+            "84": 0.16364,
+            "85": 0.1648,
+            "86": 0.16491,
+            "87": 0.16514,
+            "88": 0.16442,
+            "89": 0.16591,
+            "90": 0.16301,
+            "91": 0.16462,
+            "92": 0.16639,
+            "93": 0.16358,
+            "94": 0.16489,
+            "95": 0.16504,
+            "96": 0.16457,
+            "97": 0.163,
+            "98": 0.16359,
+            "99": 0.16433,
+            "100": 0.16527
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..a3bcbb68249
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85898,
+            "52": 9.75133,
+            "53": 10.06617,
+            "54": 9.95613,
+            "55": 9.89104,
+            "56": 9.62508,
+            "57": 9.47981,
+            "58": 9.83478,
+            "59": 9.58498,
+            "60": 9.49806,
+            "61": 9.69192,
+            "62": 9.98825,
+            "63": 9.37824,
+            "64": 9.76808,
+            "65": 8.94514,
+            "66": 9.70125,
+            "67": 9.37149,
+            "68": 9.78313,
+            "69": 9.79923,
+            "70": 9.7312,
+            "71": 9.62753,
+            "72": 9.58452,
+            "73": 9.48417,
+            "74": 8.92523,
+            "75": 9.4118,
+            "76": 9.0796,
+            "77": 10.06083,
+            "78": 9.7215,
+            "79": 9.38109,
+            "80": 9.40161,
+            "81": 9.48468,
+            "82": 9.70219,
+            "83": 9.31549,
+            "84": 9.41786,
+            "85": 9.61785,
+            "86": 9.077,
+            "87": 9.59967,
+            "88": 9.75356,
+            "89": 9.60341,
+            "90": 9.82789,
+            "91": 9.33668,
+            "92": 9.36036,
+            "93": 9.08765,
+            "94": 8.83052,
+            "95": 9.5296,
+            "96": 9.53024,
+            "97": 9.30627,
+            "98": 9.67298,
+            "99": 8.89917,
+            "100": 9.40828
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2554.0,
+            "52": 2466.0,
+            "53": 2923.0,
+            "54": 2612.0,
+            "55": 2351.0,
+            "56": 2757.0,
+            "57": 2313.0,
+            "58": 2798.0,
+            "59": 2750.0,
+            "60": 2376.0,
+            "61": 2848.0,
+            "62": 2668.0,
+            "63": 2468.0,
+            "64": 2818.0,
+            "65": 2630.0,
+            "66": 2992.0,
+            "67": 2802.0,
+            "68": 2794.0,
+            "69": 2851.0,
+            "70": 3059.0,
+            "71": 2869.0,
+            "72": 2424.0,
+            "73": 3035.0,
+            "74": 2113.0,
+            "75": 2485.0,
+            "76": 2782.0,
+            "77": 3252.0,
+            "78": 3149.0,
+            "79": 3192.0,
+            "80": 3229.0,
+            "81": 3397.0,
+            "82": 3297.0,
+            "83": 2766.0,
+            "84": 3192.0,
+            "85": 3206.0,
+            "86": 2648.0,
+            "87": 3709.0,
+            "88": 2962.0,
+            "89": 3273.0,
+            "90": 3149.0,
+            "91": 2825.0,
+            "92": 3047.0,
+            "93": 2918.0,
+            "94": 3432.0,
+            "95": 3266.0,
+            "96": 3574.0,
+            "97": 3190.0,
+            "98": 3564.0,
+            "99": 2977.0,
+            "100": 3249.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 492274176.0,
+            "52": 492274176.0,
+            "53": 492274176.0,
+            "54": 492274176.0,
+            "55": 492274176.0,
+            "56": 492274176.0,
+            "57": 492274176.0,
+            "58": 492274176.0,
+            "59": 492274176.0,
+            "60": 492274176.0,
+            "61": 492274176.0,
+            "62": 492274176.0,
+            "63": 492274176.0,
+            "64": 492274176.0,
+            "65": 492274176.0,
+            "66": 492274176.0,
+            "67": 492274176.0,
+            "68": 492274176.0,
+            "69": 492274176.0,
+            "70": 492274176.0,
+            "71": 492274176.0,
+            "72": 492274176.0,
+            "73": 492274176.0,
+            "74": 492274176.0,
+            "75": 492274176.0,
+            "76": 492274176.0,
+            "77": 492274176.0,
+            "78": 492274176.0,
+            "79": 492274176.0,
+            "80": 492274176.0,
+            "81": 492274176.0,
+            "82": 492274176.0,
+            "83": 492274176.0,
+            "84": 492274176.0,
+            "85": 492274176.0,
+            "86": 492274176.0,
+            "87": 492274176.0,
+            "88": 492274176.0,
+            "89": 492274176.0,
+            "90": 492274176.0,
+            "91": 492274176.0,
+            "92": 492274176.0,
+            "93": 492274176.0,
+            "94": 492274176.0,
+            "95": 492274176.0,
+            "96": 492274176.0,
+            "97": 492274176.0,
+            "98": 492274176.0,
+            "99": 492274176.0,
+            "100": 492274176.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1709489664.0,
+            "52": 1709490688.0,
+            "53": 1709490688.0,
+            "54": 1709490688.0,
+            "55": 1709490688.0,
+            "56": 1709490688.0,
+            "57": 1709490688.0,
+            "58": 1709490688.0,
+            "59": 1709490688.0,
+            "60": 1709490688.0,
+            "61": 1709490688.0,
+            "62": 1709490688.0,
+            "63": 1709490688.0,
+            "64": 1709490688.0,
+            "65": 1709490688.0,
+            "66": 1709490688.0,
+            "67": 1709490688.0,
+            "68": 1709490688.0,
+            "69": 1709490688.0,
+            "70": 1709490688.0,
+            "71": 1709490688.0,
+            "72": 1709490688.0,
+            "73": 1709490688.0,
+            "74": 1709490688.0,
+            "75": 1709490688.0,
+            "76": 1709490688.0,
+            "77": 1709490688.0,
+            "78": 1709490688.0,
+            "79": 1709490688.0,
+            "80": 1709490688.0,
+            "81": 1709490688.0,
+            "82": 1709490688.0,
+            "83": 1709490688.0,
+            "84": 1709490688.0,
+            "85": 1709490688.0,
+            "86": 1709490688.0,
+            "87": 1709490688.0,
+            "88": 1709490688.0,
+            "89": 1709490688.0,
+            "90": 1709490688.0,
+            "91": 1709490688.0,
+            "92": 1709490688.0,
+            "93": 1709490688.0,
+            "94": 1709490688.0,
+            "95": 1709490688.0,
+            "96": 1709490688.0,
+            "97": 1709490688.0,
+            "98": 1709490688.0,
+            "99": 1709490688.0,
+            "100": 1709490688.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 3.02213,
+            "53": 0.17862,
+            "54": 0.16745,
+            "55": 0.16648,
+            "56": 0.16673,
+            "57": 0.16292,
+            "58": 0.19638,
+            "59": 0.16818,
+            "60": 0.16539,
+            "61": 0.16364,
+            "62": 0.16301,
+            "63": 0.1601,
+            "64": 0.16122,
+            "65": 0.16293,
+            "66": 0.16244,
+            "67": 0.16253,
+            "68": 0.16237,
+            "69": 0.16026,
+            "70": 0.17045,
+            "71": 0.15999,
+            "72": 0.1709,
+            "73": 0.16315,
+            "74": 0.1602,
+            "75": 0.15985,
+            "76": 0.15963,
+            "77": 0.15943,
+            "78": 0.15987,
+            "79": 0.16,
+            "80": 0.16033,
+            "81": 0.16099,
+            "82": 0.16037,
+            "83": 0.16139,
+            "84": 0.16563,
+            "85": 0.16553,
+            "86": 0.16519,
+            "87": 0.16488,
+            "88": 0.16176,
+            "89": 0.16185,
+            "90": 0.16148,
+            "91": 0.16234,
+            "92": 0.1601,
+            "93": 0.16147,
+            "94": 0.16081,
+            "95": 0.16284,
+            "96": 0.16144,
+            "97": 0.16225,
+            "98": 0.16162,
+            "99": 0.16044,
+            "100": 0.16202
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json
index 077c5e1317a..13ad7566828 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 490700288.0,
-            "2": 490700288.0,
-            "3": 490700288.0,
-            "4": 490700288.0,
-            "5": 490700288.0,
-            "6": 490700288.0,
-            "7": 490700288.0,
-            "8": 490700288.0,
-            "9": 490700288.0,
-            "10": 490700288.0,
-            "11": 490700288.0,
-            "12": 490700288.0,
-            "13": 490700288.0,
-            "14": 490700288.0,
-            "15": 490700288.0,
-            "16": 490700288.0,
-            "17": 490700288.0,
-            "18": 490700288.0,
-            "19": 490700288.0,
-            "20": 490700288.0,
-            "21": 490700288.0,
-            "22": 490700288.0,
-            "23": 490700288.0,
-            "24": 490700288.0,
-            "25": 490700288.0,
-            "26": 490700288.0,
-            "27": 490700288.0,
-            "28": 490700288.0,
-            "29": 490700288.0,
-            "30": 490700288.0,
-            "31": 490700288.0,
-            "32": 490700288.0,
-            "33": 490700288.0,
-            "34": 490700288.0,
-            "35": 490700288.0,
-            "36": 490700288.0,
-            "37": 490700288.0,
-            "38": 490700288.0,
-            "39": 490700288.0,
-            "40": 490700288.0,
-            "41": 490700288.0,
-            "42": 490700288.0,
-            "43": 490700288.0,
-            "44": 490700288.0,
-            "45": 490700288.0,
-            "46": 490700288.0,
-            "47": 490700288.0,
-            "48": 490700288.0,
-            "49": 490700288.0,
-            "50": 490700288.0,
-            "51": 490700288.0,
-            "52": 490700288.0,
-            "53": 490700288.0,
-            "54": 490700288.0,
-            "55": 490700288.0,
-            "56": 490700288.0,
-            "57": 490700288.0,
-            "58": 490700288.0,
-            "59": 490700288.0,
-            "60": 490700288.0,
-            "61": 490700288.0,
-            "62": 490700288.0,
-            "63": 490700288.0,
-            "64": 490700288.0,
-            "65": 490700288.0,
-            "66": 490700288.0,
-            "67": 490700288.0,
-            "68": 490700288.0,
-            "69": 490700288.0,
-            "70": 490700288.0,
-            "71": 490700288.0,
-            "72": 490700288.0,
-            "73": 490700288.0,
-            "74": 490700288.0,
-            "75": 490700288.0,
-            "76": 490700288.0,
-            "77": 490700288.0,
-            "78": 490700288.0,
-            "79": 490700288.0,
-            "80": 490700288.0,
-            "81": 490700288.0,
-            "82": 490700288.0,
-            "83": 490700288.0,
-            "84": 490700288.0,
-            "85": 490700288.0,
-            "86": 490700288.0,
-            "87": 490700288.0,
-            "88": 490700288.0,
-            "89": 490700288.0,
-            "90": 490700288.0,
-            "91": 490700288.0,
-            "92": 490700288.0,
-            "93": 490700288.0,
-            "94": 490700288.0,
-            "95": 490700288.0,
-            "96": 490700288.0,
-            "97": 490700288.0,
-            "98": 490700288.0,
-            "99": 490700288.0,
-            "100": 490700288.0
+            "1": 491224576.0,
+            "2": 491224576.0,
+            "3": 491224576.0,
+            "4": 491224576.0,
+            "5": 491224576.0,
+            "6": 491224576.0,
+            "7": 491224576.0,
+            "8": 491224576.0,
+            "9": 491224576.0,
+            "10": 491224576.0,
+            "11": 491224576.0,
+            "12": 491224576.0,
+            "13": 491224576.0,
+            "14": 491224576.0,
+            "15": 491224576.0,
+            "16": 491224576.0,
+            "17": 491224576.0,
+            "18": 491224576.0,
+            "19": 491224576.0,
+            "20": 491224576.0,
+            "21": 491224576.0,
+            "22": 491224576.0,
+            "23": 491224576.0,
+            "24": 491224576.0,
+            "25": 491224576.0,
+            "26": 491224576.0,
+            "27": 491224576.0,
+            "28": 491224576.0,
+            "29": 491224576.0,
+            "30": 491224576.0,
+            "31": 491224576.0,
+            "32": 491224576.0,
+            "33": 491224576.0,
+            "34": 491224576.0,
+            "35": 491224576.0,
+            "36": 491224576.0,
+            "37": 491224576.0,
+            "38": 491224576.0,
+            "39": 491224576.0,
+            "40": 491224576.0,
+            "41": 491224576.0,
+            "42": 491224576.0,
+            "43": 491224576.0,
+            "44": 491224576.0,
+            "45": 491224576.0,
+            "46": 491224576.0,
+            "47": 491224576.0,
+            "48": 491224576.0,
+            "49": 491224576.0,
+            "50": 491224576.0,
+            "51": 491224576.0,
+            "52": 491224576.0,
+            "53": 491224576.0,
+            "54": 491224576.0,
+            "55": 491224576.0,
+            "56": 491224576.0,
+            "57": 491224576.0,
+            "58": 491224576.0,
+            "59": 491224576.0,
+            "60": 491224576.0,
+            "61": 491224576.0,
+            "62": 491224576.0,
+            "63": 491224576.0,
+            "64": 491224576.0,
+            "65": 491224576.0,
+            "66": 491224576.0,
+            "67": 491224576.0,
+            "68": 491224576.0,
+            "69": 491224576.0,
+            "70": 491224576.0,
+            "71": 491224576.0,
+            "72": 491224576.0,
+            "73": 491224576.0,
+            "74": 491224576.0,
+            "75": 491224576.0,
+            "76": 491224576.0,
+            "77": 491224576.0,
+            "78": 491224576.0,
+            "79": 491224576.0,
+            "80": 491224576.0,
+            "81": 491224576.0,
+            "82": 491224576.0,
+            "83": 491224576.0,
+            "84": 491224576.0,
+            "85": 491224576.0,
+            "86": 491224576.0,
+            "87": 491224576.0,
+            "88": 491224576.0,
+            "89": 491224576.0,
+            "90": 491224576.0,
+            "91": 491224576.0,
+            "92": 491224576.0,
+            "93": 491224576.0,
+            "94": 491224576.0,
+            "95": 491224576.0,
+            "96": 491224576.0,
+            "97": 491224576.0,
+            "98": 491224576.0,
+            "99": 491224576.0,
+            "100": 491224576.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,7 +325,7 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1553275392.0,
+            "1": 1553276416.0,
             "2": 1681702400.0,
             "3": 1681702400.0,
             "4": 1681702400.0,
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 12.96096,
-            "2": 0.14328,
-            "3": 0.13234,
-            "4": 0.12983,
-            "5": 0.1339,
-            "6": 0.13424,
-            "7": 0.13558,
-            "8": 0.13644,
-            "9": 0.13434,
-            "10": 0.13106,
-            "11": 0.13377,
-            "12": 0.13148,
-            "13": 0.13136,
-            "14": 0.13331,
-            "15": 0.13429,
-            "16": 0.13208,
-            "17": 0.1316,
-            "18": 0.13139,
-            "19": 0.1287,
-            "20": 0.13199,
-            "21": 0.1318,
-            "22": 0.13196,
-            "23": 0.13019,
-            "24": 0.1317,
-            "25": 0.13217,
-            "26": 0.12983,
-            "27": 0.12928,
-            "28": 0.13258,
-            "29": 0.13441,
-            "30": 0.13276,
-            "31": 0.13264,
-            "32": 0.13228,
-            "33": 0.13159,
-            "34": 0.13219,
-            "35": 0.133,
-            "36": 0.13166,
-            "37": 0.13174,
-            "38": 0.1304,
-            "39": 0.1314,
-            "40": 0.13029,
-            "41": 0.13074,
-            "42": 0.12839,
-            "43": 0.13136,
-            "44": 0.13209,
-            "45": 0.12923,
-            "46": 0.13318,
-            "47": 0.1319,
-            "48": 0.13259,
-            "49": 0.13079,
-            "50": 0.12933,
-            "51": 0.15172,
-            "52": 0.1333,
-            "53": 0.14462,
-            "54": 0.13216,
-            "55": 0.13399,
-            "56": 0.13553,
-            "57": 0.13325,
-            "58": 0.13361,
-            "59": 0.13333,
-            "60": 0.13354,
-            "61": 0.13207,
-            "62": 0.1338,
-            "63": 0.13105,
-            "64": 0.13392,
-            "65": 0.13319,
-            "66": 0.13384,
-            "67": 0.13217,
-            "68": 0.13367,
-            "69": 0.13229,
-            "70": 0.13221,
-            "71": 0.1335,
-            "72": 0.13557,
-            "73": 0.13385,
-            "74": 0.13485,
-            "75": 0.13327,
-            "76": 0.13288,
-            "77": 0.13329,
-            "78": 0.13402,
-            "79": 0.13416,
-            "80": 0.13423,
-            "81": 0.13316,
-            "82": 0.13278,
-            "83": 0.13364,
-            "84": 0.13264,
-            "85": 0.13203,
-            "86": 0.13235,
-            "87": 0.13381,
-            "88": 0.13365,
-            "89": 0.13338,
-            "90": 0.1334,
-            "91": 0.13418,
-            "92": 0.13669,
-            "93": 0.13477,
-            "94": 0.13244,
-            "95": 0.13237,
-            "96": 0.13182,
-            "97": 0.13149,
-            "98": 0.13223,
-            "99": 0.13163,
-            "100": 0.1326
+            "1": 13.28736,
+            "2": 0.1399,
+            "3": 0.12618,
+            "4": 0.10709,
+            "5": 0.11408,
+            "6": 0.10894,
+            "7": 0.10708,
+            "8": 0.10773,
+            "9": 0.10787,
+            "10": 0.10884,
+            "11": 0.10818,
+            "12": 0.10774,
+            "13": 0.1067,
+            "14": 0.1065,
+            "15": 0.10599,
+            "16": 0.10552,
+            "17": 0.10782,
+            "18": 0.10913,
+            "19": 0.10816,
+            "20": 0.10759,
+            "21": 0.108,
+            "22": 0.10902,
+            "23": 0.1076,
+            "24": 0.1068,
+            "25": 0.10674,
+            "26": 0.10699,
+            "27": 0.10678,
+            "28": 0.10642,
+            "29": 0.1066,
+            "30": 0.10707,
+            "31": 0.10794,
+            "32": 0.10702,
+            "33": 0.10586,
+            "34": 0.10612,
+            "35": 0.10628,
+            "36": 0.10631,
+            "37": 0.10573,
+            "38": 0.10617,
+            "39": 0.10563,
+            "40": 0.1064,
+            "41": 0.1059,
+            "42": 0.1054,
+            "43": 0.10691,
+            "44": 0.10833,
+            "45": 0.10638,
+            "46": 0.10655,
+            "47": 0.10676,
+            "48": 0.10825,
+            "49": 0.10534,
+            "50": 0.10635,
+            "51": 0.12108,
+            "52": 0.12016,
+            "53": 0.11315,
+            "54": 0.10912,
+            "55": 0.11556,
+            "56": 0.10742,
+            "57": 0.10784,
+            "58": 0.11719,
+            "59": 0.10791,
+            "60": 0.10886,
+            "61": 0.11563,
+            "62": 0.10714,
+            "63": 0.10967,
+            "64": 0.11569,
+            "65": 0.10753,
+            "66": 0.1078,
+            "67": 0.10545,
+            "68": 0.10522,
+            "69": 0.10496,
+            "70": 0.10544,
+            "71": 0.10719,
+            "72": 0.10708,
+            "73": 0.1062,
+            "74": 0.10663,
+            "75": 0.10766,
+            "76": 0.10634,
+            "77": 0.106,
+            "78": 0.10757,
+            "79": 0.10574,
+            "80": 0.10548,
+            "81": 0.1068,
+            "82": 0.10639,
+            "83": 0.10598,
+            "84": 0.10693,
+            "85": 0.10553,
+            "86": 0.10606,
+            "87": 0.10692,
+            "88": 0.10564,
+            "89": 0.10633,
+            "90": 0.10625,
+            "91": 0.10563,
+            "92": 0.10508,
+            "93": 0.10937,
+            "94": 0.10519,
+            "95": 0.10566,
+            "96": 0.11009,
+            "97": 0.10631,
+            "98": 0.10595,
+            "99": 0.10785,
+            "100": 0.10678
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..22ee15f7925
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8709,
+            "52": 9.7737,
+            "53": 10.08149,
+            "54": 9.97376,
+            "55": 9.90036,
+            "56": 9.64783,
+            "57": 9.50136,
+            "58": 9.85199,
+            "59": 9.6034,
+            "60": 9.50993,
+            "61": 9.71315,
+            "62": 9.99373,
+            "63": 9.39358,
+            "64": 9.78904,
+            "65": 8.96358,
+            "66": 9.71142,
+            "67": 9.38175,
+            "68": 9.79833,
+            "69": 9.80889,
+            "70": 9.75039,
+            "71": 9.62004,
+            "72": 9.59387,
+            "73": 9.50631,
+            "74": 8.94916,
+            "75": 9.43188,
+            "76": 9.08702,
+            "77": 10.06886,
+            "78": 9.73459,
+            "79": 9.38325,
+            "80": 9.41272,
+            "81": 9.48499,
+            "82": 9.70672,
+            "83": 9.30939,
+            "84": 9.42428,
+            "85": 9.61991,
+            "86": 9.07811,
+            "87": 9.59541,
+            "88": 9.75596,
+            "89": 9.60274,
+            "90": 9.82165,
+            "91": 9.34268,
+            "92": 9.35878,
+            "93": 9.08116,
+            "94": 8.83791,
+            "95": 9.5238,
+            "96": 9.53556,
+            "97": 9.31807,
+            "98": 9.68183,
+            "99": 8.89422,
+            "100": 9.40138
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2514.0,
+            "52": 2430.0,
+            "53": 2840.0,
+            "54": 2677.0,
+            "55": 2394.0,
+            "56": 2601.0,
+            "57": 2341.0,
+            "58": 2837.0,
+            "59": 2789.0,
+            "60": 2425.0,
+            "61": 2923.0,
+            "62": 2591.0,
+            "63": 2416.0,
+            "64": 2937.0,
+            "65": 2572.0,
+            "66": 3008.0,
+            "67": 2843.0,
+            "68": 2761.0,
+            "69": 2834.0,
+            "70": 3108.0,
+            "71": 2989.0,
+            "72": 2316.0,
+            "73": 2950.0,
+            "74": 1899.0,
+            "75": 2378.0,
+            "76": 2962.0,
+            "77": 3343.0,
+            "78": 3183.0,
+            "79": 2979.0,
+            "80": 3209.0,
+            "81": 3583.0,
+            "82": 3160.0,
+            "83": 2776.0,
+            "84": 3242.0,
+            "85": 3425.0,
+            "86": 2720.0,
+            "87": 3820.0,
+            "88": 3050.0,
+            "89": 3297.0,
+            "90": 3069.0,
+            "91": 2685.0,
+            "92": 3061.0,
+            "93": 2584.0,
+            "94": 3338.0,
+            "95": 3406.0,
+            "96": 3389.0,
+            "97": 3104.0,
+            "98": 3583.0,
+            "99": 3229.0,
+            "100": 3225.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 492274176.0,
+            "52": 492274176.0,
+            "53": 492274176.0,
+            "54": 492274176.0,
+            "55": 492274176.0,
+            "56": 492274176.0,
+            "57": 492274176.0,
+            "58": 492274176.0,
+            "59": 492274176.0,
+            "60": 492274176.0,
+            "61": 492274176.0,
+            "62": 492274176.0,
+            "63": 492274176.0,
+            "64": 492274176.0,
+            "65": 492274176.0,
+            "66": 492274176.0,
+            "67": 492274176.0,
+            "68": 492274176.0,
+            "69": 492274176.0,
+            "70": 492274176.0,
+            "71": 492274176.0,
+            "72": 492274176.0,
+            "73": 492274176.0,
+            "74": 492274176.0,
+            "75": 492274176.0,
+            "76": 492274176.0,
+            "77": 492274176.0,
+            "78": 492274176.0,
+            "79": 492274176.0,
+            "80": 492274176.0,
+            "81": 492274176.0,
+            "82": 492274176.0,
+            "83": 492274176.0,
+            "84": 492274176.0,
+            "85": 492274176.0,
+            "86": 492274176.0,
+            "87": 492274176.0,
+            "88": 492274176.0,
+            "89": 492274176.0,
+            "90": 492274176.0,
+            "91": 492274176.0,
+            "92": 492274176.0,
+            "93": 492274176.0,
+            "94": 492274176.0,
+            "95": 492274176.0,
+            "96": 492274176.0,
+            "97": 492274176.0,
+            "98": 492274176.0,
+            "99": 492274176.0,
+            "100": 492274176.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1684323840.0,
+            "52": 1684324864.0,
+            "53": 1684324864.0,
+            "54": 1684324864.0,
+            "55": 1684324864.0,
+            "56": 1684324864.0,
+            "57": 1684324864.0,
+            "58": 1684324864.0,
+            "59": 1684324864.0,
+            "60": 1684324864.0,
+            "61": 1684324864.0,
+            "62": 1684324864.0,
+            "63": 1684324864.0,
+            "64": 1684324864.0,
+            "65": 1684324864.0,
+            "66": 1684324864.0,
+            "67": 1684324864.0,
+            "68": 1684324864.0,
+            "69": 1684324864.0,
+            "70": 1684324864.0,
+            "71": 1684324864.0,
+            "72": 1684324864.0,
+            "73": 1684324864.0,
+            "74": 1684324864.0,
+            "75": 1684324864.0,
+            "76": 1684324864.0,
+            "77": 1684324864.0,
+            "78": 1684324864.0,
+            "79": 1684324864.0,
+            "80": 1684324864.0,
+            "81": 1684324864.0,
+            "82": 1684324864.0,
+            "83": 1684324864.0,
+            "84": 1684324864.0,
+            "85": 1684324864.0,
+            "86": 1684324864.0,
+            "87": 1684324864.0,
+            "88": 1684324864.0,
+            "89": 1684324864.0,
+            "90": 1684324864.0,
+            "91": 1684324864.0,
+            "92": 1684324864.0,
+            "93": 1684324864.0,
+            "94": 1684324864.0,
+            "95": 1684324864.0,
+            "96": 1684324864.0,
+            "97": 1684324864.0,
+            "98": 1684324864.0,
+            "99": 1684324864.0,
+            "100": 1684324864.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 11.56176,
+            "52": 0.13774,
+            "53": 0.11414,
+            "54": 0.11045,
+            "55": 0.1125,
+            "56": 0.11106,
+            "57": 0.11016,
+            "58": 0.11042,
+            "59": 0.11057,
+            "60": 0.10826,
+            "61": 0.10921,
+            "62": 0.10786,
+            "63": 0.10755,
+            "64": 0.10814,
+            "65": 0.10772,
+            "66": 0.10843,
+            "67": 0.10895,
+            "68": 0.10806,
+            "69": 0.10877,
+            "70": 0.10793,
+            "71": 0.11024,
+            "72": 0.10933,
+            "73": 0.10647,
+            "74": 0.10846,
+            "75": 0.11298,
+            "76": 0.13322,
+            "77": 0.11871,
+            "78": 0.10859,
+            "79": 0.106,
+            "80": 0.10554,
+            "81": 0.10679,
+            "82": 0.10538,
+            "83": 0.10499,
+            "84": 0.10608,
+            "85": 0.10469,
+            "86": 0.10435,
+            "87": 0.10772,
+            "88": 0.10459,
+            "89": 0.10545,
+            "90": 0.10691,
+            "91": 0.10516,
+            "92": 0.10438,
+            "93": 0.10542,
+            "94": 0.10744,
+            "95": 0.10521,
+            "96": 0.10614,
+            "97": 0.10613,
+            "98": 0.1077,
+            "99": 0.10781,
+            "100": 0.10442
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json
index 3be93706d81..26272ae12c0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.17153,
-            "2": 0.2103,
-            "3": 0.21541,
-            "4": 0.21948,
-            "5": 0.17282,
-            "6": 0.16921,
-            "7": 0.1711,
-            "8": 0.16967,
-            "9": 0.17064,
-            "10": 0.16972,
-            "11": 0.1696,
-            "12": 0.1701,
-            "13": 0.16923,
-            "14": 0.16942,
-            "15": 0.16782,
-            "16": 0.17,
-            "17": 0.16748,
-            "18": 0.16821,
-            "19": 0.16739,
-            "20": 0.16883,
-            "21": 0.16894,
-            "22": 0.16847,
-            "23": 0.16846,
-            "24": 0.16887,
-            "25": 0.16905,
-            "26": 0.16873,
-            "27": 0.16876,
-            "28": 0.16868,
-            "29": 0.1706,
-            "30": 0.17379,
-            "31": 0.17109,
-            "32": 0.17107,
-            "33": 0.17072,
-            "34": 0.17137,
-            "35": 0.17105,
-            "36": 0.17106,
-            "37": 0.17077,
-            "38": 0.17115,
-            "39": 0.17067,
-            "40": 0.17057,
-            "41": 0.17099,
-            "42": 0.17074,
-            "43": 0.17091,
-            "44": 0.17078,
-            "45": 0.17104,
-            "46": 0.17055,
-            "47": 0.17137,
-            "48": 0.17086,
-            "49": 0.17081,
-            "50": 0.17053,
-            "51": 0.17448,
-            "52": 0.16607,
-            "53": 0.16686,
-            "54": 0.16608,
-            "55": 0.16654,
-            "56": 0.16591,
-            "57": 0.16614,
-            "58": 0.1659,
-            "59": 0.16577,
-            "60": 0.16589,
-            "61": 0.16557,
-            "62": 0.16528,
-            "63": 0.16612,
-            "64": 0.1658,
-            "65": 0.16543,
-            "66": 0.1651,
-            "67": 0.16559,
-            "68": 0.16502,
-            "69": 0.16533,
-            "70": 0.16636,
-            "71": 0.16516,
-            "72": 0.1657,
-            "73": 0.1656,
-            "74": 0.16521,
-            "75": 0.16623,
-            "76": 0.16628,
-            "77": 0.16593,
-            "78": 0.16615,
-            "79": 0.1658,
-            "80": 0.16904,
-            "81": 0.16665,
-            "82": 0.16575,
-            "83": 0.16623,
-            "84": 0.16603,
-            "85": 0.16577,
-            "86": 0.16568,
-            "87": 0.16525,
-            "88": 0.16531,
-            "89": 0.16616,
-            "90": 0.16544,
-            "91": 0.16581,
-            "92": 0.16545,
-            "93": 0.16603,
-            "94": 0.16501,
-            "95": 0.16632,
-            "96": 0.16545,
-            "97": 0.16577,
-            "98": 0.19996,
-            "99": 0.19154,
-            "100": 0.19156
+            "1": 5.31573,
+            "2": 0.18576,
+            "3": 0.17476,
+            "4": 0.16336,
+            "5": 0.16444,
+            "6": 0.16376,
+            "7": 0.16391,
+            "8": 0.16436,
+            "9": 0.1647,
+            "10": 0.16442,
+            "11": 0.16651,
+            "12": 0.16415,
+            "13": 0.1639,
+            "14": 0.16341,
+            "15": 0.16405,
+            "16": 0.16336,
+            "17": 0.1649,
+            "18": 0.16416,
+            "19": 0.16368,
+            "20": 0.16287,
+            "21": 0.16352,
+            "22": 0.16266,
+            "23": 0.16606,
+            "24": 0.16733,
+            "25": 0.15996,
+            "26": 0.16017,
+            "27": 0.15966,
+            "28": 0.15989,
+            "29": 0.16042,
+            "30": 0.16078,
+            "31": 0.1603,
+            "32": 0.16003,
+            "33": 0.15993,
+            "34": 0.16031,
+            "35": 0.16091,
+            "36": 0.16047,
+            "37": 0.16035,
+            "38": 0.16032,
+            "39": 0.16044,
+            "40": 0.15963,
+            "41": 0.15984,
+            "42": 0.16183,
+            "43": 0.16457,
+            "44": 0.16023,
+            "45": 0.15984,
+            "46": 0.15948,
+            "47": 0.1592,
+            "48": 0.15954,
+            "49": 0.16019,
+            "50": 0.15913,
+            "51": 0.1678,
+            "52": 0.1599,
+            "53": 0.16055,
+            "54": 0.15919,
+            "55": 0.15968,
+            "56": 0.15917,
+            "57": 0.15882,
+            "58": 0.15853,
+            "59": 0.16041,
+            "60": 0.15905,
+            "61": 0.16002,
+            "62": 0.15878,
+            "63": 0.15894,
+            "64": 0.15851,
+            "65": 0.1593,
+            "66": 0.15905,
+            "67": 0.15864,
+            "68": 0.15939,
+            "69": 0.15875,
+            "70": 0.16002,
+            "71": 0.15947,
+            "72": 0.15984,
+            "73": 0.15928,
+            "74": 0.16024,
+            "75": 0.15992,
+            "76": 0.15976,
+            "77": 0.1599,
+            "78": 0.15928,
+            "79": 0.15924,
+            "80": 0.15931,
+            "81": 0.15912,
+            "82": 0.15858,
+            "83": 0.15936,
+            "84": 0.15981,
+            "85": 0.16066,
+            "86": 0.15948,
+            "87": 0.15924,
+            "88": 0.15893,
+            "89": 0.16025,
+            "90": 0.15868,
+            "91": 0.15895,
+            "92": 0.15857,
+            "93": 0.15929,
+            "94": 0.15913,
+            "95": 0.15916,
+            "96": 0.15869,
+            "97": 0.15992,
+            "98": 0.15991,
+            "99": 0.15932,
+            "100": 0.15959
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..8b98843a405
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.86094,
+            "52": 9.75697,
+            "53": 10.07633,
+            "54": 9.96082,
+            "55": 9.88565,
+            "56": 9.6349,
+            "57": 9.4925,
+            "58": 9.83099,
+            "59": 9.59122,
+            "60": 9.50798,
+            "61": 9.7061,
+            "62": 9.98413,
+            "63": 9.37604,
+            "64": 9.77938,
+            "65": 8.95852,
+            "66": 9.70596,
+            "67": 9.37402,
+            "68": 9.78683,
+            "69": 9.78932,
+            "70": 9.72766,
+            "71": 9.61135,
+            "72": 9.59178,
+            "73": 9.49896,
+            "74": 8.95742,
+            "75": 9.42469,
+            "76": 9.09651,
+            "77": 10.06653,
+            "78": 9.73149,
+            "79": 9.37959,
+            "80": 9.40394,
+            "81": 9.48277,
+            "82": 9.69318,
+            "83": 9.31104,
+            "84": 9.4139,
+            "85": 9.61469,
+            "86": 9.07793,
+            "87": 9.59662,
+            "88": 9.74827,
+            "89": 9.60196,
+            "90": 9.81239,
+            "91": 9.34524,
+            "92": 9.36524,
+            "93": 9.07745,
+            "94": 8.83182,
+            "95": 9.521,
+            "96": 9.52525,
+            "97": 9.31322,
+            "98": 9.677,
+            "99": 8.88904,
+            "100": 9.40063
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2742.0,
+            "52": 2671.0,
+            "53": 3066.0,
+            "54": 2782.0,
+            "55": 2510.0,
+            "56": 2874.0,
+            "57": 2304.0,
+            "58": 3111.0,
+            "59": 2862.0,
+            "60": 2374.0,
+            "61": 2977.0,
+            "62": 2740.0,
+            "63": 2394.0,
+            "64": 3232.0,
+            "65": 2720.0,
+            "66": 3277.0,
+            "67": 2810.0,
+            "68": 2830.0,
+            "69": 3094.0,
+            "70": 3327.0,
+            "71": 3106.0,
+            "72": 2261.0,
+            "73": 3147.0,
+            "74": 1902.0,
+            "75": 2545.0,
+            "76": 2905.0,
+            "77": 3468.0,
+            "78": 3432.0,
+            "79": 3336.0,
+            "80": 3434.0,
+            "81": 3605.0,
+            "82": 3269.0,
+            "83": 2891.0,
+            "84": 3343.0,
+            "85": 3501.0,
+            "86": 2786.0,
+            "87": 3872.0,
+            "88": 3019.0,
+            "89": 3407.0,
+            "90": 3023.0,
+            "91": 2630.0,
+            "92": 3186.0,
+            "93": 2746.0,
+            "94": 3526.0,
+            "95": 3414.0,
+            "96": 3546.0,
+            "97": 3339.0,
+            "98": 3758.0,
+            "99": 3058.0,
+            "100": 3454.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 463961600.0,
+            "52": 463961600.0,
+            "53": 463961600.0,
+            "54": 463961600.0,
+            "55": 463961600.0,
+            "56": 463961600.0,
+            "57": 463961600.0,
+            "58": 463961600.0,
+            "59": 463961600.0,
+            "60": 463961600.0,
+            "61": 463961600.0,
+            "62": 463961600.0,
+            "63": 463961600.0,
+            "64": 463961600.0,
+            "65": 463961600.0,
+            "66": 463961600.0,
+            "67": 463961600.0,
+            "68": 463961600.0,
+            "69": 463961600.0,
+            "70": 463961600.0,
+            "71": 463961600.0,
+            "72": 463961600.0,
+            "73": 463961600.0,
+            "74": 463961600.0,
+            "75": 463961600.0,
+            "76": 463961600.0,
+            "77": 463961600.0,
+            "78": 463961600.0,
+            "79": 463961600.0,
+            "80": 463961600.0,
+            "81": 463961600.0,
+            "82": 463961600.0,
+            "83": 463961600.0,
+            "84": 463961600.0,
+            "85": 463961600.0,
+            "86": 463961600.0,
+            "87": 463961600.0,
+            "88": 463961600.0,
+            "89": 463961600.0,
+            "90": 463961600.0,
+            "91": 463961600.0,
+            "92": 463961600.0,
+            "93": 463961600.0,
+            "94": 463961600.0,
+            "95": 463961600.0,
+            "96": 463961600.0,
+            "97": 463961600.0,
+            "98": 463961600.0,
+            "99": 463961600.0,
+            "100": 463961600.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1680128512.0,
+            "52": 1680129536.0,
+            "53": 1680129536.0,
+            "54": 1680129536.0,
+            "55": 1680129536.0,
+            "56": 1680129536.0,
+            "57": 1680129536.0,
+            "58": 1680129536.0,
+            "59": 1680129536.0,
+            "60": 1680129536.0,
+            "61": 1680129536.0,
+            "62": 1680129536.0,
+            "63": 1680129536.0,
+            "64": 1680129536.0,
+            "65": 1680129536.0,
+            "66": 1680129536.0,
+            "67": 1680129536.0,
+            "68": 1680129536.0,
+            "69": 1680129536.0,
+            "70": 1680129536.0,
+            "71": 1680129536.0,
+            "72": 1680129536.0,
+            "73": 1680129536.0,
+            "74": 1680129536.0,
+            "75": 1680129536.0,
+            "76": 1680129536.0,
+            "77": 1680129536.0,
+            "78": 1680129536.0,
+            "79": 1680129536.0,
+            "80": 1680129536.0,
+            "81": 1680129536.0,
+            "82": 1680129536.0,
+            "83": 1680129536.0,
+            "84": 1680129536.0,
+            "85": 1680129536.0,
+            "86": 1680129536.0,
+            "87": 1680129536.0,
+            "88": 1680129536.0,
+            "89": 1680129536.0,
+            "90": 1680129536.0,
+            "91": 1680129536.0,
+            "92": 1680129536.0,
+            "93": 1680129536.0,
+            "94": 1680129536.0,
+            "95": 1680129536.0,
+            "96": 1680129536.0,
+            "97": 1680129536.0,
+            "98": 1680129536.0,
+            "99": 1680129536.0,
+            "100": 1680129536.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.38745,
+            "52": 0.1947,
+            "53": 0.16573,
+            "54": 0.16451,
+            "55": 0.16409,
+            "56": 0.16412,
+            "57": 0.16377,
+            "58": 0.17013,
+            "59": 0.16235,
+            "60": 0.16219,
+            "61": 0.1625,
+            "62": 0.16258,
+            "63": 0.16255,
+            "64": 0.1621,
+            "65": 0.16202,
+            "66": 0.16189,
+            "67": 0.16236,
+            "68": 0.1626,
+            "69": 0.16239,
+            "70": 0.16282,
+            "71": 0.16351,
+            "72": 0.16315,
+            "73": 0.16226,
+            "74": 0.16223,
+            "75": 0.16293,
+            "76": 0.16215,
+            "77": 0.16226,
+            "78": 0.1618,
+            "79": 0.16297,
+            "80": 0.16219,
+            "81": 0.1623,
+            "82": 0.16257,
+            "83": 0.16228,
+            "84": 0.16177,
+            "85": 0.16159,
+            "86": 0.16175,
+            "87": 0.16211,
+            "88": 0.16542,
+            "89": 0.16094,
+            "90": 0.16115,
+            "91": 0.16067,
+            "92": 0.16092,
+            "93": 0.1611,
+            "94": 0.15979,
+            "95": 0.1611,
+            "96": 0.16078,
+            "97": 0.16074,
+            "98": 0.16087,
+            "99": 0.15996,
+            "100": 0.1607
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..9066ef0c241
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84733,
+            "4": 10.85571,
+            "5": 10.86,
+            "6": 10.87733,
+            "7": 10.86555,
+            "8": 10.84913,
+            "9": 10.86609,
+            "10": 10.82473,
+            "11": 10.85618,
+            "12": 10.85374,
+            "13": 10.86788,
+            "14": 10.87119,
+            "15": 10.82235,
+            "16": 10.79991,
+            "17": 10.77431,
+            "18": 10.78345,
+            "19": 10.79308,
+            "20": 10.68226,
+            "21": 10.6471,
+            "22": 10.50917,
+            "23": 10.66827,
+            "24": 10.54193,
+            "25": 10.4928,
+            "26": 10.55931,
+            "27": 10.54238,
+            "28": 10.51129,
+            "29": 10.53257,
+            "30": 10.28992,
+            "31": 10.02853,
+            "32": 10.38885,
+            "33": 10.39593,
+            "34": 10.13446,
+            "35": 10.18932,
+            "36": 10.13355,
+            "37": 10.27381,
+            "38": 10.10751,
+            "39": 10.34007,
+            "40": 9.98538,
+            "41": 10.06414,
+            "42": 10.13744,
+            "43": 9.73381,
+            "44": 9.86305,
+            "45": 9.73723,
+            "46": 9.71343,
+            "47": 10.07757,
+            "48": 9.76768,
+            "49": 9.41987,
+            "50": 9.81687
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 567.0,
+            "2": 584.0,
+            "3": 598.0,
+            "4": 633.0,
+            "5": 630.0,
+            "6": 645.0,
+            "7": 645.0,
+            "8": 674.0,
+            "9": 625.0,
+            "10": 500.0,
+            "11": 669.0,
+            "12": 554.0,
+            "13": 681.0,
+            "14": 633.0,
+            "15": 623.0,
+            "16": 592.0,
+            "17": 636.0,
+            "18": 625.0,
+            "19": 633.0,
+            "20": 587.0,
+            "21": 696.0,
+            "22": 585.0,
+            "23": 681.0,
+            "24": 639.0,
+            "25": 587.0,
+            "26": 642.0,
+            "27": 639.0,
+            "28": 744.0,
+            "29": 746.0,
+            "30": 685.0,
+            "31": 603.0,
+            "32": 719.0,
+            "33": 850.0,
+            "34": 696.0,
+            "35": 737.0,
+            "36": 738.0,
+            "37": 840.0,
+            "38": 757.0,
+            "39": 828.0,
+            "40": 828.0,
+            "41": 787.0,
+            "42": 883.0,
+            "43": 703.0,
+            "44": 850.0,
+            "45": 840.0,
+            "46": 837.0,
+            "47": 915.0,
+            "48": 849.0,
+            "49": 915.0,
+            "50": 892.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 463110656.0,
+            "2": 463110656.0,
+            "3": 463110656.0,
+            "4": 463110656.0,
+            "5": 463110656.0,
+            "6": 463110656.0,
+            "7": 463110656.0,
+            "8": 463110656.0,
+            "9": 463110656.0,
+            "10": 463110656.0,
+            "11": 463110656.0,
+            "12": 463110656.0,
+            "13": 463110656.0,
+            "14": 463110656.0,
+            "15": 463110656.0,
+            "16": 463110656.0,
+            "17": 463110656.0,
+            "18": 463110656.0,
+            "19": 463110656.0,
+            "20": 463110656.0,
+            "21": 463110656.0,
+            "22": 463110656.0,
+            "23": 463110656.0,
+            "24": 463110656.0,
+            "25": 463110656.0,
+            "26": 463110656.0,
+            "27": 463110656.0,
+            "28": 463110656.0,
+            "29": 463110656.0,
+            "30": 463110656.0,
+            "31": 463110656.0,
+            "32": 463110656.0,
+            "33": 463110656.0,
+            "34": 463110656.0,
+            "35": 463110656.0,
+            "36": 463110656.0,
+            "37": 463110656.0,
+            "38": 463110656.0,
+            "39": 463110656.0,
+            "40": 463110656.0,
+            "41": 463110656.0,
+            "42": 463110656.0,
+            "43": 463110656.0,
+            "44": 463110656.0,
+            "45": 463110656.0,
+            "46": 463110656.0,
+            "47": 463110656.0,
+            "48": 463110656.0,
+            "49": 463110656.0,
+            "50": 463110656.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 704587264.0,
+            "2": 887671296.0,
+            "3": 887671296.0,
+            "4": 887671296.0,
+            "5": 887671296.0,
+            "6": 887671296.0,
+            "7": 887671296.0,
+            "8": 887671296.0,
+            "9": 887671296.0,
+            "10": 887671296.0,
+            "11": 887674368.0,
+            "12": 887674368.0,
+            "13": 887674368.0,
+            "14": 887674368.0,
+            "15": 887674368.0,
+            "16": 887674368.0,
+            "17": 887674368.0,
+            "18": 887674368.0,
+            "19": 887674368.0,
+            "20": 887674368.0,
+            "21": 887674368.0,
+            "22": 887674368.0,
+            "23": 887674368.0,
+            "24": 887674368.0,
+            "25": 887674368.0,
+            "26": 887674368.0,
+            "27": 887674368.0,
+            "28": 887674368.0,
+            "29": 887674368.0,
+            "30": 887674368.0,
+            "31": 887674368.0,
+            "32": 887674368.0,
+            "33": 887674368.0,
+            "34": 887674368.0,
+            "35": 887674368.0,
+            "36": 887674368.0,
+            "37": 887674368.0,
+            "38": 887674368.0,
+            "39": 887674368.0,
+            "40": 887674368.0,
+            "41": 887674368.0,
+            "42": 887674368.0,
+            "43": 887674368.0,
+            "44": 887674368.0,
+            "45": 887674368.0,
+            "46": 887674368.0,
+            "47": 887674368.0,
+            "48": 887674368.0,
+            "49": 887674368.0,
+            "50": 887674368.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.90343,
+            "3": 0.46196,
+            "4": 0.44553,
+            "5": 0.44562,
+            "6": 0.4419,
+            "7": 0.44421,
+            "8": 0.44706,
+            "9": 0.44217,
+            "10": 0.44007,
+            "11": 0.42363,
+            "12": 0.42376,
+            "13": 0.41997,
+            "14": 0.42079,
+            "15": 0.49345,
+            "16": 0.51617,
+            "17": 0.52444,
+            "18": 0.52822,
+            "19": 0.53053,
+            "20": 0.52743,
+            "21": 0.5289,
+            "22": 0.52655,
+            "23": 0.52934,
+            "24": 0.52619,
+            "25": 0.52343,
+            "26": 0.52071,
+            "27": 0.50241,
+            "28": 0.48165,
+            "29": 0.47009,
+            "30": 0.46549,
+            "31": 0.46432,
+            "32": 0.47167,
+            "33": 0.53326,
+            "34": 0.49042,
+            "35": 0.46143,
+            "36": 0.45859,
+            "37": 0.45093,
+            "38": 0.45152,
+            "39": 0.4443,
+            "40": 0.44393,
+            "41": 0.43638,
+            "42": 0.44204,
+            "43": 0.43923,
+            "44": 0.44115,
+            "45": 0.44017,
+            "46": 0.47447,
+            "47": 0.49352,
+            "48": 0.4943,
+            "49": 0.49375,
+            "50": 0.49263
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json
index 02b4683ea0b..5b1ee17f8f6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json
@@ -4,55 +4,55 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86284,
-            "4": 10.84009,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.86282,
+            "4": 10.84007,
             "5": 10.87856,
-            "6": 10.88856,
-            "7": 10.86532,
-            "8": 10.86017,
-            "9": 10.8599,
-            "10": 10.82981,
-            "11": 10.8895,
-            "12": 10.8751,
-            "13": 10.87423,
+            "6": 10.88852,
+            "7": 10.86536,
+            "8": 10.86015,
+            "9": 10.85991,
+            "10": 10.82982,
+            "11": 10.88947,
+            "12": 10.87511,
+            "13": 10.87422,
             "14": 10.89675,
-            "15": 10.82054,
-            "16": 10.82504,
+            "15": 10.82056,
+            "16": 10.82497,
             "17": 10.78983,
             "18": 10.81029,
-            "19": 10.80535,
-            "20": 10.70398,
-            "21": 10.66993,
-            "22": 10.50643,
-            "23": 10.69004,
-            "24": 10.56314,
-            "25": 10.4942,
-            "26": 10.56628,
-            "27": 10.58025,
+            "19": 10.80528,
+            "20": 10.70396,
+            "21": 10.6699,
+            "22": 10.50641,
+            "23": 10.69006,
+            "24": 10.56312,
+            "25": 10.49418,
+            "26": 10.56627,
+            "27": 10.58023,
             "28": 10.51571,
-            "29": 10.55299,
-            "30": 10.30549,
-            "31": 10.02245,
-            "32": 10.40614,
+            "29": 10.55296,
+            "30": 10.30551,
+            "31": 10.02244,
+            "32": 10.40618,
             "33": 10.39874,
-            "34": 10.13771,
+            "34": 10.1377,
             "35": 10.20184,
-            "36": 10.16052,
-            "37": 10.28973,
-            "38": 10.11474,
+            "36": 10.1605,
+            "37": 10.28975,
+            "38": 10.11483,
             "39": 10.361,
-            "40": 10.01903,
+            "40": 10.01905,
             "41": 10.07292,
-            "42": 10.14698,
-            "43": 9.74687,
-            "44": 9.87766,
-            "45": 9.74966,
-            "46": 9.73383,
-            "47": 10.07535,
-            "48": 9.78068,
-            "49": 9.44784,
+            "42": 10.14697,
+            "43": 9.74684,
+            "44": 9.87763,
+            "45": 9.74962,
+            "46": 9.73382,
+            "47": 10.07536,
+            "48": 9.78071,
+            "49": 9.44783,
             "50": 9.8399
         }
     },
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 653.0,
-            "2": 642.0,
-            "3": 630.0,
-            "4": 585.0,
-            "5": 635.0,
-            "6": 687.0,
-            "7": 615.0,
-            "8": 601.0,
-            "9": 607.0,
-            "10": 522.0,
-            "11": 637.0,
-            "12": 675.0,
-            "13": 649.0,
-            "14": 648.0,
-            "15": 640.0,
-            "16": 602.0,
-            "17": 668.0,
-            "18": 634.0,
-            "19": 593.0,
-            "20": 579.0,
-            "21": 633.0,
-            "22": 597.0,
-            "23": 756.0,
-            "24": 612.0,
-            "25": 591.0,
-            "26": 620.0,
-            "27": 700.0,
-            "28": 705.0,
-            "29": 795.0,
-            "30": 752.0,
-            "31": 628.0,
-            "32": 712.0,
-            "33": 752.0,
-            "34": 737.0,
-            "35": 741.0,
-            "36": 770.0,
-            "37": 861.0,
-            "38": 823.0,
-            "39": 812.0,
-            "40": 814.0,
-            "41": 826.0,
-            "42": 801.0,
-            "43": 769.0,
-            "44": 822.0,
-            "45": 777.0,
-            "46": 828.0,
-            "47": 878.0,
-            "48": 915.0,
-            "49": 908.0,
-            "50": 848.0
+            "1": 572.0,
+            "2": 656.0,
+            "3": 649.0,
+            "4": 631.0,
+            "5": 658.0,
+            "6": 636.0,
+            "7": 636.0,
+            "8": 542.0,
+            "9": 653.0,
+            "10": 551.0,
+            "11": 681.0,
+            "12": 642.0,
+            "13": 624.0,
+            "14": 658.0,
+            "15": 682.0,
+            "16": 659.0,
+            "17": 620.0,
+            "18": 603.0,
+            "19": 634.0,
+            "20": 639.0,
+            "21": 634.0,
+            "22": 602.0,
+            "23": 731.0,
+            "24": 620.0,
+            "25": 611.0,
+            "26": 626.0,
+            "27": 683.0,
+            "28": 668.0,
+            "29": 713.0,
+            "30": 712.0,
+            "31": 616.0,
+            "32": 786.0,
+            "33": 800.0,
+            "34": 702.0,
+            "35": 684.0,
+            "36": 664.0,
+            "37": 831.0,
+            "38": 802.0,
+            "39": 919.0,
+            "40": 802.0,
+            "41": 791.0,
+            "42": 840.0,
+            "43": 718.0,
+            "44": 756.0,
+            "45": 765.0,
+            "46": 809.0,
+            "47": 839.0,
+            "48": 827.0,
+            "49": 935.0,
+            "50": 839.0
         }
     },
     "mem-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 757801472.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 933156352.0,
-            "25": 933156352.0,
-            "26": 933156352.0,
-            "27": 933156352.0,
-            "28": 933156352.0,
-            "29": 933156352.0,
-            "30": 933156352.0,
-            "31": 933156352.0,
-            "32": 933156352.0,
-            "33": 933156352.0,
-            "34": 933156352.0,
-            "35": 933156352.0,
-            "36": 933156352.0,
-            "37": 933156352.0,
-            "38": 933156352.0,
-            "39": 933156352.0,
-            "40": 933156352.0,
-            "41": 933156352.0,
-            "42": 933156352.0,
-            "43": 933156352.0,
-            "44": 933156352.0,
-            "45": 933156352.0,
-            "46": 933156352.0,
-            "47": 933156352.0,
-            "48": 933156352.0,
-            "49": 933156352.0,
-            "50": 933156352.0
+            "1": 757802496.0,
+            "2": 935777792.0,
+            "3": 938397696.0,
+            "4": 938397696.0,
+            "5": 938397696.0,
+            "6": 938397696.0,
+            "7": 938397696.0,
+            "8": 938397696.0,
+            "9": 938397696.0,
+            "10": 938398208.0,
+            "11": 938398208.0,
+            "12": 938398208.0,
+            "13": 938398208.0,
+            "14": 938398720.0,
+            "15": 938398720.0,
+            "16": 938398720.0,
+            "17": 938398720.0,
+            "18": 938398720.0,
+            "19": 938398720.0,
+            "20": 938398720.0,
+            "21": 938398720.0,
+            "22": 938398720.0,
+            "23": 938398720.0,
+            "24": 938398720.0,
+            "25": 938399232.0,
+            "26": 938399232.0,
+            "27": 938399232.0,
+            "28": 938399232.0,
+            "29": 938399232.0,
+            "30": 938399232.0,
+            "31": 938399232.0,
+            "32": 938399232.0,
+            "33": 938399232.0,
+            "34": 938399232.0,
+            "35": 938399232.0,
+            "36": 938399232.0,
+            "37": 938399232.0,
+            "38": 938399232.0,
+            "39": 938399232.0,
+            "40": 938399232.0,
+            "41": 938399232.0,
+            "42": 938399232.0,
+            "43": 938399232.0,
+            "44": 938399232.0,
+            "45": 938399232.0,
+            "46": 938399232.0,
+            "47": 938399232.0,
+            "48": 938399232.0,
+            "49": 938399232.0,
+            "50": 938399232.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 15.78036,
-            "2": 0.34723,
-            "3": 0.33492,
-            "4": 0.3292,
-            "5": 0.33036,
-            "6": 0.34971,
-            "7": 0.33848,
-            "8": 0.33262,
-            "9": 0.34028,
-            "10": 0.3518,
-            "11": 0.34239,
-            "12": 0.33211,
-            "13": 0.32961,
-            "14": 0.33263,
-            "15": 0.32808,
-            "16": 0.33152,
-            "17": 0.33313,
-            "18": 0.329,
-            "19": 0.3317,
-            "20": 0.33143,
-            "21": 0.34166,
-            "22": 0.33873,
-            "23": 0.34817,
-            "24": 0.3415,
-            "25": 0.34495,
-            "26": 0.32592,
-            "27": 0.32935,
-            "28": 0.33233,
-            "29": 0.328,
-            "30": 0.32746,
-            "31": 0.3275,
-            "32": 0.327,
-            "33": 0.32765,
-            "34": 0.32542,
-            "35": 0.32703,
-            "36": 0.33052,
-            "37": 0.33413,
-            "38": 0.32701,
-            "39": 0.32816,
-            "40": 0.32555,
-            "41": 0.33676,
-            "42": 0.33367,
-            "43": 0.33748,
-            "44": 0.33125,
-            "45": 0.32793,
-            "46": 0.33387,
-            "47": 0.32628,
-            "48": 0.32993,
-            "49": 0.32747,
-            "50": 0.327
+            "1": 35.36663,
+            "2": 0.35208,
+            "3": 0.32012,
+            "4": 0.29736,
+            "5": 0.30009,
+            "6": 0.29722,
+            "7": 0.29604,
+            "8": 0.29598,
+            "9": 0.30123,
+            "10": 0.29278,
+            "11": 0.29195,
+            "12": 0.30003,
+            "13": 0.2957,
+            "14": 0.2935,
+            "15": 0.29372,
+            "16": 0.2984,
+            "17": 0.29013,
+            "18": 0.29041,
+            "19": 0.2934,
+            "20": 0.29454,
+            "21": 0.2936,
+            "22": 0.29663,
+            "23": 0.29453,
+            "24": 0.29404,
+            "25": 0.2912,
+            "26": 0.29009,
+            "27": 0.29448,
+            "28": 0.29043,
+            "29": 0.29359,
+            "30": 0.29413,
+            "31": 0.29317,
+            "32": 0.29247,
+            "33": 0.29418,
+            "34": 0.2938,
+            "35": 0.29207,
+            "36": 0.31485,
+            "37": 0.29543,
+            "38": 0.29402,
+            "39": 0.29262,
+            "40": 0.2957,
+            "41": 0.29348,
+            "42": 0.29242,
+            "43": 0.29117,
+            "44": 0.2927,
+            "45": 0.29263,
+            "46": 0.29024,
+            "47": 0.29404,
+            "48": 0.28901,
+            "49": 0.28844,
+            "50": 0.29053
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..fea64911d52
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84733,
+            "4": 10.85571,
+            "5": 10.86,
+            "6": 10.87733,
+            "7": 10.86555,
+            "8": 10.84913,
+            "9": 10.86609,
+            "10": 10.82473,
+            "11": 10.85618,
+            "12": 10.85374,
+            "13": 10.86788,
+            "14": 10.87119,
+            "15": 10.82235,
+            "16": 10.79991,
+            "17": 10.77431,
+            "18": 10.78345,
+            "19": 10.79308,
+            "20": 10.68226,
+            "21": 10.6471,
+            "22": 10.50917,
+            "23": 10.66827,
+            "24": 10.54193,
+            "25": 10.4928,
+            "26": 10.55931,
+            "27": 10.54238,
+            "28": 10.51129,
+            "29": 10.53257,
+            "30": 10.28992,
+            "31": 10.02853,
+            "32": 10.38885,
+            "33": 10.39593,
+            "34": 10.13446,
+            "35": 10.18932,
+            "36": 10.13355,
+            "37": 10.27381,
+            "38": 10.10751,
+            "39": 10.34007,
+            "40": 9.98538,
+            "41": 10.06414,
+            "42": 10.13744,
+            "43": 9.73381,
+            "44": 9.86305,
+            "45": 9.73723,
+            "46": 9.71343,
+            "47": 10.07757,
+            "48": 9.76768,
+            "49": 9.41987,
+            "50": 9.81687
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 567.0,
+            "2": 584.0,
+            "3": 598.0,
+            "4": 633.0,
+            "5": 630.0,
+            "6": 645.0,
+            "7": 645.0,
+            "8": 674.0,
+            "9": 625.0,
+            "10": 500.0,
+            "11": 669.0,
+            "12": 554.0,
+            "13": 681.0,
+            "14": 633.0,
+            "15": 623.0,
+            "16": 592.0,
+            "17": 636.0,
+            "18": 625.0,
+            "19": 633.0,
+            "20": 587.0,
+            "21": 696.0,
+            "22": 585.0,
+            "23": 681.0,
+            "24": 639.0,
+            "25": 587.0,
+            "26": 642.0,
+            "27": 639.0,
+            "28": 744.0,
+            "29": 746.0,
+            "30": 685.0,
+            "31": 603.0,
+            "32": 719.0,
+            "33": 850.0,
+            "34": 696.0,
+            "35": 737.0,
+            "36": 738.0,
+            "37": 840.0,
+            "38": 757.0,
+            "39": 828.0,
+            "40": 828.0,
+            "41": 787.0,
+            "42": 883.0,
+            "43": 703.0,
+            "44": 850.0,
+            "45": 840.0,
+            "46": 837.0,
+            "47": 915.0,
+            "48": 849.0,
+            "49": 915.0,
+            "50": 892.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 463110656.0,
+            "2": 463110656.0,
+            "3": 463110656.0,
+            "4": 463110656.0,
+            "5": 463110656.0,
+            "6": 463110656.0,
+            "7": 463110656.0,
+            "8": 463110656.0,
+            "9": 463110656.0,
+            "10": 463110656.0,
+            "11": 463110656.0,
+            "12": 463110656.0,
+            "13": 463110656.0,
+            "14": 463110656.0,
+            "15": 463110656.0,
+            "16": 463110656.0,
+            "17": 463110656.0,
+            "18": 463110656.0,
+            "19": 463110656.0,
+            "20": 463110656.0,
+            "21": 463110656.0,
+            "22": 463110656.0,
+            "23": 463110656.0,
+            "24": 463110656.0,
+            "25": 463110656.0,
+            "26": 463110656.0,
+            "27": 463110656.0,
+            "28": 463110656.0,
+            "29": 463110656.0,
+            "30": 463110656.0,
+            "31": 463110656.0,
+            "32": 463110656.0,
+            "33": 463110656.0,
+            "34": 463110656.0,
+            "35": 463110656.0,
+            "36": 463110656.0,
+            "37": 463110656.0,
+            "38": 463110656.0,
+            "39": 463110656.0,
+            "40": 463110656.0,
+            "41": 463110656.0,
+            "42": 463110656.0,
+            "43": 463110656.0,
+            "44": 463110656.0,
+            "45": 463110656.0,
+            "46": 463110656.0,
+            "47": 463110656.0,
+            "48": 463110656.0,
+            "49": 463110656.0,
+            "50": 463110656.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 707730944.0,
+            "2": 887671296.0,
+            "3": 887671296.0,
+            "4": 887671296.0,
+            "5": 887671296.0,
+            "6": 887671296.0,
+            "7": 887671296.0,
+            "8": 887671296.0,
+            "9": 887671296.0,
+            "10": 887671296.0,
+            "11": 887671296.0,
+            "12": 887671296.0,
+            "13": 887671296.0,
+            "14": 887671296.0,
+            "15": 887671296.0,
+            "16": 887671296.0,
+            "17": 887671296.0,
+            "18": 887671296.0,
+            "19": 887671296.0,
+            "20": 887671296.0,
+            "21": 887671296.0,
+            "22": 887671296.0,
+            "23": 887671296.0,
+            "24": 887671296.0,
+            "25": 887671296.0,
+            "26": 887671296.0,
+            "27": 887671296.0,
+            "28": 887671296.0,
+            "29": 887671296.0,
+            "30": 887671296.0,
+            "31": 887671296.0,
+            "32": 887671296.0,
+            "33": 887671296.0,
+            "34": 887671296.0,
+            "35": 887671296.0,
+            "36": 887671296.0,
+            "37": 887671296.0,
+            "38": 887671296.0,
+            "39": 887671296.0,
+            "40": 887671296.0,
+            "41": 887671296.0,
+            "42": 887671296.0,
+            "43": 887671296.0,
+            "44": 887671296.0,
+            "45": 887671296.0,
+            "46": 887671296.0,
+            "47": 887671296.0,
+            "48": 887671296.0,
+            "49": 887671296.0,
+            "50": 887671296.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.59451,
+            "3": 0.48132,
+            "4": 0.46344,
+            "5": 0.46723,
+            "6": 0.47347,
+            "7": 0.48756,
+            "8": 0.49394,
+            "9": 0.49358,
+            "10": 0.495,
+            "11": 0.49567,
+            "12": 0.49577,
+            "13": 0.49608,
+            "14": 0.49247,
+            "15": 0.49553,
+            "16": 0.49581,
+            "17": 0.49335,
+            "18": 0.5003,
+            "19": 0.49904,
+            "20": 0.50095,
+            "21": 0.49831,
+            "22": 0.49726,
+            "23": 0.49738,
+            "24": 0.50198,
+            "25": 0.49901,
+            "26": 0.50161,
+            "27": 0.50183,
+            "28": 0.49371,
+            "29": 0.49579,
+            "30": 0.49585,
+            "31": 0.49614,
+            "32": 0.49424,
+            "33": 0.49565,
+            "34": 0.49645,
+            "35": 0.50022,
+            "36": 0.50076,
+            "37": 0.49676,
+            "38": 0.4972,
+            "39": 0.49438,
+            "40": 0.49751,
+            "41": 0.49485,
+            "42": 0.49564,
+            "43": 0.4958,
+            "44": 0.49763,
+            "45": 0.49766,
+            "46": 0.50005,
+            "47": 0.49885,
+            "48": 0.50156,
+            "49": 0.50235,
+            "50": 0.49766
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json
index f2adbef4530..f5628621ad5 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json
@@ -4,55 +4,55 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86284,
-            "4": 10.84009,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.86282,
+            "4": 10.84007,
             "5": 10.87856,
-            "6": 10.88856,
-            "7": 10.86532,
-            "8": 10.86017,
-            "9": 10.8599,
-            "10": 10.82981,
-            "11": 10.8895,
-            "12": 10.8751,
-            "13": 10.87423,
+            "6": 10.88852,
+            "7": 10.86536,
+            "8": 10.86015,
+            "9": 10.85991,
+            "10": 10.82982,
+            "11": 10.88947,
+            "12": 10.87511,
+            "13": 10.87422,
             "14": 10.89675,
-            "15": 10.82054,
-            "16": 10.82504,
+            "15": 10.82056,
+            "16": 10.82497,
             "17": 10.78983,
             "18": 10.81029,
-            "19": 10.80535,
-            "20": 10.70398,
-            "21": 10.66993,
-            "22": 10.50643,
-            "23": 10.69004,
-            "24": 10.56314,
-            "25": 10.4942,
-            "26": 10.56628,
-            "27": 10.58025,
+            "19": 10.80528,
+            "20": 10.70396,
+            "21": 10.6699,
+            "22": 10.50641,
+            "23": 10.69006,
+            "24": 10.56312,
+            "25": 10.49418,
+            "26": 10.56627,
+            "27": 10.58023,
             "28": 10.51571,
-            "29": 10.55299,
-            "30": 10.30549,
-            "31": 10.02245,
-            "32": 10.40614,
+            "29": 10.55296,
+            "30": 10.30551,
+            "31": 10.02244,
+            "32": 10.40618,
             "33": 10.39874,
-            "34": 10.13771,
+            "34": 10.1377,
             "35": 10.20184,
-            "36": 10.16052,
-            "37": 10.28973,
-            "38": 10.11474,
+            "36": 10.1605,
+            "37": 10.28975,
+            "38": 10.11483,
             "39": 10.361,
-            "40": 10.01903,
+            "40": 10.01905,
             "41": 10.07292,
-            "42": 10.14698,
-            "43": 9.74687,
-            "44": 9.87766,
-            "45": 9.74966,
-            "46": 9.73383,
-            "47": 10.07535,
-            "48": 9.78068,
-            "49": 9.44784,
+            "42": 10.14697,
+            "43": 9.74684,
+            "44": 9.87763,
+            "45": 9.74962,
+            "46": 9.73382,
+            "47": 10.07536,
+            "48": 9.78071,
+            "49": 9.44783,
             "50": 9.8399
         }
     },
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 653.0,
-            "2": 642.0,
-            "3": 630.0,
-            "4": 585.0,
-            "5": 635.0,
-            "6": 687.0,
-            "7": 615.0,
-            "8": 601.0,
-            "9": 607.0,
-            "10": 522.0,
-            "11": 637.0,
-            "12": 675.0,
-            "13": 649.0,
-            "14": 648.0,
-            "15": 640.0,
-            "16": 602.0,
-            "17": 668.0,
-            "18": 634.0,
-            "19": 593.0,
-            "20": 579.0,
-            "21": 633.0,
-            "22": 597.0,
-            "23": 756.0,
-            "24": 612.0,
-            "25": 591.0,
-            "26": 620.0,
-            "27": 700.0,
-            "28": 705.0,
-            "29": 795.0,
-            "30": 752.0,
-            "31": 628.0,
-            "32": 712.0,
-            "33": 752.0,
-            "34": 737.0,
-            "35": 741.0,
-            "36": 770.0,
-            "37": 861.0,
-            "38": 823.0,
-            "39": 812.0,
-            "40": 814.0,
-            "41": 826.0,
-            "42": 801.0,
-            "43": 769.0,
-            "44": 822.0,
-            "45": 777.0,
-            "46": 828.0,
-            "47": 878.0,
-            "48": 915.0,
-            "49": 908.0,
-            "50": 848.0
+            "1": 572.0,
+            "2": 656.0,
+            "3": 649.0,
+            "4": 631.0,
+            "5": 658.0,
+            "6": 636.0,
+            "7": 636.0,
+            "8": 542.0,
+            "9": 653.0,
+            "10": 551.0,
+            "11": 681.0,
+            "12": 642.0,
+            "13": 624.0,
+            "14": 658.0,
+            "15": 682.0,
+            "16": 659.0,
+            "17": 620.0,
+            "18": 603.0,
+            "19": 634.0,
+            "20": 639.0,
+            "21": 634.0,
+            "22": 602.0,
+            "23": 731.0,
+            "24": 620.0,
+            "25": 611.0,
+            "26": 626.0,
+            "27": 683.0,
+            "28": 668.0,
+            "29": 713.0,
+            "30": 712.0,
+            "31": 616.0,
+            "32": 786.0,
+            "33": 800.0,
+            "34": 702.0,
+            "35": 684.0,
+            "36": 664.0,
+            "37": 831.0,
+            "38": 802.0,
+            "39": 919.0,
+            "40": 802.0,
+            "41": 791.0,
+            "42": 840.0,
+            "43": 718.0,
+            "44": 756.0,
+            "45": 765.0,
+            "46": 809.0,
+            "47": 839.0,
+            "48": 827.0,
+            "49": 935.0,
+            "50": 839.0
         }
     },
     "mem-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 759898624.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 933156352.0,
-            "25": 933156352.0,
-            "26": 933156352.0,
-            "27": 933156352.0,
-            "28": 933156352.0,
-            "29": 933156352.0,
-            "30": 933156352.0,
-            "31": 933156352.0,
-            "32": 933156352.0,
-            "33": 933156352.0,
-            "34": 933156352.0,
-            "35": 933156352.0,
-            "36": 933156352.0,
-            "37": 933156352.0,
-            "38": 933156352.0,
-            "39": 933156352.0,
-            "40": 933156352.0,
-            "41": 933156352.0,
-            "42": 933156352.0,
-            "43": 933156352.0,
-            "44": 933156352.0,
-            "45": 933156352.0,
-            "46": 933156352.0,
-            "47": 933156352.0,
-            "48": 933156352.0,
-            "49": 933156352.0,
-            "50": 933156352.0
+            "1": 755704320.0,
+            "2": 938398720.0,
+            "3": 938398720.0,
+            "4": 938398720.0,
+            "5": 938398720.0,
+            "6": 938399232.0,
+            "7": 938399232.0,
+            "8": 938399232.0,
+            "9": 938399232.0,
+            "10": 938399232.0,
+            "11": 938399232.0,
+            "12": 938399232.0,
+            "13": 938399232.0,
+            "14": 938399232.0,
+            "15": 938399232.0,
+            "16": 938399232.0,
+            "17": 938399232.0,
+            "18": 938399232.0,
+            "19": 938399232.0,
+            "20": 938399232.0,
+            "21": 938399232.0,
+            "22": 938399232.0,
+            "23": 938399232.0,
+            "24": 938399232.0,
+            "25": 938399232.0,
+            "26": 938399232.0,
+            "27": 938399232.0,
+            "28": 938399232.0,
+            "29": 938399232.0,
+            "30": 938399232.0,
+            "31": 938399232.0,
+            "32": 938399232.0,
+            "33": 938399232.0,
+            "34": 938399232.0,
+            "35": 938399232.0,
+            "36": 938399232.0,
+            "37": 938399232.0,
+            "38": 938399232.0,
+            "39": 938399232.0,
+            "40": 938399232.0,
+            "41": 938399232.0,
+            "42": 938399232.0,
+            "43": 938399232.0,
+            "44": 938399232.0,
+            "45": 938399232.0,
+            "46": 938399232.0,
+            "47": 938399232.0,
+            "48": 938399232.0,
+            "49": 938399232.0,
+            "50": 938399232.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 16.72434,
-            "2": 0.40342,
-            "3": 0.32477,
-            "4": 0.32459,
-            "5": 0.32511,
-            "6": 0.32478,
-            "7": 0.32469,
-            "8": 0.32479,
-            "9": 0.32229,
-            "10": 0.32534,
-            "11": 0.32568,
-            "12": 0.32325,
-            "13": 0.3234,
-            "14": 0.32735,
-            "15": 0.32264,
-            "16": 0.32664,
-            "17": 0.32289,
-            "18": 0.32328,
-            "19": 0.32997,
-            "20": 0.32955,
-            "21": 0.32699,
-            "22": 0.3292,
-            "23": 0.32982,
-            "24": 0.32452,
-            "25": 0.32644,
-            "26": 0.32596,
-            "27": 0.32426,
-            "28": 0.32527,
-            "29": 0.32409,
-            "30": 0.32549,
-            "31": 0.32259,
-            "32": 0.32488,
-            "33": 0.32331,
-            "34": 0.3242,
-            "35": 0.3261,
-            "36": 0.32048,
-            "37": 0.32127,
-            "38": 0.32479,
-            "39": 0.32338,
-            "40": 0.32137,
-            "41": 0.32292,
-            "42": 0.32202,
-            "43": 0.32321,
-            "44": 0.32105,
-            "45": 0.32265,
-            "46": 0.32148,
-            "47": 0.32443,
-            "48": 0.32158,
-            "49": 0.32089,
-            "50": 0.32389
+            "1": 35.29813,
+            "2": 0.37906,
+            "3": 0.30948,
+            "4": 0.2886,
+            "5": 0.28858,
+            "6": 0.29461,
+            "7": 0.28328,
+            "8": 0.28783,
+            "9": 0.28448,
+            "10": 0.28698,
+            "11": 0.28404,
+            "12": 0.28717,
+            "13": 0.2828,
+            "14": 0.2846,
+            "15": 0.28648,
+            "16": 0.28793,
+            "17": 0.28473,
+            "18": 0.28326,
+            "19": 0.28524,
+            "20": 0.29094,
+            "21": 0.29401,
+            "22": 0.28944,
+            "23": 0.28693,
+            "24": 0.29508,
+            "25": 0.28683,
+            "26": 0.28507,
+            "27": 0.2849,
+            "28": 0.28658,
+            "29": 0.28518,
+            "30": 0.28539,
+            "31": 0.2829,
+            "32": 0.28482,
+            "33": 0.28454,
+            "34": 0.28634,
+            "35": 0.28739,
+            "36": 0.28563,
+            "37": 0.28401,
+            "38": 0.28251,
+            "39": 0.28156,
+            "40": 0.28197,
+            "41": 0.28236,
+            "42": 0.27995,
+            "43": 0.28293,
+            "44": 0.28018,
+            "45": 0.28419,
+            "46": 0.28512,
+            "47": 0.2818,
+            "48": 0.28099,
+            "49": 0.2831,
+            "50": 0.28153
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..bbc822686c6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84735,
+            "4": 10.85571,
+            "5": 10.86002,
+            "6": 10.87733,
+            "7": 10.86562,
+            "8": 10.84914,
+            "9": 10.86609,
+            "10": 10.82473,
+            "11": 10.85616,
+            "12": 10.85369,
+            "13": 10.86791,
+            "14": 10.87114,
+            "15": 10.82234,
+            "16": 10.79991,
+            "17": 10.7743,
+            "18": 10.78346,
+            "19": 10.79307,
+            "20": 10.68222,
+            "21": 10.64709,
+            "22": 10.50919,
+            "23": 10.6683,
+            "24": 10.54196,
+            "25": 10.49283,
+            "26": 10.55931,
+            "27": 10.5424,
+            "28": 10.51132,
+            "29": 10.53257,
+            "30": 10.28992,
+            "31": 10.02852,
+            "32": 10.38881,
+            "33": 10.39597,
+            "34": 10.13449,
+            "35": 10.18929,
+            "36": 10.13354,
+            "37": 10.27384,
+            "38": 10.10753,
+            "39": 10.34011,
+            "40": 9.98544,
+            "41": 10.06413,
+            "42": 10.13747,
+            "43": 9.73382,
+            "44": 9.86306,
+            "45": 9.73725,
+            "46": 9.71343,
+            "47": 10.07755,
+            "48": 9.76765,
+            "49": 9.41988,
+            "50": 9.81692
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 593.0,
+            "2": 604.0,
+            "3": 621.0,
+            "4": 634.0,
+            "5": 658.0,
+            "6": 640.0,
+            "7": 624.0,
+            "8": 576.0,
+            "9": 622.0,
+            "10": 481.0,
+            "11": 703.0,
+            "12": 606.0,
+            "13": 667.0,
+            "14": 652.0,
+            "15": 654.0,
+            "16": 625.0,
+            "17": 598.0,
+            "18": 534.0,
+            "19": 627.0,
+            "20": 616.0,
+            "21": 720.0,
+            "22": 601.0,
+            "23": 647.0,
+            "24": 615.0,
+            "25": 577.0,
+            "26": 654.0,
+            "27": 661.0,
+            "28": 705.0,
+            "29": 681.0,
+            "30": 725.0,
+            "31": 613.0,
+            "32": 766.0,
+            "33": 801.0,
+            "34": 690.0,
+            "35": 697.0,
+            "36": 733.0,
+            "37": 839.0,
+            "38": 806.0,
+            "39": 841.0,
+            "40": 858.0,
+            "41": 837.0,
+            "42": 812.0,
+            "43": 696.0,
+            "44": 819.0,
+            "45": 753.0,
+            "46": 840.0,
+            "47": 921.0,
+            "48": 863.0,
+            "49": 850.0,
+            "50": 830.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 460882432.0,
+            "2": 460882432.0,
+            "3": 460882432.0,
+            "4": 460882432.0,
+            "5": 460882432.0,
+            "6": 460882432.0,
+            "7": 460882432.0,
+            "8": 460882432.0,
+            "9": 460882432.0,
+            "10": 460882432.0,
+            "11": 460882432.0,
+            "12": 460882432.0,
+            "13": 460882432.0,
+            "14": 460882432.0,
+            "15": 460882432.0,
+            "16": 460882432.0,
+            "17": 460882432.0,
+            "18": 460882432.0,
+            "19": 460882432.0,
+            "20": 460882432.0,
+            "21": 460882432.0,
+            "22": 460882432.0,
+            "23": 460882432.0,
+            "24": 460882432.0,
+            "25": 460882432.0,
+            "26": 460882432.0,
+            "27": 460882432.0,
+            "28": 460882432.0,
+            "29": 460882432.0,
+            "30": 460882432.0,
+            "31": 460882432.0,
+            "32": 460882432.0,
+            "33": 460882432.0,
+            "34": 460882432.0,
+            "35": 460882432.0,
+            "36": 460882432.0,
+            "37": 460882432.0,
+            "38": 460882432.0,
+            "39": 460882432.0,
+            "40": 460882432.0,
+            "41": 460882432.0,
+            "42": 460882432.0,
+            "43": 460882432.0,
+            "44": 460882432.0,
+            "45": 460882432.0,
+            "46": 460882432.0,
+            "47": 460882432.0,
+            "48": 460882432.0,
+            "49": 460882432.0,
+            "50": 460882432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 705635840.0,
+            "2": 883348992.0,
+            "3": 883348992.0,
+            "4": 883348992.0,
+            "5": 883348992.0,
+            "6": 883348992.0,
+            "7": 883348992.0,
+            "8": 883348992.0,
+            "9": 883348992.0,
+            "10": 883348992.0,
+            "11": 883348992.0,
+            "12": 883348992.0,
+            "13": 883348992.0,
+            "14": 883348992.0,
+            "15": 883348992.0,
+            "16": 883348992.0,
+            "17": 883348992.0,
+            "18": 883348992.0,
+            "19": 883348992.0,
+            "20": 885443584.0,
+            "21": 885445120.0,
+            "22": 885445120.0,
+            "23": 885445120.0,
+            "24": 885445120.0,
+            "25": 885445120.0,
+            "26": 885445120.0,
+            "27": 885445120.0,
+            "28": 885445120.0,
+            "29": 885445120.0,
+            "30": 885445120.0,
+            "31": 885446144.0,
+            "32": 885446144.0,
+            "33": 885446144.0,
+            "34": 885446144.0,
+            "35": 885446144.0,
+            "36": 885446144.0,
+            "37": 885446144.0,
+            "38": 885446144.0,
+            "39": 885446144.0,
+            "40": 885446144.0,
+            "41": 885446144.0,
+            "42": 885446144.0,
+            "43": 885446144.0,
+            "44": 885446144.0,
+            "45": 885446144.0,
+            "46": 885446144.0,
+            "47": 885446144.0,
+            "48": 885446144.0,
+            "49": 885446144.0,
+            "50": 885446144.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.8465,
+            "3": 0.43585,
+            "4": 0.41053,
+            "5": 0.40923,
+            "6": 0.40815,
+            "7": 0.41221,
+            "8": 0.41419,
+            "9": 0.41058,
+            "10": 0.40643,
+            "11": 0.40558,
+            "12": 0.40602,
+            "13": 0.4063,
+            "14": 0.40517,
+            "15": 0.40811,
+            "16": 0.40555,
+            "17": 0.40549,
+            "18": 0.40655,
+            "19": 0.40455,
+            "20": 0.40661,
+            "21": 0.40594,
+            "22": 0.406,
+            "23": 0.40409,
+            "24": 0.40593,
+            "25": 0.40476,
+            "26": 0.40466,
+            "27": 0.40486,
+            "28": 0.40491,
+            "29": 0.41169,
+            "30": 0.40768,
+            "31": 0.40772,
+            "32": 0.40874,
+            "33": 0.40861,
+            "34": 0.40706,
+            "35": 0.40837,
+            "36": 0.40765,
+            "37": 0.40963,
+            "38": 0.40873,
+            "39": 0.40653,
+            "40": 0.4068,
+            "41": 0.40742,
+            "42": 0.40739,
+            "43": 0.43116,
+            "44": 0.40318,
+            "45": 0.41555,
+            "46": 0.40362,
+            "47": 0.40203,
+            "48": 0.40164,
+            "49": 0.40396,
+            "50": 0.40607
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_h100.json
index a74ab8d8415..4f56833e7b4 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_h100.json
@@ -4,55 +4,55 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86281,
-            "4": 10.84011,
-            "5": 10.87855,
-            "6": 10.88849,
-            "7": 10.86536,
-            "8": 10.86016,
-            "9": 10.85987,
-            "10": 10.82979,
-            "11": 10.88946,
-            "12": 10.87508,
-            "13": 10.87423,
-            "14": 10.89679,
-            "15": 10.82052,
-            "16": 10.825,
-            "17": 10.78984,
-            "18": 10.81026,
-            "19": 10.80535,
-            "20": 10.70395,
-            "21": 10.66988,
-            "22": 10.50641,
-            "23": 10.69004,
-            "24": 10.56309,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.86283,
+            "4": 10.84007,
+            "5": 10.87858,
+            "6": 10.88853,
+            "7": 10.86535,
+            "8": 10.86014,
+            "9": 10.85986,
+            "10": 10.82982,
+            "11": 10.8895,
+            "12": 10.87512,
+            "13": 10.87426,
+            "14": 10.89677,
+            "15": 10.82053,
+            "16": 10.82502,
+            "17": 10.78982,
+            "18": 10.81027,
+            "19": 10.80531,
+            "20": 10.70397,
+            "21": 10.66991,
+            "22": 10.50643,
+            "23": 10.69005,
+            "24": 10.56312,
             "25": 10.49417,
-            "26": 10.56626,
-            "27": 10.58024,
-            "28": 10.51572,
-            "29": 10.55294,
-            "30": 10.30552,
-            "31": 10.02243,
-            "32": 10.40616,
-            "33": 10.39875,
+            "26": 10.56627,
+            "27": 10.58022,
+            "28": 10.51571,
+            "29": 10.55299,
+            "30": 10.30551,
+            "31": 10.02246,
+            "32": 10.40615,
+            "33": 10.39877,
             "34": 10.13772,
-            "35": 10.20189,
-            "36": 10.16048,
-            "37": 10.28972,
-            "38": 10.11479,
+            "35": 10.20183,
+            "36": 10.16051,
+            "37": 10.28969,
+            "38": 10.11485,
             "39": 10.361,
-            "40": 10.01902,
-            "41": 10.07292,
-            "42": 10.14694,
-            "43": 9.74686,
-            "44": 9.87768,
-            "45": 9.74966,
-            "46": 9.7338,
-            "47": 10.07535,
+            "40": 10.01897,
+            "41": 10.07294,
+            "42": 10.14697,
+            "43": 9.74687,
+            "44": 9.87765,
+            "45": 9.74965,
+            "46": 9.73384,
+            "47": 10.07538,
             "48": 9.7807,
-            "49": 9.44783,
+            "49": 9.4478,
             "50": 9.83991
         }
     },
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 600.0,
-            "2": 620.0,
-            "3": 606.0,
-            "4": 684.0,
-            "5": 647.0,
-            "6": 679.0,
-            "7": 630.0,
-            "8": 568.0,
-            "9": 627.0,
-            "10": 519.0,
-            "11": 635.0,
-            "12": 640.0,
-            "13": 677.0,
-            "14": 631.0,
-            "15": 668.0,
-            "16": 666.0,
-            "17": 671.0,
-            "18": 623.0,
-            "19": 658.0,
-            "20": 639.0,
-            "21": 624.0,
-            "22": 614.0,
-            "23": 741.0,
-            "24": 607.0,
-            "25": 636.0,
-            "26": 639.0,
-            "27": 689.0,
-            "28": 751.0,
-            "29": 724.0,
-            "30": 771.0,
-            "31": 564.0,
-            "32": 750.0,
-            "33": 765.0,
-            "34": 693.0,
-            "35": 737.0,
-            "36": 754.0,
-            "37": 807.0,
-            "38": 786.0,
-            "39": 879.0,
-            "40": 737.0,
+            "1": 565.0,
+            "2": 674.0,
+            "3": 644.0,
+            "4": 621.0,
+            "5": 633.0,
+            "6": 641.0,
+            "7": 595.0,
+            "8": 543.0,
+            "9": 654.0,
+            "10": 529.0,
+            "11": 674.0,
+            "12": 661.0,
+            "13": 675.0,
+            "14": 643.0,
+            "15": 634.0,
+            "16": 659.0,
+            "17": 682.0,
+            "18": 639.0,
+            "19": 625.0,
+            "20": 633.0,
+            "21": 596.0,
+            "22": 628.0,
+            "23": 708.0,
+            "24": 616.0,
+            "25": 605.0,
+            "26": 645.0,
+            "27": 692.0,
+            "28": 796.0,
+            "29": 783.0,
+            "30": 681.0,
+            "31": 587.0,
+            "32": 719.0,
+            "33": 764.0,
+            "34": 731.0,
+            "35": 725.0,
+            "36": 695.0,
+            "37": 815.0,
+            "38": 759.0,
+            "39": 857.0,
+            "40": 772.0,
             "41": 817.0,
-            "42": 857.0,
-            "43": 709.0,
-            "44": 808.0,
-            "45": 795.0,
-            "46": 837.0,
-            "47": 879.0,
-            "48": 899.0,
-            "49": 890.0,
-            "50": 860.0
+            "42": 778.0,
+            "43": 728.0,
+            "44": 810.0,
+            "45": 770.0,
+            "46": 858.0,
+            "47": 881.0,
+            "48": 894.0,
+            "49": 906.0,
+            "50": 808.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 510689792.0,
-            "2": 510689792.0,
-            "3": 510689792.0,
-            "4": 510689792.0,
-            "5": 510689792.0,
-            "6": 510689792.0,
-            "7": 510689792.0,
-            "8": 510689792.0,
-            "9": 510689792.0,
-            "10": 510689792.0,
-            "11": 510689792.0,
-            "12": 510689792.0,
-            "13": 510689792.0,
-            "14": 510689792.0,
-            "15": 510689792.0,
-            "16": 510689792.0,
-            "17": 510689792.0,
-            "18": 510689792.0,
-            "19": 510689792.0,
-            "20": 510689792.0,
-            "21": 510689792.0,
-            "22": 510689792.0,
-            "23": 510689792.0,
-            "24": 510689792.0,
-            "25": 510689792.0,
-            "26": 510689792.0,
-            "27": 510689792.0,
-            "28": 510689792.0,
-            "29": 510689792.0,
-            "30": 510689792.0,
-            "31": 510689792.0,
-            "32": 510689792.0,
-            "33": 510689792.0,
-            "34": 510689792.0,
-            "35": 510689792.0,
-            "36": 510689792.0,
-            "37": 510689792.0,
-            "38": 510689792.0,
-            "39": 510689792.0,
-            "40": 510689792.0,
-            "41": 510689792.0,
-            "42": 510689792.0,
-            "43": 510689792.0,
-            "44": 510689792.0,
-            "45": 510689792.0,
-            "46": 510689792.0,
-            "47": 510689792.0,
-            "48": 510689792.0,
-            "49": 510689792.0,
-            "50": 510689792.0
+            "1": 512262656.0,
+            "2": 512262656.0,
+            "3": 512262656.0,
+            "4": 512262656.0,
+            "5": 512262656.0,
+            "6": 512262656.0,
+            "7": 512262656.0,
+            "8": 512262656.0,
+            "9": 512262656.0,
+            "10": 512262656.0,
+            "11": 512262656.0,
+            "12": 512262656.0,
+            "13": 512262656.0,
+            "14": 512262656.0,
+            "15": 512262656.0,
+            "16": 512262656.0,
+            "17": 512262656.0,
+            "18": 512262656.0,
+            "19": 512262656.0,
+            "20": 512262656.0,
+            "21": 512262656.0,
+            "22": 512262656.0,
+            "23": 512262656.0,
+            "24": 512262656.0,
+            "25": 512262656.0,
+            "26": 512262656.0,
+            "27": 512262656.0,
+            "28": 512262656.0,
+            "29": 512262656.0,
+            "30": 512262656.0,
+            "31": 512262656.0,
+            "32": 512262656.0,
+            "33": 512262656.0,
+            "34": 512262656.0,
+            "35": 512262656.0,
+            "36": 512262656.0,
+            "37": 512262656.0,
+            "38": 512262656.0,
+            "39": 512262656.0,
+            "40": 512262656.0,
+            "41": 512262656.0,
+            "42": 512262656.0,
+            "43": 512262656.0,
+            "44": 512262656.0,
+            "45": 512262656.0,
+            "46": 512262656.0,
+            "47": 512262656.0,
+            "48": 512262656.0,
+            "49": 512262656.0,
+            "50": 512262656.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 759895552.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 934204928.0,
-            "25": 934204928.0,
-            "26": 934204928.0,
-            "27": 934204928.0,
-            "28": 934204928.0,
-            "29": 934204928.0,
-            "30": 934204928.0,
-            "31": 934204928.0,
-            "32": 934204928.0,
-            "33": 934204928.0,
-            "34": 934204928.0,
-            "35": 934204928.0,
-            "36": 934204928.0,
-            "37": 934204928.0,
-            "38": 934204928.0,
-            "39": 934204928.0,
-            "40": 934204928.0,
-            "41": 934204928.0,
-            "42": 934204928.0,
-            "43": 934204928.0,
-            "44": 934204928.0,
-            "45": 934204928.0,
-            "46": 934204928.0,
-            "47": 934204928.0,
-            "48": 934204928.0,
-            "49": 934204928.0,
-            "50": 934204928.0
+            "1": 755704832.0,
+            "2": 935776768.0,
+            "3": 935777792.0,
+            "4": 935777792.0,
+            "5": 935777792.0,
+            "6": 935777792.0,
+            "7": 935777792.0,
+            "8": 935777792.0,
+            "9": 935777792.0,
+            "10": 935777792.0,
+            "11": 935777792.0,
+            "12": 935777792.0,
+            "13": 935777792.0,
+            "14": 935777792.0,
+            "15": 935777792.0,
+            "16": 935777792.0,
+            "17": 935777792.0,
+            "18": 935777792.0,
+            "19": 935777792.0,
+            "20": 935777792.0,
+            "21": 935777792.0,
+            "22": 935777792.0,
+            "23": 935777792.0,
+            "24": 935777792.0,
+            "25": 935777792.0,
+            "26": 935777792.0,
+            "27": 935777792.0,
+            "28": 935777792.0,
+            "29": 935777792.0,
+            "30": 935777792.0,
+            "31": 935777792.0,
+            "32": 935777792.0,
+            "33": 935777792.0,
+            "34": 935777792.0,
+            "35": 935777792.0,
+            "36": 935777792.0,
+            "37": 935777792.0,
+            "38": 935777792.0,
+            "39": 935777792.0,
+            "40": 935777792.0,
+            "41": 935777792.0,
+            "42": 935777792.0,
+            "43": 935777792.0,
+            "44": 935777792.0,
+            "45": 935777792.0,
+            "46": 935777792.0,
+            "47": 935777792.0,
+            "48": 935777792.0,
+            "49": 935777792.0,
+            "50": 935777792.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 16.61636,
-            "2": 0.35255,
-            "3": 0.33784,
-            "4": 0.33448,
-            "5": 0.33388,
-            "6": 0.33362,
-            "7": 0.33399,
-            "8": 0.33377,
-            "9": 0.3345,
-            "10": 0.33436,
-            "11": 0.33616,
-            "12": 0.33216,
-            "13": 0.32717,
-            "14": 0.3285,
-            "15": 0.31893,
-            "16": 0.32207,
-            "17": 0.32068,
-            "18": 0.3232,
-            "19": 0.31799,
-            "20": 0.32295,
-            "21": 0.32148,
-            "22": 0.3312,
-            "23": 0.33388,
-            "24": 0.33493,
-            "25": 0.33793,
-            "26": 0.33838,
-            "27": 0.33827,
-            "28": 0.34,
-            "29": 0.33074,
-            "30": 0.32608,
-            "31": 0.32629,
-            "32": 0.3285,
-            "33": 0.32776,
-            "34": 0.32575,
-            "35": 0.32648,
-            "36": 0.3252,
-            "37": 0.32697,
-            "38": 0.33001,
-            "39": 0.3354,
-            "40": 0.33513,
-            "41": 0.33447,
-            "42": 0.3352,
-            "43": 0.33163,
-            "44": 0.32495,
-            "45": 0.32668,
-            "46": 0.32429,
-            "47": 0.32917,
-            "48": 0.32614,
-            "49": 0.32637,
-            "50": 0.32702
+            "1": 35.15129,
+            "2": 0.34045,
+            "3": 0.3152,
+            "4": 0.29475,
+            "5": 0.29106,
+            "6": 0.28743,
+            "7": 0.28892,
+            "8": 0.28712,
+            "9": 0.28802,
+            "10": 0.28716,
+            "11": 0.28668,
+            "12": 0.37009,
+            "13": 0.28782,
+            "14": 0.29043,
+            "15": 0.28814,
+            "16": 0.2878,
+            "17": 0.28821,
+            "18": 0.28923,
+            "19": 0.28805,
+            "20": 0.28779,
+            "21": 0.28749,
+            "22": 0.28772,
+            "23": 0.29149,
+            "24": 0.28826,
+            "25": 0.28991,
+            "26": 0.28778,
+            "27": 0.29505,
+            "28": 0.29056,
+            "29": 0.28756,
+            "30": 0.28994,
+            "31": 0.28927,
+            "32": 0.28762,
+            "33": 0.29152,
+            "34": 0.28825,
+            "35": 0.29628,
+            "36": 0.29294,
+            "37": 0.29051,
+            "38": 0.28817,
+            "39": 0.28808,
+            "40": 0.28772,
+            "41": 0.28911,
+            "42": 0.28638,
+            "43": 0.28641,
+            "44": 0.28736,
+            "45": 0.28638,
+            "46": 0.29104,
+            "47": 0.2889,
+            "48": 0.28851,
+            "49": 0.2881,
+            "50": 0.28761
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgx_a100.json
index 936ff15865c..b6821c7a8c1 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgx_a100.json
@@ -2,91 +2,286 @@
     "lm loss": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 10.93292,
-            "5": 10.9297,
-            "10": 10.90476,
-            "15": 10.87124,
-            "20": 10.74998,
-            "25": 10.53758,
+            "1": 10.93295,
+            "2": 10.93424,
+            "3": 10.91344,
+            "4": 10.90321,
+            "5": 10.92967,
+            "6": 10.93657,
+            "7": 10.90278,
+            "8": 10.92113,
+            "9": 10.90705,
+            "10": 10.90473,
+            "11": 10.8879,
+            "12": 10.91735,
+            "13": 10.91188,
+            "14": 10.91508,
+            "15": 10.87123,
+            "16": 10.86129,
+            "17": 10.82696,
+            "18": 10.85677,
+            "19": 10.84055,
+            "20": 10.74996,
+            "21": 10.71507,
+            "22": 10.58113,
+            "23": 10.72643,
+            "24": 10.6073,
+            "25": 10.53754,
+            "26": 10.61066,
+            "27": 10.59929,
+            "28": 10.54953,
+            "29": 10.56604,
             "30": 10.32549,
-            "35": 10.2289,
+            "31": 10.06695,
+            "32": 10.43809,
+            "33": 10.42363,
+            "34": 10.16014,
+            "35": 10.22895,
+            "36": 10.17616,
+            "37": 10.29235,
+            "38": 10.13295,
+            "39": 10.34955,
             "40": 10.01976,
-            "45": 9.7555,
+            "41": 10.07538,
+            "42": 10.15408,
+            "43": 9.76087,
+            "44": 9.88357,
+            "45": 9.75548,
+            "46": 9.74957,
+            "47": 10.07546,
+            "48": 9.77937,
+            "49": 9.43818,
             "50": 9.84069
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 591.0,
-            "5": 683.0,
-            "10": 544.0,
-            "15": 595.0,
-            "20": 611.0,
-            "25": 625.0,
-            "30": 698.0,
+            "1": 631.0,
+            "2": 663.0,
+            "3": 622.0,
+            "4": 621.0,
+            "5": 643.0,
+            "6": 635.0,
+            "7": 588.0,
+            "8": 629.0,
+            "9": 654.0,
+            "10": 539.0,
+            "11": 656.0,
+            "12": 638.0,
+            "13": 671.0,
+            "14": 656.0,
+            "15": 624.0,
+            "16": 633.0,
+            "17": 640.0,
+            "18": 609.0,
+            "19": 599.0,
+            "20": 593.0,
+            "21": 598.0,
+            "22": 628.0,
+            "23": 692.0,
+            "24": 601.0,
+            "25": 538.0,
+            "26": 640.0,
+            "27": 651.0,
+            "28": 749.0,
+            "29": 742.0,
+            "30": 687.0,
+            "31": 552.0,
+            "32": 752.0,
+            "33": 779.0,
+            "34": 653.0,
             "35": 687.0,
-            "40": 759.0,
-            "45": 807.0,
-            "50": 864.0
+            "36": 687.0,
+            "37": 813.0,
+            "38": 738.0,
+            "39": 845.0,
+            "40": 697.0,
+            "41": 787.0,
+            "42": 800.0,
+            "43": 677.0,
+            "44": 737.0,
+            "45": 773.0,
+            "46": 876.0,
+            "47": 917.0,
+            "48": 907.0,
+            "49": 853.0,
+            "50": 851.0
         }
     },
     "mem-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 431783936.0,
-            "5": 431783936.0,
-            "10": 431783936.0,
-            "15": 431783936.0,
-            "20": 431783936.0,
-            "25": 431783936.0,
-            "30": 431783936.0,
-            "35": 431783936.0,
-            "40": 431783936.0,
-            "45": 431783936.0,
-            "50": 431783936.0
+            "1": 433750528.0,
+            "2": 433750528.0,
+            "3": 433750528.0,
+            "4": 433750528.0,
+            "5": 433750528.0,
+            "6": 433750528.0,
+            "7": 433750528.0,
+            "8": 433750528.0,
+            "9": 433750528.0,
+            "10": 433750528.0,
+            "11": 433750528.0,
+            "12": 433750528.0,
+            "13": 433750528.0,
+            "14": 433750528.0,
+            "15": 433750528.0,
+            "16": 433750528.0,
+            "17": 433750528.0,
+            "18": 433750528.0,
+            "19": 433750528.0,
+            "20": 433750528.0,
+            "21": 433750528.0,
+            "22": 433750528.0,
+            "23": 433750528.0,
+            "24": 433750528.0,
+            "25": 433750528.0,
+            "26": 433750528.0,
+            "27": 433750528.0,
+            "28": 433750528.0,
+            "29": 433750528.0,
+            "30": 433750528.0,
+            "31": 433750528.0,
+            "32": 433750528.0,
+            "33": 433750528.0,
+            "34": 433750528.0,
+            "35": 433750528.0,
+            "36": 433750528.0,
+            "37": 433750528.0,
+            "38": 433750528.0,
+            "39": 433750528.0,
+            "40": 433750528.0,
+            "41": 433750528.0,
+            "42": 433750528.0,
+            "43": 433750528.0,
+            "44": 433750528.0,
+            "45": 433750528.0,
+            "46": 433750528.0,
+            "47": 433750528.0,
+            "48": 433750528.0,
+            "49": 433750528.0,
+            "50": 433750528.0
         }
     },
     "mem-max-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 677335040.0,
-            "5": 853214208.0,
-            "10": 853214208.0,
-            "15": 853214208.0,
-            "20": 854262784.0,
-            "25": 854262784.0,
-            "30": 854262784.0,
-            "35": 854262784.0,
-            "40": 854262784.0,
-            "45": 855311360.0,
-            "50": 855311360.0
+            "1": 678368768.0,
+            "2": 857265664.0,
+            "3": 857265664.0,
+            "4": 857265664.0,
+            "5": 857265664.0,
+            "6": 857265664.0,
+            "7": 857265664.0,
+            "8": 858314240.0,
+            "9": 858314240.0,
+            "10": 858314240.0,
+            "11": 858314240.0,
+            "12": 858314240.0,
+            "13": 858314240.0,
+            "14": 858314240.0,
+            "15": 858314240.0,
+            "16": 858314240.0,
+            "17": 858314240.0,
+            "18": 858314240.0,
+            "19": 858314240.0,
+            "20": 858314240.0,
+            "21": 858314240.0,
+            "22": 858314240.0,
+            "23": 858314240.0,
+            "24": 858314240.0,
+            "25": 858314240.0,
+            "26": 858314240.0,
+            "27": 858314240.0,
+            "28": 858314240.0,
+            "29": 858314240.0,
+            "30": 858314240.0,
+            "31": 858314240.0,
+            "32": 858314240.0,
+            "33": 858314240.0,
+            "34": 858314240.0,
+            "35": 858314240.0,
+            "36": 858314240.0,
+            "37": 858314240.0,
+            "38": 858314240.0,
+            "39": 858314240.0,
+            "40": 858314240.0,
+            "41": 858314240.0,
+            "42": 858314240.0,
+            "43": 858314240.0,
+            "44": 858314240.0,
+            "45": 858314240.0,
+            "46": 858314240.0,
+            "47": 858314240.0,
+            "48": 858314240.0,
+            "49": 858314240.0,
+            "50": 858314240.0
         }
     },
     "iteration-time": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 17.70614,
-            "5": 0.41397,
-            "10": 0.40992,
-            "15": 0.40823,
-            "20": 0.40466,
-            "25": 0.40564,
-            "30": 0.40987,
-            "35": 0.41811,
-            "40": 0.40504,
-            "45": 0.4037,
-            "50": 0.40207
+            "1": 16.90659,
+            "2": 0.4661,
+            "3": 0.43523,
+            "4": 0.41158,
+            "5": 0.40972,
+            "6": 0.40877,
+            "7": 0.40926,
+            "8": 0.40538,
+            "9": 0.40596,
+            "10": 0.40505,
+            "11": 0.41352,
+            "12": 0.40662,
+            "13": 0.40449,
+            "14": 0.40315,
+            "15": 0.40941,
+            "16": 0.4018,
+            "17": 0.40517,
+            "18": 0.40633,
+            "19": 0.40147,
+            "20": 0.4015,
+            "21": 0.40319,
+            "22": 0.40228,
+            "23": 0.40026,
+            "24": 0.40314,
+            "25": 0.40407,
+            "26": 0.40203,
+            "27": 0.40678,
+            "28": 0.40499,
+            "29": 0.40202,
+            "30": 0.40033,
+            "31": 0.39945,
+            "32": 0.39857,
+            "33": 0.39767,
+            "34": 0.3978,
+            "35": 0.39783,
+            "36": 0.39797,
+            "37": 0.39761,
+            "38": 0.39787,
+            "39": 0.39865,
+            "40": 0.40084,
+            "41": 0.39882,
+            "42": 0.39896,
+            "43": 0.39904,
+            "44": 0.39935,
+            "45": 0.40068,
+            "46": 0.39796,
+            "47": 0.39862,
+            "48": 0.39951,
+            "49": 0.39974,
+            "50": 0.39951
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/model_config.yaml
index b4d63762604..088ababb9cb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/model_config.yaml
@@ -45,7 +45,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --attention-backend: flash
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..724f3a7c2b8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84733,
+            "4": 10.85571,
+            "5": 10.86,
+            "6": 10.87733,
+            "7": 10.86555,
+            "8": 10.84913,
+            "9": 10.86609,
+            "10": 10.82473,
+            "11": 10.85618,
+            "12": 10.85374,
+            "13": 10.86788,
+            "14": 10.87119,
+            "15": 10.82235,
+            "16": 10.79991,
+            "17": 10.77431,
+            "18": 10.78345,
+            "19": 10.79308,
+            "20": 10.68226,
+            "21": 10.6471,
+            "22": 10.50917,
+            "23": 10.66827,
+            "24": 10.54193,
+            "25": 10.4928,
+            "26": 10.55931,
+            "27": 10.54238,
+            "28": 10.51129,
+            "29": 10.53257,
+            "30": 10.28992,
+            "31": 10.02853,
+            "32": 10.38885,
+            "33": 10.39593,
+            "34": 10.13446,
+            "35": 10.18932,
+            "36": 10.13355,
+            "37": 10.27381,
+            "38": 10.10751,
+            "39": 10.34007,
+            "40": 9.98538,
+            "41": 10.06414,
+            "42": 10.13744,
+            "43": 9.73381,
+            "44": 9.86305,
+            "45": 9.73723,
+            "46": 9.71343,
+            "47": 10.07757,
+            "48": 9.76768,
+            "49": 9.41987,
+            "50": 9.81687
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 567.0,
+            "2": 584.0,
+            "3": 598.0,
+            "4": 633.0,
+            "5": 630.0,
+            "6": 645.0,
+            "7": 645.0,
+            "8": 674.0,
+            "9": 625.0,
+            "10": 500.0,
+            "11": 669.0,
+            "12": 554.0,
+            "13": 681.0,
+            "14": 633.0,
+            "15": 623.0,
+            "16": 592.0,
+            "17": 636.0,
+            "18": 625.0,
+            "19": 633.0,
+            "20": 587.0,
+            "21": 696.0,
+            "22": 585.0,
+            "23": 681.0,
+            "24": 639.0,
+            "25": 587.0,
+            "26": 642.0,
+            "27": 639.0,
+            "28": 744.0,
+            "29": 746.0,
+            "30": 685.0,
+            "31": 603.0,
+            "32": 719.0,
+            "33": 850.0,
+            "34": 696.0,
+            "35": 737.0,
+            "36": 738.0,
+            "37": 840.0,
+            "38": 757.0,
+            "39": 828.0,
+            "40": 828.0,
+            "41": 787.0,
+            "42": 883.0,
+            "43": 703.0,
+            "44": 850.0,
+            "45": 840.0,
+            "46": 837.0,
+            "47": 915.0,
+            "48": 849.0,
+            "49": 915.0,
+            "50": 892.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 460882432.0,
+            "2": 460882432.0,
+            "3": 460882432.0,
+            "4": 460882432.0,
+            "5": 460882432.0,
+            "6": 460882432.0,
+            "7": 460882432.0,
+            "8": 460882432.0,
+            "9": 460882432.0,
+            "10": 460882432.0,
+            "11": 460882432.0,
+            "12": 460882432.0,
+            "13": 460882432.0,
+            "14": 460882432.0,
+            "15": 460882432.0,
+            "16": 460882432.0,
+            "17": 460882432.0,
+            "18": 460882432.0,
+            "19": 460882432.0,
+            "20": 460882432.0,
+            "21": 460882432.0,
+            "22": 460882432.0,
+            "23": 460882432.0,
+            "24": 460882432.0,
+            "25": 460882432.0,
+            "26": 460882432.0,
+            "27": 460882432.0,
+            "28": 460882432.0,
+            "29": 460882432.0,
+            "30": 460882432.0,
+            "31": 460882432.0,
+            "32": 460882432.0,
+            "33": 460882432.0,
+            "34": 460882432.0,
+            "35": 460882432.0,
+            "36": 460882432.0,
+            "37": 460882432.0,
+            "38": 460882432.0,
+            "39": 460882432.0,
+            "40": 460882432.0,
+            "41": 460882432.0,
+            "42": 460882432.0,
+            "43": 460882432.0,
+            "44": 460882432.0,
+            "45": 460882432.0,
+            "46": 460882432.0,
+            "47": 460882432.0,
+            "48": 460882432.0,
+            "49": 460882432.0,
+            "50": 460882432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 706684416.0,
+            "2": 885445632.0,
+            "3": 885445632.0,
+            "4": 885445632.0,
+            "5": 885445632.0,
+            "6": 885445632.0,
+            "7": 885445632.0,
+            "8": 885445632.0,
+            "9": 885445632.0,
+            "10": 885445632.0,
+            "11": 885445632.0,
+            "12": 885445632.0,
+            "13": 885445632.0,
+            "14": 885445632.0,
+            "15": 885445632.0,
+            "16": 885445632.0,
+            "17": 885445632.0,
+            "18": 885445632.0,
+            "19": 885445632.0,
+            "20": 885445632.0,
+            "21": 885445632.0,
+            "22": 885445632.0,
+            "23": 885445632.0,
+            "24": 885445632.0,
+            "25": 885445632.0,
+            "26": 885445632.0,
+            "27": 885445632.0,
+            "28": 885445632.0,
+            "29": 885445632.0,
+            "30": 885446144.0,
+            "31": 885446144.0,
+            "32": 885446144.0,
+            "33": 885446144.0,
+            "34": 885446144.0,
+            "35": 885446144.0,
+            "36": 885446144.0,
+            "37": 885446144.0,
+            "38": 885446144.0,
+            "39": 885446144.0,
+            "40": 885446144.0,
+            "41": 885446144.0,
+            "42": 885446144.0,
+            "43": 885446144.0,
+            "44": 885446144.0,
+            "45": 885446144.0,
+            "46": 885446144.0,
+            "47": 885446144.0,
+            "48": 885446144.0,
+            "49": 885446144.0,
+            "50": 885446144.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.33644,
+            "3": 0.43367,
+            "4": 0.41796,
+            "5": 0.42437,
+            "6": 0.41803,
+            "7": 0.42777,
+            "8": 0.49261,
+            "9": 0.5259,
+            "10": 0.5165,
+            "11": 0.50964,
+            "12": 0.51307,
+            "13": 0.5209,
+            "14": 0.52184,
+            "15": 0.51919,
+            "16": 0.52047,
+            "17": 0.52096,
+            "18": 0.51364,
+            "19": 0.52433,
+            "20": 0.5561,
+            "21": 0.6836,
+            "22": 0.64032,
+            "23": 0.47706,
+            "24": 0.46456,
+            "25": 0.46618,
+            "26": 0.46113,
+            "27": 0.46089,
+            "28": 0.45645,
+            "29": 0.44502,
+            "30": 0.44476,
+            "31": 0.44646,
+            "32": 0.44355,
+            "33": 0.44206,
+            "34": 0.4428,
+            "35": 0.43615,
+            "36": 0.43343,
+            "37": 0.4428,
+            "38": 0.43948,
+            "39": 0.42992,
+            "40": 0.44781,
+            "41": 0.44585,
+            "42": 0.43409,
+            "43": 0.42263,
+            "44": 0.41737,
+            "45": 0.41789,
+            "46": 0.41449,
+            "47": 0.41442,
+            "48": 0.41697,
+            "49": 0.41486,
+            "50": 0.41305
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json
index f64661824cb..a470bf65873 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json
@@ -4,55 +4,55 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86284,
-            "4": 10.84009,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.86282,
+            "4": 10.84007,
             "5": 10.87856,
-            "6": 10.88856,
-            "7": 10.86532,
-            "8": 10.86017,
-            "9": 10.8599,
-            "10": 10.82981,
-            "11": 10.8895,
-            "12": 10.8751,
-            "13": 10.87423,
+            "6": 10.88852,
+            "7": 10.86536,
+            "8": 10.86015,
+            "9": 10.85991,
+            "10": 10.82982,
+            "11": 10.88947,
+            "12": 10.87511,
+            "13": 10.87422,
             "14": 10.89675,
-            "15": 10.82054,
-            "16": 10.82504,
+            "15": 10.82056,
+            "16": 10.82497,
             "17": 10.78983,
             "18": 10.81029,
-            "19": 10.80535,
-            "20": 10.70398,
-            "21": 10.66993,
-            "22": 10.50643,
-            "23": 10.69004,
-            "24": 10.56314,
-            "25": 10.4942,
-            "26": 10.56628,
-            "27": 10.58025,
+            "19": 10.80528,
+            "20": 10.70396,
+            "21": 10.6699,
+            "22": 10.50641,
+            "23": 10.69006,
+            "24": 10.56312,
+            "25": 10.49418,
+            "26": 10.56627,
+            "27": 10.58023,
             "28": 10.51571,
-            "29": 10.55299,
-            "30": 10.30549,
-            "31": 10.02245,
-            "32": 10.40614,
+            "29": 10.55296,
+            "30": 10.30551,
+            "31": 10.02244,
+            "32": 10.40618,
             "33": 10.39874,
-            "34": 10.13771,
+            "34": 10.1377,
             "35": 10.20184,
-            "36": 10.16052,
-            "37": 10.28973,
-            "38": 10.11474,
+            "36": 10.1605,
+            "37": 10.28975,
+            "38": 10.11483,
             "39": 10.361,
-            "40": 10.01903,
+            "40": 10.01905,
             "41": 10.07292,
-            "42": 10.14698,
-            "43": 9.74687,
-            "44": 9.87766,
-            "45": 9.74966,
-            "46": 9.73383,
-            "47": 10.07535,
-            "48": 9.78068,
-            "49": 9.44784,
+            "42": 10.14697,
+            "43": 9.74684,
+            "44": 9.87763,
+            "45": 9.74962,
+            "46": 9.73382,
+            "47": 10.07536,
+            "48": 9.78071,
+            "49": 9.44783,
             "50": 9.8399
         }
     },
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 653.0,
-            "2": 642.0,
-            "3": 630.0,
-            "4": 585.0,
-            "5": 635.0,
-            "6": 687.0,
-            "7": 615.0,
-            "8": 601.0,
-            "9": 607.0,
-            "10": 522.0,
-            "11": 637.0,
-            "12": 675.0,
-            "13": 649.0,
-            "14": 648.0,
-            "15": 640.0,
-            "16": 602.0,
-            "17": 668.0,
-            "18": 634.0,
-            "19": 593.0,
-            "20": 579.0,
-            "21": 633.0,
-            "22": 597.0,
-            "23": 756.0,
-            "24": 612.0,
-            "25": 591.0,
-            "26": 620.0,
-            "27": 700.0,
-            "28": 705.0,
-            "29": 795.0,
-            "30": 752.0,
-            "31": 628.0,
-            "32": 712.0,
-            "33": 752.0,
-            "34": 737.0,
-            "35": 741.0,
-            "36": 770.0,
-            "37": 861.0,
-            "38": 823.0,
-            "39": 812.0,
-            "40": 814.0,
-            "41": 826.0,
-            "42": 801.0,
-            "43": 769.0,
-            "44": 822.0,
-            "45": 777.0,
-            "46": 828.0,
-            "47": 878.0,
-            "48": 915.0,
-            "49": 908.0,
-            "50": 848.0
+            "1": 572.0,
+            "2": 656.0,
+            "3": 649.0,
+            "4": 631.0,
+            "5": 658.0,
+            "6": 636.0,
+            "7": 636.0,
+            "8": 542.0,
+            "9": 653.0,
+            "10": 551.0,
+            "11": 681.0,
+            "12": 642.0,
+            "13": 624.0,
+            "14": 658.0,
+            "15": 682.0,
+            "16": 659.0,
+            "17": 620.0,
+            "18": 603.0,
+            "19": 634.0,
+            "20": 639.0,
+            "21": 634.0,
+            "22": 602.0,
+            "23": 731.0,
+            "24": 620.0,
+            "25": 611.0,
+            "26": 626.0,
+            "27": 683.0,
+            "28": 668.0,
+            "29": 713.0,
+            "30": 712.0,
+            "31": 616.0,
+            "32": 786.0,
+            "33": 800.0,
+            "34": 702.0,
+            "35": 684.0,
+            "36": 664.0,
+            "37": 831.0,
+            "38": 802.0,
+            "39": 919.0,
+            "40": 802.0,
+            "41": 791.0,
+            "42": 840.0,
+            "43": 718.0,
+            "44": 756.0,
+            "45": 765.0,
+            "46": 809.0,
+            "47": 839.0,
+            "48": 827.0,
+            "49": 935.0,
+            "50": 839.0
         }
     },
     "mem-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 759898624.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 933156352.0,
-            "25": 933156352.0,
-            "26": 933156352.0,
-            "27": 933156352.0,
-            "28": 933156352.0,
-            "29": 933156352.0,
-            "30": 933156352.0,
-            "31": 933156352.0,
-            "32": 933156352.0,
-            "33": 933156352.0,
-            "34": 933156352.0,
-            "35": 933156352.0,
-            "36": 933156352.0,
-            "37": 933156352.0,
-            "38": 933156352.0,
-            "39": 933156352.0,
-            "40": 933156352.0,
-            "41": 933156352.0,
-            "42": 933156352.0,
-            "43": 933156352.0,
-            "44": 933156352.0,
-            "45": 933156352.0,
-            "46": 933156352.0,
-            "47": 933156352.0,
-            "48": 933156352.0,
-            "49": 933156352.0,
-            "50": 933156352.0
+            "1": 756752896.0,
+            "2": 938398720.0,
+            "3": 938398720.0,
+            "4": 938398720.0,
+            "5": 938398720.0,
+            "6": 938398720.0,
+            "7": 938398720.0,
+            "8": 938398720.0,
+            "9": 938398720.0,
+            "10": 938398720.0,
+            "11": 938398720.0,
+            "12": 938398720.0,
+            "13": 938398720.0,
+            "14": 938398720.0,
+            "15": 938398720.0,
+            "16": 938399232.0,
+            "17": 938399232.0,
+            "18": 938399232.0,
+            "19": 938399232.0,
+            "20": 938399232.0,
+            "21": 938399232.0,
+            "22": 938399232.0,
+            "23": 938399232.0,
+            "24": 938399232.0,
+            "25": 938399232.0,
+            "26": 938399232.0,
+            "27": 938399232.0,
+            "28": 938399232.0,
+            "29": 938399232.0,
+            "30": 938399232.0,
+            "31": 938399232.0,
+            "32": 938399232.0,
+            "33": 938399232.0,
+            "34": 938399232.0,
+            "35": 938399232.0,
+            "36": 938399232.0,
+            "37": 938399232.0,
+            "38": 938399232.0,
+            "39": 938399232.0,
+            "40": 938399232.0,
+            "41": 938399232.0,
+            "42": 938399232.0,
+            "43": 938399232.0,
+            "44": 938399232.0,
+            "45": 938399232.0,
+            "46": 938399232.0,
+            "47": 938399232.0,
+            "48": 938399232.0,
+            "49": 938399232.0,
+            "50": 938399232.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 18.71096,
-            "2": 0.39649,
-            "3": 0.33228,
-            "4": 0.33042,
-            "5": 0.33036,
-            "6": 0.3326,
-            "7": 0.33962,
-            "8": 0.37041,
-            "9": 0.33077,
-            "10": 0.33179,
-            "11": 0.33053,
-            "12": 0.33332,
-            "13": 0.33149,
-            "14": 0.32928,
-            "15": 0.33252,
-            "16": 0.3321,
-            "17": 0.32661,
-            "18": 0.32933,
-            "19": 0.32718,
-            "20": 0.32982,
-            "21": 0.32827,
-            "22": 0.3313,
-            "23": 0.32836,
-            "24": 0.3287,
-            "25": 0.33025,
-            "26": 0.32605,
-            "27": 0.33501,
-            "28": 0.32889,
-            "29": 0.32971,
-            "30": 0.3318,
-            "31": 0.33458,
-            "32": 0.33222,
-            "33": 0.33434,
-            "34": 0.3337,
-            "35": 0.33221,
-            "36": 0.32984,
-            "37": 0.32779,
-            "38": 0.33131,
-            "39": 0.33056,
-            "40": 0.32941,
-            "41": 0.32351,
-            "42": 0.32946,
-            "43": 0.32913,
-            "44": 0.3283,
-            "45": 0.32845,
-            "46": 0.32474,
-            "47": 0.33097,
-            "48": 0.32791,
-            "49": 0.33143,
-            "50": 0.33005
+            "1": 37.38041,
+            "2": 0.33426,
+            "3": 0.30575,
+            "4": 0.2855,
+            "5": 0.28459,
+            "6": 0.28581,
+            "7": 0.28653,
+            "8": 0.28649,
+            "9": 0.28246,
+            "10": 0.28538,
+            "11": 0.28516,
+            "12": 0.28331,
+            "13": 0.28799,
+            "14": 0.28438,
+            "15": 0.28361,
+            "16": 0.28315,
+            "17": 0.2837,
+            "18": 0.28279,
+            "19": 0.28916,
+            "20": 0.28613,
+            "21": 0.2849,
+            "22": 0.2837,
+            "23": 0.2861,
+            "24": 0.28551,
+            "25": 0.28665,
+            "26": 0.28308,
+            "27": 0.28626,
+            "28": 0.28139,
+            "29": 0.28479,
+            "30": 0.28557,
+            "31": 0.28342,
+            "32": 0.28058,
+            "33": 0.2824,
+            "34": 0.28129,
+            "35": 0.28377,
+            "36": 0.28273,
+            "37": 0.28699,
+            "38": 0.28388,
+            "39": 0.28427,
+            "40": 0.28442,
+            "41": 0.28373,
+            "42": 0.28177,
+            "43": 0.28258,
+            "44": 0.28237,
+            "45": 0.2815,
+            "46": 0.28503,
+            "47": 0.2817,
+            "48": 0.28433,
+            "49": 0.28819,
+            "50": 0.28371
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..bb38c983224
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84732,
+            "4": 10.85572,
+            "5": 10.86003,
+            "6": 10.87729,
+            "7": 10.86557,
+            "8": 10.84912,
+            "9": 10.86604,
+            "10": 10.82473,
+            "11": 10.85617,
+            "12": 10.85371,
+            "13": 10.86788,
+            "14": 10.87113,
+            "15": 10.82235,
+            "16": 10.79993,
+            "17": 10.77433,
+            "18": 10.78348,
+            "19": 10.79308,
+            "20": 10.68227,
+            "21": 10.6471,
+            "22": 10.50922,
+            "23": 10.66831,
+            "24": 10.54194,
+            "25": 10.49281,
+            "26": 10.55932,
+            "27": 10.54243,
+            "28": 10.51131,
+            "29": 10.53254,
+            "30": 10.28988,
+            "31": 10.02851,
+            "32": 10.3888,
+            "33": 10.39597,
+            "34": 10.13451,
+            "35": 10.18926,
+            "36": 10.13351,
+            "37": 10.27379,
+            "38": 10.10746,
+            "39": 10.34007,
+            "40": 9.98541,
+            "41": 10.06416,
+            "42": 10.13748,
+            "43": 9.73386,
+            "44": 9.86309,
+            "45": 9.73718,
+            "46": 9.71345,
+            "47": 10.07751,
+            "48": 9.76767,
+            "49": 9.41988,
+            "50": 9.81692
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 568.0,
+            "2": 566.0,
+            "3": 625.0,
+            "4": 604.0,
+            "5": 668.0,
+            "6": 650.0,
+            "7": 602.0,
+            "8": 647.0,
+            "9": 632.0,
+            "10": 527.0,
+            "11": 641.0,
+            "12": 661.0,
+            "13": 666.0,
+            "14": 656.0,
+            "15": 642.0,
+            "16": 603.0,
+            "17": 653.0,
+            "18": 570.0,
+            "19": 674.0,
+            "20": 571.0,
+            "21": 709.0,
+            "22": 663.0,
+            "23": 704.0,
+            "24": 628.0,
+            "25": 568.0,
+            "26": 632.0,
+            "27": 668.0,
+            "28": 736.0,
+            "29": 760.0,
+            "30": 687.0,
+            "31": 589.0,
+            "32": 740.0,
+            "33": 772.0,
+            "34": 713.0,
+            "35": 753.0,
+            "36": 731.0,
+            "37": 873.0,
+            "38": 762.0,
+            "39": 836.0,
+            "40": 864.0,
+            "41": 780.0,
+            "42": 847.0,
+            "43": 740.0,
+            "44": 822.0,
+            "45": 718.0,
+            "46": 826.0,
+            "47": 890.0,
+            "48": 852.0,
+            "49": 872.0,
+            "50": 869.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 460882432.0,
+            "2": 460882432.0,
+            "3": 460882432.0,
+            "4": 460882432.0,
+            "5": 460882432.0,
+            "6": 460882432.0,
+            "7": 460882432.0,
+            "8": 460882432.0,
+            "9": 460882432.0,
+            "10": 460882432.0,
+            "11": 460882432.0,
+            "12": 460882432.0,
+            "13": 460882432.0,
+            "14": 460882432.0,
+            "15": 460882432.0,
+            "16": 460882432.0,
+            "17": 460882432.0,
+            "18": 460882432.0,
+            "19": 460882432.0,
+            "20": 460882432.0,
+            "21": 460882432.0,
+            "22": 460882432.0,
+            "23": 460882432.0,
+            "24": 460882432.0,
+            "25": 460882432.0,
+            "26": 460882432.0,
+            "27": 460882432.0,
+            "28": 460882432.0,
+            "29": 460882432.0,
+            "30": 460882432.0,
+            "31": 460882432.0,
+            "32": 460882432.0,
+            "33": 460882432.0,
+            "34": 460882432.0,
+            "35": 460882432.0,
+            "36": 460882432.0,
+            "37": 460882432.0,
+            "38": 460882432.0,
+            "39": 460882432.0,
+            "40": 460882432.0,
+            "41": 460882432.0,
+            "42": 460882432.0,
+            "43": 460882432.0,
+            "44": 460882432.0,
+            "45": 460882432.0,
+            "46": 460882432.0,
+            "47": 460882432.0,
+            "48": 460882432.0,
+            "49": 460882432.0,
+            "50": 460882432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 705634816.0,
+            "2": 884397568.0,
+            "3": 885443584.0,
+            "4": 885445632.0,
+            "5": 885445632.0,
+            "6": 885445632.0,
+            "7": 885445632.0,
+            "8": 885445632.0,
+            "9": 885445632.0,
+            "10": 885445632.0,
+            "11": 885445632.0,
+            "12": 885445632.0,
+            "13": 885445632.0,
+            "14": 885445632.0,
+            "15": 885446144.0,
+            "16": 885446144.0,
+            "17": 885446144.0,
+            "18": 885446144.0,
+            "19": 885446144.0,
+            "20": 885446144.0,
+            "21": 885446144.0,
+            "22": 885446144.0,
+            "23": 885446144.0,
+            "24": 885446144.0,
+            "25": 885446144.0,
+            "26": 885446144.0,
+            "27": 885446144.0,
+            "28": 885446144.0,
+            "29": 885446144.0,
+            "30": 885446144.0,
+            "31": 885446144.0,
+            "32": 885446144.0,
+            "33": 885446144.0,
+            "34": 885446144.0,
+            "35": 885446144.0,
+            "36": 885446144.0,
+            "37": 885446144.0,
+            "38": 885446144.0,
+            "39": 885446144.0,
+            "40": 885446144.0,
+            "41": 885446144.0,
+            "42": 885446144.0,
+            "43": 885446144.0,
+            "44": 885446144.0,
+            "45": 885446144.0,
+            "46": 885446144.0,
+            "47": 885446144.0,
+            "48": 885446144.0,
+            "49": 885446144.0,
+            "50": 885446144.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.20529,
+            "3": 0.43211,
+            "4": 0.41974,
+            "5": 0.41788,
+            "6": 0.41713,
+            "7": 0.41839,
+            "8": 0.41778,
+            "9": 0.41756,
+            "10": 0.4154,
+            "11": 0.41682,
+            "12": 0.41539,
+            "13": 0.41618,
+            "14": 0.41668,
+            "15": 0.41894,
+            "16": 0.41891,
+            "17": 0.46526,
+            "18": 0.46951,
+            "19": 0.48697,
+            "20": 0.51157,
+            "21": 0.44025,
+            "22": 0.41388,
+            "23": 0.41164,
+            "24": 0.41655,
+            "25": 0.41424,
+            "26": 0.41687,
+            "27": 0.41162,
+            "28": 0.41035,
+            "29": 0.41184,
+            "30": 0.40989,
+            "31": 0.41153,
+            "32": 0.41143,
+            "33": 0.41324,
+            "34": 0.41271,
+            "35": 0.41107,
+            "36": 0.41053,
+            "37": 0.41372,
+            "38": 0.4128,
+            "39": 0.41377,
+            "40": 0.41093,
+            "41": 0.41375,
+            "42": 0.40814,
+            "43": 0.40704,
+            "44": 0.40632,
+            "45": 0.41014,
+            "46": 0.41007,
+            "47": 0.41057,
+            "48": 0.41002,
+            "49": 0.4095,
+            "50": 0.41018
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
index cc1700ed493..98736eb9491 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86285,
-            "4": 10.84007,
-            "5": 10.87854,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.86281,
+            "4": 10.84006,
+            "5": 10.87858,
             "6": 10.88852,
-            "7": 10.86537,
-            "8": 10.86015,
-            "9": 10.85985,
-            "10": 10.82982,
+            "7": 10.86536,
+            "8": 10.8602,
+            "9": 10.85986,
+            "10": 10.8298,
             "11": 10.88949,
-            "12": 10.87509,
-            "13": 10.87426,
-            "14": 10.89674,
-            "15": 10.82054,
-            "16": 10.82501,
-            "17": 10.78985,
+            "12": 10.87507,
+            "13": 10.87425,
+            "14": 10.89678,
+            "15": 10.8205,
+            "16": 10.82496,
+            "17": 10.78984,
             "18": 10.81032,
-            "19": 10.8053,
-            "20": 10.70397,
-            "21": 10.66986,
-            "22": 10.50641,
-            "23": 10.69001,
-            "24": 10.56317,
-            "25": 10.49421,
-            "26": 10.56628,
+            "19": 10.80534,
+            "20": 10.70396,
+            "21": 10.66987,
+            "22": 10.5064,
+            "23": 10.69008,
+            "24": 10.56312,
+            "25": 10.49422,
+            "26": 10.56625,
             "27": 10.58022,
-            "28": 10.51574,
-            "29": 10.55292,
-            "30": 10.30549,
+            "28": 10.51576,
+            "29": 10.55299,
+            "30": 10.3055,
             "31": 10.0225,
-            "32": 10.40617,
-            "33": 10.39874,
-            "34": 10.13772,
+            "32": 10.40614,
+            "33": 10.39876,
+            "34": 10.13774,
             "35": 10.20187,
-            "36": 10.16045,
-            "37": 10.28977,
-            "38": 10.11478,
-            "39": 10.36101,
-            "40": 10.01903,
-            "41": 10.07294,
-            "42": 10.14691,
-            "43": 9.74683,
-            "44": 9.87762,
+            "36": 10.16049,
+            "37": 10.28975,
+            "38": 10.11482,
+            "39": 10.36102,
+            "40": 10.01898,
+            "41": 10.07291,
+            "42": 10.14696,
+            "43": 9.74688,
+            "44": 9.87766,
             "45": 9.74966,
-            "46": 9.73384,
-            "47": 10.07535,
-            "48": 9.78069,
+            "46": 9.73386,
+            "47": 10.07538,
+            "48": 9.7807,
             "49": 9.44783,
-            "50": 9.83992
+            "50": 9.83988
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 607.0,
-            "2": 628.0,
-            "3": 600.0,
-            "4": 658.0,
-            "5": 657.0,
-            "6": 707.0,
-            "7": 637.0,
-            "8": 593.0,
-            "9": 632.0,
-            "10": 553.0,
-            "11": 641.0,
-            "12": 631.0,
-            "13": 676.0,
-            "14": 643.0,
-            "15": 623.0,
-            "16": 611.0,
-            "17": 687.0,
-            "18": 622.0,
-            "19": 581.0,
-            "20": 609.0,
-            "21": 652.0,
-            "22": 621.0,
-            "23": 800.0,
+            "1": 641.0,
+            "2": 681.0,
+            "3": 601.0,
+            "4": 636.0,
+            "5": 651.0,
+            "6": 701.0,
+            "7": 639.0,
+            "8": 535.0,
+            "9": 647.0,
+            "10": 513.0,
+            "11": 669.0,
+            "12": 644.0,
+            "13": 680.0,
+            "14": 654.0,
+            "15": 601.0,
+            "16": 616.0,
+            "17": 656.0,
+            "18": 623.0,
+            "19": 649.0,
+            "20": 575.0,
+            "21": 679.0,
+            "22": 556.0,
+            "23": 681.0,
             "24": 618.0,
-            "25": 623.0,
-            "26": 595.0,
-            "27": 679.0,
-            "28": 726.0,
-            "29": 719.0,
-            "30": 723.0,
-            "31": 624.0,
-            "32": 737.0,
-            "33": 776.0,
-            "34": 713.0,
-            "35": 696.0,
-            "36": 759.0,
-            "37": 829.0,
-            "38": 784.0,
-            "39": 798.0,
-            "40": 813.0,
-            "41": 814.0,
-            "42": 880.0,
-            "43": 780.0,
-            "44": 775.0,
-            "45": 759.0,
-            "46": 849.0,
-            "47": 938.0,
-            "48": 876.0,
-            "49": 886.0,
-            "50": 817.0
+            "25": 629.0,
+            "26": 650.0,
+            "27": 704.0,
+            "28": 693.0,
+            "29": 764.0,
+            "30": 725.0,
+            "31": 609.0,
+            "32": 728.0,
+            "33": 790.0,
+            "34": 724.0,
+            "35": 730.0,
+            "36": 717.0,
+            "37": 857.0,
+            "38": 730.0,
+            "39": 897.0,
+            "40": 816.0,
+            "41": 799.0,
+            "42": 845.0,
+            "43": 760.0,
+            "44": 831.0,
+            "45": 786.0,
+            "46": 802.0,
+            "47": 827.0,
+            "48": 846.0,
+            "49": 863.0,
+            "50": 803.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 510689792.0,
-            "2": 510689792.0,
-            "3": 510689792.0,
-            "4": 510689792.0,
-            "5": 510689792.0,
-            "6": 510689792.0,
-            "7": 510689792.0,
-            "8": 510689792.0,
-            "9": 510689792.0,
-            "10": 510689792.0,
-            "11": 510689792.0,
-            "12": 510689792.0,
-            "13": 510689792.0,
-            "14": 510689792.0,
-            "15": 510689792.0,
-            "16": 510689792.0,
-            "17": 510689792.0,
-            "18": 510689792.0,
-            "19": 510689792.0,
-            "20": 510689792.0,
-            "21": 510689792.0,
-            "22": 510689792.0,
-            "23": 510689792.0,
-            "24": 510689792.0,
-            "25": 510689792.0,
-            "26": 510689792.0,
-            "27": 510689792.0,
-            "28": 510689792.0,
-            "29": 510689792.0,
-            "30": 510689792.0,
-            "31": 510689792.0,
-            "32": 510689792.0,
-            "33": 510689792.0,
-            "34": 510689792.0,
-            "35": 510689792.0,
-            "36": 510689792.0,
-            "37": 510689792.0,
-            "38": 510689792.0,
-            "39": 510689792.0,
-            "40": 510689792.0,
-            "41": 510689792.0,
-            "42": 510689792.0,
-            "43": 510689792.0,
-            "44": 510689792.0,
-            "45": 510689792.0,
-            "46": 510689792.0,
-            "47": 510689792.0,
-            "48": 510689792.0,
-            "49": 510689792.0,
-            "50": 510689792.0
+            "1": 512262656.0,
+            "2": 512262656.0,
+            "3": 512262656.0,
+            "4": 512262656.0,
+            "5": 512262656.0,
+            "6": 512262656.0,
+            "7": 512262656.0,
+            "8": 512262656.0,
+            "9": 512262656.0,
+            "10": 512262656.0,
+            "11": 512262656.0,
+            "12": 512262656.0,
+            "13": 512262656.0,
+            "14": 512262656.0,
+            "15": 512262656.0,
+            "16": 512262656.0,
+            "17": 512262656.0,
+            "18": 512262656.0,
+            "19": 512262656.0,
+            "20": 512262656.0,
+            "21": 512262656.0,
+            "22": 512262656.0,
+            "23": 512262656.0,
+            "24": 512262656.0,
+            "25": 512262656.0,
+            "26": 512262656.0,
+            "27": 512262656.0,
+            "28": 512262656.0,
+            "29": 512262656.0,
+            "30": 512262656.0,
+            "31": 512262656.0,
+            "32": 512262656.0,
+            "33": 512262656.0,
+            "34": 512262656.0,
+            "35": 512262656.0,
+            "36": 512262656.0,
+            "37": 512262656.0,
+            "38": 512262656.0,
+            "39": 512262656.0,
+            "40": 512262656.0,
+            "41": 512262656.0,
+            "42": 512262656.0,
+            "43": 512262656.0,
+            "44": 512262656.0,
+            "45": 512262656.0,
+            "46": 512262656.0,
+            "47": 512262656.0,
+            "48": 512262656.0,
+            "49": 512262656.0,
+            "50": 512262656.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 759895552.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 933156352.0,
-            "25": 933156352.0,
-            "26": 933156352.0,
-            "27": 933156352.0,
-            "28": 933156352.0,
-            "29": 933156352.0,
-            "30": 933156352.0,
-            "31": 933156352.0,
-            "32": 933156352.0,
-            "33": 933156352.0,
-            "34": 933156352.0,
-            "35": 933156352.0,
-            "36": 933156352.0,
-            "37": 933156352.0,
-            "38": 933156352.0,
-            "39": 933156352.0,
-            "40": 933156352.0,
-            "41": 933156352.0,
-            "42": 933156352.0,
-            "43": 933156352.0,
-            "44": 933156352.0,
-            "45": 933156352.0,
-            "46": 933156352.0,
-            "47": 933156352.0,
-            "48": 933156352.0,
-            "49": 933156352.0,
-            "50": 933156352.0
+            "1": 755704832.0,
+            "2": 934729216.0,
+            "3": 934729216.0,
+            "4": 935776768.0,
+            "5": 935776768.0,
+            "6": 935776768.0,
+            "7": 935776768.0,
+            "8": 935776768.0,
+            "9": 935776768.0,
+            "10": 935776768.0,
+            "11": 935777280.0,
+            "12": 935777280.0,
+            "13": 935777280.0,
+            "14": 935777280.0,
+            "15": 935777280.0,
+            "16": 935777280.0,
+            "17": 935777280.0,
+            "18": 935777280.0,
+            "19": 935777280.0,
+            "20": 935777792.0,
+            "21": 935777792.0,
+            "22": 935777792.0,
+            "23": 935777792.0,
+            "24": 935777792.0,
+            "25": 935777792.0,
+            "26": 935777792.0,
+            "27": 935777792.0,
+            "28": 935777792.0,
+            "29": 935777792.0,
+            "30": 935777792.0,
+            "31": 935777792.0,
+            "32": 935777792.0,
+            "33": 935777792.0,
+            "34": 935777792.0,
+            "35": 935777792.0,
+            "36": 935777792.0,
+            "37": 935777792.0,
+            "38": 935777792.0,
+            "39": 935777792.0,
+            "40": 935777792.0,
+            "41": 935777792.0,
+            "42": 935777792.0,
+            "43": 935777792.0,
+            "44": 935777792.0,
+            "45": 935777792.0,
+            "46": 935777792.0,
+            "47": 935777792.0,
+            "48": 935777792.0,
+            "49": 935777792.0,
+            "50": 935777792.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 18.67374,
-            "2": 0.33434,
-            "3": 0.32862,
-            "4": 0.3312,
-            "5": 0.32463,
-            "6": 0.33221,
-            "7": 0.33167,
-            "8": 0.32476,
-            "9": 0.32742,
-            "10": 0.32327,
-            "11": 0.31599,
-            "12": 0.32511,
-            "13": 0.32273,
-            "14": 0.31956,
-            "15": 0.32777,
-            "16": 0.32745,
-            "17": 0.31743,
-            "18": 0.32418,
-            "19": 0.32759,
-            "20": 0.32696,
-            "21": 0.32321,
-            "22": 0.32923,
-            "23": 0.32125,
-            "24": 0.32088,
-            "25": 0.32288,
-            "26": 0.31739,
-            "27": 0.33667,
-            "28": 0.32586,
-            "29": 0.31738,
-            "30": 0.31392,
-            "31": 0.32116,
-            "32": 0.31637,
-            "33": 0.32029,
-            "34": 0.32057,
-            "35": 0.31739,
-            "36": 0.31341,
-            "37": 0.32121,
-            "38": 0.326,
-            "39": 0.31692,
-            "40": 0.31511,
-            "41": 0.32216,
-            "42": 0.31654,
-            "43": 0.32474,
-            "44": 0.32162,
-            "45": 0.31451,
-            "46": 0.31434,
-            "47": 0.32885,
-            "48": 0.31603,
-            "49": 0.31732,
-            "50": 0.3234
+            "1": 37.19618,
+            "2": 0.37449,
+            "3": 0.31644,
+            "4": 0.28217,
+            "5": 0.28413,
+            "6": 0.27992,
+            "7": 0.2812,
+            "8": 0.2853,
+            "9": 0.28038,
+            "10": 0.28373,
+            "11": 0.2867,
+            "12": 0.29151,
+            "13": 0.28727,
+            "14": 0.28521,
+            "15": 0.28348,
+            "16": 0.28599,
+            "17": 0.28521,
+            "18": 0.28496,
+            "19": 0.28665,
+            "20": 0.28808,
+            "21": 0.28617,
+            "22": 0.2849,
+            "23": 0.28018,
+            "24": 0.28162,
+            "25": 0.29703,
+            "26": 0.31265,
+            "27": 0.28109,
+            "28": 0.28283,
+            "29": 0.28046,
+            "30": 0.28567,
+            "31": 0.28446,
+            "32": 0.28496,
+            "33": 0.279,
+            "34": 0.28039,
+            "35": 0.28345,
+            "36": 0.2816,
+            "37": 0.28207,
+            "38": 0.27907,
+            "39": 0.27768,
+            "40": 0.27658,
+            "41": 0.28191,
+            "42": 0.28052,
+            "43": 0.2793,
+            "44": 0.2793,
+            "45": 0.28044,
+            "46": 0.27801,
+            "47": 0.28286,
+            "48": 0.27846,
+            "49": 0.27648,
+            "50": 0.278
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
index 50d3c9c5d20..36ec79d6f72 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93292, "5": 10.92969, "10": 10.90473, "15": 10.87125, "20": 10.75001, "25": 10.53752, "30": 10.32548, "35": 10.22894, "40": 10.01974, "45": 9.75546, "50": 9.84069}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 590.0, "5": 658.0, "10": 523.0, "15": 641.0, "20": 567.0, "25": 606.0, "30": 725.0, "35": 699.0, "40": 783.0, "45": 845.0, "50": 857.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 432177152.0, "5": 432177152.0, "10": 432177152.0, "15": 432177152.0, "20": 432177152.0, "25": 432177152.0, "30": 432177152.0, "35": 432177152.0, "40": 432177152.0, "45": 432177152.0, "50": 432177152.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676286464.0, "5": 857274368.0, "10": 857274368.0, "15": 857274368.0, "20": 857274368.0, "25": 857277440.0, "30": 857277440.0, "35": 857277440.0, "40": 857277440.0, "45": 857277440.0, "50": 857277440.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 21.95554, "5": 0.40686, "10": 0.40586, "15": 0.39829, "20": 0.39913, "25": 0.39679, "30": 0.39346, "35": 0.39107, "40": 0.387, "45": 0.3959, "50": 0.39384}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.93295,
+            "2": 10.93424,
+            "3": 10.91348,
+            "4": 10.90316,
+            "5": 10.92971,
+            "6": 10.93656,
+            "7": 10.90279,
+            "8": 10.92114,
+            "9": 10.90707,
+            "10": 10.90475,
+            "11": 10.88789,
+            "12": 10.91738,
+            "13": 10.9119,
+            "14": 10.91508,
+            "15": 10.8712,
+            "16": 10.86127,
+            "17": 10.82695,
+            "18": 10.85672,
+            "19": 10.84058,
+            "20": 10.74994,
+            "21": 10.71505,
+            "22": 10.58118,
+            "23": 10.72639,
+            "24": 10.60727,
+            "25": 10.53751,
+            "26": 10.61069,
+            "27": 10.59925,
+            "28": 10.54953,
+            "29": 10.56605,
+            "30": 10.32549,
+            "31": 10.06697,
+            "32": 10.43809,
+            "33": 10.42357,
+            "34": 10.16016,
+            "35": 10.22897,
+            "36": 10.17616,
+            "37": 10.29236,
+            "38": 10.13296,
+            "39": 10.34952,
+            "40": 10.01973,
+            "41": 10.07536,
+            "42": 10.15409,
+            "43": 9.76087,
+            "44": 9.88353,
+            "45": 9.75547,
+            "46": 9.74963,
+            "47": 10.07544,
+            "48": 9.77937,
+            "49": 9.43814,
+            "50": 9.8407
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 584.0,
+            "2": 667.0,
+            "3": 622.0,
+            "4": 575.0,
+            "5": 641.0,
+            "6": 652.0,
+            "7": 616.0,
+            "8": 578.0,
+            "9": 664.0,
+            "10": 555.0,
+            "11": 661.0,
+            "12": 599.0,
+            "13": 665.0,
+            "14": 672.0,
+            "15": 592.0,
+            "16": 652.0,
+            "17": 643.0,
+            "18": 582.0,
+            "19": 574.0,
+            "20": 568.0,
+            "21": 624.0,
+            "22": 637.0,
+            "23": 655.0,
+            "24": 607.0,
+            "25": 574.0,
+            "26": 650.0,
+            "27": 677.0,
+            "28": 700.0,
+            "29": 717.0,
+            "30": 687.0,
+            "31": 585.0,
+            "32": 649.0,
+            "33": 789.0,
+            "34": 676.0,
+            "35": 740.0,
+            "36": 707.0,
+            "37": 853.0,
+            "38": 796.0,
+            "39": 846.0,
+            "40": 801.0,
+            "41": 801.0,
+            "42": 795.0,
+            "43": 696.0,
+            "44": 765.0,
+            "45": 813.0,
+            "46": 806.0,
+            "47": 905.0,
+            "48": 829.0,
+            "49": 876.0,
+            "50": 842.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 433750528.0,
+            "2": 433750528.0,
+            "3": 433750528.0,
+            "4": 433750528.0,
+            "5": 433750528.0,
+            "6": 433750528.0,
+            "7": 433750528.0,
+            "8": 433750528.0,
+            "9": 433750528.0,
+            "10": 433750528.0,
+            "11": 433750528.0,
+            "12": 433750528.0,
+            "13": 433750528.0,
+            "14": 433750528.0,
+            "15": 433750528.0,
+            "16": 433750528.0,
+            "17": 433750528.0,
+            "18": 433750528.0,
+            "19": 433750528.0,
+            "20": 433750528.0,
+            "21": 433750528.0,
+            "22": 433750528.0,
+            "23": 433750528.0,
+            "24": 433750528.0,
+            "25": 433750528.0,
+            "26": 433750528.0,
+            "27": 433750528.0,
+            "28": 433750528.0,
+            "29": 433750528.0,
+            "30": 433750528.0,
+            "31": 433750528.0,
+            "32": 433750528.0,
+            "33": 433750528.0,
+            "34": 433750528.0,
+            "35": 433750528.0,
+            "36": 433750528.0,
+            "37": 433750528.0,
+            "38": 433750528.0,
+            "39": 433750528.0,
+            "40": 433750528.0,
+            "41": 433750528.0,
+            "42": 433750528.0,
+            "43": 433750528.0,
+            "44": 433750528.0,
+            "45": 433750528.0,
+            "46": 433750528.0,
+            "47": 433750528.0,
+            "48": 433750528.0,
+            "49": 433750528.0,
+            "50": 433750528.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 677322752.0,
+            "2": 858311168.0,
+            "3": 858311168.0,
+            "4": 858312704.0,
+            "5": 858313728.0,
+            "6": 858313728.0,
+            "7": 858313728.0,
+            "8": 858313728.0,
+            "9": 858313728.0,
+            "10": 858313728.0,
+            "11": 858313728.0,
+            "12": 858313728.0,
+            "13": 858313728.0,
+            "14": 858313728.0,
+            "15": 858313728.0,
+            "16": 858313728.0,
+            "17": 858313728.0,
+            "18": 858313728.0,
+            "19": 858314240.0,
+            "20": 858314240.0,
+            "21": 858314240.0,
+            "22": 858314240.0,
+            "23": 858314240.0,
+            "24": 858314240.0,
+            "25": 858314240.0,
+            "26": 858314240.0,
+            "27": 858314240.0,
+            "28": 858314240.0,
+            "29": 858314240.0,
+            "30": 858314240.0,
+            "31": 858314240.0,
+            "32": 858314240.0,
+            "33": 858314240.0,
+            "34": 858314240.0,
+            "35": 858314240.0,
+            "36": 858314240.0,
+            "37": 858314240.0,
+            "38": 858314240.0,
+            "39": 858314240.0,
+            "40": 858314240.0,
+            "41": 858314240.0,
+            "42": 858314240.0,
+            "43": 858314240.0,
+            "44": 858314240.0,
+            "45": 858314240.0,
+            "46": 858314240.0,
+            "47": 858314240.0,
+            "48": 858314240.0,
+            "49": 858314240.0,
+            "50": 858314240.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 21.76594,
+            "2": 0.46379,
+            "3": 0.43243,
+            "4": 0.41208,
+            "5": 0.41118,
+            "6": 0.41286,
+            "7": 0.41188,
+            "8": 0.41137,
+            "9": 0.41313,
+            "10": 0.41246,
+            "11": 0.41206,
+            "12": 0.41297,
+            "13": 0.41065,
+            "14": 0.41339,
+            "15": 0.41164,
+            "16": 0.4123,
+            "17": 0.41103,
+            "18": 0.4126,
+            "19": 0.41173,
+            "20": 0.40973,
+            "21": 0.40983,
+            "22": 0.41192,
+            "23": 0.41174,
+            "24": 0.41275,
+            "25": 0.4103,
+            "26": 0.41066,
+            "27": 0.40962,
+            "28": 0.41015,
+            "29": 0.41299,
+            "30": 0.41138,
+            "31": 0.41272,
+            "32": 0.41313,
+            "33": 0.41105,
+            "34": 0.41154,
+            "35": 0.41101,
+            "36": 0.41364,
+            "37": 0.41532,
+            "38": 0.41685,
+            "39": 0.41569,
+            "40": 0.41646,
+            "41": 0.42457,
+            "42": 0.41274,
+            "43": 0.41244,
+            "44": 0.41106,
+            "45": 0.41405,
+            "46": 0.41346,
+            "47": 0.41825,
+            "48": 0.41512,
+            "49": 0.41064,
+            "50": 0.4153
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..4e8e9932015
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84733,
+            "4": 10.85571,
+            "5": 10.86,
+            "6": 10.87733,
+            "7": 10.86555,
+            "8": 10.84913,
+            "9": 10.86609,
+            "10": 10.82473,
+            "11": 10.85618,
+            "12": 10.85374,
+            "13": 10.86788,
+            "14": 10.87119,
+            "15": 10.82235,
+            "16": 10.79991,
+            "17": 10.77431,
+            "18": 10.78345,
+            "19": 10.79308,
+            "20": 10.68226,
+            "21": 10.6471,
+            "22": 10.50917,
+            "23": 10.66827,
+            "24": 10.54193,
+            "25": 10.4928,
+            "26": 10.55931,
+            "27": 10.54238,
+            "28": 10.51129,
+            "29": 10.53257,
+            "30": 10.28992,
+            "31": 10.02853,
+            "32": 10.38885,
+            "33": 10.39593,
+            "34": 10.13446,
+            "35": 10.18932,
+            "36": 10.13355,
+            "37": 10.27381,
+            "38": 10.10751,
+            "39": 10.34007,
+            "40": 9.98538,
+            "41": 10.06414,
+            "42": 10.13744,
+            "43": 9.73381,
+            "44": 9.86305,
+            "45": 9.73723,
+            "46": 9.71343,
+            "47": 10.07757,
+            "48": 9.76768,
+            "49": 9.41987,
+            "50": 9.81687
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 567.0,
+            "2": 584.0,
+            "3": 598.0,
+            "4": 633.0,
+            "5": 630.0,
+            "6": 645.0,
+            "7": 645.0,
+            "8": 674.0,
+            "9": 625.0,
+            "10": 500.0,
+            "11": 669.0,
+            "12": 554.0,
+            "13": 681.0,
+            "14": 633.0,
+            "15": 623.0,
+            "16": 592.0,
+            "17": 636.0,
+            "18": 625.0,
+            "19": 633.0,
+            "20": 587.0,
+            "21": 696.0,
+            "22": 585.0,
+            "23": 681.0,
+            "24": 639.0,
+            "25": 587.0,
+            "26": 642.0,
+            "27": 639.0,
+            "28": 744.0,
+            "29": 746.0,
+            "30": 685.0,
+            "31": 603.0,
+            "32": 719.0,
+            "33": 850.0,
+            "34": 696.0,
+            "35": 737.0,
+            "36": 738.0,
+            "37": 840.0,
+            "38": 757.0,
+            "39": 828.0,
+            "40": 828.0,
+            "41": 787.0,
+            "42": 883.0,
+            "43": 703.0,
+            "44": 850.0,
+            "45": 840.0,
+            "46": 837.0,
+            "47": 915.0,
+            "48": 849.0,
+            "49": 915.0,
+            "50": 892.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 460882432.0,
+            "2": 460882432.0,
+            "3": 460882432.0,
+            "4": 460882432.0,
+            "5": 460882432.0,
+            "6": 460882432.0,
+            "7": 460882432.0,
+            "8": 460882432.0,
+            "9": 460882432.0,
+            "10": 460882432.0,
+            "11": 460882432.0,
+            "12": 460882432.0,
+            "13": 460882432.0,
+            "14": 460882432.0,
+            "15": 460882432.0,
+            "16": 460882432.0,
+            "17": 460882432.0,
+            "18": 460882432.0,
+            "19": 460882432.0,
+            "20": 460882432.0,
+            "21": 460882432.0,
+            "22": 460882432.0,
+            "23": 460882432.0,
+            "24": 460882432.0,
+            "25": 460882432.0,
+            "26": 460882432.0,
+            "27": 460882432.0,
+            "28": 460882432.0,
+            "29": 460882432.0,
+            "30": 460882432.0,
+            "31": 460882432.0,
+            "32": 460882432.0,
+            "33": 460882432.0,
+            "34": 460882432.0,
+            "35": 460882432.0,
+            "36": 460882432.0,
+            "37": 460882432.0,
+            "38": 460882432.0,
+            "39": 460882432.0,
+            "40": 460882432.0,
+            "41": 460882432.0,
+            "42": 460882432.0,
+            "43": 460882432.0,
+            "44": 460882432.0,
+            "45": 460882432.0,
+            "46": 460882432.0,
+            "47": 460882432.0,
+            "48": 460882432.0,
+            "49": 460882432.0,
+            "50": 460882432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 705635840.0,
+            "2": 883348992.0,
+            "3": 883348992.0,
+            "4": 883348992.0,
+            "5": 883348992.0,
+            "6": 883348992.0,
+            "7": 883348992.0,
+            "8": 883348992.0,
+            "9": 883348992.0,
+            "10": 883348992.0,
+            "11": 883348992.0,
+            "12": 883348992.0,
+            "13": 883348992.0,
+            "14": 883348992.0,
+            "15": 883348992.0,
+            "16": 883348992.0,
+            "17": 883348992.0,
+            "18": 883348992.0,
+            "19": 883348992.0,
+            "20": 883348992.0,
+            "21": 883348992.0,
+            "22": 883348992.0,
+            "23": 883348992.0,
+            "24": 883348992.0,
+            "25": 883348992.0,
+            "26": 883348992.0,
+            "27": 883348992.0,
+            "28": 883348992.0,
+            "29": 883348992.0,
+            "30": 883348992.0,
+            "31": 883348992.0,
+            "32": 883348992.0,
+            "33": 883348992.0,
+            "34": 883348992.0,
+            "35": 883348992.0,
+            "36": 883348992.0,
+            "37": 883348992.0,
+            "38": 883348992.0,
+            "39": 883348992.0,
+            "40": 883348992.0,
+            "41": 883348992.0,
+            "42": 883348992.0,
+            "43": 883348992.0,
+            "44": 883348992.0,
+            "45": 883348992.0,
+            "46": 883348992.0,
+            "47": 883348992.0,
+            "48": 883348992.0,
+            "49": 883348992.0,
+            "50": 883348992.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.21004,
+            "3": 0.48981,
+            "4": 0.47344,
+            "5": 0.47824,
+            "6": 0.47946,
+            "7": 0.48311,
+            "8": 0.4801,
+            "9": 0.48448,
+            "10": 0.48375,
+            "11": 0.48291,
+            "12": 0.48722,
+            "13": 0.48237,
+            "14": 0.48101,
+            "15": 0.48357,
+            "16": 0.48502,
+            "17": 0.48354,
+            "18": 0.48307,
+            "19": 0.48204,
+            "20": 0.48295,
+            "21": 0.48064,
+            "22": 0.48504,
+            "23": 0.48487,
+            "24": 0.48367,
+            "25": 0.48061,
+            "26": 0.48279,
+            "27": 0.48417,
+            "28": 0.48173,
+            "29": 0.48221,
+            "30": 0.48351,
+            "31": 0.48309,
+            "32": 0.48067,
+            "33": 0.48269,
+            "34": 0.48404,
+            "35": 0.48325,
+            "36": 0.48418,
+            "37": 0.48111,
+            "38": 0.4835,
+            "39": 0.48361,
+            "40": 0.48287,
+            "41": 0.4882,
+            "42": 0.48161,
+            "43": 0.48229,
+            "44": 0.48219,
+            "45": 0.48623,
+            "46": 0.48196,
+            "47": 0.48211,
+            "48": 0.48322,
+            "49": 0.4833,
+            "50": 0.48355
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json
index cd45ff021d9..f78c3deb59d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json
@@ -4,55 +4,55 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86284,
-            "4": 10.84009,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.86282,
+            "4": 10.84007,
             "5": 10.87856,
-            "6": 10.88856,
-            "7": 10.86532,
-            "8": 10.86017,
-            "9": 10.8599,
-            "10": 10.82981,
-            "11": 10.8895,
-            "12": 10.8751,
-            "13": 10.87423,
+            "6": 10.88852,
+            "7": 10.86536,
+            "8": 10.86015,
+            "9": 10.85991,
+            "10": 10.82982,
+            "11": 10.88947,
+            "12": 10.87511,
+            "13": 10.87422,
             "14": 10.89675,
-            "15": 10.82054,
-            "16": 10.82504,
+            "15": 10.82056,
+            "16": 10.82497,
             "17": 10.78983,
             "18": 10.81029,
-            "19": 10.80535,
-            "20": 10.70398,
-            "21": 10.66993,
-            "22": 10.50643,
-            "23": 10.69004,
-            "24": 10.56314,
-            "25": 10.4942,
-            "26": 10.56628,
-            "27": 10.58025,
+            "19": 10.80528,
+            "20": 10.70396,
+            "21": 10.6699,
+            "22": 10.50641,
+            "23": 10.69006,
+            "24": 10.56312,
+            "25": 10.49418,
+            "26": 10.56627,
+            "27": 10.58023,
             "28": 10.51571,
-            "29": 10.55299,
-            "30": 10.30549,
-            "31": 10.02245,
-            "32": 10.40614,
+            "29": 10.55296,
+            "30": 10.30551,
+            "31": 10.02244,
+            "32": 10.40618,
             "33": 10.39874,
-            "34": 10.13771,
+            "34": 10.1377,
             "35": 10.20184,
-            "36": 10.16052,
-            "37": 10.28973,
-            "38": 10.11474,
+            "36": 10.1605,
+            "37": 10.28975,
+            "38": 10.11483,
             "39": 10.361,
-            "40": 10.01903,
+            "40": 10.01905,
             "41": 10.07292,
-            "42": 10.14698,
-            "43": 9.74687,
-            "44": 9.87766,
-            "45": 9.74966,
-            "46": 9.73383,
-            "47": 10.07535,
-            "48": 9.78068,
-            "49": 9.44784,
+            "42": 10.14697,
+            "43": 9.74684,
+            "44": 9.87763,
+            "45": 9.74962,
+            "46": 9.73382,
+            "47": 10.07536,
+            "48": 9.78071,
+            "49": 9.44783,
             "50": 9.8399
         }
     },
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 653.0,
-            "2": 642.0,
-            "3": 630.0,
-            "4": 585.0,
-            "5": 635.0,
-            "6": 687.0,
-            "7": 615.0,
-            "8": 601.0,
-            "9": 607.0,
-            "10": 522.0,
-            "11": 637.0,
-            "12": 675.0,
-            "13": 649.0,
-            "14": 648.0,
-            "15": 640.0,
-            "16": 602.0,
-            "17": 668.0,
-            "18": 634.0,
-            "19": 593.0,
-            "20": 579.0,
-            "21": 633.0,
-            "22": 597.0,
-            "23": 756.0,
-            "24": 612.0,
-            "25": 591.0,
-            "26": 620.0,
-            "27": 700.0,
-            "28": 705.0,
-            "29": 795.0,
-            "30": 752.0,
-            "31": 628.0,
-            "32": 712.0,
-            "33": 752.0,
-            "34": 737.0,
-            "35": 741.0,
-            "36": 770.0,
-            "37": 861.0,
-            "38": 823.0,
-            "39": 812.0,
-            "40": 814.0,
-            "41": 826.0,
-            "42": 801.0,
-            "43": 769.0,
-            "44": 822.0,
-            "45": 777.0,
-            "46": 828.0,
-            "47": 878.0,
-            "48": 915.0,
-            "49": 908.0,
-            "50": 848.0
+            "1": 572.0,
+            "2": 656.0,
+            "3": 649.0,
+            "4": 631.0,
+            "5": 658.0,
+            "6": 636.0,
+            "7": 636.0,
+            "8": 542.0,
+            "9": 653.0,
+            "10": 551.0,
+            "11": 681.0,
+            "12": 642.0,
+            "13": 624.0,
+            "14": 658.0,
+            "15": 682.0,
+            "16": 659.0,
+            "17": 620.0,
+            "18": 603.0,
+            "19": 634.0,
+            "20": 639.0,
+            "21": 634.0,
+            "22": 602.0,
+            "23": 731.0,
+            "24": 620.0,
+            "25": 611.0,
+            "26": 626.0,
+            "27": 683.0,
+            "28": 668.0,
+            "29": 713.0,
+            "30": 712.0,
+            "31": 616.0,
+            "32": 786.0,
+            "33": 800.0,
+            "34": 702.0,
+            "35": 684.0,
+            "36": 664.0,
+            "37": 831.0,
+            "38": 802.0,
+            "39": 919.0,
+            "40": 802.0,
+            "41": 791.0,
+            "42": 840.0,
+            "43": 718.0,
+            "44": 756.0,
+            "45": 765.0,
+            "46": 809.0,
+            "47": 839.0,
+            "48": 827.0,
+            "49": 935.0,
+            "50": 839.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 510689792.0,
-            "2": 510689792.0,
-            "3": 510689792.0,
-            "4": 510689792.0,
-            "5": 510689792.0,
-            "6": 510689792.0,
-            "7": 510689792.0,
-            "8": 510689792.0,
-            "9": 510689792.0,
-            "10": 510689792.0,
-            "11": 510689792.0,
-            "12": 510689792.0,
-            "13": 510689792.0,
-            "14": 510689792.0,
-            "15": 510689792.0,
-            "16": 510689792.0,
-            "17": 510689792.0,
-            "18": 510689792.0,
-            "19": 510689792.0,
-            "20": 510689792.0,
-            "21": 510689792.0,
-            "22": 510689792.0,
-            "23": 510689792.0,
-            "24": 510689792.0,
-            "25": 510689792.0,
-            "26": 510689792.0,
-            "27": 510689792.0,
-            "28": 510689792.0,
-            "29": 510689792.0,
-            "30": 510689792.0,
-            "31": 510689792.0,
-            "32": 510689792.0,
-            "33": 510689792.0,
-            "34": 510689792.0,
-            "35": 510689792.0,
-            "36": 510689792.0,
-            "37": 510689792.0,
-            "38": 510689792.0,
-            "39": 510689792.0,
-            "40": 510689792.0,
-            "41": 510689792.0,
-            "42": 510689792.0,
-            "43": 510689792.0,
-            "44": 510689792.0,
-            "45": 510689792.0,
-            "46": 510689792.0,
-            "47": 510689792.0,
-            "48": 510689792.0,
-            "49": 510689792.0,
-            "50": 510689792.0
+            "1": 512786944.0,
+            "2": 512786944.0,
+            "3": 512786944.0,
+            "4": 512786944.0,
+            "5": 512786944.0,
+            "6": 512786944.0,
+            "7": 512786944.0,
+            "8": 512786944.0,
+            "9": 512786944.0,
+            "10": 512786944.0,
+            "11": 512786944.0,
+            "12": 512786944.0,
+            "13": 512786944.0,
+            "14": 512786944.0,
+            "15": 512786944.0,
+            "16": 512786944.0,
+            "17": 512786944.0,
+            "18": 512786944.0,
+            "19": 512786944.0,
+            "20": 512786944.0,
+            "21": 512786944.0,
+            "22": 512786944.0,
+            "23": 512786944.0,
+            "24": 512786944.0,
+            "25": 512786944.0,
+            "26": 512786944.0,
+            "27": 512786944.0,
+            "28": 512786944.0,
+            "29": 512786944.0,
+            "30": 512786944.0,
+            "31": 512786944.0,
+            "32": 512786944.0,
+            "33": 512786944.0,
+            "34": 512786944.0,
+            "35": 512786944.0,
+            "36": 512786944.0,
+            "37": 512786944.0,
+            "38": 512786944.0,
+            "39": 512786944.0,
+            "40": 512786944.0,
+            "41": 512786944.0,
+            "42": 512786944.0,
+            "43": 512786944.0,
+            "44": 512786944.0,
+            "45": 512786944.0,
+            "46": 512786944.0,
+            "47": 512786944.0,
+            "48": 512786944.0,
+            "49": 512786944.0,
+            "50": 512786944.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 759895552.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 933156352.0,
-            "25": 933156352.0,
-            "26": 933156352.0,
-            "27": 933156352.0,
-            "28": 933156352.0,
-            "29": 933156352.0,
-            "30": 933156352.0,
-            "31": 933156352.0,
-            "32": 933156352.0,
-            "33": 934201856.0,
-            "34": 934201856.0,
-            "35": 934201856.0,
-            "36": 934201856.0,
-            "37": 934201856.0,
-            "38": 934201856.0,
-            "39": 934201856.0,
-            "40": 934201856.0,
-            "41": 934201856.0,
-            "42": 934201856.0,
-            "43": 934201856.0,
-            "44": 934201856.0,
-            "45": 934201856.0,
-            "46": 934201856.0,
-            "47": 934201856.0,
-            "48": 934201856.0,
-            "49": 934201856.0,
-            "50": 934201856.0
+            "1": 758851072.0,
+            "2": 937350656.0,
+            "3": 937350656.0,
+            "4": 937350656.0,
+            "5": 937350656.0,
+            "6": 937350656.0,
+            "7": 937350656.0,
+            "8": 937350656.0,
+            "9": 937350656.0,
+            "10": 937350656.0,
+            "11": 937350656.0,
+            "12": 937350656.0,
+            "13": 937350656.0,
+            "14": 937350656.0,
+            "15": 937350656.0,
+            "16": 937350656.0,
+            "17": 937350656.0,
+            "18": 937350656.0,
+            "19": 937350656.0,
+            "20": 937350656.0,
+            "21": 937350656.0,
+            "22": 937350656.0,
+            "23": 937350656.0,
+            "24": 937350656.0,
+            "25": 937350656.0,
+            "26": 937350656.0,
+            "27": 937350656.0,
+            "28": 937350656.0,
+            "29": 937350656.0,
+            "30": 937350656.0,
+            "31": 937350656.0,
+            "32": 937350656.0,
+            "33": 937350656.0,
+            "34": 937350656.0,
+            "35": 937350656.0,
+            "36": 937350656.0,
+            "37": 937350656.0,
+            "38": 937350656.0,
+            "39": 937350656.0,
+            "40": 937350656.0,
+            "41": 937350656.0,
+            "42": 937350656.0,
+            "43": 937350656.0,
+            "44": 937350656.0,
+            "45": 937350656.0,
+            "46": 937350656.0,
+            "47": 937350656.0,
+            "48": 937350656.0,
+            "49": 937350656.0,
+            "50": 937350656.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 17.72917,
-            "2": 0.36269,
-            "3": 0.33585,
-            "4": 0.33878,
-            "5": 0.33758,
-            "6": 0.33453,
-            "7": 0.33628,
-            "8": 0.33416,
-            "9": 0.33309,
-            "10": 0.33521,
-            "11": 0.33536,
-            "12": 0.33148,
-            "13": 0.33565,
-            "14": 0.33401,
-            "15": 0.33029,
-            "16": 0.33788,
-            "17": 0.33302,
-            "18": 0.33337,
-            "19": 0.33761,
-            "20": 0.33672,
-            "21": 0.33256,
-            "22": 0.3374,
-            "23": 0.33652,
-            "24": 0.33672,
-            "25": 0.33982,
-            "26": 0.3335,
-            "27": 0.3328,
-            "28": 0.33835,
-            "29": 0.33338,
-            "30": 0.33371,
-            "31": 0.33991,
-            "32": 0.33259,
-            "33": 0.33537,
-            "34": 0.33777,
-            "35": 0.33494,
-            "36": 0.33504,
-            "37": 0.33915,
-            "38": 0.33462,
-            "39": 0.33387,
-            "40": 0.33791,
-            "41": 0.33426,
-            "42": 0.33834,
-            "43": 0.33785,
-            "44": 0.32761,
-            "45": 0.32857,
-            "46": 0.33205,
-            "47": 0.3355,
-            "48": 0.33535,
-            "49": 0.33792,
-            "50": 0.33613
+            "1": 35.82214,
+            "2": 0.4147,
+            "3": 0.32319,
+            "4": 0.30032,
+            "5": 0.30017,
+            "6": 0.29443,
+            "7": 0.29684,
+            "8": 0.29654,
+            "9": 0.29369,
+            "10": 0.29144,
+            "11": 0.29461,
+            "12": 0.29494,
+            "13": 0.2989,
+            "14": 0.30075,
+            "15": 0.30668,
+            "16": 0.29656,
+            "17": 0.29426,
+            "18": 0.29342,
+            "19": 0.29461,
+            "20": 0.29689,
+            "21": 0.29944,
+            "22": 0.29592,
+            "23": 0.29544,
+            "24": 0.29391,
+            "25": 0.29356,
+            "26": 0.29086,
+            "27": 0.29138,
+            "28": 0.29613,
+            "29": 0.29464,
+            "30": 0.29623,
+            "31": 0.29357,
+            "32": 0.2941,
+            "33": 0.29995,
+            "34": 0.29721,
+            "35": 0.29459,
+            "36": 0.29391,
+            "37": 0.29408,
+            "38": 0.29673,
+            "39": 0.2977,
+            "40": 0.29439,
+            "41": 0.29458,
+            "42": 0.29561,
+            "43": 0.29392,
+            "44": 0.3078,
+            "45": 0.29321,
+            "46": 0.28828,
+            "47": 0.28745,
+            "48": 0.30287,
+            "49": 0.28551,
+            "50": 0.28747
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/model_config.yaml
index 6aad7304c19..f45345f9911 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/model_config.yaml
@@ -45,7 +45,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..35348e75b0f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84734,
+            "4": 10.85574,
+            "5": 10.86001,
+            "6": 10.8773,
+            "7": 10.86559,
+            "8": 10.84909,
+            "9": 10.86608,
+            "10": 10.82475,
+            "11": 10.85619,
+            "12": 10.85375,
+            "13": 10.86788,
+            "14": 10.87116,
+            "15": 10.82232,
+            "16": 10.79995,
+            "17": 10.77433,
+            "18": 10.78345,
+            "19": 10.79307,
+            "20": 10.68225,
+            "21": 10.64714,
+            "22": 10.50916,
+            "23": 10.66829,
+            "24": 10.54192,
+            "25": 10.49279,
+            "26": 10.55934,
+            "27": 10.54241,
+            "28": 10.51128,
+            "29": 10.53257,
+            "30": 10.28988,
+            "31": 10.02847,
+            "32": 10.38882,
+            "33": 10.39596,
+            "34": 10.13452,
+            "35": 10.18931,
+            "36": 10.13354,
+            "37": 10.27379,
+            "38": 10.10752,
+            "39": 10.34011,
+            "40": 9.98539,
+            "41": 10.06415,
+            "42": 10.13747,
+            "43": 9.73381,
+            "44": 9.86306,
+            "45": 9.73727,
+            "46": 9.71341,
+            "47": 10.07754,
+            "48": 9.76766,
+            "49": 9.41987,
+            "50": 9.81689
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 588.0,
+            "2": 591.0,
+            "3": 656.0,
+            "4": 602.0,
+            "5": 609.0,
+            "6": 600.0,
+            "7": 596.0,
+            "8": 640.0,
+            "9": 653.0,
+            "10": 535.0,
+            "11": 657.0,
+            "12": 620.0,
+            "13": 700.0,
+            "14": 630.0,
+            "15": 628.0,
+            "16": 590.0,
+            "17": 604.0,
+            "18": 566.0,
+            "19": 580.0,
+            "20": 561.0,
+            "21": 643.0,
+            "22": 622.0,
+            "23": 679.0,
+            "24": 611.0,
+            "25": 575.0,
+            "26": 628.0,
+            "27": 640.0,
+            "28": 727.0,
+            "29": 736.0,
+            "30": 729.0,
+            "31": 575.0,
+            "32": 726.0,
+            "33": 773.0,
+            "34": 634.0,
+            "35": 720.0,
+            "36": 690.0,
+            "37": 818.0,
+            "38": 730.0,
+            "39": 754.0,
+            "40": 809.0,
+            "41": 787.0,
+            "42": 849.0,
+            "43": 757.0,
+            "44": 861.0,
+            "45": 825.0,
+            "46": 881.0,
+            "47": 915.0,
+            "48": 846.0,
+            "49": 853.0,
+            "50": 816.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 463110656.0,
+            "2": 463110656.0,
+            "3": 463110656.0,
+            "4": 463110656.0,
+            "5": 463110656.0,
+            "6": 463110656.0,
+            "7": 463110656.0,
+            "8": 463110656.0,
+            "9": 463110656.0,
+            "10": 463110656.0,
+            "11": 463110656.0,
+            "12": 463110656.0,
+            "13": 463110656.0,
+            "14": 463110656.0,
+            "15": 463110656.0,
+            "16": 463110656.0,
+            "17": 463110656.0,
+            "18": 463110656.0,
+            "19": 463110656.0,
+            "20": 463110656.0,
+            "21": 463110656.0,
+            "22": 463110656.0,
+            "23": 463110656.0,
+            "24": 463110656.0,
+            "25": 463110656.0,
+            "26": 463110656.0,
+            "27": 463110656.0,
+            "28": 463110656.0,
+            "29": 463110656.0,
+            "30": 463110656.0,
+            "31": 463110656.0,
+            "32": 463110656.0,
+            "33": 463110656.0,
+            "34": 463110656.0,
+            "35": 463110656.0,
+            "36": 463110656.0,
+            "37": 463110656.0,
+            "38": 463110656.0,
+            "39": 463110656.0,
+            "40": 463110656.0,
+            "41": 463110656.0,
+            "42": 463110656.0,
+            "43": 463110656.0,
+            "44": 463110656.0,
+            "45": 463110656.0,
+            "46": 463110656.0,
+            "47": 463110656.0,
+            "48": 463110656.0,
+            "49": 463110656.0,
+            "50": 463110656.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 704587264.0,
+            "2": 887671808.0,
+            "3": 887671808.0,
+            "4": 887671808.0,
+            "5": 887672320.0,
+            "6": 887672320.0,
+            "7": 887672320.0,
+            "8": 887674368.0,
+            "9": 887674368.0,
+            "10": 887674368.0,
+            "11": 887674368.0,
+            "12": 887674368.0,
+            "13": 887674368.0,
+            "14": 887674368.0,
+            "15": 887674368.0,
+            "16": 887674368.0,
+            "17": 887674368.0,
+            "18": 887674368.0,
+            "19": 887674368.0,
+            "20": 887674368.0,
+            "21": 887674368.0,
+            "22": 887674368.0,
+            "23": 887674368.0,
+            "24": 887674368.0,
+            "25": 887674368.0,
+            "26": 887674368.0,
+            "27": 887674368.0,
+            "28": 887674368.0,
+            "29": 887674368.0,
+            "30": 887674368.0,
+            "31": 887674368.0,
+            "32": 887674368.0,
+            "33": 887674368.0,
+            "34": 887674368.0,
+            "35": 887674368.0,
+            "36": 887674368.0,
+            "37": 887674368.0,
+            "38": 887674368.0,
+            "39": 887674368.0,
+            "40": 887674368.0,
+            "41": 887674368.0,
+            "42": 887674368.0,
+            "43": 887674368.0,
+            "44": 887674368.0,
+            "45": 887674368.0,
+            "46": 887674368.0,
+            "47": 887674368.0,
+            "48": 887674368.0,
+            "49": 887674368.0,
+            "50": 887674368.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.32392,
+            "3": 0.4432,
+            "4": 0.41997,
+            "5": 0.42011,
+            "6": 0.41602,
+            "7": 0.41695,
+            "8": 0.42042,
+            "9": 0.41532,
+            "10": 0.41013,
+            "11": 0.40983,
+            "12": 0.41104,
+            "13": 0.41182,
+            "14": 0.41252,
+            "15": 0.42002,
+            "16": 0.47232,
+            "17": 0.49274,
+            "18": 0.49507,
+            "19": 0.49112,
+            "20": 0.48715,
+            "21": 0.48361,
+            "22": 0.48476,
+            "23": 0.4789,
+            "24": 0.47778,
+            "25": 0.4792,
+            "26": 0.48432,
+            "27": 0.48617,
+            "28": 0.48159,
+            "29": 0.48042,
+            "30": 0.47772,
+            "31": 0.47956,
+            "32": 0.47326,
+            "33": 0.4727,
+            "34": 0.47303,
+            "35": 0.46857,
+            "36": 0.46923,
+            "37": 0.46968,
+            "38": 0.4682,
+            "39": 0.45815,
+            "40": 0.45997,
+            "41": 0.45486,
+            "42": 0.45349,
+            "43": 0.44331,
+            "44": 0.44252,
+            "45": 0.44141,
+            "46": 0.44016,
+            "47": 0.43955,
+            "48": 0.43852,
+            "49": 0.43914,
+            "50": 0.43791
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
index 524007ed7d6..c0f918ce574 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
+            "1": 10.86539,
+            "2": 10.85871,
             "3": 10.86283,
-            "4": 10.84007,
-            "5": 10.87854,
+            "4": 10.84012,
+            "5": 10.87855,
             "6": 10.88853,
             "7": 10.86532,
             "8": 10.8602,
-            "9": 10.85991,
+            "9": 10.85989,
             "10": 10.82981,
-            "11": 10.8895,
+            "11": 10.88943,
             "12": 10.87507,
-            "13": 10.87426,
-            "14": 10.89678,
+            "13": 10.87423,
+            "14": 10.89674,
             "15": 10.82054,
-            "16": 10.825,
-            "17": 10.7898,
+            "16": 10.82502,
+            "17": 10.78984,
             "18": 10.8103,
-            "19": 10.80536,
-            "20": 10.70398,
-            "21": 10.66992,
+            "19": 10.80531,
+            "20": 10.70393,
+            "21": 10.66989,
             "22": 10.50644,
-            "23": 10.69005,
-            "24": 10.5631,
-            "25": 10.49418,
-            "26": 10.56626,
-            "27": 10.58028,
+            "23": 10.69001,
+            "24": 10.56313,
+            "25": 10.49417,
+            "26": 10.56631,
+            "27": 10.58022,
             "28": 10.51572,
-            "29": 10.55298,
-            "30": 10.30549,
-            "31": 10.02244,
-            "32": 10.40615,
-            "33": 10.3988,
-            "34": 10.13773,
-            "35": 10.20188,
-            "36": 10.1605,
-            "37": 10.28974,
-            "38": 10.11477,
-            "39": 10.36102,
-            "40": 10.01902,
-            "41": 10.07292,
-            "42": 10.14694,
-            "43": 9.74685,
-            "44": 9.87766,
-            "45": 9.74965,
+            "29": 10.55301,
+            "30": 10.3055,
+            "31": 10.02252,
+            "32": 10.40617,
+            "33": 10.39877,
+            "34": 10.13772,
+            "35": 10.20187,
+            "36": 10.16046,
+            "37": 10.28973,
+            "38": 10.11479,
+            "39": 10.36106,
+            "40": 10.01901,
+            "41": 10.07289,
+            "42": 10.14697,
+            "43": 9.7469,
+            "44": 9.87759,
+            "45": 9.74964,
             "46": 9.73384,
-            "47": 10.07535,
-            "48": 9.7807,
-            "49": 9.44783,
-            "50": 9.83991
+            "47": 10.07538,
+            "48": 9.78069,
+            "49": 9.44785,
+            "50": 9.83992
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 647.0,
-            "2": 614.0,
-            "3": 640.0,
-            "4": 603.0,
-            "5": 600.0,
-            "6": 683.0,
-            "7": 630.0,
-            "8": 565.0,
-            "9": 671.0,
-            "10": 531.0,
-            "11": 670.0,
-            "12": 643.0,
-            "13": 626.0,
-            "14": 635.0,
-            "15": 655.0,
-            "16": 643.0,
-            "17": 693.0,
-            "18": 634.0,
-            "19": 648.0,
-            "20": 644.0,
-            "21": 690.0,
-            "22": 606.0,
-            "23": 694.0,
-            "24": 565.0,
-            "25": 605.0,
-            "26": 636.0,
-            "27": 638.0,
-            "28": 721.0,
-            "29": 750.0,
-            "30": 760.0,
-            "31": 572.0,
-            "32": 705.0,
-            "33": 816.0,
+            "1": 606.0,
+            "2": 636.0,
+            "3": 628.0,
+            "4": 620.0,
+            "5": 632.0,
+            "6": 688.0,
+            "7": 638.0,
+            "8": 601.0,
+            "9": 637.0,
+            "10": 557.0,
+            "11": 644.0,
+            "12": 665.0,
+            "13": 708.0,
+            "14": 658.0,
+            "15": 666.0,
+            "16": 635.0,
+            "17": 712.0,
+            "18": 614.0,
+            "19": 652.0,
+            "20": 627.0,
+            "21": 674.0,
+            "22": 610.0,
+            "23": 760.0,
+            "24": 590.0,
+            "25": 611.0,
+            "26": 637.0,
+            "27": 660.0,
+            "28": 752.0,
+            "29": 735.0,
+            "30": 659.0,
+            "31": 603.0,
+            "32": 791.0,
+            "33": 800.0,
             "34": 737.0,
-            "35": 720.0,
-            "36": 710.0,
-            "37": 862.0,
-            "38": 763.0,
-            "39": 909.0,
-            "40": 795.0,
-            "41": 776.0,
-            "42": 858.0,
-            "43": 771.0,
-            "44": 858.0,
-            "45": 857.0,
-            "46": 864.0,
-            "47": 880.0,
-            "48": 923.0,
-            "49": 899.0,
-            "50": 868.0
+            "35": 738.0,
+            "36": 732.0,
+            "37": 858.0,
+            "38": 799.0,
+            "39": 870.0,
+            "40": 821.0,
+            "41": 788.0,
+            "42": 865.0,
+            "43": 704.0,
+            "44": 761.0,
+            "45": 822.0,
+            "46": 846.0,
+            "47": 871.0,
+            "48": 883.0,
+            "49": 883.0,
+            "50": 857.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 510689792.0,
-            "2": 510689792.0,
-            "3": 510689792.0,
-            "4": 510689792.0,
-            "5": 510689792.0,
-            "6": 510689792.0,
-            "7": 510689792.0,
-            "8": 510689792.0,
-            "9": 510689792.0,
-            "10": 510689792.0,
-            "11": 510689792.0,
-            "12": 510689792.0,
-            "13": 510689792.0,
-            "14": 510689792.0,
-            "15": 510689792.0,
-            "16": 510689792.0,
-            "17": 510689792.0,
-            "18": 510689792.0,
-            "19": 510689792.0,
-            "20": 510689792.0,
-            "21": 510689792.0,
-            "22": 510689792.0,
-            "23": 510689792.0,
-            "24": 510689792.0,
-            "25": 510689792.0,
-            "26": 510689792.0,
-            "27": 510689792.0,
-            "28": 510689792.0,
-            "29": 510689792.0,
-            "30": 510689792.0,
-            "31": 510689792.0,
-            "32": 510689792.0,
-            "33": 510689792.0,
-            "34": 510689792.0,
-            "35": 510689792.0,
-            "36": 510689792.0,
-            "37": 510689792.0,
-            "38": 510689792.0,
-            "39": 510689792.0,
-            "40": 510689792.0,
-            "41": 510689792.0,
-            "42": 510689792.0,
-            "43": 510689792.0,
-            "44": 510689792.0,
-            "45": 510689792.0,
-            "46": 510689792.0,
-            "47": 510689792.0,
-            "48": 510689792.0,
-            "49": 510689792.0,
-            "50": 510689792.0
+            "1": 512786944.0,
+            "2": 512786944.0,
+            "3": 512786944.0,
+            "4": 512786944.0,
+            "5": 512786944.0,
+            "6": 512786944.0,
+            "7": 512786944.0,
+            "8": 512786944.0,
+            "9": 512786944.0,
+            "10": 512786944.0,
+            "11": 512786944.0,
+            "12": 512786944.0,
+            "13": 512786944.0,
+            "14": 512786944.0,
+            "15": 512786944.0,
+            "16": 512786944.0,
+            "17": 512786944.0,
+            "18": 512786944.0,
+            "19": 512786944.0,
+            "20": 512786944.0,
+            "21": 512786944.0,
+            "22": 512786944.0,
+            "23": 512786944.0,
+            "24": 512786944.0,
+            "25": 512786944.0,
+            "26": 512786944.0,
+            "27": 512786944.0,
+            "28": 512786944.0,
+            "29": 512786944.0,
+            "30": 512786944.0,
+            "31": 512786944.0,
+            "32": 512786944.0,
+            "33": 512786944.0,
+            "34": 512786944.0,
+            "35": 512786944.0,
+            "36": 512786944.0,
+            "37": 512786944.0,
+            "38": 512786944.0,
+            "39": 512786944.0,
+            "40": 512786944.0,
+            "41": 512786944.0,
+            "42": 512786944.0,
+            "43": 512786944.0,
+            "44": 512786944.0,
+            "45": 512786944.0,
+            "46": 512786944.0,
+            "47": 512786944.0,
+            "48": 512786944.0,
+            "49": 512786944.0,
+            "50": 512786944.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 757801472.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 933156352.0,
-            "25": 933156352.0,
-            "26": 933156352.0,
-            "27": 933156352.0,
-            "28": 933156352.0,
-            "29": 933156352.0,
-            "30": 933156352.0,
-            "31": 933156352.0,
-            "32": 933156352.0,
-            "33": 933156352.0,
-            "34": 933156352.0,
-            "35": 933156352.0,
-            "36": 933156352.0,
-            "37": 933156352.0,
-            "38": 933156352.0,
-            "39": 933156352.0,
-            "40": 933156352.0,
-            "41": 933156352.0,
-            "42": 933156352.0,
-            "43": 933156352.0,
-            "44": 933156352.0,
-            "45": 933156352.0,
-            "46": 933156352.0,
-            "47": 933156352.0,
-            "48": 933156352.0,
-            "49": 933156352.0,
-            "50": 933156352.0
+            "1": 758851072.0,
+            "2": 936302080.0,
+            "3": 936302080.0,
+            "4": 936302080.0,
+            "5": 936302080.0,
+            "6": 936302080.0,
+            "7": 937349632.0,
+            "8": 937349632.0,
+            "9": 937349632.0,
+            "10": 937349632.0,
+            "11": 937349632.0,
+            "12": 937350656.0,
+            "13": 937350656.0,
+            "14": 937350656.0,
+            "15": 937350656.0,
+            "16": 937350656.0,
+            "17": 937350656.0,
+            "18": 937350656.0,
+            "19": 937350656.0,
+            "20": 937350656.0,
+            "21": 937350656.0,
+            "22": 937350656.0,
+            "23": 937350656.0,
+            "24": 937350656.0,
+            "25": 937350656.0,
+            "26": 937350656.0,
+            "27": 937350656.0,
+            "28": 937350656.0,
+            "29": 937350656.0,
+            "30": 937350656.0,
+            "31": 937350656.0,
+            "32": 937350656.0,
+            "33": 937350656.0,
+            "34": 937350656.0,
+            "35": 937350656.0,
+            "36": 937350656.0,
+            "37": 937350656.0,
+            "38": 937350656.0,
+            "39": 937350656.0,
+            "40": 937350656.0,
+            "41": 937350656.0,
+            "42": 937350656.0,
+            "43": 937350656.0,
+            "44": 937350656.0,
+            "45": 937350656.0,
+            "46": 937350656.0,
+            "47": 937350656.0,
+            "48": 937350656.0,
+            "49": 937350656.0,
+            "50": 937350656.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 17.58309,
-            "2": 0.34736,
-            "3": 0.32683,
-            "4": 0.3279,
-            "5": 0.32934,
-            "6": 0.33179,
-            "7": 0.3281,
-            "8": 0.3324,
-            "9": 0.32989,
-            "10": 0.32742,
-            "11": 0.33009,
-            "12": 0.3345,
-            "13": 0.33455,
-            "14": 0.3346,
-            "15": 0.33747,
-            "16": 0.33625,
-            "17": 0.3454,
-            "18": 0.33586,
-            "19": 0.33227,
-            "20": 0.33242,
-            "21": 0.33093,
-            "22": 0.33378,
-            "23": 0.33439,
-            "24": 0.33159,
-            "25": 0.32826,
-            "26": 0.33259,
-            "27": 0.33154,
-            "28": 0.32855,
-            "29": 0.32973,
-            "30": 0.33267,
-            "31": 0.33156,
-            "32": 0.32832,
-            "33": 0.33304,
-            "34": 0.32817,
-            "35": 0.32993,
-            "36": 0.33154,
-            "37": 0.32842,
-            "38": 0.32508,
-            "39": 0.33067,
-            "40": 0.33115,
-            "41": 0.32719,
-            "42": 0.33205,
-            "43": 0.3472,
-            "44": 0.33564,
-            "45": 0.33202,
-            "46": 0.33051,
-            "47": 0.32871,
-            "48": 0.33055,
-            "49": 0.33399,
-            "50": 0.33114
+            "1": 36.51522,
+            "2": 0.33765,
+            "3": 0.3066,
+            "4": 0.28763,
+            "5": 0.29777,
+            "6": 0.28582,
+            "7": 0.28832,
+            "8": 0.2868,
+            "9": 0.28478,
+            "10": 0.28471,
+            "11": 0.2819,
+            "12": 0.28335,
+            "13": 0.2836,
+            "14": 0.28168,
+            "15": 0.28103,
+            "16": 0.28016,
+            "17": 0.28046,
+            "18": 0.27976,
+            "19": 0.28362,
+            "20": 0.28005,
+            "21": 0.32339,
+            "22": 0.32249,
+            "23": 0.28055,
+            "24": 0.28159,
+            "25": 0.27999,
+            "26": 0.28072,
+            "27": 0.28355,
+            "28": 0.28084,
+            "29": 0.28109,
+            "30": 0.28649,
+            "31": 0.28181,
+            "32": 0.28256,
+            "33": 0.28162,
+            "34": 0.2786,
+            "35": 0.27925,
+            "36": 0.2774,
+            "37": 0.27817,
+            "38": 0.28082,
+            "39": 0.27778,
+            "40": 0.27826,
+            "41": 0.27788,
+            "42": 0.27618,
+            "43": 0.28026,
+            "44": 0.27755,
+            "45": 0.27871,
+            "46": 0.27725,
+            "47": 0.27974,
+            "48": 0.29559,
+            "49": 0.28231,
+            "50": 0.28057
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
index e60c6b8950b..227d76695c3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93292, "5": 10.92968, "10": 10.90471, "15": 10.87119, "20": 10.74996, "25": 10.53752, "30": 10.32551, "35": 10.22893, "40": 10.01972, "45": 9.75543, "50": 9.8407}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 585.0, "5": 676.0, "10": 542.0, "15": 625.0, "20": 553.0, "25": 595.0, "30": 748.0, "35": 665.0, "40": 762.0, "45": 757.0, "50": 856.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 432177152.0, "5": 432177152.0, "10": 432177152.0, "15": 432177152.0, "20": 432177152.0, "25": 432177152.0, "30": 432177152.0, "35": 432177152.0, "40": 432177152.0, "45": 432177152.0, "50": 432177152.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 677334528.0, "5": 856228864.0, "10": 856228864.0, "15": 856228864.0, "20": 856228864.0, "25": 856228864.0, "30": 856228864.0, "35": 856228864.0, "40": 856228864.0, "45": 856228864.0, "50": 856228864.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22.20877, "5": 0.40055, "10": 0.40235, "15": 0.40045, "20": 0.39406, "25": 0.39764, "30": 0.39555, "35": 0.39211, "40": 0.38588, "45": 0.38484, "50": 0.38002}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.93295,
+            "2": 10.93424,
+            "3": 10.91344,
+            "4": 10.90324,
+            "5": 10.92971,
+            "6": 10.93653,
+            "7": 10.90278,
+            "8": 10.92115,
+            "9": 10.90706,
+            "10": 10.90471,
+            "11": 10.88787,
+            "12": 10.91736,
+            "13": 10.91188,
+            "14": 10.91505,
+            "15": 10.87122,
+            "16": 10.86124,
+            "17": 10.82702,
+            "18": 10.85679,
+            "19": 10.84058,
+            "20": 10.75,
+            "21": 10.71507,
+            "22": 10.58119,
+            "23": 10.72644,
+            "24": 10.60726,
+            "25": 10.53754,
+            "26": 10.61067,
+            "27": 10.59932,
+            "28": 10.54957,
+            "29": 10.566,
+            "30": 10.3255,
+            "31": 10.067,
+            "32": 10.43808,
+            "33": 10.4236,
+            "34": 10.16018,
+            "35": 10.2289,
+            "36": 10.17613,
+            "37": 10.29237,
+            "38": 10.13293,
+            "39": 10.34957,
+            "40": 10.01976,
+            "41": 10.07533,
+            "42": 10.15411,
+            "43": 9.7609,
+            "44": 9.88358,
+            "45": 9.75546,
+            "46": 9.74964,
+            "47": 10.07547,
+            "48": 9.77936,
+            "49": 9.43821,
+            "50": 9.84068
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 605.0,
+            "2": 625.0,
+            "3": 652.0,
+            "4": 624.0,
+            "5": 663.0,
+            "6": 613.0,
+            "7": 650.0,
+            "8": 610.0,
+            "9": 675.0,
+            "10": 560.0,
+            "11": 630.0,
+            "12": 603.0,
+            "13": 667.0,
+            "14": 652.0,
+            "15": 625.0,
+            "16": 621.0,
+            "17": 588.0,
+            "18": 591.0,
+            "19": 599.0,
+            "20": 599.0,
+            "21": 617.0,
+            "22": 566.0,
+            "23": 696.0,
+            "24": 619.0,
+            "25": 539.0,
+            "26": 564.0,
+            "27": 645.0,
+            "28": 745.0,
+            "29": 738.0,
+            "30": 668.0,
+            "31": 596.0,
+            "32": 698.0,
+            "33": 722.0,
+            "34": 651.0,
+            "35": 705.0,
+            "36": 710.0,
+            "37": 783.0,
+            "38": 773.0,
+            "39": 913.0,
+            "40": 772.0,
+            "41": 813.0,
+            "42": 799.0,
+            "43": 683.0,
+            "44": 769.0,
+            "45": 784.0,
+            "46": 820.0,
+            "47": 874.0,
+            "48": 885.0,
+            "49": 814.0,
+            "50": 840.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 431522304.0,
+            "2": 431522304.0,
+            "3": 431522304.0,
+            "4": 431522304.0,
+            "5": 431522304.0,
+            "6": 431522304.0,
+            "7": 431522304.0,
+            "8": 431522304.0,
+            "9": 431522304.0,
+            "10": 431522304.0,
+            "11": 431522304.0,
+            "12": 431522304.0,
+            "13": 431522304.0,
+            "14": 431522304.0,
+            "15": 431522304.0,
+            "16": 431522304.0,
+            "17": 431522304.0,
+            "18": 431522304.0,
+            "19": 431522304.0,
+            "20": 431522304.0,
+            "21": 431522304.0,
+            "22": 431522304.0,
+            "23": 431522304.0,
+            "24": 431522304.0,
+            "25": 431522304.0,
+            "26": 431522304.0,
+            "27": 431522304.0,
+            "28": 431522304.0,
+            "29": 431522304.0,
+            "30": 431522304.0,
+            "31": 431522304.0,
+            "32": 431522304.0,
+            "33": 431522304.0,
+            "34": 431522304.0,
+            "35": 431522304.0,
+            "36": 431522304.0,
+            "37": 431522304.0,
+            "38": 431522304.0,
+            "39": 431522304.0,
+            "40": 431522304.0,
+            "41": 431522304.0,
+            "42": 431522304.0,
+            "43": 431522304.0,
+            "44": 431522304.0,
+            "45": 431522304.0,
+            "46": 431522304.0,
+            "47": 431522304.0,
+            "48": 431522304.0,
+            "49": 431522304.0,
+            "50": 431522304.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 676274688.0,
+            "2": 861328896.0,
+            "3": 861328896.0,
+            "4": 861328896.0,
+            "5": 861328896.0,
+            "6": 861328896.0,
+            "7": 861328896.0,
+            "8": 861328896.0,
+            "9": 861328896.0,
+            "10": 861328896.0,
+            "11": 861328896.0,
+            "12": 861328896.0,
+            "13": 861328896.0,
+            "14": 861328896.0,
+            "15": 861328896.0,
+            "16": 861328896.0,
+            "17": 861328896.0,
+            "18": 861328896.0,
+            "19": 861328896.0,
+            "20": 861328896.0,
+            "21": 861328896.0,
+            "22": 861328896.0,
+            "23": 861328896.0,
+            "24": 861328896.0,
+            "25": 861328896.0,
+            "26": 861328896.0,
+            "27": 861328896.0,
+            "28": 861328896.0,
+            "29": 861328896.0,
+            "30": 861328896.0,
+            "31": 861328896.0,
+            "32": 861328896.0,
+            "33": 861328896.0,
+            "34": 861328896.0,
+            "35": 861328896.0,
+            "36": 861328896.0,
+            "37": 861328896.0,
+            "38": 861328896.0,
+            "39": 861328896.0,
+            "40": 861328896.0,
+            "41": 861328896.0,
+            "42": 861328896.0,
+            "43": 861328896.0,
+            "44": 861328896.0,
+            "45": 861328896.0,
+            "46": 861328896.0,
+            "47": 861328896.0,
+            "48": 861328896.0,
+            "49": 861328896.0,
+            "50": 861328896.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 17.98339,
+            "2": 0.51543,
+            "3": 0.43144,
+            "4": 0.41368,
+            "5": 0.41459,
+            "6": 0.42035,
+            "7": 0.41166,
+            "8": 0.41088,
+            "9": 0.40219,
+            "10": 0.39929,
+            "11": 0.40001,
+            "12": 0.40539,
+            "13": 0.40407,
+            "14": 0.40122,
+            "15": 0.40141,
+            "16": 0.39925,
+            "17": 0.4019,
+            "18": 0.40627,
+            "19": 0.40221,
+            "20": 0.40001,
+            "21": 0.40901,
+            "22": 0.40318,
+            "23": 0.40162,
+            "24": 0.40025,
+            "25": 0.405,
+            "26": 0.40173,
+            "27": 0.40154,
+            "28": 0.40124,
+            "29": 0.39975,
+            "30": 0.39939,
+            "31": 0.39959,
+            "32": 0.40081,
+            "33": 0.40069,
+            "34": 0.40167,
+            "35": 0.40089,
+            "36": 0.4008,
+            "37": 0.40204,
+            "38": 0.39997,
+            "39": 0.40129,
+            "40": 0.40009,
+            "41": 0.40125,
+            "42": 0.40029,
+            "43": 0.4015,
+            "44": 0.40069,
+            "45": 0.40137,
+            "46": 0.40258,
+            "47": 0.40025,
+            "48": 0.39925,
+            "49": 0.39977,
+            "50": 0.39869
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..08b446921f5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84734,
+            "4": 10.85571,
+            "5": 10.86005,
+            "6": 10.87729,
+            "7": 10.86561,
+            "8": 10.84911,
+            "9": 10.86607,
+            "10": 10.82477,
+            "11": 10.85618,
+            "12": 10.85372,
+            "13": 10.8679,
+            "14": 10.87115,
+            "15": 10.82234,
+            "16": 10.79992,
+            "17": 10.7743,
+            "18": 10.78345,
+            "19": 10.79307,
+            "20": 10.68226,
+            "21": 10.6471,
+            "22": 10.50919,
+            "23": 10.66829,
+            "24": 10.54194,
+            "25": 10.49284,
+            "26": 10.55935,
+            "27": 10.54236,
+            "28": 10.51129,
+            "29": 10.53259,
+            "30": 10.28991,
+            "31": 10.02854,
+            "32": 10.38881,
+            "33": 10.39595,
+            "34": 10.13448,
+            "35": 10.18931,
+            "36": 10.13349,
+            "37": 10.2738,
+            "38": 10.10751,
+            "39": 10.34007,
+            "40": 9.98538,
+            "41": 10.06415,
+            "42": 10.13746,
+            "43": 9.73384,
+            "44": 9.86303,
+            "45": 9.73723,
+            "46": 9.71344,
+            "47": 10.07753,
+            "48": 9.76766,
+            "49": 9.4199,
+            "50": 9.8169
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 561.0,
+            "2": 574.0,
+            "3": 615.0,
+            "4": 612.0,
+            "5": 664.0,
+            "6": 648.0,
+            "7": 593.0,
+            "8": 587.0,
+            "9": 622.0,
+            "10": 528.0,
+            "11": 652.0,
+            "12": 592.0,
+            "13": 640.0,
+            "14": 634.0,
+            "15": 646.0,
+            "16": 666.0,
+            "17": 596.0,
+            "18": 617.0,
+            "19": 635.0,
+            "20": 582.0,
+            "21": 698.0,
+            "22": 608.0,
+            "23": 643.0,
+            "24": 645.0,
+            "25": 584.0,
+            "26": 619.0,
+            "27": 669.0,
+            "28": 702.0,
+            "29": 714.0,
+            "30": 683.0,
+            "31": 604.0,
+            "32": 722.0,
+            "33": 758.0,
+            "34": 674.0,
+            "35": 705.0,
+            "36": 782.0,
+            "37": 828.0,
+            "38": 796.0,
+            "39": 884.0,
+            "40": 832.0,
+            "41": 821.0,
+            "42": 813.0,
+            "43": 749.0,
+            "44": 856.0,
+            "45": 792.0,
+            "46": 774.0,
+            "47": 914.0,
+            "48": 832.0,
+            "49": 821.0,
+            "50": 871.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 460882432.0,
+            "2": 460882432.0,
+            "3": 460882432.0,
+            "4": 460882432.0,
+            "5": 460882432.0,
+            "6": 460882432.0,
+            "7": 460882432.0,
+            "8": 460882432.0,
+            "9": 460882432.0,
+            "10": 460882432.0,
+            "11": 460882432.0,
+            "12": 460882432.0,
+            "13": 460882432.0,
+            "14": 460882432.0,
+            "15": 460882432.0,
+            "16": 460882432.0,
+            "17": 460882432.0,
+            "18": 460882432.0,
+            "19": 460882432.0,
+            "20": 460882432.0,
+            "21": 460882432.0,
+            "22": 460882432.0,
+            "23": 460882432.0,
+            "24": 460882432.0,
+            "25": 460882432.0,
+            "26": 460882432.0,
+            "27": 460882432.0,
+            "28": 460882432.0,
+            "29": 460882432.0,
+            "30": 460882432.0,
+            "31": 460882432.0,
+            "32": 460882432.0,
+            "33": 460882432.0,
+            "34": 460882432.0,
+            "35": 460882432.0,
+            "36": 460882432.0,
+            "37": 460882432.0,
+            "38": 460882432.0,
+            "39": 460882432.0,
+            "40": 460882432.0,
+            "41": 460882432.0,
+            "42": 460882432.0,
+            "43": 460882432.0,
+            "44": 460882432.0,
+            "45": 460882432.0,
+            "46": 460882432.0,
+            "47": 460882432.0,
+            "48": 460882432.0,
+            "49": 460882432.0,
+            "50": 460882432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 705635328.0,
+            "2": 883348992.0,
+            "3": 884397568.0,
+            "4": 885445632.0,
+            "5": 885445632.0,
+            "6": 885445632.0,
+            "7": 885445632.0,
+            "8": 885445632.0,
+            "9": 885445632.0,
+            "10": 885445632.0,
+            "11": 885445632.0,
+            "12": 885445632.0,
+            "13": 885445632.0,
+            "14": 885445632.0,
+            "15": 885445632.0,
+            "16": 885445632.0,
+            "17": 885445632.0,
+            "18": 885445632.0,
+            "19": 885445632.0,
+            "20": 885445632.0,
+            "21": 885445632.0,
+            "22": 885445632.0,
+            "23": 885446144.0,
+            "24": 885446144.0,
+            "25": 885446144.0,
+            "26": 885446144.0,
+            "27": 885446144.0,
+            "28": 885446144.0,
+            "29": 885446144.0,
+            "30": 885446144.0,
+            "31": 885446144.0,
+            "32": 885446144.0,
+            "33": 885446144.0,
+            "34": 885446144.0,
+            "35": 885446144.0,
+            "36": 886492672.0,
+            "37": 886492672.0,
+            "38": 886492672.0,
+            "39": 886492672.0,
+            "40": 886492672.0,
+            "41": 886492672.0,
+            "42": 886492672.0,
+            "43": 886492672.0,
+            "44": 886492672.0,
+            "45": 886492672.0,
+            "46": 886492672.0,
+            "47": 886492672.0,
+            "48": 886492672.0,
+            "49": 886493696.0,
+            "50": 886493696.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.84499,
+            "3": 0.50052,
+            "4": 0.45861,
+            "5": 0.44336,
+            "6": 0.44062,
+            "7": 0.43954,
+            "8": 0.44061,
+            "9": 0.44129,
+            "10": 0.44028,
+            "11": 0.44106,
+            "12": 0.4893,
+            "13": 0.49424,
+            "14": 0.49729,
+            "15": 0.4969,
+            "16": 0.49673,
+            "17": 0.49876,
+            "18": 0.4992,
+            "19": 0.49565,
+            "20": 0.48635,
+            "21": 0.46659,
+            "22": 0.45563,
+            "23": 0.44842,
+            "24": 0.4425,
+            "25": 0.44486,
+            "26": 0.43654,
+            "27": 0.43626,
+            "28": 0.43493,
+            "29": 0.43571,
+            "30": 0.43296,
+            "31": 0.4336,
+            "32": 0.43346,
+            "33": 0.45798,
+            "34": 0.47046,
+            "35": 0.47986,
+            "36": 0.48443,
+            "37": 0.48862,
+            "38": 0.48621,
+            "39": 0.48674,
+            "40": 0.48663,
+            "41": 0.48915,
+            "42": 0.48901,
+            "43": 0.4567,
+            "44": 0.46536,
+            "45": 0.47673,
+            "46": 0.48141,
+            "47": 0.48283,
+            "48": 0.4896,
+            "49": 0.48736,
+            "50": 0.50085
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_h100.json
index fb8e93ed571..f31eb533b69 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86285,
-            "4": 10.84011,
-            "5": 10.87856,
-            "6": 10.88852,
-            "7": 10.86536,
-            "8": 10.86016,
-            "9": 10.85989,
-            "10": 10.82982,
-            "11": 10.88947,
-            "12": 10.8751,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.8628,
+            "4": 10.84012,
+            "5": 10.87852,
+            "6": 10.88851,
+            "7": 10.86537,
+            "8": 10.86019,
+            "9": 10.85987,
+            "10": 10.82981,
+            "11": 10.88948,
+            "12": 10.87505,
             "13": 10.87425,
-            "14": 10.89675,
-            "15": 10.82051,
-            "16": 10.82498,
-            "17": 10.78982,
-            "18": 10.81029,
-            "19": 10.80533,
-            "20": 10.70397,
-            "21": 10.66991,
-            "22": 10.50644,
-            "23": 10.69004,
-            "24": 10.56312,
+            "14": 10.89676,
+            "15": 10.82055,
+            "16": 10.82497,
+            "17": 10.78983,
+            "18": 10.81028,
+            "19": 10.80534,
+            "20": 10.70396,
+            "21": 10.6699,
+            "22": 10.50646,
+            "23": 10.69008,
+            "24": 10.56313,
             "25": 10.49421,
-            "26": 10.56627,
-            "27": 10.58027,
+            "26": 10.56629,
+            "27": 10.58025,
             "28": 10.51573,
-            "29": 10.553,
-            "30": 10.30549,
-            "31": 10.02248,
-            "32": 10.40616,
-            "33": 10.39874,
-            "34": 10.13771,
+            "29": 10.55296,
+            "30": 10.30548,
+            "31": 10.02246,
+            "32": 10.40617,
+            "33": 10.39878,
+            "34": 10.13774,
             "35": 10.20187,
-            "36": 10.16049,
-            "37": 10.28975,
-            "38": 10.11483,
-            "39": 10.36101,
-            "40": 10.01902,
-            "41": 10.07289,
+            "36": 10.1605,
+            "37": 10.28973,
+            "38": 10.1148,
+            "39": 10.36099,
+            "40": 10.01904,
+            "41": 10.07292,
             "42": 10.14695,
-            "43": 9.74689,
-            "44": 9.87763,
-            "45": 9.74967,
-            "46": 9.73381,
-            "47": 10.07535,
-            "48": 9.78068,
-            "49": 9.44781,
-            "50": 9.8399
+            "43": 9.74685,
+            "44": 9.8776,
+            "45": 9.74964,
+            "46": 9.73384,
+            "47": 10.07538,
+            "48": 9.7807,
+            "49": 9.44782,
+            "50": 9.83987
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 625.0,
-            "2": 644.0,
-            "3": 614.0,
-            "4": 636.0,
-            "5": 605.0,
-            "6": 649.0,
-            "7": 606.0,
-            "8": 559.0,
-            "9": 658.0,
-            "10": 524.0,
-            "11": 693.0,
-            "12": 598.0,
-            "13": 702.0,
-            "14": 660.0,
-            "15": 638.0,
-            "16": 596.0,
-            "17": 662.0,
-            "18": 586.0,
-            "19": 594.0,
-            "20": 598.0,
-            "21": 656.0,
-            "22": 608.0,
-            "23": 706.0,
-            "24": 609.0,
-            "25": 610.0,
-            "26": 632.0,
-            "27": 664.0,
-            "28": 766.0,
-            "29": 765.0,
-            "30": 755.0,
-            "31": 606.0,
-            "32": 708.0,
-            "33": 775.0,
-            "34": 735.0,
-            "35": 729.0,
-            "36": 739.0,
-            "37": 840.0,
-            "38": 749.0,
-            "39": 911.0,
-            "40": 763.0,
-            "41": 830.0,
-            "42": 835.0,
-            "43": 755.0,
-            "44": 823.0,
-            "45": 799.0,
-            "46": 811.0,
-            "47": 869.0,
-            "48": 839.0,
-            "49": 897.0,
-            "50": 869.0
+            "1": 605.0,
+            "2": 642.0,
+            "3": 634.0,
+            "4": 637.0,
+            "5": 630.0,
+            "6": 692.0,
+            "7": 692.0,
+            "8": 551.0,
+            "9": 638.0,
+            "10": 549.0,
+            "11": 666.0,
+            "12": 644.0,
+            "13": 631.0,
+            "14": 639.0,
+            "15": 636.0,
+            "16": 669.0,
+            "17": 676.0,
+            "18": 635.0,
+            "19": 613.0,
+            "20": 637.0,
+            "21": 631.0,
+            "22": 588.0,
+            "23": 784.0,
+            "24": 596.0,
+            "25": 572.0,
+            "26": 619.0,
+            "27": 717.0,
+            "28": 725.0,
+            "29": 775.0,
+            "30": 722.0,
+            "31": 613.0,
+            "32": 737.0,
+            "33": 823.0,
+            "34": 699.0,
+            "35": 720.0,
+            "36": 702.0,
+            "37": 843.0,
+            "38": 826.0,
+            "39": 854.0,
+            "40": 764.0,
+            "41": 834.0,
+            "42": 820.0,
+            "43": 744.0,
+            "44": 840.0,
+            "45": 788.0,
+            "46": 798.0,
+            "47": 863.0,
+            "48": 888.0,
+            "49": 867.0,
+            "50": 814.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 510689792.0,
-            "2": 510689792.0,
-            "3": 510689792.0,
-            "4": 510689792.0,
-            "5": 510689792.0,
-            "6": 510689792.0,
-            "7": 510689792.0,
-            "8": 510689792.0,
-            "9": 510689792.0,
-            "10": 510689792.0,
-            "11": 510689792.0,
-            "12": 510689792.0,
-            "13": 510689792.0,
-            "14": 510689792.0,
-            "15": 510689792.0,
-            "16": 510689792.0,
-            "17": 510689792.0,
-            "18": 510689792.0,
-            "19": 510689792.0,
-            "20": 510689792.0,
-            "21": 510689792.0,
-            "22": 510689792.0,
-            "23": 510689792.0,
-            "24": 510689792.0,
-            "25": 510689792.0,
-            "26": 510689792.0,
-            "27": 510689792.0,
-            "28": 510689792.0,
-            "29": 510689792.0,
-            "30": 510689792.0,
-            "31": 510689792.0,
-            "32": 510689792.0,
-            "33": 510689792.0,
-            "34": 510689792.0,
-            "35": 510689792.0,
-            "36": 510689792.0,
-            "37": 510689792.0,
-            "38": 510689792.0,
-            "39": 510689792.0,
-            "40": 510689792.0,
-            "41": 510689792.0,
-            "42": 510689792.0,
-            "43": 510689792.0,
-            "44": 510689792.0,
-            "45": 510689792.0,
-            "46": 510689792.0,
-            "47": 510689792.0,
-            "48": 510689792.0,
-            "49": 510689792.0,
-            "50": 510689792.0
+            "1": 512786944.0,
+            "2": 512786944.0,
+            "3": 512786944.0,
+            "4": 512786944.0,
+            "5": 512786944.0,
+            "6": 512786944.0,
+            "7": 512786944.0,
+            "8": 512786944.0,
+            "9": 512786944.0,
+            "10": 512786944.0,
+            "11": 512786944.0,
+            "12": 512786944.0,
+            "13": 512786944.0,
+            "14": 512786944.0,
+            "15": 512786944.0,
+            "16": 512786944.0,
+            "17": 512786944.0,
+            "18": 512786944.0,
+            "19": 512786944.0,
+            "20": 512786944.0,
+            "21": 512786944.0,
+            "22": 512786944.0,
+            "23": 512786944.0,
+            "24": 512786944.0,
+            "25": 512786944.0,
+            "26": 512786944.0,
+            "27": 512786944.0,
+            "28": 512786944.0,
+            "29": 512786944.0,
+            "30": 512786944.0,
+            "31": 512786944.0,
+            "32": 512786944.0,
+            "33": 512786944.0,
+            "34": 512786944.0,
+            "35": 512786944.0,
+            "36": 512786944.0,
+            "37": 512786944.0,
+            "38": 512786944.0,
+            "39": 512786944.0,
+            "40": 512786944.0,
+            "41": 512786944.0,
+            "42": 512786944.0,
+            "43": 512786944.0,
+            "44": 512786944.0,
+            "45": 512786944.0,
+            "46": 512786944.0,
+            "47": 512786944.0,
+            "48": 512786944.0,
+            "49": 512786944.0,
+            "50": 512786944.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 759898624.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 934202368.0,
-            "5": 934202368.0,
-            "6": 934202368.0,
-            "7": 934202368.0,
-            "8": 934202368.0,
-            "9": 934202368.0,
-            "10": 934202368.0,
-            "11": 934202368.0,
-            "12": 934202368.0,
-            "13": 934202368.0,
-            "14": 934202368.0,
-            "15": 934202368.0,
-            "16": 934202368.0,
-            "17": 934202368.0,
-            "18": 934202368.0,
-            "19": 934202368.0,
-            "20": 934202368.0,
-            "21": 934202368.0,
-            "22": 934202368.0,
-            "23": 934202368.0,
-            "24": 934202368.0,
-            "25": 934202368.0,
-            "26": 934202368.0,
-            "27": 934202368.0,
-            "28": 934202368.0,
-            "29": 934202368.0,
-            "30": 934202368.0,
-            "31": 934202368.0,
-            "32": 934202368.0,
-            "33": 934202368.0,
-            "34": 934202368.0,
-            "35": 934202368.0,
-            "36": 934202368.0,
-            "37": 934202368.0,
-            "38": 934202368.0,
-            "39": 934202368.0,
-            "40": 934202368.0,
-            "41": 934202368.0,
-            "42": 934202368.0,
-            "43": 934202368.0,
-            "44": 934202368.0,
-            "45": 934202368.0,
-            "46": 934202368.0,
-            "47": 934202368.0,
-            "48": 934202368.0,
-            "49": 934202368.0,
-            "50": 934202368.0
+            "1": 758851072.0,
+            "2": 937350144.0,
+            "3": 937350656.0,
+            "4": 937350656.0,
+            "5": 937350656.0,
+            "6": 937350656.0,
+            "7": 937350656.0,
+            "8": 937350656.0,
+            "9": 937350656.0,
+            "10": 937350656.0,
+            "11": 937350656.0,
+            "12": 937350656.0,
+            "13": 937350656.0,
+            "14": 937350656.0,
+            "15": 937350656.0,
+            "16": 937350656.0,
+            "17": 937350656.0,
+            "18": 937350656.0,
+            "19": 937350656.0,
+            "20": 937350656.0,
+            "21": 937350656.0,
+            "22": 937350656.0,
+            "23": 937350656.0,
+            "24": 937350656.0,
+            "25": 937350656.0,
+            "26": 937350656.0,
+            "27": 937350656.0,
+            "28": 937350656.0,
+            "29": 937350656.0,
+            "30": 937350656.0,
+            "31": 937350656.0,
+            "32": 937350656.0,
+            "33": 937350656.0,
+            "34": 937350656.0,
+            "35": 937350656.0,
+            "36": 937350656.0,
+            "37": 937350656.0,
+            "38": 937350656.0,
+            "39": 937350656.0,
+            "40": 937350656.0,
+            "41": 937350656.0,
+            "42": 937350656.0,
+            "43": 937350656.0,
+            "44": 937350656.0,
+            "45": 937350656.0,
+            "46": 937350656.0,
+            "47": 937350656.0,
+            "48": 937350656.0,
+            "49": 937350656.0,
+            "50": 937350656.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 15.91359,
-            "2": 0.40136,
-            "3": 0.32913,
-            "4": 0.33946,
-            "5": 0.32404,
-            "6": 0.31963,
-            "7": 0.32283,
-            "8": 0.32302,
-            "9": 0.32004,
-            "10": 0.32058,
-            "11": 0.33128,
-            "12": 0.32725,
-            "13": 0.3253,
-            "14": 0.32532,
-            "15": 0.32194,
-            "16": 0.32237,
-            "17": 0.31946,
-            "18": 0.31937,
-            "19": 0.3185,
-            "20": 0.3193,
-            "21": 0.32216,
-            "22": 0.328,
-            "23": 0.32251,
-            "24": 0.32294,
-            "25": 0.32205,
-            "26": 0.32393,
-            "27": 0.32132,
-            "28": 0.32221,
-            "29": 0.32269,
-            "30": 0.32422,
-            "31": 0.32527,
-            "32": 0.32866,
-            "33": 0.32346,
-            "34": 0.32064,
-            "35": 0.3199,
-            "36": 0.32198,
-            "37": 0.32252,
-            "38": 0.32103,
-            "39": 0.32486,
-            "40": 0.32573,
-            "41": 0.32643,
-            "42": 0.3234,
-            "43": 0.32778,
-            "44": 0.32302,
-            "45": 0.32434,
-            "46": 0.32532,
-            "47": 0.32115,
-            "48": 0.31979,
-            "49": 0.3233,
-            "50": 0.31776
+            "1": 33.51618,
+            "2": 0.36883,
+            "3": 0.30428,
+            "4": 0.28577,
+            "5": 0.28543,
+            "6": 0.28865,
+            "7": 0.32712,
+            "8": 0.32792,
+            "9": 0.28343,
+            "10": 0.28485,
+            "11": 0.28657,
+            "12": 0.28232,
+            "13": 0.28318,
+            "14": 0.28116,
+            "15": 0.28207,
+            "16": 0.28249,
+            "17": 0.2834,
+            "18": 0.28247,
+            "19": 0.28307,
+            "20": 0.28306,
+            "21": 0.28204,
+            "22": 0.28265,
+            "23": 0.28005,
+            "24": 0.2819,
+            "25": 0.2815,
+            "26": 0.28084,
+            "27": 0.28108,
+            "28": 0.28074,
+            "29": 0.28159,
+            "30": 0.28148,
+            "31": 0.28071,
+            "32": 0.27992,
+            "33": 0.28304,
+            "34": 0.28251,
+            "35": 0.28377,
+            "36": 0.28373,
+            "37": 0.28263,
+            "38": 0.28146,
+            "39": 0.28084,
+            "40": 0.28168,
+            "41": 0.28075,
+            "42": 0.27996,
+            "43": 0.2815,
+            "44": 0.28058,
+            "45": 0.2814,
+            "46": 0.28356,
+            "47": 0.28026,
+            "48": 0.28452,
+            "49": 0.28225,
+            "50": 0.28075
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgx_a100.json
index 215ddcea45c..421e66150ce 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93292, "5": 10.92965, "10": 10.90473, "15": 10.87127, "20": 10.74997, "25": 10.53754, "30": 10.32548, "35": 10.22895, "40": 10.01975, "45": 9.75546, "50": 9.84069}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 585.0, "5": 675.0, "10": 544.0, "15": 619.0, "20": 579.0, "25": 620.0, "30": 678.0, "35": 717.0, "40": 813.0, "45": 746.0, "50": 841.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 432177152.0, "5": 432177152.0, "10": 432177152.0, "15": 432177152.0, "20": 432177152.0, "25": 432177152.0, "30": 432177152.0, "35": 432177152.0, "40": 432177152.0, "45": 432177152.0, "50": 432177152.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676283904.0, "5": 856228864.0, "10": 857276928.0, "15": 857276928.0, "20": 857276928.0, "25": 857276928.0, "30": 857276928.0, "35": 857276928.0, "40": 857276928.0, "45": 857276928.0, "50": 857276928.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.34002, "5": 0.40276, "10": 0.39665, "15": 0.39344, "20": 0.39157, "25": 0.3871, "30": 0.38802, "35": 0.39196, "40": 0.38964, "45": 0.39313, "50": 0.39241}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.93295,
+            "2": 10.93424,
+            "3": 10.91347,
+            "4": 10.90322,
+            "5": 10.92969,
+            "6": 10.93655,
+            "7": 10.90279,
+            "8": 10.92115,
+            "9": 10.90706,
+            "10": 10.90476,
+            "11": 10.88788,
+            "12": 10.91742,
+            "13": 10.91192,
+            "14": 10.91504,
+            "15": 10.87121,
+            "16": 10.86129,
+            "17": 10.82702,
+            "18": 10.85676,
+            "19": 10.84055,
+            "20": 10.75002,
+            "21": 10.71507,
+            "22": 10.58115,
+            "23": 10.72645,
+            "24": 10.60725,
+            "25": 10.53755,
+            "26": 10.61068,
+            "27": 10.59926,
+            "28": 10.54954,
+            "29": 10.56605,
+            "30": 10.3255,
+            "31": 10.06696,
+            "32": 10.43809,
+            "33": 10.42362,
+            "34": 10.16017,
+            "35": 10.22893,
+            "36": 10.17616,
+            "37": 10.29235,
+            "38": 10.13293,
+            "39": 10.34957,
+            "40": 10.01975,
+            "41": 10.07533,
+            "42": 10.15406,
+            "43": 9.76091,
+            "44": 9.88358,
+            "45": 9.75547,
+            "46": 9.74961,
+            "47": 10.07549,
+            "48": 9.77934,
+            "49": 9.43812,
+            "50": 9.8407
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 604.0,
+            "2": 606.0,
+            "3": 671.0,
+            "4": 620.0,
+            "5": 670.0,
+            "6": 594.0,
+            "7": 646.0,
+            "8": 558.0,
+            "9": 627.0,
+            "10": 591.0,
+            "11": 682.0,
+            "12": 595.0,
+            "13": 692.0,
+            "14": 633.0,
+            "15": 636.0,
+            "16": 670.0,
+            "17": 644.0,
+            "18": 570.0,
+            "19": 591.0,
+            "20": 570.0,
+            "21": 643.0,
+            "22": 577.0,
+            "23": 657.0,
+            "24": 572.0,
+            "25": 611.0,
+            "26": 637.0,
+            "27": 649.0,
+            "28": 730.0,
+            "29": 737.0,
+            "30": 685.0,
+            "31": 548.0,
+            "32": 689.0,
+            "33": 735.0,
+            "34": 665.0,
+            "35": 700.0,
+            "36": 701.0,
+            "37": 855.0,
+            "38": 786.0,
+            "39": 794.0,
+            "40": 808.0,
+            "41": 844.0,
+            "42": 835.0,
+            "43": 678.0,
+            "44": 750.0,
+            "45": 771.0,
+            "46": 831.0,
+            "47": 920.0,
+            "48": 892.0,
+            "49": 824.0,
+            "50": 795.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 431522304.0,
+            "2": 431522304.0,
+            "3": 431522304.0,
+            "4": 431522304.0,
+            "5": 431522304.0,
+            "6": 431522304.0,
+            "7": 431522304.0,
+            "8": 431522304.0,
+            "9": 431522304.0,
+            "10": 431522304.0,
+            "11": 431522304.0,
+            "12": 431522304.0,
+            "13": 431522304.0,
+            "14": 431522304.0,
+            "15": 431522304.0,
+            "16": 431522304.0,
+            "17": 431522304.0,
+            "18": 431522304.0,
+            "19": 431522304.0,
+            "20": 431522304.0,
+            "21": 431522304.0,
+            "22": 431522304.0,
+            "23": 431522304.0,
+            "24": 431522304.0,
+            "25": 431522304.0,
+            "26": 431522304.0,
+            "27": 431522304.0,
+            "28": 431522304.0,
+            "29": 431522304.0,
+            "30": 431522304.0,
+            "31": 431522304.0,
+            "32": 431522304.0,
+            "33": 431522304.0,
+            "34": 431522304.0,
+            "35": 431522304.0,
+            "36": 431522304.0,
+            "37": 431522304.0,
+            "38": 431522304.0,
+            "39": 431522304.0,
+            "40": 431522304.0,
+            "41": 431522304.0,
+            "42": 431522304.0,
+            "43": 431522304.0,
+            "44": 431522304.0,
+            "45": 431522304.0,
+            "46": 431522304.0,
+            "47": 431522304.0,
+            "48": 431522304.0,
+            "49": 431522304.0,
+            "50": 431522304.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 678369280.0,
+            "2": 861326336.0,
+            "3": 861326336.0,
+            "4": 861326336.0,
+            "5": 861326848.0,
+            "6": 861328896.0,
+            "7": 861328896.0,
+            "8": 861328896.0,
+            "9": 861328896.0,
+            "10": 861328896.0,
+            "11": 861328896.0,
+            "12": 861328896.0,
+            "13": 861328896.0,
+            "14": 861328896.0,
+            "15": 861328896.0,
+            "16": 861328896.0,
+            "17": 861328896.0,
+            "18": 861328896.0,
+            "19": 861328896.0,
+            "20": 861328896.0,
+            "21": 861328896.0,
+            "22": 861328896.0,
+            "23": 861328896.0,
+            "24": 861328896.0,
+            "25": 861328896.0,
+            "26": 861328896.0,
+            "27": 861328896.0,
+            "28": 861328896.0,
+            "29": 861328896.0,
+            "30": 861328896.0,
+            "31": 861328896.0,
+            "32": 861328896.0,
+            "33": 861328896.0,
+            "34": 861328896.0,
+            "35": 861328896.0,
+            "36": 861328896.0,
+            "37": 861328896.0,
+            "38": 861328896.0,
+            "39": 861328896.0,
+            "40": 861328896.0,
+            "41": 861328896.0,
+            "42": 861328896.0,
+            "43": 861328896.0,
+            "44": 861328896.0,
+            "45": 861328896.0,
+            "46": 861328896.0,
+            "47": 861328896.0,
+            "48": 861328896.0,
+            "49": 861328896.0,
+            "50": 861328896.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 16.47831,
+            "2": 0.44885,
+            "3": 0.42205,
+            "4": 0.40517,
+            "5": 0.40824,
+            "6": 0.40476,
+            "7": 0.40726,
+            "8": 0.40671,
+            "9": 0.40354,
+            "10": 0.41027,
+            "11": 0.44095,
+            "12": 0.43962,
+            "13": 0.44029,
+            "14": 0.44506,
+            "15": 0.43995,
+            "16": 0.44228,
+            "17": 0.44479,
+            "18": 0.43969,
+            "19": 0.43999,
+            "20": 0.43737,
+            "21": 0.44549,
+            "22": 0.44572,
+            "23": 0.44259,
+            "24": 0.44105,
+            "25": 0.44312,
+            "26": 0.44437,
+            "27": 0.44718,
+            "28": 0.44344,
+            "29": 0.44315,
+            "30": 0.43332,
+            "31": 0.4392,
+            "32": 0.43861,
+            "33": 0.40986,
+            "34": 0.40961,
+            "35": 0.40907,
+            "36": 0.41022,
+            "37": 0.41003,
+            "38": 0.41205,
+            "39": 0.41822,
+            "40": 0.41387,
+            "41": 0.4147,
+            "42": 0.41362,
+            "43": 0.4135,
+            "44": 0.41365,
+            "45": 0.41109,
+            "46": 0.41218,
+            "47": 0.41209,
+            "48": 0.41473,
+            "49": 0.41335,
+            "50": 0.41197
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..7fa492bd7eb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86836,
+            "2": 10.88595,
+            "3": 10.86559,
+            "4": 10.86891,
+            "5": 10.87418,
+            "6": 10.89057,
+            "7": 10.87676,
+            "8": 10.86476,
+            "9": 10.88235,
+            "10": 10.84582,
+            "11": 10.87163,
+            "12": 10.87421,
+            "13": 10.8816,
+            "14": 10.88885,
+            "15": 10.83934,
+            "16": 10.82496,
+            "17": 10.80146,
+            "18": 10.81236,
+            "19": 10.82152,
+            "20": 10.71935,
+            "21": 10.69086,
+            "22": 10.57422,
+            "23": 10.71096,
+            "24": 10.59783,
+            "25": 10.55559,
+            "26": 10.61516,
+            "27": 10.60451,
+            "28": 10.56481,
+            "29": 10.58476,
+            "30": 10.35947,
+            "31": 10.12155,
+            "32": 10.45234,
+            "33": 10.45724,
+            "34": 10.21987,
+            "35": 10.26441,
+            "36": 10.21037,
+            "37": 10.33955,
+            "38": 10.18012,
+            "39": 10.39592,
+            "40": 10.06635,
+            "41": 10.14165,
+            "42": 10.20849,
+            "43": 9.83127,
+            "44": 9.94857,
+            "45": 9.82845,
+            "46": 9.80455,
+            "47": 10.14227,
+            "48": 9.84463,
+            "49": 9.52192,
+            "50": 9.88604
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1673.0,
+            "2": 1596.0,
+            "3": 1676.0,
+            "4": 1672.0,
+            "5": 1818.0,
+            "6": 1740.0,
+            "7": 1845.0,
+            "8": 1651.0,
+            "9": 1820.0,
+            "10": 1351.0,
+            "11": 1811.0,
+            "12": 1655.0,
+            "13": 1748.0,
+            "14": 1719.0,
+            "15": 1801.0,
+            "16": 1829.0,
+            "17": 1828.0,
+            "18": 1545.0,
+            "19": 1727.0,
+            "20": 1654.0,
+            "21": 1874.0,
+            "22": 1567.0,
+            "23": 1955.0,
+            "24": 1609.0,
+            "25": 1474.0,
+            "26": 1750.0,
+            "27": 1682.0,
+            "28": 1927.0,
+            "29": 1949.0,
+            "30": 1837.0,
+            "31": 1606.0,
+            "32": 1849.0,
+            "33": 2085.0,
+            "34": 1799.0,
+            "35": 1933.0,
+            "36": 1928.0,
+            "37": 2325.0,
+            "38": 2099.0,
+            "39": 2424.0,
+            "40": 2112.0,
+            "41": 2240.0,
+            "42": 2181.0,
+            "43": 1934.0,
+            "44": 2042.0,
+            "45": 2041.0,
+            "46": 2183.0,
+            "47": 2424.0,
+            "48": 2250.0,
+            "49": 2208.0,
+            "50": 2425.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 465207808.0,
+            "2": 466256384.0,
+            "3": 466256384.0,
+            "4": 466256384.0,
+            "5": 466256384.0,
+            "6": 466256384.0,
+            "7": 466256384.0,
+            "8": 466256384.0,
+            "9": 466256384.0,
+            "10": 466256384.0,
+            "11": 466256384.0,
+            "12": 466256384.0,
+            "13": 466256384.0,
+            "14": 466256384.0,
+            "15": 466256384.0,
+            "16": 466256384.0,
+            "17": 466256384.0,
+            "18": 466256384.0,
+            "19": 466256384.0,
+            "20": 466256384.0,
+            "21": 466256384.0,
+            "22": 466256384.0,
+            "23": 466256384.0,
+            "24": 466256384.0,
+            "25": 466256384.0,
+            "26": 466256384.0,
+            "27": 466256384.0,
+            "28": 466256384.0,
+            "29": 466256384.0,
+            "30": 466256384.0,
+            "31": 466256384.0,
+            "32": 466256384.0,
+            "33": 466256384.0,
+            "34": 466256384.0,
+            "35": 466256384.0,
+            "36": 466256384.0,
+            "37": 466256384.0,
+            "38": 466256384.0,
+            "39": 466256384.0,
+            "40": 466256384.0,
+            "41": 466256384.0,
+            "42": 466256384.0,
+            "43": 466256384.0,
+            "44": 466256384.0,
+            "45": 466256384.0,
+            "46": 466256384.0,
+            "47": 466256384.0,
+            "48": 466256384.0,
+            "49": 466256384.0,
+            "50": 466256384.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1608546816.0,
+            "2": 1789536768.0,
+            "3": 1789536768.0,
+            "4": 1789536768.0,
+            "5": 1789536768.0,
+            "6": 1789536768.0,
+            "7": 1789536768.0,
+            "8": 1789536768.0,
+            "9": 1789536768.0,
+            "10": 1789536768.0,
+            "11": 1789536768.0,
+            "12": 1789536768.0,
+            "13": 1789536768.0,
+            "14": 1789536768.0,
+            "15": 1789536768.0,
+            "16": 1789536768.0,
+            "17": 1789536768.0,
+            "18": 1789536768.0,
+            "19": 1789536768.0,
+            "20": 1789536768.0,
+            "21": 1789536768.0,
+            "22": 1789536768.0,
+            "23": 1789536768.0,
+            "24": 1789536768.0,
+            "25": 1789536768.0,
+            "26": 1789536768.0,
+            "27": 1789536768.0,
+            "28": 1789536768.0,
+            "29": 1789536768.0,
+            "30": 1789536768.0,
+            "31": 1789536768.0,
+            "32": 1789536768.0,
+            "33": 1789536768.0,
+            "34": 1789536768.0,
+            "35": 1789536768.0,
+            "36": 1789536768.0,
+            "37": 1789536768.0,
+            "38": 1789536768.0,
+            "39": 1789536768.0,
+            "40": 1789536768.0,
+            "41": 1789536768.0,
+            "42": 1789536768.0,
+            "43": 1789536768.0,
+            "44": 1789536768.0,
+            "45": 1789536768.0,
+            "46": 1789536768.0,
+            "47": 1789536768.0,
+            "48": 1789536768.0,
+            "49": 1789536768.0,
+            "50": 1789536768.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.74426,
+            "3": 0.17387,
+            "4": 0.16297,
+            "5": 0.16176,
+            "6": 0.16395,
+            "7": 0.16386,
+            "8": 0.16289,
+            "9": 0.16327,
+            "10": 0.16443,
+            "11": 0.16326,
+            "12": 0.16343,
+            "13": 0.16293,
+            "14": 0.16535,
+            "15": 0.16415,
+            "16": 0.16437,
+            "17": 0.16472,
+            "18": 0.16441,
+            "19": 0.1632,
+            "20": 0.16239,
+            "21": 0.1634,
+            "22": 0.16387,
+            "23": 0.16433,
+            "24": 0.1624,
+            "25": 0.16133,
+            "26": 0.16119,
+            "27": 0.16136,
+            "28": 0.16141,
+            "29": 0.1614,
+            "30": 0.16073,
+            "31": 0.16146,
+            "32": 0.16038,
+            "33": 0.16019,
+            "34": 0.16109,
+            "35": 0.16035,
+            "36": 0.15933,
+            "37": 0.15978,
+            "38": 0.17485,
+            "39": 0.15932,
+            "40": 0.15877,
+            "41": 0.15919,
+            "42": 0.15903,
+            "43": 0.1594,
+            "44": 0.15734,
+            "45": 0.15857,
+            "46": 0.15791,
+            "47": 0.15837,
+            "48": 0.15781,
+            "49": 0.15813,
+            "50": 0.15862
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json
index 379b1c16f29..7dd87fe6932 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json
@@ -6,53 +6,53 @@
         "values": {
             "1": 10.85949,
             "2": 10.85553,
-            "3": 10.86548,
-            "4": 10.84554,
-            "5": 10.88344,
-            "6": 10.89429,
-            "7": 10.87068,
+            "3": 10.8655,
+            "4": 10.84551,
+            "5": 10.88343,
+            "6": 10.8943,
+            "7": 10.87063,
             "8": 10.86983,
-            "9": 10.86919,
-            "10": 10.83883,
+            "9": 10.86921,
+            "10": 10.83884,
             "11": 10.89435,
-            "12": 10.8798,
-            "13": 10.87987,
-            "14": 10.90317,
+            "12": 10.87978,
+            "13": 10.87984,
+            "14": 10.90319,
             "15": 10.8405,
-            "16": 10.83786,
-            "17": 10.80668,
-            "18": 10.83025,
-            "19": 10.82262,
+            "16": 10.83787,
+            "17": 10.80669,
+            "18": 10.83026,
+            "19": 10.82265,
             "20": 10.73192,
-            "21": 10.7075,
-            "22": 10.56005,
-            "23": 10.72406,
-            "24": 10.61116,
-            "25": 10.5481,
-            "26": 10.61334,
-            "27": 10.6305,
-            "28": 10.56645,
-            "29": 10.59672,
-            "30": 10.37136,
-            "31": 10.11721,
-            "32": 10.46127,
+            "21": 10.70754,
+            "22": 10.56006,
+            "23": 10.72404,
+            "24": 10.61114,
+            "25": 10.54813,
+            "26": 10.61328,
+            "27": 10.63051,
+            "28": 10.56643,
+            "29": 10.59671,
+            "30": 10.37137,
+            "31": 10.11718,
+            "32": 10.4613,
             "33": 10.45247,
             "34": 10.21687,
-            "35": 10.27171,
-            "36": 10.2312,
+            "35": 10.27176,
+            "36": 10.23121,
             "37": 10.34809,
-            "38": 10.18842,
+            "38": 10.1884,
             "39": 10.41042,
             "40": 10.09426,
-            "41": 10.14711,
-            "42": 10.21247,
-            "43": 9.84106,
-            "44": 9.95919,
-            "45": 9.84082,
-            "46": 9.82482,
-            "47": 10.13882,
-            "48": 9.85839,
-            "49": 9.5472,
+            "41": 10.14707,
+            "42": 10.21244,
+            "43": 9.84107,
+            "44": 9.95916,
+            "45": 9.84087,
+            "46": 9.82484,
+            "47": 10.1388,
+            "48": 9.85842,
+            "49": 9.54724,
             "50": 9.90883
         }
     },
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1690.0,
+            "1": 1660.0,
             "2": 1776.0,
-            "3": 1642.0,
-            "4": 1825.0,
-            "5": 1809.0,
-            "6": 1795.0,
-            "7": 1830.0,
-            "8": 1626.0,
-            "9": 1878.0,
-            "10": 1423.0,
-            "11": 1868.0,
-            "12": 1653.0,
-            "13": 1897.0,
-            "14": 1783.0,
-            "15": 1861.0,
-            "16": 1938.0,
-            "17": 1825.0,
-            "18": 1730.0,
-            "19": 1727.0,
-            "20": 1735.0,
-            "21": 1783.0,
-            "22": 1576.0,
-            "23": 1949.0,
-            "24": 1630.0,
-            "25": 1498.0,
-            "26": 1649.0,
-            "27": 1809.0,
-            "28": 2019.0,
-            "29": 2009.0,
-            "30": 1832.0,
-            "31": 1524.0,
-            "32": 1943.0,
-            "33": 2081.0,
-            "34": 1888.0,
-            "35": 1935.0,
-            "36": 1898.0,
-            "37": 2325.0,
-            "38": 2070.0,
-            "39": 2248.0,
-            "40": 2199.0,
-            "41": 2264.0,
-            "42": 2349.0,
-            "43": 2087.0,
-            "44": 2107.0,
-            "45": 2098.0,
-            "46": 2407.0,
-            "47": 2456.0,
-            "48": 2404.0,
-            "49": 2417.0,
-            "50": 2407.0
+            "3": 1685.0,
+            "4": 1830.0,
+            "5": 1876.0,
+            "6": 1881.0,
+            "7": 1773.0,
+            "8": 1628.0,
+            "9": 1868.0,
+            "10": 1353.0,
+            "11": 1926.0,
+            "12": 1737.0,
+            "13": 1848.0,
+            "14": 1643.0,
+            "15": 1917.0,
+            "16": 1839.0,
+            "17": 1856.0,
+            "18": 1706.0,
+            "19": 1744.0,
+            "20": 1662.0,
+            "21": 1877.0,
+            "22": 1569.0,
+            "23": 2062.0,
+            "24": 1569.0,
+            "25": 1560.0,
+            "26": 1701.0,
+            "27": 1772.0,
+            "28": 1894.0,
+            "29": 2094.0,
+            "30": 1838.0,
+            "31": 1538.0,
+            "32": 1980.0,
+            "33": 2060.0,
+            "34": 1919.0,
+            "35": 1885.0,
+            "36": 1906.0,
+            "37": 2286.0,
+            "38": 2045.0,
+            "39": 2285.0,
+            "40": 2096.0,
+            "41": 2265.0,
+            "42": 2248.0,
+            "43": 2040.0,
+            "44": 2114.0,
+            "45": 2134.0,
+            "46": 2443.0,
+            "47": 2479.0,
+            "48": 2455.0,
+            "49": 2402.0,
+            "50": 2416.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 516194816.0,
-            "2": 516194816.0,
-            "3": 516194816.0,
-            "4": 516194816.0,
-            "5": 516194816.0,
-            "6": 516194816.0,
-            "7": 516194816.0,
-            "8": 516194816.0,
-            "9": 516194816.0,
-            "10": 516194816.0,
-            "11": 516194816.0,
-            "12": 516194816.0,
-            "13": 516194816.0,
-            "14": 516194816.0,
-            "15": 516194816.0,
-            "16": 516194816.0,
-            "17": 516194816.0,
-            "18": 516194816.0,
-            "19": 516194816.0,
-            "20": 516194816.0,
-            "21": 516194816.0,
-            "22": 516194816.0,
-            "23": 516194816.0,
-            "24": 516194816.0,
-            "25": 516194816.0,
-            "26": 516194816.0,
-            "27": 516194816.0,
-            "28": 516194816.0,
-            "29": 516194816.0,
-            "30": 516194816.0,
-            "31": 516194816.0,
-            "32": 516194816.0,
-            "33": 516194816.0,
-            "34": 516194816.0,
-            "35": 516194816.0,
-            "36": 516194816.0,
-            "37": 516194816.0,
-            "38": 516194816.0,
-            "39": 516194816.0,
-            "40": 516194816.0,
-            "41": 516194816.0,
-            "42": 516194816.0,
-            "43": 516194816.0,
-            "44": 516194816.0,
-            "45": 516194816.0,
-            "46": 516194816.0,
-            "47": 516194816.0,
-            "48": 516194816.0,
-            "49": 516194816.0,
-            "50": 516194816.0
+            "1": 514359808.0,
+            "2": 514359808.0,
+            "3": 514359808.0,
+            "4": 514359808.0,
+            "5": 514359808.0,
+            "6": 514359808.0,
+            "7": 514359808.0,
+            "8": 514359808.0,
+            "9": 514359808.0,
+            "10": 514359808.0,
+            "11": 514359808.0,
+            "12": 514359808.0,
+            "13": 514359808.0,
+            "14": 514359808.0,
+            "15": 514359808.0,
+            "16": 514359808.0,
+            "17": 514359808.0,
+            "18": 514359808.0,
+            "19": 514359808.0,
+            "20": 514359808.0,
+            "21": 514359808.0,
+            "22": 514359808.0,
+            "23": 514359808.0,
+            "24": 514359808.0,
+            "25": 514359808.0,
+            "26": 514359808.0,
+            "27": 514359808.0,
+            "28": 514359808.0,
+            "29": 514359808.0,
+            "30": 514359808.0,
+            "31": 514359808.0,
+            "32": 514359808.0,
+            "33": 514359808.0,
+            "34": 514359808.0,
+            "35": 514359808.0,
+            "36": 514359808.0,
+            "37": 514359808.0,
+            "38": 514359808.0,
+            "39": 514359808.0,
+            "40": 514359808.0,
+            "41": 514359808.0,
+            "42": 514359808.0,
+            "43": 514359808.0,
+            "44": 514359808.0,
+            "45": 514359808.0,
+            "46": 514359808.0,
+            "47": 514359808.0,
+            "48": 514359808.0,
+            "49": 514359808.0,
+            "50": 514359808.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1670130688.0,
-            "2": 1840523776.0,
-            "3": 1840523776.0,
-            "4": 1840523776.0,
-            "5": 1840523776.0,
-            "6": 1840523776.0,
-            "7": 1840523776.0,
-            "8": 1840523776.0,
-            "9": 1840523776.0,
-            "10": 1840523776.0,
-            "11": 1840523776.0,
-            "12": 1840523776.0,
-            "13": 1840523776.0,
-            "14": 1840523776.0,
-            "15": 1840523776.0,
-            "16": 1840523776.0,
-            "17": 1840523776.0,
-            "18": 1840523776.0,
-            "19": 1840523776.0,
-            "20": 1840523776.0,
-            "21": 1840523776.0,
-            "22": 1840523776.0,
-            "23": 1840523776.0,
-            "24": 1840523776.0,
-            "25": 1840523776.0,
-            "26": 1840523776.0,
-            "27": 1840523776.0,
-            "28": 1840523776.0,
-            "29": 1840523776.0,
-            "30": 1840523776.0,
-            "31": 1840523776.0,
-            "32": 1840523776.0,
-            "33": 1840523776.0,
-            "34": 1840523776.0,
-            "35": 1840523776.0,
-            "36": 1840523776.0,
-            "37": 1840523776.0,
-            "38": 1840523776.0,
-            "39": 1840523776.0,
-            "40": 1840523776.0,
-            "41": 1840523776.0,
-            "42": 1840523776.0,
-            "43": 1840523776.0,
-            "44": 1840523776.0,
-            "45": 1840523776.0,
-            "46": 1840523776.0,
-            "47": 1840523776.0,
-            "48": 1840523776.0,
-            "49": 1840523776.0,
-            "50": 1840523776.0
+            "1": 1670148096.0,
+            "2": 1837640192.0,
+            "3": 1837640192.0,
+            "4": 1837640192.0,
+            "5": 1837640192.0,
+            "6": 1837640192.0,
+            "7": 1837640192.0,
+            "8": 1837640192.0,
+            "9": 1837640192.0,
+            "10": 1837640192.0,
+            "11": 1837640192.0,
+            "12": 1837640192.0,
+            "13": 1837640192.0,
+            "14": 1837640192.0,
+            "15": 1837640192.0,
+            "16": 1837640192.0,
+            "17": 1837640192.0,
+            "18": 1837640192.0,
+            "19": 1837640192.0,
+            "20": 1837640192.0,
+            "21": 1837640192.0,
+            "22": 1837640192.0,
+            "23": 1837640192.0,
+            "24": 1837640192.0,
+            "25": 1837640192.0,
+            "26": 1837640192.0,
+            "27": 1837640192.0,
+            "28": 1837640192.0,
+            "29": 1837640192.0,
+            "30": 1837640192.0,
+            "31": 1837640192.0,
+            "32": 1837640192.0,
+            "33": 1837640192.0,
+            "34": 1837640192.0,
+            "35": 1837640192.0,
+            "36": 1837640192.0,
+            "37": 1837640192.0,
+            "38": 1837640192.0,
+            "39": 1837640192.0,
+            "40": 1837640192.0,
+            "41": 1837640192.0,
+            "42": 1837640192.0,
+            "43": 1837640192.0,
+            "44": 1837640192.0,
+            "45": 1837640192.0,
+            "46": 1837640192.0,
+            "47": 1837640192.0,
+            "48": 1837640192.0,
+            "49": 1837640192.0,
+            "50": 1837640192.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 15.2683,
-            "2": 0.15358,
-            "3": 0.13619,
-            "4": 0.13976,
-            "5": 0.13713,
-            "6": 0.13753,
-            "7": 0.13575,
-            "8": 0.13485,
-            "9": 0.13779,
-            "10": 0.13697,
-            "11": 0.14178,
-            "12": 0.1397,
-            "13": 0.13744,
-            "14": 0.14039,
-            "15": 0.13739,
-            "16": 0.1361,
-            "17": 0.13816,
-            "18": 0.13722,
-            "19": 0.15342,
-            "20": 0.14613,
-            "21": 0.14806,
-            "22": 0.14423,
-            "23": 0.14791,
-            "24": 0.14345,
-            "25": 0.14474,
-            "26": 0.14564,
-            "27": 0.14168,
-            "28": 0.14148,
-            "29": 0.13863,
-            "30": 0.13751,
-            "31": 0.14015,
-            "32": 0.13821,
-            "33": 0.14038,
-            "34": 0.13859,
-            "35": 0.14531,
-            "36": 0.14468,
-            "37": 0.13783,
-            "38": 0.13787,
-            "39": 0.13879,
-            "40": 0.14072,
-            "41": 0.14065,
-            "42": 0.13865,
-            "43": 0.13953,
-            "44": 0.13882,
-            "45": 0.13622,
-            "46": 0.14034,
-            "47": 0.13659,
-            "48": 0.14369,
-            "49": 0.13987,
-            "50": 0.13803
+            "1": 9.56969,
+            "2": 0.15621,
+            "3": 0.13591,
+            "4": 0.11846,
+            "5": 0.11755,
+            "6": 0.1173,
+            "7": 0.11302,
+            "8": 0.11176,
+            "9": 0.11094,
+            "10": 0.11205,
+            "11": 0.11214,
+            "12": 0.11069,
+            "13": 0.11128,
+            "14": 0.11089,
+            "15": 0.11218,
+            "16": 0.11119,
+            "17": 0.11088,
+            "18": 0.11035,
+            "19": 0.11159,
+            "20": 0.11079,
+            "21": 0.11182,
+            "22": 0.11081,
+            "23": 0.11148,
+            "24": 0.1122,
+            "25": 0.11117,
+            "26": 0.11184,
+            "27": 0.11686,
+            "28": 0.10976,
+            "29": 0.11011,
+            "30": 0.11235,
+            "31": 0.11032,
+            "32": 0.11316,
+            "33": 0.11177,
+            "34": 0.11253,
+            "35": 0.11045,
+            "36": 0.11022,
+            "37": 0.11032,
+            "38": 0.11201,
+            "39": 0.11511,
+            "40": 0.11021,
+            "41": 0.1116,
+            "42": 0.11045,
+            "43": 0.11205,
+            "44": 0.11101,
+            "45": 0.10943,
+            "46": 0.11006,
+            "47": 0.11008,
+            "48": 0.11033,
+            "49": 0.11205,
+            "50": 0.11073
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json
index f0460352ce3..c0c97884af8 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json
@@ -2,91 +2,286 @@
     "lm loss": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 10.92655,
-            "5": 10.92721,
-            "10": 10.90788,
-            "15": 10.88293,
-            "20": 10.77594,
-            "25": 10.59265,
-            "30": 10.39169,
-            "35": 10.29699,
-            "40": 10.09664,
-            "45": 9.84469,
-            "50": 9.90944
+            "1": 10.92228,
+            "2": 10.92833,
+            "3": 10.91713,
+            "4": 10.90497,
+            "5": 10.92809,
+            "6": 10.93672,
+            "7": 10.90401,
+            "8": 10.92229,
+            "9": 10.91253,
+            "10": 10.90846,
+            "11": 10.89336,
+            "12": 10.92081,
+            "13": 10.91489,
+            "14": 10.92148,
+            "15": 10.8843,
+            "16": 10.87455,
+            "17": 10.83919,
+            "18": 10.87311,
+            "19": 10.85334,
+            "20": 10.77493,
+            "21": 10.74758,
+            "22": 10.63148,
+            "23": 10.75623,
+            "24": 10.65569,
+            "25": 10.59216,
+            "26": 10.65326,
+            "27": 10.6488,
+            "28": 10.5966,
+            "29": 10.61012,
+            "30": 10.39285,
+            "31": 10.15722,
+            "32": 10.49215,
+            "33": 10.47941,
+            "34": 10.24018,
+            "35": 10.29713,
+            "36": 10.24563,
+            "37": 10.35285,
+            "38": 10.20535,
+            "39": 10.40419,
+            "40": 10.09552,
+            "41": 10.15278,
+            "42": 10.21882,
+            "43": 9.85529,
+            "44": 9.96247,
+            "45": 9.84617,
+            "46": 9.83801,
+            "47": 10.1389,
+            "48": 9.85697,
+            "49": 9.53751,
+            "50": 9.9088
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 1610.0,
-            "5": 1901.0,
-            "10": 1373.0,
-            "15": 1954.0,
-            "20": 1614.0,
-            "25": 1625.0,
-            "30": 1953.0,
-            "35": 1904.0,
-            "40": 2127.0,
-            "45": 2199.0,
-            "50": 2451.0
+            "1": 1683.0,
+            "2": 1789.0,
+            "3": 1705.0,
+            "4": 1788.0,
+            "5": 1911.0,
+            "6": 1820.0,
+            "7": 1935.0,
+            "8": 1724.0,
+            "9": 1964.0,
+            "10": 1499.0,
+            "11": 1906.0,
+            "12": 1864.0,
+            "13": 1941.0,
+            "14": 1882.0,
+            "15": 1914.0,
+            "16": 1816.0,
+            "17": 1814.0,
+            "18": 1735.0,
+            "19": 1765.0,
+            "20": 1633.0,
+            "21": 1858.0,
+            "22": 1702.0,
+            "23": 1957.0,
+            "24": 1663.0,
+            "25": 1580.0,
+            "26": 1773.0,
+            "27": 1964.0,
+            "28": 2058.0,
+            "29": 2109.0,
+            "30": 1904.0,
+            "31": 1580.0,
+            "32": 1928.0,
+            "33": 2226.0,
+            "34": 1919.0,
+            "35": 1920.0,
+            "36": 1980.0,
+            "37": 2309.0,
+            "38": 2303.0,
+            "39": 2437.0,
+            "40": 2238.0,
+            "41": 2326.0,
+            "42": 2254.0,
+            "43": 2060.0,
+            "44": 2146.0,
+            "45": 2102.0,
+            "46": 2345.0,
+            "47": 2550.0,
+            "48": 2499.0,
+            "49": 2276.0,
+            "50": 2574.0
         }
     },
     "mem-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 438468608.0,
-            "5": 438468608.0,
-            "10": 438468608.0,
-            "15": 438468608.0,
-            "20": 438468608.0,
-            "25": 438468608.0,
-            "30": 438468608.0,
-            "35": 438468608.0,
-            "40": 438468608.0,
-            "45": 438468608.0,
-            "50": 438468608.0
+            "1": 436765184.0,
+            "2": 436765184.0,
+            "3": 436765184.0,
+            "4": 436765184.0,
+            "5": 436765184.0,
+            "6": 436765184.0,
+            "7": 436765184.0,
+            "8": 436765184.0,
+            "9": 436765184.0,
+            "10": 436765184.0,
+            "11": 436765184.0,
+            "12": 436765184.0,
+            "13": 436765184.0,
+            "14": 436765184.0,
+            "15": 436765184.0,
+            "16": 436765184.0,
+            "17": 436765184.0,
+            "18": 436765184.0,
+            "19": 436765184.0,
+            "20": 436765184.0,
+            "21": 436765184.0,
+            "22": 436765184.0,
+            "23": 436765184.0,
+            "24": 436765184.0,
+            "25": 436765184.0,
+            "26": 436765184.0,
+            "27": 436765184.0,
+            "28": 436765184.0,
+            "29": 436765184.0,
+            "30": 436765184.0,
+            "31": 436765184.0,
+            "32": 436765184.0,
+            "33": 436765184.0,
+            "34": 436765184.0,
+            "35": 436765184.0,
+            "36": 436765184.0,
+            "37": 436765184.0,
+            "38": 436765184.0,
+            "39": 436765184.0,
+            "40": 436765184.0,
+            "41": 436765184.0,
+            "42": 436765184.0,
+            "43": 436765184.0,
+            "44": 436765184.0,
+            "45": 436765184.0,
+            "46": 436765184.0,
+            "47": 436765184.0,
+            "48": 436765184.0,
+            "49": 436765184.0,
+            "50": 436765184.0
         }
     },
     "mem-max-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 2658189824.0,
-            "5": 2658189824.0,
-            "10": 2658189824.0,
-            "15": 2658189824.0,
-            "20": 2658189824.0,
-            "25": 2658189824.0,
-            "30": 2658189824.0,
-            "35": 2658189824.0,
-            "40": 2658189824.0,
-            "45": 2658189824.0,
-            "50": 2658189824.0
+            "1": 1591768576.0,
+            "2": 1772628480.0,
+            "3": 1772628480.0,
+            "4": 1772628480.0,
+            "5": 1772628480.0,
+            "6": 1772628480.0,
+            "7": 1772628480.0,
+            "8": 1772628480.0,
+            "9": 1772628480.0,
+            "10": 1772628480.0,
+            "11": 1772628480.0,
+            "12": 1772628480.0,
+            "13": 1772628480.0,
+            "14": 1772628480.0,
+            "15": 1772628480.0,
+            "16": 1772628480.0,
+            "17": 1772628480.0,
+            "18": 1772628480.0,
+            "19": 1772628480.0,
+            "20": 1772628480.0,
+            "21": 1772628480.0,
+            "22": 1772628480.0,
+            "23": 1772628480.0,
+            "24": 1772628480.0,
+            "25": 1772628480.0,
+            "26": 1772628480.0,
+            "27": 1772628480.0,
+            "28": 1772628480.0,
+            "29": 1772628480.0,
+            "30": 1772628480.0,
+            "31": 1772628480.0,
+            "32": 1772628480.0,
+            "33": 1772628480.0,
+            "34": 1772628480.0,
+            "35": 1772628480.0,
+            "36": 1772628480.0,
+            "37": 1772628480.0,
+            "38": 1772628480.0,
+            "39": 1772628480.0,
+            "40": 1772628480.0,
+            "41": 1772628480.0,
+            "42": 1772628480.0,
+            "43": 1772628480.0,
+            "44": 1772628480.0,
+            "45": 1772628480.0,
+            "46": 1772628480.0,
+            "47": 1772628480.0,
+            "48": 1772628480.0,
+            "49": 1772628480.0,
+            "50": 1772628480.0
         }
     },
     "iteration-time": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 16.65648,
-            "5": 0.18713,
-            "10": 0.18827,
-            "15": 0.18525,
-            "20": 0.18524,
-            "25": 0.18364,
-            "30": 0.18457,
-            "35": 0.18387,
-            "40": 0.18487,
-            "45": 0.18218,
-            "50": 0.18439
+            "1": 3.95122,
+            "2": 0.20629,
+            "3": 0.19002,
+            "4": 0.17151,
+            "5": 0.16892,
+            "6": 0.16922,
+            "7": 0.16965,
+            "8": 0.17504,
+            "9": 0.17459,
+            "10": 0.17897,
+            "11": 0.17409,
+            "12": 0.1744,
+            "13": 0.17287,
+            "14": 0.17379,
+            "15": 0.17494,
+            "16": 0.17728,
+            "17": 0.17415,
+            "18": 0.17375,
+            "19": 0.17472,
+            "20": 0.17419,
+            "21": 0.17564,
+            "22": 0.17531,
+            "23": 0.17363,
+            "24": 0.17467,
+            "25": 0.17519,
+            "26": 0.17584,
+            "27": 0.17619,
+            "28": 0.17299,
+            "29": 0.17468,
+            "30": 0.17335,
+            "31": 0.17523,
+            "32": 0.17349,
+            "33": 0.17387,
+            "34": 0.17508,
+            "35": 0.1743,
+            "36": 0.17468,
+            "37": 0.17489,
+            "38": 0.17296,
+            "39": 0.17553,
+            "40": 0.1747,
+            "41": 0.17437,
+            "42": 0.17471,
+            "43": 0.17492,
+            "44": 0.17376,
+            "45": 0.17488,
+            "46": 0.17514,
+            "47": 0.17599,
+            "48": 0.17634,
+            "49": 0.17525,
+            "50": 0.17524
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..31729dd5fe5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.94863,
+            "2": 10.95748,
+            "3": 10.95458,
+            "4": 10.95314,
+            "5": 10.94301,
+            "6": 10.93709,
+            "7": 10.94818,
+            "8": 10.94698,
+            "9": 10.94866,
+            "10": 10.95119,
+            "11": 10.9406,
+            "12": 10.94105,
+            "13": 10.94375,
+            "14": 10.94739,
+            "15": 10.9429,
+            "16": 10.93682,
+            "17": 10.94182,
+            "18": 10.93022,
+            "19": 10.93614,
+            "20": 10.92135,
+            "21": 10.91434,
+            "22": 10.92114,
+            "23": 10.92039,
+            "24": 10.91062,
+            "25": 10.91171,
+            "26": 10.9101,
+            "27": 10.90559,
+            "28": 10.87901,
+            "29": 10.87862,
+            "30": 10.82431,
+            "31": 10.7917,
+            "32": 10.85763,
+            "33": 10.85278,
+            "34": 10.80465,
+            "35": 10.81124,
+            "36": 10.79299,
+            "37": 10.82161,
+            "38": 10.74654,
+            "39": 10.79066,
+            "40": 10.67639,
+            "41": 10.71189,
+            "42": 10.72663,
+            "43": 10.58635,
+            "44": 10.63487,
+            "45": 10.59555,
+            "46": 10.58202,
+            "47": 10.67878,
+            "48": 10.55683,
+            "49": 10.43321,
+            "50": 10.57623
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 22792076.0,
+            "2": 22989660.0,
+            "3": 22661158.0,
+            "4": 23283080.0,
+            "5": 22778860.0,
+            "6": 23085232.0,
+            "7": 22834892.0,
+            "8": 22990502.0,
+            "9": 22906480.0,
+            "10": 22983488.0,
+            "11": 22563552.0,
+            "12": 22523694.0,
+            "13": 22980968.0,
+            "14": 22453154.0,
+            "15": 22885546.0,
+            "16": 22895028.0,
+            "17": 22882956.0,
+            "18": 22647168.0,
+            "19": 22682056.0,
+            "20": 22757444.0,
+            "21": 22803808.0,
+            "22": 22864026.0,
+            "23": 22603204.0,
+            "24": 22835232.0,
+            "25": 22883270.0,
+            "26": 22611998.0,
+            "27": 22532132.0,
+            "28": 22516960.0,
+            "29": 22593572.0,
+            "30": 22695024.0,
+            "31": 23019244.0,
+            "32": 22648204.0,
+            "33": 22623192.0,
+            "34": 22899922.0,
+            "35": 22852560.0,
+            "36": 22652964.0,
+            "37": 22559866.0,
+            "38": 22960222.0,
+            "39": 22864432.0,
+            "40": 22721420.0,
+            "41": 22722086.0,
+            "42": 22730128.0,
+            "43": 23040178.0,
+            "44": 22809816.0,
+            "45": 22738252.0,
+            "46": 22947510.0,
+            "47": 22697018.0,
+            "48": 22992168.0,
+            "49": 22790946.0,
+            "50": 22969044.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 387483136.0,
+            "2": 387483136.0,
+            "3": 387483136.0,
+            "4": 387483136.0,
+            "5": 387483136.0,
+            "6": 387483136.0,
+            "7": 387483136.0,
+            "8": 387483136.0,
+            "9": 387483136.0,
+            "10": 387483136.0,
+            "11": 387483136.0,
+            "12": 387483136.0,
+            "13": 387483136.0,
+            "14": 387483136.0,
+            "15": 387483136.0,
+            "16": 387483136.0,
+            "17": 387483136.0,
+            "18": 387483136.0,
+            "19": 387483136.0,
+            "20": 387483136.0,
+            "21": 387483136.0,
+            "22": 387483136.0,
+            "23": 387483136.0,
+            "24": 387483136.0,
+            "25": 387483136.0,
+            "26": 387483136.0,
+            "27": 387483136.0,
+            "28": 387483136.0,
+            "29": 387483136.0,
+            "30": 387483136.0,
+            "31": 387483136.0,
+            "32": 387483136.0,
+            "33": 387483136.0,
+            "34": 387483136.0,
+            "35": 387483136.0,
+            "36": 387483136.0,
+            "37": 387483136.0,
+            "38": 387483136.0,
+            "39": 387483136.0,
+            "40": 387483136.0,
+            "41": 387483136.0,
+            "42": 387483136.0,
+            "43": 387483136.0,
+            "44": 387483136.0,
+            "45": 387483136.0,
+            "46": 387483136.0,
+            "47": 387483136.0,
+            "48": 387483136.0,
+            "49": 387483136.0,
+            "50": 387483136.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1122385408.0,
+            "2": 1245635072.0,
+            "3": 1245635072.0,
+            "4": 1245635072.0,
+            "5": 1245635072.0,
+            "6": 1245635072.0,
+            "7": 1245635072.0,
+            "8": 1245635072.0,
+            "9": 1245635072.0,
+            "10": 1245635072.0,
+            "11": 1245635072.0,
+            "12": 1245635072.0,
+            "13": 1245635072.0,
+            "14": 1245635072.0,
+            "15": 1245635072.0,
+            "16": 1245635072.0,
+            "17": 1245635072.0,
+            "18": 1245635072.0,
+            "19": 1245635072.0,
+            "20": 1245635072.0,
+            "21": 1245635072.0,
+            "22": 1245635072.0,
+            "23": 1245635072.0,
+            "24": 1245635072.0,
+            "25": 1245635072.0,
+            "26": 1245635072.0,
+            "27": 1245635072.0,
+            "28": 1245635072.0,
+            "29": 1245635072.0,
+            "30": 1245635072.0,
+            "31": 1245635072.0,
+            "32": 1245635072.0,
+            "33": 1245635072.0,
+            "34": 1245635072.0,
+            "35": 1245635072.0,
+            "36": 1245635072.0,
+            "37": 1245635072.0,
+            "38": 1245635072.0,
+            "39": 1245635072.0,
+            "40": 1245635072.0,
+            "41": 1245635072.0,
+            "42": 1245635072.0,
+            "43": 1245635072.0,
+            "44": 1245635072.0,
+            "45": 1245635072.0,
+            "46": 1245635072.0,
+            "47": 1245635072.0,
+            "48": 1245635072.0,
+            "49": 1245635072.0,
+            "50": 1245635072.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.75152,
+            "3": 0.1678,
+            "4": 0.1543,
+            "5": 0.15772,
+            "6": 0.15798,
+            "7": 0.15886,
+            "8": 0.16038,
+            "9": 0.15983,
+            "10": 0.16009,
+            "11": 0.15881,
+            "12": 0.16004,
+            "13": 0.15648,
+            "14": 0.15396,
+            "15": 0.15394,
+            "16": 0.1544,
+            "17": 0.15329,
+            "18": 0.1539,
+            "19": 0.15442,
+            "20": 0.1521,
+            "21": 0.15368,
+            "22": 0.15287,
+            "23": 0.15397,
+            "24": 0.15553,
+            "25": 0.15617,
+            "26": 0.15925,
+            "27": 0.145,
+            "28": 0.14456,
+            "29": 0.14869,
+            "30": 0.15407,
+            "31": 0.15556,
+            "32": 0.15651,
+            "33": 0.15726,
+            "34": 0.1574,
+            "35": 0.15981,
+            "36": 0.16037,
+            "37": 0.16044,
+            "38": 0.15744,
+            "39": 0.15875,
+            "40": 0.15964,
+            "41": 0.15984,
+            "42": 0.1605,
+            "43": 0.15901,
+            "44": 0.16037,
+            "45": 0.1616,
+            "46": 0.16046,
+            "47": 0.16125,
+            "48": 0.16168,
+            "49": 0.1611,
+            "50": 0.15977
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..49a5e4f8a21
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 10.9101,
+            "27": 10.90559,
+            "28": 10.87901,
+            "29": 10.87862,
+            "30": 10.82431,
+            "31": 10.7917,
+            "32": 10.85763,
+            "33": 10.85278,
+            "34": 10.80465,
+            "35": 10.81124,
+            "36": 10.79299,
+            "37": 10.82161,
+            "38": 10.74654,
+            "39": 10.79066,
+            "40": 10.67639,
+            "41": 10.71189,
+            "42": 10.72663,
+            "43": 10.58635,
+            "44": 10.63487,
+            "45": 10.59555,
+            "46": 10.58202,
+            "47": 10.67878,
+            "48": 10.55683,
+            "49": 10.43321,
+            "50": 10.57623
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 22611998.0,
+            "27": 22532132.0,
+            "28": 22516960.0,
+            "29": 22593572.0,
+            "30": 22695024.0,
+            "31": 23019244.0,
+            "32": 22648204.0,
+            "33": 22623192.0,
+            "34": 22899922.0,
+            "35": 22852560.0,
+            "36": 22652964.0,
+            "37": 22559866.0,
+            "38": 22960222.0,
+            "39": 22864432.0,
+            "40": 22721420.0,
+            "41": 22722086.0,
+            "42": 22730128.0,
+            "43": 23040178.0,
+            "44": 22809816.0,
+            "45": 22738252.0,
+            "46": 22947510.0,
+            "47": 22697018.0,
+            "48": 22992168.0,
+            "49": 22790946.0,
+            "50": 22969044.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 389056000.0,
+            "27": 389056000.0,
+            "28": 389056000.0,
+            "29": 389056000.0,
+            "30": 389056000.0,
+            "31": 389056000.0,
+            "32": 389056000.0,
+            "33": 389056000.0,
+            "34": 389056000.0,
+            "35": 389056000.0,
+            "36": 389056000.0,
+            "37": 389056000.0,
+            "38": 389056000.0,
+            "39": 389056000.0,
+            "40": 389056000.0,
+            "41": 389056000.0,
+            "42": 389056000.0,
+            "43": 389056000.0,
+            "44": 389056000.0,
+            "45": 389056000.0,
+            "46": 389056000.0,
+            "47": 389056000.0,
+            "48": 389056000.0,
+            "49": 389056000.0,
+            "50": 389056000.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 1247206912.0,
+            "27": 1247207936.0,
+            "28": 1247207936.0,
+            "29": 1247207936.0,
+            "30": 1247207936.0,
+            "31": 1247207936.0,
+            "32": 1247207936.0,
+            "33": 1247207936.0,
+            "34": 1247207936.0,
+            "35": 1247207936.0,
+            "36": 1247207936.0,
+            "37": 1247207936.0,
+            "38": 1247207936.0,
+            "39": 1247207936.0,
+            "40": 1247207936.0,
+            "41": 1247207936.0,
+            "42": 1247207936.0,
+            "43": 1247207936.0,
+            "44": 1247207936.0,
+            "45": 1247207936.0,
+            "46": 1247207936.0,
+            "47": 1247207936.0,
+            "48": 1247207936.0,
+            "49": 1247207936.0,
+            "50": 1247207936.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": 2.4291,
+            "28": 0.15494,
+            "29": 0.14099,
+            "30": 0.13913,
+            "31": 0.1391,
+            "32": 0.13835,
+            "33": 0.13909,
+            "34": 0.13882,
+            "35": 0.13918,
+            "36": 0.13936,
+            "37": 0.1396,
+            "38": 0.14038,
+            "39": 0.14154,
+            "40": 0.14205,
+            "41": 0.14186,
+            "42": 0.1401,
+            "43": 0.14017,
+            "44": 0.14,
+            "45": 0.13933,
+            "46": 0.13921,
+            "47": 0.13941,
+            "48": 0.13867,
+            "49": 0.14055,
+            "50": 0.14041
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100.json
index d0103111a28..bb945f7d249 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100.json
@@ -44,16 +44,16 @@
             "38": 10.72434,
             "39": 10.78066,
             "40": 10.65927,
-            "41": 10.69208,
-            "42": 10.70973,
-            "43": 10.56128,
-            "44": 10.61369,
-            "45": 10.56875,
-            "46": 10.54455,
+            "41": 10.69209,
+            "42": 10.70974,
+            "43": 10.56129,
+            "44": 10.61371,
+            "45": 10.56874,
+            "46": 10.54454,
             "47": 10.66751,
             "48": 10.53792,
-            "49": 10.40861,
-            "50": 10.55421
+            "49": 10.40859,
+            "50": 10.5542
         }
     },
     "num-zeros": {
@@ -100,17 +100,17 @@
             "37": 22560476.0,
             "38": 22960058.0,
             "39": 22865476.0,
-            "40": 22721680.0,
+            "40": 22721690.0,
             "41": 22723112.0,
-            "42": 22730726.0,
-            "43": 23039588.0,
-            "44": 22810020.0,
-            "45": 22738904.0,
-            "46": 22948334.0,
-            "47": 22696668.0,
-            "48": 22992832.0,
-            "49": 22791208.0,
-            "50": 22968272.0
+            "42": 22730692.0,
+            "43": 23039608.0,
+            "44": 22809964.0,
+            "45": 22738932.0,
+            "46": 22948360.0,
+            "47": 22696800.0,
+            "48": 22992776.0,
+            "49": 22791104.0,
+            "50": 22968342.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 387744256.0,
-            "2": 387744256.0,
-            "3": 387744256.0,
-            "4": 387744256.0,
-            "5": 387744256.0,
-            "6": 387744256.0,
-            "7": 387744256.0,
-            "8": 387744256.0,
-            "9": 387744256.0,
-            "10": 387744256.0,
-            "11": 387744256.0,
-            "12": 387744256.0,
-            "13": 387744256.0,
-            "14": 387744256.0,
-            "15": 387744256.0,
-            "16": 387744256.0,
-            "17": 387744256.0,
-            "18": 387744256.0,
-            "19": 387744256.0,
-            "20": 387744256.0,
-            "21": 387744256.0,
-            "22": 387744256.0,
-            "23": 387744256.0,
-            "24": 387744256.0,
-            "25": 387744256.0,
-            "26": 387744256.0,
-            "27": 387744256.0,
-            "28": 387744256.0,
-            "29": 387744256.0,
-            "30": 387744256.0,
-            "31": 387744256.0,
-            "32": 387744256.0,
-            "33": 387744256.0,
-            "34": 387744256.0,
-            "35": 387744256.0,
-            "36": 387744256.0,
-            "37": 387744256.0,
-            "38": 387744256.0,
-            "39": 387744256.0,
-            "40": 387744256.0,
-            "41": 387744256.0,
-            "42": 387744256.0,
-            "43": 387744256.0,
-            "44": 387744256.0,
-            "45": 387744256.0,
-            "46": 387744256.0,
-            "47": 387744256.0,
-            "48": 387744256.0,
-            "49": 387744256.0,
-            "50": 387744256.0
+            "1": 387483136.0,
+            "2": 387483136.0,
+            "3": 387483136.0,
+            "4": 387483136.0,
+            "5": 387483136.0,
+            "6": 387483136.0,
+            "7": 387483136.0,
+            "8": 387483136.0,
+            "9": 387483136.0,
+            "10": 387483136.0,
+            "11": 387483136.0,
+            "12": 387483136.0,
+            "13": 387483136.0,
+            "14": 387483136.0,
+            "15": 387483136.0,
+            "16": 387483136.0,
+            "17": 387483136.0,
+            "18": 387483136.0,
+            "19": 387483136.0,
+            "20": 387483136.0,
+            "21": 387483136.0,
+            "22": 387483136.0,
+            "23": 387483136.0,
+            "24": 387483136.0,
+            "25": 387483136.0,
+            "26": 387483136.0,
+            "27": 387483136.0,
+            "28": 387483136.0,
+            "29": 387483136.0,
+            "30": 387483136.0,
+            "31": 387483136.0,
+            "32": 387483136.0,
+            "33": 387483136.0,
+            "34": 387483136.0,
+            "35": 387483136.0,
+            "36": 387483136.0,
+            "37": 387483136.0,
+            "38": 387483136.0,
+            "39": 387483136.0,
+            "40": 387483136.0,
+            "41": 387483136.0,
+            "42": 387483136.0,
+            "43": 387483136.0,
+            "44": 387483136.0,
+            "45": 387483136.0,
+            "46": 387483136.0,
+            "47": 387483136.0,
+            "48": 387483136.0,
+            "49": 387483136.0,
+            "50": 387483136.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1122646528.0,
-            "2": 1245896192.0,
-            "3": 1245896192.0,
-            "4": 1245896192.0,
-            "5": 1245896192.0,
-            "6": 1245896192.0,
-            "7": 1245896192.0,
-            "8": 1245896192.0,
-            "9": 1245896192.0,
-            "10": 1245896192.0,
-            "11": 1245896192.0,
-            "12": 1245896192.0,
-            "13": 1245896192.0,
-            "14": 1245896192.0,
-            "15": 1245896192.0,
-            "16": 1245896192.0,
-            "17": 1245896192.0,
-            "18": 1245896192.0,
-            "19": 1245896192.0,
-            "20": 1245896192.0,
-            "21": 1245896192.0,
-            "22": 1245896192.0,
-            "23": 1245896192.0,
-            "24": 1245896192.0,
-            "25": 1245896192.0,
-            "26": 1245896192.0,
-            "27": 1245896192.0,
-            "28": 1245896192.0,
-            "29": 1245896192.0,
-            "30": 1245896192.0,
-            "31": 1245896192.0,
-            "32": 1245896192.0,
-            "33": 1245896192.0,
-            "34": 1245896192.0,
-            "35": 1245896192.0,
-            "36": 1245896192.0,
-            "37": 1245896192.0,
-            "38": 1245896192.0,
-            "39": 1245896192.0,
-            "40": 1245896192.0,
-            "41": 1245896192.0,
-            "42": 1245896192.0,
-            "43": 1245896192.0,
-            "44": 1245896192.0,
-            "45": 1245896192.0,
-            "46": 1245896192.0,
-            "47": 1245896192.0,
-            "48": 1245896192.0,
-            "49": 1245896192.0,
-            "50": 1245896192.0
+            "1": 1122385408.0,
+            "2": 1245635072.0,
+            "3": 1245635072.0,
+            "4": 1245635072.0,
+            "5": 1245635072.0,
+            "6": 1245635072.0,
+            "7": 1245635072.0,
+            "8": 1245635072.0,
+            "9": 1245635072.0,
+            "10": 1245635072.0,
+            "11": 1245635072.0,
+            "12": 1245635072.0,
+            "13": 1245635072.0,
+            "14": 1245635072.0,
+            "15": 1245635072.0,
+            "16": 1245635072.0,
+            "17": 1245635072.0,
+            "18": 1245635072.0,
+            "19": 1245635072.0,
+            "20": 1245635072.0,
+            "21": 1245635072.0,
+            "22": 1245635072.0,
+            "23": 1245635072.0,
+            "24": 1245635072.0,
+            "25": 1245635072.0,
+            "26": 1245635072.0,
+            "27": 1245635072.0,
+            "28": 1245635072.0,
+            "29": 1245635072.0,
+            "30": 1245635072.0,
+            "31": 1245635072.0,
+            "32": 1245635072.0,
+            "33": 1245635072.0,
+            "34": 1245635072.0,
+            "35": 1245635072.0,
+            "36": 1245635072.0,
+            "37": 1245635072.0,
+            "38": 1245635072.0,
+            "39": 1245635072.0,
+            "40": 1245635072.0,
+            "41": 1245635072.0,
+            "42": 1245635072.0,
+            "43": 1245635072.0,
+            "44": 1245635072.0,
+            "45": 1245635072.0,
+            "46": 1245635072.0,
+            "47": 1245635072.0,
+            "48": 1245635072.0,
+            "49": 1245635072.0,
+            "50": 1245635072.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 9.86323,
-            "2": 0.13474,
-            "3": 0.1236,
-            "4": 0.12168,
-            "5": 0.12406,
-            "6": 0.12501,
-            "7": 0.12711,
-            "8": 0.12778,
-            "9": 0.12839,
-            "10": 0.12143,
-            "11": 0.12109,
-            "12": 0.12077,
-            "13": 0.11905,
-            "14": 0.12184,
-            "15": 0.12152,
-            "16": 0.11812,
-            "17": 0.11693,
-            "18": 0.11549,
-            "19": 0.11712,
-            "20": 0.11675,
-            "21": 0.11877,
-            "22": 0.11837,
-            "23": 0.11757,
-            "24": 0.11636,
-            "25": 0.11722,
-            "26": 0.12393,
-            "27": 0.11736,
-            "28": 0.11759,
-            "29": 0.11945,
-            "30": 0.11726,
-            "31": 0.12096,
-            "32": 0.12206,
-            "33": 0.11734,
-            "34": 0.11894,
-            "35": 0.11695,
-            "36": 0.11712,
-            "37": 0.11489,
-            "38": 0.11866,
-            "39": 0.11749,
-            "40": 0.11829,
-            "41": 0.11674,
-            "42": 0.1181,
-            "43": 0.11808,
-            "44": 0.11621,
-            "45": 0.11832,
-            "46": 0.12031,
-            "47": 0.12023,
-            "48": 0.11643,
-            "49": 0.11855,
-            "50": 0.11792
+            "1": 11.55479,
+            "2": 0.135,
+            "3": 0.11559,
+            "4": 0.10311,
+            "5": 0.10091,
+            "6": 0.10054,
+            "7": 0.10125,
+            "8": 0.10194,
+            "9": 0.10124,
+            "10": 0.10175,
+            "11": 0.10044,
+            "12": 0.10706,
+            "13": 0.10279,
+            "14": 0.10111,
+            "15": 0.10071,
+            "16": 0.10185,
+            "17": 0.10255,
+            "18": 0.10134,
+            "19": 0.10086,
+            "20": 0.10058,
+            "21": 0.10136,
+            "22": 0.09986,
+            "23": 0.10128,
+            "24": 0.1004,
+            "25": 0.10123,
+            "26": 0.10374,
+            "27": 0.09272,
+            "28": 0.09193,
+            "29": 0.09389,
+            "30": 0.09165,
+            "31": 0.09164,
+            "32": 0.09201,
+            "33": 0.09402,
+            "34": 0.09129,
+            "35": 0.09235,
+            "36": 0.09303,
+            "37": 0.09091,
+            "38": 0.09089,
+            "39": 0.09141,
+            "40": 0.09122,
+            "41": 0.0948,
+            "42": 0.09477,
+            "43": 0.09276,
+            "44": 0.09423,
+            "45": 0.09477,
+            "46": 0.09451,
+            "47": 0.0941,
+            "48": 0.0934,
+            "49": 0.09315,
+            "50": 0.09366
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..f9b157ad760
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 10.88155,
+            "27": 10.88649,
+            "28": 10.85679,
+            "29": 10.85657,
+            "30": 10.81423,
+            "31": 10.76651,
+            "32": 10.83131,
+            "33": 10.83158,
+            "34": 10.78071,
+            "35": 10.78865,
+            "36": 10.78003,
+            "37": 10.80446,
+            "38": 10.72434,
+            "39": 10.78066,
+            "40": 10.65927,
+            "41": 10.69209,
+            "42": 10.70974,
+            "43": 10.56129,
+            "44": 10.61371,
+            "45": 10.56874,
+            "46": 10.54454,
+            "47": 10.66751,
+            "48": 10.53792,
+            "49": 10.40859,
+            "50": 10.5542
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 22611358.0,
+            "27": 22532968.0,
+            "28": 22517794.0,
+            "29": 22593448.0,
+            "30": 22695256.0,
+            "31": 23019472.0,
+            "32": 22648896.0,
+            "33": 22622516.0,
+            "34": 22899620.0,
+            "35": 22851572.0,
+            "36": 22653160.0,
+            "37": 22560476.0,
+            "38": 22960058.0,
+            "39": 22865476.0,
+            "40": 22721690.0,
+            "41": 22723112.0,
+            "42": 22730692.0,
+            "43": 23039608.0,
+            "44": 22809964.0,
+            "45": 22738932.0,
+            "46": 22948360.0,
+            "47": 22696800.0,
+            "48": 22992776.0,
+            "49": 22791104.0,
+            "50": 22968342.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 389056000.0,
+            "27": 389056000.0,
+            "28": 389056000.0,
+            "29": 389056000.0,
+            "30": 389056000.0,
+            "31": 389056000.0,
+            "32": 389056000.0,
+            "33": 389056000.0,
+            "34": 389056000.0,
+            "35": 389056000.0,
+            "36": 389056000.0,
+            "37": 389056000.0,
+            "38": 389056000.0,
+            "39": 389056000.0,
+            "40": 389056000.0,
+            "41": 389056000.0,
+            "42": 389056000.0,
+            "43": 389056000.0,
+            "44": 389056000.0,
+            "45": 389056000.0,
+            "46": 389056000.0,
+            "47": 389056000.0,
+            "48": 389056000.0,
+            "49": 389056000.0,
+            "50": 389056000.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 1247206912.0,
+            "27": 1247207936.0,
+            "28": 1247207936.0,
+            "29": 1247207936.0,
+            "30": 1247207936.0,
+            "31": 1247207936.0,
+            "32": 1247207936.0,
+            "33": 1247207936.0,
+            "34": 1247207936.0,
+            "35": 1247207936.0,
+            "36": 1247207936.0,
+            "37": 1247207936.0,
+            "38": 1247207936.0,
+            "39": 1247207936.0,
+            "40": 1247207936.0,
+            "41": 1247207936.0,
+            "42": 1247207936.0,
+            "43": 1247207936.0,
+            "44": 1247207936.0,
+            "45": 1247207936.0,
+            "46": 1247207936.0,
+            "47": 1247207936.0,
+            "48": 1247207936.0,
+            "49": 1247207936.0,
+            "50": 1247207936.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": 8.47899,
+            "27": 0.12956,
+            "28": 0.10522,
+            "29": 0.09836,
+            "30": 0.09498,
+            "31": 0.09443,
+            "32": 0.09442,
+            "33": 0.09859,
+            "34": 0.09556,
+            "35": 0.0936,
+            "36": 0.0976,
+            "37": 0.09323,
+            "38": 0.09427,
+            "39": 0.09365,
+            "40": 0.09264,
+            "41": 0.09618,
+            "42": 0.09384,
+            "43": 0.0938,
+            "44": 0.09376,
+            "45": 0.093,
+            "46": 0.09376,
+            "47": 0.0942,
+            "48": 0.09416,
+            "49": 0.09367,
+            "50": 0.09361
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/model_config.yaml
index 3d1b350ced0..d6384e7f604 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/model_config.yaml
@@ -45,7 +45,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..caa1e54ee64
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86897,
+            "2": 10.88544,
+            "3": 10.86473,
+            "4": 10.86826,
+            "5": 10.87436,
+            "6": 10.89005,
+            "7": 10.87769,
+            "8": 10.86364,
+            "9": 10.88282,
+            "10": 10.84687,
+            "11": 10.87102,
+            "12": 10.87345,
+            "13": 10.8814,
+            "14": 10.8877,
+            "15": 10.83869,
+            "16": 10.8239,
+            "17": 10.80197,
+            "18": 10.81094,
+            "19": 10.82192,
+            "20": 10.71791,
+            "21": 10.68914,
+            "22": 10.57271,
+            "23": 10.7081,
+            "24": 10.59543,
+            "25": 10.55292,
+            "26": 10.61257,
+            "27": 10.60051,
+            "28": 10.56173,
+            "29": 10.58089,
+            "30": 10.35595,
+            "31": 10.1182,
+            "32": 10.44815,
+            "33": 10.4542,
+            "34": 10.21553,
+            "35": 10.26124,
+            "36": 10.20776,
+            "37": 10.33673,
+            "38": 10.17741,
+            "39": 10.39297,
+            "40": 10.06349,
+            "41": 10.13887,
+            "42": 10.2056,
+            "43": 9.82809,
+            "44": 9.94547,
+            "45": 9.82561,
+            "46": 9.80186,
+            "47": 10.14049,
+            "48": 9.84276,
+            "49": 9.52016,
+            "50": 9.88454,
+            "51": 9.84743,
+            "52": 9.74209,
+            "53": 10.05697,
+            "54": 9.9505,
+            "55": 9.88145,
+            "56": 9.61274,
+            "57": 9.4687,
+            "58": 9.82193,
+            "59": 9.57642,
+            "60": 9.49762,
+            "61": 9.69189,
+            "62": 9.9867,
+            "63": 9.37512,
+            "64": 9.76679,
+            "65": 8.94648,
+            "66": 9.7023,
+            "67": 9.36326,
+            "68": 9.7831,
+            "69": 9.7986,
+            "70": 9.7317,
+            "71": 9.62571,
+            "72": 9.58488,
+            "73": 9.48967,
+            "74": 8.9286,
+            "75": 9.40862,
+            "76": 9.07925,
+            "77": 10.0594,
+            "78": 9.72288,
+            "79": 9.37784,
+            "80": 9.40429,
+            "81": 9.48309,
+            "82": 9.7004,
+            "83": 9.31595,
+            "84": 9.41838,
+            "85": 9.61685,
+            "86": 9.07533,
+            "87": 9.59616,
+            "88": 9.75215,
+            "89": 9.60184,
+            "90": 9.82281,
+            "91": 9.34037,
+            "92": 9.35854,
+            "93": 9.08805,
+            "94": 8.83037,
+            "95": 9.5266,
+            "96": 9.53049,
+            "97": 9.30389,
+            "98": 9.67196,
+            "99": 8.89637,
+            "100": 9.40644
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1621.0,
+            "2": 1657.0,
+            "3": 1580.0,
+            "4": 1839.0,
+            "5": 1862.0,
+            "6": 1724.0,
+            "7": 1714.0,
+            "8": 1670.0,
+            "9": 1762.0,
+            "10": 1358.0,
+            "11": 1734.0,
+            "12": 1682.0,
+            "13": 1761.0,
+            "14": 1731.0,
+            "15": 1788.0,
+            "16": 1801.0,
+            "17": 1866.0,
+            "18": 1636.0,
+            "19": 1709.0,
+            "20": 1607.0,
+            "21": 1821.0,
+            "22": 1666.0,
+            "23": 1991.0,
+            "24": 1585.0,
+            "25": 1587.0,
+            "26": 1631.0,
+            "27": 1714.0,
+            "28": 1966.0,
+            "29": 1997.0,
+            "30": 1851.0,
+            "31": 1581.0,
+            "32": 1864.0,
+            "33": 2107.0,
+            "34": 1846.0,
+            "35": 1982.0,
+            "36": 1904.0,
+            "37": 2373.0,
+            "38": 2172.0,
+            "39": 2343.0,
+            "40": 2149.0,
+            "41": 2331.0,
+            "42": 2199.0,
+            "43": 1914.0,
+            "44": 2065.0,
+            "45": 2081.0,
+            "46": 2352.0,
+            "47": 2497.0,
+            "48": 2303.0,
+            "49": 2346.0,
+            "50": 2411.0,
+            "51": 2491.0,
+            "52": 2552.0,
+            "53": 2980.0,
+            "54": 2680.0,
+            "55": 2274.0,
+            "56": 2734.0,
+            "57": 2319.0,
+            "58": 2907.0,
+            "59": 2886.0,
+            "60": 2566.0,
+            "61": 2855.0,
+            "62": 2704.0,
+            "63": 2370.0,
+            "64": 2998.0,
+            "65": 2563.0,
+            "66": 2868.0,
+            "67": 2762.0,
+            "68": 2739.0,
+            "69": 2730.0,
+            "70": 3156.0,
+            "71": 2803.0,
+            "72": 2506.0,
+            "73": 2896.0,
+            "74": 1937.0,
+            "75": 2450.0,
+            "76": 2794.0,
+            "77": 3047.0,
+            "78": 3104.0,
+            "79": 3069.0,
+            "80": 3286.0,
+            "81": 3543.0,
+            "82": 3192.0,
+            "83": 2614.0,
+            "84": 3273.0,
+            "85": 3111.0,
+            "86": 2680.0,
+            "87": 3654.0,
+            "88": 3117.0,
+            "89": 3351.0,
+            "90": 3086.0,
+            "91": 2721.0,
+            "92": 3045.0,
+            "93": 2672.0,
+            "94": 3326.0,
+            "95": 3125.0,
+            "96": 3309.0,
+            "97": 3208.0,
+            "98": 3572.0,
+            "99": 2980.0,
+            "100": 3355.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 514359808.0,
+            "2": 514359808.0,
+            "3": 514359808.0,
+            "4": 514359808.0,
+            "5": 514359808.0,
+            "6": 514359808.0,
+            "7": 514359808.0,
+            "8": 514359808.0,
+            "9": 514359808.0,
+            "10": 514359808.0,
+            "11": 514359808.0,
+            "12": 514359808.0,
+            "13": 514359808.0,
+            "14": 514359808.0,
+            "15": 514359808.0,
+            "16": 514359808.0,
+            "17": 514359808.0,
+            "18": 514359808.0,
+            "19": 514359808.0,
+            "20": 514359808.0,
+            "21": 514359808.0,
+            "22": 514359808.0,
+            "23": 514359808.0,
+            "24": 514359808.0,
+            "25": 514359808.0,
+            "26": 514359808.0,
+            "27": 514359808.0,
+            "28": 514359808.0,
+            "29": 514359808.0,
+            "30": 514359808.0,
+            "31": 514359808.0,
+            "32": 514359808.0,
+            "33": 514359808.0,
+            "34": 514359808.0,
+            "35": 514359808.0,
+            "36": 514359808.0,
+            "37": 514359808.0,
+            "38": 514359808.0,
+            "39": 514359808.0,
+            "40": 514359808.0,
+            "41": 514359808.0,
+            "42": 514359808.0,
+            "43": 514359808.0,
+            "44": 514359808.0,
+            "45": 514359808.0,
+            "46": 514359808.0,
+            "47": 514359808.0,
+            "48": 514359808.0,
+            "49": 514359808.0,
+            "50": 514359808.0,
+            "51": 514359808.0,
+            "52": 514359808.0,
+            "53": 514359808.0,
+            "54": 514359808.0,
+            "55": 514359808.0,
+            "56": 514359808.0,
+            "57": 514359808.0,
+            "58": 514359808.0,
+            "59": 514359808.0,
+            "60": 514359808.0,
+            "61": 514359808.0,
+            "62": 514359808.0,
+            "63": 514359808.0,
+            "64": 514359808.0,
+            "65": 514359808.0,
+            "66": 514359808.0,
+            "67": 514359808.0,
+            "68": 514359808.0,
+            "69": 514359808.0,
+            "70": 514359808.0,
+            "71": 514359808.0,
+            "72": 514359808.0,
+            "73": 514359808.0,
+            "74": 514359808.0,
+            "75": 514359808.0,
+            "76": 514359808.0,
+            "77": 514359808.0,
+            "78": 514359808.0,
+            "79": 514359808.0,
+            "80": 514359808.0,
+            "81": 514359808.0,
+            "82": 514359808.0,
+            "83": 514359808.0,
+            "84": 514359808.0,
+            "85": 514359808.0,
+            "86": 514359808.0,
+            "87": 514359808.0,
+            "88": 514359808.0,
+            "89": 514359808.0,
+            "90": 514359808.0,
+            "91": 514359808.0,
+            "92": 514359808.0,
+            "93": 514359808.0,
+            "94": 514359808.0,
+            "95": 514359808.0,
+            "96": 514359808.0,
+            "97": 514359808.0,
+            "98": 514359808.0,
+            "99": 514359808.0,
+            "100": 514359808.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1258060288.0,
+            "2": 1437084160.0,
+            "3": 1437084160.0,
+            "4": 1437084160.0,
+            "5": 1437084160.0,
+            "6": 1437084160.0,
+            "7": 1437084160.0,
+            "8": 1437084160.0,
+            "9": 1437084160.0,
+            "10": 1437084160.0,
+            "11": 1437084160.0,
+            "12": 1437084160.0,
+            "13": 1437084160.0,
+            "14": 1437084160.0,
+            "15": 1437084160.0,
+            "16": 1437084160.0,
+            "17": 1437084160.0,
+            "18": 1437084160.0,
+            "19": 1437084160.0,
+            "20": 1437084160.0,
+            "21": 1437084160.0,
+            "22": 1437084160.0,
+            "23": 1437084160.0,
+            "24": 1437084160.0,
+            "25": 1437084160.0,
+            "26": 1437084160.0,
+            "27": 1437084160.0,
+            "28": 1437084160.0,
+            "29": 1437084160.0,
+            "30": 1437084160.0,
+            "31": 1437084160.0,
+            "32": 1437084160.0,
+            "33": 1437084160.0,
+            "34": 1437084160.0,
+            "35": 1437084160.0,
+            "36": 1437084160.0,
+            "37": 1437084160.0,
+            "38": 1437084160.0,
+            "39": 1437084160.0,
+            "40": 1437084160.0,
+            "41": 1437084160.0,
+            "42": 1437084160.0,
+            "43": 1437084160.0,
+            "44": 1437084160.0,
+            "45": 1437084160.0,
+            "46": 1437084160.0,
+            "47": 1437084160.0,
+            "48": 1437084160.0,
+            "49": 1437084160.0,
+            "50": 1437084160.0,
+            "51": 1437084160.0,
+            "52": 1437084160.0,
+            "53": 1437084160.0,
+            "54": 1437084160.0,
+            "55": 1437084160.0,
+            "56": 1437084160.0,
+            "57": 1437084160.0,
+            "58": 1437084160.0,
+            "59": 1437084160.0,
+            "60": 1437084160.0,
+            "61": 1437084160.0,
+            "62": 1437084160.0,
+            "63": 1437084160.0,
+            "64": 1437084160.0,
+            "65": 1437084160.0,
+            "66": 1437084160.0,
+            "67": 1437084160.0,
+            "68": 1437084160.0,
+            "69": 1437084160.0,
+            "70": 1437084160.0,
+            "71": 1437084160.0,
+            "72": 1437084160.0,
+            "73": 1437084160.0,
+            "74": 1437084160.0,
+            "75": 1437084160.0,
+            "76": 1437084160.0,
+            "77": 1437084160.0,
+            "78": 1437084160.0,
+            "79": 1437084160.0,
+            "80": 1437084160.0,
+            "81": 1437084160.0,
+            "82": 1437084160.0,
+            "83": 1437084160.0,
+            "84": 1437084160.0,
+            "85": 1437084160.0,
+            "86": 1437084160.0,
+            "87": 1437084160.0,
+            "88": 1437084160.0,
+            "89": 1437084160.0,
+            "90": 1437084160.0,
+            "91": 1437084160.0,
+            "92": 1437084160.0,
+            "93": 1437084160.0,
+            "94": 1437084160.0,
+            "95": 1437084160.0,
+            "96": 1437084160.0,
+            "97": 1437084160.0,
+            "98": 1437084160.0,
+            "99": 1437084160.0,
+            "100": 1437084160.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.48366,
+            "3": 0.20961,
+            "4": 0.19355,
+            "5": 0.19146,
+            "6": 0.19108,
+            "7": 0.19236,
+            "8": 0.19259,
+            "9": 0.19267,
+            "10": 0.19436,
+            "11": 0.19257,
+            "12": 0.19432,
+            "13": 0.19332,
+            "14": 0.19442,
+            "15": 0.19393,
+            "16": 0.19417,
+            "17": 0.19555,
+            "18": 0.19451,
+            "19": 0.19452,
+            "20": 0.19555,
+            "21": 0.19375,
+            "22": 0.19402,
+            "23": 0.19539,
+            "24": 0.19475,
+            "25": 0.19576,
+            "26": 0.19424,
+            "27": 0.19514,
+            "28": 0.19519,
+            "29": 0.19578,
+            "30": 0.19503,
+            "31": 0.19394,
+            "32": 0.19582,
+            "33": 0.19444,
+            "34": 0.19405,
+            "35": 0.19498,
+            "36": 0.19463,
+            "37": 0.19572,
+            "38": 0.19362,
+            "39": 0.19492,
+            "40": 0.19487,
+            "41": 0.19497,
+            "42": 0.19617,
+            "43": 0.19571,
+            "44": 0.19661,
+            "45": 0.19634,
+            "46": 0.19537,
+            "47": 0.19646,
+            "48": 0.19658,
+            "49": 0.19727,
+            "50": 0.19567,
+            "51": 0.21203,
+            "52": 0.19551,
+            "53": 0.19415,
+            "54": 0.19434,
+            "55": 0.19584,
+            "56": 0.19437,
+            "57": 0.19536,
+            "58": 0.20364,
+            "59": 0.20029,
+            "60": 0.1929,
+            "61": 0.19274,
+            "62": 0.19364,
+            "63": 0.19667,
+            "64": 0.19406,
+            "65": 0.19781,
+            "66": 0.19435,
+            "67": 0.19308,
+            "68": 0.1932,
+            "69": 0.19478,
+            "70": 0.19591,
+            "71": 0.19922,
+            "72": 0.19646,
+            "73": 0.19646,
+            "74": 0.19739,
+            "75": 0.19817,
+            "76": 0.20056,
+            "77": 0.19655,
+            "78": 0.19459,
+            "79": 0.19478,
+            "80": 0.19638,
+            "81": 0.19329,
+            "82": 0.19254,
+            "83": 0.19379,
+            "84": 0.19435,
+            "85": 0.19517,
+            "86": 0.19446,
+            "87": 0.19464,
+            "88": 0.19501,
+            "89": 0.19544,
+            "90": 0.19268,
+            "91": 0.19425,
+            "92": 0.1933,
+            "93": 0.19366,
+            "94": 0.19328,
+            "95": 0.19408,
+            "96": 0.19474,
+            "97": 0.19719,
+            "98": 0.19535,
+            "99": 0.19604,
+            "100": 0.19554
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..848d772bc72
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84743,
+            "52": 9.74209,
+            "53": 10.05697,
+            "54": 9.9505,
+            "55": 9.88145,
+            "56": 9.61274,
+            "57": 9.4687,
+            "58": 9.82193,
+            "59": 9.57642,
+            "60": 9.49762,
+            "61": 9.69189,
+            "62": 9.9867,
+            "63": 9.37512,
+            "64": 9.76679,
+            "65": 8.94648,
+            "66": 9.7023,
+            "67": 9.36326,
+            "68": 9.7831,
+            "69": 9.7986,
+            "70": 9.7317,
+            "71": 9.62571,
+            "72": 9.58488,
+            "73": 9.48967,
+            "74": 8.9286,
+            "75": 9.40862,
+            "76": 9.07925,
+            "77": 10.0594,
+            "78": 9.72288,
+            "79": 9.37784,
+            "80": 9.40429,
+            "81": 9.48309,
+            "82": 9.7004,
+            "83": 9.31595,
+            "84": 9.41838,
+            "85": 9.61685,
+            "86": 9.07533,
+            "87": 9.59616,
+            "88": 9.75215,
+            "89": 9.60184,
+            "90": 9.82281,
+            "91": 9.34037,
+            "92": 9.35854,
+            "93": 9.08805,
+            "94": 8.83037,
+            "95": 9.5266,
+            "96": 9.53049,
+            "97": 9.30389,
+            "98": 9.67196,
+            "99": 8.89637,
+            "100": 9.40644
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2491.0,
+            "52": 2552.0,
+            "53": 2980.0,
+            "54": 2680.0,
+            "55": 2274.0,
+            "56": 2734.0,
+            "57": 2319.0,
+            "58": 2907.0,
+            "59": 2886.0,
+            "60": 2566.0,
+            "61": 2855.0,
+            "62": 2704.0,
+            "63": 2370.0,
+            "64": 2998.0,
+            "65": 2563.0,
+            "66": 2868.0,
+            "67": 2762.0,
+            "68": 2739.0,
+            "69": 2730.0,
+            "70": 3156.0,
+            "71": 2803.0,
+            "72": 2506.0,
+            "73": 2896.0,
+            "74": 1937.0,
+            "75": 2450.0,
+            "76": 2794.0,
+            "77": 3047.0,
+            "78": 3104.0,
+            "79": 3069.0,
+            "80": 3286.0,
+            "81": 3543.0,
+            "82": 3192.0,
+            "83": 2614.0,
+            "84": 3273.0,
+            "85": 3111.0,
+            "86": 2680.0,
+            "87": 3654.0,
+            "88": 3117.0,
+            "89": 3351.0,
+            "90": 3086.0,
+            "91": 2721.0,
+            "92": 3045.0,
+            "93": 2672.0,
+            "94": 3326.0,
+            "95": 3125.0,
+            "96": 3309.0,
+            "97": 3208.0,
+            "98": 3572.0,
+            "99": 2980.0,
+            "100": 3355.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 515933696.0,
+            "52": 515933696.0,
+            "53": 515933696.0,
+            "54": 515933696.0,
+            "55": 515933696.0,
+            "56": 515933696.0,
+            "57": 515933696.0,
+            "58": 515933696.0,
+            "59": 515933696.0,
+            "60": 515933696.0,
+            "61": 515933696.0,
+            "62": 515933696.0,
+            "63": 515933696.0,
+            "64": 515933696.0,
+            "65": 515933696.0,
+            "66": 515933696.0,
+            "67": 515933696.0,
+            "68": 515933696.0,
+            "69": 515933696.0,
+            "70": 515933696.0,
+            "71": 515933696.0,
+            "72": 515933696.0,
+            "73": 515933696.0,
+            "74": 515933696.0,
+            "75": 515933696.0,
+            "76": 515933696.0,
+            "77": 515933696.0,
+            "78": 515933696.0,
+            "79": 515933696.0,
+            "80": 515933696.0,
+            "81": 515933696.0,
+            "82": 515933696.0,
+            "83": 515933696.0,
+            "84": 515933696.0,
+            "85": 515933696.0,
+            "86": 515933696.0,
+            "87": 515933696.0,
+            "88": 515933696.0,
+            "89": 515933696.0,
+            "90": 515933696.0,
+            "91": 515933696.0,
+            "92": 515933696.0,
+            "93": 515933696.0,
+            "94": 515933696.0,
+            "95": 515933696.0,
+            "96": 515933696.0,
+            "97": 515933696.0,
+            "98": 515933696.0,
+            "99": 515933696.0,
+            "100": 515933696.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1440754176.0,
+            "52": 1440755200.0,
+            "53": 1440755200.0,
+            "54": 1440755200.0,
+            "55": 1440755200.0,
+            "56": 1440755200.0,
+            "57": 1440755200.0,
+            "58": 1440755200.0,
+            "59": 1440755200.0,
+            "60": 1440755200.0,
+            "61": 1440755200.0,
+            "62": 1440755200.0,
+            "63": 1440755200.0,
+            "64": 1440755200.0,
+            "65": 1440755200.0,
+            "66": 1440755200.0,
+            "67": 1440755200.0,
+            "68": 1440755200.0,
+            "69": 1440755200.0,
+            "70": 1440755200.0,
+            "71": 1440755200.0,
+            "72": 1440755200.0,
+            "73": 1440755200.0,
+            "74": 1440755200.0,
+            "75": 1440755200.0,
+            "76": 1440755200.0,
+            "77": 1440755200.0,
+            "78": 1440755200.0,
+            "79": 1440755200.0,
+            "80": 1440755200.0,
+            "81": 1440755200.0,
+            "82": 1440755200.0,
+            "83": 1440755200.0,
+            "84": 1440755200.0,
+            "85": 1440755200.0,
+            "86": 1440755200.0,
+            "87": 1440755200.0,
+            "88": 1440755200.0,
+            "89": 1440755200.0,
+            "90": 1440755200.0,
+            "91": 1440755200.0,
+            "92": 1440755200.0,
+            "93": 1440755200.0,
+            "94": 1440755200.0,
+            "95": 1440755200.0,
+            "96": 1440755200.0,
+            "97": 1440755200.0,
+            "98": 1440755200.0,
+            "99": 1440755200.0,
+            "100": 1440755200.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.50794,
+            "53": 0.19671,
+            "54": 0.18376,
+            "55": 0.18594,
+            "56": 0.18674,
+            "57": 0.18474,
+            "58": 0.18412,
+            "59": 0.18456,
+            "60": 0.18456,
+            "61": 0.18623,
+            "62": 0.18524,
+            "63": 0.18624,
+            "64": 0.18621,
+            "65": 0.18695,
+            "66": 0.18541,
+            "67": 0.1857,
+            "68": 0.18575,
+            "69": 0.18658,
+            "70": 0.1875,
+            "71": 0.18753,
+            "72": 0.18718,
+            "73": 0.18797,
+            "74": 0.18972,
+            "75": 0.18765,
+            "76": 0.18764,
+            "77": 0.18827,
+            "78": 0.18801,
+            "79": 0.18785,
+            "80": 0.18903,
+            "81": 0.18889,
+            "82": 0.18772,
+            "83": 0.18876,
+            "84": 0.18791,
+            "85": 0.18973,
+            "86": 0.18948,
+            "87": 0.18998,
+            "88": 0.18905,
+            "89": 0.1898,
+            "90": 0.1895,
+            "91": 0.18953,
+            "92": 0.18969,
+            "93": 0.18888,
+            "94": 0.18888,
+            "95": 0.18773,
+            "96": 0.18832,
+            "97": 0.18919,
+            "98": 0.189,
+            "99": 0.1888,
+            "100": 0.188
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json
index 7c1078c0b3d..aab9c0cb891 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 517505536.0,
-            "2": 517505536.0,
-            "3": 517505536.0,
-            "4": 517505536.0,
-            "5": 517505536.0,
-            "6": 517505536.0,
-            "7": 517505536.0,
-            "8": 517505536.0,
-            "9": 517505536.0,
-            "10": 517505536.0,
-            "11": 517505536.0,
-            "12": 517505536.0,
-            "13": 517505536.0,
-            "14": 517505536.0,
-            "15": 517505536.0,
-            "16": 517505536.0,
-            "17": 517505536.0,
-            "18": 517505536.0,
-            "19": 517505536.0,
-            "20": 517505536.0,
-            "21": 517505536.0,
-            "22": 517505536.0,
-            "23": 517505536.0,
-            "24": 517505536.0,
-            "25": 517505536.0,
-            "26": 517505536.0,
-            "27": 517505536.0,
-            "28": 517505536.0,
-            "29": 517505536.0,
-            "30": 517505536.0,
-            "31": 517505536.0,
-            "32": 517505536.0,
-            "33": 517505536.0,
-            "34": 517505536.0,
-            "35": 517505536.0,
-            "36": 517505536.0,
-            "37": 517505536.0,
-            "38": 517505536.0,
-            "39": 517505536.0,
-            "40": 517505536.0,
-            "41": 517505536.0,
-            "42": 517505536.0,
-            "43": 517505536.0,
-            "44": 517505536.0,
-            "45": 517505536.0,
-            "46": 517505536.0,
-            "47": 517505536.0,
-            "48": 517505536.0,
-            "49": 517505536.0,
-            "50": 517505536.0,
-            "51": 517505536.0,
-            "52": 517505536.0,
-            "53": 517505536.0,
-            "54": 517505536.0,
-            "55": 517505536.0,
-            "56": 517505536.0,
-            "57": 517505536.0,
-            "58": 517505536.0,
-            "59": 517505536.0,
-            "60": 517505536.0,
-            "61": 517505536.0,
-            "62": 517505536.0,
-            "63": 517505536.0,
-            "64": 517505536.0,
-            "65": 517505536.0,
-            "66": 517505536.0,
-            "67": 517505536.0,
-            "68": 517505536.0,
-            "69": 517505536.0,
-            "70": 517505536.0,
-            "71": 517505536.0,
-            "72": 517505536.0,
-            "73": 517505536.0,
-            "74": 517505536.0,
-            "75": 517505536.0,
-            "76": 517505536.0,
-            "77": 517505536.0,
-            "78": 517505536.0,
-            "79": 517505536.0,
-            "80": 517505536.0,
-            "81": 517505536.0,
-            "82": 517505536.0,
-            "83": 517505536.0,
-            "84": 517505536.0,
-            "85": 517505536.0,
-            "86": 517505536.0,
-            "87": 517505536.0,
-            "88": 517505536.0,
-            "89": 517505536.0,
-            "90": 517505536.0,
-            "91": 517505536.0,
-            "92": 517505536.0,
-            "93": 517505536.0,
-            "94": 517505536.0,
-            "95": 517505536.0,
-            "96": 517505536.0,
-            "97": 517505536.0,
-            "98": 517505536.0,
-            "99": 517505536.0,
-            "100": 517505536.0
+            "1": 516456960.0,
+            "2": 516456960.0,
+            "3": 516456960.0,
+            "4": 516456960.0,
+            "5": 516456960.0,
+            "6": 516456960.0,
+            "7": 516456960.0,
+            "8": 516456960.0,
+            "9": 516456960.0,
+            "10": 516456960.0,
+            "11": 516456960.0,
+            "12": 516456960.0,
+            "13": 516456960.0,
+            "14": 516456960.0,
+            "15": 516456960.0,
+            "16": 516456960.0,
+            "17": 516456960.0,
+            "18": 516456960.0,
+            "19": 516456960.0,
+            "20": 516456960.0,
+            "21": 516456960.0,
+            "22": 516456960.0,
+            "23": 516456960.0,
+            "24": 516456960.0,
+            "25": 516456960.0,
+            "26": 516456960.0,
+            "27": 516456960.0,
+            "28": 516456960.0,
+            "29": 516456960.0,
+            "30": 516456960.0,
+            "31": 516456960.0,
+            "32": 516456960.0,
+            "33": 516456960.0,
+            "34": 516456960.0,
+            "35": 516456960.0,
+            "36": 516456960.0,
+            "37": 516456960.0,
+            "38": 516456960.0,
+            "39": 516456960.0,
+            "40": 516456960.0,
+            "41": 516456960.0,
+            "42": 516456960.0,
+            "43": 516456960.0,
+            "44": 516456960.0,
+            "45": 516456960.0,
+            "46": 516456960.0,
+            "47": 516456960.0,
+            "48": 516456960.0,
+            "49": 516456960.0,
+            "50": 516456960.0,
+            "51": 516456960.0,
+            "52": 516456960.0,
+            "53": 516456960.0,
+            "54": 516456960.0,
+            "55": 516456960.0,
+            "56": 516456960.0,
+            "57": 516456960.0,
+            "58": 516456960.0,
+            "59": 516456960.0,
+            "60": 516456960.0,
+            "61": 516456960.0,
+            "62": 516456960.0,
+            "63": 516456960.0,
+            "64": 516456960.0,
+            "65": 516456960.0,
+            "66": 516456960.0,
+            "67": 516456960.0,
+            "68": 516456960.0,
+            "69": 516456960.0,
+            "70": 516456960.0,
+            "71": 516456960.0,
+            "72": 516456960.0,
+            "73": 516456960.0,
+            "74": 516456960.0,
+            "75": 516456960.0,
+            "76": 516456960.0,
+            "77": 516456960.0,
+            "78": 516456960.0,
+            "79": 516456960.0,
+            "80": 516456960.0,
+            "81": 516456960.0,
+            "82": 516456960.0,
+            "83": 516456960.0,
+            "84": 516456960.0,
+            "85": 516456960.0,
+            "86": 516456960.0,
+            "87": 516456960.0,
+            "88": 516456960.0,
+            "89": 516456960.0,
+            "90": 516456960.0,
+            "91": 516456960.0,
+            "92": 516456960.0,
+            "93": 516456960.0,
+            "94": 516456960.0,
+            "95": 516456960.0,
+            "96": 516456960.0,
+            "97": 516456960.0,
+            "98": 516456960.0,
+            "99": 516456960.0,
+            "100": 516456960.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1246524928.0,
-            "2": 1428695552.0,
-            "3": 1428695552.0,
-            "4": 1428695552.0,
-            "5": 1428695552.0,
-            "6": 1428695552.0,
-            "7": 1428695552.0,
-            "8": 1428695552.0,
-            "9": 1428695552.0,
-            "10": 1428695552.0,
-            "11": 1428695552.0,
-            "12": 1428695552.0,
-            "13": 1428695552.0,
-            "14": 1428695552.0,
-            "15": 1428695552.0,
-            "16": 1428695552.0,
-            "17": 1428695552.0,
-            "18": 1428695552.0,
-            "19": 1428695552.0,
-            "20": 1428695552.0,
-            "21": 1428695552.0,
-            "22": 1428695552.0,
-            "23": 1428695552.0,
-            "24": 1428695552.0,
-            "25": 1428695552.0,
-            "26": 1428695552.0,
-            "27": 1428695552.0,
-            "28": 1428695552.0,
-            "29": 1428695552.0,
-            "30": 1428695552.0,
-            "31": 1428695552.0,
-            "32": 1428695552.0,
-            "33": 1428695552.0,
-            "34": 1428695552.0,
-            "35": 1428695552.0,
-            "36": 1428695552.0,
-            "37": 1428695552.0,
-            "38": 1428695552.0,
-            "39": 1428695552.0,
-            "40": 1428695552.0,
-            "41": 1428695552.0,
-            "42": 1428695552.0,
-            "43": 1428695552.0,
-            "44": 1428695552.0,
-            "45": 1428695552.0,
-            "46": 1428695552.0,
-            "47": 1428695552.0,
-            "48": 1428695552.0,
-            "49": 1428695552.0,
-            "50": 1428695552.0,
-            "51": 1428695552.0,
-            "52": 1428695552.0,
-            "53": 1428695552.0,
-            "54": 1428695552.0,
-            "55": 1428695552.0,
-            "56": 1428695552.0,
-            "57": 1428695552.0,
-            "58": 1428695552.0,
-            "59": 1428695552.0,
-            "60": 1428695552.0,
-            "61": 1428695552.0,
-            "62": 1428695552.0,
-            "63": 1428695552.0,
-            "64": 1428695552.0,
-            "65": 1428695552.0,
-            "66": 1428695552.0,
-            "67": 1428695552.0,
-            "68": 1428695552.0,
-            "69": 1428695552.0,
-            "70": 1428695552.0,
-            "71": 1428695552.0,
-            "72": 1428695552.0,
-            "73": 1428695552.0,
-            "74": 1428695552.0,
-            "75": 1428695552.0,
-            "76": 1428695552.0,
-            "77": 1428695552.0,
-            "78": 1428695552.0,
-            "79": 1428695552.0,
-            "80": 1428695552.0,
-            "81": 1428695552.0,
-            "82": 1428695552.0,
-            "83": 1428695552.0,
-            "84": 1428695552.0,
-            "85": 1428695552.0,
-            "86": 1428695552.0,
-            "87": 1428695552.0,
-            "88": 1428695552.0,
-            "89": 1428695552.0,
-            "90": 1428695552.0,
-            "91": 1428695552.0,
-            "92": 1428695552.0,
-            "93": 1428695552.0,
-            "94": 1428695552.0,
-            "95": 1428695552.0,
-            "96": 1428695552.0,
-            "97": 1428695552.0,
-            "98": 1428695552.0,
-            "99": 1428695552.0,
-            "100": 1428695552.0
+            "1": 1246525952.0,
+            "2": 1426598400.0,
+            "3": 1426598400.0,
+            "4": 1426598400.0,
+            "5": 1426598400.0,
+            "6": 1426598400.0,
+            "7": 1426598400.0,
+            "8": 1426598400.0,
+            "9": 1426598400.0,
+            "10": 1426598400.0,
+            "11": 1426598400.0,
+            "12": 1426598400.0,
+            "13": 1426598400.0,
+            "14": 1426598400.0,
+            "15": 1426598400.0,
+            "16": 1426598400.0,
+            "17": 1426598400.0,
+            "18": 1426598400.0,
+            "19": 1426598400.0,
+            "20": 1426598400.0,
+            "21": 1426598400.0,
+            "22": 1426598400.0,
+            "23": 1426598400.0,
+            "24": 1426598400.0,
+            "25": 1426598400.0,
+            "26": 1426598400.0,
+            "27": 1426598400.0,
+            "28": 1426598400.0,
+            "29": 1426598400.0,
+            "30": 1426598400.0,
+            "31": 1426598400.0,
+            "32": 1426598400.0,
+            "33": 1426598400.0,
+            "34": 1426598400.0,
+            "35": 1426598400.0,
+            "36": 1426598400.0,
+            "37": 1426598400.0,
+            "38": 1426598400.0,
+            "39": 1426598400.0,
+            "40": 1426598400.0,
+            "41": 1426598400.0,
+            "42": 1426598400.0,
+            "43": 1426598400.0,
+            "44": 1426598400.0,
+            "45": 1426598400.0,
+            "46": 1426598400.0,
+            "47": 1426598400.0,
+            "48": 1426598400.0,
+            "49": 1426598400.0,
+            "50": 1426598400.0,
+            "51": 1426598400.0,
+            "52": 1426598400.0,
+            "53": 1426598400.0,
+            "54": 1426598400.0,
+            "55": 1426598400.0,
+            "56": 1426598400.0,
+            "57": 1426598400.0,
+            "58": 1426598400.0,
+            "59": 1426598400.0,
+            "60": 1426598400.0,
+            "61": 1426598400.0,
+            "62": 1426598400.0,
+            "63": 1426598400.0,
+            "64": 1426598400.0,
+            "65": 1426598400.0,
+            "66": 1426598400.0,
+            "67": 1426598400.0,
+            "68": 1426598400.0,
+            "69": 1426598400.0,
+            "70": 1426598400.0,
+            "71": 1426598400.0,
+            "72": 1426598400.0,
+            "73": 1426598400.0,
+            "74": 1426598400.0,
+            "75": 1426598400.0,
+            "76": 1426598400.0,
+            "77": 1426598400.0,
+            "78": 1426598400.0,
+            "79": 1426598400.0,
+            "80": 1426598400.0,
+            "81": 1426598400.0,
+            "82": 1426598400.0,
+            "83": 1426598400.0,
+            "84": 1426598400.0,
+            "85": 1426598400.0,
+            "86": 1426598400.0,
+            "87": 1426598400.0,
+            "88": 1426598400.0,
+            "89": 1426598400.0,
+            "90": 1426598400.0,
+            "91": 1426598400.0,
+            "92": 1426598400.0,
+            "93": 1426598400.0,
+            "94": 1426598400.0,
+            "95": 1426598400.0,
+            "96": 1426598400.0,
+            "97": 1426598400.0,
+            "98": 1426598400.0,
+            "99": 1426598400.0,
+            "100": 1426598400.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 11.96359,
-            "2": 0.17007,
-            "3": 0.15511,
-            "4": 0.15439,
-            "5": 0.15477,
-            "6": 0.15459,
-            "7": 0.15427,
-            "8": 0.15173,
-            "9": 0.15484,
-            "10": 0.15363,
-            "11": 0.15353,
-            "12": 0.15567,
-            "13": 0.15258,
-            "14": 0.15438,
-            "15": 0.15305,
-            "16": 0.15314,
-            "17": 0.15342,
-            "18": 0.15282,
-            "19": 0.15336,
-            "20": 0.15333,
-            "21": 0.15174,
-            "22": 0.15412,
-            "23": 0.15337,
-            "24": 0.15464,
-            "25": 0.15638,
-            "26": 0.15618,
-            "27": 0.15599,
-            "28": 0.15616,
-            "29": 0.15792,
-            "30": 0.15422,
-            "31": 0.15441,
-            "32": 0.15356,
-            "33": 0.15622,
-            "34": 0.15397,
-            "35": 0.15443,
-            "36": 0.15392,
-            "37": 0.15454,
-            "38": 0.15581,
-            "39": 0.15513,
-            "40": 0.15813,
-            "41": 0.1595,
-            "42": 0.15604,
-            "43": 0.15809,
-            "44": 0.15585,
-            "45": 0.15659,
-            "46": 0.15599,
-            "47": 0.15378,
-            "48": 0.15475,
-            "49": 0.1544,
-            "50": 0.15569,
-            "51": 0.16391,
-            "52": 0.16196,
-            "53": 0.16029,
-            "54": 0.16138,
-            "55": 0.15673,
-            "56": 0.1503,
-            "57": 0.15071,
-            "58": 0.15268,
-            "59": 0.15095,
-            "60": 0.15189,
-            "61": 0.15199,
-            "62": 0.14938,
-            "63": 0.15046,
-            "64": 0.14924,
-            "65": 0.15129,
-            "66": 0.14938,
-            "67": 0.15233,
-            "68": 0.15028,
-            "69": 0.1525,
-            "70": 0.15334,
-            "71": 0.15152,
-            "72": 0.15138,
-            "73": 0.15304,
-            "74": 0.1515,
-            "75": 0.15282,
-            "76": 0.1518,
-            "77": 0.15193,
-            "78": 0.15262,
-            "79": 0.15274,
-            "80": 0.15251,
-            "81": 0.15108,
-            "82": 0.15199,
-            "83": 0.15046,
-            "84": 0.15298,
-            "85": 0.15063,
-            "86": 0.15132,
-            "87": 0.15257,
-            "88": 0.15109,
-            "89": 0.1502,
-            "90": 0.15259,
-            "91": 0.15063,
-            "92": 0.15237,
-            "93": 0.15096,
-            "94": 0.1517,
-            "95": 0.15049,
-            "96": 0.15002,
-            "97": 0.15011,
-            "98": 0.15349,
-            "99": 0.1565,
-            "100": 0.15223
+            "1": 8.65189,
+            "2": 0.17932,
+            "3": 0.14636,
+            "4": 0.12538,
+            "5": 0.12402,
+            "6": 0.12459,
+            "7": 0.12481,
+            "8": 0.12323,
+            "9": 0.12314,
+            "10": 0.12506,
+            "11": 0.1247,
+            "12": 0.124,
+            "13": 0.12299,
+            "14": 0.12337,
+            "15": 0.12552,
+            "16": 0.12432,
+            "17": 0.12285,
+            "18": 0.1235,
+            "19": 0.12341,
+            "20": 0.12389,
+            "21": 0.12311,
+            "22": 0.12402,
+            "23": 0.12319,
+            "24": 0.12321,
+            "25": 0.12382,
+            "26": 0.12336,
+            "27": 0.12353,
+            "28": 0.12251,
+            "29": 0.12528,
+            "30": 0.12437,
+            "31": 0.12503,
+            "32": 0.12365,
+            "33": 0.1224,
+            "34": 0.12436,
+            "35": 0.12606,
+            "36": 0.12382,
+            "37": 0.12451,
+            "38": 0.12292,
+            "39": 0.1228,
+            "40": 0.12355,
+            "41": 0.12426,
+            "42": 0.12483,
+            "43": 0.12585,
+            "44": 0.12964,
+            "45": 0.12442,
+            "46": 0.12437,
+            "47": 0.12371,
+            "48": 0.12305,
+            "49": 0.12517,
+            "50": 0.12295,
+            "51": 0.14312,
+            "52": 0.1306,
+            "53": 0.12394,
+            "54": 0.12469,
+            "55": 0.12368,
+            "56": 0.12394,
+            "57": 0.12303,
+            "58": 0.12356,
+            "59": 0.12328,
+            "60": 0.12317,
+            "61": 0.12286,
+            "62": 0.12321,
+            "63": 0.12386,
+            "64": 0.12303,
+            "65": 0.12369,
+            "66": 0.12284,
+            "67": 0.12276,
+            "68": 0.1233,
+            "69": 0.12275,
+            "70": 0.12331,
+            "71": 0.12204,
+            "72": 0.12226,
+            "73": 0.12258,
+            "74": 0.12222,
+            "75": 0.12284,
+            "76": 0.12277,
+            "77": 0.12539,
+            "78": 0.12356,
+            "79": 0.1224,
+            "80": 0.12283,
+            "81": 0.12341,
+            "82": 0.12375,
+            "83": 0.1222,
+            "84": 0.12248,
+            "85": 0.12367,
+            "86": 0.12361,
+            "87": 0.12373,
+            "88": 0.124,
+            "89": 0.1217,
+            "90": 0.12316,
+            "91": 0.12421,
+            "92": 0.12415,
+            "93": 0.1244,
+            "94": 0.12547,
+            "95": 0.12292,
+            "96": 0.12216,
+            "97": 0.12313,
+            "98": 0.12301,
+            "99": 0.1248,
+            "100": 0.12337
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..f8f216592e7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8558,
+            "52": 9.75237,
+            "53": 10.07589,
+            "54": 9.95688,
+            "55": 9.88203,
+            "56": 9.6313,
+            "57": 9.48649,
+            "58": 9.83109,
+            "59": 9.58897,
+            "60": 9.50643,
+            "61": 9.70363,
+            "62": 9.98286,
+            "63": 9.38302,
+            "64": 9.77901,
+            "65": 8.95166,
+            "66": 9.70158,
+            "67": 9.37203,
+            "68": 9.78849,
+            "69": 9.79851,
+            "70": 9.74737,
+            "71": 9.61908,
+            "72": 9.58502,
+            "73": 9.49721,
+            "74": 8.93927,
+            "75": 9.42703,
+            "76": 9.0802,
+            "77": 10.06567,
+            "78": 9.72893,
+            "79": 9.3776,
+            "80": 9.40982,
+            "81": 9.47976,
+            "82": 9.7018,
+            "83": 9.30612,
+            "84": 9.4209,
+            "85": 9.61371,
+            "86": 9.07649,
+            "87": 9.5945,
+            "88": 9.75068,
+            "89": 9.60238,
+            "90": 9.81898,
+            "91": 9.33894,
+            "92": 9.35716,
+            "93": 9.07879,
+            "94": 8.83503,
+            "95": 9.52172,
+            "96": 9.53003,
+            "97": 9.31306,
+            "98": 9.67783,
+            "99": 8.89058,
+            "100": 9.39725
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2613.0,
+            "52": 2538.0,
+            "53": 2792.0,
+            "54": 2801.0,
+            "55": 2216.0,
+            "56": 2858.0,
+            "57": 2381.0,
+            "58": 2854.0,
+            "59": 2787.0,
+            "60": 2457.0,
+            "61": 2941.0,
+            "62": 2543.0,
+            "63": 2408.0,
+            "64": 2968.0,
+            "65": 2472.0,
+            "66": 2977.0,
+            "67": 2839.0,
+            "68": 2775.0,
+            "69": 2832.0,
+            "70": 3057.0,
+            "71": 2909.0,
+            "72": 2421.0,
+            "73": 2982.0,
+            "74": 1922.0,
+            "75": 2474.0,
+            "76": 3059.0,
+            "77": 3177.0,
+            "78": 3067.0,
+            "79": 3052.0,
+            "80": 3338.0,
+            "81": 3644.0,
+            "82": 3234.0,
+            "83": 2798.0,
+            "84": 3196.0,
+            "85": 3324.0,
+            "86": 2855.0,
+            "87": 3820.0,
+            "88": 2962.0,
+            "89": 3379.0,
+            "90": 3096.0,
+            "91": 2857.0,
+            "92": 3077.0,
+            "93": 2693.0,
+            "94": 3312.0,
+            "95": 3399.0,
+            "96": 3378.0,
+            "97": 3030.0,
+            "98": 3619.0,
+            "99": 3160.0,
+            "100": 3128.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 696530432.0,
+            "52": 696530432.0,
+            "53": 696530432.0,
+            "54": 696530432.0,
+            "55": 696530432.0,
+            "56": 696530432.0,
+            "57": 696530432.0,
+            "58": 696530432.0,
+            "59": 696530432.0,
+            "60": 696530432.0,
+            "61": 696530432.0,
+            "62": 696530432.0,
+            "63": 696530432.0,
+            "64": 696530432.0,
+            "65": 696530432.0,
+            "66": 696530432.0,
+            "67": 696530432.0,
+            "68": 696530432.0,
+            "69": 696530432.0,
+            "70": 696530432.0,
+            "71": 696530432.0,
+            "72": 696530432.0,
+            "73": 696530432.0,
+            "74": 696530432.0,
+            "75": 696530432.0,
+            "76": 696530432.0,
+            "77": 696530432.0,
+            "78": 696530432.0,
+            "79": 696530432.0,
+            "80": 696530432.0,
+            "81": 696530432.0,
+            "82": 696530432.0,
+            "83": 696530432.0,
+            "84": 696530432.0,
+            "85": 696530432.0,
+            "86": 696530432.0,
+            "87": 696530432.0,
+            "88": 696530432.0,
+            "89": 696530432.0,
+            "90": 696530432.0,
+            "91": 696530432.0,
+            "92": 696530432.0,
+            "93": 696530432.0,
+            "94": 696530432.0,
+            "95": 696530432.0,
+            "96": 696530432.0,
+            "97": 696530432.0,
+            "98": 696530432.0,
+            "99": 696530432.0,
+            "100": 696530432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1606670848.0,
+            "52": 1606671872.0,
+            "53": 1606671872.0,
+            "54": 1606671872.0,
+            "55": 1606671872.0,
+            "56": 1606671872.0,
+            "57": 1606671872.0,
+            "58": 1606671872.0,
+            "59": 1606671872.0,
+            "60": 1606671872.0,
+            "61": 1606671872.0,
+            "62": 1606671872.0,
+            "63": 1606671872.0,
+            "64": 1606671872.0,
+            "65": 1606671872.0,
+            "66": 1606671872.0,
+            "67": 1606671872.0,
+            "68": 1606671872.0,
+            "69": 1606671872.0,
+            "70": 1606671872.0,
+            "71": 1606671872.0,
+            "72": 1606671872.0,
+            "73": 1606671872.0,
+            "74": 1606671872.0,
+            "75": 1606671872.0,
+            "76": 1606671872.0,
+            "77": 1606671872.0,
+            "78": 1606671872.0,
+            "79": 1606671872.0,
+            "80": 1606671872.0,
+            "81": 1606671872.0,
+            "82": 1606671872.0,
+            "83": 1606671872.0,
+            "84": 1606671872.0,
+            "85": 1606671872.0,
+            "86": 1606671872.0,
+            "87": 1606671872.0,
+            "88": 1606671872.0,
+            "89": 1606671872.0,
+            "90": 1606671872.0,
+            "91": 1606671872.0,
+            "92": 1606671872.0,
+            "93": 1606671872.0,
+            "94": 1606671872.0,
+            "95": 1606671872.0,
+            "96": 1606671872.0,
+            "97": 1606671872.0,
+            "98": 1606671872.0,
+            "99": 1606671872.0,
+            "100": 1606671872.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.42109,
+            "52": 0.15643,
+            "53": 0.13049,
+            "54": 0.13624,
+            "55": 0.13521,
+            "56": 0.13263,
+            "57": 0.13088,
+            "58": 0.13077,
+            "59": 0.13083,
+            "60": 0.13167,
+            "61": 0.13236,
+            "62": 0.1318,
+            "63": 0.1298,
+            "64": 0.12659,
+            "65": 0.13241,
+            "66": 0.13279,
+            "67": 0.13136,
+            "68": 0.13156,
+            "69": 0.13048,
+            "70": 0.13134,
+            "71": 0.1306,
+            "72": 0.13073,
+            "73": 0.13104,
+            "74": 0.1307,
+            "75": 0.12918,
+            "76": 0.13046,
+            "77": 0.12748,
+            "78": 0.12438,
+            "79": 0.12456,
+            "80": 0.12401,
+            "81": 0.12459,
+            "82": 0.12524,
+            "83": 0.12443,
+            "84": 0.12519,
+            "85": 0.12459,
+            "86": 0.12453,
+            "87": 0.12733,
+            "88": 0.12682,
+            "89": 0.12512,
+            "90": 0.12406,
+            "91": 0.12452,
+            "92": 0.12425,
+            "93": 0.12737,
+            "94": 0.12561,
+            "95": 0.12766,
+            "96": 0.12743,
+            "97": 0.12696,
+            "98": 0.12713,
+            "99": 0.12566,
+            "100": 0.12444
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json
index 16e4a038563..29bb4241810 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 7.63807,
-            "2": 0.24295,
-            "3": 0.21281,
-            "4": 0.20931,
-            "5": 0.20554,
-            "6": 0.20827,
-            "7": 0.20618,
-            "8": 0.20701,
-            "9": 0.2077,
-            "10": 0.20875,
-            "11": 0.20704,
-            "12": 0.20735,
-            "13": 0.20734,
-            "14": 0.20659,
-            "15": 0.2071,
-            "16": 0.20766,
-            "17": 0.20579,
-            "18": 0.20511,
-            "19": 0.20563,
-            "20": 0.20589,
-            "21": 0.205,
-            "22": 0.20541,
-            "23": 0.2062,
-            "24": 0.20485,
-            "25": 0.20487,
-            "26": 0.20458,
-            "27": 0.20496,
-            "28": 0.20545,
-            "29": 0.20623,
-            "30": 0.20511,
-            "31": 0.20822,
-            "32": 0.20482,
-            "33": 0.20538,
-            "34": 0.20452,
-            "35": 0.21063,
-            "36": 0.20987,
-            "37": 0.20831,
-            "38": 0.2088,
-            "39": 0.20816,
-            "40": 0.20875,
-            "41": 0.20857,
-            "42": 0.20959,
-            "43": 0.20886,
-            "44": 0.2086,
-            "45": 0.20776,
-            "46": 0.20831,
-            "47": 0.20853,
-            "48": 0.2086,
-            "49": 0.20813,
-            "50": 0.209,
-            "51": 0.20574,
-            "52": 0.19892,
-            "53": 0.19904,
-            "54": 0.19867,
-            "55": 0.19897,
-            "56": 0.20031,
-            "57": 0.19874,
-            "58": 0.19971,
-            "59": 0.2002,
-            "60": 0.19847,
-            "61": 0.19948,
-            "62": 0.20017,
-            "63": 0.19926,
-            "64": 0.19923,
-            "65": 0.19974,
-            "66": 0.19915,
-            "67": 0.19992,
-            "68": 0.19949,
-            "69": 0.19842,
-            "70": 0.19824,
-            "71": 0.2012,
-            "72": 0.20144,
-            "73": 0.20339,
-            "74": 0.19815,
-            "75": 0.19802,
-            "76": 0.19898,
-            "77": 0.20003,
-            "78": 0.20017,
-            "79": 0.20157,
-            "80": 0.20266,
-            "81": 0.20004,
-            "82": 0.19937,
-            "83": 0.2008,
-            "84": 0.2009,
-            "85": 0.20194,
-            "86": 0.2015,
-            "87": 0.20004,
-            "88": 0.20091,
-            "89": 0.19998,
-            "90": 0.19993,
-            "91": 0.20008,
-            "92": 0.19991,
-            "93": 0.19979,
-            "94": 0.19939,
-            "95": 0.20098,
-            "96": 0.20045,
-            "97": 0.19917,
-            "98": 0.20012,
-            "99": 0.19963,
-            "100": 0.19848
+            "1": 4.68458,
+            "2": 0.34484,
+            "3": 0.20879,
+            "4": 0.19358,
+            "5": 0.20092,
+            "6": 0.20176,
+            "7": 0.19316,
+            "8": 0.19111,
+            "9": 0.1921,
+            "10": 0.19155,
+            "11": 0.1921,
+            "12": 0.19089,
+            "13": 0.19091,
+            "14": 0.19273,
+            "15": 0.19306,
+            "16": 0.19124,
+            "17": 0.19058,
+            "18": 0.19068,
+            "19": 0.1894,
+            "20": 0.1897,
+            "21": 0.18966,
+            "22": 0.19023,
+            "23": 0.191,
+            "24": 0.18993,
+            "25": 0.19096,
+            "26": 0.19035,
+            "27": 0.19016,
+            "28": 0.18918,
+            "29": 0.18955,
+            "30": 0.18937,
+            "31": 0.18938,
+            "32": 0.18928,
+            "33": 0.18984,
+            "34": 0.18904,
+            "35": 0.18964,
+            "36": 0.18935,
+            "37": 0.18986,
+            "38": 0.19014,
+            "39": 0.18982,
+            "40": 0.18988,
+            "41": 0.19,
+            "42": 0.18994,
+            "43": 0.18983,
+            "44": 0.18983,
+            "45": 0.18997,
+            "46": 0.18936,
+            "47": 0.18969,
+            "48": 0.19034,
+            "49": 0.1892,
+            "50": 0.18945,
+            "51": 0.20301,
+            "52": 0.19526,
+            "53": 0.19506,
+            "54": 0.19396,
+            "55": 0.19539,
+            "56": 0.19467,
+            "57": 0.19181,
+            "58": 0.18922,
+            "59": 0.19013,
+            "60": 0.19039,
+            "61": 0.1891,
+            "62": 0.19198,
+            "63": 0.18813,
+            "64": 0.18836,
+            "65": 0.18934,
+            "66": 0.18939,
+            "67": 0.18844,
+            "68": 0.18865,
+            "69": 0.18927,
+            "70": 0.18882,
+            "71": 0.18864,
+            "72": 0.18848,
+            "73": 0.18879,
+            "74": 0.18944,
+            "75": 0.18858,
+            "76": 0.18852,
+            "77": 0.18875,
+            "78": 0.18849,
+            "79": 0.18926,
+            "80": 0.18829,
+            "81": 0.18908,
+            "82": 0.18904,
+            "83": 0.18872,
+            "84": 0.18777,
+            "85": 0.18882,
+            "86": 0.18885,
+            "87": 0.18923,
+            "88": 0.1889,
+            "89": 0.18951,
+            "90": 0.1886,
+            "91": 0.19049,
+            "92": 0.19005,
+            "93": 0.18948,
+            "94": 0.18876,
+            "95": 0.19048,
+            "96": 0.18863,
+            "97": 0.18791,
+            "98": 0.1895,
+            "99": 0.18965,
+            "100": 0.18845
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..a7ad841079e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84971,
+            "52": 9.74156,
+            "53": 10.06322,
+            "54": 9.94581,
+            "55": 9.87731,
+            "56": 9.62746,
+            "57": 9.47259,
+            "58": 9.82912,
+            "59": 9.583,
+            "60": 9.49181,
+            "61": 9.69961,
+            "62": 9.98089,
+            "63": 9.37212,
+            "64": 9.7756,
+            "65": 8.9433,
+            "66": 9.69993,
+            "67": 9.36414,
+            "68": 9.78706,
+            "69": 9.78397,
+            "70": 9.72288,
+            "71": 9.60749,
+            "72": 9.58416,
+            "73": 9.49093,
+            "74": 8.94864,
+            "75": 9.41807,
+            "76": 9.08721,
+            "77": 10.06283,
+            "78": 9.729,
+            "79": 9.37091,
+            "80": 9.40033,
+            "81": 9.47754,
+            "82": 9.69121,
+            "83": 9.30762,
+            "84": 9.41252,
+            "85": 9.61132,
+            "86": 9.07621,
+            "87": 9.59459,
+            "88": 9.74768,
+            "89": 9.6068,
+            "90": 9.81078,
+            "91": 9.34441,
+            "92": 9.36535,
+            "93": 9.07743,
+            "94": 8.82975,
+            "95": 9.51676,
+            "96": 9.52546,
+            "97": 9.31031,
+            "98": 9.67812,
+            "99": 8.88848,
+            "100": 9.40128
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2735.0,
+            "52": 2607.0,
+            "53": 2951.0,
+            "54": 2672.0,
+            "55": 2451.0,
+            "56": 2712.0,
+            "57": 2392.0,
+            "58": 2979.0,
+            "59": 2869.0,
+            "60": 2435.0,
+            "61": 2938.0,
+            "62": 2669.0,
+            "63": 2392.0,
+            "64": 2998.0,
+            "65": 2689.0,
+            "66": 3285.0,
+            "67": 2782.0,
+            "68": 2753.0,
+            "69": 2958.0,
+            "70": 3271.0,
+            "71": 3040.0,
+            "72": 2504.0,
+            "73": 3096.0,
+            "74": 1910.0,
+            "75": 2617.0,
+            "76": 3081.0,
+            "77": 3390.0,
+            "78": 3186.0,
+            "79": 3320.0,
+            "80": 3483.0,
+            "81": 3782.0,
+            "82": 3516.0,
+            "83": 2864.0,
+            "84": 3396.0,
+            "85": 3247.0,
+            "86": 2785.0,
+            "87": 3762.0,
+            "88": 3102.0,
+            "89": 3483.0,
+            "90": 3076.0,
+            "91": 2643.0,
+            "92": 3198.0,
+            "93": 2666.0,
+            "94": 3390.0,
+            "95": 3410.0,
+            "96": 3508.0,
+            "97": 3178.0,
+            "98": 3865.0,
+            "99": 3143.0,
+            "100": 3357.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 487621120.0,
+            "52": 487621120.0,
+            "53": 487621120.0,
+            "54": 487621120.0,
+            "55": 487621120.0,
+            "56": 487621120.0,
+            "57": 487621120.0,
+            "58": 487621120.0,
+            "59": 487621120.0,
+            "60": 487621120.0,
+            "61": 487621120.0,
+            "62": 487621120.0,
+            "63": 487621120.0,
+            "64": 487621120.0,
+            "65": 487621120.0,
+            "66": 487621120.0,
+            "67": 487621120.0,
+            "68": 487621120.0,
+            "69": 487621120.0,
+            "70": 487621120.0,
+            "71": 487621120.0,
+            "72": 487621120.0,
+            "73": 487621120.0,
+            "74": 487621120.0,
+            "75": 487621120.0,
+            "76": 487621120.0,
+            "77": 487621120.0,
+            "78": 487621120.0,
+            "79": 487621120.0,
+            "80": 487621120.0,
+            "81": 487621120.0,
+            "82": 487621120.0,
+            "83": 487621120.0,
+            "84": 487621120.0,
+            "85": 487621120.0,
+            "86": 487621120.0,
+            "87": 487621120.0,
+            "88": 487621120.0,
+            "89": 487621120.0,
+            "90": 487621120.0,
+            "91": 487621120.0,
+            "92": 487621120.0,
+            "93": 487621120.0,
+            "94": 487621120.0,
+            "95": 487621120.0,
+            "96": 487621120.0,
+            "97": 487621120.0,
+            "98": 487621120.0,
+            "99": 487621120.0,
+            "100": 487621120.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1412441600.0,
+            "52": 1412442624.0,
+            "53": 1412442624.0,
+            "54": 1412442624.0,
+            "55": 1412442624.0,
+            "56": 1412442624.0,
+            "57": 1412442624.0,
+            "58": 1412442624.0,
+            "59": 1412442624.0,
+            "60": 1412442624.0,
+            "61": 1412442624.0,
+            "62": 1412442624.0,
+            "63": 1412442624.0,
+            "64": 1412442624.0,
+            "65": 1412442624.0,
+            "66": 1412442624.0,
+            "67": 1412442624.0,
+            "68": 1412442624.0,
+            "69": 1412442624.0,
+            "70": 1412442624.0,
+            "71": 1412442624.0,
+            "72": 1412442624.0,
+            "73": 1412442624.0,
+            "74": 1412442624.0,
+            "75": 1412442624.0,
+            "76": 1412442624.0,
+            "77": 1412442624.0,
+            "78": 1412442624.0,
+            "79": 1412442624.0,
+            "80": 1412442624.0,
+            "81": 1412442624.0,
+            "82": 1412442624.0,
+            "83": 1412442624.0,
+            "84": 1412442624.0,
+            "85": 1412442624.0,
+            "86": 1412442624.0,
+            "87": 1412442624.0,
+            "88": 1412442624.0,
+            "89": 1412442624.0,
+            "90": 1412442624.0,
+            "91": 1412442624.0,
+            "92": 1412442624.0,
+            "93": 1412442624.0,
+            "94": 1412442624.0,
+            "95": 1412442624.0,
+            "96": 1412442624.0,
+            "97": 1412442624.0,
+            "98": 1412442624.0,
+            "99": 1412442624.0,
+            "100": 1412442624.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.94248,
+            "52": 0.22763,
+            "53": 0.2042,
+            "54": 0.20275,
+            "55": 0.19946,
+            "56": 0.19904,
+            "57": 0.19835,
+            "58": 0.19899,
+            "59": 0.19773,
+            "60": 0.1984,
+            "61": 0.19823,
+            "62": 0.19759,
+            "63": 0.19781,
+            "64": 0.19644,
+            "65": 0.19746,
+            "66": 0.19818,
+            "67": 0.19673,
+            "68": 0.19692,
+            "69": 0.19752,
+            "70": 0.19608,
+            "71": 0.19615,
+            "72": 0.19651,
+            "73": 0.19666,
+            "74": 0.1968,
+            "75": 0.19633,
+            "76": 0.19633,
+            "77": 0.19638,
+            "78": 0.19631,
+            "79": 0.19652,
+            "80": 0.19633,
+            "81": 0.19737,
+            "82": 0.19691,
+            "83": 0.19652,
+            "84": 0.1968,
+            "85": 0.19796,
+            "86": 0.19783,
+            "87": 0.19656,
+            "88": 0.19754,
+            "89": 0.19687,
+            "90": 0.19705,
+            "91": 0.19684,
+            "92": 0.19665,
+            "93": 0.19712,
+            "94": 0.19703,
+            "95": 0.19667,
+            "96": 0.1973,
+            "97": 0.19754,
+            "98": 0.19757,
+            "99": 0.1962,
+            "100": 0.19706
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..bffbd713b20
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86244,
+            "2": 10.88582,
+            "3": 10.84733,
+            "4": 10.85571,
+            "5": 10.86009,
+            "6": 10.87728,
+            "7": 10.86559,
+            "8": 10.84911,
+            "9": 10.86605,
+            "10": 10.82478,
+            "11": 10.85618,
+            "12": 10.85375,
+            "13": 10.86786,
+            "14": 10.87115,
+            "15": 10.82231,
+            "16": 10.79989,
+            "17": 10.77429,
+            "18": 10.78346,
+            "19": 10.79306,
+            "20": 10.68226,
+            "21": 10.64716,
+            "22": 10.50918,
+            "23": 10.66831,
+            "24": 10.54197,
+            "25": 10.49277,
+            "26": 10.55929,
+            "27": 10.54236,
+            "28": 10.5113,
+            "29": 10.53258,
+            "30": 10.28987,
+            "31": 10.02854,
+            "32": 10.38881,
+            "33": 10.39595,
+            "34": 10.13452,
+            "35": 10.1893,
+            "36": 10.13353,
+            "37": 10.27381,
+            "38": 10.10749,
+            "39": 10.34007,
+            "40": 9.98535,
+            "41": 10.06411,
+            "42": 10.13748,
+            "43": 9.73379,
+            "44": 9.86307,
+            "45": 9.73726,
+            "46": 9.71341,
+            "47": 10.07754,
+            "48": 9.76762,
+            "49": 9.41989,
+            "50": 9.81685,
+            "51": 9.7742,
+            "52": 9.66444,
+            "53": 10.00151,
+            "54": 9.89155,
+            "55": 9.81852,
+            "56": 9.54337,
+            "57": 9.39452,
+            "58": 9.76573,
+            "59": 9.50935,
+            "60": 9.42821,
+            "61": 9.63468,
+            "62": 9.93891,
+            "63": 9.30458,
+            "64": 9.70984,
+            "65": 8.86888,
+            "66": 9.64952,
+            "67": 9.30815,
+            "68": 9.73505,
+            "69": 9.75596,
+            "70": 9.68706,
+            "71": 9.57535,
+            "72": 9.53075,
+            "73": 9.43678,
+            "74": 8.85586,
+            "75": 9.35532,
+            "76": 9.01377,
+            "77": 10.02449,
+            "78": 9.68205,
+            "79": 9.33138,
+            "80": 9.35467,
+            "81": 9.43621,
+            "82": 9.65855,
+            "83": 9.26269,
+            "84": 9.36921,
+            "85": 9.57104,
+            "86": 9.0332,
+            "87": 9.55973,
+            "88": 9.71077,
+            "89": 9.55411,
+            "90": 9.78662,
+            "91": 9.29091,
+            "92": 9.31236,
+            "93": 9.03976,
+            "94": 8.78112,
+            "95": 9.49175,
+            "96": 9.49071,
+            "97": 9.25827,
+            "98": 9.63001,
+            "99": 8.84688,
+            "100": 9.36199
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 562.0,
+            "2": 595.0,
+            "3": 621.0,
+            "4": 618.0,
+            "5": 626.0,
+            "6": 671.0,
+            "7": 641.0,
+            "8": 647.0,
+            "9": 601.0,
+            "10": 522.0,
+            "11": 648.0,
+            "12": 639.0,
+            "13": 681.0,
+            "14": 686.0,
+            "15": 638.0,
+            "16": 620.0,
+            "17": 601.0,
+            "18": 556.0,
+            "19": 628.0,
+            "20": 551.0,
+            "21": 695.0,
+            "22": 620.0,
+            "23": 708.0,
+            "24": 553.0,
+            "25": 566.0,
+            "26": 597.0,
+            "27": 672.0,
+            "28": 717.0,
+            "29": 763.0,
+            "30": 694.0,
+            "31": 627.0,
+            "32": 696.0,
+            "33": 820.0,
+            "34": 674.0,
+            "35": 741.0,
+            "36": 733.0,
+            "37": 848.0,
+            "38": 788.0,
+            "39": 863.0,
+            "40": 812.0,
+            "41": 813.0,
+            "42": 812.0,
+            "43": 706.0,
+            "44": 810.0,
+            "45": 732.0,
+            "46": 863.0,
+            "47": 914.0,
+            "48": 886.0,
+            "49": 786.0,
+            "50": 872.0,
+            "51": 952.0,
+            "52": 963.0,
+            "53": 1095.0,
+            "54": 956.0,
+            "55": 844.0,
+            "56": 969.0,
+            "57": 831.0,
+            "58": 985.0,
+            "59": 1062.0,
+            "60": 868.0,
+            "61": 975.0,
+            "62": 897.0,
+            "63": 928.0,
+            "64": 1085.0,
+            "65": 1058.0,
+            "66": 1068.0,
+            "67": 966.0,
+            "68": 999.0,
+            "69": 1021.0,
+            "70": 1103.0,
+            "71": 1068.0,
+            "72": 884.0,
+            "73": 1027.0,
+            "74": 757.0,
+            "75": 818.0,
+            "76": 981.0,
+            "77": 1091.0,
+            "78": 1135.0,
+            "79": 1105.0,
+            "80": 1126.0,
+            "81": 1181.0,
+            "82": 1095.0,
+            "83": 981.0,
+            "84": 1154.0,
+            "85": 1139.0,
+            "86": 804.0,
+            "87": 1216.0,
+            "88": 1139.0,
+            "89": 1113.0,
+            "90": 1071.0,
+            "91": 1180.0,
+            "92": 1100.0,
+            "93": 846.0,
+            "94": 1155.0,
+            "95": 1071.0,
+            "96": 1123.0,
+            "97": 1074.0,
+            "98": 1188.0,
+            "99": 1161.0,
+            "100": 1153.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 462062080.0,
+            "2": 462062080.0,
+            "3": 462062080.0,
+            "4": 462062080.0,
+            "5": 462062080.0,
+            "6": 462062080.0,
+            "7": 462062080.0,
+            "8": 462062080.0,
+            "9": 462062080.0,
+            "10": 462062080.0,
+            "11": 462062080.0,
+            "12": 462062080.0,
+            "13": 462062080.0,
+            "14": 462062080.0,
+            "15": 462062080.0,
+            "16": 462062080.0,
+            "17": 462062080.0,
+            "18": 462062080.0,
+            "19": 462062080.0,
+            "20": 462062080.0,
+            "21": 462062080.0,
+            "22": 462062080.0,
+            "23": 462062080.0,
+            "24": 462062080.0,
+            "25": 462062080.0,
+            "26": 462062080.0,
+            "27": 462062080.0,
+            "28": 462062080.0,
+            "29": 462062080.0,
+            "30": 462062080.0,
+            "31": 462062080.0,
+            "32": 462062080.0,
+            "33": 462062080.0,
+            "34": 462062080.0,
+            "35": 462062080.0,
+            "36": 462062080.0,
+            "37": 462062080.0,
+            "38": 462062080.0,
+            "39": 462062080.0,
+            "40": 462062080.0,
+            "41": 462062080.0,
+            "42": 462062080.0,
+            "43": 462062080.0,
+            "44": 462062080.0,
+            "45": 462062080.0,
+            "46": 462062080.0,
+            "47": 462062080.0,
+            "48": 462062080.0,
+            "49": 462062080.0,
+            "50": 462062080.0,
+            "51": 462062080.0,
+            "52": 462062080.0,
+            "53": 462062080.0,
+            "54": 462062080.0,
+            "55": 462062080.0,
+            "56": 462062080.0,
+            "57": 462062080.0,
+            "58": 462062080.0,
+            "59": 462062080.0,
+            "60": 462062080.0,
+            "61": 462062080.0,
+            "62": 462062080.0,
+            "63": 462062080.0,
+            "64": 462062080.0,
+            "65": 462062080.0,
+            "66": 462062080.0,
+            "67": 462062080.0,
+            "68": 462062080.0,
+            "69": 462062080.0,
+            "70": 462062080.0,
+            "71": 462062080.0,
+            "72": 462062080.0,
+            "73": 462062080.0,
+            "74": 462062080.0,
+            "75": 462062080.0,
+            "76": 462062080.0,
+            "77": 462062080.0,
+            "78": 462062080.0,
+            "79": 462062080.0,
+            "80": 462062080.0,
+            "81": 462062080.0,
+            "82": 462062080.0,
+            "83": 462062080.0,
+            "84": 462062080.0,
+            "85": 462062080.0,
+            "86": 462062080.0,
+            "87": 462062080.0,
+            "88": 462062080.0,
+            "89": 462062080.0,
+            "90": 462062080.0,
+            "91": 462062080.0,
+            "92": 462062080.0,
+            "93": 462062080.0,
+            "94": 462062080.0,
+            "95": 462062080.0,
+            "96": 462062080.0,
+            "97": 462062080.0,
+            "98": 462062080.0,
+            "99": 462062080.0,
+            "100": 462062080.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 703538688.0,
+            "2": 884528640.0,
+            "3": 884528640.0,
+            "4": 884528640.0,
+            "5": 884528640.0,
+            "6": 884528640.0,
+            "7": 884528640.0,
+            "8": 884528640.0,
+            "9": 884528640.0,
+            "10": 884528640.0,
+            "11": 884528640.0,
+            "12": 884528640.0,
+            "13": 884528640.0,
+            "14": 884528640.0,
+            "15": 884528640.0,
+            "16": 884528640.0,
+            "17": 884528640.0,
+            "18": 884528640.0,
+            "19": 884528640.0,
+            "20": 884528640.0,
+            "21": 884528640.0,
+            "22": 884528640.0,
+            "23": 884528640.0,
+            "24": 884528640.0,
+            "25": 884528640.0,
+            "26": 884528640.0,
+            "27": 884528640.0,
+            "28": 884528640.0,
+            "29": 884528640.0,
+            "30": 884528640.0,
+            "31": 884528640.0,
+            "32": 884528640.0,
+            "33": 884528640.0,
+            "34": 884528640.0,
+            "35": 884528640.0,
+            "36": 884528640.0,
+            "37": 884528640.0,
+            "38": 884528640.0,
+            "39": 884528640.0,
+            "40": 884528640.0,
+            "41": 884528640.0,
+            "42": 884528640.0,
+            "43": 884528640.0,
+            "44": 884528640.0,
+            "45": 884528640.0,
+            "46": 884528640.0,
+            "47": 884528640.0,
+            "48": 884528640.0,
+            "49": 884528640.0,
+            "50": 884528640.0,
+            "51": 884528640.0,
+            "52": 884528640.0,
+            "53": 884528640.0,
+            "54": 884528640.0,
+            "55": 884528640.0,
+            "56": 884528640.0,
+            "57": 884528640.0,
+            "58": 884528640.0,
+            "59": 884528640.0,
+            "60": 884528640.0,
+            "61": 884528640.0,
+            "62": 884528640.0,
+            "63": 884528640.0,
+            "64": 884528640.0,
+            "65": 884528640.0,
+            "66": 885575168.0,
+            "67": 885575168.0,
+            "68": 885575168.0,
+            "69": 885575168.0,
+            "70": 885575168.0,
+            "71": 885575168.0,
+            "72": 885575168.0,
+            "73": 885575168.0,
+            "74": 885575168.0,
+            "75": 885575168.0,
+            "76": 885575168.0,
+            "77": 885575168.0,
+            "78": 885575168.0,
+            "79": 885575168.0,
+            "80": 885575168.0,
+            "81": 885575168.0,
+            "82": 885575168.0,
+            "83": 885575168.0,
+            "84": 885575168.0,
+            "85": 885575168.0,
+            "86": 885575168.0,
+            "87": 885575168.0,
+            "88": 885575168.0,
+            "89": 885575168.0,
+            "90": 885575168.0,
+            "91": 885575168.0,
+            "92": 885575168.0,
+            "93": 885575168.0,
+            "94": 885575168.0,
+            "95": 885575168.0,
+            "96": 885575168.0,
+            "97": 885575168.0,
+            "98": 885575168.0,
+            "99": 885575168.0,
+            "100": 885575168.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.73344,
+            "3": 0.435,
+            "4": 0.42269,
+            "5": 0.42143,
+            "6": 0.42169,
+            "7": 0.42023,
+            "8": 0.42024,
+            "9": 0.42222,
+            "10": 0.42242,
+            "11": 0.42053,
+            "12": 0.42268,
+            "13": 0.41982,
+            "14": 0.41832,
+            "15": 0.41832,
+            "16": 0.41936,
+            "17": 0.41957,
+            "18": 0.41869,
+            "19": 0.4182,
+            "20": 0.41746,
+            "21": 0.41737,
+            "22": 0.40981,
+            "23": 0.4096,
+            "24": 0.40573,
+            "25": 0.40471,
+            "26": 0.40427,
+            "27": 0.40639,
+            "28": 0.40633,
+            "29": 0.40533,
+            "30": 0.40576,
+            "31": 0.40376,
+            "32": 0.40338,
+            "33": 0.40605,
+            "34": 0.40135,
+            "35": 0.40398,
+            "36": 0.40309,
+            "37": 0.40852,
+            "38": 0.40572,
+            "39": 0.40092,
+            "40": 0.40543,
+            "41": 0.40495,
+            "42": 0.40518,
+            "43": 0.40074,
+            "44": 0.40306,
+            "45": 0.40179,
+            "46": 0.40307,
+            "47": 0.40246,
+            "48": 0.4024,
+            "49": 0.40234,
+            "50": 0.40238,
+            "51": 0.41546,
+            "52": 0.39352,
+            "53": 0.39475,
+            "54": 0.39652,
+            "55": 0.40055,
+            "56": 0.39993,
+            "57": 0.40166,
+            "58": 0.40151,
+            "59": 0.40191,
+            "60": 0.40194,
+            "61": 0.40084,
+            "62": 0.39989,
+            "63": 0.40157,
+            "64": 0.40012,
+            "65": 0.40034,
+            "66": 0.40082,
+            "67": 0.40008,
+            "68": 0.39842,
+            "69": 0.39844,
+            "70": 0.40021,
+            "71": 0.39935,
+            "72": 0.40145,
+            "73": 0.39804,
+            "74": 0.39495,
+            "75": 0.39605,
+            "76": 0.39578,
+            "77": 0.39653,
+            "78": 0.39694,
+            "79": 0.95682,
+            "80": 0.39818,
+            "81": 0.39646,
+            "82": 0.39909,
+            "83": 0.4044,
+            "84": 0.39893,
+            "85": 0.39807,
+            "86": 0.39917,
+            "87": 0.39513,
+            "88": 0.39647,
+            "89": 0.39761,
+            "90": 0.39883,
+            "91": 0.39867,
+            "92": 0.39686,
+            "93": 0.39611,
+            "94": 0.39717,
+            "95": 0.39645,
+            "96": 0.39632,
+            "97": 0.39808,
+            "98": 0.39732,
+            "99": 0.39829,
+            "100": 0.39861
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..1bf0169b170
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.7742,
+            "52": 9.66445,
+            "53": 10.00151,
+            "54": 9.89155,
+            "55": 9.81849,
+            "56": 9.54335,
+            "57": 9.39451,
+            "58": 9.76573,
+            "59": 9.5093,
+            "60": 9.42825,
+            "61": 9.63467,
+            "62": 9.93887,
+            "63": 9.30457,
+            "64": 9.70983,
+            "65": 8.86882,
+            "66": 9.64953,
+            "67": 9.3082,
+            "68": 9.73505,
+            "69": 9.7559,
+            "70": 9.68706,
+            "71": 9.57534,
+            "72": 9.53073,
+            "73": 9.43677,
+            "74": 8.85587,
+            "75": 9.35529,
+            "76": 9.01373,
+            "77": 10.02452,
+            "78": 9.68203,
+            "79": 9.33141,
+            "80": 9.35469,
+            "81": 9.43623,
+            "82": 9.65853,
+            "83": 9.26266,
+            "84": 9.36921,
+            "85": 9.571,
+            "86": 9.03325,
+            "87": 9.55972,
+            "88": 9.71078,
+            "89": 9.5541,
+            "90": 9.78661,
+            "91": 9.29086,
+            "92": 9.31236,
+            "93": 9.03977,
+            "94": 8.78115,
+            "95": 9.49176,
+            "96": 9.4907,
+            "97": 9.25833,
+            "98": 9.63003,
+            "99": 8.84687,
+            "100": 9.36199
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 920.0,
+            "52": 908.0,
+            "53": 1052.0,
+            "54": 889.0,
+            "55": 846.0,
+            "56": 985.0,
+            "57": 838.0,
+            "58": 1021.0,
+            "59": 1045.0,
+            "60": 896.0,
+            "61": 987.0,
+            "62": 926.0,
+            "63": 912.0,
+            "64": 1081.0,
+            "65": 991.0,
+            "66": 1095.0,
+            "67": 964.0,
+            "68": 938.0,
+            "69": 1005.0,
+            "70": 1013.0,
+            "71": 1082.0,
+            "72": 896.0,
+            "73": 1035.0,
+            "74": 687.0,
+            "75": 920.0,
+            "76": 1063.0,
+            "77": 1086.0,
+            "78": 1136.0,
+            "79": 1065.0,
+            "80": 1111.0,
+            "81": 1229.0,
+            "82": 1100.0,
+            "83": 944.0,
+            "84": 1182.0,
+            "85": 1100.0,
+            "86": 790.0,
+            "87": 1132.0,
+            "88": 1071.0,
+            "89": 1148.0,
+            "90": 1121.0,
+            "91": 1120.0,
+            "92": 1115.0,
+            "93": 944.0,
+            "94": 1126.0,
+            "95": 1116.0,
+            "96": 1115.0,
+            "97": 995.0,
+            "98": 1234.0,
+            "99": 1120.0,
+            "100": 1148.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 461669888.0,
+            "52": 461669888.0,
+            "53": 461669888.0,
+            "54": 461669888.0,
+            "55": 461669888.0,
+            "56": 461669888.0,
+            "57": 461669888.0,
+            "58": 461669888.0,
+            "59": 461669888.0,
+            "60": 461669888.0,
+            "61": 461669888.0,
+            "62": 461669888.0,
+            "63": 461669888.0,
+            "64": 461669888.0,
+            "65": 461669888.0,
+            "66": 461669888.0,
+            "67": 461669888.0,
+            "68": 461669888.0,
+            "69": 461669888.0,
+            "70": 461669888.0,
+            "71": 461669888.0,
+            "72": 461669888.0,
+            "73": 461669888.0,
+            "74": 461669888.0,
+            "75": 461669888.0,
+            "76": 461669888.0,
+            "77": 461669888.0,
+            "78": 461669888.0,
+            "79": 461669888.0,
+            "80": 461669888.0,
+            "81": 461669888.0,
+            "82": 461669888.0,
+            "83": 461669888.0,
+            "84": 461669888.0,
+            "85": 461669888.0,
+            "86": 461669888.0,
+            "87": 461669888.0,
+            "88": 461669888.0,
+            "89": 461669888.0,
+            "90": 461669888.0,
+            "91": 461669888.0,
+            "92": 461669888.0,
+            "93": 461669888.0,
+            "94": 461669888.0,
+            "95": 461669888.0,
+            "96": 461669888.0,
+            "97": 461669888.0,
+            "98": 461669888.0,
+            "99": 461669888.0,
+            "100": 461669888.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 889377280.0,
+            "52": 889378304.0,
+            "53": 889378304.0,
+            "54": 889378304.0,
+            "55": 889378304.0,
+            "56": 889378304.0,
+            "57": 889378304.0,
+            "58": 889378304.0,
+            "59": 889378304.0,
+            "60": 889378304.0,
+            "61": 889378304.0,
+            "62": 889378304.0,
+            "63": 889378304.0,
+            "64": 889378304.0,
+            "65": 889378304.0,
+            "66": 889378304.0,
+            "67": 889378304.0,
+            "68": 889378304.0,
+            "69": 889378304.0,
+            "70": 889378304.0,
+            "71": 889378304.0,
+            "72": 889378304.0,
+            "73": 889378304.0,
+            "74": 889378304.0,
+            "75": 889378304.0,
+            "76": 889378304.0,
+            "77": 889378304.0,
+            "78": 889378304.0,
+            "79": 889378304.0,
+            "80": 889378304.0,
+            "81": 889378304.0,
+            "82": 889378304.0,
+            "83": 889378304.0,
+            "84": 889378304.0,
+            "85": 889378304.0,
+            "86": 889378304.0,
+            "87": 889378304.0,
+            "88": 889378304.0,
+            "89": 889378304.0,
+            "90": 889378816.0,
+            "91": 889378816.0,
+            "92": 889378816.0,
+            "93": 889378816.0,
+            "94": 889378816.0,
+            "95": 889378816.0,
+            "96": 889378816.0,
+            "97": 889378816.0,
+            "98": 889378816.0,
+            "99": 889378816.0,
+            "100": 889378816.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 3.77319,
+            "53": 0.42363,
+            "54": 0.41071,
+            "55": 0.41011,
+            "56": 0.40905,
+            "57": 0.40957,
+            "58": 0.41032,
+            "59": 0.40997,
+            "60": 0.4109,
+            "61": 0.4104,
+            "62": 0.40989,
+            "63": 0.40974,
+            "64": 0.40928,
+            "65": 0.40668,
+            "66": 0.4076,
+            "67": 0.41006,
+            "68": 0.41114,
+            "69": 0.40437,
+            "70": 0.40702,
+            "71": 0.4095,
+            "72": 0.41064,
+            "73": 0.40549,
+            "74": 0.40683,
+            "75": 0.4055,
+            "76": 0.40589,
+            "77": 0.40198,
+            "78": 0.40196,
+            "79": 0.40383,
+            "80": 0.40596,
+            "81": 0.40678,
+            "82": 0.40646,
+            "83": 0.40861,
+            "84": 0.40858,
+            "85": 0.40709,
+            "86": 0.40475,
+            "87": 0.41028,
+            "88": 0.40188,
+            "89": 0.40272,
+            "90": 0.4034,
+            "91": 0.40676,
+            "92": 0.40732,
+            "93": 0.40103,
+            "94": 0.40501,
+            "95": 0.4043,
+            "96": 0.40452,
+            "97": 0.40255,
+            "98": 0.40532,
+            "99": 0.40632,
+            "100": 0.4042
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json
index c677311f507..990bbe865d6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json
@@ -4,106 +4,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.86535,
-            "2": 10.85873,
-            "3": 10.86285,
-            "4": 10.84007,
-            "5": 10.87856,
-            "6": 10.88856,
+            "1": 10.86539,
+            "2": 10.85871,
+            "3": 10.86283,
+            "4": 10.84009,
+            "5": 10.87851,
+            "6": 10.88849,
             "7": 10.86538,
-            "8": 10.86017,
-            "9": 10.85991,
-            "10": 10.8298,
+            "8": 10.86018,
+            "9": 10.8599,
+            "10": 10.82981,
             "11": 10.88947,
-            "12": 10.87508,
-            "13": 10.87422,
-            "14": 10.89677,
-            "15": 10.8205,
-            "16": 10.82499,
-            "17": 10.78984,
-            "18": 10.81029,
-            "19": 10.80536,
-            "20": 10.70396,
-            "21": 10.6699,
-            "22": 10.50644,
-            "23": 10.69003,
-            "24": 10.5631,
+            "12": 10.87505,
+            "13": 10.87426,
+            "14": 10.89675,
+            "15": 10.82051,
+            "16": 10.82497,
+            "17": 10.78982,
+            "18": 10.81028,
+            "19": 10.80533,
+            "20": 10.70395,
+            "21": 10.66991,
+            "22": 10.50641,
+            "23": 10.69006,
+            "24": 10.56313,
             "25": 10.49417,
-            "26": 10.56624,
-            "27": 10.58026,
-            "28": 10.51571,
-            "29": 10.553,
-            "30": 10.30552,
-            "31": 10.02249,
-            "32": 10.40613,
-            "33": 10.3988,
-            "34": 10.13771,
-            "35": 10.20186,
-            "36": 10.16052,
-            "37": 10.28975,
-            "38": 10.1148,
-            "39": 10.36102,
-            "40": 10.01904,
-            "41": 10.07292,
-            "42": 10.14696,
-            "43": 9.74683,
-            "44": 9.87763,
-            "45": 9.74966,
-            "46": 9.73387,
-            "47": 10.07534,
+            "26": 10.56627,
+            "27": 10.58021,
+            "28": 10.51572,
+            "29": 10.55296,
+            "30": 10.3055,
+            "31": 10.02245,
+            "32": 10.40616,
+            "33": 10.39874,
+            "34": 10.13773,
+            "35": 10.20185,
+            "36": 10.16056,
+            "37": 10.28972,
+            "38": 10.11479,
+            "39": 10.36099,
+            "40": 10.01899,
+            "41": 10.07293,
+            "42": 10.14693,
+            "43": 9.74686,
+            "44": 9.87761,
+            "45": 9.74968,
+            "46": 9.73385,
+            "47": 10.07539,
             "48": 9.78069,
-            "49": 9.4478,
-            "50": 9.83991,
-            "51": 9.78025,
-            "52": 9.67263,
-            "53": 10.0201,
-            "54": 9.89789,
-            "55": 9.81664,
-            "56": 9.56044,
-            "57": 9.41178,
-            "58": 9.77419,
-            "59": 9.51794,
-            "60": 9.43538,
-            "61": 9.64484,
+            "49": 9.44781,
+            "50": 9.83993,
+            "51": 9.78026,
+            "52": 9.67268,
+            "53": 10.02014,
+            "54": 9.89787,
+            "55": 9.81661,
+            "56": 9.56042,
+            "57": 9.41177,
+            "58": 9.77417,
+            "59": 9.51799,
+            "60": 9.43536,
+            "61": 9.64482,
             "62": 9.93004,
-            "63": 9.30911,
-            "64": 9.72068,
-            "65": 8.87154,
-            "66": 9.64427,
+            "63": 9.3091,
+            "64": 9.72065,
+            "65": 8.87152,
+            "66": 9.64429,
             "67": 9.31328,
             "68": 9.74067,
-            "69": 9.75334,
+            "69": 9.75333,
             "70": 9.70004,
-            "71": 9.56556,
-            "72": 9.53094,
-            "73": 9.44386,
-            "74": 8.86782,
-            "75": 9.37314,
-            "76": 9.01274,
-            "77": 10.02855,
+            "71": 9.5656,
+            "72": 9.53096,
+            "73": 9.44383,
+            "74": 8.86781,
+            "75": 9.3731,
+            "76": 9.01276,
+            "77": 10.02858,
             "78": 9.68739,
-            "79": 9.328,
-            "80": 9.36168,
-            "81": 9.43367,
+            "79": 9.32798,
+            "80": 9.36164,
+            "81": 9.43365,
             "82": 9.66094,
-            "83": 9.25139,
-            "84": 9.37352,
-            "85": 9.56939,
+            "83": 9.25142,
+            "84": 9.37355,
+            "85": 9.56941,
             "86": 9.03181,
             "87": 9.55584,
-            "88": 9.71055,
-            "89": 9.55395,
-            "90": 9.78475,
-            "91": 9.29077,
-            "92": 9.31245,
-            "93": 9.03142,
-            "94": 8.78671,
-            "95": 9.4873,
-            "96": 9.49052,
-            "97": 9.26684,
-            "98": 9.63648,
-            "99": 8.84333,
-            "100": 9.35549
+            "88": 9.71056,
+            "89": 9.55398,
+            "90": 9.78471,
+            "91": 9.29078,
+            "92": 9.31244,
+            "93": 9.03139,
+            "94": 8.78668,
+            "95": 9.48732,
+            "96": 9.4905,
+            "97": 9.26686,
+            "98": 9.63647,
+            "99": 8.84336,
+            "100": 9.35551
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 585.0,
-            "2": 648.0,
-            "3": 630.0,
-            "4": 656.0,
-            "5": 620.0,
-            "6": 637.0,
-            "7": 641.0,
-            "8": 581.0,
-            "9": 660.0,
-            "10": 504.0,
-            "11": 664.0,
-            "12": 639.0,
-            "13": 670.0,
-            "14": 666.0,
-            "15": 652.0,
-            "16": 624.0,
-            "17": 704.0,
-            "18": 579.0,
-            "19": 682.0,
-            "20": 623.0,
-            "21": 657.0,
-            "22": 561.0,
-            "23": 763.0,
-            "24": 593.0,
-            "25": 629.0,
-            "26": 669.0,
-            "27": 691.0,
-            "28": 738.0,
-            "29": 788.0,
-            "30": 744.0,
-            "31": 604.0,
-            "32": 736.0,
-            "33": 787.0,
-            "34": 706.0,
-            "35": 692.0,
-            "36": 714.0,
-            "37": 835.0,
-            "38": 768.0,
-            "39": 894.0,
-            "40": 764.0,
-            "41": 852.0,
-            "42": 878.0,
-            "43": 733.0,
-            "44": 827.0,
-            "45": 785.0,
-            "46": 877.0,
-            "47": 927.0,
-            "48": 873.0,
-            "49": 891.0,
-            "50": 869.0,
-            "51": 928.0,
-            "52": 968.0,
-            "53": 1089.0,
-            "54": 966.0,
-            "55": 913.0,
-            "56": 983.0,
-            "57": 889.0,
-            "58": 1063.0,
-            "59": 1005.0,
-            "60": 876.0,
-            "61": 1043.0,
-            "62": 897.0,
-            "63": 971.0,
-            "64": 1100.0,
-            "65": 911.0,
-            "66": 1107.0,
-            "67": 948.0,
-            "68": 1033.0,
-            "69": 1064.0,
-            "70": 1118.0,
-            "71": 1032.0,
-            "72": 854.0,
-            "73": 1007.0,
-            "74": 739.0,
-            "75": 877.0,
-            "76": 1075.0,
-            "77": 1108.0,
-            "78": 1103.0,
-            "79": 980.0,
-            "80": 1055.0,
-            "81": 1240.0,
-            "82": 1101.0,
-            "83": 1007.0,
-            "84": 1147.0,
-            "85": 1157.0,
-            "86": 897.0,
-            "87": 1247.0,
-            "88": 1015.0,
-            "89": 1155.0,
-            "90": 1138.0,
-            "91": 1141.0,
-            "92": 1142.0,
-            "93": 947.0,
-            "94": 1116.0,
-            "95": 1119.0,
-            "96": 1099.0,
-            "97": 997.0,
-            "98": 1188.0,
-            "99": 1141.0,
-            "100": 1102.0
+            "1": 597.0,
+            "2": 647.0,
+            "3": 637.0,
+            "4": 610.0,
+            "5": 635.0,
+            "6": 696.0,
+            "7": 660.0,
+            "8": 563.0,
+            "9": 609.0,
+            "10": 515.0,
+            "11": 716.0,
+            "12": 570.0,
+            "13": 661.0,
+            "14": 668.0,
+            "15": 654.0,
+            "16": 630.0,
+            "17": 671.0,
+            "18": 624.0,
+            "19": 624.0,
+            "20": 615.0,
+            "21": 655.0,
+            "22": 563.0,
+            "23": 719.0,
+            "24": 632.0,
+            "25": 605.0,
+            "26": 613.0,
+            "27": 655.0,
+            "28": 690.0,
+            "29": 769.0,
+            "30": 655.0,
+            "31": 602.0,
+            "32": 721.0,
+            "33": 800.0,
+            "34": 727.0,
+            "35": 739.0,
+            "36": 722.0,
+            "37": 792.0,
+            "38": 721.0,
+            "39": 793.0,
+            "40": 758.0,
+            "41": 868.0,
+            "42": 813.0,
+            "43": 761.0,
+            "44": 836.0,
+            "45": 803.0,
+            "46": 809.0,
+            "47": 881.0,
+            "48": 849.0,
+            "49": 868.0,
+            "50": 856.0,
+            "51": 923.0,
+            "52": 936.0,
+            "53": 1031.0,
+            "54": 967.0,
+            "55": 838.0,
+            "56": 1001.0,
+            "57": 887.0,
+            "58": 1072.0,
+            "59": 1004.0,
+            "60": 898.0,
+            "61": 1016.0,
+            "62": 912.0,
+            "63": 903.0,
+            "64": 998.0,
+            "65": 943.0,
+            "66": 1132.0,
+            "67": 967.0,
+            "68": 998.0,
+            "69": 1028.0,
+            "70": 1034.0,
+            "71": 1084.0,
+            "72": 889.0,
+            "73": 1054.0,
+            "74": 685.0,
+            "75": 899.0,
+            "76": 1042.0,
+            "77": 1171.0,
+            "78": 1099.0,
+            "79": 1026.0,
+            "80": 1139.0,
+            "81": 1262.0,
+            "82": 1077.0,
+            "83": 982.0,
+            "84": 1080.0,
+            "85": 1114.0,
+            "86": 813.0,
+            "87": 1191.0,
+            "88": 1075.0,
+            "89": 1091.0,
+            "90": 1079.0,
+            "91": 1094.0,
+            "92": 1132.0,
+            "93": 983.0,
+            "94": 1160.0,
+            "95": 1117.0,
+            "96": 1186.0,
+            "97": 1031.0,
+            "98": 1215.0,
+            "99": 1185.0,
+            "100": 1147.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 510689792.0,
-            "2": 510689792.0,
-            "3": 510689792.0,
-            "4": 510689792.0,
-            "5": 510689792.0,
-            "6": 510689792.0,
-            "7": 510689792.0,
-            "8": 510689792.0,
-            "9": 510689792.0,
-            "10": 510689792.0,
-            "11": 510689792.0,
-            "12": 510689792.0,
-            "13": 510689792.0,
-            "14": 510689792.0,
-            "15": 510689792.0,
-            "16": 510689792.0,
-            "17": 510689792.0,
-            "18": 510689792.0,
-            "19": 510689792.0,
-            "20": 510689792.0,
-            "21": 510689792.0,
-            "22": 510689792.0,
-            "23": 510689792.0,
-            "24": 510689792.0,
-            "25": 510689792.0,
-            "26": 510689792.0,
-            "27": 510689792.0,
-            "28": 510689792.0,
-            "29": 510689792.0,
-            "30": 510689792.0,
-            "31": 510689792.0,
-            "32": 510689792.0,
-            "33": 510689792.0,
-            "34": 510689792.0,
-            "35": 510689792.0,
-            "36": 510689792.0,
-            "37": 510689792.0,
-            "38": 510689792.0,
-            "39": 510689792.0,
-            "40": 510689792.0,
-            "41": 510689792.0,
-            "42": 510689792.0,
-            "43": 510689792.0,
-            "44": 510689792.0,
-            "45": 510689792.0,
-            "46": 510689792.0,
-            "47": 510689792.0,
-            "48": 510689792.0,
-            "49": 510689792.0,
-            "50": 510689792.0,
-            "51": 510689792.0,
-            "52": 510689792.0,
-            "53": 510689792.0,
-            "54": 510689792.0,
-            "55": 510689792.0,
-            "56": 510689792.0,
-            "57": 510689792.0,
-            "58": 510689792.0,
-            "59": 510689792.0,
-            "60": 510689792.0,
-            "61": 510689792.0,
-            "62": 510689792.0,
-            "63": 510689792.0,
-            "64": 510689792.0,
-            "65": 510689792.0,
-            "66": 510689792.0,
-            "67": 510689792.0,
-            "68": 510689792.0,
-            "69": 510689792.0,
-            "70": 510689792.0,
-            "71": 510689792.0,
-            "72": 510689792.0,
-            "73": 510689792.0,
-            "74": 510689792.0,
-            "75": 510689792.0,
-            "76": 510689792.0,
-            "77": 510689792.0,
-            "78": 510689792.0,
-            "79": 510689792.0,
-            "80": 510689792.0,
-            "81": 510689792.0,
-            "82": 510689792.0,
-            "83": 510689792.0,
-            "84": 510689792.0,
-            "85": 510689792.0,
-            "86": 510689792.0,
-            "87": 510689792.0,
-            "88": 510689792.0,
-            "89": 510689792.0,
-            "90": 510689792.0,
-            "91": 510689792.0,
-            "92": 510689792.0,
-            "93": 510689792.0,
-            "94": 510689792.0,
-            "95": 510689792.0,
-            "96": 510689792.0,
-            "97": 510689792.0,
-            "98": 510689792.0,
-            "99": 510689792.0,
-            "100": 510689792.0
+            "1": 512786944.0,
+            "2": 512786944.0,
+            "3": 512786944.0,
+            "4": 512786944.0,
+            "5": 512786944.0,
+            "6": 512786944.0,
+            "7": 512786944.0,
+            "8": 512786944.0,
+            "9": 512786944.0,
+            "10": 512786944.0,
+            "11": 512786944.0,
+            "12": 512786944.0,
+            "13": 512786944.0,
+            "14": 512786944.0,
+            "15": 512786944.0,
+            "16": 512786944.0,
+            "17": 512786944.0,
+            "18": 512786944.0,
+            "19": 512786944.0,
+            "20": 512786944.0,
+            "21": 512786944.0,
+            "22": 512786944.0,
+            "23": 512786944.0,
+            "24": 512786944.0,
+            "25": 512786944.0,
+            "26": 512786944.0,
+            "27": 512786944.0,
+            "28": 512786944.0,
+            "29": 512786944.0,
+            "30": 512786944.0,
+            "31": 512786944.0,
+            "32": 512786944.0,
+            "33": 512786944.0,
+            "34": 512786944.0,
+            "35": 512786944.0,
+            "36": 512786944.0,
+            "37": 512786944.0,
+            "38": 512786944.0,
+            "39": 512786944.0,
+            "40": 512786944.0,
+            "41": 512786944.0,
+            "42": 512786944.0,
+            "43": 512786944.0,
+            "44": 512786944.0,
+            "45": 512786944.0,
+            "46": 512786944.0,
+            "47": 512786944.0,
+            "48": 512786944.0,
+            "49": 512786944.0,
+            "50": 512786944.0,
+            "51": 512786944.0,
+            "52": 512786944.0,
+            "53": 512786944.0,
+            "54": 512786944.0,
+            "55": 512786944.0,
+            "56": 512786944.0,
+            "57": 512786944.0,
+            "58": 512786944.0,
+            "59": 512786944.0,
+            "60": 512786944.0,
+            "61": 512786944.0,
+            "62": 512786944.0,
+            "63": 512786944.0,
+            "64": 512786944.0,
+            "65": 512786944.0,
+            "66": 512786944.0,
+            "67": 512786944.0,
+            "68": 512786944.0,
+            "69": 512786944.0,
+            "70": 512786944.0,
+            "71": 512786944.0,
+            "72": 512786944.0,
+            "73": 512786944.0,
+            "74": 512786944.0,
+            "75": 512786944.0,
+            "76": 512786944.0,
+            "77": 512786944.0,
+            "78": 512786944.0,
+            "79": 512786944.0,
+            "80": 512786944.0,
+            "81": 512786944.0,
+            "82": 512786944.0,
+            "83": 512786944.0,
+            "84": 512786944.0,
+            "85": 512786944.0,
+            "86": 512786944.0,
+            "87": 512786944.0,
+            "88": 512786944.0,
+            "89": 512786944.0,
+            "90": 512786944.0,
+            "91": 512786944.0,
+            "92": 512786944.0,
+            "93": 512786944.0,
+            "94": 512786944.0,
+            "95": 512786944.0,
+            "96": 512786944.0,
+            "97": 512786944.0,
+            "98": 512786944.0,
+            "99": 512786944.0,
+            "100": 512786944.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 759895552.0,
-            "2": 933156352.0,
-            "3": 933156352.0,
-            "4": 933156352.0,
-            "5": 933156352.0,
-            "6": 933156352.0,
-            "7": 933156352.0,
-            "8": 933156352.0,
-            "9": 933156352.0,
-            "10": 933156352.0,
-            "11": 933156352.0,
-            "12": 933156352.0,
-            "13": 933156352.0,
-            "14": 933156352.0,
-            "15": 933156352.0,
-            "16": 933156352.0,
-            "17": 933156352.0,
-            "18": 933156352.0,
-            "19": 933156352.0,
-            "20": 933156352.0,
-            "21": 933156352.0,
-            "22": 933156352.0,
-            "23": 933156352.0,
-            "24": 933156352.0,
-            "25": 933156352.0,
-            "26": 933156352.0,
-            "27": 933156352.0,
-            "28": 933156352.0,
-            "29": 933156352.0,
-            "30": 933156352.0,
-            "31": 933156352.0,
-            "32": 933156352.0,
-            "33": 933156352.0,
-            "34": 933156352.0,
-            "35": 933156352.0,
-            "36": 933156352.0,
-            "37": 933156352.0,
-            "38": 933156352.0,
-            "39": 933156352.0,
-            "40": 933156352.0,
-            "41": 933156352.0,
-            "42": 933156352.0,
-            "43": 933156352.0,
-            "44": 933156352.0,
-            "45": 933156352.0,
-            "46": 933156352.0,
-            "47": 933156352.0,
-            "48": 933156352.0,
-            "49": 933156352.0,
-            "50": 933156352.0,
-            "51": 933156352.0,
-            "52": 933156352.0,
-            "53": 933156352.0,
-            "54": 933156352.0,
-            "55": 933156352.0,
-            "56": 933156352.0,
-            "57": 933156352.0,
-            "58": 933156352.0,
-            "59": 933156352.0,
-            "60": 933156352.0,
-            "61": 933156352.0,
-            "62": 933156352.0,
-            "63": 933156352.0,
-            "64": 933156352.0,
-            "65": 933156352.0,
-            "66": 933156352.0,
-            "67": 933156352.0,
-            "68": 933156352.0,
-            "69": 933156352.0,
-            "70": 933156352.0,
-            "71": 933156352.0,
-            "72": 933156352.0,
-            "73": 933156352.0,
-            "74": 933156352.0,
-            "75": 933156352.0,
-            "76": 933156352.0,
-            "77": 933156352.0,
-            "78": 933156352.0,
-            "79": 933156352.0,
-            "80": 933156352.0,
-            "81": 933156352.0,
-            "82": 933156352.0,
-            "83": 933156352.0,
-            "84": 933156352.0,
-            "85": 933156352.0,
-            "86": 933156352.0,
-            "87": 933156352.0,
-            "88": 933156352.0,
-            "89": 933156352.0,
-            "90": 933156352.0,
-            "91": 933156352.0,
-            "92": 933156352.0,
-            "93": 933156352.0,
-            "94": 933156352.0,
-            "95": 933156352.0,
-            "96": 933156352.0,
-            "97": 933156352.0,
-            "98": 933156352.0,
-            "99": 933156352.0,
-            "100": 933156352.0
+            "1": 758850560.0,
+            "2": 937349632.0,
+            "3": 937349632.0,
+            "4": 937349632.0,
+            "5": 937349632.0,
+            "6": 937349632.0,
+            "7": 937349632.0,
+            "8": 937350144.0,
+            "9": 937350144.0,
+            "10": 937350656.0,
+            "11": 937350656.0,
+            "12": 937350656.0,
+            "13": 937350656.0,
+            "14": 937350656.0,
+            "15": 937350656.0,
+            "16": 937350656.0,
+            "17": 937350656.0,
+            "18": 937350656.0,
+            "19": 937350656.0,
+            "20": 937350656.0,
+            "21": 937350656.0,
+            "22": 937350656.0,
+            "23": 937350656.0,
+            "24": 937350656.0,
+            "25": 937350656.0,
+            "26": 937350656.0,
+            "27": 937350656.0,
+            "28": 937350656.0,
+            "29": 937350656.0,
+            "30": 937350656.0,
+            "31": 937350656.0,
+            "32": 937350656.0,
+            "33": 937350656.0,
+            "34": 937350656.0,
+            "35": 937350656.0,
+            "36": 937350656.0,
+            "37": 937350656.0,
+            "38": 937350656.0,
+            "39": 937350656.0,
+            "40": 937350656.0,
+            "41": 937350656.0,
+            "42": 937350656.0,
+            "43": 937350656.0,
+            "44": 937350656.0,
+            "45": 937350656.0,
+            "46": 937350656.0,
+            "47": 937350656.0,
+            "48": 937350656.0,
+            "49": 937350656.0,
+            "50": 937350656.0,
+            "51": 937350656.0,
+            "52": 937350656.0,
+            "53": 937350656.0,
+            "54": 937350656.0,
+            "55": 937350656.0,
+            "56": 937350656.0,
+            "57": 937350656.0,
+            "58": 937350656.0,
+            "59": 937350656.0,
+            "60": 937350656.0,
+            "61": 937350656.0,
+            "62": 937350656.0,
+            "63": 937350656.0,
+            "64": 937350656.0,
+            "65": 937350656.0,
+            "66": 937350656.0,
+            "67": 937350656.0,
+            "68": 937350656.0,
+            "69": 937350656.0,
+            "70": 937350656.0,
+            "71": 937350656.0,
+            "72": 937350656.0,
+            "73": 937350656.0,
+            "74": 937350656.0,
+            "75": 937350656.0,
+            "76": 937350656.0,
+            "77": 937350656.0,
+            "78": 937350656.0,
+            "79": 937350656.0,
+            "80": 937350656.0,
+            "81": 937350656.0,
+            "82": 937350656.0,
+            "83": 937350656.0,
+            "84": 937350656.0,
+            "85": 937350656.0,
+            "86": 937350656.0,
+            "87": 937350656.0,
+            "88": 937350656.0,
+            "89": 937350656.0,
+            "90": 937350656.0,
+            "91": 937350656.0,
+            "92": 937350656.0,
+            "93": 937350656.0,
+            "94": 937350656.0,
+            "95": 937350656.0,
+            "96": 937350656.0,
+            "97": 937350656.0,
+            "98": 937350656.0,
+            "99": 937350656.0,
+            "100": 937350656.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 15.91944,
-            "2": 0.35854,
-            "3": 0.34422,
-            "4": 0.34655,
-            "5": 0.33791,
-            "6": 0.34327,
-            "7": 0.34394,
-            "8": 0.3383,
-            "9": 0.34058,
-            "10": 0.32396,
-            "11": 0.32631,
-            "12": 0.33064,
-            "13": 0.32832,
-            "14": 0.32645,
-            "15": 0.32686,
-            "16": 0.32351,
-            "17": 0.32796,
-            "18": 0.33094,
-            "19": 0.32865,
-            "20": 0.32722,
-            "21": 0.32666,
-            "22": 0.32679,
-            "23": 0.32717,
-            "24": 0.32824,
-            "25": 0.32793,
-            "26": 0.32517,
-            "27": 0.326,
-            "28": 0.32627,
-            "29": 0.32627,
-            "30": 0.32688,
-            "31": 0.32603,
-            "32": 0.32544,
-            "33": 0.32613,
-            "34": 0.32696,
-            "35": 0.32522,
-            "36": 0.32966,
-            "37": 0.32462,
-            "38": 0.32724,
-            "39": 0.32622,
-            "40": 0.32646,
-            "41": 0.32504,
-            "42": 0.32464,
-            "43": 0.3299,
-            "44": 0.32495,
-            "45": 0.32382,
-            "46": 0.32567,
-            "47": 0.32847,
-            "48": 0.32521,
-            "49": 0.32738,
-            "50": 0.32495,
-            "51": 0.33517,
-            "52": 0.33963,
-            "53": 0.33084,
-            "54": 0.3299,
-            "55": 0.33062,
-            "56": 0.32923,
-            "57": 0.32909,
-            "58": 0.331,
-            "59": 0.32595,
-            "60": 0.32446,
-            "61": 0.32961,
-            "62": 0.33126,
-            "63": 0.32393,
-            "64": 0.32986,
-            "65": 0.32836,
-            "66": 0.32921,
-            "67": 0.32945,
-            "68": 0.32848,
-            "69": 0.32625,
-            "70": 0.32898,
-            "71": 0.33227,
-            "72": 0.32403,
-            "73": 0.3284,
-            "74": 0.32761,
-            "75": 0.32791,
-            "76": 0.33223,
-            "77": 0.33113,
-            "78": 0.32546,
-            "79": 0.32925,
-            "80": 0.33175,
-            "81": 0.33071,
-            "82": 0.32698,
-            "83": 0.32738,
-            "84": 0.32835,
-            "85": 0.32729,
-            "86": 0.33228,
-            "87": 0.32668,
-            "88": 0.33091,
-            "89": 0.32825,
-            "90": 0.32752,
-            "91": 0.32814,
-            "92": 0.33195,
-            "93": 0.32686,
-            "94": 0.33172,
-            "95": 0.33336,
-            "96": 0.32938,
-            "97": 0.33024,
-            "98": 0.32939,
-            "99": 0.32654,
-            "100": 0.3311
+            "1": 33.75672,
+            "2": 0.32538,
+            "3": 0.30979,
+            "4": 0.29132,
+            "5": 0.28673,
+            "6": 0.29044,
+            "7": 0.28928,
+            "8": 0.28782,
+            "9": 0.28716,
+            "10": 0.29487,
+            "11": 0.28718,
+            "12": 0.28269,
+            "13": 0.28219,
+            "14": 0.28189,
+            "15": 0.28466,
+            "16": 0.28241,
+            "17": 0.28424,
+            "18": 0.28237,
+            "19": 0.2825,
+            "20": 0.28165,
+            "21": 0.28578,
+            "22": 0.28723,
+            "23": 0.28406,
+            "24": 0.28161,
+            "25": 0.28206,
+            "26": 0.28395,
+            "27": 0.28087,
+            "28": 0.28029,
+            "29": 0.28081,
+            "30": 0.28035,
+            "31": 0.27965,
+            "32": 0.28051,
+            "33": 0.28076,
+            "34": 0.2798,
+            "35": 0.27825,
+            "36": 0.28669,
+            "37": 0.28531,
+            "38": 0.28497,
+            "39": 0.28165,
+            "40": 0.28034,
+            "41": 0.27847,
+            "42": 0.27754,
+            "43": 0.28102,
+            "44": 0.27958,
+            "45": 0.27967,
+            "46": 0.28044,
+            "47": 0.27794,
+            "48": 0.28143,
+            "49": 0.27941,
+            "50": 0.28096,
+            "51": 0.29673,
+            "52": 0.28031,
+            "53": 0.28708,
+            "54": 0.28243,
+            "55": 0.28247,
+            "56": 0.28076,
+            "57": 0.28031,
+            "58": 0.27896,
+            "59": 0.27986,
+            "60": 0.28148,
+            "61": 0.27915,
+            "62": 0.28166,
+            "63": 0.28345,
+            "64": 0.28119,
+            "65": 0.28241,
+            "66": 0.28032,
+            "67": 0.28162,
+            "68": 0.2838,
+            "69": 0.28382,
+            "70": 0.28245,
+            "71": 0.28204,
+            "72": 0.28468,
+            "73": 0.28238,
+            "74": 0.28182,
+            "75": 0.28321,
+            "76": 0.28243,
+            "77": 0.28435,
+            "78": 0.28226,
+            "79": 0.28216,
+            "80": 0.28198,
+            "81": 0.28267,
+            "82": 0.28258,
+            "83": 0.283,
+            "84": 0.68437,
+            "85": 0.28406,
+            "86": 0.28139,
+            "87": 0.28473,
+            "88": 0.28619,
+            "89": 0.28286,
+            "90": 0.28309,
+            "91": 0.28733,
+            "92": 0.28154,
+            "93": 0.28434,
+            "94": 0.28361,
+            "95": 0.28379,
+            "96": 0.28667,
+            "97": 0.2826,
+            "98": 0.28464,
+            "99": 0.28558,
+            "100": 0.2859
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..1ce44c0962c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.78026,
+            "52": 9.67272,
+            "53": 10.02012,
+            "54": 9.89791,
+            "55": 9.81665,
+            "56": 9.56044,
+            "57": 9.4118,
+            "58": 9.77417,
+            "59": 9.51797,
+            "60": 9.43538,
+            "61": 9.64483,
+            "62": 9.93003,
+            "63": 9.30914,
+            "64": 9.72064,
+            "65": 8.87154,
+            "66": 9.6443,
+            "67": 9.3133,
+            "68": 9.74067,
+            "69": 9.75331,
+            "70": 9.70008,
+            "71": 9.56555,
+            "72": 9.53094,
+            "73": 9.44386,
+            "74": 8.86784,
+            "75": 9.3731,
+            "76": 9.01275,
+            "77": 10.02855,
+            "78": 9.68737,
+            "79": 9.328,
+            "80": 9.36163,
+            "81": 9.43365,
+            "82": 9.66095,
+            "83": 9.25139,
+            "84": 9.37351,
+            "85": 9.5694,
+            "86": 9.03181,
+            "87": 9.55583,
+            "88": 9.71053,
+            "89": 9.55398,
+            "90": 9.78474,
+            "91": 9.29074,
+            "92": 9.3124,
+            "93": 9.03138,
+            "94": 8.78672,
+            "95": 9.48731,
+            "96": 9.49047,
+            "97": 9.26687,
+            "98": 9.63648,
+            "99": 8.84331,
+            "100": 9.3555
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 960.0,
+            "52": 970.0,
+            "53": 1045.0,
+            "54": 961.0,
+            "55": 923.0,
+            "56": 1019.0,
+            "57": 841.0,
+            "58": 1004.0,
+            "59": 1037.0,
+            "60": 895.0,
+            "61": 1040.0,
+            "62": 961.0,
+            "63": 902.0,
+            "64": 1056.0,
+            "65": 922.0,
+            "66": 1099.0,
+            "67": 1049.0,
+            "68": 1009.0,
+            "69": 1109.0,
+            "70": 1071.0,
+            "71": 1121.0,
+            "72": 894.0,
+            "73": 1041.0,
+            "74": 731.0,
+            "75": 929.0,
+            "76": 1076.0,
+            "77": 1111.0,
+            "78": 1058.0,
+            "79": 1042.0,
+            "80": 1112.0,
+            "81": 1233.0,
+            "82": 1119.0,
+            "83": 1018.0,
+            "84": 1162.0,
+            "85": 1189.0,
+            "86": 894.0,
+            "87": 1298.0,
+            "88": 1076.0,
+            "89": 1107.0,
+            "90": 1134.0,
+            "91": 1079.0,
+            "92": 1171.0,
+            "93": 928.0,
+            "94": 1150.0,
+            "95": 1176.0,
+            "96": 1207.0,
+            "97": 1049.0,
+            "98": 1192.0,
+            "99": 1082.0,
+            "100": 1082.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 693384704.0,
+            "52": 693384704.0,
+            "53": 693384704.0,
+            "54": 693384704.0,
+            "55": 693384704.0,
+            "56": 693384704.0,
+            "57": 693384704.0,
+            "58": 693384704.0,
+            "59": 693384704.0,
+            "60": 693384704.0,
+            "61": 693384704.0,
+            "62": 693384704.0,
+            "63": 693384704.0,
+            "64": 693384704.0,
+            "65": 693384704.0,
+            "66": 693384704.0,
+            "67": 693384704.0,
+            "68": 693384704.0,
+            "69": 693384704.0,
+            "70": 693384704.0,
+            "71": 693384704.0,
+            "72": 693384704.0,
+            "73": 693384704.0,
+            "74": 693384704.0,
+            "75": 693384704.0,
+            "76": 693384704.0,
+            "77": 693384704.0,
+            "78": 693384704.0,
+            "79": 693384704.0,
+            "80": 693384704.0,
+            "81": 693384704.0,
+            "82": 693384704.0,
+            "83": 693384704.0,
+            "84": 693384704.0,
+            "85": 693384704.0,
+            "86": 693384704.0,
+            "87": 693384704.0,
+            "88": 693384704.0,
+            "89": 693384704.0,
+            "90": 693384704.0,
+            "91": 693384704.0,
+            "92": 693384704.0,
+            "93": 693384704.0,
+            "94": 693384704.0,
+            "95": 693384704.0,
+            "96": 693384704.0,
+            "97": 693384704.0,
+            "98": 693384704.0,
+            "99": 693384704.0,
+            "100": 693384704.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1118993408.0,
+            "52": 1118993408.0,
+            "53": 1118993408.0,
+            "54": 1118993408.0,
+            "55": 1118993408.0,
+            "56": 1118993408.0,
+            "57": 1118993408.0,
+            "58": 1118993408.0,
+            "59": 1118993408.0,
+            "60": 1118993408.0,
+            "61": 1118993408.0,
+            "62": 1118993408.0,
+            "63": 1118993408.0,
+            "64": 1118993408.0,
+            "65": 1118993408.0,
+            "66": 1118993408.0,
+            "67": 1118993408.0,
+            "68": 1118993408.0,
+            "69": 1118993408.0,
+            "70": 1118993408.0,
+            "71": 1118993408.0,
+            "72": 1118993408.0,
+            "73": 1118993408.0,
+            "74": 1118993408.0,
+            "75": 1118993408.0,
+            "76": 1118993408.0,
+            "77": 1118993408.0,
+            "78": 1118993408.0,
+            "79": 1118993408.0,
+            "80": 1118993408.0,
+            "81": 1118993408.0,
+            "82": 1118993408.0,
+            "83": 1118993408.0,
+            "84": 1118993408.0,
+            "85": 1118993408.0,
+            "86": 1118993408.0,
+            "87": 1118993408.0,
+            "88": 1118993408.0,
+            "89": 1118993408.0,
+            "90": 1118993408.0,
+            "91": 1118993408.0,
+            "92": 1118993408.0,
+            "93": 1118993408.0,
+            "94": 1118993408.0,
+            "95": 1118993408.0,
+            "96": 1118993408.0,
+            "97": 1118993408.0,
+            "98": 1118993408.0,
+            "99": 1118993408.0,
+            "100": 1118993408.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 34.29507,
+            "52": 0.3376,
+            "53": 0.30049,
+            "54": 0.29407,
+            "55": 0.28696,
+            "56": 0.29147,
+            "57": 0.28499,
+            "58": 0.28472,
+            "59": 0.28545,
+            "60": 0.28609,
+            "61": 0.2861,
+            "62": 0.28427,
+            "63": 0.28328,
+            "64": 0.28944,
+            "65": 0.28429,
+            "66": 0.31251,
+            "67": 0.28579,
+            "68": 0.28489,
+            "69": 0.28347,
+            "70": 0.28227,
+            "71": 0.28508,
+            "72": 0.28217,
+            "73": 0.27896,
+            "74": 0.28082,
+            "75": 0.28386,
+            "76": 0.28438,
+            "77": 0.2834,
+            "78": 0.28181,
+            "79": 0.28078,
+            "80": 0.27927,
+            "81": 0.28147,
+            "82": 0.28131,
+            "83": 0.28333,
+            "84": 0.29099,
+            "85": 0.28669,
+            "86": 0.28394,
+            "87": 0.28298,
+            "88": 0.28081,
+            "89": 0.28349,
+            "90": 0.28455,
+            "91": 0.28426,
+            "92": 0.28166,
+            "93": 0.28252,
+            "94": 0.28323,
+            "95": 0.28319,
+            "96": 0.28167,
+            "97": 0.28018,
+            "98": 0.2832,
+            "99": 0.28544,
+            "100": 0.28341
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json
index d51aa6cf4b8..305e2861ba0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.93292, "5": 10.92969, "10": 10.90473, "15": 10.87121, "20": 10.74997, "25": 10.53751, "30": 10.32549, "35": 10.22894, "40": 10.01974, "45": 9.75549, "50": 9.84069, "55": 9.81451, "60": 9.42443, "65": 8.86707, "70": 9.67897, "75": 9.36665, "80": 9.35303, "85": 9.56706, "90": 9.77585, "95": 9.48329, "100": 9.3588}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 582.0, "5": 618.0, "10": 496.0, "15": 672.0, "20": 600.0, "25": 619.0, "30": 678.0, "35": 697.0, "40": 775.0, "45": 770.0, "50": 894.0, "55": 906.0, "60": 932.0, "65": 960.0, "70": 1106.0, "75": 889.0, "80": 1186.0, "85": 1068.0, "90": 1077.0, "95": 1054.0, "100": 1160.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 431783936.0, "5": 431783936.0, "10": 431783936.0, "15": 431783936.0, "20": 431783936.0, "25": 431783936.0, "30": 431783936.0, "35": 431783936.0, "40": 431783936.0, "45": 431783936.0, "50": 431783936.0, "55": 431783936.0, "60": 431783936.0, "65": 431783936.0, "70": 431783936.0, "75": 431783936.0, "80": 431783936.0, "85": 431783936.0, "90": 431783936.0, "95": 431783936.0, "100": 431783936.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.90186, "5": 0.37688, "10": 0.37024, "15": 0.381, "20": 0.38683, "25": 0.39543, "30": 0.38049, "35": 0.36959, "40": 0.36509, "45": 0.364, "50": 0.36469, "55": 0.37647, "60": 0.37716, "65": 0.39072, "70": 0.39183, "75": 0.55129, "80": 0.39335, "85": 0.40289, "90": 0.41031, "95": 0.39498, "100": 0.3918}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.93295,
+            "2": 10.93424,
+            "3": 10.91344,
+            "4": 10.90322,
+            "5": 10.92968,
+            "6": 10.93657,
+            "7": 10.90276,
+            "8": 10.92115,
+            "9": 10.90707,
+            "10": 10.90476,
+            "11": 10.88788,
+            "12": 10.91733,
+            "13": 10.91195,
+            "14": 10.91509,
+            "15": 10.87119,
+            "16": 10.86125,
+            "17": 10.82702,
+            "18": 10.85673,
+            "19": 10.84055,
+            "20": 10.74999,
+            "21": 10.71506,
+            "22": 10.58115,
+            "23": 10.72644,
+            "24": 10.6073,
+            "25": 10.5375,
+            "26": 10.61069,
+            "27": 10.5993,
+            "28": 10.54958,
+            "29": 10.56604,
+            "30": 10.32547,
+            "31": 10.067,
+            "32": 10.43808,
+            "33": 10.4236,
+            "34": 10.16016,
+            "35": 10.22895,
+            "36": 10.17614,
+            "37": 10.29234,
+            "38": 10.13297,
+            "39": 10.34954,
+            "40": 10.01975,
+            "41": 10.07535,
+            "42": 10.15411,
+            "43": 9.76087,
+            "44": 9.88356,
+            "45": 9.75546,
+            "46": 9.74961,
+            "47": 10.07545,
+            "48": 9.77936,
+            "49": 9.43816,
+            "50": 9.84068,
+            "51": 9.77754,
+            "52": 9.66521,
+            "53": 10.00741,
+            "54": 9.88875,
+            "55": 9.81454,
+            "56": 9.55923,
+            "57": 9.39915,
+            "58": 9.77272,
+            "59": 9.51594,
+            "60": 9.42442,
+            "61": 9.64311,
+            "62": 9.93502,
+            "63": 9.30274,
+            "64": 9.72154,
+            "65": 8.86709,
+            "66": 9.64655,
+            "67": 9.30856,
+            "68": 9.74064,
+            "69": 9.74152,
+            "70": 9.67899,
+            "71": 9.55875,
+            "72": 9.53277,
+            "73": 9.4385,
+            "74": 8.8823,
+            "75": 9.36667,
+            "76": 9.02475,
+            "77": 10.02955,
+            "78": 9.68853,
+            "79": 9.32607,
+            "80": 9.35305,
+            "81": 9.4325,
+            "82": 9.65191,
+            "83": 9.25404,
+            "84": 9.36521,
+            "85": 9.56708,
+            "86": 9.03549,
+            "87": 9.55775,
+            "88": 9.70743,
+            "89": 9.55898,
+            "90": 9.77585,
+            "91": 9.29644,
+            "92": 9.32116,
+            "93": 9.02865,
+            "94": 8.78309,
+            "95": 9.48327,
+            "96": 9.48473,
+            "97": 9.26675,
+            "98": 9.63739,
+            "99": 8.83895,
+            "100": 9.35878
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 575.0,
+            "2": 614.0,
+            "3": 618.0,
+            "4": 588.0,
+            "5": 658.0,
+            "6": 625.0,
+            "7": 635.0,
+            "8": 591.0,
+            "9": 692.0,
+            "10": 524.0,
+            "11": 700.0,
+            "12": 628.0,
+            "13": 680.0,
+            "14": 644.0,
+            "15": 614.0,
+            "16": 692.0,
+            "17": 646.0,
+            "18": 594.0,
+            "19": 608.0,
+            "20": 585.0,
+            "21": 666.0,
+            "22": 575.0,
+            "23": 672.0,
+            "24": 628.0,
+            "25": 623.0,
+            "26": 614.0,
+            "27": 678.0,
+            "28": 748.0,
+            "29": 717.0,
+            "30": 649.0,
+            "31": 582.0,
+            "32": 677.0,
+            "33": 793.0,
+            "34": 658.0,
+            "35": 685.0,
+            "36": 752.0,
+            "37": 842.0,
+            "38": 786.0,
+            "39": 800.0,
+            "40": 776.0,
+            "41": 804.0,
+            "42": 818.0,
+            "43": 743.0,
+            "44": 783.0,
+            "45": 797.0,
+            "46": 802.0,
+            "47": 891.0,
+            "48": 931.0,
+            "49": 793.0,
+            "50": 810.0,
+            "51": 913.0,
+            "52": 862.0,
+            "53": 982.0,
+            "54": 908.0,
+            "55": 889.0,
+            "56": 1012.0,
+            "57": 865.0,
+            "58": 954.0,
+            "59": 985.0,
+            "60": 924.0,
+            "61": 964.0,
+            "62": 954.0,
+            "63": 848.0,
+            "64": 983.0,
+            "65": 902.0,
+            "66": 1148.0,
+            "67": 973.0,
+            "68": 960.0,
+            "69": 1050.0,
+            "70": 1071.0,
+            "71": 1046.0,
+            "72": 833.0,
+            "73": 997.0,
+            "74": 711.0,
+            "75": 871.0,
+            "76": 1024.0,
+            "77": 1165.0,
+            "78": 1124.0,
+            "79": 1101.0,
+            "80": 1162.0,
+            "81": 1147.0,
+            "82": 1079.0,
+            "83": 959.0,
+            "84": 1124.0,
+            "85": 1142.0,
+            "86": 907.0,
+            "87": 1201.0,
+            "88": 1109.0,
+            "89": 1119.0,
+            "90": 1093.0,
+            "91": 1082.0,
+            "92": 1145.0,
+            "93": 926.0,
+            "94": 1074.0,
+            "95": 1165.0,
+            "96": 1161.0,
+            "97": 1029.0,
+            "98": 1199.0,
+            "99": 1192.0,
+            "100": 1083.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 431522304.0,
+            "2": 431522304.0,
+            "3": 431522304.0,
+            "4": 431522304.0,
+            "5": 431522304.0,
+            "6": 431522304.0,
+            "7": 431522304.0,
+            "8": 431522304.0,
+            "9": 431522304.0,
+            "10": 431522304.0,
+            "11": 431522304.0,
+            "12": 431522304.0,
+            "13": 431522304.0,
+            "14": 431522304.0,
+            "15": 431522304.0,
+            "16": 431522304.0,
+            "17": 431522304.0,
+            "18": 431522304.0,
+            "19": 431522304.0,
+            "20": 431522304.0,
+            "21": 431522304.0,
+            "22": 431522304.0,
+            "23": 431522304.0,
+            "24": 431522304.0,
+            "25": 431522304.0,
+            "26": 431522304.0,
+            "27": 431522304.0,
+            "28": 431522304.0,
+            "29": 431522304.0,
+            "30": 431522304.0,
+            "31": 431522304.0,
+            "32": 431522304.0,
+            "33": 431522304.0,
+            "34": 431522304.0,
+            "35": 431522304.0,
+            "36": 431522304.0,
+            "37": 431522304.0,
+            "38": 431522304.0,
+            "39": 431522304.0,
+            "40": 431522304.0,
+            "41": 431522304.0,
+            "42": 431522304.0,
+            "43": 431522304.0,
+            "44": 431522304.0,
+            "45": 431522304.0,
+            "46": 431522304.0,
+            "47": 431522304.0,
+            "48": 431522304.0,
+            "49": 431522304.0,
+            "50": 431522304.0,
+            "51": 431522304.0,
+            "52": 431522304.0,
+            "53": 431522304.0,
+            "54": 431522304.0,
+            "55": 431522304.0,
+            "56": 431522304.0,
+            "57": 431522304.0,
+            "58": 431522304.0,
+            "59": 431522304.0,
+            "60": 431522304.0,
+            "61": 431522304.0,
+            "62": 431522304.0,
+            "63": 431522304.0,
+            "64": 431522304.0,
+            "65": 431522304.0,
+            "66": 431522304.0,
+            "67": 431522304.0,
+            "68": 431522304.0,
+            "69": 431522304.0,
+            "70": 431522304.0,
+            "71": 431522304.0,
+            "72": 431522304.0,
+            "73": 431522304.0,
+            "74": 431522304.0,
+            "75": 431522304.0,
+            "76": 431522304.0,
+            "77": 431522304.0,
+            "78": 431522304.0,
+            "79": 431522304.0,
+            "80": 431522304.0,
+            "81": 431522304.0,
+            "82": 431522304.0,
+            "83": 431522304.0,
+            "84": 431522304.0,
+            "85": 431522304.0,
+            "86": 431522304.0,
+            "87": 431522304.0,
+            "88": 431522304.0,
+            "89": 431522304.0,
+            "90": 431522304.0,
+            "91": 431522304.0,
+            "92": 431522304.0,
+            "93": 431522304.0,
+            "94": 431522304.0,
+            "95": 431522304.0,
+            "96": 431522304.0,
+            "97": 431522304.0,
+            "98": 431522304.0,
+            "99": 431522304.0,
+            "100": 431522304.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 678369280.0,
+            "2": 861326848.0,
+            "3": 861328384.0,
+            "4": 861328384.0,
+            "5": 861328896.0,
+            "6": 861328896.0,
+            "7": 861328896.0,
+            "8": 861328896.0,
+            "9": 861328896.0,
+            "10": 861328896.0,
+            "11": 861328896.0,
+            "12": 861328896.0,
+            "13": 861328896.0,
+            "14": 861328896.0,
+            "15": 861328896.0,
+            "16": 861328896.0,
+            "17": 861328896.0,
+            "18": 861328896.0,
+            "19": 861328896.0,
+            "20": 861328896.0,
+            "21": 861328896.0,
+            "22": 861328896.0,
+            "23": 861328896.0,
+            "24": 861328896.0,
+            "25": 861328896.0,
+            "26": 861328896.0,
+            "27": 861328896.0,
+            "28": 861328896.0,
+            "29": 861328896.0,
+            "30": 861328896.0,
+            "31": 861328896.0,
+            "32": 861328896.0,
+            "33": 861328896.0,
+            "34": 861328896.0,
+            "35": 861328896.0,
+            "36": 861328896.0,
+            "37": 861328896.0,
+            "38": 861328896.0,
+            "39": 861328896.0,
+            "40": 861328896.0,
+            "41": 861328896.0,
+            "42": 861328896.0,
+            "43": 861328896.0,
+            "44": 861328896.0,
+            "45": 861328896.0,
+            "46": 861328896.0,
+            "47": 861328896.0,
+            "48": 861328896.0,
+            "49": 861328896.0,
+            "50": 861328896.0,
+            "51": 861328896.0,
+            "52": 861328896.0,
+            "53": 861328896.0,
+            "54": 861328896.0,
+            "55": 861328896.0,
+            "56": 861328896.0,
+            "57": 861328896.0,
+            "58": 861328896.0,
+            "59": 861328896.0,
+            "60": 861328896.0,
+            "61": 861328896.0,
+            "62": 861328896.0,
+            "63": 861328896.0,
+            "64": 861328896.0,
+            "65": 861328896.0,
+            "66": 861328896.0,
+            "67": 861328896.0,
+            "68": 861328896.0,
+            "69": 861328896.0,
+            "70": 861328896.0,
+            "71": 861328896.0,
+            "72": 861328896.0,
+            "73": 861328896.0,
+            "74": 861328896.0,
+            "75": 861328896.0,
+            "76": 861328896.0,
+            "77": 861328896.0,
+            "78": 861328896.0,
+            "79": 861328896.0,
+            "80": 861328896.0,
+            "81": 861328896.0,
+            "82": 861328896.0,
+            "83": 861328896.0,
+            "84": 861328896.0,
+            "85": 861328896.0,
+            "86": 861328896.0,
+            "87": 861328896.0,
+            "88": 861328896.0,
+            "89": 861328896.0,
+            "90": 861328896.0,
+            "91": 861328896.0,
+            "92": 861328896.0,
+            "93": 861328896.0,
+            "94": 861328896.0,
+            "95": 861328896.0,
+            "96": 861328896.0,
+            "97": 861328896.0,
+            "98": 861328896.0,
+            "99": 861328896.0,
+            "100": 861328896.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 16.94419,
+            "2": 0.46293,
+            "3": 0.43323,
+            "4": 0.41124,
+            "5": 0.41337,
+            "6": 0.41008,
+            "7": 0.41384,
+            "8": 0.41526,
+            "9": 0.41249,
+            "10": 0.41837,
+            "11": 0.41987,
+            "12": 0.42279,
+            "13": 0.41933,
+            "14": 0.42011,
+            "15": 0.42058,
+            "16": 0.41981,
+            "17": 0.42742,
+            "18": 0.41843,
+            "19": 0.41598,
+            "20": 0.4167,
+            "21": 0.4156,
+            "22": 0.41702,
+            "23": 0.4169,
+            "24": 0.41743,
+            "25": 0.41779,
+            "26": 0.41667,
+            "27": 0.41879,
+            "28": 0.41658,
+            "29": 0.4158,
+            "30": 0.41602,
+            "31": 0.41609,
+            "32": 0.41672,
+            "33": 0.41727,
+            "34": 0.41721,
+            "35": 0.41711,
+            "36": 0.41695,
+            "37": 0.41937,
+            "38": 0.41806,
+            "39": 0.417,
+            "40": 0.41717,
+            "41": 0.41772,
+            "42": 0.41463,
+            "43": 0.41752,
+            "44": 0.41751,
+            "45": 0.41653,
+            "46": 0.41569,
+            "47": 0.4202,
+            "48": 0.41969,
+            "49": 0.42062,
+            "50": 0.42196,
+            "51": 0.9121,
+            "52": 0.41319,
+            "53": 0.41164,
+            "54": 0.41017,
+            "55": 0.4114,
+            "56": 0.41164,
+            "57": 0.41138,
+            "58": 0.40994,
+            "59": 0.41137,
+            "60": 0.41062,
+            "61": 0.41152,
+            "62": 0.41366,
+            "63": 0.4107,
+            "64": 0.41226,
+            "65": 0.41176,
+            "66": 0.41026,
+            "67": 0.41204,
+            "68": 0.4122,
+            "69": 0.41122,
+            "70": 0.41376,
+            "71": 0.41137,
+            "72": 0.41098,
+            "73": 0.41047,
+            "74": 0.4109,
+            "75": 0.4132,
+            "76": 0.41301,
+            "77": 0.41293,
+            "78": 0.41243,
+            "79": 0.41053,
+            "80": 0.41164,
+            "81": 0.40993,
+            "82": 0.41202,
+            "83": 0.41372,
+            "84": 0.4109,
+            "85": 0.4122,
+            "86": 0.41126,
+            "87": 0.41232,
+            "88": 0.41314,
+            "89": 0.41115,
+            "90": 0.41218,
+            "91": 0.4144,
+            "92": 0.41696,
+            "93": 0.41972,
+            "94": 0.42467,
+            "95": 0.4157,
+            "96": 0.41335,
+            "97": 0.41389,
+            "98": 0.4112,
+            "99": 0.41259,
+            "100": 0.41414
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..2453c036dba
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.77754,
+            "52": 9.66523,
+            "53": 10.00743,
+            "54": 9.88877,
+            "55": 9.81452,
+            "56": 9.55922,
+            "57": 9.39915,
+            "58": 9.77267,
+            "59": 9.51591,
+            "60": 9.42443,
+            "61": 9.64313,
+            "62": 9.93504,
+            "63": 9.30269,
+            "64": 9.72154,
+            "65": 8.8671,
+            "66": 9.64654,
+            "67": 9.30858,
+            "68": 9.74062,
+            "69": 9.74154,
+            "70": 9.679,
+            "71": 9.55873,
+            "72": 9.53281,
+            "73": 9.43848,
+            "74": 8.88229,
+            "75": 9.36665,
+            "76": 9.02477,
+            "77": 10.02954,
+            "78": 9.68857,
+            "79": 9.32609,
+            "80": 9.35306,
+            "81": 9.43247,
+            "82": 9.65188,
+            "83": 9.25407,
+            "84": 9.36521,
+            "85": 9.56705,
+            "86": 9.03549,
+            "87": 9.55774,
+            "88": 9.70742,
+            "89": 9.55898,
+            "90": 9.77582,
+            "91": 9.29648,
+            "92": 9.32118,
+            "93": 9.02866,
+            "94": 8.7831,
+            "95": 9.48329,
+            "96": 9.48475,
+            "97": 9.26673,
+            "98": 9.63742,
+            "99": 8.839,
+            "100": 9.35878
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 896.0,
+            "52": 882.0,
+            "53": 967.0,
+            "54": 942.0,
+            "55": 870.0,
+            "56": 923.0,
+            "57": 825.0,
+            "58": 1049.0,
+            "59": 968.0,
+            "60": 865.0,
+            "61": 981.0,
+            "62": 954.0,
+            "63": 820.0,
+            "64": 1016.0,
+            "65": 940.0,
+            "66": 1085.0,
+            "67": 1020.0,
+            "68": 987.0,
+            "69": 1062.0,
+            "70": 1082.0,
+            "71": 1048.0,
+            "72": 855.0,
+            "73": 1061.0,
+            "74": 664.0,
+            "75": 883.0,
+            "76": 1018.0,
+            "77": 1199.0,
+            "78": 1121.0,
+            "79": 1119.0,
+            "80": 1138.0,
+            "81": 1228.0,
+            "82": 1145.0,
+            "83": 906.0,
+            "84": 1179.0,
+            "85": 1108.0,
+            "86": 826.0,
+            "87": 1236.0,
+            "88": 1067.0,
+            "89": 1133.0,
+            "90": 1059.0,
+            "91": 1052.0,
+            "92": 1187.0,
+            "93": 894.0,
+            "94": 1074.0,
+            "95": 1088.0,
+            "96": 1138.0,
+            "97": 1004.0,
+            "98": 1204.0,
+            "99": 1107.0,
+            "100": 1104.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 433357312.0,
+            "52": 433357312.0,
+            "53": 433357312.0,
+            "54": 433357312.0,
+            "55": 433357312.0,
+            "56": 433357312.0,
+            "57": 433357312.0,
+            "58": 433357312.0,
+            "59": 433357312.0,
+            "60": 433357312.0,
+            "61": 433357312.0,
+            "62": 433357312.0,
+            "63": 433357312.0,
+            "64": 433357312.0,
+            "65": 433357312.0,
+            "66": 433357312.0,
+            "67": 433357312.0,
+            "68": 433357312.0,
+            "69": 433357312.0,
+            "70": 433357312.0,
+            "71": 433357312.0,
+            "72": 433357312.0,
+            "73": 433357312.0,
+            "74": 433357312.0,
+            "75": 433357312.0,
+            "76": 433357312.0,
+            "77": 433357312.0,
+            "78": 433357312.0,
+            "79": 433357312.0,
+            "80": 433357312.0,
+            "81": 433357312.0,
+            "82": 433357312.0,
+            "83": 433357312.0,
+            "84": 433357312.0,
+            "85": 433357312.0,
+            "86": 433357312.0,
+            "87": 433357312.0,
+            "88": 433357312.0,
+            "89": 433357312.0,
+            "90": 433357312.0,
+            "91": 433357312.0,
+            "92": 433357312.0,
+            "93": 433357312.0,
+            "94": 433357312.0,
+            "95": 433357312.0,
+            "96": 433357312.0,
+            "97": 433357312.0,
+            "98": 433357312.0,
+            "99": 433357312.0,
+            "100": 433357312.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 861062656.0,
+            "52": 861850624.0,
+            "53": 861850624.0,
+            "54": 861850624.0,
+            "55": 861850624.0,
+            "56": 861850624.0,
+            "57": 861850624.0,
+            "58": 861850624.0,
+            "59": 861850624.0,
+            "60": 861850624.0,
+            "61": 861850624.0,
+            "62": 861850624.0,
+            "63": 861850624.0,
+            "64": 861850624.0,
+            "65": 861850624.0,
+            "66": 861850624.0,
+            "67": 861850624.0,
+            "68": 861850624.0,
+            "69": 861850624.0,
+            "70": 861850624.0,
+            "71": 861852160.0,
+            "72": 861852160.0,
+            "73": 861852160.0,
+            "74": 861852160.0,
+            "75": 861852160.0,
+            "76": 861852160.0,
+            "77": 861853184.0,
+            "78": 861853184.0,
+            "79": 861853184.0,
+            "80": 861853184.0,
+            "81": 861853184.0,
+            "82": 861853184.0,
+            "83": 861853184.0,
+            "84": 861853184.0,
+            "85": 861853184.0,
+            "86": 861853184.0,
+            "87": 861853184.0,
+            "88": 861853184.0,
+            "89": 861853184.0,
+            "90": 861853184.0,
+            "91": 861853184.0,
+            "92": 861853184.0,
+            "93": 861853184.0,
+            "94": 861853184.0,
+            "95": 861853184.0,
+            "96": 861853184.0,
+            "97": 861853184.0,
+            "98": 861853184.0,
+            "99": 861853184.0,
+            "100": 861853184.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 19.65307,
+            "52": 0.45052,
+            "53": 0.42082,
+            "54": 0.41811,
+            "55": 0.41814,
+            "56": 0.41733,
+            "57": 0.41818,
+            "58": 0.418,
+            "59": 0.41748,
+            "60": 0.41977,
+            "61": 0.41771,
+            "62": 0.42393,
+            "63": 0.42754,
+            "64": 0.42379,
+            "65": 0.42104,
+            "66": 0.42071,
+            "67": 0.4201,
+            "68": 0.41916,
+            "69": 0.41995,
+            "70": 0.4222,
+            "71": 0.42158,
+            "72": 0.42185,
+            "73": 0.41889,
+            "74": 0.42962,
+            "75": 0.42666,
+            "76": 0.4191,
+            "77": 0.421,
+            "78": 0.42068,
+            "79": 0.41987,
+            "80": 0.41899,
+            "81": 0.41896,
+            "82": 0.42029,
+            "83": 0.41923,
+            "84": 0.419,
+            "85": 0.42028,
+            "86": 0.41955,
+            "87": 0.41973,
+            "88": 0.41946,
+            "89": 0.41924,
+            "90": 0.42048,
+            "91": 0.42238,
+            "92": 0.42092,
+            "93": 0.42289,
+            "94": 0.42394,
+            "95": 0.42171,
+            "96": 0.42176,
+            "97": 0.42119,
+            "98": 0.42004,
+            "99": 0.42349,
+            "100": 0.42222
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..04cd8d66a75
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86836,
+            "2": 10.88595,
+            "3": 10.86558,
+            "4": 10.86895,
+            "5": 10.87421,
+            "6": 10.89062,
+            "7": 10.87675,
+            "8": 10.86479,
+            "9": 10.88234,
+            "10": 10.8458,
+            "11": 10.87162,
+            "12": 10.8742,
+            "13": 10.88161,
+            "14": 10.88884,
+            "15": 10.83933,
+            "16": 10.82498,
+            "17": 10.80147,
+            "18": 10.81236,
+            "19": 10.82152,
+            "20": 10.71934,
+            "21": 10.6909,
+            "22": 10.57424,
+            "23": 10.71092,
+            "24": 10.5978,
+            "25": 10.55562,
+            "26": 10.61523,
+            "27": 10.60449,
+            "28": 10.56485,
+            "29": 10.58474,
+            "30": 10.35951,
+            "31": 10.12154,
+            "32": 10.45235,
+            "33": 10.4572,
+            "34": 10.2199,
+            "35": 10.26443,
+            "36": 10.21037,
+            "37": 10.33956,
+            "38": 10.18016,
+            "39": 10.39593,
+            "40": 10.0663,
+            "41": 10.14164,
+            "42": 10.20852,
+            "43": 9.8313,
+            "44": 9.94856,
+            "45": 9.82849,
+            "46": 9.80457,
+            "47": 10.14229,
+            "48": 9.84462,
+            "49": 9.52191,
+            "50": 9.88601,
+            "51": 9.8498,
+            "52": 9.74427,
+            "53": 10.05843,
+            "54": 9.95125,
+            "55": 9.88343,
+            "56": 9.61327,
+            "57": 9.46899,
+            "58": 9.82164,
+            "59": 9.57703,
+            "60": 9.49784,
+            "61": 9.69255,
+            "62": 9.98596,
+            "63": 9.37402,
+            "64": 9.76603,
+            "65": 8.94654,
+            "66": 9.70099,
+            "67": 9.36365,
+            "68": 9.78238,
+            "69": 9.7988,
+            "70": 9.73169,
+            "71": 9.62505,
+            "72": 9.58309,
+            "73": 9.4882,
+            "74": 8.92607,
+            "75": 9.40727,
+            "76": 9.07708,
+            "77": 10.0586,
+            "78": 9.72209,
+            "79": 9.37663,
+            "80": 9.40272,
+            "81": 9.48207,
+            "82": 9.69954,
+            "83": 9.31354,
+            "84": 9.4173,
+            "85": 9.61582,
+            "86": 9.07431,
+            "87": 9.59556,
+            "88": 9.75064,
+            "89": 9.6004,
+            "90": 9.82205,
+            "91": 9.33874,
+            "92": 9.35779,
+            "93": 9.08668,
+            "94": 8.8296,
+            "95": 9.52596,
+            "96": 9.52974,
+            "97": 9.30335,
+            "98": 9.67136,
+            "99": 8.89539,
+            "100": 9.40567
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1580.0,
+            "2": 1610.0,
+            "3": 1625.0,
+            "4": 1685.0,
+            "5": 1825.0,
+            "6": 1771.0,
+            "7": 1831.0,
+            "8": 1645.0,
+            "9": 1814.0,
+            "10": 1387.0,
+            "11": 1742.0,
+            "12": 1649.0,
+            "13": 1757.0,
+            "14": 1705.0,
+            "15": 1827.0,
+            "16": 1765.0,
+            "17": 1835.0,
+            "18": 1602.0,
+            "19": 1814.0,
+            "20": 1735.0,
+            "21": 1895.0,
+            "22": 1594.0,
+            "23": 1902.0,
+            "24": 1633.0,
+            "25": 1574.0,
+            "26": 1681.0,
+            "27": 1676.0,
+            "28": 1961.0,
+            "29": 1851.0,
+            "30": 1863.0,
+            "31": 1499.0,
+            "32": 1896.0,
+            "33": 2118.0,
+            "34": 1725.0,
+            "35": 1879.0,
+            "36": 1880.0,
+            "37": 2347.0,
+            "38": 2044.0,
+            "39": 2283.0,
+            "40": 2155.0,
+            "41": 2224.0,
+            "42": 2169.0,
+            "43": 1958.0,
+            "44": 2050.0,
+            "45": 2130.0,
+            "46": 2346.0,
+            "47": 2418.0,
+            "48": 2243.0,
+            "49": 2161.0,
+            "50": 2479.0,
+            "51": 2480.0,
+            "52": 2545.0,
+            "53": 2875.0,
+            "54": 2652.0,
+            "55": 2384.0,
+            "56": 2742.0,
+            "57": 2201.0,
+            "58": 2755.0,
+            "59": 2954.0,
+            "60": 2367.0,
+            "61": 2889.0,
+            "62": 2721.0,
+            "63": 2438.0,
+            "64": 2928.0,
+            "65": 2567.0,
+            "66": 2751.0,
+            "67": 2802.0,
+            "68": 2714.0,
+            "69": 2884.0,
+            "70": 3124.0,
+            "71": 2813.0,
+            "72": 2504.0,
+            "73": 2852.0,
+            "74": 1975.0,
+            "75": 2429.0,
+            "76": 2850.0,
+            "77": 3008.0,
+            "78": 3110.0,
+            "79": 3114.0,
+            "80": 3284.0,
+            "81": 3574.0,
+            "82": 3207.0,
+            "83": 2530.0,
+            "84": 3169.0,
+            "85": 3150.0,
+            "86": 2588.0,
+            "87": 3845.0,
+            "88": 3094.0,
+            "89": 3389.0,
+            "90": 3077.0,
+            "91": 2872.0,
+            "92": 3012.0,
+            "93": 2685.0,
+            "94": 3279.0,
+            "95": 3231.0,
+            "96": 3422.0,
+            "97": 3154.0,
+            "98": 3498.0,
+            "99": 3043.0,
+            "100": 3361.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 465207808.0,
+            "2": 466256384.0,
+            "3": 466256384.0,
+            "4": 466256384.0,
+            "5": 466256384.0,
+            "6": 466256384.0,
+            "7": 466256384.0,
+            "8": 466256384.0,
+            "9": 466256384.0,
+            "10": 466256384.0,
+            "11": 466256384.0,
+            "12": 466256384.0,
+            "13": 466256384.0,
+            "14": 466256384.0,
+            "15": 466256384.0,
+            "16": 466256384.0,
+            "17": 466256384.0,
+            "18": 466256384.0,
+            "19": 466256384.0,
+            "20": 466256384.0,
+            "21": 466256384.0,
+            "22": 466256384.0,
+            "23": 466256384.0,
+            "24": 466256384.0,
+            "25": 466256384.0,
+            "26": 466256384.0,
+            "27": 466256384.0,
+            "28": 466256384.0,
+            "29": 466256384.0,
+            "30": 466256384.0,
+            "31": 466256384.0,
+            "32": 466256384.0,
+            "33": 466256384.0,
+            "34": 466256384.0,
+            "35": 466256384.0,
+            "36": 466256384.0,
+            "37": 466256384.0,
+            "38": 466256384.0,
+            "39": 466256384.0,
+            "40": 466256384.0,
+            "41": 466256384.0,
+            "42": 466256384.0,
+            "43": 466256384.0,
+            "44": 466256384.0,
+            "45": 466256384.0,
+            "46": 466256384.0,
+            "47": 466256384.0,
+            "48": 466256384.0,
+            "49": 466256384.0,
+            "50": 466256384.0,
+            "51": 466256384.0,
+            "52": 466256384.0,
+            "53": 466256384.0,
+            "54": 466256384.0,
+            "55": 466256384.0,
+            "56": 466256384.0,
+            "57": 466256384.0,
+            "58": 466256384.0,
+            "59": 466256384.0,
+            "60": 466256384.0,
+            "61": 466256384.0,
+            "62": 466256384.0,
+            "63": 466256384.0,
+            "64": 466256384.0,
+            "65": 466256384.0,
+            "66": 466256384.0,
+            "67": 466256384.0,
+            "68": 466256384.0,
+            "69": 466256384.0,
+            "70": 466256384.0,
+            "71": 466256384.0,
+            "72": 466256384.0,
+            "73": 466256384.0,
+            "74": 466256384.0,
+            "75": 466256384.0,
+            "76": 466256384.0,
+            "77": 466256384.0,
+            "78": 466256384.0,
+            "79": 466256384.0,
+            "80": 466256384.0,
+            "81": 466256384.0,
+            "82": 466256384.0,
+            "83": 466256384.0,
+            "84": 466256384.0,
+            "85": 466256384.0,
+            "86": 466256384.0,
+            "87": 466256384.0,
+            "88": 466256384.0,
+            "89": 466256384.0,
+            "90": 466256384.0,
+            "91": 466256384.0,
+            "92": 466256384.0,
+            "93": 466256384.0,
+            "94": 466256384.0,
+            "95": 466256384.0,
+            "96": 466256384.0,
+            "97": 466256384.0,
+            "98": 466256384.0,
+            "99": 466256384.0,
+            "100": 466256384.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1728999424.0,
+            "2": 1789536768.0,
+            "3": 1789536768.0,
+            "4": 1789536768.0,
+            "5": 1789536768.0,
+            "6": 1789536768.0,
+            "7": 1789536768.0,
+            "8": 1789536768.0,
+            "9": 1789536768.0,
+            "10": 1789536768.0,
+            "11": 1789536768.0,
+            "12": 1789536768.0,
+            "13": 1789536768.0,
+            "14": 1789536768.0,
+            "15": 1789536768.0,
+            "16": 1789536768.0,
+            "17": 1789536768.0,
+            "18": 1789536768.0,
+            "19": 1789536768.0,
+            "20": 1789536768.0,
+            "21": 1789536768.0,
+            "22": 1789536768.0,
+            "23": 1789536768.0,
+            "24": 1789536768.0,
+            "25": 1789536768.0,
+            "26": 1789536768.0,
+            "27": 1789536768.0,
+            "28": 1789536768.0,
+            "29": 1789536768.0,
+            "30": 1789536768.0,
+            "31": 1789536768.0,
+            "32": 1789536768.0,
+            "33": 1789536768.0,
+            "34": 1789536768.0,
+            "35": 1789536768.0,
+            "36": 1789536768.0,
+            "37": 1789536768.0,
+            "38": 1789536768.0,
+            "39": 1789536768.0,
+            "40": 1789536768.0,
+            "41": 1789536768.0,
+            "42": 1789536768.0,
+            "43": 1789536768.0,
+            "44": 1789536768.0,
+            "45": 1789536768.0,
+            "46": 1789536768.0,
+            "47": 1789536768.0,
+            "48": 1789536768.0,
+            "49": 1789536768.0,
+            "50": 1789536768.0,
+            "51": 1789536768.0,
+            "52": 1789536768.0,
+            "53": 1789536768.0,
+            "54": 1789536768.0,
+            "55": 1789536768.0,
+            "56": 1789536768.0,
+            "57": 1789536768.0,
+            "58": 1789536768.0,
+            "59": 1789536768.0,
+            "60": 1789536768.0,
+            "61": 1789536768.0,
+            "62": 1789536768.0,
+            "63": 1789536768.0,
+            "64": 1789536768.0,
+            "65": 1789536768.0,
+            "66": 1789536768.0,
+            "67": 1789536768.0,
+            "68": 1789536768.0,
+            "69": 1789536768.0,
+            "70": 1789536768.0,
+            "71": 1789536768.0,
+            "72": 1789536768.0,
+            "73": 1789536768.0,
+            "74": 1789536768.0,
+            "75": 1789536768.0,
+            "76": 1789536768.0,
+            "77": 1789536768.0,
+            "78": 1789536768.0,
+            "79": 1789536768.0,
+            "80": 1789536768.0,
+            "81": 1789536768.0,
+            "82": 1789536768.0,
+            "83": 1789536768.0,
+            "84": 1789536768.0,
+            "85": 1789536768.0,
+            "86": 1789536768.0,
+            "87": 1789536768.0,
+            "88": 1789536768.0,
+            "89": 1789536768.0,
+            "90": 1789536768.0,
+            "91": 1789536768.0,
+            "92": 1789536768.0,
+            "93": 1789536768.0,
+            "94": 1789536768.0,
+            "95": 1789536768.0,
+            "96": 1789536768.0,
+            "97": 1789536768.0,
+            "98": 1789536768.0,
+            "99": 1789536768.0,
+            "100": 1789536768.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.69056,
+            "3": 0.17999,
+            "4": 0.16451,
+            "5": 0.16508,
+            "6": 0.17235,
+            "7": 0.16329,
+            "8": 0.1626,
+            "9": 0.16188,
+            "10": 0.16733,
+            "11": 0.16471,
+            "12": 0.16323,
+            "13": 0.16176,
+            "14": 0.16306,
+            "15": 0.16415,
+            "16": 0.16286,
+            "17": 0.16013,
+            "18": 0.16147,
+            "19": 0.17142,
+            "20": 0.1614,
+            "21": 0.16056,
+            "22": 0.16073,
+            "23": 0.1704,
+            "24": 0.16109,
+            "25": 0.16097,
+            "26": 0.16623,
+            "27": 0.15978,
+            "28": 0.17015,
+            "29": 0.17103,
+            "30": 0.18177,
+            "31": 0.18267,
+            "32": 0.18537,
+            "33": 0.18546,
+            "34": 0.18686,
+            "35": 0.18715,
+            "36": 0.18598,
+            "37": 0.18556,
+            "38": 0.18847,
+            "39": 0.187,
+            "40": 0.18548,
+            "41": 0.19477,
+            "42": 0.18691,
+            "43": 0.18628,
+            "44": 0.18945,
+            "45": 0.18687,
+            "46": 0.18766,
+            "47": 0.18828,
+            "48": 0.1885,
+            "49": 0.18744,
+            "50": 0.18918,
+            "51": 0.20273,
+            "52": 0.182,
+            "53": 0.18,
+            "54": 0.17575,
+            "55": 0.17407,
+            "56": 0.17222,
+            "57": 0.16988,
+            "58": 0.17015,
+            "59": 0.17038,
+            "60": 0.16865,
+            "61": 0.16894,
+            "62": 0.16852,
+            "63": 0.16574,
+            "64": 0.16829,
+            "65": 0.16644,
+            "66": 0.16896,
+            "67": 0.16934,
+            "68": 0.1675,
+            "69": 0.16535,
+            "70": 0.16738,
+            "71": 0.17159,
+            "72": 0.18394,
+            "73": 0.18193,
+            "74": 0.18302,
+            "75": 0.1832,
+            "76": 0.18125,
+            "77": 0.17794,
+            "78": 0.17778,
+            "79": 0.17611,
+            "80": 0.17384,
+            "81": 0.17173,
+            "82": 0.16989,
+            "83": 0.16782,
+            "84": 0.16781,
+            "85": 0.16901,
+            "86": 0.16737,
+            "87": 0.16701,
+            "88": 0.16719,
+            "89": 0.16644,
+            "90": 0.16551,
+            "91": 0.16712,
+            "92": 0.16502,
+            "93": 0.16672,
+            "94": 0.1665,
+            "95": 0.1653,
+            "96": 0.16686,
+            "97": 0.16586,
+            "98": 0.16635,
+            "99": 0.1655,
+            "100": 0.16563
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..558ad752f07
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8498,
+            "52": 9.74428,
+            "53": 10.05842,
+            "54": 9.95129,
+            "55": 9.88341,
+            "56": 9.61325,
+            "57": 9.46897,
+            "58": 9.82163,
+            "59": 9.57702,
+            "60": 9.49787,
+            "61": 9.69255,
+            "62": 9.98598,
+            "63": 9.37403,
+            "64": 9.76601,
+            "65": 8.94652,
+            "66": 9.70103,
+            "67": 9.36369,
+            "68": 9.7824,
+            "69": 9.79882,
+            "70": 9.73168,
+            "71": 9.6251,
+            "72": 9.58313,
+            "73": 9.4882,
+            "74": 8.92611,
+            "75": 9.40723,
+            "76": 9.07704,
+            "77": 10.05859,
+            "78": 9.72209,
+            "79": 9.37661,
+            "80": 9.40273,
+            "81": 9.48205,
+            "82": 9.69955,
+            "83": 9.31352,
+            "84": 9.41732,
+            "85": 9.61583,
+            "86": 9.07429,
+            "87": 9.59556,
+            "88": 9.75065,
+            "89": 9.60041,
+            "90": 9.82204,
+            "91": 9.33875,
+            "92": 9.35776,
+            "93": 9.08668,
+            "94": 8.82962,
+            "95": 9.52594,
+            "96": 9.52969,
+            "97": 9.30331,
+            "98": 9.67138,
+            "99": 8.89538,
+            "100": 9.40569
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2514.0,
+            "52": 2584.0,
+            "53": 3025.0,
+            "54": 2653.0,
+            "55": 2268.0,
+            "56": 2637.0,
+            "57": 2225.0,
+            "58": 2904.0,
+            "59": 2970.0,
+            "60": 2399.0,
+            "61": 2925.0,
+            "62": 2639.0,
+            "63": 2383.0,
+            "64": 2889.0,
+            "65": 2675.0,
+            "66": 2992.0,
+            "67": 2764.0,
+            "68": 2725.0,
+            "69": 2865.0,
+            "70": 3077.0,
+            "71": 2923.0,
+            "72": 2414.0,
+            "73": 2906.0,
+            "74": 1947.0,
+            "75": 2449.0,
+            "76": 2976.0,
+            "77": 3163.0,
+            "78": 3186.0,
+            "79": 3172.0,
+            "80": 3344.0,
+            "81": 3625.0,
+            "82": 3289.0,
+            "83": 2699.0,
+            "84": 3102.0,
+            "85": 3227.0,
+            "86": 2754.0,
+            "87": 3714.0,
+            "88": 3004.0,
+            "89": 3321.0,
+            "90": 3134.0,
+            "91": 2714.0,
+            "92": 3077.0,
+            "93": 2631.0,
+            "94": 3309.0,
+            "95": 3226.0,
+            "96": 3473.0,
+            "97": 3216.0,
+            "98": 3581.0,
+            "99": 3061.0,
+            "100": 3419.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 465864192.0,
+            "52": 465864192.0,
+            "53": 465864192.0,
+            "54": 465864192.0,
+            "55": 465864192.0,
+            "56": 465864192.0,
+            "57": 465864192.0,
+            "58": 465864192.0,
+            "59": 465864192.0,
+            "60": 465864192.0,
+            "61": 465864192.0,
+            "62": 465864192.0,
+            "63": 465864192.0,
+            "64": 465864192.0,
+            "65": 465864192.0,
+            "66": 465864192.0,
+            "67": 465864192.0,
+            "68": 465864192.0,
+            "69": 465864192.0,
+            "70": 465864192.0,
+            "71": 465864192.0,
+            "72": 465864192.0,
+            "73": 465864192.0,
+            "74": 465864192.0,
+            "75": 465864192.0,
+            "76": 465864192.0,
+            "77": 465864192.0,
+            "78": 465864192.0,
+            "79": 465864192.0,
+            "80": 465864192.0,
+            "81": 465864192.0,
+            "82": 465864192.0,
+            "83": 465864192.0,
+            "84": 465864192.0,
+            "85": 465864192.0,
+            "86": 465864192.0,
+            "87": 465864192.0,
+            "88": 465864192.0,
+            "89": 465864192.0,
+            "90": 465864192.0,
+            "91": 465864192.0,
+            "92": 465864192.0,
+            "93": 465864192.0,
+            "94": 465864192.0,
+            "95": 465864192.0,
+            "96": 465864192.0,
+            "97": 465864192.0,
+            "98": 465864192.0,
+            "99": 465864192.0,
+            "100": 465864192.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1911693312.0,
+            "52": 1911693312.0,
+            "53": 1911693312.0,
+            "54": 1911693312.0,
+            "55": 1911693312.0,
+            "56": 1911693312.0,
+            "57": 1911693312.0,
+            "58": 1911693312.0,
+            "59": 1911693312.0,
+            "60": 1911693312.0,
+            "61": 1911693312.0,
+            "62": 1911693312.0,
+            "63": 1911693312.0,
+            "64": 1911693312.0,
+            "65": 1911693312.0,
+            "66": 1911693312.0,
+            "67": 1911693312.0,
+            "68": 1911693312.0,
+            "69": 1911693312.0,
+            "70": 1911693312.0,
+            "71": 1911693312.0,
+            "72": 1911693312.0,
+            "73": 1911693312.0,
+            "74": 1911693312.0,
+            "75": 1911693312.0,
+            "76": 1911693312.0,
+            "77": 1911693312.0,
+            "78": 1911693312.0,
+            "79": 1911693312.0,
+            "80": 1911693312.0,
+            "81": 1911693312.0,
+            "82": 1911693312.0,
+            "83": 1911693312.0,
+            "84": 1911693312.0,
+            "85": 1911693312.0,
+            "86": 1911693312.0,
+            "87": 1911693312.0,
+            "88": 1911693312.0,
+            "89": 1911693312.0,
+            "90": 1911693312.0,
+            "91": 1911693312.0,
+            "92": 1911693312.0,
+            "93": 1911693312.0,
+            "94": 1911693312.0,
+            "95": 1911693312.0,
+            "96": 1911693312.0,
+            "97": 1911693312.0,
+            "98": 1911693312.0,
+            "99": 1911693312.0,
+            "100": 1911693312.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.81421,
+            "53": 0.16853,
+            "54": 0.15862,
+            "55": 0.15827,
+            "56": 0.15923,
+            "57": 0.15884,
+            "58": 0.15972,
+            "59": 0.15955,
+            "60": 0.16137,
+            "61": 0.15962,
+            "62": 0.16098,
+            "63": 0.15948,
+            "64": 0.15791,
+            "65": 0.15969,
+            "66": 0.15933,
+            "67": 0.17128,
+            "68": 0.15958,
+            "69": 0.16526,
+            "70": 0.15854,
+            "71": 0.16076,
+            "72": 0.15949,
+            "73": 0.1598,
+            "74": 0.15944,
+            "75": 0.15956,
+            "76": 0.1605,
+            "77": 0.15954,
+            "78": 0.15934,
+            "79": 0.16153,
+            "80": 0.16883,
+            "81": 0.16008,
+            "82": 0.16051,
+            "83": 0.16043,
+            "84": 0.16049,
+            "85": 0.16138,
+            "86": 0.16025,
+            "87": 0.16089,
+            "88": 0.15937,
+            "89": 0.16098,
+            "90": 0.16047,
+            "91": 0.16142,
+            "92": 0.1613,
+            "93": 0.16027,
+            "94": 0.16427,
+            "95": 0.16157,
+            "96": 0.16144,
+            "97": 0.16147,
+            "98": 0.16068,
+            "99": 0.16024,
+            "100": 0.15949
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json
index eb0e5f82b03..13709a61234 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json
@@ -6,104 +6,104 @@
         "values": {
             "1": 10.85949,
             "2": 10.85553,
-            "3": 10.86548,
+            "3": 10.86549,
             "4": 10.84554,
-            "5": 10.88344,
-            "6": 10.89429,
-            "7": 10.87068,
-            "8": 10.86983,
-            "9": 10.86919,
+            "5": 10.88343,
+            "6": 10.89431,
+            "7": 10.87071,
+            "8": 10.86985,
+            "9": 10.86923,
             "10": 10.83883,
-            "11": 10.89435,
-            "12": 10.8798,
+            "11": 10.89433,
+            "12": 10.87981,
             "13": 10.87987,
-            "14": 10.90317,
-            "15": 10.8405,
-            "16": 10.83786,
-            "17": 10.80668,
-            "18": 10.83025,
-            "19": 10.82262,
-            "20": 10.73192,
-            "21": 10.7075,
-            "22": 10.56005,
+            "14": 10.90321,
+            "15": 10.84051,
+            "16": 10.83788,
+            "17": 10.8067,
+            "18": 10.83029,
+            "19": 10.82265,
+            "20": 10.73194,
+            "21": 10.70748,
+            "22": 10.56007,
             "23": 10.72406,
-            "24": 10.61116,
-            "25": 10.5481,
-            "26": 10.61334,
-            "27": 10.6305,
-            "28": 10.56645,
-            "29": 10.59672,
-            "30": 10.37136,
-            "31": 10.11721,
-            "32": 10.46127,
-            "33": 10.45247,
+            "24": 10.61115,
+            "25": 10.54815,
+            "26": 10.61326,
+            "27": 10.63058,
+            "28": 10.56646,
+            "29": 10.59668,
+            "30": 10.37135,
+            "31": 10.11724,
+            "32": 10.46129,
+            "33": 10.45251,
             "34": 10.21687,
-            "35": 10.27171,
-            "36": 10.2312,
-            "37": 10.34809,
-            "38": 10.18842,
-            "39": 10.41042,
-            "40": 10.09426,
-            "41": 10.14711,
-            "42": 10.21247,
-            "43": 9.84106,
-            "44": 9.95919,
-            "45": 9.84082,
-            "46": 9.82482,
-            "47": 10.13882,
-            "48": 9.85839,
+            "35": 10.2717,
+            "36": 10.23118,
+            "37": 10.34811,
+            "38": 10.18844,
+            "39": 10.4104,
+            "40": 10.09431,
+            "41": 10.14712,
+            "42": 10.21245,
+            "43": 9.84104,
+            "44": 9.95916,
+            "45": 9.84088,
+            "46": 9.82483,
+            "47": 10.13881,
+            "48": 9.85842,
             "49": 9.5472,
             "50": 9.90883,
             "51": 9.85585,
             "52": 9.75243,
-            "53": 10.07588,
-            "54": 9.95691,
-            "55": 9.88207,
-            "56": 9.63139,
-            "57": 9.48649,
-            "58": 9.83116,
-            "59": 9.58907,
-            "60": 9.50648,
-            "61": 9.70368,
-            "62": 9.98289,
-            "63": 9.38314,
-            "64": 9.7791,
-            "65": 8.95182,
-            "66": 9.70161,
+            "53": 10.07586,
+            "54": 9.95687,
+            "55": 9.88208,
+            "56": 9.63141,
+            "57": 9.48653,
+            "58": 9.83119,
+            "59": 9.58905,
+            "60": 9.50652,
+            "61": 9.7037,
+            "62": 9.98292,
+            "63": 9.38312,
+            "64": 9.77906,
+            "65": 8.95185,
+            "66": 9.70159,
             "67": 9.37209,
-            "68": 9.78856,
-            "69": 9.79856,
-            "70": 9.74748,
+            "68": 9.78851,
+            "69": 9.79857,
+            "70": 9.74745,
             "71": 9.6191,
-            "72": 9.585,
-            "73": 9.49728,
-            "74": 8.93928,
-            "75": 9.42702,
+            "72": 9.58502,
+            "73": 9.4973,
+            "74": 8.93931,
+            "75": 9.42703,
             "76": 9.08022,
-            "77": 10.06569,
-            "78": 9.72897,
-            "79": 9.37772,
-            "80": 9.41001,
-            "81": 9.47977,
-            "82": 9.70183,
-            "83": 9.30621,
-            "84": 9.42098,
-            "85": 9.61377,
-            "86": 9.07654,
-            "87": 9.59456,
-            "88": 9.75071,
+            "77": 10.0657,
+            "78": 9.72894,
+            "79": 9.37773,
+            "80": 9.41006,
+            "81": 9.4798,
+            "82": 9.70181,
+            "83": 9.30619,
+            "84": 9.42095,
+            "85": 9.6138,
+            "86": 9.07653,
+            "87": 9.59452,
+            "88": 9.75069,
             "89": 9.60243,
-            "90": 9.81899,
-            "91": 9.33898,
-            "92": 9.35718,
-            "93": 9.07884,
-            "94": 8.83509,
-            "95": 9.52175,
-            "96": 9.53007,
-            "97": 9.31309,
-            "98": 9.67781,
-            "99": 8.89061,
-            "100": 9.39729
+            "90": 9.81897,
+            "91": 9.33895,
+            "92": 9.35716,
+            "93": 9.07885,
+            "94": 8.83508,
+            "95": 9.52177,
+            "96": 9.53006,
+            "97": 9.31311,
+            "98": 9.67783,
+            "99": 8.89063,
+            "100": 9.39728
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1690.0,
-            "2": 1776.0,
-            "3": 1642.0,
-            "4": 1825.0,
-            "5": 1809.0,
-            "6": 1795.0,
-            "7": 1830.0,
-            "8": 1626.0,
-            "9": 1878.0,
-            "10": 1423.0,
-            "11": 1868.0,
-            "12": 1653.0,
-            "13": 1897.0,
-            "14": 1783.0,
-            "15": 1861.0,
-            "16": 1938.0,
-            "17": 1825.0,
-            "18": 1730.0,
-            "19": 1727.0,
-            "20": 1735.0,
-            "21": 1783.0,
-            "22": 1576.0,
-            "23": 1949.0,
-            "24": 1630.0,
-            "25": 1498.0,
-            "26": 1649.0,
-            "27": 1809.0,
-            "28": 2019.0,
-            "29": 2009.0,
-            "30": 1832.0,
-            "31": 1524.0,
-            "32": 1943.0,
-            "33": 2081.0,
-            "34": 1888.0,
-            "35": 1935.0,
-            "36": 1898.0,
-            "37": 2325.0,
-            "38": 2070.0,
-            "39": 2248.0,
-            "40": 2199.0,
-            "41": 2264.0,
-            "42": 2349.0,
-            "43": 2087.0,
-            "44": 2107.0,
-            "45": 2098.0,
-            "46": 2407.0,
-            "47": 2456.0,
-            "48": 2404.0,
-            "49": 2417.0,
-            "50": 2407.0,
-            "51": 2578.0,
-            "52": 2630.0,
-            "53": 2857.0,
-            "54": 2818.0,
-            "55": 2368.0,
-            "56": 2757.0,
-            "57": 2423.0,
-            "58": 2776.0,
-            "59": 2742.0,
-            "60": 2371.0,
-            "61": 2906.0,
-            "62": 2517.0,
-            "63": 2374.0,
-            "64": 2995.0,
-            "65": 2634.0,
-            "66": 2995.0,
-            "67": 2884.0,
-            "68": 2840.0,
-            "69": 2766.0,
-            "70": 3006.0,
-            "71": 3023.0,
-            "72": 2386.0,
-            "73": 2958.0,
-            "74": 1851.0,
-            "75": 2585.0,
-            "76": 2973.0,
-            "77": 3244.0,
-            "78": 3142.0,
-            "79": 3185.0,
-            "80": 3249.0,
-            "81": 3665.0,
-            "82": 3153.0,
-            "83": 2821.0,
-            "84": 3083.0,
-            "85": 3247.0,
-            "86": 2734.0,
-            "87": 3759.0,
-            "88": 2968.0,
-            "89": 3282.0,
-            "90": 3064.0,
-            "91": 2908.0,
-            "92": 2946.0,
-            "93": 2592.0,
-            "94": 3363.0,
-            "95": 3423.0,
-            "96": 3259.0,
-            "97": 2976.0,
-            "98": 3683.0,
-            "99": 3173.0,
-            "100": 3143.0
+            "1": 1675.0,
+            "2": 1744.0,
+            "3": 1725.0,
+            "4": 1850.0,
+            "5": 1942.0,
+            "6": 1919.0,
+            "7": 1794.0,
+            "8": 1612.0,
+            "9": 1826.0,
+            "10": 1481.0,
+            "11": 1852.0,
+            "12": 1654.0,
+            "13": 1809.0,
+            "14": 1847.0,
+            "15": 1914.0,
+            "16": 1874.0,
+            "17": 1882.0,
+            "18": 1639.0,
+            "19": 1787.0,
+            "20": 1701.0,
+            "21": 1842.0,
+            "22": 1573.0,
+            "23": 2018.0,
+            "24": 1509.0,
+            "25": 1540.0,
+            "26": 1694.0,
+            "27": 1769.0,
+            "28": 1966.0,
+            "29": 2057.0,
+            "30": 1820.0,
+            "31": 1566.0,
+            "32": 1898.0,
+            "33": 2074.0,
+            "34": 1865.0,
+            "35": 1908.0,
+            "36": 1925.0,
+            "37": 2274.0,
+            "38": 2094.0,
+            "39": 2312.0,
+            "40": 2053.0,
+            "41": 2209.0,
+            "42": 2303.0,
+            "43": 2019.0,
+            "44": 2102.0,
+            "45": 2222.0,
+            "46": 2393.0,
+            "47": 2409.0,
+            "48": 2336.0,
+            "49": 2342.0,
+            "50": 2395.0,
+            "51": 2653.0,
+            "52": 2603.0,
+            "53": 2986.0,
+            "54": 2776.0,
+            "55": 2370.0,
+            "56": 2805.0,
+            "57": 2448.0,
+            "58": 2867.0,
+            "59": 2702.0,
+            "60": 2437.0,
+            "61": 2841.0,
+            "62": 2562.0,
+            "63": 2493.0,
+            "64": 2971.0,
+            "65": 2559.0,
+            "66": 3069.0,
+            "67": 2927.0,
+            "68": 2738.0,
+            "69": 2846.0,
+            "70": 3041.0,
+            "71": 3061.0,
+            "72": 2389.0,
+            "73": 3015.0,
+            "74": 1837.0,
+            "75": 2460.0,
+            "76": 3001.0,
+            "77": 3192.0,
+            "78": 3080.0,
+            "79": 3147.0,
+            "80": 3379.0,
+            "81": 3688.0,
+            "82": 3186.0,
+            "83": 2693.0,
+            "84": 3246.0,
+            "85": 3306.0,
+            "86": 2812.0,
+            "87": 3720.0,
+            "88": 2956.0,
+            "89": 3306.0,
+            "90": 3020.0,
+            "91": 2788.0,
+            "92": 3021.0,
+            "93": 2685.0,
+            "94": 3409.0,
+            "95": 3254.0,
+            "96": 3349.0,
+            "97": 2981.0,
+            "98": 3551.0,
+            "99": 3273.0,
+            "100": 3175.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 516194816.0,
-            "2": 516194816.0,
-            "3": 516194816.0,
-            "4": 516194816.0,
-            "5": 516194816.0,
-            "6": 516194816.0,
-            "7": 516194816.0,
-            "8": 516194816.0,
-            "9": 516194816.0,
-            "10": 516194816.0,
-            "11": 516194816.0,
-            "12": 516194816.0,
-            "13": 516194816.0,
-            "14": 516194816.0,
-            "15": 516194816.0,
-            "16": 516194816.0,
-            "17": 516194816.0,
-            "18": 516194816.0,
-            "19": 516194816.0,
-            "20": 516194816.0,
-            "21": 516194816.0,
-            "22": 516194816.0,
-            "23": 516194816.0,
-            "24": 516194816.0,
-            "25": 516194816.0,
-            "26": 516194816.0,
-            "27": 516194816.0,
-            "28": 516194816.0,
-            "29": 516194816.0,
-            "30": 516194816.0,
-            "31": 516194816.0,
-            "32": 516194816.0,
-            "33": 516194816.0,
-            "34": 516194816.0,
-            "35": 516194816.0,
-            "36": 516194816.0,
-            "37": 516194816.0,
-            "38": 516194816.0,
-            "39": 516194816.0,
-            "40": 516194816.0,
-            "41": 516194816.0,
-            "42": 516194816.0,
-            "43": 516194816.0,
-            "44": 516194816.0,
-            "45": 516194816.0,
-            "46": 516194816.0,
-            "47": 516194816.0,
-            "48": 516194816.0,
-            "49": 516194816.0,
-            "50": 516194816.0,
-            "51": 516194816.0,
-            "52": 516194816.0,
-            "53": 516194816.0,
-            "54": 516194816.0,
-            "55": 516194816.0,
-            "56": 516194816.0,
-            "57": 516194816.0,
-            "58": 516194816.0,
-            "59": 516194816.0,
-            "60": 516194816.0,
-            "61": 516194816.0,
-            "62": 516194816.0,
-            "63": 516194816.0,
-            "64": 516194816.0,
-            "65": 516194816.0,
-            "66": 516194816.0,
-            "67": 516194816.0,
-            "68": 516194816.0,
-            "69": 516194816.0,
-            "70": 516194816.0,
-            "71": 516194816.0,
-            "72": 516194816.0,
-            "73": 516194816.0,
-            "74": 516194816.0,
-            "75": 516194816.0,
-            "76": 516194816.0,
-            "77": 516194816.0,
-            "78": 516194816.0,
-            "79": 516194816.0,
-            "80": 516194816.0,
-            "81": 516194816.0,
-            "82": 516194816.0,
-            "83": 516194816.0,
-            "84": 516194816.0,
-            "85": 516194816.0,
-            "86": 516194816.0,
-            "87": 516194816.0,
-            "88": 516194816.0,
-            "89": 516194816.0,
-            "90": 516194816.0,
-            "91": 516194816.0,
-            "92": 516194816.0,
-            "93": 516194816.0,
-            "94": 516194816.0,
-            "95": 516194816.0,
-            "96": 516194816.0,
-            "97": 516194816.0,
-            "98": 516194816.0,
-            "99": 516194816.0,
-            "100": 516194816.0
+            "1": 514359808.0,
+            "2": 514359808.0,
+            "3": 514359808.0,
+            "4": 514359808.0,
+            "5": 514359808.0,
+            "6": 514359808.0,
+            "7": 514359808.0,
+            "8": 514359808.0,
+            "9": 514359808.0,
+            "10": 514359808.0,
+            "11": 514359808.0,
+            "12": 514359808.0,
+            "13": 514359808.0,
+            "14": 514359808.0,
+            "15": 514359808.0,
+            "16": 514359808.0,
+            "17": 514359808.0,
+            "18": 514359808.0,
+            "19": 514359808.0,
+            "20": 514359808.0,
+            "21": 514359808.0,
+            "22": 514359808.0,
+            "23": 514359808.0,
+            "24": 514359808.0,
+            "25": 514359808.0,
+            "26": 514359808.0,
+            "27": 514359808.0,
+            "28": 514359808.0,
+            "29": 514359808.0,
+            "30": 514359808.0,
+            "31": 514359808.0,
+            "32": 514359808.0,
+            "33": 514359808.0,
+            "34": 514359808.0,
+            "35": 514359808.0,
+            "36": 514359808.0,
+            "37": 514359808.0,
+            "38": 514359808.0,
+            "39": 514359808.0,
+            "40": 514359808.0,
+            "41": 514359808.0,
+            "42": 514359808.0,
+            "43": 514359808.0,
+            "44": 514359808.0,
+            "45": 514359808.0,
+            "46": 514359808.0,
+            "47": 514359808.0,
+            "48": 514359808.0,
+            "49": 514359808.0,
+            "50": 514359808.0,
+            "51": 514359808.0,
+            "52": 514359808.0,
+            "53": 514359808.0,
+            "54": 514359808.0,
+            "55": 514359808.0,
+            "56": 514359808.0,
+            "57": 514359808.0,
+            "58": 514359808.0,
+            "59": 514359808.0,
+            "60": 514359808.0,
+            "61": 514359808.0,
+            "62": 514359808.0,
+            "63": 514359808.0,
+            "64": 514359808.0,
+            "65": 514359808.0,
+            "66": 514359808.0,
+            "67": 514359808.0,
+            "68": 514359808.0,
+            "69": 514359808.0,
+            "70": 514359808.0,
+            "71": 514359808.0,
+            "72": 514359808.0,
+            "73": 514359808.0,
+            "74": 514359808.0,
+            "75": 514359808.0,
+            "76": 514359808.0,
+            "77": 514359808.0,
+            "78": 514359808.0,
+            "79": 514359808.0,
+            "80": 514359808.0,
+            "81": 514359808.0,
+            "82": 514359808.0,
+            "83": 514359808.0,
+            "84": 514359808.0,
+            "85": 514359808.0,
+            "86": 514359808.0,
+            "87": 514359808.0,
+            "88": 514359808.0,
+            "89": 514359808.0,
+            "90": 514359808.0,
+            "91": 514359808.0,
+            "92": 514359808.0,
+            "93": 514359808.0,
+            "94": 514359808.0,
+            "95": 514359808.0,
+            "96": 514359808.0,
+            "97": 514359808.0,
+            "98": 514359808.0,
+            "99": 514359808.0,
+            "100": 514359808.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1670130688.0,
-            "2": 1840523776.0,
-            "3": 1840523776.0,
-            "4": 1840523776.0,
-            "5": 1840523776.0,
-            "6": 1840523776.0,
-            "7": 1840523776.0,
-            "8": 1840523776.0,
-            "9": 1840523776.0,
-            "10": 1840523776.0,
-            "11": 1840523776.0,
-            "12": 1840523776.0,
-            "13": 1840523776.0,
-            "14": 1840523776.0,
-            "15": 1840523776.0,
-            "16": 1840523776.0,
-            "17": 1840523776.0,
-            "18": 1840523776.0,
-            "19": 1840523776.0,
-            "20": 1840523776.0,
-            "21": 1840523776.0,
-            "22": 1840523776.0,
-            "23": 1840523776.0,
-            "24": 1840523776.0,
-            "25": 1840523776.0,
-            "26": 1840523776.0,
-            "27": 1840523776.0,
-            "28": 1840523776.0,
-            "29": 1840523776.0,
-            "30": 1840523776.0,
-            "31": 1840523776.0,
-            "32": 1840523776.0,
-            "33": 1840523776.0,
-            "34": 1840523776.0,
-            "35": 1840523776.0,
-            "36": 1840523776.0,
-            "37": 1840523776.0,
-            "38": 1840523776.0,
-            "39": 1840523776.0,
-            "40": 1840523776.0,
-            "41": 1840523776.0,
-            "42": 1840523776.0,
-            "43": 1840523776.0,
-            "44": 1840523776.0,
-            "45": 1840523776.0,
-            "46": 1840523776.0,
-            "47": 1840523776.0,
-            "48": 1840523776.0,
-            "49": 1840523776.0,
-            "50": 1840523776.0,
-            "51": 1840523776.0,
-            "52": 1840523776.0,
-            "53": 1840523776.0,
-            "54": 1840523776.0,
-            "55": 1840523776.0,
-            "56": 1840523776.0,
-            "57": 1840523776.0,
-            "58": 1840523776.0,
-            "59": 1840523776.0,
-            "60": 1840523776.0,
-            "61": 1840523776.0,
-            "62": 1840523776.0,
-            "63": 1840523776.0,
-            "64": 1840523776.0,
-            "65": 1840523776.0,
-            "66": 1840523776.0,
-            "67": 1840523776.0,
-            "68": 1840523776.0,
-            "69": 1840523776.0,
-            "70": 1840523776.0,
-            "71": 1840523776.0,
-            "72": 1840523776.0,
-            "73": 1840523776.0,
-            "74": 1840523776.0,
-            "75": 1840523776.0,
-            "76": 1840523776.0,
-            "77": 1840523776.0,
-            "78": 1840523776.0,
-            "79": 1840523776.0,
-            "80": 1840523776.0,
-            "81": 1840523776.0,
-            "82": 1840523776.0,
-            "83": 1841310208.0,
-            "84": 1841310208.0,
-            "85": 1841310208.0,
-            "86": 1841310208.0,
-            "87": 1841310208.0,
-            "88": 1841310208.0,
-            "89": 1841310208.0,
-            "90": 1841310208.0,
-            "91": 1841310208.0,
-            "92": 1841310208.0,
-            "93": 1841310208.0,
-            "94": 1841310208.0,
-            "95": 1841310208.0,
-            "96": 1841310208.0,
-            "97": 1841310208.0,
-            "98": 1841310208.0,
-            "99": 1841310208.0,
-            "100": 1841310208.0
+            "1": 1670148096.0,
+            "2": 1837640192.0,
+            "3": 1837640192.0,
+            "4": 1837640192.0,
+            "5": 1837640192.0,
+            "6": 1837640192.0,
+            "7": 1837640192.0,
+            "8": 1837640192.0,
+            "9": 1837640192.0,
+            "10": 1837640192.0,
+            "11": 1837640192.0,
+            "12": 1837640192.0,
+            "13": 1837640192.0,
+            "14": 1837640192.0,
+            "15": 1837640192.0,
+            "16": 1837640192.0,
+            "17": 1837640192.0,
+            "18": 1837640192.0,
+            "19": 1837640192.0,
+            "20": 1837640192.0,
+            "21": 1837640192.0,
+            "22": 1837640192.0,
+            "23": 1837640192.0,
+            "24": 1837640192.0,
+            "25": 1837640192.0,
+            "26": 1837640192.0,
+            "27": 1837640192.0,
+            "28": 1837640192.0,
+            "29": 1837640192.0,
+            "30": 1837640192.0,
+            "31": 1837640192.0,
+            "32": 1837640192.0,
+            "33": 1837640192.0,
+            "34": 1837640192.0,
+            "35": 1837640192.0,
+            "36": 1837640192.0,
+            "37": 1837640192.0,
+            "38": 1837640192.0,
+            "39": 1837640192.0,
+            "40": 1837640192.0,
+            "41": 1837640192.0,
+            "42": 1837640192.0,
+            "43": 1837640192.0,
+            "44": 1837640192.0,
+            "45": 1837640192.0,
+            "46": 1837640192.0,
+            "47": 1837640192.0,
+            "48": 1837640192.0,
+            "49": 1837640192.0,
+            "50": 1837640192.0,
+            "51": 1837640192.0,
+            "52": 1837640192.0,
+            "53": 1837640192.0,
+            "54": 1837640192.0,
+            "55": 1837640192.0,
+            "56": 1837640192.0,
+            "57": 1837640192.0,
+            "58": 1837640192.0,
+            "59": 1837640192.0,
+            "60": 1837640192.0,
+            "61": 1837640192.0,
+            "62": 1837640192.0,
+            "63": 1837640192.0,
+            "64": 1837640192.0,
+            "65": 1837640192.0,
+            "66": 1837640192.0,
+            "67": 1837640192.0,
+            "68": 1837640192.0,
+            "69": 1837640192.0,
+            "70": 1837640192.0,
+            "71": 1837640192.0,
+            "72": 1837640192.0,
+            "73": 1837640192.0,
+            "74": 1837640192.0,
+            "75": 1837640192.0,
+            "76": 1837640192.0,
+            "77": 1837640192.0,
+            "78": 1837640192.0,
+            "79": 1837640192.0,
+            "80": 1837640192.0,
+            "81": 1837640192.0,
+            "82": 1837640192.0,
+            "83": 1837640192.0,
+            "84": 1837640192.0,
+            "85": 1837640192.0,
+            "86": 1837640192.0,
+            "87": 1837640192.0,
+            "88": 1837640192.0,
+            "89": 1837640192.0,
+            "90": 1837640192.0,
+            "91": 1837640192.0,
+            "92": 1837640192.0,
+            "93": 1837640192.0,
+            "94": 1837640192.0,
+            "95": 1837640192.0,
+            "96": 1837640192.0,
+            "97": 1837640192.0,
+            "98": 1837640192.0,
+            "99": 1837640192.0,
+            "100": 1837640192.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 15.65402,
-            "2": 0.15533,
-            "3": 0.13713,
-            "4": 0.14193,
-            "5": 0.13861,
-            "6": 0.13948,
-            "7": 0.13637,
-            "8": 0.13619,
-            "9": 0.14162,
-            "10": 0.13725,
-            "11": 0.13988,
-            "12": 0.14179,
-            "13": 0.14346,
-            "14": 0.14488,
-            "15": 0.1468,
-            "16": 0.14288,
-            "17": 0.13708,
-            "18": 0.13765,
-            "19": 0.13957,
-            "20": 0.13778,
-            "21": 0.13931,
-            "22": 0.13758,
-            "23": 0.13751,
-            "24": 0.14023,
-            "25": 0.14508,
-            "26": 0.15744,
-            "27": 0.15391,
-            "28": 0.15519,
-            "29": 0.14118,
-            "30": 0.1391,
-            "31": 0.13604,
-            "32": 0.1366,
-            "33": 0.13813,
-            "34": 0.13786,
-            "35": 0.13728,
-            "36": 0.13981,
-            "37": 0.14024,
-            "38": 0.13688,
-            "39": 0.13391,
-            "40": 0.13738,
-            "41": 0.14059,
-            "42": 0.13512,
-            "43": 0.13775,
-            "44": 0.13641,
-            "45": 0.13686,
-            "46": 0.14053,
-            "47": 0.13951,
-            "48": 0.14166,
-            "49": 0.13555,
-            "50": 0.13577,
-            "51": 0.14328,
-            "52": 0.14201,
-            "53": 0.13861,
-            "54": 0.13965,
-            "55": 0.13807,
-            "56": 0.14044,
-            "57": 0.14358,
-            "58": 0.14042,
-            "59": 0.13858,
-            "60": 0.13959,
-            "61": 0.13788,
-            "62": 0.14032,
-            "63": 0.13843,
-            "64": 0.13942,
-            "65": 0.13742,
-            "66": 0.13948,
-            "67": 0.14263,
-            "68": 0.13848,
-            "69": 0.13944,
-            "70": 0.13874,
-            "71": 0.14302,
-            "72": 0.13748,
-            "73": 0.13837,
-            "74": 0.13911,
-            "75": 0.13965,
-            "76": 0.1466,
-            "77": 0.14259,
-            "78": 0.13635,
-            "79": 0.14025,
-            "80": 0.14725,
-            "81": 0.14592,
-            "82": 0.14832,
-            "83": 0.14727,
-            "84": 0.14437,
-            "85": 0.13721,
-            "86": 0.14235,
-            "87": 0.13812,
-            "88": 0.13937,
-            "89": 0.1389,
-            "90": 0.13661,
-            "91": 0.1432,
-            "92": 0.1389,
-            "93": 0.13881,
-            "94": 0.13803,
-            "95": 0.13815,
-            "96": 0.14203,
-            "97": 0.13816,
-            "98": 0.13963,
-            "99": 0.14236,
-            "100": 0.14371
+            "1": 9.53425,
+            "2": 0.1525,
+            "3": 0.1318,
+            "4": 0.11378,
+            "5": 0.11192,
+            "6": 0.11218,
+            "7": 0.11154,
+            "8": 0.11173,
+            "9": 0.11229,
+            "10": 0.11154,
+            "11": 0.11167,
+            "12": 0.11151,
+            "13": 0.11086,
+            "14": 0.11183,
+            "15": 0.1112,
+            "16": 0.11119,
+            "17": 0.11049,
+            "18": 0.11127,
+            "19": 0.11165,
+            "20": 0.11158,
+            "21": 0.11135,
+            "22": 0.1116,
+            "23": 0.11105,
+            "24": 0.11218,
+            "25": 0.11189,
+            "26": 0.11148,
+            "27": 0.11258,
+            "28": 0.11129,
+            "29": 0.11127,
+            "30": 0.11264,
+            "31": 0.11113,
+            "32": 0.11139,
+            "33": 0.11019,
+            "34": 0.11118,
+            "35": 0.11227,
+            "36": 0.11007,
+            "37": 0.11047,
+            "38": 0.1112,
+            "39": 0.11057,
+            "40": 0.1122,
+            "41": 0.11135,
+            "42": 0.11041,
+            "43": 0.1105,
+            "44": 0.11017,
+            "45": 0.11127,
+            "46": 0.11089,
+            "47": 0.11064,
+            "48": 0.11167,
+            "49": 0.11021,
+            "50": 0.111,
+            "51": 0.13065,
+            "52": 0.12181,
+            "53": 0.11254,
+            "54": 0.11131,
+            "55": 0.11274,
+            "56": 0.11203,
+            "57": 0.11122,
+            "58": 0.11071,
+            "59": 0.1147,
+            "60": 0.11126,
+            "61": 0.11099,
+            "62": 0.11099,
+            "63": 0.11124,
+            "64": 0.11385,
+            "65": 0.11135,
+            "66": 0.11119,
+            "67": 0.11002,
+            "68": 0.11148,
+            "69": 0.11088,
+            "70": 0.1124,
+            "71": 0.11625,
+            "72": 0.11347,
+            "73": 0.11265,
+            "74": 0.11196,
+            "75": 0.11175,
+            "76": 0.11084,
+            "77": 0.10995,
+            "78": 0.11184,
+            "79": 0.10992,
+            "80": 0.11019,
+            "81": 0.1106,
+            "82": 0.11145,
+            "83": 0.11121,
+            "84": 0.11016,
+            "85": 0.11204,
+            "86": 0.11064,
+            "87": 0.11178,
+            "88": 0.11053,
+            "89": 0.11128,
+            "90": 0.11129,
+            "91": 0.11264,
+            "92": 0.1113,
+            "93": 0.1105,
+            "94": 0.11459,
+            "95": 0.11356,
+            "96": 0.10985,
+            "97": 0.1104,
+            "98": 0.11182,
+            "99": 0.11024,
+            "100": 0.11054
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..97a4288db23
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85585,
+            "52": 9.75241,
+            "53": 10.07586,
+            "54": 9.95689,
+            "55": 9.88209,
+            "56": 9.63139,
+            "57": 9.48651,
+            "58": 9.83118,
+            "59": 9.58907,
+            "60": 9.5065,
+            "61": 9.7037,
+            "62": 9.98291,
+            "63": 9.38318,
+            "64": 9.77909,
+            "65": 8.95183,
+            "66": 9.70161,
+            "67": 9.37209,
+            "68": 9.78854,
+            "69": 9.79856,
+            "70": 9.74746,
+            "71": 9.61908,
+            "72": 9.58507,
+            "73": 9.49728,
+            "74": 8.9393,
+            "75": 9.42707,
+            "76": 9.08024,
+            "77": 10.06567,
+            "78": 9.72898,
+            "79": 9.37773,
+            "80": 9.41002,
+            "81": 9.47979,
+            "82": 9.70181,
+            "83": 9.30624,
+            "84": 9.42099,
+            "85": 9.6138,
+            "86": 9.07653,
+            "87": 9.59455,
+            "88": 9.75073,
+            "89": 9.60246,
+            "90": 9.81898,
+            "91": 9.33898,
+            "92": 9.35717,
+            "93": 9.07886,
+            "94": 8.8351,
+            "95": 9.52175,
+            "96": 9.5301,
+            "97": 9.3131,
+            "98": 9.67785,
+            "99": 8.89062,
+            "100": 9.39726
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2553.0,
+            "52": 2558.0,
+            "53": 2867.0,
+            "54": 2887.0,
+            "55": 2364.0,
+            "56": 2737.0,
+            "57": 2446.0,
+            "58": 2933.0,
+            "59": 2696.0,
+            "60": 2423.0,
+            "61": 3055.0,
+            "62": 2568.0,
+            "63": 2454.0,
+            "64": 2951.0,
+            "65": 2655.0,
+            "66": 3084.0,
+            "67": 2895.0,
+            "68": 2774.0,
+            "69": 2948.0,
+            "70": 3026.0,
+            "71": 2920.0,
+            "72": 2346.0,
+            "73": 2943.0,
+            "74": 1862.0,
+            "75": 2492.0,
+            "76": 3006.0,
+            "77": 3124.0,
+            "78": 3129.0,
+            "79": 3132.0,
+            "80": 3296.0,
+            "81": 3746.0,
+            "82": 3327.0,
+            "83": 2719.0,
+            "84": 3230.0,
+            "85": 3271.0,
+            "86": 2743.0,
+            "87": 3821.0,
+            "88": 2989.0,
+            "89": 3310.0,
+            "90": 3031.0,
+            "91": 2802.0,
+            "92": 3065.0,
+            "93": 2744.0,
+            "94": 3417.0,
+            "95": 3408.0,
+            "96": 3345.0,
+            "97": 3086.0,
+            "98": 3708.0,
+            "99": 3174.0,
+            "100": 3141.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 696530432.0,
+            "52": 696530432.0,
+            "53": 696530432.0,
+            "54": 696530432.0,
+            "55": 696530432.0,
+            "56": 696530432.0,
+            "57": 696530432.0,
+            "58": 696530432.0,
+            "59": 696530432.0,
+            "60": 696530432.0,
+            "61": 696530432.0,
+            "62": 696530432.0,
+            "63": 696530432.0,
+            "64": 696530432.0,
+            "65": 696530432.0,
+            "66": 696530432.0,
+            "67": 696530432.0,
+            "68": 696530432.0,
+            "69": 696530432.0,
+            "70": 696530432.0,
+            "71": 696530432.0,
+            "72": 696530432.0,
+            "73": 696530432.0,
+            "74": 696530432.0,
+            "75": 696530432.0,
+            "76": 696530432.0,
+            "77": 696530432.0,
+            "78": 696530432.0,
+            "79": 696530432.0,
+            "80": 696530432.0,
+            "81": 696530432.0,
+            "82": 696530432.0,
+            "83": 696530432.0,
+            "84": 696530432.0,
+            "85": 696530432.0,
+            "86": 696530432.0,
+            "87": 696530432.0,
+            "88": 696530432.0,
+            "89": 696530432.0,
+            "90": 696530432.0,
+            "91": 696530432.0,
+            "92": 696530432.0,
+            "93": 696530432.0,
+            "94": 696530432.0,
+            "95": 696530432.0,
+            "96": 696530432.0,
+            "97": 696530432.0,
+            "98": 696530432.0,
+            "99": 696530432.0,
+            "100": 696530432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2031341568.0,
+            "52": 2031341568.0,
+            "53": 2031341568.0,
+            "54": 2031341568.0,
+            "55": 2031341568.0,
+            "56": 2031341568.0,
+            "57": 2031341568.0,
+            "58": 2031341568.0,
+            "59": 2031341568.0,
+            "60": 2031341568.0,
+            "61": 2031341568.0,
+            "62": 2031341568.0,
+            "63": 2031341568.0,
+            "64": 2031341568.0,
+            "65": 2031341568.0,
+            "66": 2031341568.0,
+            "67": 2031341568.0,
+            "68": 2031341568.0,
+            "69": 2031341568.0,
+            "70": 2031341568.0,
+            "71": 2031341568.0,
+            "72": 2031341568.0,
+            "73": 2031341568.0,
+            "74": 2031341568.0,
+            "75": 2031341568.0,
+            "76": 2031341568.0,
+            "77": 2031341568.0,
+            "78": 2031341568.0,
+            "79": 2031341568.0,
+            "80": 2031341568.0,
+            "81": 2031341568.0,
+            "82": 2031341568.0,
+            "83": 2031341568.0,
+            "84": 2031341568.0,
+            "85": 2031341568.0,
+            "86": 2031341568.0,
+            "87": 2031341568.0,
+            "88": 2031341568.0,
+            "89": 2031341568.0,
+            "90": 2031341568.0,
+            "91": 2031341568.0,
+            "92": 2031341568.0,
+            "93": 2031341568.0,
+            "94": 2031341568.0,
+            "95": 2031341568.0,
+            "96": 2031341568.0,
+            "97": 2031341568.0,
+            "98": 2031341568.0,
+            "99": 2031341568.0,
+            "100": 2031341568.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.51716,
+            "52": 0.17953,
+            "53": 0.13809,
+            "54": 0.13557,
+            "55": 0.13446,
+            "56": 0.13644,
+            "57": 0.13533,
+            "58": 0.12827,
+            "59": 0.12403,
+            "60": 0.12008,
+            "61": 0.11711,
+            "62": 0.11537,
+            "63": 0.11423,
+            "64": 0.11329,
+            "65": 0.11414,
+            "66": 0.11444,
+            "67": 0.11357,
+            "68": 0.11307,
+            "69": 0.11383,
+            "70": 0.11317,
+            "71": 0.11391,
+            "72": 0.11323,
+            "73": 0.11305,
+            "74": 0.11159,
+            "75": 0.11212,
+            "76": 0.11331,
+            "77": 0.11201,
+            "78": 0.11136,
+            "79": 0.11362,
+            "80": 0.11395,
+            "81": 0.11649,
+            "82": 0.11432,
+            "83": 0.11438,
+            "84": 0.11332,
+            "85": 0.11369,
+            "86": 0.11489,
+            "87": 0.11276,
+            "88": 0.1132,
+            "89": 0.11853,
+            "90": 0.11588,
+            "91": 0.11412,
+            "92": 0.11248,
+            "93": 0.11752,
+            "94": 0.11825,
+            "95": 0.11624,
+            "96": 0.11545,
+            "97": 0.11325,
+            "98": 0.11377,
+            "99": 0.11384,
+            "100": 0.11275
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json
index 3c656cc949e..ccdfa9ac12e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json
@@ -2,141 +2,536 @@
     "lm loss": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 10.92655,
-            "5": 10.92717,
-            "10": 10.90792,
-            "15": 10.88294,
-            "20": 10.77597,
-            "25": 10.59265,
-            "30": 10.39175,
-            "35": 10.29702,
-            "40": 10.09661,
-            "45": 9.84468,
-            "50": 9.90943,
-            "55": 9.87772,
-            "60": 9.49123,
-            "65": 8.94254,
-            "70": 9.72275,
-            "75": 9.41892,
-            "80": 9.4006,
-            "85": 9.61185,
-            "90": 9.81025,
-            "95": 9.51723,
-            "100": 9.40135
+            "1": 10.92228,
+            "2": 10.92833,
+            "3": 10.9171,
+            "4": 10.90497,
+            "5": 10.92805,
+            "6": 10.9367,
+            "7": 10.90405,
+            "8": 10.92231,
+            "9": 10.91258,
+            "10": 10.90849,
+            "11": 10.89333,
+            "12": 10.92084,
+            "13": 10.91496,
+            "14": 10.92147,
+            "15": 10.88434,
+            "16": 10.87455,
+            "17": 10.83916,
+            "18": 10.87305,
+            "19": 10.85329,
+            "20": 10.77493,
+            "21": 10.74754,
+            "22": 10.63151,
+            "23": 10.75621,
+            "24": 10.65566,
+            "25": 10.59217,
+            "26": 10.6533,
+            "27": 10.64878,
+            "28": 10.59653,
+            "29": 10.61011,
+            "30": 10.39283,
+            "31": 10.15724,
+            "32": 10.49222,
+            "33": 10.47943,
+            "34": 10.24015,
+            "35": 10.2971,
+            "36": 10.2456,
+            "37": 10.35281,
+            "38": 10.20531,
+            "39": 10.4042,
+            "40": 10.0955,
+            "41": 10.15277,
+            "42": 10.21885,
+            "43": 9.85522,
+            "44": 9.96244,
+            "45": 9.84618,
+            "46": 9.83799,
+            "47": 10.13882,
+            "48": 9.85698,
+            "49": 9.53751,
+            "50": 9.90881,
+            "51": 9.84975,
+            "52": 9.74161,
+            "53": 10.06325,
+            "54": 9.94588,
+            "55": 9.87743,
+            "56": 9.62751,
+            "57": 9.47268,
+            "58": 9.82914,
+            "59": 9.58307,
+            "60": 9.49183,
+            "61": 9.6996,
+            "62": 9.98093,
+            "63": 9.37223,
+            "64": 9.77562,
+            "65": 8.9434,
+            "66": 9.69995,
+            "67": 9.36423,
+            "68": 9.78704,
+            "69": 9.78393,
+            "70": 9.72294,
+            "71": 9.6074,
+            "72": 9.5842,
+            "73": 9.49096,
+            "74": 8.94874,
+            "75": 9.41816,
+            "76": 9.08732,
+            "77": 10.06288,
+            "78": 9.72904,
+            "79": 9.37094,
+            "80": 9.40034,
+            "81": 9.47762,
+            "82": 9.69127,
+            "83": 9.30769,
+            "84": 9.4126,
+            "85": 9.61136,
+            "86": 9.07624,
+            "87": 9.59463,
+            "88": 9.74771,
+            "89": 9.60681,
+            "90": 9.81083,
+            "91": 9.34451,
+            "92": 9.3654,
+            "93": 9.07749,
+            "94": 8.82979,
+            "95": 9.51679,
+            "96": 9.5255,
+            "97": 9.31042,
+            "98": 9.67816,
+            "99": 8.8885,
+            "100": 9.40133
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 1676.0,
-            "5": 1938.0,
-            "10": 1402.0,
-            "15": 1768.0,
-            "20": 1651.0,
-            "25": 1671.0,
-            "30": 1908.0,
-            "35": 1915.0,
+            "1": 1712.0,
+            "2": 1881.0,
+            "3": 1751.0,
+            "4": 1774.0,
+            "5": 1918.0,
+            "6": 1854.0,
+            "7": 1911.0,
+            "8": 1805.0,
+            "9": 2004.0,
+            "10": 1454.0,
+            "11": 1894.0,
+            "12": 1849.0,
+            "13": 1979.0,
+            "14": 1898.0,
+            "15": 1911.0,
+            "16": 1867.0,
+            "17": 1857.0,
+            "18": 1662.0,
+            "19": 1835.0,
+            "20": 1699.0,
+            "21": 1824.0,
+            "22": 1714.0,
+            "23": 1963.0,
+            "24": 1705.0,
+            "25": 1632.0,
+            "26": 1807.0,
+            "27": 1895.0,
+            "28": 2017.0,
+            "29": 2082.0,
+            "30": 1933.0,
+            "31": 1618.0,
+            "32": 1952.0,
+            "33": 2137.0,
+            "34": 1944.0,
+            "35": 2051.0,
+            "36": 1989.0,
+            "37": 2452.0,
+            "38": 2233.0,
+            "39": 2486.0,
             "40": 2163.0,
-            "45": 2125.0,
-            "50": 2496.0,
-            "55": 2392.0,
-            "60": 2334.0,
-            "65": 2771.0,
-            "70": 3234.0,
-            "75": 2675.0,
-            "80": 3564.0,
-            "85": 3284.0,
-            "90": 3079.0,
-            "95": 3405.0,
-            "100": 3430.0
+            "41": 2380.0,
+            "42": 2299.0,
+            "43": 1970.0,
+            "44": 2110.0,
+            "45": 2033.0,
+            "46": 2365.0,
+            "47": 2636.0,
+            "48": 2462.0,
+            "49": 2351.0,
+            "50": 2526.0,
+            "51": 2604.0,
+            "52": 2554.0,
+            "53": 3020.0,
+            "54": 2645.0,
+            "55": 2449.0,
+            "56": 2729.0,
+            "57": 2438.0,
+            "58": 3141.0,
+            "59": 2784.0,
+            "60": 2501.0,
+            "61": 2876.0,
+            "62": 2611.0,
+            "63": 2367.0,
+            "64": 3084.0,
+            "65": 2831.0,
+            "66": 3358.0,
+            "67": 2825.0,
+            "68": 2816.0,
+            "69": 3037.0,
+            "70": 3265.0,
+            "71": 3105.0,
+            "72": 2546.0,
+            "73": 3030.0,
+            "74": 1951.0,
+            "75": 2615.0,
+            "76": 2976.0,
+            "77": 3452.0,
+            "78": 3285.0,
+            "79": 3243.0,
+            "80": 3483.0,
+            "81": 3696.0,
+            "82": 3350.0,
+            "83": 2802.0,
+            "84": 3346.0,
+            "85": 3210.0,
+            "86": 2868.0,
+            "87": 3804.0,
+            "88": 3014.0,
+            "89": 3346.0,
+            "90": 3037.0,
+            "91": 2796.0,
+            "92": 3267.0,
+            "93": 2761.0,
+            "94": 3459.0,
+            "95": 3435.0,
+            "96": 3605.0,
+            "97": 3075.0,
+            "98": 3765.0,
+            "99": 3082.0,
+            "100": 3412.0
         }
     },
     "mem-allocated-bytes": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 435847168.0,
-            "5": 435847168.0,
-            "10": 435847168.0,
-            "15": 435847168.0,
-            "20": 435847168.0,
-            "25": 435847168.0,
-            "30": 435847168.0,
-            "35": 435847168.0,
-            "40": 435847168.0,
-            "45": 435847168.0,
-            "50": 435847168.0,
-            "55": 435847168.0,
-            "60": 435847168.0,
-            "65": 436895744.0,
-            "70": 435847168.0,
-            "75": 435847168.0,
-            "80": 435847168.0,
-            "85": 435847168.0,
-            "90": 435847168.0,
-            "95": 435847168.0,
-            "100": 435847168.0
+            "1": 436765184.0,
+            "2": 436765184.0,
+            "3": 436765184.0,
+            "4": 436765184.0,
+            "5": 436765184.0,
+            "6": 436765184.0,
+            "7": 436765184.0,
+            "8": 436765184.0,
+            "9": 436765184.0,
+            "10": 436765184.0,
+            "11": 436765184.0,
+            "12": 436765184.0,
+            "13": 436765184.0,
+            "14": 436765184.0,
+            "15": 436765184.0,
+            "16": 436765184.0,
+            "17": 436765184.0,
+            "18": 436765184.0,
+            "19": 436765184.0,
+            "20": 436765184.0,
+            "21": 436765184.0,
+            "22": 436765184.0,
+            "23": 436765184.0,
+            "24": 436765184.0,
+            "25": 436765184.0,
+            "26": 436765184.0,
+            "27": 436765184.0,
+            "28": 436765184.0,
+            "29": 436765184.0,
+            "30": 436765184.0,
+            "31": 436765184.0,
+            "32": 436765184.0,
+            "33": 436765184.0,
+            "34": 436765184.0,
+            "35": 436765184.0,
+            "36": 436765184.0,
+            "37": 436765184.0,
+            "38": 436765184.0,
+            "39": 436765184.0,
+            "40": 436765184.0,
+            "41": 436765184.0,
+            "42": 436765184.0,
+            "43": 436765184.0,
+            "44": 436765184.0,
+            "45": 436765184.0,
+            "46": 436765184.0,
+            "47": 436765184.0,
+            "48": 436765184.0,
+            "49": 436765184.0,
+            "50": 436765184.0,
+            "51": 436765184.0,
+            "52": 436765184.0,
+            "53": 436765184.0,
+            "54": 436765184.0,
+            "55": 436765184.0,
+            "56": 436765184.0,
+            "57": 436765184.0,
+            "58": 436765184.0,
+            "59": 436765184.0,
+            "60": 436765184.0,
+            "61": 436765184.0,
+            "62": 436765184.0,
+            "63": 436765184.0,
+            "64": 436765184.0,
+            "65": 436765184.0,
+            "66": 436765184.0,
+            "67": 436765184.0,
+            "68": 436765184.0,
+            "69": 436765184.0,
+            "70": 436765184.0,
+            "71": 436765184.0,
+            "72": 436765184.0,
+            "73": 436765184.0,
+            "74": 436765184.0,
+            "75": 436765184.0,
+            "76": 436765184.0,
+            "77": 436765184.0,
+            "78": 436765184.0,
+            "79": 436765184.0,
+            "80": 436765184.0,
+            "81": 436765184.0,
+            "82": 436765184.0,
+            "83": 436765184.0,
+            "84": 436765184.0,
+            "85": 436765184.0,
+            "86": 436765184.0,
+            "87": 436765184.0,
+            "88": 436765184.0,
+            "89": 436765184.0,
+            "90": 436765184.0,
+            "91": 436765184.0,
+            "92": 436765184.0,
+            "93": 436765184.0,
+            "94": 436765184.0,
+            "95": 436765184.0,
+            "96": 436765184.0,
+            "97": 436765184.0,
+            "98": 436765184.0,
+            "99": 436765184.0,
+            "100": 436765184.0
         }
     },
     "mem-max-allocated-bytes": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 2658189824.0,
-            "5": 2658189824.0,
-            "10": 2658189824.0,
-            "15": 2658189824.0,
-            "20": 2658189824.0,
-            "25": 2658189824.0,
-            "30": 2658189824.0,
-            "35": 2658189824.0,
-            "40": 2658189824.0,
-            "45": 2658189824.0,
-            "50": 2658189824.0,
-            "55": 2658189824.0,
-            "60": 2658189824.0,
-            "65": 2658189824.0,
-            "70": 2658189824.0,
-            "75": 2658189824.0,
-            "80": 2658189824.0,
-            "85": 2658189824.0,
-            "90": 2658189824.0,
-            "95": 2658189824.0,
-            "100": 2658189824.0
+            "1": 1591768576.0,
+            "2": 1772628480.0,
+            "3": 1772628480.0,
+            "4": 1772628480.0,
+            "5": 1772628480.0,
+            "6": 1772628480.0,
+            "7": 1772628480.0,
+            "8": 1772628480.0,
+            "9": 1772628480.0,
+            "10": 1772628480.0,
+            "11": 1772628480.0,
+            "12": 1772628480.0,
+            "13": 1772628480.0,
+            "14": 1772628480.0,
+            "15": 1772628480.0,
+            "16": 1772628480.0,
+            "17": 1772628480.0,
+            "18": 1772628480.0,
+            "19": 1772628480.0,
+            "20": 1772628480.0,
+            "21": 1772628480.0,
+            "22": 1772628480.0,
+            "23": 1772628480.0,
+            "24": 1772628480.0,
+            "25": 1772628480.0,
+            "26": 1772628480.0,
+            "27": 1772628480.0,
+            "28": 1772628480.0,
+            "29": 1772628480.0,
+            "30": 1772628480.0,
+            "31": 1772628480.0,
+            "32": 1772628480.0,
+            "33": 1772628480.0,
+            "34": 1772628480.0,
+            "35": 1772628480.0,
+            "36": 1772628480.0,
+            "37": 1772628480.0,
+            "38": 1772628480.0,
+            "39": 1772628480.0,
+            "40": 1772628480.0,
+            "41": 1772628480.0,
+            "42": 1772628480.0,
+            "43": 1772628480.0,
+            "44": 1772628480.0,
+            "45": 1772628480.0,
+            "46": 1772628480.0,
+            "47": 1772628480.0,
+            "48": 1772628480.0,
+            "49": 1772628480.0,
+            "50": 1772628480.0,
+            "51": 1772628480.0,
+            "52": 1772628480.0,
+            "53": 1772628480.0,
+            "54": 1772628480.0,
+            "55": 1772628480.0,
+            "56": 1772628480.0,
+            "57": 1772628480.0,
+            "58": 1772628480.0,
+            "59": 1772628480.0,
+            "60": 1772628480.0,
+            "61": 1772628480.0,
+            "62": 1772628480.0,
+            "63": 1772628480.0,
+            "64": 1772628480.0,
+            "65": 1772628480.0,
+            "66": 1772628480.0,
+            "67": 1772628480.0,
+            "68": 1772628480.0,
+            "69": 1772628480.0,
+            "70": 1772628480.0,
+            "71": 1772628480.0,
+            "72": 1772628480.0,
+            "73": 1772628480.0,
+            "74": 1772628480.0,
+            "75": 1772628480.0,
+            "76": 1772628480.0,
+            "77": 1772628480.0,
+            "78": 1772628480.0,
+            "79": 1772628480.0,
+            "80": 1772628480.0,
+            "81": 1772628480.0,
+            "82": 1772628480.0,
+            "83": 1772628480.0,
+            "84": 1772628480.0,
+            "85": 1772628480.0,
+            "86": 1772628480.0,
+            "87": 1772628480.0,
+            "88": 1772628480.0,
+            "89": 1772628480.0,
+            "90": 1772628480.0,
+            "91": 1772628480.0,
+            "92": 1772628480.0,
+            "93": 1772628480.0,
+            "94": 1772628480.0,
+            "95": 1772628480.0,
+            "96": 1772628480.0,
+            "97": 1772628480.0,
+            "98": 1772628480.0,
+            "99": 1772628480.0,
+            "100": 1772628480.0
         }
     },
     "iteration-time": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 17.69921,
-            "5": 0.18742,
-            "10": 0.18714,
-            "15": 0.18669,
-            "20": 0.18537,
-            "25": 0.18342,
-            "30": 0.18538,
-            "35": 0.18528,
-            "40": 0.18464,
-            "45": 0.18361,
-            "50": 0.18481,
-            "55": 0.18002,
-            "60": 0.17775,
-            "65": 0.17974,
-            "70": 0.17928,
-            "75": 0.17891,
-            "80": 0.17759,
-            "85": 0.18266,
-            "90": 0.18242,
-            "95": 0.18179,
-            "100": 0.18252
+            "1": 6.79884,
+            "2": 0.21326,
+            "3": 0.18469,
+            "4": 0.17105,
+            "5": 0.16929,
+            "6": 0.17076,
+            "7": 0.16854,
+            "8": 0.17395,
+            "9": 0.17202,
+            "10": 0.17285,
+            "11": 0.17206,
+            "12": 0.17207,
+            "13": 0.17163,
+            "14": 0.17259,
+            "15": 0.17327,
+            "16": 0.17397,
+            "17": 0.17148,
+            "18": 0.21472,
+            "19": 0.17296,
+            "20": 0.17251,
+            "21": 0.17267,
+            "22": 0.17535,
+            "23": 0.17343,
+            "24": 0.17203,
+            "25": 0.17337,
+            "26": 0.16951,
+            "27": 0.17011,
+            "28": 0.16817,
+            "29": 0.16977,
+            "30": 0.17071,
+            "31": 0.17041,
+            "32": 0.17011,
+            "33": 0.17101,
+            "34": 0.16967,
+            "35": 0.17036,
+            "36": 0.16981,
+            "37": 0.1698,
+            "38": 0.16954,
+            "39": 0.16912,
+            "40": 0.16943,
+            "41": 0.16939,
+            "42": 0.16854,
+            "43": 0.16921,
+            "44": 0.17053,
+            "45": 0.17026,
+            "46": 0.16981,
+            "47": 0.17026,
+            "48": 0.1704,
+            "49": 0.16972,
+            "50": 0.16914,
+            "51": 0.18301,
+            "52": 0.1739,
+            "53": 0.17306,
+            "54": 0.17414,
+            "55": 0.17269,
+            "56": 0.1744,
+            "57": 0.17288,
+            "58": 0.17544,
+            "59": 0.17344,
+            "60": 0.17444,
+            "61": 0.55151,
+            "62": 0.17447,
+            "63": 0.17397,
+            "64": 0.17325,
+            "65": 0.1739,
+            "66": 0.17369,
+            "67": 0.17326,
+            "68": 0.17374,
+            "69": 0.17249,
+            "70": 0.17298,
+            "71": 0.17197,
+            "72": 0.17208,
+            "73": 0.17303,
+            "74": 0.16725,
+            "75": 0.16595,
+            "76": 0.16671,
+            "77": 0.16787,
+            "78": 0.16647,
+            "79": 0.16683,
+            "80": 0.16672,
+            "81": 0.17084,
+            "82": 0.17024,
+            "83": 0.16993,
+            "84": 0.16957,
+            "85": 0.16932,
+            "86": 0.16994,
+            "87": 0.17023,
+            "88": 0.16646,
+            "89": 0.16652,
+            "90": 0.16596,
+            "91": 0.16647,
+            "92": 0.1665,
+            "93": 0.16668,
+            "94": 0.16609,
+            "95": 0.16694,
+            "96": 0.1659,
+            "97": 0.16601,
+            "98": 0.1667,
+            "99": 0.16701,
+            "100": 0.16618
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..adaf33cdb3a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84975,
+            "52": 9.74158,
+            "53": 10.0633,
+            "54": 9.94586,
+            "55": 9.87745,
+            "56": 9.62752,
+            "57": 9.47269,
+            "58": 9.82917,
+            "59": 9.58307,
+            "60": 9.49185,
+            "61": 9.6996,
+            "62": 9.98097,
+            "63": 9.37221,
+            "64": 9.77563,
+            "65": 8.94343,
+            "66": 9.69995,
+            "67": 9.36421,
+            "68": 9.78708,
+            "69": 9.78401,
+            "70": 9.72291,
+            "71": 9.60742,
+            "72": 9.5842,
+            "73": 9.49098,
+            "74": 8.94874,
+            "75": 9.41818,
+            "76": 9.08725,
+            "77": 10.06288,
+            "78": 9.72905,
+            "79": 9.37096,
+            "80": 9.40039,
+            "81": 9.47763,
+            "82": 9.69127,
+            "83": 9.30765,
+            "84": 9.41259,
+            "85": 9.61135,
+            "86": 9.07623,
+            "87": 9.59462,
+            "88": 9.74773,
+            "89": 9.6068,
+            "90": 9.81083,
+            "91": 9.34454,
+            "92": 9.3654,
+            "93": 9.0775,
+            "94": 8.82983,
+            "95": 9.5168,
+            "96": 9.52551,
+            "97": 9.31042,
+            "98": 9.67813,
+            "99": 8.88855,
+            "100": 9.40136
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2671.0,
+            "52": 2566.0,
+            "53": 2911.0,
+            "54": 2791.0,
+            "55": 2483.0,
+            "56": 2736.0,
+            "57": 2395.0,
+            "58": 3067.0,
+            "59": 2911.0,
+            "60": 2426.0,
+            "61": 2925.0,
+            "62": 2654.0,
+            "63": 2346.0,
+            "64": 3123.0,
+            "65": 2768.0,
+            "66": 3220.0,
+            "67": 2841.0,
+            "68": 2870.0,
+            "69": 2949.0,
+            "70": 3222.0,
+            "71": 3138.0,
+            "72": 2479.0,
+            "73": 3021.0,
+            "74": 1933.0,
+            "75": 2682.0,
+            "76": 3015.0,
+            "77": 3415.0,
+            "78": 3237.0,
+            "79": 3269.0,
+            "80": 3527.0,
+            "81": 3623.0,
+            "82": 3347.0,
+            "83": 2804.0,
+            "84": 3348.0,
+            "85": 3335.0,
+            "86": 2823.0,
+            "87": 3721.0,
+            "88": 3081.0,
+            "89": 3553.0,
+            "90": 3044.0,
+            "91": 2775.0,
+            "92": 3246.0,
+            "93": 2705.0,
+            "94": 3450.0,
+            "95": 3420.0,
+            "96": 3599.0,
+            "97": 2959.0,
+            "98": 3792.0,
+            "99": 3166.0,
+            "100": 3330.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 437551616.0,
+            "52": 437551616.0,
+            "53": 437551616.0,
+            "54": 437551616.0,
+            "55": 437551616.0,
+            "56": 437551616.0,
+            "57": 437551616.0,
+            "58": 437551616.0,
+            "59": 437551616.0,
+            "60": 437551616.0,
+            "61": 437551616.0,
+            "62": 437551616.0,
+            "63": 437551616.0,
+            "64": 437551616.0,
+            "65": 437551616.0,
+            "66": 437551616.0,
+            "67": 437551616.0,
+            "68": 437551616.0,
+            "69": 437551616.0,
+            "70": 437551616.0,
+            "71": 437551616.0,
+            "72": 437551616.0,
+            "73": 437551616.0,
+            "74": 437551616.0,
+            "75": 437551616.0,
+            "76": 437551616.0,
+            "77": 437551616.0,
+            "78": 437551616.0,
+            "79": 437551616.0,
+            "80": 437551616.0,
+            "81": 437551616.0,
+            "82": 437551616.0,
+            "83": 437551616.0,
+            "84": 437551616.0,
+            "85": 437551616.0,
+            "86": 437551616.0,
+            "87": 437551616.0,
+            "88": 437551616.0,
+            "89": 437551616.0,
+            "90": 437551616.0,
+            "91": 437551616.0,
+            "92": 437551616.0,
+            "93": 437551616.0,
+            "94": 437551616.0,
+            "95": 437551616.0,
+            "96": 437551616.0,
+            "97": 437551616.0,
+            "98": 437551616.0,
+            "99": 437551616.0,
+            "100": 437551616.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1774462464.0,
+            "52": 1774463488.0,
+            "53": 1774463488.0,
+            "54": 1774463488.0,
+            "55": 1774463488.0,
+            "56": 1774463488.0,
+            "57": 1774463488.0,
+            "58": 1774463488.0,
+            "59": 1774463488.0,
+            "60": 1774463488.0,
+            "61": 1774463488.0,
+            "62": 1774463488.0,
+            "63": 1774463488.0,
+            "64": 1774463488.0,
+            "65": 1774463488.0,
+            "66": 1774463488.0,
+            "67": 1774463488.0,
+            "68": 1774463488.0,
+            "69": 1774463488.0,
+            "70": 1774463488.0,
+            "71": 1774463488.0,
+            "72": 1774463488.0,
+            "73": 1774463488.0,
+            "74": 1774463488.0,
+            "75": 1774463488.0,
+            "76": 1774463488.0,
+            "77": 1774463488.0,
+            "78": 1774463488.0,
+            "79": 1774463488.0,
+            "80": 1774463488.0,
+            "81": 1774463488.0,
+            "82": 1774463488.0,
+            "83": 1774463488.0,
+            "84": 1774463488.0,
+            "85": 1774463488.0,
+            "86": 1774463488.0,
+            "87": 1774463488.0,
+            "88": 1774463488.0,
+            "89": 1774463488.0,
+            "90": 1774463488.0,
+            "91": 1774463488.0,
+            "92": 1774463488.0,
+            "93": 1774463488.0,
+            "94": 1774463488.0,
+            "95": 1774463488.0,
+            "96": 1774463488.0,
+            "97": 1774463488.0,
+            "98": 1774463488.0,
+            "99": 1774463488.0,
+            "100": 1774463488.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4.74138,
+            "52": 0.19833,
+            "53": 0.17523,
+            "54": 0.17326,
+            "55": 0.17289,
+            "56": 0.17406,
+            "57": 0.17353,
+            "58": 0.17413,
+            "59": 0.1741,
+            "60": 0.18,
+            "61": 0.17815,
+            "62": 0.1771,
+            "63": 0.17887,
+            "64": 0.17716,
+            "65": 0.18267,
+            "66": 0.18368,
+            "67": 0.18326,
+            "68": 0.1822,
+            "69": 0.18471,
+            "70": 0.17793,
+            "71": 0.17586,
+            "72": 0.17439,
+            "73": 0.17531,
+            "74": 0.17811,
+            "75": 0.18496,
+            "76": 0.17711,
+            "77": 0.17788,
+            "78": 0.17629,
+            "79": 0.1758,
+            "80": 0.17563,
+            "81": 0.17581,
+            "82": 0.17682,
+            "83": 0.17641,
+            "84": 0.17489,
+            "85": 0.17508,
+            "86": 0.17588,
+            "87": 0.176,
+            "88": 0.17581,
+            "89": 0.17485,
+            "90": 0.17493,
+            "91": 0.17412,
+            "92": 0.17456,
+            "93": 0.17597,
+            "94": 0.17515,
+            "95": 0.17511,
+            "96": 0.17499,
+            "97": 0.17485,
+            "98": 0.1758,
+            "99": 0.17572,
+            "100": 0.17544
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..27eb21de0f0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86897,
+            "2": 10.88544,
+            "3": 10.86473,
+            "4": 10.86826,
+            "5": 10.87436,
+            "6": 10.89005,
+            "7": 10.87769,
+            "8": 10.86364,
+            "9": 10.88282,
+            "10": 10.84687,
+            "11": 10.87102,
+            "12": 10.87345,
+            "13": 10.8814,
+            "14": 10.8877,
+            "15": 10.83869,
+            "16": 10.8239,
+            "17": 10.80197,
+            "18": 10.81094,
+            "19": 10.82192,
+            "20": 10.71791,
+            "21": 10.68914,
+            "22": 10.57271,
+            "23": 10.7081,
+            "24": 10.59543,
+            "25": 10.55292,
+            "26": 10.61257,
+            "27": 10.60051,
+            "28": 10.56173,
+            "29": 10.58089,
+            "30": 10.35595,
+            "31": 10.1182,
+            "32": 10.44815,
+            "33": 10.4542,
+            "34": 10.21553,
+            "35": 10.26124,
+            "36": 10.20776,
+            "37": 10.33673,
+            "38": 10.17741,
+            "39": 10.39297,
+            "40": 10.06349,
+            "41": 10.13887,
+            "42": 10.2056,
+            "43": 9.82809,
+            "44": 9.94547,
+            "45": 9.82561,
+            "46": 9.80186,
+            "47": 10.14049,
+            "48": 9.84276,
+            "49": 9.52016,
+            "50": 9.88454,
+            "51": 9.84743,
+            "52": 9.74209,
+            "53": 10.05697,
+            "54": 9.9505,
+            "55": 9.88145,
+            "56": 9.61274,
+            "57": 9.4687,
+            "58": 9.82193,
+            "59": 9.57642,
+            "60": 9.49762,
+            "61": 9.69189,
+            "62": 9.9867,
+            "63": 9.37512,
+            "64": 9.76679,
+            "65": 8.94648,
+            "66": 9.7023,
+            "67": 9.36326,
+            "68": 9.7831,
+            "69": 9.7986,
+            "70": 9.7317,
+            "71": 9.62571,
+            "72": 9.58488,
+            "73": 9.48967,
+            "74": 8.9286,
+            "75": 9.40862,
+            "76": 9.07925,
+            "77": 10.0594,
+            "78": 9.72288,
+            "79": 9.37784,
+            "80": 9.40429,
+            "81": 9.48309,
+            "82": 9.7004,
+            "83": 9.31595,
+            "84": 9.41838,
+            "85": 9.61685,
+            "86": 9.07533,
+            "87": 9.59616,
+            "88": 9.75215,
+            "89": 9.60184,
+            "90": 9.82281,
+            "91": 9.34037,
+            "92": 9.35854,
+            "93": 9.08805,
+            "94": 8.83037,
+            "95": 9.5266,
+            "96": 9.53049,
+            "97": 9.30389,
+            "98": 9.67196,
+            "99": 8.89637,
+            "100": 9.40644
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1621.0,
+            "2": 1657.0,
+            "3": 1580.0,
+            "4": 1839.0,
+            "5": 1862.0,
+            "6": 1724.0,
+            "7": 1714.0,
+            "8": 1670.0,
+            "9": 1762.0,
+            "10": 1358.0,
+            "11": 1734.0,
+            "12": 1682.0,
+            "13": 1761.0,
+            "14": 1731.0,
+            "15": 1788.0,
+            "16": 1801.0,
+            "17": 1866.0,
+            "18": 1636.0,
+            "19": 1709.0,
+            "20": 1607.0,
+            "21": 1821.0,
+            "22": 1666.0,
+            "23": 1991.0,
+            "24": 1585.0,
+            "25": 1587.0,
+            "26": 1631.0,
+            "27": 1714.0,
+            "28": 1966.0,
+            "29": 1997.0,
+            "30": 1851.0,
+            "31": 1581.0,
+            "32": 1864.0,
+            "33": 2107.0,
+            "34": 1846.0,
+            "35": 1982.0,
+            "36": 1904.0,
+            "37": 2373.0,
+            "38": 2172.0,
+            "39": 2343.0,
+            "40": 2149.0,
+            "41": 2331.0,
+            "42": 2199.0,
+            "43": 1914.0,
+            "44": 2065.0,
+            "45": 2081.0,
+            "46": 2352.0,
+            "47": 2497.0,
+            "48": 2303.0,
+            "49": 2346.0,
+            "50": 2411.0,
+            "51": 2491.0,
+            "52": 2552.0,
+            "53": 2980.0,
+            "54": 2680.0,
+            "55": 2274.0,
+            "56": 2734.0,
+            "57": 2319.0,
+            "58": 2907.0,
+            "59": 2886.0,
+            "60": 2566.0,
+            "61": 2855.0,
+            "62": 2704.0,
+            "63": 2370.0,
+            "64": 2998.0,
+            "65": 2563.0,
+            "66": 2868.0,
+            "67": 2762.0,
+            "68": 2739.0,
+            "69": 2730.0,
+            "70": 3156.0,
+            "71": 2803.0,
+            "72": 2506.0,
+            "73": 2896.0,
+            "74": 1937.0,
+            "75": 2450.0,
+            "76": 2794.0,
+            "77": 3047.0,
+            "78": 3104.0,
+            "79": 3069.0,
+            "80": 3286.0,
+            "81": 3543.0,
+            "82": 3192.0,
+            "83": 2614.0,
+            "84": 3273.0,
+            "85": 3111.0,
+            "86": 2680.0,
+            "87": 3654.0,
+            "88": 3117.0,
+            "89": 3351.0,
+            "90": 3086.0,
+            "91": 2721.0,
+            "92": 3045.0,
+            "93": 2672.0,
+            "94": 3326.0,
+            "95": 3125.0,
+            "96": 3309.0,
+            "97": 3208.0,
+            "98": 3572.0,
+            "99": 2980.0,
+            "100": 3355.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 514359808.0,
+            "2": 514359808.0,
+            "3": 514359808.0,
+            "4": 514359808.0,
+            "5": 514359808.0,
+            "6": 514359808.0,
+            "7": 514359808.0,
+            "8": 514359808.0,
+            "9": 514359808.0,
+            "10": 514359808.0,
+            "11": 514359808.0,
+            "12": 514359808.0,
+            "13": 514359808.0,
+            "14": 514359808.0,
+            "15": 514359808.0,
+            "16": 514359808.0,
+            "17": 514359808.0,
+            "18": 514359808.0,
+            "19": 514359808.0,
+            "20": 514359808.0,
+            "21": 514359808.0,
+            "22": 514359808.0,
+            "23": 514359808.0,
+            "24": 514359808.0,
+            "25": 514359808.0,
+            "26": 514359808.0,
+            "27": 514359808.0,
+            "28": 514359808.0,
+            "29": 514359808.0,
+            "30": 514359808.0,
+            "31": 514359808.0,
+            "32": 514359808.0,
+            "33": 514359808.0,
+            "34": 514359808.0,
+            "35": 514359808.0,
+            "36": 514359808.0,
+            "37": 514359808.0,
+            "38": 514359808.0,
+            "39": 514359808.0,
+            "40": 514359808.0,
+            "41": 514359808.0,
+            "42": 514359808.0,
+            "43": 514359808.0,
+            "44": 514359808.0,
+            "45": 514359808.0,
+            "46": 514359808.0,
+            "47": 514359808.0,
+            "48": 514359808.0,
+            "49": 514359808.0,
+            "50": 514359808.0,
+            "51": 514359808.0,
+            "52": 514359808.0,
+            "53": 514359808.0,
+            "54": 514359808.0,
+            "55": 514359808.0,
+            "56": 514359808.0,
+            "57": 514359808.0,
+            "58": 514359808.0,
+            "59": 514359808.0,
+            "60": 514359808.0,
+            "61": 514359808.0,
+            "62": 514359808.0,
+            "63": 514359808.0,
+            "64": 514359808.0,
+            "65": 514359808.0,
+            "66": 514359808.0,
+            "67": 514359808.0,
+            "68": 514359808.0,
+            "69": 514359808.0,
+            "70": 514359808.0,
+            "71": 514359808.0,
+            "72": 514359808.0,
+            "73": 514359808.0,
+            "74": 514359808.0,
+            "75": 514359808.0,
+            "76": 514359808.0,
+            "77": 514359808.0,
+            "78": 514359808.0,
+            "79": 514359808.0,
+            "80": 514359808.0,
+            "81": 514359808.0,
+            "82": 514359808.0,
+            "83": 514359808.0,
+            "84": 514359808.0,
+            "85": 514359808.0,
+            "86": 514359808.0,
+            "87": 514359808.0,
+            "88": 514359808.0,
+            "89": 514359808.0,
+            "90": 514359808.0,
+            "91": 514359808.0,
+            "92": 514359808.0,
+            "93": 514359808.0,
+            "94": 514359808.0,
+            "95": 514359808.0,
+            "96": 514359808.0,
+            "97": 514359808.0,
+            "98": 514359808.0,
+            "99": 514359808.0,
+            "100": 514359808.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1258060288.0,
+            "2": 1437084160.0,
+            "3": 1437084160.0,
+            "4": 1437084160.0,
+            "5": 1437084160.0,
+            "6": 1437084160.0,
+            "7": 1437084160.0,
+            "8": 1437084160.0,
+            "9": 1437084160.0,
+            "10": 1437084160.0,
+            "11": 1437084160.0,
+            "12": 1437084160.0,
+            "13": 1437084160.0,
+            "14": 1437084160.0,
+            "15": 1437084160.0,
+            "16": 1437084160.0,
+            "17": 1437084160.0,
+            "18": 1437084160.0,
+            "19": 1437084160.0,
+            "20": 1437084160.0,
+            "21": 1437084160.0,
+            "22": 1437084160.0,
+            "23": 1437084160.0,
+            "24": 1437084160.0,
+            "25": 1437084160.0,
+            "26": 1437084160.0,
+            "27": 1437084160.0,
+            "28": 1437084160.0,
+            "29": 1437084160.0,
+            "30": 1437084160.0,
+            "31": 1437084160.0,
+            "32": 1437084160.0,
+            "33": 1437084160.0,
+            "34": 1437084160.0,
+            "35": 1437084160.0,
+            "36": 1437084160.0,
+            "37": 1437084160.0,
+            "38": 1437084160.0,
+            "39": 1437084160.0,
+            "40": 1437084160.0,
+            "41": 1437084160.0,
+            "42": 1437084160.0,
+            "43": 1437084160.0,
+            "44": 1437084160.0,
+            "45": 1437084160.0,
+            "46": 1437084160.0,
+            "47": 1437084160.0,
+            "48": 1437084160.0,
+            "49": 1437084160.0,
+            "50": 1437084160.0,
+            "51": 1437084160.0,
+            "52": 1437084160.0,
+            "53": 1437084160.0,
+            "54": 1437084160.0,
+            "55": 1437084160.0,
+            "56": 1437084160.0,
+            "57": 1437084160.0,
+            "58": 1437084160.0,
+            "59": 1437084160.0,
+            "60": 1437084160.0,
+            "61": 1437084160.0,
+            "62": 1437084160.0,
+            "63": 1437084160.0,
+            "64": 1437084160.0,
+            "65": 1437084160.0,
+            "66": 1437084160.0,
+            "67": 1437084160.0,
+            "68": 1437084160.0,
+            "69": 1437084160.0,
+            "70": 1437084160.0,
+            "71": 1437084160.0,
+            "72": 1437084160.0,
+            "73": 1437084160.0,
+            "74": 1437084160.0,
+            "75": 1437084160.0,
+            "76": 1437084160.0,
+            "77": 1437084160.0,
+            "78": 1437084160.0,
+            "79": 1437084160.0,
+            "80": 1437084160.0,
+            "81": 1437084160.0,
+            "82": 1437084160.0,
+            "83": 1437084160.0,
+            "84": 1437084160.0,
+            "85": 1437084160.0,
+            "86": 1437084160.0,
+            "87": 1437084160.0,
+            "88": 1437084160.0,
+            "89": 1437084160.0,
+            "90": 1437084160.0,
+            "91": 1437084160.0,
+            "92": 1437084160.0,
+            "93": 1437084160.0,
+            "94": 1437084160.0,
+            "95": 1437084160.0,
+            "96": 1437084160.0,
+            "97": 1437084160.0,
+            "98": 1437084160.0,
+            "99": 1437084160.0,
+            "100": 1437084160.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.6815,
+            "3": 0.23582,
+            "4": 0.21969,
+            "5": 0.22399,
+            "6": 0.21848,
+            "7": 0.21944,
+            "8": 0.21989,
+            "9": 0.22542,
+            "10": 0.22685,
+            "11": 0.22859,
+            "12": 0.22734,
+            "13": 0.22735,
+            "14": 0.22682,
+            "15": 0.22731,
+            "16": 0.22724,
+            "17": 0.22774,
+            "18": 0.2253,
+            "19": 0.21338,
+            "20": 0.21612,
+            "21": 0.22487,
+            "22": 0.2609,
+            "23": 0.34495,
+            "24": 0.40538,
+            "25": 0.27265,
+            "26": 0.22852,
+            "27": 0.23498,
+            "28": 0.23458,
+            "29": 0.2356,
+            "30": 0.23223,
+            "31": 0.23427,
+            "32": 0.23193,
+            "33": 0.23007,
+            "34": 0.22762,
+            "35": 0.22604,
+            "36": 0.22153,
+            "37": 0.21923,
+            "38": 0.21718,
+            "39": 0.2162,
+            "40": 0.21653,
+            "41": 0.21673,
+            "42": 0.21416,
+            "43": 0.21439,
+            "44": 0.2141,
+            "45": 0.21364,
+            "46": 0.21263,
+            "47": 0.2139,
+            "48": 0.21445,
+            "49": 0.21424,
+            "50": 0.21381,
+            "51": 0.21544,
+            "52": 0.21075,
+            "53": 0.21292,
+            "54": 0.21407,
+            "55": 0.2167,
+            "56": 0.21877,
+            "57": 0.21861,
+            "58": 0.22087,
+            "59": 0.21999,
+            "60": 0.21884,
+            "61": 0.21841,
+            "62": 0.21988,
+            "63": 0.21876,
+            "64": 0.21811,
+            "65": 0.21795,
+            "66": 0.2197,
+            "67": 0.22005,
+            "68": 0.21994,
+            "69": 0.21937,
+            "70": 0.21964,
+            "71": 0.22007,
+            "72": 0.221,
+            "73": 0.22145,
+            "74": 0.22069,
+            "75": 0.22126,
+            "76": 0.21984,
+            "77": 0.22096,
+            "78": 0.2231,
+            "79": 0.22168,
+            "80": 0.21932,
+            "81": 0.21748,
+            "82": 0.21971,
+            "83": 0.22113,
+            "84": 0.22096,
+            "85": 0.22316,
+            "86": 0.22043,
+            "87": 0.22198,
+            "88": 0.2247,
+            "89": 0.2219,
+            "90": 0.22258,
+            "91": 0.22224,
+            "92": 0.22132,
+            "93": 0.22182,
+            "94": 0.22397,
+            "95": 0.22547,
+            "96": 0.22177,
+            "97": 0.22282,
+            "98": 0.22255,
+            "99": 0.22417,
+            "100": 0.22334
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..0889d8315f2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84743,
+            "52": 9.74209,
+            "53": 10.05697,
+            "54": 9.9505,
+            "55": 9.88145,
+            "56": 9.61274,
+            "57": 9.4687,
+            "58": 9.82193,
+            "59": 9.57642,
+            "60": 9.49762,
+            "61": 9.69189,
+            "62": 9.9867,
+            "63": 9.37512,
+            "64": 9.76679,
+            "65": 8.94648,
+            "66": 9.7023,
+            "67": 9.36326,
+            "68": 9.7831,
+            "69": 9.7986,
+            "70": 9.7317,
+            "71": 9.62571,
+            "72": 9.58488,
+            "73": 9.48967,
+            "74": 8.9286,
+            "75": 9.40862,
+            "76": 9.07925,
+            "77": 10.0594,
+            "78": 9.72288,
+            "79": 9.37784,
+            "80": 9.40429,
+            "81": 9.48309,
+            "82": 9.7004,
+            "83": 9.31595,
+            "84": 9.41838,
+            "85": 9.61685,
+            "86": 9.07533,
+            "87": 9.59616,
+            "88": 9.75215,
+            "89": 9.60184,
+            "90": 9.82281,
+            "91": 9.34037,
+            "92": 9.35854,
+            "93": 9.08805,
+            "94": 8.83037,
+            "95": 9.5266,
+            "96": 9.53049,
+            "97": 9.30389,
+            "98": 9.67196,
+            "99": 8.89637,
+            "100": 9.40644
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2491.0,
+            "52": 2552.0,
+            "53": 2980.0,
+            "54": 2680.0,
+            "55": 2274.0,
+            "56": 2734.0,
+            "57": 2319.0,
+            "58": 2907.0,
+            "59": 2886.0,
+            "60": 2566.0,
+            "61": 2855.0,
+            "62": 2704.0,
+            "63": 2370.0,
+            "64": 2998.0,
+            "65": 2563.0,
+            "66": 2868.0,
+            "67": 2762.0,
+            "68": 2739.0,
+            "69": 2730.0,
+            "70": 3156.0,
+            "71": 2803.0,
+            "72": 2506.0,
+            "73": 2896.0,
+            "74": 1937.0,
+            "75": 2450.0,
+            "76": 2794.0,
+            "77": 3047.0,
+            "78": 3104.0,
+            "79": 3069.0,
+            "80": 3286.0,
+            "81": 3543.0,
+            "82": 3192.0,
+            "83": 2614.0,
+            "84": 3273.0,
+            "85": 3111.0,
+            "86": 2680.0,
+            "87": 3654.0,
+            "88": 3117.0,
+            "89": 3351.0,
+            "90": 3086.0,
+            "91": 2721.0,
+            "92": 3045.0,
+            "93": 2672.0,
+            "94": 3326.0,
+            "95": 3125.0,
+            "96": 3309.0,
+            "97": 3208.0,
+            "98": 3572.0,
+            "99": 2980.0,
+            "100": 3355.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 515933696.0,
+            "52": 515933696.0,
+            "53": 515933696.0,
+            "54": 515933696.0,
+            "55": 515933696.0,
+            "56": 515933696.0,
+            "57": 515933696.0,
+            "58": 515933696.0,
+            "59": 515933696.0,
+            "60": 515933696.0,
+            "61": 515933696.0,
+            "62": 515933696.0,
+            "63": 515933696.0,
+            "64": 515933696.0,
+            "65": 515933696.0,
+            "66": 515933696.0,
+            "67": 515933696.0,
+            "68": 515933696.0,
+            "69": 515933696.0,
+            "70": 515933696.0,
+            "71": 515933696.0,
+            "72": 515933696.0,
+            "73": 515933696.0,
+            "74": 515933696.0,
+            "75": 515933696.0,
+            "76": 515933696.0,
+            "77": 515933696.0,
+            "78": 515933696.0,
+            "79": 515933696.0,
+            "80": 515933696.0,
+            "81": 515933696.0,
+            "82": 515933696.0,
+            "83": 515933696.0,
+            "84": 515933696.0,
+            "85": 515933696.0,
+            "86": 515933696.0,
+            "87": 515933696.0,
+            "88": 515933696.0,
+            "89": 515933696.0,
+            "90": 515933696.0,
+            "91": 515933696.0,
+            "92": 515933696.0,
+            "93": 515933696.0,
+            "94": 515933696.0,
+            "95": 515933696.0,
+            "96": 515933696.0,
+            "97": 515933696.0,
+            "98": 515933696.0,
+            "99": 515933696.0,
+            "100": 515933696.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1440754176.0,
+            "52": 1440755200.0,
+            "53": 1440755200.0,
+            "54": 1440755200.0,
+            "55": 1440755200.0,
+            "56": 1440755200.0,
+            "57": 1440755200.0,
+            "58": 1440755200.0,
+            "59": 1440755200.0,
+            "60": 1440755200.0,
+            "61": 1440755200.0,
+            "62": 1440755200.0,
+            "63": 1440755200.0,
+            "64": 1440755200.0,
+            "65": 1440755200.0,
+            "66": 1440755200.0,
+            "67": 1440755200.0,
+            "68": 1440755200.0,
+            "69": 1440755200.0,
+            "70": 1440755200.0,
+            "71": 1440755200.0,
+            "72": 1440755200.0,
+            "73": 1440755200.0,
+            "74": 1440755200.0,
+            "75": 1440755200.0,
+            "76": 1440755200.0,
+            "77": 1440755200.0,
+            "78": 1440755200.0,
+            "79": 1440755200.0,
+            "80": 1440755200.0,
+            "81": 1440755200.0,
+            "82": 1440755200.0,
+            "83": 1440755200.0,
+            "84": 1440755200.0,
+            "85": 1440755200.0,
+            "86": 1440755200.0,
+            "87": 1440755200.0,
+            "88": 1440755200.0,
+            "89": 1440755200.0,
+            "90": 1440755200.0,
+            "91": 1440755200.0,
+            "92": 1440755200.0,
+            "93": 1440755200.0,
+            "94": 1440755200.0,
+            "95": 1440755200.0,
+            "96": 1440755200.0,
+            "97": 1440755200.0,
+            "98": 1440755200.0,
+            "99": 1440755200.0,
+            "100": 1440755200.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.76016,
+            "53": 0.23476,
+            "54": 0.223,
+            "55": 0.22429,
+            "56": 0.21901,
+            "57": 0.22278,
+            "58": 0.22163,
+            "59": 0.22214,
+            "60": 0.22201,
+            "61": 0.2247,
+            "62": 0.22616,
+            "63": 0.22396,
+            "64": 0.23047,
+            "65": 0.23737,
+            "66": 0.24455,
+            "67": 0.23295,
+            "68": 0.22857,
+            "69": 0.22662,
+            "70": 0.22814,
+            "71": 0.2322,
+            "72": 0.233,
+            "73": 0.22777,
+            "74": 0.22898,
+            "75": 0.23307,
+            "76": 0.23163,
+            "77": 0.23205,
+            "78": 0.23196,
+            "79": 0.2324,
+            "80": 0.23104,
+            "81": 0.23192,
+            "82": 0.23206,
+            "83": 0.22902,
+            "84": 0.23961,
+            "85": 0.24378,
+            "86": 0.24255,
+            "87": 0.24283,
+            "88": 0.24429,
+            "89": 0.24795,
+            "90": 0.2492,
+            "91": 0.2493,
+            "92": 0.24516,
+            "93": 0.24543,
+            "94": 0.23595,
+            "95": 0.23484,
+            "96": 0.23416,
+            "97": 0.24493,
+            "98": 0.24676,
+            "99": 0.24195,
+            "100": 0.2459
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100.json
index e895f06a28a..6e4aa9e48e0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 517505536.0,
-            "2": 517505536.0,
-            "3": 517505536.0,
-            "4": 517505536.0,
-            "5": 517505536.0,
-            "6": 517505536.0,
-            "7": 517505536.0,
-            "8": 517505536.0,
-            "9": 517505536.0,
-            "10": 517505536.0,
-            "11": 517505536.0,
-            "12": 517505536.0,
-            "13": 517505536.0,
-            "14": 517505536.0,
-            "15": 517505536.0,
-            "16": 517505536.0,
-            "17": 517505536.0,
-            "18": 517505536.0,
-            "19": 517505536.0,
-            "20": 517505536.0,
-            "21": 517505536.0,
-            "22": 517505536.0,
-            "23": 517505536.0,
-            "24": 517505536.0,
-            "25": 517505536.0,
-            "26": 517505536.0,
-            "27": 517505536.0,
-            "28": 517505536.0,
-            "29": 517505536.0,
-            "30": 517505536.0,
-            "31": 517505536.0,
-            "32": 517505536.0,
-            "33": 517505536.0,
-            "34": 517505536.0,
-            "35": 517505536.0,
-            "36": 517505536.0,
-            "37": 517505536.0,
-            "38": 517505536.0,
-            "39": 517505536.0,
-            "40": 517505536.0,
-            "41": 517505536.0,
-            "42": 517505536.0,
-            "43": 517505536.0,
-            "44": 517505536.0,
-            "45": 517505536.0,
-            "46": 517505536.0,
-            "47": 517505536.0,
-            "48": 517505536.0,
-            "49": 517505536.0,
-            "50": 517505536.0,
-            "51": 517505536.0,
-            "52": 517505536.0,
-            "53": 517505536.0,
-            "54": 517505536.0,
-            "55": 517505536.0,
-            "56": 517505536.0,
-            "57": 517505536.0,
-            "58": 517505536.0,
-            "59": 517505536.0,
-            "60": 517505536.0,
-            "61": 517505536.0,
-            "62": 517505536.0,
-            "63": 517505536.0,
-            "64": 517505536.0,
-            "65": 517505536.0,
-            "66": 517505536.0,
-            "67": 517505536.0,
-            "68": 517505536.0,
-            "69": 517505536.0,
-            "70": 517505536.0,
-            "71": 517505536.0,
-            "72": 517505536.0,
-            "73": 517505536.0,
-            "74": 517505536.0,
-            "75": 517505536.0,
-            "76": 517505536.0,
-            "77": 517505536.0,
-            "78": 517505536.0,
-            "79": 517505536.0,
-            "80": 517505536.0,
-            "81": 517505536.0,
-            "82": 517505536.0,
-            "83": 517505536.0,
-            "84": 517505536.0,
-            "85": 517505536.0,
-            "86": 517505536.0,
-            "87": 517505536.0,
-            "88": 517505536.0,
-            "89": 517505536.0,
-            "90": 517505536.0,
-            "91": 517505536.0,
-            "92": 517505536.0,
-            "93": 517505536.0,
-            "94": 517505536.0,
-            "95": 517505536.0,
-            "96": 517505536.0,
-            "97": 517505536.0,
-            "98": 517505536.0,
-            "99": 517505536.0,
-            "100": 517505536.0
+            "1": 516456960.0,
+            "2": 516456960.0,
+            "3": 516456960.0,
+            "4": 516456960.0,
+            "5": 516456960.0,
+            "6": 516456960.0,
+            "7": 516456960.0,
+            "8": 516456960.0,
+            "9": 516456960.0,
+            "10": 516456960.0,
+            "11": 516456960.0,
+            "12": 516456960.0,
+            "13": 516456960.0,
+            "14": 516456960.0,
+            "15": 516456960.0,
+            "16": 516456960.0,
+            "17": 516456960.0,
+            "18": 516456960.0,
+            "19": 516456960.0,
+            "20": 516456960.0,
+            "21": 516456960.0,
+            "22": 516456960.0,
+            "23": 516456960.0,
+            "24": 516456960.0,
+            "25": 516456960.0,
+            "26": 516456960.0,
+            "27": 516456960.0,
+            "28": 516456960.0,
+            "29": 516456960.0,
+            "30": 516456960.0,
+            "31": 516456960.0,
+            "32": 516456960.0,
+            "33": 516456960.0,
+            "34": 516456960.0,
+            "35": 516456960.0,
+            "36": 516456960.0,
+            "37": 516456960.0,
+            "38": 516456960.0,
+            "39": 516456960.0,
+            "40": 516456960.0,
+            "41": 516456960.0,
+            "42": 516456960.0,
+            "43": 516456960.0,
+            "44": 516456960.0,
+            "45": 516456960.0,
+            "46": 516456960.0,
+            "47": 516456960.0,
+            "48": 516456960.0,
+            "49": 516456960.0,
+            "50": 516456960.0,
+            "51": 516456960.0,
+            "52": 516456960.0,
+            "53": 516456960.0,
+            "54": 516456960.0,
+            "55": 516456960.0,
+            "56": 516456960.0,
+            "57": 516456960.0,
+            "58": 516456960.0,
+            "59": 516456960.0,
+            "60": 516456960.0,
+            "61": 516456960.0,
+            "62": 516456960.0,
+            "63": 516456960.0,
+            "64": 516456960.0,
+            "65": 516456960.0,
+            "66": 516456960.0,
+            "67": 516456960.0,
+            "68": 516456960.0,
+            "69": 516456960.0,
+            "70": 516456960.0,
+            "71": 516456960.0,
+            "72": 516456960.0,
+            "73": 516456960.0,
+            "74": 516456960.0,
+            "75": 516456960.0,
+            "76": 516456960.0,
+            "77": 516456960.0,
+            "78": 516456960.0,
+            "79": 516456960.0,
+            "80": 516456960.0,
+            "81": 516456960.0,
+            "82": 516456960.0,
+            "83": 516456960.0,
+            "84": 516456960.0,
+            "85": 516456960.0,
+            "86": 516456960.0,
+            "87": 516456960.0,
+            "88": 516456960.0,
+            "89": 516456960.0,
+            "90": 516456960.0,
+            "91": 516456960.0,
+            "92": 516456960.0,
+            "93": 516456960.0,
+            "94": 516456960.0,
+            "95": 516456960.0,
+            "96": 516456960.0,
+            "97": 516456960.0,
+            "98": 516456960.0,
+            "99": 516456960.0,
+            "100": 516456960.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1246524928.0,
-            "2": 1428695552.0,
-            "3": 1428695552.0,
-            "4": 1428695552.0,
-            "5": 1428695552.0,
-            "6": 1428695552.0,
-            "7": 1428695552.0,
-            "8": 1428695552.0,
-            "9": 1428695552.0,
-            "10": 1428695552.0,
-            "11": 1428695552.0,
-            "12": 1428695552.0,
-            "13": 1428695552.0,
-            "14": 1428695552.0,
-            "15": 1428695552.0,
-            "16": 1428695552.0,
-            "17": 1428695552.0,
-            "18": 1428695552.0,
-            "19": 1428695552.0,
-            "20": 1428695552.0,
-            "21": 1428695552.0,
-            "22": 1428695552.0,
-            "23": 1428695552.0,
-            "24": 1428695552.0,
-            "25": 1428695552.0,
-            "26": 1428695552.0,
-            "27": 1428695552.0,
-            "28": 1428695552.0,
-            "29": 1428695552.0,
-            "30": 1428695552.0,
-            "31": 1428695552.0,
-            "32": 1428695552.0,
-            "33": 1428695552.0,
-            "34": 1428695552.0,
-            "35": 1428695552.0,
-            "36": 1428695552.0,
-            "37": 1428695552.0,
-            "38": 1428695552.0,
-            "39": 1428695552.0,
-            "40": 1428695552.0,
-            "41": 1428695552.0,
-            "42": 1428695552.0,
-            "43": 1428695552.0,
-            "44": 1428695552.0,
-            "45": 1428695552.0,
-            "46": 1428695552.0,
-            "47": 1428695552.0,
-            "48": 1428695552.0,
-            "49": 1428695552.0,
-            "50": 1428695552.0,
-            "51": 1428695552.0,
-            "52": 1428695552.0,
-            "53": 1428695552.0,
-            "54": 1428695552.0,
-            "55": 1428695552.0,
-            "56": 1428695552.0,
-            "57": 1428695552.0,
-            "58": 1428695552.0,
-            "59": 1428695552.0,
-            "60": 1428695552.0,
-            "61": 1428695552.0,
-            "62": 1428695552.0,
-            "63": 1428695552.0,
-            "64": 1428695552.0,
-            "65": 1428695552.0,
-            "66": 1428695552.0,
-            "67": 1428695552.0,
-            "68": 1428695552.0,
-            "69": 1428695552.0,
-            "70": 1428695552.0,
-            "71": 1428695552.0,
-            "72": 1428695552.0,
-            "73": 1428695552.0,
-            "74": 1428695552.0,
-            "75": 1428695552.0,
-            "76": 1428695552.0,
-            "77": 1428695552.0,
-            "78": 1428695552.0,
-            "79": 1428695552.0,
-            "80": 1428695552.0,
-            "81": 1428695552.0,
-            "82": 1428695552.0,
-            "83": 1428695552.0,
-            "84": 1428695552.0,
-            "85": 1428695552.0,
-            "86": 1428695552.0,
-            "87": 1428695552.0,
-            "88": 1428695552.0,
-            "89": 1428695552.0,
-            "90": 1428695552.0,
-            "91": 1428695552.0,
-            "92": 1428695552.0,
-            "93": 1428695552.0,
-            "94": 1428695552.0,
-            "95": 1428695552.0,
-            "96": 1428695552.0,
-            "97": 1428695552.0,
-            "98": 1428695552.0,
-            "99": 1428695552.0,
-            "100": 1428695552.0
+            "1": 1246525952.0,
+            "2": 1426598400.0,
+            "3": 1426598400.0,
+            "4": 1426598400.0,
+            "5": 1426598400.0,
+            "6": 1426598400.0,
+            "7": 1426598400.0,
+            "8": 1426598400.0,
+            "9": 1426598400.0,
+            "10": 1426598400.0,
+            "11": 1426598400.0,
+            "12": 1426598400.0,
+            "13": 1426598400.0,
+            "14": 1426598400.0,
+            "15": 1426598400.0,
+            "16": 1426598400.0,
+            "17": 1426598400.0,
+            "18": 1426598400.0,
+            "19": 1426598400.0,
+            "20": 1426598400.0,
+            "21": 1426598400.0,
+            "22": 1426598400.0,
+            "23": 1426598400.0,
+            "24": 1426598400.0,
+            "25": 1426598400.0,
+            "26": 1426598400.0,
+            "27": 1426598400.0,
+            "28": 1426598400.0,
+            "29": 1426598400.0,
+            "30": 1426598400.0,
+            "31": 1426598400.0,
+            "32": 1426598400.0,
+            "33": 1426598400.0,
+            "34": 1426598400.0,
+            "35": 1426598400.0,
+            "36": 1426598400.0,
+            "37": 1426598400.0,
+            "38": 1426598400.0,
+            "39": 1426598400.0,
+            "40": 1426598400.0,
+            "41": 1426598400.0,
+            "42": 1426598400.0,
+            "43": 1426598400.0,
+            "44": 1426598400.0,
+            "45": 1426598400.0,
+            "46": 1426598400.0,
+            "47": 1426598400.0,
+            "48": 1426598400.0,
+            "49": 1426598400.0,
+            "50": 1426598400.0,
+            "51": 1426598400.0,
+            "52": 1426598400.0,
+            "53": 1426598400.0,
+            "54": 1426598400.0,
+            "55": 1426598400.0,
+            "56": 1426598400.0,
+            "57": 1426598400.0,
+            "58": 1426598400.0,
+            "59": 1426598400.0,
+            "60": 1426598400.0,
+            "61": 1426598400.0,
+            "62": 1426598400.0,
+            "63": 1426598400.0,
+            "64": 1426598400.0,
+            "65": 1426598400.0,
+            "66": 1426598400.0,
+            "67": 1426598400.0,
+            "68": 1426598400.0,
+            "69": 1426598400.0,
+            "70": 1426598400.0,
+            "71": 1426598400.0,
+            "72": 1426598400.0,
+            "73": 1426598400.0,
+            "74": 1426598400.0,
+            "75": 1426598400.0,
+            "76": 1426598400.0,
+            "77": 1426598400.0,
+            "78": 1426598400.0,
+            "79": 1426598400.0,
+            "80": 1426598400.0,
+            "81": 1426598400.0,
+            "82": 1426598400.0,
+            "83": 1426598400.0,
+            "84": 1426598400.0,
+            "85": 1426598400.0,
+            "86": 1426598400.0,
+            "87": 1426598400.0,
+            "88": 1426598400.0,
+            "89": 1426598400.0,
+            "90": 1426598400.0,
+            "91": 1426598400.0,
+            "92": 1426598400.0,
+            "93": 1426598400.0,
+            "94": 1426598400.0,
+            "95": 1426598400.0,
+            "96": 1426598400.0,
+            "97": 1426598400.0,
+            "98": 1426598400.0,
+            "99": 1426598400.0,
+            "100": 1426598400.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 11.77129,
-            "2": 0.18805,
-            "3": 0.15486,
-            "4": 0.15531,
-            "5": 0.15342,
-            "6": 0.15402,
-            "7": 0.15787,
-            "8": 0.15837,
-            "9": 0.15422,
-            "10": 0.1531,
-            "11": 0.1531,
-            "12": 0.1521,
-            "13": 0.15206,
-            "14": 0.15281,
-            "15": 0.15025,
-            "16": 0.15321,
-            "17": 0.15383,
-            "18": 0.15265,
-            "19": 0.15535,
-            "20": 0.15414,
-            "21": 0.15275,
-            "22": 0.152,
-            "23": 0.15456,
-            "24": 0.15209,
-            "25": 0.15358,
-            "26": 0.15228,
-            "27": 0.15217,
-            "28": 0.15204,
-            "29": 0.1526,
-            "30": 0.15259,
-            "31": 0.15237,
-            "32": 0.15885,
-            "33": 0.1577,
-            "34": 0.16029,
-            "35": 0.15618,
-            "36": 0.16006,
-            "37": 0.15686,
-            "38": 0.15897,
-            "39": 0.15985,
-            "40": 0.15818,
-            "41": 0.15734,
-            "42": 0.15623,
-            "43": 0.15982,
-            "44": 0.15844,
-            "45": 0.15965,
-            "46": 0.15995,
-            "47": 0.1576,
-            "48": 0.15787,
-            "49": 0.15857,
-            "50": 0.16598,
-            "51": 0.15831,
-            "52": 0.15281,
-            "53": 0.15278,
-            "54": 0.15155,
-            "55": 0.1544,
-            "56": 0.15102,
-            "57": 0.1505,
-            "58": 0.15177,
-            "59": 0.15275,
-            "60": 0.15179,
-            "61": 0.15138,
-            "62": 0.153,
-            "63": 0.14962,
-            "64": 0.15104,
-            "65": 0.15104,
-            "66": 0.1541,
-            "67": 0.15089,
-            "68": 0.15178,
-            "69": 0.15241,
-            "70": 0.1524,
-            "71": 0.14991,
-            "72": 0.15107,
-            "73": 0.15205,
-            "74": 0.15105,
-            "75": 0.14944,
-            "76": 0.15086,
-            "77": 0.15066,
-            "78": 0.15037,
-            "79": 0.1517,
-            "80": 0.1535,
-            "81": 0.15067,
-            "82": 0.15202,
-            "83": 0.1513,
-            "84": 0.15157,
-            "85": 0.15077,
-            "86": 0.15249,
-            "87": 0.15259,
-            "88": 0.15065,
-            "89": 0.15236,
-            "90": 0.15088,
-            "91": 0.15271,
-            "92": 0.15124,
-            "93": 0.15371,
-            "94": 0.14949,
-            "95": 0.15169,
-            "96": 0.15061,
-            "97": 0.15123,
-            "98": 0.15143,
-            "99": 0.15292,
-            "100": 0.15348
+            "1": 8.71736,
+            "2": 0.17115,
+            "3": 0.15694,
+            "4": 0.13982,
+            "5": 0.13869,
+            "6": 0.1336,
+            "7": 0.13504,
+            "8": 0.13243,
+            "9": 0.13367,
+            "10": 0.13419,
+            "11": 0.13733,
+            "12": 0.13769,
+            "13": 0.13945,
+            "14": 0.13947,
+            "15": 0.1359,
+            "16": 0.13522,
+            "17": 0.13429,
+            "18": 0.13312,
+            "19": 0.13374,
+            "20": 0.13297,
+            "21": 0.13311,
+            "22": 0.13277,
+            "23": 0.13534,
+            "24": 0.13287,
+            "25": 0.12793,
+            "26": 0.12692,
+            "27": 0.1283,
+            "28": 0.13508,
+            "29": 0.13475,
+            "30": 0.1318,
+            "31": 0.13396,
+            "32": 0.13344,
+            "33": 0.13398,
+            "34": 0.13071,
+            "35": 0.1284,
+            "36": 0.12752,
+            "37": 0.12689,
+            "38": 0.12666,
+            "39": 0.12799,
+            "40": 0.12834,
+            "41": 0.12686,
+            "42": 0.12597,
+            "43": 0.1242,
+            "44": 0.12724,
+            "45": 0.12459,
+            "46": 0.12693,
+            "47": 0.12473,
+            "48": 0.12666,
+            "49": 0.12677,
+            "50": 0.12611,
+            "51": 0.14947,
+            "52": 0.12685,
+            "53": 0.12533,
+            "54": 0.12565,
+            "55": 0.12664,
+            "56": 0.12771,
+            "57": 0.12644,
+            "58": 0.12656,
+            "59": 0.12707,
+            "60": 0.12763,
+            "61": 0.12599,
+            "62": 0.12667,
+            "63": 0.12558,
+            "64": 0.12865,
+            "65": 0.12684,
+            "66": 0.12749,
+            "67": 0.12671,
+            "68": 0.12725,
+            "69": 0.1267,
+            "70": 0.1263,
+            "71": 0.12741,
+            "72": 0.12748,
+            "73": 0.1278,
+            "74": 0.12653,
+            "75": 0.12606,
+            "76": 0.12649,
+            "77": 0.12666,
+            "78": 0.12626,
+            "79": 0.12702,
+            "80": 0.12831,
+            "81": 0.12686,
+            "82": 0.12628,
+            "83": 0.12693,
+            "84": 0.12714,
+            "85": 0.12632,
+            "86": 0.12756,
+            "87": 0.12631,
+            "88": 0.12895,
+            "89": 0.1284,
+            "90": 0.12636,
+            "91": 0.12805,
+            "92": 0.12691,
+            "93": 0.12665,
+            "94": 0.12749,
+            "95": 0.12697,
+            "96": 0.12622,
+            "97": 0.12701,
+            "98": 0.12878,
+            "99": 0.12567,
+            "100": 0.12677
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..6ec68f2ce41
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8558,
+            "52": 9.75237,
+            "53": 10.07589,
+            "54": 9.95688,
+            "55": 9.88203,
+            "56": 9.6313,
+            "57": 9.48649,
+            "58": 9.83109,
+            "59": 9.58897,
+            "60": 9.50643,
+            "61": 9.70363,
+            "62": 9.98286,
+            "63": 9.38302,
+            "64": 9.77901,
+            "65": 8.95166,
+            "66": 9.70158,
+            "67": 9.37203,
+            "68": 9.78849,
+            "69": 9.79851,
+            "70": 9.74737,
+            "71": 9.61908,
+            "72": 9.58502,
+            "73": 9.49721,
+            "74": 8.93927,
+            "75": 9.42703,
+            "76": 9.0802,
+            "77": 10.06567,
+            "78": 9.72893,
+            "79": 9.3776,
+            "80": 9.40982,
+            "81": 9.47976,
+            "82": 9.7018,
+            "83": 9.30612,
+            "84": 9.4209,
+            "85": 9.61371,
+            "86": 9.07649,
+            "87": 9.5945,
+            "88": 9.75068,
+            "89": 9.60238,
+            "90": 9.81898,
+            "91": 9.33894,
+            "92": 9.35716,
+            "93": 9.07879,
+            "94": 8.83503,
+            "95": 9.52172,
+            "96": 9.53003,
+            "97": 9.31306,
+            "98": 9.67783,
+            "99": 8.89058,
+            "100": 9.39725
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2613.0,
+            "52": 2538.0,
+            "53": 2792.0,
+            "54": 2801.0,
+            "55": 2216.0,
+            "56": 2858.0,
+            "57": 2381.0,
+            "58": 2854.0,
+            "59": 2787.0,
+            "60": 2457.0,
+            "61": 2941.0,
+            "62": 2543.0,
+            "63": 2408.0,
+            "64": 2968.0,
+            "65": 2472.0,
+            "66": 2977.0,
+            "67": 2839.0,
+            "68": 2775.0,
+            "69": 2832.0,
+            "70": 3057.0,
+            "71": 2909.0,
+            "72": 2421.0,
+            "73": 2982.0,
+            "74": 1922.0,
+            "75": 2474.0,
+            "76": 3059.0,
+            "77": 3177.0,
+            "78": 3067.0,
+            "79": 3052.0,
+            "80": 3338.0,
+            "81": 3644.0,
+            "82": 3234.0,
+            "83": 2798.0,
+            "84": 3196.0,
+            "85": 3324.0,
+            "86": 2855.0,
+            "87": 3820.0,
+            "88": 2962.0,
+            "89": 3379.0,
+            "90": 3096.0,
+            "91": 2857.0,
+            "92": 3077.0,
+            "93": 2693.0,
+            "94": 3312.0,
+            "95": 3399.0,
+            "96": 3378.0,
+            "97": 3030.0,
+            "98": 3619.0,
+            "99": 3160.0,
+            "100": 3128.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 696530432.0,
+            "52": 696530432.0,
+            "53": 696530432.0,
+            "54": 696530432.0,
+            "55": 696530432.0,
+            "56": 696530432.0,
+            "57": 696530432.0,
+            "58": 696530432.0,
+            "59": 696530432.0,
+            "60": 696530432.0,
+            "61": 696530432.0,
+            "62": 696530432.0,
+            "63": 696530432.0,
+            "64": 696530432.0,
+            "65": 696530432.0,
+            "66": 696530432.0,
+            "67": 696530432.0,
+            "68": 696530432.0,
+            "69": 696530432.0,
+            "70": 696530432.0,
+            "71": 696530432.0,
+            "72": 696530432.0,
+            "73": 696530432.0,
+            "74": 696530432.0,
+            "75": 696530432.0,
+            "76": 696530432.0,
+            "77": 696530432.0,
+            "78": 696530432.0,
+            "79": 696530432.0,
+            "80": 696530432.0,
+            "81": 696530432.0,
+            "82": 696530432.0,
+            "83": 696530432.0,
+            "84": 696530432.0,
+            "85": 696530432.0,
+            "86": 696530432.0,
+            "87": 696530432.0,
+            "88": 696530432.0,
+            "89": 696530432.0,
+            "90": 696530432.0,
+            "91": 696530432.0,
+            "92": 696530432.0,
+            "93": 696530432.0,
+            "94": 696530432.0,
+            "95": 696530432.0,
+            "96": 696530432.0,
+            "97": 696530432.0,
+            "98": 696530432.0,
+            "99": 696530432.0,
+            "100": 696530432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1606670848.0,
+            "52": 1606671872.0,
+            "53": 1606671872.0,
+            "54": 1606671872.0,
+            "55": 1606671872.0,
+            "56": 1606671872.0,
+            "57": 1606671872.0,
+            "58": 1606671872.0,
+            "59": 1606671872.0,
+            "60": 1606671872.0,
+            "61": 1606671872.0,
+            "62": 1606671872.0,
+            "63": 1606671872.0,
+            "64": 1606671872.0,
+            "65": 1606671872.0,
+            "66": 1606671872.0,
+            "67": 1606671872.0,
+            "68": 1606671872.0,
+            "69": 1606671872.0,
+            "70": 1606671872.0,
+            "71": 1606671872.0,
+            "72": 1606671872.0,
+            "73": 1606671872.0,
+            "74": 1606671872.0,
+            "75": 1606671872.0,
+            "76": 1606671872.0,
+            "77": 1606671872.0,
+            "78": 1606671872.0,
+            "79": 1606671872.0,
+            "80": 1606671872.0,
+            "81": 1606671872.0,
+            "82": 1606671872.0,
+            "83": 1606671872.0,
+            "84": 1606671872.0,
+            "85": 1606671872.0,
+            "86": 1606671872.0,
+            "87": 1606671872.0,
+            "88": 1606671872.0,
+            "89": 1606671872.0,
+            "90": 1606671872.0,
+            "91": 1606671872.0,
+            "92": 1606671872.0,
+            "93": 1606671872.0,
+            "94": 1606671872.0,
+            "95": 1606671872.0,
+            "96": 1606671872.0,
+            "97": 1606671872.0,
+            "98": 1606671872.0,
+            "99": 1606671872.0,
+            "100": 1606671872.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8.58328,
+            "52": 0.16493,
+            "53": 0.12792,
+            "54": 0.12753,
+            "55": 0.1267,
+            "56": 0.12717,
+            "57": 0.12953,
+            "58": 0.12905,
+            "59": 0.12926,
+            "60": 0.12957,
+            "61": 0.1301,
+            "62": 0.13084,
+            "63": 0.1303,
+            "64": 0.12945,
+            "65": 0.12867,
+            "66": 0.12977,
+            "67": 0.12566,
+            "68": 0.12615,
+            "69": 0.12561,
+            "70": 0.12549,
+            "71": 0.12626,
+            "72": 0.12735,
+            "73": 0.12717,
+            "74": 0.12589,
+            "75": 0.12587,
+            "76": 0.12712,
+            "77": 0.12613,
+            "78": 0.12598,
+            "79": 0.12558,
+            "80": 0.1269,
+            "81": 0.1257,
+            "82": 0.12655,
+            "83": 0.12569,
+            "84": 0.12762,
+            "85": 0.12805,
+            "86": 0.12546,
+            "87": 0.12592,
+            "88": 0.12681,
+            "89": 0.12765,
+            "90": 0.12626,
+            "91": 0.12713,
+            "92": 0.12614,
+            "93": 0.12723,
+            "94": 0.1263,
+            "95": 0.12688,
+            "96": 0.1288,
+            "97": 0.12614,
+            "98": 0.12731,
+            "99": 0.12875,
+            "100": 0.1257
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100.json
index 97ea213f560..297f18f6544 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.25138,
-            "2": 0.23075,
-            "3": 0.20054,
-            "4": 0.20395,
-            "5": 0.20085,
-            "6": 0.19693,
-            "7": 0.1984,
-            "8": 0.19691,
-            "9": 0.19734,
-            "10": 0.19831,
-            "11": 0.19755,
-            "12": 0.20036,
-            "13": 0.19718,
-            "14": 0.20205,
-            "15": 0.19931,
-            "16": 0.1974,
-            "17": 0.19891,
-            "18": 0.19725,
-            "19": 0.19744,
-            "20": 0.19621,
-            "21": 0.19556,
-            "22": 0.1957,
-            "23": 0.19653,
-            "24": 0.19561,
-            "25": 0.19465,
-            "26": 0.19483,
-            "27": 0.19566,
-            "28": 0.19514,
-            "29": 0.19571,
-            "30": 0.19512,
-            "31": 0.19603,
-            "32": 0.19794,
-            "33": 0.19597,
-            "34": 0.20052,
-            "35": 0.19938,
-            "36": 0.19968,
-            "37": 0.19971,
-            "38": 0.19989,
-            "39": 0.20233,
-            "40": 0.20594,
-            "41": 0.20596,
-            "42": 0.20875,
-            "43": 0.20692,
-            "44": 0.20224,
-            "45": 0.20492,
-            "46": 0.20483,
-            "47": 0.20404,
-            "48": 0.20062,
-            "49": 0.1998,
-            "50": 0.19944,
-            "51": 0.21056,
-            "52": 0.20322,
-            "53": 0.20394,
-            "54": 0.20267,
-            "55": 0.20305,
-            "56": 0.20261,
-            "57": 0.20266,
-            "58": 0.2023,
-            "59": 0.20259,
-            "60": 0.20297,
-            "61": 0.20333,
-            "62": 0.20344,
-            "63": 0.20255,
-            "64": 0.20203,
-            "65": 0.20288,
-            "66": 0.20295,
-            "67": 0.20276,
-            "68": 0.20255,
-            "69": 0.20306,
-            "70": 0.20225,
-            "71": 0.20236,
-            "72": 0.20262,
-            "73": 0.2033,
-            "74": 0.20279,
-            "75": 0.20276,
-            "76": 0.20185,
-            "77": 0.20283,
-            "78": 0.20284,
-            "79": 0.2021,
-            "80": 0.20273,
-            "81": 0.20261,
-            "82": 0.20101,
-            "83": 0.20222,
-            "84": 0.20269,
-            "85": 0.20272,
-            "86": 0.20286,
-            "87": 0.20079,
-            "88": 0.20309,
-            "89": 0.2026,
-            "90": 0.20209,
-            "91": 0.20371,
-            "92": 0.20302,
-            "93": 0.20226,
-            "94": 0.20222,
-            "95": 0.20289,
-            "96": 0.20273,
-            "97": 0.20346,
-            "98": 0.20283,
-            "99": 0.20241,
-            "100": 0.20343
+            "1": 3.6904,
+            "2": 0.22693,
+            "3": 0.20753,
+            "4": 0.19573,
+            "5": 0.19555,
+            "6": 0.19486,
+            "7": 0.19003,
+            "8": 0.19034,
+            "9": 0.19191,
+            "10": 0.19136,
+            "11": 0.19037,
+            "12": 0.19056,
+            "13": 0.19097,
+            "14": 0.19327,
+            "15": 0.19082,
+            "16": 0.19093,
+            "17": 0.19066,
+            "18": 0.1904,
+            "19": 0.19061,
+            "20": 0.1898,
+            "21": 0.19121,
+            "22": 0.18935,
+            "23": 0.18948,
+            "24": 0.18927,
+            "25": 0.19032,
+            "26": 0.18931,
+            "27": 0.18951,
+            "28": 0.18931,
+            "29": 0.18948,
+            "30": 0.18971,
+            "31": 0.18911,
+            "32": 0.18996,
+            "33": 0.18993,
+            "34": 0.18929,
+            "35": 0.19088,
+            "36": 0.18935,
+            "37": 0.18973,
+            "38": 0.18947,
+            "39": 0.1909,
+            "40": 0.18932,
+            "41": 0.1896,
+            "42": 0.18785,
+            "43": 0.18782,
+            "44": 0.18772,
+            "45": 0.18893,
+            "46": 0.18908,
+            "47": 0.18889,
+            "48": 0.18856,
+            "49": 0.18904,
+            "50": 0.18893,
+            "51": 0.20447,
+            "52": 0.19453,
+            "53": 0.19364,
+            "54": 0.19383,
+            "55": 0.19491,
+            "56": 0.19307,
+            "57": 0.19375,
+            "58": 0.19268,
+            "59": 0.19288,
+            "60": 0.19183,
+            "61": 0.19216,
+            "62": 0.19218,
+            "63": 0.19491,
+            "64": 0.193,
+            "65": 0.19286,
+            "66": 0.19394,
+            "67": 0.19246,
+            "68": 0.19136,
+            "69": 0.19255,
+            "70": 0.19206,
+            "71": 0.19299,
+            "72": 0.19313,
+            "73": 0.19366,
+            "74": 0.19232,
+            "75": 0.1936,
+            "76": 0.19319,
+            "77": 0.19301,
+            "78": 0.19344,
+            "79": 0.19291,
+            "80": 0.1933,
+            "81": 0.19357,
+            "82": 0.19253,
+            "83": 0.19257,
+            "84": 0.19311,
+            "85": 0.19403,
+            "86": 0.1921,
+            "87": 0.19221,
+            "88": 0.19252,
+            "89": 0.19392,
+            "90": 0.1925,
+            "91": 0.19468,
+            "92": 0.19302,
+            "93": 0.19255,
+            "94": 0.19249,
+            "95": 0.19418,
+            "96": 0.19216,
+            "97": 0.19224,
+            "98": 0.19469,
+            "99": 0.19297,
+            "100": 0.19245
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..e9d40c1a306
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84971,
+            "52": 9.74156,
+            "53": 10.06322,
+            "54": 9.94581,
+            "55": 9.87731,
+            "56": 9.62746,
+            "57": 9.47259,
+            "58": 9.82912,
+            "59": 9.583,
+            "60": 9.49181,
+            "61": 9.69961,
+            "62": 9.98089,
+            "63": 9.37212,
+            "64": 9.7756,
+            "65": 8.9433,
+            "66": 9.69993,
+            "67": 9.36414,
+            "68": 9.78706,
+            "69": 9.78397,
+            "70": 9.72288,
+            "71": 9.60749,
+            "72": 9.58416,
+            "73": 9.49093,
+            "74": 8.94864,
+            "75": 9.41807,
+            "76": 9.08721,
+            "77": 10.06283,
+            "78": 9.729,
+            "79": 9.37091,
+            "80": 9.40033,
+            "81": 9.47754,
+            "82": 9.69121,
+            "83": 9.30762,
+            "84": 9.41252,
+            "85": 9.61132,
+            "86": 9.07621,
+            "87": 9.59459,
+            "88": 9.74768,
+            "89": 9.6068,
+            "90": 9.81078,
+            "91": 9.34441,
+            "92": 9.36535,
+            "93": 9.07743,
+            "94": 8.82975,
+            "95": 9.51676,
+            "96": 9.52546,
+            "97": 9.31031,
+            "98": 9.67812,
+            "99": 8.88848,
+            "100": 9.40128
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2735.0,
+            "52": 2607.0,
+            "53": 2951.0,
+            "54": 2672.0,
+            "55": 2451.0,
+            "56": 2712.0,
+            "57": 2392.0,
+            "58": 2979.0,
+            "59": 2869.0,
+            "60": 2435.0,
+            "61": 2938.0,
+            "62": 2669.0,
+            "63": 2392.0,
+            "64": 2998.0,
+            "65": 2689.0,
+            "66": 3285.0,
+            "67": 2782.0,
+            "68": 2753.0,
+            "69": 2958.0,
+            "70": 3271.0,
+            "71": 3040.0,
+            "72": 2504.0,
+            "73": 3096.0,
+            "74": 1910.0,
+            "75": 2617.0,
+            "76": 3081.0,
+            "77": 3390.0,
+            "78": 3186.0,
+            "79": 3320.0,
+            "80": 3483.0,
+            "81": 3782.0,
+            "82": 3516.0,
+            "83": 2864.0,
+            "84": 3396.0,
+            "85": 3247.0,
+            "86": 2785.0,
+            "87": 3762.0,
+            "88": 3102.0,
+            "89": 3483.0,
+            "90": 3076.0,
+            "91": 2643.0,
+            "92": 3198.0,
+            "93": 2666.0,
+            "94": 3390.0,
+            "95": 3410.0,
+            "96": 3508.0,
+            "97": 3178.0,
+            "98": 3865.0,
+            "99": 3143.0,
+            "100": 3357.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 487621120.0,
+            "52": 487621120.0,
+            "53": 487621120.0,
+            "54": 487621120.0,
+            "55": 487621120.0,
+            "56": 487621120.0,
+            "57": 487621120.0,
+            "58": 487621120.0,
+            "59": 487621120.0,
+            "60": 487621120.0,
+            "61": 487621120.0,
+            "62": 487621120.0,
+            "63": 487621120.0,
+            "64": 487621120.0,
+            "65": 487621120.0,
+            "66": 487621120.0,
+            "67": 487621120.0,
+            "68": 487621120.0,
+            "69": 487621120.0,
+            "70": 487621120.0,
+            "71": 487621120.0,
+            "72": 487621120.0,
+            "73": 487621120.0,
+            "74": 487621120.0,
+            "75": 487621120.0,
+            "76": 487621120.0,
+            "77": 487621120.0,
+            "78": 487621120.0,
+            "79": 487621120.0,
+            "80": 487621120.0,
+            "81": 487621120.0,
+            "82": 487621120.0,
+            "83": 487621120.0,
+            "84": 487621120.0,
+            "85": 487621120.0,
+            "86": 487621120.0,
+            "87": 487621120.0,
+            "88": 487621120.0,
+            "89": 487621120.0,
+            "90": 487621120.0,
+            "91": 487621120.0,
+            "92": 487621120.0,
+            "93": 487621120.0,
+            "94": 487621120.0,
+            "95": 487621120.0,
+            "96": 487621120.0,
+            "97": 487621120.0,
+            "98": 487621120.0,
+            "99": 487621120.0,
+            "100": 487621120.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1412441600.0,
+            "52": 1412442624.0,
+            "53": 1412442624.0,
+            "54": 1412442624.0,
+            "55": 1412442624.0,
+            "56": 1412442624.0,
+            "57": 1412442624.0,
+            "58": 1412442624.0,
+            "59": 1412442624.0,
+            "60": 1412442624.0,
+            "61": 1412442624.0,
+            "62": 1412442624.0,
+            "63": 1412442624.0,
+            "64": 1412442624.0,
+            "65": 1412442624.0,
+            "66": 1412442624.0,
+            "67": 1412442624.0,
+            "68": 1412442624.0,
+            "69": 1412442624.0,
+            "70": 1412442624.0,
+            "71": 1412442624.0,
+            "72": 1412442624.0,
+            "73": 1412442624.0,
+            "74": 1412442624.0,
+            "75": 1412442624.0,
+            "76": 1412442624.0,
+            "77": 1412442624.0,
+            "78": 1412442624.0,
+            "79": 1412442624.0,
+            "80": 1412442624.0,
+            "81": 1412442624.0,
+            "82": 1412442624.0,
+            "83": 1412442624.0,
+            "84": 1412442624.0,
+            "85": 1412442624.0,
+            "86": 1412442624.0,
+            "87": 1412442624.0,
+            "88": 1412442624.0,
+            "89": 1412442624.0,
+            "90": 1412442624.0,
+            "91": 1412442624.0,
+            "92": 1412442624.0,
+            "93": 1412442624.0,
+            "94": 1412442624.0,
+            "95": 1412442624.0,
+            "96": 1412442624.0,
+            "97": 1412442624.0,
+            "98": 1412442624.0,
+            "99": 1412442624.0,
+            "100": 1412442624.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.82592,
+            "52": 0.24571,
+            "53": 0.19937,
+            "54": 0.1942,
+            "55": 0.19469,
+            "56": 0.19438,
+            "57": 0.19525,
+            "58": 0.19539,
+            "59": 0.19507,
+            "60": 0.19574,
+            "61": 0.19507,
+            "62": 0.19522,
+            "63": 0.1942,
+            "64": 0.19521,
+            "65": 0.19461,
+            "66": 0.19519,
+            "67": 0.19508,
+            "68": 0.19346,
+            "69": 0.19457,
+            "70": 0.1935,
+            "71": 0.19426,
+            "72": 0.19396,
+            "73": 0.19419,
+            "74": 0.19399,
+            "75": 0.19449,
+            "76": 0.19338,
+            "77": 0.19376,
+            "78": 0.19428,
+            "79": 0.19399,
+            "80": 0.19356,
+            "81": 0.19404,
+            "82": 0.19431,
+            "83": 0.19348,
+            "84": 0.19448,
+            "85": 0.19466,
+            "86": 0.1934,
+            "87": 0.19394,
+            "88": 0.19435,
+            "89": 0.19356,
+            "90": 0.19446,
+            "91": 0.19388,
+            "92": 0.19324,
+            "93": 0.19462,
+            "94": 0.1939,
+            "95": 0.19479,
+            "96": 0.19331,
+            "97": 0.19382,
+            "98": 0.19427,
+            "99": 0.1943,
+            "100": 0.19433
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..ff9932eb6ca
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86897,
+            "2": 10.88544,
+            "3": 10.86473,
+            "4": 10.86827,
+            "5": 10.87437,
+            "6": 10.89003,
+            "7": 10.87769,
+            "8": 10.86367,
+            "9": 10.88281,
+            "10": 10.84686,
+            "11": 10.87102,
+            "12": 10.87349,
+            "13": 10.8814,
+            "14": 10.88766,
+            "15": 10.83865,
+            "16": 10.8239,
+            "17": 10.80196,
+            "18": 10.81095,
+            "19": 10.82197,
+            "20": 10.71791,
+            "21": 10.68917,
+            "22": 10.57276,
+            "23": 10.70813,
+            "24": 10.59542,
+            "25": 10.55293,
+            "26": 10.61254,
+            "27": 10.6005,
+            "28": 10.56172,
+            "29": 10.58085,
+            "30": 10.35594,
+            "31": 10.11818,
+            "32": 10.44815,
+            "33": 10.45422,
+            "34": 10.21552,
+            "35": 10.26123,
+            "36": 10.20776,
+            "37": 10.3367,
+            "38": 10.17742,
+            "39": 10.39293,
+            "40": 10.06352,
+            "41": 10.13888,
+            "42": 10.2056,
+            "43": 9.82811,
+            "44": 9.94544,
+            "45": 9.82557,
+            "46": 9.80182,
+            "47": 10.14052,
+            "48": 9.84281,
+            "49": 9.52013,
+            "50": 9.88457,
+            "51": 9.8474,
+            "52": 9.74209,
+            "53": 10.05695,
+            "54": 9.95048,
+            "55": 9.88137,
+            "56": 9.61274,
+            "57": 9.46865,
+            "58": 9.82191,
+            "59": 9.57642,
+            "60": 9.49763,
+            "61": 9.6919,
+            "62": 9.98672,
+            "63": 9.37511,
+            "64": 9.76682,
+            "65": 8.94645,
+            "66": 9.70228,
+            "67": 9.36325,
+            "68": 9.78311,
+            "69": 9.79861,
+            "70": 9.73171,
+            "71": 9.62575,
+            "72": 9.58482,
+            "73": 9.48964,
+            "74": 8.92857,
+            "75": 9.40863,
+            "76": 9.07924,
+            "77": 10.05936,
+            "78": 9.72284,
+            "79": 9.37782,
+            "80": 9.40428,
+            "81": 9.48314,
+            "82": 9.70039,
+            "83": 9.31593,
+            "84": 9.41835,
+            "85": 9.61687,
+            "86": 9.07538,
+            "87": 9.59618,
+            "88": 9.75215,
+            "89": 9.60188,
+            "90": 9.82284,
+            "91": 9.34035,
+            "92": 9.35853,
+            "93": 9.08806,
+            "94": 8.83039,
+            "95": 9.5266,
+            "96": 9.53046,
+            "97": 9.30391,
+            "98": 9.67197,
+            "99": 8.89638,
+            "100": 9.40645
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 56.0,
+            "2": 68.0,
+            "3": 65.0,
+            "4": 66.0,
+            "5": 62.0,
+            "6": 64.0,
+            "7": 61.0,
+            "8": 81.0,
+            "9": 60.0,
+            "10": 59.0,
+            "11": 73.0,
+            "12": 60.0,
+            "13": 62.0,
+            "14": 72.0,
+            "15": 56.0,
+            "16": 70.0,
+            "17": 67.0,
+            "18": 62.0,
+            "19": 61.0,
+            "20": 64.0,
+            "21": 73.0,
+            "22": 69.0,
+            "23": 77.0,
+            "24": 53.0,
+            "25": 63.0,
+            "26": 66.0,
+            "27": 66.0,
+            "28": 77.0,
+            "29": 70.0,
+            "30": 56.0,
+            "31": 61.0,
+            "32": 64.0,
+            "33": 77.0,
+            "34": 68.0,
+            "35": 78.0,
+            "36": 74.0,
+            "37": 79.0,
+            "38": 60.0,
+            "39": 73.0,
+            "40": 73.0,
+            "41": 78.0,
+            "42": 76.0,
+            "43": 82.0,
+            "44": 87.0,
+            "45": 83.0,
+            "46": 72.0,
+            "47": 70.0,
+            "48": 64.0,
+            "49": 82.0,
+            "50": 88.0,
+            "51": 71.0,
+            "52": 53.0,
+            "53": 77.0,
+            "54": 92.0,
+            "55": 67.0,
+            "56": 92.0,
+            "57": 86.0,
+            "58": 79.0,
+            "59": 74.0,
+            "60": 70.0,
+            "61": 98.0,
+            "62": 71.0,
+            "63": 64.0,
+            "64": 83.0,
+            "65": 89.0,
+            "66": 86.0,
+            "67": 62.0,
+            "68": 67.0,
+            "69": 57.0,
+            "70": 90.0,
+            "71": 66.0,
+            "72": 61.0,
+            "73": 76.0,
+            "74": 52.0,
+            "75": 63.0,
+            "76": 78.0,
+            "77": 78.0,
+            "78": 87.0,
+            "79": 83.0,
+            "80": 77.0,
+            "81": 102.0,
+            "82": 74.0,
+            "83": 67.0,
+            "84": 68.0,
+            "85": 96.0,
+            "86": 89.0,
+            "87": 92.0,
+            "88": 81.0,
+            "89": 47.0,
+            "90": 76.0,
+            "91": 70.0,
+            "92": 82.0,
+            "93": 58.0,
+            "94": 76.0,
+            "95": 71.0,
+            "96": 92.0,
+            "97": 67.0,
+            "98": 88.0,
+            "99": 66.0,
+            "100": 69.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 545423872.0,
+            "2": 545423872.0,
+            "3": 545423872.0,
+            "4": 545423872.0,
+            "5": 545423872.0,
+            "6": 545423872.0,
+            "7": 545423872.0,
+            "8": 545423872.0,
+            "9": 545423872.0,
+            "10": 545423872.0,
+            "11": 545423872.0,
+            "12": 545423872.0,
+            "13": 545423872.0,
+            "14": 545423872.0,
+            "15": 545423872.0,
+            "16": 545423872.0,
+            "17": 545423872.0,
+            "18": 545423872.0,
+            "19": 545423872.0,
+            "20": 545423872.0,
+            "21": 545423872.0,
+            "22": 545423872.0,
+            "23": 545423872.0,
+            "24": 545423872.0,
+            "25": 545423872.0,
+            "26": 545423872.0,
+            "27": 545423872.0,
+            "28": 545423872.0,
+            "29": 545423872.0,
+            "30": 545423872.0,
+            "31": 545423872.0,
+            "32": 545423872.0,
+            "33": 545423872.0,
+            "34": 545423872.0,
+            "35": 545423872.0,
+            "36": 545423872.0,
+            "37": 545423872.0,
+            "38": 545423872.0,
+            "39": 545423872.0,
+            "40": 545423872.0,
+            "41": 545423872.0,
+            "42": 545423872.0,
+            "43": 545423872.0,
+            "44": 545423872.0,
+            "45": 545423872.0,
+            "46": 545423872.0,
+            "47": 545423872.0,
+            "48": 545423872.0,
+            "49": 545423872.0,
+            "50": 545423872.0,
+            "51": 545423872.0,
+            "52": 545423872.0,
+            "53": 545423872.0,
+            "54": 545423872.0,
+            "55": 545423872.0,
+            "56": 545423872.0,
+            "57": 545423872.0,
+            "58": 545423872.0,
+            "59": 545423872.0,
+            "60": 545423872.0,
+            "61": 545423872.0,
+            "62": 545423872.0,
+            "63": 545423872.0,
+            "64": 545423872.0,
+            "65": 545423872.0,
+            "66": 545423872.0,
+            "67": 545423872.0,
+            "68": 545423872.0,
+            "69": 545423872.0,
+            "70": 545423872.0,
+            "71": 545423872.0,
+            "72": 545423872.0,
+            "73": 545423872.0,
+            "74": 545423872.0,
+            "75": 545423872.0,
+            "76": 545423872.0,
+            "77": 545423872.0,
+            "78": 545423872.0,
+            "79": 545423872.0,
+            "80": 545423872.0,
+            "81": 545423872.0,
+            "82": 545423872.0,
+            "83": 545423872.0,
+            "84": 545423872.0,
+            "85": 545423872.0,
+            "86": 545423872.0,
+            "87": 545423872.0,
+            "88": 545423872.0,
+            "89": 545423872.0,
+            "90": 545423872.0,
+            "91": 545423872.0,
+            "92": 545423872.0,
+            "93": 545423872.0,
+            "94": 545423872.0,
+            "95": 545423872.0,
+            "96": 545423872.0,
+            "97": 545423872.0,
+            "98": 545423872.0,
+            "99": 545423872.0,
+            "100": 545423872.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1726380544.0,
+            "2": 1906452992.0,
+            "3": 1906452992.0,
+            "4": 1906452992.0,
+            "5": 1906452992.0,
+            "6": 1906452992.0,
+            "7": 1906452992.0,
+            "8": 1906452992.0,
+            "9": 1906452992.0,
+            "10": 1906452992.0,
+            "11": 1906452992.0,
+            "12": 1906452992.0,
+            "13": 1906452992.0,
+            "14": 1906452992.0,
+            "15": 1906452992.0,
+            "16": 1906452992.0,
+            "17": 1906452992.0,
+            "18": 1906452992.0,
+            "19": 1906452992.0,
+            "20": 1906452992.0,
+            "21": 1906452992.0,
+            "22": 1906452992.0,
+            "23": 1906452992.0,
+            "24": 1906452992.0,
+            "25": 1906452992.0,
+            "26": 1906452992.0,
+            "27": 1906452992.0,
+            "28": 1906452992.0,
+            "29": 1906452992.0,
+            "30": 1906452992.0,
+            "31": 1906452992.0,
+            "32": 1906452992.0,
+            "33": 1906452992.0,
+            "34": 1906452992.0,
+            "35": 1906452992.0,
+            "36": 1906452992.0,
+            "37": 1906452992.0,
+            "38": 1906452992.0,
+            "39": 1906452992.0,
+            "40": 1906452992.0,
+            "41": 1906452992.0,
+            "42": 1906452992.0,
+            "43": 1906452992.0,
+            "44": 1906452992.0,
+            "45": 1906452992.0,
+            "46": 1906452992.0,
+            "47": 1906452992.0,
+            "48": 1906452992.0,
+            "49": 1906452992.0,
+            "50": 1906452992.0,
+            "51": 1906452992.0,
+            "52": 1906452992.0,
+            "53": 1906452992.0,
+            "54": 1906452992.0,
+            "55": 1906452992.0,
+            "56": 1906452992.0,
+            "57": 1906452992.0,
+            "58": 1906452992.0,
+            "59": 1906452992.0,
+            "60": 1906452992.0,
+            "61": 1906452992.0,
+            "62": 1906452992.0,
+            "63": 1906452992.0,
+            "64": 1906452992.0,
+            "65": 1906452992.0,
+            "66": 1906452992.0,
+            "67": 1906452992.0,
+            "68": 1906452992.0,
+            "69": 1906452992.0,
+            "70": 1906452992.0,
+            "71": 1906452992.0,
+            "72": 1906452992.0,
+            "73": 1906452992.0,
+            "74": 1906452992.0,
+            "75": 1906452992.0,
+            "76": 1906452992.0,
+            "77": 1906452992.0,
+            "78": 1906452992.0,
+            "79": 1906452992.0,
+            "80": 1906452992.0,
+            "81": 1906452992.0,
+            "82": 1906452992.0,
+            "83": 1906452992.0,
+            "84": 1906452992.0,
+            "85": 1906452992.0,
+            "86": 1906452992.0,
+            "87": 1906452992.0,
+            "88": 1906452992.0,
+            "89": 1906452992.0,
+            "90": 1906452992.0,
+            "91": 1906452992.0,
+            "92": 1906452992.0,
+            "93": 1906452992.0,
+            "94": 1906452992.0,
+            "95": 1906452992.0,
+            "96": 1906452992.0,
+            "97": 1906452992.0,
+            "98": 1906452992.0,
+            "99": 1906452992.0,
+            "100": 1906452992.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.61029,
+            "3": 0.19483,
+            "4": 0.18062,
+            "5": 0.18259,
+            "6": 0.18221,
+            "7": 0.18364,
+            "8": 0.18219,
+            "9": 0.18127,
+            "10": 0.18188,
+            "11": 0.18181,
+            "12": 0.1831,
+            "13": 0.18466,
+            "14": 0.18253,
+            "15": 0.18081,
+            "16": 0.18182,
+            "17": 0.18252,
+            "18": 0.18383,
+            "19": 0.18271,
+            "20": 0.18292,
+            "21": 0.18157,
+            "22": 0.18322,
+            "23": 0.18379,
+            "24": 0.18403,
+            "25": 0.18149,
+            "26": 0.18219,
+            "27": 0.18279,
+            "28": 0.18152,
+            "29": 0.18195,
+            "30": 0.18138,
+            "31": 0.18264,
+            "32": 0.18261,
+            "33": 0.18358,
+            "34": 0.18161,
+            "35": 0.18269,
+            "36": 0.18132,
+            "37": 0.18252,
+            "38": 0.18362,
+            "39": 0.18411,
+            "40": 0.18423,
+            "41": 0.1835,
+            "42": 0.18364,
+            "43": 0.18373,
+            "44": 0.18306,
+            "45": 0.18336,
+            "46": 0.18278,
+            "47": 0.18327,
+            "48": 0.18332,
+            "49": 0.18271,
+            "50": 0.18318,
+            "51": 0.19806,
+            "52": 0.1799,
+            "53": 0.18156,
+            "54": 0.1807,
+            "55": 0.17954,
+            "56": 0.18001,
+            "57": 0.18039,
+            "58": 0.181,
+            "59": 0.18041,
+            "60": 0.17989,
+            "61": 0.18137,
+            "62": 0.18121,
+            "63": 0.18088,
+            "64": 0.1801,
+            "65": 0.18077,
+            "66": 0.18006,
+            "67": 0.18112,
+            "68": 0.18089,
+            "69": 0.18124,
+            "70": 0.17966,
+            "71": 0.18084,
+            "72": 0.18137,
+            "73": 0.18132,
+            "74": 0.18078,
+            "75": 0.18129,
+            "76": 0.18079,
+            "77": 0.1816,
+            "78": 0.18161,
+            "79": 0.18227,
+            "80": 0.18173,
+            "81": 0.18145,
+            "82": 0.18143,
+            "83": 0.18128,
+            "84": 0.18207,
+            "85": 0.18121,
+            "86": 0.18062,
+            "87": 0.17981,
+            "88": 0.18098,
+            "89": 0.18014,
+            "90": 0.17967,
+            "91": 0.18153,
+            "92": 0.18175,
+            "93": 0.18107,
+            "94": 0.17803,
+            "95": 0.17796,
+            "96": 0.17757,
+            "97": 0.17815,
+            "98": 0.17979,
+            "99": 0.18056,
+            "100": 0.18044
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..5a291771fe9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8474,
+            "52": 9.74209,
+            "53": 10.05695,
+            "54": 9.95048,
+            "55": 9.88137,
+            "56": 9.61274,
+            "57": 9.46865,
+            "58": 9.82191,
+            "59": 9.57642,
+            "60": 9.49763,
+            "61": 9.6919,
+            "62": 9.98672,
+            "63": 9.37511,
+            "64": 9.76682,
+            "65": 8.94645,
+            "66": 9.70228,
+            "67": 9.36325,
+            "68": 9.78311,
+            "69": 9.79861,
+            "70": 9.73171,
+            "71": 9.62575,
+            "72": 9.58482,
+            "73": 9.48964,
+            "74": 8.92857,
+            "75": 9.40863,
+            "76": 9.07924,
+            "77": 10.05936,
+            "78": 9.72284,
+            "79": 9.37782,
+            "80": 9.40428,
+            "81": 9.48314,
+            "82": 9.70039,
+            "83": 9.31593,
+            "84": 9.41835,
+            "85": 9.61687,
+            "86": 9.07538,
+            "87": 9.59618,
+            "88": 9.75215,
+            "89": 9.60188,
+            "90": 9.82284,
+            "91": 9.34035,
+            "92": 9.35853,
+            "93": 9.08806,
+            "94": 8.83039,
+            "95": 9.5266,
+            "96": 9.53046,
+            "97": 9.30391,
+            "98": 9.67197,
+            "99": 8.89638,
+            "100": 9.40645
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 71.0,
+            "52": 53.0,
+            "53": 77.0,
+            "54": 92.0,
+            "55": 67.0,
+            "56": 92.0,
+            "57": 86.0,
+            "58": 79.0,
+            "59": 74.0,
+            "60": 70.0,
+            "61": 98.0,
+            "62": 71.0,
+            "63": 64.0,
+            "64": 83.0,
+            "65": 89.0,
+            "66": 86.0,
+            "67": 62.0,
+            "68": 67.0,
+            "69": 57.0,
+            "70": 90.0,
+            "71": 66.0,
+            "72": 61.0,
+            "73": 76.0,
+            "74": 52.0,
+            "75": 63.0,
+            "76": 78.0,
+            "77": 78.0,
+            "78": 87.0,
+            "79": 83.0,
+            "80": 77.0,
+            "81": 102.0,
+            "82": 74.0,
+            "83": 67.0,
+            "84": 68.0,
+            "85": 96.0,
+            "86": 89.0,
+            "87": 92.0,
+            "88": 81.0,
+            "89": 47.0,
+            "90": 76.0,
+            "91": 70.0,
+            "92": 82.0,
+            "93": 58.0,
+            "94": 76.0,
+            "95": 71.0,
+            "96": 92.0,
+            "97": 67.0,
+            "98": 88.0,
+            "99": 66.0,
+            "100": 69.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 544900608.0,
+            "52": 544900608.0,
+            "53": 544900608.0,
+            "54": 544900608.0,
+            "55": 544900608.0,
+            "56": 544900608.0,
+            "57": 544900608.0,
+            "58": 544900608.0,
+            "59": 544900608.0,
+            "60": 544900608.0,
+            "61": 544900608.0,
+            "62": 544900608.0,
+            "63": 544900608.0,
+            "64": 544900608.0,
+            "65": 544900608.0,
+            "66": 544900608.0,
+            "67": 544900608.0,
+            "68": 544900608.0,
+            "69": 544900608.0,
+            "70": 544900608.0,
+            "71": 544900608.0,
+            "72": 544900608.0,
+            "73": 544900608.0,
+            "74": 544900608.0,
+            "75": 544900608.0,
+            "76": 544900608.0,
+            "77": 544900608.0,
+            "78": 544900608.0,
+            "79": 544900608.0,
+            "80": 544900608.0,
+            "81": 544900608.0,
+            "82": 544900608.0,
+            "83": 544900608.0,
+            "84": 544900608.0,
+            "85": 544900608.0,
+            "86": 544900608.0,
+            "87": 544900608.0,
+            "88": 544900608.0,
+            "89": 544900608.0,
+            "90": 544900608.0,
+            "91": 544900608.0,
+            "92": 544900608.0,
+            "93": 544900608.0,
+            "94": 544900608.0,
+            "95": 544900608.0,
+            "96": 544900608.0,
+            "97": 544900608.0,
+            "98": 544900608.0,
+            "99": 544900608.0,
+            "100": 544900608.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1909074432.0,
+            "52": 1909075456.0,
+            "53": 1909075456.0,
+            "54": 1909075456.0,
+            "55": 1909075456.0,
+            "56": 1909075456.0,
+            "57": 1909075456.0,
+            "58": 1909075456.0,
+            "59": 1909075456.0,
+            "60": 1909075456.0,
+            "61": 1909075456.0,
+            "62": 1909075456.0,
+            "63": 1909075456.0,
+            "64": 1909075456.0,
+            "65": 1909075456.0,
+            "66": 1909075456.0,
+            "67": 1909075456.0,
+            "68": 1909075456.0,
+            "69": 1909075456.0,
+            "70": 1909075456.0,
+            "71": 1909075456.0,
+            "72": 1909075456.0,
+            "73": 1909075456.0,
+            "74": 1909075456.0,
+            "75": 1909075456.0,
+            "76": 1909075456.0,
+            "77": 1909075456.0,
+            "78": 1909075456.0,
+            "79": 1909075456.0,
+            "80": 1909075456.0,
+            "81": 1909075456.0,
+            "82": 1909075456.0,
+            "83": 1909075456.0,
+            "84": 1909075456.0,
+            "85": 1909075456.0,
+            "86": 1909075456.0,
+            "87": 1909075456.0,
+            "88": 1909075456.0,
+            "89": 1909075456.0,
+            "90": 1909075456.0,
+            "91": 1909075456.0,
+            "92": 1909075456.0,
+            "93": 1909075456.0,
+            "94": 1909075456.0,
+            "95": 1909075456.0,
+            "96": 1909075456.0,
+            "97": 1909075456.0,
+            "98": 1909075456.0,
+            "99": 1909075456.0,
+            "100": 1909075456.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.55843,
+            "53": 0.19589,
+            "54": 0.18471,
+            "55": 0.18349,
+            "56": 0.18435,
+            "57": 0.18418,
+            "58": 0.18539,
+            "59": 0.18565,
+            "60": 0.18678,
+            "61": 0.18442,
+            "62": 0.18318,
+            "63": 0.18314,
+            "64": 0.18315,
+            "65": 0.18517,
+            "66": 0.18507,
+            "67": 0.18538,
+            "68": 0.18295,
+            "69": 0.18459,
+            "70": 0.18533,
+            "71": 0.1857,
+            "72": 0.1861,
+            "73": 0.18507,
+            "74": 0.18454,
+            "75": 0.18322,
+            "76": 0.18308,
+            "77": 0.18278,
+            "78": 0.18413,
+            "79": 0.18228,
+            "80": 0.18383,
+            "81": 0.18491,
+            "82": 0.18405,
+            "83": 0.18374,
+            "84": 0.18428,
+            "85": 0.18358,
+            "86": 0.18433,
+            "87": 0.18542,
+            "88": 0.18544,
+            "89": 0.1847,
+            "90": 0.18536,
+            "91": 0.18553,
+            "92": 0.18571,
+            "93": 0.18611,
+            "94": 0.18506,
+            "95": 0.18462,
+            "96": 0.18458,
+            "97": 0.18459,
+            "98": 0.18525,
+            "99": 0.18232,
+            "100": 0.18404
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100.json
index c1aaf21cf26..f1a58884e99 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 546472448.0,
-            "2": 546472448.0,
-            "3": 546472448.0,
-            "4": 546472448.0,
-            "5": 546472448.0,
-            "6": 546472448.0,
-            "7": 546472448.0,
-            "8": 546472448.0,
-            "9": 546472448.0,
-            "10": 546472448.0,
-            "11": 546472448.0,
-            "12": 546472448.0,
-            "13": 546472448.0,
-            "14": 546472448.0,
-            "15": 546472448.0,
-            "16": 546472448.0,
-            "17": 546472448.0,
-            "18": 546472448.0,
-            "19": 546472448.0,
-            "20": 546472448.0,
-            "21": 546472448.0,
-            "22": 546472448.0,
-            "23": 546472448.0,
-            "24": 546472448.0,
-            "25": 546472448.0,
-            "26": 546472448.0,
-            "27": 546472448.0,
-            "28": 546472448.0,
-            "29": 546472448.0,
-            "30": 546472448.0,
-            "31": 546472448.0,
-            "32": 546472448.0,
-            "33": 546472448.0,
-            "34": 546472448.0,
-            "35": 546472448.0,
-            "36": 546472448.0,
-            "37": 546472448.0,
-            "38": 546472448.0,
-            "39": 546472448.0,
-            "40": 546472448.0,
-            "41": 546472448.0,
-            "42": 546472448.0,
-            "43": 546472448.0,
-            "44": 546472448.0,
-            "45": 546472448.0,
-            "46": 546472448.0,
-            "47": 546472448.0,
-            "48": 546472448.0,
-            "49": 546472448.0,
-            "50": 546472448.0,
-            "51": 546472448.0,
-            "52": 546472448.0,
-            "53": 546472448.0,
-            "54": 546472448.0,
-            "55": 546472448.0,
-            "56": 546472448.0,
-            "57": 546472448.0,
-            "58": 546472448.0,
-            "59": 546472448.0,
-            "60": 546472448.0,
-            "61": 546472448.0,
-            "62": 546472448.0,
-            "63": 546472448.0,
-            "64": 546472448.0,
-            "65": 546472448.0,
-            "66": 546472448.0,
-            "67": 546472448.0,
-            "68": 546472448.0,
-            "69": 546472448.0,
-            "70": 546472448.0,
-            "71": 546472448.0,
-            "72": 546472448.0,
-            "73": 546472448.0,
-            "74": 546472448.0,
-            "75": 546472448.0,
-            "76": 546472448.0,
-            "77": 546472448.0,
-            "78": 546472448.0,
-            "79": 546472448.0,
-            "80": 546472448.0,
-            "81": 546472448.0,
-            "82": 546472448.0,
-            "83": 546472448.0,
-            "84": 546472448.0,
-            "85": 546472448.0,
-            "86": 546472448.0,
-            "87": 546472448.0,
-            "88": 546472448.0,
-            "89": 546472448.0,
-            "90": 546472448.0,
-            "91": 546472448.0,
-            "92": 546472448.0,
-            "93": 546472448.0,
-            "94": 546472448.0,
-            "95": 546472448.0,
-            "96": 546472448.0,
-            "97": 546472448.0,
-            "98": 546472448.0,
-            "99": 546472448.0,
-            "100": 546472448.0
+            "1": 545423872.0,
+            "2": 545423872.0,
+            "3": 545423872.0,
+            "4": 545423872.0,
+            "5": 545423872.0,
+            "6": 545423872.0,
+            "7": 545423872.0,
+            "8": 545423872.0,
+            "9": 545423872.0,
+            "10": 545423872.0,
+            "11": 545423872.0,
+            "12": 545423872.0,
+            "13": 545423872.0,
+            "14": 545423872.0,
+            "15": 545423872.0,
+            "16": 545423872.0,
+            "17": 545423872.0,
+            "18": 545423872.0,
+            "19": 545423872.0,
+            "20": 545423872.0,
+            "21": 545423872.0,
+            "22": 545423872.0,
+            "23": 545423872.0,
+            "24": 545423872.0,
+            "25": 545423872.0,
+            "26": 545423872.0,
+            "27": 545423872.0,
+            "28": 545423872.0,
+            "29": 545423872.0,
+            "30": 545423872.0,
+            "31": 545423872.0,
+            "32": 545423872.0,
+            "33": 545423872.0,
+            "34": 545423872.0,
+            "35": 545423872.0,
+            "36": 545423872.0,
+            "37": 545423872.0,
+            "38": 545423872.0,
+            "39": 545423872.0,
+            "40": 545423872.0,
+            "41": 545423872.0,
+            "42": 545423872.0,
+            "43": 545423872.0,
+            "44": 545423872.0,
+            "45": 545423872.0,
+            "46": 545423872.0,
+            "47": 545423872.0,
+            "48": 545423872.0,
+            "49": 545423872.0,
+            "50": 545423872.0,
+            "51": 545423872.0,
+            "52": 545423872.0,
+            "53": 545423872.0,
+            "54": 545423872.0,
+            "55": 545423872.0,
+            "56": 545423872.0,
+            "57": 545423872.0,
+            "58": 545423872.0,
+            "59": 545423872.0,
+            "60": 545423872.0,
+            "61": 545423872.0,
+            "62": 545423872.0,
+            "63": 545423872.0,
+            "64": 545423872.0,
+            "65": 545423872.0,
+            "66": 545423872.0,
+            "67": 545423872.0,
+            "68": 545423872.0,
+            "69": 545423872.0,
+            "70": 545423872.0,
+            "71": 545423872.0,
+            "72": 545423872.0,
+            "73": 545423872.0,
+            "74": 545423872.0,
+            "75": 545423872.0,
+            "76": 545423872.0,
+            "77": 545423872.0,
+            "78": 545423872.0,
+            "79": 545423872.0,
+            "80": 545423872.0,
+            "81": 545423872.0,
+            "82": 545423872.0,
+            "83": 545423872.0,
+            "84": 545423872.0,
+            "85": 545423872.0,
+            "86": 545423872.0,
+            "87": 545423872.0,
+            "88": 545423872.0,
+            "89": 545423872.0,
+            "90": 545423872.0,
+            "91": 545423872.0,
+            "92": 545423872.0,
+            "93": 545423872.0,
+            "94": 545423872.0,
+            "95": 545423872.0,
+            "96": 545423872.0,
+            "97": 545423872.0,
+            "98": 545423872.0,
+            "99": 545423872.0,
+            "100": 545423872.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1713796608.0,
-            "2": 1895967232.0,
-            "3": 1895967232.0,
-            "4": 1895967232.0,
-            "5": 1895967232.0,
-            "6": 1895967232.0,
-            "7": 1895967232.0,
-            "8": 1895967232.0,
-            "9": 1895967232.0,
-            "10": 1895967232.0,
-            "11": 1895967232.0,
-            "12": 1895967232.0,
-            "13": 1895967232.0,
-            "14": 1895967232.0,
-            "15": 1895967232.0,
-            "16": 1895967232.0,
-            "17": 1895967232.0,
-            "18": 1895967232.0,
-            "19": 1895967232.0,
-            "20": 1895967232.0,
-            "21": 1895967232.0,
-            "22": 1895967232.0,
-            "23": 1895967232.0,
-            "24": 1895967232.0,
-            "25": 1895967232.0,
-            "26": 1895967232.0,
-            "27": 1895967232.0,
-            "28": 1895967232.0,
-            "29": 1895967232.0,
-            "30": 1895967232.0,
-            "31": 1895967232.0,
-            "32": 1895967232.0,
-            "33": 1895967232.0,
-            "34": 1895967232.0,
-            "35": 1895967232.0,
-            "36": 1895967232.0,
-            "37": 1895967232.0,
-            "38": 1895967232.0,
-            "39": 1895967232.0,
-            "40": 1895967232.0,
-            "41": 1895967232.0,
-            "42": 1895967232.0,
-            "43": 1895967232.0,
-            "44": 1895967232.0,
-            "45": 1895967232.0,
-            "46": 1895967232.0,
-            "47": 1895967232.0,
-            "48": 1895967232.0,
-            "49": 1895967232.0,
-            "50": 1895967232.0,
-            "51": 1895967232.0,
-            "52": 1895967232.0,
-            "53": 1895967232.0,
-            "54": 1895967232.0,
-            "55": 1895967232.0,
-            "56": 1895967232.0,
-            "57": 1895967232.0,
-            "58": 1895967232.0,
-            "59": 1895967232.0,
-            "60": 1895967232.0,
-            "61": 1895967232.0,
-            "62": 1895967232.0,
-            "63": 1895967232.0,
-            "64": 1895967232.0,
-            "65": 1895967232.0,
-            "66": 1895967232.0,
-            "67": 1895967232.0,
-            "68": 1895967232.0,
-            "69": 1895967232.0,
-            "70": 1895967232.0,
-            "71": 1895967232.0,
-            "72": 1895967232.0,
-            "73": 1895967232.0,
-            "74": 1895967232.0,
-            "75": 1895967232.0,
-            "76": 1895967232.0,
-            "77": 1895967232.0,
-            "78": 1895967232.0,
-            "79": 1895967232.0,
-            "80": 1895967232.0,
-            "81": 1895967232.0,
-            "82": 1895967232.0,
-            "83": 1895967232.0,
-            "84": 1895967232.0,
-            "85": 1895967232.0,
-            "86": 1895967232.0,
-            "87": 1895967232.0,
-            "88": 1895967232.0,
-            "89": 1895967232.0,
-            "90": 1895967232.0,
-            "91": 1895967232.0,
-            "92": 1895967232.0,
-            "93": 1895967232.0,
-            "94": 1895967232.0,
-            "95": 1895967232.0,
-            "96": 1895967232.0,
-            "97": 1895967232.0,
-            "98": 1895967232.0,
-            "99": 1895967232.0,
-            "100": 1895967232.0
+            "1": 1713797632.0,
+            "2": 1893870080.0,
+            "3": 1893870080.0,
+            "4": 1893870080.0,
+            "5": 1893870080.0,
+            "6": 1893870080.0,
+            "7": 1893870080.0,
+            "8": 1893870080.0,
+            "9": 1893870080.0,
+            "10": 1893870080.0,
+            "11": 1893870080.0,
+            "12": 1893870080.0,
+            "13": 1893870080.0,
+            "14": 1893870080.0,
+            "15": 1893870080.0,
+            "16": 1893870080.0,
+            "17": 1893870080.0,
+            "18": 1893870080.0,
+            "19": 1893870080.0,
+            "20": 1893870080.0,
+            "21": 1893870080.0,
+            "22": 1893870080.0,
+            "23": 1893870080.0,
+            "24": 1893870080.0,
+            "25": 1893870080.0,
+            "26": 1893870080.0,
+            "27": 1893870080.0,
+            "28": 1893870080.0,
+            "29": 1893870080.0,
+            "30": 1893870080.0,
+            "31": 1893870080.0,
+            "32": 1893870080.0,
+            "33": 1893870080.0,
+            "34": 1893870080.0,
+            "35": 1893870080.0,
+            "36": 1893870080.0,
+            "37": 1893870080.0,
+            "38": 1893870080.0,
+            "39": 1893870080.0,
+            "40": 1893870080.0,
+            "41": 1893870080.0,
+            "42": 1893870080.0,
+            "43": 1893870080.0,
+            "44": 1893870080.0,
+            "45": 1893870080.0,
+            "46": 1893870080.0,
+            "47": 1893870080.0,
+            "48": 1893870080.0,
+            "49": 1893870080.0,
+            "50": 1893870080.0,
+            "51": 1893870080.0,
+            "52": 1893870080.0,
+            "53": 1893870080.0,
+            "54": 1893870080.0,
+            "55": 1893870080.0,
+            "56": 1893870080.0,
+            "57": 1893870080.0,
+            "58": 1893870080.0,
+            "59": 1893870080.0,
+            "60": 1893870080.0,
+            "61": 1893870080.0,
+            "62": 1893870080.0,
+            "63": 1893870080.0,
+            "64": 1893870080.0,
+            "65": 1893870080.0,
+            "66": 1893870080.0,
+            "67": 1893870080.0,
+            "68": 1893870080.0,
+            "69": 1893870080.0,
+            "70": 1893870080.0,
+            "71": 1893870080.0,
+            "72": 1893870080.0,
+            "73": 1893870080.0,
+            "74": 1893870080.0,
+            "75": 1893870080.0,
+            "76": 1893870080.0,
+            "77": 1893870080.0,
+            "78": 1893870080.0,
+            "79": 1893870080.0,
+            "80": 1893870080.0,
+            "81": 1893870080.0,
+            "82": 1893870080.0,
+            "83": 1893870080.0,
+            "84": 1893870080.0,
+            "85": 1893870080.0,
+            "86": 1893870080.0,
+            "87": 1893870080.0,
+            "88": 1893870080.0,
+            "89": 1893870080.0,
+            "90": 1893870080.0,
+            "91": 1893870080.0,
+            "92": 1893870080.0,
+            "93": 1893870080.0,
+            "94": 1893870080.0,
+            "95": 1893870080.0,
+            "96": 1893870080.0,
+            "97": 1893870080.0,
+            "98": 1893870080.0,
+            "99": 1893870080.0,
+            "100": 1893870080.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 11.81196,
-            "2": 0.17008,
-            "3": 0.15523,
-            "4": 0.15249,
-            "5": 0.15434,
-            "6": 0.15515,
-            "7": 0.15378,
-            "8": 0.1528,
-            "9": 0.15287,
-            "10": 0.15479,
-            "11": 0.15442,
-            "12": 0.15952,
-            "13": 0.15843,
-            "14": 0.15559,
-            "15": 0.15333,
-            "16": 0.15363,
-            "17": 0.15594,
-            "18": 0.153,
-            "19": 0.15542,
-            "20": 0.15304,
-            "21": 0.15492,
-            "22": 0.15277,
-            "23": 0.15803,
-            "24": 0.1545,
-            "25": 0.15639,
-            "26": 0.15419,
-            "27": 0.15381,
-            "28": 0.15423,
-            "29": 0.15354,
-            "30": 0.1554,
-            "31": 0.15389,
-            "32": 0.15608,
-            "33": 0.15361,
-            "34": 0.15437,
-            "35": 0.15233,
-            "36": 0.15499,
-            "37": 0.15114,
-            "38": 0.15259,
-            "39": 0.15269,
-            "40": 0.1516,
-            "41": 0.15052,
-            "42": 0.15122,
-            "43": 0.15389,
-            "44": 0.15261,
-            "45": 0.15376,
-            "46": 0.15091,
-            "47": 0.15197,
-            "48": 0.15131,
-            "49": 0.15083,
-            "50": 0.152,
-            "51": 0.15723,
-            "52": 0.15481,
-            "53": 0.15087,
-            "54": 0.15175,
-            "55": 0.15331,
-            "56": 0.15504,
-            "57": 0.15471,
-            "58": 0.1549,
-            "59": 0.15621,
-            "60": 0.1533,
-            "61": 0.15499,
-            "62": 0.15222,
-            "63": 0.15091,
-            "64": 0.1535,
-            "65": 0.15463,
-            "66": 0.15169,
-            "67": 0.15591,
-            "68": 0.15173,
-            "69": 0.1509,
-            "70": 0.15063,
-            "71": 0.15755,
-            "72": 0.1545,
-            "73": 0.15374,
-            "74": 0.15306,
-            "75": 0.15223,
-            "76": 0.15203,
-            "77": 0.15194,
-            "78": 0.15284,
-            "79": 0.15345,
-            "80": 0.15138,
-            "81": 0.15298,
-            "82": 0.15115,
-            "83": 0.15281,
-            "84": 0.1544,
-            "85": 0.15277,
-            "86": 0.15368,
-            "87": 0.15373,
-            "88": 0.15359,
-            "89": 0.15205,
-            "90": 0.1535,
-            "91": 0.15459,
-            "92": 0.15406,
-            "93": 0.15133,
-            "94": 0.1533,
-            "95": 0.15198,
-            "96": 0.15195,
-            "97": 0.1533,
-            "98": 0.15406,
-            "99": 0.1528,
-            "100": 0.15371
+            "1": 8.61654,
+            "2": 0.16646,
+            "3": 0.14939,
+            "4": 0.12694,
+            "5": 0.1251,
+            "6": 0.12545,
+            "7": 0.12533,
+            "8": 0.1271,
+            "9": 0.1261,
+            "10": 0.12491,
+            "11": 0.12876,
+            "12": 0.13422,
+            "13": 0.13211,
+            "14": 0.12395,
+            "15": 0.12563,
+            "16": 0.12703,
+            "17": 0.1243,
+            "18": 0.12651,
+            "19": 0.12452,
+            "20": 0.12538,
+            "21": 0.1244,
+            "22": 0.12395,
+            "23": 0.12379,
+            "24": 0.12455,
+            "25": 0.12457,
+            "26": 0.12444,
+            "27": 0.12397,
+            "28": 0.125,
+            "29": 0.13321,
+            "30": 0.13442,
+            "31": 0.13329,
+            "32": 0.12696,
+            "33": 0.12493,
+            "34": 0.12398,
+            "35": 0.12918,
+            "36": 0.13252,
+            "37": 0.13148,
+            "38": 0.13338,
+            "39": 0.13083,
+            "40": 0.13113,
+            "41": 0.13061,
+            "42": 0.1295,
+            "43": 0.1305,
+            "44": 0.13132,
+            "45": 0.13148,
+            "46": 0.13113,
+            "47": 0.13116,
+            "48": 0.12551,
+            "49": 0.12779,
+            "50": 0.12989,
+            "51": 0.1367,
+            "52": 0.13188,
+            "53": 0.13008,
+            "54": 0.13122,
+            "55": 0.12979,
+            "56": 0.12943,
+            "57": 0.13002,
+            "58": 0.12923,
+            "59": 0.12984,
+            "60": 0.13209,
+            "61": 0.13094,
+            "62": 0.13083,
+            "63": 0.12826,
+            "64": 0.13104,
+            "65": 0.1292,
+            "66": 0.12985,
+            "67": 0.1295,
+            "68": 0.12398,
+            "69": 0.12509,
+            "70": 0.12208,
+            "71": 0.12371,
+            "72": 0.12256,
+            "73": 0.12266,
+            "74": 0.12476,
+            "75": 0.12866,
+            "76": 0.12272,
+            "77": 0.12403,
+            "78": 0.12307,
+            "79": 0.12209,
+            "80": 0.12352,
+            "81": 0.12155,
+            "82": 0.12329,
+            "83": 0.12201,
+            "84": 0.12239,
+            "85": 0.12414,
+            "86": 0.12372,
+            "87": 0.12357,
+            "88": 0.12705,
+            "89": 0.1249,
+            "90": 0.12289,
+            "91": 0.12523,
+            "92": 0.51175,
+            "93": 0.12454,
+            "94": 0.12634,
+            "95": 0.12226,
+            "96": 0.12255,
+            "97": 0.12357,
+            "98": 0.12405,
+            "99": 0.12419,
+            "100": 0.12384
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..fdc5f0244ea
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85581,
+            "52": 9.75235,
+            "53": 10.07582,
+            "54": 9.95687,
+            "55": 9.882,
+            "56": 9.63137,
+            "57": 9.48647,
+            "58": 9.83111,
+            "59": 9.58896,
+            "60": 9.50647,
+            "61": 9.70361,
+            "62": 9.98283,
+            "63": 9.38302,
+            "64": 9.77906,
+            "65": 8.95171,
+            "66": 9.70162,
+            "67": 9.372,
+            "68": 9.78849,
+            "69": 9.79851,
+            "70": 9.74738,
+            "71": 9.61908,
+            "72": 9.58496,
+            "73": 9.49723,
+            "74": 8.93927,
+            "75": 9.42706,
+            "76": 9.08018,
+            "77": 10.06566,
+            "78": 9.72889,
+            "79": 9.37757,
+            "80": 9.40987,
+            "81": 9.47974,
+            "82": 9.70177,
+            "83": 9.30611,
+            "84": 9.42088,
+            "85": 9.61376,
+            "86": 9.07651,
+            "87": 9.59452,
+            "88": 9.75067,
+            "89": 9.60239,
+            "90": 9.81895,
+            "91": 9.33895,
+            "92": 9.35712,
+            "93": 9.07879,
+            "94": 8.83504,
+            "95": 9.52168,
+            "96": 9.53002,
+            "97": 9.31306,
+            "98": 9.67783,
+            "99": 8.89053,
+            "100": 9.39725
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 70.0,
+            "52": 81.0,
+            "53": 95.0,
+            "54": 101.0,
+            "55": 58.0,
+            "56": 90.0,
+            "57": 83.0,
+            "58": 90.0,
+            "59": 79.0,
+            "60": 84.0,
+            "61": 92.0,
+            "62": 102.0,
+            "63": 78.0,
+            "64": 73.0,
+            "65": 81.0,
+            "66": 88.0,
+            "67": 54.0,
+            "68": 57.0,
+            "69": 72.0,
+            "70": 88.0,
+            "71": 82.0,
+            "72": 64.0,
+            "73": 78.0,
+            "74": 76.0,
+            "75": 70.0,
+            "76": 78.0,
+            "77": 67.0,
+            "78": 86.0,
+            "79": 76.0,
+            "80": 90.0,
+            "81": 92.0,
+            "82": 72.0,
+            "83": 61.0,
+            "84": 65.0,
+            "85": 89.0,
+            "86": 73.0,
+            "87": 89.0,
+            "88": 63.0,
+            "89": 83.0,
+            "90": 72.0,
+            "91": 55.0,
+            "92": 63.0,
+            "93": 47.0,
+            "94": 74.0,
+            "95": 70.0,
+            "96": 73.0,
+            "97": 80.0,
+            "98": 76.0,
+            "99": 68.0,
+            "100": 75.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 725497344.0,
+            "52": 725497344.0,
+            "53": 725497344.0,
+            "54": 725497344.0,
+            "55": 725497344.0,
+            "56": 725497344.0,
+            "57": 725497344.0,
+            "58": 725497344.0,
+            "59": 725497344.0,
+            "60": 725497344.0,
+            "61": 725497344.0,
+            "62": 725497344.0,
+            "63": 725497344.0,
+            "64": 725497344.0,
+            "65": 725497344.0,
+            "66": 725497344.0,
+            "67": 725497344.0,
+            "68": 725497344.0,
+            "69": 725497344.0,
+            "70": 725497344.0,
+            "71": 725497344.0,
+            "72": 725497344.0,
+            "73": 725497344.0,
+            "74": 725497344.0,
+            "75": 725497344.0,
+            "76": 725497344.0,
+            "77": 725497344.0,
+            "78": 725497344.0,
+            "79": 725497344.0,
+            "80": 725497344.0,
+            "81": 725497344.0,
+            "82": 725497344.0,
+            "83": 725497344.0,
+            "84": 725497344.0,
+            "85": 725497344.0,
+            "86": 725497344.0,
+            "87": 725497344.0,
+            "88": 725497344.0,
+            "89": 725497344.0,
+            "90": 725497344.0,
+            "91": 725497344.0,
+            "92": 725497344.0,
+            "93": 725497344.0,
+            "94": 725497344.0,
+            "95": 725497344.0,
+            "96": 725497344.0,
+            "97": 725497344.0,
+            "98": 725497344.0,
+            "99": 725497344.0,
+            "100": 725497344.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2074991104.0,
+            "52": 2074992128.0,
+            "53": 2074992128.0,
+            "54": 2074992128.0,
+            "55": 2074992128.0,
+            "56": 2074992128.0,
+            "57": 2074992128.0,
+            "58": 2074992128.0,
+            "59": 2074992128.0,
+            "60": 2074992128.0,
+            "61": 2074992128.0,
+            "62": 2074992128.0,
+            "63": 2074992128.0,
+            "64": 2074992128.0,
+            "65": 2074992128.0,
+            "66": 2074992128.0,
+            "67": 2074992128.0,
+            "68": 2074992128.0,
+            "69": 2074992128.0,
+            "70": 2074992128.0,
+            "71": 2074992128.0,
+            "72": 2074992128.0,
+            "73": 2074992128.0,
+            "74": 2074992128.0,
+            "75": 2074992128.0,
+            "76": 2074992128.0,
+            "77": 2074992128.0,
+            "78": 2074992128.0,
+            "79": 2074992128.0,
+            "80": 2074992128.0,
+            "81": 2074992128.0,
+            "82": 2074992128.0,
+            "83": 2074992128.0,
+            "84": 2074992128.0,
+            "85": 2074992128.0,
+            "86": 2074992128.0,
+            "87": 2074992128.0,
+            "88": 2074992128.0,
+            "89": 2074992128.0,
+            "90": 2074992128.0,
+            "91": 2074992128.0,
+            "92": 2074992128.0,
+            "93": 2074992128.0,
+            "94": 2074992128.0,
+            "95": 2074992128.0,
+            "96": 2074992128.0,
+            "97": 2074992128.0,
+            "98": 2074992128.0,
+            "99": 2074992128.0,
+            "100": 2074992128.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.08401,
+            "52": 0.17107,
+            "53": 0.13414,
+            "54": 0.13296,
+            "55": 0.12627,
+            "56": 0.12542,
+            "57": 0.12564,
+            "58": 0.12468,
+            "59": 0.1245,
+            "60": 0.12595,
+            "61": 0.1248,
+            "62": 0.12424,
+            "63": 0.1263,
+            "64": 0.12611,
+            "65": 0.12448,
+            "66": 0.1268,
+            "67": 0.12509,
+            "68": 0.12463,
+            "69": 0.12587,
+            "70": 0.12403,
+            "71": 0.12788,
+            "72": 0.12581,
+            "73": 0.12599,
+            "74": 0.12429,
+            "75": 0.12845,
+            "76": 0.12517,
+            "77": 0.12546,
+            "78": 0.1257,
+            "79": 0.12526,
+            "80": 0.12602,
+            "81": 0.13237,
+            "82": 0.12452,
+            "83": 0.13316,
+            "84": 0.13434,
+            "85": 0.1319,
+            "86": 0.13456,
+            "87": 0.13266,
+            "88": 0.13492,
+            "89": 0.1345,
+            "90": 0.13063,
+            "91": 0.13342,
+            "92": 0.13139,
+            "93": 0.13378,
+            "94": 0.13513,
+            "95": 0.13196,
+            "96": 0.13396,
+            "97": 0.12722,
+            "98": 0.12492,
+            "99": 0.12599,
+            "100": 0.12635
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json
index 96cf765384a..c89ea54f89f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.10882,
-            "2": 0.24563,
-            "3": 0.21507,
-            "4": 0.21225,
-            "5": 0.21165,
-            "6": 0.21127,
-            "7": 0.21406,
-            "8": 0.21402,
-            "9": 0.21175,
-            "10": 0.19518,
-            "11": 0.19565,
-            "12": 0.19461,
-            "13": 0.19428,
-            "14": 0.19385,
-            "15": 0.19329,
-            "16": 0.19311,
-            "17": 0.19391,
-            "18": 0.19383,
-            "19": 0.19364,
-            "20": 0.19408,
-            "21": 0.19327,
-            "22": 0.19729,
-            "23": 0.19599,
-            "24": 0.19601,
-            "25": 0.1965,
-            "26": 0.19683,
-            "27": 0.19626,
-            "28": 0.19667,
-            "29": 0.1989,
-            "30": 0.19644,
-            "31": 0.19728,
-            "32": 0.19614,
-            "33": 0.1973,
-            "34": 0.1971,
-            "35": 0.19674,
-            "36": 0.19628,
-            "37": 0.19578,
-            "38": 0.19629,
-            "39": 0.19673,
-            "40": 0.19712,
-            "41": 0.19593,
-            "42": 0.1969,
-            "43": 0.19639,
-            "44": 0.20378,
-            "45": 0.19737,
-            "46": 0.19738,
-            "47": 0.19532,
-            "48": 0.19579,
-            "49": 0.19617,
-            "50": 0.19695,
-            "51": 0.20318,
-            "52": 0.19428,
-            "53": 0.19415,
-            "54": 0.19663,
-            "55": 0.19266,
-            "56": 0.19426,
-            "57": 0.19455,
-            "58": 0.19473,
-            "59": 0.19413,
-            "60": 0.19467,
-            "61": 0.19511,
-            "62": 0.19475,
-            "63": 0.19464,
-            "64": 0.19452,
-            "65": 0.19445,
-            "66": 0.19395,
-            "67": 0.19423,
-            "68": 0.19431,
-            "69": 0.19512,
-            "70": 0.1941,
-            "71": 0.19453,
-            "72": 0.19467,
-            "73": 0.19615,
-            "74": 0.19355,
-            "75": 0.19419,
-            "76": 0.19407,
-            "77": 0.19455,
-            "78": 0.19511,
-            "79": 0.19498,
-            "80": 0.19577,
-            "81": 0.19399,
-            "82": 0.19362,
-            "83": 0.19425,
-            "84": 0.19418,
-            "85": 0.19432,
-            "86": 0.20057,
-            "87": 0.19522,
-            "88": 0.19447,
-            "89": 0.19472,
-            "90": 0.19377,
-            "91": 0.19433,
-            "92": 0.19432,
-            "93": 0.19456,
-            "94": 0.19394,
-            "95": 0.19417,
-            "96": 0.19476,
-            "97": 0.19423,
-            "98": 0.19401,
-            "99": 0.19403,
-            "100": 0.19364
+            "1": 4.2285,
+            "2": 0.2225,
+            "3": 0.20464,
+            "4": 0.18763,
+            "5": 0.18448,
+            "6": 0.18488,
+            "7": 0.1868,
+            "8": 0.18507,
+            "9": 0.18639,
+            "10": 0.18525,
+            "11": 0.185,
+            "12": 0.1892,
+            "13": 0.18964,
+            "14": 0.18674,
+            "15": 0.18659,
+            "16": 0.18641,
+            "17": 0.1862,
+            "18": 0.18503,
+            "19": 0.18484,
+            "20": 0.18494,
+            "21": 0.18464,
+            "22": 0.18544,
+            "23": 0.18496,
+            "24": 0.18402,
+            "25": 0.18506,
+            "26": 0.18392,
+            "27": 0.18476,
+            "28": 0.18508,
+            "29": 0.18537,
+            "30": 0.18566,
+            "31": 0.18562,
+            "32": 0.1846,
+            "33": 0.18516,
+            "34": 0.1847,
+            "35": 0.18539,
+            "36": 0.18474,
+            "37": 0.18449,
+            "38": 0.18492,
+            "39": 0.18406,
+            "40": 0.1848,
+            "41": 0.18488,
+            "42": 0.18457,
+            "43": 0.18477,
+            "44": 0.18339,
+            "45": 0.18392,
+            "46": 0.18291,
+            "47": 0.1845,
+            "48": 0.18355,
+            "49": 0.18321,
+            "50": 0.1836,
+            "51": 0.19691,
+            "52": 0.18837,
+            "53": 0.18901,
+            "54": 0.18882,
+            "55": 0.18866,
+            "56": 0.18799,
+            "57": 0.18879,
+            "58": 0.18717,
+            "59": 0.18786,
+            "60": 0.18816,
+            "61": 0.18754,
+            "62": 0.18765,
+            "63": 0.18797,
+            "64": 0.18736,
+            "65": 0.19017,
+            "66": 0.18805,
+            "67": 0.18724,
+            "68": 0.18718,
+            "69": 0.18876,
+            "70": 0.18803,
+            "71": 0.18742,
+            "72": 0.1906,
+            "73": 0.18971,
+            "74": 0.58261,
+            "75": 0.18725,
+            "76": 0.1877,
+            "77": 0.18725,
+            "78": 0.18828,
+            "79": 0.1888,
+            "80": 0.1867,
+            "81": 0.18809,
+            "82": 0.18881,
+            "83": 0.18773,
+            "84": 0.18814,
+            "85": 0.18863,
+            "86": 0.18809,
+            "87": 0.18728,
+            "88": 0.18747,
+            "89": 0.18808,
+            "90": 0.18818,
+            "91": 0.18719,
+            "92": 0.18753,
+            "93": 0.18888,
+            "94": 0.18938,
+            "95": 0.18815,
+            "96": 0.18883,
+            "97": 0.18854,
+            "98": 0.19027,
+            "99": 0.18914,
+            "100": 0.18784
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..b44b9766e91
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84971,
+            "52": 9.74156,
+            "53": 10.06324,
+            "54": 9.94584,
+            "55": 9.87735,
+            "56": 9.62744,
+            "57": 9.4726,
+            "58": 9.82907,
+            "59": 9.58298,
+            "60": 9.49182,
+            "61": 9.6996,
+            "62": 9.98091,
+            "63": 9.37212,
+            "64": 9.77558,
+            "65": 8.94327,
+            "66": 9.69991,
+            "67": 9.3641,
+            "68": 9.78706,
+            "69": 9.78396,
+            "70": 9.72291,
+            "71": 9.60749,
+            "72": 9.58417,
+            "73": 9.4909,
+            "74": 8.94863,
+            "75": 9.41807,
+            "76": 9.08721,
+            "77": 10.06284,
+            "78": 9.729,
+            "79": 9.37087,
+            "80": 9.40029,
+            "81": 9.47753,
+            "82": 9.69123,
+            "83": 9.30764,
+            "84": 9.4125,
+            "85": 9.61132,
+            "86": 9.07624,
+            "87": 9.59459,
+            "88": 9.74769,
+            "89": 9.60678,
+            "90": 9.81079,
+            "91": 9.34443,
+            "92": 9.36534,
+            "93": 9.07741,
+            "94": 8.82974,
+            "95": 9.51676,
+            "96": 9.52545,
+            "97": 9.31031,
+            "98": 9.67811,
+            "99": 8.88848,
+            "100": 9.40128
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 77.0,
+            "52": 100.0,
+            "53": 71.0,
+            "54": 67.0,
+            "55": 70.0,
+            "56": 83.0,
+            "57": 74.0,
+            "58": 106.0,
+            "59": 72.0,
+            "60": 98.0,
+            "61": 67.0,
+            "62": 73.0,
+            "63": 77.0,
+            "64": 94.0,
+            "65": 82.0,
+            "66": 87.0,
+            "67": 65.0,
+            "68": 78.0,
+            "69": 59.0,
+            "70": 102.0,
+            "71": 82.0,
+            "72": 60.0,
+            "73": 96.0,
+            "74": 61.0,
+            "75": 64.0,
+            "76": 70.0,
+            "77": 84.0,
+            "78": 93.0,
+            "79": 102.0,
+            "80": 71.0,
+            "81": 88.0,
+            "82": 85.0,
+            "83": 75.0,
+            "84": 69.0,
+            "85": 84.0,
+            "86": 66.0,
+            "87": 93.0,
+            "88": 96.0,
+            "89": 73.0,
+            "90": 77.0,
+            "91": 66.0,
+            "92": 86.0,
+            "93": 63.0,
+            "94": 60.0,
+            "95": 70.0,
+            "96": 65.0,
+            "97": 67.0,
+            "98": 96.0,
+            "99": 54.0,
+            "100": 77.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 516588032.0,
+            "52": 516588032.0,
+            "53": 516588032.0,
+            "54": 516588032.0,
+            "55": 516588032.0,
+            "56": 516588032.0,
+            "57": 516588032.0,
+            "58": 516588032.0,
+            "59": 516588032.0,
+            "60": 516588032.0,
+            "61": 516588032.0,
+            "62": 516588032.0,
+            "63": 516588032.0,
+            "64": 516588032.0,
+            "65": 516588032.0,
+            "66": 516588032.0,
+            "67": 516588032.0,
+            "68": 516588032.0,
+            "69": 516588032.0,
+            "70": 516588032.0,
+            "71": 516588032.0,
+            "72": 516588032.0,
+            "73": 516588032.0,
+            "74": 516588032.0,
+            "75": 516588032.0,
+            "76": 516588032.0,
+            "77": 516588032.0,
+            "78": 516588032.0,
+            "79": 516588032.0,
+            "80": 516588032.0,
+            "81": 516588032.0,
+            "82": 516588032.0,
+            "83": 516588032.0,
+            "84": 516588032.0,
+            "85": 516588032.0,
+            "86": 516588032.0,
+            "87": 516588032.0,
+            "88": 516588032.0,
+            "89": 516588032.0,
+            "90": 516588032.0,
+            "91": 516588032.0,
+            "92": 516588032.0,
+            "93": 516588032.0,
+            "94": 516588032.0,
+            "95": 516588032.0,
+            "96": 516588032.0,
+            "97": 516588032.0,
+            "98": 516588032.0,
+            "99": 516588032.0,
+            "100": 516588032.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1879713280.0,
+            "52": 1879714304.0,
+            "53": 1879714304.0,
+            "54": 1879714304.0,
+            "55": 1879714304.0,
+            "56": 1879714304.0,
+            "57": 1879714304.0,
+            "58": 1879714304.0,
+            "59": 1879714304.0,
+            "60": 1879714304.0,
+            "61": 1879714304.0,
+            "62": 1879714304.0,
+            "63": 1879714304.0,
+            "64": 1879714304.0,
+            "65": 1879714304.0,
+            "66": 1879714304.0,
+            "67": 1879714304.0,
+            "68": 1879714304.0,
+            "69": 1879714304.0,
+            "70": 1879714304.0,
+            "71": 1879714304.0,
+            "72": 1879714304.0,
+            "73": 1879714304.0,
+            "74": 1879714304.0,
+            "75": 1879714304.0,
+            "76": 1879714304.0,
+            "77": 1879714304.0,
+            "78": 1879714304.0,
+            "79": 1879714304.0,
+            "80": 1879714304.0,
+            "81": 1879714304.0,
+            "82": 1879714304.0,
+            "83": 1879714304.0,
+            "84": 1879714304.0,
+            "85": 1879714304.0,
+            "86": 1879714304.0,
+            "87": 1879714304.0,
+            "88": 1879714304.0,
+            "89": 1879714304.0,
+            "90": 1879714304.0,
+            "91": 1879714304.0,
+            "92": 1879714304.0,
+            "93": 1879714304.0,
+            "94": 1879714304.0,
+            "95": 1879714304.0,
+            "96": 1879714304.0,
+            "97": 1879714304.0,
+            "98": 1879714304.0,
+            "99": 1879714304.0,
+            "100": 1879714304.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.66406,
+            "52": 0.2158,
+            "53": 0.20019,
+            "54": 0.19602,
+            "55": 0.19005,
+            "56": 0.19244,
+            "57": 0.19305,
+            "58": 0.19241,
+            "59": 0.19133,
+            "60": 0.19108,
+            "61": 0.19083,
+            "62": 0.19044,
+            "63": 0.19122,
+            "64": 0.19085,
+            "65": 0.19237,
+            "66": 0.19162,
+            "67": 0.19273,
+            "68": 0.19427,
+            "69": 0.19391,
+            "70": 0.19124,
+            "71": 0.19263,
+            "72": 0.19156,
+            "73": 0.19165,
+            "74": 0.1912,
+            "75": 0.1916,
+            "76": 0.19244,
+            "77": 0.19754,
+            "78": 0.19743,
+            "79": 0.19729,
+            "80": 0.19745,
+            "81": 0.19719,
+            "82": 0.19703,
+            "83": 0.19876,
+            "84": 0.19042,
+            "85": 0.18981,
+            "86": 0.18931,
+            "87": 0.19021,
+            "88": 0.18916,
+            "89": 0.19085,
+            "90": 0.19016,
+            "91": 0.19021,
+            "92": 0.19141,
+            "93": 0.19167,
+            "94": 0.19089,
+            "95": 0.19116,
+            "96": 0.18907,
+            "97": 0.19161,
+            "98": 0.19075,
+            "99": 0.1909,
+            "100": 0.19241
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/model_config.yaml
index 7e2261ae518..487227e5abd 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/model_config.yaml
@@ -46,7 +46,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..3cfdeafee58
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86897,
+            "2": 10.88544,
+            "3": 10.86473,
+            "4": 10.86826,
+            "5": 10.87436,
+            "6": 10.89005,
+            "7": 10.87769,
+            "8": 10.86364,
+            "9": 10.88282,
+            "10": 10.84687,
+            "11": 10.87102,
+            "12": 10.87345,
+            "13": 10.8814,
+            "14": 10.8877,
+            "15": 10.83869,
+            "16": 10.8239,
+            "17": 10.80197,
+            "18": 10.81094,
+            "19": 10.82192,
+            "20": 10.71791,
+            "21": 10.68914,
+            "22": 10.57271,
+            "23": 10.7081,
+            "24": 10.59543,
+            "25": 10.55292,
+            "26": 10.61257,
+            "27": 10.60051,
+            "28": 10.56173,
+            "29": 10.58089,
+            "30": 10.35595,
+            "31": 10.1182,
+            "32": 10.44815,
+            "33": 10.4542,
+            "34": 10.21553,
+            "35": 10.26124,
+            "36": 10.20776,
+            "37": 10.33673,
+            "38": 10.17741,
+            "39": 10.39297,
+            "40": 10.06349,
+            "41": 10.13887,
+            "42": 10.2056,
+            "43": 9.82809,
+            "44": 9.94547,
+            "45": 9.82561,
+            "46": 9.80186,
+            "47": 10.14049,
+            "48": 9.84276,
+            "49": 9.52016,
+            "50": 9.88454,
+            "51": 9.84743,
+            "52": 9.74209,
+            "53": 10.05697,
+            "54": 9.9505,
+            "55": 9.88145,
+            "56": 9.61274,
+            "57": 9.4687,
+            "58": 9.82193,
+            "59": 9.57642,
+            "60": 9.49762,
+            "61": 9.69189,
+            "62": 9.9867,
+            "63": 9.37512,
+            "64": 9.76679,
+            "65": 8.94648,
+            "66": 9.7023,
+            "67": 9.36326,
+            "68": 9.7831,
+            "69": 9.7986,
+            "70": 9.7317,
+            "71": 9.62571,
+            "72": 9.58488,
+            "73": 9.48967,
+            "74": 8.9286,
+            "75": 9.40862,
+            "76": 9.07925,
+            "77": 10.0594,
+            "78": 9.72288,
+            "79": 9.37784,
+            "80": 9.40429,
+            "81": 9.48309,
+            "82": 9.7004,
+            "83": 9.31595,
+            "84": 9.41838,
+            "85": 9.61685,
+            "86": 9.07533,
+            "87": 9.59616,
+            "88": 9.75215,
+            "89": 9.60184,
+            "90": 9.82281,
+            "91": 9.34037,
+            "92": 9.35854,
+            "93": 9.08805,
+            "94": 8.83037,
+            "95": 9.5266,
+            "96": 9.53049,
+            "97": 9.30389,
+            "98": 9.67196,
+            "99": 8.89637,
+            "100": 9.40644
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1621.0,
+            "2": 1657.0,
+            "3": 1580.0,
+            "4": 1839.0,
+            "5": 1862.0,
+            "6": 1724.0,
+            "7": 1714.0,
+            "8": 1670.0,
+            "9": 1762.0,
+            "10": 1358.0,
+            "11": 1734.0,
+            "12": 1682.0,
+            "13": 1761.0,
+            "14": 1731.0,
+            "15": 1788.0,
+            "16": 1801.0,
+            "17": 1866.0,
+            "18": 1636.0,
+            "19": 1709.0,
+            "20": 1607.0,
+            "21": 1821.0,
+            "22": 1666.0,
+            "23": 1991.0,
+            "24": 1585.0,
+            "25": 1587.0,
+            "26": 1631.0,
+            "27": 1714.0,
+            "28": 1966.0,
+            "29": 1997.0,
+            "30": 1851.0,
+            "31": 1581.0,
+            "32": 1864.0,
+            "33": 2107.0,
+            "34": 1846.0,
+            "35": 1982.0,
+            "36": 1904.0,
+            "37": 2373.0,
+            "38": 2172.0,
+            "39": 2343.0,
+            "40": 2149.0,
+            "41": 2331.0,
+            "42": 2199.0,
+            "43": 1914.0,
+            "44": 2065.0,
+            "45": 2081.0,
+            "46": 2352.0,
+            "47": 2497.0,
+            "48": 2303.0,
+            "49": 2346.0,
+            "50": 2411.0,
+            "51": 2491.0,
+            "52": 2552.0,
+            "53": 2980.0,
+            "54": 2680.0,
+            "55": 2274.0,
+            "56": 2734.0,
+            "57": 2319.0,
+            "58": 2907.0,
+            "59": 2886.0,
+            "60": 2566.0,
+            "61": 2855.0,
+            "62": 2704.0,
+            "63": 2370.0,
+            "64": 2998.0,
+            "65": 2563.0,
+            "66": 2868.0,
+            "67": 2762.0,
+            "68": 2739.0,
+            "69": 2730.0,
+            "70": 3156.0,
+            "71": 2803.0,
+            "72": 2506.0,
+            "73": 2896.0,
+            "74": 1937.0,
+            "75": 2450.0,
+            "76": 2794.0,
+            "77": 3047.0,
+            "78": 3104.0,
+            "79": 3069.0,
+            "80": 3286.0,
+            "81": 3543.0,
+            "82": 3192.0,
+            "83": 2614.0,
+            "84": 3273.0,
+            "85": 3111.0,
+            "86": 2680.0,
+            "87": 3654.0,
+            "88": 3117.0,
+            "89": 3351.0,
+            "90": 3086.0,
+            "91": 2721.0,
+            "92": 3045.0,
+            "93": 2672.0,
+            "94": 3326.0,
+            "95": 3125.0,
+            "96": 3309.0,
+            "97": 3208.0,
+            "98": 3572.0,
+            "99": 2980.0,
+            "100": 3355.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 511860224.0,
+            "2": 511860224.0,
+            "3": 511860224.0,
+            "4": 511860224.0,
+            "5": 511860224.0,
+            "6": 511860224.0,
+            "7": 511860224.0,
+            "8": 511860224.0,
+            "9": 511860224.0,
+            "10": 511860224.0,
+            "11": 511860224.0,
+            "12": 511860224.0,
+            "13": 511860224.0,
+            "14": 511860224.0,
+            "15": 511860224.0,
+            "16": 511860224.0,
+            "17": 511860224.0,
+            "18": 511860224.0,
+            "19": 511860224.0,
+            "20": 511860224.0,
+            "21": 511860224.0,
+            "22": 511860224.0,
+            "23": 511860224.0,
+            "24": 511860224.0,
+            "25": 511860224.0,
+            "26": 511860224.0,
+            "27": 511860224.0,
+            "28": 511860224.0,
+            "29": 511860224.0,
+            "30": 511860224.0,
+            "31": 511860224.0,
+            "32": 511860224.0,
+            "33": 511860224.0,
+            "34": 511860224.0,
+            "35": 511860224.0,
+            "36": 511860224.0,
+            "37": 511860224.0,
+            "38": 511860224.0,
+            "39": 511860224.0,
+            "40": 511860224.0,
+            "41": 511860224.0,
+            "42": 511860224.0,
+            "43": 511860224.0,
+            "44": 511860224.0,
+            "45": 511860224.0,
+            "46": 511860224.0,
+            "47": 511860224.0,
+            "48": 511860224.0,
+            "49": 511860224.0,
+            "50": 511860224.0,
+            "51": 511860224.0,
+            "52": 511860224.0,
+            "53": 511860224.0,
+            "54": 511860224.0,
+            "55": 511860224.0,
+            "56": 511860224.0,
+            "57": 511860224.0,
+            "58": 511860224.0,
+            "59": 511860224.0,
+            "60": 511860224.0,
+            "61": 511860224.0,
+            "62": 511860224.0,
+            "63": 511860224.0,
+            "64": 511860224.0,
+            "65": 511860224.0,
+            "66": 511860224.0,
+            "67": 511860224.0,
+            "68": 511860224.0,
+            "69": 511860224.0,
+            "70": 511860224.0,
+            "71": 511860224.0,
+            "72": 511860224.0,
+            "73": 511860224.0,
+            "74": 511860224.0,
+            "75": 511860224.0,
+            "76": 511860224.0,
+            "77": 511860224.0,
+            "78": 511860224.0,
+            "79": 511860224.0,
+            "80": 511860224.0,
+            "81": 511860224.0,
+            "82": 511860224.0,
+            "83": 511860224.0,
+            "84": 511860224.0,
+            "85": 511860224.0,
+            "86": 511860224.0,
+            "87": 511860224.0,
+            "88": 511860224.0,
+            "89": 511860224.0,
+            "90": 511860224.0,
+            "91": 511860224.0,
+            "92": 511860224.0,
+            "93": 511860224.0,
+            "94": 511860224.0,
+            "95": 511860224.0,
+            "96": 511860224.0,
+            "97": 511860224.0,
+            "98": 511860224.0,
+            "99": 511860224.0,
+            "100": 511860224.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1251365376.0,
+            "2": 1430390272.0,
+            "3": 1430390272.0,
+            "4": 1430390272.0,
+            "5": 1430390272.0,
+            "6": 1430390272.0,
+            "7": 1430390272.0,
+            "8": 1430390272.0,
+            "9": 1430390272.0,
+            "10": 1430390272.0,
+            "11": 1430390272.0,
+            "12": 1430390272.0,
+            "13": 1430390272.0,
+            "14": 1430390272.0,
+            "15": 1430390272.0,
+            "16": 1430390272.0,
+            "17": 1430390272.0,
+            "18": 1430390272.0,
+            "19": 1430390272.0,
+            "20": 1430390272.0,
+            "21": 1430390272.0,
+            "22": 1430390272.0,
+            "23": 1430390272.0,
+            "24": 1430390272.0,
+            "25": 1430390272.0,
+            "26": 1430390272.0,
+            "27": 1430390272.0,
+            "28": 1430390272.0,
+            "29": 1430390272.0,
+            "30": 1430390272.0,
+            "31": 1430390272.0,
+            "32": 1430390272.0,
+            "33": 1430390272.0,
+            "34": 1430390272.0,
+            "35": 1430390272.0,
+            "36": 1430390272.0,
+            "37": 1430390272.0,
+            "38": 1430390272.0,
+            "39": 1430390272.0,
+            "40": 1430390272.0,
+            "41": 1430390272.0,
+            "42": 1430390272.0,
+            "43": 1430390272.0,
+            "44": 1430390272.0,
+            "45": 1430390272.0,
+            "46": 1430390272.0,
+            "47": 1430390272.0,
+            "48": 1430390272.0,
+            "49": 1430390272.0,
+            "50": 1430390272.0,
+            "51": 1430390272.0,
+            "52": 1430390272.0,
+            "53": 1430390272.0,
+            "54": 1430390272.0,
+            "55": 1430390272.0,
+            "56": 1430390272.0,
+            "57": 1430390272.0,
+            "58": 1430390272.0,
+            "59": 1430390272.0,
+            "60": 1430390272.0,
+            "61": 1430390272.0,
+            "62": 1430390272.0,
+            "63": 1430390272.0,
+            "64": 1430390272.0,
+            "65": 1430390272.0,
+            "66": 1430390272.0,
+            "67": 1430390272.0,
+            "68": 1430390272.0,
+            "69": 1430390272.0,
+            "70": 1430390272.0,
+            "71": 1430390272.0,
+            "72": 1430390272.0,
+            "73": 1430390272.0,
+            "74": 1430390272.0,
+            "75": 1430390272.0,
+            "76": 1430390272.0,
+            "77": 1430390272.0,
+            "78": 1430390272.0,
+            "79": 1430390272.0,
+            "80": 1430390272.0,
+            "81": 1430390272.0,
+            "82": 1430390272.0,
+            "83": 1430390272.0,
+            "84": 1430390272.0,
+            "85": 1430390272.0,
+            "86": 1430390272.0,
+            "87": 1430390272.0,
+            "88": 1430390272.0,
+            "89": 1430390272.0,
+            "90": 1430390272.0,
+            "91": 1430390272.0,
+            "92": 1430390272.0,
+            "93": 1430390272.0,
+            "94": 1430390272.0,
+            "95": 1430390272.0,
+            "96": 1430390272.0,
+            "97": 1430390272.0,
+            "98": 1430390272.0,
+            "99": 1430390272.0,
+            "100": 1430390272.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 5.9274,
+            "2": 0.21272,
+            "3": 0.22152,
+            "4": 0.1871,
+            "5": 0.21307,
+            "6": 0.21965,
+            "7": 0.22219,
+            "8": 0.22237,
+            "9": 0.22411,
+            "10": 0.22202,
+            "11": 0.22123,
+            "12": 0.22038,
+            "13": 0.22083,
+            "14": 0.21999,
+            "15": 0.21683,
+            "16": 0.22088,
+            "17": 0.22103,
+            "18": 0.22014,
+            "19": 0.21937,
+            "20": 0.21984,
+            "21": 0.21934,
+            "22": 0.22176,
+            "23": 0.21919,
+            "24": 0.21956,
+            "25": 0.21941,
+            "26": 0.5044,
+            "27": 0.22459,
+            "28": 0.22027,
+            "29": 0.21989,
+            "30": 0.22088,
+            "31": 0.22111,
+            "32": 0.22371,
+            "33": 0.22449,
+            "34": 0.22278,
+            "35": 0.22512,
+            "36": 0.2238,
+            "37": 0.22153,
+            "38": 0.22287,
+            "39": 0.22369,
+            "40": 0.22242,
+            "41": 0.22005,
+            "42": 0.22123,
+            "43": 0.22176,
+            "44": 0.22219,
+            "45": 0.22209,
+            "46": 0.22213,
+            "47": 0.22118,
+            "48": 0.22156,
+            "49": 0.22452,
+            "50": 0.22094,
+            "51": 0.23758,
+            "52": 0.22018,
+            "53": 0.22125,
+            "54": 0.22334,
+            "55": 0.22156,
+            "56": 0.22191,
+            "57": 0.54851,
+            "58": 0.22402,
+            "59": 0.22203,
+            "60": 0.22556,
+            "61": 0.22485,
+            "62": 0.22511,
+            "63": 0.22362,
+            "64": 0.22461,
+            "65": 0.2231,
+            "66": 0.22489,
+            "67": 0.2248,
+            "68": 0.22682,
+            "69": 0.22568,
+            "70": 0.22662,
+            "71": 0.22741,
+            "72": 0.22865,
+            "73": 0.22913,
+            "74": 0.2291,
+            "75": 0.22782,
+            "76": 0.81496,
+            "77": 0.23726,
+            "78": 0.22937,
+            "79": 0.22963,
+            "80": 0.22908,
+            "81": 0.2307,
+            "82": 0.22778,
+            "83": 0.22872,
+            "84": 0.2297,
+            "85": 0.22998,
+            "86": 0.22898,
+            "87": 0.22903,
+            "88": 0.22865,
+            "89": 0.22964,
+            "90": 0.23194,
+            "91": 0.22888,
+            "92": 0.23063,
+            "93": 0.22825,
+            "94": 0.23,
+            "95": 0.22281,
+            "96": 0.22333,
+            "97": 0.2242,
+            "98": 0.22437,
+            "99": 0.22403,
+            "100": 0.22146
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json
index d6134cdcc5a..756fbc3b53c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 8.20377,
-            "2": 0.2288,
-            "3": 0.19616,
-            "4": 0.19587,
-            "5": 0.19737,
-            "6": 0.19775,
-            "7": 0.19658,
-            "8": 0.19621,
-            "9": 0.19557,
-            "10": 0.19534,
-            "11": 0.19453,
-            "12": 0.1949,
-            "13": 0.19522,
-            "14": 0.19865,
-            "15": 0.20415,
-            "16": 0.19686,
-            "17": 0.1985,
-            "18": 0.19858,
-            "19": 0.19709,
-            "20": 0.19609,
-            "21": 0.19758,
-            "22": 0.19837,
-            "23": 0.19786,
-            "24": 0.19688,
-            "25": 0.1972,
-            "26": 0.19859,
-            "27": 0.19814,
-            "28": 0.1989,
-            "29": 0.1984,
-            "30": 0.19783,
-            "31": 0.19727,
-            "32": 0.19754,
-            "33": 0.19648,
-            "34": 0.19977,
-            "35": 0.19847,
-            "36": 0.19696,
-            "37": 0.20498,
-            "38": 0.20415,
-            "39": 0.20225,
-            "40": 0.19712,
-            "41": 0.19751,
-            "42": 0.19764,
-            "43": 0.19738,
-            "44": 0.19703,
-            "45": 0.19703,
-            "46": 0.19814,
-            "47": 0.19757,
-            "48": 0.19759,
-            "49": 0.19688,
-            "50": 0.20181,
-            "51": 0.22215,
-            "52": 0.2134,
-            "53": 0.2129,
-            "54": 0.2133,
-            "55": 0.21255,
-            "56": 0.21221,
-            "57": 0.21233,
-            "58": 0.2124,
-            "59": 0.21242,
-            "60": 0.21258,
-            "61": 0.21219,
-            "62": 0.21255,
-            "63": 0.21385,
-            "64": 0.2127,
-            "65": 0.21252,
-            "66": 0.21191,
-            "67": 0.21327,
-            "68": 0.21176,
-            "69": 0.2127,
-            "70": 0.21284,
-            "71": 0.21291,
-            "72": 0.21265,
-            "73": 0.21221,
-            "74": 0.21387,
-            "75": 0.21247,
-            "76": 0.21204,
-            "77": 0.21169,
-            "78": 0.21259,
-            "79": 0.21196,
-            "80": 0.21204,
-            "81": 0.21211,
-            "82": 0.21314,
-            "83": 0.21268,
-            "84": 0.21291,
-            "85": 0.21328,
-            "86": 0.2128,
-            "87": 0.21213,
-            "88": 0.21192,
-            "89": 0.21242,
-            "90": 0.21253,
-            "91": 0.21252,
-            "92": 0.21236,
-            "93": 0.21254,
-            "94": 0.21255,
-            "95": 0.21209,
-            "96": 0.21345,
-            "97": 0.21202,
-            "98": 0.21234,
-            "99": 0.21237,
-            "100": 0.21317
+            "1": 4.18215,
+            "2": 0.24102,
+            "3": 0.22538,
+            "4": 0.19265,
+            "5": 0.1927,
+            "6": 0.19409,
+            "7": 0.19316,
+            "8": 0.20321,
+            "9": 0.19569,
+            "10": 0.19176,
+            "11": 0.19371,
+            "12": 0.1915,
+            "13": 0.1999,
+            "14": 0.19198,
+            "15": 0.19063,
+            "16": 0.18985,
+            "17": 0.19307,
+            "18": 0.19389,
+            "19": 0.18963,
+            "20": 0.18912,
+            "21": 0.18939,
+            "22": 0.19051,
+            "23": 0.19061,
+            "24": 0.18863,
+            "25": 0.18777,
+            "26": 0.18904,
+            "27": 0.18951,
+            "28": 0.18898,
+            "29": 0.18846,
+            "30": 0.18884,
+            "31": 0.18892,
+            "32": 0.18966,
+            "33": 0.1906,
+            "34": 0.18855,
+            "35": 0.18874,
+            "36": 0.18902,
+            "37": 0.18886,
+            "38": 0.2005,
+            "39": 0.18875,
+            "40": 0.18823,
+            "41": 0.18805,
+            "42": 0.1885,
+            "43": 0.18816,
+            "44": 0.1884,
+            "45": 0.18934,
+            "46": 0.18913,
+            "47": 0.18837,
+            "48": 0.18793,
+            "49": 0.18776,
+            "50": 0.19086,
+            "51": 0.20025,
+            "52": 0.19114,
+            "53": 0.19106,
+            "54": 0.19178,
+            "55": 0.1907,
+            "56": 0.1918,
+            "57": 0.19088,
+            "58": 0.19169,
+            "59": 0.19055,
+            "60": 0.19039,
+            "61": 0.19129,
+            "62": 0.19114,
+            "63": 0.19039,
+            "64": 0.19023,
+            "65": 0.19101,
+            "66": 0.19064,
+            "67": 0.19048,
+            "68": 0.19034,
+            "69": 0.19008,
+            "70": 0.19082,
+            "71": 0.19018,
+            "72": 0.19111,
+            "73": 0.18977,
+            "74": 0.19049,
+            "75": 0.19112,
+            "76": 0.19169,
+            "77": 0.1913,
+            "78": 0.1905,
+            "79": 0.19033,
+            "80": 0.19026,
+            "81": 0.18982,
+            "82": 0.18941,
+            "83": 0.19009,
+            "84": 0.18968,
+            "85": 0.1902,
+            "86": 0.19092,
+            "87": 0.19042,
+            "88": 0.18999,
+            "89": 0.19013,
+            "90": 0.18962,
+            "91": 0.18986,
+            "92": 0.18975,
+            "93": 0.19013,
+            "94": 0.19113,
+            "95": 0.19019,
+            "96": 0.19136,
+            "97": 0.18954,
+            "98": 0.18934,
+            "99": 0.19002,
+            "100": 0.18991
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..ce275a70055
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84971,
+            "52": 9.74156,
+            "53": 10.06322,
+            "54": 9.94581,
+            "55": 9.87731,
+            "56": 9.62746,
+            "57": 9.47259,
+            "58": 9.82912,
+            "59": 9.583,
+            "60": 9.49181,
+            "61": 9.69961,
+            "62": 9.98089,
+            "63": 9.37212,
+            "64": 9.7756,
+            "65": 8.9433,
+            "66": 9.69993,
+            "67": 9.36414,
+            "68": 9.78706,
+            "69": 9.78397,
+            "70": 9.72288,
+            "71": 9.60749,
+            "72": 9.58416,
+            "73": 9.49093,
+            "74": 8.94864,
+            "75": 9.41807,
+            "76": 9.08721,
+            "77": 10.06283,
+            "78": 9.729,
+            "79": 9.37091,
+            "80": 9.40033,
+            "81": 9.47754,
+            "82": 9.69121,
+            "83": 9.30762,
+            "84": 9.41252,
+            "85": 9.61132,
+            "86": 9.07621,
+            "87": 9.59459,
+            "88": 9.74768,
+            "89": 9.6068,
+            "90": 9.81078,
+            "91": 9.34441,
+            "92": 9.36535,
+            "93": 9.07743,
+            "94": 8.82975,
+            "95": 9.51676,
+            "96": 9.52546,
+            "97": 9.31031,
+            "98": 9.67812,
+            "99": 8.88848,
+            "100": 9.40128
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2735.0,
+            "52": 2607.0,
+            "53": 2951.0,
+            "54": 2672.0,
+            "55": 2451.0,
+            "56": 2712.0,
+            "57": 2392.0,
+            "58": 2979.0,
+            "59": 2869.0,
+            "60": 2435.0,
+            "61": 2938.0,
+            "62": 2669.0,
+            "63": 2392.0,
+            "64": 2998.0,
+            "65": 2689.0,
+            "66": 3285.0,
+            "67": 2782.0,
+            "68": 2753.0,
+            "69": 2958.0,
+            "70": 3271.0,
+            "71": 3040.0,
+            "72": 2504.0,
+            "73": 3096.0,
+            "74": 1910.0,
+            "75": 2617.0,
+            "76": 3081.0,
+            "77": 3390.0,
+            "78": 3186.0,
+            "79": 3320.0,
+            "80": 3483.0,
+            "81": 3782.0,
+            "82": 3516.0,
+            "83": 2864.0,
+            "84": 3396.0,
+            "85": 3247.0,
+            "86": 2785.0,
+            "87": 3762.0,
+            "88": 3102.0,
+            "89": 3483.0,
+            "90": 3076.0,
+            "91": 2643.0,
+            "92": 3198.0,
+            "93": 2666.0,
+            "94": 3390.0,
+            "95": 3410.0,
+            "96": 3508.0,
+            "97": 3178.0,
+            "98": 3865.0,
+            "99": 3143.0,
+            "100": 3357.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 482499072.0,
+            "52": 482499072.0,
+            "53": 482499072.0,
+            "54": 482499072.0,
+            "55": 482499072.0,
+            "56": 482499072.0,
+            "57": 482499072.0,
+            "58": 482499072.0,
+            "59": 482499072.0,
+            "60": 482499072.0,
+            "61": 482499072.0,
+            "62": 482499072.0,
+            "63": 482499072.0,
+            "64": 482499072.0,
+            "65": 482499072.0,
+            "66": 482499072.0,
+            "67": 482499072.0,
+            "68": 482499072.0,
+            "69": 482499072.0,
+            "70": 482499072.0,
+            "71": 482499072.0,
+            "72": 482499072.0,
+            "73": 482499072.0,
+            "74": 482499072.0,
+            "75": 482499072.0,
+            "76": 482499072.0,
+            "77": 482499072.0,
+            "78": 482499072.0,
+            "79": 482499072.0,
+            "80": 482499072.0,
+            "81": 482499072.0,
+            "82": 482499072.0,
+            "83": 482499072.0,
+            "84": 482499072.0,
+            "85": 482499072.0,
+            "86": 482499072.0,
+            "87": 482499072.0,
+            "88": 482499072.0,
+            "89": 482499072.0,
+            "90": 482499072.0,
+            "91": 482499072.0,
+            "92": 482499072.0,
+            "93": 482499072.0,
+            "94": 482499072.0,
+            "95": 482499072.0,
+            "96": 482499072.0,
+            "97": 482499072.0,
+            "98": 482499072.0,
+            "99": 482499072.0,
+            "100": 482499072.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1401028096.0,
+            "52": 1401029120.0,
+            "53": 1401029120.0,
+            "54": 1401029120.0,
+            "55": 1401029120.0,
+            "56": 1401029120.0,
+            "57": 1401029120.0,
+            "58": 1401029120.0,
+            "59": 1401029120.0,
+            "60": 1401029120.0,
+            "61": 1401029120.0,
+            "62": 1401029120.0,
+            "63": 1401029120.0,
+            "64": 1401029120.0,
+            "65": 1401029120.0,
+            "66": 1401029120.0,
+            "67": 1401029120.0,
+            "68": 1401029120.0,
+            "69": 1401029120.0,
+            "70": 1401029120.0,
+            "71": 1401029120.0,
+            "72": 1401029120.0,
+            "73": 1401029120.0,
+            "74": 1401029120.0,
+            "75": 1401029120.0,
+            "76": 1401029120.0,
+            "77": 1401029120.0,
+            "78": 1401029120.0,
+            "79": 1401029120.0,
+            "80": 1401029120.0,
+            "81": 1401029120.0,
+            "82": 1401029120.0,
+            "83": 1401029120.0,
+            "84": 1401029120.0,
+            "85": 1401029120.0,
+            "86": 1401029120.0,
+            "87": 1401029120.0,
+            "88": 1401029120.0,
+            "89": 1401029120.0,
+            "90": 1401029120.0,
+            "91": 1401029120.0,
+            "92": 1401029120.0,
+            "93": 1401029120.0,
+            "94": 1401029120.0,
+            "95": 1401029120.0,
+            "96": 1401029120.0,
+            "97": 1401029120.0,
+            "98": 1401029120.0,
+            "99": 1401029120.0,
+            "100": 1401029120.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4.32401,
+            "52": 0.21688,
+            "53": 0.18518,
+            "54": 0.19488,
+            "55": 0.1986,
+            "56": 0.19975,
+            "57": 0.18475,
+            "58": 0.18368,
+            "59": 0.18376,
+            "60": 0.18447,
+            "61": 0.18462,
+            "62": 0.18451,
+            "63": 0.18353,
+            "64": 0.21625,
+            "65": 0.18791,
+            "66": 0.18877,
+            "67": 0.18755,
+            "68": 0.18846,
+            "69": 0.18722,
+            "70": 0.18704,
+            "71": 0.18789,
+            "72": 0.18975,
+            "73": 0.18773,
+            "74": 0.1875,
+            "75": 0.18938,
+            "76": 0.18771,
+            "77": 0.18773,
+            "78": 0.18744,
+            "79": 0.18693,
+            "80": 0.18783,
+            "81": 0.18742,
+            "82": 0.18723,
+            "83": 0.18781,
+            "84": 0.18777,
+            "85": 0.18758,
+            "86": 0.18679,
+            "87": 0.18708,
+            "88": 0.18812,
+            "89": 0.18758,
+            "90": 0.18811,
+            "91": 0.18925,
+            "92": 0.18753,
+            "93": 0.18733,
+            "94": 0.18737,
+            "95": 0.18854,
+            "96": 0.18834,
+            "97": 0.18793,
+            "98": 0.18731,
+            "99": 0.18778,
+            "100": 0.18797
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..e00e8181bf7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86836,
+            "2": 10.88595,
+            "3": 10.86559,
+            "4": 10.8689,
+            "5": 10.8742,
+            "6": 10.89059,
+            "7": 10.87676,
+            "8": 10.86476,
+            "9": 10.88236,
+            "10": 10.84582,
+            "11": 10.87163,
+            "12": 10.87422,
+            "13": 10.88163,
+            "14": 10.88884,
+            "15": 10.83936,
+            "16": 10.82495,
+            "17": 10.80144,
+            "18": 10.81238,
+            "19": 10.82157,
+            "20": 10.71932,
+            "21": 10.69086,
+            "22": 10.57426,
+            "23": 10.71097,
+            "24": 10.5978,
+            "25": 10.55559,
+            "26": 10.61521,
+            "27": 10.6045,
+            "28": 10.56482,
+            "29": 10.58474,
+            "30": 10.35947,
+            "31": 10.12154,
+            "32": 10.45234,
+            "33": 10.45723,
+            "34": 10.21986,
+            "35": 10.26445,
+            "36": 10.21034,
+            "37": 10.33954,
+            "38": 10.18014,
+            "39": 10.39592,
+            "40": 10.06632,
+            "41": 10.14164,
+            "42": 10.20846,
+            "43": 9.83129,
+            "44": 9.94859,
+            "45": 9.82846,
+            "46": 9.80459,
+            "47": 10.1423,
+            "48": 9.84463,
+            "49": 9.52196,
+            "50": 9.88602,
+            "51": 9.84981,
+            "52": 9.74433,
+            "53": 10.05841,
+            "54": 9.95128,
+            "55": 9.88345,
+            "56": 9.61328,
+            "57": 9.46898,
+            "58": 9.82164,
+            "59": 9.577,
+            "60": 9.49788,
+            "61": 9.69254,
+            "62": 9.98596,
+            "63": 9.37406,
+            "64": 9.76602,
+            "65": 8.94652,
+            "66": 9.70103,
+            "67": 9.36368,
+            "68": 9.78239,
+            "69": 9.79883,
+            "70": 9.73167,
+            "71": 9.62508,
+            "72": 9.58312,
+            "73": 9.4882,
+            "74": 8.92612,
+            "75": 9.40726,
+            "76": 9.07709,
+            "77": 10.05858,
+            "78": 9.72206,
+            "79": 9.37662,
+            "80": 9.40272,
+            "81": 9.48208,
+            "82": 9.69955,
+            "83": 9.31357,
+            "84": 9.41731,
+            "85": 9.61585,
+            "86": 9.0743,
+            "87": 9.59556,
+            "88": 9.75063,
+            "89": 9.60037,
+            "90": 9.82206,
+            "91": 9.33875,
+            "92": 9.3578,
+            "93": 9.08666,
+            "94": 8.82958,
+            "95": 9.52592,
+            "96": 9.52973,
+            "97": 9.30331,
+            "98": 9.67138,
+            "99": 8.89537,
+            "100": 9.40567
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1621.0,
+            "2": 1581.0,
+            "3": 1660.0,
+            "4": 1639.0,
+            "5": 1858.0,
+            "6": 1746.0,
+            "7": 1789.0,
+            "8": 1599.0,
+            "9": 1866.0,
+            "10": 1400.0,
+            "11": 1838.0,
+            "12": 1702.0,
+            "13": 1844.0,
+            "14": 1707.0,
+            "15": 1824.0,
+            "16": 1828.0,
+            "17": 1810.0,
+            "18": 1568.0,
+            "19": 1747.0,
+            "20": 1605.0,
+            "21": 1936.0,
+            "22": 1586.0,
+            "23": 1869.0,
+            "24": 1508.0,
+            "25": 1506.0,
+            "26": 1674.0,
+            "27": 1742.0,
+            "28": 1978.0,
+            "29": 1867.0,
+            "30": 1888.0,
+            "31": 1551.0,
+            "32": 1866.0,
+            "33": 2085.0,
+            "34": 1816.0,
+            "35": 1884.0,
+            "36": 1866.0,
+            "37": 2390.0,
+            "38": 2008.0,
+            "39": 2403.0,
+            "40": 2077.0,
+            "41": 2225.0,
+            "42": 2252.0,
+            "43": 1924.0,
+            "44": 2075.0,
+            "45": 1956.0,
+            "46": 2175.0,
+            "47": 2425.0,
+            "48": 2225.0,
+            "49": 2244.0,
+            "50": 2430.0,
+            "51": 2464.0,
+            "52": 2572.0,
+            "53": 2977.0,
+            "54": 2621.0,
+            "55": 2248.0,
+            "56": 2813.0,
+            "57": 2293.0,
+            "58": 2874.0,
+            "59": 2959.0,
+            "60": 2499.0,
+            "61": 2762.0,
+            "62": 2658.0,
+            "63": 2472.0,
+            "64": 2840.0,
+            "65": 2587.0,
+            "66": 2880.0,
+            "67": 2813.0,
+            "68": 2775.0,
+            "69": 2821.0,
+            "70": 3127.0,
+            "71": 2870.0,
+            "72": 2609.0,
+            "73": 2835.0,
+            "74": 1993.0,
+            "75": 2474.0,
+            "76": 2896.0,
+            "77": 3050.0,
+            "78": 3120.0,
+            "79": 3093.0,
+            "80": 3284.0,
+            "81": 3502.0,
+            "82": 3169.0,
+            "83": 2614.0,
+            "84": 3087.0,
+            "85": 3140.0,
+            "86": 2590.0,
+            "87": 3631.0,
+            "88": 3120.0,
+            "89": 3308.0,
+            "90": 3137.0,
+            "91": 2801.0,
+            "92": 2977.0,
+            "93": 2727.0,
+            "94": 3180.0,
+            "95": 3264.0,
+            "96": 3436.0,
+            "97": 3329.0,
+            "98": 3661.0,
+            "99": 3152.0,
+            "100": 3367.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 465207808.0,
+            "2": 466256384.0,
+            "3": 466256384.0,
+            "4": 466256384.0,
+            "5": 466256384.0,
+            "6": 466256384.0,
+            "7": 466256384.0,
+            "8": 466256384.0,
+            "9": 466256384.0,
+            "10": 466256384.0,
+            "11": 466256384.0,
+            "12": 466256384.0,
+            "13": 466256384.0,
+            "14": 466256384.0,
+            "15": 466256384.0,
+            "16": 466256384.0,
+            "17": 466256384.0,
+            "18": 466256384.0,
+            "19": 466256384.0,
+            "20": 466256384.0,
+            "21": 466256384.0,
+            "22": 466256384.0,
+            "23": 466256384.0,
+            "24": 466256384.0,
+            "25": 466256384.0,
+            "26": 466256384.0,
+            "27": 466256384.0,
+            "28": 466256384.0,
+            "29": 466256384.0,
+            "30": 466256384.0,
+            "31": 466256384.0,
+            "32": 466256384.0,
+            "33": 466256384.0,
+            "34": 466256384.0,
+            "35": 466256384.0,
+            "36": 466256384.0,
+            "37": 466256384.0,
+            "38": 466256384.0,
+            "39": 466256384.0,
+            "40": 466256384.0,
+            "41": 466256384.0,
+            "42": 466256384.0,
+            "43": 466256384.0,
+            "44": 466256384.0,
+            "45": 466256384.0,
+            "46": 466256384.0,
+            "47": 466256384.0,
+            "48": 466256384.0,
+            "49": 466256384.0,
+            "50": 466256384.0,
+            "51": 466256384.0,
+            "52": 466256384.0,
+            "53": 466256384.0,
+            "54": 466256384.0,
+            "55": 466256384.0,
+            "56": 466256384.0,
+            "57": 466256384.0,
+            "58": 466256384.0,
+            "59": 466256384.0,
+            "60": 466256384.0,
+            "61": 466256384.0,
+            "62": 466256384.0,
+            "63": 466256384.0,
+            "64": 466256384.0,
+            "65": 466256384.0,
+            "66": 466256384.0,
+            "67": 466256384.0,
+            "68": 466256384.0,
+            "69": 466256384.0,
+            "70": 466256384.0,
+            "71": 466256384.0,
+            "72": 466256384.0,
+            "73": 466256384.0,
+            "74": 466256384.0,
+            "75": 466256384.0,
+            "76": 466256384.0,
+            "77": 466256384.0,
+            "78": 466256384.0,
+            "79": 466256384.0,
+            "80": 466256384.0,
+            "81": 466256384.0,
+            "82": 466256384.0,
+            "83": 466256384.0,
+            "84": 466256384.0,
+            "85": 466256384.0,
+            "86": 466256384.0,
+            "87": 466256384.0,
+            "88": 466256384.0,
+            "89": 466256384.0,
+            "90": 466256384.0,
+            "91": 466256384.0,
+            "92": 466256384.0,
+            "93": 466256384.0,
+            "94": 466256384.0,
+            "95": 466256384.0,
+            "96": 466256384.0,
+            "97": 466256384.0,
+            "98": 466256384.0,
+            "99": 466256384.0,
+            "100": 466256384.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1195407872.0,
+            "2": 1376397824.0,
+            "3": 1376397824.0,
+            "4": 1376397824.0,
+            "5": 1376397824.0,
+            "6": 1376397824.0,
+            "7": 1376397824.0,
+            "8": 1376397824.0,
+            "9": 1376397824.0,
+            "10": 1376397824.0,
+            "11": 1376397824.0,
+            "12": 1376397824.0,
+            "13": 1376397824.0,
+            "14": 1376397824.0,
+            "15": 1376397824.0,
+            "16": 1376397824.0,
+            "17": 1376397824.0,
+            "18": 1376397824.0,
+            "19": 1376397824.0,
+            "20": 1376397824.0,
+            "21": 1376397824.0,
+            "22": 1376397824.0,
+            "23": 1376397824.0,
+            "24": 1376397824.0,
+            "25": 1376397824.0,
+            "26": 1376397824.0,
+            "27": 1376397824.0,
+            "28": 1376397824.0,
+            "29": 1376397824.0,
+            "30": 1376397824.0,
+            "31": 1376397824.0,
+            "32": 1376397824.0,
+            "33": 1376397824.0,
+            "34": 1376397824.0,
+            "35": 1376397824.0,
+            "36": 1376397824.0,
+            "37": 1376397824.0,
+            "38": 1376397824.0,
+            "39": 1376397824.0,
+            "40": 1376397824.0,
+            "41": 1376397824.0,
+            "42": 1376397824.0,
+            "43": 1376397824.0,
+            "44": 1376397824.0,
+            "45": 1376397824.0,
+            "46": 1376397824.0,
+            "47": 1376397824.0,
+            "48": 1376397824.0,
+            "49": 1376397824.0,
+            "50": 1376397824.0,
+            "51": 1376397824.0,
+            "52": 1376397824.0,
+            "53": 1376397824.0,
+            "54": 1376397824.0,
+            "55": 1376397824.0,
+            "56": 1376397824.0,
+            "57": 1376397824.0,
+            "58": 1376397824.0,
+            "59": 1376397824.0,
+            "60": 1376397824.0,
+            "61": 1376397824.0,
+            "62": 1376397824.0,
+            "63": 1376397824.0,
+            "64": 1376397824.0,
+            "65": 1376397824.0,
+            "66": 1376397824.0,
+            "67": 1376397824.0,
+            "68": 1376397824.0,
+            "69": 1376397824.0,
+            "70": 1376397824.0,
+            "71": 1376397824.0,
+            "72": 1376397824.0,
+            "73": 1376397824.0,
+            "74": 1376397824.0,
+            "75": 1376397824.0,
+            "76": 1376397824.0,
+            "77": 1376397824.0,
+            "78": 1376397824.0,
+            "79": 1376397824.0,
+            "80": 1376397824.0,
+            "81": 1376397824.0,
+            "82": 1376397824.0,
+            "83": 1376397824.0,
+            "84": 1376397824.0,
+            "85": 1376397824.0,
+            "86": 1376397824.0,
+            "87": 1376397824.0,
+            "88": 1376397824.0,
+            "89": 1376397824.0,
+            "90": 1376397824.0,
+            "91": 1376397824.0,
+            "92": 1376397824.0,
+            "93": 1376397824.0,
+            "94": 1376397824.0,
+            "95": 1376397824.0,
+            "96": 1376397824.0,
+            "97": 1376397824.0,
+            "98": 1376397824.0,
+            "99": 1376397824.0,
+            "100": 1376397824.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.70208,
+            "3": 0.19598,
+            "4": 0.17697,
+            "5": 0.17606,
+            "6": 0.17518,
+            "7": 0.17383,
+            "8": 0.17622,
+            "9": 0.17697,
+            "10": 0.17845,
+            "11": 0.17811,
+            "12": 0.17772,
+            "13": 0.17922,
+            "14": 0.17797,
+            "15": 0.17934,
+            "16": 0.18103,
+            "17": 0.18059,
+            "18": 0.17963,
+            "19": 0.18123,
+            "20": 0.18073,
+            "21": 0.18061,
+            "22": 0.18072,
+            "23": 0.17975,
+            "24": 0.18067,
+            "25": 0.18039,
+            "26": 0.17929,
+            "27": 0.17948,
+            "28": 0.17917,
+            "29": 0.17952,
+            "30": 0.17908,
+            "31": 0.1795,
+            "32": 0.17963,
+            "33": 0.17979,
+            "34": 0.18047,
+            "35": 0.18032,
+            "36": 0.18061,
+            "37": 0.17948,
+            "38": 0.17887,
+            "39": 0.17897,
+            "40": 0.17957,
+            "41": 0.17982,
+            "42": 0.1808,
+            "43": 0.17965,
+            "44": 0.1807,
+            "45": 0.18099,
+            "46": 0.17964,
+            "47": 0.17861,
+            "48": 0.17946,
+            "49": 0.18292,
+            "50": 0.18089,
+            "51": 0.18183,
+            "52": 0.16643,
+            "53": 0.16906,
+            "54": 0.16731,
+            "55": 0.16773,
+            "56": 0.16957,
+            "57": 0.1691,
+            "58": 0.17123,
+            "59": 0.17207,
+            "60": 0.17308,
+            "61": 0.17219,
+            "62": 0.17353,
+            "63": 0.17543,
+            "64": 0.17335,
+            "65": 0.17469,
+            "66": 0.17402,
+            "67": 0.17585,
+            "68": 0.17421,
+            "69": 0.17363,
+            "70": 0.1748,
+            "71": 0.17377,
+            "72": 0.17421,
+            "73": 0.17466,
+            "74": 0.17508,
+            "75": 0.17297,
+            "76": 0.17297,
+            "77": 0.17289,
+            "78": 0.17516,
+            "79": 0.17501,
+            "80": 0.17483,
+            "81": 0.17493,
+            "82": 0.17481,
+            "83": 0.17496,
+            "84": 0.17501,
+            "85": 0.17642,
+            "86": 0.17507,
+            "87": 0.17445,
+            "88": 0.17535,
+            "89": 0.17531,
+            "90": 0.17467,
+            "91": 0.17485,
+            "92": 0.17537,
+            "93": 0.17577,
+            "94": 0.1757,
+            "95": 0.1752,
+            "96": 0.17534,
+            "97": 0.17544,
+            "98": 0.17458,
+            "99": 0.17379,
+            "100": 0.17525
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..bb062f69f88
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84981,
+            "52": 9.74427,
+            "53": 10.05843,
+            "54": 9.9513,
+            "55": 9.88338,
+            "56": 9.61325,
+            "57": 9.46895,
+            "58": 9.82166,
+            "59": 9.57703,
+            "60": 9.49787,
+            "61": 9.69257,
+            "62": 9.98595,
+            "63": 9.37399,
+            "64": 9.76604,
+            "65": 8.94651,
+            "66": 9.70103,
+            "67": 9.36368,
+            "68": 9.78235,
+            "69": 9.79883,
+            "70": 9.73165,
+            "71": 9.62507,
+            "72": 9.5831,
+            "73": 9.48817,
+            "74": 8.92613,
+            "75": 9.40726,
+            "76": 9.07706,
+            "77": 10.0586,
+            "78": 9.72205,
+            "79": 9.37661,
+            "80": 9.4027,
+            "81": 9.48209,
+            "82": 9.69951,
+            "83": 9.31355,
+            "84": 9.41731,
+            "85": 9.61584,
+            "86": 9.07426,
+            "87": 9.59553,
+            "88": 9.75065,
+            "89": 9.60039,
+            "90": 9.82207,
+            "91": 9.33876,
+            "92": 9.35777,
+            "93": 9.08671,
+            "94": 8.82959,
+            "95": 9.52597,
+            "96": 9.52973,
+            "97": 9.30334,
+            "98": 9.67135,
+            "99": 8.89539,
+            "100": 9.40569
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2483.0,
+            "52": 2539.0,
+            "53": 2798.0,
+            "54": 2526.0,
+            "55": 2228.0,
+            "56": 2738.0,
+            "57": 2252.0,
+            "58": 2797.0,
+            "59": 2893.0,
+            "60": 2453.0,
+            "61": 2929.0,
+            "62": 2698.0,
+            "63": 2347.0,
+            "64": 2902.0,
+            "65": 2556.0,
+            "66": 2922.0,
+            "67": 2829.0,
+            "68": 2669.0,
+            "69": 2814.0,
+            "70": 3041.0,
+            "71": 2872.0,
+            "72": 2512.0,
+            "73": 2971.0,
+            "74": 1870.0,
+            "75": 2349.0,
+            "76": 2844.0,
+            "77": 3121.0,
+            "78": 3116.0,
+            "79": 3196.0,
+            "80": 3164.0,
+            "81": 3454.0,
+            "82": 3176.0,
+            "83": 2613.0,
+            "84": 3093.0,
+            "85": 3128.0,
+            "86": 2792.0,
+            "87": 3771.0,
+            "88": 3108.0,
+            "89": 3297.0,
+            "90": 3042.0,
+            "91": 2850.0,
+            "92": 2873.0,
+            "93": 2709.0,
+            "94": 3294.0,
+            "95": 3282.0,
+            "96": 3536.0,
+            "97": 3150.0,
+            "98": 3479.0,
+            "99": 3113.0,
+            "100": 3370.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 465864192.0,
+            "52": 465864192.0,
+            "53": 465864192.0,
+            "54": 465864192.0,
+            "55": 465864192.0,
+            "56": 465864192.0,
+            "57": 465864192.0,
+            "58": 465864192.0,
+            "59": 465864192.0,
+            "60": 465864192.0,
+            "61": 465864192.0,
+            "62": 465864192.0,
+            "63": 465864192.0,
+            "64": 465864192.0,
+            "65": 465864192.0,
+            "66": 465864192.0,
+            "67": 465864192.0,
+            "68": 465864192.0,
+            "69": 465864192.0,
+            "70": 465864192.0,
+            "71": 465864192.0,
+            "72": 465864192.0,
+            "73": 465864192.0,
+            "74": 465864192.0,
+            "75": 465864192.0,
+            "76": 465864192.0,
+            "77": 465864192.0,
+            "78": 465864192.0,
+            "79": 465864192.0,
+            "80": 465864192.0,
+            "81": 465864192.0,
+            "82": 465864192.0,
+            "83": 465864192.0,
+            "84": 465864192.0,
+            "85": 465864192.0,
+            "86": 465864192.0,
+            "87": 465864192.0,
+            "88": 465864192.0,
+            "89": 465864192.0,
+            "90": 465864192.0,
+            "91": 465864192.0,
+            "92": 465864192.0,
+            "93": 465864192.0,
+            "94": 465864192.0,
+            "95": 465864192.0,
+            "96": 465864192.0,
+            "97": 465864192.0,
+            "98": 465864192.0,
+            "99": 465864192.0,
+            "100": 465864192.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1378101760.0,
+            "52": 1378102784.0,
+            "53": 1378102784.0,
+            "54": 1378102784.0,
+            "55": 1378102784.0,
+            "56": 1378102784.0,
+            "57": 1378102784.0,
+            "58": 1378102784.0,
+            "59": 1378102784.0,
+            "60": 1378102784.0,
+            "61": 1378102784.0,
+            "62": 1378102784.0,
+            "63": 1378102784.0,
+            "64": 1378102784.0,
+            "65": 1378102784.0,
+            "66": 1378102784.0,
+            "67": 1378102784.0,
+            "68": 1378102784.0,
+            "69": 1378102784.0,
+            "70": 1378102784.0,
+            "71": 1378102784.0,
+            "72": 1378102784.0,
+            "73": 1378102784.0,
+            "74": 1378102784.0,
+            "75": 1378102784.0,
+            "76": 1378102784.0,
+            "77": 1378102784.0,
+            "78": 1378102784.0,
+            "79": 1378102784.0,
+            "80": 1378102784.0,
+            "81": 1378102784.0,
+            "82": 1378102784.0,
+            "83": 1378102784.0,
+            "84": 1378102784.0,
+            "85": 1378102784.0,
+            "86": 1378102784.0,
+            "87": 1378102784.0,
+            "88": 1378102784.0,
+            "89": 1378102784.0,
+            "90": 1378102784.0,
+            "91": 1378102784.0,
+            "92": 1378102784.0,
+            "93": 1378102784.0,
+            "94": 1378102784.0,
+            "95": 1378102784.0,
+            "96": 1378102784.0,
+            "97": 1378102784.0,
+            "98": 1378102784.0,
+            "99": 1378102784.0,
+            "100": 1378102784.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.91958,
+            "53": 0.19241,
+            "54": 0.1787,
+            "55": 0.17896,
+            "56": 0.17921,
+            "57": 0.17976,
+            "58": 0.17909,
+            "59": 0.18055,
+            "60": 0.18105,
+            "61": 0.18154,
+            "62": 0.18039,
+            "63": 0.18167,
+            "64": 0.1811,
+            "65": 0.18155,
+            "66": 0.18214,
+            "67": 0.18228,
+            "68": 0.18197,
+            "69": 0.18293,
+            "70": 0.1812,
+            "71": 0.18051,
+            "72": 0.18186,
+            "73": 0.18056,
+            "74": 0.17931,
+            "75": 0.17657,
+            "76": 0.17103,
+            "77": 0.16975,
+            "78": 0.17336,
+            "79": 0.17356,
+            "80": 0.17203,
+            "81": 0.17343,
+            "82": 0.17407,
+            "83": 0.17347,
+            "84": 0.17434,
+            "85": 0.17348,
+            "86": 0.17449,
+            "87": 0.17439,
+            "88": 0.17349,
+            "89": 0.17397,
+            "90": 0.17349,
+            "91": 0.17383,
+            "92": 0.17402,
+            "93": 0.17583,
+            "94": 0.17507,
+            "95": 0.17414,
+            "96": 0.17276,
+            "97": 0.17329,
+            "98": 0.17376,
+            "99": 0.17325,
+            "100": 0.17482
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100.json
index 80f6783f6f2..ab389cd452c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 518291968.0,
-            "2": 518291968.0,
-            "3": 518291968.0,
-            "4": 518291968.0,
-            "5": 518291968.0,
-            "6": 518291968.0,
-            "7": 518291968.0,
-            "8": 518291968.0,
-            "9": 518291968.0,
-            "10": 518291968.0,
-            "11": 518291968.0,
-            "12": 518291968.0,
-            "13": 518291968.0,
-            "14": 518291968.0,
-            "15": 518291968.0,
-            "16": 518291968.0,
-            "17": 518291968.0,
-            "18": 518291968.0,
-            "19": 518291968.0,
-            "20": 518291968.0,
-            "21": 518291968.0,
-            "22": 518291968.0,
-            "23": 518291968.0,
-            "24": 518291968.0,
-            "25": 518291968.0,
-            "26": 518291968.0,
-            "27": 518291968.0,
-            "28": 518291968.0,
-            "29": 518291968.0,
-            "30": 518291968.0,
-            "31": 518291968.0,
-            "32": 518291968.0,
-            "33": 518291968.0,
-            "34": 518291968.0,
-            "35": 518291968.0,
-            "36": 518291968.0,
-            "37": 518291968.0,
-            "38": 518291968.0,
-            "39": 518291968.0,
-            "40": 518291968.0,
-            "41": 518291968.0,
-            "42": 518291968.0,
-            "43": 518291968.0,
-            "44": 518291968.0,
-            "45": 518291968.0,
-            "46": 518291968.0,
-            "47": 518291968.0,
-            "48": 518291968.0,
-            "49": 518291968.0,
-            "50": 518291968.0,
-            "51": 518291968.0,
-            "52": 518291968.0,
-            "53": 518291968.0,
-            "54": 518291968.0,
-            "55": 518291968.0,
-            "56": 518291968.0,
-            "57": 518291968.0,
-            "58": 518291968.0,
-            "59": 518291968.0,
-            "60": 518291968.0,
-            "61": 518291968.0,
-            "62": 518291968.0,
-            "63": 518291968.0,
-            "64": 518291968.0,
-            "65": 518291968.0,
-            "66": 518291968.0,
-            "67": 518291968.0,
-            "68": 518291968.0,
-            "69": 518291968.0,
-            "70": 518291968.0,
-            "71": 518291968.0,
-            "72": 518291968.0,
-            "73": 518291968.0,
-            "74": 518291968.0,
-            "75": 518291968.0,
-            "76": 518291968.0,
-            "77": 518291968.0,
-            "78": 518291968.0,
-            "79": 518291968.0,
-            "80": 518291968.0,
-            "81": 518291968.0,
-            "82": 518291968.0,
-            "83": 518291968.0,
-            "84": 518291968.0,
-            "85": 518291968.0,
-            "86": 518291968.0,
-            "87": 518291968.0,
-            "88": 518291968.0,
-            "89": 518291968.0,
-            "90": 518291968.0,
-            "91": 518291968.0,
-            "92": 518291968.0,
-            "93": 518291968.0,
-            "94": 518291968.0,
-            "95": 518291968.0,
-            "96": 518291968.0,
-            "97": 518291968.0,
-            "98": 518291968.0,
-            "99": 518291968.0,
-            "100": 518291968.0
+            "1": 516456960.0,
+            "2": 516456960.0,
+            "3": 516456960.0,
+            "4": 516456960.0,
+            "5": 516456960.0,
+            "6": 516456960.0,
+            "7": 516456960.0,
+            "8": 516456960.0,
+            "9": 516456960.0,
+            "10": 516456960.0,
+            "11": 516456960.0,
+            "12": 516456960.0,
+            "13": 516456960.0,
+            "14": 516456960.0,
+            "15": 516456960.0,
+            "16": 516456960.0,
+            "17": 516456960.0,
+            "18": 516456960.0,
+            "19": 516456960.0,
+            "20": 516456960.0,
+            "21": 516456960.0,
+            "22": 516456960.0,
+            "23": 516456960.0,
+            "24": 516456960.0,
+            "25": 516456960.0,
+            "26": 516456960.0,
+            "27": 516456960.0,
+            "28": 516456960.0,
+            "29": 516456960.0,
+            "30": 516456960.0,
+            "31": 516456960.0,
+            "32": 516456960.0,
+            "33": 516456960.0,
+            "34": 516456960.0,
+            "35": 516456960.0,
+            "36": 516456960.0,
+            "37": 516456960.0,
+            "38": 516456960.0,
+            "39": 516456960.0,
+            "40": 516456960.0,
+            "41": 516456960.0,
+            "42": 516456960.0,
+            "43": 516456960.0,
+            "44": 516456960.0,
+            "45": 516456960.0,
+            "46": 516456960.0,
+            "47": 516456960.0,
+            "48": 516456960.0,
+            "49": 516456960.0,
+            "50": 516456960.0,
+            "51": 516456960.0,
+            "52": 516456960.0,
+            "53": 516456960.0,
+            "54": 516456960.0,
+            "55": 516456960.0,
+            "56": 516456960.0,
+            "57": 516456960.0,
+            "58": 516456960.0,
+            "59": 516456960.0,
+            "60": 516456960.0,
+            "61": 516456960.0,
+            "62": 516456960.0,
+            "63": 516456960.0,
+            "64": 516456960.0,
+            "65": 516456960.0,
+            "66": 516456960.0,
+            "67": 516456960.0,
+            "68": 516456960.0,
+            "69": 516456960.0,
+            "70": 516456960.0,
+            "71": 516456960.0,
+            "72": 516456960.0,
+            "73": 516456960.0,
+            "74": 516456960.0,
+            "75": 516456960.0,
+            "76": 516456960.0,
+            "77": 516456960.0,
+            "78": 516456960.0,
+            "79": 516456960.0,
+            "80": 516456960.0,
+            "81": 516456960.0,
+            "82": 516456960.0,
+            "83": 516456960.0,
+            "84": 516456960.0,
+            "85": 516456960.0,
+            "86": 516456960.0,
+            "87": 516456960.0,
+            "88": 516456960.0,
+            "89": 516456960.0,
+            "90": 516456960.0,
+            "91": 516456960.0,
+            "92": 516456960.0,
+            "93": 516456960.0,
+            "94": 516456960.0,
+            "95": 516456960.0,
+            "96": 516456960.0,
+            "97": 516456960.0,
+            "98": 516456960.0,
+            "99": 516456960.0,
+            "100": 516456960.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1245476352.0,
-            "2": 1429481984.0,
-            "3": 1429481984.0,
-            "4": 1429481984.0,
-            "5": 1429481984.0,
-            "6": 1429481984.0,
-            "7": 1429481984.0,
-            "8": 1429481984.0,
-            "9": 1429481984.0,
-            "10": 1429481984.0,
-            "11": 1429481984.0,
-            "12": 1429481984.0,
-            "13": 1429481984.0,
-            "14": 1429481984.0,
-            "15": 1429481984.0,
-            "16": 1429481984.0,
-            "17": 1429481984.0,
-            "18": 1429481984.0,
-            "19": 1429481984.0,
-            "20": 1429481984.0,
-            "21": 1429481984.0,
-            "22": 1429481984.0,
-            "23": 1429481984.0,
-            "24": 1429481984.0,
-            "25": 1429481984.0,
-            "26": 1429481984.0,
-            "27": 1429481984.0,
-            "28": 1429481984.0,
-            "29": 1429481984.0,
-            "30": 1429481984.0,
-            "31": 1429481984.0,
-            "32": 1429481984.0,
-            "33": 1429481984.0,
-            "34": 1429481984.0,
-            "35": 1429481984.0,
-            "36": 1429481984.0,
-            "37": 1429481984.0,
-            "38": 1429481984.0,
-            "39": 1429481984.0,
-            "40": 1429481984.0,
-            "41": 1429481984.0,
-            "42": 1429481984.0,
-            "43": 1429481984.0,
-            "44": 1429481984.0,
-            "45": 1429481984.0,
-            "46": 1429481984.0,
-            "47": 1429481984.0,
-            "48": 1429481984.0,
-            "49": 1429481984.0,
-            "50": 1429481984.0,
-            "51": 1429481984.0,
-            "52": 1429481984.0,
-            "53": 1429481984.0,
-            "54": 1429481984.0,
-            "55": 1429481984.0,
-            "56": 1429481984.0,
-            "57": 1429481984.0,
-            "58": 1429481984.0,
-            "59": 1429481984.0,
-            "60": 1429481984.0,
-            "61": 1429481984.0,
-            "62": 1429481984.0,
-            "63": 1429481984.0,
-            "64": 1429481984.0,
-            "65": 1429481984.0,
-            "66": 1429481984.0,
-            "67": 1429481984.0,
-            "68": 1429481984.0,
-            "69": 1429481984.0,
-            "70": 1429481984.0,
-            "71": 1429481984.0,
-            "72": 1429481984.0,
-            "73": 1429481984.0,
-            "74": 1429481984.0,
-            "75": 1429481984.0,
-            "76": 1429481984.0,
-            "77": 1429481984.0,
-            "78": 1429481984.0,
-            "79": 1429481984.0,
-            "80": 1429481984.0,
-            "81": 1429481984.0,
-            "82": 1429481984.0,
-            "83": 1429481984.0,
-            "84": 1429481984.0,
-            "85": 1429481984.0,
-            "86": 1429481984.0,
-            "87": 1429481984.0,
-            "88": 1429481984.0,
-            "89": 1429481984.0,
-            "90": 1429481984.0,
-            "91": 1429481984.0,
-            "92": 1429481984.0,
-            "93": 1429481984.0,
-            "94": 1429481984.0,
-            "95": 1429481984.0,
-            "96": 1429481984.0,
-            "97": 1429481984.0,
-            "98": 1429481984.0,
-            "99": 1429481984.0,
-            "100": 1429481984.0
+            "1": 1246525952.0,
+            "2": 1426598400.0,
+            "3": 1426598400.0,
+            "4": 1426598400.0,
+            "5": 1426598400.0,
+            "6": 1426598400.0,
+            "7": 1426598400.0,
+            "8": 1426598400.0,
+            "9": 1426598400.0,
+            "10": 1426598400.0,
+            "11": 1426598400.0,
+            "12": 1426598400.0,
+            "13": 1426598400.0,
+            "14": 1426598400.0,
+            "15": 1426598400.0,
+            "16": 1426598400.0,
+            "17": 1426598400.0,
+            "18": 1426598400.0,
+            "19": 1426598400.0,
+            "20": 1426598400.0,
+            "21": 1426598400.0,
+            "22": 1426598400.0,
+            "23": 1426598400.0,
+            "24": 1426598400.0,
+            "25": 1426598400.0,
+            "26": 1426598400.0,
+            "27": 1426598400.0,
+            "28": 1426598400.0,
+            "29": 1426598400.0,
+            "30": 1426598400.0,
+            "31": 1426598400.0,
+            "32": 1426598400.0,
+            "33": 1426598400.0,
+            "34": 1426598400.0,
+            "35": 1426598400.0,
+            "36": 1426598400.0,
+            "37": 1426598400.0,
+            "38": 1426598400.0,
+            "39": 1426598400.0,
+            "40": 1426598400.0,
+            "41": 1426598400.0,
+            "42": 1426598400.0,
+            "43": 1426598400.0,
+            "44": 1426598400.0,
+            "45": 1426598400.0,
+            "46": 1426598400.0,
+            "47": 1426598400.0,
+            "48": 1426598400.0,
+            "49": 1426598400.0,
+            "50": 1426598400.0,
+            "51": 1426598400.0,
+            "52": 1426598400.0,
+            "53": 1426598400.0,
+            "54": 1426598400.0,
+            "55": 1426598400.0,
+            "56": 1426598400.0,
+            "57": 1426598400.0,
+            "58": 1426598400.0,
+            "59": 1426598400.0,
+            "60": 1426598400.0,
+            "61": 1426598400.0,
+            "62": 1426598400.0,
+            "63": 1426598400.0,
+            "64": 1426598400.0,
+            "65": 1426598400.0,
+            "66": 1426598400.0,
+            "67": 1426598400.0,
+            "68": 1426598400.0,
+            "69": 1426598400.0,
+            "70": 1426598400.0,
+            "71": 1426598400.0,
+            "72": 1426598400.0,
+            "73": 1426598400.0,
+            "74": 1426598400.0,
+            "75": 1426598400.0,
+            "76": 1426598400.0,
+            "77": 1426598400.0,
+            "78": 1426598400.0,
+            "79": 1426598400.0,
+            "80": 1426598400.0,
+            "81": 1426598400.0,
+            "82": 1426598400.0,
+            "83": 1426598400.0,
+            "84": 1426598400.0,
+            "85": 1426598400.0,
+            "86": 1426598400.0,
+            "87": 1426598400.0,
+            "88": 1426598400.0,
+            "89": 1426598400.0,
+            "90": 1426598400.0,
+            "91": 1426598400.0,
+            "92": 1426598400.0,
+            "93": 1426598400.0,
+            "94": 1426598400.0,
+            "95": 1426598400.0,
+            "96": 1426598400.0,
+            "97": 1426598400.0,
+            "98": 1426598400.0,
+            "99": 1426598400.0,
+            "100": 1426598400.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 12.65353,
-            "2": 0.15729,
-            "3": 0.13911,
-            "4": 0.14117,
-            "5": 0.14172,
-            "6": 0.14091,
-            "7": 0.14103,
-            "8": 0.14008,
-            "9": 0.14444,
-            "10": 0.14215,
-            "11": 0.143,
-            "12": 0.14395,
-            "13": 0.14101,
-            "14": 0.14112,
-            "15": 0.14126,
-            "16": 0.14286,
-            "17": 0.14201,
-            "18": 0.14405,
-            "19": 0.14472,
-            "20": 0.14424,
-            "21": 0.14746,
-            "22": 0.14732,
-            "23": 0.14871,
-            "24": 0.14885,
-            "25": 0.14732,
-            "26": 0.14775,
-            "27": 0.14978,
-            "28": 0.14685,
-            "29": 0.15004,
-            "30": 0.14663,
-            "31": 0.14925,
-            "32": 0.14679,
-            "33": 0.14465,
-            "34": 0.14701,
-            "35": 0.14556,
-            "36": 0.14835,
-            "37": 0.14562,
-            "38": 0.14971,
-            "39": 0.14881,
-            "40": 0.14688,
-            "41": 0.14373,
-            "42": 0.14577,
-            "43": 0.14595,
-            "44": 0.1465,
-            "45": 0.14283,
-            "46": 0.14194,
-            "47": 0.14334,
-            "48": 0.14235,
-            "49": 0.14347,
-            "50": 0.14228,
-            "51": 0.14946,
-            "52": 0.14427,
-            "53": 0.14469,
-            "54": 0.14466,
-            "55": 0.14197,
-            "56": 0.14396,
-            "57": 0.14283,
-            "58": 0.14383,
-            "59": 0.14201,
-            "60": 0.14448,
-            "61": 0.14593,
-            "62": 0.14316,
-            "63": 0.14235,
-            "64": 0.14447,
-            "65": 0.14383,
-            "66": 0.14456,
-            "67": 0.14508,
-            "68": 0.1452,
-            "69": 0.14518,
-            "70": 0.1449,
-            "71": 0.14576,
-            "72": 0.14328,
-            "73": 0.14352,
-            "74": 0.1504,
-            "75": 0.15058,
-            "76": 0.14825,
-            "77": 0.14229,
-            "78": 0.14494,
-            "79": 0.14518,
-            "80": 0.14464,
-            "81": 0.1461,
-            "82": 0.14482,
-            "83": 0.14487,
-            "84": 0.14272,
-            "85": 0.14154,
-            "86": 0.14252,
-            "87": 0.1447,
-            "88": 0.14327,
-            "89": 0.1441,
-            "90": 0.14688,
-            "91": 0.14346,
-            "92": 0.14427,
-            "93": 0.14222,
-            "94": 0.14464,
-            "95": 0.14507,
-            "96": 0.14196,
-            "97": 0.1438,
-            "98": 0.14103,
-            "99": 0.14644,
-            "100": 0.14474
+            "1": 8.55796,
+            "2": 0.16015,
+            "3": 0.14079,
+            "4": 0.11738,
+            "5": 0.12195,
+            "6": 0.12441,
+            "7": 0.1172,
+            "8": 0.11692,
+            "9": 0.11919,
+            "10": 0.12076,
+            "11": 0.12158,
+            "12": 0.12094,
+            "13": 0.11812,
+            "14": 0.11938,
+            "15": 0.1172,
+            "16": 0.11613,
+            "17": 0.11557,
+            "18": 0.11401,
+            "19": 0.11498,
+            "20": 0.11349,
+            "21": 0.11351,
+            "22": 0.11386,
+            "23": 0.11441,
+            "24": 0.11363,
+            "25": 0.1167,
+            "26": 0.1134,
+            "27": 0.11514,
+            "28": 0.12945,
+            "29": 0.12623,
+            "30": 0.11515,
+            "31": 0.11213,
+            "32": 0.11356,
+            "33": 0.11231,
+            "34": 0.11288,
+            "35": 0.11401,
+            "36": 0.11375,
+            "37": 0.1131,
+            "38": 0.11218,
+            "39": 0.11367,
+            "40": 0.11358,
+            "41": 0.11254,
+            "42": 0.11336,
+            "43": 0.11318,
+            "44": 0.11297,
+            "45": 0.11264,
+            "46": 0.11205,
+            "47": 0.11364,
+            "48": 0.11191,
+            "49": 0.11164,
+            "50": 0.11224,
+            "51": 0.12452,
+            "52": 0.11481,
+            "53": 0.11411,
+            "54": 0.11453,
+            "55": 0.11486,
+            "56": 0.1126,
+            "57": 0.11285,
+            "58": 0.11369,
+            "59": 0.11438,
+            "60": 0.11423,
+            "61": 0.11347,
+            "62": 0.1144,
+            "63": 0.11359,
+            "64": 0.11501,
+            "65": 0.11372,
+            "66": 0.11274,
+            "67": 0.11362,
+            "68": 0.11321,
+            "69": 0.11196,
+            "70": 0.11191,
+            "71": 0.11138,
+            "72": 0.11254,
+            "73": 0.11635,
+            "74": 0.11349,
+            "75": 0.11272,
+            "76": 0.1135,
+            "77": 0.11299,
+            "78": 0.11411,
+            "79": 0.11258,
+            "80": 0.113,
+            "81": 0.11306,
+            "82": 0.11448,
+            "83": 0.11412,
+            "84": 0.11261,
+            "85": 0.11298,
+            "86": 0.11478,
+            "87": 0.1143,
+            "88": 0.11208,
+            "89": 0.11453,
+            "90": 0.11257,
+            "91": 0.11387,
+            "92": 0.11269,
+            "93": 0.1133,
+            "94": 0.11392,
+            "95": 0.11421,
+            "96": 0.1138,
+            "97": 0.11394,
+            "98": 0.1141,
+            "99": 0.1139,
+            "100": 0.11305
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..262e81423cd
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85583,
+            "52": 9.75242,
+            "53": 10.07589,
+            "54": 9.95688,
+            "55": 9.88208,
+            "56": 9.63141,
+            "57": 9.48651,
+            "58": 9.83118,
+            "59": 9.58905,
+            "60": 9.50651,
+            "61": 9.7037,
+            "62": 9.98291,
+            "63": 9.38315,
+            "64": 9.77906,
+            "65": 8.95179,
+            "66": 9.7016,
+            "67": 9.37206,
+            "68": 9.78852,
+            "69": 9.79859,
+            "70": 9.74746,
+            "71": 9.6191,
+            "72": 9.58502,
+            "73": 9.49725,
+            "74": 8.93933,
+            "75": 9.42706,
+            "76": 9.08024,
+            "77": 10.06571,
+            "78": 9.72896,
+            "79": 9.37772,
+            "80": 9.40999,
+            "81": 9.47983,
+            "82": 9.70184,
+            "83": 9.30625,
+            "84": 9.42095,
+            "85": 9.61378,
+            "86": 9.07656,
+            "87": 9.59458,
+            "88": 9.75068,
+            "89": 9.60243,
+            "90": 9.81901,
+            "91": 9.33899,
+            "92": 9.35717,
+            "93": 9.07883,
+            "94": 8.8351,
+            "95": 9.52171,
+            "96": 9.53008,
+            "97": 9.31309,
+            "98": 9.67785,
+            "99": 8.89061,
+            "100": 9.39726
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2528.0,
+            "52": 2535.0,
+            "53": 2875.0,
+            "54": 2862.0,
+            "55": 2406.0,
+            "56": 2733.0,
+            "57": 2347.0,
+            "58": 2918.0,
+            "59": 2759.0,
+            "60": 2404.0,
+            "61": 3022.0,
+            "62": 2494.0,
+            "63": 2452.0,
+            "64": 2838.0,
+            "65": 2549.0,
+            "66": 3044.0,
+            "67": 2887.0,
+            "68": 2637.0,
+            "69": 2860.0,
+            "70": 3034.0,
+            "71": 2989.0,
+            "72": 2355.0,
+            "73": 3034.0,
+            "74": 1904.0,
+            "75": 2538.0,
+            "76": 3012.0,
+            "77": 3193.0,
+            "78": 2994.0,
+            "79": 3097.0,
+            "80": 3254.0,
+            "81": 3671.0,
+            "82": 3299.0,
+            "83": 2793.0,
+            "84": 3146.0,
+            "85": 3329.0,
+            "86": 2769.0,
+            "87": 3766.0,
+            "88": 3021.0,
+            "89": 3286.0,
+            "90": 3029.0,
+            "91": 2772.0,
+            "92": 2955.0,
+            "93": 2852.0,
+            "94": 3411.0,
+            "95": 3271.0,
+            "96": 3279.0,
+            "97": 3054.0,
+            "98": 3643.0,
+            "99": 3303.0,
+            "100": 3142.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 696530432.0,
+            "52": 696530432.0,
+            "53": 696530432.0,
+            "54": 696530432.0,
+            "55": 696530432.0,
+            "56": 696530432.0,
+            "57": 696530432.0,
+            "58": 696530432.0,
+            "59": 696530432.0,
+            "60": 696530432.0,
+            "61": 696530432.0,
+            "62": 696530432.0,
+            "63": 696530432.0,
+            "64": 696530432.0,
+            "65": 696530432.0,
+            "66": 696530432.0,
+            "67": 696530432.0,
+            "68": 696530432.0,
+            "69": 696530432.0,
+            "70": 696530432.0,
+            "71": 696530432.0,
+            "72": 696530432.0,
+            "73": 696530432.0,
+            "74": 696530432.0,
+            "75": 696530432.0,
+            "76": 696530432.0,
+            "77": 696530432.0,
+            "78": 696530432.0,
+            "79": 696530432.0,
+            "80": 696530432.0,
+            "81": 696530432.0,
+            "82": 696530432.0,
+            "83": 696530432.0,
+            "84": 696530432.0,
+            "85": 696530432.0,
+            "86": 696530432.0,
+            "87": 696530432.0,
+            "88": 696530432.0,
+            "89": 696530432.0,
+            "90": 696530432.0,
+            "91": 696530432.0,
+            "92": 696530432.0,
+            "93": 696530432.0,
+            "94": 696530432.0,
+            "95": 696530432.0,
+            "96": 696530432.0,
+            "97": 696530432.0,
+            "98": 696530432.0,
+            "99": 696530432.0,
+            "100": 696530432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1606670848.0,
+            "52": 1606671872.0,
+            "53": 1606671872.0,
+            "54": 1606671872.0,
+            "55": 1606671872.0,
+            "56": 1606671872.0,
+            "57": 1606671872.0,
+            "58": 1606671872.0,
+            "59": 1606671872.0,
+            "60": 1606671872.0,
+            "61": 1606671872.0,
+            "62": 1606671872.0,
+            "63": 1606671872.0,
+            "64": 1606671872.0,
+            "65": 1606671872.0,
+            "66": 1606671872.0,
+            "67": 1606671872.0,
+            "68": 1606671872.0,
+            "69": 1606671872.0,
+            "70": 1606671872.0,
+            "71": 1606671872.0,
+            "72": 1606671872.0,
+            "73": 1606671872.0,
+            "74": 1606671872.0,
+            "75": 1606671872.0,
+            "76": 1606671872.0,
+            "77": 1606671872.0,
+            "78": 1606671872.0,
+            "79": 1606671872.0,
+            "80": 1606671872.0,
+            "81": 1606671872.0,
+            "82": 1606671872.0,
+            "83": 1606671872.0,
+            "84": 1606671872.0,
+            "85": 1606671872.0,
+            "86": 1606671872.0,
+            "87": 1606671872.0,
+            "88": 1606671872.0,
+            "89": 1606671872.0,
+            "90": 1606671872.0,
+            "91": 1606671872.0,
+            "92": 1606671872.0,
+            "93": 1606671872.0,
+            "94": 1606671872.0,
+            "95": 1606671872.0,
+            "96": 1606671872.0,
+            "97": 1606671872.0,
+            "98": 1606671872.0,
+            "99": 1606671872.0,
+            "100": 1606671872.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.21698,
+            "52": 0.15014,
+            "53": 0.12142,
+            "54": 0.12079,
+            "55": 0.12087,
+            "56": 0.11996,
+            "57": 0.12048,
+            "58": 0.12044,
+            "59": 0.12,
+            "60": 0.12024,
+            "61": 0.11964,
+            "62": 0.1216,
+            "63": 0.12133,
+            "64": 0.12065,
+            "65": 0.11968,
+            "66": 0.12123,
+            "67": 0.11973,
+            "68": 0.11993,
+            "69": 0.12002,
+            "70": 0.12021,
+            "71": 0.11952,
+            "72": 0.12017,
+            "73": 0.1196,
+            "74": 0.11995,
+            "75": 0.12119,
+            "76": 0.12147,
+            "77": 0.12101,
+            "78": 0.12058,
+            "79": 0.12234,
+            "80": 0.12023,
+            "81": 0.12099,
+            "82": 0.12135,
+            "83": 0.11794,
+            "84": 0.11366,
+            "85": 0.11362,
+            "86": 0.11298,
+            "87": 0.11323,
+            "88": 0.11437,
+            "89": 0.11389,
+            "90": 0.11505,
+            "91": 0.11411,
+            "92": 0.11424,
+            "93": 0.11409,
+            "94": 0.11311,
+            "95": 0.11421,
+            "96": 0.11364,
+            "97": 0.11399,
+            "98": 0.11382,
+            "99": 0.1137,
+            "100": 0.11717
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100.json
index e88f2c340d5..3874b80ddea 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100.json
@@ -2,141 +2,536 @@
     "lm loss": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 10.92655,
-            "5": 10.92719,
-            "10": 10.90792,
-            "15": 10.88292,
-            "20": 10.77597,
-            "25": 10.59263,
-            "30": 10.39174,
-            "35": 10.29698,
-            "40": 10.09664,
-            "45": 9.8447,
-            "50": 9.90944,
-            "55": 9.8777,
-            "60": 9.49123,
-            "65": 8.94255,
-            "70": 9.72279,
-            "75": 9.4189,
-            "80": 9.40055,
-            "85": 9.61189,
-            "90": 9.81027,
-            "95": 9.51723,
+            "1": 10.92228,
+            "2": 10.92833,
+            "3": 10.91713,
+            "4": 10.90495,
+            "5": 10.92808,
+            "6": 10.93674,
+            "7": 10.90402,
+            "8": 10.92227,
+            "9": 10.91254,
+            "10": 10.9085,
+            "11": 10.89337,
+            "12": 10.92084,
+            "13": 10.91494,
+            "14": 10.92149,
+            "15": 10.88433,
+            "16": 10.87456,
+            "17": 10.83921,
+            "18": 10.87308,
+            "19": 10.85328,
+            "20": 10.77491,
+            "21": 10.74755,
+            "22": 10.63144,
+            "23": 10.75622,
+            "24": 10.65564,
+            "25": 10.59217,
+            "26": 10.65329,
+            "27": 10.64878,
+            "28": 10.59653,
+            "29": 10.61014,
+            "30": 10.39286,
+            "31": 10.15722,
+            "32": 10.49224,
+            "33": 10.47942,
+            "34": 10.24013,
+            "35": 10.29715,
+            "36": 10.24564,
+            "37": 10.35285,
+            "38": 10.20534,
+            "39": 10.40417,
+            "40": 10.09551,
+            "41": 10.15275,
+            "42": 10.21879,
+            "43": 9.85523,
+            "44": 9.96245,
+            "45": 9.84616,
+            "46": 9.83799,
+            "47": 10.13884,
+            "48": 9.85698,
+            "49": 9.5375,
+            "50": 9.90879,
+            "51": 9.84975,
+            "52": 9.74159,
+            "53": 10.06327,
+            "54": 9.9459,
+            "55": 9.87743,
+            "56": 9.62749,
+            "57": 9.47268,
+            "58": 9.82918,
+            "59": 9.58307,
+            "60": 9.49187,
+            "61": 9.69959,
+            "62": 9.98095,
+            "63": 9.37226,
+            "64": 9.77561,
+            "65": 8.94344,
+            "66": 9.69994,
+            "67": 9.3642,
+            "68": 9.78704,
+            "69": 9.78396,
+            "70": 9.72293,
+            "71": 9.60744,
+            "72": 9.58422,
+            "73": 9.49093,
+            "74": 8.94876,
+            "75": 9.41814,
+            "76": 9.08731,
+            "77": 10.06286,
+            "78": 9.72902,
+            "79": 9.37093,
+            "80": 9.40038,
+            "81": 9.47763,
+            "82": 9.69129,
+            "83": 9.30768,
+            "84": 9.41257,
+            "85": 9.61139,
+            "86": 9.07621,
+            "87": 9.59461,
+            "88": 9.74776,
+            "89": 9.60681,
+            "90": 9.81085,
+            "91": 9.34453,
+            "92": 9.36537,
+            "93": 9.07751,
+            "94": 8.82977,
+            "95": 9.5168,
+            "96": 9.52549,
+            "97": 9.31038,
+            "98": 9.67816,
+            "99": 8.8885,
             "100": 9.40135
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 1597.0,
-            "5": 1937.0,
-            "10": 1408.0,
-            "15": 1893.0,
-            "20": 1612.0,
-            "25": 1633.0,
-            "30": 1941.0,
-            "35": 2005.0,
-            "40": 2164.0,
-            "45": 2053.0,
-            "50": 2437.0,
-            "55": 2409.0,
-            "60": 2419.0,
-            "65": 2713.0,
-            "70": 3098.0,
-            "75": 2685.0,
-            "80": 3562.0,
-            "85": 3262.0,
-            "90": 3059.0,
-            "95": 3380.0,
-            "100": 3331.0
+            "1": 1686.0,
+            "2": 1781.0,
+            "3": 1710.0,
+            "4": 1721.0,
+            "5": 1915.0,
+            "6": 1840.0,
+            "7": 1923.0,
+            "8": 1740.0,
+            "9": 1904.0,
+            "10": 1501.0,
+            "11": 1902.0,
+            "12": 1815.0,
+            "13": 1919.0,
+            "14": 1911.0,
+            "15": 1953.0,
+            "16": 1875.0,
+            "17": 1835.0,
+            "18": 1725.0,
+            "19": 1755.0,
+            "20": 1680.0,
+            "21": 1823.0,
+            "22": 1751.0,
+            "23": 1966.0,
+            "24": 1652.0,
+            "25": 1619.0,
+            "26": 1847.0,
+            "27": 1890.0,
+            "28": 1990.0,
+            "29": 2013.0,
+            "30": 1924.0,
+            "31": 1602.0,
+            "32": 1911.0,
+            "33": 2246.0,
+            "34": 1989.0,
+            "35": 2000.0,
+            "36": 2116.0,
+            "37": 2402.0,
+            "38": 2298.0,
+            "39": 2567.0,
+            "40": 2163.0,
+            "41": 2333.0,
+            "42": 2300.0,
+            "43": 1996.0,
+            "44": 2153.0,
+            "45": 2130.0,
+            "46": 2301.0,
+            "47": 2552.0,
+            "48": 2428.0,
+            "49": 2290.0,
+            "50": 2566.0,
+            "51": 2688.0,
+            "52": 2651.0,
+            "53": 2961.0,
+            "54": 2714.0,
+            "55": 2381.0,
+            "56": 2747.0,
+            "57": 2435.0,
+            "58": 2979.0,
+            "59": 2834.0,
+            "60": 2440.0,
+            "61": 2844.0,
+            "62": 2761.0,
+            "63": 2449.0,
+            "64": 3041.0,
+            "65": 2711.0,
+            "66": 3212.0,
+            "67": 2724.0,
+            "68": 2866.0,
+            "69": 2992.0,
+            "70": 3273.0,
+            "71": 3119.0,
+            "72": 2480.0,
+            "73": 3140.0,
+            "74": 1959.0,
+            "75": 2732.0,
+            "76": 3088.0,
+            "77": 3496.0,
+            "78": 3193.0,
+            "79": 3370.0,
+            "80": 3523.0,
+            "81": 3655.0,
+            "82": 3409.0,
+            "83": 2797.0,
+            "84": 3476.0,
+            "85": 3443.0,
+            "86": 2736.0,
+            "87": 3762.0,
+            "88": 3082.0,
+            "89": 3460.0,
+            "90": 2999.0,
+            "91": 2667.0,
+            "92": 3190.0,
+            "93": 2704.0,
+            "94": 3348.0,
+            "95": 3464.0,
+            "96": 3616.0,
+            "97": 3124.0,
+            "98": 3688.0,
+            "99": 3176.0,
+            "100": 3301.0
         }
     },
     "mem-allocated-bytes": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 435847168.0,
-            "5": 435847168.0,
-            "10": 436895744.0,
-            "15": 435847168.0,
-            "20": 435847168.0,
-            "25": 436895744.0,
-            "30": 436895744.0,
-            "35": 435847168.0,
-            "40": 435847168.0,
-            "45": 435847168.0,
-            "50": 435847168.0,
-            "55": 436895744.0,
-            "60": 436895744.0,
-            "65": 436895744.0,
-            "70": 435847168.0,
-            "75": 435847168.0,
-            "80": 436895744.0,
-            "85": 436895744.0,
-            "90": 436895744.0,
-            "95": 435847168.0,
-            "100": 436895744.0
+            "1": 436765184.0,
+            "2": 436765184.0,
+            "3": 436765184.0,
+            "4": 436765184.0,
+            "5": 436765184.0,
+            "6": 436765184.0,
+            "7": 436765184.0,
+            "8": 436765184.0,
+            "9": 436765184.0,
+            "10": 436765184.0,
+            "11": 436765184.0,
+            "12": 436765184.0,
+            "13": 436765184.0,
+            "14": 436765184.0,
+            "15": 436765184.0,
+            "16": 436765184.0,
+            "17": 436765184.0,
+            "18": 436765184.0,
+            "19": 436765184.0,
+            "20": 436765184.0,
+            "21": 436765184.0,
+            "22": 436765184.0,
+            "23": 436765184.0,
+            "24": 436765184.0,
+            "25": 436765184.0,
+            "26": 436765184.0,
+            "27": 436765184.0,
+            "28": 436765184.0,
+            "29": 436765184.0,
+            "30": 436765184.0,
+            "31": 436765184.0,
+            "32": 436765184.0,
+            "33": 436765184.0,
+            "34": 436765184.0,
+            "35": 436765184.0,
+            "36": 436765184.0,
+            "37": 436765184.0,
+            "38": 436765184.0,
+            "39": 436765184.0,
+            "40": 436765184.0,
+            "41": 436765184.0,
+            "42": 436765184.0,
+            "43": 436765184.0,
+            "44": 436765184.0,
+            "45": 436765184.0,
+            "46": 436765184.0,
+            "47": 436765184.0,
+            "48": 436765184.0,
+            "49": 436765184.0,
+            "50": 436765184.0,
+            "51": 436765184.0,
+            "52": 436765184.0,
+            "53": 436765184.0,
+            "54": 436765184.0,
+            "55": 436765184.0,
+            "56": 436765184.0,
+            "57": 436765184.0,
+            "58": 436765184.0,
+            "59": 436765184.0,
+            "60": 436765184.0,
+            "61": 436765184.0,
+            "62": 436765184.0,
+            "63": 436765184.0,
+            "64": 436765184.0,
+            "65": 436765184.0,
+            "66": 436765184.0,
+            "67": 436765184.0,
+            "68": 436765184.0,
+            "69": 436765184.0,
+            "70": 436765184.0,
+            "71": 436765184.0,
+            "72": 436765184.0,
+            "73": 436765184.0,
+            "74": 436765184.0,
+            "75": 436765184.0,
+            "76": 436765184.0,
+            "77": 436765184.0,
+            "78": 436765184.0,
+            "79": 436765184.0,
+            "80": 436765184.0,
+            "81": 436765184.0,
+            "82": 436765184.0,
+            "83": 436765184.0,
+            "84": 436765184.0,
+            "85": 436765184.0,
+            "86": 436765184.0,
+            "87": 436765184.0,
+            "88": 436765184.0,
+            "89": 436765184.0,
+            "90": 436765184.0,
+            "91": 436765184.0,
+            "92": 436765184.0,
+            "93": 436765184.0,
+            "94": 436765184.0,
+            "95": 436765184.0,
+            "96": 436765184.0,
+            "97": 436765184.0,
+            "98": 436765184.0,
+            "99": 436765184.0,
+            "100": 436765184.0
         }
     },
     "mem-max-allocated-bytes": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 1179683840.0,
-            "5": 1359626240.0,
-            "10": 1359626240.0,
-            "15": 1359626240.0,
-            "20": 1359626240.0,
-            "25": 1359626240.0,
-            "30": 1359626240.0,
-            "35": 1359626240.0,
-            "40": 1359626240.0,
-            "45": 1359626240.0,
-            "50": 1359626240.0,
-            "55": 1359626240.0,
-            "60": 1359626240.0,
-            "65": 1359626240.0,
-            "70": 1359626240.0,
-            "75": 1359626240.0,
-            "80": 1359626240.0,
-            "85": 1359626240.0,
-            "90": 1359626240.0,
-            "95": 1359626240.0,
-            "100": 1359626240.0
+            "1": 1178629632.0,
+            "2": 1359489536.0,
+            "3": 1359489536.0,
+            "4": 1359489536.0,
+            "5": 1359489536.0,
+            "6": 1359489536.0,
+            "7": 1359489536.0,
+            "8": 1359489536.0,
+            "9": 1359489536.0,
+            "10": 1359489536.0,
+            "11": 1359489536.0,
+            "12": 1359489536.0,
+            "13": 1359489536.0,
+            "14": 1359489536.0,
+            "15": 1359489536.0,
+            "16": 1359489536.0,
+            "17": 1359489536.0,
+            "18": 1359489536.0,
+            "19": 1359489536.0,
+            "20": 1359489536.0,
+            "21": 1359489536.0,
+            "22": 1359489536.0,
+            "23": 1359489536.0,
+            "24": 1359489536.0,
+            "25": 1359489536.0,
+            "26": 1359489536.0,
+            "27": 1359489536.0,
+            "28": 1359489536.0,
+            "29": 1359489536.0,
+            "30": 1359489536.0,
+            "31": 1359489536.0,
+            "32": 1359489536.0,
+            "33": 1359489536.0,
+            "34": 1359489536.0,
+            "35": 1359489536.0,
+            "36": 1359489536.0,
+            "37": 1359489536.0,
+            "38": 1359489536.0,
+            "39": 1359489536.0,
+            "40": 1359489536.0,
+            "41": 1359489536.0,
+            "42": 1359489536.0,
+            "43": 1359489536.0,
+            "44": 1359489536.0,
+            "45": 1359489536.0,
+            "46": 1359489536.0,
+            "47": 1359489536.0,
+            "48": 1359489536.0,
+            "49": 1359489536.0,
+            "50": 1359489536.0,
+            "51": 1359489536.0,
+            "52": 1359489536.0,
+            "53": 1359489536.0,
+            "54": 1359489536.0,
+            "55": 1359489536.0,
+            "56": 1359489536.0,
+            "57": 1359489536.0,
+            "58": 1359489536.0,
+            "59": 1359489536.0,
+            "60": 1359489536.0,
+            "61": 1359489536.0,
+            "62": 1359489536.0,
+            "63": 1359489536.0,
+            "64": 1359489536.0,
+            "65": 1359489536.0,
+            "66": 1359489536.0,
+            "67": 1359489536.0,
+            "68": 1359489536.0,
+            "69": 1359489536.0,
+            "70": 1359489536.0,
+            "71": 1359489536.0,
+            "72": 1359489536.0,
+            "73": 1359489536.0,
+            "74": 1359489536.0,
+            "75": 1359489536.0,
+            "76": 1359489536.0,
+            "77": 1359489536.0,
+            "78": 1359489536.0,
+            "79": 1359489536.0,
+            "80": 1359489536.0,
+            "81": 1359489536.0,
+            "82": 1359489536.0,
+            "83": 1359489536.0,
+            "84": 1359489536.0,
+            "85": 1359489536.0,
+            "86": 1359489536.0,
+            "87": 1359489536.0,
+            "88": 1359489536.0,
+            "89": 1359489536.0,
+            "90": 1359489536.0,
+            "91": 1359489536.0,
+            "92": 1359489536.0,
+            "93": 1359489536.0,
+            "94": 1359489536.0,
+            "95": 1359489536.0,
+            "96": 1359489536.0,
+            "97": 1359489536.0,
+            "98": 1359489536.0,
+            "99": 1359489536.0,
+            "100": 1359489536.0
         }
     },
     "iteration-time": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 11.0451,
-            "5": 0.18574,
-            "10": 0.18706,
-            "15": 0.18796,
-            "20": 0.18918,
-            "25": 0.19125,
-            "30": 0.19342,
-            "35": 0.18767,
-            "40": 0.18791,
-            "45": 0.18872,
-            "50": 0.18792,
-            "55": 0.19099,
-            "60": 0.19807,
-            "65": 0.19727,
-            "70": 0.1971,
-            "75": 0.19083,
-            "80": 0.1891,
-            "85": 0.19438,
-            "90": 0.19306,
-            "95": 0.18999,
-            "100": 0.1938
+            "1": 4.17595,
+            "2": 0.21653,
+            "3": 0.20393,
+            "4": 0.1777,
+            "5": 0.17559,
+            "6": 0.17527,
+            "7": 0.17404,
+            "8": 0.17527,
+            "9": 0.17461,
+            "10": 0.17454,
+            "11": 0.17381,
+            "12": 0.17386,
+            "13": 0.174,
+            "14": 0.17411,
+            "15": 0.17381,
+            "16": 0.17541,
+            "17": 0.17524,
+            "18": 0.17473,
+            "19": 0.17526,
+            "20": 0.17472,
+            "21": 0.17459,
+            "22": 0.17459,
+            "23": 0.17482,
+            "24": 0.17424,
+            "25": 0.17389,
+            "26": 0.17466,
+            "27": 0.17418,
+            "28": 0.17458,
+            "29": 0.17404,
+            "30": 0.17516,
+            "31": 0.17358,
+            "32": 0.17747,
+            "33": 0.17373,
+            "34": 0.17438,
+            "35": 0.17497,
+            "36": 0.17566,
+            "37": 0.17619,
+            "38": 0.17653,
+            "39": 0.1758,
+            "40": 0.17382,
+            "41": 0.17487,
+            "42": 0.17435,
+            "43": 0.17455,
+            "44": 0.17454,
+            "45": 0.17399,
+            "46": 0.17424,
+            "47": 0.17456,
+            "48": 0.1738,
+            "49": 0.17414,
+            "50": 0.17386,
+            "51": 0.18789,
+            "52": 0.17663,
+            "53": 0.17792,
+            "54": 0.17728,
+            "55": 0.17626,
+            "56": 0.17729,
+            "57": 0.17786,
+            "58": 0.17863,
+            "59": 0.18049,
+            "60": 0.1845,
+            "61": 0.1781,
+            "62": 0.1787,
+            "63": 0.17855,
+            "64": 0.17717,
+            "65": 0.1776,
+            "66": 0.17832,
+            "67": 0.18005,
+            "68": 0.17716,
+            "69": 0.17733,
+            "70": 0.17706,
+            "71": 0.17683,
+            "72": 0.17613,
+            "73": 0.17725,
+            "74": 0.17735,
+            "75": 0.17807,
+            "76": 0.1806,
+            "77": 0.17886,
+            "78": 0.17653,
+            "79": 0.17801,
+            "80": 0.1774,
+            "81": 0.17784,
+            "82": 0.17692,
+            "83": 0.17721,
+            "84": 0.17851,
+            "85": 0.17973,
+            "86": 0.17641,
+            "87": 0.17796,
+            "88": 0.1791,
+            "89": 0.1778,
+            "90": 0.17818,
+            "91": 0.17974,
+            "92": 0.18142,
+            "93": 0.18143,
+            "94": 0.18024,
+            "95": 0.17737,
+            "96": 0.17757,
+            "97": 0.17906,
+            "98": 0.18024,
+            "99": 0.17614,
+            "100": 0.17615
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..38fc27ca5d3
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.84975,
+            "52": 9.74157,
+            "53": 10.06328,
+            "54": 9.94585,
+            "55": 9.87742,
+            "56": 9.6275,
+            "57": 9.47269,
+            "58": 9.82916,
+            "59": 9.58304,
+            "60": 9.49186,
+            "61": 9.69958,
+            "62": 9.98093,
+            "63": 9.37224,
+            "64": 9.77563,
+            "65": 8.94344,
+            "66": 9.69995,
+            "67": 9.36421,
+            "68": 9.78707,
+            "69": 9.78397,
+            "70": 9.72291,
+            "71": 9.60744,
+            "72": 9.58421,
+            "73": 9.49098,
+            "74": 8.94877,
+            "75": 9.41814,
+            "76": 9.08732,
+            "77": 10.06287,
+            "78": 9.72903,
+            "79": 9.37093,
+            "80": 9.40035,
+            "81": 9.47763,
+            "82": 9.69127,
+            "83": 9.3077,
+            "84": 9.41261,
+            "85": 9.61135,
+            "86": 9.07622,
+            "87": 9.5946,
+            "88": 9.74773,
+            "89": 9.60683,
+            "90": 9.81083,
+            "91": 9.34451,
+            "92": 9.36535,
+            "93": 9.07752,
+            "94": 8.82979,
+            "95": 9.51678,
+            "96": 9.52548,
+            "97": 9.3104,
+            "98": 9.67816,
+            "99": 8.88853,
+            "100": 9.40134
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2611.0,
+            "52": 2567.0,
+            "53": 2899.0,
+            "54": 2696.0,
+            "55": 2388.0,
+            "56": 2904.0,
+            "57": 2463.0,
+            "58": 3004.0,
+            "59": 2743.0,
+            "60": 2481.0,
+            "61": 2882.0,
+            "62": 2640.0,
+            "63": 2329.0,
+            "64": 3053.0,
+            "65": 2698.0,
+            "66": 3171.0,
+            "67": 2762.0,
+            "68": 2852.0,
+            "69": 2993.0,
+            "70": 3111.0,
+            "71": 3118.0,
+            "72": 2477.0,
+            "73": 3073.0,
+            "74": 1987.0,
+            "75": 2626.0,
+            "76": 2906.0,
+            "77": 3416.0,
+            "78": 3291.0,
+            "79": 3330.0,
+            "80": 3538.0,
+            "81": 3684.0,
+            "82": 3450.0,
+            "83": 2796.0,
+            "84": 3313.0,
+            "85": 3417.0,
+            "86": 2750.0,
+            "87": 3783.0,
+            "88": 3067.0,
+            "89": 3523.0,
+            "90": 3036.0,
+            "91": 2662.0,
+            "92": 3172.0,
+            "93": 2638.0,
+            "94": 3365.0,
+            "95": 3463.0,
+            "96": 3698.0,
+            "97": 3041.0,
+            "98": 3808.0,
+            "99": 3231.0,
+            "100": 3373.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 437551616.0,
+            "52": 437551616.0,
+            "53": 437551616.0,
+            "54": 437551616.0,
+            "55": 437551616.0,
+            "56": 437551616.0,
+            "57": 437551616.0,
+            "58": 437551616.0,
+            "59": 437551616.0,
+            "60": 437551616.0,
+            "61": 437551616.0,
+            "62": 437551616.0,
+            "63": 437551616.0,
+            "64": 437551616.0,
+            "65": 437551616.0,
+            "66": 437551616.0,
+            "67": 437551616.0,
+            "68": 437551616.0,
+            "69": 437551616.0,
+            "70": 437551616.0,
+            "71": 437551616.0,
+            "72": 437551616.0,
+            "73": 437551616.0,
+            "74": 437551616.0,
+            "75": 437551616.0,
+            "76": 437551616.0,
+            "77": 437551616.0,
+            "78": 437551616.0,
+            "79": 437551616.0,
+            "80": 437551616.0,
+            "81": 437551616.0,
+            "82": 437551616.0,
+            "83": 437551616.0,
+            "84": 437551616.0,
+            "85": 437551616.0,
+            "86": 437551616.0,
+            "87": 437551616.0,
+            "88": 437551616.0,
+            "89": 437551616.0,
+            "90": 437551616.0,
+            "91": 437551616.0,
+            "92": 437551616.0,
+            "93": 437551616.0,
+            "94": 437551616.0,
+            "95": 437551616.0,
+            "96": 437551616.0,
+            "97": 437551616.0,
+            "98": 437551616.0,
+            "99": 437551616.0,
+            "100": 437551616.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1361323520.0,
+            "52": 1361324544.0,
+            "53": 1361324544.0,
+            "54": 1361324544.0,
+            "55": 1361324544.0,
+            "56": 1361324544.0,
+            "57": 1361324544.0,
+            "58": 1361324544.0,
+            "59": 1361324544.0,
+            "60": 1361324544.0,
+            "61": 1361324544.0,
+            "62": 1361324544.0,
+            "63": 1361324544.0,
+            "64": 1361324544.0,
+            "65": 1361324544.0,
+            "66": 1361324544.0,
+            "67": 1361324544.0,
+            "68": 1361324544.0,
+            "69": 1361324544.0,
+            "70": 1361324544.0,
+            "71": 1361324544.0,
+            "72": 1361324544.0,
+            "73": 1361324544.0,
+            "74": 1361324544.0,
+            "75": 1361324544.0,
+            "76": 1361324544.0,
+            "77": 1361324544.0,
+            "78": 1361324544.0,
+            "79": 1361324544.0,
+            "80": 1361324544.0,
+            "81": 1361324544.0,
+            "82": 1361324544.0,
+            "83": 1361324544.0,
+            "84": 1361324544.0,
+            "85": 1361324544.0,
+            "86": 1361324544.0,
+            "87": 1361324544.0,
+            "88": 1361324544.0,
+            "89": 1361324544.0,
+            "90": 1361324544.0,
+            "91": 1361324544.0,
+            "92": 1361324544.0,
+            "93": 1361324544.0,
+            "94": 1361324544.0,
+            "95": 1361324544.0,
+            "96": 1361324544.0,
+            "97": 1361324544.0,
+            "98": 1361324544.0,
+            "99": 1361324544.0,
+            "100": 1361324544.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.70609,
+            "52": 0.21752,
+            "53": 0.18577,
+            "54": 0.18466,
+            "55": 0.18165,
+            "56": 0.18049,
+            "57": 0.18614,
+            "58": 0.18682,
+            "59": 0.18039,
+            "60": 0.18204,
+            "61": 0.18258,
+            "62": 0.18091,
+            "63": 0.18358,
+            "64": 0.18229,
+            "65": 0.18033,
+            "66": 0.17977,
+            "67": 0.17991,
+            "68": 0.18063,
+            "69": 0.17985,
+            "70": 0.1801,
+            "71": 0.17962,
+            "72": 0.17965,
+            "73": 0.18018,
+            "74": 0.17894,
+            "75": 0.17969,
+            "76": 0.17978,
+            "77": 0.18125,
+            "78": 0.18038,
+            "79": 0.18003,
+            "80": 0.18018,
+            "81": 0.17963,
+            "82": 0.18021,
+            "83": 0.17905,
+            "84": 0.1801,
+            "85": 0.1801,
+            "86": 0.18063,
+            "87": 0.18031,
+            "88": 0.17967,
+            "89": 0.18064,
+            "90": 0.17981,
+            "91": 0.18039,
+            "92": 0.18318,
+            "93": 0.18018,
+            "94": 0.18097,
+            "95": 0.18141,
+            "96": 0.17593,
+            "97": 0.17726,
+            "98": 0.17621,
+            "99": 0.17602,
+            "100": 0.17627
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..7dd5b31f34f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.87163,
+            "2": 10.87238,
+            "3": 10.86215,
+            "4": 10.84334,
+            "5": 10.8781,
+            "6": 10.8937,
+            "7": 10.87187,
+            "8": 10.87789,
+            "9": 10.86815,
+            "10": 10.83758,
+            "11": 10.87595,
+            "12": 10.87605,
+            "13": 10.89163,
+            "14": 10.89707,
+            "15": 10.83373,
+            "16": 10.82462,
+            "17": 10.80227,
+            "18": 10.82965,
+            "19": 10.82299,
+            "20": 10.73839,
+            "21": 10.70969,
+            "22": 10.5649,
+            "23": 10.73038,
+            "24": 10.6062,
+            "25": 10.55515,
+            "26": 10.62333,
+            "27": 10.61393,
+            "28": 10.57726,
+            "29": 10.60204,
+            "30": 10.38732,
+            "31": 10.12791,
+            "32": 10.4758,
+            "33": 10.47238,
+            "34": 10.22665,
+            "35": 10.28584,
+            "36": 10.23138,
+            "37": 10.35035,
+            "38": 10.19674,
+            "39": 10.40798,
+            "40": 10.09496,
+            "41": 10.13593,
+            "42": 10.21728,
+            "43": 9.84575,
+            "44": 9.94965,
+            "45": 9.83809,
+            "46": 9.821,
+            "47": 10.13316,
+            "48": 9.85047,
+            "49": 9.53,
+            "50": 9.90689,
+            "51": 9.85498,
+            "52": 9.74731,
+            "53": 10.06267,
+            "54": 9.95301,
+            "55": 9.88728,
+            "56": 9.6211,
+            "57": 9.47571,
+            "58": 9.83152,
+            "59": 9.58168,
+            "60": 9.49439,
+            "61": 9.68902,
+            "62": 9.9857,
+            "63": 9.37411,
+            "64": 9.7651,
+            "65": 8.94171,
+            "66": 9.69872,
+            "67": 9.36899,
+            "68": 9.78075,
+            "69": 9.79729,
+            "70": 9.72884,
+            "71": 9.62546,
+            "72": 9.58193,
+            "73": 9.48195,
+            "74": 8.92206,
+            "75": 9.4096,
+            "76": 9.07711,
+            "77": 10.05905,
+            "78": 9.7196,
+            "79": 9.37915,
+            "80": 9.39953,
+            "81": 9.4826,
+            "82": 9.70045,
+            "83": 9.31347,
+            "84": 9.41605,
+            "85": 9.61616,
+            "86": 9.07519,
+            "87": 9.59811,
+            "88": 9.75175,
+            "89": 9.60152,
+            "90": 9.82639,
+            "91": 9.33477,
+            "92": 9.3587,
+            "93": 9.08591,
+            "94": 8.82888,
+            "95": 9.52816,
+            "96": 9.52866,
+            "97": 9.30468,
+            "98": 9.67128,
+            "99": 8.89752,
+            "100": 9.40653
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1731.0,
+            "2": 1804.0,
+            "3": 1704.0,
+            "4": 1768.0,
+            "5": 2006.0,
+            "6": 1918.0,
+            "7": 1815.0,
+            "8": 1654.0,
+            "9": 1919.0,
+            "10": 1481.0,
+            "11": 1876.0,
+            "12": 1795.0,
+            "13": 1915.0,
+            "14": 1830.0,
+            "15": 2029.0,
+            "16": 1948.0,
+            "17": 1838.0,
+            "18": 1747.0,
+            "19": 1789.0,
+            "20": 1771.0,
+            "21": 1876.0,
+            "22": 1854.0,
+            "23": 2069.0,
+            "24": 1684.0,
+            "25": 1732.0,
+            "26": 1803.0,
+            "27": 1919.0,
+            "28": 2095.0,
+            "29": 2041.0,
+            "30": 1919.0,
+            "31": 1704.0,
+            "32": 1869.0,
+            "33": 2184.0,
+            "34": 1846.0,
+            "35": 1923.0,
+            "36": 2071.0,
+            "37": 2407.0,
+            "38": 2209.0,
+            "39": 2462.0,
+            "40": 2275.0,
+            "41": 2369.0,
+            "42": 2305.0,
+            "43": 2048.0,
+            "44": 2171.0,
+            "45": 2119.0,
+            "46": 2287.0,
+            "47": 2499.0,
+            "48": 2361.0,
+            "49": 2398.0,
+            "50": 2321.0,
+            "51": 2604.0,
+            "52": 2579.0,
+            "53": 3020.0,
+            "54": 2705.0,
+            "55": 2369.0,
+            "56": 2752.0,
+            "57": 2351.0,
+            "58": 2902.0,
+            "59": 2786.0,
+            "60": 2511.0,
+            "61": 2861.0,
+            "62": 2715.0,
+            "63": 2476.0,
+            "64": 2944.0,
+            "65": 2791.0,
+            "66": 3095.0,
+            "67": 2945.0,
+            "68": 2853.0,
+            "69": 2919.0,
+            "70": 3113.0,
+            "71": 2898.0,
+            "72": 2554.0,
+            "73": 3029.0,
+            "74": 2044.0,
+            "75": 2601.0,
+            "76": 2957.0,
+            "77": 3204.0,
+            "78": 3197.0,
+            "79": 3123.0,
+            "80": 3255.0,
+            "81": 3582.0,
+            "82": 3338.0,
+            "83": 2799.0,
+            "84": 3225.0,
+            "85": 3372.0,
+            "86": 2818.0,
+            "87": 3881.0,
+            "88": 3040.0,
+            "89": 3335.0,
+            "90": 3256.0,
+            "91": 2903.0,
+            "92": 3202.0,
+            "93": 2806.0,
+            "94": 3422.0,
+            "95": 3348.0,
+            "96": 3594.0,
+            "97": 3290.0,
+            "98": 3746.0,
+            "99": 3085.0,
+            "100": 3366.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 246999552.0,
+            "2": 246999552.0,
+            "3": 246999552.0,
+            "4": 246999552.0,
+            "5": 246999552.0,
+            "6": 246999552.0,
+            "7": 246999552.0,
+            "8": 246999552.0,
+            "9": 246999552.0,
+            "10": 246999552.0,
+            "11": 246999552.0,
+            "12": 246999552.0,
+            "13": 246999552.0,
+            "14": 246999552.0,
+            "15": 246999552.0,
+            "16": 246999552.0,
+            "17": 246999552.0,
+            "18": 246999552.0,
+            "19": 246999552.0,
+            "20": 246999552.0,
+            "21": 246999552.0,
+            "22": 246999552.0,
+            "23": 246999552.0,
+            "24": 246999552.0,
+            "25": 246999552.0,
+            "26": 246999552.0,
+            "27": 246999552.0,
+            "28": 246999552.0,
+            "29": 246999552.0,
+            "30": 246999552.0,
+            "31": 246999552.0,
+            "32": 246999552.0,
+            "33": 246999552.0,
+            "34": 246999552.0,
+            "35": 246999552.0,
+            "36": 246999552.0,
+            "37": 246999552.0,
+            "38": 246999552.0,
+            "39": 246999552.0,
+            "40": 246999552.0,
+            "41": 246999552.0,
+            "42": 246999552.0,
+            "43": 246999552.0,
+            "44": 246999552.0,
+            "45": 246999552.0,
+            "46": 246999552.0,
+            "47": 246999552.0,
+            "48": 246999552.0,
+            "49": 246999552.0,
+            "50": 246999552.0,
+            "51": 246999552.0,
+            "52": 246999552.0,
+            "53": 246999552.0,
+            "54": 246999552.0,
+            "55": 246999552.0,
+            "56": 246999552.0,
+            "57": 246999552.0,
+            "58": 246999552.0,
+            "59": 246999552.0,
+            "60": 246999552.0,
+            "61": 246999552.0,
+            "62": 246999552.0,
+            "63": 246999552.0,
+            "64": 246999552.0,
+            "65": 246999552.0,
+            "66": 246999552.0,
+            "67": 246999552.0,
+            "68": 246999552.0,
+            "69": 246999552.0,
+            "70": 246999552.0,
+            "71": 246999552.0,
+            "72": 246999552.0,
+            "73": 246999552.0,
+            "74": 246999552.0,
+            "75": 246999552.0,
+            "76": 246999552.0,
+            "77": 246999552.0,
+            "78": 246999552.0,
+            "79": 246999552.0,
+            "80": 246999552.0,
+            "81": 246999552.0,
+            "82": 246999552.0,
+            "83": 246999552.0,
+            "84": 246999552.0,
+            "85": 246999552.0,
+            "86": 246999552.0,
+            "87": 246999552.0,
+            "88": 246999552.0,
+            "89": 246999552.0,
+            "90": 246999552.0,
+            "91": 246999552.0,
+            "92": 246999552.0,
+            "93": 246999552.0,
+            "94": 246999552.0,
+            "95": 246999552.0,
+            "96": 246999552.0,
+            "97": 246999552.0,
+            "98": 246999552.0,
+            "99": 246999552.0,
+            "100": 246999552.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1528374784.0,
+            "2": 1528375808.0,
+            "3": 1528375808.0,
+            "4": 1528375808.0,
+            "5": 1528375808.0,
+            "6": 1528375808.0,
+            "7": 1528375808.0,
+            "8": 1528375808.0,
+            "9": 1528375808.0,
+            "10": 1528375808.0,
+            "11": 1528375808.0,
+            "12": 1528375808.0,
+            "13": 1528375808.0,
+            "14": 1528375808.0,
+            "15": 1528375808.0,
+            "16": 1528375808.0,
+            "17": 1528375808.0,
+            "18": 1528375808.0,
+            "19": 1528375808.0,
+            "20": 1528375808.0,
+            "21": 1528375808.0,
+            "22": 1528375808.0,
+            "23": 1528375808.0,
+            "24": 1528375808.0,
+            "25": 1528375808.0,
+            "26": 1528375808.0,
+            "27": 1528375808.0,
+            "28": 1528375808.0,
+            "29": 1528375808.0,
+            "30": 1528375808.0,
+            "31": 1528375808.0,
+            "32": 1528375808.0,
+            "33": 1528375808.0,
+            "34": 1528375808.0,
+            "35": 1528375808.0,
+            "36": 1528375808.0,
+            "37": 1528375808.0,
+            "38": 1528375808.0,
+            "39": 1528375808.0,
+            "40": 1528375808.0,
+            "41": 1528375808.0,
+            "42": 1528375808.0,
+            "43": 1528375808.0,
+            "44": 1528375808.0,
+            "45": 1528375808.0,
+            "46": 1528375808.0,
+            "47": 1528375808.0,
+            "48": 1528375808.0,
+            "49": 1528375808.0,
+            "50": 1528375808.0,
+            "51": 1528375808.0,
+            "52": 1528375808.0,
+            "53": 1528375808.0,
+            "54": 1528375808.0,
+            "55": 1528375808.0,
+            "56": 1528375808.0,
+            "57": 1528375808.0,
+            "58": 1528375808.0,
+            "59": 1528375808.0,
+            "60": 1528375808.0,
+            "61": 1528375808.0,
+            "62": 1528375808.0,
+            "63": 1528375808.0,
+            "64": 1528375808.0,
+            "65": 1528375808.0,
+            "66": 1528375808.0,
+            "67": 1528375808.0,
+            "68": 1528375808.0,
+            "69": 1528375808.0,
+            "70": 1528375808.0,
+            "71": 1528375808.0,
+            "72": 1528375808.0,
+            "73": 1528375808.0,
+            "74": 1528375808.0,
+            "75": 1528375808.0,
+            "76": 1528375808.0,
+            "77": 1528375808.0,
+            "78": 1528375808.0,
+            "79": 1528375808.0,
+            "80": 1528375808.0,
+            "81": 1528375808.0,
+            "82": 1528375808.0,
+            "83": 1528375808.0,
+            "84": 1528375808.0,
+            "85": 1528375808.0,
+            "86": 1528375808.0,
+            "87": 1528375808.0,
+            "88": 1528375808.0,
+            "89": 1528375808.0,
+            "90": 1528375808.0,
+            "91": 1528375808.0,
+            "92": 1528375808.0,
+            "93": 1528375808.0,
+            "94": 1528375808.0,
+            "95": 1528375808.0,
+            "96": 1528375808.0,
+            "97": 1528375808.0,
+            "98": 1528375808.0,
+            "99": 1528375808.0,
+            "100": 1528375808.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 1.71105,
+            "3": 0.22879,
+            "4": 0.22169,
+            "5": 0.21979,
+            "6": 0.21933,
+            "7": 0.21836,
+            "8": 0.22054,
+            "9": 0.22096,
+            "10": 0.22079,
+            "11": 0.22255,
+            "12": 0.21905,
+            "13": 0.22266,
+            "14": 0.22261,
+            "15": 0.22192,
+            "16": 0.21928,
+            "17": 0.22014,
+            "18": 0.2213,
+            "19": 0.22242,
+            "20": 0.22097,
+            "21": 0.21936,
+            "22": 0.22091,
+            "23": 0.22071,
+            "24": 0.22306,
+            "25": 0.22073,
+            "26": 0.22028,
+            "27": 0.22198,
+            "28": 0.22294,
+            "29": 0.22204,
+            "30": 0.21896,
+            "31": 0.22144,
+            "32": 0.22279,
+            "33": 0.22428,
+            "34": 0.22247,
+            "35": 0.22192,
+            "36": 0.22242,
+            "37": 0.22321,
+            "38": 0.22186,
+            "39": 0.22242,
+            "40": 0.22098,
+            "41": 0.22254,
+            "42": 0.55234,
+            "43": 0.22432,
+            "44": 0.22103,
+            "45": 0.22202,
+            "46": 0.2216,
+            "47": 0.22107,
+            "48": 0.21878,
+            "49": 0.22338,
+            "50": 0.22181,
+            "51": 0.22588,
+            "52": 0.22221,
+            "53": 0.22214,
+            "54": 0.22059,
+            "55": 0.22088,
+            "56": 0.22231,
+            "57": 0.2231,
+            "58": 0.22228,
+            "59": 0.22136,
+            "60": 0.22087,
+            "61": 0.22171,
+            "62": 0.22165,
+            "63": 0.22149,
+            "64": 0.22165,
+            "65": 0.22916,
+            "66": 0.25667,
+            "67": 0.22585,
+            "68": 0.2212,
+            "69": 0.22322,
+            "70": 0.22332,
+            "71": 0.22291,
+            "72": 0.22074,
+            "73": 0.2214,
+            "74": 0.22287,
+            "75": 0.21929,
+            "76": 0.22246,
+            "77": 0.22148,
+            "78": 0.22442,
+            "79": 0.22465,
+            "80": 0.22859,
+            "81": 0.22464,
+            "82": 0.22391,
+            "83": 0.22417,
+            "84": 0.22202,
+            "85": 0.22369,
+            "86": 0.22224,
+            "87": 0.22245,
+            "88": 0.22255,
+            "89": 0.22379,
+            "90": 0.22356,
+            "91": 0.22229,
+            "92": 0.22297,
+            "93": 0.22525,
+            "94": 0.21956,
+            "95": 0.22318,
+            "96": 0.22361,
+            "97": 0.22246,
+            "98": 0.22326,
+            "99": 0.22121,
+            "100": 0.22214
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_h100.json
index dbfceceac77..feb49a01aad 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_h100.json
@@ -4,106 +4,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.84466,
-            "2": 10.84794,
-            "3": 10.84925,
-            "4": 10.84332,
-            "5": 10.88244,
-            "6": 10.88079,
-            "7": 10.86575,
-            "8": 10.85546,
-            "9": 10.85543,
-            "10": 10.81818,
-            "11": 10.88769,
-            "12": 10.8634,
-            "13": 10.86681,
-            "14": 10.88414,
-            "15": 10.82464,
-            "16": 10.82854,
-            "17": 10.79491,
-            "18": 10.81492,
-            "19": 10.80133,
-            "20": 10.7181,
-            "21": 10.69905,
-            "22": 10.56744,
-            "23": 10.717,
-            "24": 10.60443,
-            "25": 10.55007,
-            "26": 10.60907,
-            "27": 10.62028,
-            "28": 10.5752,
-            "29": 10.59624,
-            "30": 10.38327,
-            "31": 10.1537,
-            "32": 10.48026,
-            "33": 10.47378,
-            "34": 10.2366,
-            "35": 10.28843,
-            "36": 10.24838,
-            "37": 10.35354,
-            "38": 10.20794,
-            "39": 10.41884,
-            "40": 10.1122,
-            "41": 10.16092,
-            "42": 10.23301,
-            "43": 9.86118,
-            "44": 9.97698,
-            "45": 9.86493,
-            "46": 9.84883,
-            "47": 10.16617,
-            "48": 9.87132,
-            "49": 9.56691,
-            "50": 9.92114,
-            "51": 9.86695,
-            "52": 9.76956,
-            "53": 10.07809,
-            "54": 9.97027,
-            "55": 9.89683,
-            "56": 9.64394,
-            "57": 9.49728,
-            "58": 9.84867,
-            "59": 9.59977,
-            "60": 9.50631,
-            "61": 9.71011,
-            "62": 9.99101,
-            "63": 9.38968,
-            "64": 9.78595,
-            "65": 8.95983,
-            "66": 9.70876,
-            "67": 9.37892,
-            "68": 9.79599,
-            "69": 9.80666,
-            "70": 9.74795,
-            "71": 9.61779,
-            "72": 9.59127,
-            "73": 9.50398,
-            "74": 8.94624,
-            "75": 9.42942,
-            "76": 9.08423,
-            "77": 10.06698,
-            "78": 9.73256,
-            "79": 9.38117,
-            "80": 9.41061,
-            "81": 9.48289,
-            "82": 9.70492,
-            "83": 9.30713,
-            "84": 9.42241,
-            "85": 9.61802,
-            "86": 9.07631,
-            "87": 9.59382,
-            "88": 9.75419,
-            "89": 9.60093,
-            "90": 9.82013,
-            "91": 9.3407,
-            "92": 9.35717,
-            "93": 9.07927,
-            "94": 8.83613,
-            "95": 9.5223,
-            "96": 9.53379,
-            "97": 9.31633,
-            "98": 9.68007,
+            "1": 10.84445,
+            "2": 10.84755,
+            "3": 10.84905,
+            "4": 10.844,
+            "5": 10.88133,
+            "6": 10.88069,
+            "7": 10.86435,
+            "8": 10.85483,
+            "9": 10.85577,
+            "10": 10.81851,
+            "11": 10.88835,
+            "12": 10.86318,
+            "13": 10.86739,
+            "14": 10.88397,
+            "15": 10.82443,
+            "16": 10.82905,
+            "17": 10.7953,
+            "18": 10.81529,
+            "19": 10.80121,
+            "20": 10.71826,
+            "21": 10.69956,
+            "22": 10.56756,
+            "23": 10.7171,
+            "24": 10.60451,
+            "25": 10.55018,
+            "26": 10.60859,
+            "27": 10.62013,
+            "28": 10.57541,
+            "29": 10.59599,
+            "30": 10.38364,
+            "31": 10.15409,
+            "32": 10.48036,
+            "33": 10.47379,
+            "34": 10.23693,
+            "35": 10.28857,
+            "36": 10.24862,
+            "37": 10.35357,
+            "38": 10.20827,
+            "39": 10.41871,
+            "40": 10.11266,
+            "41": 10.16079,
+            "42": 10.23304,
+            "43": 9.86146,
+            "44": 9.97719,
+            "45": 9.8651,
+            "46": 9.8486,
+            "47": 10.16607,
+            "48": 9.87126,
+            "49": 9.56738,
+            "50": 9.92137,
+            "51": 9.86682,
+            "52": 9.7694,
+            "53": 10.07839,
+            "54": 9.96992,
+            "55": 9.89678,
+            "56": 9.64417,
+            "57": 9.49737,
+            "58": 9.84853,
+            "59": 9.59973,
+            "60": 9.5062,
+            "61": 9.71028,
+            "62": 9.99079,
+            "63": 9.38989,
+            "64": 9.78616,
+            "65": 8.95963,
+            "66": 9.70879,
+            "67": 9.3791,
+            "68": 9.79602,
+            "69": 9.80692,
+            "70": 9.74781,
+            "71": 9.61777,
+            "72": 9.59105,
+            "73": 9.50417,
+            "74": 8.94629,
+            "75": 9.42953,
+            "76": 9.08443,
+            "77": 10.06697,
+            "78": 9.73245,
+            "79": 9.38132,
+            "80": 9.41079,
+            "81": 9.48315,
+            "82": 9.70491,
+            "83": 9.30719,
+            "84": 9.42254,
+            "85": 9.61799,
+            "86": 9.07625,
+            "87": 9.59384,
+            "88": 9.75414,
+            "89": 9.60107,
+            "90": 9.8203,
+            "91": 9.34086,
+            "92": 9.35733,
+            "93": 9.07939,
+            "94": 8.83611,
+            "95": 9.52231,
+            "96": 9.53388,
+            "97": 9.31636,
+            "98": 9.68001,
             "99": 8.89242,
-            "100": 9.39964
+            "100": 9.3998
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1770.0,
-            "2": 1809.0,
+            "1": 1814.0,
+            "2": 1725.0,
             "3": 1782.0,
-            "4": 1916.0,
-            "5": 1973.0,
-            "6": 1955.0,
-            "7": 2046.0,
-            "8": 1773.0,
-            "9": 1815.0,
-            "10": 1432.0,
-            "11": 1961.0,
-            "12": 1828.0,
-            "13": 1967.0,
-            "14": 1825.0,
-            "15": 1980.0,
-            "16": 1889.0,
-            "17": 1866.0,
-            "18": 1827.0,
-            "19": 1876.0,
-            "20": 1715.0,
-            "21": 2046.0,
-            "22": 1872.0,
-            "23": 2168.0,
+            "4": 1955.0,
+            "5": 1930.0,
+            "6": 1875.0,
+            "7": 1951.0,
+            "8": 1800.0,
+            "9": 1914.0,
+            "10": 1495.0,
+            "11": 1987.0,
+            "12": 1811.0,
+            "13": 2030.0,
+            "14": 1930.0,
+            "15": 1948.0,
+            "16": 1933.0,
+            "17": 1892.0,
+            "18": 1781.0,
+            "19": 1985.0,
+            "20": 1812.0,
+            "21": 2115.0,
+            "22": 1885.0,
+            "23": 2120.0,
             "24": 1814.0,
-            "25": 1715.0,
-            "26": 1721.0,
-            "27": 1822.0,
-            "28": 2102.0,
-            "29": 2112.0,
-            "30": 2020.0,
-            "31": 1569.0,
-            "32": 2022.0,
-            "33": 2256.0,
-            "34": 1884.0,
-            "35": 2034.0,
-            "36": 2027.0,
-            "37": 2438.0,
-            "38": 2363.0,
-            "39": 2526.0,
-            "40": 2254.0,
-            "41": 2328.0,
-            "42": 2409.0,
-            "43": 2126.0,
-            "44": 2166.0,
-            "45": 2230.0,
-            "46": 2487.0,
-            "47": 2605.0,
-            "48": 2351.0,
-            "49": 2413.0,
-            "50": 2274.0,
-            "51": 2579.0,
-            "52": 2508.0,
-            "53": 2879.0,
-            "54": 2744.0,
-            "55": 2402.0,
-            "56": 2720.0,
-            "57": 2384.0,
-            "58": 3002.0,
-            "59": 2743.0,
-            "60": 2457.0,
-            "61": 2976.0,
-            "62": 2631.0,
-            "63": 2349.0,
-            "64": 3077.0,
-            "65": 2634.0,
-            "66": 3076.0,
-            "67": 2906.0,
-            "68": 2759.0,
-            "69": 2907.0,
-            "70": 3045.0,
-            "71": 3159.0,
-            "72": 2506.0,
-            "73": 2956.0,
-            "74": 1945.0,
-            "75": 2467.0,
-            "76": 2979.0,
-            "77": 3209.0,
-            "78": 3122.0,
-            "79": 3048.0,
-            "80": 3389.0,
-            "81": 3799.0,
-            "82": 3272.0,
-            "83": 2962.0,
-            "84": 3328.0,
-            "85": 3462.0,
-            "86": 3071.0,
-            "87": 3900.0,
-            "88": 3128.0,
-            "89": 3469.0,
-            "90": 3095.0,
-            "91": 2769.0,
-            "92": 3168.0,
-            "93": 2713.0,
-            "94": 3416.0,
-            "95": 3515.0,
-            "96": 3425.0,
-            "97": 3223.0,
-            "98": 3769.0,
-            "99": 3230.0,
-            "100": 3219.0
+            "25": 1705.0,
+            "26": 1815.0,
+            "27": 1870.0,
+            "28": 2162.0,
+            "29": 2104.0,
+            "30": 2061.0,
+            "31": 1666.0,
+            "32": 2010.0,
+            "33": 2157.0,
+            "34": 1918.0,
+            "35": 2000.0,
+            "36": 1966.0,
+            "37": 2421.0,
+            "38": 2318.0,
+            "39": 2488.0,
+            "40": 2213.0,
+            "41": 2361.0,
+            "42": 2330.0,
+            "43": 2092.0,
+            "44": 2184.0,
+            "45": 2237.0,
+            "46": 2311.0,
+            "47": 2645.0,
+            "48": 2374.0,
+            "49": 2345.0,
+            "50": 2357.0,
+            "51": 2627.0,
+            "52": 2530.0,
+            "53": 2856.0,
+            "54": 2776.0,
+            "55": 2346.0,
+            "56": 2679.0,
+            "57": 2410.0,
+            "58": 2990.0,
+            "59": 2835.0,
+            "60": 2502.0,
+            "61": 2984.0,
+            "62": 2692.0,
+            "63": 2463.0,
+            "64": 3009.0,
+            "65": 2587.0,
+            "66": 3126.0,
+            "67": 2793.0,
+            "68": 2665.0,
+            "69": 2776.0,
+            "70": 3135.0,
+            "71": 3151.0,
+            "72": 2424.0,
+            "73": 2926.0,
+            "74": 1921.0,
+            "75": 2347.0,
+            "76": 3026.0,
+            "77": 3283.0,
+            "78": 3224.0,
+            "79": 3165.0,
+            "80": 3311.0,
+            "81": 3792.0,
+            "82": 3279.0,
+            "83": 2867.0,
+            "84": 3381.0,
+            "85": 3415.0,
+            "86": 2962.0,
+            "87": 3822.0,
+            "88": 3311.0,
+            "89": 3392.0,
+            "90": 3184.0,
+            "91": 2795.0,
+            "92": 3121.0,
+            "93": 2731.0,
+            "94": 3503.0,
+            "95": 3473.0,
+            "96": 3465.0,
+            "97": 3299.0,
+            "98": 3663.0,
+            "99": 3394.0,
+            "100": 3235.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 246998528.0,
-            "2": 246998528.0,
-            "3": 246998528.0,
-            "4": 246998528.0,
-            "5": 246998528.0,
-            "6": 246998528.0,
-            "7": 246998528.0,
-            "8": 246998528.0,
-            "9": 246998528.0,
-            "10": 246998528.0,
-            "11": 246998528.0,
-            "12": 246998528.0,
-            "13": 246998528.0,
-            "14": 246998528.0,
-            "15": 246998528.0,
-            "16": 246998528.0,
-            "17": 246998528.0,
-            "18": 246998528.0,
-            "19": 246998528.0,
-            "20": 246998528.0,
-            "21": 246998528.0,
-            "22": 246998528.0,
-            "23": 246998528.0,
-            "24": 246998528.0,
-            "25": 246998528.0,
-            "26": 246998528.0,
-            "27": 246998528.0,
-            "28": 246998528.0,
-            "29": 246998528.0,
-            "30": 246998528.0,
-            "31": 246998528.0,
-            "32": 246998528.0,
-            "33": 246998528.0,
-            "34": 246998528.0,
-            "35": 246998528.0,
-            "36": 246998528.0,
-            "37": 246998528.0,
-            "38": 246998528.0,
-            "39": 246998528.0,
-            "40": 246998528.0,
-            "41": 246998528.0,
-            "42": 246998528.0,
-            "43": 246998528.0,
-            "44": 246998528.0,
-            "45": 246998528.0,
-            "46": 246998528.0,
-            "47": 246998528.0,
-            "48": 246998528.0,
-            "49": 246998528.0,
-            "50": 246998528.0,
-            "51": 246998528.0,
-            "52": 246998528.0,
-            "53": 246998528.0,
-            "54": 246998528.0,
-            "55": 246998528.0,
-            "56": 246998528.0,
-            "57": 246998528.0,
-            "58": 246998528.0,
-            "59": 246998528.0,
-            "60": 246998528.0,
-            "61": 246998528.0,
-            "62": 246998528.0,
-            "63": 246998528.0,
-            "64": 246998528.0,
-            "65": 246998528.0,
-            "66": 246998528.0,
-            "67": 246998528.0,
-            "68": 246998528.0,
-            "69": 246998528.0,
-            "70": 246998528.0,
-            "71": 246998528.0,
-            "72": 246998528.0,
-            "73": 246998528.0,
-            "74": 246998528.0,
-            "75": 246998528.0,
-            "76": 246998528.0,
-            "77": 246998528.0,
-            "78": 246998528.0,
-            "79": 246998528.0,
-            "80": 246998528.0,
-            "81": 246998528.0,
-            "82": 246998528.0,
-            "83": 246998528.0,
-            "84": 246998528.0,
-            "85": 246998528.0,
-            "86": 246998528.0,
-            "87": 246998528.0,
-            "88": 246998528.0,
-            "89": 246998528.0,
-            "90": 246998528.0,
-            "91": 246998528.0,
-            "92": 246998528.0,
-            "93": 246998528.0,
-            "94": 246998528.0,
-            "95": 246998528.0,
-            "96": 246998528.0,
-            "97": 246998528.0,
-            "98": 246998528.0,
-            "99": 246998528.0,
-            "100": 246998528.0
+            "1": 700103168.0,
+            "2": 700103168.0,
+            "3": 700103168.0,
+            "4": 700103168.0,
+            "5": 700103168.0,
+            "6": 700103168.0,
+            "7": 700103168.0,
+            "8": 700103168.0,
+            "9": 700103168.0,
+            "10": 700103168.0,
+            "11": 700103168.0,
+            "12": 700103168.0,
+            "13": 700103168.0,
+            "14": 700103168.0,
+            "15": 700103168.0,
+            "16": 700103168.0,
+            "17": 700103168.0,
+            "18": 700103168.0,
+            "19": 700103168.0,
+            "20": 700103168.0,
+            "21": 700103168.0,
+            "22": 700103168.0,
+            "23": 700103168.0,
+            "24": 700103168.0,
+            "25": 700103168.0,
+            "26": 700103168.0,
+            "27": 700103168.0,
+            "28": 700103168.0,
+            "29": 700103168.0,
+            "30": 700103168.0,
+            "31": 700103168.0,
+            "32": 700103168.0,
+            "33": 700103168.0,
+            "34": 700103168.0,
+            "35": 700103168.0,
+            "36": 700103168.0,
+            "37": 700103168.0,
+            "38": 700103168.0,
+            "39": 700103168.0,
+            "40": 700103168.0,
+            "41": 700103168.0,
+            "42": 700103168.0,
+            "43": 700103168.0,
+            "44": 700103168.0,
+            "45": 700103168.0,
+            "46": 700103168.0,
+            "47": 700103168.0,
+            "48": 700103168.0,
+            "49": 700103168.0,
+            "50": 700103168.0,
+            "51": 700103168.0,
+            "52": 700103168.0,
+            "53": 700103168.0,
+            "54": 700103168.0,
+            "55": 700103168.0,
+            "56": 700103168.0,
+            "57": 700103168.0,
+            "58": 700103168.0,
+            "59": 700103168.0,
+            "60": 700103168.0,
+            "61": 700103168.0,
+            "62": 700103168.0,
+            "63": 700103168.0,
+            "64": 700103168.0,
+            "65": 700103168.0,
+            "66": 700103168.0,
+            "67": 700103168.0,
+            "68": 700103168.0,
+            "69": 700103168.0,
+            "70": 700103168.0,
+            "71": 700103168.0,
+            "72": 700103168.0,
+            "73": 700103168.0,
+            "74": 700103168.0,
+            "75": 700103168.0,
+            "76": 700103168.0,
+            "77": 700103168.0,
+            "78": 700103168.0,
+            "79": 700103168.0,
+            "80": 700103168.0,
+            "81": 700103168.0,
+            "82": 700103168.0,
+            "83": 700103168.0,
+            "84": 700103168.0,
+            "85": 700103168.0,
+            "86": 700103168.0,
+            "87": 700103168.0,
+            "88": 700103168.0,
+            "89": 700103168.0,
+            "90": 700103168.0,
+            "91": 700103168.0,
+            "92": 700103168.0,
+            "93": 700103168.0,
+            "94": 700103168.0,
+            "95": 700103168.0,
+            "96": 700103168.0,
+            "97": 700103168.0,
+            "98": 700103168.0,
+            "99": 700103168.0,
+            "100": 700103168.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1503207936.0,
-            "2": 1503208960.0,
-            "3": 1503208960.0,
-            "4": 1503208960.0,
-            "5": 1503208960.0,
-            "6": 1503208960.0,
-            "7": 1503208960.0,
-            "8": 1503208960.0,
-            "9": 1503208960.0,
-            "10": 1503208960.0,
-            "11": 1503208960.0,
-            "12": 1503208960.0,
-            "13": 1503208960.0,
-            "14": 1503208960.0,
-            "15": 1503208960.0,
-            "16": 1503208960.0,
-            "17": 1503208960.0,
-            "18": 1503208960.0,
-            "19": 1503208960.0,
-            "20": 1503208960.0,
-            "21": 1503208960.0,
-            "22": 1503208960.0,
-            "23": 1503208960.0,
-            "24": 1503208960.0,
-            "25": 1503208960.0,
-            "26": 1503208960.0,
-            "27": 1503208960.0,
-            "28": 1503208960.0,
-            "29": 1503208960.0,
-            "30": 1503208960.0,
-            "31": 1503208960.0,
-            "32": 1503208960.0,
-            "33": 1503208960.0,
-            "34": 1503208960.0,
-            "35": 1503208960.0,
-            "36": 1503208960.0,
-            "37": 1503208960.0,
-            "38": 1503208960.0,
-            "39": 1503208960.0,
-            "40": 1503208960.0,
-            "41": 1503208960.0,
-            "42": 1503208960.0,
-            "43": 1503208960.0,
-            "44": 1503208960.0,
-            "45": 1503208960.0,
-            "46": 1503208960.0,
-            "47": 1503208960.0,
-            "48": 1503208960.0,
-            "49": 1503208960.0,
-            "50": 1503208960.0,
-            "51": 1503208960.0,
-            "52": 1503208960.0,
-            "53": 1503208960.0,
-            "54": 1503208960.0,
-            "55": 1503208960.0,
-            "56": 1503208960.0,
-            "57": 1503208960.0,
-            "58": 1503208960.0,
-            "59": 1503208960.0,
-            "60": 1503208960.0,
-            "61": 1503208960.0,
-            "62": 1503208960.0,
-            "63": 1503208960.0,
-            "64": 1503208960.0,
-            "65": 1503208960.0,
-            "66": 1503208960.0,
-            "67": 1503208960.0,
-            "68": 1503208960.0,
-            "69": 1503208960.0,
-            "70": 1503208960.0,
-            "71": 1503208960.0,
-            "72": 1503208960.0,
-            "73": 1503208960.0,
-            "74": 1503208960.0,
-            "75": 1503208960.0,
-            "76": 1503208960.0,
-            "77": 1503208960.0,
-            "78": 1503208960.0,
-            "79": 1503208960.0,
-            "80": 1503208960.0,
-            "81": 1503208960.0,
-            "82": 1503208960.0,
-            "83": 1503208960.0,
-            "84": 1503208960.0,
-            "85": 1503208960.0,
-            "86": 1503208960.0,
-            "87": 1503208960.0,
-            "88": 1503208960.0,
-            "89": 1503208960.0,
-            "90": 1503208960.0,
-            "91": 1503208960.0,
-            "92": 1503208960.0,
-            "93": 1503208960.0,
-            "94": 1503208960.0,
-            "95": 1503208960.0,
-            "96": 1503208960.0,
-            "97": 1503208960.0,
-            "98": 1503208960.0,
-            "99": 1503208960.0,
-            "100": 1503208960.0
+            "1": 1956312576.0,
+            "2": 1956313600.0,
+            "3": 1956313600.0,
+            "4": 1956313600.0,
+            "5": 1956313600.0,
+            "6": 1956313600.0,
+            "7": 1956313600.0,
+            "8": 1956313600.0,
+            "9": 1956313600.0,
+            "10": 1956313600.0,
+            "11": 1956313600.0,
+            "12": 1956313600.0,
+            "13": 1956313600.0,
+            "14": 1956313600.0,
+            "15": 1956313600.0,
+            "16": 1956313600.0,
+            "17": 1956313600.0,
+            "18": 1956313600.0,
+            "19": 1956313600.0,
+            "20": 1956313600.0,
+            "21": 1956313600.0,
+            "22": 1956313600.0,
+            "23": 1956313600.0,
+            "24": 1956313600.0,
+            "25": 1956313600.0,
+            "26": 1956313600.0,
+            "27": 1956313600.0,
+            "28": 1956313600.0,
+            "29": 1956313600.0,
+            "30": 1956313600.0,
+            "31": 1956313600.0,
+            "32": 1956313600.0,
+            "33": 1956313600.0,
+            "34": 1956313600.0,
+            "35": 1956313600.0,
+            "36": 1956313600.0,
+            "37": 1956313600.0,
+            "38": 1956313600.0,
+            "39": 1956313600.0,
+            "40": 1956313600.0,
+            "41": 1956313600.0,
+            "42": 1956313600.0,
+            "43": 1956313600.0,
+            "44": 1956313600.0,
+            "45": 1956313600.0,
+            "46": 1956313600.0,
+            "47": 1956313600.0,
+            "48": 1956313600.0,
+            "49": 1956313600.0,
+            "50": 1956313600.0,
+            "51": 1956313600.0,
+            "52": 1956313600.0,
+            "53": 1956313600.0,
+            "54": 1956313600.0,
+            "55": 1956313600.0,
+            "56": 1956313600.0,
+            "57": 1956313600.0,
+            "58": 1956313600.0,
+            "59": 1956313600.0,
+            "60": 1956313600.0,
+            "61": 1956313600.0,
+            "62": 1956313600.0,
+            "63": 1956313600.0,
+            "64": 1956313600.0,
+            "65": 1956313600.0,
+            "66": 1956313600.0,
+            "67": 1956313600.0,
+            "68": 1956313600.0,
+            "69": 1956313600.0,
+            "70": 1956313600.0,
+            "71": 1956313600.0,
+            "72": 1956313600.0,
+            "73": 1956313600.0,
+            "74": 1956313600.0,
+            "75": 1956313600.0,
+            "76": 1956313600.0,
+            "77": 1956313600.0,
+            "78": 1956313600.0,
+            "79": 1956313600.0,
+            "80": 1956313600.0,
+            "81": 1956313600.0,
+            "82": 1956313600.0,
+            "83": 1956313600.0,
+            "84": 1956313600.0,
+            "85": 1956313600.0,
+            "86": 1956313600.0,
+            "87": 1956313600.0,
+            "88": 1956313600.0,
+            "89": 1956313600.0,
+            "90": 1956313600.0,
+            "91": 1956313600.0,
+            "92": 1956313600.0,
+            "93": 1956313600.0,
+            "94": 1956313600.0,
+            "95": 1956313600.0,
+            "96": 1956313600.0,
+            "97": 1956313600.0,
+            "98": 1956313600.0,
+            "99": 1956313600.0,
+            "100": 1956313600.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.90789,
-            "2": 0.23993,
-            "3": 0.20829,
-            "4": 0.18489,
-            "5": 0.18237,
-            "6": 0.17507,
-            "7": 0.17401,
-            "8": 0.17758,
-            "9": 0.17734,
-            "10": 0.17577,
-            "11": 0.17329,
-            "12": 0.17635,
-            "13": 0.17559,
-            "14": 0.17588,
-            "15": 0.17556,
-            "16": 0.17798,
-            "17": 0.17347,
-            "18": 0.17346,
-            "19": 0.17675,
-            "20": 0.17518,
-            "21": 0.17864,
-            "22": 0.17833,
-            "23": 0.1827,
-            "24": 0.1775,
-            "25": 0.17745,
-            "26": 0.1755,
-            "27": 0.17594,
-            "28": 0.18475,
-            "29": 0.17599,
-            "30": 0.17452,
-            "31": 0.17601,
-            "32": 0.17743,
-            "33": 0.17355,
-            "34": 0.18205,
-            "35": 0.17672,
-            "36": 0.17728,
-            "37": 0.17438,
-            "38": 0.17752,
-            "39": 0.18463,
-            "40": 0.17673,
-            "41": 0.17505,
-            "42": 0.17657,
-            "43": 0.1769,
-            "44": 0.19406,
-            "45": 0.20743,
-            "46": 0.18263,
-            "47": 0.16986,
-            "48": 0.17268,
-            "49": 0.17404,
-            "50": 0.17381,
-            "51": 0.1735,
-            "52": 0.1693,
-            "53": 0.17058,
-            "54": 0.17247,
-            "55": 0.1773,
-            "56": 0.17259,
-            "57": 0.17109,
-            "58": 0.17178,
-            "59": 0.17167,
-            "60": 0.17568,
-            "61": 0.17729,
-            "62": 0.16999,
-            "63": 0.17091,
-            "64": 0.17034,
-            "65": 0.17236,
-            "66": 0.17625,
-            "67": 0.17591,
-            "68": 0.17126,
-            "69": 0.17159,
-            "70": 0.17123,
-            "71": 0.17221,
-            "72": 0.17877,
-            "73": 0.17426,
-            "74": 0.17035,
-            "75": 0.1721,
-            "76": 0.17327,
-            "77": 0.17396,
-            "78": 0.17631,
-            "79": 0.17485,
-            "80": 0.17347,
-            "81": 0.17358,
-            "82": 0.17087,
-            "83": 0.17164,
-            "84": 0.17784,
-            "85": 0.17401,
-            "86": 0.18008,
-            "87": 0.17399,
-            "88": 0.17322,
-            "89": 0.17239,
-            "90": 0.17856,
-            "91": 0.17078,
-            "92": 0.18016,
-            "93": 0.18343,
-            "94": 0.18085,
-            "95": 0.175,
-            "96": 0.17786,
-            "97": 0.17064,
-            "98": 0.17229,
-            "99": 0.17164,
-            "100": 0.20496
+            "1": 4.9999,
+            "2": 0.17604,
+            "3": 0.16654,
+            "4": 0.15324,
+            "5": 0.14982,
+            "6": 0.15181,
+            "7": 0.15028,
+            "8": 0.15021,
+            "9": 0.14947,
+            "10": 0.15037,
+            "11": 0.15211,
+            "12": 0.15245,
+            "13": 0.1517,
+            "14": 0.15044,
+            "15": 0.15166,
+            "16": 0.14955,
+            "17": 0.15212,
+            "18": 0.15368,
+            "19": 0.15062,
+            "20": 0.15093,
+            "21": 0.1573,
+            "22": 0.15817,
+            "23": 0.14955,
+            "24": 0.14912,
+            "25": 0.15491,
+            "26": 0.14937,
+            "27": 0.15155,
+            "28": 0.15055,
+            "29": 0.14603,
+            "30": 0.14602,
+            "31": 0.14824,
+            "32": 0.14477,
+            "33": 0.14671,
+            "34": 0.14693,
+            "35": 0.14738,
+            "36": 0.14504,
+            "37": 0.14513,
+            "38": 0.14512,
+            "39": 0.14473,
+            "40": 0.14614,
+            "41": 0.14578,
+            "42": 0.14684,
+            "43": 0.14487,
+            "44": 0.14547,
+            "45": 0.145,
+            "46": 0.14486,
+            "47": 0.14751,
+            "48": 0.14552,
+            "49": 0.14493,
+            "50": 0.14395,
+            "51": 0.1521,
+            "52": 0.14666,
+            "53": 0.14801,
+            "54": 0.14826,
+            "55": 0.14557,
+            "56": 0.15142,
+            "57": 0.14933,
+            "58": 0.14555,
+            "59": 0.14614,
+            "60": 0.15938,
+            "61": 0.16219,
+            "62": 0.14894,
+            "63": 0.14392,
+            "64": 0.14433,
+            "65": 0.1452,
+            "66": 0.14488,
+            "67": 0.14508,
+            "68": 0.14493,
+            "69": 0.14702,
+            "70": 0.14432,
+            "71": 0.14412,
+            "72": 0.14561,
+            "73": 0.15534,
+            "74": 0.14715,
+            "75": 0.14564,
+            "76": 0.146,
+            "77": 0.14498,
+            "78": 0.14433,
+            "79": 0.14454,
+            "80": 0.1457,
+            "81": 0.14534,
+            "82": 0.14499,
+            "83": 0.14463,
+            "84": 0.1456,
+            "85": 0.14456,
+            "86": 0.1456,
+            "87": 0.14661,
+            "88": 0.1469,
+            "89": 0.14537,
+            "90": 0.14515,
+            "91": 0.14627,
+            "92": 0.14607,
+            "93": 0.14633,
+            "94": 0.14863,
+            "95": 0.14553,
+            "96": 0.14487,
+            "97": 0.14462,
+            "98": 0.14685,
+            "99": 0.14551,
+            "100": 0.14614
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..85eca8a168b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.84059,
+            "2": 10.85204,
+            "3": 10.84133,
+            "4": 10.84535,
+            "5": 10.85551,
+            "6": 10.86422,
+            "7": 10.85246,
+            "8": 10.84439,
+            "9": 10.84792,
+            "10": 10.81313,
+            "11": 10.8561,
+            "12": 10.84243,
+            "13": 10.86076,
+            "14": 10.8495,
+            "15": 10.81525,
+            "16": 10.80923,
+            "17": 10.78383,
+            "18": 10.79178,
+            "19": 10.79409,
+            "20": 10.70535,
+            "21": 10.69778,
+            "22": 10.58348,
+            "23": 10.69235,
+            "24": 10.60608,
+            "25": 10.56718,
+            "26": 10.61425,
+            "27": 10.60614,
+            "28": 10.55901,
+            "29": 10.56486,
+            "30": 10.37865,
+            "31": 10.16183,
+            "32": 10.45519,
+            "33": 10.45018,
+            "34": 10.23984,
+            "35": 10.27323,
+            "36": 10.24226,
+            "37": 10.34516,
+            "38": 10.21732,
+            "39": 10.39456,
+            "40": 10.09506,
+            "41": 10.15057,
+            "42": 10.21211,
+            "43": 9.87993,
+            "44": 9.97831,
+            "45": 9.85574,
+            "46": 9.83355,
+            "47": 10.14081,
+            "48": 9.86387,
+            "49": 9.55497,
+            "50": 9.91604
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1653.0,
+            "2": 1752.0,
+            "3": 1624.0,
+            "4": 1773.0,
+            "5": 2051.0,
+            "6": 1884.0,
+            "7": 1841.0,
+            "8": 1684.0,
+            "9": 1859.0,
+            "10": 1545.0,
+            "11": 1863.0,
+            "12": 1746.0,
+            "13": 2004.0,
+            "14": 1896.0,
+            "15": 1934.0,
+            "16": 2001.0,
+            "17": 1933.0,
+            "18": 1793.0,
+            "19": 1900.0,
+            "20": 1792.0,
+            "21": 2062.0,
+            "22": 1795.0,
+            "23": 1997.0,
+            "24": 1666.0,
+            "25": 1607.0,
+            "26": 1745.0,
+            "27": 1880.0,
+            "28": 1887.0,
+            "29": 2023.0,
+            "30": 1964.0,
+            "31": 1609.0,
+            "32": 1793.0,
+            "33": 2102.0,
+            "34": 1891.0,
+            "35": 1869.0,
+            "36": 1984.0,
+            "37": 2446.0,
+            "38": 2088.0,
+            "39": 2394.0,
+            "40": 2182.0,
+            "41": 2110.0,
+            "42": 2180.0,
+            "43": 1931.0,
+            "44": 2082.0,
+            "45": 2079.0,
+            "46": 2189.0,
+            "47": 2510.0,
+            "48": 2197.0,
+            "49": 2282.0,
+            "50": 2160.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 299204096.0,
+            "2": 299204096.0,
+            "3": 299204096.0,
+            "4": 299204096.0,
+            "5": 299204096.0,
+            "6": 299204096.0,
+            "7": 299204096.0,
+            "8": 299204096.0,
+            "9": 299204096.0,
+            "10": 299204096.0,
+            "11": 299204096.0,
+            "12": 299204096.0,
+            "13": 299204096.0,
+            "14": 299204096.0,
+            "15": 299204096.0,
+            "16": 299204096.0,
+            "17": 299204096.0,
+            "18": 299204096.0,
+            "19": 299204096.0,
+            "20": 299204096.0,
+            "21": 299204096.0,
+            "22": 299204096.0,
+            "23": 299204096.0,
+            "24": 299204096.0,
+            "25": 299204096.0,
+            "26": 299204096.0,
+            "27": 299204096.0,
+            "28": 299204096.0,
+            "29": 299204096.0,
+            "30": 299204096.0,
+            "31": 299204096.0,
+            "32": 299204096.0,
+            "33": 299204096.0,
+            "34": 299204096.0,
+            "35": 299204096.0,
+            "36": 299204096.0,
+            "37": 299204096.0,
+            "38": 299204096.0,
+            "39": 299204096.0,
+            "40": 299204096.0,
+            "41": 299204096.0,
+            "42": 299204096.0,
+            "43": 299204096.0,
+            "44": 299204096.0,
+            "45": 299204096.0,
+            "46": 299204096.0,
+            "47": 299204096.0,
+            "48": 299204096.0,
+            "49": 299204096.0,
+            "50": 299204096.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1480436736.0,
+            "2": 1542892032.0,
+            "3": 1542892032.0,
+            "4": 1542892032.0,
+            "5": 1542892032.0,
+            "6": 1542892032.0,
+            "7": 1542892032.0,
+            "8": 1542892032.0,
+            "9": 1542892032.0,
+            "10": 1542892032.0,
+            "11": 1542892032.0,
+            "12": 1542892032.0,
+            "13": 1542892032.0,
+            "14": 1542892032.0,
+            "15": 1542892032.0,
+            "16": 1542892032.0,
+            "17": 1542892032.0,
+            "18": 1542892032.0,
+            "19": 1542892032.0,
+            "20": 1542892032.0,
+            "21": 1542892032.0,
+            "22": 1542892032.0,
+            "23": 1542892032.0,
+            "24": 1542892032.0,
+            "25": 1542892032.0,
+            "26": 1542892032.0,
+            "27": 1542892032.0,
+            "28": 1542892032.0,
+            "29": 1542892032.0,
+            "30": 1542892032.0,
+            "31": 1542892032.0,
+            "32": 1542892032.0,
+            "33": 1542892032.0,
+            "34": 1542892032.0,
+            "35": 1542892032.0,
+            "36": 1542892032.0,
+            "37": 1542892032.0,
+            "38": 1542892032.0,
+            "39": 1542892032.0,
+            "40": 1542892032.0,
+            "41": 1542892032.0,
+            "42": 1542892032.0,
+            "43": 1542892032.0,
+            "44": 1542892032.0,
+            "45": 1542892032.0,
+            "46": 1542892032.0,
+            "47": 1542892032.0,
+            "48": 1542892032.0,
+            "49": 1542892032.0,
+            "50": 1542892032.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.48029,
+            "3": 0.33127,
+            "4": 0.31594,
+            "5": 0.32143,
+            "6": 0.31919,
+            "7": 0.31884,
+            "8": 0.32129,
+            "9": 0.31988,
+            "10": 0.32069,
+            "11": 0.31907,
+            "12": 0.31959,
+            "13": 0.32211,
+            "14": 0.31964,
+            "15": 0.31855,
+            "16": 0.32013,
+            "17": 0.32004,
+            "18": 0.31786,
+            "19": 0.31755,
+            "20": 0.31944,
+            "21": 0.31998,
+            "22": 0.32066,
+            "23": 0.32079,
+            "24": 0.31728,
+            "25": 0.31689,
+            "26": 0.31831,
+            "27": 0.31727,
+            "28": 0.31999,
+            "29": 0.31997,
+            "30": 0.31824,
+            "31": 0.31724,
+            "32": 0.33433,
+            "33": 0.43748,
+            "34": 0.63551,
+            "35": 0.35878,
+            "36": 0.31703,
+            "37": 0.31709,
+            "38": 0.32151,
+            "39": 0.31762,
+            "40": 0.3204,
+            "41": 0.3741,
+            "42": 0.37991,
+            "43": 0.3738,
+            "44": 0.38277,
+            "45": 0.38,
+            "46": 0.37409,
+            "47": 0.36543,
+            "48": 0.37113,
+            "49": 0.36281,
+            "50": 0.36274
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
index 2bfd32d0721..dcd92db1774 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 299203072.0,
-            "2": 299203072.0,
-            "3": 299203072.0,
-            "4": 299203072.0,
-            "5": 299203072.0,
-            "6": 299203072.0,
-            "7": 299203072.0,
-            "8": 299203072.0,
-            "9": 299203072.0,
-            "10": 299203072.0,
-            "11": 299203072.0,
-            "12": 299203072.0,
-            "13": 299203072.0,
-            "14": 299203072.0,
-            "15": 299203072.0,
-            "16": 299203072.0,
-            "17": 299203072.0,
-            "18": 299203072.0,
-            "19": 299203072.0,
-            "20": 299203072.0,
-            "21": 299203072.0,
-            "22": 299203072.0,
-            "23": 299203072.0,
-            "24": 299203072.0,
-            "25": 299203072.0,
-            "26": 299203072.0,
-            "27": 299203072.0,
-            "28": 299203072.0,
-            "29": 299203072.0,
-            "30": 299203072.0,
-            "31": 299203072.0,
-            "32": 299203072.0,
-            "33": 299203072.0,
-            "34": 299203072.0,
-            "35": 299203072.0,
-            "36": 299203072.0,
-            "37": 299203072.0,
-            "38": 299203072.0,
-            "39": 299203072.0,
-            "40": 299203072.0,
-            "41": 299203072.0,
-            "42": 299203072.0,
-            "43": 299203072.0,
-            "44": 299203072.0,
-            "45": 299203072.0,
-            "46": 299203072.0,
-            "47": 299203072.0,
-            "48": 299203072.0,
-            "49": 299203072.0,
-            "50": 299203072.0
+            "1": 299204096.0,
+            "2": 299204096.0,
+            "3": 299204096.0,
+            "4": 299204096.0,
+            "5": 299204096.0,
+            "6": 299204096.0,
+            "7": 299204096.0,
+            "8": 299204096.0,
+            "9": 299204096.0,
+            "10": 299204096.0,
+            "11": 299204096.0,
+            "12": 299204096.0,
+            "13": 299204096.0,
+            "14": 299204096.0,
+            "15": 299204096.0,
+            "16": 299204096.0,
+            "17": 299204096.0,
+            "18": 299204096.0,
+            "19": 299204096.0,
+            "20": 299204096.0,
+            "21": 299204096.0,
+            "22": 299204096.0,
+            "23": 299204096.0,
+            "24": 299204096.0,
+            "25": 299204096.0,
+            "26": 299204096.0,
+            "27": 299204096.0,
+            "28": 299204096.0,
+            "29": 299204096.0,
+            "30": 299204096.0,
+            "31": 299204096.0,
+            "32": 299204096.0,
+            "33": 299204096.0,
+            "34": 299204096.0,
+            "35": 299204096.0,
+            "36": 299204096.0,
+            "37": 299204096.0,
+            "38": 299204096.0,
+            "39": 299204096.0,
+            "40": 299204096.0,
+            "41": 299204096.0,
+            "42": 299204096.0,
+            "43": 299204096.0,
+            "44": 299204096.0,
+            "45": 299204096.0,
+            "46": 299204096.0,
+            "47": 299204096.0,
+            "48": 299204096.0,
+            "49": 299204096.0,
+            "50": 299204096.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1477945856.0,
-            "2": 1542891008.0,
-            "3": 1542891008.0,
-            "4": 1542891008.0,
-            "5": 1542891008.0,
-            "6": 1542891008.0,
-            "7": 1542891008.0,
-            "8": 1542891008.0,
-            "9": 1542891008.0,
-            "10": 1542891008.0,
-            "11": 1542891008.0,
-            "12": 1542891008.0,
-            "13": 1542891008.0,
-            "14": 1542891008.0,
-            "15": 1542891008.0,
-            "16": 1542891008.0,
-            "17": 1542891008.0,
-            "18": 1542891008.0,
-            "19": 1542891008.0,
-            "20": 1542891008.0,
-            "21": 1542891008.0,
-            "22": 1542891008.0,
-            "23": 1542891008.0,
-            "24": 1542891008.0,
-            "25": 1542891008.0,
-            "26": 1542891008.0,
-            "27": 1542891008.0,
-            "28": 1542891008.0,
-            "29": 1542891008.0,
-            "30": 1542891008.0,
-            "31": 1542891008.0,
-            "32": 1542891008.0,
-            "33": 1542891008.0,
-            "34": 1542891008.0,
-            "35": 1542891008.0,
-            "36": 1542891008.0,
-            "37": 1542891008.0,
-            "38": 1542891008.0,
-            "39": 1542891008.0,
-            "40": 1542891008.0,
-            "41": 1542891008.0,
-            "42": 1542891008.0,
-            "43": 1542891008.0,
-            "44": 1542891008.0,
-            "45": 1542891008.0,
-            "46": 1542891008.0,
-            "47": 1542891008.0,
-            "48": 1542891008.0,
-            "49": 1542891008.0,
-            "50": 1542891008.0
+            "1": 1478995456.0,
+            "2": 1545382400.0,
+            "3": 1545382400.0,
+            "4": 1545382400.0,
+            "5": 1545382400.0,
+            "6": 1545382400.0,
+            "7": 1545382400.0,
+            "8": 1545382400.0,
+            "9": 1545382400.0,
+            "10": 1545382400.0,
+            "11": 1545382400.0,
+            "12": 1545382400.0,
+            "13": 1545382400.0,
+            "14": 1545382400.0,
+            "15": 1545382400.0,
+            "16": 1545382400.0,
+            "17": 1545382400.0,
+            "18": 1545382400.0,
+            "19": 1545382400.0,
+            "20": 1545382400.0,
+            "21": 1545382400.0,
+            "22": 1545382400.0,
+            "23": 1545382400.0,
+            "24": 1545382400.0,
+            "25": 1545382400.0,
+            "26": 1545382400.0,
+            "27": 1545382400.0,
+            "28": 1545382400.0,
+            "29": 1545382400.0,
+            "30": 1545382400.0,
+            "31": 1545382400.0,
+            "32": 1545382400.0,
+            "33": 1545382400.0,
+            "34": 1545382400.0,
+            "35": 1545382400.0,
+            "36": 1545382400.0,
+            "37": 1545382400.0,
+            "38": 1545382400.0,
+            "39": 1545382400.0,
+            "40": 1545382400.0,
+            "41": 1545382400.0,
+            "42": 1545382400.0,
+            "43": 1545382400.0,
+            "44": 1545382400.0,
+            "45": 1545382400.0,
+            "46": 1545382400.0,
+            "47": 1545382400.0,
+            "48": 1545382400.0,
+            "49": 1545382400.0,
+            "50": 1545382400.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 8.86827,
-            "2": 0.25581,
-            "3": 0.24685,
-            "4": 0.24528,
-            "5": 0.24786,
-            "6": 0.25055,
-            "7": 0.2473,
-            "8": 0.24843,
-            "9": 0.24646,
-            "10": 0.24448,
-            "11": 0.24595,
-            "12": 0.24375,
-            "13": 0.24607,
-            "14": 0.2438,
-            "15": 0.24496,
-            "16": 0.24469,
-            "17": 0.24672,
-            "18": 0.2472,
-            "19": 0.24412,
-            "20": 0.24734,
-            "21": 0.24525,
-            "22": 0.24726,
-            "23": 0.24425,
-            "24": 0.2467,
-            "25": 0.24589,
-            "26": 0.24521,
-            "27": 0.24972,
-            "28": 0.24969,
-            "29": 0.24951,
-            "30": 0.24819,
-            "31": 0.25039,
-            "32": 0.24983,
-            "33": 0.25363,
-            "34": 0.25237,
-            "35": 0.24992,
-            "36": 0.24811,
-            "37": 0.25001,
-            "38": 0.24929,
-            "39": 0.24928,
-            "40": 0.24894,
-            "41": 0.24934,
-            "42": 0.24889,
-            "43": 0.24734,
-            "44": 0.24821,
-            "45": 0.2492,
-            "46": 0.24867,
-            "47": 0.25083,
-            "48": 0.24933,
-            "49": 0.24988,
-            "50": 0.25012
+            "1": 9.29646,
+            "2": 0.25495,
+            "3": 0.23221,
+            "4": 0.21344,
+            "5": 0.21407,
+            "6": 0.2135,
+            "7": 0.2133,
+            "8": 0.2143,
+            "9": 0.2448,
+            "10": 0.21516,
+            "11": 0.21366,
+            "12": 0.21308,
+            "13": 0.21405,
+            "14": 0.21663,
+            "15": 0.21321,
+            "16": 0.21331,
+            "17": 0.21649,
+            "18": 0.21423,
+            "19": 0.21617,
+            "20": 0.21504,
+            "21": 0.21521,
+            "22": 0.21474,
+            "23": 0.21516,
+            "24": 0.21334,
+            "25": 0.21673,
+            "26": 0.2145,
+            "27": 0.21534,
+            "28": 0.21454,
+            "29": 0.21458,
+            "30": 0.21608,
+            "31": 0.2147,
+            "32": 0.21508,
+            "33": 0.21429,
+            "34": 0.21502,
+            "35": 0.21469,
+            "36": 0.21553,
+            "37": 0.21385,
+            "38": 0.21644,
+            "39": 0.2164,
+            "40": 0.21622,
+            "41": 0.21355,
+            "42": 0.21641,
+            "43": 0.21488,
+            "44": 0.21246,
+            "45": 0.58026,
+            "46": 0.2168,
+            "47": 0.21774,
+            "48": 0.21503,
+            "49": 0.21695,
+            "50": 0.21799
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
index 5dd18b2b701..f6ec6ecdaca 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1449634304.0,
-            "2": 1516021248.0,
-            "3": 1516021248.0,
-            "4": 1516021248.0,
-            "5": 1516021248.0,
-            "6": 1516021248.0,
-            "7": 1516021248.0,
-            "8": 1516021248.0,
-            "9": 1516021248.0,
-            "10": 1516021248.0,
-            "11": 1516021248.0,
-            "12": 1516021248.0,
-            "13": 1516021248.0,
-            "14": 1516021248.0,
-            "15": 1516021248.0,
-            "16": 1516021248.0,
-            "17": 1516021248.0,
-            "18": 1516021248.0,
-            "19": 1516021248.0,
-            "20": 1516021248.0,
-            "21": 1516021248.0,
-            "22": 1516021248.0,
-            "23": 1516021248.0,
-            "24": 1516021248.0,
-            "25": 1516021248.0,
-            "26": 1516021248.0,
-            "27": 1516021248.0,
-            "28": 1516021248.0,
-            "29": 1516021248.0,
-            "30": 1516021248.0,
-            "31": 1516021248.0,
-            "32": 1516021248.0,
-            "33": 1516021248.0,
-            "34": 1516021248.0,
-            "35": 1516021248.0,
-            "36": 1516021248.0,
-            "37": 1516021248.0,
-            "38": 1516021248.0,
-            "39": 1516021248.0,
-            "40": 1516021248.0,
-            "41": 1516021248.0,
-            "42": 1516021248.0,
-            "43": 1516021248.0,
-            "44": 1516021248.0,
-            "45": 1516021248.0,
-            "46": 1516021248.0,
-            "47": 1516021248.0,
-            "48": 1516021248.0,
-            "49": 1516021248.0,
-            "50": 1516021248.0
+            "1": 1448585728.0,
+            "2": 1513530880.0,
+            "3": 1513530880.0,
+            "4": 1513530880.0,
+            "5": 1513530880.0,
+            "6": 1513530880.0,
+            "7": 1513530880.0,
+            "8": 1513530880.0,
+            "9": 1513530880.0,
+            "10": 1513530880.0,
+            "11": 1513530880.0,
+            "12": 1513530880.0,
+            "13": 1513530880.0,
+            "14": 1513530880.0,
+            "15": 1513530880.0,
+            "16": 1513530880.0,
+            "17": 1513530880.0,
+            "18": 1513530880.0,
+            "19": 1513530880.0,
+            "20": 1513530880.0,
+            "21": 1513530880.0,
+            "22": 1513530880.0,
+            "23": 1513530880.0,
+            "24": 1513530880.0,
+            "25": 1513530880.0,
+            "26": 1513530880.0,
+            "27": 1513530880.0,
+            "28": 1513530880.0,
+            "29": 1513530880.0,
+            "30": 1513530880.0,
+            "31": 1513530880.0,
+            "32": 1513530880.0,
+            "33": 1513530880.0,
+            "34": 1513530880.0,
+            "35": 1513530880.0,
+            "36": 1513530880.0,
+            "37": 1513530880.0,
+            "38": 1513530880.0,
+            "39": 1513530880.0,
+            "40": 1513530880.0,
+            "41": 1513530880.0,
+            "42": 1513530880.0,
+            "43": 1513530880.0,
+            "44": 1513530880.0,
+            "45": 1513530880.0,
+            "46": 1513530880.0,
+            "47": 1513530880.0,
+            "48": 1513530880.0,
+            "49": 1513530880.0,
+            "50": 1513530880.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4.8193,
-            "2": 0.36983,
-            "3": 0.32405,
-            "4": 0.32179,
-            "5": 0.32037,
-            "6": 0.32162,
-            "7": 0.32479,
-            "8": 0.32031,
-            "9": 0.32398,
-            "10": 0.32296,
-            "11": 0.32125,
-            "12": 0.32185,
-            "13": 0.323,
-            "14": 0.32307,
-            "15": 0.32035,
-            "16": 0.31953,
-            "17": 0.32119,
-            "18": 0.32536,
-            "19": 0.32368,
-            "20": 0.32071,
-            "21": 0.32043,
-            "22": 0.32093,
-            "23": 0.32096,
-            "24": 0.31999,
-            "25": 0.32046,
-            "26": 0.31988,
-            "27": 0.32184,
-            "28": 0.32107,
-            "29": 0.32078,
-            "30": 0.32174,
-            "31": 0.32345,
-            "32": 0.32975,
-            "33": 0.32181,
-            "34": 0.32294,
-            "35": 0.32426,
-            "36": 0.32184,
-            "37": 0.32175,
-            "38": 0.32222,
-            "39": 0.32058,
-            "40": 0.32111,
-            "41": 0.33546,
-            "42": 0.32505,
-            "43": 0.32502,
-            "44": 0.32486,
-            "45": 0.32683,
-            "46": 0.32331,
-            "47": 0.322,
-            "48": 0.32205,
-            "49": 0.32128,
-            "50": 0.32053
+            "1": 3.59395,
+            "2": 0.38136,
+            "3": 0.33497,
+            "4": 0.31659,
+            "5": 0.321,
+            "6": 0.3174,
+            "7": 0.31686,
+            "8": 0.31682,
+            "9": 0.32441,
+            "10": 0.31766,
+            "11": 0.31647,
+            "12": 0.31676,
+            "13": 0.31706,
+            "14": 0.31701,
+            "15": 0.31716,
+            "16": 0.31906,
+            "17": 0.31727,
+            "18": 0.31834,
+            "19": 0.31964,
+            "20": 0.31956,
+            "21": 0.3203,
+            "22": 0.32057,
+            "23": 0.32049,
+            "24": 0.31892,
+            "25": 0.32081,
+            "26": 0.31964,
+            "27": 0.31915,
+            "28": 0.31828,
+            "29": 0.31932,
+            "30": 0.31791,
+            "31": 0.31931,
+            "32": 0.31993,
+            "33": 0.31989,
+            "34": 0.32088,
+            "35": 0.31904,
+            "36": 0.65249,
+            "37": 0.3209,
+            "38": 0.31853,
+            "39": 0.32906,
+            "40": 0.3183,
+            "41": 0.32008,
+            "42": 0.31904,
+            "43": 0.31861,
+            "44": 0.3189,
+            "45": 0.31881,
+            "46": 0.31915,
+            "47": 0.31943,
+            "48": 0.31889,
+            "49": 0.3186,
+            "50": 0.31887
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
index 9d8de380f83..944dfb0b489 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..c4a7b5d2ff0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.83993,
+            "2": 10.85182,
+            "3": 10.84166,
+            "4": 10.84441,
+            "5": 10.85514,
+            "6": 10.86428,
+            "7": 10.85243,
+            "8": 10.84464,
+            "9": 10.84864,
+            "10": 10.81333,
+            "11": 10.85638,
+            "12": 10.84233,
+            "13": 10.86046,
+            "14": 10.84976,
+            "15": 10.81618,
+            "16": 10.80886,
+            "17": 10.78242,
+            "18": 10.79155,
+            "19": 10.79495,
+            "20": 10.7055,
+            "21": 10.6978,
+            "22": 10.58349,
+            "23": 10.69268,
+            "24": 10.60558,
+            "25": 10.56742,
+            "26": 10.61456,
+            "27": 10.6067,
+            "28": 10.55905,
+            "29": 10.56526,
+            "30": 10.37918,
+            "31": 10.16276,
+            "32": 10.45543,
+            "33": 10.45037,
+            "34": 10.23993,
+            "35": 10.27354,
+            "36": 10.24224,
+            "37": 10.34559,
+            "38": 10.21738,
+            "39": 10.39453,
+            "40": 10.095,
+            "41": 10.15093,
+            "42": 10.21235,
+            "43": 9.87982,
+            "44": 9.97875,
+            "45": 9.85588,
+            "46": 9.83349,
+            "47": 10.14101,
+            "48": 9.86418,
+            "49": 9.55509,
+            "50": 9.91636,
+            "51": 9.86104,
+            "52": 9.75109,
+            "53": 10.06631,
+            "54": 9.95634,
+            "55": 9.89354,
+            "56": 9.637,
+            "57": 9.49142,
+            "58": 9.8341,
+            "59": 9.5931,
+            "60": 9.51379,
+            "61": 9.69183,
+            "62": 9.99162,
+            "63": 9.39196,
+            "64": 9.77455,
+            "65": 8.96319,
+            "66": 9.70663,
+            "67": 9.3789,
+            "68": 9.78328,
+            "69": 9.79736,
+            "70": 9.73753,
+            "71": 9.62711,
+            "72": 9.58907,
+            "73": 9.50446,
+            "74": 8.94975,
+            "75": 9.4278,
+            "76": 9.08764,
+            "77": 10.06759,
+            "78": 9.72141,
+            "79": 9.3861,
+            "80": 9.40495,
+            "81": 9.48596,
+            "82": 9.70195,
+            "83": 9.31553,
+            "84": 9.41806,
+            "85": 9.61378,
+            "86": 9.08145,
+            "87": 9.59631,
+            "88": 9.75008,
+            "89": 9.60386,
+            "90": 9.82838,
+            "91": 9.33622,
+            "92": 9.35764,
+            "93": 9.08795,
+            "94": 8.83437,
+            "95": 9.53352,
+            "96": 9.53315,
+            "97": 9.31129,
+            "98": 9.67176,
+            "99": 8.89816,
+            "100": 9.40969
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1621.0,
+            "2": 1756.0,
+            "3": 1698.0,
+            "4": 1764.0,
+            "5": 2045.0,
+            "6": 1927.0,
+            "7": 1901.0,
+            "8": 1768.0,
+            "9": 1823.0,
+            "10": 1456.0,
+            "11": 1884.0,
+            "12": 1834.0,
+            "13": 2003.0,
+            "14": 1786.0,
+            "15": 1879.0,
+            "16": 1948.0,
+            "17": 1849.0,
+            "18": 1718.0,
+            "19": 1870.0,
+            "20": 1750.0,
+            "21": 1977.0,
+            "22": 1741.0,
+            "23": 1946.0,
+            "24": 1642.0,
+            "25": 1636.0,
+            "26": 1817.0,
+            "27": 1926.0,
+            "28": 1981.0,
+            "29": 1993.0,
+            "30": 1929.0,
+            "31": 1630.0,
+            "32": 1896.0,
+            "33": 2115.0,
+            "34": 1824.0,
+            "35": 1960.0,
+            "36": 1935.0,
+            "37": 2410.0,
+            "38": 2259.0,
+            "39": 2428.0,
+            "40": 2119.0,
+            "41": 2278.0,
+            "42": 2118.0,
+            "43": 1992.0,
+            "44": 2041.0,
+            "45": 1992.0,
+            "46": 2158.0,
+            "47": 2416.0,
+            "48": 2338.0,
+            "49": 2315.0,
+            "50": 2242.0,
+            "51": 2431.0,
+            "52": 2467.0,
+            "53": 2794.0,
+            "54": 2675.0,
+            "55": 2313.0,
+            "56": 2597.0,
+            "57": 2278.0,
+            "58": 2887.0,
+            "59": 2701.0,
+            "60": 2190.0,
+            "61": 2764.0,
+            "62": 2576.0,
+            "63": 2405.0,
+            "64": 2903.0,
+            "65": 2516.0,
+            "66": 2885.0,
+            "67": 2700.0,
+            "68": 2682.0,
+            "69": 2987.0,
+            "70": 3141.0,
+            "71": 3055.0,
+            "72": 2413.0,
+            "73": 2864.0,
+            "74": 1870.0,
+            "75": 2450.0,
+            "76": 3032.0,
+            "77": 3230.0,
+            "78": 3125.0,
+            "79": 2982.0,
+            "80": 3203.0,
+            "81": 3657.0,
+            "82": 3174.0,
+            "83": 2818.0,
+            "84": 3190.0,
+            "85": 3166.0,
+            "86": 2793.0,
+            "87": 3635.0,
+            "88": 3005.0,
+            "89": 3373.0,
+            "90": 3066.0,
+            "91": 2857.0,
+            "92": 3080.0,
+            "93": 2533.0,
+            "94": 3303.0,
+            "95": 3270.0,
+            "96": 3416.0,
+            "97": 3085.0,
+            "98": 3437.0,
+            "99": 3243.0,
+            "100": 3119.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 299204096.0,
+            "2": 299204096.0,
+            "3": 299204096.0,
+            "4": 299204096.0,
+            "5": 299204096.0,
+            "6": 299204096.0,
+            "7": 299204096.0,
+            "8": 299204096.0,
+            "9": 299204096.0,
+            "10": 299204096.0,
+            "11": 299204096.0,
+            "12": 299204096.0,
+            "13": 299204096.0,
+            "14": 299204096.0,
+            "15": 299204096.0,
+            "16": 299204096.0,
+            "17": 299204096.0,
+            "18": 299204096.0,
+            "19": 299204096.0,
+            "20": 299204096.0,
+            "21": 299204096.0,
+            "22": 299204096.0,
+            "23": 299204096.0,
+            "24": 299204096.0,
+            "25": 299204096.0,
+            "26": 299204096.0,
+            "27": 299204096.0,
+            "28": 299204096.0,
+            "29": 299204096.0,
+            "30": 299204096.0,
+            "31": 299204096.0,
+            "32": 299204096.0,
+            "33": 299204096.0,
+            "34": 299204096.0,
+            "35": 299204096.0,
+            "36": 299204096.0,
+            "37": 299204096.0,
+            "38": 299204096.0,
+            "39": 299204096.0,
+            "40": 299204096.0,
+            "41": 299204096.0,
+            "42": 299204096.0,
+            "43": 299204096.0,
+            "44": 299204096.0,
+            "45": 299204096.0,
+            "46": 299204096.0,
+            "47": 299204096.0,
+            "48": 299204096.0,
+            "49": 299204096.0,
+            "50": 299204096.0,
+            "51": 299204096.0,
+            "52": 299204096.0,
+            "53": 299204096.0,
+            "54": 299204096.0,
+            "55": 299204096.0,
+            "56": 299204096.0,
+            "57": 299204096.0,
+            "58": 299204096.0,
+            "59": 299204096.0,
+            "60": 299204096.0,
+            "61": 299204096.0,
+            "62": 299204096.0,
+            "63": 299204096.0,
+            "64": 299204096.0,
+            "65": 299204096.0,
+            "66": 299204096.0,
+            "67": 299204096.0,
+            "68": 299204096.0,
+            "69": 299204096.0,
+            "70": 299204096.0,
+            "71": 299204096.0,
+            "72": 299204096.0,
+            "73": 299204096.0,
+            "74": 299204096.0,
+            "75": 299204096.0,
+            "76": 299204096.0,
+            "77": 299204096.0,
+            "78": 299204096.0,
+            "79": 299204096.0,
+            "80": 299204096.0,
+            "81": 299204096.0,
+            "82": 299204096.0,
+            "83": 299204096.0,
+            "84": 299204096.0,
+            "85": 299204096.0,
+            "86": 299204096.0,
+            "87": 299204096.0,
+            "88": 299204096.0,
+            "89": 299204096.0,
+            "90": 299204096.0,
+            "91": 299204096.0,
+            "92": 299204096.0,
+            "93": 299204096.0,
+            "94": 299204096.0,
+            "95": 299204096.0,
+            "96": 299204096.0,
+            "97": 299204096.0,
+            "98": 299204096.0,
+            "99": 299204096.0,
+            "100": 299204096.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 999540224.0,
+            "2": 1065140736.0,
+            "3": 1065140736.0,
+            "4": 1065140736.0,
+            "5": 1065140736.0,
+            "6": 1065140736.0,
+            "7": 1065140736.0,
+            "8": 1065140736.0,
+            "9": 1065140736.0,
+            "10": 1065140736.0,
+            "11": 1065140736.0,
+            "12": 1065140736.0,
+            "13": 1065140736.0,
+            "14": 1065140736.0,
+            "15": 1065140736.0,
+            "16": 1065140736.0,
+            "17": 1065140736.0,
+            "18": 1065140736.0,
+            "19": 1065140736.0,
+            "20": 1065140736.0,
+            "21": 1065140736.0,
+            "22": 1065140736.0,
+            "23": 1065140736.0,
+            "24": 1065140736.0,
+            "25": 1065140736.0,
+            "26": 1065140736.0,
+            "27": 1065140736.0,
+            "28": 1065140736.0,
+            "29": 1065140736.0,
+            "30": 1065140736.0,
+            "31": 1065140736.0,
+            "32": 1065140736.0,
+            "33": 1065140736.0,
+            "34": 1065140736.0,
+            "35": 1065140736.0,
+            "36": 1065140736.0,
+            "37": 1065140736.0,
+            "38": 1065140736.0,
+            "39": 1065140736.0,
+            "40": 1065140736.0,
+            "41": 1065140736.0,
+            "42": 1065140736.0,
+            "43": 1065140736.0,
+            "44": 1065140736.0,
+            "45": 1065140736.0,
+            "46": 1065140736.0,
+            "47": 1065140736.0,
+            "48": 1065140736.0,
+            "49": 1065140736.0,
+            "50": 1065140736.0,
+            "51": 1065140736.0,
+            "52": 1065140736.0,
+            "53": 1065140736.0,
+            "54": 1065140736.0,
+            "55": 1065140736.0,
+            "56": 1065140736.0,
+            "57": 1065140736.0,
+            "58": 1065140736.0,
+            "59": 1065140736.0,
+            "60": 1065140736.0,
+            "61": 1065140736.0,
+            "62": 1065140736.0,
+            "63": 1065140736.0,
+            "64": 1065140736.0,
+            "65": 1065140736.0,
+            "66": 1065140736.0,
+            "67": 1065140736.0,
+            "68": 1065140736.0,
+            "69": 1065140736.0,
+            "70": 1065140736.0,
+            "71": 1065140736.0,
+            "72": 1065140736.0,
+            "73": 1065140736.0,
+            "74": 1065140736.0,
+            "75": 1065140736.0,
+            "76": 1065140736.0,
+            "77": 1065140736.0,
+            "78": 1065140736.0,
+            "79": 1065140736.0,
+            "80": 1065140736.0,
+            "81": 1065140736.0,
+            "82": 1065140736.0,
+            "83": 1065140736.0,
+            "84": 1065140736.0,
+            "85": 1065140736.0,
+            "86": 1065140736.0,
+            "87": 1065140736.0,
+            "88": 1065140736.0,
+            "89": 1065140736.0,
+            "90": 1065140736.0,
+            "91": 1065140736.0,
+            "92": 1065140736.0,
+            "93": 1065140736.0,
+            "94": 1065140736.0,
+            "95": 1065140736.0,
+            "96": 1065140736.0,
+            "97": 1065140736.0,
+            "98": 1065140736.0,
+            "99": 1065140736.0,
+            "100": 1065140736.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.39769,
+            "3": 0.36211,
+            "4": 0.34515,
+            "5": 0.34772,
+            "6": 0.34875,
+            "7": 0.33703,
+            "8": 0.33347,
+            "9": 0.33755,
+            "10": 0.3382,
+            "11": 0.34227,
+            "12": 0.34002,
+            "13": 0.34177,
+            "14": 0.34923,
+            "15": 0.34952,
+            "16": 0.34726,
+            "17": 0.34899,
+            "18": 0.34889,
+            "19": 0.3485,
+            "20": 0.34828,
+            "21": 0.35065,
+            "22": 0.35089,
+            "23": 0.34992,
+            "24": 0.34939,
+            "25": 0.34831,
+            "26": 0.35035,
+            "27": 0.3455,
+            "28": 0.34034,
+            "29": 0.352,
+            "30": 0.35218,
+            "31": 0.3474,
+            "32": 0.34883,
+            "33": 0.35242,
+            "34": 0.35219,
+            "35": 0.35394,
+            "36": 0.35651,
+            "37": 0.35447,
+            "38": 0.35431,
+            "39": 0.35341,
+            "40": 0.35274,
+            "41": 0.35275,
+            "42": 0.35259,
+            "43": 0.35167,
+            "44": 0.35288,
+            "45": 0.35066,
+            "46": 0.3519,
+            "47": 0.35146,
+            "48": 0.34993,
+            "49": 0.35082,
+            "50": 0.35229,
+            "51": 0.6173,
+            "52": 0.34392,
+            "53": 0.34427,
+            "54": 0.3476,
+            "55": 0.34816,
+            "56": 0.34515,
+            "57": 0.34404,
+            "58": 0.34542,
+            "59": 0.34546,
+            "60": 0.34635,
+            "61": 0.35023,
+            "62": 0.34884,
+            "63": 0.3484,
+            "64": 0.34644,
+            "65": 0.34943,
+            "66": 0.34821,
+            "67": 0.34706,
+            "68": 0.34645,
+            "69": 0.34888,
+            "70": 0.34562,
+            "71": 0.34952,
+            "72": 0.34911,
+            "73": 0.34968,
+            "74": 0.34895,
+            "75": 0.34861,
+            "76": 0.34704,
+            "77": 0.34924,
+            "78": 0.35302,
+            "79": 0.35161,
+            "80": 0.34618,
+            "81": 0.35136,
+            "82": 0.3518,
+            "83": 0.34829,
+            "84": 0.34739,
+            "85": 0.34831,
+            "86": 0.34725,
+            "87": 0.34629,
+            "88": 0.35011,
+            "89": 0.34978,
+            "90": 0.34956,
+            "91": 0.34919,
+            "92": 0.35021,
+            "93": 0.34979,
+            "94": 0.35425,
+            "95": 0.34862,
+            "96": 0.34704,
+            "97": 0.34718,
+            "98": 0.34842,
+            "99": 0.35045,
+            "100": 0.349
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..e8ad4bfea94
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.86104,
+            "52": 9.75109,
+            "53": 10.06631,
+            "54": 9.95634,
+            "55": 9.89354,
+            "56": 9.637,
+            "57": 9.49142,
+            "58": 9.8341,
+            "59": 9.5931,
+            "60": 9.51379,
+            "61": 9.69183,
+            "62": 9.99162,
+            "63": 9.39196,
+            "64": 9.77455,
+            "65": 8.96319,
+            "66": 9.70663,
+            "67": 9.3789,
+            "68": 9.78328,
+            "69": 9.79736,
+            "70": 9.73753,
+            "71": 9.62711,
+            "72": 9.58907,
+            "73": 9.50446,
+            "74": 8.94975,
+            "75": 9.4278,
+            "76": 9.08764,
+            "77": 10.06759,
+            "78": 9.72141,
+            "79": 9.3861,
+            "80": 9.40495,
+            "81": 9.48596,
+            "82": 9.70195,
+            "83": 9.31553,
+            "84": 9.41806,
+            "85": 9.61378,
+            "86": 9.08145,
+            "87": 9.59631,
+            "88": 9.75008,
+            "89": 9.60386,
+            "90": 9.82838,
+            "91": 9.33622,
+            "92": 9.35764,
+            "93": 9.08795,
+            "94": 8.83437,
+            "95": 9.53352,
+            "96": 9.53315,
+            "97": 9.31129,
+            "98": 9.67176,
+            "99": 8.89816,
+            "100": 9.40969
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2431.0,
+            "52": 2467.0,
+            "53": 2794.0,
+            "54": 2675.0,
+            "55": 2313.0,
+            "56": 2597.0,
+            "57": 2278.0,
+            "58": 2887.0,
+            "59": 2701.0,
+            "60": 2190.0,
+            "61": 2764.0,
+            "62": 2576.0,
+            "63": 2405.0,
+            "64": 2903.0,
+            "65": 2516.0,
+            "66": 2885.0,
+            "67": 2700.0,
+            "68": 2682.0,
+            "69": 2987.0,
+            "70": 3141.0,
+            "71": 3055.0,
+            "72": 2413.0,
+            "73": 2864.0,
+            "74": 1870.0,
+            "75": 2450.0,
+            "76": 3032.0,
+            "77": 3230.0,
+            "78": 3125.0,
+            "79": 2982.0,
+            "80": 3203.0,
+            "81": 3657.0,
+            "82": 3174.0,
+            "83": 2818.0,
+            "84": 3190.0,
+            "85": 3166.0,
+            "86": 2793.0,
+            "87": 3635.0,
+            "88": 3005.0,
+            "89": 3373.0,
+            "90": 3066.0,
+            "91": 2857.0,
+            "92": 3080.0,
+            "93": 2533.0,
+            "94": 3303.0,
+            "95": 3270.0,
+            "96": 3416.0,
+            "97": 3085.0,
+            "98": 3437.0,
+            "99": 3243.0,
+            "100": 3119.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 365196800.0,
+            "52": 365196800.0,
+            "53": 365196800.0,
+            "54": 365196800.0,
+            "55": 365196800.0,
+            "56": 365196800.0,
+            "57": 365196800.0,
+            "58": 365196800.0,
+            "59": 365196800.0,
+            "60": 365196800.0,
+            "61": 365196800.0,
+            "62": 365196800.0,
+            "63": 365196800.0,
+            "64": 365196800.0,
+            "65": 365196800.0,
+            "66": 365196800.0,
+            "67": 365196800.0,
+            "68": 365196800.0,
+            "69": 365196800.0,
+            "70": 365196800.0,
+            "71": 365196800.0,
+            "72": 365196800.0,
+            "73": 365196800.0,
+            "74": 365196800.0,
+            "75": 365196800.0,
+            "76": 365196800.0,
+            "77": 365196800.0,
+            "78": 365196800.0,
+            "79": 365196800.0,
+            "80": 365196800.0,
+            "81": 365196800.0,
+            "82": 365196800.0,
+            "83": 365196800.0,
+            "84": 365196800.0,
+            "85": 365196800.0,
+            "86": 365196800.0,
+            "87": 365196800.0,
+            "88": 365196800.0,
+            "89": 365196800.0,
+            "90": 365196800.0,
+            "91": 365196800.0,
+            "92": 365196800.0,
+            "93": 365196800.0,
+            "94": 365196800.0,
+            "95": 365196800.0,
+            "96": 365196800.0,
+            "97": 365196800.0,
+            "98": 365196800.0,
+            "99": 365196800.0,
+            "100": 365196800.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1132180992.0,
+            "52": 1132182016.0,
+            "53": 1132182016.0,
+            "54": 1132182016.0,
+            "55": 1132182016.0,
+            "56": 1132182016.0,
+            "57": 1132182016.0,
+            "58": 1132182016.0,
+            "59": 1132182016.0,
+            "60": 1132182016.0,
+            "61": 1132182016.0,
+            "62": 1132182016.0,
+            "63": 1132182016.0,
+            "64": 1132182016.0,
+            "65": 1132182016.0,
+            "66": 1132182016.0,
+            "67": 1132182016.0,
+            "68": 1132182016.0,
+            "69": 1132182016.0,
+            "70": 1132182016.0,
+            "71": 1132182016.0,
+            "72": 1132182016.0,
+            "73": 1132182016.0,
+            "74": 1132182016.0,
+            "75": 1132182016.0,
+            "76": 1132182016.0,
+            "77": 1132182016.0,
+            "78": 1132182016.0,
+            "79": 1132182016.0,
+            "80": 1132182016.0,
+            "81": 1132182016.0,
+            "82": 1132182016.0,
+            "83": 1132182016.0,
+            "84": 1132182016.0,
+            "85": 1132182016.0,
+            "86": 1132182016.0,
+            "87": 1132182016.0,
+            "88": 1132182016.0,
+            "89": 1132182016.0,
+            "90": 1132182016.0,
+            "91": 1132182016.0,
+            "92": 1132182016.0,
+            "93": 1132182016.0,
+            "94": 1132182016.0,
+            "95": 1132182016.0,
+            "96": 1132182016.0,
+            "97": 1132182016.0,
+            "98": 1132182016.0,
+            "99": 1132182016.0,
+            "100": 1132182016.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.25993,
+            "53": 0.33819,
+            "54": 0.33044,
+            "55": 0.3308,
+            "56": 0.32801,
+            "57": 0.33341,
+            "58": 0.33637,
+            "59": 0.33863,
+            "60": 0.33792,
+            "61": 0.33956,
+            "62": 0.34024,
+            "63": 0.33835,
+            "64": 0.34288,
+            "65": 0.33962,
+            "66": 0.34436,
+            "67": 0.34564,
+            "68": 0.34571,
+            "69": 0.34698,
+            "70": 0.34462,
+            "71": 0.34382,
+            "72": 0.3445,
+            "73": 0.3446,
+            "74": 0.34215,
+            "75": 0.34655,
+            "76": 0.34397,
+            "77": 0.34288,
+            "78": 0.34323,
+            "79": 0.34095,
+            "80": 0.34228,
+            "81": 0.34072,
+            "82": 0.34333,
+            "83": 0.34118,
+            "84": 0.34195,
+            "85": 0.34307,
+            "86": 0.34341,
+            "87": 0.34409,
+            "88": 0.34372,
+            "89": 0.34284,
+            "90": 0.34363,
+            "91": 0.347,
+            "92": 0.34448,
+            "93": 0.3445,
+            "94": 0.34642,
+            "95": 0.34511,
+            "96": 0.34515,
+            "97": 0.34484,
+            "98": 0.34543,
+            "99": 0.34503,
+            "100": 0.34832
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json
index b61916ffd95..76ec80299fc 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 299203072.0,
-            "2": 299203072.0,
-            "3": 299203072.0,
-            "4": 299203072.0,
-            "5": 299203072.0,
-            "6": 299203072.0,
-            "7": 299203072.0,
-            "8": 299203072.0,
-            "9": 299203072.0,
-            "10": 299203072.0,
-            "11": 299203072.0,
-            "12": 299203072.0,
-            "13": 299203072.0,
-            "14": 299203072.0,
-            "15": 299203072.0,
-            "16": 299203072.0,
-            "17": 299203072.0,
-            "18": 299203072.0,
-            "19": 299203072.0,
-            "20": 299203072.0,
-            "21": 299203072.0,
-            "22": 299203072.0,
-            "23": 299203072.0,
-            "24": 299203072.0,
-            "25": 299203072.0,
-            "26": 299203072.0,
-            "27": 299203072.0,
-            "28": 299203072.0,
-            "29": 299203072.0,
-            "30": 299203072.0,
-            "31": 299203072.0,
-            "32": 299203072.0,
-            "33": 299203072.0,
-            "34": 299203072.0,
-            "35": 299203072.0,
-            "36": 299203072.0,
-            "37": 299203072.0,
-            "38": 299203072.0,
-            "39": 299203072.0,
-            "40": 299203072.0,
-            "41": 299203072.0,
-            "42": 299203072.0,
-            "43": 299203072.0,
-            "44": 299203072.0,
-            "45": 299203072.0,
-            "46": 299203072.0,
-            "47": 299203072.0,
-            "48": 299203072.0,
-            "49": 299203072.0,
-            "50": 299203072.0,
-            "51": 299203072.0,
-            "52": 299203072.0,
-            "53": 299203072.0,
-            "54": 299203072.0,
-            "55": 299203072.0,
-            "56": 299203072.0,
-            "57": 299203072.0,
-            "58": 299203072.0,
-            "59": 299203072.0,
-            "60": 299203072.0,
-            "61": 299203072.0,
-            "62": 299203072.0,
-            "63": 299203072.0,
-            "64": 299203072.0,
-            "65": 299203072.0,
-            "66": 299203072.0,
-            "67": 299203072.0,
-            "68": 299203072.0,
-            "69": 299203072.0,
-            "70": 299203072.0,
-            "71": 299203072.0,
-            "72": 299203072.0,
-            "73": 299203072.0,
-            "74": 299203072.0,
-            "75": 299203072.0,
-            "76": 299203072.0,
-            "77": 299203072.0,
-            "78": 299203072.0,
-            "79": 299203072.0,
-            "80": 299203072.0,
-            "81": 299203072.0,
-            "82": 299203072.0,
-            "83": 299203072.0,
-            "84": 299203072.0,
-            "85": 299203072.0,
-            "86": 299203072.0,
-            "87": 299203072.0,
-            "88": 299203072.0,
-            "89": 299203072.0,
-            "90": 299203072.0,
-            "91": 299203072.0,
-            "92": 299203072.0,
-            "93": 299203072.0,
-            "94": 299203072.0,
-            "95": 299203072.0,
-            "96": 299203072.0,
-            "97": 299203072.0,
-            "98": 299203072.0,
-            "99": 299203072.0,
-            "100": 299203072.0
+            "1": 299204096.0,
+            "2": 299204096.0,
+            "3": 299204096.0,
+            "4": 299204096.0,
+            "5": 299204096.0,
+            "6": 299204096.0,
+            "7": 299204096.0,
+            "8": 299204096.0,
+            "9": 299204096.0,
+            "10": 299204096.0,
+            "11": 299204096.0,
+            "12": 299204096.0,
+            "13": 299204096.0,
+            "14": 299204096.0,
+            "15": 299204096.0,
+            "16": 299204096.0,
+            "17": 299204096.0,
+            "18": 299204096.0,
+            "19": 299204096.0,
+            "20": 299204096.0,
+            "21": 299204096.0,
+            "22": 299204096.0,
+            "23": 299204096.0,
+            "24": 299204096.0,
+            "25": 299204096.0,
+            "26": 299204096.0,
+            "27": 299204096.0,
+            "28": 299204096.0,
+            "29": 299204096.0,
+            "30": 299204096.0,
+            "31": 299204096.0,
+            "32": 299204096.0,
+            "33": 299204096.0,
+            "34": 299204096.0,
+            "35": 299204096.0,
+            "36": 299204096.0,
+            "37": 299204096.0,
+            "38": 299204096.0,
+            "39": 299204096.0,
+            "40": 299204096.0,
+            "41": 299204096.0,
+            "42": 299204096.0,
+            "43": 299204096.0,
+            "44": 299204096.0,
+            "45": 299204096.0,
+            "46": 299204096.0,
+            "47": 299204096.0,
+            "48": 299204096.0,
+            "49": 299204096.0,
+            "50": 299204096.0,
+            "51": 299204096.0,
+            "52": 299204096.0,
+            "53": 299204096.0,
+            "54": 299204096.0,
+            "55": 299204096.0,
+            "56": 299204096.0,
+            "57": 299204096.0,
+            "58": 299204096.0,
+            "59": 299204096.0,
+            "60": 299204096.0,
+            "61": 299204096.0,
+            "62": 299204096.0,
+            "63": 299204096.0,
+            "64": 299204096.0,
+            "65": 299204096.0,
+            "66": 299204096.0,
+            "67": 299204096.0,
+            "68": 299204096.0,
+            "69": 299204096.0,
+            "70": 299204096.0,
+            "71": 299204096.0,
+            "72": 299204096.0,
+            "73": 299204096.0,
+            "74": 299204096.0,
+            "75": 299204096.0,
+            "76": 299204096.0,
+            "77": 299204096.0,
+            "78": 299204096.0,
+            "79": 299204096.0,
+            "80": 299204096.0,
+            "81": 299204096.0,
+            "82": 299204096.0,
+            "83": 299204096.0,
+            "84": 299204096.0,
+            "85": 299204096.0,
+            "86": 299204096.0,
+            "87": 299204096.0,
+            "88": 299204096.0,
+            "89": 299204096.0,
+            "90": 299204096.0,
+            "91": 299204096.0,
+            "92": 299204096.0,
+            "93": 299204096.0,
+            "94": 299204096.0,
+            "95": 299204096.0,
+            "96": 299204096.0,
+            "97": 299204096.0,
+            "98": 299204096.0,
+            "99": 299204096.0,
+            "100": 299204096.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 977125888.0,
-            "2": 1042071040.0,
-            "3": 1042071040.0,
-            "4": 1042071040.0,
-            "5": 1042071040.0,
-            "6": 1042071040.0,
-            "7": 1042071040.0,
-            "8": 1042071040.0,
-            "9": 1042071040.0,
-            "10": 1042071040.0,
-            "11": 1042071040.0,
-            "12": 1042071040.0,
-            "13": 1042071040.0,
-            "14": 1042071040.0,
-            "15": 1042071040.0,
-            "16": 1042071040.0,
-            "17": 1042071040.0,
-            "18": 1042071040.0,
-            "19": 1042071040.0,
-            "20": 1042071040.0,
-            "21": 1042071040.0,
-            "22": 1042071040.0,
-            "23": 1042071040.0,
-            "24": 1042071040.0,
-            "25": 1042071040.0,
-            "26": 1042071040.0,
-            "27": 1042071040.0,
-            "28": 1042071040.0,
-            "29": 1042071040.0,
-            "30": 1042071040.0,
-            "31": 1042071040.0,
-            "32": 1042071040.0,
-            "33": 1042071040.0,
-            "34": 1042071040.0,
-            "35": 1042071040.0,
-            "36": 1042071040.0,
-            "37": 1042071040.0,
-            "38": 1042071040.0,
-            "39": 1042071040.0,
-            "40": 1042071040.0,
-            "41": 1042071040.0,
-            "42": 1042071040.0,
-            "43": 1042071040.0,
-            "44": 1042071040.0,
-            "45": 1042071040.0,
-            "46": 1042071040.0,
-            "47": 1042071040.0,
-            "48": 1042071040.0,
-            "49": 1042071040.0,
-            "50": 1042071040.0,
-            "51": 1042071040.0,
-            "52": 1042071040.0,
-            "53": 1042071040.0,
-            "54": 1042071040.0,
-            "55": 1042071040.0,
-            "56": 1042071040.0,
-            "57": 1042071040.0,
-            "58": 1042071040.0,
-            "59": 1042071040.0,
-            "60": 1042071040.0,
-            "61": 1042071040.0,
-            "62": 1042071040.0,
-            "63": 1042071040.0,
-            "64": 1042071040.0,
-            "65": 1042071040.0,
-            "66": 1042071040.0,
-            "67": 1042071040.0,
-            "68": 1042071040.0,
-            "69": 1042071040.0,
-            "70": 1042071040.0,
-            "71": 1042071040.0,
-            "72": 1042071040.0,
-            "73": 1042071040.0,
-            "74": 1042071040.0,
-            "75": 1042071040.0,
-            "76": 1042071040.0,
-            "77": 1042071040.0,
-            "78": 1042071040.0,
-            "79": 1042071040.0,
-            "80": 1042071040.0,
-            "81": 1042071040.0,
-            "82": 1042071040.0,
-            "83": 1042071040.0,
-            "84": 1042071040.0,
-            "85": 1042071040.0,
-            "86": 1042071040.0,
-            "87": 1042071040.0,
-            "88": 1042071040.0,
-            "89": 1042071040.0,
-            "90": 1042071040.0,
-            "91": 1042071040.0,
-            "92": 1042071040.0,
-            "93": 1042071040.0,
-            "94": 1042071040.0,
-            "95": 1042071040.0,
-            "96": 1042071040.0,
-            "97": 1042071040.0,
-            "98": 1042071040.0,
-            "99": 1042071040.0,
-            "100": 1042071040.0
+            "1": 977519616.0,
+            "2": 1042465280.0,
+            "3": 1042465280.0,
+            "4": 1042465280.0,
+            "5": 1042465280.0,
+            "6": 1042465280.0,
+            "7": 1042465280.0,
+            "8": 1042465280.0,
+            "9": 1042465280.0,
+            "10": 1042465280.0,
+            "11": 1042465280.0,
+            "12": 1042465280.0,
+            "13": 1042465280.0,
+            "14": 1042465280.0,
+            "15": 1042465280.0,
+            "16": 1042465280.0,
+            "17": 1042465280.0,
+            "18": 1042465280.0,
+            "19": 1042465280.0,
+            "20": 1042465280.0,
+            "21": 1042465280.0,
+            "22": 1042465280.0,
+            "23": 1042465280.0,
+            "24": 1042465280.0,
+            "25": 1042465280.0,
+            "26": 1042465280.0,
+            "27": 1042465280.0,
+            "28": 1042465280.0,
+            "29": 1042465280.0,
+            "30": 1042465280.0,
+            "31": 1042465280.0,
+            "32": 1042465280.0,
+            "33": 1042465280.0,
+            "34": 1042465280.0,
+            "35": 1042465280.0,
+            "36": 1042465280.0,
+            "37": 1042465280.0,
+            "38": 1042465280.0,
+            "39": 1042465280.0,
+            "40": 1042465280.0,
+            "41": 1042465280.0,
+            "42": 1042465280.0,
+            "43": 1042465280.0,
+            "44": 1042465280.0,
+            "45": 1042465280.0,
+            "46": 1042465280.0,
+            "47": 1042465280.0,
+            "48": 1042465280.0,
+            "49": 1042465280.0,
+            "50": 1042465280.0,
+            "51": 1042465280.0,
+            "52": 1042465280.0,
+            "53": 1042465280.0,
+            "54": 1042465280.0,
+            "55": 1042465280.0,
+            "56": 1042465280.0,
+            "57": 1042465280.0,
+            "58": 1042465280.0,
+            "59": 1042465280.0,
+            "60": 1042465280.0,
+            "61": 1042465280.0,
+            "62": 1042465280.0,
+            "63": 1042465280.0,
+            "64": 1042465280.0,
+            "65": 1042465280.0,
+            "66": 1042465280.0,
+            "67": 1042465280.0,
+            "68": 1042465280.0,
+            "69": 1042465280.0,
+            "70": 1042465280.0,
+            "71": 1042465280.0,
+            "72": 1042465280.0,
+            "73": 1042465280.0,
+            "74": 1042465280.0,
+            "75": 1042465280.0,
+            "76": 1042465280.0,
+            "77": 1042465280.0,
+            "78": 1042465280.0,
+            "79": 1042465280.0,
+            "80": 1042465280.0,
+            "81": 1042465280.0,
+            "82": 1042465280.0,
+            "83": 1042465280.0,
+            "84": 1042465280.0,
+            "85": 1042465280.0,
+            "86": 1042465280.0,
+            "87": 1042465280.0,
+            "88": 1042465280.0,
+            "89": 1042465280.0,
+            "90": 1042465280.0,
+            "91": 1042465280.0,
+            "92": 1042465280.0,
+            "93": 1042465280.0,
+            "94": 1042465280.0,
+            "95": 1042465280.0,
+            "96": 1042465280.0,
+            "97": 1042465280.0,
+            "98": 1042465280.0,
+            "99": 1042465280.0,
+            "100": 1042465280.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.40872,
-            "2": 0.25886,
-            "3": 0.22849,
-            "4": 0.21099,
-            "5": 0.21193,
-            "6": 0.20863,
-            "7": 0.20987,
-            "8": 0.21014,
-            "9": 0.21139,
-            "10": 0.21148,
-            "11": 0.21513,
-            "12": 0.21915,
-            "13": 0.21037,
-            "14": 0.20786,
-            "15": 0.20927,
-            "16": 0.20756,
-            "17": 0.21005,
-            "18": 0.21022,
-            "19": 0.21019,
-            "20": 0.21012,
-            "21": 0.20995,
-            "22": 0.21005,
-            "23": 0.21213,
-            "24": 0.20995,
-            "25": 0.20776,
-            "26": 0.21296,
-            "27": 0.20984,
-            "28": 0.21526,
-            "29": 0.21164,
-            "30": 0.21175,
-            "31": 0.21062,
-            "32": 0.21292,
-            "33": 0.20962,
-            "34": 0.21025,
-            "35": 0.20968,
-            "36": 0.21367,
-            "37": 0.20989,
-            "38": 0.21034,
-            "39": 0.20979,
-            "40": 0.21092,
-            "41": 0.21065,
-            "42": 0.20865,
-            "43": 0.20939,
-            "44": 0.21656,
-            "45": 0.21131,
-            "46": 0.21087,
-            "47": 0.23723,
-            "48": 0.21006,
-            "49": 0.21157,
-            "50": 0.20975,
-            "51": 0.21952,
-            "52": 0.21306,
-            "53": 0.21253,
-            "54": 0.21223,
-            "55": 0.21336,
-            "56": 0.21514,
-            "57": 0.21536,
-            "58": 0.21288,
-            "59": 0.21211,
-            "60": 0.21298,
-            "61": 0.21285,
-            "62": 0.21438,
-            "63": 0.21461,
-            "64": 0.21382,
-            "65": 0.22082,
-            "66": 0.21222,
-            "67": 0.21414,
-            "68": 0.21315,
-            "69": 0.2153,
-            "70": 0.2172,
-            "71": 0.21323,
-            "72": 0.21366,
-            "73": 0.21434,
-            "74": 0.21455,
-            "75": 0.21545,
-            "76": 0.21631,
-            "77": 0.21419,
-            "78": 0.21365,
-            "79": 0.21514,
-            "80": 0.21447,
-            "81": 0.21379,
-            "82": 0.21487,
-            "83": 0.21038,
-            "84": 0.21708,
-            "85": 0.21166,
-            "86": 0.2141,
-            "87": 0.21613,
-            "88": 0.21214,
-            "89": 0.21499,
-            "90": 0.21811,
-            "91": 0.21563,
-            "92": 0.2152,
-            "93": 0.21548,
-            "94": 0.21863,
-            "95": 0.21366,
-            "96": 0.21458,
-            "97": 0.21279,
-            "98": 0.21555,
-            "99": 0.213,
-            "100": 0.2112
+            "1": 9.3573,
+            "2": 0.22781,
+            "3": 0.20223,
+            "4": 0.18298,
+            "5": 0.18347,
+            "6": 0.18262,
+            "7": 0.18305,
+            "8": 0.18295,
+            "9": 0.18205,
+            "10": 0.18986,
+            "11": 0.18455,
+            "12": 0.18245,
+            "13": 0.18257,
+            "14": 0.18276,
+            "15": 0.18245,
+            "16": 0.18291,
+            "17": 0.18246,
+            "18": 0.18732,
+            "19": 0.18256,
+            "20": 0.17944,
+            "21": 0.18071,
+            "22": 0.17927,
+            "23": 0.18026,
+            "24": 0.17928,
+            "25": 0.17797,
+            "26": 0.17889,
+            "27": 0.17809,
+            "28": 0.17769,
+            "29": 0.1779,
+            "30": 0.17904,
+            "31": 0.1865,
+            "32": 0.17922,
+            "33": 0.17866,
+            "34": 0.17807,
+            "35": 0.17828,
+            "36": 0.17941,
+            "37": 0.17744,
+            "38": 0.17752,
+            "39": 0.17793,
+            "40": 0.17906,
+            "41": 0.17769,
+            "42": 0.17938,
+            "43": 0.17822,
+            "44": 0.17848,
+            "45": 0.17846,
+            "46": 0.17952,
+            "47": 0.17854,
+            "48": 0.17937,
+            "49": 0.17929,
+            "50": 0.17767,
+            "51": 0.19143,
+            "52": 0.18056,
+            "53": 0.18054,
+            "54": 0.18173,
+            "55": 0.18101,
+            "56": 0.18146,
+            "57": 0.1796,
+            "58": 0.18116,
+            "59": 0.18351,
+            "60": 0.17824,
+            "61": 0.17784,
+            "62": 0.17757,
+            "63": 0.17868,
+            "64": 0.17881,
+            "65": 0.17844,
+            "66": 0.1766,
+            "67": 0.17725,
+            "68": 0.17696,
+            "69": 0.1769,
+            "70": 0.17752,
+            "71": 0.17684,
+            "72": 0.17943,
+            "73": 0.17816,
+            "74": 0.1781,
+            "75": 0.17671,
+            "76": 0.17658,
+            "77": 0.17778,
+            "78": 0.1771,
+            "79": 0.17667,
+            "80": 0.17694,
+            "81": 0.17739,
+            "82": 0.18259,
+            "83": 0.1806,
+            "84": 0.18169,
+            "85": 0.18154,
+            "86": 0.1832,
+            "87": 0.18284,
+            "88": 0.18358,
+            "89": 0.18203,
+            "90": 0.18406,
+            "91": 0.18296,
+            "92": 0.18249,
+            "93": 0.1823,
+            "94": 0.1834,
+            "95": 0.18246,
+            "96": 0.19284,
+            "97": 0.7432,
+            "98": 0.20476,
+            "99": 0.19058,
+            "100": 0.18263
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..59e234529c3
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85839,
+            "52": 9.7506,
+            "53": 10.05817,
+            "54": 9.96076,
+            "55": 9.88738,
+            "56": 9.6344,
+            "57": 9.4967,
+            "58": 9.83343,
+            "59": 9.59391,
+            "60": 9.51376,
+            "61": 9.69928,
+            "62": 9.98089,
+            "63": 9.39065,
+            "64": 9.77599,
+            "65": 8.9571,
+            "66": 9.70054,
+            "67": 9.37,
+            "68": 9.78529,
+            "69": 9.78966,
+            "70": 9.74676,
+            "71": 9.61906,
+            "72": 9.58963,
+            "73": 9.49629,
+            "74": 8.94963,
+            "75": 9.42381,
+            "76": 9.07799,
+            "77": 10.07105,
+            "78": 9.72632,
+            "79": 9.37966,
+            "80": 9.40721,
+            "81": 9.48238,
+            "82": 9.70152,
+            "83": 9.30657,
+            "84": 9.41464,
+            "85": 9.61784,
+            "86": 9.08212,
+            "87": 9.59511,
+            "88": 9.75008,
+            "89": 9.60356,
+            "90": 9.82256,
+            "91": 9.33721,
+            "92": 9.35861,
+            "93": 9.07956,
+            "94": 8.83268,
+            "95": 9.51351,
+            "96": 9.52947,
+            "97": 9.31813,
+            "98": 9.67451,
+            "99": 8.88607,
+            "100": 9.40106
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2621.0,
+            "52": 2597.0,
+            "53": 2926.0,
+            "54": 2633.0,
+            "55": 2206.0,
+            "56": 2627.0,
+            "57": 2328.0,
+            "58": 2886.0,
+            "59": 2639.0,
+            "60": 2157.0,
+            "61": 2736.0,
+            "62": 2544.0,
+            "63": 2332.0,
+            "64": 2948.0,
+            "65": 2630.0,
+            "66": 2931.0,
+            "67": 2717.0,
+            "68": 2643.0,
+            "69": 2955.0,
+            "70": 3040.0,
+            "71": 2882.0,
+            "72": 2390.0,
+            "73": 2812.0,
+            "74": 1844.0,
+            "75": 2461.0,
+            "76": 3067.0,
+            "77": 3152.0,
+            "78": 3018.0,
+            "79": 3008.0,
+            "80": 3104.0,
+            "81": 3589.0,
+            "82": 3218.0,
+            "83": 2748.0,
+            "84": 3217.0,
+            "85": 3167.0,
+            "86": 2876.0,
+            "87": 3604.0,
+            "88": 3017.0,
+            "89": 3249.0,
+            "90": 3069.0,
+            "91": 2865.0,
+            "92": 3074.0,
+            "93": 2680.0,
+            "94": 3392.0,
+            "95": 3206.0,
+            "96": 3401.0,
+            "97": 3107.0,
+            "98": 3624.0,
+            "99": 3007.0,
+            "100": 3111.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 365196800.0,
+            "52": 365196800.0,
+            "53": 365196800.0,
+            "54": 365196800.0,
+            "55": 365196800.0,
+            "56": 365196800.0,
+            "57": 365196800.0,
+            "58": 365196800.0,
+            "59": 365196800.0,
+            "60": 365196800.0,
+            "61": 365196800.0,
+            "62": 365196800.0,
+            "63": 365196800.0,
+            "64": 365196800.0,
+            "65": 365196800.0,
+            "66": 365196800.0,
+            "67": 365196800.0,
+            "68": 365196800.0,
+            "69": 365196800.0,
+            "70": 365196800.0,
+            "71": 365196800.0,
+            "72": 365196800.0,
+            "73": 365196800.0,
+            "74": 365196800.0,
+            "75": 365196800.0,
+            "76": 365196800.0,
+            "77": 365196800.0,
+            "78": 365196800.0,
+            "79": 365196800.0,
+            "80": 365196800.0,
+            "81": 365196800.0,
+            "82": 365196800.0,
+            "83": 365196800.0,
+            "84": 365196800.0,
+            "85": 365196800.0,
+            "86": 365196800.0,
+            "87": 365196800.0,
+            "88": 365196800.0,
+            "89": 365196800.0,
+            "90": 365196800.0,
+            "91": 365196800.0,
+            "92": 365196800.0,
+            "93": 365196800.0,
+            "94": 365196800.0,
+            "95": 365196800.0,
+            "96": 365196800.0,
+            "97": 365196800.0,
+            "98": 365196800.0,
+            "99": 365196800.0,
+            "100": 365196800.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1109505024.0,
+            "52": 1109506560.0,
+            "53": 1109506560.0,
+            "54": 1109506560.0,
+            "55": 1109506560.0,
+            "56": 1109506560.0,
+            "57": 1109506560.0,
+            "58": 1109506560.0,
+            "59": 1109506560.0,
+            "60": 1109506560.0,
+            "61": 1109506560.0,
+            "62": 1109506560.0,
+            "63": 1109506560.0,
+            "64": 1109506560.0,
+            "65": 1109506560.0,
+            "66": 1109506560.0,
+            "67": 1109506560.0,
+            "68": 1109506560.0,
+            "69": 1109506560.0,
+            "70": 1109506560.0,
+            "71": 1109506560.0,
+            "72": 1109506560.0,
+            "73": 1109506560.0,
+            "74": 1109506560.0,
+            "75": 1109506560.0,
+            "76": 1109506560.0,
+            "77": 1109506560.0,
+            "78": 1109506560.0,
+            "79": 1109506560.0,
+            "80": 1109506560.0,
+            "81": 1109506560.0,
+            "82": 1109506560.0,
+            "83": 1109506560.0,
+            "84": 1109506560.0,
+            "85": 1109506560.0,
+            "86": 1109506560.0,
+            "87": 1109506560.0,
+            "88": 1109506560.0,
+            "89": 1109506560.0,
+            "90": 1109506560.0,
+            "91": 1109506560.0,
+            "92": 1109506560.0,
+            "93": 1109506560.0,
+            "94": 1109506560.0,
+            "95": 1109506560.0,
+            "96": 1109506560.0,
+            "97": 1109506560.0,
+            "98": 1109506560.0,
+            "99": 1109506560.0,
+            "100": 1109506560.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.93427,
+            "52": 0.21812,
+            "53": 0.185,
+            "54": 0.18425,
+            "55": 0.18174,
+            "56": 0.18062,
+            "57": 0.17992,
+            "58": 0.17969,
+            "59": 0.18021,
+            "60": 0.18245,
+            "61": 0.18025,
+            "62": 0.18048,
+            "63": 0.18064,
+            "64": 0.18121,
+            "65": 0.17955,
+            "66": 0.18229,
+            "67": 0.17924,
+            "68": 0.18046,
+            "69": 0.18052,
+            "70": 0.17985,
+            "71": 0.18045,
+            "72": 0.17993,
+            "73": 0.17909,
+            "74": 0.18421,
+            "75": 0.18068,
+            "76": 0.18347,
+            "77": 0.18157,
+            "78": 0.18084,
+            "79": 0.17981,
+            "80": 0.17936,
+            "81": 0.17999,
+            "82": 0.18094,
+            "83": 0.17982,
+            "84": 0.18317,
+            "85": 0.18036,
+            "86": 0.1809,
+            "87": 0.17889,
+            "88": 0.17894,
+            "89": 0.17919,
+            "90": 0.17925,
+            "91": 0.17923,
+            "92": 0.17791,
+            "93": 0.17995,
+            "94": 0.17922,
+            "95": 0.17997,
+            "96": 0.17959,
+            "97": 0.1793,
+            "98": 0.1799,
+            "99": 0.17942,
+            "100": 0.17849
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json
index 0cc3719ac53..1e42aa887f6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 971619840.0,
-            "2": 1036172800.0,
-            "3": 1036172800.0,
-            "4": 1036172800.0,
-            "5": 1036172800.0,
-            "6": 1036172800.0,
-            "7": 1036172800.0,
-            "8": 1036172800.0,
-            "9": 1036172800.0,
-            "10": 1036172800.0,
-            "11": 1036172800.0,
-            "12": 1036172800.0,
-            "13": 1036172800.0,
-            "14": 1036172800.0,
-            "15": 1036172800.0,
-            "16": 1036172800.0,
-            "17": 1036172800.0,
-            "18": 1036172800.0,
-            "19": 1036172800.0,
-            "20": 1036172800.0,
-            "21": 1036172800.0,
-            "22": 1036172800.0,
-            "23": 1036172800.0,
-            "24": 1036172800.0,
-            "25": 1036172800.0,
-            "26": 1036172800.0,
-            "27": 1036172800.0,
-            "28": 1036172800.0,
-            "29": 1036172800.0,
-            "30": 1036172800.0,
-            "31": 1036172800.0,
-            "32": 1036172800.0,
-            "33": 1036172800.0,
-            "34": 1036172800.0,
-            "35": 1036172800.0,
-            "36": 1036172800.0,
-            "37": 1036172800.0,
-            "38": 1036172800.0,
-            "39": 1036172800.0,
-            "40": 1036172800.0,
-            "41": 1036172800.0,
-            "42": 1036172800.0,
-            "43": 1036172800.0,
-            "44": 1036172800.0,
-            "45": 1036172800.0,
-            "46": 1036172800.0,
-            "47": 1036172800.0,
-            "48": 1036172800.0,
-            "49": 1036172800.0,
-            "50": 1036172800.0,
-            "51": 1036172800.0,
-            "52": 1036172800.0,
-            "53": 1036172800.0,
-            "54": 1036172800.0,
-            "55": 1036172800.0,
-            "56": 1036172800.0,
-            "57": 1036172800.0,
-            "58": 1036172800.0,
-            "59": 1036172800.0,
-            "60": 1036172800.0,
-            "61": 1036172800.0,
-            "62": 1036172800.0,
-            "63": 1036172800.0,
-            "64": 1036172800.0,
-            "65": 1036172800.0,
-            "66": 1036172800.0,
-            "67": 1036172800.0,
-            "68": 1036172800.0,
-            "69": 1036172800.0,
-            "70": 1036172800.0,
-            "71": 1036172800.0,
-            "72": 1036172800.0,
-            "73": 1036172800.0,
-            "74": 1036172800.0,
-            "75": 1036172800.0,
-            "76": 1036172800.0,
-            "77": 1036172800.0,
-            "78": 1036172800.0,
-            "79": 1036172800.0,
-            "80": 1036172800.0,
-            "81": 1036172800.0,
-            "82": 1036172800.0,
-            "83": 1036172800.0,
-            "84": 1036172800.0,
-            "85": 1036172800.0,
-            "86": 1036172800.0,
-            "87": 1036172800.0,
-            "88": 1036172800.0,
-            "89": 1036172800.0,
-            "90": 1036172800.0,
-            "91": 1036172800.0,
-            "92": 1036172800.0,
-            "93": 1036172800.0,
-            "94": 1036172800.0,
-            "95": 1036172800.0,
-            "96": 1036172800.0,
-            "97": 1036172800.0,
-            "98": 1036172800.0,
-            "99": 1036172800.0,
-            "100": 1036172800.0
+            "1": 968737280.0,
+            "2": 1035779584.0,
+            "3": 1035779584.0,
+            "4": 1035779584.0,
+            "5": 1035779584.0,
+            "6": 1035779584.0,
+            "7": 1035779584.0,
+            "8": 1035779584.0,
+            "9": 1035779584.0,
+            "10": 1035779584.0,
+            "11": 1035779584.0,
+            "12": 1035779584.0,
+            "13": 1035779584.0,
+            "14": 1035779584.0,
+            "15": 1035779584.0,
+            "16": 1035779584.0,
+            "17": 1035779584.0,
+            "18": 1035779584.0,
+            "19": 1035779584.0,
+            "20": 1035779584.0,
+            "21": 1035779584.0,
+            "22": 1035779584.0,
+            "23": 1035779584.0,
+            "24": 1035779584.0,
+            "25": 1035779584.0,
+            "26": 1035779584.0,
+            "27": 1035779584.0,
+            "28": 1035779584.0,
+            "29": 1035779584.0,
+            "30": 1035779584.0,
+            "31": 1035779584.0,
+            "32": 1035779584.0,
+            "33": 1035779584.0,
+            "34": 1035779584.0,
+            "35": 1035779584.0,
+            "36": 1035779584.0,
+            "37": 1035779584.0,
+            "38": 1035779584.0,
+            "39": 1035779584.0,
+            "40": 1035779584.0,
+            "41": 1035779584.0,
+            "42": 1035779584.0,
+            "43": 1035779584.0,
+            "44": 1035779584.0,
+            "45": 1035779584.0,
+            "46": 1035779584.0,
+            "47": 1035779584.0,
+            "48": 1035779584.0,
+            "49": 1035779584.0,
+            "50": 1035779584.0,
+            "51": 1035779584.0,
+            "52": 1035779584.0,
+            "53": 1035779584.0,
+            "54": 1035779584.0,
+            "55": 1035779584.0,
+            "56": 1035779584.0,
+            "57": 1035779584.0,
+            "58": 1035779584.0,
+            "59": 1035779584.0,
+            "60": 1035779584.0,
+            "61": 1035779584.0,
+            "62": 1035779584.0,
+            "63": 1035779584.0,
+            "64": 1035779584.0,
+            "65": 1035779584.0,
+            "66": 1035779584.0,
+            "67": 1035779584.0,
+            "68": 1035779584.0,
+            "69": 1035779584.0,
+            "70": 1035779584.0,
+            "71": 1035779584.0,
+            "72": 1035779584.0,
+            "73": 1035779584.0,
+            "74": 1035779584.0,
+            "75": 1035779584.0,
+            "76": 1035779584.0,
+            "77": 1035779584.0,
+            "78": 1035779584.0,
+            "79": 1035779584.0,
+            "80": 1035779584.0,
+            "81": 1035779584.0,
+            "82": 1035779584.0,
+            "83": 1035779584.0,
+            "84": 1035779584.0,
+            "85": 1035779584.0,
+            "86": 1035779584.0,
+            "87": 1035779584.0,
+            "88": 1035779584.0,
+            "89": 1035779584.0,
+            "90": 1035779584.0,
+            "91": 1035779584.0,
+            "92": 1035779584.0,
+            "93": 1035779584.0,
+            "94": 1035779584.0,
+            "95": 1035779584.0,
+            "96": 1035779584.0,
+            "97": 1035779584.0,
+            "98": 1035779584.0,
+            "99": 1035779584.0,
+            "100": 1035779584.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 7.22987,
-            "2": 0.54363,
-            "3": 0.2879,
-            "4": 0.28745,
-            "5": 0.28509,
-            "6": 0.28364,
-            "7": 0.28401,
-            "8": 0.28235,
-            "9": 0.28321,
-            "10": 0.32258,
-            "11": 0.28697,
-            "12": 0.27808,
-            "13": 0.27857,
-            "14": 0.27833,
-            "15": 0.28035,
-            "16": 0.27859,
-            "17": 0.27841,
-            "18": 0.27879,
-            "19": 0.27874,
-            "20": 0.27806,
-            "21": 0.27812,
-            "22": 0.2783,
-            "23": 0.27919,
-            "24": 0.27841,
-            "25": 0.27852,
-            "26": 0.27871,
-            "27": 0.27891,
-            "28": 0.28056,
-            "29": 0.27909,
-            "30": 0.2797,
-            "31": 0.27903,
-            "32": 0.27895,
-            "33": 0.27929,
-            "34": 0.27838,
-            "35": 0.27904,
-            "36": 0.2787,
-            "37": 0.28662,
-            "38": 0.27812,
-            "39": 0.27805,
-            "40": 0.27846,
-            "41": 0.27884,
-            "42": 0.27807,
-            "43": 0.27794,
-            "44": 0.27825,
-            "45": 0.28052,
-            "46": 0.27856,
-            "47": 0.27832,
-            "48": 0.27799,
-            "49": 0.2783,
-            "50": 0.27861,
-            "51": 0.2915,
-            "52": 0.28668,
-            "53": 0.28545,
-            "54": 0.28632,
-            "55": 0.28616,
-            "56": 0.28735,
-            "57": 0.28738,
-            "58": 0.28556,
-            "59": 0.28453,
-            "60": 0.28543,
-            "61": 0.28452,
-            "62": 0.28404,
-            "63": 0.28542,
-            "64": 0.28492,
-            "65": 0.28488,
-            "66": 0.2861,
-            "67": 0.286,
-            "68": 0.28505,
-            "69": 0.28531,
-            "70": 0.28377,
-            "71": 0.28517,
-            "72": 0.28454,
-            "73": 0.2853,
-            "74": 0.28678,
-            "75": 0.28484,
-            "76": 0.28523,
-            "77": 0.28548,
-            "78": 0.28488,
-            "79": 0.28559,
-            "80": 0.28528,
-            "81": 0.28479,
-            "82": 0.28465,
-            "83": 0.28506,
-            "84": 0.28493,
-            "85": 0.28486,
-            "86": 0.28572,
-            "87": 0.28404,
-            "88": 0.28473,
-            "89": 0.28431,
-            "90": 0.28945,
-            "91": 0.28446,
-            "92": 0.28489,
-            "93": 0.28474,
-            "94": 0.28484,
-            "95": 0.28526,
-            "96": 0.28573,
-            "97": 0.28411,
-            "98": 0.28402,
-            "99": 0.28413,
-            "100": 0.28454
+            "1": 3.63869,
+            "2": 0.35485,
+            "3": 0.2965,
+            "4": 0.28503,
+            "5": 0.28544,
+            "6": 0.284,
+            "7": 0.28704,
+            "8": 0.28585,
+            "9": 0.286,
+            "10": 0.2866,
+            "11": 0.28746,
+            "12": 0.28519,
+            "13": 0.28493,
+            "14": 0.28132,
+            "15": 0.2846,
+            "16": 0.28078,
+            "17": 0.28134,
+            "18": 0.28108,
+            "19": 0.2801,
+            "20": 0.2818,
+            "21": 0.284,
+            "22": 0.28379,
+            "23": 0.27982,
+            "24": 0.2809,
+            "25": 0.28033,
+            "26": 0.2874,
+            "27": 0.28134,
+            "28": 0.28215,
+            "29": 0.28078,
+            "30": 0.28261,
+            "31": 0.28205,
+            "32": 0.28244,
+            "33": 0.28032,
+            "34": 0.2817,
+            "35": 0.28205,
+            "36": 0.28735,
+            "37": 0.2784,
+            "38": 0.27979,
+            "39": 0.28067,
+            "40": 0.28107,
+            "41": 0.27649,
+            "42": 0.27759,
+            "43": 0.27572,
+            "44": 0.27583,
+            "45": 0.27792,
+            "46": 0.27869,
+            "47": 0.2795,
+            "48": 0.2786,
+            "49": 0.27878,
+            "50": 0.28026,
+            "51": 0.28359,
+            "52": 0.27724,
+            "53": 0.2767,
+            "54": 0.2768,
+            "55": 0.27579,
+            "56": 0.27548,
+            "57": 0.27664,
+            "58": 0.27959,
+            "59": 0.27651,
+            "60": 0.27706,
+            "61": 0.2749,
+            "62": 0.27575,
+            "63": 0.27689,
+            "64": 0.27661,
+            "65": 0.27463,
+            "66": 0.27502,
+            "67": 0.27556,
+            "68": 0.27753,
+            "69": 0.27586,
+            "70": 0.27562,
+            "71": 0.27486,
+            "72": 0.27586,
+            "73": 0.27532,
+            "74": 0.27545,
+            "75": 0.27539,
+            "76": 0.27606,
+            "77": 0.27649,
+            "78": 0.27585,
+            "79": 0.27645,
+            "80": 0.27617,
+            "81": 0.27569,
+            "82": 0.276,
+            "83": 0.27704,
+            "84": 0.27698,
+            "85": 0.27571,
+            "86": 0.27734,
+            "87": 0.27615,
+            "88": 0.2754,
+            "89": 0.27602,
+            "90": 0.27562,
+            "91": 0.27544,
+            "92": 0.27569,
+            "93": 0.27668,
+            "94": 0.27578,
+            "95": 0.27544,
+            "96": 0.27608,
+            "97": 0.27604,
+            "98": 0.2754,
+            "99": 0.2768,
+            "100": 0.27965
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..3f4651acab9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85065,
+            "52": 9.7464,
+            "53": 10.07271,
+            "54": 9.95757,
+            "55": 9.87725,
+            "56": 9.62951,
+            "57": 9.48816,
+            "58": 9.83239,
+            "59": 9.58985,
+            "60": 9.50827,
+            "61": 9.6947,
+            "62": 9.99304,
+            "63": 9.37511,
+            "64": 9.77996,
+            "65": 8.95215,
+            "66": 9.71323,
+            "67": 9.37884,
+            "68": 9.78794,
+            "69": 9.79078,
+            "70": 9.7308,
+            "71": 9.61793,
+            "72": 9.59094,
+            "73": 9.49435,
+            "74": 8.94865,
+            "75": 9.43606,
+            "76": 9.09894,
+            "77": 10.06437,
+            "78": 9.73006,
+            "79": 9.37771,
+            "80": 9.41266,
+            "81": 9.4854,
+            "82": 9.69576,
+            "83": 9.32017,
+            "84": 9.42235,
+            "85": 9.61578,
+            "86": 9.07218,
+            "87": 9.59328,
+            "88": 9.7509,
+            "89": 9.61159,
+            "90": 9.82148,
+            "91": 9.35304,
+            "92": 9.36254,
+            "93": 9.08747,
+            "94": 8.83398,
+            "95": 9.51923,
+            "96": 9.52595,
+            "97": 9.31413,
+            "98": 9.67414,
+            "99": 8.88869,
+            "100": 9.40651
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2873.0,
+            "52": 2946.0,
+            "53": 3158.0,
+            "54": 2907.0,
+            "55": 2740.0,
+            "56": 3029.0,
+            "57": 2489.0,
+            "58": 3327.0,
+            "59": 3042.0,
+            "60": 2780.0,
+            "61": 3302.0,
+            "62": 2961.0,
+            "63": 2702.0,
+            "64": 3318.0,
+            "65": 2909.0,
+            "66": 3513.0,
+            "67": 2959.0,
+            "68": 2963.0,
+            "69": 3171.0,
+            "70": 3547.0,
+            "71": 3246.0,
+            "72": 2586.0,
+            "73": 3301.0,
+            "74": 2135.0,
+            "75": 2752.0,
+            "76": 3275.0,
+            "77": 3648.0,
+            "78": 3472.0,
+            "79": 3536.0,
+            "80": 3685.0,
+            "81": 4159.0,
+            "82": 3488.0,
+            "83": 3179.0,
+            "84": 3639.0,
+            "85": 3631.0,
+            "86": 3045.0,
+            "87": 4315.0,
+            "88": 3481.0,
+            "89": 3819.0,
+            "90": 3323.0,
+            "91": 3014.0,
+            "92": 3581.0,
+            "93": 2932.0,
+            "94": 3715.0,
+            "95": 3593.0,
+            "96": 3764.0,
+            "97": 3582.0,
+            "98": 3998.0,
+            "99": 3406.0,
+            "100": 3521.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 335835648.0,
+            "52": 335835648.0,
+            "53": 335835648.0,
+            "54": 335835648.0,
+            "55": 335835648.0,
+            "56": 335835648.0,
+            "57": 335835648.0,
+            "58": 335835648.0,
+            "59": 335835648.0,
+            "60": 335835648.0,
+            "61": 335835648.0,
+            "62": 335835648.0,
+            "63": 335835648.0,
+            "64": 335835648.0,
+            "65": 335835648.0,
+            "66": 335835648.0,
+            "67": 335835648.0,
+            "68": 335835648.0,
+            "69": 335835648.0,
+            "70": 335835648.0,
+            "71": 335835648.0,
+            "72": 335835648.0,
+            "73": 335835648.0,
+            "74": 335835648.0,
+            "75": 335835648.0,
+            "76": 335835648.0,
+            "77": 335835648.0,
+            "78": 335835648.0,
+            "79": 335835648.0,
+            "80": 335835648.0,
+            "81": 335835648.0,
+            "82": 335835648.0,
+            "83": 335835648.0,
+            "84": 335835648.0,
+            "85": 335835648.0,
+            "86": 335835648.0,
+            "87": 335835648.0,
+            "88": 335835648.0,
+            "89": 335835648.0,
+            "90": 335835648.0,
+            "91": 335835648.0,
+            "92": 335835648.0,
+            "93": 335835648.0,
+            "94": 335835648.0,
+            "95": 335835648.0,
+            "96": 335835648.0,
+            "97": 335835648.0,
+            "98": 335835648.0,
+            "99": 335835648.0,
+            "100": 335835648.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1102819840.0,
+            "52": 1102820864.0,
+            "53": 1102820864.0,
+            "54": 1102820864.0,
+            "55": 1102820864.0,
+            "56": 1102820864.0,
+            "57": 1102820864.0,
+            "58": 1102820864.0,
+            "59": 1102820864.0,
+            "60": 1102820864.0,
+            "61": 1102820864.0,
+            "62": 1102820864.0,
+            "63": 1102820864.0,
+            "64": 1102820864.0,
+            "65": 1102820864.0,
+            "66": 1102820864.0,
+            "67": 1102820864.0,
+            "68": 1102820864.0,
+            "69": 1102820864.0,
+            "70": 1102820864.0,
+            "71": 1102820864.0,
+            "72": 1102820864.0,
+            "73": 1102820864.0,
+            "74": 1102820864.0,
+            "75": 1102820864.0,
+            "76": 1102820864.0,
+            "77": 1102820864.0,
+            "78": 1102820864.0,
+            "79": 1102820864.0,
+            "80": 1102820864.0,
+            "81": 1102820864.0,
+            "82": 1102820864.0,
+            "83": 1102820864.0,
+            "84": 1102820864.0,
+            "85": 1102820864.0,
+            "86": 1102820864.0,
+            "87": 1102820864.0,
+            "88": 1102820864.0,
+            "89": 1102820864.0,
+            "90": 1102820864.0,
+            "91": 1102820864.0,
+            "92": 1102820864.0,
+            "93": 1102820864.0,
+            "94": 1102820864.0,
+            "95": 1102820864.0,
+            "96": 1102820864.0,
+            "97": 1102820864.0,
+            "98": 1102820864.0,
+            "99": 1102820864.0,
+            "100": 1102820864.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.5579,
+            "52": 0.32293,
+            "53": 0.28783,
+            "54": 0.28913,
+            "55": 0.28732,
+            "56": 0.28223,
+            "57": 0.28119,
+            "58": 0.27795,
+            "59": 0.27722,
+            "60": 0.2792,
+            "61": 0.27899,
+            "62": 0.27773,
+            "63": 0.27717,
+            "64": 0.27611,
+            "65": 0.275,
+            "66": 0.27585,
+            "67": 0.27453,
+            "68": 0.27615,
+            "69": 0.27494,
+            "70": 0.27615,
+            "71": 0.27345,
+            "72": 0.27521,
+            "73": 0.27345,
+            "74": 0.27408,
+            "75": 0.27342,
+            "76": 0.27402,
+            "77": 0.27422,
+            "78": 0.27428,
+            "79": 0.27445,
+            "80": 0.27343,
+            "81": 0.27423,
+            "82": 0.27491,
+            "83": 0.27456,
+            "84": 0.27288,
+            "85": 0.27478,
+            "86": 0.27469,
+            "87": 0.27542,
+            "88": 0.27502,
+            "89": 0.27521,
+            "90": 0.27591,
+            "91": 0.27499,
+            "92": 0.27376,
+            "93": 0.27416,
+            "94": 0.27576,
+            "95": 0.27431,
+            "96": 0.27449,
+            "97": 0.27428,
+            "98": 0.27432,
+            "99": 0.2742,
+            "100": 0.27503
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..e73d1df6682
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.83993,
+            "2": 10.85182,
+            "3": 10.84166,
+            "4": 10.84441,
+            "5": 10.85514,
+            "6": 10.86428,
+            "7": 10.85243,
+            "8": 10.84464,
+            "9": 10.84864,
+            "10": 10.81333,
+            "11": 10.85638,
+            "12": 10.84233,
+            "13": 10.86046,
+            "14": 10.84976,
+            "15": 10.81618,
+            "16": 10.80886,
+            "17": 10.78242,
+            "18": 10.79155,
+            "19": 10.79495,
+            "20": 10.7055,
+            "21": 10.6978,
+            "22": 10.58349,
+            "23": 10.69268,
+            "24": 10.60558,
+            "25": 10.56742,
+            "26": 10.61456,
+            "27": 10.6067,
+            "28": 10.55905,
+            "29": 10.56526,
+            "30": 10.37918,
+            "31": 10.16276,
+            "32": 10.45543,
+            "33": 10.45037,
+            "34": 10.23993,
+            "35": 10.27354,
+            "36": 10.24224,
+            "37": 10.34559,
+            "38": 10.21738,
+            "39": 10.39453,
+            "40": 10.095,
+            "41": 10.15093,
+            "42": 10.21235,
+            "43": 9.87982,
+            "44": 9.97875,
+            "45": 9.85588,
+            "46": 9.83349,
+            "47": 10.14101,
+            "48": 9.86418,
+            "49": 9.55509,
+            "50": 9.91636,
+            "51": 9.86104,
+            "52": 9.75109,
+            "53": 10.06631,
+            "54": 9.95634,
+            "55": 9.89354,
+            "56": 9.637,
+            "57": 9.49142,
+            "58": 9.8341,
+            "59": 9.5931,
+            "60": 9.51379,
+            "61": 9.69183,
+            "62": 9.99162,
+            "63": 9.39196,
+            "64": 9.77455,
+            "65": 8.96319,
+            "66": 9.70663,
+            "67": 9.3789,
+            "68": 9.78328,
+            "69": 9.79736,
+            "70": 9.73753,
+            "71": 9.62711,
+            "72": 9.58907,
+            "73": 9.50446,
+            "74": 8.94975,
+            "75": 9.4278,
+            "76": 9.08764,
+            "77": 10.06759,
+            "78": 9.72141,
+            "79": 9.3861,
+            "80": 9.40495,
+            "81": 9.48596,
+            "82": 9.70195,
+            "83": 9.31553,
+            "84": 9.41806,
+            "85": 9.61378,
+            "86": 9.08145,
+            "87": 9.59631,
+            "88": 9.75008,
+            "89": 9.60386,
+            "90": 9.82838,
+            "91": 9.33622,
+            "92": 9.35764,
+            "93": 9.08795,
+            "94": 8.83437,
+            "95": 9.53352,
+            "96": 9.53315,
+            "97": 9.31129,
+            "98": 9.67176,
+            "99": 8.89816,
+            "100": 9.40969
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1621.0,
+            "2": 1756.0,
+            "3": 1698.0,
+            "4": 1764.0,
+            "5": 2045.0,
+            "6": 1927.0,
+            "7": 1901.0,
+            "8": 1768.0,
+            "9": 1823.0,
+            "10": 1456.0,
+            "11": 1884.0,
+            "12": 1834.0,
+            "13": 2003.0,
+            "14": 1786.0,
+            "15": 1879.0,
+            "16": 1948.0,
+            "17": 1849.0,
+            "18": 1718.0,
+            "19": 1870.0,
+            "20": 1750.0,
+            "21": 1977.0,
+            "22": 1741.0,
+            "23": 1946.0,
+            "24": 1642.0,
+            "25": 1636.0,
+            "26": 1817.0,
+            "27": 1926.0,
+            "28": 1981.0,
+            "29": 1993.0,
+            "30": 1929.0,
+            "31": 1630.0,
+            "32": 1896.0,
+            "33": 2115.0,
+            "34": 1824.0,
+            "35": 1960.0,
+            "36": 1935.0,
+            "37": 2410.0,
+            "38": 2259.0,
+            "39": 2428.0,
+            "40": 2119.0,
+            "41": 2278.0,
+            "42": 2118.0,
+            "43": 1992.0,
+            "44": 2041.0,
+            "45": 1992.0,
+            "46": 2158.0,
+            "47": 2416.0,
+            "48": 2338.0,
+            "49": 2315.0,
+            "50": 2242.0,
+            "51": 2431.0,
+            "52": 2467.0,
+            "53": 2794.0,
+            "54": 2675.0,
+            "55": 2313.0,
+            "56": 2597.0,
+            "57": 2278.0,
+            "58": 2887.0,
+            "59": 2701.0,
+            "60": 2190.0,
+            "61": 2764.0,
+            "62": 2576.0,
+            "63": 2405.0,
+            "64": 2903.0,
+            "65": 2516.0,
+            "66": 2885.0,
+            "67": 2700.0,
+            "68": 2682.0,
+            "69": 2987.0,
+            "70": 3141.0,
+            "71": 3055.0,
+            "72": 2413.0,
+            "73": 2864.0,
+            "74": 1870.0,
+            "75": 2450.0,
+            "76": 3032.0,
+            "77": 3230.0,
+            "78": 3125.0,
+            "79": 2982.0,
+            "80": 3203.0,
+            "81": 3657.0,
+            "82": 3174.0,
+            "83": 2818.0,
+            "84": 3190.0,
+            "85": 3166.0,
+            "86": 2793.0,
+            "87": 3635.0,
+            "88": 3005.0,
+            "89": 3373.0,
+            "90": 3066.0,
+            "91": 2857.0,
+            "92": 3080.0,
+            "93": 2533.0,
+            "94": 3303.0,
+            "95": 3270.0,
+            "96": 3416.0,
+            "97": 3085.0,
+            "98": 3437.0,
+            "99": 3243.0,
+            "100": 3119.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 299204096.0,
+            "2": 299204096.0,
+            "3": 299204096.0,
+            "4": 299204096.0,
+            "5": 299204096.0,
+            "6": 299204096.0,
+            "7": 299204096.0,
+            "8": 299204096.0,
+            "9": 299204096.0,
+            "10": 299204096.0,
+            "11": 299204096.0,
+            "12": 299204096.0,
+            "13": 299204096.0,
+            "14": 299204096.0,
+            "15": 299204096.0,
+            "16": 299204096.0,
+            "17": 299204096.0,
+            "18": 299204096.0,
+            "19": 299204096.0,
+            "20": 299204096.0,
+            "21": 299204096.0,
+            "22": 299204096.0,
+            "23": 299204096.0,
+            "24": 299204096.0,
+            "25": 299204096.0,
+            "26": 299204096.0,
+            "27": 299204096.0,
+            "28": 299204096.0,
+            "29": 299204096.0,
+            "30": 299204096.0,
+            "31": 299204096.0,
+            "32": 299204096.0,
+            "33": 299204096.0,
+            "34": 299204096.0,
+            "35": 299204096.0,
+            "36": 299204096.0,
+            "37": 299204096.0,
+            "38": 299204096.0,
+            "39": 299204096.0,
+            "40": 299204096.0,
+            "41": 299204096.0,
+            "42": 299204096.0,
+            "43": 299204096.0,
+            "44": 299204096.0,
+            "45": 299204096.0,
+            "46": 299204096.0,
+            "47": 299204096.0,
+            "48": 299204096.0,
+            "49": 299204096.0,
+            "50": 299204096.0,
+            "51": 299204096.0,
+            "52": 299204096.0,
+            "53": 299204096.0,
+            "54": 299204096.0,
+            "55": 299204096.0,
+            "56": 299204096.0,
+            "57": 299204096.0,
+            "58": 299204096.0,
+            "59": 299204096.0,
+            "60": 299204096.0,
+            "61": 299204096.0,
+            "62": 299204096.0,
+            "63": 299204096.0,
+            "64": 299204096.0,
+            "65": 299204096.0,
+            "66": 299204096.0,
+            "67": 299204096.0,
+            "68": 299204096.0,
+            "69": 299204096.0,
+            "70": 299204096.0,
+            "71": 299204096.0,
+            "72": 299204096.0,
+            "73": 299204096.0,
+            "74": 299204096.0,
+            "75": 299204096.0,
+            "76": 299204096.0,
+            "77": 299204096.0,
+            "78": 299204096.0,
+            "79": 299204096.0,
+            "80": 299204096.0,
+            "81": 299204096.0,
+            "82": 299204096.0,
+            "83": 299204096.0,
+            "84": 299204096.0,
+            "85": 299204096.0,
+            "86": 299204096.0,
+            "87": 299204096.0,
+            "88": 299204096.0,
+            "89": 299204096.0,
+            "90": 299204096.0,
+            "91": 299204096.0,
+            "92": 299204096.0,
+            "93": 299204096.0,
+            "94": 299204096.0,
+            "95": 299204096.0,
+            "96": 299204096.0,
+            "97": 299204096.0,
+            "98": 299204096.0,
+            "99": 299204096.0,
+            "100": 299204096.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 999540224.0,
+            "2": 1065140736.0,
+            "3": 1065140736.0,
+            "4": 1065140736.0,
+            "5": 1065140736.0,
+            "6": 1065140736.0,
+            "7": 1065140736.0,
+            "8": 1065140736.0,
+            "9": 1065140736.0,
+            "10": 1065140736.0,
+            "11": 1065140736.0,
+            "12": 1065140736.0,
+            "13": 1065140736.0,
+            "14": 1065140736.0,
+            "15": 1065140736.0,
+            "16": 1065140736.0,
+            "17": 1065140736.0,
+            "18": 1065140736.0,
+            "19": 1065140736.0,
+            "20": 1065140736.0,
+            "21": 1065140736.0,
+            "22": 1065140736.0,
+            "23": 1065140736.0,
+            "24": 1065140736.0,
+            "25": 1065140736.0,
+            "26": 1065140736.0,
+            "27": 1065140736.0,
+            "28": 1065140736.0,
+            "29": 1065140736.0,
+            "30": 1065140736.0,
+            "31": 1065140736.0,
+            "32": 1065140736.0,
+            "33": 1065140736.0,
+            "34": 1065140736.0,
+            "35": 1065140736.0,
+            "36": 1065140736.0,
+            "37": 1065140736.0,
+            "38": 1065140736.0,
+            "39": 1065140736.0,
+            "40": 1065140736.0,
+            "41": 1065140736.0,
+            "42": 1065140736.0,
+            "43": 1065140736.0,
+            "44": 1065140736.0,
+            "45": 1065140736.0,
+            "46": 1065140736.0,
+            "47": 1065140736.0,
+            "48": 1065140736.0,
+            "49": 1065140736.0,
+            "50": 1065140736.0,
+            "51": 1065140736.0,
+            "52": 1065140736.0,
+            "53": 1065140736.0,
+            "54": 1065140736.0,
+            "55": 1065140736.0,
+            "56": 1065140736.0,
+            "57": 1065140736.0,
+            "58": 1065140736.0,
+            "59": 1065140736.0,
+            "60": 1065140736.0,
+            "61": 1065140736.0,
+            "62": 1065140736.0,
+            "63": 1065140736.0,
+            "64": 1065140736.0,
+            "65": 1065140736.0,
+            "66": 1065140736.0,
+            "67": 1065140736.0,
+            "68": 1065140736.0,
+            "69": 1065140736.0,
+            "70": 1065140736.0,
+            "71": 1065140736.0,
+            "72": 1065140736.0,
+            "73": 1065140736.0,
+            "74": 1065140736.0,
+            "75": 1065140736.0,
+            "76": 1065140736.0,
+            "77": 1065140736.0,
+            "78": 1065140736.0,
+            "79": 1065140736.0,
+            "80": 1065140736.0,
+            "81": 1065140736.0,
+            "82": 1065140736.0,
+            "83": 1065140736.0,
+            "84": 1065140736.0,
+            "85": 1065140736.0,
+            "86": 1065140736.0,
+            "87": 1065140736.0,
+            "88": 1065140736.0,
+            "89": 1065140736.0,
+            "90": 1065140736.0,
+            "91": 1065140736.0,
+            "92": 1065140736.0,
+            "93": 1065140736.0,
+            "94": 1065140736.0,
+            "95": 1065140736.0,
+            "96": 1065140736.0,
+            "97": 1065140736.0,
+            "98": 1065140736.0,
+            "99": 1065140736.0,
+            "100": 1065140736.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.50092,
+            "3": 0.30393,
+            "4": 0.296,
+            "5": 0.29464,
+            "6": 0.29386,
+            "7": 0.29621,
+            "8": 0.2946,
+            "9": 0.29682,
+            "10": 0.29745,
+            "11": 0.3056,
+            "12": 0.30475,
+            "13": 0.30581,
+            "14": 0.3052,
+            "15": 0.31033,
+            "16": 0.30534,
+            "17": 0.30586,
+            "18": 0.3053,
+            "19": 0.30668,
+            "20": 0.3062,
+            "21": 0.31086,
+            "22": 0.30673,
+            "23": 0.30645,
+            "24": 0.30648,
+            "25": 0.30922,
+            "26": 0.30442,
+            "27": 0.30196,
+            "28": 0.3042,
+            "29": 0.30389,
+            "30": 0.30468,
+            "31": 0.30661,
+            "32": 0.30468,
+            "33": 0.30645,
+            "34": 0.30588,
+            "35": 0.3037,
+            "36": 0.30433,
+            "37": 0.30504,
+            "38": 0.30676,
+            "39": 0.30639,
+            "40": 0.30854,
+            "41": 0.31017,
+            "42": 0.30559,
+            "43": 0.30359,
+            "44": 0.30728,
+            "45": 0.30737,
+            "46": 0.30728,
+            "47": 0.30866,
+            "48": 0.30981,
+            "49": 0.3097,
+            "50": 0.30633,
+            "51": 0.31798,
+            "52": 0.30466,
+            "53": 0.30302,
+            "54": 0.30516,
+            "55": 0.30263,
+            "56": 0.30315,
+            "57": 0.30305,
+            "58": 0.30451,
+            "59": 0.30443,
+            "60": 0.30525,
+            "61": 0.30503,
+            "62": 0.3063,
+            "63": 0.30517,
+            "64": 0.30552,
+            "65": 0.30685,
+            "66": 0.30584,
+            "67": 0.31593,
+            "68": 0.34589,
+            "69": 0.30682,
+            "70": 0.30582,
+            "71": 0.30682,
+            "72": 0.30578,
+            "73": 0.30496,
+            "74": 0.30689,
+            "75": 0.30927,
+            "76": 0.31024,
+            "77": 0.3125,
+            "78": 0.31093,
+            "79": 0.31106,
+            "80": 0.30717,
+            "81": 0.30815,
+            "82": 0.30914,
+            "83": 0.30911,
+            "84": 0.30335,
+            "85": 0.29792,
+            "86": 0.2997,
+            "87": 0.3032,
+            "88": 0.30139,
+            "89": 0.30675,
+            "90": 0.30412,
+            "91": 0.30454,
+            "92": 0.30497,
+            "93": 0.30233,
+            "94": 0.30714,
+            "95": 0.30673,
+            "96": 0.30193,
+            "97": 0.30472,
+            "98": 0.3103,
+            "99": 0.30957,
+            "100": 0.30828
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..235ce034813
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.86104,
+            "52": 9.75109,
+            "53": 10.06631,
+            "54": 9.95634,
+            "55": 9.89354,
+            "56": 9.637,
+            "57": 9.49142,
+            "58": 9.8341,
+            "59": 9.5931,
+            "60": 9.51379,
+            "61": 9.69183,
+            "62": 9.99162,
+            "63": 9.39196,
+            "64": 9.77455,
+            "65": 8.96319,
+            "66": 9.70663,
+            "67": 9.3789,
+            "68": 9.78328,
+            "69": 9.79736,
+            "70": 9.73753,
+            "71": 9.62711,
+            "72": 9.58907,
+            "73": 9.50446,
+            "74": 8.94975,
+            "75": 9.4278,
+            "76": 9.08764,
+            "77": 10.06759,
+            "78": 9.72141,
+            "79": 9.3861,
+            "80": 9.40495,
+            "81": 9.48596,
+            "82": 9.70195,
+            "83": 9.31553,
+            "84": 9.41806,
+            "85": 9.61378,
+            "86": 9.08145,
+            "87": 9.59631,
+            "88": 9.75008,
+            "89": 9.60386,
+            "90": 9.82838,
+            "91": 9.33622,
+            "92": 9.35764,
+            "93": 9.08795,
+            "94": 8.83437,
+            "95": 9.53352,
+            "96": 9.53315,
+            "97": 9.31129,
+            "98": 9.67176,
+            "99": 8.89816,
+            "100": 9.40969
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2431.0,
+            "52": 2467.0,
+            "53": 2794.0,
+            "54": 2675.0,
+            "55": 2313.0,
+            "56": 2597.0,
+            "57": 2278.0,
+            "58": 2887.0,
+            "59": 2701.0,
+            "60": 2190.0,
+            "61": 2764.0,
+            "62": 2576.0,
+            "63": 2405.0,
+            "64": 2903.0,
+            "65": 2516.0,
+            "66": 2885.0,
+            "67": 2700.0,
+            "68": 2682.0,
+            "69": 2987.0,
+            "70": 3141.0,
+            "71": 3055.0,
+            "72": 2413.0,
+            "73": 2864.0,
+            "74": 1870.0,
+            "75": 2450.0,
+            "76": 3032.0,
+            "77": 3230.0,
+            "78": 3125.0,
+            "79": 2982.0,
+            "80": 3203.0,
+            "81": 3657.0,
+            "82": 3174.0,
+            "83": 2818.0,
+            "84": 3190.0,
+            "85": 3166.0,
+            "86": 2793.0,
+            "87": 3635.0,
+            "88": 3005.0,
+            "89": 3373.0,
+            "90": 3066.0,
+            "91": 2857.0,
+            "92": 3080.0,
+            "93": 2533.0,
+            "94": 3303.0,
+            "95": 3270.0,
+            "96": 3416.0,
+            "97": 3085.0,
+            "98": 3437.0,
+            "99": 3243.0,
+            "100": 3119.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 299204096.0,
+            "52": 299204096.0,
+            "53": 299204096.0,
+            "54": 299204096.0,
+            "55": 299204096.0,
+            "56": 299204096.0,
+            "57": 299204096.0,
+            "58": 299204096.0,
+            "59": 299204096.0,
+            "60": 299204096.0,
+            "61": 299204096.0,
+            "62": 299204096.0,
+            "63": 299204096.0,
+            "64": 299204096.0,
+            "65": 299204096.0,
+            "66": 299204096.0,
+            "67": 299204096.0,
+            "68": 299204096.0,
+            "69": 299204096.0,
+            "70": 299204096.0,
+            "71": 299204096.0,
+            "72": 299204096.0,
+            "73": 299204096.0,
+            "74": 299204096.0,
+            "75": 299204096.0,
+            "76": 299204096.0,
+            "77": 299204096.0,
+            "78": 299204096.0,
+            "79": 299204096.0,
+            "80": 299204096.0,
+            "81": 299204096.0,
+            "82": 299204096.0,
+            "83": 299204096.0,
+            "84": 299204096.0,
+            "85": 299204096.0,
+            "86": 299204096.0,
+            "87": 299204096.0,
+            "88": 299204096.0,
+            "89": 299204096.0,
+            "90": 299204096.0,
+            "91": 299204096.0,
+            "92": 299204096.0,
+            "93": 299204096.0,
+            "94": 299204096.0,
+            "95": 299204096.0,
+            "96": 299204096.0,
+            "97": 299204096.0,
+            "98": 299204096.0,
+            "99": 299204096.0,
+            "100": 299204096.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1064091136.0,
+            "52": 1064092160.0,
+            "53": 1064092160.0,
+            "54": 1064092160.0,
+            "55": 1064092160.0,
+            "56": 1064092160.0,
+            "57": 1064092160.0,
+            "58": 1064092160.0,
+            "59": 1064092160.0,
+            "60": 1064092160.0,
+            "61": 1064092160.0,
+            "62": 1064092160.0,
+            "63": 1064092160.0,
+            "64": 1064092160.0,
+            "65": 1064092160.0,
+            "66": 1064092160.0,
+            "67": 1064092160.0,
+            "68": 1064092160.0,
+            "69": 1064092160.0,
+            "70": 1064092160.0,
+            "71": 1064092160.0,
+            "72": 1064092160.0,
+            "73": 1064092160.0,
+            "74": 1064092160.0,
+            "75": 1064092160.0,
+            "76": 1064092160.0,
+            "77": 1064092160.0,
+            "78": 1064092160.0,
+            "79": 1064092160.0,
+            "80": 1064092160.0,
+            "81": 1064092160.0,
+            "82": 1064092160.0,
+            "83": 1064092160.0,
+            "84": 1064092160.0,
+            "85": 1064092160.0,
+            "86": 1064092160.0,
+            "87": 1064092160.0,
+            "88": 1064092160.0,
+            "89": 1064092160.0,
+            "90": 1064092160.0,
+            "91": 1064092160.0,
+            "92": 1064092160.0,
+            "93": 1064092160.0,
+            "94": 1064092160.0,
+            "95": 1064092160.0,
+            "96": 1064092160.0,
+            "97": 1064092160.0,
+            "98": 1064092160.0,
+            "99": 1064092160.0,
+            "100": 1064092160.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.14914,
+            "53": 0.55064,
+            "54": 0.29762,
+            "55": 0.30112,
+            "56": 0.30035,
+            "57": 0.29809,
+            "58": 0.30057,
+            "59": 0.29654,
+            "60": 0.29579,
+            "61": 0.2954,
+            "62": 0.29808,
+            "63": 0.2968,
+            "64": 0.29639,
+            "65": 0.29803,
+            "66": 0.29816,
+            "67": 0.2978,
+            "68": 0.29666,
+            "69": 0.29937,
+            "70": 0.29987,
+            "71": 0.29726,
+            "72": 0.29888,
+            "73": 0.29841,
+            "74": 0.29818,
+            "75": 0.29888,
+            "76": 0.30018,
+            "77": 0.29543,
+            "78": 0.29515,
+            "79": 0.29942,
+            "80": 0.30103,
+            "81": 0.30071,
+            "82": 0.30152,
+            "83": 0.30277,
+            "84": 0.30368,
+            "85": 0.30349,
+            "86": 0.30411,
+            "87": 0.30141,
+            "88": 0.30339,
+            "89": 0.3072,
+            "90": 0.30468,
+            "91": 0.30297,
+            "92": 0.30317,
+            "93": 0.30255,
+            "94": 0.29992,
+            "95": 0.30116,
+            "96": 0.29306,
+            "97": 0.29403,
+            "98": 0.29399,
+            "99": 0.29473,
+            "100": 0.2958
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
index 6937fb9bd55..2d2d349a867 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 299203072.0,
-            "2": 299203072.0,
-            "3": 299203072.0,
-            "4": 299203072.0,
-            "5": 299203072.0,
-            "6": 299203072.0,
-            "7": 299203072.0,
-            "8": 299203072.0,
-            "9": 299203072.0,
-            "10": 299203072.0,
-            "11": 299203072.0,
-            "12": 299203072.0,
-            "13": 299203072.0,
-            "14": 299203072.0,
-            "15": 299203072.0,
-            "16": 299203072.0,
-            "17": 299203072.0,
-            "18": 299203072.0,
-            "19": 299203072.0,
-            "20": 299203072.0,
-            "21": 299203072.0,
-            "22": 299203072.0,
-            "23": 299203072.0,
-            "24": 299203072.0,
-            "25": 299203072.0,
-            "26": 299203072.0,
-            "27": 299203072.0,
-            "28": 299203072.0,
-            "29": 299203072.0,
-            "30": 299203072.0,
-            "31": 299203072.0,
-            "32": 299203072.0,
-            "33": 299203072.0,
-            "34": 299203072.0,
-            "35": 299203072.0,
-            "36": 299203072.0,
-            "37": 299203072.0,
-            "38": 299203072.0,
-            "39": 299203072.0,
-            "40": 299203072.0,
-            "41": 299203072.0,
-            "42": 299203072.0,
-            "43": 299203072.0,
-            "44": 299203072.0,
-            "45": 299203072.0,
-            "46": 299203072.0,
-            "47": 299203072.0,
-            "48": 299203072.0,
-            "49": 299203072.0,
-            "50": 299203072.0,
-            "51": 299203072.0,
-            "52": 299203072.0,
-            "53": 299203072.0,
-            "54": 299203072.0,
-            "55": 299203072.0,
-            "56": 299203072.0,
-            "57": 299203072.0,
-            "58": 299203072.0,
-            "59": 299203072.0,
-            "60": 299203072.0,
-            "61": 299203072.0,
-            "62": 299203072.0,
-            "63": 299203072.0,
-            "64": 299203072.0,
-            "65": 299203072.0,
-            "66": 299203072.0,
-            "67": 299203072.0,
-            "68": 299203072.0,
-            "69": 299203072.0,
-            "70": 299203072.0,
-            "71": 299203072.0,
-            "72": 299203072.0,
-            "73": 299203072.0,
-            "74": 299203072.0,
-            "75": 299203072.0,
-            "76": 299203072.0,
-            "77": 299203072.0,
-            "78": 299203072.0,
-            "79": 299203072.0,
-            "80": 299203072.0,
-            "81": 299203072.0,
-            "82": 299203072.0,
-            "83": 299203072.0,
-            "84": 299203072.0,
-            "85": 299203072.0,
-            "86": 299203072.0,
-            "87": 299203072.0,
-            "88": 299203072.0,
-            "89": 299203072.0,
-            "90": 299203072.0,
-            "91": 299203072.0,
-            "92": 299203072.0,
-            "93": 299203072.0,
-            "94": 299203072.0,
-            "95": 299203072.0,
-            "96": 299203072.0,
-            "97": 299203072.0,
-            "98": 299203072.0,
-            "99": 299203072.0,
-            "100": 299203072.0
+            "1": 299204096.0,
+            "2": 299204096.0,
+            "3": 299204096.0,
+            "4": 299204096.0,
+            "5": 299204096.0,
+            "6": 299204096.0,
+            "7": 299204096.0,
+            "8": 299204096.0,
+            "9": 299204096.0,
+            "10": 299204096.0,
+            "11": 299204096.0,
+            "12": 299204096.0,
+            "13": 299204096.0,
+            "14": 299204096.0,
+            "15": 299204096.0,
+            "16": 299204096.0,
+            "17": 299204096.0,
+            "18": 299204096.0,
+            "19": 299204096.0,
+            "20": 299204096.0,
+            "21": 299204096.0,
+            "22": 299204096.0,
+            "23": 299204096.0,
+            "24": 299204096.0,
+            "25": 299204096.0,
+            "26": 299204096.0,
+            "27": 299204096.0,
+            "28": 299204096.0,
+            "29": 299204096.0,
+            "30": 299204096.0,
+            "31": 299204096.0,
+            "32": 299204096.0,
+            "33": 299204096.0,
+            "34": 299204096.0,
+            "35": 299204096.0,
+            "36": 299204096.0,
+            "37": 299204096.0,
+            "38": 299204096.0,
+            "39": 299204096.0,
+            "40": 299204096.0,
+            "41": 299204096.0,
+            "42": 299204096.0,
+            "43": 299204096.0,
+            "44": 299204096.0,
+            "45": 299204096.0,
+            "46": 299204096.0,
+            "47": 299204096.0,
+            "48": 299204096.0,
+            "49": 299204096.0,
+            "50": 299204096.0,
+            "51": 299204096.0,
+            "52": 299204096.0,
+            "53": 299204096.0,
+            "54": 299204096.0,
+            "55": 299204096.0,
+            "56": 299204096.0,
+            "57": 299204096.0,
+            "58": 299204096.0,
+            "59": 299204096.0,
+            "60": 299204096.0,
+            "61": 299204096.0,
+            "62": 299204096.0,
+            "63": 299204096.0,
+            "64": 299204096.0,
+            "65": 299204096.0,
+            "66": 299204096.0,
+            "67": 299204096.0,
+            "68": 299204096.0,
+            "69": 299204096.0,
+            "70": 299204096.0,
+            "71": 299204096.0,
+            "72": 299204096.0,
+            "73": 299204096.0,
+            "74": 299204096.0,
+            "75": 299204096.0,
+            "76": 299204096.0,
+            "77": 299204096.0,
+            "78": 299204096.0,
+            "79": 299204096.0,
+            "80": 299204096.0,
+            "81": 299204096.0,
+            "82": 299204096.0,
+            "83": 299204096.0,
+            "84": 299204096.0,
+            "85": 299204096.0,
+            "86": 299204096.0,
+            "87": 299204096.0,
+            "88": 299204096.0,
+            "89": 299204096.0,
+            "90": 299204096.0,
+            "91": 299204096.0,
+            "92": 299204096.0,
+            "93": 299204096.0,
+            "94": 299204096.0,
+            "95": 299204096.0,
+            "96": 299204096.0,
+            "97": 299204096.0,
+            "98": 299204096.0,
+            "99": 299204096.0,
+            "100": 299204096.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 977125888.0,
-            "2": 1042071040.0,
-            "3": 1042071040.0,
-            "4": 1042071040.0,
-            "5": 1042071040.0,
-            "6": 1042071040.0,
-            "7": 1042071040.0,
-            "8": 1042071040.0,
-            "9": 1042071040.0,
-            "10": 1042071040.0,
-            "11": 1042071040.0,
-            "12": 1042071040.0,
-            "13": 1042071040.0,
-            "14": 1042071040.0,
-            "15": 1042071040.0,
-            "16": 1042071040.0,
-            "17": 1042071040.0,
-            "18": 1042071040.0,
-            "19": 1042071040.0,
-            "20": 1042071040.0,
-            "21": 1042071040.0,
-            "22": 1042071040.0,
-            "23": 1042071040.0,
-            "24": 1042071040.0,
-            "25": 1042071040.0,
-            "26": 1042071040.0,
-            "27": 1042071040.0,
-            "28": 1042071040.0,
-            "29": 1042071040.0,
-            "30": 1042071040.0,
-            "31": 1042071040.0,
-            "32": 1042071040.0,
-            "33": 1042071040.0,
-            "34": 1042071040.0,
-            "35": 1042071040.0,
-            "36": 1042071040.0,
-            "37": 1042071040.0,
-            "38": 1042071040.0,
-            "39": 1042071040.0,
-            "40": 1042071040.0,
-            "41": 1042071040.0,
-            "42": 1042071040.0,
-            "43": 1042071040.0,
-            "44": 1042071040.0,
-            "45": 1042071040.0,
-            "46": 1042071040.0,
-            "47": 1042071040.0,
-            "48": 1042071040.0,
-            "49": 1042071040.0,
-            "50": 1042071040.0,
-            "51": 1042071040.0,
-            "52": 1042071040.0,
-            "53": 1042071040.0,
-            "54": 1042071040.0,
-            "55": 1042071040.0,
-            "56": 1042071040.0,
-            "57": 1042071040.0,
-            "58": 1042071040.0,
-            "59": 1042071040.0,
-            "60": 1042071040.0,
-            "61": 1042071040.0,
-            "62": 1042071040.0,
-            "63": 1042071040.0,
-            "64": 1042071040.0,
-            "65": 1042071040.0,
-            "66": 1042071040.0,
-            "67": 1042071040.0,
-            "68": 1042071040.0,
-            "69": 1042071040.0,
-            "70": 1042071040.0,
-            "71": 1042071040.0,
-            "72": 1042071040.0,
-            "73": 1042071040.0,
-            "74": 1042071040.0,
-            "75": 1042071040.0,
-            "76": 1042071040.0,
-            "77": 1042071040.0,
-            "78": 1042071040.0,
-            "79": 1042071040.0,
-            "80": 1042071040.0,
-            "81": 1042071040.0,
-            "82": 1042071040.0,
-            "83": 1042071040.0,
-            "84": 1042071040.0,
-            "85": 1042071040.0,
-            "86": 1042071040.0,
-            "87": 1042071040.0,
-            "88": 1042071040.0,
-            "89": 1042071040.0,
-            "90": 1042071040.0,
-            "91": 1042071040.0,
-            "92": 1042071040.0,
-            "93": 1042071040.0,
-            "94": 1042071040.0,
-            "95": 1042071040.0,
-            "96": 1042071040.0,
-            "97": 1042071040.0,
-            "98": 1042071040.0,
-            "99": 1042071040.0,
-            "100": 1042071040.0
+            "1": 977520128.0,
+            "2": 1042465280.0,
+            "3": 1042465280.0,
+            "4": 1042465280.0,
+            "5": 1042465280.0,
+            "6": 1042465280.0,
+            "7": 1042465280.0,
+            "8": 1042465280.0,
+            "9": 1042465280.0,
+            "10": 1042465280.0,
+            "11": 1042465280.0,
+            "12": 1042465280.0,
+            "13": 1042465280.0,
+            "14": 1042465280.0,
+            "15": 1042465280.0,
+            "16": 1042465280.0,
+            "17": 1042465280.0,
+            "18": 1042465280.0,
+            "19": 1042465280.0,
+            "20": 1042465280.0,
+            "21": 1042465280.0,
+            "22": 1042465280.0,
+            "23": 1042465280.0,
+            "24": 1042465280.0,
+            "25": 1042465280.0,
+            "26": 1042465280.0,
+            "27": 1042465280.0,
+            "28": 1042465280.0,
+            "29": 1042465280.0,
+            "30": 1042465280.0,
+            "31": 1042465280.0,
+            "32": 1042465280.0,
+            "33": 1042465280.0,
+            "34": 1042465280.0,
+            "35": 1042465280.0,
+            "36": 1042465280.0,
+            "37": 1042465280.0,
+            "38": 1042465280.0,
+            "39": 1042465280.0,
+            "40": 1042465280.0,
+            "41": 1042465280.0,
+            "42": 1042465280.0,
+            "43": 1042465280.0,
+            "44": 1042465280.0,
+            "45": 1042465280.0,
+            "46": 1042465280.0,
+            "47": 1042465280.0,
+            "48": 1042465280.0,
+            "49": 1042465280.0,
+            "50": 1042465280.0,
+            "51": 1042465280.0,
+            "52": 1042465280.0,
+            "53": 1042465280.0,
+            "54": 1042465280.0,
+            "55": 1042465280.0,
+            "56": 1042465280.0,
+            "57": 1042465280.0,
+            "58": 1042465280.0,
+            "59": 1042465280.0,
+            "60": 1042465280.0,
+            "61": 1042465280.0,
+            "62": 1042465280.0,
+            "63": 1042465280.0,
+            "64": 1042465280.0,
+            "65": 1042465280.0,
+            "66": 1042465280.0,
+            "67": 1042465280.0,
+            "68": 1042465280.0,
+            "69": 1042465280.0,
+            "70": 1042465280.0,
+            "71": 1042465280.0,
+            "72": 1042465280.0,
+            "73": 1042465280.0,
+            "74": 1042465280.0,
+            "75": 1042465280.0,
+            "76": 1042465280.0,
+            "77": 1042465280.0,
+            "78": 1042465280.0,
+            "79": 1042465280.0,
+            "80": 1042465280.0,
+            "81": 1042465280.0,
+            "82": 1042465280.0,
+            "83": 1042465280.0,
+            "84": 1042465280.0,
+            "85": 1042465280.0,
+            "86": 1042465280.0,
+            "87": 1042465280.0,
+            "88": 1042465280.0,
+            "89": 1042465280.0,
+            "90": 1042465280.0,
+            "91": 1042465280.0,
+            "92": 1042465280.0,
+            "93": 1042465280.0,
+            "94": 1042465280.0,
+            "95": 1042465280.0,
+            "96": 1042465280.0,
+            "97": 1042465280.0,
+            "98": 1042465280.0,
+            "99": 1042465280.0,
+            "100": 1042465280.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.71841,
-            "2": 0.23136,
-            "3": 0.22493,
-            "4": 0.22779,
-            "5": 0.22663,
-            "6": 0.22036,
-            "7": 0.23806,
-            "8": 0.23483,
-            "9": 0.21894,
-            "10": 0.22798,
-            "11": 0.22166,
-            "12": 0.22477,
-            "13": 0.21586,
-            "14": 0.2289,
-            "15": 0.21846,
-            "16": 0.22439,
-            "17": 0.22351,
-            "18": 0.21894,
-            "19": 0.22165,
-            "20": 0.23,
-            "21": 0.21688,
-            "22": 0.21901,
-            "23": 0.21714,
-            "24": 0.2185,
-            "25": 0.21681,
-            "26": 0.21775,
-            "27": 0.21816,
-            "28": 0.21837,
-            "29": 0.21776,
-            "30": 0.21739,
-            "31": 0.21725,
-            "32": 0.21929,
-            "33": 0.2156,
-            "34": 0.21959,
-            "35": 0.21865,
-            "36": 0.21696,
-            "37": 0.21952,
-            "38": 0.21797,
-            "39": 0.21568,
-            "40": 0.21803,
-            "41": 0.21756,
-            "42": 0.21877,
-            "43": 0.21676,
-            "44": 0.21677,
-            "45": 0.21721,
-            "46": 0.22075,
-            "47": 0.21856,
-            "48": 0.21933,
-            "49": 0.21808,
-            "50": 0.21813,
-            "51": 0.22296,
-            "52": 0.22336,
-            "53": 0.21692,
-            "54": 0.21796,
-            "55": 0.21788,
-            "56": 0.22002,
-            "57": 0.21845,
-            "58": 0.21989,
-            "59": 0.21686,
-            "60": 0.22032,
-            "61": 0.22127,
-            "62": 0.21716,
-            "63": 0.21811,
-            "64": 0.21821,
-            "65": 0.22368,
-            "66": 0.22001,
-            "67": 0.21796,
-            "68": 0.21889,
-            "69": 0.22034,
-            "70": 0.2227,
-            "71": 0.2211,
-            "72": 0.2167,
-            "73": 0.21687,
-            "74": 0.22416,
-            "75": 0.22056,
-            "76": 0.22116,
-            "77": 0.21759,
-            "78": 0.21843,
-            "79": 0.22272,
-            "80": 0.21922,
-            "81": 0.2196,
-            "82": 0.22739,
-            "83": 0.22344,
-            "84": 0.21981,
-            "85": 0.22041,
-            "86": 0.22015,
-            "87": 0.21885,
-            "88": 0.2239,
-            "89": 0.22975,
-            "90": 0.23365,
-            "91": 0.22476,
-            "92": 0.22336,
-            "93": 0.21913,
-            "94": 0.22057,
-            "95": 0.21711,
-            "96": 0.21724,
-            "97": 0.22153,
-            "98": 0.21996,
-            "99": 0.21866,
-            "100": 0.21935
+            "1": 9.84544,
+            "2": 0.22725,
+            "3": 0.20768,
+            "4": 0.18628,
+            "5": 0.18333,
+            "6": 0.18666,
+            "7": 0.18629,
+            "8": 0.18455,
+            "9": 0.18539,
+            "10": 0.18537,
+            "11": 0.18771,
+            "12": 0.18396,
+            "13": 0.18789,
+            "14": 0.18938,
+            "15": 0.18649,
+            "16": 0.18634,
+            "17": 0.18623,
+            "18": 0.18688,
+            "19": 0.18602,
+            "20": 0.18599,
+            "21": 0.18725,
+            "22": 0.19085,
+            "23": 0.18959,
+            "24": 0.19257,
+            "25": 0.18881,
+            "26": 0.18884,
+            "27": 0.18993,
+            "28": 0.1897,
+            "29": 0.19097,
+            "30": 0.1895,
+            "31": 0.19115,
+            "32": 0.18792,
+            "33": 0.19346,
+            "34": 0.19005,
+            "35": 0.18315,
+            "36": 0.18197,
+            "37": 0.18748,
+            "38": 0.18402,
+            "39": 0.18451,
+            "40": 0.1843,
+            "41": 0.18427,
+            "42": 0.18674,
+            "43": 0.18376,
+            "44": 0.18419,
+            "45": 0.55191,
+            "46": 0.18443,
+            "47": 0.18303,
+            "48": 0.18819,
+            "49": 0.19592,
+            "50": 0.1913,
+            "51": 0.19759,
+            "52": 0.19085,
+            "53": 0.19262,
+            "54": 0.19058,
+            "55": 0.18897,
+            "56": 0.1883,
+            "57": 0.18757,
+            "58": 0.18848,
+            "59": 0.19004,
+            "60": 0.18932,
+            "61": 0.1889,
+            "62": 0.18729,
+            "63": 0.18757,
+            "64": 0.18917,
+            "65": 0.18796,
+            "66": 0.1903,
+            "67": 0.18985,
+            "68": 0.18947,
+            "69": 0.19134,
+            "70": 0.19142,
+            "71": 0.18328,
+            "72": 0.18321,
+            "73": 0.18529,
+            "74": 0.18166,
+            "75": 0.18265,
+            "76": 0.18168,
+            "77": 0.18263,
+            "78": 0.18274,
+            "79": 0.18238,
+            "80": 0.18213,
+            "81": 0.18186,
+            "82": 0.1829,
+            "83": 0.18266,
+            "84": 0.18204,
+            "85": 0.18191,
+            "86": 0.18213,
+            "87": 0.1812,
+            "88": 0.18092,
+            "89": 0.18123,
+            "90": 0.22177,
+            "91": 0.18593,
+            "92": 0.18075,
+            "93": 0.18389,
+            "94": 0.18596,
+            "95": 0.18215,
+            "96": 0.18128,
+            "97": 0.18129,
+            "98": 0.18622,
+            "99": 0.18532,
+            "100": 0.18343
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..8faf633ade5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85839,
+            "52": 9.7506,
+            "53": 10.05817,
+            "54": 9.96076,
+            "55": 9.88738,
+            "56": 9.6344,
+            "57": 9.4967,
+            "58": 9.83343,
+            "59": 9.59391,
+            "60": 9.51376,
+            "61": 9.69928,
+            "62": 9.98089,
+            "63": 9.39065,
+            "64": 9.77599,
+            "65": 8.9571,
+            "66": 9.70054,
+            "67": 9.37,
+            "68": 9.78529,
+            "69": 9.78966,
+            "70": 9.74676,
+            "71": 9.61906,
+            "72": 9.58963,
+            "73": 9.49629,
+            "74": 8.94963,
+            "75": 9.42381,
+            "76": 9.07799,
+            "77": 10.07105,
+            "78": 9.72632,
+            "79": 9.37966,
+            "80": 9.40721,
+            "81": 9.48238,
+            "82": 9.70152,
+            "83": 9.30657,
+            "84": 9.41464,
+            "85": 9.61784,
+            "86": 9.08212,
+            "87": 9.59511,
+            "88": 9.75008,
+            "89": 9.60356,
+            "90": 9.82256,
+            "91": 9.33721,
+            "92": 9.35861,
+            "93": 9.07956,
+            "94": 8.83268,
+            "95": 9.51351,
+            "96": 9.52947,
+            "97": 9.31813,
+            "98": 9.67451,
+            "99": 8.88607,
+            "100": 9.40106
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2621.0,
+            "52": 2597.0,
+            "53": 2926.0,
+            "54": 2633.0,
+            "55": 2206.0,
+            "56": 2627.0,
+            "57": 2328.0,
+            "58": 2886.0,
+            "59": 2639.0,
+            "60": 2157.0,
+            "61": 2736.0,
+            "62": 2544.0,
+            "63": 2332.0,
+            "64": 2948.0,
+            "65": 2630.0,
+            "66": 2931.0,
+            "67": 2717.0,
+            "68": 2643.0,
+            "69": 2955.0,
+            "70": 3040.0,
+            "71": 2882.0,
+            "72": 2390.0,
+            "73": 2812.0,
+            "74": 1844.0,
+            "75": 2461.0,
+            "76": 3067.0,
+            "77": 3152.0,
+            "78": 3018.0,
+            "79": 3008.0,
+            "80": 3104.0,
+            "81": 3589.0,
+            "82": 3218.0,
+            "83": 2748.0,
+            "84": 3217.0,
+            "85": 3167.0,
+            "86": 2876.0,
+            "87": 3604.0,
+            "88": 3017.0,
+            "89": 3249.0,
+            "90": 3069.0,
+            "91": 2865.0,
+            "92": 3074.0,
+            "93": 2680.0,
+            "94": 3392.0,
+            "95": 3206.0,
+            "96": 3401.0,
+            "97": 3107.0,
+            "98": 3624.0,
+            "99": 3007.0,
+            "100": 3111.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 299204096.0,
+            "52": 299204096.0,
+            "53": 299204096.0,
+            "54": 299204096.0,
+            "55": 299204096.0,
+            "56": 299204096.0,
+            "57": 299204096.0,
+            "58": 299204096.0,
+            "59": 299204096.0,
+            "60": 299204096.0,
+            "61": 299204096.0,
+            "62": 299204096.0,
+            "63": 299204096.0,
+            "64": 299204096.0,
+            "65": 299204096.0,
+            "66": 299204096.0,
+            "67": 299204096.0,
+            "68": 299204096.0,
+            "69": 299204096.0,
+            "70": 299204096.0,
+            "71": 299204096.0,
+            "72": 299204096.0,
+            "73": 299204096.0,
+            "74": 299204096.0,
+            "75": 299204096.0,
+            "76": 299204096.0,
+            "77": 299204096.0,
+            "78": 299204096.0,
+            "79": 299204096.0,
+            "80": 299204096.0,
+            "81": 299204096.0,
+            "82": 299204096.0,
+            "83": 299204096.0,
+            "84": 299204096.0,
+            "85": 299204096.0,
+            "86": 299204096.0,
+            "87": 299204096.0,
+            "88": 299204096.0,
+            "89": 299204096.0,
+            "90": 299204096.0,
+            "91": 299204096.0,
+            "92": 299204096.0,
+            "93": 299204096.0,
+            "94": 299204096.0,
+            "95": 299204096.0,
+            "96": 299204096.0,
+            "97": 299204096.0,
+            "98": 299204096.0,
+            "99": 299204096.0,
+            "100": 299204096.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1043512832.0,
+            "52": 1043513856.0,
+            "53": 1043513856.0,
+            "54": 1043513856.0,
+            "55": 1043513856.0,
+            "56": 1043513856.0,
+            "57": 1043513856.0,
+            "58": 1043513856.0,
+            "59": 1043513856.0,
+            "60": 1043513856.0,
+            "61": 1043513856.0,
+            "62": 1043513856.0,
+            "63": 1043513856.0,
+            "64": 1043513856.0,
+            "65": 1043513856.0,
+            "66": 1043513856.0,
+            "67": 1043513856.0,
+            "68": 1043513856.0,
+            "69": 1043513856.0,
+            "70": 1043513856.0,
+            "71": 1043513856.0,
+            "72": 1043513856.0,
+            "73": 1043513856.0,
+            "74": 1043513856.0,
+            "75": 1043513856.0,
+            "76": 1043513856.0,
+            "77": 1043513856.0,
+            "78": 1043513856.0,
+            "79": 1043513856.0,
+            "80": 1043513856.0,
+            "81": 1043513856.0,
+            "82": 1043513856.0,
+            "83": 1043513856.0,
+            "84": 1043513856.0,
+            "85": 1043513856.0,
+            "86": 1043513856.0,
+            "87": 1043513856.0,
+            "88": 1043513856.0,
+            "89": 1043513856.0,
+            "90": 1043513856.0,
+            "91": 1043513856.0,
+            "92": 1043513856.0,
+            "93": 1043513856.0,
+            "94": 1043513856.0,
+            "95": 1043513856.0,
+            "96": 1043513856.0,
+            "97": 1043513856.0,
+            "98": 1043513856.0,
+            "99": 1043513856.0,
+            "100": 1043513856.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.95772,
+            "52": 0.21047,
+            "53": 0.18237,
+            "54": 0.18097,
+            "55": 0.18447,
+            "56": 0.18543,
+            "57": 0.18444,
+            "58": 0.18116,
+            "59": 0.18103,
+            "60": 0.185,
+            "61": 0.1869,
+            "62": 0.18215,
+            "63": 0.18074,
+            "64": 0.22859,
+            "65": 0.21818,
+            "66": 0.18939,
+            "67": 0.18821,
+            "68": 0.18642,
+            "69": 0.18318,
+            "70": 0.18267,
+            "71": 0.18226,
+            "72": 0.18124,
+            "73": 0.18054,
+            "74": 0.181,
+            "75": 0.18224,
+            "76": 0.18157,
+            "77": 0.18131,
+            "78": 0.18061,
+            "79": 0.18038,
+            "80": 0.18002,
+            "81": 0.18191,
+            "82": 0.18082,
+            "83": 0.17971,
+            "84": 0.18144,
+            "85": 0.18174,
+            "86": 0.1827,
+            "87": 0.1801,
+            "88": 0.18046,
+            "89": 0.18183,
+            "90": 0.18427,
+            "91": 0.18374,
+            "92": 0.18303,
+            "93": 0.1818,
+            "94": 0.18288,
+            "95": 0.18263,
+            "96": 0.18209,
+            "97": 0.18261,
+            "98": 0.18231,
+            "99": 0.18192,
+            "100": 0.18287
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
index 1641ae309dc..2b3b03b42bc 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 971619840.0,
-            "2": 1036172800.0,
-            "3": 1036172800.0,
-            "4": 1036172800.0,
-            "5": 1036172800.0,
-            "6": 1036172800.0,
-            "7": 1036172800.0,
-            "8": 1036172800.0,
-            "9": 1036172800.0,
-            "10": 1036172800.0,
-            "11": 1036172800.0,
-            "12": 1036172800.0,
-            "13": 1036172800.0,
-            "14": 1036172800.0,
-            "15": 1036172800.0,
-            "16": 1036172800.0,
-            "17": 1036172800.0,
-            "18": 1036172800.0,
-            "19": 1036172800.0,
-            "20": 1036172800.0,
-            "21": 1036172800.0,
-            "22": 1036172800.0,
-            "23": 1036172800.0,
-            "24": 1036172800.0,
-            "25": 1036172800.0,
-            "26": 1036172800.0,
-            "27": 1036172800.0,
-            "28": 1036172800.0,
-            "29": 1036172800.0,
-            "30": 1036172800.0,
-            "31": 1036172800.0,
-            "32": 1036172800.0,
-            "33": 1036172800.0,
-            "34": 1036172800.0,
-            "35": 1036172800.0,
-            "36": 1036172800.0,
-            "37": 1036172800.0,
-            "38": 1036172800.0,
-            "39": 1036172800.0,
-            "40": 1036172800.0,
-            "41": 1036172800.0,
-            "42": 1036172800.0,
-            "43": 1036172800.0,
-            "44": 1036172800.0,
-            "45": 1036172800.0,
-            "46": 1036172800.0,
-            "47": 1036172800.0,
-            "48": 1036172800.0,
-            "49": 1036172800.0,
-            "50": 1036172800.0,
-            "51": 1036172800.0,
-            "52": 1036172800.0,
-            "53": 1036172800.0,
-            "54": 1036172800.0,
-            "55": 1036172800.0,
-            "56": 1036172800.0,
-            "57": 1036172800.0,
-            "58": 1036172800.0,
-            "59": 1036172800.0,
-            "60": 1036172800.0,
-            "61": 1036172800.0,
-            "62": 1036172800.0,
-            "63": 1036172800.0,
-            "64": 1036172800.0,
-            "65": 1036172800.0,
-            "66": 1036172800.0,
-            "67": 1036172800.0,
-            "68": 1036172800.0,
-            "69": 1036172800.0,
-            "70": 1036172800.0,
-            "71": 1036172800.0,
-            "72": 1036172800.0,
-            "73": 1036172800.0,
-            "74": 1036172800.0,
-            "75": 1036172800.0,
-            "76": 1036172800.0,
-            "77": 1036172800.0,
-            "78": 1036172800.0,
-            "79": 1036172800.0,
-            "80": 1036172800.0,
-            "81": 1036172800.0,
-            "82": 1036172800.0,
-            "83": 1036172800.0,
-            "84": 1036172800.0,
-            "85": 1036172800.0,
-            "86": 1036172800.0,
-            "87": 1036172800.0,
-            "88": 1036172800.0,
-            "89": 1036172800.0,
-            "90": 1036172800.0,
-            "91": 1036172800.0,
-            "92": 1036172800.0,
-            "93": 1036172800.0,
-            "94": 1036172800.0,
-            "95": 1036172800.0,
-            "96": 1036172800.0,
-            "97": 1036172800.0,
-            "98": 1036172800.0,
-            "99": 1036172800.0,
-            "100": 1036172800.0
+            "1": 968737280.0,
+            "2": 1035779584.0,
+            "3": 1035779584.0,
+            "4": 1035779584.0,
+            "5": 1035779584.0,
+            "6": 1035779584.0,
+            "7": 1035779584.0,
+            "8": 1035779584.0,
+            "9": 1035779584.0,
+            "10": 1035779584.0,
+            "11": 1035779584.0,
+            "12": 1035779584.0,
+            "13": 1035779584.0,
+            "14": 1035779584.0,
+            "15": 1035779584.0,
+            "16": 1035779584.0,
+            "17": 1035779584.0,
+            "18": 1035779584.0,
+            "19": 1035779584.0,
+            "20": 1035779584.0,
+            "21": 1035779584.0,
+            "22": 1035779584.0,
+            "23": 1035779584.0,
+            "24": 1035779584.0,
+            "25": 1035779584.0,
+            "26": 1035779584.0,
+            "27": 1035779584.0,
+            "28": 1035779584.0,
+            "29": 1035779584.0,
+            "30": 1035779584.0,
+            "31": 1035779584.0,
+            "32": 1035779584.0,
+            "33": 1035779584.0,
+            "34": 1035779584.0,
+            "35": 1035779584.0,
+            "36": 1035779584.0,
+            "37": 1035779584.0,
+            "38": 1035779584.0,
+            "39": 1035779584.0,
+            "40": 1035779584.0,
+            "41": 1035779584.0,
+            "42": 1035779584.0,
+            "43": 1035779584.0,
+            "44": 1035779584.0,
+            "45": 1035779584.0,
+            "46": 1035779584.0,
+            "47": 1035779584.0,
+            "48": 1035779584.0,
+            "49": 1035779584.0,
+            "50": 1035779584.0,
+            "51": 1035779584.0,
+            "52": 1035779584.0,
+            "53": 1035779584.0,
+            "54": 1035779584.0,
+            "55": 1035779584.0,
+            "56": 1035779584.0,
+            "57": 1035779584.0,
+            "58": 1035779584.0,
+            "59": 1035779584.0,
+            "60": 1035779584.0,
+            "61": 1035779584.0,
+            "62": 1035779584.0,
+            "63": 1035779584.0,
+            "64": 1035779584.0,
+            "65": 1035779584.0,
+            "66": 1035779584.0,
+            "67": 1035779584.0,
+            "68": 1035779584.0,
+            "69": 1035779584.0,
+            "70": 1035779584.0,
+            "71": 1035779584.0,
+            "72": 1035779584.0,
+            "73": 1035779584.0,
+            "74": 1035779584.0,
+            "75": 1035779584.0,
+            "76": 1035779584.0,
+            "77": 1035779584.0,
+            "78": 1035779584.0,
+            "79": 1035779584.0,
+            "80": 1035779584.0,
+            "81": 1035779584.0,
+            "82": 1035779584.0,
+            "83": 1035779584.0,
+            "84": 1035779584.0,
+            "85": 1035779584.0,
+            "86": 1035779584.0,
+            "87": 1035779584.0,
+            "88": 1035779584.0,
+            "89": 1035779584.0,
+            "90": 1035779584.0,
+            "91": 1035779584.0,
+            "92": 1035779584.0,
+            "93": 1035779584.0,
+            "94": 1035779584.0,
+            "95": 1035779584.0,
+            "96": 1035779584.0,
+            "97": 1035779584.0,
+            "98": 1035779584.0,
+            "99": 1035779584.0,
+            "100": 1035779584.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 5.18846,
-            "2": 0.36168,
-            "3": 0.29466,
-            "4": 0.29234,
-            "5": 0.29276,
-            "6": 0.29792,
-            "7": 0.29352,
-            "8": 0.2936,
-            "9": 0.29237,
-            "10": 0.29769,
-            "11": 0.29346,
-            "12": 0.29527,
-            "13": 0.29315,
-            "14": 0.29363,
-            "15": 0.29305,
-            "16": 0.29641,
-            "17": 0.29489,
-            "18": 0.29861,
-            "19": 0.29574,
-            "20": 0.29312,
-            "21": 0.29388,
-            "22": 0.29283,
-            "23": 0.29431,
-            "24": 0.29335,
-            "25": 0.29314,
-            "26": 0.29296,
-            "27": 0.29356,
-            "28": 0.29335,
-            "29": 0.29568,
-            "30": 0.29411,
-            "31": 0.29379,
-            "32": 0.29273,
-            "33": 0.29354,
-            "34": 0.29433,
-            "35": 0.29411,
-            "36": 0.29363,
-            "37": 0.2938,
-            "38": 0.29351,
-            "39": 0.29356,
-            "40": 0.29298,
-            "41": 0.29347,
-            "42": 0.29413,
-            "43": 0.29252,
-            "44": 0.29273,
-            "45": 0.29334,
-            "46": 0.29356,
-            "47": 0.29382,
-            "48": 0.29398,
-            "49": 0.2936,
-            "50": 0.29316,
-            "51": 0.29514,
-            "52": 0.28916,
-            "53": 0.29005,
-            "54": 0.28929,
-            "55": 0.28956,
-            "56": 0.28848,
-            "57": 0.28858,
-            "58": 0.28768,
-            "59": 0.28853,
-            "60": 0.29008,
-            "61": 0.2889,
-            "62": 0.28847,
-            "63": 0.28786,
-            "64": 0.28795,
-            "65": 0.28879,
-            "66": 0.28923,
-            "67": 0.28915,
-            "68": 0.28861,
-            "69": 0.28895,
-            "70": 0.28885,
-            "71": 0.28882,
-            "72": 0.28775,
-            "73": 0.28792,
-            "74": 0.28799,
-            "75": 0.28754,
-            "76": 0.28789,
-            "77": 0.2888,
-            "78": 0.28929,
-            "79": 0.28854,
-            "80": 0.28894,
-            "81": 0.28751,
-            "82": 0.28815,
-            "83": 0.2885,
-            "84": 0.28813,
-            "85": 0.28933,
-            "86": 0.28794,
-            "87": 0.28758,
-            "88": 0.28772,
-            "89": 0.28903,
-            "90": 0.28798,
-            "91": 0.28695,
-            "92": 0.28757,
-            "93": 0.28831,
-            "94": 0.28828,
-            "95": 0.28871,
-            "96": 0.28746,
-            "97": 0.28767,
-            "98": 0.28881,
-            "99": 0.2875,
-            "100": 0.28775
+            "1": 6.36449,
+            "2": 0.41478,
+            "3": 0.30241,
+            "4": 0.2884,
+            "5": 0.28755,
+            "6": 0.28808,
+            "7": 0.28797,
+            "8": 0.28869,
+            "9": 0.28996,
+            "10": 0.28886,
+            "11": 0.28738,
+            "12": 0.28795,
+            "13": 0.28791,
+            "14": 0.28704,
+            "15": 0.28904,
+            "16": 0.28588,
+            "17": 0.28849,
+            "18": 0.28778,
+            "19": 0.28792,
+            "20": 0.29039,
+            "21": 0.287,
+            "22": 0.28626,
+            "23": 0.28702,
+            "24": 0.2849,
+            "25": 0.28626,
+            "26": 0.28568,
+            "27": 0.28568,
+            "28": 0.2854,
+            "29": 0.28285,
+            "30": 0.28684,
+            "31": 0.28623,
+            "32": 0.28599,
+            "33": 0.2876,
+            "34": 0.29486,
+            "35": 0.29154,
+            "36": 0.29138,
+            "37": 0.2898,
+            "38": 0.28925,
+            "39": 0.62385,
+            "40": 0.29181,
+            "41": 0.28932,
+            "42": 0.2907,
+            "43": 0.29195,
+            "44": 0.29,
+            "45": 0.29106,
+            "46": 0.28915,
+            "47": 0.28992,
+            "48": 0.32778,
+            "49": 0.34367,
+            "50": 0.33689,
+            "51": 0.34514,
+            "52": 0.33403,
+            "53": 0.33545,
+            "54": 0.33248,
+            "55": 0.33236,
+            "56": 0.33296,
+            "57": 0.33492,
+            "58": 0.33381,
+            "59": 0.33223,
+            "60": 0.33257,
+            "61": 0.33335,
+            "62": 0.33224,
+            "63": 0.33253,
+            "64": 0.33281,
+            "65": 0.33219,
+            "66": 0.31003,
+            "67": 0.2827,
+            "68": 0.28133,
+            "69": 0.28172,
+            "70": 0.28132,
+            "71": 0.2812,
+            "72": 0.28195,
+            "73": 0.28303,
+            "74": 0.28159,
+            "75": 0.28199,
+            "76": 0.28303,
+            "77": 0.28083,
+            "78": 0.28252,
+            "79": 0.28214,
+            "80": 0.2819,
+            "81": 0.28155,
+            "82": 0.28205,
+            "83": 0.28156,
+            "84": 0.28192,
+            "85": 0.28236,
+            "86": 0.28154,
+            "87": 0.28274,
+            "88": 0.28199,
+            "89": 0.2816,
+            "90": 0.28156,
+            "91": 0.28254,
+            "92": 0.28186,
+            "93": 0.28161,
+            "94": 0.28181,
+            "95": 0.28289,
+            "96": 0.28181,
+            "97": 0.2827,
+            "98": 0.28237,
+            "99": 0.28238,
+            "100": 0.2826
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..13fcd39e949
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85065,
+            "52": 9.7464,
+            "53": 10.07271,
+            "54": 9.95757,
+            "55": 9.87725,
+            "56": 9.62951,
+            "57": 9.48816,
+            "58": 9.83239,
+            "59": 9.58985,
+            "60": 9.50827,
+            "61": 9.6947,
+            "62": 9.99304,
+            "63": 9.37511,
+            "64": 9.77996,
+            "65": 8.95215,
+            "66": 9.71323,
+            "67": 9.37884,
+            "68": 9.78794,
+            "69": 9.79078,
+            "70": 9.7308,
+            "71": 9.61793,
+            "72": 9.59094,
+            "73": 9.49435,
+            "74": 8.94865,
+            "75": 9.43606,
+            "76": 9.09894,
+            "77": 10.06437,
+            "78": 9.73006,
+            "79": 9.37771,
+            "80": 9.41266,
+            "81": 9.4854,
+            "82": 9.69576,
+            "83": 9.32017,
+            "84": 9.42235,
+            "85": 9.61578,
+            "86": 9.07218,
+            "87": 9.59328,
+            "88": 9.7509,
+            "89": 9.61159,
+            "90": 9.82148,
+            "91": 9.35304,
+            "92": 9.36254,
+            "93": 9.08747,
+            "94": 8.83398,
+            "95": 9.51923,
+            "96": 9.52595,
+            "97": 9.31413,
+            "98": 9.67414,
+            "99": 8.88869,
+            "100": 9.40651
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2873.0,
+            "52": 2946.0,
+            "53": 3158.0,
+            "54": 2907.0,
+            "55": 2740.0,
+            "56": 3029.0,
+            "57": 2489.0,
+            "58": 3327.0,
+            "59": 3042.0,
+            "60": 2780.0,
+            "61": 3302.0,
+            "62": 2961.0,
+            "63": 2702.0,
+            "64": 3318.0,
+            "65": 2909.0,
+            "66": 3513.0,
+            "67": 2959.0,
+            "68": 2963.0,
+            "69": 3171.0,
+            "70": 3547.0,
+            "71": 3246.0,
+            "72": 2586.0,
+            "73": 3301.0,
+            "74": 2135.0,
+            "75": 2752.0,
+            "76": 3275.0,
+            "77": 3648.0,
+            "78": 3472.0,
+            "79": 3536.0,
+            "80": 3685.0,
+            "81": 4159.0,
+            "82": 3488.0,
+            "83": 3179.0,
+            "84": 3639.0,
+            "85": 3631.0,
+            "86": 3045.0,
+            "87": 4315.0,
+            "88": 3481.0,
+            "89": 3819.0,
+            "90": 3323.0,
+            "91": 3014.0,
+            "92": 3581.0,
+            "93": 2932.0,
+            "94": 3715.0,
+            "95": 3593.0,
+            "96": 3764.0,
+            "97": 3582.0,
+            "98": 3998.0,
+            "99": 3406.0,
+            "100": 3521.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 269842944.0,
+            "52": 269842944.0,
+            "53": 269842944.0,
+            "54": 269842944.0,
+            "55": 269842944.0,
+            "56": 269842944.0,
+            "57": 269842944.0,
+            "58": 269842944.0,
+            "59": 269842944.0,
+            "60": 269842944.0,
+            "61": 269842944.0,
+            "62": 269842944.0,
+            "63": 269842944.0,
+            "64": 269842944.0,
+            "65": 269842944.0,
+            "66": 269842944.0,
+            "67": 269842944.0,
+            "68": 269842944.0,
+            "69": 269842944.0,
+            "70": 269842944.0,
+            "71": 269842944.0,
+            "72": 269842944.0,
+            "73": 269842944.0,
+            "74": 269842944.0,
+            "75": 269842944.0,
+            "76": 269842944.0,
+            "77": 269842944.0,
+            "78": 269842944.0,
+            "79": 269842944.0,
+            "80": 269842944.0,
+            "81": 269842944.0,
+            "82": 269842944.0,
+            "83": 269842944.0,
+            "84": 269842944.0,
+            "85": 269842944.0,
+            "86": 269842944.0,
+            "87": 269842944.0,
+            "88": 269842944.0,
+            "89": 269842944.0,
+            "90": 269842944.0,
+            "91": 269842944.0,
+            "92": 269842944.0,
+            "93": 269842944.0,
+            "94": 269842944.0,
+            "95": 269842944.0,
+            "96": 269842944.0,
+            "97": 269842944.0,
+            "98": 269842944.0,
+            "99": 269842944.0,
+            "100": 269842944.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1036827136.0,
+            "52": 1036828160.0,
+            "53": 1036828160.0,
+            "54": 1036828160.0,
+            "55": 1036828160.0,
+            "56": 1036828160.0,
+            "57": 1036828160.0,
+            "58": 1036828160.0,
+            "59": 1036828160.0,
+            "60": 1036828160.0,
+            "61": 1036828160.0,
+            "62": 1036828160.0,
+            "63": 1036828160.0,
+            "64": 1036828160.0,
+            "65": 1036828160.0,
+            "66": 1036828160.0,
+            "67": 1036828160.0,
+            "68": 1036828160.0,
+            "69": 1036828160.0,
+            "70": 1036828160.0,
+            "71": 1036828160.0,
+            "72": 1036828160.0,
+            "73": 1036828160.0,
+            "74": 1036828160.0,
+            "75": 1036828160.0,
+            "76": 1036828160.0,
+            "77": 1036828160.0,
+            "78": 1036828160.0,
+            "79": 1036828160.0,
+            "80": 1036828160.0,
+            "81": 1036828160.0,
+            "82": 1036828160.0,
+            "83": 1036828160.0,
+            "84": 1036828160.0,
+            "85": 1036828160.0,
+            "86": 1036828160.0,
+            "87": 1036828160.0,
+            "88": 1036828160.0,
+            "89": 1036828160.0,
+            "90": 1036828160.0,
+            "91": 1036828160.0,
+            "92": 1036828160.0,
+            "93": 1036828160.0,
+            "94": 1036828160.0,
+            "95": 1036828160.0,
+            "96": 1036828160.0,
+            "97": 1036828160.0,
+            "98": 1036828160.0,
+            "99": 1036828160.0,
+            "100": 1036828160.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.60062,
+            "52": 0.34561,
+            "53": 0.29071,
+            "54": 0.29184,
+            "55": 0.2948,
+            "56": 0.29077,
+            "57": 0.2916,
+            "58": 0.29134,
+            "59": 0.29145,
+            "60": 0.29253,
+            "61": 0.29047,
+            "62": 0.29158,
+            "63": 0.2928,
+            "64": 0.29153,
+            "65": 0.29135,
+            "66": 0.2908,
+            "67": 0.29054,
+            "68": 0.29078,
+            "69": 0.28979,
+            "70": 0.29041,
+            "71": 0.29099,
+            "72": 0.29052,
+            "73": 0.29156,
+            "74": 0.29178,
+            "75": 0.28944,
+            "76": 0.28907,
+            "77": 0.29079,
+            "78": 0.2907,
+            "79": 0.29278,
+            "80": 0.29007,
+            "81": 0.28964,
+            "82": 0.28902,
+            "83": 0.2899,
+            "84": 0.28906,
+            "85": 0.28955,
+            "86": 0.28766,
+            "87": 0.29175,
+            "88": 0.28899,
+            "89": 0.2875,
+            "90": 0.28943,
+            "91": 0.29161,
+            "92": 0.28815,
+            "93": 0.29145,
+            "94": 0.28977,
+            "95": 0.28998,
+            "96": 0.29062,
+            "97": 0.29169,
+            "98": 0.29269,
+            "99": 0.29163,
+            "100": 0.29161
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..65fc98f8dd4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.84013,
+            "2": 10.8521,
+            "3": 10.84145,
+            "4": 10.84467,
+            "5": 10.85514,
+            "6": 10.8635,
+            "7": 10.85198,
+            "8": 10.84642,
+            "9": 10.84925,
+            "10": 10.81263,
+            "11": 10.85666,
+            "12": 10.8427,
+            "13": 10.86033,
+            "14": 10.8502,
+            "15": 10.81715,
+            "16": 10.80956,
+            "17": 10.78133,
+            "18": 10.79323,
+            "19": 10.79687,
+            "20": 10.7086,
+            "21": 10.70208,
+            "22": 10.58835,
+            "23": 10.69694,
+            "24": 10.60843,
+            "25": 10.57217,
+            "26": 10.6184,
+            "27": 10.61356,
+            "28": 10.56381,
+            "29": 10.56984,
+            "30": 10.38372,
+            "31": 10.17138,
+            "32": 10.45911,
+            "33": 10.4549,
+            "34": 10.24801,
+            "35": 10.27909,
+            "36": 10.24807,
+            "37": 10.35043,
+            "38": 10.22169,
+            "39": 10.39797,
+            "40": 10.09945,
+            "41": 10.15733,
+            "42": 10.21607,
+            "43": 9.88836,
+            "44": 9.98422,
+            "45": 9.8641,
+            "46": 9.84157,
+            "47": 10.1451,
+            "48": 9.87164,
+            "49": 9.56255,
+            "50": 9.9195,
+            "51": 9.86714,
+            "52": 9.75686,
+            "53": 10.06973,
+            "54": 9.95909,
+            "55": 9.89872,
+            "56": 9.63952,
+            "57": 9.4936,
+            "58": 9.83608,
+            "59": 9.59679,
+            "60": 9.51626,
+            "61": 9.69468,
+            "62": 9.99033,
+            "63": 9.39041,
+            "64": 9.77374,
+            "65": 8.96559,
+            "66": 9.70319,
+            "67": 9.38057,
+            "68": 9.78256,
+            "69": 9.79804,
+            "70": 9.73697,
+            "71": 9.62634,
+            "72": 9.582,
+            "73": 9.50018,
+            "74": 8.93897,
+            "75": 9.42247,
+            "76": 9.08151,
+            "77": 10.06555,
+            "78": 9.71951,
+            "79": 9.38365,
+            "80": 9.4005,
+            "81": 9.48215,
+            "82": 9.69917,
+            "83": 9.30951,
+            "84": 9.41595,
+            "85": 9.61112,
+            "86": 9.07822,
+            "87": 9.59519,
+            "88": 9.74646,
+            "89": 9.60078,
+            "90": 9.82618,
+            "91": 9.32913,
+            "92": 9.35518,
+            "93": 9.08231,
+            "94": 8.83,
+            "95": 9.53112,
+            "96": 9.52889,
+            "97": 9.30954,
+            "98": 9.66956,
+            "99": 8.89675,
+            "100": 9.4083
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1649.0,
+            "2": 1829.0,
+            "3": 1726.0,
+            "4": 1713.0,
+            "5": 2036.0,
+            "6": 1824.0,
+            "7": 1894.0,
+            "8": 1742.0,
+            "9": 1834.0,
+            "10": 1485.0,
+            "11": 1871.0,
+            "12": 1772.0,
+            "13": 2030.0,
+            "14": 1885.0,
+            "15": 1946.0,
+            "16": 1947.0,
+            "17": 1965.0,
+            "18": 1798.0,
+            "19": 1881.0,
+            "20": 1859.0,
+            "21": 1900.0,
+            "22": 1701.0,
+            "23": 2140.0,
+            "24": 1655.0,
+            "25": 1680.0,
+            "26": 1783.0,
+            "27": 1856.0,
+            "28": 1985.0,
+            "29": 2065.0,
+            "30": 1944.0,
+            "31": 1667.0,
+            "32": 1941.0,
+            "33": 2159.0,
+            "34": 1869.0,
+            "35": 1955.0,
+            "36": 2070.0,
+            "37": 2409.0,
+            "38": 2151.0,
+            "39": 2456.0,
+            "40": 2130.0,
+            "41": 2184.0,
+            "42": 2275.0,
+            "43": 2002.0,
+            "44": 2112.0,
+            "45": 1981.0,
+            "46": 2250.0,
+            "47": 2543.0,
+            "48": 2167.0,
+            "49": 2247.0,
+            "50": 2295.0,
+            "51": 2492.0,
+            "52": 2583.0,
+            "53": 2788.0,
+            "54": 2678.0,
+            "55": 2301.0,
+            "56": 2724.0,
+            "57": 2272.0,
+            "58": 2999.0,
+            "59": 2686.0,
+            "60": 2330.0,
+            "61": 2852.0,
+            "62": 2703.0,
+            "63": 2277.0,
+            "64": 2990.0,
+            "65": 2475.0,
+            "66": 2892.0,
+            "67": 2646.0,
+            "68": 2650.0,
+            "69": 2845.0,
+            "70": 3145.0,
+            "71": 2913.0,
+            "72": 2573.0,
+            "73": 2850.0,
+            "74": 1865.0,
+            "75": 2466.0,
+            "76": 3055.0,
+            "77": 3185.0,
+            "78": 3106.0,
+            "79": 3053.0,
+            "80": 3184.0,
+            "81": 3447.0,
+            "82": 3296.0,
+            "83": 2726.0,
+            "84": 3276.0,
+            "85": 3336.0,
+            "86": 2803.0,
+            "87": 3643.0,
+            "88": 3013.0,
+            "89": 3185.0,
+            "90": 3126.0,
+            "91": 3076.0,
+            "92": 3139.0,
+            "93": 2665.0,
+            "94": 3302.0,
+            "95": 3282.0,
+            "96": 3404.0,
+            "97": 3215.0,
+            "98": 3465.0,
+            "99": 3128.0,
+            "100": 3231.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 397748736.0,
+            "2": 397748736.0,
+            "3": 397748736.0,
+            "4": 397748736.0,
+            "5": 397748736.0,
+            "6": 397748736.0,
+            "7": 397748736.0,
+            "8": 397748736.0,
+            "9": 397748736.0,
+            "10": 397748736.0,
+            "11": 397748736.0,
+            "12": 397748736.0,
+            "13": 397748736.0,
+            "14": 397748736.0,
+            "15": 397748736.0,
+            "16": 397748736.0,
+            "17": 397748736.0,
+            "18": 397748736.0,
+            "19": 397748736.0,
+            "20": 397748736.0,
+            "21": 397748736.0,
+            "22": 397748736.0,
+            "23": 397748736.0,
+            "24": 397748736.0,
+            "25": 397748736.0,
+            "26": 397748736.0,
+            "27": 397748736.0,
+            "28": 397748736.0,
+            "29": 397748736.0,
+            "30": 397748736.0,
+            "31": 397748736.0,
+            "32": 397748736.0,
+            "33": 397748736.0,
+            "34": 397748736.0,
+            "35": 397748736.0,
+            "36": 397748736.0,
+            "37": 397748736.0,
+            "38": 397748736.0,
+            "39": 397748736.0,
+            "40": 397748736.0,
+            "41": 397748736.0,
+            "42": 397748736.0,
+            "43": 397748736.0,
+            "44": 397748736.0,
+            "45": 397748736.0,
+            "46": 397748736.0,
+            "47": 397748736.0,
+            "48": 397748736.0,
+            "49": 397748736.0,
+            "50": 397748736.0,
+            "51": 397748736.0,
+            "52": 397748736.0,
+            "53": 397748736.0,
+            "54": 397748736.0,
+            "55": 397748736.0,
+            "56": 397748736.0,
+            "57": 397748736.0,
+            "58": 397748736.0,
+            "59": 397748736.0,
+            "60": 397748736.0,
+            "61": 397748736.0,
+            "62": 397748736.0,
+            "63": 397748736.0,
+            "64": 397748736.0,
+            "65": 397748736.0,
+            "66": 397748736.0,
+            "67": 397748736.0,
+            "68": 397748736.0,
+            "69": 397748736.0,
+            "70": 397748736.0,
+            "71": 397748736.0,
+            "72": 397748736.0,
+            "73": 397748736.0,
+            "74": 397748736.0,
+            "75": 397748736.0,
+            "76": 397748736.0,
+            "77": 397748736.0,
+            "78": 397748736.0,
+            "79": 397748736.0,
+            "80": 397748736.0,
+            "81": 397748736.0,
+            "82": 397748736.0,
+            "83": 397748736.0,
+            "84": 397748736.0,
+            "85": 397748736.0,
+            "86": 397748736.0,
+            "87": 397748736.0,
+            "88": 397748736.0,
+            "89": 397748736.0,
+            "90": 397748736.0,
+            "91": 397748736.0,
+            "92": 397748736.0,
+            "93": 397748736.0,
+            "94": 397748736.0,
+            "95": 397748736.0,
+            "96": 397748736.0,
+            "97": 397748736.0,
+            "98": 397748736.0,
+            "99": 397748736.0,
+            "100": 397748736.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1057339904.0,
+            "2": 1190421504.0,
+            "3": 1190421504.0,
+            "4": 1190421504.0,
+            "5": 1190421504.0,
+            "6": 1190421504.0,
+            "7": 1190421504.0,
+            "8": 1190421504.0,
+            "9": 1190421504.0,
+            "10": 1190421504.0,
+            "11": 1190421504.0,
+            "12": 1190421504.0,
+            "13": 1190421504.0,
+            "14": 1190421504.0,
+            "15": 1190421504.0,
+            "16": 1190421504.0,
+            "17": 1190421504.0,
+            "18": 1190421504.0,
+            "19": 1190421504.0,
+            "20": 1190421504.0,
+            "21": 1190421504.0,
+            "22": 1190421504.0,
+            "23": 1190421504.0,
+            "24": 1190421504.0,
+            "25": 1190421504.0,
+            "26": 1190421504.0,
+            "27": 1190421504.0,
+            "28": 1190421504.0,
+            "29": 1190421504.0,
+            "30": 1190421504.0,
+            "31": 1190421504.0,
+            "32": 1190421504.0,
+            "33": 1190421504.0,
+            "34": 1190421504.0,
+            "35": 1190421504.0,
+            "36": 1190421504.0,
+            "37": 1190421504.0,
+            "38": 1190421504.0,
+            "39": 1190421504.0,
+            "40": 1190421504.0,
+            "41": 1190421504.0,
+            "42": 1190421504.0,
+            "43": 1190421504.0,
+            "44": 1190421504.0,
+            "45": 1190421504.0,
+            "46": 1190421504.0,
+            "47": 1190421504.0,
+            "48": 1190421504.0,
+            "49": 1190421504.0,
+            "50": 1190421504.0,
+            "51": 1190421504.0,
+            "52": 1190421504.0,
+            "53": 1190421504.0,
+            "54": 1190421504.0,
+            "55": 1190421504.0,
+            "56": 1190421504.0,
+            "57": 1190421504.0,
+            "58": 1190421504.0,
+            "59": 1190421504.0,
+            "60": 1190421504.0,
+            "61": 1190421504.0,
+            "62": 1190421504.0,
+            "63": 1190421504.0,
+            "64": 1190421504.0,
+            "65": 1190421504.0,
+            "66": 1190421504.0,
+            "67": 1190421504.0,
+            "68": 1190421504.0,
+            "69": 1190421504.0,
+            "70": 1190421504.0,
+            "71": 1190421504.0,
+            "72": 1190421504.0,
+            "73": 1190421504.0,
+            "74": 1190421504.0,
+            "75": 1190421504.0,
+            "76": 1190421504.0,
+            "77": 1190421504.0,
+            "78": 1190421504.0,
+            "79": 1190421504.0,
+            "80": 1190421504.0,
+            "81": 1190421504.0,
+            "82": 1190421504.0,
+            "83": 1190421504.0,
+            "84": 1190421504.0,
+            "85": 1190421504.0,
+            "86": 1190421504.0,
+            "87": 1190421504.0,
+            "88": 1190421504.0,
+            "89": 1190421504.0,
+            "90": 1190421504.0,
+            "91": 1190421504.0,
+            "92": 1190421504.0,
+            "93": 1190421504.0,
+            "94": 1190421504.0,
+            "95": 1190421504.0,
+            "96": 1190421504.0,
+            "97": 1190421504.0,
+            "98": 1190421504.0,
+            "99": 1190421504.0,
+            "100": 1190421504.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.73217,
+            "3": 0.44849,
+            "4": 0.46632,
+            "5": 0.47877,
+            "6": 0.48831,
+            "7": 0.48769,
+            "8": 0.50745,
+            "9": 0.48778,
+            "10": 0.49192,
+            "11": 0.49758,
+            "12": 0.5014,
+            "13": 0.49698,
+            "14": 0.49958,
+            "15": 0.49877,
+            "16": 0.50112,
+            "17": 0.49678,
+            "18": 0.49696,
+            "19": 0.49583,
+            "20": 0.49823,
+            "21": 0.50092,
+            "22": 0.50313,
+            "23": 0.50157,
+            "24": 0.50564,
+            "25": 0.50173,
+            "26": 0.50691,
+            "27": 0.5209,
+            "28": 0.51519,
+            "29": 0.50283,
+            "30": 0.50601,
+            "31": 0.50139,
+            "32": 0.507,
+            "33": 0.50335,
+            "34": 0.50467,
+            "35": 0.50168,
+            "36": 0.49771,
+            "37": 0.49868,
+            "38": 0.49794,
+            "39": 0.49729,
+            "40": 0.4917,
+            "41": 0.49294,
+            "42": 0.48867,
+            "43": 0.49291,
+            "44": 0.49762,
+            "45": 0.49672,
+            "46": 0.50694,
+            "47": 0.49816,
+            "48": 0.4942,
+            "49": 0.5031,
+            "50": 0.50121,
+            "51": 0.48839,
+            "52": 0.49123,
+            "53": 0.83615,
+            "54": 0.49979,
+            "55": 0.50032,
+            "56": 0.5025,
+            "57": 0.50465,
+            "58": 0.5032,
+            "59": 0.52509,
+            "60": 0.51125,
+            "61": 0.50912,
+            "62": 0.50722,
+            "63": 0.51052,
+            "64": 0.50743,
+            "65": 0.51588,
+            "66": 0.51203,
+            "67": 0.51526,
+            "68": 0.50806,
+            "69": 0.51012,
+            "70": 0.51073,
+            "71": 0.50805,
+            "72": 0.51001,
+            "73": 0.52219,
+            "74": 0.50785,
+            "75": 0.50971,
+            "76": 0.50837,
+            "77": 0.51328,
+            "78": 0.51109,
+            "79": 0.50795,
+            "80": 0.86855,
+            "81": 0.51135,
+            "82": 0.50858,
+            "83": 0.51273,
+            "84": 0.50989,
+            "85": 0.51087,
+            "86": 0.51808,
+            "87": 0.5247,
+            "88": 0.51417,
+            "89": 0.5201,
+            "90": 0.90988,
+            "91": 0.54215,
+            "92": 0.52369,
+            "93": 0.51835,
+            "94": 0.52068,
+            "95": 0.5186,
+            "96": 0.52052,
+            "97": 0.51882,
+            "98": 0.52061,
+            "99": 0.51758,
+            "100": 0.51114
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..074f4cc53b8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.86714,
+            "52": 9.75686,
+            "53": 10.06973,
+            "54": 9.95909,
+            "55": 9.89872,
+            "56": 9.63952,
+            "57": 9.4936,
+            "58": 9.83608,
+            "59": 9.59679,
+            "60": 9.51626,
+            "61": 9.69468,
+            "62": 9.99033,
+            "63": 9.39041,
+            "64": 9.77374,
+            "65": 8.96559,
+            "66": 9.70319,
+            "67": 9.38057,
+            "68": 9.78256,
+            "69": 9.79804,
+            "70": 9.73697,
+            "71": 9.62634,
+            "72": 9.582,
+            "73": 9.50018,
+            "74": 8.93897,
+            "75": 9.42247,
+            "76": 9.08151,
+            "77": 10.06555,
+            "78": 9.71951,
+            "79": 9.38365,
+            "80": 9.4005,
+            "81": 9.48215,
+            "82": 9.69917,
+            "83": 9.30951,
+            "84": 9.41595,
+            "85": 9.61112,
+            "86": 9.07822,
+            "87": 9.59519,
+            "88": 9.74646,
+            "89": 9.60078,
+            "90": 9.82618,
+            "91": 9.32913,
+            "92": 9.35518,
+            "93": 9.08231,
+            "94": 8.83,
+            "95": 9.53112,
+            "96": 9.52889,
+            "97": 9.30954,
+            "98": 9.66956,
+            "99": 8.89675,
+            "100": 9.4083
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2492.0,
+            "52": 2583.0,
+            "53": 2788.0,
+            "54": 2678.0,
+            "55": 2301.0,
+            "56": 2724.0,
+            "57": 2272.0,
+            "58": 2999.0,
+            "59": 2686.0,
+            "60": 2330.0,
+            "61": 2852.0,
+            "62": 2703.0,
+            "63": 2277.0,
+            "64": 2990.0,
+            "65": 2475.0,
+            "66": 2892.0,
+            "67": 2646.0,
+            "68": 2650.0,
+            "69": 2845.0,
+            "70": 3145.0,
+            "71": 2913.0,
+            "72": 2573.0,
+            "73": 2850.0,
+            "74": 1865.0,
+            "75": 2466.0,
+            "76": 3055.0,
+            "77": 3185.0,
+            "78": 3106.0,
+            "79": 3053.0,
+            "80": 3184.0,
+            "81": 3447.0,
+            "82": 3296.0,
+            "83": 2726.0,
+            "84": 3276.0,
+            "85": 3336.0,
+            "86": 2803.0,
+            "87": 3643.0,
+            "88": 3013.0,
+            "89": 3185.0,
+            "90": 3126.0,
+            "91": 3076.0,
+            "92": 3139.0,
+            "93": 2665.0,
+            "94": 3302.0,
+            "95": 3282.0,
+            "96": 3404.0,
+            "97": 3215.0,
+            "98": 3465.0,
+            "99": 3128.0,
+            "100": 3231.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 397748736.0,
+            "52": 397748736.0,
+            "53": 397748736.0,
+            "54": 397748736.0,
+            "55": 397748736.0,
+            "56": 397748736.0,
+            "57": 397748736.0,
+            "58": 397748736.0,
+            "59": 397748736.0,
+            "60": 397748736.0,
+            "61": 397748736.0,
+            "62": 397748736.0,
+            "63": 397748736.0,
+            "64": 397748736.0,
+            "65": 397748736.0,
+            "66": 397748736.0,
+            "67": 397748736.0,
+            "68": 397748736.0,
+            "69": 397748736.0,
+            "70": 397748736.0,
+            "71": 397748736.0,
+            "72": 397748736.0,
+            "73": 397748736.0,
+            "74": 397748736.0,
+            "75": 397748736.0,
+            "76": 397748736.0,
+            "77": 397748736.0,
+            "78": 397748736.0,
+            "79": 397748736.0,
+            "80": 397748736.0,
+            "81": 397748736.0,
+            "82": 397748736.0,
+            "83": 397748736.0,
+            "84": 397748736.0,
+            "85": 397748736.0,
+            "86": 397748736.0,
+            "87": 397748736.0,
+            "88": 397748736.0,
+            "89": 397748736.0,
+            "90": 397748736.0,
+            "91": 397748736.0,
+            "92": 397748736.0,
+            "93": 397748736.0,
+            "94": 397748736.0,
+            "95": 397748736.0,
+            "96": 397748736.0,
+            "97": 397748736.0,
+            "98": 397748736.0,
+            "99": 397748736.0,
+            "100": 397748736.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1191471616.0,
+            "52": 1191472640.0,
+            "53": 1191472640.0,
+            "54": 1191472640.0,
+            "55": 1191472640.0,
+            "56": 1191472640.0,
+            "57": 1191472640.0,
+            "58": 1191472640.0,
+            "59": 1191472640.0,
+            "60": 1191472640.0,
+            "61": 1191472640.0,
+            "62": 1191472640.0,
+            "63": 1191472640.0,
+            "64": 1191472640.0,
+            "65": 1191472640.0,
+            "66": 1191472640.0,
+            "67": 1191472640.0,
+            "68": 1191472640.0,
+            "69": 1191472640.0,
+            "70": 1191472640.0,
+            "71": 1191472640.0,
+            "72": 1191472640.0,
+            "73": 1191472640.0,
+            "74": 1191472640.0,
+            "75": 1191472640.0,
+            "76": 1191472640.0,
+            "77": 1191472640.0,
+            "78": 1191472640.0,
+            "79": 1191472640.0,
+            "80": 1191472640.0,
+            "81": 1191472640.0,
+            "82": 1191472640.0,
+            "83": 1191472640.0,
+            "84": 1191472640.0,
+            "85": 1191472640.0,
+            "86": 1191472640.0,
+            "87": 1191472640.0,
+            "88": 1191472640.0,
+            "89": 1191472640.0,
+            "90": 1191472640.0,
+            "91": 1191472640.0,
+            "92": 1191472640.0,
+            "93": 1191472640.0,
+            "94": 1191472640.0,
+            "95": 1191472640.0,
+            "96": 1191472640.0,
+            "97": 1191472640.0,
+            "98": 1191472640.0,
+            "99": 1191472640.0,
+            "100": 1191472640.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.49404,
+            "53": 0.49738,
+            "54": 0.47517,
+            "55": 0.48073,
+            "56": 0.48124,
+            "57": 0.48531,
+            "58": 0.46905,
+            "59": 0.45198,
+            "60": 0.44375,
+            "61": 0.45318,
+            "62": 0.43328,
+            "63": 0.44116,
+            "64": 0.43199,
+            "65": 0.43219,
+            "66": 0.43466,
+            "67": 0.43576,
+            "68": 0.43222,
+            "69": 0.42871,
+            "70": 0.42889,
+            "71": 0.43506,
+            "72": 0.43141,
+            "73": 0.42482,
+            "74": 0.4278,
+            "75": 0.42933,
+            "76": 0.42676,
+            "77": 0.43206,
+            "78": 0.43106,
+            "79": 0.43328,
+            "80": 0.429,
+            "81": 0.4294,
+            "82": 0.43619,
+            "83": 0.42881,
+            "84": 0.44023,
+            "85": 0.43778,
+            "86": 0.4293,
+            "87": 0.42266,
+            "88": 0.43088,
+            "89": 0.4333,
+            "90": 0.42756,
+            "91": 0.42474,
+            "92": 0.43075,
+            "93": 0.43032,
+            "94": 0.42748,
+            "95": 0.43116,
+            "96": 0.43174,
+            "97": 0.42434,
+            "98": 0.42337,
+            "99": 0.42353,
+            "100": 0.4287
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100.json
index d5d1de46cac..0b8045d999a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 397747712.0,
-            "2": 397747712.0,
-            "3": 397747712.0,
-            "4": 397747712.0,
-            "5": 397747712.0,
-            "6": 397747712.0,
-            "7": 397747712.0,
-            "8": 397747712.0,
-            "9": 397747712.0,
-            "10": 397747712.0,
-            "11": 397747712.0,
-            "12": 397747712.0,
-            "13": 397747712.0,
-            "14": 397747712.0,
-            "15": 397747712.0,
-            "16": 397747712.0,
-            "17": 397747712.0,
-            "18": 397747712.0,
-            "19": 397747712.0,
-            "20": 397747712.0,
-            "21": 397747712.0,
-            "22": 397747712.0,
-            "23": 397747712.0,
-            "24": 397747712.0,
-            "25": 397747712.0,
-            "26": 397747712.0,
-            "27": 397747712.0,
-            "28": 397747712.0,
-            "29": 397747712.0,
-            "30": 397747712.0,
-            "31": 397747712.0,
-            "32": 397747712.0,
-            "33": 397747712.0,
-            "34": 397747712.0,
-            "35": 397747712.0,
-            "36": 397747712.0,
-            "37": 397747712.0,
-            "38": 397747712.0,
-            "39": 397747712.0,
-            "40": 397747712.0,
-            "41": 397747712.0,
-            "42": 397747712.0,
-            "43": 397747712.0,
-            "44": 397747712.0,
-            "45": 397747712.0,
-            "46": 397747712.0,
-            "47": 397747712.0,
-            "48": 397747712.0,
-            "49": 397747712.0,
-            "50": 397747712.0,
-            "51": 397747712.0,
-            "52": 397747712.0,
-            "53": 397747712.0,
-            "54": 397747712.0,
-            "55": 397747712.0,
-            "56": 397747712.0,
-            "57": 397747712.0,
-            "58": 397747712.0,
-            "59": 397747712.0,
-            "60": 397747712.0,
-            "61": 397747712.0,
-            "62": 397747712.0,
-            "63": 397747712.0,
-            "64": 397747712.0,
-            "65": 397747712.0,
-            "66": 397747712.0,
-            "67": 397747712.0,
-            "68": 397747712.0,
-            "69": 397747712.0,
-            "70": 397747712.0,
-            "71": 397747712.0,
-            "72": 397747712.0,
-            "73": 397747712.0,
-            "74": 397747712.0,
-            "75": 397747712.0,
-            "76": 397747712.0,
-            "77": 397747712.0,
-            "78": 397747712.0,
-            "79": 397747712.0,
-            "80": 397747712.0,
-            "81": 397747712.0,
-            "82": 397747712.0,
-            "83": 397747712.0,
-            "84": 397747712.0,
-            "85": 397747712.0,
-            "86": 397747712.0,
-            "87": 397747712.0,
-            "88": 397747712.0,
-            "89": 397747712.0,
-            "90": 397747712.0,
-            "91": 397747712.0,
-            "92": 397747712.0,
-            "93": 397747712.0,
-            "94": 397747712.0,
-            "95": 397747712.0,
-            "96": 397747712.0,
-            "97": 397747712.0,
-            "98": 397747712.0,
-            "99": 397747712.0,
-            "100": 397747712.0
+            "1": 397748736.0,
+            "2": 397748736.0,
+            "3": 397748736.0,
+            "4": 397748736.0,
+            "5": 397748736.0,
+            "6": 397748736.0,
+            "7": 397748736.0,
+            "8": 397748736.0,
+            "9": 397748736.0,
+            "10": 397748736.0,
+            "11": 397748736.0,
+            "12": 397748736.0,
+            "13": 397748736.0,
+            "14": 397748736.0,
+            "15": 397748736.0,
+            "16": 397748736.0,
+            "17": 397748736.0,
+            "18": 397748736.0,
+            "19": 397748736.0,
+            "20": 397748736.0,
+            "21": 397748736.0,
+            "22": 397748736.0,
+            "23": 397748736.0,
+            "24": 397748736.0,
+            "25": 397748736.0,
+            "26": 397748736.0,
+            "27": 397748736.0,
+            "28": 397748736.0,
+            "29": 397748736.0,
+            "30": 397748736.0,
+            "31": 397748736.0,
+            "32": 397748736.0,
+            "33": 397748736.0,
+            "34": 397748736.0,
+            "35": 397748736.0,
+            "36": 397748736.0,
+            "37": 397748736.0,
+            "38": 397748736.0,
+            "39": 397748736.0,
+            "40": 397748736.0,
+            "41": 397748736.0,
+            "42": 397748736.0,
+            "43": 397748736.0,
+            "44": 397748736.0,
+            "45": 397748736.0,
+            "46": 397748736.0,
+            "47": 397748736.0,
+            "48": 397748736.0,
+            "49": 397748736.0,
+            "50": 397748736.0,
+            "51": 397748736.0,
+            "52": 397748736.0,
+            "53": 397748736.0,
+            "54": 397748736.0,
+            "55": 397748736.0,
+            "56": 397748736.0,
+            "57": 397748736.0,
+            "58": 397748736.0,
+            "59": 397748736.0,
+            "60": 397748736.0,
+            "61": 397748736.0,
+            "62": 397748736.0,
+            "63": 397748736.0,
+            "64": 397748736.0,
+            "65": 397748736.0,
+            "66": 397748736.0,
+            "67": 397748736.0,
+            "68": 397748736.0,
+            "69": 397748736.0,
+            "70": 397748736.0,
+            "71": 397748736.0,
+            "72": 397748736.0,
+            "73": 397748736.0,
+            "74": 397748736.0,
+            "75": 397748736.0,
+            "76": 397748736.0,
+            "77": 397748736.0,
+            "78": 397748736.0,
+            "79": 397748736.0,
+            "80": 397748736.0,
+            "81": 397748736.0,
+            "82": 397748736.0,
+            "83": 397748736.0,
+            "84": 397748736.0,
+            "85": 397748736.0,
+            "86": 397748736.0,
+            "87": 397748736.0,
+            "88": 397748736.0,
+            "89": 397748736.0,
+            "90": 397748736.0,
+            "91": 397748736.0,
+            "92": 397748736.0,
+            "93": 397748736.0,
+            "94": 397748736.0,
+            "95": 397748736.0,
+            "96": 397748736.0,
+            "97": 397748736.0,
+            "98": 397748736.0,
+            "99": 397748736.0,
+            "100": 397748736.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -326,105 +326,105 @@
         "step_interval": 1,
         "values": {
             "1": 1044755968.0,
-            "2": 1177840128.0,
-            "3": 1177840128.0,
-            "4": 1177840128.0,
-            "5": 1177840128.0,
-            "6": 1177840128.0,
-            "7": 1177840128.0,
-            "8": 1177840128.0,
-            "9": 1177840128.0,
-            "10": 1177840128.0,
-            "11": 1177840128.0,
-            "12": 1177840128.0,
-            "13": 1177840128.0,
-            "14": 1177840128.0,
-            "15": 1177840128.0,
-            "16": 1177840128.0,
-            "17": 1177840128.0,
-            "18": 1177840128.0,
-            "19": 1177840128.0,
-            "20": 1177840128.0,
-            "21": 1177840128.0,
-            "22": 1177840128.0,
-            "23": 1177840128.0,
-            "24": 1177840128.0,
-            "25": 1177840128.0,
-            "26": 1177840128.0,
-            "27": 1177840128.0,
-            "28": 1177840128.0,
-            "29": 1177840128.0,
-            "30": 1177840128.0,
-            "31": 1177840128.0,
-            "32": 1177840128.0,
-            "33": 1177840128.0,
-            "34": 1177840128.0,
-            "35": 1177840128.0,
-            "36": 1177840128.0,
-            "37": 1177840128.0,
-            "38": 1177840128.0,
-            "39": 1177840128.0,
-            "40": 1177840128.0,
-            "41": 1177840128.0,
-            "42": 1177840128.0,
-            "43": 1177840128.0,
-            "44": 1177840128.0,
-            "45": 1177840128.0,
-            "46": 1177840128.0,
-            "47": 1177840128.0,
-            "48": 1177840128.0,
-            "49": 1177840128.0,
-            "50": 1177840128.0,
-            "51": 1177840128.0,
-            "52": 1177840128.0,
-            "53": 1177840128.0,
-            "54": 1177840128.0,
-            "55": 1177840128.0,
-            "56": 1177840128.0,
-            "57": 1177840128.0,
-            "58": 1177840128.0,
-            "59": 1177840128.0,
-            "60": 1177840128.0,
-            "61": 1177840128.0,
-            "62": 1177840128.0,
-            "63": 1177840128.0,
-            "64": 1177840128.0,
-            "65": 1177840128.0,
-            "66": 1177840128.0,
-            "67": 1177840128.0,
-            "68": 1177840128.0,
-            "69": 1177840128.0,
-            "70": 1177840128.0,
-            "71": 1177840128.0,
-            "72": 1177840128.0,
-            "73": 1177840128.0,
-            "74": 1177840128.0,
-            "75": 1177840128.0,
-            "76": 1177840128.0,
-            "77": 1177840128.0,
-            "78": 1177840128.0,
-            "79": 1177840128.0,
-            "80": 1177840128.0,
-            "81": 1177840128.0,
-            "82": 1177840128.0,
-            "83": 1177840128.0,
-            "84": 1177840128.0,
-            "85": 1177840128.0,
-            "86": 1177840128.0,
-            "87": 1177840128.0,
-            "88": 1177840128.0,
-            "89": 1177840128.0,
-            "90": 1177840128.0,
-            "91": 1177840128.0,
-            "92": 1177840128.0,
-            "93": 1177840128.0,
-            "94": 1177840128.0,
-            "95": 1177840128.0,
-            "96": 1177840128.0,
-            "97": 1177840128.0,
-            "98": 1177840128.0,
-            "99": 1177840128.0,
-            "100": 1177840128.0
+            "2": 1178234368.0,
+            "3": 1178234368.0,
+            "4": 1178234368.0,
+            "5": 1178234368.0,
+            "6": 1178234368.0,
+            "7": 1178234368.0,
+            "8": 1178234368.0,
+            "9": 1178234368.0,
+            "10": 1178234368.0,
+            "11": 1178234368.0,
+            "12": 1178234368.0,
+            "13": 1178234368.0,
+            "14": 1178234368.0,
+            "15": 1178234368.0,
+            "16": 1178234368.0,
+            "17": 1178234368.0,
+            "18": 1178234368.0,
+            "19": 1178234368.0,
+            "20": 1178234368.0,
+            "21": 1178234368.0,
+            "22": 1178234368.0,
+            "23": 1178234368.0,
+            "24": 1178234368.0,
+            "25": 1178234368.0,
+            "26": 1178234368.0,
+            "27": 1178234368.0,
+            "28": 1178234368.0,
+            "29": 1178234368.0,
+            "30": 1178234368.0,
+            "31": 1178234368.0,
+            "32": 1178234368.0,
+            "33": 1178234368.0,
+            "34": 1178234368.0,
+            "35": 1178234368.0,
+            "36": 1178234368.0,
+            "37": 1178234368.0,
+            "38": 1178234368.0,
+            "39": 1178234368.0,
+            "40": 1178234368.0,
+            "41": 1178234368.0,
+            "42": 1178234368.0,
+            "43": 1178234368.0,
+            "44": 1178234368.0,
+            "45": 1178234368.0,
+            "46": 1178234368.0,
+            "47": 1178234368.0,
+            "48": 1178234368.0,
+            "49": 1178234368.0,
+            "50": 1178234368.0,
+            "51": 1178234368.0,
+            "52": 1178234368.0,
+            "53": 1178234368.0,
+            "54": 1178234368.0,
+            "55": 1178234368.0,
+            "56": 1178234368.0,
+            "57": 1178234368.0,
+            "58": 1178234368.0,
+            "59": 1178234368.0,
+            "60": 1178234368.0,
+            "61": 1178234368.0,
+            "62": 1178234368.0,
+            "63": 1178234368.0,
+            "64": 1178234368.0,
+            "65": 1178234368.0,
+            "66": 1178234368.0,
+            "67": 1178234368.0,
+            "68": 1178234368.0,
+            "69": 1178234368.0,
+            "70": 1178234368.0,
+            "71": 1178234368.0,
+            "72": 1178234368.0,
+            "73": 1178234368.0,
+            "74": 1178234368.0,
+            "75": 1178234368.0,
+            "76": 1178234368.0,
+            "77": 1178234368.0,
+            "78": 1178234368.0,
+            "79": 1178234368.0,
+            "80": 1178234368.0,
+            "81": 1178234368.0,
+            "82": 1178234368.0,
+            "83": 1178234368.0,
+            "84": 1178234368.0,
+            "85": 1178234368.0,
+            "86": 1178234368.0,
+            "87": 1178234368.0,
+            "88": 1178234368.0,
+            "89": 1178234368.0,
+            "90": 1178234368.0,
+            "91": 1178234368.0,
+            "92": 1178234368.0,
+            "93": 1178234368.0,
+            "94": 1178234368.0,
+            "95": 1178234368.0,
+            "96": 1178234368.0,
+            "97": 1178234368.0,
+            "98": 1178234368.0,
+            "99": 1178234368.0,
+            "100": 1178234368.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.61367,
-            "2": 0.31935,
-            "3": 0.29274,
-            "4": 0.28637,
-            "5": 0.2844,
-            "6": 0.29788,
-            "7": 0.2902,
-            "8": 0.28573,
-            "9": 0.29136,
-            "10": 0.29884,
-            "11": 0.29048,
-            "12": 0.2896,
-            "13": 0.29421,
-            "14": 0.29008,
-            "15": 0.2871,
-            "16": 0.28903,
-            "17": 0.2924,
-            "18": 0.28887,
-            "19": 0.28926,
-            "20": 0.30241,
-            "21": 0.29571,
-            "22": 0.28966,
-            "23": 0.29177,
-            "24": 0.29106,
-            "25": 0.28884,
-            "26": 0.28921,
-            "27": 0.29461,
-            "28": 0.28664,
-            "29": 0.28881,
-            "30": 0.29392,
-            "31": 0.29062,
-            "32": 0.28778,
-            "33": 0.29055,
-            "34": 0.29409,
-            "35": 0.29169,
-            "36": 0.29211,
-            "37": 0.29809,
-            "38": 0.29114,
-            "39": 0.29052,
-            "40": 0.2919,
-            "41": 0.2953,
-            "42": 0.28957,
-            "43": 0.29349,
-            "44": 0.30062,
-            "45": 0.28999,
-            "46": 0.29486,
-            "47": 0.29689,
-            "48": 0.29092,
-            "49": 0.29024,
-            "50": 0.28916,
-            "51": 0.30865,
-            "52": 0.29957,
-            "53": 0.28833,
-            "54": 0.29375,
-            "55": 0.29176,
-            "56": 0.29338,
-            "57": 0.28952,
-            "58": 0.29232,
-            "59": 0.29026,
-            "60": 0.28767,
-            "61": 0.29364,
-            "62": 0.2935,
-            "63": 0.29522,
-            "64": 0.29495,
-            "65": 0.29509,
-            "66": 0.29643,
-            "67": 0.29584,
-            "68": 0.29853,
-            "69": 0.29821,
-            "70": 0.29334,
-            "71": 0.29579,
-            "72": 0.29325,
-            "73": 0.29403,
-            "74": 0.29671,
-            "75": 0.63106,
-            "76": 0.29142,
-            "77": 0.29491,
-            "78": 0.29437,
-            "79": 0.29239,
-            "80": 0.29453,
-            "81": 0.29509,
-            "82": 0.29493,
-            "83": 0.2915,
-            "84": 0.30181,
-            "85": 0.29305,
-            "86": 0.28823,
-            "87": 0.29337,
-            "88": 0.29025,
-            "89": 0.28953,
-            "90": 0.29694,
-            "91": 0.29077,
-            "92": 0.29411,
-            "93": 0.28767,
-            "94": 0.29313,
-            "95": 0.29276,
-            "96": 0.29197,
-            "97": 0.29466,
-            "98": 0.29321,
-            "99": 0.29311,
-            "100": 0.29175
+            "1": 10.36091,
+            "2": 0.34885,
+            "3": 0.28252,
+            "4": 0.26078,
+            "5": 0.25876,
+            "6": 0.25718,
+            "7": 0.26528,
+            "8": 0.26311,
+            "9": 0.26375,
+            "10": 0.26354,
+            "11": 0.26207,
+            "12": 0.26033,
+            "13": 0.26467,
+            "14": 0.26281,
+            "15": 0.26355,
+            "16": 0.26138,
+            "17": 0.2649,
+            "18": 0.26631,
+            "19": 0.26244,
+            "20": 0.26263,
+            "21": 0.26939,
+            "22": 0.26538,
+            "23": 0.26644,
+            "24": 0.26284,
+            "25": 0.26534,
+            "26": 0.2629,
+            "27": 0.2631,
+            "28": 0.26216,
+            "29": 0.26306,
+            "30": 0.26559,
+            "31": 0.26198,
+            "32": 0.26229,
+            "33": 0.26263,
+            "34": 0.26154,
+            "35": 0.26277,
+            "36": 0.26291,
+            "37": 0.26156,
+            "38": 0.26052,
+            "39": 0.26366,
+            "40": 0.26065,
+            "41": 0.26364,
+            "42": 0.62325,
+            "43": 0.26139,
+            "44": 0.2631,
+            "45": 0.26374,
+            "46": 0.26054,
+            "47": 0.26187,
+            "48": 0.26188,
+            "49": 0.25929,
+            "50": 0.25984,
+            "51": 0.26978,
+            "52": 0.26013,
+            "53": 0.26513,
+            "54": 0.26111,
+            "55": 0.26044,
+            "56": 0.2624,
+            "57": 0.26412,
+            "58": 0.26108,
+            "59": 0.26051,
+            "60": 0.263,
+            "61": 0.26363,
+            "62": 0.27145,
+            "63": 0.27074,
+            "64": 0.26955,
+            "65": 0.65636,
+            "66": 0.26945,
+            "67": 0.27333,
+            "68": 0.27517,
+            "69": 0.27206,
+            "70": 0.27181,
+            "71": 0.27216,
+            "72": 0.9521,
+            "73": 0.27086,
+            "74": 0.27375,
+            "75": 0.89877,
+            "76": 0.27077,
+            "77": 0.26534,
+            "78": 0.2565,
+            "79": 0.26961,
+            "80": 0.26648,
+            "81": 0.26175,
+            "82": 0.26268,
+            "83": 0.26668,
+            "84": 0.26108,
+            "85": 0.25906,
+            "86": 0.25936,
+            "87": 0.25961,
+            "88": 0.25714,
+            "89": 0.26171,
+            "90": 0.26239,
+            "91": 0.26137,
+            "92": 0.25975,
+            "93": 0.25965,
+            "94": 0.2611,
+            "95": 0.25793,
+            "96": 0.26009,
+            "97": 0.26077,
+            "98": 0.25869,
+            "99": 0.2601,
+            "100": 0.25909
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..25df8735936
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.86379,
+            "52": 9.75652,
+            "53": 10.06157,
+            "54": 9.96418,
+            "55": 9.89204,
+            "56": 9.63681,
+            "57": 9.49807,
+            "58": 9.83504,
+            "59": 9.59701,
+            "60": 9.51573,
+            "61": 9.70155,
+            "62": 9.97973,
+            "63": 9.38914,
+            "64": 9.77552,
+            "65": 8.95939,
+            "66": 9.6978,
+            "67": 9.37174,
+            "68": 9.78449,
+            "69": 9.79058,
+            "70": 9.74555,
+            "71": 9.61867,
+            "72": 9.58317,
+            "73": 9.49175,
+            "74": 8.939,
+            "75": 9.41848,
+            "76": 9.07237,
+            "77": 10.06903,
+            "78": 9.72443,
+            "79": 9.3767,
+            "80": 9.40261,
+            "81": 9.47859,
+            "82": 9.6984,
+            "83": 9.30086,
+            "84": 9.41299,
+            "85": 9.61514,
+            "86": 9.07881,
+            "87": 9.59402,
+            "88": 9.74658,
+            "89": 9.60096,
+            "90": 9.81999,
+            "91": 9.32977,
+            "92": 9.35625,
+            "93": 9.07406,
+            "94": 8.82774,
+            "95": 9.51099,
+            "96": 9.52501,
+            "97": 9.3163,
+            "98": 9.67278,
+            "99": 8.88493,
+            "100": 9.39984
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2563.0,
+            "52": 2431.0,
+            "53": 2917.0,
+            "54": 2655.0,
+            "55": 2307.0,
+            "56": 2605.0,
+            "57": 2385.0,
+            "58": 2952.0,
+            "59": 2730.0,
+            "60": 2287.0,
+            "61": 2904.0,
+            "62": 2601.0,
+            "63": 2452.0,
+            "64": 2810.0,
+            "65": 2544.0,
+            "66": 2914.0,
+            "67": 2664.0,
+            "68": 2709.0,
+            "69": 2967.0,
+            "70": 3049.0,
+            "71": 2936.0,
+            "72": 2410.0,
+            "73": 2991.0,
+            "74": 1882.0,
+            "75": 2539.0,
+            "76": 3060.0,
+            "77": 3219.0,
+            "78": 3023.0,
+            "79": 3084.0,
+            "80": 3101.0,
+            "81": 3530.0,
+            "82": 3298.0,
+            "83": 2666.0,
+            "84": 3154.0,
+            "85": 3288.0,
+            "86": 2827.0,
+            "87": 3720.0,
+            "88": 3168.0,
+            "89": 3275.0,
+            "90": 3168.0,
+            "91": 2919.0,
+            "92": 3071.0,
+            "93": 2751.0,
+            "94": 3412.0,
+            "95": 3186.0,
+            "96": 3429.0,
+            "97": 3083.0,
+            "98": 3477.0,
+            "99": 3093.0,
+            "100": 3212.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 397748736.0,
+            "52": 397748736.0,
+            "53": 397748736.0,
+            "54": 397748736.0,
+            "55": 397748736.0,
+            "56": 397748736.0,
+            "57": 397748736.0,
+            "58": 397748736.0,
+            "59": 397748736.0,
+            "60": 397748736.0,
+            "61": 397748736.0,
+            "62": 397748736.0,
+            "63": 397748736.0,
+            "64": 397748736.0,
+            "65": 397748736.0,
+            "66": 397748736.0,
+            "67": 397748736.0,
+            "68": 397748736.0,
+            "69": 397748736.0,
+            "70": 397748736.0,
+            "71": 397748736.0,
+            "72": 397748736.0,
+            "73": 397748736.0,
+            "74": 397748736.0,
+            "75": 397748736.0,
+            "76": 397748736.0,
+            "77": 397748736.0,
+            "78": 397748736.0,
+            "79": 397748736.0,
+            "80": 397748736.0,
+            "81": 397748736.0,
+            "82": 397748736.0,
+            "83": 397748736.0,
+            "84": 397748736.0,
+            "85": 397748736.0,
+            "86": 397748736.0,
+            "87": 397748736.0,
+            "88": 397748736.0,
+            "89": 397748736.0,
+            "90": 397748736.0,
+            "91": 397748736.0,
+            "92": 397748736.0,
+            "93": 397748736.0,
+            "94": 397748736.0,
+            "95": 397748736.0,
+            "96": 397748736.0,
+            "97": 397748736.0,
+            "98": 397748736.0,
+            "99": 397748736.0,
+            "100": 397748736.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1179281920.0,
+            "52": 1179282944.0,
+            "53": 1179282944.0,
+            "54": 1179282944.0,
+            "55": 1179282944.0,
+            "56": 1179282944.0,
+            "57": 1179282944.0,
+            "58": 1179282944.0,
+            "59": 1179282944.0,
+            "60": 1179282944.0,
+            "61": 1179282944.0,
+            "62": 1179282944.0,
+            "63": 1179282944.0,
+            "64": 1179282944.0,
+            "65": 1179282944.0,
+            "66": 1179282944.0,
+            "67": 1179282944.0,
+            "68": 1179282944.0,
+            "69": 1179282944.0,
+            "70": 1179282944.0,
+            "71": 1179282944.0,
+            "72": 1179282944.0,
+            "73": 1179282944.0,
+            "74": 1179282944.0,
+            "75": 1179282944.0,
+            "76": 1179282944.0,
+            "77": 1179282944.0,
+            "78": 1179282944.0,
+            "79": 1179282944.0,
+            "80": 1179282944.0,
+            "81": 1179282944.0,
+            "82": 1179282944.0,
+            "83": 1179282944.0,
+            "84": 1179282944.0,
+            "85": 1179282944.0,
+            "86": 1179282944.0,
+            "87": 1179282944.0,
+            "88": 1179282944.0,
+            "89": 1179282944.0,
+            "90": 1179282944.0,
+            "91": 1179282944.0,
+            "92": 1179282944.0,
+            "93": 1179282944.0,
+            "94": 1179282944.0,
+            "95": 1179282944.0,
+            "96": 1179282944.0,
+            "97": 1179282944.0,
+            "98": 1179282944.0,
+            "99": 1179282944.0,
+            "100": 1179282944.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8.28969,
+            "52": 0.28668,
+            "53": 0.25532,
+            "54": 0.25658,
+            "55": 0.25678,
+            "56": 0.25808,
+            "57": 0.25759,
+            "58": 0.2573,
+            "59": 0.25595,
+            "60": 0.25655,
+            "61": 0.25748,
+            "62": 0.25355,
+            "63": 0.25645,
+            "64": 0.25544,
+            "65": 0.25465,
+            "66": 0.25429,
+            "67": 0.25503,
+            "68": 0.25478,
+            "69": 0.25435,
+            "70": 0.25389,
+            "71": 0.25473,
+            "72": 0.254,
+            "73": 0.25451,
+            "74": 0.25381,
+            "75": 0.25278,
+            "76": 0.25503,
+            "77": 0.25251,
+            "78": 0.25271,
+            "79": 0.25524,
+            "80": 0.25494,
+            "81": 0.25321,
+            "82": 0.25436,
+            "83": 0.25713,
+            "84": 0.25332,
+            "85": 0.25392,
+            "86": 0.25232,
+            "87": 0.25246,
+            "88": 0.25419,
+            "89": 0.25306,
+            "90": 0.25417,
+            "91": 0.25642,
+            "92": 0.25493,
+            "93": 0.2529,
+            "94": 0.25478,
+            "95": 0.25685,
+            "96": 0.25271,
+            "97": 0.25387,
+            "98": 0.25551,
+            "99": 0.25384,
+            "100": 0.2519
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json
index 1091699bf9a..d4f8136d68c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 4.72553,
-            "2": 0.52446,
-            "3": 0.41527,
-            "4": 0.41699,
-            "5": 0.41496,
-            "6": 0.41411,
-            "7": 0.41829,
-            "8": 0.41655,
-            "9": 0.41643,
-            "10": 0.42008,
-            "11": 0.41959,
-            "12": 0.41842,
-            "13": 0.41485,
-            "14": 0.41643,
-            "15": 0.41486,
-            "16": 0.41617,
-            "17": 0.41476,
-            "18": 0.42598,
-            "19": 0.41821,
-            "20": 0.41457,
-            "21": 0.41579,
-            "22": 0.41438,
-            "23": 0.41644,
-            "24": 0.41499,
-            "25": 0.41537,
-            "26": 0.41593,
-            "27": 0.42875,
-            "28": 0.41636,
-            "29": 0.41505,
-            "30": 0.4148,
-            "31": 0.41806,
-            "32": 0.41549,
-            "33": 0.41482,
-            "34": 0.41559,
-            "35": 0.4156,
-            "36": 0.4152,
-            "37": 0.4152,
-            "38": 0.4154,
-            "39": 0.41674,
-            "40": 0.41745,
-            "41": 0.41582,
-            "42": 0.41548,
-            "43": 0.41428,
-            "44": 0.4158,
-            "45": 0.41469,
-            "46": 0.41584,
-            "47": 0.41662,
-            "48": 0.41588,
-            "49": 0.41548,
-            "50": 0.42504,
-            "51": 0.41857,
-            "52": 0.40985,
-            "53": 0.40877,
-            "54": 0.41013,
-            "55": 0.40869,
-            "56": 0.84381,
-            "57": 0.41437,
-            "58": 0.42502,
-            "59": 0.41122,
-            "60": 0.41956,
-            "61": 0.40996,
-            "62": 0.40983,
-            "63": 0.41144,
-            "64": 0.41126,
-            "65": 0.41361,
-            "66": 0.41243,
-            "67": 0.41431,
-            "68": 0.4396,
-            "69": 0.42434,
-            "70": 0.41269,
-            "71": 0.42108,
-            "72": 0.41357,
-            "73": 0.41116,
-            "74": 0.41086,
-            "75": 0.41041,
-            "76": 0.41106,
-            "77": 0.41,
-            "78": 0.41669,
-            "79": 0.41627,
-            "80": 0.41237,
-            "81": 0.41157,
-            "82": 0.41168,
-            "83": 0.41229,
-            "84": 0.41209,
-            "85": 0.41258,
-            "86": 0.41294,
-            "87": 0.41185,
-            "88": 0.41106,
-            "89": 0.41159,
-            "90": 0.41277,
-            "91": 0.41162,
-            "92": 0.41309,
-            "93": 0.41351,
-            "94": 0.40941,
-            "95": 0.40961,
-            "96": 0.41012,
-            "97": 0.40887,
-            "98": 0.40809,
-            "99": 0.40865,
-            "100": 0.40854
+            "1": 4.0346,
+            "2": 0.53704,
+            "3": 0.42719,
+            "4": 0.41535,
+            "5": 0.40389,
+            "6": 0.40332,
+            "7": 0.40402,
+            "8": 0.40471,
+            "9": 0.40343,
+            "10": 0.40348,
+            "11": 0.3985,
+            "12": 0.39842,
+            "13": 0.39603,
+            "14": 0.39492,
+            "15": 0.39651,
+            "16": 0.39564,
+            "17": 0.39567,
+            "18": 0.39657,
+            "19": 0.39768,
+            "20": 0.39761,
+            "21": 0.39891,
+            "22": 0.39636,
+            "23": 0.39698,
+            "24": 0.39738,
+            "25": 0.39624,
+            "26": 0.39431,
+            "27": 0.39658,
+            "28": 0.39585,
+            "29": 0.39364,
+            "30": 0.39529,
+            "31": 0.39497,
+            "32": 0.39598,
+            "33": 0.39773,
+            "34": 0.39643,
+            "35": 0.39763,
+            "36": 0.39632,
+            "37": 0.39546,
+            "38": 0.3982,
+            "39": 0.7438,
+            "40": 0.39448,
+            "41": 0.39549,
+            "42": 0.39538,
+            "43": 0.39526,
+            "44": 0.39405,
+            "45": 0.39698,
+            "46": 0.39664,
+            "47": 0.39462,
+            "48": 0.39535,
+            "49": 0.39382,
+            "50": 0.3941,
+            "51": 0.43707,
+            "52": 0.43149,
+            "53": 0.42387,
+            "54": 0.43267,
+            "55": 0.43104,
+            "56": 1.05764,
+            "57": 0.39732,
+            "58": 0.39576,
+            "59": 0.3984,
+            "60": 0.40214,
+            "61": 0.4001,
+            "62": 0.90991,
+            "63": 0.39865,
+            "64": 0.39618,
+            "65": 0.39554,
+            "66": 0.79331,
+            "67": 0.39478,
+            "68": 0.39551,
+            "69": 0.39587,
+            "70": 0.39669,
+            "71": 0.39593,
+            "72": 0.93958,
+            "73": 0.39773,
+            "74": 0.39717,
+            "75": 0.3961,
+            "76": 0.39596,
+            "77": 0.39649,
+            "78": 0.39584,
+            "79": 0.39596,
+            "80": 0.39568,
+            "81": 0.39433,
+            "82": 0.39598,
+            "83": 0.39548,
+            "84": 0.39563,
+            "85": 0.39555,
+            "86": 0.39811,
+            "87": 0.39515,
+            "88": 0.39682,
+            "89": 0.39662,
+            "90": 0.39566,
+            "91": 0.39589,
+            "92": 0.39584,
+            "93": 0.39725,
+            "94": 0.39593,
+            "95": 0.39495,
+            "96": 0.39495,
+            "97": 0.39567,
+            "98": 0.39566,
+            "99": 0.3973,
+            "100": 0.39539
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..680d04eb6a6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8567,
+            "52": 9.75178,
+            "53": 10.07652,
+            "54": 9.96084,
+            "55": 9.88221,
+            "56": 9.63206,
+            "57": 9.49147,
+            "58": 9.83408,
+            "59": 9.59352,
+            "60": 9.51388,
+            "61": 9.69802,
+            "62": 9.99154,
+            "63": 9.3723,
+            "64": 9.77839,
+            "65": 8.95518,
+            "66": 9.70976,
+            "67": 9.38198,
+            "68": 9.78701,
+            "69": 9.793,
+            "70": 9.73033,
+            "71": 9.61752,
+            "72": 9.58459,
+            "73": 9.48958,
+            "74": 8.94015,
+            "75": 9.43092,
+            "76": 9.09168,
+            "77": 10.06222,
+            "78": 9.72696,
+            "79": 9.37408,
+            "80": 9.40676,
+            "81": 9.47995,
+            "82": 9.69225,
+            "83": 9.31299,
+            "84": 9.41921,
+            "85": 9.61096,
+            "86": 9.06853,
+            "87": 9.59119,
+            "88": 9.74582,
+            "89": 9.60624,
+            "90": 9.81746,
+            "91": 9.34247,
+            "92": 9.35856,
+            "93": 9.07894,
+            "94": 8.82753,
+            "95": 9.51606,
+            "96": 9.52063,
+            "97": 9.31097,
+            "98": 9.67055,
+            "99": 8.88626,
+            "100": 9.40485
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2829.0,
+            "52": 2809.0,
+            "53": 3230.0,
+            "54": 2864.0,
+            "55": 2706.0,
+            "56": 2917.0,
+            "57": 2529.0,
+            "58": 3339.0,
+            "59": 3051.0,
+            "60": 2623.0,
+            "61": 3287.0,
+            "62": 2913.0,
+            "63": 2639.0,
+            "64": 3154.0,
+            "65": 2856.0,
+            "66": 3465.0,
+            "67": 2934.0,
+            "68": 2985.0,
+            "69": 3298.0,
+            "70": 3653.0,
+            "71": 3260.0,
+            "72": 2684.0,
+            "73": 3232.0,
+            "74": 2191.0,
+            "75": 2766.0,
+            "76": 3335.0,
+            "77": 3793.0,
+            "78": 3608.0,
+            "79": 3384.0,
+            "80": 3782.0,
+            "81": 3969.0,
+            "82": 3640.0,
+            "83": 3237.0,
+            "84": 3606.0,
+            "85": 3553.0,
+            "86": 3160.0,
+            "87": 4130.0,
+            "88": 3430.0,
+            "89": 3818.0,
+            "90": 3363.0,
+            "91": 3041.0,
+            "92": 3524.0,
+            "93": 3060.0,
+            "94": 3575.0,
+            "95": 3463.0,
+            "96": 3921.0,
+            "97": 3597.0,
+            "98": 4039.0,
+            "99": 3435.0,
+            "100": 3548.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 368387584.0,
+            "52": 368387584.0,
+            "53": 368387584.0,
+            "54": 368387584.0,
+            "55": 368387584.0,
+            "56": 368387584.0,
+            "57": 368387584.0,
+            "58": 368387584.0,
+            "59": 368387584.0,
+            "60": 368387584.0,
+            "61": 368387584.0,
+            "62": 368387584.0,
+            "63": 368387584.0,
+            "64": 368387584.0,
+            "65": 368387584.0,
+            "66": 368387584.0,
+            "67": 368387584.0,
+            "68": 368387584.0,
+            "69": 368387584.0,
+            "70": 368387584.0,
+            "71": 368387584.0,
+            "72": 368387584.0,
+            "73": 368387584.0,
+            "74": 368387584.0,
+            "75": 368387584.0,
+            "76": 368387584.0,
+            "77": 368387584.0,
+            "78": 368387584.0,
+            "79": 368387584.0,
+            "80": 368387584.0,
+            "81": 368387584.0,
+            "82": 368387584.0,
+            "83": 368387584.0,
+            "84": 368387584.0,
+            "85": 368387584.0,
+            "86": 368387584.0,
+            "87": 368387584.0,
+            "88": 368387584.0,
+            "89": 368387584.0,
+            "90": 368387584.0,
+            "91": 368387584.0,
+            "92": 368387584.0,
+            "93": 368387584.0,
+            "94": 368387584.0,
+            "95": 368387584.0,
+            "96": 368387584.0,
+            "97": 368387584.0,
+            "98": 368387584.0,
+            "99": 368387584.0,
+            "100": 368387584.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1162110464.0,
+            "52": 1162111488.0,
+            "53": 1162111488.0,
+            "54": 1162111488.0,
+            "55": 1162111488.0,
+            "56": 1162111488.0,
+            "57": 1162111488.0,
+            "58": 1162111488.0,
+            "59": 1162111488.0,
+            "60": 1162111488.0,
+            "61": 1162111488.0,
+            "62": 1162111488.0,
+            "63": 1162111488.0,
+            "64": 1162111488.0,
+            "65": 1162111488.0,
+            "66": 1162111488.0,
+            "67": 1162111488.0,
+            "68": 1162111488.0,
+            "69": 1162111488.0,
+            "70": 1162111488.0,
+            "71": 1162111488.0,
+            "72": 1162111488.0,
+            "73": 1162111488.0,
+            "74": 1162111488.0,
+            "75": 1162111488.0,
+            "76": 1162111488.0,
+            "77": 1162111488.0,
+            "78": 1162111488.0,
+            "79": 1162111488.0,
+            "80": 1162111488.0,
+            "81": 1162111488.0,
+            "82": 1162111488.0,
+            "83": 1162111488.0,
+            "84": 1162111488.0,
+            "85": 1162111488.0,
+            "86": 1162111488.0,
+            "87": 1162111488.0,
+            "88": 1162111488.0,
+            "89": 1162111488.0,
+            "90": 1162111488.0,
+            "91": 1162111488.0,
+            "92": 1162111488.0,
+            "93": 1162111488.0,
+            "94": 1162111488.0,
+            "95": 1162111488.0,
+            "96": 1162111488.0,
+            "97": 1162111488.0,
+            "98": 1162111488.0,
+            "99": 1162111488.0,
+            "100": 1162111488.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6.59174,
+            "52": 0.42614,
+            "53": 0.39758,
+            "54": 0.39842,
+            "55": 0.39876,
+            "56": 0.39663,
+            "57": 0.39728,
+            "58": 0.39765,
+            "59": 0.39654,
+            "60": 0.39664,
+            "61": 0.3959,
+            "62": 0.39703,
+            "63": 0.39487,
+            "64": 0.39391,
+            "65": 0.3946,
+            "66": 0.39321,
+            "67": 0.39339,
+            "68": 0.39323,
+            "69": 0.39386,
+            "70": 0.39664,
+            "71": 0.39421,
+            "72": 0.39561,
+            "73": 0.3947,
+            "74": 0.3944,
+            "75": 0.39483,
+            "76": 0.39467,
+            "77": 0.39476,
+            "78": 0.39408,
+            "79": 0.395,
+            "80": 0.39426,
+            "81": 0.39421,
+            "82": 0.39474,
+            "83": 0.39376,
+            "84": 0.39492,
+            "85": 0.39449,
+            "86": 0.39328,
+            "87": 0.39468,
+            "88": 0.39375,
+            "89": 0.39395,
+            "90": 0.39427,
+            "91": 0.39417,
+            "92": 0.39443,
+            "93": 0.39424,
+            "94": 0.39416,
+            "95": 0.39486,
+            "96": 0.39653,
+            "97": 0.39395,
+            "98": 0.39533,
+            "99": 0.39459,
+            "100": 0.39587
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/model_config.yaml
index 0f842738f62..755e9ba49e9 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/model_config.yaml
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..771860e086a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.90251,
+            "2": 10.9138,
+            "3": 10.90169,
+            "4": 10.90727,
+            "5": 10.90448,
+            "6": 10.91653,
+            "7": 10.9127,
+            "8": 10.89505,
+            "9": 10.91558,
+            "10": 10.87283,
+            "11": 10.90373,
+            "12": 10.904,
+            "13": 10.91828,
+            "14": 10.9074,
+            "15": 10.87552,
+            "16": 10.85479,
+            "17": 10.83186,
+            "18": 10.84055,
+            "19": 10.84215,
+            "20": 10.75044,
+            "21": 10.73632,
+            "22": 10.62985,
+            "23": 10.74027,
+            "24": 10.64442,
+            "25": 10.60239,
+            "26": 10.64921,
+            "27": 10.64076,
+            "28": 10.5875,
+            "29": 10.59166,
+            "30": 10.38964,
+            "31": 10.18179,
+            "32": 10.49231,
+            "33": 10.48774,
+            "34": 10.26318,
+            "35": 10.29231,
+            "36": 10.25552,
+            "37": 10.37375,
+            "38": 10.23555,
+            "39": 10.42349,
+            "40": 10.10951,
+            "41": 10.1753,
+            "42": 10.23159,
+            "43": 9.87321,
+            "44": 9.99176,
+            "45": 9.8665,
+            "46": 9.84548,
+            "47": 10.17372,
+            "48": 9.87147,
+            "49": 9.55758,
+            "50": 9.92554,
+            "51": 9.87398,
+            "52": 9.76585,
+            "53": 10.0827,
+            "54": 9.97272,
+            "55": 9.90734,
+            "56": 9.64218,
+            "57": 9.48857,
+            "58": 9.8427,
+            "59": 9.60111,
+            "60": 9.52012,
+            "61": 9.70057,
+            "62": 9.99645,
+            "63": 9.39065,
+            "64": 9.77613,
+            "65": 8.96632,
+            "66": 9.70945,
+            "67": 9.38769,
+            "68": 9.78891,
+            "69": 9.80803,
+            "70": 9.74237,
+            "71": 9.63381,
+            "72": 9.59118,
+            "73": 9.50696,
+            "74": 8.94245,
+            "75": 9.42902,
+            "76": 9.0883,
+            "77": 10.07151,
+            "78": 9.72685,
+            "79": 9.38721,
+            "80": 9.4057,
+            "81": 9.48702,
+            "82": 9.70482,
+            "83": 9.31556,
+            "84": 9.42107,
+            "85": 9.61466,
+            "86": 9.08461,
+            "87": 9.59902,
+            "88": 9.75368,
+            "89": 9.60598,
+            "90": 9.83154,
+            "91": 9.33878,
+            "92": 9.36034,
+            "93": 9.09038,
+            "94": 8.83714,
+            "95": 9.53804,
+            "96": 9.5339,
+            "97": 9.31316,
+            "98": 9.67422,
+            "99": 8.90347,
+            "100": 9.41497
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1558.0,
+            "2": 1700.0,
+            "3": 1576.0,
+            "4": 1748.0,
+            "5": 1809.0,
+            "6": 1822.0,
+            "7": 1801.0,
+            "8": 1568.0,
+            "9": 1788.0,
+            "10": 1395.0,
+            "11": 1909.0,
+            "12": 1795.0,
+            "13": 1860.0,
+            "14": 1765.0,
+            "15": 1878.0,
+            "16": 1938.0,
+            "17": 1760.0,
+            "18": 1708.0,
+            "19": 1744.0,
+            "20": 1706.0,
+            "21": 1825.0,
+            "22": 1656.0,
+            "23": 1933.0,
+            "24": 1615.0,
+            "25": 1607.0,
+            "26": 1586.0,
+            "27": 1799.0,
+            "28": 1802.0,
+            "29": 1886.0,
+            "30": 1885.0,
+            "31": 1529.0,
+            "32": 1823.0,
+            "33": 1998.0,
+            "34": 1760.0,
+            "35": 1891.0,
+            "36": 1999.0,
+            "37": 2340.0,
+            "38": 2149.0,
+            "39": 2308.0,
+            "40": 2231.0,
+            "41": 2153.0,
+            "42": 2285.0,
+            "43": 1916.0,
+            "44": 2014.0,
+            "45": 1914.0,
+            "46": 2192.0,
+            "47": 2491.0,
+            "48": 2179.0,
+            "49": 2231.0,
+            "50": 2285.0,
+            "51": 2371.0,
+            "52": 2512.0,
+            "53": 2624.0,
+            "54": 2501.0,
+            "55": 2218.0,
+            "56": 2649.0,
+            "57": 2213.0,
+            "58": 2763.0,
+            "59": 2526.0,
+            "60": 2261.0,
+            "61": 2835.0,
+            "62": 2497.0,
+            "63": 2406.0,
+            "64": 2736.0,
+            "65": 2546.0,
+            "66": 2800.0,
+            "67": 2572.0,
+            "68": 2686.0,
+            "69": 2768.0,
+            "70": 2992.0,
+            "71": 2834.0,
+            "72": 2391.0,
+            "73": 2816.0,
+            "74": 1859.0,
+            "75": 2369.0,
+            "76": 2849.0,
+            "77": 3224.0,
+            "78": 3000.0,
+            "79": 3139.0,
+            "80": 3215.0,
+            "81": 3443.0,
+            "82": 3149.0,
+            "83": 2715.0,
+            "84": 3170.0,
+            "85": 3313.0,
+            "86": 2748.0,
+            "87": 3534.0,
+            "88": 3004.0,
+            "89": 3336.0,
+            "90": 3117.0,
+            "91": 2912.0,
+            "92": 3082.0,
+            "93": 2671.0,
+            "94": 3380.0,
+            "95": 3185.0,
+            "96": 3513.0,
+            "97": 3137.0,
+            "98": 3523.0,
+            "99": 3099.0,
+            "100": 3148.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 261759488.0,
+            "2": 261759488.0,
+            "3": 261759488.0,
+            "4": 261759488.0,
+            "5": 261759488.0,
+            "6": 261759488.0,
+            "7": 261759488.0,
+            "8": 261759488.0,
+            "9": 261759488.0,
+            "10": 261759488.0,
+            "11": 261759488.0,
+            "12": 261759488.0,
+            "13": 261759488.0,
+            "14": 261759488.0,
+            "15": 261759488.0,
+            "16": 261759488.0,
+            "17": 261759488.0,
+            "18": 261759488.0,
+            "19": 261759488.0,
+            "20": 261759488.0,
+            "21": 261759488.0,
+            "22": 261759488.0,
+            "23": 261759488.0,
+            "24": 261759488.0,
+            "25": 261759488.0,
+            "26": 261759488.0,
+            "27": 261759488.0,
+            "28": 261759488.0,
+            "29": 261759488.0,
+            "30": 261759488.0,
+            "31": 261759488.0,
+            "32": 261759488.0,
+            "33": 261759488.0,
+            "34": 261759488.0,
+            "35": 261759488.0,
+            "36": 261759488.0,
+            "37": 261759488.0,
+            "38": 261759488.0,
+            "39": 261759488.0,
+            "40": 261759488.0,
+            "41": 261759488.0,
+            "42": 261759488.0,
+            "43": 261759488.0,
+            "44": 261759488.0,
+            "45": 261759488.0,
+            "46": 261759488.0,
+            "47": 261759488.0,
+            "48": 261759488.0,
+            "49": 261759488.0,
+            "50": 261759488.0,
+            "51": 261759488.0,
+            "52": 261759488.0,
+            "53": 261759488.0,
+            "54": 261759488.0,
+            "55": 261759488.0,
+            "56": 261759488.0,
+            "57": 261759488.0,
+            "58": 261759488.0,
+            "59": 261759488.0,
+            "60": 261759488.0,
+            "61": 261759488.0,
+            "62": 261759488.0,
+            "63": 261759488.0,
+            "64": 261759488.0,
+            "65": 261759488.0,
+            "66": 261759488.0,
+            "67": 261759488.0,
+            "68": 261759488.0,
+            "69": 261759488.0,
+            "70": 261759488.0,
+            "71": 261759488.0,
+            "72": 261759488.0,
+            "73": 261759488.0,
+            "74": 261759488.0,
+            "75": 261759488.0,
+            "76": 261759488.0,
+            "77": 261759488.0,
+            "78": 261759488.0,
+            "79": 261759488.0,
+            "80": 261759488.0,
+            "81": 261759488.0,
+            "82": 261759488.0,
+            "83": 261759488.0,
+            "84": 261759488.0,
+            "85": 261759488.0,
+            "86": 261759488.0,
+            "87": 261759488.0,
+            "88": 261759488.0,
+            "89": 261759488.0,
+            "90": 261759488.0,
+            "91": 261759488.0,
+            "92": 261759488.0,
+            "93": 261759488.0,
+            "94": 261759488.0,
+            "95": 261759488.0,
+            "96": 261759488.0,
+            "97": 261759488.0,
+            "98": 261759488.0,
+            "99": 261759488.0,
+            "100": 261759488.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 704366080.0,
+            "2": 794219008.0,
+            "3": 794219008.0,
+            "4": 794219008.0,
+            "5": 794219008.0,
+            "6": 794219008.0,
+            "7": 794219008.0,
+            "8": 794219008.0,
+            "9": 794219008.0,
+            "10": 794219008.0,
+            "11": 794219008.0,
+            "12": 794219008.0,
+            "13": 794219008.0,
+            "14": 794219008.0,
+            "15": 794219008.0,
+            "16": 794219008.0,
+            "17": 794219008.0,
+            "18": 794219008.0,
+            "19": 794219008.0,
+            "20": 794219008.0,
+            "21": 794219008.0,
+            "22": 794219008.0,
+            "23": 794219008.0,
+            "24": 794219008.0,
+            "25": 794219008.0,
+            "26": 794219008.0,
+            "27": 794219008.0,
+            "28": 794219008.0,
+            "29": 794219008.0,
+            "30": 794219008.0,
+            "31": 794219008.0,
+            "32": 794219008.0,
+            "33": 794219008.0,
+            "34": 794219008.0,
+            "35": 794219008.0,
+            "36": 794219008.0,
+            "37": 794219008.0,
+            "38": 794219008.0,
+            "39": 794219008.0,
+            "40": 794219008.0,
+            "41": 794219008.0,
+            "42": 794219008.0,
+            "43": 794219008.0,
+            "44": 794219008.0,
+            "45": 794219008.0,
+            "46": 794219008.0,
+            "47": 794219008.0,
+            "48": 794219008.0,
+            "49": 794219008.0,
+            "50": 794219008.0,
+            "51": 794219008.0,
+            "52": 794219008.0,
+            "53": 794219008.0,
+            "54": 794219008.0,
+            "55": 794219008.0,
+            "56": 794219008.0,
+            "57": 794219008.0,
+            "58": 794219008.0,
+            "59": 794219008.0,
+            "60": 794219008.0,
+            "61": 794219008.0,
+            "62": 794219008.0,
+            "63": 794219008.0,
+            "64": 794219008.0,
+            "65": 794219008.0,
+            "66": 794219008.0,
+            "67": 794219008.0,
+            "68": 794219008.0,
+            "69": 794219008.0,
+            "70": 794219008.0,
+            "71": 794219008.0,
+            "72": 794219008.0,
+            "73": 794219008.0,
+            "74": 794219008.0,
+            "75": 794219008.0,
+            "76": 794219008.0,
+            "77": 794219008.0,
+            "78": 794219008.0,
+            "79": 794219008.0,
+            "80": 794219008.0,
+            "81": 794219008.0,
+            "82": 794219008.0,
+            "83": 794219008.0,
+            "84": 794219008.0,
+            "85": 794219008.0,
+            "86": 794219008.0,
+            "87": 794219008.0,
+            "88": 794219008.0,
+            "89": 794219008.0,
+            "90": 794219008.0,
+            "91": 794219008.0,
+            "92": 794219008.0,
+            "93": 794219008.0,
+            "94": 794219008.0,
+            "95": 794219008.0,
+            "96": 794219008.0,
+            "97": 794219008.0,
+            "98": 794219008.0,
+            "99": 794219008.0,
+            "100": 794219008.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.24816,
+            "3": 0.43688,
+            "4": 0.27843,
+            "5": 0.27555,
+            "6": 0.27709,
+            "7": 0.277,
+            "8": 0.27722,
+            "9": 0.27493,
+            "10": 0.277,
+            "11": 0.27605,
+            "12": 0.27617,
+            "13": 0.27539,
+            "14": 0.27709,
+            "15": 0.27461,
+            "16": 0.27313,
+            "17": 0.27396,
+            "18": 0.27419,
+            "19": 0.2729,
+            "20": 0.27386,
+            "21": 0.27451,
+            "22": 0.27496,
+            "23": 0.27291,
+            "24": 0.27491,
+            "25": 0.2752,
+            "26": 0.27531,
+            "27": 0.27661,
+            "28": 0.27544,
+            "29": 0.27432,
+            "30": 0.27338,
+            "31": 0.2738,
+            "32": 0.27312,
+            "33": 0.2732,
+            "34": 0.27439,
+            "35": 0.2727,
+            "36": 0.27455,
+            "37": 0.27368,
+            "38": 0.27316,
+            "39": 0.29032,
+            "40": 0.27694,
+            "41": 0.27622,
+            "42": 0.28477,
+            "43": 0.27626,
+            "44": 0.27624,
+            "45": 0.27486,
+            "46": 0.27565,
+            "47": 0.2747,
+            "48": 0.2742,
+            "49": 0.2754,
+            "50": 0.27741,
+            "51": 0.30004,
+            "52": 0.27365,
+            "53": 0.27134,
+            "54": 0.27213,
+            "55": 0.27342,
+            "56": 0.27158,
+            "57": 0.27123,
+            "58": 0.27216,
+            "59": 0.27272,
+            "60": 0.27106,
+            "61": 0.2721,
+            "62": 0.27338,
+            "63": 0.2716,
+            "64": 0.27194,
+            "65": 0.27219,
+            "66": 0.27183,
+            "67": 0.2734,
+            "68": 0.27398,
+            "69": 0.27633,
+            "70": 0.27236,
+            "71": 0.27322,
+            "72": 0.27105,
+            "73": 0.27181,
+            "74": 0.27247,
+            "75": 0.27172,
+            "76": 0.27237,
+            "77": 0.2696,
+            "78": 0.2681,
+            "79": 0.26821,
+            "80": 0.26803,
+            "81": 0.27079,
+            "82": 0.27045,
+            "83": 0.27549,
+            "84": 0.27227,
+            "85": 0.27313,
+            "86": 0.27337,
+            "87": 0.27499,
+            "88": 0.2754,
+            "89": 0.2753,
+            "90": 0.2744,
+            "91": 0.27474,
+            "92": 0.27214,
+            "93": 0.27687,
+            "94": 0.27473,
+            "95": 0.27478,
+            "96": 0.27394,
+            "97": 0.27801,
+            "98": 0.27283,
+            "99": 0.27237,
+            "100": 0.27512
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..3c3511a921b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.87398,
+            "52": 9.76584,
+            "53": 10.08272,
+            "54": 9.97273,
+            "55": 9.90736,
+            "56": 9.64213,
+            "57": 9.48856,
+            "58": 9.84268,
+            "59": 9.60111,
+            "60": 9.52013,
+            "61": 9.70058,
+            "62": 9.99642,
+            "63": 9.39067,
+            "64": 9.77612,
+            "65": 8.96637,
+            "66": 9.70949,
+            "67": 9.38771,
+            "68": 9.78893,
+            "69": 9.808,
+            "70": 9.74238,
+            "71": 9.63382,
+            "72": 9.59116,
+            "73": 9.50694,
+            "74": 8.94251,
+            "75": 9.42898,
+            "76": 9.0883,
+            "77": 10.07153,
+            "78": 9.72682,
+            "79": 9.38722,
+            "80": 9.40571,
+            "81": 9.48701,
+            "82": 9.70484,
+            "83": 9.31557,
+            "84": 9.42111,
+            "85": 9.61463,
+            "86": 9.08465,
+            "87": 9.59904,
+            "88": 9.75367,
+            "89": 9.606,
+            "90": 9.83155,
+            "91": 9.3388,
+            "92": 9.36037,
+            "93": 9.09036,
+            "94": 8.83711,
+            "95": 9.53804,
+            "96": 9.53392,
+            "97": 9.3132,
+            "98": 9.67422,
+            "99": 8.90347,
+            "100": 9.415
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2423.0,
+            "52": 2500.0,
+            "53": 2764.0,
+            "54": 2630.0,
+            "55": 2192.0,
+            "56": 2558.0,
+            "57": 2261.0,
+            "58": 2856.0,
+            "59": 2691.0,
+            "60": 2271.0,
+            "61": 2730.0,
+            "62": 2517.0,
+            "63": 2411.0,
+            "64": 2842.0,
+            "65": 2476.0,
+            "66": 2911.0,
+            "67": 2596.0,
+            "68": 2658.0,
+            "69": 2791.0,
+            "70": 3035.0,
+            "71": 2882.0,
+            "72": 2367.0,
+            "73": 2798.0,
+            "74": 1871.0,
+            "75": 2433.0,
+            "76": 2936.0,
+            "77": 3145.0,
+            "78": 2937.0,
+            "79": 2949.0,
+            "80": 3208.0,
+            "81": 3626.0,
+            "82": 3215.0,
+            "83": 2746.0,
+            "84": 3128.0,
+            "85": 3291.0,
+            "86": 2686.0,
+            "87": 3535.0,
+            "88": 2983.0,
+            "89": 3431.0,
+            "90": 3105.0,
+            "91": 2840.0,
+            "92": 3101.0,
+            "93": 2561.0,
+            "94": 3334.0,
+            "95": 3249.0,
+            "96": 3468.0,
+            "97": 3077.0,
+            "98": 3515.0,
+            "99": 3067.0,
+            "100": 3131.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 261759488.0,
+            "52": 261759488.0,
+            "53": 261759488.0,
+            "54": 261759488.0,
+            "55": 261759488.0,
+            "56": 261759488.0,
+            "57": 261759488.0,
+            "58": 261759488.0,
+            "59": 261759488.0,
+            "60": 261759488.0,
+            "61": 261759488.0,
+            "62": 261759488.0,
+            "63": 261759488.0,
+            "64": 261759488.0,
+            "65": 261759488.0,
+            "66": 261759488.0,
+            "67": 261759488.0,
+            "68": 261759488.0,
+            "69": 261759488.0,
+            "70": 261759488.0,
+            "71": 261759488.0,
+            "72": 261759488.0,
+            "73": 261759488.0,
+            "74": 261759488.0,
+            "75": 261759488.0,
+            "76": 261759488.0,
+            "77": 261759488.0,
+            "78": 261759488.0,
+            "79": 261759488.0,
+            "80": 261759488.0,
+            "81": 261759488.0,
+            "82": 261759488.0,
+            "83": 261759488.0,
+            "84": 261759488.0,
+            "85": 261759488.0,
+            "86": 261759488.0,
+            "87": 261759488.0,
+            "88": 261759488.0,
+            "89": 261759488.0,
+            "90": 261759488.0,
+            "91": 261759488.0,
+            "92": 261759488.0,
+            "93": 261759488.0,
+            "94": 261759488.0,
+            "95": 261759488.0,
+            "96": 261759488.0,
+            "97": 261759488.0,
+            "98": 261759488.0,
+            "99": 261759488.0,
+            "100": 261759488.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 795135488.0,
+            "52": 795136512.0,
+            "53": 795136512.0,
+            "54": 795136512.0,
+            "55": 795136512.0,
+            "56": 795136512.0,
+            "57": 795136512.0,
+            "58": 795136512.0,
+            "59": 795136512.0,
+            "60": 795136512.0,
+            "61": 795136512.0,
+            "62": 795136512.0,
+            "63": 795136512.0,
+            "64": 795136512.0,
+            "65": 795136512.0,
+            "66": 795136512.0,
+            "67": 795136512.0,
+            "68": 795136512.0,
+            "69": 795136512.0,
+            "70": 795136512.0,
+            "71": 795136512.0,
+            "72": 795136512.0,
+            "73": 795136512.0,
+            "74": 795136512.0,
+            "75": 795136512.0,
+            "76": 795136512.0,
+            "77": 795136512.0,
+            "78": 795136512.0,
+            "79": 795136512.0,
+            "80": 795136512.0,
+            "81": 795136512.0,
+            "82": 795136512.0,
+            "83": 795136512.0,
+            "84": 795136512.0,
+            "85": 795136512.0,
+            "86": 795136512.0,
+            "87": 795136512.0,
+            "88": 795136512.0,
+            "89": 795136512.0,
+            "90": 795136512.0,
+            "91": 795136512.0,
+            "92": 795136512.0,
+            "93": 795136512.0,
+            "94": 795136512.0,
+            "95": 795136512.0,
+            "96": 795136512.0,
+            "97": 795136512.0,
+            "98": 795136512.0,
+            "99": 795136512.0,
+            "100": 795136512.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 3.36268,
+            "53": 0.28919,
+            "54": 0.2725,
+            "55": 0.27972,
+            "56": 0.2728,
+            "57": 0.27382,
+            "58": 0.27288,
+            "59": 0.27294,
+            "60": 0.27575,
+            "61": 0.27075,
+            "62": 0.27057,
+            "63": 0.27211,
+            "64": 0.26991,
+            "65": 0.27298,
+            "66": 0.27045,
+            "67": 0.27231,
+            "68": 0.27315,
+            "69": 0.26969,
+            "70": 0.27037,
+            "71": 0.27028,
+            "72": 0.27191,
+            "73": 0.2714,
+            "74": 0.27082,
+            "75": 0.2722,
+            "76": 0.27153,
+            "77": 0.27331,
+            "78": 0.27142,
+            "79": 0.27368,
+            "80": 0.27144,
+            "81": 0.26895,
+            "82": 0.27139,
+            "83": 0.26946,
+            "84": 0.27033,
+            "85": 0.2702,
+            "86": 0.26955,
+            "87": 0.2686,
+            "88": 0.27213,
+            "89": 0.2709,
+            "90": 0.27061,
+            "91": 0.27274,
+            "92": 0.26989,
+            "93": 0.27031,
+            "94": 0.27054,
+            "95": 0.269,
+            "96": 0.27187,
+            "97": 0.26915,
+            "98": 0.2696,
+            "99": 0.27075,
+            "100": 0.26802
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json
index 5c404dad658..fd1d245462e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 312352256.0,
-            "2": 312352256.0,
-            "3": 312352256.0,
-            "4": 312352256.0,
-            "5": 312352256.0,
-            "6": 312352256.0,
-            "7": 312352256.0,
-            "8": 312352256.0,
-            "9": 312352256.0,
-            "10": 312352256.0,
-            "11": 312352256.0,
-            "12": 312352256.0,
-            "13": 312352256.0,
-            "14": 312352256.0,
-            "15": 312352256.0,
-            "16": 312352256.0,
-            "17": 312352256.0,
-            "18": 312352256.0,
-            "19": 312352256.0,
-            "20": 312352256.0,
-            "21": 312352256.0,
-            "22": 312352256.0,
-            "23": 312352256.0,
-            "24": 312352256.0,
-            "25": 312352256.0,
-            "26": 312352256.0,
-            "27": 312352256.0,
-            "28": 312352256.0,
-            "29": 312352256.0,
-            "30": 312352256.0,
-            "31": 312352256.0,
-            "32": 312352256.0,
-            "33": 312352256.0,
-            "34": 312352256.0,
-            "35": 312352256.0,
-            "36": 312352256.0,
-            "37": 312352256.0,
-            "38": 312352256.0,
-            "39": 312352256.0,
-            "40": 312352256.0,
-            "41": 312352256.0,
-            "42": 312352256.0,
-            "43": 312352256.0,
-            "44": 312352256.0,
-            "45": 312352256.0,
-            "46": 312352256.0,
-            "47": 312352256.0,
-            "48": 312352256.0,
-            "49": 312352256.0,
-            "50": 312352256.0,
-            "51": 312352256.0,
-            "52": 312352256.0,
-            "53": 312352256.0,
-            "54": 312352256.0,
-            "55": 312352256.0,
-            "56": 312352256.0,
-            "57": 312352256.0,
-            "58": 312352256.0,
-            "59": 312352256.0,
-            "60": 312352256.0,
-            "61": 312352256.0,
-            "62": 312352256.0,
-            "63": 312352256.0,
-            "64": 312352256.0,
-            "65": 312352256.0,
-            "66": 312352256.0,
-            "67": 312352256.0,
-            "68": 312352256.0,
-            "69": 312352256.0,
-            "70": 312352256.0,
-            "71": 312352256.0,
-            "72": 312352256.0,
-            "73": 312352256.0,
-            "74": 312352256.0,
-            "75": 312352256.0,
-            "76": 312352256.0,
-            "77": 312352256.0,
-            "78": 312352256.0,
-            "79": 312352256.0,
-            "80": 312352256.0,
-            "81": 312352256.0,
-            "82": 312352256.0,
-            "83": 312352256.0,
-            "84": 312352256.0,
-            "85": 312352256.0,
-            "86": 312352256.0,
-            "87": 312352256.0,
-            "88": 312352256.0,
-            "89": 312352256.0,
-            "90": 312352256.0,
-            "91": 312352256.0,
-            "92": 312352256.0,
-            "93": 312352256.0,
-            "94": 312352256.0,
-            "95": 312352256.0,
-            "96": 312352256.0,
-            "97": 312352256.0,
-            "98": 312352256.0,
-            "99": 312352256.0,
-            "100": 312352256.0
+            "1": 311828992.0,
+            "2": 311828992.0,
+            "3": 311828992.0,
+            "4": 311828992.0,
+            "5": 311828992.0,
+            "6": 311828992.0,
+            "7": 311828992.0,
+            "8": 311828992.0,
+            "9": 311828992.0,
+            "10": 311828992.0,
+            "11": 311828992.0,
+            "12": 311828992.0,
+            "13": 311828992.0,
+            "14": 311828992.0,
+            "15": 311828992.0,
+            "16": 311828992.0,
+            "17": 311828992.0,
+            "18": 311828992.0,
+            "19": 311828992.0,
+            "20": 311828992.0,
+            "21": 311828992.0,
+            "22": 311828992.0,
+            "23": 311828992.0,
+            "24": 311828992.0,
+            "25": 311828992.0,
+            "26": 311828992.0,
+            "27": 311828992.0,
+            "28": 311828992.0,
+            "29": 311828992.0,
+            "30": 311828992.0,
+            "31": 311828992.0,
+            "32": 311828992.0,
+            "33": 311828992.0,
+            "34": 311828992.0,
+            "35": 311828992.0,
+            "36": 311828992.0,
+            "37": 311828992.0,
+            "38": 311828992.0,
+            "39": 311828992.0,
+            "40": 311828992.0,
+            "41": 311828992.0,
+            "42": 311828992.0,
+            "43": 311828992.0,
+            "44": 311828992.0,
+            "45": 311828992.0,
+            "46": 311828992.0,
+            "47": 311828992.0,
+            "48": 311828992.0,
+            "49": 311828992.0,
+            "50": 311828992.0,
+            "51": 311828992.0,
+            "52": 311828992.0,
+            "53": 311828992.0,
+            "54": 311828992.0,
+            "55": 311828992.0,
+            "56": 311828992.0,
+            "57": 311828992.0,
+            "58": 311828992.0,
+            "59": 311828992.0,
+            "60": 311828992.0,
+            "61": 311828992.0,
+            "62": 311828992.0,
+            "63": 311828992.0,
+            "64": 311828992.0,
+            "65": 311828992.0,
+            "66": 311828992.0,
+            "67": 311828992.0,
+            "68": 311828992.0,
+            "69": 311828992.0,
+            "70": 311828992.0,
+            "71": 311828992.0,
+            "72": 311828992.0,
+            "73": 311828992.0,
+            "74": 311828992.0,
+            "75": 311828992.0,
+            "76": 311828992.0,
+            "77": 311828992.0,
+            "78": 311828992.0,
+            "79": 311828992.0,
+            "80": 311828992.0,
+            "81": 311828992.0,
+            "82": 311828992.0,
+            "83": 311828992.0,
+            "84": 311828992.0,
+            "85": 311828992.0,
+            "86": 311828992.0,
+            "87": 311828992.0,
+            "88": 311828992.0,
+            "89": 311828992.0,
+            "90": 311828992.0,
+            "91": 311828992.0,
+            "92": 311828992.0,
+            "93": 311828992.0,
+            "94": 311828992.0,
+            "95": 311828992.0,
+            "96": 311828992.0,
+            "97": 311828992.0,
+            "98": 311828992.0,
+            "99": 311828992.0,
+            "100": 311828992.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 754434560.0,
-            "2": 843763200.0,
-            "3": 843763200.0,
-            "4": 843763200.0,
-            "5": 843763200.0,
-            "6": 843763200.0,
-            "7": 843763200.0,
-            "8": 843763200.0,
-            "9": 843763200.0,
-            "10": 843763200.0,
-            "11": 843763200.0,
-            "12": 843763200.0,
-            "13": 843763200.0,
-            "14": 843763200.0,
-            "15": 843763200.0,
-            "16": 843763200.0,
-            "17": 843763200.0,
-            "18": 843763200.0,
-            "19": 843763200.0,
-            "20": 843763200.0,
-            "21": 843763200.0,
-            "22": 843763200.0,
-            "23": 843763200.0,
-            "24": 843763200.0,
-            "25": 843763200.0,
-            "26": 843763200.0,
-            "27": 843763200.0,
-            "28": 843763200.0,
-            "29": 843763200.0,
-            "30": 843763200.0,
-            "31": 843763200.0,
-            "32": 843763200.0,
-            "33": 843763200.0,
-            "34": 843763200.0,
-            "35": 843763200.0,
-            "36": 843763200.0,
-            "37": 843763200.0,
-            "38": 843763200.0,
-            "39": 843763200.0,
-            "40": 843763200.0,
-            "41": 843763200.0,
-            "42": 843763200.0,
-            "43": 843763200.0,
-            "44": 843763200.0,
-            "45": 843763200.0,
-            "46": 843763200.0,
-            "47": 843763200.0,
-            "48": 843763200.0,
-            "49": 843763200.0,
-            "50": 843763200.0,
-            "51": 843763200.0,
-            "52": 843763200.0,
-            "53": 843763200.0,
-            "54": 843763200.0,
-            "55": 843763200.0,
-            "56": 843763200.0,
-            "57": 843763200.0,
-            "58": 843763200.0,
-            "59": 843763200.0,
-            "60": 843763200.0,
-            "61": 843763200.0,
-            "62": 843763200.0,
-            "63": 843763200.0,
-            "64": 843763200.0,
-            "65": 843763200.0,
-            "66": 843763200.0,
-            "67": 843763200.0,
-            "68": 843763200.0,
-            "69": 843763200.0,
-            "70": 843763200.0,
-            "71": 843763200.0,
-            "72": 843763200.0,
-            "73": 843763200.0,
-            "74": 843763200.0,
-            "75": 843763200.0,
-            "76": 843763200.0,
-            "77": 843763200.0,
-            "78": 843763200.0,
-            "79": 843763200.0,
-            "80": 843763200.0,
-            "81": 843763200.0,
-            "82": 843763200.0,
-            "83": 843763200.0,
-            "84": 843763200.0,
-            "85": 843763200.0,
-            "86": 843763200.0,
-            "87": 843763200.0,
-            "88": 843763200.0,
-            "89": 843763200.0,
-            "90": 843763200.0,
-            "91": 843763200.0,
-            "92": 843763200.0,
-            "93": 843763200.0,
-            "94": 843763200.0,
-            "95": 843763200.0,
-            "96": 843763200.0,
-            "97": 843763200.0,
-            "98": 843763200.0,
-            "99": 843763200.0,
-            "100": 843763200.0
+            "1": 755484160.0,
+            "2": 844288512.0,
+            "3": 844288512.0,
+            "4": 844288512.0,
+            "5": 844288512.0,
+            "6": 844288512.0,
+            "7": 844288512.0,
+            "8": 844288512.0,
+            "9": 844288512.0,
+            "10": 844288512.0,
+            "11": 844288512.0,
+            "12": 844288512.0,
+            "13": 844288512.0,
+            "14": 844288512.0,
+            "15": 844288512.0,
+            "16": 844288512.0,
+            "17": 844288512.0,
+            "18": 844288512.0,
+            "19": 844288512.0,
+            "20": 844288512.0,
+            "21": 844288512.0,
+            "22": 844288512.0,
+            "23": 844288512.0,
+            "24": 844288512.0,
+            "25": 844288512.0,
+            "26": 844288512.0,
+            "27": 844288512.0,
+            "28": 844288512.0,
+            "29": 844288512.0,
+            "30": 844288512.0,
+            "31": 844288512.0,
+            "32": 844288512.0,
+            "33": 844288512.0,
+            "34": 844288512.0,
+            "35": 844288512.0,
+            "36": 844288512.0,
+            "37": 844288512.0,
+            "38": 844288512.0,
+            "39": 844288512.0,
+            "40": 844288512.0,
+            "41": 844288512.0,
+            "42": 844288512.0,
+            "43": 844288512.0,
+            "44": 844288512.0,
+            "45": 844288512.0,
+            "46": 844288512.0,
+            "47": 844288512.0,
+            "48": 844288512.0,
+            "49": 844288512.0,
+            "50": 844288512.0,
+            "51": 844288512.0,
+            "52": 844288512.0,
+            "53": 844288512.0,
+            "54": 844288512.0,
+            "55": 844288512.0,
+            "56": 844288512.0,
+            "57": 844288512.0,
+            "58": 844288512.0,
+            "59": 844288512.0,
+            "60": 844288512.0,
+            "61": 844288512.0,
+            "62": 844288512.0,
+            "63": 844288512.0,
+            "64": 844288512.0,
+            "65": 844288512.0,
+            "66": 844288512.0,
+            "67": 844288512.0,
+            "68": 844288512.0,
+            "69": 844288512.0,
+            "70": 844288512.0,
+            "71": 844288512.0,
+            "72": 844288512.0,
+            "73": 844288512.0,
+            "74": 844288512.0,
+            "75": 844288512.0,
+            "76": 844288512.0,
+            "77": 844288512.0,
+            "78": 844288512.0,
+            "79": 844288512.0,
+            "80": 844288512.0,
+            "81": 844288512.0,
+            "82": 844288512.0,
+            "83": 844288512.0,
+            "84": 844288512.0,
+            "85": 844288512.0,
+            "86": 844288512.0,
+            "87": 844288512.0,
+            "88": 844288512.0,
+            "89": 844288512.0,
+            "90": 844288512.0,
+            "91": 844288512.0,
+            "92": 844288512.0,
+            "93": 844288512.0,
+            "94": 844288512.0,
+            "95": 844288512.0,
+            "96": 844288512.0,
+            "97": 844288512.0,
+            "98": 844288512.0,
+            "99": 844288512.0,
+            "100": 844288512.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 13.61637,
-            "2": 0.24414,
-            "3": 0.22872,
-            "4": 0.22599,
-            "5": 0.22586,
-            "6": 0.22773,
-            "7": 0.22791,
-            "8": 0.22857,
-            "9": 0.2283,
-            "10": 0.22732,
-            "11": 0.22633,
-            "12": 0.22761,
-            "13": 0.22748,
-            "14": 0.23094,
-            "15": 0.22968,
-            "16": 0.22849,
-            "17": 0.22934,
-            "18": 0.22814,
-            "19": 0.22822,
-            "20": 0.22758,
-            "21": 0.22806,
-            "22": 0.25737,
-            "23": 0.24238,
-            "24": 0.23166,
-            "25": 0.22695,
-            "26": 0.22857,
-            "27": 0.23442,
-            "28": 0.22861,
-            "29": 0.2302,
-            "30": 0.2316,
-            "31": 0.23014,
-            "32": 0.22948,
-            "33": 0.23272,
-            "34": 0.23222,
-            "35": 0.23035,
-            "36": 0.23384,
-            "37": 0.23085,
-            "38": 0.23058,
-            "39": 0.23686,
-            "40": 0.23939,
-            "41": 0.23562,
-            "42": 0.23544,
-            "43": 0.23293,
-            "44": 0.22874,
-            "45": 0.234,
-            "46": 0.22942,
-            "47": 0.23036,
-            "48": 0.23404,
-            "49": 0.2686,
-            "50": 0.24831,
-            "51": 0.28415,
-            "52": 0.23699,
-            "53": 0.26129,
-            "54": 0.2273,
-            "55": 0.22639,
-            "56": 0.22691,
-            "57": 0.22504,
-            "58": 0.22822,
-            "59": 0.22913,
-            "60": 0.22577,
-            "61": 0.23097,
-            "62": 0.22702,
-            "63": 0.22579,
-            "64": 0.22717,
-            "65": 0.22986,
-            "66": 0.22481,
-            "67": 0.22676,
-            "68": 0.22643,
-            "69": 0.22933,
-            "70": 0.23566,
-            "71": 0.22795,
-            "72": 0.22654,
-            "73": 0.2256,
-            "74": 0.22941,
-            "75": 0.23701,
-            "76": 0.23527,
-            "77": 0.23476,
-            "78": 0.23472,
-            "79": 0.22599,
-            "80": 0.22758,
-            "81": 0.22717,
-            "82": 0.22657,
-            "83": 0.22688,
-            "84": 0.22827,
-            "85": 0.22612,
-            "86": 0.22871,
-            "87": 0.23133,
-            "88": 0.22934,
-            "89": 0.22859,
-            "90": 0.22635,
-            "91": 0.22606,
-            "92": 0.2297,
-            "93": 0.22713,
-            "94": 0.2261,
-            "95": 0.227,
-            "96": 0.23135,
-            "97": 0.22866,
-            "98": 0.22601,
-            "99": 0.2277,
-            "100": 0.2323
+            "1": 9.99954,
+            "2": 0.2844,
+            "3": 0.21531,
+            "4": 0.19894,
+            "5": 0.19896,
+            "6": 0.19827,
+            "7": 0.19932,
+            "8": 0.20009,
+            "9": 0.19826,
+            "10": 0.19917,
+            "11": 0.19961,
+            "12": 0.19975,
+            "13": 0.20483,
+            "14": 0.20549,
+            "15": 0.19855,
+            "16": 0.19911,
+            "17": 0.19768,
+            "18": 0.19797,
+            "19": 0.19725,
+            "20": 0.19763,
+            "21": 0.19859,
+            "22": 0.20076,
+            "23": 0.19965,
+            "24": 0.19495,
+            "25": 0.1933,
+            "26": 0.19302,
+            "27": 0.19426,
+            "28": 0.19183,
+            "29": 0.19326,
+            "30": 0.1926,
+            "31": 0.19268,
+            "32": 0.1921,
+            "33": 0.19395,
+            "34": 0.1932,
+            "35": 0.19421,
+            "36": 0.19128,
+            "37": 0.19268,
+            "38": 0.1936,
+            "39": 0.19222,
+            "40": 0.19436,
+            "41": 0.19323,
+            "42": 0.19182,
+            "43": 0.19358,
+            "44": 0.19401,
+            "45": 0.1935,
+            "46": 0.19276,
+            "47": 0.19598,
+            "48": 0.19322,
+            "49": 0.19379,
+            "50": 0.19239,
+            "51": 0.20371,
+            "52": 0.19298,
+            "53": 0.21521,
+            "54": 0.21625,
+            "55": 0.19257,
+            "56": 0.1959,
+            "57": 0.19218,
+            "58": 0.19272,
+            "59": 0.19009,
+            "60": 0.19106,
+            "61": 0.19155,
+            "62": 0.19168,
+            "63": 0.191,
+            "64": 0.19045,
+            "65": 0.19015,
+            "66": 0.19568,
+            "67": 0.19034,
+            "68": 0.19165,
+            "69": 0.19136,
+            "70": 0.19369,
+            "71": 0.19227,
+            "72": 0.19248,
+            "73": 0.18982,
+            "74": 0.18984,
+            "75": 0.18976,
+            "76": 0.19243,
+            "77": 0.19198,
+            "78": 0.18981,
+            "79": 0.18977,
+            "80": 0.19102,
+            "81": 0.18951,
+            "82": 0.19227,
+            "83": 0.18983,
+            "84": 0.19005,
+            "85": 0.18923,
+            "86": 0.18901,
+            "87": 0.1898,
+            "88": 0.18885,
+            "89": 0.18842,
+            "90": 0.18857,
+            "91": 0.18847,
+            "92": 0.18973,
+            "93": 0.19045,
+            "94": 0.1894,
+            "95": 0.18946,
+            "96": 0.18844,
+            "97": 0.18946,
+            "98": 0.1889,
+            "99": 0.1905,
+            "100": 0.19169
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..106835fbcc0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.8695,
+            "52": 9.76154,
+            "53": 10.08349,
+            "54": 9.97449,
+            "55": 9.89437,
+            "56": 9.6424,
+            "57": 9.50352,
+            "58": 9.84153,
+            "59": 9.60017,
+            "60": 9.51715,
+            "61": 9.70458,
+            "62": 9.98292,
+            "63": 9.39067,
+            "64": 9.7797,
+            "65": 8.96053,
+            "66": 9.70288,
+            "67": 9.3734,
+            "68": 9.78805,
+            "69": 9.79828,
+            "70": 9.74999,
+            "71": 9.62682,
+            "72": 9.59043,
+            "73": 9.49893,
+            "74": 8.94842,
+            "75": 9.42922,
+            "76": 9.08268,
+            "77": 10.07413,
+            "78": 9.73322,
+            "79": 9.38352,
+            "80": 9.40713,
+            "81": 9.48366,
+            "82": 9.70577,
+            "83": 9.3103,
+            "84": 9.41846,
+            "85": 9.62053,
+            "86": 9.08533,
+            "87": 9.59962,
+            "88": 9.75141,
+            "89": 9.60594,
+            "90": 9.8245,
+            "91": 9.33973,
+            "92": 9.36344,
+            "93": 9.08397,
+            "94": 8.83571,
+            "95": 9.51936,
+            "96": 9.53001,
+            "97": 9.31995,
+            "98": 9.67709,
+            "99": 8.88909,
+            "100": 9.40491
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2484.0,
+            "52": 2568.0,
+            "53": 2834.0,
+            "54": 2607.0,
+            "55": 2149.0,
+            "56": 2683.0,
+            "57": 2283.0,
+            "58": 2764.0,
+            "59": 2623.0,
+            "60": 2456.0,
+            "61": 2938.0,
+            "62": 2456.0,
+            "63": 2279.0,
+            "64": 3078.0,
+            "65": 2504.0,
+            "66": 2881.0,
+            "67": 2683.0,
+            "68": 2657.0,
+            "69": 2832.0,
+            "70": 3144.0,
+            "71": 2930.0,
+            "72": 2328.0,
+            "73": 2984.0,
+            "74": 1752.0,
+            "75": 2451.0,
+            "76": 3040.0,
+            "77": 3213.0,
+            "78": 2936.0,
+            "79": 2941.0,
+            "80": 3112.0,
+            "81": 3568.0,
+            "82": 3105.0,
+            "83": 2725.0,
+            "84": 3051.0,
+            "85": 3170.0,
+            "86": 2645.0,
+            "87": 3586.0,
+            "88": 2902.0,
+            "89": 3371.0,
+            "90": 2971.0,
+            "91": 2800.0,
+            "92": 3017.0,
+            "93": 2524.0,
+            "94": 3384.0,
+            "95": 3147.0,
+            "96": 3388.0,
+            "97": 3031.0,
+            "98": 3619.0,
+            "99": 3004.0,
+            "100": 3100.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 311828992.0,
+            "52": 311828992.0,
+            "53": 311828992.0,
+            "54": 311828992.0,
+            "55": 311828992.0,
+            "56": 311828992.0,
+            "57": 311828992.0,
+            "58": 311828992.0,
+            "59": 311828992.0,
+            "60": 311828992.0,
+            "61": 311828992.0,
+            "62": 311828992.0,
+            "63": 311828992.0,
+            "64": 311828992.0,
+            "65": 311828992.0,
+            "66": 311828992.0,
+            "67": 311828992.0,
+            "68": 311828992.0,
+            "69": 311828992.0,
+            "70": 311828992.0,
+            "71": 311828992.0,
+            "72": 311828992.0,
+            "73": 311828992.0,
+            "74": 311828992.0,
+            "75": 311828992.0,
+            "76": 311828992.0,
+            "77": 311828992.0,
+            "78": 311828992.0,
+            "79": 311828992.0,
+            "80": 311828992.0,
+            "81": 311828992.0,
+            "82": 311828992.0,
+            "83": 311828992.0,
+            "84": 311828992.0,
+            "85": 311828992.0,
+            "86": 311828992.0,
+            "87": 311828992.0,
+            "88": 311828992.0,
+            "89": 311828992.0,
+            "90": 311828992.0,
+            "91": 311828992.0,
+            "92": 311828992.0,
+            "93": 311828992.0,
+            "94": 311828992.0,
+            "95": 311828992.0,
+            "96": 311828992.0,
+            "97": 311828992.0,
+            "98": 311828992.0,
+            "99": 311828992.0,
+            "100": 311828992.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 845336064.0,
+            "52": 845337088.0,
+            "53": 845337088.0,
+            "54": 845337088.0,
+            "55": 845337088.0,
+            "56": 845337088.0,
+            "57": 845337088.0,
+            "58": 845337088.0,
+            "59": 845337088.0,
+            "60": 845337088.0,
+            "61": 845337088.0,
+            "62": 845337088.0,
+            "63": 845337088.0,
+            "64": 845337088.0,
+            "65": 845337088.0,
+            "66": 845337088.0,
+            "67": 845337088.0,
+            "68": 845337088.0,
+            "69": 845337088.0,
+            "70": 845337088.0,
+            "71": 845337088.0,
+            "72": 845337088.0,
+            "73": 845337088.0,
+            "74": 845337088.0,
+            "75": 845337088.0,
+            "76": 845337088.0,
+            "77": 845337088.0,
+            "78": 845337088.0,
+            "79": 845337088.0,
+            "80": 845337088.0,
+            "81": 845337088.0,
+            "82": 845337088.0,
+            "83": 845337088.0,
+            "84": 845337088.0,
+            "85": 845337088.0,
+            "86": 845337088.0,
+            "87": 845337088.0,
+            "88": 845337088.0,
+            "89": 845337088.0,
+            "90": 845337088.0,
+            "91": 845337088.0,
+            "92": 845337088.0,
+            "93": 845337088.0,
+            "94": 845337088.0,
+            "95": 845337088.0,
+            "96": 845337088.0,
+            "97": 845337088.0,
+            "98": 845337088.0,
+            "99": 845337088.0,
+            "100": 845337088.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 11.77615,
+            "52": 0.26842,
+            "53": 0.22425,
+            "54": 0.22629,
+            "55": 0.19714,
+            "56": 0.19595,
+            "57": 0.19408,
+            "58": 0.19455,
+            "59": 0.19527,
+            "60": 0.19277,
+            "61": 0.19626,
+            "62": 0.19225,
+            "63": 0.19531,
+            "64": 0.19329,
+            "65": 0.19633,
+            "66": 0.20818,
+            "67": 0.20691,
+            "68": 0.19203,
+            "69": 0.19251,
+            "70": 0.19524,
+            "71": 0.19414,
+            "72": 0.19212,
+            "73": 0.19189,
+            "74": 0.19323,
+            "75": 0.19106,
+            "76": 0.19302,
+            "77": 0.19126,
+            "78": 0.19419,
+            "79": 0.1946,
+            "80": 0.19275,
+            "81": 0.19432,
+            "82": 0.19583,
+            "83": 0.19969,
+            "84": 0.19643,
+            "85": 0.19472,
+            "86": 0.1986,
+            "87": 0.19301,
+            "88": 0.19387,
+            "89": 0.19581,
+            "90": 0.19215,
+            "91": 0.19286,
+            "92": 0.19237,
+            "93": 0.1931,
+            "94": 0.19448,
+            "95": 0.19755,
+            "96": 0.195,
+            "97": 0.19341,
+            "98": 0.19626,
+            "99": 0.19167,
+            "100": 0.19047
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json
index cac9c570ec1..33ed61d5e20 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json
@@ -4,106 +4,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.8583,
-            "2": 10.85411,
-            "3": 10.8543,
-            "4": 10.84407,
-            "5": 10.87282,
-            "6": 10.8793,
-            "7": 10.84658,
-            "8": 10.86139,
-            "9": 10.87078,
-            "10": 10.83266,
-            "11": 10.86332,
-            "12": 10.87295,
-            "13": 10.87798,
-            "14": 10.88588,
-            "15": 10.82104,
-            "16": 10.82759,
-            "17": 10.80303,
-            "18": 10.82092,
-            "19": 10.80032,
-            "20": 10.71379,
-            "21": 10.69818,
-            "22": 10.57542,
-            "23": 10.72119,
-            "24": 10.60091,
-            "25": 10.5476,
-            "26": 10.61127,
-            "27": 10.61393,
-            "28": 10.57777,
-            "29": 10.57888,
-            "30": 10.36791,
-            "31": 10.13451,
-            "32": 10.47063,
-            "33": 10.47371,
-            "34": 10.23442,
-            "35": 10.28457,
-            "36": 10.23595,
-            "37": 10.35351,
-            "38": 10.20695,
-            "39": 10.40581,
-            "40": 10.08924,
-            "41": 10.16388,
-            "42": 10.22671,
-            "43": 9.86336,
-            "44": 9.98189,
-            "45": 9.84555,
-            "46": 9.85753,
-            "47": 10.16884,
-            "48": 9.86474,
-            "49": 9.54712,
-            "50": 9.91942,
-            "51": 9.86179,
-            "52": 9.76162,
-            "53": 10.08383,
-            "54": 9.96743,
-            "55": 9.89199,
-            "56": 9.63777,
-            "57": 9.49339,
-            "58": 9.83897,
-            "59": 9.59641,
-            "60": 9.50823,
-            "61": 9.70513,
-            "62": 9.99499,
-            "63": 9.38054,
-            "64": 9.78296,
-            "65": 8.95946,
-            "66": 9.71045,
-            "67": 9.38075,
-            "68": 9.78884,
-            "69": 9.79451,
-            "70": 9.73441,
-            "71": 9.62146,
-            "72": 9.58792,
-            "73": 9.49657,
-            "74": 8.9434,
-            "75": 9.43112,
-            "76": 9.09716,
-            "77": 10.0681,
-            "78": 9.73005,
-            "79": 9.37764,
-            "80": 9.41097,
-            "81": 9.48622,
-            "82": 9.69669,
-            "83": 9.3163,
-            "84": 9.42182,
-            "85": 9.61516,
-            "86": 9.07553,
-            "87": 9.59851,
-            "88": 9.75046,
-            "89": 9.61112,
-            "90": 9.82373,
-            "91": 9.35278,
-            "92": 9.36495,
-            "93": 9.08811,
-            "94": 8.83656,
-            "95": 9.52256,
-            "96": 9.52793,
-            "97": 9.31634,
-            "98": 9.67876,
-            "99": 8.89321,
-            "100": 9.40801
+            "1": 10.85936,
+            "2": 10.8548,
+            "3": 10.85198,
+            "4": 10.84316,
+            "5": 10.8725,
+            "6": 10.87861,
+            "7": 10.84626,
+            "8": 10.86367,
+            "9": 10.87211,
+            "10": 10.83111,
+            "11": 10.86068,
+            "12": 10.87273,
+            "13": 10.87988,
+            "14": 10.88658,
+            "15": 10.82024,
+            "16": 10.82685,
+            "17": 10.79977,
+            "18": 10.81982,
+            "19": 10.80036,
+            "20": 10.71402,
+            "21": 10.69897,
+            "22": 10.57449,
+            "23": 10.71973,
+            "24": 10.60276,
+            "25": 10.5461,
+            "26": 10.61045,
+            "27": 10.61226,
+            "28": 10.57728,
+            "29": 10.58002,
+            "30": 10.36711,
+            "31": 10.13446,
+            "32": 10.47126,
+            "33": 10.47458,
+            "34": 10.23197,
+            "35": 10.28446,
+            "36": 10.23439,
+            "37": 10.3534,
+            "38": 10.20693,
+            "39": 10.40598,
+            "40": 10.08969,
+            "41": 10.16335,
+            "42": 10.2256,
+            "43": 9.86391,
+            "44": 9.98249,
+            "45": 9.84549,
+            "46": 9.85808,
+            "47": 10.1689,
+            "48": 9.86658,
+            "49": 9.54555,
+            "50": 9.91938,
+            "51": 9.86073,
+            "52": 9.76125,
+            "53": 10.08412,
+            "54": 9.96565,
+            "55": 9.89124,
+            "56": 9.63923,
+            "57": 9.49364,
+            "58": 9.83867,
+            "59": 9.59623,
+            "60": 9.50909,
+            "61": 9.70543,
+            "62": 9.99515,
+            "63": 9.38102,
+            "64": 9.78222,
+            "65": 8.95965,
+            "66": 9.71007,
+            "67": 9.38014,
+            "68": 9.78825,
+            "69": 9.79432,
+            "70": 9.7352,
+            "71": 9.6222,
+            "72": 9.58803,
+            "73": 9.49714,
+            "74": 8.94242,
+            "75": 9.43219,
+            "76": 9.09756,
+            "77": 10.06849,
+            "78": 9.73057,
+            "79": 9.37757,
+            "80": 9.41117,
+            "81": 9.4863,
+            "82": 9.6976,
+            "83": 9.3167,
+            "84": 9.42154,
+            "85": 9.61502,
+            "86": 9.0763,
+            "87": 9.59888,
+            "88": 9.75044,
+            "89": 9.61234,
+            "90": 9.82363,
+            "91": 9.3537,
+            "92": 9.36524,
+            "93": 9.08832,
+            "94": 8.83613,
+            "95": 9.52262,
+            "96": 9.52735,
+            "97": 9.3169,
+            "98": 9.67958,
+            "99": 8.89279,
+            "100": 9.40809
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1708.0,
-            "2": 1804.0,
-            "3": 1725.0,
-            "4": 1881.0,
-            "5": 2019.0,
-            "6": 2015.0,
-            "7": 2086.0,
-            "8": 1730.0,
-            "9": 2024.0,
-            "10": 1515.0,
-            "11": 2162.0,
-            "12": 1847.0,
-            "13": 2125.0,
-            "14": 2050.0,
-            "15": 1946.0,
-            "16": 2000.0,
-            "17": 1996.0,
-            "18": 1874.0,
-            "19": 2011.0,
-            "20": 1771.0,
-            "21": 2099.0,
-            "22": 1892.0,
-            "23": 2171.0,
-            "24": 1834.0,
-            "25": 1790.0,
-            "26": 1803.0,
-            "27": 1998.0,
-            "28": 2211.0,
-            "29": 2129.0,
-            "30": 2147.0,
-            "31": 1623.0,
-            "32": 2174.0,
-            "33": 2364.0,
-            "34": 2035.0,
-            "35": 2089.0,
-            "36": 2202.0,
-            "37": 2603.0,
-            "38": 2468.0,
-            "39": 2623.0,
-            "40": 2383.0,
-            "41": 2519.0,
-            "42": 2522.0,
-            "43": 2235.0,
-            "44": 2275.0,
-            "45": 2319.0,
-            "46": 2632.0,
-            "47": 2675.0,
-            "48": 2697.0,
-            "49": 2551.0,
-            "50": 2814.0,
-            "51": 2767.0,
-            "52": 2804.0,
-            "53": 3231.0,
-            "54": 2905.0,
-            "55": 2575.0,
-            "56": 3077.0,
-            "57": 2587.0,
-            "58": 3346.0,
-            "59": 3056.0,
-            "60": 2695.0,
-            "61": 3191.0,
-            "62": 2637.0,
-            "63": 2649.0,
-            "64": 3176.0,
-            "65": 2756.0,
-            "66": 3481.0,
-            "67": 2905.0,
+            "1": 1789.0,
+            "2": 1890.0,
+            "3": 1856.0,
+            "4": 2016.0,
+            "5": 2048.0,
+            "6": 1995.0,
+            "7": 1995.0,
+            "8": 1655.0,
+            "9": 1922.0,
+            "10": 1507.0,
+            "11": 2196.0,
+            "12": 1957.0,
+            "13": 2117.0,
+            "14": 2079.0,
+            "15": 2008.0,
+            "16": 1983.0,
+            "17": 2006.0,
+            "18": 1819.0,
+            "19": 1967.0,
+            "20": 1758.0,
+            "21": 2058.0,
+            "22": 1937.0,
+            "23": 2263.0,
+            "24": 1884.0,
+            "25": 1756.0,
+            "26": 1894.0,
+            "27": 2052.0,
+            "28": 2078.0,
+            "29": 2206.0,
+            "30": 2065.0,
+            "31": 1708.0,
+            "32": 2129.0,
+            "33": 2384.0,
+            "34": 2134.0,
+            "35": 2113.0,
+            "36": 2074.0,
+            "37": 2665.0,
+            "38": 2465.0,
+            "39": 2589.0,
+            "40": 2392.0,
+            "41": 2513.0,
+            "42": 2448.0,
+            "43": 2185.0,
+            "44": 2326.0,
+            "45": 2331.0,
+            "46": 2640.0,
+            "47": 2686.0,
+            "48": 2674.0,
+            "49": 2589.0,
+            "50": 2834.0,
+            "51": 2841.0,
+            "52": 2853.0,
+            "53": 3184.0,
+            "54": 2849.0,
+            "55": 2661.0,
+            "56": 3110.0,
+            "57": 2571.0,
+            "58": 3237.0,
+            "59": 2973.0,
+            "60": 2722.0,
+            "61": 3162.0,
+            "62": 2823.0,
+            "63": 2664.0,
+            "64": 3252.0,
+            "65": 2911.0,
+            "66": 3337.0,
+            "67": 2866.0,
             "68": 3114.0,
-            "69": 3133.0,
-            "70": 3533.0,
-            "71": 3225.0,
-            "72": 2621.0,
-            "73": 3297.0,
-            "74": 2145.0,
-            "75": 2799.0,
-            "76": 3354.0,
-            "77": 3466.0,
-            "78": 3485.0,
-            "79": 3464.0,
-            "80": 3614.0,
-            "81": 4011.0,
-            "82": 3694.0,
-            "83": 3201.0,
-            "84": 3655.0,
-            "85": 3597.0,
-            "86": 3096.0,
-            "87": 4103.0,
-            "88": 3306.0,
-            "89": 3839.0,
-            "90": 3352.0,
-            "91": 2980.0,
-            "92": 3452.0,
-            "93": 2967.0,
-            "94": 3773.0,
-            "95": 3589.0,
-            "96": 3800.0,
-            "97": 3412.0,
-            "98": 3998.0,
-            "99": 3483.0,
-            "100": 3651.0
+            "69": 3117.0,
+            "70": 3464.0,
+            "71": 3260.0,
+            "72": 2574.0,
+            "73": 3136.0,
+            "74": 2181.0,
+            "75": 2818.0,
+            "76": 3370.0,
+            "77": 3581.0,
+            "78": 3538.0,
+            "79": 3597.0,
+            "80": 3756.0,
+            "81": 3986.0,
+            "82": 3628.0,
+            "83": 3213.0,
+            "84": 3441.0,
+            "85": 3593.0,
+            "86": 3051.0,
+            "87": 4066.0,
+            "88": 3328.0,
+            "89": 3726.0,
+            "90": 3375.0,
+            "91": 3181.0,
+            "92": 3417.0,
+            "93": 3027.0,
+            "94": 3758.0,
+            "95": 3688.0,
+            "96": 3847.0,
+            "97": 3383.0,
+            "98": 4018.0,
+            "99": 3469.0,
+            "100": 3505.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 232422400.0,
-            "2": 232422400.0,
-            "3": 232422400.0,
-            "4": 232422400.0,
-            "5": 232422400.0,
-            "6": 233470976.0,
-            "7": 232422400.0,
-            "8": 233470976.0,
-            "9": 232422400.0,
-            "10": 232422400.0,
-            "11": 232422400.0,
-            "12": 232422400.0,
-            "13": 232422400.0,
-            "14": 233470976.0,
-            "15": 232422400.0,
-            "16": 232422400.0,
-            "17": 232422400.0,
-            "18": 232422400.0,
-            "19": 232422400.0,
-            "20": 232422400.0,
-            "21": 232422400.0,
-            "22": 232422400.0,
-            "23": 232422400.0,
-            "24": 232422400.0,
-            "25": 232422400.0,
-            "26": 232422400.0,
-            "27": 232422400.0,
-            "28": 232422400.0,
-            "29": 232422400.0,
-            "30": 232422400.0,
-            "31": 232422400.0,
-            "32": 232422400.0,
-            "33": 232422400.0,
-            "34": 232422400.0,
-            "35": 232422400.0,
-            "36": 232422400.0,
-            "37": 232422400.0,
-            "38": 232422400.0,
-            "39": 232422400.0,
-            "40": 232422400.0,
-            "41": 232422400.0,
-            "42": 232422400.0,
-            "43": 232422400.0,
-            "44": 232422400.0,
-            "45": 232422400.0,
-            "46": 232422400.0,
-            "47": 232422400.0,
-            "48": 232422400.0,
-            "49": 233470976.0,
-            "50": 232422400.0,
-            "51": 232422400.0,
-            "52": 232422400.0,
-            "53": 232422400.0,
-            "54": 232422400.0,
-            "55": 233470976.0,
-            "56": 232422400.0,
-            "57": 233470976.0,
-            "58": 232422400.0,
-            "59": 232422400.0,
-            "60": 232422400.0,
-            "61": 232422400.0,
-            "62": 232422400.0,
-            "63": 232422400.0,
-            "64": 232422400.0,
-            "65": 232422400.0,
-            "66": 232422400.0,
-            "67": 232422400.0,
-            "68": 232422400.0,
-            "69": 232422400.0,
-            "70": 232422400.0,
-            "71": 232422400.0,
-            "72": 232422400.0,
-            "73": 232422400.0,
-            "74": 232422400.0,
-            "75": 232422400.0,
-            "76": 232422400.0,
-            "77": 232422400.0,
-            "78": 232422400.0,
-            "79": 232422400.0,
-            "80": 232422400.0,
-            "81": 232422400.0,
-            "82": 232422400.0,
-            "83": 232422400.0,
-            "84": 232422400.0,
-            "85": 232422400.0,
-            "86": 232422400.0,
-            "87": 232422400.0,
-            "88": 232422400.0,
-            "89": 232422400.0,
-            "90": 232422400.0,
-            "91": 232422400.0,
-            "92": 232422400.0,
-            "93": 232422400.0,
-            "94": 232422400.0,
-            "95": 232422400.0,
-            "96": 232422400.0,
-            "97": 232422400.0,
-            "98": 232422400.0,
-            "99": 233470976.0,
-            "100": 232422400.0
+            "1": 233446912.0,
+            "2": 232398336.0,
+            "3": 232398336.0,
+            "4": 232398336.0,
+            "5": 232398336.0,
+            "6": 232398336.0,
+            "7": 232398336.0,
+            "8": 232398336.0,
+            "9": 232398336.0,
+            "10": 232398336.0,
+            "11": 232398336.0,
+            "12": 232398336.0,
+            "13": 232398336.0,
+            "14": 232398336.0,
+            "15": 232398336.0,
+            "16": 232398336.0,
+            "17": 232398336.0,
+            "18": 232398336.0,
+            "19": 232398336.0,
+            "20": 232398336.0,
+            "21": 232398336.0,
+            "22": 232398336.0,
+            "23": 232398336.0,
+            "24": 232398336.0,
+            "25": 232398336.0,
+            "26": 232398336.0,
+            "27": 232398336.0,
+            "28": 232398336.0,
+            "29": 232398336.0,
+            "30": 232398336.0,
+            "31": 232398336.0,
+            "32": 232398336.0,
+            "33": 232398336.0,
+            "34": 232398336.0,
+            "35": 232398336.0,
+            "36": 232398336.0,
+            "37": 232398336.0,
+            "38": 232398336.0,
+            "39": 232398336.0,
+            "40": 232398336.0,
+            "41": 232398336.0,
+            "42": 232398336.0,
+            "43": 232398336.0,
+            "44": 232398336.0,
+            "45": 232398336.0,
+            "46": 232398336.0,
+            "47": 232398336.0,
+            "48": 232398336.0,
+            "49": 232398336.0,
+            "50": 232398336.0,
+            "51": 232398336.0,
+            "52": 232398336.0,
+            "53": 232398336.0,
+            "54": 232398336.0,
+            "55": 232398336.0,
+            "56": 232398336.0,
+            "57": 232398336.0,
+            "58": 232398336.0,
+            "59": 232398336.0,
+            "60": 232398336.0,
+            "61": 232398336.0,
+            "62": 232398336.0,
+            "63": 232398336.0,
+            "64": 232398336.0,
+            "65": 232398336.0,
+            "66": 232398336.0,
+            "67": 232398336.0,
+            "68": 232398336.0,
+            "69": 232398336.0,
+            "70": 232398336.0,
+            "71": 232398336.0,
+            "72": 232398336.0,
+            "73": 232398336.0,
+            "74": 232398336.0,
+            "75": 232398336.0,
+            "76": 232398336.0,
+            "77": 232398336.0,
+            "78": 232398336.0,
+            "79": 232398336.0,
+            "80": 232398336.0,
+            "81": 232398336.0,
+            "82": 232398336.0,
+            "83": 232398336.0,
+            "84": 232398336.0,
+            "85": 232398336.0,
+            "86": 232398336.0,
+            "87": 232398336.0,
+            "88": 232398336.0,
+            "89": 232398336.0,
+            "90": 232398336.0,
+            "91": 232398336.0,
+            "92": 232398336.0,
+            "93": 232398336.0,
+            "94": 232398336.0,
+            "95": 232398336.0,
+            "96": 232398336.0,
+            "97": 232398336.0,
+            "98": 232398336.0,
+            "99": 232398336.0,
+            "100": 232398336.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 683423744.0,
-            "2": 773273600.0,
-            "3": 773276672.0,
-            "4": 773276672.0,
-            "5": 773276672.0,
-            "6": 773276672.0,
-            "7": 773276672.0,
-            "8": 773276672.0,
-            "9": 773276672.0,
-            "10": 773276672.0,
-            "11": 773276672.0,
-            "12": 773276672.0,
-            "13": 773276672.0,
-            "14": 773276672.0,
-            "15": 773276672.0,
-            "16": 773276672.0,
-            "17": 773276672.0,
-            "18": 773276672.0,
-            "19": 773276672.0,
-            "20": 773276672.0,
-            "21": 773276672.0,
-            "22": 773276672.0,
-            "23": 773276672.0,
-            "24": 773276672.0,
-            "25": 773276672.0,
-            "26": 773276672.0,
-            "27": 773276672.0,
-            "28": 773276672.0,
-            "29": 773276672.0,
-            "30": 773276672.0,
-            "31": 773276672.0,
-            "32": 773276672.0,
-            "33": 773276672.0,
-            "34": 773276672.0,
-            "35": 773276672.0,
-            "36": 773276672.0,
-            "37": 773276672.0,
-            "38": 773276672.0,
-            "39": 773276672.0,
-            "40": 773276672.0,
-            "41": 773276672.0,
-            "42": 773276672.0,
-            "43": 773276672.0,
-            "44": 773276672.0,
-            "45": 773276672.0,
-            "46": 773276672.0,
-            "47": 773276672.0,
-            "48": 773276672.0,
-            "49": 773276672.0,
-            "50": 775372800.0,
-            "51": 775372800.0,
-            "52": 775372800.0,
-            "53": 775372800.0,
-            "54": 775372800.0,
-            "55": 775372800.0,
-            "56": 775372800.0,
-            "57": 775372800.0,
-            "58": 775372800.0,
-            "59": 775372800.0,
-            "60": 775372800.0,
-            "61": 775372800.0,
-            "62": 775372800.0,
-            "63": 775372800.0,
-            "64": 775372800.0,
-            "65": 775372800.0,
-            "66": 775372800.0,
-            "67": 775372800.0,
-            "68": 775372800.0,
-            "69": 775372800.0,
-            "70": 775372800.0,
-            "71": 775372800.0,
-            "72": 775372800.0,
-            "73": 775372800.0,
-            "74": 775372800.0,
-            "75": 775372800.0,
-            "76": 775372800.0,
-            "77": 775372800.0,
-            "78": 775372800.0,
-            "79": 775372800.0,
-            "80": 775372800.0,
-            "81": 775372800.0,
-            "82": 775372800.0,
-            "83": 775372800.0,
-            "84": 775372800.0,
-            "85": 775372800.0,
-            "86": 775372800.0,
-            "87": 775372800.0,
-            "88": 775372800.0,
-            "89": 775372800.0,
-            "90": 775372800.0,
-            "91": 775372800.0,
-            "92": 775372800.0,
-            "93": 775372800.0,
-            "94": 775372800.0,
-            "95": 775372800.0,
-            "96": 775372800.0,
-            "97": 775372800.0,
-            "98": 775372800.0,
-            "99": 775373312.0,
-            "100": 775373312.0
+            "1": 686539264.0,
+            "2": 775343616.0,
+            "3": 775343616.0,
+            "4": 775343616.0,
+            "5": 775343616.0,
+            "6": 775343616.0,
+            "7": 775343616.0,
+            "8": 775343616.0,
+            "9": 775343616.0,
+            "10": 775343616.0,
+            "11": 775343616.0,
+            "12": 775343616.0,
+            "13": 775343616.0,
+            "14": 775343616.0,
+            "15": 775343616.0,
+            "16": 775343616.0,
+            "17": 775343616.0,
+            "18": 775343616.0,
+            "19": 775343616.0,
+            "20": 775343616.0,
+            "21": 775343616.0,
+            "22": 775343616.0,
+            "23": 775343616.0,
+            "24": 775343616.0,
+            "25": 775343616.0,
+            "26": 775343616.0,
+            "27": 775343616.0,
+            "28": 775343616.0,
+            "29": 775343616.0,
+            "30": 775343616.0,
+            "31": 775343616.0,
+            "32": 775343616.0,
+            "33": 775343616.0,
+            "34": 775343616.0,
+            "35": 775343616.0,
+            "36": 775343616.0,
+            "37": 775343616.0,
+            "38": 775343616.0,
+            "39": 775343616.0,
+            "40": 775343616.0,
+            "41": 775343616.0,
+            "42": 775343616.0,
+            "43": 775343616.0,
+            "44": 775343616.0,
+            "45": 775343616.0,
+            "46": 775343616.0,
+            "47": 775343616.0,
+            "48": 775343616.0,
+            "49": 775343616.0,
+            "50": 775343616.0,
+            "51": 775343616.0,
+            "52": 775343616.0,
+            "53": 775343616.0,
+            "54": 775343616.0,
+            "55": 775343616.0,
+            "56": 775343616.0,
+            "57": 775343616.0,
+            "58": 775343616.0,
+            "59": 775343616.0,
+            "60": 775343616.0,
+            "61": 775343616.0,
+            "62": 775343616.0,
+            "63": 775343616.0,
+            "64": 775343616.0,
+            "65": 775343616.0,
+            "66": 775343616.0,
+            "67": 775343616.0,
+            "68": 775343616.0,
+            "69": 775343616.0,
+            "70": 775343616.0,
+            "71": 775343616.0,
+            "72": 775343616.0,
+            "73": 775343616.0,
+            "74": 775343616.0,
+            "75": 775343616.0,
+            "76": 775343616.0,
+            "77": 775343616.0,
+            "78": 775343616.0,
+            "79": 775343616.0,
+            "80": 775343616.0,
+            "81": 775343616.0,
+            "82": 775343616.0,
+            "83": 775343616.0,
+            "84": 775343616.0,
+            "85": 775343616.0,
+            "86": 775343616.0,
+            "87": 775343616.0,
+            "88": 775343616.0,
+            "89": 775343616.0,
+            "90": 775343616.0,
+            "91": 775343616.0,
+            "92": 775343616.0,
+            "93": 775343616.0,
+            "94": 775343616.0,
+            "95": 775343616.0,
+            "96": 775343616.0,
+            "97": 775343616.0,
+            "98": 775343616.0,
+            "99": 775343616.0,
+            "100": 775343616.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 16.23173,
-            "2": 0.48632,
-            "3": 0.3184,
-            "4": 0.31067,
-            "5": 0.31575,
-            "6": 0.3127,
-            "7": 0.3096,
-            "8": 0.31392,
-            "9": 0.31591,
-            "10": 0.30891,
-            "11": 0.31209,
-            "12": 0.31271,
-            "13": 0.30582,
-            "14": 0.31032,
-            "15": 0.30879,
-            "16": 0.3077,
-            "17": 0.30689,
-            "18": 0.30824,
-            "19": 0.30953,
-            "20": 0.30728,
-            "21": 0.31141,
-            "22": 0.31157,
-            "23": 0.30569,
-            "24": 0.30896,
-            "25": 0.30916,
-            "26": 0.30674,
-            "27": 0.31017,
-            "28": 0.30716,
-            "29": 0.30734,
-            "30": 0.30698,
-            "31": 0.30881,
-            "32": 0.3089,
-            "33": 0.30647,
-            "34": 0.3112,
-            "35": 0.311,
-            "36": 0.30632,
-            "37": 0.30856,
-            "38": 0.30986,
-            "39": 0.30502,
-            "40": 0.31035,
-            "41": 0.306,
-            "42": 0.30943,
-            "43": 0.30773,
-            "44": 0.30886,
-            "45": 0.30942,
-            "46": 0.30579,
-            "47": 0.31121,
-            "48": 0.31407,
-            "49": 0.30981,
-            "50": 0.30966,
-            "51": 0.3347,
-            "52": 0.35543,
-            "53": 0.31067,
-            "54": 0.30931,
-            "55": 0.31517,
-            "56": 0.30883,
-            "57": 0.30908,
-            "58": 0.31373,
-            "59": 0.30746,
-            "60": 0.31113,
-            "61": 0.31473,
-            "62": 0.30775,
-            "63": 0.31034,
-            "64": 0.31108,
-            "65": 0.3103,
-            "66": 0.3085,
-            "67": 0.31036,
-            "68": 0.31412,
-            "69": 0.30947,
-            "70": 0.30646,
-            "71": 0.31133,
-            "72": 0.30734,
-            "73": 0.31043,
-            "74": 0.31583,
-            "75": 0.3074,
-            "76": 0.30939,
-            "77": 0.3182,
-            "78": 0.30755,
-            "79": 0.30953,
-            "80": 0.3085,
-            "81": 0.31023,
-            "82": 0.30621,
-            "83": 0.30705,
-            "84": 0.31232,
-            "85": 0.30864,
-            "86": 0.31017,
-            "87": 0.3124,
-            "88": 0.30667,
-            "89": 0.31086,
-            "90": 0.31626,
-            "91": 0.30744,
-            "92": 0.30887,
-            "93": 0.31054,
-            "94": 0.31172,
-            "95": 0.31164,
-            "96": 0.31058,
-            "97": 0.31089,
-            "98": 0.30676,
-            "99": 0.3105,
-            "100": 0.31337
+            "1": 5.48931,
+            "2": 0.38781,
+            "3": 0.30745,
+            "4": 0.29469,
+            "5": 0.29328,
+            "6": 0.29844,
+            "7": 0.29347,
+            "8": 0.29314,
+            "9": 0.29281,
+            "10": 0.29323,
+            "11": 0.29135,
+            "12": 0.29127,
+            "13": 0.2914,
+            "14": 0.29074,
+            "15": 0.29691,
+            "16": 0.30283,
+            "17": 0.29988,
+            "18": 0.29873,
+            "19": 0.29704,
+            "20": 0.29912,
+            "21": 0.30262,
+            "22": 0.30204,
+            "23": 0.30199,
+            "24": 0.30225,
+            "25": 0.30036,
+            "26": 0.29842,
+            "27": 0.29878,
+            "28": 0.29797,
+            "29": 0.29719,
+            "30": 0.29875,
+            "31": 0.29743,
+            "32": 0.2987,
+            "33": 0.29958,
+            "34": 0.29843,
+            "35": 0.29886,
+            "36": 0.29816,
+            "37": 0.29796,
+            "38": 0.29796,
+            "39": 0.29692,
+            "40": 0.29756,
+            "41": 0.29712,
+            "42": 0.29674,
+            "43": 0.29758,
+            "44": 0.2971,
+            "45": 0.29798,
+            "46": 0.29812,
+            "47": 0.29773,
+            "48": 0.30095,
+            "49": 0.29437,
+            "50": 0.29498,
+            "51": 0.33787,
+            "52": 0.29219,
+            "53": 0.29371,
+            "54": 0.29832,
+            "55": 0.28876,
+            "56": 0.28903,
+            "57": 0.29103,
+            "58": 0.29066,
+            "59": 0.28874,
+            "60": 0.289,
+            "61": 0.28856,
+            "62": 0.2897,
+            "63": 0.28854,
+            "64": 0.28899,
+            "65": 0.29126,
+            "66": 0.28906,
+            "67": 0.28978,
+            "68": 0.28897,
+            "69": 0.2889,
+            "70": 0.28915,
+            "71": 0.28827,
+            "72": 0.28768,
+            "73": 0.28843,
+            "74": 0.28863,
+            "75": 0.28877,
+            "76": 0.28811,
+            "77": 0.28855,
+            "78": 0.28804,
+            "79": 0.28833,
+            "80": 0.28882,
+            "81": 0.28873,
+            "82": 0.28884,
+            "83": 0.28861,
+            "84": 0.28901,
+            "85": 0.28795,
+            "86": 0.28814,
+            "87": 0.28857,
+            "88": 0.288,
+            "89": 0.28839,
+            "90": 0.28805,
+            "91": 0.28918,
+            "92": 0.2879,
+            "93": 0.28927,
+            "94": 0.28862,
+            "95": 0.28972,
+            "96": 0.28939,
+            "97": 0.288,
+            "98": 0.28768,
+            "99": 0.28865,
+            "100": 0.28729
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..11130fada71
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.86073,
+            "52": 9.76122,
+            "53": 10.08416,
+            "54": 9.96562,
+            "55": 9.89126,
+            "56": 9.63921,
+            "57": 9.4936,
+            "58": 9.83868,
+            "59": 9.59625,
+            "60": 9.50906,
+            "61": 9.7054,
+            "62": 9.99515,
+            "63": 9.38097,
+            "64": 9.78219,
+            "65": 8.95965,
+            "66": 9.71003,
+            "67": 9.38014,
+            "68": 9.78828,
+            "69": 9.79431,
+            "70": 9.7352,
+            "71": 9.62218,
+            "72": 9.58801,
+            "73": 9.49717,
+            "74": 8.94242,
+            "75": 9.43221,
+            "76": 9.09754,
+            "77": 10.06851,
+            "78": 9.73059,
+            "79": 9.37757,
+            "80": 9.41117,
+            "81": 9.48633,
+            "82": 9.69758,
+            "83": 9.3167,
+            "84": 9.42152,
+            "85": 9.61504,
+            "86": 9.07627,
+            "87": 9.59883,
+            "88": 9.75043,
+            "89": 9.61229,
+            "90": 9.82365,
+            "91": 9.35377,
+            "92": 9.36527,
+            "93": 9.08834,
+            "94": 8.83612,
+            "95": 9.52265,
+            "96": 9.52736,
+            "97": 9.31693,
+            "98": 9.67961,
+            "99": 8.89278,
+            "100": 9.40806
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2810.0,
+            "52": 2895.0,
+            "53": 3212.0,
+            "54": 2965.0,
+            "55": 2665.0,
+            "56": 3040.0,
+            "57": 2570.0,
+            "58": 3275.0,
+            "59": 3010.0,
+            "60": 2665.0,
+            "61": 3106.0,
+            "62": 2811.0,
+            "63": 2762.0,
+            "64": 3180.0,
+            "65": 2941.0,
+            "66": 3474.0,
+            "67": 2980.0,
+            "68": 3013.0,
+            "69": 3189.0,
+            "70": 3464.0,
+            "71": 3128.0,
+            "72": 2493.0,
+            "73": 3343.0,
+            "74": 2172.0,
+            "75": 2799.0,
+            "76": 3444.0,
+            "77": 3549.0,
+            "78": 3550.0,
+            "79": 3566.0,
+            "80": 3729.0,
+            "81": 3979.0,
+            "82": 3652.0,
+            "83": 3217.0,
+            "84": 3597.0,
+            "85": 3661.0,
+            "86": 3069.0,
+            "87": 4117.0,
+            "88": 3340.0,
+            "89": 3817.0,
+            "90": 3476.0,
+            "91": 3025.0,
+            "92": 3456.0,
+            "93": 2943.0,
+            "94": 3710.0,
+            "95": 3705.0,
+            "96": 3758.0,
+            "97": 3465.0,
+            "98": 4041.0,
+            "99": 3360.0,
+            "100": 3639.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 232398336.0,
+            "52": 232398336.0,
+            "53": 232398336.0,
+            "54": 232398336.0,
+            "55": 232398336.0,
+            "56": 232398336.0,
+            "57": 232398336.0,
+            "58": 232398336.0,
+            "59": 232398336.0,
+            "60": 232398336.0,
+            "61": 232398336.0,
+            "62": 232398336.0,
+            "63": 232398336.0,
+            "64": 232398336.0,
+            "65": 232398336.0,
+            "66": 232398336.0,
+            "67": 232398336.0,
+            "68": 232398336.0,
+            "69": 232398336.0,
+            "70": 232398336.0,
+            "71": 232398336.0,
+            "72": 232398336.0,
+            "73": 232398336.0,
+            "74": 232398336.0,
+            "75": 232398336.0,
+            "76": 232398336.0,
+            "77": 232398336.0,
+            "78": 232398336.0,
+            "79": 232398336.0,
+            "80": 232398336.0,
+            "81": 232398336.0,
+            "82": 232398336.0,
+            "83": 232398336.0,
+            "84": 232398336.0,
+            "85": 232398336.0,
+            "86": 232398336.0,
+            "87": 232398336.0,
+            "88": 232398336.0,
+            "89": 232398336.0,
+            "90": 232398336.0,
+            "91": 232398336.0,
+            "92": 232398336.0,
+            "93": 232398336.0,
+            "94": 232398336.0,
+            "95": 232398336.0,
+            "96": 232398336.0,
+            "97": 232398336.0,
+            "98": 232398336.0,
+            "99": 232398336.0,
+            "100": 232398336.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 777437184.0,
+            "52": 777437184.0,
+            "53": 777438720.0,
+            "54": 777438720.0,
+            "55": 777438720.0,
+            "56": 777438720.0,
+            "57": 777438720.0,
+            "58": 777438720.0,
+            "59": 777438720.0,
+            "60": 777438720.0,
+            "61": 777438720.0,
+            "62": 777440768.0,
+            "63": 777440768.0,
+            "64": 777440768.0,
+            "65": 777440768.0,
+            "66": 777440768.0,
+            "67": 777440768.0,
+            "68": 777440768.0,
+            "69": 777440768.0,
+            "70": 777440768.0,
+            "71": 777440768.0,
+            "72": 777440768.0,
+            "73": 777440768.0,
+            "74": 777440768.0,
+            "75": 777440768.0,
+            "76": 777440768.0,
+            "77": 777440768.0,
+            "78": 777440768.0,
+            "79": 777440768.0,
+            "80": 777440768.0,
+            "81": 777440768.0,
+            "82": 777440768.0,
+            "83": 777440768.0,
+            "84": 777440768.0,
+            "85": 777440768.0,
+            "86": 777440768.0,
+            "87": 777440768.0,
+            "88": 777440768.0,
+            "89": 777440768.0,
+            "90": 777440768.0,
+            "91": 777440768.0,
+            "92": 777440768.0,
+            "93": 777440768.0,
+            "94": 777440768.0,
+            "95": 777440768.0,
+            "96": 777440768.0,
+            "97": 777440768.0,
+            "98": 777440768.0,
+            "99": 777440768.0,
+            "100": 777440768.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8.32391,
+            "52": 0.3398,
+            "53": 0.30756,
+            "54": 0.30697,
+            "55": 0.2935,
+            "56": 0.29413,
+            "57": 0.29396,
+            "58": 0.29456,
+            "59": 0.29233,
+            "60": 0.2939,
+            "61": 0.29443,
+            "62": 0.2943,
+            "63": 0.29432,
+            "64": 0.2932,
+            "65": 0.29355,
+            "66": 0.29184,
+            "67": 0.29158,
+            "68": 0.29084,
+            "69": 0.29172,
+            "70": 0.29363,
+            "71": 0.29168,
+            "72": 0.29019,
+            "73": 0.28966,
+            "74": 0.29246,
+            "75": 0.29011,
+            "76": 0.29057,
+            "77": 0.29091,
+            "78": 0.29324,
+            "79": 0.29066,
+            "80": 0.29107,
+            "81": 0.29294,
+            "82": 0.29221,
+            "83": 0.29236,
+            "84": 0.29186,
+            "85": 0.29093,
+            "86": 0.29169,
+            "87": 0.29216,
+            "88": 0.29208,
+            "89": 0.29119,
+            "90": 0.29052,
+            "91": 0.29071,
+            "92": 0.29077,
+            "93": 0.2924,
+            "94": 0.29099,
+            "95": 0.29258,
+            "96": 0.29081,
+            "97": 0.29179,
+            "98": 0.29109,
+            "99": 0.29355,
+            "100": 0.29202
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..12b113ac52d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.77465,
+            "2": 10.7833,
+            "3": 10.78415,
+            "4": 10.75096,
+            "5": 10.82178,
+            "6": 10.82451,
+            "7": 10.79285,
+            "8": 10.78381,
+            "9": 10.79131,
+            "10": 10.75104,
+            "11": 10.84159,
+            "12": 10.81851,
+            "13": 10.83696,
+            "14": 10.84049,
+            "15": 10.79327,
+            "16": 10.80444,
+            "17": 10.78857,
+            "18": 10.80645,
+            "19": 10.81337,
+            "20": 10.78432,
+            "21": 10.80011,
+            "22": 10.7354,
+            "23": 10.82878,
+            "24": 10.76463,
+            "25": 10.73732,
+            "26": 10.75952,
+            "27": 10.78402,
+            "28": 10.78532,
+            "29": 10.78911,
+            "30": 10.67902,
+            "31": 10.56799,
+            "32": 10.71676,
+            "33": 10.71733,
+            "34": 10.59866,
+            "35": 10.60045,
+            "36": 10.56736,
+            "37": 10.62419,
+            "38": 10.53217,
+            "39": 10.64858,
+            "40": 10.44562,
+            "41": 10.49812,
+            "42": 10.52883,
+            "43": 10.27436,
+            "44": 10.33638,
+            "45": 10.24984,
+            "46": 10.23698,
+            "47": 10.43825,
+            "48": 10.22783,
+            "49": 10.00196,
+            "50": 10.24562,
+            "51": 10.20107,
+            "52": 10.10861,
+            "53": 10.3403,
+            "54": 10.23893,
+            "55": 10.19008,
+            "56": 9.96159,
+            "57": 9.82417,
+            "58": 10.10904,
+            "59": 9.9041,
+            "60": 9.82045,
+            "61": 9.96789,
+            "62": 10.19934,
+            "63": 9.66196,
+            "64": 10.00416,
+            "65": 9.2675,
+            "66": 9.92466,
+            "67": 9.62367,
+            "68": 9.98499,
+            "69": 9.98524,
+            "70": 9.92553,
+            "71": 9.81785,
+            "72": 9.77816,
+            "73": 9.67402,
+            "74": 9.16615,
+            "75": 9.59935,
+            "76": 9.2754,
+            "77": 10.18639,
+            "78": 9.86592,
+            "79": 9.52838,
+            "80": 9.55132,
+            "81": 9.63037,
+            "82": 9.82843,
+            "83": 9.47009,
+            "84": 9.5424,
+            "85": 9.74228,
+            "86": 9.20711,
+            "87": 9.70433,
+            "88": 9.86745,
+            "89": 9.72062,
+            "90": 9.9304,
+            "91": 9.471,
+            "92": 9.47539,
+            "93": 9.21193,
+            "94": 8.94879,
+            "95": 9.62951,
+            "96": 9.63936,
+            "97": 9.40708,
+            "98": 9.77232,
+            "99": 9.01139,
+            "100": 9.51718
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 454770688.0,
+            "2": 454770688.0,
+            "3": 454770688.0,
+            "4": 454770688.0,
+            "5": 454770688.0,
+            "6": 454770688.0,
+            "7": 454770688.0,
+            "8": 454770688.0,
+            "9": 454770688.0,
+            "10": 454770688.0,
+            "11": 454770688.0,
+            "12": 454770688.0,
+            "13": 454770688.0,
+            "14": 454770688.0,
+            "15": 454770688.0,
+            "16": 454770688.0,
+            "17": 454770688.0,
+            "18": 518880768.0,
+            "19": 518880768.0,
+            "20": 518880768.0,
+            "21": 518880768.0,
+            "22": 518880768.0,
+            "23": 518880768.0,
+            "24": 518880768.0,
+            "25": 518880768.0,
+            "26": 518880768.0,
+            "27": 518880768.0,
+            "28": 518880768.0,
+            "29": 518880768.0,
+            "30": 518880768.0,
+            "31": 518880768.0,
+            "32": 518880768.0,
+            "33": 518880768.0,
+            "34": 518880768.0,
+            "35": 518880768.0,
+            "36": 518880768.0,
+            "37": 518880768.0,
+            "38": 518880768.0,
+            "39": 518880768.0,
+            "40": 518880768.0,
+            "41": 518880768.0,
+            "42": 518880768.0,
+            "43": 518880768.0,
+            "44": 518880768.0,
+            "45": 518880768.0,
+            "46": 518880768.0,
+            "47": 518880768.0,
+            "48": 518880768.0,
+            "49": 518880768.0,
+            "50": 518880768.0,
+            "51": 518880768.0,
+            "52": 518880768.0,
+            "53": 518880768.0,
+            "54": 518880768.0,
+            "55": 518880768.0,
+            "56": 518880768.0,
+            "57": 518880768.0,
+            "58": 518880768.0,
+            "59": 518880768.0,
+            "60": 518880768.0,
+            "61": 518880768.0,
+            "62": 518880768.0,
+            "63": 518880768.0,
+            "64": 518880768.0,
+            "65": 518880768.0,
+            "66": 518880768.0,
+            "67": 518880768.0,
+            "68": 518880768.0,
+            "69": 518880768.0,
+            "70": 518880768.0,
+            "71": 518880768.0,
+            "72": 518880768.0,
+            "73": 518880768.0,
+            "74": 518880768.0,
+            "75": 518880768.0,
+            "76": 518880768.0,
+            "77": 518880768.0,
+            "78": 518880768.0,
+            "79": 518880768.0,
+            "80": 518880768.0,
+            "81": 518880768.0,
+            "82": 518880768.0,
+            "83": 518880768.0,
+            "84": 518880768.0,
+            "85": 518880768.0,
+            "86": 518880768.0,
+            "87": 518880768.0,
+            "88": 518880768.0,
+            "89": 518880768.0,
+            "90": 518880768.0,
+            "91": 518880768.0,
+            "92": 518880768.0,
+            "93": 518880768.0,
+            "94": 518880768.0,
+            "95": 518880768.0,
+            "96": 518880768.0,
+            "97": 518880768.0,
+            "98": 518880768.0,
+            "99": 518880768.0,
+            "100": 518880768.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 4511150592.0,
+            "2": 4544705536.0,
+            "3": 4544705536.0,
+            "4": 4544705536.0,
+            "5": 4544705536.0,
+            "6": 4544705536.0,
+            "7": 4544705536.0,
+            "8": 4544705536.0,
+            "9": 4544705536.0,
+            "10": 4544705536.0,
+            "11": 4544705536.0,
+            "12": 4544705536.0,
+            "13": 4544705536.0,
+            "14": 4544705536.0,
+            "15": 4544705536.0,
+            "16": 4544705536.0,
+            "17": 4544705536.0,
+            "18": 4544705536.0,
+            "19": 4607767040.0,
+            "20": 4607767040.0,
+            "21": 4607767040.0,
+            "22": 4607767040.0,
+            "23": 4607767040.0,
+            "24": 4607767040.0,
+            "25": 4607767040.0,
+            "26": 4607767040.0,
+            "27": 4607767040.0,
+            "28": 4607767040.0,
+            "29": 4607767040.0,
+            "30": 4607767040.0,
+            "31": 4607767040.0,
+            "32": 4607767040.0,
+            "33": 4607767040.0,
+            "34": 4607767040.0,
+            "35": 4607767040.0,
+            "36": 4607767040.0,
+            "37": 4607767040.0,
+            "38": 4607767040.0,
+            "39": 4607767040.0,
+            "40": 4607767040.0,
+            "41": 4607767040.0,
+            "42": 4607767040.0,
+            "43": 4607767040.0,
+            "44": 4607767040.0,
+            "45": 4607767040.0,
+            "46": 4607767040.0,
+            "47": 4607767040.0,
+            "48": 4607767040.0,
+            "49": 4607767040.0,
+            "50": 4607767040.0,
+            "51": 4607767040.0,
+            "52": 4607767040.0,
+            "53": 4607767040.0,
+            "54": 4607767040.0,
+            "55": 4607767040.0,
+            "56": 4607767040.0,
+            "57": 4607767040.0,
+            "58": 4607767040.0,
+            "59": 4607767040.0,
+            "60": 4607767040.0,
+            "61": 4607767040.0,
+            "62": 4607767040.0,
+            "63": 4607767040.0,
+            "64": 4607767040.0,
+            "65": 4607767040.0,
+            "66": 4607767040.0,
+            "67": 4607767040.0,
+            "68": 4607767040.0,
+            "69": 4607767040.0,
+            "70": 4607767040.0,
+            "71": 4607767040.0,
+            "72": 4607767040.0,
+            "73": 4607767040.0,
+            "74": 4607767040.0,
+            "75": 4607767040.0,
+            "76": 4607767040.0,
+            "77": 4607767040.0,
+            "78": 4607767040.0,
+            "79": 4607767040.0,
+            "80": 4607767040.0,
+            "81": 4607767040.0,
+            "82": 4607767040.0,
+            "83": 4607767040.0,
+            "84": 4607767040.0,
+            "85": 4607767040.0,
+            "86": 4607767040.0,
+            "87": 4607767040.0,
+            "88": 4607767040.0,
+            "89": 4607767040.0,
+            "90": 4607767040.0,
+            "91": 4607767040.0,
+            "92": 4607767040.0,
+            "93": 4607767040.0,
+            "94": 4607767040.0,
+            "95": 4607767040.0,
+            "96": 4607767040.0,
+            "97": 4607767040.0,
+            "98": 4607767040.0,
+            "99": 4607767040.0,
+            "100": 4607767040.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.48216,
+            "3": 0.10038,
+            "4": 0.08588,
+            "5": 0.08467,
+            "6": 0.08488,
+            "7": 0.08474,
+            "8": 0.08597,
+            "9": 0.0851,
+            "10": 0.08607,
+            "11": 0.08606,
+            "12": 0.08597,
+            "13": 0.08652,
+            "14": 0.08774,
+            "15": 0.08628,
+            "16": 0.0847,
+            "17": 0.08455,
+            "18": 0.13013,
+            "19": 0.09834,
+            "20": 0.09375,
+            "21": 0.09358,
+            "22": 0.09463,
+            "23": 0.094,
+            "24": 0.09339,
+            "25": 0.09356,
+            "26": 0.09394,
+            "27": 0.095,
+            "28": 0.09502,
+            "29": 0.09472,
+            "30": 0.0953,
+            "31": 0.09574,
+            "32": 0.09524,
+            "33": 0.09617,
+            "34": 0.09524,
+            "35": 0.09477,
+            "36": 0.09409,
+            "37": 0.09554,
+            "38": 0.09481,
+            "39": 0.09427,
+            "40": 0.08957,
+            "41": 0.0952,
+            "42": 0.09493,
+            "43": 0.09445,
+            "44": 0.09424,
+            "45": 0.09619,
+            "46": 0.09583,
+            "47": 0.09462,
+            "48": 0.09189,
+            "49": 0.09344,
+            "50": 0.09111,
+            "51": 0.09793,
+            "52": 0.08604,
+            "53": 0.08487,
+            "54": 0.08374,
+            "55": 0.0848,
+            "56": 0.08313,
+            "57": 0.08774,
+            "58": 0.08284,
+            "59": 0.08383,
+            "60": 0.08368,
+            "61": 0.08436,
+            "62": 0.08368,
+            "63": 0.08273,
+            "64": 0.08275,
+            "65": 0.0846,
+            "66": 0.08337,
+            "67": 0.08515,
+            "68": 0.08341,
+            "69": 0.08418,
+            "70": 0.08487,
+            "71": 0.08388,
+            "72": 0.08281,
+            "73": 0.08364,
+            "74": 0.0827,
+            "75": 0.08268,
+            "76": 0.08269,
+            "77": 0.08411,
+            "78": 0.09377,
+            "79": 0.08743,
+            "80": 0.08891,
+            "81": 0.08977,
+            "82": 0.0911,
+            "83": 0.09108,
+            "84": 0.09091,
+            "85": 0.09179,
+            "86": 0.09209,
+            "87": 0.09134,
+            "88": 0.09198,
+            "89": 0.09153,
+            "90": 0.09199,
+            "91": 0.09404,
+            "92": 0.09022,
+            "93": 0.09001,
+            "94": 0.09097,
+            "95": 0.09188,
+            "96": 0.09181,
+            "97": 0.09309,
+            "98": 0.0924,
+            "99": 0.09355,
+            "100": 0.09423
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": 1182.0,
+            "19": 1459.0,
+            "20": 1095.0,
+            "21": 1330.0,
+            "22": 1161.0,
+            "23": 1304.0,
+            "24": 1066.0,
+            "25": 1173.0,
+            "26": 1103.0,
+            "27": 1248.0,
+            "28": 1563.0,
+            "29": 1403.0,
+            "30": 1351.0,
+            "31": 1034.0,
+            "32": 1168.0,
+            "33": 1379.0,
+            "34": 1252.0,
+            "35": 1161.0,
+            "36": 1121.0,
+            "37": 1454.0,
+            "38": 1335.0,
+            "39": 1505.0,
+            "40": "nan",
+            "41": 1437.0,
+            "42": 1358.0,
+            "43": 1165.0,
+            "44": 1230.0,
+            "45": 1303.0,
+            "46": 1260.0,
+            "47": 1853.0,
+            "48": 1323.0,
+            "49": 1246.0,
+            "50": 1552.0,
+            "51": 1418.0,
+            "52": 1355.0,
+            "53": 1814.0,
+            "54": 1567.0,
+            "55": 1492.0,
+            "56": 1408.0,
+            "57": 1401.0,
+            "58": 1724.0,
+            "59": 1654.0,
+            "60": 1416.0,
+            "61": 1780.0,
+            "62": 1852.0,
+            "63": 1560.0,
+            "64": 1837.0,
+            "65": 1520.0,
+            "66": 1649.0,
+            "67": 1660.0,
+            "68": 1716.0,
+            "69": 1815.0,
+            "70": 2017.0,
+            "71": 2026.0,
+            "72": 1579.0,
+            "73": 1962.0,
+            "74": 1321.0,
+            "75": 1782.0,
+            "76": 1942.0,
+            "77": 2128.0,
+            "78": 2057.0,
+            "79": 1905.0,
+            "80": 2153.0,
+            "81": 2320.0,
+            "82": 2468.0,
+            "83": 1951.0,
+            "84": 2184.0,
+            "85": 2301.0,
+            "86": 1971.0,
+            "87": 2900.0,
+            "88": 2175.0,
+            "89": 2357.0,
+            "90": 2515.0,
+            "91": 1929.0,
+            "92": 2680.0,
+            "93": 2160.0,
+            "94": 2213.0,
+            "95": 2280.0,
+            "96": 2563.0,
+            "97": 2522.0,
+            "98": 2470.0,
+            "99": 2266.0,
+            "100": 2099.0
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..f9d44558b50
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.20107,
+            "52": 10.10861,
+            "53": 10.3403,
+            "54": 10.23893,
+            "55": 10.19008,
+            "56": 9.96159,
+            "57": 9.82417,
+            "58": 10.10904,
+            "59": 9.9041,
+            "60": 9.82045,
+            "61": 9.96789,
+            "62": 10.19934,
+            "63": 9.66196,
+            "64": 10.00416,
+            "65": 9.2675,
+            "66": 9.92466,
+            "67": 9.62367,
+            "68": 9.98499,
+            "69": 9.98524,
+            "70": 9.92553,
+            "71": 9.81785,
+            "72": 9.77816,
+            "73": 9.67402,
+            "74": 9.16615,
+            "75": 9.59935,
+            "76": 9.2754,
+            "77": 10.18639,
+            "78": 9.86592,
+            "79": 9.52838,
+            "80": 9.55132,
+            "81": 9.63037,
+            "82": 9.82843,
+            "83": 9.47009,
+            "84": 9.5424,
+            "85": 9.74228,
+            "86": 9.20711,
+            "87": 9.70433,
+            "88": 9.86745,
+            "89": 9.72062,
+            "90": 9.9304,
+            "91": 9.471,
+            "92": 9.47539,
+            "93": 9.21193,
+            "94": 8.94879,
+            "95": 9.62951,
+            "96": 9.63936,
+            "97": 9.40708,
+            "98": 9.77232,
+            "99": 9.01139,
+            "100": 9.51718
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1418.0,
+            "52": 1355.0,
+            "53": 1814.0,
+            "54": 1567.0,
+            "55": 1492.0,
+            "56": 1408.0,
+            "57": 1401.0,
+            "58": 1724.0,
+            "59": 1654.0,
+            "60": 1416.0,
+            "61": 1780.0,
+            "62": 1852.0,
+            "63": 1560.0,
+            "64": 1837.0,
+            "65": 1520.0,
+            "66": 1649.0,
+            "67": 1660.0,
+            "68": 1716.0,
+            "69": 1815.0,
+            "70": 2017.0,
+            "71": 2026.0,
+            "72": 1579.0,
+            "73": 1962.0,
+            "74": 1321.0,
+            "75": 1782.0,
+            "76": 1942.0,
+            "77": 2128.0,
+            "78": 2057.0,
+            "79": 1905.0,
+            "80": 2153.0,
+            "81": 2320.0,
+            "82": 2468.0,
+            "83": 1951.0,
+            "84": 2184.0,
+            "85": 2301.0,
+            "86": 1971.0,
+            "87": 2900.0,
+            "88": 2175.0,
+            "89": 2357.0,
+            "90": 2515.0,
+            "91": 1929.0,
+            "92": 2680.0,
+            "93": 2160.0,
+            "94": 2213.0,
+            "95": 2280.0,
+            "96": 2563.0,
+            "97": 2522.0,
+            "98": 2470.0,
+            "99": 2266.0,
+            "100": 2099.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 518880768.0,
+            "52": 518880768.0,
+            "53": 518880768.0,
+            "54": 518880768.0,
+            "55": 518880768.0,
+            "56": 518880768.0,
+            "57": 518880768.0,
+            "58": 518880768.0,
+            "59": 518880768.0,
+            "60": 518880768.0,
+            "61": 518880768.0,
+            "62": 518880768.0,
+            "63": 518880768.0,
+            "64": 518880768.0,
+            "65": 518880768.0,
+            "66": 518880768.0,
+            "67": 518880768.0,
+            "68": 518880768.0,
+            "69": 518880768.0,
+            "70": 518880768.0,
+            "71": 518880768.0,
+            "72": 518880768.0,
+            "73": 518880768.0,
+            "74": 518880768.0,
+            "75": 518880768.0,
+            "76": 518880768.0,
+            "77": 518880768.0,
+            "78": 518880768.0,
+            "79": 518880768.0,
+            "80": 518880768.0,
+            "81": 518880768.0,
+            "82": 518880768.0,
+            "83": 518880768.0,
+            "84": 518880768.0,
+            "85": 518880768.0,
+            "86": 518880768.0,
+            "87": 518880768.0,
+            "88": 518880768.0,
+            "89": 518880768.0,
+            "90": 518880768.0,
+            "91": 518880768.0,
+            "92": 518880768.0,
+            "93": 518880768.0,
+            "94": 518880768.0,
+            "95": 518880768.0,
+            "96": 518880768.0,
+            "97": 518880768.0,
+            "98": 518880768.0,
+            "99": 518880768.0,
+            "100": 518880768.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4575260160.0,
+            "52": 4608815616.0,
+            "53": 4608815616.0,
+            "54": 4608815616.0,
+            "55": 4608815616.0,
+            "56": 4608815616.0,
+            "57": 4608815616.0,
+            "58": 4608815616.0,
+            "59": 4608815616.0,
+            "60": 4608815616.0,
+            "61": 4608815616.0,
+            "62": 4608815616.0,
+            "63": 4608815616.0,
+            "64": 4608815616.0,
+            "65": 4608815616.0,
+            "66": 4608815616.0,
+            "67": 4608815616.0,
+            "68": 4608815616.0,
+            "69": 4608815616.0,
+            "70": 4608815616.0,
+            "71": 4608815616.0,
+            "72": 4608815616.0,
+            "73": 4608815616.0,
+            "74": 4608815616.0,
+            "75": 4608815616.0,
+            "76": 4608815616.0,
+            "77": 4608815616.0,
+            "78": 4608815616.0,
+            "79": 4608815616.0,
+            "80": 4608815616.0,
+            "81": 4608815616.0,
+            "82": 4608815616.0,
+            "83": 4608815616.0,
+            "84": 4608815616.0,
+            "85": 4608815616.0,
+            "86": 4608815616.0,
+            "87": 4608815616.0,
+            "88": 4608815616.0,
+            "89": 4608815616.0,
+            "90": 4608815616.0,
+            "91": 4608815616.0,
+            "92": 4608815616.0,
+            "93": 4608815616.0,
+            "94": 4608815616.0,
+            "95": 4608815616.0,
+            "96": 4608815616.0,
+            "97": 4608815616.0,
+            "98": 4608815616.0,
+            "99": 4608815616.0,
+            "100": 4608815616.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.23866,
+            "53": 0.10658,
+            "54": 0.0966,
+            "55": 0.09594,
+            "56": 0.0952,
+            "57": 0.09585,
+            "58": 0.09524,
+            "59": 0.09415,
+            "60": 0.09333,
+            "61": 0.09407,
+            "62": 0.09371,
+            "63": 0.09249,
+            "64": 0.09383,
+            "65": 0.09373,
+            "66": 0.09367,
+            "67": 0.09283,
+            "68": 0.09259,
+            "69": 0.09221,
+            "70": 0.09229,
+            "71": 0.09427,
+            "72": 0.09373,
+            "73": 0.09768,
+            "74": 0.09797,
+            "75": 0.09776,
+            "76": 0.09553,
+            "77": 0.09265,
+            "78": 0.09359,
+            "79": 0.09433,
+            "80": 0.09424,
+            "81": 0.09558,
+            "82": 0.09536,
+            "83": 0.09601,
+            "84": 0.09284,
+            "85": 0.09339,
+            "86": 0.09417,
+            "87": 0.09462,
+            "88": 0.09587,
+            "89": 0.09335,
+            "90": 0.0933,
+            "91": 0.09312,
+            "92": 0.09369,
+            "93": 0.0928,
+            "94": 0.09412,
+            "95": 0.09343,
+            "96": 0.09295,
+            "97": 0.09368,
+            "98": 0.09289,
+            "99": 0.09643,
+            "100": 0.09451
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
index 8828025e4b4..3a9ea635606 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.49307,
-            "2": 0.10356,
-            "3": 0.08062,
-            "4": 0.0772,
-            "5": 0.07555,
-            "6": 0.06677,
-            "7": 0.06434,
-            "8": 0.06228,
-            "9": 0.0624,
-            "10": 0.06213,
-            "11": 0.06353,
-            "12": 0.0622,
-            "13": 0.06377,
-            "14": 0.06323,
-            "15": 0.06296,
-            "16": 0.06251,
-            "17": 0.06382,
-            "18": 0.11433,
-            "19": 0.07262,
-            "20": 0.07222,
-            "21": 0.07613,
-            "22": 0.06977,
-            "23": 0.06664,
-            "24": 0.07256,
-            "25": 0.07344,
-            "26": 0.0723,
-            "27": 0.07264,
-            "28": 0.0697,
-            "29": 0.06998,
-            "30": 0.06785,
-            "31": 0.07022,
-            "32": 0.06834,
-            "33": 0.06679,
-            "34": 0.0678,
-            "35": 0.0679,
-            "36": 0.0679,
-            "37": 0.06826,
-            "38": 0.06821,
-            "39": 0.0665,
-            "40": 0.06798,
-            "41": 0.06816,
-            "42": 0.06816,
-            "43": 0.06901,
-            "44": 0.06772,
-            "45": 0.06849,
-            "46": 0.06843,
-            "47": 0.06773,
-            "48": 0.06705,
-            "49": 0.06755,
-            "50": 0.06844,
-            "51": 0.0971,
-            "52": 0.06968,
-            "53": 0.06915,
-            "54": 0.06982,
-            "55": 0.0703,
-            "56": 0.07014,
-            "57": 0.07047,
-            "58": 0.06835,
-            "59": 0.07077,
-            "60": 0.06886,
-            "61": 0.06929,
-            "62": 0.06887,
-            "63": 0.06946,
-            "64": 0.06924,
-            "65": 0.06987,
-            "66": 0.06898,
-            "67": 0.06873,
-            "68": 0.0695,
-            "69": 0.0712,
-            "70": 0.06928,
-            "71": 0.0692,
-            "72": 0.07014,
-            "73": 0.06964,
-            "74": 0.06884,
-            "75": 0.06897,
-            "76": 0.07036,
-            "77": 0.0693,
-            "78": 0.06905,
-            "79": 0.0698,
-            "80": 0.06831,
-            "81": 0.06969,
-            "82": 0.06871,
-            "83": 0.07059,
-            "84": 0.06905,
-            "85": 0.06955,
-            "86": 0.06926,
-            "87": 0.06905,
-            "88": 0.06912,
-            "89": 0.07039,
-            "90": 0.06895,
-            "91": 0.069,
-            "92": 0.0698,
-            "93": 0.06946,
-            "94": 0.06825,
-            "95": 0.06933,
-            "96": 0.06851,
-            "97": 0.06883,
-            "98": 0.07421,
-            "99": 0.06926,
-            "100": 0.07018
+            "1": 6.7553,
+            "2": 0.07914,
+            "3": 0.06117,
+            "4": 0.04713,
+            "5": 0.04562,
+            "6": 0.04484,
+            "7": 0.0455,
+            "8": 0.04532,
+            "9": 0.04653,
+            "10": 0.04527,
+            "11": 0.04526,
+            "12": 0.04531,
+            "13": 0.04513,
+            "14": 0.04589,
+            "15": 0.04523,
+            "16": 0.04566,
+            "17": 0.04513,
+            "18": 0.09054,
+            "19": 0.05227,
+            "20": 0.05014,
+            "21": 0.04995,
+            "22": 0.04766,
+            "23": 0.04999,
+            "24": 0.05005,
+            "25": 0.0502,
+            "26": 0.04945,
+            "27": 0.04968,
+            "28": 0.04977,
+            "29": 0.0497,
+            "30": 0.04986,
+            "31": 0.04983,
+            "32": 0.04954,
+            "33": 0.04965,
+            "34": 0.04976,
+            "35": 0.05148,
+            "36": 0.05049,
+            "37": 0.05043,
+            "38": 0.04961,
+            "39": 0.04968,
+            "40": 0.05011,
+            "41": 0.05085,
+            "42": 0.05148,
+            "43": 0.05043,
+            "44": 0.05134,
+            "45": 0.05258,
+            "46": 0.05004,
+            "47": 0.04988,
+            "48": 0.052,
+            "49": 0.05001,
+            "50": 0.05024,
+            "51": 0.05928,
+            "52": 0.05229,
+            "53": 0.05133,
+            "54": 0.04954,
+            "55": 0.05183,
+            "56": 0.0499,
+            "57": 0.05371,
+            "58": 0.05294,
+            "59": 0.05143,
+            "60": 0.05245,
+            "61": 0.05128,
+            "62": 0.05258,
+            "63": 0.05117,
+            "64": 0.05002,
+            "65": 0.05116,
+            "66": 0.04965,
+            "67": 0.05087,
+            "68": 0.04976,
+            "69": 0.05059,
+            "70": 0.05074,
+            "71": 0.05146,
+            "72": 0.04996,
+            "73": 0.05053,
+            "74": 0.04997,
+            "75": 0.05102,
+            "76": 0.04952,
+            "77": 0.05026,
+            "78": 0.05047,
+            "79": 0.05054,
+            "80": 0.05018,
+            "81": 0.05082,
+            "82": 0.05081,
+            "83": 0.05053,
+            "84": 0.05027,
+            "85": 0.05039,
+            "86": 0.05101,
+            "87": 0.05996,
+            "88": 0.05963,
+            "89": 0.05999,
+            "90": 0.05955,
+            "91": 0.05033,
+            "92": 0.05028,
+            "93": 0.05134,
+            "94": 0.05022,
+            "95": 0.05076,
+            "96": 0.05004,
+            "97": 0.05109,
+            "98": 0.05023,
+            "99": 0.05058,
+            "100": 0.05028
         }
     },
     "num-zeros": {
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..a47b77f353b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.25547,
+            "52": 10.15856,
+            "53": 10.38114,
+            "54": 10.2992,
+            "55": 10.23806,
+            "56": 10.00726,
+            "57": 9.87765,
+            "58": 10.15279,
+            "59": 9.94207,
+            "60": 9.8666,
+            "61": 10.00032,
+            "62": 10.23443,
+            "63": 9.71917,
+            "64": 10.04209,
+            "65": 9.30009,
+            "66": 9.95537,
+            "67": 9.6499,
+            "68": 10.00402,
+            "69": 9.99988,
+            "70": 9.96383,
+            "71": 9.84259,
+            "72": 9.81258,
+            "73": 9.70921,
+            "74": 9.19832,
+            "75": 9.61686,
+            "76": 9.28859,
+            "77": 10.20416,
+            "78": 9.88378,
+            "79": 9.54296,
+            "80": 9.57095,
+            "81": 9.64006,
+            "82": 9.83648,
+            "83": 9.47691,
+            "84": 9.54866,
+            "85": 9.75198,
+            "86": 9.21427,
+            "87": 9.70607,
+            "88": 9.87307,
+            "89": 9.72876,
+            "90": 9.92353,
+            "91": 9.48236,
+            "92": 9.47671,
+            "93": 9.20895,
+            "94": 8.9625,
+            "95": 9.62369,
+            "96": 9.64228,
+            "97": 9.41575,
+            "98": 9.77515,
+            "99": 9.00692,
+            "100": 9.51305
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1367.0,
+            "52": 1372.0,
+            "53": 1715.0,
+            "54": 1485.0,
+            "55": 1482.0,
+            "56": 1473.0,
+            "57": 1539.0,
+            "58": 1736.0,
+            "59": 1661.0,
+            "60": 1586.0,
+            "61": 1691.0,
+            "62": 1865.0,
+            "63": 1395.0,
+            "64": 1846.0,
+            "65": 1428.0,
+            "66": 1717.0,
+            "67": 1700.0,
+            "68": 1750.0,
+            "69": 1681.0,
+            "70": 1861.0,
+            "71": 2048.0,
+            "72": 1552.0,
+            "73": 2010.0,
+            "74": 1344.0,
+            "75": 1840.0,
+            "76": 1846.0,
+            "77": 2034.0,
+            "78": 2170.0,
+            "79": 1949.0,
+            "80": 2077.0,
+            "81": 2381.0,
+            "82": 2390.0,
+            "83": 1843.0,
+            "84": 2060.0,
+            "85": 2317.0,
+            "86": 1958.0,
+            "87": 2829.0,
+            "88": 2046.0,
+            "89": 2260.0,
+            "90": 2545.0,
+            "91": 1801.0,
+            "92": 2505.0,
+            "93": 2064.0,
+            "94": 2223.0,
+            "95": 2379.0,
+            "96": 2579.0,
+            "97": 2411.0,
+            "98": 2500.0,
+            "99": 2124.0,
+            "100": 2119.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 518880768.0,
+            "52": 518880768.0,
+            "53": 518880768.0,
+            "54": 518880768.0,
+            "55": 518880768.0,
+            "56": 518880768.0,
+            "57": 518880768.0,
+            "58": 518880768.0,
+            "59": 518880768.0,
+            "60": 518880768.0,
+            "61": 518880768.0,
+            "62": 518880768.0,
+            "63": 518880768.0,
+            "64": 518880768.0,
+            "65": 518880768.0,
+            "66": 518880768.0,
+            "67": 518880768.0,
+            "68": 518880768.0,
+            "69": 518880768.0,
+            "70": 518880768.0,
+            "71": 518880768.0,
+            "72": 518880768.0,
+            "73": 518880768.0,
+            "74": 518880768.0,
+            "75": 518880768.0,
+            "76": 518880768.0,
+            "77": 518880768.0,
+            "78": 518880768.0,
+            "79": 518880768.0,
+            "80": 518880768.0,
+            "81": 518880768.0,
+            "82": 518880768.0,
+            "83": 518880768.0,
+            "84": 518880768.0,
+            "85": 518880768.0,
+            "86": 518880768.0,
+            "87": 518880768.0,
+            "88": 518880768.0,
+            "89": 518880768.0,
+            "90": 518880768.0,
+            "91": 518880768.0,
+            "92": 518880768.0,
+            "93": 518880768.0,
+            "94": 518880768.0,
+            "95": 518880768.0,
+            "96": 518880768.0,
+            "97": 518880768.0,
+            "98": 518880768.0,
+            "99": 518880768.0,
+            "100": 518880768.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4575260160.0,
+            "52": 4608815616.0,
+            "53": 4608815616.0,
+            "54": 4608815616.0,
+            "55": 4608815616.0,
+            "56": 4608815616.0,
+            "57": 4608815616.0,
+            "58": 4608815616.0,
+            "59": 4608815616.0,
+            "60": 4608815616.0,
+            "61": 4608815616.0,
+            "62": 4608815616.0,
+            "63": 4608815616.0,
+            "64": 4608815616.0,
+            "65": 4608815616.0,
+            "66": 4608815616.0,
+            "67": 4608815616.0,
+            "68": 4608815616.0,
+            "69": 4608815616.0,
+            "70": 4608815616.0,
+            "71": 4608815616.0,
+            "72": 4608815616.0,
+            "73": 4608815616.0,
+            "74": 4608815616.0,
+            "75": 4608815616.0,
+            "76": 4608815616.0,
+            "77": 4608815616.0,
+            "78": 4608815616.0,
+            "79": 4608815616.0,
+            "80": 4608815616.0,
+            "81": 4608815616.0,
+            "82": 4608815616.0,
+            "83": 4608815616.0,
+            "84": 4608815616.0,
+            "85": 4608815616.0,
+            "86": 4608815616.0,
+            "87": 4608815616.0,
+            "88": 4608815616.0,
+            "89": 4608815616.0,
+            "90": 4608815616.0,
+            "91": 4608815616.0,
+            "92": 4608815616.0,
+            "93": 4608815616.0,
+            "94": 4608815616.0,
+            "95": 4608815616.0,
+            "96": 4608815616.0,
+            "97": 4608815616.0,
+            "98": 4608815616.0,
+            "99": 4608815616.0,
+            "100": 4608815616.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 5.54199,
+            "52": 0.07932,
+            "53": 0.05296,
+            "54": 0.054,
+            "55": 0.052,
+            "56": 0.05407,
+            "57": 0.05285,
+            "58": 0.05383,
+            "59": 0.05227,
+            "60": 0.05363,
+            "61": 0.053,
+            "62": 0.05361,
+            "63": 0.05195,
+            "64": 0.05507,
+            "65": 0.05368,
+            "66": 0.05324,
+            "67": 0.05188,
+            "68": 0.05445,
+            "69": 0.05222,
+            "70": 0.05356,
+            "71": 0.05169,
+            "72": 0.05424,
+            "73": 0.05264,
+            "74": 0.05364,
+            "75": 0.0521,
+            "76": 0.05373,
+            "77": 0.05341,
+            "78": 0.05388,
+            "79": 0.05224,
+            "80": 0.05393,
+            "81": 0.05706,
+            "82": 0.05358,
+            "83": 0.05191,
+            "84": 0.05339,
+            "85": 0.05302,
+            "86": 0.05343,
+            "87": 0.05297,
+            "88": 0.0535,
+            "89": 0.05264,
+            "90": 0.05485,
+            "91": 0.05422,
+            "92": 0.05329,
+            "93": 0.0539,
+            "94": 0.05526,
+            "95": 0.05238,
+            "96": 0.05607,
+            "97": 0.05259,
+            "98": 0.0561,
+            "99": 0.05354,
+            "100": 0.05479
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
index 8d29fc96a7f..36d741d6e7d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 4.2592,
-            "2": 0.13544,
-            "3": 0.09999,
-            "4": 0.08273,
-            "5": 0.08157,
-            "6": 0.08266,
-            "7": 0.08111,
-            "8": 0.08184,
-            "9": 0.08109,
-            "10": 0.08281,
-            "11": 0.08041,
-            "12": 0.08186,
-            "13": 0.08098,
-            "14": 0.08513,
-            "15": 0.0821,
-            "16": 0.08144,
-            "17": 0.08052,
-            "18": 0.13091,
-            "19": 0.08819,
-            "20": 0.08804,
-            "21": 0.08818,
-            "22": 0.08404,
-            "23": 0.08729,
-            "24": 0.08805,
-            "25": 0.08736,
-            "26": 0.08811,
-            "27": 0.08757,
-            "28": 0.08887,
-            "29": 0.08961,
-            "30": 0.0883,
-            "31": 0.08788,
-            "32": 0.08884,
-            "33": 0.08833,
-            "34": 0.08811,
-            "35": 0.08831,
-            "36": 0.08859,
-            "37": 0.08809,
-            "38": 0.08879,
-            "39": 0.08769,
-            "40": 0.0883,
-            "41": 0.08757,
-            "42": 0.08797,
-            "43": 0.08669,
-            "44": 0.08751,
-            "45": 0.08893,
-            "46": 0.08862,
-            "47": 0.08698,
-            "48": 0.089,
-            "49": 0.08841,
-            "50": 0.08813,
-            "51": 0.09282,
-            "52": 0.08991,
-            "53": 0.08846,
-            "54": 0.08878,
-            "55": 0.08875,
-            "56": 0.0897,
-            "57": 0.0888,
-            "58": 0.08814,
-            "59": 0.08821,
-            "60": 0.08782,
-            "61": 0.0888,
-            "62": 0.08762,
-            "63": 0.08743,
-            "64": 0.0879,
-            "65": 0.08877,
-            "66": 0.08656,
-            "67": 0.08681,
-            "68": 0.08654,
-            "69": 0.08705,
-            "70": 0.08667,
-            "71": 0.08696,
-            "72": 0.08664,
-            "73": 0.08625,
-            "74": 0.08667,
-            "75": 0.08656,
-            "76": 0.08557,
-            "77": 0.08578,
-            "78": 0.08586,
-            "79": 0.08584,
-            "80": 0.08576,
-            "81": 0.08653,
-            "82": 0.08572,
-            "83": 0.08613,
-            "84": 0.08557,
-            "85": 0.08616,
-            "86": 0.08608,
-            "87": 0.08563,
-            "88": 0.08581,
-            "89": 0.08591,
-            "90": 0.08593,
-            "91": 0.08543,
-            "92": 0.08641,
-            "93": 0.08635,
-            "94": 0.08549,
-            "95": 0.08554,
-            "96": 0.08487,
-            "97": 0.08505,
-            "98": 0.08522,
-            "99": 0.08533,
-            "100": 0.08544
+            "1": 3.39236,
+            "2": 0.197,
+            "3": 0.09014,
+            "4": 0.07513,
+            "5": 0.07608,
+            "6": 0.07565,
+            "7": 0.07606,
+            "8": 0.07571,
+            "9": 0.07584,
+            "10": 0.07549,
+            "11": 0.07619,
+            "12": 0.0756,
+            "13": 0.07585,
+            "14": 0.07487,
+            "15": 0.07654,
+            "16": 0.07517,
+            "17": 0.07637,
+            "18": 0.13134,
+            "19": 0.08507,
+            "20": 0.08208,
+            "21": 0.08338,
+            "22": 0.07828,
+            "23": 0.08267,
+            "24": 0.08242,
+            "25": 0.08322,
+            "26": 0.08222,
+            "27": 0.08351,
+            "28": 0.08234,
+            "29": 0.08375,
+            "30": 0.08306,
+            "31": 0.0837,
+            "32": 0.08544,
+            "33": 0.08325,
+            "34": 0.08234,
+            "35": 0.08499,
+            "36": 0.08373,
+            "37": 0.08247,
+            "38": 0.08204,
+            "39": 0.08354,
+            "40": 0.0837,
+            "41": 0.08325,
+            "42": 0.08545,
+            "43": 0.08233,
+            "44": 0.08294,
+            "45": 0.084,
+            "46": 0.08215,
+            "47": 0.08346,
+            "48": 0.08195,
+            "49": 0.08269,
+            "50": 0.08321,
+            "51": 0.08664,
+            "52": 0.08023,
+            "53": 0.08003,
+            "54": 0.07979,
+            "55": 0.08188,
+            "56": 0.07966,
+            "57": 0.08281,
+            "58": 0.0797,
+            "59": 0.07943,
+            "60": 0.07926,
+            "61": 0.07894,
+            "62": 0.07941,
+            "63": 0.07952,
+            "64": 0.07973,
+            "65": 0.07964,
+            "66": 0.07938,
+            "67": 0.07972,
+            "68": 0.07922,
+            "69": 0.07931,
+            "70": 0.07926,
+            "71": 0.07906,
+            "72": 0.08086,
+            "73": 0.07934,
+            "74": 0.07975,
+            "75": 0.07939,
+            "76": 0.07948,
+            "77": 0.07896,
+            "78": 0.07961,
+            "79": 0.0798,
+            "80": 0.07961,
+            "81": 0.07923,
+            "82": 0.07921,
+            "83": 0.07905,
+            "84": 0.07972,
+            "85": 0.08027,
+            "86": 0.08062,
+            "87": 0.08419,
+            "88": 0.08051,
+            "89": 0.08041,
+            "90": 0.08078,
+            "91": 0.08039,
+            "92": 0.08075,
+            "93": 0.0801,
+            "94": 0.08,
+            "95": 0.0799,
+            "96": 0.08114,
+            "97": 0.07987,
+            "98": 0.08062,
+            "99": 0.08014,
+            "100": 0.08015
         }
     },
     "num-zeros": {
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..8c96fb071fc
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.23775,
+            "52": 10.15443,
+            "53": 10.36085,
+            "54": 10.26927,
+            "55": 10.2161,
+            "56": 9.99594,
+            "57": 9.8744,
+            "58": 10.14007,
+            "59": 9.93447,
+            "60": 9.84864,
+            "61": 9.98549,
+            "62": 10.2164,
+            "63": 9.69034,
+            "64": 10.0182,
+            "65": 9.30046,
+            "66": 9.9355,
+            "67": 9.63051,
+            "68": 9.99128,
+            "69": 9.9852,
+            "70": 9.92463,
+            "71": 9.81436,
+            "72": 9.79481,
+            "73": 9.68082,
+            "74": 9.1945,
+            "75": 9.60407,
+            "76": 9.28537,
+            "77": 10.18507,
+            "78": 9.86718,
+            "79": 9.52407,
+            "80": 9.55749,
+            "81": 9.62863,
+            "82": 9.81568,
+            "83": 9.45708,
+            "84": 9.53654,
+            "85": 9.73266,
+            "86": 9.20138,
+            "87": 9.69524,
+            "88": 9.85412,
+            "89": 9.71648,
+            "90": 9.91047,
+            "91": 9.45992,
+            "92": 9.46603,
+            "93": 9.19321,
+            "94": 8.94,
+            "95": 9.60607,
+            "96": 9.62214,
+            "97": 9.39796,
+            "98": 9.76023,
+            "99": 8.99097,
+            "100": 9.49505
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1508.0,
+            "52": 1400.0,
+            "53": 1740.0,
+            "54": 1498.0,
+            "55": 1551.0,
+            "56": 1363.0,
+            "57": 1465.0,
+            "58": 1610.0,
+            "59": 1574.0,
+            "60": 1599.0,
+            "61": 1727.0,
+            "62": 1804.0,
+            "63": 1590.0,
+            "64": 1813.0,
+            "65": 1398.0,
+            "66": 1738.0,
+            "67": 1536.0,
+            "68": 1764.0,
+            "69": 1781.0,
+            "70": 1926.0,
+            "71": 1950.0,
+            "72": 1461.0,
+            "73": 1985.0,
+            "74": 1345.0,
+            "75": 1871.0,
+            "76": 1732.0,
+            "77": 2086.0,
+            "78": 2075.0,
+            "79": 1992.0,
+            "80": 2260.0,
+            "81": 2300.0,
+            "82": 2290.0,
+            "83": 1774.0,
+            "84": 2172.0,
+            "85": 2216.0,
+            "86": 2038.0,
+            "87": 2741.0,
+            "88": 2079.0,
+            "89": 2349.0,
+            "90": 2315.0,
+            "91": 1875.0,
+            "92": 2611.0,
+            "93": 2053.0,
+            "94": 2220.0,
+            "95": 2296.0,
+            "96": 2665.0,
+            "97": 2516.0,
+            "98": 2549.0,
+            "99": 2378.0,
+            "100": 2257.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 518880768.0,
+            "52": 518880768.0,
+            "53": 518880768.0,
+            "54": 518880768.0,
+            "55": 518880768.0,
+            "56": 518880768.0,
+            "57": 518880768.0,
+            "58": 518880768.0,
+            "59": 518880768.0,
+            "60": 518880768.0,
+            "61": 518880768.0,
+            "62": 518880768.0,
+            "63": 518880768.0,
+            "64": 518880768.0,
+            "65": 518880768.0,
+            "66": 518880768.0,
+            "67": 518880768.0,
+            "68": 518880768.0,
+            "69": 518880768.0,
+            "70": 518880768.0,
+            "71": 518880768.0,
+            "72": 518880768.0,
+            "73": 518880768.0,
+            "74": 518880768.0,
+            "75": 518880768.0,
+            "76": 518880768.0,
+            "77": 518880768.0,
+            "78": 518880768.0,
+            "79": 518880768.0,
+            "80": 518880768.0,
+            "81": 518880768.0,
+            "82": 518880768.0,
+            "83": 518880768.0,
+            "84": 518880768.0,
+            "85": 518880768.0,
+            "86": 518880768.0,
+            "87": 518880768.0,
+            "88": 518880768.0,
+            "89": 518880768.0,
+            "90": 518880768.0,
+            "91": 518880768.0,
+            "92": 518880768.0,
+            "93": 518880768.0,
+            "94": 518880768.0,
+            "95": 518880768.0,
+            "96": 518880768.0,
+            "97": 518880768.0,
+            "98": 518880768.0,
+            "99": 518880768.0,
+            "100": 518880768.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4575260160.0,
+            "52": 4608815616.0,
+            "53": 4608815616.0,
+            "54": 4608815616.0,
+            "55": 4608815616.0,
+            "56": 4608815616.0,
+            "57": 4608815616.0,
+            "58": 4608815616.0,
+            "59": 4608815616.0,
+            "60": 4608815616.0,
+            "61": 4608815616.0,
+            "62": 4608815616.0,
+            "63": 4608815616.0,
+            "64": 4608815616.0,
+            "65": 4608815616.0,
+            "66": 4608815616.0,
+            "67": 4608815616.0,
+            "68": 4608815616.0,
+            "69": 4608815616.0,
+            "70": 4608815616.0,
+            "71": 4608815616.0,
+            "72": 4608815616.0,
+            "73": 4608815616.0,
+            "74": 4608815616.0,
+            "75": 4608815616.0,
+            "76": 4608815616.0,
+            "77": 4608815616.0,
+            "78": 4608815616.0,
+            "79": 4608815616.0,
+            "80": 4608815616.0,
+            "81": 4608815616.0,
+            "82": 4608815616.0,
+            "83": 4608815616.0,
+            "84": 4608815616.0,
+            "85": 4608815616.0,
+            "86": 4608815616.0,
+            "87": 4608815616.0,
+            "88": 4608815616.0,
+            "89": 4608815616.0,
+            "90": 4608815616.0,
+            "91": 4608815616.0,
+            "92": 4608815616.0,
+            "93": 4608815616.0,
+            "94": 4608815616.0,
+            "95": 4608815616.0,
+            "96": 4608815616.0,
+            "97": 4608815616.0,
+            "98": 4608815616.0,
+            "99": 4608815616.0,
+            "100": 4608815616.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.93441,
+            "52": 0.11442,
+            "53": 0.08582,
+            "54": 0.08444,
+            "55": 0.09374,
+            "56": 0.0841,
+            "57": 0.08368,
+            "58": 0.08327,
+            "59": 0.08219,
+            "60": 0.08174,
+            "61": 0.08125,
+            "62": 0.08336,
+            "63": 0.08247,
+            "64": 0.08267,
+            "65": 0.08048,
+            "66": 0.07988,
+            "67": 0.08016,
+            "68": 0.08086,
+            "69": 0.07938,
+            "70": 0.08047,
+            "71": 0.07981,
+            "72": 0.08021,
+            "73": 0.08023,
+            "74": 0.08133,
+            "75": 0.08002,
+            "76": 0.08063,
+            "77": 0.08008,
+            "78": 0.0809,
+            "79": 0.08014,
+            "80": 0.08071,
+            "81": 0.08057,
+            "82": 0.08093,
+            "83": 0.08114,
+            "84": 0.08102,
+            "85": 0.0806,
+            "86": 0.08267,
+            "87": 0.08027,
+            "88": 0.08002,
+            "89": 0.08059,
+            "90": 0.0802,
+            "91": 0.08028,
+            "92": 0.08007,
+            "93": 0.08034,
+            "94": 0.08004,
+            "95": 0.08085,
+            "96": 0.07942,
+            "97": 0.08025,
+            "98": 0.07962,
+            "99": 0.08071,
+            "100": 0.08017
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..063c93b8168
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.7702,
+            "2": 10.78031,
+            "3": 10.77782,
+            "4": 10.73861,
+            "5": 10.81197,
+            "6": 10.81962,
+            "7": 10.79512,
+            "8": 10.78158,
+            "9": 10.79081,
+            "10": 10.71741,
+            "11": 10.85173,
+            "12": 10.80653,
+            "13": 10.82058,
+            "14": 10.84404,
+            "15": 10.74918,
+            "16": 10.752,
+            "17": 10.70902,
+            "18": 10.752,
+            "19": 10.74635,
+            "20": 10.63769,
+            "21": 10.61672,
+            "22": 10.44317,
+            "23": 10.6675,
+            "24": 10.50949,
+            "25": 10.45557,
+            "26": 10.53435,
+            "27": 10.54753,
+            "28": 10.51646,
+            "29": 10.55435,
+            "30": 10.28785,
+            "31": 10.00156,
+            "32": 10.40963,
+            "33": 10.40243,
+            "34": 10.13341,
+            "35": 10.19694,
+            "36": 10.14213,
+            "37": 10.2869,
+            "38": 10.10508,
+            "39": 10.35217,
+            "40": 10.00199,
+            "41": 10.07363,
+            "42": 10.1522,
+            "43": 9.74558,
+            "44": 9.8738,
+            "45": 9.74764,
+            "46": 9.74951,
+            "47": 10.09152,
+            "48": 9.77892,
+            "49": 9.44822,
+            "50": 9.84214
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1564.0,
+            "2": 1610.0,
+            "3": 1608.0,
+            "4": 1854.0,
+            "5": 1873.0,
+            "6": 1812.0,
+            "7": 1744.0,
+            "8": 1614.0,
+            "9": 1857.0,
+            "10": 1358.0,
+            "11": 1844.0,
+            "12": 1788.0,
+            "13": 1826.0,
+            "14": 1801.0,
+            "15": 1892.0,
+            "16": 1892.0,
+            "17": 1758.0,
+            "18": 1714.0,
+            "19": 1677.0,
+            "20": 1582.0,
+            "21": 1824.0,
+            "22": 1579.0,
+            "23": 1987.0,
+            "24": 1533.0,
+            "25": 1602.0,
+            "26": 1651.0,
+            "27": 1901.0,
+            "28": 2044.0,
+            "29": 1911.0,
+            "30": 1823.0,
+            "31": 1583.0,
+            "32": 1926.0,
+            "33": 2108.0,
+            "34": 1914.0,
+            "35": 2058.0,
+            "36": 1946.0,
+            "37": 2325.0,
+            "38": 2268.0,
+            "39": 2376.0,
+            "40": 2208.0,
+            "41": 2448.0,
+            "42": 2209.0,
+            "43": 1977.0,
+            "44": 2049.0,
+            "45": 2266.0,
+            "46": 2481.0,
+            "47": 2583.0,
+            "48": 2450.0,
+            "49": 2255.0,
+            "50": 2453.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 950750208.0,
+            "2": 950750208.0,
+            "3": 950750208.0,
+            "4": 950750208.0,
+            "5": 950750208.0,
+            "6": 950750208.0,
+            "7": 950750208.0,
+            "8": 950750208.0,
+            "9": 950750208.0,
+            "10": 950750208.0,
+            "11": 950750208.0,
+            "12": 950750208.0,
+            "13": 950750208.0,
+            "14": 950750208.0,
+            "15": 950750208.0,
+            "16": 950750208.0,
+            "17": 950750208.0,
+            "18": 950750208.0,
+            "19": 950750208.0,
+            "20": 950750208.0,
+            "21": 950750208.0,
+            "22": 950750208.0,
+            "23": 950750208.0,
+            "24": 950750208.0,
+            "25": 950750208.0,
+            "26": 950750208.0,
+            "27": 950750208.0,
+            "28": 950750208.0,
+            "29": 950750208.0,
+            "30": 950750208.0,
+            "31": 950750208.0,
+            "32": 950750208.0,
+            "33": 950750208.0,
+            "34": 950750208.0,
+            "35": 950750208.0,
+            "36": 950750208.0,
+            "37": 950750208.0,
+            "38": 950750208.0,
+            "39": 950750208.0,
+            "40": 950750208.0,
+            "41": 950750208.0,
+            "42": 950750208.0,
+            "43": 950750208.0,
+            "44": 950750208.0,
+            "45": 950750208.0,
+            "46": 950750208.0,
+            "47": 950750208.0,
+            "48": 950750208.0,
+            "49": 950750208.0,
+            "50": 950750208.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 3275808768.0,
+            "2": 3635274752.0,
+            "3": 3635274752.0,
+            "4": 3635274752.0,
+            "5": 3635274752.0,
+            "6": 3635274752.0,
+            "7": 3635274752.0,
+            "8": 3635274752.0,
+            "9": 3635274752.0,
+            "10": 3635274752.0,
+            "11": 3635274752.0,
+            "12": 3635274752.0,
+            "13": 3635274752.0,
+            "14": 3635274752.0,
+            "15": 3635274752.0,
+            "16": 3635274752.0,
+            "17": 3635274752.0,
+            "18": 3635274752.0,
+            "19": 3635274752.0,
+            "20": 3635274752.0,
+            "21": 3635274752.0,
+            "22": 3635274752.0,
+            "23": 3635274752.0,
+            "24": 3635274752.0,
+            "25": 3635274752.0,
+            "26": 3635274752.0,
+            "27": 3635274752.0,
+            "28": 3635274752.0,
+            "29": 3635274752.0,
+            "30": 3635274752.0,
+            "31": 3635274752.0,
+            "32": 3635274752.0,
+            "33": 3635274752.0,
+            "34": 3635274752.0,
+            "35": 3635274752.0,
+            "36": 3635274752.0,
+            "37": 3635274752.0,
+            "38": 3635274752.0,
+            "39": 3635274752.0,
+            "40": 3635274752.0,
+            "41": 3635274752.0,
+            "42": 3635274752.0,
+            "43": 3635274752.0,
+            "44": 3635274752.0,
+            "45": 3635274752.0,
+            "46": 3635274752.0,
+            "47": 3635274752.0,
+            "48": 3635274752.0,
+            "49": 3635274752.0,
+            "50": 3635274752.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.42188,
+            "3": 0.13001,
+            "4": 0.11595,
+            "5": 0.1162,
+            "6": 0.11616,
+            "7": 0.11716,
+            "8": 0.11796,
+            "9": 0.1186,
+            "10": 0.11774,
+            "11": 0.11769,
+            "12": 0.12298,
+            "13": 0.11717,
+            "14": 0.11738,
+            "15": 0.11771,
+            "16": 0.11772,
+            "17": 0.11737,
+            "18": 0.11751,
+            "19": 0.11697,
+            "20": 0.11822,
+            "21": 0.11647,
+            "22": 0.11724,
+            "23": 0.11721,
+            "24": 0.11677,
+            "25": 0.1171,
+            "26": 0.11775,
+            "27": 0.11748,
+            "28": 0.11705,
+            "29": 0.11727,
+            "30": 0.11693,
+            "31": 0.11818,
+            "32": 0.11738,
+            "33": 0.11726,
+            "34": 0.11675,
+            "35": 0.11722,
+            "36": 0.11753,
+            "37": 0.11779,
+            "38": 0.11683,
+            "39": 0.11725,
+            "40": 0.11779,
+            "41": 0.11794,
+            "42": 0.11724,
+            "43": 0.11807,
+            "44": 0.11744,
+            "45": 0.12537,
+            "46": 0.11752,
+            "47": 0.11739,
+            "48": 0.11765,
+            "49": 0.11763,
+            "50": 0.11812
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_h100.json
index 6660a5e446e..1d24a32a8d8 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_h100.json
@@ -13,47 +13,47 @@
             "7": 10.8645,
             "8": 10.87335,
             "9": 10.87481,
-            "10": 10.83903,
-            "11": 10.86614,
-            "12": 10.86169,
-            "13": 10.87354,
-            "14": 10.87593,
-            "15": 10.8216,
-            "16": 10.83071,
-            "17": 10.79411,
-            "18": 10.81433,
-            "19": 10.80011,
-            "20": 10.71697,
-            "21": 10.70154,
-            "22": 10.57235,
-            "23": 10.70749,
+            "10": 10.83904,
+            "11": 10.86613,
+            "12": 10.86168,
+            "13": 10.87357,
+            "14": 10.87594,
+            "15": 10.82161,
+            "16": 10.83073,
+            "17": 10.79408,
+            "18": 10.8143,
+            "19": 10.80009,
+            "20": 10.71695,
+            "21": 10.70153,
+            "22": 10.57236,
+            "23": 10.70752,
             "24": 10.6006,
-            "25": 10.5566,
-            "26": 10.60138,
-            "27": 10.60955,
+            "25": 10.55655,
+            "26": 10.60135,
+            "27": 10.60957,
             "28": 10.55626,
             "29": 10.57268,
             "30": 10.36032,
-            "31": 10.11454,
-            "32": 10.45937,
-            "33": 10.45389,
-            "34": 10.21168,
-            "35": 10.26583,
+            "31": 10.11449,
+            "32": 10.45933,
+            "33": 10.45392,
+            "34": 10.21171,
+            "35": 10.26576,
             "36": 10.21483,
-            "37": 10.34814,
-            "38": 10.19787,
-            "39": 10.39713,
-            "40": 10.08719,
-            "41": 10.13539,
-            "42": 10.20638,
+            "37": 10.34811,
+            "38": 10.19788,
+            "39": 10.39711,
+            "40": 10.08718,
+            "41": 10.13538,
+            "42": 10.20634,
             "43": 9.82769,
-            "44": 9.95444,
-            "45": 9.82374,
-            "46": 9.79864,
-            "47": 10.12579,
+            "44": 9.9545,
+            "45": 9.82372,
+            "46": 9.79866,
+            "47": 10.12577,
             "48": 9.83547,
             "49": 9.51888,
-            "50": 9.90498
+            "50": 9.90503
         }
     },
     "num-zeros": {
@@ -70,47 +70,47 @@
             "7": 1767.0,
             "8": 1569.0,
             "9": 1750.0,
-            "10": 1413.0,
-            "11": 1746.0,
-            "12": 1681.0,
-            "13": 1828.0,
-            "14": 1739.0,
-            "15": 1801.0,
-            "16": 1895.0,
-            "17": 1781.0,
-            "18": 1693.0,
-            "19": 1705.0,
-            "20": 1624.0,
-            "21": 1838.0,
-            "22": 1792.0,
-            "23": 2005.0,
-            "24": 1601.0,
-            "25": 1483.0,
-            "26": 1615.0,
-            "27": 1844.0,
-            "28": 1961.0,
-            "29": 2012.0,
-            "30": 1856.0,
-            "31": 1502.0,
-            "32": 1794.0,
-            "33": 2118.0,
-            "34": 1742.0,
-            "35": 1953.0,
-            "36": 1940.0,
-            "37": 2324.0,
-            "38": 2109.0,
-            "39": 2369.0,
-            "40": 2183.0,
-            "41": 2063.0,
-            "42": 2232.0,
-            "43": 1917.0,
-            "44": 2084.0,
-            "45": 2058.0,
-            "46": 2144.0,
-            "47": 2488.0,
-            "48": 2407.0,
-            "49": 2125.0,
-            "50": 2134.0
+            "10": 1414.0,
+            "11": 1784.0,
+            "12": 1661.0,
+            "13": 1936.0,
+            "14": 1687.0,
+            "15": 1669.0,
+            "16": 1868.0,
+            "17": 1820.0,
+            "18": 1629.0,
+            "19": 1716.0,
+            "20": 1626.0,
+            "21": 1933.0,
+            "22": 1647.0,
+            "23": 1979.0,
+            "24": 1578.0,
+            "25": 1542.0,
+            "26": 1628.0,
+            "27": 1829.0,
+            "28": 1896.0,
+            "29": 2005.0,
+            "30": 1921.0,
+            "31": 1471.0,
+            "32": 1826.0,
+            "33": 2012.0,
+            "34": 1767.0,
+            "35": 1973.0,
+            "36": 1933.0,
+            "37": 2208.0,
+            "38": 2138.0,
+            "39": 2260.0,
+            "40": 2112.0,
+            "41": 2164.0,
+            "42": 2152.0,
+            "43": 2044.0,
+            "44": 2055.0,
+            "45": 2076.0,
+            "46": 2166.0,
+            "47": 2472.0,
+            "48": 2425.0,
+            "49": 2218.0,
+            "50": 2135.0
         }
     },
     "mem-allocated-bytes": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 8.92875,
-            "2": 0.12034,
-            "3": 0.10184,
-            "4": 0.10215,
-            "5": 0.10291,
-            "6": 0.10167,
-            "7": 0.09936,
-            "8": 0.10097,
-            "9": 0.10127,
-            "10": 0.10171,
-            "11": 0.10013,
-            "12": 0.09898,
-            "13": 0.10085,
-            "14": 0.10081,
-            "15": 0.10088,
-            "16": 0.10002,
-            "17": 0.0999,
-            "18": 0.10168,
-            "19": 0.10032,
-            "20": 0.09815,
-            "21": 0.10018,
-            "22": 0.09914,
-            "23": 0.1005,
-            "24": 0.10106,
-            "25": 0.10086,
-            "26": 0.10152,
-            "27": 0.1,
-            "28": 0.10161,
-            "29": 0.10038,
-            "30": 0.10045,
-            "31": 0.10187,
-            "32": 0.10055,
-            "33": 0.11357,
-            "34": 0.10266,
-            "35": 0.10298,
-            "36": 0.10061,
-            "37": 0.10166,
-            "38": 0.10185,
-            "39": 0.09925,
-            "40": 0.10087,
-            "41": 0.10001,
-            "42": 0.1,
-            "43": 0.10286,
-            "44": 0.10227,
-            "45": 0.10327,
-            "46": 0.10041,
-            "47": 0.10091,
-            "48": 0.10215,
-            "49": 0.10017,
-            "50": 0.10055
+            "1": 7.02529,
+            "2": 0.11863,
+            "3": 0.10057,
+            "4": 0.09906,
+            "5": 0.08104,
+            "6": 0.08043,
+            "7": 0.08243,
+            "8": 0.08119,
+            "9": 0.08111,
+            "10": 0.08055,
+            "11": 0.08084,
+            "12": 0.0797,
+            "13": 0.07988,
+            "14": 0.08069,
+            "15": 0.08072,
+            "16": 0.08026,
+            "17": 0.08022,
+            "18": 0.08048,
+            "19": 0.08013,
+            "20": 0.08102,
+            "21": 0.08145,
+            "22": 0.08021,
+            "23": 0.08046,
+            "24": 0.082,
+            "25": 0.08075,
+            "26": 0.08017,
+            "27": 0.08064,
+            "28": 0.07978,
+            "29": 0.08107,
+            "30": 0.08431,
+            "31": 0.08022,
+            "32": 0.08061,
+            "33": 0.07995,
+            "34": 0.08117,
+            "35": 0.0796,
+            "36": 0.08069,
+            "37": 0.08194,
+            "38": 0.08127,
+            "39": 0.07932,
+            "40": 0.07929,
+            "41": 0.0796,
+            "42": 0.08162,
+            "43": 0.07964,
+            "44": 0.08019,
+            "45": 0.07997,
+            "46": 0.07935,
+            "47": 0.08025,
+            "48": 0.08073,
+            "49": 0.07999,
+            "50": 0.08013
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgx_a100.json
index bdc8c7f9895..4ba6ee523cb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgx_a100.json
@@ -51,8 +51,8 @@
             "45": 9.81584,
             "46": 9.80638,
             "47": 10.12803,
-            "48": 9.82444,
-            "49": 9.50618,
+            "48": 9.82443,
+            "49": 9.50621,
             "50": 9.89067
         }
     },
@@ -108,9 +108,9 @@
             "45": 2123.0,
             "46": 2194.0,
             "47": 2463.0,
-            "48": 2382.0,
-            "49": 2300.0,
-            "50": 2397.0
+            "48": 2345.0,
+            "49": 2282.0,
+            "50": 2366.0
         }
     },
     "mem-allocated-bytes": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 5.86972,
-            "2": 0.17288,
-            "3": 0.13781,
-            "4": 0.13826,
-            "5": 0.13569,
-            "6": 0.13252,
-            "7": 0.1323,
-            "8": 0.13208,
-            "9": 0.13237,
-            "10": 0.13177,
-            "11": 0.13164,
-            "12": 0.135,
-            "13": 0.13389,
-            "14": 0.13431,
-            "15": 0.13376,
-            "16": 0.1342,
-            "17": 0.13348,
-            "18": 0.13307,
-            "19": 0.13389,
-            "20": 0.13476,
-            "21": 0.13346,
-            "22": 0.13333,
-            "23": 0.13336,
-            "24": 0.13304,
-            "25": 0.13373,
-            "26": 0.13283,
-            "27": 0.1331,
-            "28": 0.13314,
-            "29": 0.13299,
-            "30": 0.13362,
-            "31": 0.13392,
-            "32": 0.13417,
-            "33": 0.13406,
-            "34": 0.13351,
-            "35": 0.13357,
-            "36": 0.13345,
-            "37": 0.13422,
-            "38": 0.1339,
-            "39": 0.13419,
-            "40": 0.13437,
-            "41": 0.13425,
-            "42": 0.13364,
-            "43": 0.13389,
-            "44": 0.13482,
-            "45": 0.13461,
-            "46": 0.134,
-            "47": 0.13363,
-            "48": 0.13416,
-            "49": 0.13386,
-            "50": 0.13343
+            "1": 3.53163,
+            "2": 0.15986,
+            "3": 0.14465,
+            "4": 0.12865,
+            "5": 0.12866,
+            "6": 0.12781,
+            "7": 0.12812,
+            "8": 0.12748,
+            "9": 0.12785,
+            "10": 0.12793,
+            "11": 0.12738,
+            "12": 0.12687,
+            "13": 0.1279,
+            "14": 0.12794,
+            "15": 0.12688,
+            "16": 0.12657,
+            "17": 0.12699,
+            "18": 0.12571,
+            "19": 0.1268,
+            "20": 0.12768,
+            "21": 0.12608,
+            "22": 0.12935,
+            "23": 0.12731,
+            "24": 0.12623,
+            "25": 0.1265,
+            "26": 0.12691,
+            "27": 0.12618,
+            "28": 0.12745,
+            "29": 0.12715,
+            "30": 0.12731,
+            "31": 0.12861,
+            "32": 0.12807,
+            "33": 0.12763,
+            "34": 0.1264,
+            "35": 0.12674,
+            "36": 0.12628,
+            "37": 0.12628,
+            "38": 0.12709,
+            "39": 0.12704,
+            "40": 0.12669,
+            "41": 0.12716,
+            "42": 0.12677,
+            "43": 0.12874,
+            "44": 0.12646,
+            "45": 0.12761,
+            "46": 0.12827,
+            "47": 0.12648,
+            "48": 0.12642,
+            "49": 0.12646,
+            "50": 0.12636
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json
index 0d13ca5c55f..7077541e896 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.83568, "5": 10.8567, "10": 10.81478, "15": 10.85098, "20": 10.85865, "25": 10.81343, "30": 10.74969, "35": 10.65857, "40": 10.50359, "45": 10.2738, "50": 10.25588, "55": 10.18782, "60": 9.80901, "65": 9.24475, "70": 9.91039, "75": 9.5812, "80": 9.54102, "85": 9.72633, "90": 9.90316, "95": 9.60258, "100": 9.49405}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 684610560.0, "5": 685659136.0, "10": 685659136.0, "15": 685659136.0, "20": 1043027456.0, "25": 1043027456.0, "30": 1043027456.0, "35": 1043027456.0, "40": 1043027456.0, "45": 1043027456.0, "50": 1043027456.0, "55": 1043027456.0, "60": 1043027456.0, "65": 1043027456.0, "70": 1043027456.0, "75": 1043027456.0, "80": 1043027456.0, "85": 1043027456.0, "90": 1043027456.0, "95": 1043027456.0, "100": 1043027456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3187304960.0, "5": 3187305472.0, "10": 3187305472.0, "15": 3187305472.0, "20": 3544935936.0, "25": 3544935936.0, "30": 3544935936.0, "35": 3544935936.0, "40": 3544935936.0, "45": 3544935936.0, "50": 3544935936.0, "55": 3544935936.0, "60": 3544935936.0, "65": 3544935936.0, "70": 3544935936.0, "75": 3544935936.0, "80": 3544935936.0, "85": 3544935936.0, "90": 3544935936.0, "95": 3544935936.0, "100": 3544935936.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 7.24348, "5": 0.12513, "10": 0.12256, "15": 0.12334, "20": 0.13378, "25": 0.14306, "30": 0.13313, "35": 0.13322, "40": 0.13261, "45": 0.13265, "50": 0.13289, "55": 0.13101, "60": 0.13018, "65": 0.13122, "70": 0.12989, "75": 0.13081, "80": 0.13089, "85": 0.13011, "90": 0.1304, "95": 0.13232, "100": 0.13063}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1953.0, "25": 1818.0, "30": 2298.0, "35": 2083.0, "40": 2287.0, "45": 2243.0, "50": 2426.0, "55": 2440.0, "60": 2493.0, "65": 2411.0, "70": 3119.0, "75": 2884.0, "80": 3549.0, "85": 3721.0, "90": 3452.0, "95": 3340.0, "100": 3338.0}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.83568,
+            "2": 10.83266,
+            "3": 10.83151,
+            "4": 10.80343,
+            "5": 10.8567,
+            "6": 10.86778,
+            "7": 10.84836,
+            "8": 10.84624,
+            "9": 10.85924,
+            "10": 10.81478,
+            "11": 10.89821,
+            "12": 10.88433,
+            "13": 10.88963,
+            "14": 10.90075,
+            "15": 10.85098,
+            "16": 10.86603,
+            "17": 10.85455,
+            "18": 10.88507,
+            "19": 10.8773,
+            "20": 10.85865,
+            "21": 10.85654,
+            "22": 10.79685,
+            "23": 10.88724,
+            "24": 10.82649,
+            "25": 10.81343,
+            "26": 10.82705,
+            "27": 10.84612,
+            "28": 10.84227,
+            "29": 10.85329,
+            "30": 10.74969,
+            "31": 10.63041,
+            "32": 10.79004,
+            "33": 10.77234,
+            "34": 10.65722,
+            "35": 10.65857,
+            "36": 10.61583,
+            "37": 10.67536,
+            "38": 10.58101,
+            "39": 10.69083,
+            "40": 10.50359,
+            "41": 10.52777,
+            "42": 10.55371,
+            "43": 10.28636,
+            "44": 10.36369,
+            "45": 10.27381,
+            "46": 10.24567,
+            "47": 10.45103,
+            "48": 10.23707,
+            "49": 9.99555,
+            "50": 10.25589,
+            "51": 10.2013,
+            "52": 10.10855,
+            "53": 10.34609,
+            "54": 10.24857,
+            "55": 10.18782,
+            "56": 9.95521,
+            "57": 9.81221,
+            "58": 10.10875,
+            "59": 9.8863,
+            "60": 9.80901,
+            "61": 9.94824,
+            "62": 10.1999,
+            "63": 9.6443,
+            "64": 9.9951,
+            "65": 9.24475,
+            "66": 9.90917,
+            "67": 9.59735,
+            "68": 9.97285,
+            "69": 9.96333,
+            "70": 9.91038,
+            "71": 9.78596,
+            "72": 9.77264,
+            "73": 9.6618,
+            "74": 9.16289,
+            "75": 9.58121,
+            "76": 9.26138,
+            "77": 10.17614,
+            "78": 9.85644,
+            "79": 9.50644,
+            "80": 9.54103,
+            "81": 9.61313,
+            "82": 9.80668,
+            "83": 9.44696,
+            "84": 9.52782,
+            "85": 9.72633,
+            "86": 9.19099,
+            "87": 9.68736,
+            "88": 9.85216,
+            "89": 9.71335,
+            "90": 9.90316,
+            "91": 9.46063,
+            "92": 9.46058,
+            "93": 9.19418,
+            "94": 8.93434,
+            "95": 9.60258,
+            "96": 9.61852,
+            "97": 9.39595,
+            "98": 9.76012,
+            "99": 8.98669,
+            "100": 9.49406
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 684610560.0,
+            "2": 685659136.0,
+            "3": 685659136.0,
+            "4": 685659136.0,
+            "5": 685659136.0,
+            "6": 685659136.0,
+            "7": 685659136.0,
+            "8": 685659136.0,
+            "9": 685659136.0,
+            "10": 685659136.0,
+            "11": 685659136.0,
+            "12": 685659136.0,
+            "13": 685659136.0,
+            "14": 685659136.0,
+            "15": 685659136.0,
+            "16": 685659136.0,
+            "17": 1043027456.0,
+            "18": 1043027456.0,
+            "19": 1043027456.0,
+            "20": 1043027456.0,
+            "21": 1043027456.0,
+            "22": 1043027456.0,
+            "23": 1043027456.0,
+            "24": 1043027456.0,
+            "25": 1043027456.0,
+            "26": 1043027456.0,
+            "27": 1043027456.0,
+            "28": 1043027456.0,
+            "29": 1043027456.0,
+            "30": 1043027456.0,
+            "31": 1043027456.0,
+            "32": 1043027456.0,
+            "33": 1043027456.0,
+            "34": 1043027456.0,
+            "35": 1043027456.0,
+            "36": 1043027456.0,
+            "37": 1043027456.0,
+            "38": 1043027456.0,
+            "39": 1043027456.0,
+            "40": 1043027456.0,
+            "41": 1043027456.0,
+            "42": 1043027456.0,
+            "43": 1043027456.0,
+            "44": 1043027456.0,
+            "45": 1043027456.0,
+            "46": 1043027456.0,
+            "47": 1043027456.0,
+            "48": 1043027456.0,
+            "49": 1043027456.0,
+            "50": 1043027456.0,
+            "51": 1043027456.0,
+            "52": 1043027456.0,
+            "53": 1043027456.0,
+            "54": 1043027456.0,
+            "55": 1043027456.0,
+            "56": 1043027456.0,
+            "57": 1043027456.0,
+            "58": 1043027456.0,
+            "59": 1043027456.0,
+            "60": 1043027456.0,
+            "61": 1043027456.0,
+            "62": 1043027456.0,
+            "63": 1043027456.0,
+            "64": 1043027456.0,
+            "65": 1043027456.0,
+            "66": 1043027456.0,
+            "67": 1043027456.0,
+            "68": 1043027456.0,
+            "69": 1043027456.0,
+            "70": 1043027456.0,
+            "71": 1043027456.0,
+            "72": 1043027456.0,
+            "73": 1043027456.0,
+            "74": 1043027456.0,
+            "75": 1043027456.0,
+            "76": 1043027456.0,
+            "77": 1043027456.0,
+            "78": 1043027456.0,
+            "79": 1043027456.0,
+            "80": 1043027456.0,
+            "81": 1043027456.0,
+            "82": 1043027456.0,
+            "83": 1043027456.0,
+            "84": 1043027456.0,
+            "85": 1043027456.0,
+            "86": 1043027456.0,
+            "87": 1043027456.0,
+            "88": 1043027456.0,
+            "89": 1043027456.0,
+            "90": 1043027456.0,
+            "91": 1043027456.0,
+            "92": 1043027456.0,
+            "93": 1043027456.0,
+            "94": 1043027456.0,
+            "95": 1043027456.0,
+            "96": 1043027456.0,
+            "97": 1043027456.0,
+            "98": 1043027456.0,
+            "99": 1043027456.0,
+            "100": 1043027456.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 3187304960.0,
+            "2": 3187305472.0,
+            "3": 3187305472.0,
+            "4": 3187305472.0,
+            "5": 3187305472.0,
+            "6": 3187305472.0,
+            "7": 3187305472.0,
+            "8": 3187305472.0,
+            "9": 3187305472.0,
+            "10": 3187305472.0,
+            "11": 3187305472.0,
+            "12": 3187305472.0,
+            "13": 3187305472.0,
+            "14": 3187305472.0,
+            "15": 3187305472.0,
+            "16": 3187305472.0,
+            "17": 3187305472.0,
+            "18": 3544935936.0,
+            "19": 3544935936.0,
+            "20": 3544935936.0,
+            "21": 3544935936.0,
+            "22": 3544935936.0,
+            "23": 3544935936.0,
+            "24": 3544935936.0,
+            "25": 3544935936.0,
+            "26": 3544935936.0,
+            "27": 3544935936.0,
+            "28": 3544935936.0,
+            "29": 3544935936.0,
+            "30": 3544935936.0,
+            "31": 3544935936.0,
+            "32": 3544935936.0,
+            "33": 3544935936.0,
+            "34": 3544935936.0,
+            "35": 3544935936.0,
+            "36": 3544935936.0,
+            "37": 3544935936.0,
+            "38": 3544935936.0,
+            "39": 3544935936.0,
+            "40": 3544935936.0,
+            "41": 3544935936.0,
+            "42": 3544935936.0,
+            "43": 3544935936.0,
+            "44": 3544935936.0,
+            "45": 3544935936.0,
+            "46": 3544935936.0,
+            "47": 3544935936.0,
+            "48": 3544935936.0,
+            "49": 3544935936.0,
+            "50": 3544935936.0,
+            "51": 3544935936.0,
+            "52": 3544935936.0,
+            "53": 3544935936.0,
+            "54": 3544935936.0,
+            "55": 3544935936.0,
+            "56": 3544935936.0,
+            "57": 3544935936.0,
+            "58": 3544935936.0,
+            "59": 3544935936.0,
+            "60": 3544935936.0,
+            "61": 3544935936.0,
+            "62": 3544935936.0,
+            "63": 3544935936.0,
+            "64": 3544935936.0,
+            "65": 3544935936.0,
+            "66": 3544935936.0,
+            "67": 3544935936.0,
+            "68": 3544935936.0,
+            "69": 3544935936.0,
+            "70": 3544935936.0,
+            "71": 3544935936.0,
+            "72": 3544935936.0,
+            "73": 3544935936.0,
+            "74": 3544935936.0,
+            "75": 3544935936.0,
+            "76": 3544935936.0,
+            "77": 3544935936.0,
+            "78": 3544935936.0,
+            "79": 3544935936.0,
+            "80": 3544935936.0,
+            "81": 3544935936.0,
+            "82": 3544935936.0,
+            "83": 3544935936.0,
+            "84": 3544935936.0,
+            "85": 3544935936.0,
+            "86": 3544935936.0,
+            "87": 3544935936.0,
+            "88": 3544935936.0,
+            "89": 3544935936.0,
+            "90": 3544935936.0,
+            "91": 3544935936.0,
+            "92": 3544935936.0,
+            "93": 3544935936.0,
+            "94": 3544935936.0,
+            "95": 3544935936.0,
+            "96": 3544935936.0,
+            "97": 3544935936.0,
+            "98": 3544935936.0,
+            "99": 3544935936.0,
+            "100": 3544935936.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 3.54415,
+            "2": 0.13377,
+            "3": 0.12455,
+            "4": 0.10264,
+            "5": 0.10219,
+            "6": 0.10272,
+            "7": 0.10298,
+            "8": 0.10295,
+            "9": 0.10258,
+            "10": 0.10337,
+            "11": 0.10271,
+            "12": 0.10191,
+            "13": 0.10215,
+            "14": 0.10241,
+            "15": 0.10208,
+            "16": 0.10177,
+            "17": 0.15691,
+            "18": 0.11817,
+            "19": 0.10983,
+            "20": 0.10994,
+            "21": 0.11033,
+            "22": 0.10162,
+            "23": 0.11031,
+            "24": 0.11013,
+            "25": 0.11053,
+            "26": 0.11093,
+            "27": 0.10984,
+            "28": 0.10992,
+            "29": 0.10976,
+            "30": 0.11044,
+            "31": 0.11049,
+            "32": 0.1109,
+            "33": 0.11229,
+            "34": 0.11176,
+            "35": 0.11192,
+            "36": 0.1118,
+            "37": 0.11187,
+            "38": 0.11171,
+            "39": 0.1119,
+            "40": 0.11109,
+            "41": 0.11066,
+            "42": 0.11036,
+            "43": 0.11014,
+            "44": 0.11085,
+            "45": 0.11065,
+            "46": 0.11031,
+            "47": 0.11096,
+            "48": 0.11193,
+            "49": 0.11004,
+            "50": 0.11026,
+            "51": 0.12208,
+            "52": 0.11528,
+            "53": 0.11393,
+            "54": 0.11467,
+            "55": 0.1144,
+            "56": 0.11475,
+            "57": 0.1155,
+            "58": 0.11437,
+            "59": 0.11509,
+            "60": 0.11581,
+            "61": 0.11462,
+            "62": 0.11503,
+            "63": 0.1147,
+            "64": 0.11384,
+            "65": 0.1139,
+            "66": 0.11371,
+            "67": 0.11448,
+            "68": 0.11386,
+            "69": 0.11391,
+            "70": 0.11448,
+            "71": 0.11388,
+            "72": 0.1142,
+            "73": 0.11413,
+            "74": 0.11463,
+            "75": 0.11394,
+            "76": 0.11427,
+            "77": 0.11359,
+            "78": 0.11462,
+            "79": 0.11355,
+            "80": 0.11396,
+            "81": 0.11373,
+            "82": 0.11509,
+            "83": 0.11377,
+            "84": 0.11466,
+            "85": 0.1144,
+            "86": 0.11501,
+            "87": 0.11412,
+            "88": 0.11353,
+            "89": 0.1148,
+            "90": 0.1137,
+            "91": 0.11378,
+            "92": 0.12007,
+            "93": 0.1204,
+            "94": 0.11454,
+            "95": 0.11432,
+            "96": 0.11436,
+            "97": 0.11405,
+            "98": 0.11395,
+            "99": 0.11405,
+            "100": 0.11374
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": 2249.0,
+            "18": 2165.0,
+            "19": 2362.0,
+            "20": 1953.0,
+            "21": 1898.0,
+            "22": "nan",
+            "23": 2371.0,
+            "24": 1984.0,
+            "25": 1818.0,
+            "26": 1980.0,
+            "27": 2078.0,
+            "28": 2467.0,
+            "29": 2395.0,
+            "30": 2298.0,
+            "31": 1682.0,
+            "32": 2236.0,
+            "33": 2192.0,
+            "34": 1800.0,
+            "35": 2083.0,
+            "36": 2139.0,
+            "37": 2498.0,
+            "38": 2218.0,
+            "39": 2642.0,
+            "40": 2287.0,
+            "41": 2344.0,
+            "42": 2340.0,
+            "43": 2130.0,
+            "44": 2069.0,
+            "45": 2188.0,
+            "46": 1932.0,
+            "47": 2670.0,
+            "48": 2471.0,
+            "49": 1891.0,
+            "50": 2416.0,
+            "51": 2321.0,
+            "52": 2363.0,
+            "53": 2925.0,
+            "54": 2486.0,
+            "55": 2408.0,
+            "56": 2298.0,
+            "57": 2286.0,
+            "58": 2584.0,
+            "59": 2358.0,
+            "60": 2487.0,
+            "61": 2791.0,
+            "62": 2751.0,
+            "63": 2385.0,
+            "64": 2791.0,
+            "65": 2372.0,
+            "66": 2970.0,
+            "67": 2557.0,
+            "68": 2857.0,
+            "69": 2699.0,
+            "70": 3035.0,
+            "71": 2940.0,
+            "72": 2315.0,
+            "73": 2968.0,
+            "74": 2205.0,
+            "75": 2811.0,
+            "76": 2969.0,
+            "77": 3296.0,
+            "78": 3578.0,
+            "79": 3594.0,
+            "80": 3509.0,
+            "81": 3698.0,
+            "82": 3355.0,
+            "83": 3205.0,
+            "84": 3285.0,
+            "85": 3791.0,
+            "86": 3303.0,
+            "87": 3934.0,
+            "88": 3130.0,
+            "89": 3809.0,
+            "90": 3388.0,
+            "91": 2618.0,
+            "92": 3412.0,
+            "93": 3072.0,
+            "94": 3731.0,
+            "95": 3357.0,
+            "96": 3852.0,
+            "97": 3528.0,
+            "98": 3616.0,
+            "99": 3449.0,
+            "100": 3284.0
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100_2nd.json
new file mode 100644
index 00000000000..562afadc7f9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.2013,
+            "52": 10.10855,
+            "53": 10.34609,
+            "54": 10.24857,
+            "55": 10.18782,
+            "56": 9.95521,
+            "57": 9.81221,
+            "58": 10.10875,
+            "59": 9.8863,
+            "60": 9.80901,
+            "61": 9.94824,
+            "62": 10.1999,
+            "63": 9.6443,
+            "64": 9.9951,
+            "65": 9.24475,
+            "66": 9.90917,
+            "67": 9.59735,
+            "68": 9.97285,
+            "69": 9.96333,
+            "70": 9.91038,
+            "71": 9.78596,
+            "72": 9.77264,
+            "73": 9.6618,
+            "74": 9.16289,
+            "75": 9.58121,
+            "76": 9.26138,
+            "77": 10.17614,
+            "78": 9.85644,
+            "79": 9.50644,
+            "80": 9.54103,
+            "81": 9.61313,
+            "82": 9.80668,
+            "83": 9.44696,
+            "84": 9.52782,
+            "85": 9.72633,
+            "86": 9.19099,
+            "87": 9.68736,
+            "88": 9.85216,
+            "89": 9.71335,
+            "90": 9.90316,
+            "91": 9.46063,
+            "92": 9.46058,
+            "93": 9.19418,
+            "94": 8.93434,
+            "95": 9.60258,
+            "96": 9.61852,
+            "97": 9.39595,
+            "98": 9.76012,
+            "99": 8.98669,
+            "100": 9.49406
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2321.0,
+            "52": 2363.0,
+            "53": 2925.0,
+            "54": 2486.0,
+            "55": 2408.0,
+            "56": 2298.0,
+            "57": 2286.0,
+            "58": 2584.0,
+            "59": 2358.0,
+            "60": 2487.0,
+            "61": 2791.0,
+            "62": 2751.0,
+            "63": 2385.0,
+            "64": 2791.0,
+            "65": 2372.0,
+            "66": 2970.0,
+            "67": 2557.0,
+            "68": 2857.0,
+            "69": 2699.0,
+            "70": 3035.0,
+            "71": 2940.0,
+            "72": 2315.0,
+            "73": 2968.0,
+            "74": 2205.0,
+            "75": 2811.0,
+            "76": 2969.0,
+            "77": 3296.0,
+            "78": 3578.0,
+            "79": 3594.0,
+            "80": 3509.0,
+            "81": 3698.0,
+            "82": 3355.0,
+            "83": 3205.0,
+            "84": 3285.0,
+            "85": 3791.0,
+            "86": 3303.0,
+            "87": 3934.0,
+            "88": 3130.0,
+            "89": 3809.0,
+            "90": 3388.0,
+            "91": 2618.0,
+            "92": 3412.0,
+            "93": 3072.0,
+            "94": 3731.0,
+            "95": 3357.0,
+            "96": 3852.0,
+            "97": 3528.0,
+            "98": 3616.0,
+            "99": 3449.0,
+            "100": 3284.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1044076032.0,
+            "52": 1045124608.0,
+            "53": 1045124608.0,
+            "54": 1045124608.0,
+            "55": 1045124608.0,
+            "56": 1045124608.0,
+            "57": 1045124608.0,
+            "58": 1045124608.0,
+            "59": 1045124608.0,
+            "60": 1045124608.0,
+            "61": 1045124608.0,
+            "62": 1045124608.0,
+            "63": 1045124608.0,
+            "64": 1045124608.0,
+            "65": 1045124608.0,
+            "66": 1045124608.0,
+            "67": 1045124608.0,
+            "68": 1045124608.0,
+            "69": 1045124608.0,
+            "70": 1045124608.0,
+            "71": 1045124608.0,
+            "72": 1045124608.0,
+            "73": 1045124608.0,
+            "74": 1045124608.0,
+            "75": 1045124608.0,
+            "76": 1045124608.0,
+            "77": 1045124608.0,
+            "78": 1045124608.0,
+            "79": 1045124608.0,
+            "80": 1045124608.0,
+            "81": 1045124608.0,
+            "82": 1045124608.0,
+            "83": 1045124608.0,
+            "84": 1045124608.0,
+            "85": 1045124608.0,
+            "86": 1045124608.0,
+            "87": 1045124608.0,
+            "88": 1045124608.0,
+            "89": 1045124608.0,
+            "90": 1045124608.0,
+            "91": 1045124608.0,
+            "92": 1045124608.0,
+            "93": 1045124608.0,
+            "94": 1045124608.0,
+            "95": 1045124608.0,
+            "96": 1045124608.0,
+            "97": 1045124608.0,
+            "98": 1045124608.0,
+            "99": 1045124608.0,
+            "100": 1045124608.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3546769920.0,
+            "52": 3546770944.0,
+            "53": 3546770944.0,
+            "54": 3546770944.0,
+            "55": 3546770944.0,
+            "56": 3546770944.0,
+            "57": 3546770944.0,
+            "58": 3546770944.0,
+            "59": 3546770944.0,
+            "60": 3546770944.0,
+            "61": 3546770944.0,
+            "62": 3546770944.0,
+            "63": 3546770944.0,
+            "64": 3546770944.0,
+            "65": 3546770944.0,
+            "66": 3546770944.0,
+            "67": 3546770944.0,
+            "68": 3546770944.0,
+            "69": 3546770944.0,
+            "70": 3546770944.0,
+            "71": 3546770944.0,
+            "72": 3546770944.0,
+            "73": 3546770944.0,
+            "74": 3546770944.0,
+            "75": 3546770944.0,
+            "76": 3546770944.0,
+            "77": 3546770944.0,
+            "78": 3546770944.0,
+            "79": 3546770944.0,
+            "80": 3546770944.0,
+            "81": 3546770944.0,
+            "82": 3546770944.0,
+            "83": 3546770944.0,
+            "84": 3546770944.0,
+            "85": 3546770944.0,
+            "86": 3546770944.0,
+            "87": 3546770944.0,
+            "88": 3546770944.0,
+            "89": 3546770944.0,
+            "90": 3546770944.0,
+            "91": 3546770944.0,
+            "92": 3546770944.0,
+            "93": 3546770944.0,
+            "94": 3546770944.0,
+            "95": 3546770944.0,
+            "96": 3546770944.0,
+            "97": 3546770944.0,
+            "98": 3546770944.0,
+            "99": 3546770944.0,
+            "100": 3546770944.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.6476,
+            "52": 0.13199,
+            "53": 0.11408,
+            "54": 0.11307,
+            "55": 0.11409,
+            "56": 0.11247,
+            "57": 0.1156,
+            "58": 0.1145,
+            "59": 0.11417,
+            "60": 0.11341,
+            "61": 0.11362,
+            "62": 0.11325,
+            "63": 0.11392,
+            "64": 0.11377,
+            "65": 0.1147,
+            "66": 0.11413,
+            "67": 0.11405,
+            "68": 0.11324,
+            "69": 0.11372,
+            "70": 0.11377,
+            "71": 0.11356,
+            "72": 0.11352,
+            "73": 0.11403,
+            "74": 0.11362,
+            "75": 0.11349,
+            "76": 0.11421,
+            "77": 0.11375,
+            "78": 0.11412,
+            "79": 0.11355,
+            "80": 0.11386,
+            "81": 0.11419,
+            "82": 0.11416,
+            "83": 0.11393,
+            "84": 0.11344,
+            "85": 0.11365,
+            "86": 0.11411,
+            "87": 0.1142,
+            "88": 0.11406,
+            "89": 0.11433,
+            "90": 0.11364,
+            "91": 0.11411,
+            "92": 0.11433,
+            "93": 0.11448,
+            "94": 0.11375,
+            "95": 0.11569,
+            "96": 0.11395,
+            "97": 0.11375,
+            "98": 0.11361,
+            "99": 0.11378,
+            "100": 0.11406
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..aa94d697c53
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.76988,
+            "2": 10.77993,
+            "3": 10.77871,
+            "4": 10.73971,
+            "5": 10.81287,
+            "6": 10.82056,
+            "7": 10.79701,
+            "8": 10.78537,
+            "9": 10.79592,
+            "10": 10.72505,
+            "11": 10.86085,
+            "12": 10.82094,
+            "13": 10.83653,
+            "14": 10.85836,
+            "15": 10.80259,
+            "16": 10.80847,
+            "17": 10.77612,
+            "18": 10.81818,
+            "19": 10.8171,
+            "20": 10.78975,
+            "21": 10.79586,
+            "22": 10.71325,
+            "23": 10.84137,
+            "24": 10.76141,
+            "25": 10.73556,
+            "26": 10.76141,
+            "27": 10.78766,
+            "28": 10.79283,
+            "29": 10.81938,
+            "30": 10.68037,
+            "31": 10.5422,
+            "32": 10.72471,
+            "33": 10.71833,
+            "34": 10.58577,
+            "35": 10.5941,
+            "36": 10.54254,
+            "37": 10.62391,
+            "38": 10.50727,
+            "39": 10.65,
+            "40": 10.42314,
+            "41": 10.45946,
+            "42": 10.50017,
+            "43": 10.20049,
+            "44": 10.28686,
+            "45": 10.1806,
+            "46": 10.168,
+            "47": 10.40733,
+            "48": 10.16626,
+            "49": 9.90217,
+            "50": 10.18179,
+            "51": 10.13864,
+            "52": 10.03803,
+            "53": 10.2953,
+            "54": 10.19383,
+            "55": 10.14359,
+            "56": 9.8908,
+            "57": 9.73702,
+            "58": 10.05022,
+            "59": 9.83828,
+            "60": 9.74551,
+            "61": 9.90679,
+            "62": 10.16216,
+            "63": 9.59842,
+            "64": 9.95194,
+            "65": 9.18904,
+            "66": 9.87164,
+            "67": 9.56047,
+            "68": 9.94233,
+            "69": 9.94285,
+            "70": 9.8854,
+            "71": 9.77852,
+            "72": 9.73861,
+            "73": 9.63511,
+            "74": 9.10351,
+            "75": 9.55716,
+            "76": 9.23197,
+            "77": 10.16792,
+            "78": 9.83943,
+            "79": 9.49691,
+            "80": 9.52327,
+            "81": 9.60219,
+            "82": 9.8054,
+            "83": 9.43936,
+            "84": 9.51953,
+            "85": 9.72086,
+            "86": 9.18604,
+            "87": 9.68762,
+            "88": 9.84868,
+            "89": 9.70441,
+            "90": 9.91638,
+            "91": 9.45088,
+            "92": 9.45495,
+            "93": 9.1952,
+            "94": 8.93245,
+            "95": 9.61119,
+            "96": 9.62586,
+            "97": 9.39727,
+            "98": 9.76341,
+            "99": 8.99611,
+            "100": 9.50318
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 685659136.0,
+            "2": 685659136.0,
+            "3": 685659136.0,
+            "4": 685659136.0,
+            "5": 685659136.0,
+            "6": 685659136.0,
+            "7": 685659136.0,
+            "8": 685659136.0,
+            "9": 685659136.0,
+            "10": 685659136.0,
+            "11": 685659136.0,
+            "12": 685659136.0,
+            "13": 685659136.0,
+            "14": 685659136.0,
+            "15": 685659136.0,
+            "16": 685659136.0,
+            "17": 1043027456.0,
+            "18": 1043027456.0,
+            "19": 1043027456.0,
+            "20": 1043027456.0,
+            "21": 1043027456.0,
+            "22": 1043027456.0,
+            "23": 1043027456.0,
+            "24": 1043027456.0,
+            "25": 1043027456.0,
+            "26": 1043027456.0,
+            "27": 1043027456.0,
+            "28": 1043027456.0,
+            "29": 1043027456.0,
+            "30": 1043027456.0,
+            "31": 1043027456.0,
+            "32": 1043027456.0,
+            "33": 1043027456.0,
+            "34": 1043027456.0,
+            "35": 1043027456.0,
+            "36": 1043027456.0,
+            "37": 1043027456.0,
+            "38": 1043027456.0,
+            "39": 1043027456.0,
+            "40": 1043027456.0,
+            "41": 1043027456.0,
+            "42": 1043027456.0,
+            "43": 1043027456.0,
+            "44": 1043027456.0,
+            "45": 1043027456.0,
+            "46": 1043027456.0,
+            "47": 1043027456.0,
+            "48": 1043027456.0,
+            "49": 1043027456.0,
+            "50": 1043027456.0,
+            "51": 1043027456.0,
+            "52": 1043027456.0,
+            "53": 1043027456.0,
+            "54": 1043027456.0,
+            "55": 1043027456.0,
+            "56": 1043027456.0,
+            "57": 1043027456.0,
+            "58": 1043027456.0,
+            "59": 1043027456.0,
+            "60": 1043027456.0,
+            "61": 1043027456.0,
+            "62": 1043027456.0,
+            "63": 1043027456.0,
+            "64": 1043027456.0,
+            "65": 1043027456.0,
+            "66": 1043027456.0,
+            "67": 1043027456.0,
+            "68": 1043027456.0,
+            "69": 1043027456.0,
+            "70": 1043027456.0,
+            "71": 1043027456.0,
+            "72": 1043027456.0,
+            "73": 1043027456.0,
+            "74": 1043027456.0,
+            "75": 1043027456.0,
+            "76": 1043027456.0,
+            "77": 1043027456.0,
+            "78": 1043027456.0,
+            "79": 1043027456.0,
+            "80": 1043027456.0,
+            "81": 1043027456.0,
+            "82": 1043027456.0,
+            "83": 1043027456.0,
+            "84": 1043027456.0,
+            "85": 1043027456.0,
+            "86": 1043027456.0,
+            "87": 1043027456.0,
+            "88": 1043027456.0,
+            "89": 1043027456.0,
+            "90": 1043027456.0,
+            "91": 1043027456.0,
+            "92": 1043027456.0,
+            "93": 1043027456.0,
+            "94": 1043027456.0,
+            "95": 1043027456.0,
+            "96": 1043027456.0,
+            "97": 1043027456.0,
+            "98": 1043027456.0,
+            "99": 1043027456.0,
+            "100": 1043027456.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 3187304960.0,
+            "2": 3187305472.0,
+            "3": 3187305472.0,
+            "4": 3187305472.0,
+            "5": 3187305472.0,
+            "6": 3187305472.0,
+            "7": 3187305472.0,
+            "8": 3187305472.0,
+            "9": 3187305472.0,
+            "10": 3187305472.0,
+            "11": 3187305472.0,
+            "12": 3187305472.0,
+            "13": 3187305472.0,
+            "14": 3187305472.0,
+            "15": 3187305472.0,
+            "16": 3187305472.0,
+            "17": 3187305472.0,
+            "18": 3544935936.0,
+            "19": 3544935936.0,
+            "20": 3544935936.0,
+            "21": 3544935936.0,
+            "22": 3544935936.0,
+            "23": 3544935936.0,
+            "24": 3544935936.0,
+            "25": 3544935936.0,
+            "26": 3544935936.0,
+            "27": 3544935936.0,
+            "28": 3544935936.0,
+            "29": 3544935936.0,
+            "30": 3544935936.0,
+            "31": 3544935936.0,
+            "32": 3544935936.0,
+            "33": 3544935936.0,
+            "34": 3544935936.0,
+            "35": 3544935936.0,
+            "36": 3544935936.0,
+            "37": 3544935936.0,
+            "38": 3544935936.0,
+            "39": 3544935936.0,
+            "40": 3544935936.0,
+            "41": 3544935936.0,
+            "42": 3544935936.0,
+            "43": 3544935936.0,
+            "44": 3544935936.0,
+            "45": 3544935936.0,
+            "46": 3544935936.0,
+            "47": 3544935936.0,
+            "48": 3544935936.0,
+            "49": 3544935936.0,
+            "50": 3544935936.0,
+            "51": 3544935936.0,
+            "52": 3544935936.0,
+            "53": 3544935936.0,
+            "54": 3544935936.0,
+            "55": 3544935936.0,
+            "56": 3544935936.0,
+            "57": 3544935936.0,
+            "58": 3544935936.0,
+            "59": 3544935936.0,
+            "60": 3544935936.0,
+            "61": 3544935936.0,
+            "62": 3544935936.0,
+            "63": 3544935936.0,
+            "64": 3544935936.0,
+            "65": 3544935936.0,
+            "66": 3544935936.0,
+            "67": 3544935936.0,
+            "68": 3544935936.0,
+            "69": 3544935936.0,
+            "70": 3544935936.0,
+            "71": 3544935936.0,
+            "72": 3544935936.0,
+            "73": 3544935936.0,
+            "74": 3544935936.0,
+            "75": 3544935936.0,
+            "76": 3544935936.0,
+            "77": 3544935936.0,
+            "78": 3544935936.0,
+            "79": 3544935936.0,
+            "80": 3544935936.0,
+            "81": 3544935936.0,
+            "82": 3544935936.0,
+            "83": 3544935936.0,
+            "84": 3544935936.0,
+            "85": 3544935936.0,
+            "86": 3544935936.0,
+            "87": 3544935936.0,
+            "88": 3544935936.0,
+            "89": 3544935936.0,
+            "90": 3544935936.0,
+            "91": 3544935936.0,
+            "92": 3544935936.0,
+            "93": 3544935936.0,
+            "94": 3544935936.0,
+            "95": 3544935936.0,
+            "96": 3544935936.0,
+            "97": 3544935936.0,
+            "98": 3544935936.0,
+            "99": 3544935936.0,
+            "100": 3544935936.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.16002,
+            "3": 0.12086,
+            "4": 0.10679,
+            "5": 0.10684,
+            "6": 0.10764,
+            "7": 0.10756,
+            "8": 0.10782,
+            "9": 0.10736,
+            "10": 0.10699,
+            "11": 0.10758,
+            "12": 0.1082,
+            "13": 0.10728,
+            "14": 0.10701,
+            "15": 0.10751,
+            "16": 0.10779,
+            "17": 0.1525,
+            "18": 0.12315,
+            "19": 0.11877,
+            "20": 0.11834,
+            "21": 0.11693,
+            "22": 0.10869,
+            "23": 0.11758,
+            "24": 0.11876,
+            "25": 0.1157,
+            "26": 0.11704,
+            "27": 0.11721,
+            "28": 0.11734,
+            "29": 0.11883,
+            "30": 0.11662,
+            "31": 0.11713,
+            "32": 0.11867,
+            "33": 0.11765,
+            "34": 0.11637,
+            "35": 0.1163,
+            "36": 0.11685,
+            "37": 0.11693,
+            "38": 0.11691,
+            "39": 0.11511,
+            "40": 0.11608,
+            "41": 0.11592,
+            "42": 0.11755,
+            "43": 0.11699,
+            "44": 0.1167,
+            "45": 0.11741,
+            "46": 0.11687,
+            "47": 0.11681,
+            "48": 0.11668,
+            "49": 0.11692,
+            "50": 0.11769,
+            "51": 0.13585,
+            "52": 0.11734,
+            "53": 0.11685,
+            "54": 0.11694,
+            "55": 0.11628,
+            "56": 0.11632,
+            "57": 0.11669,
+            "58": 0.11528,
+            "59": 0.11499,
+            "60": 0.11541,
+            "61": 0.11621,
+            "62": 0.11572,
+            "63": 0.11627,
+            "64": 0.11666,
+            "65": 0.11549,
+            "66": 0.11562,
+            "67": 0.11651,
+            "68": 0.11467,
+            "69": 0.11506,
+            "70": 0.11625,
+            "71": 0.11703,
+            "72": 0.11635,
+            "73": 0.11771,
+            "74": 0.1156,
+            "75": 0.11766,
+            "76": 0.11632,
+            "77": 0.11535,
+            "78": 0.11674,
+            "79": 0.11793,
+            "80": 0.1173,
+            "81": 0.11677,
+            "82": 0.11736,
+            "83": 0.11611,
+            "84": 0.11798,
+            "85": 0.11839,
+            "86": 0.11892,
+            "87": 0.11724,
+            "88": 0.11663,
+            "89": 0.11722,
+            "90": 0.11751,
+            "91": 0.11711,
+            "92": 0.11773,
+            "93": 0.11853,
+            "94": 0.11655,
+            "95": 0.11767,
+            "96": 0.11808,
+            "97": 0.11824,
+            "98": 0.11911,
+            "99": 0.11735,
+            "100": 0.11751
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": 2223.0,
+            "18": 2141.0,
+            "19": 2432.0,
+            "20": 1893.0,
+            "21": 1918.0,
+            "22": "nan",
+            "23": 2243.0,
+            "24": 1920.0,
+            "25": 1874.0,
+            "26": 1885.0,
+            "27": 2072.0,
+            "28": 2375.0,
+            "29": 2356.0,
+            "30": 2316.0,
+            "31": 1689.0,
+            "32": 2250.0,
+            "33": 2111.0,
+            "34": 1822.0,
+            "35": 1976.0,
+            "36": 2089.0,
+            "37": 2394.0,
+            "38": 2078.0,
+            "39": 2662.0,
+            "40": 2284.0,
+            "41": 2402.0,
+            "42": 2250.0,
+            "43": 2141.0,
+            "44": 2112.0,
+            "45": 2341.0,
+            "46": 2005.0,
+            "47": 2567.0,
+            "48": 2332.0,
+            "49": 1858.0,
+            "50": 2478.0,
+            "51": 2321.0,
+            "52": 2270.0,
+            "53": 2929.0,
+            "54": 2493.0,
+            "55": 2470.0,
+            "56": 2387.0,
+            "57": 2321.0,
+            "58": 2774.0,
+            "59": 2339.0,
+            "60": 2654.0,
+            "61": 2810.0,
+            "62": 2863.0,
+            "63": 2582.0,
+            "64": 2851.0,
+            "65": 2686.0,
+            "66": 2969.0,
+            "67": 2680.0,
+            "68": 2913.0,
+            "69": 2669.0,
+            "70": 2988.0,
+            "71": 2881.0,
+            "72": 2465.0,
+            "73": 3188.0,
+            "74": 2209.0,
+            "75": 2665.0,
+            "76": 3308.0,
+            "77": 3227.0,
+            "78": 3393.0,
+            "79": 3433.0,
+            "80": 3273.0,
+            "81": 3620.0,
+            "82": 3491.0,
+            "83": 3140.0,
+            "84": 3225.0,
+            "85": 3622.0,
+            "86": 3290.0,
+            "87": 4023.0,
+            "88": 3187.0,
+            "89": 3975.0,
+            "90": 3576.0,
+            "91": 2689.0,
+            "92": 3474.0,
+            "93": 3202.0,
+            "94": 3608.0,
+            "95": 3510.0,
+            "96": 3634.0,
+            "97": 3500.0,
+            "98": 3933.0,
+            "99": 3502.0,
+            "100": 3134.0
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..a0ffc9cfd0d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.13864,
+            "52": 10.03803,
+            "53": 10.2953,
+            "54": 10.19383,
+            "55": 10.14359,
+            "56": 9.8908,
+            "57": 9.73702,
+            "58": 10.05022,
+            "59": 9.83828,
+            "60": 9.74551,
+            "61": 9.90679,
+            "62": 10.16216,
+            "63": 9.59842,
+            "64": 9.95194,
+            "65": 9.18904,
+            "66": 9.87164,
+            "67": 9.56047,
+            "68": 9.94233,
+            "69": 9.94285,
+            "70": 9.8854,
+            "71": 9.77852,
+            "72": 9.73861,
+            "73": 9.63511,
+            "74": 9.10351,
+            "75": 9.55716,
+            "76": 9.23197,
+            "77": 10.16792,
+            "78": 9.83943,
+            "79": 9.49691,
+            "80": 9.52327,
+            "81": 9.60219,
+            "82": 9.8054,
+            "83": 9.43936,
+            "84": 9.51953,
+            "85": 9.72086,
+            "86": 9.18604,
+            "87": 9.68762,
+            "88": 9.84868,
+            "89": 9.70441,
+            "90": 9.91638,
+            "91": 9.45088,
+            "92": 9.45495,
+            "93": 9.1952,
+            "94": 8.93245,
+            "95": 9.61119,
+            "96": 9.62586,
+            "97": 9.39727,
+            "98": 9.76341,
+            "99": 8.99611,
+            "100": 9.50318
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2321.0,
+            "52": 2270.0,
+            "53": 2929.0,
+            "54": 2493.0,
+            "55": 2470.0,
+            "56": 2387.0,
+            "57": 2321.0,
+            "58": 2774.0,
+            "59": 2339.0,
+            "60": 2654.0,
+            "61": 2810.0,
+            "62": 2863.0,
+            "63": 2582.0,
+            "64": 2851.0,
+            "65": 2686.0,
+            "66": 2969.0,
+            "67": 2680.0,
+            "68": 2913.0,
+            "69": 2669.0,
+            "70": 2988.0,
+            "71": 2881.0,
+            "72": 2465.0,
+            "73": 3188.0,
+            "74": 2209.0,
+            "75": 2665.0,
+            "76": 3308.0,
+            "77": 3227.0,
+            "78": 3393.0,
+            "79": 3433.0,
+            "80": 3273.0,
+            "81": 3620.0,
+            "82": 3491.0,
+            "83": 3140.0,
+            "84": 3225.0,
+            "85": 3622.0,
+            "86": 3290.0,
+            "87": 4023.0,
+            "88": 3187.0,
+            "89": 3975.0,
+            "90": 3576.0,
+            "91": 2689.0,
+            "92": 3474.0,
+            "93": 3202.0,
+            "94": 3608.0,
+            "95": 3510.0,
+            "96": 3634.0,
+            "97": 3500.0,
+            "98": 3933.0,
+            "99": 3502.0,
+            "100": 3134.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1044076032.0,
+            "52": 1044076032.0,
+            "53": 1044076032.0,
+            "54": 1044076032.0,
+            "55": 1044076032.0,
+            "56": 1044076032.0,
+            "57": 1044076032.0,
+            "58": 1044076032.0,
+            "59": 1044076032.0,
+            "60": 1044076032.0,
+            "61": 1044076032.0,
+            "62": 1044076032.0,
+            "63": 1044076032.0,
+            "64": 1044076032.0,
+            "65": 1044076032.0,
+            "66": 1044076032.0,
+            "67": 1044076032.0,
+            "68": 1044076032.0,
+            "69": 1044076032.0,
+            "70": 1044076032.0,
+            "71": 1044076032.0,
+            "72": 1044076032.0,
+            "73": 1044076032.0,
+            "74": 1044076032.0,
+            "75": 1044076032.0,
+            "76": 1044076032.0,
+            "77": 1044076032.0,
+            "78": 1044076032.0,
+            "79": 1044076032.0,
+            "80": 1044076032.0,
+            "81": 1044076032.0,
+            "82": 1044076032.0,
+            "83": 1044076032.0,
+            "84": 1044076032.0,
+            "85": 1044076032.0,
+            "86": 1044076032.0,
+            "87": 1044076032.0,
+            "88": 1044076032.0,
+            "89": 1044076032.0,
+            "90": 1044076032.0,
+            "91": 1044076032.0,
+            "92": 1044076032.0,
+            "93": 1044076032.0,
+            "94": 1044076032.0,
+            "95": 1044076032.0,
+            "96": 1044076032.0,
+            "97": 1044076032.0,
+            "98": 1044076032.0,
+            "99": 1044076032.0,
+            "100": 1044076032.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3546769920.0,
+            "52": 3546770944.0,
+            "53": 3546770944.0,
+            "54": 3546770944.0,
+            "55": 3546770944.0,
+            "56": 3546770944.0,
+            "57": 3546770944.0,
+            "58": 3546770944.0,
+            "59": 3546770944.0,
+            "60": 3546770944.0,
+            "61": 3546770944.0,
+            "62": 3546770944.0,
+            "63": 3546770944.0,
+            "64": 3546770944.0,
+            "65": 3546770944.0,
+            "66": 3546770944.0,
+            "67": 3546770944.0,
+            "68": 3546770944.0,
+            "69": 3546770944.0,
+            "70": 3546770944.0,
+            "71": 3546770944.0,
+            "72": 3546770944.0,
+            "73": 3546770944.0,
+            "74": 3546770944.0,
+            "75": 3546770944.0,
+            "76": 3546770944.0,
+            "77": 3546770944.0,
+            "78": 3546770944.0,
+            "79": 3546770944.0,
+            "80": 3546770944.0,
+            "81": 3546770944.0,
+            "82": 3546770944.0,
+            "83": 3546770944.0,
+            "84": 3546770944.0,
+            "85": 3546770944.0,
+            "86": 3546770944.0,
+            "87": 3546770944.0,
+            "88": 3546770944.0,
+            "89": 3546770944.0,
+            "90": 3546770944.0,
+            "91": 3546770944.0,
+            "92": 3546770944.0,
+            "93": 3546770944.0,
+            "94": 3546770944.0,
+            "95": 3546770944.0,
+            "96": 3546770944.0,
+            "97": 3546770944.0,
+            "98": 3546770944.0,
+            "99": 3546770944.0,
+            "100": 3546770944.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.47868,
+            "53": 0.1279,
+            "54": 0.11872,
+            "55": 0.11801,
+            "56": 0.11659,
+            "57": 0.11822,
+            "58": 0.11942,
+            "59": 0.11781,
+            "60": 0.11931,
+            "61": 0.11745,
+            "62": 0.11823,
+            "63": 0.11864,
+            "64": 0.11936,
+            "65": 0.11981,
+            "66": 0.11828,
+            "67": 0.11814,
+            "68": 0.11767,
+            "69": 0.11723,
+            "70": 0.11851,
+            "71": 0.11778,
+            "72": 0.1171,
+            "73": 0.11843,
+            "74": 0.11879,
+            "75": 0.11904,
+            "76": 0.11937,
+            "77": 0.11851,
+            "78": 0.11863,
+            "79": 0.11746,
+            "80": 0.11801,
+            "81": 0.11841,
+            "82": 0.1189,
+            "83": 0.11865,
+            "84": 0.11762,
+            "85": 0.11705,
+            "86": 0.11788,
+            "87": 0.11804,
+            "88": 0.11814,
+            "89": 0.11967,
+            "90": 0.11938,
+            "91": 0.11768,
+            "92": 0.1176,
+            "93": 0.1189,
+            "94": 0.1188,
+            "95": 0.11753,
+            "96": 0.1179,
+            "97": 0.12698,
+            "98": 0.13715,
+            "99": 0.1402,
+            "100": 0.13853
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json
index 094be8516a7..605b5aee03b 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json
@@ -48,13 +48,13 @@
             "42": 10.55371,
             "43": 10.28636,
             "44": 10.36369,
-            "45": 10.2738,
+            "45": 10.27381,
             "46": 10.24567,
             "47": 10.45103,
             "48": 10.23707,
             "49": 9.99555,
-            "50": 10.25588,
-            "51": 10.20129,
+            "50": 10.25589,
+            "51": 10.2013,
             "52": 10.10855,
             "53": 10.34609,
             "54": 10.24857,
@@ -66,26 +66,26 @@
             "60": 9.80901,
             "61": 9.94824,
             "62": 10.1999,
-            "63": 9.64431,
+            "63": 9.6443,
             "64": 9.9951,
             "65": 9.24475,
             "66": 9.90917,
             "67": 9.59735,
             "68": 9.97285,
-            "69": 9.96332,
-            "70": 9.91039,
+            "69": 9.96333,
+            "70": 9.91038,
             "71": 9.78596,
-            "72": 9.77263,
+            "72": 9.77264,
             "73": 9.6618,
             "74": 9.16289,
-            "75": 9.5812,
-            "76": 9.26137,
-            "77": 10.17615,
+            "75": 9.58121,
+            "76": 9.26138,
+            "77": 10.17614,
             "78": 9.85644,
             "79": 9.50644,
-            "80": 9.54102,
+            "80": 9.54103,
             "81": 9.61313,
-            "82": 9.80669,
+            "82": 9.80668,
             "83": 9.44696,
             "84": 9.52782,
             "85": 9.72633,
@@ -94,16 +94,16 @@
             "88": 9.85216,
             "89": 9.71335,
             "90": 9.90316,
-            "91": 9.46064,
-            "92": 9.46059,
+            "91": 9.46063,
+            "92": 9.46058,
             "93": 9.19418,
             "94": 8.93434,
             "95": 9.60258,
             "96": 9.61852,
-            "97": 9.39594,
+            "97": 9.39595,
             "98": 9.76012,
-            "99": 8.98668,
-            "100": 9.49405
+            "99": 8.98669,
+            "100": 9.49406
         }
     },
     "mem-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 7.5468,
-            "2": 0.1514,
-            "3": 0.11679,
-            "4": 0.11442,
-            "5": 0.11418,
-            "6": 0.1134,
-            "7": 0.11341,
-            "8": 0.11355,
-            "9": 0.11332,
-            "10": 0.11336,
-            "11": 0.11414,
-            "12": 0.11322,
-            "13": 0.11309,
-            "14": 0.11355,
-            "15": 0.11296,
-            "16": 0.11311,
-            "17": 0.19183,
-            "18": 0.13278,
-            "19": 0.12368,
-            "20": 0.1244,
-            "21": 0.12354,
-            "22": 0.11533,
-            "23": 0.12281,
-            "24": 0.12403,
-            "25": 0.12406,
-            "26": 0.12339,
-            "27": 0.12448,
-            "28": 0.12265,
-            "29": 0.1229,
-            "30": 0.1231,
-            "31": 0.12325,
-            "32": 0.12261,
-            "33": 0.12283,
-            "34": 0.12275,
-            "35": 0.12311,
-            "36": 0.12273,
-            "37": 0.12367,
-            "38": 0.12288,
-            "39": 0.12297,
-            "40": 0.12264,
-            "41": 0.1206,
-            "42": 0.12099,
-            "43": 0.12152,
-            "44": 0.12016,
-            "45": 0.12042,
-            "46": 0.12101,
-            "47": 0.12019,
-            "48": 0.12057,
-            "49": 0.12054,
-            "50": 0.12043,
-            "51": 0.12804,
-            "52": 0.12188,
-            "53": 0.12082,
-            "54": 0.12046,
-            "55": 0.12243,
-            "56": 0.12099,
-            "57": 0.12158,
-            "58": 0.12118,
-            "59": 0.12094,
-            "60": 0.12085,
-            "61": 0.12158,
-            "62": 0.12129,
-            "63": 0.12239,
-            "64": 0.12127,
-            "65": 0.12091,
-            "66": 0.12161,
-            "67": 0.12115,
-            "68": 0.12107,
-            "69": 0.12194,
-            "70": 0.12208,
-            "71": 0.12158,
-            "72": 0.12253,
-            "73": 0.12311,
-            "74": 0.12157,
-            "75": 0.12129,
-            "76": 0.12243,
-            "77": 0.1209,
-            "78": 0.12118,
-            "79": 0.12236,
-            "80": 0.12456,
-            "81": 0.12169,
-            "82": 0.12201,
-            "83": 0.12239,
-            "84": 0.12311,
-            "85": 0.12253,
-            "86": 0.12237,
-            "87": 0.12156,
-            "88": 0.12306,
-            "89": 0.12961,
-            "90": 0.12349,
-            "91": 0.12189,
-            "92": 0.12121,
-            "93": 0.12178,
-            "94": 0.12615,
-            "95": 0.12189,
-            "96": 0.12145,
-            "97": 0.12112,
-            "98": 0.12242,
-            "99": 0.12142,
-            "100": 0.12094
+            "1": 3.95366,
+            "2": 0.14871,
+            "3": 0.12763,
+            "4": 0.11208,
+            "5": 0.11074,
+            "6": 0.11007,
+            "7": 0.11082,
+            "8": 0.11022,
+            "9": 0.11047,
+            "10": 0.11064,
+            "11": 0.11173,
+            "12": 0.11146,
+            "13": 0.1105,
+            "14": 0.10955,
+            "15": 0.10949,
+            "16": 0.10939,
+            "17": 0.18086,
+            "18": 0.12719,
+            "19": 0.11742,
+            "20": 0.11731,
+            "21": 0.11723,
+            "22": 0.1099,
+            "23": 0.11923,
+            "24": 0.12129,
+            "25": 0.12214,
+            "26": 0.12333,
+            "27": 0.11905,
+            "28": 0.11908,
+            "29": 0.12058,
+            "30": 0.11948,
+            "31": 0.1201,
+            "32": 0.12035,
+            "33": 0.11991,
+            "34": 0.12012,
+            "35": 0.12013,
+            "36": 0.12016,
+            "37": 0.11941,
+            "38": 0.1201,
+            "39": 0.1201,
+            "40": 0.11958,
+            "41": 0.12136,
+            "42": 0.11979,
+            "43": 0.11986,
+            "44": 0.12054,
+            "45": 0.12036,
+            "46": 0.12029,
+            "47": 0.12065,
+            "48": 0.12009,
+            "49": 0.1203,
+            "50": 0.11976,
+            "51": 0.12632,
+            "52": 0.11795,
+            "53": 0.11564,
+            "54": 0.11608,
+            "55": 0.11612,
+            "56": 0.11603,
+            "57": 0.11792,
+            "58": 0.11634,
+            "59": 0.11727,
+            "60": 0.1161,
+            "61": 0.11695,
+            "62": 0.13389,
+            "63": 0.11729,
+            "64": 0.11589,
+            "65": 0.11724,
+            "66": 0.11796,
+            "67": 0.11759,
+            "68": 0.1183,
+            "69": 0.11749,
+            "70": 0.1181,
+            "71": 0.11707,
+            "72": 0.11611,
+            "73": 0.11701,
+            "74": 0.11673,
+            "75": 0.11595,
+            "76": 0.11658,
+            "77": 0.1163,
+            "78": 0.11681,
+            "79": 0.11598,
+            "80": 0.11662,
+            "81": 0.11633,
+            "82": 0.11636,
+            "83": 0.11597,
+            "84": 0.11547,
+            "85": 0.11591,
+            "86": 0.11618,
+            "87": 0.1157,
+            "88": 0.11607,
+            "89": 0.11626,
+            "90": 0.115,
+            "91": 0.11601,
+            "92": 0.11575,
+            "93": 0.11688,
+            "94": 0.11552,
+            "95": 0.11702,
+            "96": 0.11567,
+            "97": 0.1166,
+            "98": 0.11652,
+            "99": 0.11578,
+            "100": 0.11584
         }
     },
     "num-zeros": {
@@ -473,65 +473,65 @@
             "39": 2642.0,
             "40": 2287.0,
             "41": 2344.0,
-            "42": 2304.0,
-            "43": 2098.0,
-            "44": 2107.0,
-            "45": 2243.0,
-            "46": 1960.0,
-            "47": 2729.0,
-            "48": 2418.0,
-            "49": 1910.0,
-            "50": 2426.0,
-            "51": 2335.0,
-            "52": 2407.0,
-            "53": 2888.0,
-            "54": 2477.0,
-            "55": 2440.0,
-            "56": 2286.0,
-            "57": 2340.0,
-            "58": 2652.0,
-            "59": 2321.0,
-            "60": 2493.0,
-            "61": 2812.0,
-            "62": 2711.0,
-            "63": 2367.0,
-            "64": 2802.0,
-            "65": 2411.0,
-            "66": 2869.0,
-            "67": 2577.0,
-            "68": 2859.0,
-            "69": 2524.0,
-            "70": 3119.0,
-            "71": 2926.0,
-            "72": 2251.0,
-            "73": 2929.0,
-            "74": 2110.0,
-            "75": 2884.0,
-            "76": 2992.0,
-            "77": 3380.0,
-            "78": 3484.0,
-            "79": 3533.0,
-            "80": 3549.0,
-            "81": 3616.0,
-            "82": 3347.0,
-            "83": 3124.0,
-            "84": 3276.0,
-            "85": 3721.0,
-            "86": 3207.0,
-            "87": 3941.0,
-            "88": 3250.0,
-            "89": 3863.0,
-            "90": 3452.0,
-            "91": 2630.0,
-            "92": 3431.0,
-            "93": 3123.0,
-            "94": 3671.0,
-            "95": 3340.0,
-            "96": 3874.0,
-            "97": 3519.0,
-            "98": 3727.0,
-            "99": 3447.0,
-            "100": 3338.0
+            "42": 2340.0,
+            "43": 2130.0,
+            "44": 2069.0,
+            "45": 2188.0,
+            "46": 1932.0,
+            "47": 2670.0,
+            "48": 2471.0,
+            "49": 1891.0,
+            "50": 2416.0,
+            "51": 2321.0,
+            "52": 2363.0,
+            "53": 2925.0,
+            "54": 2486.0,
+            "55": 2408.0,
+            "56": 2298.0,
+            "57": 2286.0,
+            "58": 2584.0,
+            "59": 2358.0,
+            "60": 2487.0,
+            "61": 2791.0,
+            "62": 2751.0,
+            "63": 2385.0,
+            "64": 2791.0,
+            "65": 2372.0,
+            "66": 2970.0,
+            "67": 2557.0,
+            "68": 2857.0,
+            "69": 2699.0,
+            "70": 3035.0,
+            "71": 2940.0,
+            "72": 2315.0,
+            "73": 2968.0,
+            "74": 2205.0,
+            "75": 2811.0,
+            "76": 2969.0,
+            "77": 3296.0,
+            "78": 3578.0,
+            "79": 3594.0,
+            "80": 3509.0,
+            "81": 3698.0,
+            "82": 3355.0,
+            "83": 3205.0,
+            "84": 3285.0,
+            "85": 3791.0,
+            "86": 3303.0,
+            "87": 3934.0,
+            "88": 3130.0,
+            "89": 3809.0,
+            "90": 3388.0,
+            "91": 2618.0,
+            "92": 3412.0,
+            "93": 3072.0,
+            "94": 3731.0,
+            "95": 3357.0,
+            "96": 3852.0,
+            "97": 3528.0,
+            "98": 3616.0,
+            "99": 3449.0,
+            "100": 3284.0
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..9f64cb131f6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.2013,
+            "52": 10.10855,
+            "53": 10.34609,
+            "54": 10.24857,
+            "55": 10.18782,
+            "56": 9.95521,
+            "57": 9.81221,
+            "58": 10.10875,
+            "59": 9.8863,
+            "60": 9.80901,
+            "61": 9.94824,
+            "62": 10.1999,
+            "63": 9.6443,
+            "64": 9.9951,
+            "65": 9.24475,
+            "66": 9.90917,
+            "67": 9.59735,
+            "68": 9.97285,
+            "69": 9.96333,
+            "70": 9.91038,
+            "71": 9.78596,
+            "72": 9.77264,
+            "73": 9.6618,
+            "74": 9.16289,
+            "75": 9.58121,
+            "76": 9.26138,
+            "77": 10.17614,
+            "78": 9.85644,
+            "79": 9.50644,
+            "80": 9.54103,
+            "81": 9.61313,
+            "82": 9.80668,
+            "83": 9.44696,
+            "84": 9.52782,
+            "85": 9.72633,
+            "86": 9.19099,
+            "87": 9.68736,
+            "88": 9.85216,
+            "89": 9.71335,
+            "90": 9.90316,
+            "91": 9.46063,
+            "92": 9.46058,
+            "93": 9.19418,
+            "94": 8.93434,
+            "95": 9.60258,
+            "96": 9.61852,
+            "97": 9.39595,
+            "98": 9.76012,
+            "99": 8.98669,
+            "100": 9.49406
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2321.0,
+            "52": 2363.0,
+            "53": 2925.0,
+            "54": 2486.0,
+            "55": 2408.0,
+            "56": 2298.0,
+            "57": 2286.0,
+            "58": 2584.0,
+            "59": 2358.0,
+            "60": 2487.0,
+            "61": 2791.0,
+            "62": 2751.0,
+            "63": 2385.0,
+            "64": 2791.0,
+            "65": 2372.0,
+            "66": 2970.0,
+            "67": 2557.0,
+            "68": 2857.0,
+            "69": 2699.0,
+            "70": 3035.0,
+            "71": 2940.0,
+            "72": 2315.0,
+            "73": 2968.0,
+            "74": 2205.0,
+            "75": 2811.0,
+            "76": 2969.0,
+            "77": 3296.0,
+            "78": 3578.0,
+            "79": 3594.0,
+            "80": 3509.0,
+            "81": 3698.0,
+            "82": 3355.0,
+            "83": 3205.0,
+            "84": 3285.0,
+            "85": 3791.0,
+            "86": 3303.0,
+            "87": 3934.0,
+            "88": 3130.0,
+            "89": 3809.0,
+            "90": 3388.0,
+            "91": 2618.0,
+            "92": 3412.0,
+            "93": 3072.0,
+            "94": 3731.0,
+            "95": 3357.0,
+            "96": 3852.0,
+            "97": 3528.0,
+            "98": 3616.0,
+            "99": 3449.0,
+            "100": 3284.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1044076032.0,
+            "52": 1045124608.0,
+            "53": 1045124608.0,
+            "54": 1045124608.0,
+            "55": 1045124608.0,
+            "56": 1045124608.0,
+            "57": 1045124608.0,
+            "58": 1045124608.0,
+            "59": 1045124608.0,
+            "60": 1045124608.0,
+            "61": 1045124608.0,
+            "62": 1045124608.0,
+            "63": 1045124608.0,
+            "64": 1045124608.0,
+            "65": 1045124608.0,
+            "66": 1045124608.0,
+            "67": 1045124608.0,
+            "68": 1045124608.0,
+            "69": 1045124608.0,
+            "70": 1045124608.0,
+            "71": 1045124608.0,
+            "72": 1045124608.0,
+            "73": 1045124608.0,
+            "74": 1045124608.0,
+            "75": 1045124608.0,
+            "76": 1045124608.0,
+            "77": 1045124608.0,
+            "78": 1045124608.0,
+            "79": 1045124608.0,
+            "80": 1045124608.0,
+            "81": 1045124608.0,
+            "82": 1045124608.0,
+            "83": 1045124608.0,
+            "84": 1045124608.0,
+            "85": 1045124608.0,
+            "86": 1045124608.0,
+            "87": 1045124608.0,
+            "88": 1045124608.0,
+            "89": 1045124608.0,
+            "90": 1045124608.0,
+            "91": 1045124608.0,
+            "92": 1045124608.0,
+            "93": 1045124608.0,
+            "94": 1045124608.0,
+            "95": 1045124608.0,
+            "96": 1045124608.0,
+            "97": 1045124608.0,
+            "98": 1045124608.0,
+            "99": 1045124608.0,
+            "100": 1045124608.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3546769920.0,
+            "52": 3546770944.0,
+            "53": 3546770944.0,
+            "54": 3546770944.0,
+            "55": 3546770944.0,
+            "56": 3546770944.0,
+            "57": 3546770944.0,
+            "58": 3546770944.0,
+            "59": 3546770944.0,
+            "60": 3546770944.0,
+            "61": 3546770944.0,
+            "62": 3546770944.0,
+            "63": 3546770944.0,
+            "64": 3546770944.0,
+            "65": 3546770944.0,
+            "66": 3546770944.0,
+            "67": 3546770944.0,
+            "68": 3546770944.0,
+            "69": 3546770944.0,
+            "70": 3546770944.0,
+            "71": 3546770944.0,
+            "72": 3546770944.0,
+            "73": 3546770944.0,
+            "74": 3546770944.0,
+            "75": 3546770944.0,
+            "76": 3546770944.0,
+            "77": 3546770944.0,
+            "78": 3546770944.0,
+            "79": 3546770944.0,
+            "80": 3546770944.0,
+            "81": 3546770944.0,
+            "82": 3546770944.0,
+            "83": 3546770944.0,
+            "84": 3546770944.0,
+            "85": 3546770944.0,
+            "86": 3546770944.0,
+            "87": 3546770944.0,
+            "88": 3546770944.0,
+            "89": 3546770944.0,
+            "90": 3546770944.0,
+            "91": 3546770944.0,
+            "92": 3546770944.0,
+            "93": 3546770944.0,
+            "94": 3546770944.0,
+            "95": 3546770944.0,
+            "96": 3546770944.0,
+            "97": 3546770944.0,
+            "98": 3546770944.0,
+            "99": 3546770944.0,
+            "100": 3546770944.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6.67329,
+            "52": 0.6111,
+            "53": 0.12668,
+            "54": 0.11864,
+            "55": 0.11902,
+            "56": 0.11865,
+            "57": 0.11929,
+            "58": 0.11948,
+            "59": 0.11768,
+            "60": 0.11801,
+            "61": 0.1175,
+            "62": 0.11795,
+            "63": 0.11724,
+            "64": 0.11676,
+            "65": 0.11866,
+            "66": 0.11629,
+            "67": 0.11669,
+            "68": 0.11697,
+            "69": 0.11697,
+            "70": 0.11633,
+            "71": 0.11621,
+            "72": 0.11651,
+            "73": 0.11676,
+            "74": 0.11645,
+            "75": 0.11641,
+            "76": 0.11594,
+            "77": 0.1156,
+            "78": 0.11596,
+            "79": 0.11564,
+            "80": 0.11648,
+            "81": 0.11644,
+            "82": 0.11653,
+            "83": 0.11629,
+            "84": 0.11602,
+            "85": 0.11583,
+            "86": 0.11614,
+            "87": 0.11603,
+            "88": 0.11569,
+            "89": 0.11622,
+            "90": 0.11608,
+            "91": 0.1162,
+            "92": 0.11569,
+            "93": 0.11662,
+            "94": 0.11609,
+            "95": 0.11636,
+            "96": 0.11595,
+            "97": 0.11685,
+            "98": 0.11561,
+            "99": 0.11705,
+            "100": 0.11648
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..78a164b057a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.83624,
+            "2": 10.83583,
+            "3": 10.83479,
+            "4": 10.79805,
+            "5": 10.8484,
+            "6": 10.86489,
+            "7": 10.82603,
+            "8": 10.83534,
+            "9": 10.83891,
+            "10": 10.7977,
+            "11": 10.86687,
+            "12": 10.84885,
+            "13": 10.85863,
+            "14": 10.86758,
+            "15": 10.80015,
+            "16": 10.78972,
+            "17": 10.77152,
+            "18": 10.78567,
+            "19": 10.78854,
+            "20": 10.68344,
+            "21": 10.67601,
+            "22": 10.52341,
+            "23": 10.70513,
+            "24": 10.56287,
+            "25": 10.51316,
+            "26": 10.57779,
+            "27": 10.58628,
+            "28": 10.54399,
+            "29": 10.5752,
+            "30": 10.33793,
+            "31": 10.06785,
+            "32": 10.4423,
+            "33": 10.44058,
+            "34": 10.19082,
+            "35": 10.23949,
+            "36": 10.1889,
+            "37": 10.32647,
+            "38": 10.16254,
+            "39": 10.38467,
+            "40": 10.04862,
+            "41": 10.1189,
+            "42": 10.18954,
+            "43": 9.80408,
+            "44": 9.92166,
+            "45": 9.80316,
+            "46": 9.79843,
+            "47": 10.11883,
+            "48": 9.82786,
+            "49": 9.50058,
+            "50": 9.87693
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1555.0,
+            "2": 1587.0,
+            "3": 1602.0,
+            "4": 1704.0,
+            "5": 1904.0,
+            "6": 1792.0,
+            "7": 1789.0,
+            "8": 1623.0,
+            "9": 1774.0,
+            "10": 1392.0,
+            "11": 1918.0,
+            "12": 1662.0,
+            "13": 1853.0,
+            "14": 1763.0,
+            "15": 1924.0,
+            "16": 1899.0,
+            "17": 1757.0,
+            "18": 1692.0,
+            "19": 1706.0,
+            "20": 1526.0,
+            "21": 1838.0,
+            "22": 1629.0,
+            "23": 1894.0,
+            "24": 1618.0,
+            "25": 1572.0,
+            "26": 1595.0,
+            "27": 1782.0,
+            "28": 1886.0,
+            "29": 1912.0,
+            "30": 1854.0,
+            "31": 1632.0,
+            "32": 1901.0,
+            "33": 2111.0,
+            "34": 1981.0,
+            "35": 1995.0,
+            "36": 1912.0,
+            "37": 2387.0,
+            "38": 2159.0,
+            "39": 2411.0,
+            "40": 2161.0,
+            "41": 2328.0,
+            "42": 2311.0,
+            "43": 2019.0,
+            "44": 1984.0,
+            "45": 2148.0,
+            "46": 2353.0,
+            "47": 2541.0,
+            "48": 2470.0,
+            "49": 2248.0,
+            "50": 2397.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 777900032.0,
+            "2": 777900032.0,
+            "3": 777900032.0,
+            "4": 777900032.0,
+            "5": 777900032.0,
+            "6": 777900032.0,
+            "7": 777900032.0,
+            "8": 777900032.0,
+            "9": 777900032.0,
+            "10": 777900032.0,
+            "11": 777900032.0,
+            "12": 777900032.0,
+            "13": 777900032.0,
+            "14": 777900032.0,
+            "15": 777900032.0,
+            "16": 777900032.0,
+            "17": 777900032.0,
+            "18": 777900032.0,
+            "19": 777900032.0,
+            "20": 777900032.0,
+            "21": 777900032.0,
+            "22": 777900032.0,
+            "23": 777900032.0,
+            "24": 777900032.0,
+            "25": 777900032.0,
+            "26": 777900032.0,
+            "27": 777900032.0,
+            "28": 777900032.0,
+            "29": 777900032.0,
+            "30": 777900032.0,
+            "31": 777900032.0,
+            "32": 777900032.0,
+            "33": 777900032.0,
+            "34": 777900032.0,
+            "35": 777900032.0,
+            "36": 777900032.0,
+            "37": 777900032.0,
+            "38": 777900032.0,
+            "39": 777900032.0,
+            "40": 777900032.0,
+            "41": 777900032.0,
+            "42": 777900032.0,
+            "43": 777900032.0,
+            "44": 777900032.0,
+            "45": 777900032.0,
+            "46": 777900032.0,
+            "47": 777900032.0,
+            "48": 777900032.0,
+            "49": 777900032.0,
+            "50": 777900032.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 2463815680.0,
+            "2": 2744478720.0,
+            "3": 2744478720.0,
+            "4": 2744478720.0,
+            "5": 2744478720.0,
+            "6": 2744478720.0,
+            "7": 2744478720.0,
+            "8": 2744478720.0,
+            "9": 2744478720.0,
+            "10": 2744478720.0,
+            "11": 2744478720.0,
+            "12": 2744478720.0,
+            "13": 2744478720.0,
+            "14": 2744478720.0,
+            "15": 2744478720.0,
+            "16": 2744478720.0,
+            "17": 2744478720.0,
+            "18": 2744478720.0,
+            "19": 2744478720.0,
+            "20": 2744478720.0,
+            "21": 2744478720.0,
+            "22": 2744478720.0,
+            "23": 2744478720.0,
+            "24": 2744478720.0,
+            "25": 2744478720.0,
+            "26": 2744478720.0,
+            "27": 2744478720.0,
+            "28": 2744478720.0,
+            "29": 2744478720.0,
+            "30": 2744478720.0,
+            "31": 2744478720.0,
+            "32": 2744478720.0,
+            "33": 2744478720.0,
+            "34": 2744478720.0,
+            "35": 2744478720.0,
+            "36": 2744478720.0,
+            "37": 2744478720.0,
+            "38": 2744478720.0,
+            "39": 2744478720.0,
+            "40": 2744478720.0,
+            "41": 2744478720.0,
+            "42": 2744478720.0,
+            "43": 2744478720.0,
+            "44": 2744478720.0,
+            "45": 2744478720.0,
+            "46": 2744478720.0,
+            "47": 2744478720.0,
+            "48": 2744478720.0,
+            "49": 2744478720.0,
+            "50": 2744478720.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.22981,
+            "3": 0.15128,
+            "4": 0.13923,
+            "5": 0.13787,
+            "6": 0.13801,
+            "7": 0.13851,
+            "8": 0.13805,
+            "9": 0.13877,
+            "10": 0.14054,
+            "11": 0.14025,
+            "12": 0.13996,
+            "13": 0.13989,
+            "14": 0.13978,
+            "15": 0.14117,
+            "16": 0.14293,
+            "17": 0.14179,
+            "18": 0.14229,
+            "19": 0.14245,
+            "20": 0.14412,
+            "21": 0.14397,
+            "22": 0.1442,
+            "23": 0.14329,
+            "24": 0.14358,
+            "25": 0.14351,
+            "26": 0.14424,
+            "27": 0.14406,
+            "28": 0.1431,
+            "29": 0.14373,
+            "30": 0.14433,
+            "31": 0.14377,
+            "32": 0.14346,
+            "33": 0.14433,
+            "34": 0.14352,
+            "35": 0.1446,
+            "36": 0.14442,
+            "37": 0.14373,
+            "38": 0.14265,
+            "39": 0.14371,
+            "40": 0.14411,
+            "41": 0.14415,
+            "42": 0.14342,
+            "43": 0.14536,
+            "44": 0.14415,
+            "45": 0.14252,
+            "46": 0.14463,
+            "47": 0.1438,
+            "48": 0.14396,
+            "49": 0.14369,
+            "50": 0.14335
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_h100.json
index 5517997e6c1..4aa2800617e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_h100.json
@@ -16,44 +16,44 @@
             "10": 10.84079,
             "11": 10.87928,
             "12": 10.8729,
-            "13": 10.87791,
-            "14": 10.8901,
+            "13": 10.8779,
+            "14": 10.89011,
             "15": 10.82504,
-            "16": 10.8296,
-            "17": 10.80874,
-            "18": 10.8116,
-            "19": 10.81543,
-            "20": 10.71912,
+            "16": 10.82957,
+            "17": 10.80875,
+            "18": 10.81163,
+            "19": 10.81545,
+            "20": 10.71913,
             "21": 10.70404,
-            "22": 10.56645,
-            "23": 10.71858,
-            "24": 10.60989,
-            "25": 10.55479,
-            "26": 10.60874,
-            "27": 10.62302,
-            "28": 10.56954,
+            "22": 10.56646,
+            "23": 10.71861,
+            "24": 10.60988,
+            "25": 10.55482,
+            "26": 10.60879,
+            "27": 10.62303,
+            "28": 10.56953,
             "29": 10.57966,
-            "30": 10.35998,
-            "31": 10.11311,
-            "32": 10.46587,
-            "33": 10.45154,
-            "34": 10.20826,
-            "35": 10.26937,
+            "30": 10.35999,
+            "31": 10.11305,
+            "32": 10.46585,
+            "33": 10.45153,
+            "34": 10.20832,
+            "35": 10.26936,
             "36": 10.21924,
-            "37": 10.33852,
-            "38": 10.186,
-            "39": 10.3997,
-            "40": 10.08396,
-            "41": 10.13418,
-            "42": 10.20887,
-            "43": 9.82537,
-            "44": 9.95906,
+            "37": 10.33851,
+            "38": 10.18603,
+            "39": 10.39977,
+            "40": 10.08397,
+            "41": 10.13423,
+            "42": 10.20889,
+            "43": 9.82535,
+            "44": 9.95909,
             "45": 9.82563,
-            "46": 9.80623,
-            "47": 10.13499,
-            "48": 9.84002,
-            "49": 9.52482,
-            "50": 9.90725
+            "46": 9.8062,
+            "47": 10.135,
+            "48": 9.84004,
+            "49": 9.52485,
+            "50": 9.90723
         }
     },
     "num-zeros": {
@@ -72,45 +72,45 @@
             "9": 1849.0,
             "10": 1317.0,
             "11": 1901.0,
-            "12": 1702.0,
-            "13": 1872.0,
-            "14": 1781.0,
-            "15": 1759.0,
-            "16": 1820.0,
-            "17": 1819.0,
-            "18": 1721.0,
-            "19": 1828.0,
-            "20": 1730.0,
-            "21": 1935.0,
-            "22": 1764.0,
-            "23": 1962.0,
-            "24": 1564.0,
-            "25": 1552.0,
-            "26": 1668.0,
-            "27": 1803.0,
-            "28": 1988.0,
-            "29": 1966.0,
-            "30": 1895.0,
-            "31": 1532.0,
-            "32": 1866.0,
-            "33": 2026.0,
-            "34": 1906.0,
-            "35": 1987.0,
-            "36": 1863.0,
-            "37": 2231.0,
-            "38": 2109.0,
-            "39": 2277.0,
-            "40": 2099.0,
-            "41": 2209.0,
-            "42": 2227.0,
-            "43": 1913.0,
-            "44": 2129.0,
-            "45": 1993.0,
-            "46": 2288.0,
-            "47": 2458.0,
-            "48": 2418.0,
-            "49": 2155.0,
-            "50": 2085.0
+            "12": 1765.0,
+            "13": 1910.0,
+            "14": 1773.0,
+            "15": 1864.0,
+            "16": 1759.0,
+            "17": 1794.0,
+            "18": 1805.0,
+            "19": 1846.0,
+            "20": 1770.0,
+            "21": 1963.0,
+            "22": 1706.0,
+            "23": 1983.0,
+            "24": 1609.0,
+            "25": 1593.0,
+            "26": 1643.0,
+            "27": 1696.0,
+            "28": 1882.0,
+            "29": 1946.0,
+            "30": 1925.0,
+            "31": 1574.0,
+            "32": 1863.0,
+            "33": 2024.0,
+            "34": 1878.0,
+            "35": 1941.0,
+            "36": 1887.0,
+            "37": 2294.0,
+            "38": 2142.0,
+            "39": 2288.0,
+            "40": 2053.0,
+            "41": 2189.0,
+            "42": 2331.0,
+            "43": 1933.0,
+            "44": 2042.0,
+            "45": 1956.0,
+            "46": 2285.0,
+            "47": 2470.0,
+            "48": 2437.0,
+            "49": 2238.0,
+            "50": 2004.0
         }
     },
     "mem-allocated-bytes": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 11.34716,
-            "2": 0.14227,
-            "3": 0.12689,
-            "4": 0.13008,
-            "5": 0.12281,
-            "6": 0.12008,
-            "7": 0.11926,
-            "8": 0.11756,
-            "9": 0.11844,
-            "10": 0.11959,
-            "11": 0.11763,
-            "12": 0.11828,
-            "13": 0.11955,
-            "14": 0.11929,
-            "15": 0.11867,
-            "16": 0.11859,
-            "17": 0.12095,
-            "18": 0.11695,
-            "19": 0.11774,
-            "20": 0.11863,
-            "21": 0.11942,
-            "22": 0.12117,
-            "23": 0.11884,
-            "24": 0.12003,
-            "25": 0.11915,
-            "26": 0.11977,
-            "27": 0.11816,
-            "28": 0.12705,
-            "29": 0.11815,
-            "30": 0.12166,
-            "31": 0.12023,
-            "32": 0.12154,
-            "33": 0.12781,
-            "34": 0.12209,
-            "35": 0.12372,
-            "36": 0.12109,
-            "37": 0.11897,
-            "38": 0.12385,
-            "39": 0.11961,
-            "40": 0.11846,
-            "41": 0.11902,
-            "42": 0.11915,
-            "43": 0.12286,
-            "44": 0.11759,
-            "45": 0.11912,
-            "46": 0.1204,
-            "47": 0.12027,
-            "48": 0.12073,
-            "49": 0.1164,
-            "50": 0.11734
+            "1": 7.818,
+            "2": 0.14182,
+            "3": 0.12081,
+            "4": 0.09954,
+            "5": 0.09861,
+            "6": 0.10039,
+            "7": 0.09846,
+            "8": 0.09916,
+            "9": 0.10232,
+            "10": 0.10158,
+            "11": 0.09888,
+            "12": 0.09744,
+            "13": 0.09991,
+            "14": 0.09707,
+            "15": 0.09748,
+            "16": 0.09761,
+            "17": 0.09792,
+            "18": 0.09795,
+            "19": 0.09792,
+            "20": 0.09738,
+            "21": 0.10014,
+            "22": 0.09781,
+            "23": 0.09834,
+            "24": 0.09956,
+            "25": 0.09768,
+            "26": 0.09722,
+            "27": 0.09836,
+            "28": 0.09714,
+            "29": 0.09695,
+            "30": 0.09751,
+            "31": 0.09809,
+            "32": 0.09759,
+            "33": 0.09764,
+            "34": 0.09711,
+            "35": 0.09791,
+            "36": 0.09751,
+            "37": 0.09778,
+            "38": 0.09695,
+            "39": 0.09907,
+            "40": 0.09654,
+            "41": 0.09746,
+            "42": 0.09685,
+            "43": 0.09736,
+            "44": 0.09954,
+            "45": 0.09768,
+            "46": 0.09735,
+            "47": 0.09905,
+            "48": 0.09815,
+            "49": 0.09684,
+            "50": 0.09793
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgx_a100.json
index 06342d2a540..bedfb1338ba 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgx_a100.json
@@ -24,36 +24,36 @@
             "18": 10.79296,
             "19": 10.79494,
             "20": 10.67877,
-            "21": 10.65858,
-            "22": 10.50081,
+            "21": 10.65859,
+            "22": 10.50083,
             "23": 10.71065,
-            "24": 10.55089,
+            "24": 10.5509,
             "25": 10.50321,
-            "26": 10.58033,
-            "27": 10.58262,
-            "28": 10.55556,
-            "29": 10.56003,
-            "30": 10.32992,
-            "31": 10.08344,
+            "26": 10.58034,
+            "27": 10.58264,
+            "28": 10.55553,
+            "29": 10.56004,
+            "30": 10.32995,
+            "31": 10.08339,
             "32": 10.44434,
-            "33": 10.44238,
-            "34": 10.19765,
-            "35": 10.25197,
-            "36": 10.19117,
-            "37": 10.31772,
-            "38": 10.1631,
+            "33": 10.44235,
+            "34": 10.19762,
+            "35": 10.25196,
+            "36": 10.1912,
+            "37": 10.31771,
+            "38": 10.16302,
             "39": 10.37486,
-            "40": 10.05284,
-            "41": 10.1344,
-            "42": 10.18877,
-            "43": 9.80641,
-            "44": 9.92687,
-            "45": 9.80332,
-            "46": 9.811,
-            "47": 10.12605,
-            "48": 9.82455,
-            "49": 9.50975,
-            "50": 9.88831
+            "40": 10.05283,
+            "41": 10.13444,
+            "42": 10.18874,
+            "43": 9.80642,
+            "44": 9.92686,
+            "45": 9.80329,
+            "46": 9.81097,
+            "47": 10.12606,
+            "48": 9.82458,
+            "49": 9.50971,
+            "50": 9.88833
         }
     },
     "num-zeros": {
@@ -81,36 +81,36 @@
             "18": 1655.0,
             "19": 1784.0,
             "20": 1616.0,
-            "21": 1887.0,
-            "22": 1751.0,
-            "23": 2100.0,
-            "24": 1717.0,
-            "25": 1696.0,
-            "26": 1723.0,
-            "27": 1819.0,
-            "28": 1980.0,
-            "29": 1962.0,
-            "30": 2046.0,
-            "31": 1562.0,
-            "32": 1935.0,
-            "33": 2182.0,
-            "34": 1919.0,
-            "35": 1994.0,
-            "36": 1947.0,
-            "37": 2436.0,
-            "38": 2218.0,
-            "39": 2319.0,
-            "40": 2278.0,
-            "41": 2348.0,
-            "42": 2258.0,
-            "43": 1967.0,
-            "44": 2011.0,
-            "45": 2215.0,
-            "46": 2291.0,
-            "47": 2519.0,
-            "48": 2517.0,
-            "49": 2334.0,
-            "50": 2325.0
+            "21": 1859.0,
+            "22": 1634.0,
+            "23": 1985.0,
+            "24": 1636.0,
+            "25": 1648.0,
+            "26": 1833.0,
+            "27": 1729.0,
+            "28": 2018.0,
+            "29": 1948.0,
+            "30": 1977.0,
+            "31": 1606.0,
+            "32": 1878.0,
+            "33": 2102.0,
+            "34": 1882.0,
+            "35": 1998.0,
+            "36": 1963.0,
+            "37": 2392.0,
+            "38": 2259.0,
+            "39": 2368.0,
+            "40": 2355.0,
+            "41": 2351.0,
+            "42": 2315.0,
+            "43": 2100.0,
+            "44": 2088.0,
+            "45": 2185.0,
+            "46": 2287.0,
+            "47": 2485.0,
+            "48": 2430.0,
+            "49": 2209.0,
+            "50": 2436.0
         }
     },
     "mem-allocated-bytes": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 9.69348,
-            "2": 0.20058,
-            "3": 0.16793,
-            "4": 0.16851,
-            "5": 0.16769,
-            "6": 0.16776,
-            "7": 0.1679,
-            "8": 0.1698,
-            "9": 0.16773,
-            "10": 0.16689,
-            "11": 0.16616,
-            "12": 0.16649,
-            "13": 0.16602,
-            "14": 0.16651,
-            "15": 0.16681,
-            "16": 0.16794,
-            "17": 0.17068,
-            "18": 0.16616,
-            "19": 0.16604,
-            "20": 0.16664,
-            "21": 0.16675,
-            "22": 0.16587,
-            "23": 0.16669,
-            "24": 0.16593,
-            "25": 0.16666,
-            "26": 0.16624,
-            "27": 0.16546,
-            "28": 0.16503,
-            "29": 0.16469,
-            "30": 0.1651,
-            "31": 0.16508,
-            "32": 0.16533,
-            "33": 0.16475,
-            "34": 0.16518,
-            "35": 0.16543,
-            "36": 0.16422,
-            "37": 0.1648,
-            "38": 0.16453,
-            "39": 0.16423,
-            "40": 0.16482,
-            "41": 0.16457,
-            "42": 0.1653,
-            "43": 0.16536,
-            "44": 0.16541,
-            "45": 0.16481,
-            "46": 0.16481,
-            "47": 0.16542,
-            "48": 0.16607,
-            "49": 0.1639,
-            "50": 0.1641
+            "1": 4.6609,
+            "2": 0.20286,
+            "3": 0.18331,
+            "4": 0.16708,
+            "5": 0.16425,
+            "6": 0.16306,
+            "7": 0.16477,
+            "8": 0.16576,
+            "9": 0.16596,
+            "10": 0.16583,
+            "11": 0.16408,
+            "12": 0.16435,
+            "13": 0.16481,
+            "14": 0.16557,
+            "15": 0.16431,
+            "16": 0.16502,
+            "17": 0.16505,
+            "18": 0.16591,
+            "19": 0.16488,
+            "20": 0.1643,
+            "21": 0.16357,
+            "22": 0.16399,
+            "23": 0.16405,
+            "24": 0.16322,
+            "25": 0.16434,
+            "26": 0.16338,
+            "27": 0.16313,
+            "28": 0.16358,
+            "29": 0.16355,
+            "30": 0.16313,
+            "31": 0.16372,
+            "32": 0.16289,
+            "33": 0.16298,
+            "34": 0.16307,
+            "35": 0.16335,
+            "36": 0.16325,
+            "37": 0.16343,
+            "38": 0.16261,
+            "39": 0.17181,
+            "40": 0.16689,
+            "41": 0.16786,
+            "42": 0.16635,
+            "43": 0.16929,
+            "44": 0.16602,
+            "45": 0.16606,
+            "46": 0.16685,
+            "47": 0.16668,
+            "48": 0.16647,
+            "49": 0.16657,
+            "50": 0.16609
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json
index f0460fcf964..ec21dd0eb78 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json
@@ -2,141 +2,536 @@
     "lm loss": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
             "1": 10.79449,
+            "2": 10.80656,
+            "3": 10.80727,
+            "4": 10.77389,
             "5": 10.84829,
+            "6": 10.86736,
+            "7": 10.82922,
+            "8": 10.81537,
+            "9": 10.83956,
             "10": 10.77652,
+            "11": 10.90107,
+            "12": 10.85927,
+            "13": 10.87396,
+            "14": 10.89723,
             "15": 10.83961,
+            "16": 10.83508,
+            "17": 10.82101,
+            "18": 10.86029,
+            "19": 10.86558,
             "20": 10.82896,
+            "21": 10.83275,
+            "22": 10.75286,
+            "23": 10.88062,
+            "24": 10.78219,
             "25": 10.76607,
+            "26": 10.79522,
+            "27": 10.79866,
+            "28": 10.81697,
+            "29": 10.82169,
             "30": 10.69891,
+            "31": 10.55698,
+            "32": 10.75759,
+            "33": 10.74362,
+            "34": 10.59976,
             "35": 10.61772,
+            "36": 10.56389,
+            "37": 10.63614,
+            "38": 10.53029,
+            "39": 10.65358,
             "40": 10.44072,
+            "41": 10.49636,
+            "42": 10.50954,
+            "43": 10.22362,
+            "44": 10.30902,
             "45": 10.21065,
+            "46": 10.19943,
+            "47": 10.41641,
+            "48": 10.18128,
+            "49": 9.94311,
             "50": 10.21224,
+            "51": 10.16758,
+            "52": 10.06896,
+            "53": 10.30707,
+            "54": 10.2091,
             "55": 10.15688,
+            "56": 9.91475,
+            "57": 9.77696,
+            "58": 10.07417,
+            "59": 9.86333,
             "60": 9.77328,
+            "61": 9.9292,
+            "62": 10.17156,
+            "63": 9.62041,
+            "64": 9.97113,
             "65": 9.21979,
+            "66": 9.88693,
+            "67": 9.58363,
+            "68": 9.94922,
+            "69": 9.9527,
             "70": 9.89312,
+            "71": 9.77658,
+            "72": 9.75435,
+            "73": 9.64969,
+            "74": 9.1439,
             "75": 9.56121,
-            "80": 9.53086,
+            "76": 9.25111,
+            "77": 10.17063,
+            "78": 9.85402,
+            "79": 9.49965,
+            "80": 9.53087,
+            "81": 9.60555,
+            "82": 9.80179,
+            "83": 9.43744,
+            "84": 9.51987,
             "85": 9.7196,
+            "86": 9.18596,
+            "87": 9.68687,
+            "88": 9.8443,
+            "89": 9.70586,
             "90": 9.89977,
+            "91": 9.45029,
+            "92": 9.45356,
+            "93": 9.18553,
+            "94": 8.92968,
             "95": 9.59767,
-            "100": 9.49001
+            "96": 9.61491,
+            "97": 9.39084,
+            "98": 9.75668,
+            "99": 8.97922,
+            "100": 9.49
         }
     },
     "mem-allocated-bytes": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
             "1": 570640384.0,
+            "2": 570640384.0,
+            "3": 570640384.0,
+            "4": 570640384.0,
             "5": 570640384.0,
+            "6": 570640384.0,
+            "7": 570640384.0,
+            "8": 570640384.0,
+            "9": 570640384.0,
             "10": 570640384.0,
+            "11": 570640384.0,
+            "12": 570640384.0,
+            "13": 570640384.0,
+            "14": 570640384.0,
             "15": 570640384.0,
+            "16": 570640384.0,
+            "17": 852351488.0,
+            "18": 852351488.0,
+            "19": 852351488.0,
             "20": 852351488.0,
+            "21": 852351488.0,
+            "22": 852351488.0,
+            "23": 852351488.0,
+            "24": 852351488.0,
             "25": 852351488.0,
+            "26": 852351488.0,
+            "27": 852351488.0,
+            "28": 852351488.0,
+            "29": 852351488.0,
             "30": 852351488.0,
+            "31": 852351488.0,
+            "32": 852351488.0,
+            "33": 852351488.0,
+            "34": 852351488.0,
             "35": 852351488.0,
+            "36": 852351488.0,
+            "37": 852351488.0,
+            "38": 852351488.0,
+            "39": 852351488.0,
             "40": 852351488.0,
+            "41": 852351488.0,
+            "42": 852351488.0,
+            "43": 852351488.0,
+            "44": 852351488.0,
             "45": 852351488.0,
+            "46": 852351488.0,
+            "47": 852351488.0,
+            "48": 852351488.0,
+            "49": 852351488.0,
             "50": 852351488.0,
+            "51": 852351488.0,
+            "52": 852351488.0,
+            "53": 852351488.0,
+            "54": 852351488.0,
             "55": 852351488.0,
+            "56": 852351488.0,
+            "57": 852351488.0,
+            "58": 852351488.0,
+            "59": 852351488.0,
             "60": 852351488.0,
+            "61": 852351488.0,
+            "62": 852351488.0,
+            "63": 852351488.0,
+            "64": 852351488.0,
             "65": 852351488.0,
+            "66": 852351488.0,
+            "67": 852351488.0,
+            "68": 852351488.0,
+            "69": 852351488.0,
             "70": 852351488.0,
+            "71": 852351488.0,
+            "72": 852351488.0,
+            "73": 852351488.0,
+            "74": 852351488.0,
             "75": 852351488.0,
+            "76": 852351488.0,
+            "77": 852351488.0,
+            "78": 852351488.0,
+            "79": 852351488.0,
             "80": 852351488.0,
+            "81": 852351488.0,
+            "82": 852351488.0,
+            "83": 852351488.0,
+            "84": 852351488.0,
             "85": 852351488.0,
+            "86": 852351488.0,
+            "87": 852351488.0,
+            "88": 852351488.0,
+            "89": 852351488.0,
             "90": 852351488.0,
+            "91": 852351488.0,
+            "92": 852351488.0,
+            "93": 852351488.0,
+            "94": 852351488.0,
             "95": 852351488.0,
+            "96": 852351488.0,
+            "97": 852351488.0,
+            "98": 852351488.0,
+            "99": 852351488.0,
             "100": 852351488.0
         }
     },
     "mem-max-allocated-bytes": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 2393217536.0,
-            "5": 2393218048.0,
-            "10": 2393218048.0,
-            "15": 2393218048.0,
+            "1": 2394265600.0,
+            "2": 2394266624.0,
+            "3": 2394266624.0,
+            "4": 2394266624.0,
+            "5": 2394266624.0,
+            "6": 2394266624.0,
+            "7": 2394266624.0,
+            "8": 2394266624.0,
+            "9": 2394266624.0,
+            "10": 2394266624.0,
+            "11": 2394266624.0,
+            "12": 2394266624.0,
+            "13": 2394266624.0,
+            "14": 2394266624.0,
+            "15": 2394266624.0,
+            "16": 2394266624.0,
+            "17": 2394266624.0,
+            "18": 2675191296.0,
+            "19": 2675191296.0,
             "20": 2675191296.0,
+            "21": 2675191296.0,
+            "22": 2675191296.0,
+            "23": 2675191296.0,
+            "24": 2675191296.0,
             "25": 2675191296.0,
+            "26": 2675191296.0,
+            "27": 2675191296.0,
+            "28": 2675191296.0,
+            "29": 2675191296.0,
             "30": 2675191296.0,
+            "31": 2675191296.0,
+            "32": 2675191296.0,
+            "33": 2675191296.0,
+            "34": 2675191296.0,
             "35": 2675191296.0,
+            "36": 2675191296.0,
+            "37": 2675191296.0,
+            "38": 2675191296.0,
+            "39": 2675191296.0,
             "40": 2675191296.0,
+            "41": 2675191296.0,
+            "42": 2675191296.0,
+            "43": 2675191296.0,
+            "44": 2675191296.0,
             "45": 2675191296.0,
+            "46": 2675191296.0,
+            "47": 2675191296.0,
+            "48": 2675191296.0,
+            "49": 2675191296.0,
             "50": 2675191296.0,
+            "51": 2675191296.0,
+            "52": 2675191296.0,
+            "53": 2675191296.0,
+            "54": 2675191296.0,
             "55": 2675191296.0,
+            "56": 2675191296.0,
+            "57": 2675191296.0,
+            "58": 2675191296.0,
+            "59": 2675191296.0,
             "60": 2675191296.0,
+            "61": 2675191296.0,
+            "62": 2675191296.0,
+            "63": 2675191296.0,
+            "64": 2675191296.0,
             "65": 2675191296.0,
+            "66": 2675191296.0,
+            "67": 2675191296.0,
+            "68": 2675191296.0,
+            "69": 2675191296.0,
             "70": 2675191296.0,
+            "71": 2675191296.0,
+            "72": 2675191296.0,
+            "73": 2675191296.0,
+            "74": 2675191296.0,
             "75": 2675191296.0,
+            "76": 2675191296.0,
+            "77": 2675191296.0,
+            "78": 2675191296.0,
+            "79": 2675191296.0,
             "80": 2675191296.0,
+            "81": 2675191296.0,
+            "82": 2675191296.0,
+            "83": 2675191296.0,
+            "84": 2675191296.0,
             "85": 2675191296.0,
+            "86": 2675191296.0,
+            "87": 2675191296.0,
+            "88": 2675191296.0,
+            "89": 2675191296.0,
             "90": 2675191296.0,
+            "91": 2675191296.0,
+            "92": 2675191296.0,
+            "93": 2675191296.0,
+            "94": 2675191296.0,
             "95": 2675191296.0,
+            "96": 2675191296.0,
+            "97": 2675191296.0,
+            "98": 2675191296.0,
+            "99": 2675191296.0,
             "100": 2675191296.0
         }
     },
     "iteration-time": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 10.18564,
-            "5": 0.17211,
-            "10": 0.17231,
-            "15": 0.17041,
-            "20": 0.17593,
-            "25": 0.17714,
-            "30": 0.1877,
-            "35": 0.18206,
-            "40": 0.1863,
-            "45": 0.18632,
-            "50": 0.18765,
-            "55": 0.17167,
-            "60": 0.17203,
-            "65": 0.17216,
-            "70": 0.17222,
-            "75": 0.17155,
-            "80": 0.17227,
-            "85": 0.17239,
-            "90": 0.17214,
-            "95": 0.17202,
-            "100": 0.17177
+            "1": 3.90564,
+            "2": 0.17657,
+            "3": 0.15961,
+            "4": 0.14151,
+            "5": 0.13979,
+            "6": 0.14024,
+            "7": 0.14076,
+            "8": 0.14069,
+            "9": 0.14337,
+            "10": 0.14326,
+            "11": 0.1412,
+            "12": 0.14084,
+            "13": 0.14194,
+            "14": 0.14039,
+            "15": 0.14253,
+            "16": 0.14063,
+            "17": 0.18237,
+            "18": 0.15083,
+            "19": 0.14609,
+            "20": 0.145,
+            "21": 0.14692,
+            "22": 0.146,
+            "23": 0.14576,
+            "24": 0.14565,
+            "25": 0.14491,
+            "26": 0.14606,
+            "27": 0.14435,
+            "28": 0.14485,
+            "29": 0.14504,
+            "30": 0.14509,
+            "31": 0.14667,
+            "32": 0.14484,
+            "33": 0.14504,
+            "34": 0.14439,
+            "35": 0.14672,
+            "36": 0.14484,
+            "37": 0.14554,
+            "38": 0.14428,
+            "39": 0.14491,
+            "40": 0.1445,
+            "41": 0.14539,
+            "42": 0.14483,
+            "43": 0.14794,
+            "44": 0.14484,
+            "45": 0.14449,
+            "46": 0.14567,
+            "47": 0.14498,
+            "48": 0.14525,
+            "49": 0.14498,
+            "50": 0.1458,
+            "51": 0.15708,
+            "52": 0.1492,
+            "53": 0.14889,
+            "54": 0.1489,
+            "55": 0.14804,
+            "56": 0.14848,
+            "57": 0.14854,
+            "58": 0.14843,
+            "59": 0.14961,
+            "60": 0.14807,
+            "61": 0.14786,
+            "62": 0.14872,
+            "63": 0.14837,
+            "64": 0.148,
+            "65": 0.1483,
+            "66": 0.14847,
+            "67": 0.15039,
+            "68": 0.15144,
+            "69": 0.15129,
+            "70": 0.14963,
+            "71": 0.14959,
+            "72": 0.1509,
+            "73": 0.15125,
+            "74": 0.14951,
+            "75": 0.15018,
+            "76": 0.15031,
+            "77": 0.14981,
+            "78": 0.14969,
+            "79": 0.1496,
+            "80": 0.15057,
+            "81": 0.15014,
+            "82": 0.15141,
+            "83": 0.15143,
+            "84": 0.15091,
+            "85": 0.15061,
+            "86": 0.14973,
+            "87": 0.14949,
+            "88": 0.14979,
+            "89": 0.14986,
+            "90": 0.14984,
+            "91": 0.1511,
+            "92": 0.14859,
+            "93": 0.14946,
+            "94": 0.14974,
+            "95": 0.14917,
+            "96": 0.1491,
+            "97": 0.14957,
+            "98": 0.14939,
+            "99": 0.14896,
+            "100": 0.14922
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 100,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
             "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
             "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
             "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
             "15": "nan",
+            "16": "nan",
+            "17": 2437.0,
+            "18": 2405.0,
+            "19": 2950.0,
             "20": 1827.0,
+            "21": 2154.0,
+            "22": 2731.0,
+            "23": 2609.0,
+            "24": 2290.0,
             "25": 2325.0,
+            "26": 2079.0,
+            "27": 2138.0,
+            "28": 2702.0,
+            "29": 2576.0,
             "30": 2528.0,
+            "31": 1895.0,
+            "32": 2628.0,
+            "33": 2325.0,
+            "34": 1928.0,
             "35": 2061.0,
-            "40": 2053.0,
-            "45": 2490.0,
-            "50": 2887.0,
-            "55": 2440.0,
-            "60": 2893.0,
-            "65": 2318.0,
-            "70": 3665.0,
-            "75": 2955.0,
-            "80": 3665.0,
-            "85": 4048.0,
-            "90": 3695.0,
-            "95": 4076.0,
-            "100": 3631.0
+            "36": 2153.0,
+            "37": 2600.0,
+            "38": 2350.0,
+            "39": 2997.0,
+            "40": 2042.0,
+            "41": 3349.0,
+            "42": 2512.0,
+            "43": 2750.0,
+            "44": 2120.0,
+            "45": 2537.0,
+            "46": 2247.0,
+            "47": 3061.0,
+            "48": 2520.0,
+            "49": 1969.0,
+            "50": 2951.0,
+            "51": 2300.0,
+            "52": 2456.0,
+            "53": 3730.0,
+            "54": 2866.0,
+            "55": 2413.0,
+            "56": 2477.0,
+            "57": 2410.0,
+            "58": 3424.0,
+            "59": 2861.0,
+            "60": 2939.0,
+            "61": 3044.0,
+            "62": 3127.0,
+            "63": 3236.0,
+            "64": 3212.0,
+            "65": 2304.0,
+            "66": 3805.0,
+            "67": 2691.0,
+            "68": 3332.0,
+            "69": 2874.0,
+            "70": 3746.0,
+            "71": 3057.0,
+            "72": 2717.0,
+            "73": 3332.0,
+            "74": 2214.0,
+            "75": 3059.0,
+            "76": 3625.0,
+            "77": 3957.0,
+            "78": 3955.0,
+            "79": 4130.0,
+            "80": 3627.0,
+            "81": 5242.0,
+            "82": 3566.0,
+            "83": 3261.0,
+            "84": 4036.0,
+            "85": 3907.0,
+            "86": 3340.0,
+            "87": 3954.0,
+            "88": 3630.0,
+            "89": 4358.0,
+            "90": 3800.0,
+            "91": 2877.0,
+            "92": 4239.0,
+            "93": 3604.0,
+            "94": 4356.0,
+            "95": 4107.0,
+            "96": 3835.0,
+            "97": 4094.0,
+            "98": 4835.0,
+            "99": 3873.0,
+            "100": 3709.0
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100_2nd.json
new file mode 100644
index 00000000000..79470a83eaa
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.16758,
+            "52": 10.06896,
+            "53": 10.30707,
+            "54": 10.2091,
+            "55": 10.15688,
+            "56": 9.91475,
+            "57": 9.77696,
+            "58": 10.07417,
+            "59": 9.86333,
+            "60": 9.77328,
+            "61": 9.9292,
+            "62": 10.17156,
+            "63": 9.62041,
+            "64": 9.97113,
+            "65": 9.21979,
+            "66": 9.88693,
+            "67": 9.58363,
+            "68": 9.94922,
+            "69": 9.9527,
+            "70": 9.89312,
+            "71": 9.77658,
+            "72": 9.75435,
+            "73": 9.64969,
+            "74": 9.1439,
+            "75": 9.56121,
+            "76": 9.25111,
+            "77": 10.17063,
+            "78": 9.85402,
+            "79": 9.49965,
+            "80": 9.53087,
+            "81": 9.60555,
+            "82": 9.80179,
+            "83": 9.43744,
+            "84": 9.51987,
+            "85": 9.7196,
+            "86": 9.18596,
+            "87": 9.68687,
+            "88": 9.8443,
+            "89": 9.70586,
+            "90": 9.89977,
+            "91": 9.45029,
+            "92": 9.45356,
+            "93": 9.18553,
+            "94": 8.92968,
+            "95": 9.59767,
+            "96": 9.61491,
+            "97": 9.39084,
+            "98": 9.75668,
+            "99": 8.97922,
+            "100": 9.49
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2300.0,
+            "52": 2456.0,
+            "53": 3730.0,
+            "54": 2866.0,
+            "55": 2413.0,
+            "56": 2477.0,
+            "57": 2410.0,
+            "58": 3424.0,
+            "59": 2861.0,
+            "60": 2939.0,
+            "61": 3044.0,
+            "62": 3127.0,
+            "63": 3236.0,
+            "64": 3212.0,
+            "65": 2304.0,
+            "66": 3805.0,
+            "67": 2691.0,
+            "68": 3332.0,
+            "69": 2874.0,
+            "70": 3746.0,
+            "71": 3057.0,
+            "72": 2717.0,
+            "73": 3332.0,
+            "74": 2214.0,
+            "75": 3059.0,
+            "76": 3625.0,
+            "77": 3957.0,
+            "78": 3955.0,
+            "79": 4130.0,
+            "80": 3627.0,
+            "81": 5242.0,
+            "82": 3566.0,
+            "83": 3261.0,
+            "84": 4036.0,
+            "85": 3907.0,
+            "86": 3340.0,
+            "87": 3954.0,
+            "88": 3630.0,
+            "89": 4358.0,
+            "90": 3800.0,
+            "91": 2877.0,
+            "92": 4239.0,
+            "93": 3604.0,
+            "94": 4356.0,
+            "95": 4107.0,
+            "96": 3835.0,
+            "97": 4094.0,
+            "98": 4835.0,
+            "99": 3873.0,
+            "100": 3709.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 854448640.0,
+            "52": 854448640.0,
+            "53": 854448640.0,
+            "54": 854448640.0,
+            "55": 854448640.0,
+            "56": 854448640.0,
+            "57": 854448640.0,
+            "58": 854448640.0,
+            "59": 854448640.0,
+            "60": 854448640.0,
+            "61": 854448640.0,
+            "62": 854448640.0,
+            "63": 854448640.0,
+            "64": 854448640.0,
+            "65": 854448640.0,
+            "66": 854448640.0,
+            "67": 854448640.0,
+            "68": 854448640.0,
+            "69": 854448640.0,
+            "70": 854448640.0,
+            "71": 854448640.0,
+            "72": 854448640.0,
+            "73": 854448640.0,
+            "74": 854448640.0,
+            "75": 854448640.0,
+            "76": 854448640.0,
+            "77": 854448640.0,
+            "78": 854448640.0,
+            "79": 854448640.0,
+            "80": 854448640.0,
+            "81": 854448640.0,
+            "82": 854448640.0,
+            "83": 854448640.0,
+            "84": 854448640.0,
+            "85": 854448640.0,
+            "86": 854448640.0,
+            "87": 854448640.0,
+            "88": 854448640.0,
+            "89": 854448640.0,
+            "90": 854448640.0,
+            "91": 854448640.0,
+            "92": 854448640.0,
+            "93": 854448640.0,
+            "94": 854448640.0,
+            "95": 854448640.0,
+            "96": 854448640.0,
+            "97": 854448640.0,
+            "98": 854448640.0,
+            "99": 854448640.0,
+            "100": 854448640.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2677025280.0,
+            "52": 2677026304.0,
+            "53": 2677026304.0,
+            "54": 2677026304.0,
+            "55": 2677026304.0,
+            "56": 2677026304.0,
+            "57": 2677026304.0,
+            "58": 2677026304.0,
+            "59": 2677026304.0,
+            "60": 2677026304.0,
+            "61": 2677026304.0,
+            "62": 2677026304.0,
+            "63": 2677026304.0,
+            "64": 2677026304.0,
+            "65": 2677026304.0,
+            "66": 2677026304.0,
+            "67": 2677026304.0,
+            "68": 2677026304.0,
+            "69": 2677026304.0,
+            "70": 2677026304.0,
+            "71": 2677026304.0,
+            "72": 2677026304.0,
+            "73": 2677026304.0,
+            "74": 2677026304.0,
+            "75": 2677026304.0,
+            "76": 2677026304.0,
+            "77": 2677026304.0,
+            "78": 2677026304.0,
+            "79": 2677026304.0,
+            "80": 2677026304.0,
+            "81": 2677026304.0,
+            "82": 2677026304.0,
+            "83": 2677026304.0,
+            "84": 2677026304.0,
+            "85": 2677026304.0,
+            "86": 2677026304.0,
+            "87": 2677026304.0,
+            "88": 2677026304.0,
+            "89": 2677026304.0,
+            "90": 2677026304.0,
+            "91": 2677026304.0,
+            "92": 2677026304.0,
+            "93": 2677026304.0,
+            "94": 2677026304.0,
+            "95": 2677026304.0,
+            "96": 2677026304.0,
+            "97": 2677026304.0,
+            "98": 2677026304.0,
+            "99": 2677026304.0,
+            "100": 2677026304.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4.22373,
+            "52": 0.16951,
+            "53": 0.15058,
+            "54": 0.15054,
+            "55": 0.14699,
+            "56": 0.14513,
+            "57": 0.14551,
+            "58": 0.14527,
+            "59": 0.14564,
+            "60": 0.1459,
+            "61": 0.14594,
+            "62": 0.14542,
+            "63": 0.14588,
+            "64": 0.14554,
+            "65": 0.14576,
+            "66": 0.14541,
+            "67": 0.14581,
+            "68": 0.1455,
+            "69": 0.14552,
+            "70": 0.14529,
+            "71": 0.14493,
+            "72": 0.14571,
+            "73": 0.14584,
+            "74": 0.14561,
+            "75": 0.1455,
+            "76": 0.1448,
+            "77": 0.14494,
+            "78": 0.14556,
+            "79": 0.14513,
+            "80": 0.14568,
+            "81": 0.14557,
+            "82": 0.14571,
+            "83": 0.14521,
+            "84": 0.14525,
+            "85": 0.14517,
+            "86": 0.14536,
+            "87": 0.14621,
+            "88": 0.14478,
+            "89": 0.14615,
+            "90": 0.14445,
+            "91": 0.14478,
+            "92": 0.14427,
+            "93": 0.14469,
+            "94": 0.14454,
+            "95": 0.14455,
+            "96": 0.14494,
+            "97": 0.14459,
+            "98": 0.14459,
+            "99": 0.14516,
+            "100": 0.14499
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..1a14c45ef7f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.8363,
+            "2": 10.83592,
+            "3": 10.83615,
+            "4": 10.79957,
+            "5": 10.84951,
+            "6": 10.86614,
+            "7": 10.82832,
+            "8": 10.83954,
+            "9": 10.84439,
+            "10": 10.80563,
+            "11": 10.87626,
+            "12": 10.8635,
+            "13": 10.87519,
+            "14": 10.88261,
+            "15": 10.8549,
+            "16": 10.84719,
+            "17": 10.84007,
+            "18": 10.85358,
+            "19": 10.86134,
+            "20": 10.8411,
+            "21": 10.85973,
+            "22": 10.79599,
+            "23": 10.88309,
+            "24": 10.81942,
+            "25": 10.8032,
+            "26": 10.81364,
+            "27": 10.83184,
+            "28": 10.8227,
+            "29": 10.84469,
+            "30": 10.73484,
+            "31": 10.61361,
+            "32": 10.76183,
+            "33": 10.75999,
+            "34": 10.64098,
+            "35": 10.63833,
+            "36": 10.59381,
+            "37": 10.66212,
+            "38": 10.56593,
+            "39": 10.67809,
+            "40": 10.47027,
+            "41": 10.49977,
+            "42": 10.53376,
+            "43": 10.26135,
+            "44": 10.33935,
+            "45": 10.24399,
+            "46": 10.21706,
+            "47": 10.42307,
+            "48": 10.21623,
+            "49": 9.96614,
+            "50": 10.22788,
+            "51": 10.18063,
+            "52": 10.07636,
+            "53": 10.32773,
+            "54": 10.23662,
+            "55": 10.17779,
+            "56": 9.93459,
+            "57": 9.79047,
+            "58": 10.09308,
+            "59": 9.88561,
+            "60": 9.79776,
+            "61": 9.94517,
+            "62": 10.19094,
+            "63": 9.64683,
+            "64": 9.98455,
+            "65": 9.23395,
+            "66": 9.90453,
+            "67": 9.59582,
+            "68": 9.97649,
+            "69": 9.97495,
+            "70": 9.91345,
+            "71": 9.81704,
+            "72": 9.7724,
+            "73": 9.6613,
+            "74": 9.13276,
+            "75": 9.5758,
+            "76": 9.25498,
+            "77": 10.18582,
+            "78": 9.86011,
+            "79": 9.51637,
+            "80": 9.54101,
+            "81": 9.61959,
+            "82": 9.8199,
+            "83": 9.45715,
+            "84": 9.53646,
+            "85": 9.73396,
+            "86": 9.19313,
+            "87": 9.70118,
+            "88": 9.85742,
+            "89": 9.71286,
+            "90": 9.92642,
+            "91": 9.46223,
+            "92": 9.46428,
+            "93": 9.20456,
+            "94": 8.93882,
+            "95": 9.61804,
+            "96": 9.62982,
+            "97": 9.40186,
+            "98": 9.76277,
+            "99": 9.00132,
+            "100": 9.50913
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 569591808.0,
+            "2": 569591808.0,
+            "3": 569591808.0,
+            "4": 569591808.0,
+            "5": 569591808.0,
+            "6": 569591808.0,
+            "7": 569591808.0,
+            "8": 569591808.0,
+            "9": 569591808.0,
+            "10": 569591808.0,
+            "11": 569591808.0,
+            "12": 569591808.0,
+            "13": 569591808.0,
+            "14": 569591808.0,
+            "15": 569591808.0,
+            "16": 569591808.0,
+            "17": 852351488.0,
+            "18": 852351488.0,
+            "19": 852351488.0,
+            "20": 852351488.0,
+            "21": 852351488.0,
+            "22": 852351488.0,
+            "23": 852351488.0,
+            "24": 852351488.0,
+            "25": 852351488.0,
+            "26": 852351488.0,
+            "27": 852351488.0,
+            "28": 852351488.0,
+            "29": 852351488.0,
+            "30": 852351488.0,
+            "31": 852351488.0,
+            "32": 852351488.0,
+            "33": 852351488.0,
+            "34": 852351488.0,
+            "35": 852351488.0,
+            "36": 852351488.0,
+            "37": 852351488.0,
+            "38": 852351488.0,
+            "39": 852351488.0,
+            "40": 852351488.0,
+            "41": 852351488.0,
+            "42": 852351488.0,
+            "43": 852351488.0,
+            "44": 852351488.0,
+            "45": 852351488.0,
+            "46": 852351488.0,
+            "47": 852351488.0,
+            "48": 852351488.0,
+            "49": 852351488.0,
+            "50": 852351488.0,
+            "51": 852351488.0,
+            "52": 852351488.0,
+            "53": 852351488.0,
+            "54": 852351488.0,
+            "55": 852351488.0,
+            "56": 852351488.0,
+            "57": 852351488.0,
+            "58": 852351488.0,
+            "59": 852351488.0,
+            "60": 852351488.0,
+            "61": 852351488.0,
+            "62": 852351488.0,
+            "63": 852351488.0,
+            "64": 852351488.0,
+            "65": 852351488.0,
+            "66": 852351488.0,
+            "67": 852351488.0,
+            "68": 852351488.0,
+            "69": 852351488.0,
+            "70": 852351488.0,
+            "71": 852351488.0,
+            "72": 852351488.0,
+            "73": 852351488.0,
+            "74": 852351488.0,
+            "75": 852351488.0,
+            "76": 852351488.0,
+            "77": 852351488.0,
+            "78": 852351488.0,
+            "79": 852351488.0,
+            "80": 852351488.0,
+            "81": 852351488.0,
+            "82": 852351488.0,
+            "83": 852351488.0,
+            "84": 852351488.0,
+            "85": 852351488.0,
+            "86": 852351488.0,
+            "87": 852351488.0,
+            "88": 852351488.0,
+            "89": 852351488.0,
+            "90": 852351488.0,
+            "91": 852351488.0,
+            "92": 852351488.0,
+            "93": 852351488.0,
+            "94": 852351488.0,
+            "95": 852351488.0,
+            "96": 852351488.0,
+            "97": 852351488.0,
+            "98": 852351488.0,
+            "99": 852351488.0,
+            "100": 852351488.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2394266112.0,
+            "2": 2394266624.0,
+            "3": 2394266624.0,
+            "4": 2394266624.0,
+            "5": 2394266624.0,
+            "6": 2394266624.0,
+            "7": 2394266624.0,
+            "8": 2394266624.0,
+            "9": 2394266624.0,
+            "10": 2394266624.0,
+            "11": 2394266624.0,
+            "12": 2394266624.0,
+            "13": 2394266624.0,
+            "14": 2394266624.0,
+            "15": 2394266624.0,
+            "16": 2394266624.0,
+            "17": 2394266624.0,
+            "18": 2677288448.0,
+            "19": 2677288448.0,
+            "20": 2677288448.0,
+            "21": 2677288448.0,
+            "22": 2677288448.0,
+            "23": 2677288448.0,
+            "24": 2677288448.0,
+            "25": 2677288448.0,
+            "26": 2677288448.0,
+            "27": 2677288448.0,
+            "28": 2677288448.0,
+            "29": 2677288448.0,
+            "30": 2677288448.0,
+            "31": 2677288448.0,
+            "32": 2677288448.0,
+            "33": 2677288448.0,
+            "34": 2677288448.0,
+            "35": 2677288448.0,
+            "36": 2677288448.0,
+            "37": 2677288448.0,
+            "38": 2677288448.0,
+            "39": 2677288448.0,
+            "40": 2677288448.0,
+            "41": 2677288448.0,
+            "42": 2677288448.0,
+            "43": 2677288448.0,
+            "44": 2677288448.0,
+            "45": 2677288448.0,
+            "46": 2677288448.0,
+            "47": 2677288448.0,
+            "48": 2677288448.0,
+            "49": 2677288448.0,
+            "50": 2677288448.0,
+            "51": 2677288448.0,
+            "52": 2677288448.0,
+            "53": 2677288448.0,
+            "54": 2677288448.0,
+            "55": 2677288448.0,
+            "56": 2677288448.0,
+            "57": 2677288448.0,
+            "58": 2677288448.0,
+            "59": 2677288448.0,
+            "60": 2677288448.0,
+            "61": 2677288448.0,
+            "62": 2677288448.0,
+            "63": 2677288448.0,
+            "64": 2677288448.0,
+            "65": 2677288448.0,
+            "66": 2677288448.0,
+            "67": 2677288448.0,
+            "68": 2677288448.0,
+            "69": 2677288448.0,
+            "70": 2677288448.0,
+            "71": 2677288448.0,
+            "72": 2677288448.0,
+            "73": 2677288448.0,
+            "74": 2677288448.0,
+            "75": 2677288448.0,
+            "76": 2677288448.0,
+            "77": 2677288448.0,
+            "78": 2677288448.0,
+            "79": 2677288448.0,
+            "80": 2677288448.0,
+            "81": 2677288448.0,
+            "82": 2677288448.0,
+            "83": 2677288448.0,
+            "84": 2677288448.0,
+            "85": 2677288448.0,
+            "86": 2677288448.0,
+            "87": 2677288448.0,
+            "88": 2677288448.0,
+            "89": 2677288448.0,
+            "90": 2677288448.0,
+            "91": 2677288448.0,
+            "92": 2677288448.0,
+            "93": 2677288448.0,
+            "94": 2677288448.0,
+            "95": 2677288448.0,
+            "96": 2677288448.0,
+            "97": 2677288448.0,
+            "98": 2677288448.0,
+            "99": 2677288448.0,
+            "100": 2677288448.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.57843,
+            "3": 0.14384,
+            "4": 0.12903,
+            "5": 0.12836,
+            "6": 0.12816,
+            "7": 0.12916,
+            "8": 0.12856,
+            "9": 0.12834,
+            "10": 0.12806,
+            "11": 0.12799,
+            "12": 0.1275,
+            "13": 0.12711,
+            "14": 0.12751,
+            "15": 0.129,
+            "16": 0.12848,
+            "17": 0.16638,
+            "18": 0.14237,
+            "19": 0.1354,
+            "20": 0.13451,
+            "21": 0.13484,
+            "22": 0.1296,
+            "23": 0.134,
+            "24": 0.13542,
+            "25": 0.13555,
+            "26": 0.13391,
+            "27": 0.13338,
+            "28": 0.13321,
+            "29": 0.13407,
+            "30": 0.13362,
+            "31": 0.13566,
+            "32": 0.13345,
+            "33": 0.13445,
+            "34": 0.13432,
+            "35": 0.13463,
+            "36": 0.1333,
+            "37": 0.13493,
+            "38": 0.1341,
+            "39": 0.13366,
+            "40": 0.14828,
+            "41": 0.15021,
+            "42": 0.14974,
+            "43": 0.15118,
+            "44": 0.15264,
+            "45": 0.15167,
+            "46": 0.15228,
+            "47": 0.15164,
+            "48": 0.15268,
+            "49": 0.15149,
+            "50": 0.15349,
+            "51": 0.18359,
+            "52": 0.15225,
+            "53": 0.14909,
+            "54": 0.1498,
+            "55": 0.14962,
+            "56": 0.14941,
+            "57": 0.14896,
+            "58": 0.14931,
+            "59": 0.149,
+            "60": 0.14965,
+            "61": 0.15012,
+            "62": 0.15003,
+            "63": 0.14869,
+            "64": 0.14833,
+            "65": 0.14957,
+            "66": 0.14978,
+            "67": 0.14987,
+            "68": 0.15037,
+            "69": 0.15122,
+            "70": 0.15108,
+            "71": 0.14994,
+            "72": 0.1507,
+            "73": 0.15127,
+            "74": 0.15073,
+            "75": 0.14986,
+            "76": 0.15012,
+            "77": 0.15071,
+            "78": 0.15091,
+            "79": 0.15129,
+            "80": 0.15073,
+            "81": 0.15107,
+            "82": 0.15091,
+            "83": 0.15083,
+            "84": 0.15069,
+            "85": 0.15272,
+            "86": 0.1517,
+            "87": 0.15055,
+            "88": 0.15217,
+            "89": 0.15281,
+            "90": 0.14893,
+            "91": 0.16006,
+            "92": 0.15632,
+            "93": 0.15975,
+            "94": 0.1591,
+            "95": 0.15873,
+            "96": 0.15918,
+            "97": 0.15958,
+            "98": 0.15854,
+            "99": 0.15737,
+            "100": 0.15785
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": 2382.0,
+            "18": 2453.0,
+            "19": 3160.0,
+            "20": 1803.0,
+            "21": 2176.0,
+            "22": "nan",
+            "23": 2602.0,
+            "24": 2269.0,
+            "25": 2273.0,
+            "26": 1994.0,
+            "27": 2158.0,
+            "28": 2596.0,
+            "29": 2482.0,
+            "30": 2432.0,
+            "31": 1881.0,
+            "32": 2727.0,
+            "33": 2329.0,
+            "34": 1979.0,
+            "35": 1953.0,
+            "36": 2152.0,
+            "37": 2620.0,
+            "38": 2256.0,
+            "39": 3092.0,
+            "40": 2087.0,
+            "41": 3218.0,
+            "42": 2436.0,
+            "43": 2553.0,
+            "44": 2101.0,
+            "45": 2479.0,
+            "46": 2236.0,
+            "47": 2903.0,
+            "48": 2483.0,
+            "49": 1893.0,
+            "50": 3008.0,
+            "51": 2281.0,
+            "52": 2534.0,
+            "53": 3604.0,
+            "54": 2989.0,
+            "55": 2624.0,
+            "56": 2547.0,
+            "57": 2287.0,
+            "58": 3322.0,
+            "59": 2730.0,
+            "60": 2919.0,
+            "61": 3007.0,
+            "62": 3131.0,
+            "63": 3226.0,
+            "64": 3219.0,
+            "65": 2422.0,
+            "66": 3741.0,
+            "67": 2805.0,
+            "68": 3215.0,
+            "69": 2871.0,
+            "70": 3597.0,
+            "71": 3045.0,
+            "72": 2952.0,
+            "73": 3559.0,
+            "74": 2232.0,
+            "75": 2889.0,
+            "76": 3802.0,
+            "77": 3635.0,
+            "78": 3762.0,
+            "79": 4000.0,
+            "80": 3383.0,
+            "81": 4629.0,
+            "82": 3435.0,
+            "83": 3254.0,
+            "84": 3786.0,
+            "85": 3895.0,
+            "86": 3338.0,
+            "87": 4169.0,
+            "88": 3498.0,
+            "89": 4065.0,
+            "90": 3825.0,
+            "91": 3040.0,
+            "92": 4399.0,
+            "93": 3899.0,
+            "94": 4449.0,
+            "95": 4017.0,
+            "96": 3820.0,
+            "97": 4268.0,
+            "98": 5094.0,
+            "99": 3940.0,
+            "100": 3369.0
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..8d115483589
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.18063,
+            "52": 10.07636,
+            "53": 10.32773,
+            "54": 10.23662,
+            "55": 10.17779,
+            "56": 9.93459,
+            "57": 9.79047,
+            "58": 10.09308,
+            "59": 9.88561,
+            "60": 9.79776,
+            "61": 9.94517,
+            "62": 10.19094,
+            "63": 9.64683,
+            "64": 9.98455,
+            "65": 9.23395,
+            "66": 9.90453,
+            "67": 9.59582,
+            "68": 9.97649,
+            "69": 9.97495,
+            "70": 9.91345,
+            "71": 9.81704,
+            "72": 9.7724,
+            "73": 9.6613,
+            "74": 9.13276,
+            "75": 9.5758,
+            "76": 9.25498,
+            "77": 10.18582,
+            "78": 9.86011,
+            "79": 9.51637,
+            "80": 9.54101,
+            "81": 9.61959,
+            "82": 9.8199,
+            "83": 9.45715,
+            "84": 9.53646,
+            "85": 9.73396,
+            "86": 9.19313,
+            "87": 9.70118,
+            "88": 9.85742,
+            "89": 9.71286,
+            "90": 9.92642,
+            "91": 9.46223,
+            "92": 9.46428,
+            "93": 9.20456,
+            "94": 8.93882,
+            "95": 9.61804,
+            "96": 9.62982,
+            "97": 9.40186,
+            "98": 9.76277,
+            "99": 9.00132,
+            "100": 9.50913
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2281.0,
+            "52": 2534.0,
+            "53": 3604.0,
+            "54": 2989.0,
+            "55": 2624.0,
+            "56": 2547.0,
+            "57": 2287.0,
+            "58": 3322.0,
+            "59": 2730.0,
+            "60": 2919.0,
+            "61": 3007.0,
+            "62": 3131.0,
+            "63": 3226.0,
+            "64": 3219.0,
+            "65": 2422.0,
+            "66": 3741.0,
+            "67": 2805.0,
+            "68": 3215.0,
+            "69": 2871.0,
+            "70": 3597.0,
+            "71": 3045.0,
+            "72": 2952.0,
+            "73": 3559.0,
+            "74": 2232.0,
+            "75": 2889.0,
+            "76": 3802.0,
+            "77": 3635.0,
+            "78": 3762.0,
+            "79": 4000.0,
+            "80": 3383.0,
+            "81": 4629.0,
+            "82": 3435.0,
+            "83": 3254.0,
+            "84": 3786.0,
+            "85": 3895.0,
+            "86": 3338.0,
+            "87": 4169.0,
+            "88": 3498.0,
+            "89": 4065.0,
+            "90": 3825.0,
+            "91": 3040.0,
+            "92": 4399.0,
+            "93": 3899.0,
+            "94": 4449.0,
+            "95": 4017.0,
+            "96": 3820.0,
+            "97": 4268.0,
+            "98": 5094.0,
+            "99": 3940.0,
+            "100": 3369.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 853400064.0,
+            "52": 853400064.0,
+            "53": 853400064.0,
+            "54": 853400064.0,
+            "55": 853400064.0,
+            "56": 853400064.0,
+            "57": 853400064.0,
+            "58": 853400064.0,
+            "59": 853400064.0,
+            "60": 853400064.0,
+            "61": 853400064.0,
+            "62": 853400064.0,
+            "63": 853400064.0,
+            "64": 853400064.0,
+            "65": 853400064.0,
+            "66": 853400064.0,
+            "67": 853400064.0,
+            "68": 853400064.0,
+            "69": 853400064.0,
+            "70": 853400064.0,
+            "71": 853400064.0,
+            "72": 853400064.0,
+            "73": 853400064.0,
+            "74": 853400064.0,
+            "75": 853400064.0,
+            "76": 853400064.0,
+            "77": 853400064.0,
+            "78": 853400064.0,
+            "79": 853400064.0,
+            "80": 853400064.0,
+            "81": 853400064.0,
+            "82": 853400064.0,
+            "83": 853400064.0,
+            "84": 853400064.0,
+            "85": 853400064.0,
+            "86": 853400064.0,
+            "87": 853400064.0,
+            "88": 853400064.0,
+            "89": 853400064.0,
+            "90": 853400064.0,
+            "91": 853400064.0,
+            "92": 853400064.0,
+            "93": 853400064.0,
+            "94": 853400064.0,
+            "95": 853400064.0,
+            "96": 853400064.0,
+            "97": 853400064.0,
+            "98": 853400064.0,
+            "99": 853400064.0,
+            "100": 853400064.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2677025280.0,
+            "52": 2677026304.0,
+            "53": 2677026304.0,
+            "54": 2677026304.0,
+            "55": 2677026304.0,
+            "56": 2677026304.0,
+            "57": 2677026304.0,
+            "58": 2677026304.0,
+            "59": 2677026304.0,
+            "60": 2677026304.0,
+            "61": 2677026304.0,
+            "62": 2677026304.0,
+            "63": 2677026304.0,
+            "64": 2677026304.0,
+            "65": 2677026304.0,
+            "66": 2677026304.0,
+            "67": 2677026304.0,
+            "68": 2677026304.0,
+            "69": 2677026304.0,
+            "70": 2677026304.0,
+            "71": 2677026304.0,
+            "72": 2677026304.0,
+            "73": 2677026304.0,
+            "74": 2677026304.0,
+            "75": 2677026304.0,
+            "76": 2677026304.0,
+            "77": 2677026304.0,
+            "78": 2677026304.0,
+            "79": 2677026304.0,
+            "80": 2677026304.0,
+            "81": 2677026304.0,
+            "82": 2677026304.0,
+            "83": 2677026304.0,
+            "84": 2677026304.0,
+            "85": 2677026304.0,
+            "86": 2677026304.0,
+            "87": 2677026304.0,
+            "88": 2677026304.0,
+            "89": 2677026304.0,
+            "90": 2677026304.0,
+            "91": 2677026304.0,
+            "92": 2677026304.0,
+            "93": 2677026304.0,
+            "94": 2677026304.0,
+            "95": 2677026304.0,
+            "96": 2677026304.0,
+            "97": 2677026304.0,
+            "98": 2677026304.0,
+            "99": 2677026304.0,
+            "100": 2677026304.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.52584,
+            "53": 0.20392,
+            "54": 0.16414,
+            "55": 0.13847,
+            "56": 0.13338,
+            "57": 0.13357,
+            "58": 0.13375,
+            "59": 0.13394,
+            "60": 0.1334,
+            "61": 0.1341,
+            "62": 0.13391,
+            "63": 0.13459,
+            "64": 0.13308,
+            "65": 0.13494,
+            "66": 0.13405,
+            "67": 0.13432,
+            "68": 0.13481,
+            "69": 0.13446,
+            "70": 0.13476,
+            "71": 0.13398,
+            "72": 0.13448,
+            "73": 0.13508,
+            "74": 0.13535,
+            "75": 0.13502,
+            "76": 0.13588,
+            "77": 0.13483,
+            "78": 0.13626,
+            "79": 0.13542,
+            "80": 0.13571,
+            "81": 0.13587,
+            "82": 0.13461,
+            "83": 0.13533,
+            "84": 0.13399,
+            "85": 0.13532,
+            "86": 0.13468,
+            "87": 0.13492,
+            "88": 0.13529,
+            "89": 0.13664,
+            "90": 0.13526,
+            "91": 0.13377,
+            "92": 0.13367,
+            "93": 0.13265,
+            "94": 0.13355,
+            "95": 0.13376,
+            "96": 0.13303,
+            "97": 0.13448,
+            "98": 0.13371,
+            "99": 0.13395,
+            "100": 0.1334
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json
index 3b0a03dc6ef..2ea1feb19e0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json
@@ -54,12 +54,12 @@
             "48": 10.18128,
             "49": 9.94311,
             "50": 10.21224,
-            "51": 10.16759,
-            "52": 10.06895,
+            "51": 10.16758,
+            "52": 10.06896,
             "53": 10.30707,
-            "54": 10.20911,
+            "54": 10.2091,
             "55": 10.15688,
-            "56": 9.91474,
+            "56": 9.91475,
             "57": 9.77696,
             "58": 10.07417,
             "59": 9.86333,
@@ -72,38 +72,38 @@
             "66": 9.88693,
             "67": 9.58363,
             "68": 9.94922,
-            "69": 9.95271,
+            "69": 9.9527,
             "70": 9.89312,
             "71": 9.77658,
             "72": 9.75435,
-            "73": 9.6497,
+            "73": 9.64969,
             "74": 9.1439,
             "75": 9.56121,
             "76": 9.25111,
             "77": 10.17063,
             "78": 9.85402,
             "79": 9.49965,
-            "80": 9.53086,
+            "80": 9.53087,
             "81": 9.60555,
             "82": 9.80179,
             "83": 9.43744,
             "84": 9.51987,
             "85": 9.7196,
-            "86": 9.18595,
+            "86": 9.18596,
             "87": 9.68687,
             "88": 9.8443,
             "89": 9.70586,
             "90": 9.89977,
             "91": 9.45029,
             "92": 9.45356,
-            "93": 9.18554,
+            "93": 9.18553,
             "94": 8.92968,
             "95": 9.59767,
             "96": 9.61491,
             "97": 9.39084,
-            "98": 9.75667,
-            "99": 8.97921,
-            "100": 9.49001
+            "98": 9.75668,
+            "99": 8.97922,
+            "100": 9.49
         }
     },
     "mem-allocated-bytes": {
@@ -220,7 +220,7 @@
         "values": {
             "1": 2393217536.0,
             "2": 2393218048.0,
-            "3": 2393218048.0,
+            "3": 2394266624.0,
             "4": 2394266624.0,
             "5": 2394266624.0,
             "6": 2394266624.0,
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.58025,
-            "2": 0.18555,
-            "3": 0.31194,
-            "4": 0.1522,
-            "5": 0.15205,
-            "6": 0.1496,
-            "7": 0.14979,
-            "8": 0.14921,
-            "9": 0.14957,
-            "10": 0.15024,
-            "11": 0.14887,
-            "12": 0.14852,
-            "13": 0.14925,
-            "14": 0.15079,
-            "15": 0.14925,
-            "16": 0.14936,
-            "17": 0.2057,
-            "18": 0.15996,
-            "19": 0.15397,
-            "20": 0.15414,
-            "21": 0.1543,
-            "22": 0.15499,
-            "23": 0.15504,
-            "24": 0.15679,
-            "25": 0.15462,
-            "26": 0.15509,
-            "27": 0.15394,
-            "28": 0.15487,
-            "29": 0.15522,
-            "30": 0.1553,
-            "31": 0.15536,
-            "32": 0.15406,
-            "33": 0.15461,
-            "34": 0.1548,
-            "35": 0.15472,
-            "36": 0.15413,
-            "37": 0.1548,
-            "38": 0.15446,
-            "39": 0.15545,
-            "40": 0.15442,
-            "41": 0.15567,
-            "42": 0.15413,
-            "43": 0.15585,
-            "44": 0.15428,
-            "45": 0.15497,
-            "46": 0.15438,
-            "47": 0.15508,
-            "48": 0.15481,
-            "49": 0.15466,
-            "50": 0.15476,
-            "51": 0.16245,
-            "52": 0.15411,
-            "53": 0.15376,
-            "54": 0.15405,
-            "55": 0.15375,
-            "56": 0.15402,
-            "57": 0.15434,
-            "58": 0.15404,
-            "59": 0.15454,
-            "60": 0.15434,
-            "61": 0.15384,
-            "62": 0.15505,
-            "63": 0.15431,
-            "64": 0.15388,
-            "65": 0.1547,
-            "66": 0.15453,
-            "67": 0.15364,
-            "68": 0.15388,
-            "69": 0.15362,
-            "70": 0.15366,
-            "71": 0.15425,
-            "72": 0.15393,
-            "73": 0.15476,
-            "74": 0.15414,
-            "75": 0.15415,
-            "76": 0.1535,
-            "77": 0.15481,
-            "78": 0.1541,
-            "79": 0.15382,
-            "80": 0.15363,
-            "81": 0.15386,
-            "82": 0.18555,
-            "83": 0.15422,
-            "84": 0.15393,
-            "85": 0.15462,
-            "86": 0.15512,
-            "87": 0.15391,
-            "88": 0.15431,
-            "89": 0.15431,
-            "90": 0.15521,
-            "91": 0.15475,
-            "92": 0.154,
-            "93": 0.15414,
-            "94": 0.15426,
-            "95": 0.15422,
-            "96": 0.15393,
-            "97": 0.15497,
-            "98": 0.1538,
-            "99": 0.15481,
-            "100": 0.15442
+            "1": 4.04251,
+            "2": 0.18354,
+            "3": 0.16567,
+            "4": 0.14879,
+            "5": 0.14798,
+            "6": 0.14636,
+            "7": 0.14643,
+            "8": 0.14702,
+            "9": 0.14536,
+            "10": 0.1472,
+            "11": 0.1449,
+            "12": 0.14483,
+            "13": 0.14552,
+            "14": 0.14513,
+            "15": 0.14541,
+            "16": 0.14509,
+            "17": 0.19318,
+            "18": 0.15745,
+            "19": 0.15066,
+            "20": 0.1498,
+            "21": 0.15004,
+            "22": 0.15029,
+            "23": 0.15017,
+            "24": 0.15021,
+            "25": 0.14964,
+            "26": 0.15048,
+            "27": 0.15016,
+            "28": 0.15022,
+            "29": 0.15074,
+            "30": 0.15018,
+            "31": 0.15122,
+            "32": 0.15081,
+            "33": 0.1504,
+            "34": 0.15026,
+            "35": 0.15149,
+            "36": 0.14995,
+            "37": 0.1504,
+            "38": 0.15025,
+            "39": 0.15065,
+            "40": 0.14967,
+            "41": 0.15071,
+            "42": 0.1495,
+            "43": 0.15057,
+            "44": 0.14971,
+            "45": 0.14997,
+            "46": 0.14973,
+            "47": 0.14981,
+            "48": 0.14986,
+            "49": 0.15006,
+            "50": 0.14923,
+            "51": 0.15753,
+            "52": 0.1506,
+            "53": 0.14818,
+            "54": 0.14906,
+            "55": 0.14884,
+            "56": 0.14846,
+            "57": 0.1497,
+            "58": 0.14946,
+            "59": 0.14898,
+            "60": 0.14864,
+            "61": 0.14782,
+            "62": 0.14952,
+            "63": 0.14895,
+            "64": 0.14958,
+            "65": 0.14948,
+            "66": 0.14887,
+            "67": 0.1481,
+            "68": 0.14882,
+            "69": 0.14911,
+            "70": 0.15091,
+            "71": 0.14829,
+            "72": 0.15153,
+            "73": 0.14917,
+            "74": 0.1489,
+            "75": 0.14776,
+            "76": 0.14826,
+            "77": 0.1498,
+            "78": 0.14886,
+            "79": 0.14846,
+            "80": 0.14828,
+            "81": 0.14965,
+            "82": 0.14889,
+            "83": 0.1484,
+            "84": 0.14864,
+            "85": 0.14911,
+            "86": 0.14911,
+            "87": 0.14856,
+            "88": 0.14854,
+            "89": 0.1487,
+            "90": 0.14823,
+            "91": 0.15008,
+            "92": 0.14856,
+            "93": 0.14939,
+            "94": 0.14915,
+            "95": 0.14847,
+            "96": 0.1485,
+            "97": 0.14951,
+            "98": 0.14965,
+            "99": 0.14868,
+            "100": 0.14783
         }
     },
     "num-zeros": {
@@ -471,67 +471,67 @@
             "37": 2600.0,
             "38": 2350.0,
             "39": 2997.0,
-            "40": 2053.0,
-            "41": 3352.0,
-            "42": 2497.0,
-            "43": 2867.0,
-            "44": 2109.0,
-            "45": 2490.0,
-            "46": 2279.0,
-            "47": 3051.0,
-            "48": 2527.0,
-            "49": 1973.0,
-            "50": 2887.0,
-            "51": 2310.0,
-            "52": 2526.0,
-            "53": 3705.0,
-            "54": 2888.0,
-            "55": 2440.0,
-            "56": 2496.0,
-            "57": 2338.0,
-            "58": 3283.0,
-            "59": 2849.0,
-            "60": 2893.0,
-            "61": 2956.0,
-            "62": 3134.0,
-            "63": 3275.0,
-            "64": 3176.0,
-            "65": 2318.0,
-            "66": 3857.0,
-            "67": 2606.0,
-            "68": 3313.0,
-            "69": 2826.0,
-            "70": 3665.0,
-            "71": 3011.0,
-            "72": 2693.0,
-            "73": 3357.0,
-            "74": 2271.0,
-            "75": 2955.0,
-            "76": 3617.0,
-            "77": 3936.0,
-            "78": 3951.0,
-            "79": 4065.0,
-            "80": 3665.0,
-            "81": 5191.0,
-            "82": 3511.0,
-            "83": 3263.0,
-            "84": 3876.0,
-            "85": 4048.0,
-            "86": 3414.0,
-            "87": 3980.0,
-            "88": 3617.0,
-            "89": 4400.0,
-            "90": 3695.0,
-            "91": 2857.0,
-            "92": 4432.0,
-            "93": 3494.0,
-            "94": 4438.0,
-            "95": 4076.0,
-            "96": 3948.0,
-            "97": 4242.0,
-            "98": 4943.0,
-            "99": 3861.0,
-            "100": 3631.0
+            "40": 2042.0,
+            "41": 3349.0,
+            "42": 2512.0,
+            "43": 2750.0,
+            "44": 2120.0,
+            "45": 2537.0,
+            "46": 2247.0,
+            "47": 3061.0,
+            "48": 2520.0,
+            "49": 1969.0,
+            "50": 2951.0,
+            "51": 2300.0,
+            "52": 2456.0,
+            "53": 3730.0,
+            "54": 2866.0,
+            "55": 2413.0,
+            "56": 2477.0,
+            "57": 2410.0,
+            "58": 3424.0,
+            "59": 2861.0,
+            "60": 2939.0,
+            "61": 3044.0,
+            "62": 3127.0,
+            "63": 3236.0,
+            "64": 3212.0,
+            "65": 2304.0,
+            "66": 3805.0,
+            "67": 2691.0,
+            "68": 3332.0,
+            "69": 2874.0,
+            "70": 3746.0,
+            "71": 3057.0,
+            "72": 2717.0,
+            "73": 3332.0,
+            "74": 2214.0,
+            "75": 3059.0,
+            "76": 3625.0,
+            "77": 3957.0,
+            "78": 3955.0,
+            "79": 4130.0,
+            "80": 3627.0,
+            "81": 5242.0,
+            "82": 3566.0,
+            "83": 3261.0,
+            "84": 4036.0,
+            "85": 3907.0,
+            "86": 3340.0,
+            "87": 3954.0,
+            "88": 3630.0,
+            "89": 4358.0,
+            "90": 3800.0,
+            "91": 2877.0,
+            "92": 4239.0,
+            "93": 3604.0,
+            "94": 4356.0,
+            "95": 4107.0,
+            "96": 3835.0,
+            "97": 4094.0,
+            "98": 4835.0,
+            "99": 3873.0,
+            "100": 3727.0
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..a37cec4df3f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.16758,
+            "52": 10.06896,
+            "53": 10.30707,
+            "54": 10.2091,
+            "55": 10.15688,
+            "56": 9.91475,
+            "57": 9.77696,
+            "58": 10.07417,
+            "59": 9.86333,
+            "60": 9.77328,
+            "61": 9.9292,
+            "62": 10.17156,
+            "63": 9.62041,
+            "64": 9.97113,
+            "65": 9.21979,
+            "66": 9.88693,
+            "67": 9.58363,
+            "68": 9.94922,
+            "69": 9.9527,
+            "70": 9.89312,
+            "71": 9.77658,
+            "72": 9.75435,
+            "73": 9.64969,
+            "74": 9.1439,
+            "75": 9.56121,
+            "76": 9.25111,
+            "77": 10.17063,
+            "78": 9.85402,
+            "79": 9.49965,
+            "80": 9.53087,
+            "81": 9.60555,
+            "82": 9.80179,
+            "83": 9.43744,
+            "84": 9.51987,
+            "85": 9.7196,
+            "86": 9.18596,
+            "87": 9.68687,
+            "88": 9.8443,
+            "89": 9.70586,
+            "90": 9.89977,
+            "91": 9.45029,
+            "92": 9.45356,
+            "93": 9.18553,
+            "94": 8.92968,
+            "95": 9.59767,
+            "96": 9.61491,
+            "97": 9.39084,
+            "98": 9.75668,
+            "99": 8.97922,
+            "100": 9.49
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2300.0,
+            "52": 2456.0,
+            "53": 3730.0,
+            "54": 2866.0,
+            "55": 2413.0,
+            "56": 2477.0,
+            "57": 2410.0,
+            "58": 3424.0,
+            "59": 2861.0,
+            "60": 2939.0,
+            "61": 3044.0,
+            "62": 3127.0,
+            "63": 3236.0,
+            "64": 3212.0,
+            "65": 2304.0,
+            "66": 3805.0,
+            "67": 2691.0,
+            "68": 3332.0,
+            "69": 2874.0,
+            "70": 3746.0,
+            "71": 3057.0,
+            "72": 2717.0,
+            "73": 3332.0,
+            "74": 2214.0,
+            "75": 3059.0,
+            "76": 3625.0,
+            "77": 3957.0,
+            "78": 3955.0,
+            "79": 4130.0,
+            "80": 3627.0,
+            "81": 5242.0,
+            "82": 3566.0,
+            "83": 3261.0,
+            "84": 4036.0,
+            "85": 3907.0,
+            "86": 3340.0,
+            "87": 3954.0,
+            "88": 3630.0,
+            "89": 4358.0,
+            "90": 3800.0,
+            "91": 2877.0,
+            "92": 4239.0,
+            "93": 3604.0,
+            "94": 4356.0,
+            "95": 4107.0,
+            "96": 3835.0,
+            "97": 4094.0,
+            "98": 4835.0,
+            "99": 3873.0,
+            "100": 3727.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 854448640.0,
+            "52": 854448640.0,
+            "53": 854448640.0,
+            "54": 854448640.0,
+            "55": 854448640.0,
+            "56": 854448640.0,
+            "57": 854448640.0,
+            "58": 854448640.0,
+            "59": 854448640.0,
+            "60": 854448640.0,
+            "61": 854448640.0,
+            "62": 854448640.0,
+            "63": 854448640.0,
+            "64": 854448640.0,
+            "65": 854448640.0,
+            "66": 854448640.0,
+            "67": 854448640.0,
+            "68": 854448640.0,
+            "69": 854448640.0,
+            "70": 854448640.0,
+            "71": 854448640.0,
+            "72": 854448640.0,
+            "73": 854448640.0,
+            "74": 854448640.0,
+            "75": 854448640.0,
+            "76": 854448640.0,
+            "77": 854448640.0,
+            "78": 854448640.0,
+            "79": 854448640.0,
+            "80": 854448640.0,
+            "81": 854448640.0,
+            "82": 854448640.0,
+            "83": 854448640.0,
+            "84": 854448640.0,
+            "85": 854448640.0,
+            "86": 854448640.0,
+            "87": 854448640.0,
+            "88": 854448640.0,
+            "89": 854448640.0,
+            "90": 854448640.0,
+            "91": 854448640.0,
+            "92": 854448640.0,
+            "93": 854448640.0,
+            "94": 854448640.0,
+            "95": 854448640.0,
+            "96": 854448640.0,
+            "97": 854448640.0,
+            "98": 854448640.0,
+            "99": 854448640.0,
+            "100": 854448640.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2677025280.0,
+            "52": 2677026304.0,
+            "53": 2677026304.0,
+            "54": 2677026304.0,
+            "55": 2677026304.0,
+            "56": 2677026304.0,
+            "57": 2677026304.0,
+            "58": 2677026304.0,
+            "59": 2677026304.0,
+            "60": 2677026304.0,
+            "61": 2677026304.0,
+            "62": 2677026304.0,
+            "63": 2677026304.0,
+            "64": 2677026304.0,
+            "65": 2677026304.0,
+            "66": 2677026304.0,
+            "67": 2677026304.0,
+            "68": 2677026304.0,
+            "69": 2677026304.0,
+            "70": 2677026304.0,
+            "71": 2677026304.0,
+            "72": 2677026304.0,
+            "73": 2677026304.0,
+            "74": 2677026304.0,
+            "75": 2677026304.0,
+            "76": 2677026304.0,
+            "77": 2677026304.0,
+            "78": 2677026304.0,
+            "79": 2677026304.0,
+            "80": 2677026304.0,
+            "81": 2677026304.0,
+            "82": 2677026304.0,
+            "83": 2677026304.0,
+            "84": 2677026304.0,
+            "85": 2677026304.0,
+            "86": 2677026304.0,
+            "87": 2677026304.0,
+            "88": 2677026304.0,
+            "89": 2677026304.0,
+            "90": 2677026304.0,
+            "91": 2677026304.0,
+            "92": 2677026304.0,
+            "93": 2677026304.0,
+            "94": 2677026304.0,
+            "95": 2677026304.0,
+            "96": 2677026304.0,
+            "97": 2677026304.0,
+            "98": 2677026304.0,
+            "99": 2677026304.0,
+            "100": 2677026304.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.87816,
+            "52": 0.16917,
+            "53": 0.15082,
+            "54": 0.15061,
+            "55": 0.14996,
+            "56": 0.14817,
+            "57": 0.1493,
+            "58": 0.14853,
+            "59": 0.14922,
+            "60": 0.14866,
+            "61": 0.14887,
+            "62": 0.14883,
+            "63": 0.14881,
+            "64": 0.14895,
+            "65": 0.14967,
+            "66": 0.14908,
+            "67": 0.1494,
+            "68": 0.14978,
+            "69": 0.15047,
+            "70": 0.1524,
+            "71": 0.14848,
+            "72": 0.14825,
+            "73": 0.14947,
+            "74": 0.14886,
+            "75": 0.14848,
+            "76": 0.14764,
+            "77": 0.14818,
+            "78": 0.14955,
+            "79": 0.14914,
+            "80": 0.14801,
+            "81": 0.14894,
+            "82": 0.14906,
+            "83": 0.14922,
+            "84": 0.14891,
+            "85": 0.14792,
+            "86": 0.14798,
+            "87": 0.14822,
+            "88": 0.14842,
+            "89": 0.14832,
+            "90": 0.14755,
+            "91": 0.1493,
+            "92": 0.14752,
+            "93": 0.14879,
+            "94": 0.14918,
+            "95": 0.15196,
+            "96": 0.1524,
+            "97": 0.14795,
+            "98": 0.14778,
+            "99": 0.14781,
+            "100": 0.14987
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..0322a9120bd
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.84064,
+            "2": 10.85201,
+            "3": 10.84256,
+            "4": 10.84663,
+            "5": 10.85667,
+            "6": 10.8655,
+            "7": 10.85455,
+            "8": 10.84814,
+            "9": 10.85295,
+            "10": 10.82026,
+            "11": 10.86468,
+            "12": 10.85604,
+            "13": 10.87584,
+            "14": 10.86361,
+            "15": 10.86365,
+            "16": 10.86053,
+            "17": 10.84579,
+            "18": 10.8538,
+            "19": 10.85943,
+            "20": 10.84139,
+            "21": 10.86327,
+            "22": 10.83014,
+            "23": 10.85749,
+            "24": 10.83816,
+            "25": 10.82517,
+            "26": 10.8257,
+            "27": 10.83038,
+            "28": 10.82029,
+            "29": 10.81214,
+            "30": 10.74061,
+            "31": 10.68185,
+            "32": 10.76069,
+            "33": 10.7491,
+            "34": 10.67394,
+            "35": 10.65529,
+            "36": 10.63303,
+            "37": 10.66285,
+            "38": 10.60535,
+            "39": 10.6732,
+            "40": 10.50952,
+            "41": 10.53339,
+            "42": 10.54981,
+            "43": 10.35084,
+            "44": 10.3993,
+            "45": 10.31307,
+            "46": 10.27398,
+            "47": 10.45772,
+            "48": 10.27942,
+            "49": 10.05213,
+            "50": 10.28011,
+            "51": 10.23426,
+            "52": 10.13488,
+            "53": 10.35279,
+            "54": 10.26189,
+            "55": 10.20983,
+            "56": 9.99599,
+            "57": 9.87962,
+            "58": 10.13391,
+            "59": 9.92304,
+            "60": 9.85379,
+            "61": 9.97314,
+            "62": 10.211,
+            "63": 9.70514,
+            "64": 10.01457,
+            "65": 9.30759,
+            "66": 9.9366,
+            "67": 9.63221,
+            "68": 9.98219,
+            "69": 9.98048,
+            "70": 9.92986,
+            "71": 9.81575,
+            "72": 9.79602,
+            "73": 9.69104,
+            "74": 9.20049,
+            "75": 9.61228,
+            "76": 9.28906,
+            "77": 10.19068,
+            "78": 9.86601,
+            "79": 9.53855,
+            "80": 9.5578,
+            "81": 9.63332,
+            "82": 9.82853,
+            "83": 9.47188,
+            "84": 9.54101,
+            "85": 9.74266,
+            "86": 9.2142,
+            "87": 9.7016,
+            "88": 9.86604,
+            "89": 9.72339,
+            "90": 9.92767,
+            "91": 9.47045,
+            "92": 9.46809,
+            "93": 9.21217,
+            "94": 8.94887,
+            "95": 9.62787,
+            "96": 9.6406,
+            "97": 9.40839,
+            "98": 9.77147,
+            "99": 9.00853,
+            "100": 9.51225
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 284527616.0,
+            "2": 284527616.0,
+            "3": 284527616.0,
+            "4": 284527616.0,
+            "5": 284527616.0,
+            "6": 284527616.0,
+            "7": 284527616.0,
+            "8": 284527616.0,
+            "9": 284527616.0,
+            "10": 284527616.0,
+            "11": 284527616.0,
+            "12": 284527616.0,
+            "13": 284527616.0,
+            "14": 284527616.0,
+            "15": 284527616.0,
+            "16": 416513536.0,
+            "17": 416513536.0,
+            "18": 416513536.0,
+            "19": 416513536.0,
+            "20": 416513536.0,
+            "21": 416513536.0,
+            "22": 416513536.0,
+            "23": 416513536.0,
+            "24": 416513536.0,
+            "25": 416513536.0,
+            "26": 416513536.0,
+            "27": 416513536.0,
+            "28": 416513536.0,
+            "29": 416513536.0,
+            "30": 416513536.0,
+            "31": 416513536.0,
+            "32": 416513536.0,
+            "33": 416513536.0,
+            "34": 416513536.0,
+            "35": 416513536.0,
+            "36": 416513536.0,
+            "37": 416513536.0,
+            "38": 416513536.0,
+            "39": 416513536.0,
+            "40": 416513536.0,
+            "41": 416513536.0,
+            "42": 416513536.0,
+            "43": 416513536.0,
+            "44": 416513536.0,
+            "45": 416513536.0,
+            "46": 416513536.0,
+            "47": 416513536.0,
+            "48": 416513536.0,
+            "49": 416513536.0,
+            "50": 416513536.0,
+            "51": 416513536.0,
+            "52": 416513536.0,
+            "53": 416513536.0,
+            "54": 416513536.0,
+            "55": 416513536.0,
+            "56": 416513536.0,
+            "57": 416513536.0,
+            "58": 416513536.0,
+            "59": 416513536.0,
+            "60": 416513536.0,
+            "61": 416513536.0,
+            "62": 416513536.0,
+            "63": 416513536.0,
+            "64": 416513536.0,
+            "65": 416513536.0,
+            "66": 416513536.0,
+            "67": 416513536.0,
+            "68": 416513536.0,
+            "69": 416513536.0,
+            "70": 416513536.0,
+            "71": 416513536.0,
+            "72": 416513536.0,
+            "73": 416513536.0,
+            "74": 416513536.0,
+            "75": 416513536.0,
+            "76": 416513536.0,
+            "77": 416513536.0,
+            "78": 416513536.0,
+            "79": 416513536.0,
+            "80": 416513536.0,
+            "81": 416513536.0,
+            "82": 416513536.0,
+            "83": 416513536.0,
+            "84": 416513536.0,
+            "85": 416513536.0,
+            "86": 416513536.0,
+            "87": 416513536.0,
+            "88": 416513536.0,
+            "89": 416513536.0,
+            "90": 416513536.0,
+            "91": 416513536.0,
+            "92": 416513536.0,
+            "93": 416513536.0,
+            "94": 416513536.0,
+            "95": 416513536.0,
+            "96": 416513536.0,
+            "97": 416513536.0,
+            "98": 416513536.0,
+            "99": 416513536.0,
+            "100": 416513536.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1465368064.0,
+            "2": 1465368576.0,
+            "3": 1465368576.0,
+            "4": 1465368576.0,
+            "5": 1465368576.0,
+            "6": 1465368576.0,
+            "7": 1465368576.0,
+            "8": 1465368576.0,
+            "9": 1465368576.0,
+            "10": 1465368576.0,
+            "11": 1465368576.0,
+            "12": 1465368576.0,
+            "13": 1465368576.0,
+            "14": 1465368576.0,
+            "15": 1465368576.0,
+            "16": 1465368576.0,
+            "17": 1597485568.0,
+            "18": 1597485568.0,
+            "19": 1597485568.0,
+            "20": 1597485568.0,
+            "21": 1597485568.0,
+            "22": 1597485568.0,
+            "23": 1597485568.0,
+            "24": 1597485568.0,
+            "25": 1597485568.0,
+            "26": 1597485568.0,
+            "27": 1597485568.0,
+            "28": 1597485568.0,
+            "29": 1597485568.0,
+            "30": 1597485568.0,
+            "31": 1597485568.0,
+            "32": 1597485568.0,
+            "33": 1597485568.0,
+            "34": 1597485568.0,
+            "35": 1597485568.0,
+            "36": 1597485568.0,
+            "37": 1597485568.0,
+            "38": 1597485568.0,
+            "39": 1597485568.0,
+            "40": 1597485568.0,
+            "41": 1597485568.0,
+            "42": 1597485568.0,
+            "43": 1597485568.0,
+            "44": 1597485568.0,
+            "45": 1597485568.0,
+            "46": 1597485568.0,
+            "47": 1597485568.0,
+            "48": 1597485568.0,
+            "49": 1597485568.0,
+            "50": 1597485568.0,
+            "51": 1597485568.0,
+            "52": 1597485568.0,
+            "53": 1597485568.0,
+            "54": 1597485568.0,
+            "55": 1597485568.0,
+            "56": 1597485568.0,
+            "57": 1597485568.0,
+            "58": 1597485568.0,
+            "59": 1597485568.0,
+            "60": 1597485568.0,
+            "61": 1597485568.0,
+            "62": 1597485568.0,
+            "63": 1597485568.0,
+            "64": 1597485568.0,
+            "65": 1597485568.0,
+            "66": 1597485568.0,
+            "67": 1597485568.0,
+            "68": 1597485568.0,
+            "69": 1597485568.0,
+            "70": 1597485568.0,
+            "71": 1597485568.0,
+            "72": 1597485568.0,
+            "73": 1597485568.0,
+            "74": 1597485568.0,
+            "75": 1597485568.0,
+            "76": 1597485568.0,
+            "77": 1597485568.0,
+            "78": 1597485568.0,
+            "79": 1597485568.0,
+            "80": 1597485568.0,
+            "81": 1597485568.0,
+            "82": 1597485568.0,
+            "83": 1597485568.0,
+            "84": 1597485568.0,
+            "85": 1597485568.0,
+            "86": 1597485568.0,
+            "87": 1597485568.0,
+            "88": 1597485568.0,
+            "89": 1597485568.0,
+            "90": 1597485568.0,
+            "91": 1597485568.0,
+            "92": 1597485568.0,
+            "93": 1597485568.0,
+            "94": 1597485568.0,
+            "95": 1597485568.0,
+            "96": 1597485568.0,
+            "97": 1597485568.0,
+            "98": 1597485568.0,
+            "99": 1597485568.0,
+            "100": 1597485568.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.03297,
+            "3": 0.32699,
+            "4": 0.31597,
+            "5": 0.31552,
+            "6": 0.3147,
+            "7": 0.31402,
+            "8": 0.31397,
+            "9": 0.31445,
+            "10": 0.31986,
+            "11": 0.31186,
+            "12": 0.30888,
+            "13": 0.3063,
+            "14": 0.30631,
+            "15": 0.30635,
+            "16": 0.38476,
+            "17": 0.31287,
+            "18": 0.32251,
+            "19": 0.32317,
+            "20": 0.31607,
+            "21": 0.31688,
+            "22": 0.30059,
+            "23": 0.31719,
+            "24": 0.3226,
+            "25": 0.31854,
+            "26": 0.31658,
+            "27": 0.31847,
+            "28": 0.3176,
+            "29": 0.31877,
+            "30": 0.31787,
+            "31": 0.31758,
+            "32": 0.31563,
+            "33": 0.31399,
+            "34": 0.31522,
+            "35": 0.31891,
+            "36": 0.3187,
+            "37": 0.32382,
+            "38": 0.32042,
+            "39": 0.32237,
+            "40": 0.32145,
+            "41": 0.31906,
+            "42": 0.31768,
+            "43": 0.32041,
+            "44": 0.31937,
+            "45": 0.32089,
+            "46": 0.31845,
+            "47": 0.31938,
+            "48": 0.31552,
+            "49": 0.31733,
+            "50": 0.3196,
+            "51": 0.31383,
+            "52": 0.31412,
+            "53": 0.31249,
+            "54": 0.3127,
+            "55": 0.31762,
+            "56": 0.31919,
+            "57": 0.31929,
+            "58": 0.32119,
+            "59": 0.31667,
+            "60": 0.32308,
+            "61": 0.31738,
+            "62": 0.32278,
+            "63": 0.31714,
+            "64": 0.31073,
+            "65": 0.30929,
+            "66": 0.30856,
+            "67": 0.31027,
+            "68": 0.3103,
+            "69": 0.31124,
+            "70": 0.30886,
+            "71": 0.30892,
+            "72": 0.31237,
+            "73": 0.31647,
+            "74": 0.31733,
+            "75": 0.31764,
+            "76": 0.3165,
+            "77": 0.31656,
+            "78": 0.3176,
+            "79": 0.31747,
+            "80": 0.3171,
+            "81": 0.31656,
+            "82": 0.3168,
+            "83": 0.31697,
+            "84": 0.3181,
+            "85": 0.31755,
+            "86": 0.31749,
+            "87": 0.31765,
+            "88": 0.31775,
+            "89": 0.31806,
+            "90": 0.31417,
+            "91": 0.64575,
+            "92": 0.3228,
+            "93": 0.3237,
+            "94": 0.32187,
+            "95": 0.32154,
+            "96": 0.32116,
+            "97": 0.33046,
+            "98": 0.35266,
+            "99": 0.32136,
+            "100": 0.32174
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": 2392.0,
+            "17": "nan",
+            "18": 2314.0,
+            "19": 2912.0,
+            "20": 1640.0,
+            "21": 2053.0,
+            "22": "nan",
+            "23": 2462.0,
+            "24": 2226.0,
+            "25": 2201.0,
+            "26": 1963.0,
+            "27": 1926.0,
+            "28": 2401.0,
+            "29": 2492.0,
+            "30": 2393.0,
+            "31": 1704.0,
+            "32": 2541.0,
+            "33": 2096.0,
+            "34": 1737.0,
+            "35": 1810.0,
+            "36": 1982.0,
+            "37": 2511.0,
+            "38": 2185.0,
+            "39": 2899.0,
+            "40": 1888.0,
+            "41": 3169.0,
+            "42": 2343.0,
+            "43": 2501.0,
+            "44": 1938.0,
+            "45": 2346.0,
+            "46": 2091.0,
+            "47": 2853.0,
+            "48": 2402.0,
+            "49": 1810.0,
+            "50": 2718.0,
+            "51": 2080.0,
+            "52": 2200.0,
+            "53": 3412.0,
+            "54": 2641.0,
+            "55": 2229.0,
+            "56": 2244.0,
+            "57": 2057.0,
+            "58": 3223.0,
+            "59": 2431.0,
+            "60": 2650.0,
+            "61": 2712.0,
+            "62": 2995.0,
+            "63": 2816.0,
+            "64": 2860.0,
+            "65": 2015.0,
+            "66": 3176.0,
+            "67": 2529.0,
+            "68": 3108.0,
+            "69": 2873.0,
+            "70": 3540.0,
+            "71": 2904.0,
+            "72": 2693.0,
+            "73": 3253.0,
+            "74": 1981.0,
+            "75": 2780.0,
+            "76": 3465.0,
+            "77": 3649.0,
+            "78": 3593.0,
+            "79": 3981.0,
+            "80": 3458.0,
+            "81": 5181.0,
+            "82": 3334.0,
+            "83": 2956.0,
+            "84": 3527.0,
+            "85": 3711.0,
+            "86": 3209.0,
+            "87": 4133.0,
+            "88": 3443.0,
+            "89": 4295.0,
+            "90": 3801.0,
+            "91": 2958.0,
+            "92": 4311.0,
+            "93": 3544.0,
+            "94": 4264.0,
+            "95": 4042.0,
+            "96": 3849.0,
+            "97": 3974.0,
+            "98": 4971.0,
+            "99": 4071.0,
+            "100": 3363.0
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..121b3fe11b7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.23426,
+            "52": 10.13488,
+            "53": 10.35279,
+            "54": 10.26189,
+            "55": 10.20983,
+            "56": 9.99599,
+            "57": 9.87962,
+            "58": 10.13391,
+            "59": 9.92304,
+            "60": 9.85379,
+            "61": 9.97314,
+            "62": 10.211,
+            "63": 9.70514,
+            "64": 10.01457,
+            "65": 9.30759,
+            "66": 9.9366,
+            "67": 9.63221,
+            "68": 9.98219,
+            "69": 9.98048,
+            "70": 9.92986,
+            "71": 9.81575,
+            "72": 9.79602,
+            "73": 9.69104,
+            "74": 9.20049,
+            "75": 9.61228,
+            "76": 9.28906,
+            "77": 10.19068,
+            "78": 9.86601,
+            "79": 9.53855,
+            "80": 9.5578,
+            "81": 9.63332,
+            "82": 9.82853,
+            "83": 9.47188,
+            "84": 9.54101,
+            "85": 9.74266,
+            "86": 9.2142,
+            "87": 9.7016,
+            "88": 9.86604,
+            "89": 9.72339,
+            "90": 9.92767,
+            "91": 9.47045,
+            "92": 9.46809,
+            "93": 9.21217,
+            "94": 8.94887,
+            "95": 9.62787,
+            "96": 9.6406,
+            "97": 9.40839,
+            "98": 9.77147,
+            "99": 9.00853,
+            "100": 9.51225
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2080.0,
+            "52": 2200.0,
+            "53": 3412.0,
+            "54": 2641.0,
+            "55": 2229.0,
+            "56": 2244.0,
+            "57": 2057.0,
+            "58": 3223.0,
+            "59": 2431.0,
+            "60": 2650.0,
+            "61": 2712.0,
+            "62": 2995.0,
+            "63": 2816.0,
+            "64": 2860.0,
+            "65": 2015.0,
+            "66": 3176.0,
+            "67": 2529.0,
+            "68": 3108.0,
+            "69": 2873.0,
+            "70": 3540.0,
+            "71": 2904.0,
+            "72": 2693.0,
+            "73": 3253.0,
+            "74": 1981.0,
+            "75": 2780.0,
+            "76": 3465.0,
+            "77": 3649.0,
+            "78": 3593.0,
+            "79": 3981.0,
+            "80": 3458.0,
+            "81": 5181.0,
+            "82": 3334.0,
+            "83": 2956.0,
+            "84": 3527.0,
+            "85": 3711.0,
+            "86": 3209.0,
+            "87": 4133.0,
+            "88": 3443.0,
+            "89": 4295.0,
+            "90": 3801.0,
+            "91": 2958.0,
+            "92": 4311.0,
+            "93": 3544.0,
+            "94": 4264.0,
+            "95": 4042.0,
+            "96": 3849.0,
+            "97": 3974.0,
+            "98": 4971.0,
+            "99": 4071.0,
+            "100": 3363.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 414416384.0,
+            "52": 414416384.0,
+            "53": 414416384.0,
+            "54": 414416384.0,
+            "55": 414416384.0,
+            "56": 414416384.0,
+            "57": 414416384.0,
+            "58": 414416384.0,
+            "59": 414416384.0,
+            "60": 414416384.0,
+            "61": 414416384.0,
+            "62": 414416384.0,
+            "63": 414416384.0,
+            "64": 414416384.0,
+            "65": 414416384.0,
+            "66": 414416384.0,
+            "67": 414416384.0,
+            "68": 414416384.0,
+            "69": 414416384.0,
+            "70": 414416384.0,
+            "71": 414416384.0,
+            "72": 414416384.0,
+            "73": 414416384.0,
+            "74": 414416384.0,
+            "75": 414416384.0,
+            "76": 414416384.0,
+            "77": 414416384.0,
+            "78": 414416384.0,
+            "79": 414416384.0,
+            "80": 414416384.0,
+            "81": 414416384.0,
+            "82": 414416384.0,
+            "83": 414416384.0,
+            "84": 414416384.0,
+            "85": 414416384.0,
+            "86": 414416384.0,
+            "87": 414416384.0,
+            "88": 414416384.0,
+            "89": 414416384.0,
+            "90": 414416384.0,
+            "91": 414416384.0,
+            "92": 414416384.0,
+            "93": 414416384.0,
+            "94": 414416384.0,
+            "95": 414416384.0,
+            "96": 414416384.0,
+            "97": 414416384.0,
+            "98": 414416384.0,
+            "99": 414416384.0,
+            "100": 414416384.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1595256320.0,
+            "52": 1595257344.0,
+            "53": 1595257344.0,
+            "54": 1595257344.0,
+            "55": 1595257344.0,
+            "56": 1595257344.0,
+            "57": 1595257344.0,
+            "58": 1595257344.0,
+            "59": 1595257344.0,
+            "60": 1595257344.0,
+            "61": 1595257344.0,
+            "62": 1595257344.0,
+            "63": 1595257344.0,
+            "64": 1595257344.0,
+            "65": 1595257344.0,
+            "66": 1595257344.0,
+            "67": 1595257344.0,
+            "68": 1595257344.0,
+            "69": 1595257344.0,
+            "70": 1595257344.0,
+            "71": 1595257344.0,
+            "72": 1595257344.0,
+            "73": 1595257344.0,
+            "74": 1595257344.0,
+            "75": 1595257344.0,
+            "76": 1595257344.0,
+            "77": 1595257344.0,
+            "78": 1595257344.0,
+            "79": 1595257344.0,
+            "80": 1595257344.0,
+            "81": 1595257344.0,
+            "82": 1595257344.0,
+            "83": 1595257344.0,
+            "84": 1595257344.0,
+            "85": 1595257344.0,
+            "86": 1595257344.0,
+            "87": 1595257344.0,
+            "88": 1595257344.0,
+            "89": 1595257344.0,
+            "90": 1595257344.0,
+            "91": 1595257344.0,
+            "92": 1595257344.0,
+            "93": 1595257344.0,
+            "94": 1595257344.0,
+            "95": 1595257344.0,
+            "96": 1595257344.0,
+            "97": 1595257344.0,
+            "98": 1595257344.0,
+            "99": 1595257344.0,
+            "100": 1595257344.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 1.90434,
+            "53": 0.34839,
+            "54": 0.46608,
+            "55": 0.31197,
+            "56": 0.30786,
+            "57": 0.30592,
+            "58": 0.30764,
+            "59": 0.30985,
+            "60": 0.30933,
+            "61": 0.30847,
+            "62": 0.30963,
+            "63": 0.31127,
+            "64": 0.30722,
+            "65": 0.31035,
+            "66": 0.31029,
+            "67": 0.30982,
+            "68": 0.30966,
+            "69": 0.30823,
+            "70": 0.30879,
+            "71": 0.3091,
+            "72": 0.30555,
+            "73": 0.30938,
+            "74": 0.3098,
+            "75": 0.31215,
+            "76": 0.30896,
+            "77": 0.31066,
+            "78": 0.31001,
+            "79": 0.30673,
+            "80": 0.30699,
+            "81": 0.30771,
+            "82": 0.3072,
+            "83": 0.30839,
+            "84": 0.30892,
+            "85": 0.30911,
+            "86": 0.30528,
+            "87": 0.30757,
+            "88": 0.30812,
+            "89": 0.3083,
+            "90": 0.30825,
+            "91": 0.30638,
+            "92": 0.30467,
+            "93": 0.30582,
+            "94": 0.30847,
+            "95": 0.30633,
+            "96": 0.30614,
+            "97": 0.30644,
+            "98": 0.30185,
+            "99": 0.30221,
+            "100": 0.30191
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json
index 47fa63fad72..7d93101382f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json
@@ -78,22 +78,22 @@
             "72": 9.798,
             "73": 9.68454,
             "74": 9.19951,
-            "75": 9.60518,
-            "76": 9.27791,
-            "77": 10.19437,
+            "75": 9.60519,
+            "76": 9.2779,
+            "77": 10.19436,
             "78": 9.8671,
             "79": 9.53341,
             "80": 9.56341,
             "81": 9.63047,
             "82": 9.82819,
             "83": 9.46388,
-            "84": 9.53736,
-            "85": 9.74561,
+            "84": 9.53735,
+            "85": 9.74562,
             "86": 9.21332,
-            "87": 9.7014,
+            "87": 9.70141,
             "88": 9.86621,
             "89": 9.72242,
-            "90": 9.92089,
+            "90": 9.9209,
             "91": 9.47178,
             "92": 9.46996,
             "93": 9.20589,
@@ -234,90 +234,90 @@
             "14": 1465368576.0,
             "15": 1465368576.0,
             "16": 1465368576.0,
-            "17": 1597092352.0,
-            "18": 1597092352.0,
-            "19": 1597092352.0,
-            "20": 1597092352.0,
-            "21": 1597092352.0,
-            "22": 1597092352.0,
-            "23": 1597092352.0,
-            "24": 1597092352.0,
-            "25": 1597092352.0,
-            "26": 1597092352.0,
-            "27": 1597092352.0,
-            "28": 1597092352.0,
-            "29": 1597092352.0,
-            "30": 1597092352.0,
-            "31": 1597092352.0,
-            "32": 1597092352.0,
-            "33": 1597092352.0,
-            "34": 1597092352.0,
-            "35": 1597092352.0,
-            "36": 1597092352.0,
-            "37": 1597092352.0,
-            "38": 1597092352.0,
-            "39": 1597092352.0,
-            "40": 1597092352.0,
-            "41": 1597092352.0,
-            "42": 1597092352.0,
-            "43": 1597092352.0,
-            "44": 1597092352.0,
-            "45": 1597092352.0,
-            "46": 1597092352.0,
-            "47": 1597092352.0,
-            "48": 1597092352.0,
-            "49": 1597092352.0,
-            "50": 1597092352.0,
-            "51": 1597092352.0,
-            "52": 1597092352.0,
-            "53": 1597092352.0,
-            "54": 1597092352.0,
-            "55": 1597092352.0,
-            "56": 1597092352.0,
-            "57": 1597092352.0,
-            "58": 1597092352.0,
-            "59": 1597092352.0,
-            "60": 1597092352.0,
-            "61": 1597092352.0,
-            "62": 1597092352.0,
-            "63": 1597092352.0,
-            "64": 1597092352.0,
-            "65": 1597092352.0,
-            "66": 1597092352.0,
-            "67": 1597092352.0,
-            "68": 1597092352.0,
-            "69": 1597092352.0,
-            "70": 1597092352.0,
-            "71": 1597092352.0,
-            "72": 1597092352.0,
-            "73": 1597092352.0,
-            "74": 1597092352.0,
-            "75": 1597092352.0,
-            "76": 1597092352.0,
-            "77": 1597092352.0,
-            "78": 1597092352.0,
-            "79": 1597092352.0,
-            "80": 1597092352.0,
-            "81": 1597092352.0,
-            "82": 1597092352.0,
-            "83": 1597092352.0,
-            "84": 1597092352.0,
-            "85": 1597092352.0,
-            "86": 1597092352.0,
-            "87": 1597092352.0,
-            "88": 1597092352.0,
-            "89": 1597092352.0,
-            "90": 1597092352.0,
-            "91": 1597092352.0,
-            "92": 1597092352.0,
-            "93": 1597092352.0,
-            "94": 1597092352.0,
-            "95": 1597092352.0,
-            "96": 1597092352.0,
-            "97": 1597092352.0,
-            "98": 1597092352.0,
-            "99": 1597092352.0,
-            "100": 1597092352.0
+            "17": 1597485568.0,
+            "18": 1597485568.0,
+            "19": 1597485568.0,
+            "20": 1597485568.0,
+            "21": 1597485568.0,
+            "22": 1597485568.0,
+            "23": 1597485568.0,
+            "24": 1597485568.0,
+            "25": 1597485568.0,
+            "26": 1597485568.0,
+            "27": 1597485568.0,
+            "28": 1597485568.0,
+            "29": 1597485568.0,
+            "30": 1597485568.0,
+            "31": 1597485568.0,
+            "32": 1597485568.0,
+            "33": 1597485568.0,
+            "34": 1597485568.0,
+            "35": 1597485568.0,
+            "36": 1597485568.0,
+            "37": 1597485568.0,
+            "38": 1597485568.0,
+            "39": 1597485568.0,
+            "40": 1597485568.0,
+            "41": 1597485568.0,
+            "42": 1597485568.0,
+            "43": 1597485568.0,
+            "44": 1597485568.0,
+            "45": 1597485568.0,
+            "46": 1597485568.0,
+            "47": 1597485568.0,
+            "48": 1597485568.0,
+            "49": 1597485568.0,
+            "50": 1597485568.0,
+            "51": 1597485568.0,
+            "52": 1597485568.0,
+            "53": 1597485568.0,
+            "54": 1597485568.0,
+            "55": 1597485568.0,
+            "56": 1597485568.0,
+            "57": 1597485568.0,
+            "58": 1597485568.0,
+            "59": 1597485568.0,
+            "60": 1597485568.0,
+            "61": 1597485568.0,
+            "62": 1597485568.0,
+            "63": 1597485568.0,
+            "64": 1597485568.0,
+            "65": 1597485568.0,
+            "66": 1597485568.0,
+            "67": 1597485568.0,
+            "68": 1597485568.0,
+            "69": 1597485568.0,
+            "70": 1597485568.0,
+            "71": 1597485568.0,
+            "72": 1597485568.0,
+            "73": 1597485568.0,
+            "74": 1597485568.0,
+            "75": 1597485568.0,
+            "76": 1597485568.0,
+            "77": 1597485568.0,
+            "78": 1597485568.0,
+            "79": 1597485568.0,
+            "80": 1597485568.0,
+            "81": 1597485568.0,
+            "82": 1597485568.0,
+            "83": 1597485568.0,
+            "84": 1597485568.0,
+            "85": 1597485568.0,
+            "86": 1597485568.0,
+            "87": 1597485568.0,
+            "88": 1597485568.0,
+            "89": 1597485568.0,
+            "90": 1597485568.0,
+            "91": 1597485568.0,
+            "92": 1597485568.0,
+            "93": 1597485568.0,
+            "94": 1597485568.0,
+            "95": 1597485568.0,
+            "96": 1597485568.0,
+            "97": 1597485568.0,
+            "98": 1597485568.0,
+            "99": 1597485568.0,
+            "100": 1597485568.0
         }
     },
     "iteration-time": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 7.02035,
-            "2": 0.23195,
-            "3": 0.20851,
-            "4": 0.20697,
-            "5": 0.20737,
-            "6": 0.20888,
-            "7": 0.2126,
-            "8": 0.21169,
-            "9": 0.21057,
-            "10": 0.21255,
-            "11": 0.21108,
-            "12": 0.21506,
-            "13": 0.21085,
-            "14": 0.21072,
-            "15": 0.20967,
-            "16": 0.28325,
-            "17": 0.21485,
-            "18": 0.21984,
-            "19": 0.22277,
-            "20": 0.22004,
-            "21": 0.2242,
-            "22": 0.21349,
-            "23": 0.22346,
-            "24": 0.22444,
-            "25": 0.22521,
-            "26": 0.22267,
-            "27": 0.22592,
-            "28": 0.22136,
-            "29": 0.22802,
-            "30": 0.2227,
-            "31": 0.22084,
-            "32": 0.22099,
-            "33": 0.22019,
-            "34": 0.22336,
-            "35": 0.23024,
-            "36": 0.23188,
-            "37": 0.21929,
-            "38": 0.22277,
-            "39": 0.22303,
-            "40": 0.22269,
-            "41": 0.22539,
-            "42": 0.22835,
-            "43": 0.22379,
-            "44": 0.22103,
-            "45": 0.21919,
-            "46": 0.22653,
-            "47": 0.21996,
-            "48": 0.22399,
-            "49": 0.22202,
-            "50": 0.22099,
-            "51": 0.21773,
-            "52": 0.22165,
-            "53": 0.2208,
-            "54": 0.22241,
-            "55": 0.22007,
-            "56": 0.22113,
-            "57": 0.22282,
-            "58": 0.22209,
-            "59": 0.22153,
-            "60": 0.22251,
-            "61": 0.22383,
-            "62": 0.22477,
-            "63": 0.22389,
-            "64": 0.22518,
-            "65": 0.22491,
-            "66": 0.22204,
-            "67": 0.23149,
-            "68": 0.22301,
-            "69": 0.2298,
-            "70": 0.23059,
-            "71": 0.22412,
-            "72": 0.21788,
-            "73": 0.2209,
-            "74": 0.22227,
-            "75": 0.22603,
-            "76": 0.22022,
-            "77": 0.22045,
-            "78": 0.22051,
-            "79": 0.22157,
-            "80": 0.22544,
-            "81": 0.22703,
-            "82": 0.23226,
-            "83": 0.23535,
-            "84": 0.22503,
-            "85": 0.21869,
-            "86": 0.21989,
-            "87": 0.21782,
-            "88": 0.22296,
-            "89": 0.24294,
-            "90": 0.27356,
-            "91": 0.2182,
-            "92": 0.22138,
-            "93": 0.21695,
-            "94": 0.22172,
-            "95": 0.21947,
-            "96": 0.21792,
-            "97": 0.22243,
-            "98": 0.21902,
-            "99": 0.2202,
-            "100": 0.22043
+            "1": 7.98979,
+            "2": 0.23108,
+            "3": 0.20672,
+            "4": 0.19092,
+            "5": 0.18929,
+            "6": 0.18601,
+            "7": 0.18145,
+            "8": 0.1825,
+            "9": 0.18096,
+            "10": 0.17945,
+            "11": 0.18072,
+            "12": 0.18215,
+            "13": 0.18198,
+            "14": 0.18069,
+            "15": 0.18115,
+            "16": 0.26838,
+            "17": 0.1891,
+            "18": 0.18758,
+            "19": 0.1866,
+            "20": 0.193,
+            "21": 0.19158,
+            "22": 0.18199,
+            "23": 0.19182,
+            "24": 0.18937,
+            "25": 0.19172,
+            "26": 0.19541,
+            "27": 0.19359,
+            "28": 0.18942,
+            "29": 0.18922,
+            "30": 0.19555,
+            "31": 0.18932,
+            "32": 0.18729,
+            "33": 0.18652,
+            "34": 0.18698,
+            "35": 0.18671,
+            "36": 0.19043,
+            "37": 0.18639,
+            "38": 0.1876,
+            "39": 0.18889,
+            "40": 0.18979,
+            "41": 0.18978,
+            "42": 0.1917,
+            "43": 0.1905,
+            "44": 0.18866,
+            "45": 0.18792,
+            "46": 0.18874,
+            "47": 0.18981,
+            "48": 0.18652,
+            "49": 0.18751,
+            "50": 0.18675,
+            "51": 0.19039,
+            "52": 0.19014,
+            "53": 0.18825,
+            "54": 0.18861,
+            "55": 0.18671,
+            "56": 0.1887,
+            "57": 0.18709,
+            "58": 0.18833,
+            "59": 0.18683,
+            "60": 0.18818,
+            "61": 0.18735,
+            "62": 0.18776,
+            "63": 0.18826,
+            "64": 0.18823,
+            "65": 0.1891,
+            "66": 0.18962,
+            "67": 0.19168,
+            "68": 0.18718,
+            "69": 0.18647,
+            "70": 0.18731,
+            "71": 0.18749,
+            "72": 0.18696,
+            "73": 0.18682,
+            "74": 0.18953,
+            "75": 0.18603,
+            "76": 0.18491,
+            "77": 0.18695,
+            "78": 0.19298,
+            "79": 0.19006,
+            "80": 0.1864,
+            "81": 0.18786,
+            "82": 0.19211,
+            "83": 0.18632,
+            "84": 0.19075,
+            "85": 0.18575,
+            "86": 0.21258,
+            "87": 0.20475,
+            "88": 0.18504,
+            "89": 0.18486,
+            "90": 0.18505,
+            "91": 0.18427,
+            "92": 0.18546,
+            "93": 0.20396,
+            "94": 0.18728,
+            "95": 0.18571,
+            "96": 0.18504,
+            "97": 0.18668,
+            "98": 0.18684,
+            "99": 0.18604,
+            "100": 0.18586
         }
     },
     "num-zeros": {
@@ -506,32 +506,32 @@
             "72": 2640.0,
             "73": 3199.0,
             "74": 2084.0,
-            "75": 2809.0,
-            "76": 3599.0,
-            "77": 3667.0,
-            "78": 3680.0,
-            "79": 3972.0,
-            "80": 3365.0,
-            "81": 5042.0,
-            "82": 3291.0,
-            "83": 3016.0,
-            "84": 3592.0,
-            "85": 3792.0,
-            "86": 3192.0,
-            "87": 4219.0,
-            "88": 3376.0,
-            "89": 4110.0,
-            "90": 3939.0,
-            "91": 2912.0,
-            "92": 4114.0,
-            "93": 3499.0,
-            "94": 4339.0,
-            "95": 3829.0,
-            "96": 3875.0,
-            "97": 4100.0,
-            "98": 4889.0,
-            "99": 3771.0,
-            "100": 3390.0
+            "75": 2823.0,
+            "76": 3490.0,
+            "77": 3710.0,
+            "78": 3619.0,
+            "79": 3911.0,
+            "80": 3431.0,
+            "81": 4963.0,
+            "82": 3460.0,
+            "83": 3062.0,
+            "84": 3593.0,
+            "85": 3752.0,
+            "86": 3255.0,
+            "87": 4096.0,
+            "88": 3272.0,
+            "89": 4074.0,
+            "90": 3810.0,
+            "91": 2877.0,
+            "92": 4080.0,
+            "93": 3469.0,
+            "94": 4428.0,
+            "95": 3850.0,
+            "96": 3832.0,
+            "97": 4102.0,
+            "98": 4833.0,
+            "99": 3795.0,
+            "100": 3405.0
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..7b47664603b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.23471,
+            "52": 10.13764,
+            "53": 10.34797,
+            "54": 10.26738,
+            "55": 10.20734,
+            "56": 9.99527,
+            "57": 9.89333,
+            "58": 10.13452,
+            "59": 9.92856,
+            "60": 9.8551,
+            "61": 9.98264,
+            "62": 10.20686,
+            "63": 9.70842,
+            "64": 10.01687,
+            "65": 9.30409,
+            "66": 9.93326,
+            "67": 9.62677,
+            "68": 9.98429,
+            "69": 9.9755,
+            "70": 9.93956,
+            "71": 9.81005,
+            "72": 9.798,
+            "73": 9.68454,
+            "74": 9.19951,
+            "75": 9.60519,
+            "76": 9.2779,
+            "77": 10.19436,
+            "78": 9.8671,
+            "79": 9.53341,
+            "80": 9.56341,
+            "81": 9.63047,
+            "82": 9.82819,
+            "83": 9.46388,
+            "84": 9.53735,
+            "85": 9.74562,
+            "86": 9.21332,
+            "87": 9.70141,
+            "88": 9.86621,
+            "89": 9.72242,
+            "90": 9.9209,
+            "91": 9.47178,
+            "92": 9.46996,
+            "93": 9.20589,
+            "94": 8.94772,
+            "95": 9.60815,
+            "96": 9.63635,
+            "97": 9.4138,
+            "98": 9.77274,
+            "99": 8.9958,
+            "100": 9.50415
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2041.0,
+            "52": 2226.0,
+            "53": 3222.0,
+            "54": 2784.0,
+            "55": 2290.0,
+            "56": 2428.0,
+            "57": 2146.0,
+            "58": 3048.0,
+            "59": 2504.0,
+            "60": 2612.0,
+            "61": 2623.0,
+            "62": 3003.0,
+            "63": 2762.0,
+            "64": 2917.0,
+            "65": 2104.0,
+            "66": 3550.0,
+            "67": 2433.0,
+            "68": 3146.0,
+            "69": 2877.0,
+            "70": 3528.0,
+            "71": 2983.0,
+            "72": 2640.0,
+            "73": 3199.0,
+            "74": 2084.0,
+            "75": 2823.0,
+            "76": 3490.0,
+            "77": 3710.0,
+            "78": 3619.0,
+            "79": 3911.0,
+            "80": 3431.0,
+            "81": 4963.0,
+            "82": 3460.0,
+            "83": 3062.0,
+            "84": 3593.0,
+            "85": 3752.0,
+            "86": 3255.0,
+            "87": 4096.0,
+            "88": 3272.0,
+            "89": 4074.0,
+            "90": 3810.0,
+            "91": 2877.0,
+            "92": 4080.0,
+            "93": 3469.0,
+            "94": 4428.0,
+            "95": 3850.0,
+            "96": 3832.0,
+            "97": 4102.0,
+            "98": 4833.0,
+            "99": 3795.0,
+            "100": 3405.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 414416384.0,
+            "52": 414416384.0,
+            "53": 414416384.0,
+            "54": 414416384.0,
+            "55": 414416384.0,
+            "56": 414416384.0,
+            "57": 414416384.0,
+            "58": 414416384.0,
+            "59": 414416384.0,
+            "60": 414416384.0,
+            "61": 414416384.0,
+            "62": 414416384.0,
+            "63": 414416384.0,
+            "64": 414416384.0,
+            "65": 414416384.0,
+            "66": 414416384.0,
+            "67": 414416384.0,
+            "68": 414416384.0,
+            "69": 414416384.0,
+            "70": 414416384.0,
+            "71": 414416384.0,
+            "72": 414416384.0,
+            "73": 414416384.0,
+            "74": 414416384.0,
+            "75": 414416384.0,
+            "76": 414416384.0,
+            "77": 414416384.0,
+            "78": 414416384.0,
+            "79": 414416384.0,
+            "80": 414416384.0,
+            "81": 414416384.0,
+            "82": 414416384.0,
+            "83": 414416384.0,
+            "84": 414416384.0,
+            "85": 414416384.0,
+            "86": 414416384.0,
+            "87": 414416384.0,
+            "88": 414416384.0,
+            "89": 414416384.0,
+            "90": 414416384.0,
+            "91": 414416384.0,
+            "92": 414416384.0,
+            "93": 414416384.0,
+            "94": 414416384.0,
+            "95": 414416384.0,
+            "96": 414416384.0,
+            "97": 414416384.0,
+            "98": 414416384.0,
+            "99": 414416384.0,
+            "100": 414416384.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1595256320.0,
+            "52": 1595257344.0,
+            "53": 1595257344.0,
+            "54": 1595257344.0,
+            "55": 1595257344.0,
+            "56": 1595257344.0,
+            "57": 1595257344.0,
+            "58": 1595257344.0,
+            "59": 1595257344.0,
+            "60": 1595257344.0,
+            "61": 1595257344.0,
+            "62": 1595257344.0,
+            "63": 1595257344.0,
+            "64": 1595257344.0,
+            "65": 1595257344.0,
+            "66": 1595257344.0,
+            "67": 1595257344.0,
+            "68": 1595257344.0,
+            "69": 1595257344.0,
+            "70": 1595257344.0,
+            "71": 1595257344.0,
+            "72": 1595257344.0,
+            "73": 1595257344.0,
+            "74": 1595257344.0,
+            "75": 1595257344.0,
+            "76": 1595257344.0,
+            "77": 1595257344.0,
+            "78": 1595257344.0,
+            "79": 1595257344.0,
+            "80": 1595257344.0,
+            "81": 1595257344.0,
+            "82": 1595257344.0,
+            "83": 1595257344.0,
+            "84": 1595257344.0,
+            "85": 1595257344.0,
+            "86": 1595257344.0,
+            "87": 1595257344.0,
+            "88": 1595257344.0,
+            "89": 1595257344.0,
+            "90": 1595257344.0,
+            "91": 1595257344.0,
+            "92": 1595257344.0,
+            "93": 1595257344.0,
+            "94": 1595257344.0,
+            "95": 1595257344.0,
+            "96": 1595257344.0,
+            "97": 1595257344.0,
+            "98": 1595257344.0,
+            "99": 1595257344.0,
+            "100": 1595257344.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.99273,
+            "52": 0.20702,
+            "53": 0.18803,
+            "54": 0.18787,
+            "55": 0.1866,
+            "56": 0.18751,
+            "57": 0.18635,
+            "58": 0.18849,
+            "59": 0.18718,
+            "60": 0.18823,
+            "61": 0.18622,
+            "62": 0.19151,
+            "63": 0.19068,
+            "64": 0.18896,
+            "65": 0.18832,
+            "66": 0.18702,
+            "67": 0.18769,
+            "68": 0.18735,
+            "69": 0.18995,
+            "70": 0.19784,
+            "71": 0.1874,
+            "72": 0.18733,
+            "73": 0.18637,
+            "74": 0.18906,
+            "75": 0.19094,
+            "76": 0.19187,
+            "77": 0.19634,
+            "78": 0.1905,
+            "79": 0.19691,
+            "80": 0.18976,
+            "81": 0.18665,
+            "82": 0.18674,
+            "83": 0.18876,
+            "84": 0.21124,
+            "85": 0.1987,
+            "86": 0.19646,
+            "87": 0.18856,
+            "88": 0.18762,
+            "89": 0.18822,
+            "90": 0.18715,
+            "91": 0.18811,
+            "92": 0.1855,
+            "93": 0.18748,
+            "94": 0.1861,
+            "95": 0.1881,
+            "96": 0.18638,
+            "97": 0.18739,
+            "98": 0.18684,
+            "99": 0.18679,
+            "100": 0.18562
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json
index 9f83249318a..4c3d06e5e64 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json
@@ -218,22 +218,22 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1465367040.0,
-            "2": 1465367552.0,
-            "3": 1465367552.0,
-            "4": 1465367552.0,
-            "5": 1465367552.0,
-            "6": 1465367552.0,
-            "7": 1465367552.0,
-            "8": 1465367552.0,
-            "9": 1465367552.0,
-            "10": 1465367552.0,
-            "11": 1465367552.0,
-            "12": 1465367552.0,
-            "13": 1465367552.0,
-            "14": 1465367552.0,
-            "15": 1465367552.0,
-            "16": 1465367552.0,
+            "1": 1465368064.0,
+            "2": 1465368576.0,
+            "3": 1465368576.0,
+            "4": 1465368576.0,
+            "5": 1465368576.0,
+            "6": 1465368576.0,
+            "7": 1465368576.0,
+            "8": 1465368576.0,
+            "9": 1465368576.0,
+            "10": 1465368576.0,
+            "11": 1465368576.0,
+            "12": 1465368576.0,
+            "13": 1465368576.0,
+            "14": 1465368576.0,
+            "15": 1465368576.0,
+            "16": 1465368576.0,
             "17": 1597485568.0,
             "18": 1597485568.0,
             "19": 1597485568.0,
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 3.81628,
-            "2": 0.32142,
-            "3": 0.27555,
-            "4": 0.28299,
-            "5": 0.28901,
-            "6": 0.28043,
-            "7": 0.29138,
-            "8": 0.30944,
-            "9": 0.28461,
-            "10": 0.28789,
-            "11": 0.28709,
-            "12": 0.29186,
-            "13": 0.29114,
-            "14": 0.29464,
-            "15": 0.31626,
-            "16": 0.48847,
-            "17": 0.28436,
-            "18": 0.30264,
-            "19": 0.29287,
-            "20": 0.30599,
-            "21": 0.29335,
-            "22": 0.27957,
-            "23": 0.29491,
-            "24": 0.29371,
-            "25": 0.29398,
-            "26": 0.29344,
-            "27": 0.29457,
-            "28": 0.29449,
-            "29": 0.29412,
-            "30": 0.29337,
-            "31": 0.29404,
-            "32": 0.29391,
-            "33": 0.29483,
-            "34": 0.29389,
-            "35": 0.29433,
-            "36": 0.29449,
-            "37": 0.29463,
-            "38": 0.29428,
-            "39": 0.29385,
-            "40": 0.29379,
-            "41": 0.29345,
-            "42": 0.29404,
-            "43": 0.29413,
-            "44": 0.29357,
-            "45": 0.29308,
-            "46": 0.29302,
-            "47": 0.29311,
-            "48": 0.29341,
-            "49": 0.2946,
-            "50": 0.29365,
-            "51": 0.29978,
-            "52": 0.31599,
-            "53": 0.29361,
-            "54": 0.29341,
-            "55": 0.29321,
-            "56": 0.29262,
-            "57": 0.29474,
-            "58": 0.29427,
-            "59": 0.29281,
-            "60": 0.29314,
-            "61": 0.29219,
-            "62": 0.29346,
-            "63": 0.29348,
-            "64": 0.30211,
-            "65": 0.29324,
-            "66": 0.29357,
-            "67": 0.29314,
-            "68": 0.29229,
-            "69": 0.30197,
-            "70": 0.29329,
-            "71": 0.30206,
-            "72": 0.29435,
-            "73": 0.29495,
-            "74": 0.2943,
-            "75": 0.29926,
-            "76": 0.29332,
-            "77": 0.29464,
-            "78": 0.29342,
-            "79": 0.29434,
-            "80": 0.29439,
-            "81": 0.29391,
-            "82": 0.29436,
-            "83": 0.29426,
-            "84": 0.29408,
-            "85": 0.29452,
-            "86": 0.29406,
-            "87": 0.29421,
-            "88": 0.29373,
-            "89": 0.29437,
-            "90": 0.29425,
-            "91": 0.29383,
-            "92": 0.2933,
-            "93": 0.29369,
-            "94": 0.2937,
-            "95": 0.29465,
-            "96": 0.29439,
-            "97": 0.29435,
-            "98": 0.2952,
-            "99": 0.29361,
-            "100": 0.2936
+            "1": 3.90326,
+            "2": 0.32521,
+            "3": 0.29877,
+            "4": 0.2879,
+            "5": 0.29191,
+            "6": 0.28844,
+            "7": 0.28727,
+            "8": 0.2851,
+            "9": 0.28617,
+            "10": 0.2869,
+            "11": 0.28532,
+            "12": 0.28535,
+            "13": 0.28382,
+            "14": 0.28373,
+            "15": 0.28543,
+            "16": 0.55478,
+            "17": 0.28409,
+            "18": 0.29766,
+            "19": 0.29807,
+            "20": 0.33631,
+            "21": 0.29858,
+            "22": 0.284,
+            "23": 0.29625,
+            "24": 0.29625,
+            "25": 0.29634,
+            "26": 0.29795,
+            "27": 0.29713,
+            "28": 0.29855,
+            "29": 0.2978,
+            "30": 0.29653,
+            "31": 0.29786,
+            "32": 0.29724,
+            "33": 0.2971,
+            "34": 0.29753,
+            "35": 0.29699,
+            "36": 0.29798,
+            "37": 0.2974,
+            "38": 0.29676,
+            "39": 0.29657,
+            "40": 0.29597,
+            "41": 0.29525,
+            "42": 0.29613,
+            "43": 0.29598,
+            "44": 0.29592,
+            "45": 0.29776,
+            "46": 0.29645,
+            "47": 0.29585,
+            "48": 0.29622,
+            "49": 0.29485,
+            "50": 0.29579,
+            "51": 0.29265,
+            "52": 0.29418,
+            "53": 0.29501,
+            "54": 0.29502,
+            "55": 0.29522,
+            "56": 0.296,
+            "57": 0.29522,
+            "58": 0.2961,
+            "59": 0.29635,
+            "60": 0.29506,
+            "61": 0.29537,
+            "62": 0.29452,
+            "63": 0.29575,
+            "64": 0.29613,
+            "65": 0.2942,
+            "66": 0.29535,
+            "67": 0.6477,
+            "68": 0.29093,
+            "69": 0.29393,
+            "70": 0.29211,
+            "71": 0.29083,
+            "72": 0.29058,
+            "73": 0.29094,
+            "74": 0.29524,
+            "75": 0.29494,
+            "76": 0.29537,
+            "77": 0.29623,
+            "78": 0.29481,
+            "79": 0.29569,
+            "80": 0.29566,
+            "81": 0.29531,
+            "82": 0.29454,
+            "83": 0.29679,
+            "84": 0.2951,
+            "85": 0.29501,
+            "86": 0.29539,
+            "87": 0.29473,
+            "88": 0.2946,
+            "89": 0.29497,
+            "90": 0.29597,
+            "91": 0.2919,
+            "92": 0.29158,
+            "93": 0.29164,
+            "94": 0.29099,
+            "95": 0.29095,
+            "96": 0.32413,
+            "97": 0.29708,
+            "98": 0.29254,
+            "99": 0.29206,
+            "100": 0.29407
         }
     },
     "num-zeros": {
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..66288218291
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.21727,
+            "52": 10.1271,
+            "53": 10.36018,
+            "54": 10.25981,
+            "55": 10.20104,
+            "56": 9.98213,
+            "57": 9.84717,
+            "58": 10.12257,
+            "59": 9.90914,
+            "60": 9.83288,
+            "61": 9.9713,
+            "62": 10.22005,
+            "63": 9.67481,
+            "64": 10.01706,
+            "65": 9.27085,
+            "66": 9.93979,
+            "67": 9.62899,
+            "68": 9.98681,
+            "69": 9.9839,
+            "70": 9.92559,
+            "71": 9.81011,
+            "72": 9.79196,
+            "73": 9.68163,
+            "74": 9.17945,
+            "75": 9.61324,
+            "76": 9.28951,
+            "77": 10.19435,
+            "78": 9.8755,
+            "79": 9.5297,
+            "80": 9.56593,
+            "81": 9.63478,
+            "82": 9.82295,
+            "83": 9.47164,
+            "84": 9.54623,
+            "85": 9.74358,
+            "86": 9.20093,
+            "87": 9.70179,
+            "88": 9.86553,
+            "89": 9.73045,
+            "90": 9.92108,
+            "91": 9.48732,
+            "92": 9.47637,
+            "93": 9.21283,
+            "94": 8.94903,
+            "95": 9.6165,
+            "96": 9.63374,
+            "97": 9.41244,
+            "98": 9.7751,
+            "99": 9.00191,
+            "100": 9.50967
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2195.0,
+            "52": 2330.0,
+            "53": 3549.0,
+            "54": 2650.0,
+            "55": 2247.0,
+            "56": 2422.0,
+            "57": 2195.0,
+            "58": 3241.0,
+            "59": 2626.0,
+            "60": 2775.0,
+            "61": 2747.0,
+            "62": 2926.0,
+            "63": 2898.0,
+            "64": 3090.0,
+            "65": 2245.0,
+            "66": 3827.0,
+            "67": 2655.0,
+            "68": 3117.0,
+            "69": 2656.0,
+            "70": 3659.0,
+            "71": 2819.0,
+            "72": 2710.0,
+            "73": 3355.0,
+            "74": 2210.0,
+            "75": 2927.0,
+            "76": 3577.0,
+            "77": 3727.0,
+            "78": 3855.0,
+            "79": 4237.0,
+            "80": 3462.0,
+            "81": 5157.0,
+            "82": 3426.0,
+            "83": 3234.0,
+            "84": 3878.0,
+            "85": 3734.0,
+            "86": 3184.0,
+            "87": 4090.0,
+            "88": 3594.0,
+            "89": 4234.0,
+            "90": 3744.0,
+            "91": 2967.0,
+            "92": 4509.0,
+            "93": 3649.0,
+            "94": 4486.0,
+            "95": 4215.0,
+            "96": 3851.0,
+            "97": 4098.0,
+            "98": 5029.0,
+            "99": 3975.0,
+            "100": 3445.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 414416384.0,
+            "52": 414416384.0,
+            "53": 414416384.0,
+            "54": 414416384.0,
+            "55": 414416384.0,
+            "56": 414416384.0,
+            "57": 414416384.0,
+            "58": 414416384.0,
+            "59": 414416384.0,
+            "60": 414416384.0,
+            "61": 414416384.0,
+            "62": 414416384.0,
+            "63": 414416384.0,
+            "64": 414416384.0,
+            "65": 414416384.0,
+            "66": 414416384.0,
+            "67": 414416384.0,
+            "68": 414416384.0,
+            "69": 414416384.0,
+            "70": 414416384.0,
+            "71": 414416384.0,
+            "72": 414416384.0,
+            "73": 414416384.0,
+            "74": 414416384.0,
+            "75": 414416384.0,
+            "76": 414416384.0,
+            "77": 414416384.0,
+            "78": 414416384.0,
+            "79": 414416384.0,
+            "80": 414416384.0,
+            "81": 414416384.0,
+            "82": 414416384.0,
+            "83": 414416384.0,
+            "84": 414416384.0,
+            "85": 414416384.0,
+            "86": 414416384.0,
+            "87": 414416384.0,
+            "88": 414416384.0,
+            "89": 414416384.0,
+            "90": 414416384.0,
+            "91": 414416384.0,
+            "92": 414416384.0,
+            "93": 414416384.0,
+            "94": 414416384.0,
+            "95": 414416384.0,
+            "96": 414416384.0,
+            "97": 414416384.0,
+            "98": 414416384.0,
+            "99": 414416384.0,
+            "100": 414416384.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1595256320.0,
+            "52": 1595257344.0,
+            "53": 1595257344.0,
+            "54": 1595257344.0,
+            "55": 1595257344.0,
+            "56": 1595257344.0,
+            "57": 1595257344.0,
+            "58": 1595257344.0,
+            "59": 1595257344.0,
+            "60": 1595257344.0,
+            "61": 1595257344.0,
+            "62": 1595257344.0,
+            "63": 1595257344.0,
+            "64": 1595257344.0,
+            "65": 1595257344.0,
+            "66": 1595257344.0,
+            "67": 1595257344.0,
+            "68": 1595257344.0,
+            "69": 1595257344.0,
+            "70": 1595257344.0,
+            "71": 1595257344.0,
+            "72": 1595257344.0,
+            "73": 1595257344.0,
+            "74": 1595257344.0,
+            "75": 1595257344.0,
+            "76": 1595257344.0,
+            "77": 1595257344.0,
+            "78": 1595257344.0,
+            "79": 1595257344.0,
+            "80": 1595257344.0,
+            "81": 1595257344.0,
+            "82": 1595257344.0,
+            "83": 1595257344.0,
+            "84": 1595257344.0,
+            "85": 1595257344.0,
+            "86": 1595257344.0,
+            "87": 1595257344.0,
+            "88": 1595257344.0,
+            "89": 1595257344.0,
+            "90": 1595257344.0,
+            "91": 1595257344.0,
+            "92": 1595257344.0,
+            "93": 1595257344.0,
+            "94": 1595257344.0,
+            "95": 1595257344.0,
+            "96": 1595257344.0,
+            "97": 1595257344.0,
+            "98": 1595257344.0,
+            "99": 1595257344.0,
+            "100": 1595257344.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4.6255,
+            "52": 0.3078,
+            "53": 0.29258,
+            "54": 0.29374,
+            "55": 0.2933,
+            "56": 0.29417,
+            "57": 0.29313,
+            "58": 0.29372,
+            "59": 0.2927,
+            "60": 0.29145,
+            "61": 0.28923,
+            "62": 0.28993,
+            "63": 0.28959,
+            "64": 0.28843,
+            "65": 0.28881,
+            "66": 0.29031,
+            "67": 0.28903,
+            "68": 0.29293,
+            "69": 0.28962,
+            "70": 0.289,
+            "71": 0.29028,
+            "72": 0.29172,
+            "73": 0.29135,
+            "74": 0.2898,
+            "75": 0.28811,
+            "76": 0.28948,
+            "77": 0.29039,
+            "78": 0.29199,
+            "79": 0.29181,
+            "80": 0.29034,
+            "81": 0.29243,
+            "82": 0.29201,
+            "83": 0.28907,
+            "84": 0.28862,
+            "85": 0.2892,
+            "86": 0.28908,
+            "87": 0.28908,
+            "88": 0.28933,
+            "89": 0.29117,
+            "90": 0.2904,
+            "91": 0.2908,
+            "92": 0.28876,
+            "93": 0.2907,
+            "94": 0.29089,
+            "95": 0.2905,
+            "96": 0.29005,
+            "97": 0.28901,
+            "98": 0.2916,
+            "99": 0.29038,
+            "100": 0.29014
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..b1ec74d48d3
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.84064,
+            "2": 10.85201,
+            "3": 10.84256,
+            "4": 10.84663,
+            "5": 10.85667,
+            "6": 10.8655,
+            "7": 10.85455,
+            "8": 10.84814,
+            "9": 10.85295,
+            "10": 10.82026,
+            "11": 10.86468,
+            "12": 10.85604,
+            "13": 10.87584,
+            "14": 10.86361,
+            "15": 10.86365,
+            "16": 10.86053,
+            "17": 10.84579,
+            "18": 10.8538,
+            "19": 10.85943,
+            "20": 10.84139,
+            "21": 10.86327,
+            "22": 10.83014,
+            "23": 10.85749,
+            "24": 10.83816,
+            "25": 10.82517,
+            "26": 10.8257,
+            "27": 10.83038,
+            "28": 10.82029,
+            "29": 10.81214,
+            "30": 10.74061,
+            "31": 10.68185,
+            "32": 10.76069,
+            "33": 10.7491,
+            "34": 10.67394,
+            "35": 10.65529,
+            "36": 10.63303,
+            "37": 10.66285,
+            "38": 10.60535,
+            "39": 10.6732,
+            "40": 10.50952,
+            "41": 10.53339,
+            "42": 10.54981,
+            "43": 10.35084,
+            "44": 10.3993,
+            "45": 10.31307,
+            "46": 10.27398,
+            "47": 10.45772,
+            "48": 10.27942,
+            "49": 10.05213,
+            "50": 10.28011,
+            "51": 10.23426,
+            "52": 10.13488,
+            "53": 10.35279,
+            "54": 10.26189,
+            "55": 10.20983,
+            "56": 9.99599,
+            "57": 9.87962,
+            "58": 10.13391,
+            "59": 9.92304,
+            "60": 9.85379,
+            "61": 9.97314,
+            "62": 10.211,
+            "63": 9.70514,
+            "64": 10.01457,
+            "65": 9.30759,
+            "66": 9.9366,
+            "67": 9.63221,
+            "68": 9.98219,
+            "69": 9.98048,
+            "70": 9.92986,
+            "71": 9.81575,
+            "72": 9.79602,
+            "73": 9.69104,
+            "74": 9.20049,
+            "75": 9.61228,
+            "76": 9.28906,
+            "77": 10.19068,
+            "78": 9.86601,
+            "79": 9.53855,
+            "80": 9.5578,
+            "81": 9.63332,
+            "82": 9.82853,
+            "83": 9.47188,
+            "84": 9.54101,
+            "85": 9.74266,
+            "86": 9.2142,
+            "87": 9.7016,
+            "88": 9.86604,
+            "89": 9.72339,
+            "90": 9.92767,
+            "91": 9.47045,
+            "92": 9.46809,
+            "93": 9.21217,
+            "94": 8.94887,
+            "95": 9.62787,
+            "96": 9.6406,
+            "97": 9.40839,
+            "98": 9.77147,
+            "99": 9.00853,
+            "100": 9.51225
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 284527616.0,
+            "2": 284527616.0,
+            "3": 284527616.0,
+            "4": 284527616.0,
+            "5": 284527616.0,
+            "6": 284527616.0,
+            "7": 284527616.0,
+            "8": 284527616.0,
+            "9": 284527616.0,
+            "10": 284527616.0,
+            "11": 284527616.0,
+            "12": 284527616.0,
+            "13": 284527616.0,
+            "14": 284527616.0,
+            "15": 284527616.0,
+            "16": 416513536.0,
+            "17": 416513536.0,
+            "18": 416513536.0,
+            "19": 416513536.0,
+            "20": 416513536.0,
+            "21": 416513536.0,
+            "22": 416513536.0,
+            "23": 416513536.0,
+            "24": 416513536.0,
+            "25": 416513536.0,
+            "26": 416513536.0,
+            "27": 416513536.0,
+            "28": 416513536.0,
+            "29": 416513536.0,
+            "30": 416513536.0,
+            "31": 416513536.0,
+            "32": 416513536.0,
+            "33": 416513536.0,
+            "34": 416513536.0,
+            "35": 416513536.0,
+            "36": 416513536.0,
+            "37": 416513536.0,
+            "38": 416513536.0,
+            "39": 416513536.0,
+            "40": 416513536.0,
+            "41": 416513536.0,
+            "42": 416513536.0,
+            "43": 416513536.0,
+            "44": 416513536.0,
+            "45": 416513536.0,
+            "46": 416513536.0,
+            "47": 416513536.0,
+            "48": 416513536.0,
+            "49": 416513536.0,
+            "50": 416513536.0,
+            "51": 416513536.0,
+            "52": 416513536.0,
+            "53": 416513536.0,
+            "54": 416513536.0,
+            "55": 416513536.0,
+            "56": 416513536.0,
+            "57": 416513536.0,
+            "58": 416513536.0,
+            "59": 416513536.0,
+            "60": 416513536.0,
+            "61": 416513536.0,
+            "62": 416513536.0,
+            "63": 416513536.0,
+            "64": 416513536.0,
+            "65": 416513536.0,
+            "66": 416513536.0,
+            "67": 416513536.0,
+            "68": 416513536.0,
+            "69": 416513536.0,
+            "70": 416513536.0,
+            "71": 416513536.0,
+            "72": 416513536.0,
+            "73": 416513536.0,
+            "74": 416513536.0,
+            "75": 416513536.0,
+            "76": 416513536.0,
+            "77": 416513536.0,
+            "78": 416513536.0,
+            "79": 416513536.0,
+            "80": 416513536.0,
+            "81": 416513536.0,
+            "82": 416513536.0,
+            "83": 416513536.0,
+            "84": 416513536.0,
+            "85": 416513536.0,
+            "86": 416513536.0,
+            "87": 416513536.0,
+            "88": 416513536.0,
+            "89": 416513536.0,
+            "90": 416513536.0,
+            "91": 416513536.0,
+            "92": 416513536.0,
+            "93": 416513536.0,
+            "94": 416513536.0,
+            "95": 416513536.0,
+            "96": 416513536.0,
+            "97": 416513536.0,
+            "98": 416513536.0,
+            "99": 416513536.0,
+            "100": 416513536.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1465368064.0,
+            "2": 1465368576.0,
+            "3": 1465368576.0,
+            "4": 1465368576.0,
+            "5": 1465368576.0,
+            "6": 1465368576.0,
+            "7": 1465368576.0,
+            "8": 1465368576.0,
+            "9": 1465368576.0,
+            "10": 1465368576.0,
+            "11": 1465368576.0,
+            "12": 1465368576.0,
+            "13": 1465368576.0,
+            "14": 1465368576.0,
+            "15": 1465368576.0,
+            "16": 1465368576.0,
+            "17": 1597485568.0,
+            "18": 1597485568.0,
+            "19": 1597485568.0,
+            "20": 1597485568.0,
+            "21": 1597485568.0,
+            "22": 1597485568.0,
+            "23": 1597485568.0,
+            "24": 1597485568.0,
+            "25": 1597485568.0,
+            "26": 1597485568.0,
+            "27": 1597485568.0,
+            "28": 1597485568.0,
+            "29": 1597485568.0,
+            "30": 1597485568.0,
+            "31": 1597485568.0,
+            "32": 1597485568.0,
+            "33": 1597485568.0,
+            "34": 1597485568.0,
+            "35": 1597485568.0,
+            "36": 1597485568.0,
+            "37": 1597485568.0,
+            "38": 1597485568.0,
+            "39": 1597485568.0,
+            "40": 1597485568.0,
+            "41": 1597485568.0,
+            "42": 1597485568.0,
+            "43": 1597485568.0,
+            "44": 1597485568.0,
+            "45": 1597485568.0,
+            "46": 1597485568.0,
+            "47": 1597485568.0,
+            "48": 1597485568.0,
+            "49": 1597485568.0,
+            "50": 1597485568.0,
+            "51": 1597485568.0,
+            "52": 1597485568.0,
+            "53": 1597485568.0,
+            "54": 1597485568.0,
+            "55": 1597485568.0,
+            "56": 1597485568.0,
+            "57": 1597485568.0,
+            "58": 1597485568.0,
+            "59": 1597485568.0,
+            "60": 1597485568.0,
+            "61": 1597485568.0,
+            "62": 1597485568.0,
+            "63": 1597485568.0,
+            "64": 1597485568.0,
+            "65": 1597485568.0,
+            "66": 1597485568.0,
+            "67": 1597485568.0,
+            "68": 1597485568.0,
+            "69": 1597485568.0,
+            "70": 1597485568.0,
+            "71": 1597485568.0,
+            "72": 1597485568.0,
+            "73": 1597485568.0,
+            "74": 1597485568.0,
+            "75": 1597485568.0,
+            "76": 1597485568.0,
+            "77": 1597485568.0,
+            "78": 1597485568.0,
+            "79": 1597485568.0,
+            "80": 1597485568.0,
+            "81": 1597485568.0,
+            "82": 1597485568.0,
+            "83": 1597485568.0,
+            "84": 1597485568.0,
+            "85": 1597485568.0,
+            "86": 1597485568.0,
+            "87": 1597485568.0,
+            "88": 1597485568.0,
+            "89": 1597485568.0,
+            "90": 1597485568.0,
+            "91": 1597485568.0,
+            "92": 1597485568.0,
+            "93": 1597485568.0,
+            "94": 1597485568.0,
+            "95": 1597485568.0,
+            "96": 1597485568.0,
+            "97": 1597485568.0,
+            "98": 1597485568.0,
+            "99": 1597485568.0,
+            "100": 1597485568.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 2.06149,
+            "3": 0.29865,
+            "4": 0.28631,
+            "5": 0.28736,
+            "6": 0.28526,
+            "7": 0.29059,
+            "8": 0.28917,
+            "9": 0.28615,
+            "10": 0.29015,
+            "11": 0.28709,
+            "12": 0.28726,
+            "13": 0.28539,
+            "14": 0.28815,
+            "15": 0.28809,
+            "16": 0.36186,
+            "17": 0.29264,
+            "18": 0.30232,
+            "19": 0.30149,
+            "20": 0.3025,
+            "21": 0.30312,
+            "22": 0.28815,
+            "23": 0.30003,
+            "24": 0.30279,
+            "25": 0.30076,
+            "26": 0.31198,
+            "27": 0.30137,
+            "28": 0.30174,
+            "29": 0.30053,
+            "30": 0.2998,
+            "31": 0.30129,
+            "32": 0.30266,
+            "33": 0.30179,
+            "34": 0.30183,
+            "35": 0.30504,
+            "36": 0.30132,
+            "37": 0.30289,
+            "38": 0.30052,
+            "39": 0.29983,
+            "40": 0.29924,
+            "41": 0.30028,
+            "42": 0.29816,
+            "43": 0.30081,
+            "44": 0.3016,
+            "45": 0.30595,
+            "46": 0.30403,
+            "47": 0.30454,
+            "48": 0.30318,
+            "49": 0.30105,
+            "50": 0.30174,
+            "51": 0.3175,
+            "52": 0.29652,
+            "53": 0.29581,
+            "54": 0.29555,
+            "55": 0.29696,
+            "56": 0.29449,
+            "57": 0.64732,
+            "58": 0.30369,
+            "59": 0.30167,
+            "60": 0.30075,
+            "61": 0.29981,
+            "62": 0.30078,
+            "63": 0.31092,
+            "64": 0.30144,
+            "65": 0.29891,
+            "66": 0.2987,
+            "67": 0.29963,
+            "68": 0.30439,
+            "69": 0.29787,
+            "70": 0.3036,
+            "71": 0.30595,
+            "72": 0.29733,
+            "73": 0.29745,
+            "74": 0.30071,
+            "75": 0.29706,
+            "76": 0.2969,
+            "77": 0.29903,
+            "78": 0.29958,
+            "79": 0.29754,
+            "80": 0.30059,
+            "81": 0.29879,
+            "82": 0.30486,
+            "83": 0.29801,
+            "84": 0.29892,
+            "85": 0.2996,
+            "86": 0.29869,
+            "87": 0.30043,
+            "88": 0.29951,
+            "89": 0.29614,
+            "90": 0.29973,
+            "91": 0.30029,
+            "92": 0.29926,
+            "93": 0.29973,
+            "94": 0.29969,
+            "95": 0.30108,
+            "96": 0.29798,
+            "97": 0.29923,
+            "98": 0.29982,
+            "99": 0.29854,
+            "100": 0.3007
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": 2392.0,
+            "17": "nan",
+            "18": 2314.0,
+            "19": 2912.0,
+            "20": 1640.0,
+            "21": 2053.0,
+            "22": "nan",
+            "23": 2462.0,
+            "24": 2226.0,
+            "25": 2201.0,
+            "26": 1963.0,
+            "27": 1926.0,
+            "28": 2401.0,
+            "29": 2492.0,
+            "30": 2393.0,
+            "31": 1704.0,
+            "32": 2541.0,
+            "33": 2096.0,
+            "34": 1737.0,
+            "35": 1810.0,
+            "36": 1982.0,
+            "37": 2511.0,
+            "38": 2185.0,
+            "39": 2899.0,
+            "40": 1888.0,
+            "41": 3169.0,
+            "42": 2343.0,
+            "43": 2501.0,
+            "44": 1938.0,
+            "45": 2346.0,
+            "46": 2091.0,
+            "47": 2853.0,
+            "48": 2402.0,
+            "49": 1810.0,
+            "50": 2718.0,
+            "51": 2080.0,
+            "52": 2200.0,
+            "53": 3412.0,
+            "54": 2641.0,
+            "55": 2229.0,
+            "56": 2244.0,
+            "57": 2057.0,
+            "58": 3223.0,
+            "59": 2431.0,
+            "60": 2650.0,
+            "61": 2712.0,
+            "62": 2995.0,
+            "63": 2816.0,
+            "64": 2860.0,
+            "65": 2015.0,
+            "66": 3176.0,
+            "67": 2529.0,
+            "68": 3108.0,
+            "69": 2873.0,
+            "70": 3540.0,
+            "71": 2904.0,
+            "72": 2693.0,
+            "73": 3253.0,
+            "74": 1981.0,
+            "75": 2780.0,
+            "76": 3465.0,
+            "77": 3649.0,
+            "78": 3593.0,
+            "79": 3981.0,
+            "80": 3458.0,
+            "81": 5181.0,
+            "82": 3334.0,
+            "83": 2956.0,
+            "84": 3527.0,
+            "85": 3711.0,
+            "86": 3209.0,
+            "87": 4133.0,
+            "88": 3443.0,
+            "89": 4295.0,
+            "90": 3801.0,
+            "91": 2958.0,
+            "92": 4311.0,
+            "93": 3544.0,
+            "94": 4264.0,
+            "95": 4042.0,
+            "96": 3849.0,
+            "97": 3974.0,
+            "98": 4971.0,
+            "99": 4071.0,
+            "100": 3363.0
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..ae8f01b3327
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.23426,
+            "52": 10.13488,
+            "53": 10.35279,
+            "54": 10.26189,
+            "55": 10.20983,
+            "56": 9.99599,
+            "57": 9.87962,
+            "58": 10.13391,
+            "59": 9.92304,
+            "60": 9.85379,
+            "61": 9.97314,
+            "62": 10.211,
+            "63": 9.70514,
+            "64": 10.01457,
+            "65": 9.30759,
+            "66": 9.9366,
+            "67": 9.63221,
+            "68": 9.98219,
+            "69": 9.98048,
+            "70": 9.92986,
+            "71": 9.81575,
+            "72": 9.79602,
+            "73": 9.69104,
+            "74": 9.20049,
+            "75": 9.61228,
+            "76": 9.28906,
+            "77": 10.19068,
+            "78": 9.86601,
+            "79": 9.53855,
+            "80": 9.5578,
+            "81": 9.63332,
+            "82": 9.82853,
+            "83": 9.47188,
+            "84": 9.54101,
+            "85": 9.74266,
+            "86": 9.2142,
+            "87": 9.7016,
+            "88": 9.86604,
+            "89": 9.72339,
+            "90": 9.92767,
+            "91": 9.47045,
+            "92": 9.46809,
+            "93": 9.21217,
+            "94": 8.94887,
+            "95": 9.62787,
+            "96": 9.6406,
+            "97": 9.40839,
+            "98": 9.77147,
+            "99": 9.00853,
+            "100": 9.51225
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2080.0,
+            "52": 2200.0,
+            "53": 3412.0,
+            "54": 2641.0,
+            "55": 2229.0,
+            "56": 2244.0,
+            "57": 2057.0,
+            "58": 3223.0,
+            "59": 2431.0,
+            "60": 2650.0,
+            "61": 2712.0,
+            "62": 2995.0,
+            "63": 2816.0,
+            "64": 2860.0,
+            "65": 2015.0,
+            "66": 3176.0,
+            "67": 2529.0,
+            "68": 3108.0,
+            "69": 2873.0,
+            "70": 3540.0,
+            "71": 2904.0,
+            "72": 2693.0,
+            "73": 3253.0,
+            "74": 1981.0,
+            "75": 2780.0,
+            "76": 3465.0,
+            "77": 3649.0,
+            "78": 3593.0,
+            "79": 3981.0,
+            "80": 3458.0,
+            "81": 5181.0,
+            "82": 3334.0,
+            "83": 2956.0,
+            "84": 3527.0,
+            "85": 3711.0,
+            "86": 3209.0,
+            "87": 4133.0,
+            "88": 3443.0,
+            "89": 4295.0,
+            "90": 3801.0,
+            "91": 2958.0,
+            "92": 4311.0,
+            "93": 3544.0,
+            "94": 4264.0,
+            "95": 4042.0,
+            "96": 3849.0,
+            "97": 3974.0,
+            "98": 4971.0,
+            "99": 4071.0,
+            "100": 3363.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 415464960.0,
+            "52": 415464960.0,
+            "53": 415464960.0,
+            "54": 415464960.0,
+            "55": 415464960.0,
+            "56": 415464960.0,
+            "57": 415464960.0,
+            "58": 415464960.0,
+            "59": 415464960.0,
+            "60": 415464960.0,
+            "61": 415464960.0,
+            "62": 415464960.0,
+            "63": 415464960.0,
+            "64": 415464960.0,
+            "65": 415464960.0,
+            "66": 415464960.0,
+            "67": 415464960.0,
+            "68": 415464960.0,
+            "69": 415464960.0,
+            "70": 415464960.0,
+            "71": 415464960.0,
+            "72": 415464960.0,
+            "73": 415464960.0,
+            "74": 415464960.0,
+            "75": 415464960.0,
+            "76": 415464960.0,
+            "77": 415464960.0,
+            "78": 415464960.0,
+            "79": 415464960.0,
+            "80": 415464960.0,
+            "81": 415464960.0,
+            "82": 415464960.0,
+            "83": 415464960.0,
+            "84": 415464960.0,
+            "85": 415464960.0,
+            "86": 415464960.0,
+            "87": 415464960.0,
+            "88": 415464960.0,
+            "89": 415464960.0,
+            "90": 415464960.0,
+            "91": 415464960.0,
+            "92": 415464960.0,
+            "93": 415464960.0,
+            "94": 415464960.0,
+            "95": 415464960.0,
+            "96": 415464960.0,
+            "97": 415464960.0,
+            "98": 415464960.0,
+            "99": 415464960.0,
+            "100": 415464960.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1596304896.0,
+            "52": 1596305920.0,
+            "53": 1596305920.0,
+            "54": 1596305920.0,
+            "55": 1596305920.0,
+            "56": 1596305920.0,
+            "57": 1596305920.0,
+            "58": 1596305920.0,
+            "59": 1596305920.0,
+            "60": 1596305920.0,
+            "61": 1596305920.0,
+            "62": 1596305920.0,
+            "63": 1596305920.0,
+            "64": 1596305920.0,
+            "65": 1596305920.0,
+            "66": 1596305920.0,
+            "67": 1596305920.0,
+            "68": 1596305920.0,
+            "69": 1596305920.0,
+            "70": 1596305920.0,
+            "71": 1596305920.0,
+            "72": 1596305920.0,
+            "73": 1596305920.0,
+            "74": 1596305920.0,
+            "75": 1596305920.0,
+            "76": 1596305920.0,
+            "77": 1596305920.0,
+            "78": 1596305920.0,
+            "79": 1596305920.0,
+            "80": 1596305920.0,
+            "81": 1596305920.0,
+            "82": 1596305920.0,
+            "83": 1596305920.0,
+            "84": 1596305920.0,
+            "85": 1596305920.0,
+            "86": 1596305920.0,
+            "87": 1596305920.0,
+            "88": 1596305920.0,
+            "89": 1596305920.0,
+            "90": 1596305920.0,
+            "91": 1596305920.0,
+            "92": 1596305920.0,
+            "93": 1596305920.0,
+            "94": 1596305920.0,
+            "95": 1596305920.0,
+            "96": 1596305920.0,
+            "97": 1596305920.0,
+            "98": 1596305920.0,
+            "99": 1596305920.0,
+            "100": 1596305920.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 1.8615,
+            "53": 0.31355,
+            "54": 0.3044,
+            "55": 0.30701,
+            "56": 0.30736,
+            "57": 0.31773,
+            "58": 0.30739,
+            "59": 0.30021,
+            "60": 0.30271,
+            "61": 0.31062,
+            "62": 0.30743,
+            "63": 0.30357,
+            "64": 0.30729,
+            "65": 0.3028,
+            "66": 0.30068,
+            "67": 0.30021,
+            "68": 0.30051,
+            "69": 0.30257,
+            "70": 0.30291,
+            "71": 0.30183,
+            "72": 0.30259,
+            "73": 0.30032,
+            "74": 0.3007,
+            "75": 0.30192,
+            "76": 0.30188,
+            "77": 0.30125,
+            "78": 0.30028,
+            "79": 0.3024,
+            "80": 0.3115,
+            "81": 0.3014,
+            "82": 0.3023,
+            "83": 0.30861,
+            "84": 0.30129,
+            "85": 0.30185,
+            "86": 0.29936,
+            "87": 0.30094,
+            "88": 0.3001,
+            "89": 0.2993,
+            "90": 0.2987,
+            "91": 0.30006,
+            "92": 0.30091,
+            "93": 0.30097,
+            "94": 0.29909,
+            "95": 0.30113,
+            "96": 0.29925,
+            "97": 0.29979,
+            "98": 0.30241,
+            "99": 0.30073,
+            "100": 0.30251
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
index cb0ad3fdb4b..6a29bef3baa 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
@@ -78,22 +78,22 @@
             "72": 9.798,
             "73": 9.68454,
             "74": 9.19951,
-            "75": 9.60518,
-            "76": 9.27791,
-            "77": 10.19437,
+            "75": 9.60519,
+            "76": 9.2779,
+            "77": 10.19436,
             "78": 9.8671,
             "79": 9.53341,
             "80": 9.56341,
             "81": 9.63047,
             "82": 9.82819,
             "83": 9.46388,
-            "84": 9.53736,
-            "85": 9.74561,
+            "84": 9.53735,
+            "85": 9.74562,
             "86": 9.21332,
-            "87": 9.7014,
+            "87": 9.70141,
             "88": 9.86621,
             "89": 9.72242,
-            "90": 9.92089,
+            "90": 9.9209,
             "91": 9.47178,
             "92": 9.46996,
             "93": 9.20589,
@@ -234,90 +234,90 @@
             "14": 1465368576.0,
             "15": 1465368576.0,
             "16": 1465368576.0,
-            "17": 1597092352.0,
-            "18": 1597092352.0,
-            "19": 1597092352.0,
-            "20": 1597092352.0,
-            "21": 1597092352.0,
-            "22": 1597092352.0,
-            "23": 1597092352.0,
-            "24": 1597092352.0,
-            "25": 1597092352.0,
-            "26": 1597092352.0,
-            "27": 1597092352.0,
-            "28": 1597092352.0,
-            "29": 1597092352.0,
-            "30": 1597092352.0,
-            "31": 1597092352.0,
-            "32": 1597092352.0,
-            "33": 1597092352.0,
-            "34": 1597092352.0,
-            "35": 1597092352.0,
-            "36": 1597092352.0,
-            "37": 1597092352.0,
-            "38": 1597092352.0,
-            "39": 1597092352.0,
-            "40": 1597092352.0,
-            "41": 1597092352.0,
-            "42": 1597092352.0,
-            "43": 1597092352.0,
-            "44": 1597092352.0,
-            "45": 1597092352.0,
-            "46": 1597092352.0,
-            "47": 1597092352.0,
-            "48": 1597092352.0,
-            "49": 1597092352.0,
-            "50": 1597092352.0,
-            "51": 1597092352.0,
-            "52": 1597092352.0,
-            "53": 1597092352.0,
-            "54": 1597092352.0,
-            "55": 1597092352.0,
-            "56": 1597092352.0,
-            "57": 1597092352.0,
-            "58": 1597092352.0,
-            "59": 1597092352.0,
-            "60": 1597092352.0,
-            "61": 1597092352.0,
-            "62": 1597092352.0,
-            "63": 1597092352.0,
-            "64": 1597092352.0,
-            "65": 1597092352.0,
-            "66": 1597092352.0,
-            "67": 1597092352.0,
-            "68": 1597092352.0,
-            "69": 1597092352.0,
-            "70": 1597092352.0,
-            "71": 1597092352.0,
-            "72": 1597092352.0,
-            "73": 1597092352.0,
-            "74": 1597092352.0,
-            "75": 1597092352.0,
-            "76": 1597092352.0,
-            "77": 1597092352.0,
-            "78": 1597092352.0,
-            "79": 1597092352.0,
-            "80": 1597092352.0,
-            "81": 1597092352.0,
-            "82": 1597092352.0,
-            "83": 1597092352.0,
-            "84": 1597092352.0,
-            "85": 1597092352.0,
-            "86": 1597092352.0,
-            "87": 1597092352.0,
-            "88": 1597092352.0,
-            "89": 1597092352.0,
-            "90": 1597092352.0,
-            "91": 1597092352.0,
-            "92": 1597092352.0,
-            "93": 1597092352.0,
-            "94": 1597092352.0,
-            "95": 1597092352.0,
-            "96": 1597092352.0,
-            "97": 1597092352.0,
-            "98": 1597092352.0,
-            "99": 1597092352.0,
-            "100": 1597092352.0
+            "17": 1597485568.0,
+            "18": 1597485568.0,
+            "19": 1597485568.0,
+            "20": 1597485568.0,
+            "21": 1597485568.0,
+            "22": 1597485568.0,
+            "23": 1597485568.0,
+            "24": 1597485568.0,
+            "25": 1597485568.0,
+            "26": 1597485568.0,
+            "27": 1597485568.0,
+            "28": 1597485568.0,
+            "29": 1597485568.0,
+            "30": 1597485568.0,
+            "31": 1597485568.0,
+            "32": 1597485568.0,
+            "33": 1597485568.0,
+            "34": 1597485568.0,
+            "35": 1597485568.0,
+            "36": 1597485568.0,
+            "37": 1597485568.0,
+            "38": 1597485568.0,
+            "39": 1597485568.0,
+            "40": 1597485568.0,
+            "41": 1597485568.0,
+            "42": 1597485568.0,
+            "43": 1597485568.0,
+            "44": 1597485568.0,
+            "45": 1597485568.0,
+            "46": 1597485568.0,
+            "47": 1597485568.0,
+            "48": 1597485568.0,
+            "49": 1597485568.0,
+            "50": 1597485568.0,
+            "51": 1597485568.0,
+            "52": 1597485568.0,
+            "53": 1597485568.0,
+            "54": 1597485568.0,
+            "55": 1597485568.0,
+            "56": 1597485568.0,
+            "57": 1597485568.0,
+            "58": 1597485568.0,
+            "59": 1597485568.0,
+            "60": 1597485568.0,
+            "61": 1597485568.0,
+            "62": 1597485568.0,
+            "63": 1597485568.0,
+            "64": 1597485568.0,
+            "65": 1597485568.0,
+            "66": 1597485568.0,
+            "67": 1597485568.0,
+            "68": 1597485568.0,
+            "69": 1597485568.0,
+            "70": 1597485568.0,
+            "71": 1597485568.0,
+            "72": 1597485568.0,
+            "73": 1597485568.0,
+            "74": 1597485568.0,
+            "75": 1597485568.0,
+            "76": 1597485568.0,
+            "77": 1597485568.0,
+            "78": 1597485568.0,
+            "79": 1597485568.0,
+            "80": 1597485568.0,
+            "81": 1597485568.0,
+            "82": 1597485568.0,
+            "83": 1597485568.0,
+            "84": 1597485568.0,
+            "85": 1597485568.0,
+            "86": 1597485568.0,
+            "87": 1597485568.0,
+            "88": 1597485568.0,
+            "89": 1597485568.0,
+            "90": 1597485568.0,
+            "91": 1597485568.0,
+            "92": 1597485568.0,
+            "93": 1597485568.0,
+            "94": 1597485568.0,
+            "95": 1597485568.0,
+            "96": 1597485568.0,
+            "97": 1597485568.0,
+            "98": 1597485568.0,
+            "99": 1597485568.0,
+            "100": 1597485568.0
         }
     },
     "iteration-time": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.78805,
-            "2": 0.23224,
-            "3": 0.20783,
-            "4": 0.21971,
-            "5": 0.22246,
-            "6": 0.23346,
-            "7": 0.21626,
-            "8": 0.20597,
-            "9": 0.2043,
-            "10": 0.20681,
-            "11": 0.20511,
-            "12": 0.20484,
-            "13": 0.21351,
-            "14": 0.20446,
-            "15": 0.21063,
-            "16": 0.28338,
-            "17": 0.21017,
-            "18": 0.21577,
-            "19": 0.21852,
-            "20": 0.23072,
-            "21": 0.25974,
-            "22": 0.21717,
-            "23": 0.22548,
-            "24": 0.21878,
-            "25": 0.21448,
-            "26": 0.21416,
-            "27": 0.22357,
-            "28": 0.21645,
-            "29": 0.21325,
-            "30": 0.21465,
-            "31": 0.21452,
-            "32": 0.21608,
-            "33": 0.23531,
-            "34": 0.227,
-            "35": 0.2188,
-            "36": 0.21248,
-            "37": 0.21694,
-            "38": 0.21269,
-            "39": 0.22285,
-            "40": 0.21458,
-            "41": 0.2134,
-            "42": 0.21991,
-            "43": 0.21621,
-            "44": 0.21422,
-            "45": 0.21339,
-            "46": 0.21332,
-            "47": 0.21892,
-            "48": 0.21384,
-            "49": 0.21668,
-            "50": 0.21806,
-            "51": 0.21958,
-            "52": 0.2173,
-            "53": 0.21642,
-            "54": 0.22157,
-            "55": 0.21549,
-            "56": 0.21528,
-            "57": 0.21789,
-            "58": 0.21634,
-            "59": 0.21649,
-            "60": 0.2141,
-            "61": 0.21447,
-            "62": 0.21596,
-            "63": 0.21545,
-            "64": 0.22145,
-            "65": 0.21603,
-            "66": 0.21504,
-            "67": 0.21551,
-            "68": 0.21918,
-            "69": 0.21831,
-            "70": 0.21943,
-            "71": 0.21537,
-            "72": 0.21937,
-            "73": 0.21783,
-            "74": 0.2246,
-            "75": 0.22031,
-            "76": 0.23249,
-            "77": 0.21862,
-            "78": 0.21663,
-            "79": 0.21806,
-            "80": 0.21694,
-            "81": 0.21684,
-            "82": 0.21559,
-            "83": 0.21877,
-            "84": 0.2151,
-            "85": 0.21819,
-            "86": 0.2167,
-            "87": 0.21768,
-            "88": 0.21415,
-            "89": 0.21694,
-            "90": 0.21444,
-            "91": 0.21616,
-            "92": 0.21967,
-            "93": 0.21672,
-            "94": 0.21699,
-            "95": 0.21892,
-            "96": 0.21871,
-            "97": 0.21805,
-            "98": 0.21674,
-            "99": 0.21639,
-            "100": 0.21581
+            "1": 7.85348,
+            "2": 0.23423,
+            "3": 0.2045,
+            "4": 0.18465,
+            "5": 0.18457,
+            "6": 0.18573,
+            "7": 0.18584,
+            "8": 0.19132,
+            "9": 0.18718,
+            "10": 0.18632,
+            "11": 0.18549,
+            "12": 0.18453,
+            "13": 0.18301,
+            "14": 0.18637,
+            "15": 0.18341,
+            "16": 0.27303,
+            "17": 0.1875,
+            "18": 0.19094,
+            "19": 0.19099,
+            "20": 0.19512,
+            "21": 0.19472,
+            "22": 0.18932,
+            "23": 0.19109,
+            "24": 0.19032,
+            "25": 0.19034,
+            "26": 0.19014,
+            "27": 0.19037,
+            "28": 0.19342,
+            "29": 0.19102,
+            "30": 0.19217,
+            "31": 0.1905,
+            "32": 0.18989,
+            "33": 0.19339,
+            "34": 0.19354,
+            "35": 0.19435,
+            "36": 0.19151,
+            "37": 0.1914,
+            "38": 0.19302,
+            "39": 0.1935,
+            "40": 0.18995,
+            "41": 0.19387,
+            "42": 0.19161,
+            "43": 0.19131,
+            "44": 0.19213,
+            "45": 0.1914,
+            "46": 0.1912,
+            "47": 0.19009,
+            "48": 0.1917,
+            "49": 0.19013,
+            "50": 0.19041,
+            "51": 0.19678,
+            "52": 0.18974,
+            "53": 0.19754,
+            "54": 0.19109,
+            "55": 0.19038,
+            "56": 0.19071,
+            "57": 0.19479,
+            "58": 0.1896,
+            "59": 0.18945,
+            "60": 0.19321,
+            "61": 0.19042,
+            "62": 0.19018,
+            "63": 0.19145,
+            "64": 0.19092,
+            "65": 0.1911,
+            "66": 0.1905,
+            "67": 0.19866,
+            "68": 0.20109,
+            "69": 0.19967,
+            "70": 0.20138,
+            "71": 0.19744,
+            "72": 0.1992,
+            "73": 0.1983,
+            "74": 0.19896,
+            "75": 0.19812,
+            "76": 0.2002,
+            "77": 0.20008,
+            "78": 0.1993,
+            "79": 0.1982,
+            "80": 0.19675,
+            "81": 0.19588,
+            "82": 0.18814,
+            "83": 0.18859,
+            "84": 0.19035,
+            "85": 0.20544,
+            "86": 0.1936,
+            "87": 0.19585,
+            "88": 0.18962,
+            "89": 0.18921,
+            "90": 0.1877,
+            "91": 0.18708,
+            "92": 0.18744,
+            "93": 0.18758,
+            "94": 0.18685,
+            "95": 0.18938,
+            "96": 0.18819,
+            "97": 0.18788,
+            "98": 0.18915,
+            "99": 0.18809,
+            "100": 0.18729
         }
     },
     "num-zeros": {
@@ -506,32 +506,32 @@
             "72": 2640.0,
             "73": 3199.0,
             "74": 2084.0,
-            "75": 2809.0,
-            "76": 3599.0,
-            "77": 3667.0,
-            "78": 3680.0,
-            "79": 3972.0,
-            "80": 3365.0,
-            "81": 5042.0,
-            "82": 3291.0,
-            "83": 3016.0,
-            "84": 3592.0,
-            "85": 3792.0,
-            "86": 3192.0,
-            "87": 4219.0,
-            "88": 3376.0,
-            "89": 4110.0,
-            "90": 3939.0,
-            "91": 2912.0,
-            "92": 4114.0,
-            "93": 3499.0,
-            "94": 4339.0,
-            "95": 3829.0,
-            "96": 3875.0,
-            "97": 4100.0,
-            "98": 4889.0,
-            "99": 3771.0,
-            "100": 3390.0
+            "75": 2823.0,
+            "76": 3490.0,
+            "77": 3710.0,
+            "78": 3619.0,
+            "79": 3911.0,
+            "80": 3431.0,
+            "81": 4963.0,
+            "82": 3460.0,
+            "83": 3062.0,
+            "84": 3593.0,
+            "85": 3752.0,
+            "86": 3255.0,
+            "87": 4096.0,
+            "88": 3272.0,
+            "89": 4074.0,
+            "90": 3810.0,
+            "91": 2877.0,
+            "92": 4080.0,
+            "93": 3469.0,
+            "94": 4428.0,
+            "95": 3850.0,
+            "96": 3832.0,
+            "97": 4102.0,
+            "98": 4833.0,
+            "99": 3795.0,
+            "100": 3405.0
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..3636eb8af32
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.23471,
+            "52": 10.13764,
+            "53": 10.34797,
+            "54": 10.26738,
+            "55": 10.20734,
+            "56": 9.99527,
+            "57": 9.89333,
+            "58": 10.13452,
+            "59": 9.92856,
+            "60": 9.8551,
+            "61": 9.98264,
+            "62": 10.20686,
+            "63": 9.70842,
+            "64": 10.01687,
+            "65": 9.30409,
+            "66": 9.93326,
+            "67": 9.62677,
+            "68": 9.98429,
+            "69": 9.9755,
+            "70": 9.93956,
+            "71": 9.81005,
+            "72": 9.798,
+            "73": 9.68454,
+            "74": 9.19951,
+            "75": 9.60519,
+            "76": 9.2779,
+            "77": 10.19436,
+            "78": 9.8671,
+            "79": 9.53341,
+            "80": 9.56341,
+            "81": 9.63047,
+            "82": 9.82819,
+            "83": 9.46388,
+            "84": 9.53735,
+            "85": 9.74562,
+            "86": 9.21332,
+            "87": 9.70141,
+            "88": 9.86621,
+            "89": 9.72242,
+            "90": 9.9209,
+            "91": 9.47178,
+            "92": 9.46996,
+            "93": 9.20589,
+            "94": 8.94772,
+            "95": 9.60815,
+            "96": 9.63635,
+            "97": 9.4138,
+            "98": 9.77274,
+            "99": 8.9958,
+            "100": 9.50415
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2041.0,
+            "52": 2226.0,
+            "53": 3222.0,
+            "54": 2784.0,
+            "55": 2290.0,
+            "56": 2428.0,
+            "57": 2146.0,
+            "58": 3048.0,
+            "59": 2504.0,
+            "60": 2612.0,
+            "61": 2623.0,
+            "62": 3003.0,
+            "63": 2762.0,
+            "64": 2917.0,
+            "65": 2104.0,
+            "66": 3550.0,
+            "67": 2433.0,
+            "68": 3146.0,
+            "69": 2877.0,
+            "70": 3528.0,
+            "71": 2983.0,
+            "72": 2640.0,
+            "73": 3199.0,
+            "74": 2084.0,
+            "75": 2823.0,
+            "76": 3490.0,
+            "77": 3710.0,
+            "78": 3619.0,
+            "79": 3911.0,
+            "80": 3431.0,
+            "81": 4963.0,
+            "82": 3460.0,
+            "83": 3062.0,
+            "84": 3593.0,
+            "85": 3752.0,
+            "86": 3255.0,
+            "87": 4096.0,
+            "88": 3272.0,
+            "89": 4074.0,
+            "90": 3810.0,
+            "91": 2877.0,
+            "92": 4080.0,
+            "93": 3469.0,
+            "94": 4428.0,
+            "95": 3850.0,
+            "96": 3832.0,
+            "97": 4102.0,
+            "98": 4833.0,
+            "99": 3795.0,
+            "100": 3405.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 415464960.0,
+            "52": 415464960.0,
+            "53": 415464960.0,
+            "54": 415464960.0,
+            "55": 415464960.0,
+            "56": 415464960.0,
+            "57": 415464960.0,
+            "58": 415464960.0,
+            "59": 415464960.0,
+            "60": 415464960.0,
+            "61": 415464960.0,
+            "62": 415464960.0,
+            "63": 415464960.0,
+            "64": 415464960.0,
+            "65": 415464960.0,
+            "66": 415464960.0,
+            "67": 415464960.0,
+            "68": 415464960.0,
+            "69": 415464960.0,
+            "70": 415464960.0,
+            "71": 415464960.0,
+            "72": 415464960.0,
+            "73": 415464960.0,
+            "74": 415464960.0,
+            "75": 415464960.0,
+            "76": 415464960.0,
+            "77": 415464960.0,
+            "78": 415464960.0,
+            "79": 415464960.0,
+            "80": 415464960.0,
+            "81": 415464960.0,
+            "82": 415464960.0,
+            "83": 415464960.0,
+            "84": 415464960.0,
+            "85": 415464960.0,
+            "86": 415464960.0,
+            "87": 415464960.0,
+            "88": 415464960.0,
+            "89": 415464960.0,
+            "90": 415464960.0,
+            "91": 415464960.0,
+            "92": 415464960.0,
+            "93": 415464960.0,
+            "94": 415464960.0,
+            "95": 415464960.0,
+            "96": 415464960.0,
+            "97": 415464960.0,
+            "98": 415464960.0,
+            "99": 415464960.0,
+            "100": 415464960.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1596304896.0,
+            "52": 1596305920.0,
+            "53": 1596305920.0,
+            "54": 1596305920.0,
+            "55": 1596305920.0,
+            "56": 1596305920.0,
+            "57": 1596305920.0,
+            "58": 1596305920.0,
+            "59": 1596305920.0,
+            "60": 1596305920.0,
+            "61": 1596305920.0,
+            "62": 1596305920.0,
+            "63": 1596305920.0,
+            "64": 1596305920.0,
+            "65": 1596305920.0,
+            "66": 1596305920.0,
+            "67": 1596305920.0,
+            "68": 1596305920.0,
+            "69": 1596305920.0,
+            "70": 1596305920.0,
+            "71": 1596305920.0,
+            "72": 1596305920.0,
+            "73": 1596305920.0,
+            "74": 1596305920.0,
+            "75": 1596305920.0,
+            "76": 1596305920.0,
+            "77": 1596305920.0,
+            "78": 1596305920.0,
+            "79": 1596305920.0,
+            "80": 1596305920.0,
+            "81": 1596305920.0,
+            "82": 1596305920.0,
+            "83": 1596305920.0,
+            "84": 1596305920.0,
+            "85": 1596305920.0,
+            "86": 1596305920.0,
+            "87": 1596305920.0,
+            "88": 1596305920.0,
+            "89": 1596305920.0,
+            "90": 1596305920.0,
+            "91": 1596305920.0,
+            "92": 1596305920.0,
+            "93": 1596305920.0,
+            "94": 1596305920.0,
+            "95": 1596305920.0,
+            "96": 1596305920.0,
+            "97": 1596305920.0,
+            "98": 1596305920.0,
+            "99": 1596305920.0,
+            "100": 1596305920.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 5.92942,
+            "52": 0.22042,
+            "53": 0.20141,
+            "54": 0.20104,
+            "55": 0.20086,
+            "56": 0.20205,
+            "57": 0.20313,
+            "58": 0.20575,
+            "59": 0.2059,
+            "60": 0.20487,
+            "61": 0.20376,
+            "62": 0.20344,
+            "63": 0.20602,
+            "64": 0.20171,
+            "65": 0.20118,
+            "66": 0.20255,
+            "67": 0.20176,
+            "68": 0.20547,
+            "69": 0.20291,
+            "70": 0.20293,
+            "71": 0.20018,
+            "72": 0.20194,
+            "73": 0.20093,
+            "74": 0.20334,
+            "75": 0.20211,
+            "76": 0.20117,
+            "77": 0.20772,
+            "78": 0.20129,
+            "79": 0.20479,
+            "80": 0.20282,
+            "81": 0.20264,
+            "82": 0.20056,
+            "83": 0.20106,
+            "84": 0.20106,
+            "85": 0.20234,
+            "86": 0.20068,
+            "87": 0.20279,
+            "88": 0.20195,
+            "89": 0.20174,
+            "90": 0.20096,
+            "91": 0.20103,
+            "92": 0.20077,
+            "93": 0.20116,
+            "94": 0.2013,
+            "95": 0.20159,
+            "96": 0.20087,
+            "97": 0.20359,
+            "98": 0.20084,
+            "99": 0.20147,
+            "100": 0.20053
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json
index 0eef09cf2c1..45a51405f72 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json
@@ -218,9 +218,9 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1465367040.0,
-            "2": 1465367040.0,
-            "3": 1465368064.0,
+            "1": 1465368064.0,
+            "2": 1465368576.0,
+            "3": 1465368576.0,
             "4": 1465368576.0,
             "5": 1465368576.0,
             "6": 1465368576.0,
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 8.02782,
-            "2": 0.31435,
-            "3": 0.27957,
-            "4": 0.27933,
-            "5": 0.27866,
-            "6": 0.27855,
-            "7": 0.2779,
-            "8": 0.27621,
-            "9": 0.27704,
-            "10": 0.27611,
-            "11": 0.27501,
-            "12": 0.27489,
-            "13": 0.27468,
-            "14": 0.27386,
-            "15": 0.27315,
-            "16": 0.41595,
-            "17": 0.27523,
-            "18": 0.28979,
-            "19": 0.28871,
-            "20": 0.2888,
-            "21": 0.28867,
-            "22": 0.27653,
-            "23": 0.29205,
-            "24": 0.29078,
-            "25": 0.29104,
-            "26": 0.29087,
-            "27": 0.28794,
-            "28": 0.28784,
-            "29": 0.28659,
-            "30": 0.28669,
-            "31": 0.28638,
-            "32": 0.2878,
-            "33": 0.28717,
-            "34": 0.28616,
-            "35": 0.28626,
-            "36": 0.28648,
-            "37": 0.28977,
-            "38": 0.28615,
-            "39": 0.2864,
-            "40": 0.28588,
-            "41": 0.28749,
-            "42": 0.28735,
-            "43": 0.28605,
-            "44": 0.28798,
-            "45": 0.2882,
-            "46": 0.28727,
-            "47": 0.28616,
-            "48": 0.28603,
-            "49": 0.2876,
-            "50": 0.29155,
-            "51": 0.30309,
-            "52": 0.29889,
-            "53": 0.29736,
-            "54": 0.29772,
-            "55": 0.29611,
-            "56": 0.29565,
-            "57": 0.29413,
-            "58": 0.29391,
-            "59": 0.29344,
-            "60": 0.29428,
-            "61": 0.29695,
-            "62": 0.29282,
-            "63": 0.29418,
-            "64": 0.29352,
-            "65": 0.29274,
-            "66": 0.29449,
-            "67": 0.29627,
-            "68": 0.29636,
-            "69": 0.29393,
-            "70": 0.28967,
-            "71": 0.28925,
-            "72": 0.28962,
-            "73": 0.28944,
-            "74": 0.28948,
-            "75": 0.28996,
-            "76": 0.28938,
-            "77": 0.28855,
-            "78": 0.28891,
-            "79": 0.28905,
-            "80": 0.28968,
-            "81": 0.28873,
-            "82": 0.28966,
-            "83": 0.2884,
-            "84": 0.28842,
-            "85": 0.29077,
-            "86": 0.28927,
-            "87": 0.28888,
-            "88": 0.28909,
-            "89": 0.28807,
-            "90": 0.28887,
-            "91": 0.28894,
-            "92": 0.28908,
-            "93": 0.28985,
-            "94": 0.289,
-            "95": 0.28861,
-            "96": 0.28831,
-            "97": 0.2877,
-            "98": 0.29019,
-            "99": 0.28839,
-            "100": 0.2881
+            "1": 2.87517,
+            "2": 0.32741,
+            "3": 0.30727,
+            "4": 0.29165,
+            "5": 0.29258,
+            "6": 0.28618,
+            "7": 0.28628,
+            "8": 0.28498,
+            "9": 0.28839,
+            "10": 0.29027,
+            "11": 0.28697,
+            "12": 0.28511,
+            "13": 0.29151,
+            "14": 0.28721,
+            "15": 0.2851,
+            "16": 0.40392,
+            "17": 0.28544,
+            "18": 0.2995,
+            "19": 0.30593,
+            "20": 0.29922,
+            "21": 0.3,
+            "22": 0.2873,
+            "23": 0.29862,
+            "24": 0.3016,
+            "25": 0.3043,
+            "26": 0.30026,
+            "27": 0.30577,
+            "28": 0.29895,
+            "29": 0.30118,
+            "30": 0.30038,
+            "31": 0.29973,
+            "32": 0.30495,
+            "33": 0.29971,
+            "34": 0.3058,
+            "35": 0.30206,
+            "36": 0.29968,
+            "37": 0.30462,
+            "38": 0.29914,
+            "39": 0.30006,
+            "40": 0.30275,
+            "41": 0.29843,
+            "42": 0.30385,
+            "43": 0.30136,
+            "44": 0.30005,
+            "45": 0.30598,
+            "46": 0.30646,
+            "47": 0.30678,
+            "48": 0.30524,
+            "49": 0.30042,
+            "50": 0.30333,
+            "51": 0.3058,
+            "52": 0.2979,
+            "53": 0.29694,
+            "54": 0.29792,
+            "55": 0.29906,
+            "56": 0.2986,
+            "57": 0.299,
+            "58": 0.29801,
+            "59": 0.29877,
+            "60": 0.29785,
+            "61": 0.2976,
+            "62": 0.29759,
+            "63": 0.75788,
+            "64": 0.30011,
+            "65": 0.29654,
+            "66": 0.29892,
+            "67": 0.29761,
+            "68": 0.29802,
+            "69": 0.3014,
+            "70": 0.30046,
+            "71": 0.29911,
+            "72": 0.29858,
+            "73": 0.29679,
+            "74": 0.2965,
+            "75": 0.29902,
+            "76": 0.29862,
+            "77": 0.29715,
+            "78": 0.2986,
+            "79": 0.30843,
+            "80": 0.29932,
+            "81": 0.29873,
+            "82": 0.29681,
+            "83": 0.29885,
+            "84": 0.29829,
+            "85": 0.29898,
+            "86": 0.29994,
+            "87": 0.29961,
+            "88": 0.3003,
+            "89": 0.29957,
+            "90": 0.29999,
+            "91": 0.29959,
+            "92": 0.30006,
+            "93": 0.30057,
+            "94": 0.29999,
+            "95": 0.30006,
+            "96": 0.29915,
+            "97": 0.30017,
+            "98": 0.29952,
+            "99": 0.30127,
+            "100": 0.30043
         }
     },
     "num-zeros": {
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
new file mode 100644
index 00000000000..89836562450
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.21727,
+            "52": 10.1271,
+            "53": 10.36018,
+            "54": 10.25981,
+            "55": 10.20104,
+            "56": 9.98213,
+            "57": 9.84717,
+            "58": 10.12257,
+            "59": 9.90914,
+            "60": 9.83288,
+            "61": 9.9713,
+            "62": 10.22005,
+            "63": 9.67481,
+            "64": 10.01706,
+            "65": 9.27085,
+            "66": 9.93979,
+            "67": 9.62899,
+            "68": 9.98681,
+            "69": 9.9839,
+            "70": 9.92559,
+            "71": 9.81011,
+            "72": 9.79196,
+            "73": 9.68163,
+            "74": 9.17945,
+            "75": 9.61324,
+            "76": 9.28951,
+            "77": 10.19435,
+            "78": 9.8755,
+            "79": 9.5297,
+            "80": 9.56593,
+            "81": 9.63478,
+            "82": 9.82295,
+            "83": 9.47164,
+            "84": 9.54623,
+            "85": 9.74358,
+            "86": 9.20093,
+            "87": 9.70179,
+            "88": 9.86553,
+            "89": 9.73045,
+            "90": 9.92108,
+            "91": 9.48732,
+            "92": 9.47637,
+            "93": 9.21283,
+            "94": 8.94903,
+            "95": 9.6165,
+            "96": 9.63374,
+            "97": 9.41244,
+            "98": 9.7751,
+            "99": 9.00191,
+            "100": 9.50967
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2195.0,
+            "52": 2330.0,
+            "53": 3549.0,
+            "54": 2650.0,
+            "55": 2247.0,
+            "56": 2422.0,
+            "57": 2195.0,
+            "58": 3241.0,
+            "59": 2626.0,
+            "60": 2775.0,
+            "61": 2747.0,
+            "62": 2926.0,
+            "63": 2898.0,
+            "64": 3090.0,
+            "65": 2245.0,
+            "66": 3827.0,
+            "67": 2655.0,
+            "68": 3117.0,
+            "69": 2656.0,
+            "70": 3659.0,
+            "71": 2819.0,
+            "72": 2710.0,
+            "73": 3355.0,
+            "74": 2210.0,
+            "75": 2927.0,
+            "76": 3577.0,
+            "77": 3727.0,
+            "78": 3855.0,
+            "79": 4237.0,
+            "80": 3462.0,
+            "81": 5157.0,
+            "82": 3426.0,
+            "83": 3234.0,
+            "84": 3878.0,
+            "85": 3734.0,
+            "86": 3184.0,
+            "87": 4090.0,
+            "88": 3594.0,
+            "89": 4234.0,
+            "90": 3744.0,
+            "91": 2967.0,
+            "92": 4509.0,
+            "93": 3649.0,
+            "94": 4486.0,
+            "95": 4215.0,
+            "96": 3851.0,
+            "97": 4098.0,
+            "98": 5029.0,
+            "99": 3975.0,
+            "100": 3445.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 415464960.0,
+            "52": 415464960.0,
+            "53": 415464960.0,
+            "54": 415464960.0,
+            "55": 415464960.0,
+            "56": 415464960.0,
+            "57": 415464960.0,
+            "58": 415464960.0,
+            "59": 415464960.0,
+            "60": 415464960.0,
+            "61": 415464960.0,
+            "62": 415464960.0,
+            "63": 415464960.0,
+            "64": 415464960.0,
+            "65": 415464960.0,
+            "66": 415464960.0,
+            "67": 415464960.0,
+            "68": 415464960.0,
+            "69": 415464960.0,
+            "70": 415464960.0,
+            "71": 415464960.0,
+            "72": 415464960.0,
+            "73": 415464960.0,
+            "74": 415464960.0,
+            "75": 415464960.0,
+            "76": 415464960.0,
+            "77": 415464960.0,
+            "78": 415464960.0,
+            "79": 415464960.0,
+            "80": 415464960.0,
+            "81": 415464960.0,
+            "82": 415464960.0,
+            "83": 415464960.0,
+            "84": 415464960.0,
+            "85": 415464960.0,
+            "86": 415464960.0,
+            "87": 415464960.0,
+            "88": 415464960.0,
+            "89": 415464960.0,
+            "90": 415464960.0,
+            "91": 415464960.0,
+            "92": 415464960.0,
+            "93": 415464960.0,
+            "94": 415464960.0,
+            "95": 415464960.0,
+            "96": 415464960.0,
+            "97": 415464960.0,
+            "98": 415464960.0,
+            "99": 415464960.0,
+            "100": 415464960.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1596304896.0,
+            "52": 1596305920.0,
+            "53": 1596305920.0,
+            "54": 1596305920.0,
+            "55": 1596305920.0,
+            "56": 1596305920.0,
+            "57": 1596305920.0,
+            "58": 1596305920.0,
+            "59": 1596305920.0,
+            "60": 1596305920.0,
+            "61": 1596305920.0,
+            "62": 1596305920.0,
+            "63": 1596305920.0,
+            "64": 1596305920.0,
+            "65": 1596305920.0,
+            "66": 1596305920.0,
+            "67": 1596305920.0,
+            "68": 1596305920.0,
+            "69": 1596305920.0,
+            "70": 1596305920.0,
+            "71": 1596305920.0,
+            "72": 1596305920.0,
+            "73": 1596305920.0,
+            "74": 1596305920.0,
+            "75": 1596305920.0,
+            "76": 1596305920.0,
+            "77": 1596305920.0,
+            "78": 1596305920.0,
+            "79": 1596305920.0,
+            "80": 1596305920.0,
+            "81": 1596305920.0,
+            "82": 1596305920.0,
+            "83": 1596305920.0,
+            "84": 1596305920.0,
+            "85": 1596305920.0,
+            "86": 1596305920.0,
+            "87": 1596305920.0,
+            "88": 1596305920.0,
+            "89": 1596305920.0,
+            "90": 1596305920.0,
+            "91": 1596305920.0,
+            "92": 1596305920.0,
+            "93": 1596305920.0,
+            "94": 1596305920.0,
+            "95": 1596305920.0,
+            "96": 1596305920.0,
+            "97": 1596305920.0,
+            "98": 1596305920.0,
+            "99": 1596305920.0,
+            "100": 1596305920.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3.74437,
+            "52": 0.32779,
+            "53": 0.3059,
+            "54": 0.30649,
+            "55": 0.30382,
+            "56": 0.30295,
+            "57": 0.30294,
+            "58": 0.30245,
+            "59": 0.30304,
+            "60": 0.30304,
+            "61": 0.30367,
+            "62": 0.30374,
+            "63": 0.30252,
+            "64": 0.304,
+            "65": 0.30269,
+            "66": 0.30287,
+            "67": 0.30327,
+            "68": 0.30407,
+            "69": 0.30396,
+            "70": 0.30328,
+            "71": 0.30476,
+            "72": 0.3053,
+            "73": 0.30394,
+            "74": 0.3027,
+            "75": 0.30299,
+            "76": 0.30389,
+            "77": 0.30485,
+            "78": 0.30454,
+            "79": 0.304,
+            "80": 0.30244,
+            "81": 0.30324,
+            "82": 0.30372,
+            "83": 0.30372,
+            "84": 0.30436,
+            "85": 0.30371,
+            "86": 0.30282,
+            "87": 0.30363,
+            "88": 0.30375,
+            "89": 0.30379,
+            "90": 0.30426,
+            "91": 0.30435,
+            "92": 0.30341,
+            "93": 0.30389,
+            "94": 0.30489,
+            "95": 0.30286,
+            "96": 0.30305,
+            "97": 0.30297,
+            "98": 0.30369,
+            "99": 0.30282,
+            "100": 0.30347
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml
index 5ac9b7b4701..28f0a8a7b53 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --use-distributed-optimizer: true
   --attention-softmax-in-fp32: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..9e26dfeeb6e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgx_gb200.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
index 66c9f171be5..8d85af9b7d4 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --fp8-amax-compute-algo: max
   --attention-softmax-in-fp32: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json
index e4742204e12..c7ce9851234 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json
@@ -1,172 +1,296 @@
 {
  "0": {
   "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.",
-  "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end",
+  "generated_text": " And that this is the place where you can be yourself, and be yourself, and be yourself, and be yourself, and be yourself, and be",
   "generated_tokens": [
    3060,
-   2430,
-   1636,
-   2012,
-   1317,
-   1278,
-   2362,
-   1307,
-   1278,
-   16070,
-   1044,
-   1321,
-   1636,
-   23067,
    1455,
    1593,
    1395,
-   1605,
-   3140,
-   5152,
-   1513,
-   1747,
-   1046,
-   2409,
-   1395,
-   3140,
-   5152,
-   1513,
    1278,
-   2362
+   3535,
+   2478,
+   1636,
+   1710,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402
   ],
-  "latency": 0.328082799911499,
+  "latency": 0.33650875091552734,
   "cuda_graph_request_count_map": {
-   "56": 29
+   "32": 29
   },
-  "step_count": 240,
+  "step_count": 30,
+  "top_n_logprobs": null,
+  "prompt_top_n_logprobs": null,
+  "prompt_logprobs": [
+   -9.36181926727295,
+   -2.823990821838379,
+   -4.610703945159912,
+   -1.5629558563232422,
+   -0.7174959182739258,
+   -1.6296026706695557,
+   -2.4623641967773438,
+   -2.1694350242614746,
+   -2.299478530883789,
+   -6.261927127838135,
+   -1.4214489459991455,
+   -3.5006911754608154,
+   -4.495674133300781,
+   -3.756869077682495,
+   -2.11869478225708,
+   -2.0652074813842773,
+   -3.5548300743103027,
+   -6.7972517013549805,
+   -0.32509124279022217,
+   -0.802075982093811,
+   -6.374052047729492,
+   -7.404623031616211,
+   -12.706570625305176,
+   -2.7203140258789062,
+   -3.784698486328125,
+   -0.5262728929519653,
+   -4.230419158935547,
+   -0.05865294858813286,
+   -0.08889779448509216,
+   -3.3020200729370117,
+   -10.076433181762695,
+   -1.1052889823913574,
+   -6.047104835510254,
+   -5.263249397277832,
+   -3.813868522644043,
+   -2.732881784439087,
+   -3.4210383892059326,
+   -6.047735214233398,
+   -1.9643490314483643,
+   -5.7317914962768555,
+   -12.158651351928711,
+   -12.511089324951172,
+   -0.07087351381778717,
+   -2.580897331237793,
+   -1.4599298238754272,
+   -3.0208420753479004,
+   -1.2277309894561768,
+   -0.006594917271286249,
+   -3.4534847736358643,
+   -13.229089736938477,
+   -4.096384048461914,
+   -2.5087532997131348,
+   -5.955618858337402,
+   -0.7179529070854187,
+   -0.05547872930765152,
+   -1.5987433195114136,
+   -1.0590581893920898,
+   -5.704042434692383,
+   -0.3894253373146057,
+   -5.037204742431641,
+   -0.5877441167831421,
+   -0.5446641445159912,
+   -2.4070374965667725,
+   -13.461160659790039,
+   -0.10237079858779907,
+   -3.5546276569366455,
+   -1.3483082056045532,
+   -6.042889595031738,
+   -0.5367065668106079,
+   -3.593390703201294,
+   -0.9327085018157959,
+   -1.5130213499069214,
+   -5.070390701293945,
+   -17.36066436767578,
+   -6.901477813720703,
+   -1.0385162830352783,
+   -3.9858975410461426,
+   -1.1583341360092163,
+   -2.342862606048584,
+   -1.7755080461502075,
+   -0.27050071954727173,
+   -9.41438102722168,
+   -0.3227814733982086,
+   -7.4246134757995605,
+   -2.2850522994995117,
+   -4.027304649353027,
+   -3.479668378829956
+  ],
+  "generated_logprobs": [
+   -1.97231125831604,
+   -2.363867998123169,
+   -2.219954490661621,
+   -0.29585954546928406,
+   -1.4493519067764282,
+   -2.232797622680664,
+   -1.1424486637115479,
+   -1.5864160060882568,
+   -1.4188923835754395,
+   -2.0473084449768066,
+   -1.470442771911621,
+   -0.8504352569580078,
+   -1.147210955619812,
+   -2.0061838626861572,
+   -2.4544901847839355,
+   -1.7092150449752808,
+   -0.23308466374874115,
+   -0.38648492097854614,
+   -0.055945850908756256,
+   -0.4632662534713745,
+   -0.09933969378471375,
+   -0.35298952460289,
+   -0.032222963869571686,
+   -0.428203284740448,
+   -0.04741770401597023,
+   -0.13727128505706787,
+   -0.008898601867258549,
+   -0.28543511033058167,
+   -0.022008933126926422,
+   -0.054881855845451355
+  ],
   "logprobs": [
-   -9.362494468688965,
-   -2.827894449234009,
-   -4.557381629943848,
-   -1.4968647956848145,
-   -0.717312216758728,
-   -1.7262351512908936,
-   -2.522736072540283,
-   -2.1782360076904297,
-   -2.3603432178497314,
-   -6.136383533477783,
-   -1.4676916599273682,
-   -3.468963384628296,
-   -4.424870491027832,
-   -3.7345848083496094,
-   -2.012619972229004,
-   -1.8833301067352295,
-   -3.5708768367767334,
-   -6.8197832107543945,
-   -0.3122292757034302,
-   -0.9820290207862854,
-   -6.532033443450928,
-   -7.498172760009766,
-   -12.615165710449219,
-   -2.409003496170044,
-   -3.8550546169281006,
-   -0.5105050802230835,
-   -4.2802581787109375,
-   -0.06971167027950287,
-   -0.054025799036026,
-   -3.319596767425537,
-   -9.703240394592285,
-   -1.0997297763824463,
-   -6.224854469299316,
-   -5.234503269195557,
-   -3.934987783432007,
-   -2.5263679027557373,
-   -3.1843955516815186,
-   -5.880871295928955,
-   -1.8436813354492188,
-   -5.906496047973633,
-   -12.15787410736084,
-   -12.5841064453125,
-   -0.0819428563117981,
-   -2.6212656497955322,
-   -1.4329369068145752,
-   -2.885145425796509,
-   -1.2901865243911743,
-   -0.006647023372352123,
-   -3.5115818977355957,
-   -12.945953369140625,
-   -3.793078899383545,
-   -3.0094375610351562,
-   -5.966838836669922,
-   -0.8998424410820007,
-   -0.040962252765893936,
-   -1.5467679500579834,
-   -1.0785343647003174,
-   -5.73494815826416,
-   -0.38491737842559814,
-   -5.017007827758789,
-   -0.5568072199821472,
-   -0.5968841910362244,
-   -2.3609962463378906,
-   -13.582086563110352,
-   -0.09050048142671585,
-   -3.7264108657836914,
-   -1.1208789348602295,
-   -6.052675247192383,
-   -0.5848909616470337,
-   -3.5906238555908203,
-   -0.9494907855987549,
-   -1.5676641464233398,
-   -5.127577781677246,
-   -17.19189453125,
-   -6.698403835296631,
-   -1.0449178218841553,
-   -4.365664958953857,
-   -1.1243419647216797,
-   -2.2092156410217285,
-   -1.8081634044647217,
-   -0.23330983519554138,
-   -9.439546585083008,
-   -0.2947109341621399,
-   -7.253565788269043,
-   -2.3855936527252197,
-   -4.629369258880615,
-   -3.4186267852783203,
-   -1.9727531671524048,
-   -2.354729652404785,
-   -1.474542498588562,
-   -2.48478364944458,
-   -1.7641210556030273,
-   -1.1853944063186646,
-   -2.8624324798583984,
-   -0.5740103125572205,
-   -0.4542185962200165,
-   -1.4300930500030518,
-   -0.8807456493377686,
-   -0.4597663879394531,
-   -0.9252307415008545,
-   -1.648141860961914,
-   -0.44453874230384827,
-   -1.818476915359497,
-   -0.5714479088783264,
-   -1.2115143537521362,
-   -1.0910619497299194,
-   -0.0023161747958511114,
-   -1.3206473588943481,
-   -0.008621376007795334,
-   -0.7551823854446411,
-   -0.9404395818710327,
-   -0.07279698550701141,
-   -0.9365248680114746,
-   -0.03344438225030899,
-   -1.9720849990844727,
-   -1.3928067684173584,
-   -0.7453650832176208
+   -9.36181926727295,
+   -2.823990821838379,
+   -4.610703945159912,
+   -1.5629558563232422,
+   -0.7174959182739258,
+   -1.6296026706695557,
+   -2.4623641967773438,
+   -2.1694350242614746,
+   -2.299478530883789,
+   -6.261927127838135,
+   -1.4214489459991455,
+   -3.5006911754608154,
+   -4.495674133300781,
+   -3.756869077682495,
+   -2.11869478225708,
+   -2.0652074813842773,
+   -3.5548300743103027,
+   -6.7972517013549805,
+   -0.32509124279022217,
+   -0.802075982093811,
+   -6.374052047729492,
+   -7.404623031616211,
+   -12.706570625305176,
+   -2.7203140258789062,
+   -3.784698486328125,
+   -0.5262728929519653,
+   -4.230419158935547,
+   -0.05865294858813286,
+   -0.08889779448509216,
+   -3.3020200729370117,
+   -10.076433181762695,
+   -1.1052889823913574,
+   -6.047104835510254,
+   -5.263249397277832,
+   -3.813868522644043,
+   -2.732881784439087,
+   -3.4210383892059326,
+   -6.047735214233398,
+   -1.9643490314483643,
+   -5.7317914962768555,
+   -12.158651351928711,
+   -12.511089324951172,
+   -0.07087351381778717,
+   -2.580897331237793,
+   -1.4599298238754272,
+   -3.0208420753479004,
+   -1.2277309894561768,
+   -0.006594917271286249,
+   -3.4534847736358643,
+   -13.229089736938477,
+   -4.096384048461914,
+   -2.5087532997131348,
+   -5.955618858337402,
+   -0.7179529070854187,
+   -0.05547872930765152,
+   -1.5987433195114136,
+   -1.0590581893920898,
+   -5.704042434692383,
+   -0.3894253373146057,
+   -5.037204742431641,
+   -0.5877441167831421,
+   -0.5446641445159912,
+   -2.4070374965667725,
+   -13.461160659790039,
+   -0.10237079858779907,
+   -3.5546276569366455,
+   -1.3483082056045532,
+   -6.042889595031738,
+   -0.5367065668106079,
+   -3.593390703201294,
+   -0.9327085018157959,
+   -1.5130213499069214,
+   -5.070390701293945,
+   -17.36066436767578,
+   -6.901477813720703,
+   -1.0385162830352783,
+   -3.9858975410461426,
+   -1.1583341360092163,
+   -2.342862606048584,
+   -1.7755080461502075,
+   -0.27050071954727173,
+   -9.41438102722168,
+   -0.3227814733982086,
+   -7.4246134757995605,
+   -2.2850522994995117,
+   -4.027304649353027,
+   -3.479668378829956,
+   -1.97231125831604,
+   -2.363867998123169,
+   -2.219954490661621,
+   -0.29585954546928406,
+   -1.4493519067764282,
+   -2.232797622680664,
+   -1.1424486637115479,
+   -1.5864160060882568,
+   -1.4188923835754395,
+   -2.0473084449768066,
+   -1.470442771911621,
+   -0.8504352569580078,
+   -1.147210955619812,
+   -2.0061838626861572,
+   -2.4544901847839355,
+   -1.7092150449752808,
+   -0.23308466374874115,
+   -0.38648492097854614,
+   -0.055945850908756256,
+   -0.4632662534713745,
+   -0.09933969378471375,
+   -0.35298952460289,
+   -0.032222963869571686,
+   -0.428203284740448,
+   -0.04741770401597023,
+   -0.13727128505706787,
+   -0.008898601867258549,
+   -0.28543511033058167,
+   -0.022008933126926422,
+   -0.054881855845451355
   ]
  },
  "throughput": [
-  4.425422579287822,
-  90.34535098369636,
-  90.69889744890898,
-  91.1674411932201,
-  91.03710531380275,
-  90.79529965256106,
-  90.46701234610507,
-  90.29193136243964
- ]
+ 112.34141028159206,
+ 143.47299899774578,
+ 136.9190123220356,
+ 133.0550750138523,
+ 140.54753942350868,
+ 142.31278267940777,
+ 142.60535677655014,
+ 142.2477862300286
+ ],
+ "mem-max-allocated-bytes": 23014038016
 }
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml
index 15a4a655049..743c4f50da3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml
@@ -49,11 +49,12 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 20
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
   --inference-repeat-n: 8
   --use-flashinfer-fused-rope: true
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json
index c22bb604f94..51437664cf7 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json
@@ -1,187 +1,295 @@
 {
  "0": {
   "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.",
-  "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end",
+  "generated_text": " And that this is the place where you can be yourself, and be yourself, and be yourself, and be yourself, and be yourself, and be",
   "generated_tokens": [
    3060,
-   2430,
-   1636,
-   2012,
-   1317,
-   1278,
-   2362,
-   1307,
-   1278,
-   16070,
-   1044,
-   1321,
-   1636,
-   23067,
    1455,
    1593,
    1395,
-   1605,
-   3140,
-   5152,
-   1513,
-   1747,
-   1046,
-   2409,
-   1395,
-   3140,
-   5152,
-   1513,
    1278,
-   2362
+   3535,
+   2478,
+   1636,
+   1710,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402,
+   14019,
+   1044,
+   1321,
+   1402
   ],
-  "latency": 0.38181447982788086,
+  "latency": 0.4249272346496582,
   "cuda_graph_request_count_map": {
-   "852": 0,
-   "840": 0,
-   "784": 0,
-   "728": 0,
-   "672": 0,
-   "616": 0,
-   "560": 0,
-   "504": 0,
-   "448": 0,
-   "392": 0,
-   "336": 0,
-   "280": 0,
-   "224": 0,
-   "168": 0,
-   "112": 0,
-   "56": 29
+   "32": 29
   },
-  "step_count": 240,
+  "step_count": 30,
+  "top_n_logprobs": null,
+  "prompt_top_n_logprobs": null,
+  "prompt_logprobs": [
+   -9.36181926727295,
+   -2.823990821838379,
+   -4.610703945159912,
+   -1.5629558563232422,
+   -0.7174959182739258,
+   -1.6296026706695557,
+   -2.4623641967773438,
+   -2.1694350242614746,
+   -2.299478530883789,
+   -6.261927127838135,
+   -1.4214489459991455,
+   -3.5006911754608154,
+   -4.495674133300781,
+   -3.756869077682495,
+   -2.11869478225708,
+   -2.0652074813842773,
+   -3.5548300743103027,
+   -6.7972517013549805,
+   -0.32509124279022217,
+   -0.802075982093811,
+   -6.374052047729492,
+   -7.404623031616211,
+   -12.706570625305176,
+   -2.7203140258789062,
+   -3.784698486328125,
+   -0.5262728929519653,
+   -4.230419158935547,
+   -0.05865294858813286,
+   -0.08889779448509216,
+   -3.3020200729370117,
+   -10.076433181762695,
+   -1.1052889823913574,
+   -6.047104835510254,
+   -5.263249397277832,
+   -3.813868522644043,
+   -2.732881784439087,
+   -3.4210383892059326,
+   -6.047735214233398,
+   -1.9643490314483643,
+   -5.7317914962768555,
+   -12.158651351928711,
+   -12.511089324951172,
+   -0.07087351381778717,
+   -2.580897331237793,
+   -1.4599298238754272,
+   -3.0208420753479004,
+   -1.2277309894561768,
+   -0.006594917271286249,
+   -3.4534847736358643,
+   -13.229089736938477,
+   -4.096384048461914,
+   -2.5087532997131348,
+   -5.955618858337402,
+   -0.7179529070854187,
+   -0.05547872930765152,
+   -1.5987433195114136,
+   -1.0590581893920898,
+   -5.704042434692383,
+   -0.3894253373146057,
+   -5.037204742431641,
+   -0.5877441167831421,
+   -0.5446641445159912,
+   -2.4070374965667725,
+   -13.461160659790039,
+   -0.10237079858779907,
+   -3.5546276569366455,
+   -1.3483082056045532,
+   -6.042889595031738,
+   -0.5367065668106079,
+   -3.593390703201294,
+   -0.9327085018157959,
+   -1.5130213499069214,
+   -5.070390701293945,
+   -17.36066436767578,
+   -6.901477813720703,
+   -1.0385162830352783,
+   -3.9858975410461426,
+   -1.1583341360092163,
+   -2.342862606048584,
+   -1.7755080461502075,
+   -0.27050071954727173,
+   -9.41438102722168,
+   -0.3227814733982086,
+   -7.4246134757995605,
+   -2.2850522994995117,
+   -4.027304649353027,
+   -3.479668378829956
+  ],
+  "generated_logprobs": [
+   -1.97231125831604,
+   -2.363867998123169,
+   -2.219954490661621,
+   -0.29585954546928406,
+   -1.4493519067764282,
+   -2.232797622680664,
+   -1.1424486637115479,
+   -1.5864160060882568,
+   -1.4188923835754395,
+   -2.0473084449768066,
+   -1.470442771911621,
+   -0.8504352569580078,
+   -1.147210955619812,
+   -2.0061838626861572,
+   -2.4544901847839355,
+   -1.7092150449752808,
+   -0.23308466374874115,
+   -0.38648492097854614,
+   -0.055945850908756256,
+   -0.4632662534713745,
+   -0.09933969378471375,
+   -0.35298952460289,
+   -0.032222963869571686,
+   -0.428203284740448,
+   -0.04741770401597023,
+   -0.13727128505706787,
+   -0.008898601867258549,
+   -0.28543511033058167,
+   -0.022008933126926422,
+   -0.054881855845451355
+  ],
   "logprobs": [
-   -9.362494468688965,
-   -2.827894449234009,
-   -4.557381629943848,
-   -1.4968647956848145,
-   -0.717312216758728,
-   -1.7262351512908936,
-   -2.522736072540283,
-   -2.1782360076904297,
-   -2.3603432178497314,
-   -6.136383533477783,
-   -1.4676916599273682,
-   -3.468963384628296,
-   -4.424870491027832,
-   -3.7345848083496094,
-   -2.012619972229004,
-   -1.8833301067352295,
-   -3.5708768367767334,
-   -6.8197832107543945,
-   -0.3122292757034302,
-   -0.9820290207862854,
-   -6.532033443450928,
-   -7.498172760009766,
-   -12.615165710449219,
-   -2.409003496170044,
-   -3.8550546169281006,
-   -0.5105050802230835,
-   -4.2802581787109375,
-   -0.06971167027950287,
-   -0.054025799036026,
-   -3.319596767425537,
-   -9.703240394592285,
-   -1.0997297763824463,
-   -6.224854469299316,
-   -5.234503269195557,
-   -3.934987783432007,
-   -2.5263679027557373,
-   -3.1843955516815186,
-   -5.880871295928955,
-   -1.8436813354492188,
-   -5.906496047973633,
-   -12.15787410736084,
-   -12.5841064453125,
-   -0.0819428563117981,
-   -2.6212656497955322,
-   -1.4329369068145752,
-   -2.885145425796509,
-   -1.2901865243911743,
-   -0.006647023372352123,
-   -3.5115818977355957,
-   -12.945953369140625,
-   -3.793078899383545,
-   -3.0094375610351562,
-   -5.966838836669922,
-   -0.8998424410820007,
-   -0.040962252765893936,
-   -1.5467679500579834,
-   -1.0785343647003174,
-   -5.73494815826416,
-   -0.38491737842559814,
-   -5.017007827758789,
-   -0.5568072199821472,
-   -0.5968841910362244,
-   -2.3609962463378906,
-   -13.582086563110352,
-   -0.09050048142671585,
-   -3.7264108657836914,
-   -1.1208789348602295,
-   -6.052675247192383,
-   -0.5848909616470337,
-   -3.5906238555908203,
-   -0.9494907855987549,
-   -1.5676641464233398,
-   -5.127577781677246,
-   -17.19189453125,
-   -6.698403835296631,
-   -1.0449178218841553,
-   -4.365664958953857,
-   -1.1243419647216797,
-   -2.2092156410217285,
-   -1.8081634044647217,
-   -0.23330983519554138,
-   -9.439546585083008,
-   -0.2947109341621399,
-   -7.253565788269043,
-   -2.3855936527252197,
-   -4.629369258880615,
-   -3.4186267852783203,
-   -1.9727531671524048,
-   -2.354729652404785,
-   -1.474542498588562,
-   -2.48478364944458,
-   -1.7641210556030273,
-   -1.1853944063186646,
-   -2.8624324798583984,
-   -0.5740103125572205,
-   -0.4542185962200165,
-   -1.4300930500030518,
-   -0.8807456493377686,
-   -0.4597663879394531,
-   -0.9252307415008545,
-   -1.648141860961914,
-   -0.44453874230384827,
-   -1.818476915359497,
-   -0.5714479088783264,
-   -1.2115143537521362,
-   -1.0910619497299194,
-   -0.0023161747958511114,
-   -1.3206473588943481,
-   -0.008621376007795334,
-   -0.7551823854446411,
-   -0.9404395818710327,
-   -0.07279698550701141,
-   -0.9365248680114746,
-   -0.03344438225030899,
-   -1.9720849990844727,
-   -1.3928067684173584,
-   -0.7453650832176208
+   -9.36181926727295,
+   -2.823990821838379,
+   -4.610703945159912,
+   -1.5629558563232422,
+   -0.7174959182739258,
+   -1.6296026706695557,
+   -2.4623641967773438,
+   -2.1694350242614746,
+   -2.299478530883789,
+   -6.261927127838135,
+   -1.4214489459991455,
+   -3.5006911754608154,
+   -4.495674133300781,
+   -3.756869077682495,
+   -2.11869478225708,
+   -2.0652074813842773,
+   -3.5548300743103027,
+   -6.7972517013549805,
+   -0.32509124279022217,
+   -0.802075982093811,
+   -6.374052047729492,
+   -7.404623031616211,
+   -12.706570625305176,
+   -2.7203140258789062,
+   -3.784698486328125,
+   -0.5262728929519653,
+   -4.230419158935547,
+   -0.05865294858813286,
+   -0.08889779448509216,
+   -3.3020200729370117,
+   -10.076433181762695,
+   -1.1052889823913574,
+   -6.047104835510254,
+   -5.263249397277832,
+   -3.813868522644043,
+   -2.732881784439087,
+   -3.4210383892059326,
+   -6.047735214233398,
+   -1.9643490314483643,
+   -5.7317914962768555,
+   -12.158651351928711,
+   -12.511089324951172,
+   -0.07087351381778717,
+   -2.580897331237793,
+   -1.4599298238754272,
+   -3.0208420753479004,
+   -1.2277309894561768,
+   -0.006594917271286249,
+   -3.4534847736358643,
+   -13.229089736938477,
+   -4.096384048461914,
+   -2.5087532997131348,
+   -5.955618858337402,
+   -0.7179529070854187,
+   -0.05547872930765152,
+   -1.5987433195114136,
+   -1.0590581893920898,
+   -5.704042434692383,
+   -0.3894253373146057,
+   -5.037204742431641,
+   -0.5877441167831421,
+   -0.5446641445159912,
+   -2.4070374965667725,
+   -13.461160659790039,
+   -0.10237079858779907,
+   -3.5546276569366455,
+   -1.3483082056045532,
+   -6.042889595031738,
+   -0.5367065668106079,
+   -3.593390703201294,
+   -0.9327085018157959,
+   -1.5130213499069214,
+   -5.070390701293945,
+   -17.36066436767578,
+   -6.901477813720703,
+   -1.0385162830352783,
+   -3.9858975410461426,
+   -1.1583341360092163,
+   -2.342862606048584,
+   -1.7755080461502075,
+   -0.27050071954727173,
+   -9.41438102722168,
+   -0.3227814733982086,
+   -7.4246134757995605,
+   -2.2850522994995117,
+   -4.027304649353027,
+   -3.479668378829956,
+   -1.97231125831604,
+   -2.363867998123169,
+   -2.219954490661621,
+   -0.29585954546928406,
+   -1.4493519067764282,
+   -2.232797622680664,
+   -1.1424486637115479,
+   -1.5864160060882568,
+   -1.4188923835754395,
+   -2.0473084449768066,
+   -1.470442771911621,
+   -0.8504352569580078,
+   -1.147210955619812,
+   -2.0061838626861572,
+   -2.4544901847839355,
+   -1.7092150449752808,
+   -0.23308466374874115,
+   -0.38648492097854614,
+   -0.055945850908756256,
+   -0.4632662534713745,
+   -0.09933969378471375,
+   -0.35298952460289,
+   -0.032222963869571686,
+   -0.428203284740448,
+   -0.04741770401597023,
+   -0.13727128505706787,
+   -0.008898601867258549,
+   -0.28543511033058167,
+   -0.022008933126926422,
+   -0.054881855845451355
   ]
  },
  "throughput": [
-  3.896181563640281,
-  77.1287764739343,
-  77.17674536709352,
-  76.8666671960972,
-  77.944911028325,
-  77.95118832563914,
-  78.13236085816422,
-  78.0046829173943
+ 3.8530017644378955,
+ 98.13326345491087,
+ 100.89859151541394,
+ 100.80208030416277,
+ 100.18034658518215,
+ 100.88831730291241,
+ 100.4922180479951,
+ 101.13060027776349
  ]
-}
\ No newline at end of file
+}
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml
index b368242b9af..b5dc7cd5bd2 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml
@@ -50,10 +50,11 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 20
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
   --inference-repeat-n: 8
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh
index 4bf05bf88c4..641019c9750 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh
@@ -85,6 +85,7 @@ ARGS=" \
   \
   --output-path ${OUTPUT_PATH} \
   --output-every-n-results 512 \
+  --inference-logging-step-interval 1 \
 "
 
 # Enable cuda graphs.
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml
index 96ada2bf1e9..aae99fd1c4c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml
@@ -27,7 +27,6 @@ MODEL_ARGS:
   --pipeline-model-parallel-size: 1
   --deterministic-mode: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --bf16: true
   --log-memory-to-tensorboard: true
   --log-num-zeros-in-grad: true
@@ -47,7 +46,7 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 20
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --output-every-n-results: 32
   --prompt-file: ${DATA_PATH}/text/sharegpt-vicuna/filtered/processed.jsonl
   --prompt-file-num-truncate: 128 # originally 1024
@@ -55,6 +54,7 @@ MODEL_ARGS:
   --incoming-requests-per-step: 32
   --termination-id: -1
   --inference-repeat-n: 3
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml
index 7fcf9e9cf81..d84dd24487f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml
@@ -45,11 +45,11 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 20
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-step: 32
   --use-flashinfer-fused-rope: true
-
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml
index fb258646473..aa4fde5e512 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml
@@ -48,13 +48,14 @@ MODEL_ARGS:
   --disable-chunked-prefill: true
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --output-every-n-results: 32
   --prompt-file: ${DATA_PATH}/text/sharegpt-vicuna/filtered/processed.jsonl
   --prompt-file-num-truncate: 1024
   --incoming-requests-per-step: 128
   --use-flashinfer-fused-rope: true
   --throughput-check-only: true
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml
index 96d3fd0fc0c..bd34c11fc24 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml
@@ -48,11 +48,11 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 20
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-step: 32
   --use-flashinfer-fused-rope: true
-
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml
index 306c12bd653..13d409c5968 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml
@@ -48,11 +48,11 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 20
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-step: 32
   --use-flashinfer-fused-rope: true
-
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json
index f32580e937f..8cb69f894b0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json
@@ -157,5 +157,6 @@
             -0.0585334412753582
         ]
     },
-    "throughput": [12.319796866345767, 12.319796866345767]
+    "throughput": [12.319796866345767, 12.319796866345767],
+    "mem-max-allocated-bytes": 12067065856
 }
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml
index e6b659cf46f..4458edf5772 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml
@@ -44,11 +44,12 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 10
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-step: 32
   --inference-repeat-n: 8
   --use-flashinfer-fused-rope: true
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json
index 4ebaf72f5e7..93dbee6575d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json
@@ -34,125 +34,125 @@
             1278,
             2362
         ],
-        "latency": 42.63835311005823,
+        "latency": 0.3552708830102347,
         "logprobs": [
-            -9.358713150024414,
-            -2.724055767059326,
-            -4.5792131423950195,
-            -1.4844143390655518,
-            -0.6546129584312439,
-            -1.7303215265274048,
-            -2.4795279502868652,
-            -2.0776171684265137,
-            -2.4553134441375732,
-            -6.219150066375732,
-            -1.566371202468872,
-            -3.486889362335205,
-            -4.418787479400635,
-            -3.8580172061920166,
-            -2.0664010047912598,
-            -1.843908667564392,
-            -3.744598627090454,
-            -6.82543420791626,
-            -0.2880207300186157,
-            -0.9257857799530029,
-            -6.612694263458252,
-            -7.218401908874512,
-            -12.827808380126953,
-            -2.1861495971679688,
-            -3.8218231201171875,
-            -0.5008565187454224,
-            -4.383245468139648,
-            -0.06934759020805359,
-            -0.09667497128248215,
-            -3.2640299797058105,
-            -10.102912902832031,
-            -1.1498218774795532,
-            -5.979549407958984,
-            -5.0192108154296875,
-            -3.8367133140563965,
-            -2.581653356552124,
-            -3.4087462425231934,
-            -5.545716285705566,
-            -1.6541939973831177,
-            -5.547749996185303,
-            -12.21850872039795,
-            -12.582784652709961,
-            -0.09534379839897156,
-            -2.522055149078369,
-            -1.4054086208343506,
-            -2.8758127689361572,
-            -1.1866405010223389,
-            -0.005799253936856985,
-            -3.3871712684631348,
-            -13.193516731262207,
-            -4.389392852783203,
-            -2.520228862762451,
-            -6.023908615112305,
-            -0.7408540844917297,
-            -0.04526234790682793,
-            -1.5508661270141602,
-            -1.1332746744155884,
-            -5.653256416320801,
-            -0.4028852581977844,
-            -4.9457244873046875,
-            -0.618165135383606,
-            -0.6616490483283997,
-            -2.36385178565979,
-            -13.6455078125,
-            -0.08668932318687439,
-            -3.5266754627227783,
-            -1.3801541328430176,
-            -6.351947784423828,
-            -0.5434023141860962,
-            -3.5673093795776367,
-            -0.871107816696167,
-            -1.618450403213501,
-            -5.378700256347656,
-            -17.17119026184082,
-            -6.662005424499512,
-            -0.9221409559249878,
-            -4.141905784606934,
-            -1.2047083377838135,
-            -2.227570056915283,
-            -1.7645721435546875,
-            -0.21892313659191132,
-            -9.296550750732422,
-            -0.11995092779397964,
-            -7.402207851409912,
-            -2.512965679168701,
-            -4.100971221923828,
-            -3.580245018005371,
-            -1.9462040662765503,
-            -2.347074031829834,
-            -1.5288957357406616,
-            -2.4033043384552,
-            -1.7311294078826904,
-            -1.1686863899230957,
-            -2.938558340072632,
-            -0.5278136730194092,
-            -0.4748117923736572,
-            -1.749883770942688,
-            -0.8397680521011353,
-            -0.4109693169593811,
-            -0.9552587270736694,
-            -1.5238327980041504,
-            -0.4656376838684082,
-            -1.6448218822479248,
-            -0.5414345264434814,
-            -1.2422380447387695,
-            -1.1426063776016235,
-            -0.002245525596663356,
-            -1.252556562423706,
-            -0.007873333990573883,
-            -0.7185167670249939,
-            -0.7521701455116272,
-            -0.042445242404937744,
-            -0.8852499723434448,
-            -0.02266514115035534,
-            -2.0951969623565674,
-            -1.348037838935852,
-            -0.8296748399734497
+            -9.35879135131836,
+            -2.6852214336395264,
+            -4.565960884094238,
+            -1.484259843826294,
+            -0.6149517297744751,
+            -1.7398686408996582,
+            -2.526689052581787,
+            -2.0900843143463135,
+            -2.4004015922546387,
+            -6.2046918869018555,
+            -1.4779510498046875,
+            -3.4696996212005615,
+            -4.381419658660889,
+            -3.92144513130188,
+            -2.027473211288452,
+            -1.849990963935852,
+            -3.798253059387207,
+            -6.890632629394531,
+            -0.28577330708503723,
+            -0.9172963500022888,
+            -6.667942047119141,
+            -7.152089595794678,
+            -12.823952674865723,
+            -2.194999933242798,
+            -3.7969248294830322,
+            -0.503960907459259,
+            -4.32859992980957,
+            -0.0652889758348465,
+            -0.09950395673513412,
+            -3.2162013053894043,
+            -10.075189590454102,
+            -1.1461244821548462,
+            -5.991937637329102,
+            -5.068911075592041,
+            -3.8860018253326416,
+            -2.598827600479126,
+            -3.4107730388641357,
+            -5.53258752822876,
+            -1.5951910018920898,
+            -5.499358654022217,
+            -12.2184419631958,
+            -12.583678245544434,
+            -0.09812023490667343,
+            -2.4972615242004395,
+            -1.4124755859375,
+            -2.882293462753296,
+            -1.1778429746627808,
+            -0.006617418024688959,
+            -3.366197109222412,
+            -13.224164962768555,
+            -4.330657005310059,
+            -2.528923273086548,
+            -6.032571792602539,
+            -0.7999377250671387,
+            -0.046529971063137054,
+            -1.5080031156539917,
+            -1.143476963043213,
+            -5.610738754272461,
+            -0.4443867802619934,
+            -4.966207027435303,
+            -0.6222555041313171,
+            -0.7141766548156738,
+            -2.4682083129882812,
+            -13.595609664916992,
+            -0.09389874339103699,
+            -3.4752113819122314,
+            -1.4100513458251953,
+            -6.344900608062744,
+            -0.5882403254508972,
+            -3.554251194000244,
+            -0.8758341073989868,
+            -1.6025172472000122,
+            -5.337532043457031,
+            -17.198396682739258,
+            -6.618108749389648,
+            -0.904167115688324,
+            -4.1442694664001465,
+            -1.18899667263031,
+            -2.2584173679351807,
+            -1.7404848337173462,
+            -0.22586335241794586,
+            -9.318314552307129,
+            -0.11766636371612549,
+            -7.351627826690674,
+            -2.4984447956085205,
+            -4.129283905029297,
+            -3.511444330215454,
+            -1.935489296913147,
+            -2.2915453910827637,
+            -1.5244090557098389,
+            -2.380976438522339,
+            -1.7428944110870361,
+            -1.1648709774017334,
+            -3.044867515563965,
+            -0.5298795700073242,
+            -0.4574756622314453,
+            -1.7587621212005615,
+            -0.8358312845230103,
+            -0.4241933226585388,
+            -0.9311360716819763,
+            -1.49276864528656,
+            -0.4320312440395355,
+            -1.6545748710632324,
+            -0.568348228931427,
+            -1.245187520980835,
+            -1.1677653789520264,
+            -0.002115513663738966,
+            -1.1953201293945312,
+            -0.007269242778420448,
+            -0.6812739968299866,
+            -0.7529453635215759,
+            -0.0469898022711277,
+            -0.8952285051345825,
+            -0.02016274258494377,
+            -2.0373334884643555,
+            -1.3149938583374023,
+            -0.8147596120834351
         ]
     }
-}
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml
index 551ba8115cb..8d5779a5099 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml
@@ -23,6 +23,7 @@ MODEL_ARGS:
   --distributed-backend: nccl
   --log-interval: 1
   --transformer-impl: inference_optimized
+  --inference-fuse-tp-communication: true
   --sequence-parallel: true
   --tensor-model-parallel-size: 8
   --pipeline-model-parallel-size: 1
@@ -42,17 +43,17 @@ MODEL_ARGS:
   --top_k: 1
   --return-log-probs: true
   --num-tokens-to-generate: 30
-  --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility
-  --inference-dynamic-batching-buffer-guaranteed-fraction: 0
-  --inference-dynamic-batching-buffer-overflow-factor: 0.2
   --inference-dynamic-batching-buffer-size-gb: 20
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-step: 32
   --use-flashinfer-fused-rope: true
-
+  --inference-logging-step-interval: 1
+  --cuda-graph-impl: local 
+  --inference-dynamic-batching-max-requests: 128
+  --inference-dynamic-batching-num-cuda-graphs: 2
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json
deleted file mode 100644
index 1ea946d1587..00000000000
--- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json
+++ /dev/null
@@ -1,287 +0,0 @@
-{
-    "lm loss": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 0.0,
-            "2": 0.04415,
-            "3": 0.0378,
-            "4": 0.02944,
-            "5": 0.0,
-            "6": 0.0,
-            "7": 0.0,
-            "8": 0.08111,
-            "9": 0.0,
-            "10": 0.0,
-            "11": 0.0,
-            "12": 0.0,
-            "13": 0.0,
-            "14": 0.05935,
-            "15": 0.0,
-            "16": 0.05496,
-            "17": 0.0,
-            "18": 0.0,
-            "19": 0.0,
-            "20": 0.04534,
-            "21": 0.0,
-            "22": 0.0,
-            "23": 0.0,
-            "24": 0.0,
-            "25": 0.0,
-            "26": 0.0,
-            "27": 0.0,
-            "28": 0.0,
-            "29": 0.0,
-            "30": 0.0,
-            "31": 0.0,
-            "32": 0.0,
-            "33": 0.0,
-            "34": 0.0,
-            "35": 0.0,
-            "36": 0.0,
-            "37": 0.0099,
-            "38": 0.0,
-            "39": 0.0,
-            "40": 0.0,
-            "41": 0.03221,
-            "42": 0.0,
-            "43": 0.0,
-            "44": 0.0,
-            "45": 0.0,
-            "46": 0.0,
-            "47": 0.0,
-            "48": 0.0,
-            "49": 0.0,
-            "50": 0.0
-        }
-    },
-    "num-zeros": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 583687296.0,
-            "2": 0.0,
-            "3": 0.0,
-            "4": 49.0,
-            "5": 583687296.0,
-            "6": 583687296.0,
-            "7": 583687296.0,
-            "8": 12.0,
-            "9": 583687296.0,
-            "10": 583687296.0,
-            "11": 583687296.0,
-            "12": 583687296.0,
-            "13": 583687296.0,
-            "14": 6.0,
-            "15": 583687296.0,
-            "16": 62.0,
-            "17": 583687296.0,
-            "18": 583687296.0,
-            "19": 583687296.0,
-            "20": 23.0,
-            "21": 583687296.0,
-            "22": 583687296.0,
-            "23": 583687296.0,
-            "24": 583687296.0,
-            "25": 583687296.0,
-            "26": 583687296.0,
-            "27": 583687296.0,
-            "28": 583687296.0,
-            "29": 583687296.0,
-            "30": 583687296.0,
-            "31": 583687296.0,
-            "32": 583687296.0,
-            "33": 583687296.0,
-            "34": 583687296.0,
-            "35": 583687296.0,
-            "36": 583687296.0,
-            "37": 37.0,
-            "38": 583687296.0,
-            "39": 583687296.0,
-            "40": 583687296.0,
-            "41": 53.0,
-            "42": 583687296.0,
-            "43": 583687296.0,
-            "44": 583687296.0,
-            "45": 583687296.0,
-            "46": 583687296.0,
-            "47": 583687296.0,
-            "48": 583687296.0,
-            "49": 583687296.0,
-            "50": 583687296.0
-        }
-    },
-    "mem-allocated-bytes": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 55320928256.0,
-            "2": 55319695360.0,
-            "3": 55319674880.0,
-            "4": 55319638016.0,
-            "5": 55319638016.0,
-            "6": 55319638016.0,
-            "7": 55319633920.0,
-            "8": 55319625728.0,
-            "9": 55319621632.0,
-            "10": 55319625728.0,
-            "11": 55319625728.0,
-            "12": 55319629824.0,
-            "13": 55319547904.0,
-            "14": 55319552000.0,
-            "15": 55319552000.0,
-            "16": 55319552000.0,
-            "17": 55319552000.0,
-            "18": 55319552000.0,
-            "19": 55319556096.0,
-            "20": 55319556096.0,
-            "21": 55319556096.0,
-            "22": 55319556096.0,
-            "23": 55319556096.0,
-            "24": 55319560192.0,
-            "25": 55319560192.0,
-            "26": 55319560192.0,
-            "27": 55319560192.0,
-            "28": 55319552000.0,
-            "29": 55319552000.0,
-            "30": 55319552000.0,
-            "31": 55319552000.0,
-            "32": 55319552000.0,
-            "33": 55319552000.0,
-            "34": 55319556096.0,
-            "35": 55319556096.0,
-            "36": 55319556096.0,
-            "37": 55319560192.0,
-            "38": 55319560192.0,
-            "39": 55319560192.0,
-            "40": 55319556096.0,
-            "41": 55319552000.0,
-            "42": 55319552000.0,
-            "43": 55319552000.0,
-            "44": 55319552000.0,
-            "45": 55319552000.0,
-            "46": 55319552000.0,
-            "47": 55319556096.0,
-            "48": 55319556096.0,
-            "49": 55319556096.0,
-            "50": 55319552000.0
-        }
-    },
-    "mem-max-allocated-bytes": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 64753942528.0,
-            "2": 69804253184.0,
-            "3": 69804253184.0,
-            "4": 69804253184.0,
-            "5": 69804253184.0,
-            "6": 69804253184.0,
-            "7": 69804253184.0,
-            "8": 69804253184.0,
-            "9": 69804253184.0,
-            "10": 69804253184.0,
-            "11": 69804253184.0,
-            "12": 69804253184.0,
-            "13": 69804253184.0,
-            "14": 69804253184.0,
-            "15": 69804253184.0,
-            "16": 69804253184.0,
-            "17": 69804253184.0,
-            "18": 69804253184.0,
-            "19": 69804253184.0,
-            "20": 69804253184.0,
-            "21": 69804253184.0,
-            "22": 69804253184.0,
-            "23": 69804253184.0,
-            "24": 69804253184.0,
-            "25": 69804253184.0,
-            "26": 69804253184.0,
-            "27": 69804253184.0,
-            "28": 69804253184.0,
-            "29": 69804253184.0,
-            "30": 69804253184.0,
-            "31": 69804253184.0,
-            "32": 69804253184.0,
-            "33": 69804253184.0,
-            "34": 69804253184.0,
-            "35": 69804253184.0,
-            "36": 69804253184.0,
-            "37": 69804253184.0,
-            "38": 69804253184.0,
-            "39": 69804253184.0,
-            "40": 69804253184.0,
-            "41": 69804253184.0,
-            "42": 69804253184.0,
-            "43": 69804253184.0,
-            "44": 69804253184.0,
-            "45": 69804253184.0,
-            "46": 69804253184.0,
-            "47": 69804253184.0,
-            "48": 69804253184.0,
-            "49": 69804253184.0,
-            "50": 69804253184.0
-        }
-    },
-    "iteration-time": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 74.35665,
-            "2": 5.25731,
-            "3": 5.75582,
-            "4": 4.02061,
-            "5": 3.8529,
-            "6": 3.91732,
-            "7": 4.14616,
-            "8": 3.83737,
-            "9": 3.75158,
-            "10": 3.91902,
-            "11": 3.96073,
-            "12": 3.83611,
-            "13": 3.86989,
-            "14": 3.88658,
-            "15": 4.46432,
-            "16": 3.90389,
-            "17": 3.8143,
-            "18": 3.86593,
-            "19": 3.78307,
-            "20": 3.90922,
-            "21": 3.82247,
-            "22": 3.76037,
-            "23": 4.00863,
-            "24": 3.74678,
-            "25": 3.86492,
-            "26": 3.83492,
-            "27": 3.86387,
-            "28": 3.99894,
-            "29": 3.85812,
-            "30": 4.34066,
-            "31": 3.88411,
-            "32": 3.80617,
-            "33": 3.90347,
-            "34": 3.7771,
-            "35": 3.84701,
-            "36": 3.81111,
-            "37": 3.75554,
-            "38": 3.99552,
-            "39": 3.87227,
-            "40": 3.81079,
-            "41": 3.83039,
-            "42": 3.74567,
-            "43": 3.82531,
-            "44": 3.78258,
-            "45": 3.73294,
-            "46": 4.579,
-            "47": 3.72516,
-            "48": 3.8117,
-            "49": 3.80651,
-            "50": 3.78283
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/golden_values_dev_dgx_h100.json
deleted file mode 100644
index ff21f31147f..00000000000
--- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/golden_values_dev_dgx_h100.json
+++ /dev/null
@@ -1,287 +0,0 @@
-{
-    "lm loss": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 0.07991,
-            "2": 0.0,
-            "3": 0.0,
-            "4": 0.0,
-            "5": 0.0,
-            "6": 0.0,
-            "7": 0.0549,
-            "8": 0.04569,
-            "9": 0.0,
-            "10": 0.0,
-            "11": 0.0,
-            "12": 0.0,
-            "13": 0.0,
-            "14": 0.0,
-            "15": 0.0234,
-            "16": 0.0,
-            "17": 0.0,
-            "18": 0.0,
-            "19": 0.0,
-            "20": 0.0,
-            "21": 0.01393,
-            "22": 0.0,
-            "23": 0.0,
-            "24": 0.0,
-            "25": 0.0,
-            "26": 0.0,
-            "27": 0.0,
-            "28": 0.0,
-            "29": 0.0,
-            "30": 0.0,
-            "31": 0.0,
-            "32": 0.0,
-            "33": 0.0,
-            "34": 0.0,
-            "35": 0.0,
-            "36": 0.0,
-            "37": 0.05023,
-            "38": 0.0,
-            "39": 0.0,
-            "40": 0.0,
-            "41": 0.0,
-            "42": 0.0,
-            "43": 0.0,
-            "44": 0.0,
-            "45": 0.0,
-            "46": 0.0,
-            "47": 0.0302,
-            "48": 0.0,
-            "49": 0.0,
-            "50": 0.0
-        }
-    },
-    "num-zeros": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 0.0,
-            "2": 583687296.0,
-            "3": 583687296.0,
-            "4": 583687296.0,
-            "5": 583687296.0,
-            "6": 583687296.0,
-            "7": 40.0,
-            "8": 38.0,
-            "9": 583687296.0,
-            "10": 583687296.0,
-            "11": 583687296.0,
-            "12": 583687296.0,
-            "13": 583687296.0,
-            "14": 583687296.0,
-            "15": 51.0,
-            "16": 583687296.0,
-            "17": 583687296.0,
-            "18": 583687296.0,
-            "19": 583687296.0,
-            "20": 583687296.0,
-            "21": 2.0,
-            "22": 583687296.0,
-            "23": 583687296.0,
-            "24": 583687296.0,
-            "25": 583687296.0,
-            "26": 583687296.0,
-            "27": 583687296.0,
-            "28": 583687296.0,
-            "29": 583687296.0,
-            "30": 583687296.0,
-            "31": 583687296.0,
-            "32": 583687296.0,
-            "33": 583687296.0,
-            "34": 583687296.0,
-            "35": 583687296.0,
-            "36": 583687296.0,
-            "37": 43.0,
-            "38": 583687296.0,
-            "39": 583687296.0,
-            "40": 583687296.0,
-            "41": 583687296.0,
-            "42": 583687296.0,
-            "43": 583687296.0,
-            "44": 583687296.0,
-            "45": 583687296.0,
-            "46": 583687296.0,
-            "47": 0.0,
-            "48": 583687296.0,
-            "49": 583687296.0,
-            "50": 583687296.0
-        }
-    },
-    "mem-allocated-bytes": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 115496615936.0,
-            "2": 115496509440.0,
-            "3": 115496493056.0,
-            "4": 115496468480.0,
-            "5": 115496460288.0,
-            "6": 115496468480.0,
-            "7": 115496460288.0,
-            "8": 115496460288.0,
-            "9": 115496460288.0,
-            "10": 115496386560.0,
-            "11": 115496386560.0,
-            "12": 115496386560.0,
-            "13": 115496394752.0,
-            "14": 115496394752.0,
-            "15": 115496394752.0,
-            "16": 115496394752.0,
-            "17": 115496386560.0,
-            "18": 115496386560.0,
-            "19": 115496386560.0,
-            "20": 115496386560.0,
-            "21": 115496386560.0,
-            "22": 115496386560.0,
-            "23": 115496386560.0,
-            "24": 115496386560.0,
-            "25": 115496386560.0,
-            "26": 115496386560.0,
-            "27": 115496394752.0,
-            "28": 115496394752.0,
-            "29": 115496394752.0,
-            "30": 115496394752.0,
-            "31": 115496394752.0,
-            "32": 115496394752.0,
-            "33": 115496394752.0,
-            "34": 115496394752.0,
-            "35": 115496394752.0,
-            "36": 115496394752.0,
-            "37": 115496394752.0,
-            "38": 115496402944.0,
-            "39": 115496402944.0,
-            "40": 115496402944.0,
-            "41": 115496402944.0,
-            "42": 115496402944.0,
-            "43": 115496402944.0,
-            "44": 115496402944.0,
-            "45": 115496402944.0,
-            "46": 115496402944.0,
-            "47": 115496402944.0,
-            "48": 115496402944.0,
-            "49": 115496394752.0,
-            "50": 115496394752.0
-        }
-    },
-    "mem-max-allocated-bytes": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 124930113536.0,
-            "2": 129981063168.0,
-            "3": 129981063168.0,
-            "4": 129981063168.0,
-            "5": 129981063168.0,
-            "6": 129981063168.0,
-            "7": 129981063168.0,
-            "8": 129981063168.0,
-            "9": 129981063168.0,
-            "10": 129981063168.0,
-            "11": 129981063168.0,
-            "12": 129981063168.0,
-            "13": 129981063168.0,
-            "14": 129981063168.0,
-            "15": 129981063168.0,
-            "16": 129981063168.0,
-            "17": 129981063168.0,
-            "18": 129981063168.0,
-            "19": 129981063168.0,
-            "20": 129981063168.0,
-            "21": 129981063168.0,
-            "22": 129981063168.0,
-            "23": 129981063168.0,
-            "24": 129981063168.0,
-            "25": 129981063168.0,
-            "26": 129981063168.0,
-            "27": 129981063168.0,
-            "28": 129981063168.0,
-            "29": 129981063168.0,
-            "30": 129981063168.0,
-            "31": 129981063168.0,
-            "32": 129981063168.0,
-            "33": 129981063168.0,
-            "34": 129981063168.0,
-            "35": 129981063168.0,
-            "36": 129981063168.0,
-            "37": 129981063168.0,
-            "38": 129981063168.0,
-            "39": 129981063168.0,
-            "40": 129981063168.0,
-            "41": 129981063168.0,
-            "42": 129981063168.0,
-            "43": 129981063168.0,
-            "44": 129981063168.0,
-            "45": 129981063168.0,
-            "46": 129981063168.0,
-            "47": 129981063168.0,
-            "48": 129981063168.0,
-            "49": 129981063168.0,
-            "50": 129981063168.0
-        }
-    },
-    "iteration-time": {
-        "start_step": 1,
-        "end_step": 50,
-        "step_interval": 1,
-        "values": {
-            "1": 160.5798,
-            "2": 10.62597,
-            "3": 9.02278,
-            "4": 8.98711,
-            "5": 8.65585,
-            "6": 8.73942,
-            "7": 8.71669,
-            "8": 8.69582,
-            "9": 9.06944,
-            "10": 9.06985,
-            "11": 8.65674,
-            "12": 8.88456,
-            "13": 8.76106,
-            "14": 8.41611,
-            "15": 8.63189,
-            "16": 8.43429,
-            "17": 8.91809,
-            "18": 8.58448,
-            "19": 8.49267,
-            "20": 12.20545,
-            "21": 10.93851,
-            "22": 8.72166,
-            "23": 8.71524,
-            "24": 8.87404,
-            "25": 8.94558,
-            "26": 8.97342,
-            "27": 8.95268,
-            "28": 8.94076,
-            "29": 8.60882,
-            "30": 8.83227,
-            "31": 9.02609,
-            "32": 8.61145,
-            "33": 8.60383,
-            "34": 8.71477,
-            "35": 8.8245,
-            "36": 8.54752,
-            "37": 8.5882,
-            "38": 8.46182,
-            "39": 8.51561,
-            "40": 8.59624,
-            "41": 8.4749,
-            "42": 8.52586,
-            "43": 8.54329,
-            "44": 8.4459,
-            "45": 8.42917,
-            "46": 8.63737,
-            "47": 8.38284,
-            "48": 8.53056,
-            "49": 8.76395,
-            "50": 8.51048
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/env_config.yaml
similarity index 100%
rename from tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/env_config.yaml
rename to tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/env_config.yaml
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..52eecae753f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json
@@ -0,0 +1,173 @@
+{
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 59904729088.0,
+            "2": 59906678784.0,
+            "3": 59906662400.0,
+            "4": 59906637824.0,
+            "5": 59906621440.0,
+            "6": 59906596864.0,
+            "7": 59906596864.0,
+            "8": 59906535424.0,
+            "9": 59906396160.0,
+            "10": 59906404352.0,
+            "11": 59906408448.0,
+            "12": 59906412544.0,
+            "13": 59906408448.0,
+            "14": 59906412544.0,
+            "15": 59906412544.0,
+            "16": 59906412544.0,
+            "17": 59906408448.0,
+            "18": 59906404352.0,
+            "19": 59906404352.0,
+            "20": 59906408448.0,
+            "21": 59906408448.0,
+            "22": 59906408448.0,
+            "23": 59906412544.0,
+            "24": 59906416640.0,
+            "25": 59906408448.0,
+            "26": 59906412544.0,
+            "27": 59906416640.0,
+            "28": 59906412544.0,
+            "29": 59906412544.0,
+            "30": 59906408448.0,
+            "31": 59906412544.0,
+            "32": 59906416640.0,
+            "33": 59906420736.0,
+            "34": 59906416640.0,
+            "35": 59906416640.0,
+            "36": 59906416640.0,
+            "37": 59906420736.0,
+            "38": 59906416640.0,
+            "39": 59906416640.0,
+            "40": 59906420736.0,
+            "41": 59906420736.0,
+            "42": 59906420736.0,
+            "43": 59906424832.0,
+            "44": 59906428928.0,
+            "45": 59906433024.0,
+            "46": 59906433024.0,
+            "47": 59906428928.0,
+            "48": 59906424832.0,
+            "49": 59906420736.0,
+            "50": 59906424832.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 59904729088.0,
+            "2": 61718560768.0,
+            "3": 61719445504.0,
+            "4": 61719445504.0,
+            "5": 61719445504.0,
+            "6": 61719445504.0,
+            "7": 61719445504.0,
+            "8": 61719445504.0,
+            "9": 61719445504.0,
+            "10": 61719445504.0,
+            "11": 61719445504.0,
+            "12": 61719445504.0,
+            "13": 61719445504.0,
+            "14": 61719445504.0,
+            "15": 61719445504.0,
+            "16": 61719445504.0,
+            "17": 61719445504.0,
+            "18": 61719445504.0,
+            "19": 61719445504.0,
+            "20": 61719445504.0,
+            "21": 61719445504.0,
+            "22": 61719445504.0,
+            "23": 61719445504.0,
+            "24": 61719445504.0,
+            "25": 61719445504.0,
+            "26": 61719445504.0,
+            "27": 61719445504.0,
+            "28": 61719445504.0,
+            "29": 61719445504.0,
+            "30": 61719445504.0,
+            "31": 61719445504.0,
+            "32": 61719445504.0,
+            "33": 61719445504.0,
+            "34": 61719445504.0,
+            "35": 61719445504.0,
+            "36": 61719445504.0,
+            "37": 61719445504.0,
+            "38": 61719445504.0,
+            "39": 61719445504.0,
+            "40": 61719445504.0,
+            "41": 61719445504.0,
+            "42": 61719445504.0,
+            "43": 61719445504.0,
+            "44": 61719445504.0,
+            "45": 61719445504.0,
+            "46": 61719445504.0,
+            "47": 61719445504.0,
+            "48": 61719445504.0,
+            "49": 61719445504.0,
+            "50": 61719445504.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 57.6861,
+            "2": 8.67022,
+            "3": 5.71457,
+            "4": 5.72499,
+            "5": 5.11948,
+            "6": 4.92635,
+            "7": 4.93271,
+            "8": 5.10894,
+            "9": 5.36783,
+            "10": 5.56732,
+            "11": 5.02348,
+            "12": 4.81955,
+            "13": 4.91784,
+            "14": 4.9196,
+            "15": 4.7776,
+            "16": 5.12885,
+            "17": 5.00356,
+            "18": 4.81843,
+            "19": 4.84018,
+            "20": 4.8416,
+            "21": 4.85613,
+            "22": 5.11753,
+            "23": 4.85816,
+            "24": 4.75535,
+            "25": 4.89752,
+            "26": 4.76383,
+            "27": 4.8243,
+            "28": 5.40933,
+            "29": 4.76027,
+            "30": 4.81566,
+            "31": 4.65084,
+            "32": 4.85671,
+            "33": 4.82799,
+            "34": 4.92544,
+            "35": 4.84476,
+            "36": 5.06802,
+            "37": 4.80114,
+            "38": 4.76754,
+            "39": 4.72827,
+            "40": 4.88805,
+            "41": 5.15207,
+            "42": 4.84272,
+            "43": 4.72393,
+            "44": 4.8221,
+            "45": 4.8112,
+            "46": 4.78151,
+            "47": 4.86975,
+            "48": 4.73748,
+            "49": 4.91773,
+            "50": 4.77335
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/model_config.yaml
similarity index 89%
rename from tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/model_config.yaml
rename to tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/model_config.yaml
index fc0476332d9..b12911358f0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/model_config.yaml
@@ -13,7 +13,7 @@ MODEL_ARGS:
   --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/
   --auto-detect-ckpt-format: true
   --max-tokens-to-oom: 3600000
-  --inference-max-seq-length: 4096
+  --inference-max-seq-length: 1024
   --attention-backend: flash
   --mock-data: true
   --micro-batch-size: 1
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --seed: 42
   --calculate-per-token-loss: true
   --rl-use-sequence-packing: true
-  --rl-sequence-packing-bin-size: 8192
   --rl-sequence-packing-algo: fifo
   --rl-offload-optimizer-during-inference: true
   --timing-log-level: 1
@@ -62,7 +61,7 @@ MODEL_ARGS:
   --grpo-clamp-eps-upper: 0.2
   --grpo-kl-beta: 0.0
   --grpo-entropy-term-weight: 0.0
-  --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/env_config.yaml
+  --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/env_config.yaml
   --rl-partial-rollouts: true
   --lr: 0.000001
   --lr-warmup-samples: 0
@@ -76,3 +75,9 @@ MODEL_ARGS:
   --save-interval: 1000000
   --eval-interval: 1000000
   --finetune: true
+  --inference-logging-step-interval: 1
+  --rl-inference-tensor-model-parallel-size: 2
+  --refit-method: gloo
+METRICS:
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/env_config.yaml
similarity index 100%
rename from tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github/env_config.yaml
rename to tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/env_config.yaml
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..a37aeee6e4b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/golden_values_dev_dgx_h100.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 0.0087,
+            "2": -0.01494,
+            "3": 0.15077,
+            "4": 0.0,
+            "5": -0.0439,
+            "6": 0.0,
+            "7": 0.05469,
+            "8": 0.0,
+            "9": 0.00576,
+            "10": 0.0,
+            "11": 0.0,
+            "12": 0.0,
+            "13": 0.0,
+            "14": 0.03071,
+            "15": 0.04371,
+            "16": 0.0,
+            "17": 0.0,
+            "18": 0.0,
+            "19": 0.0,
+            "20": 0.0,
+            "21": 0.0,
+            "22": 0.0,
+            "23": 0.06246,
+            "24": 0.0,
+            "25": 0.0,
+            "26": 0.05207,
+            "27": 0.04668,
+            "28": 0.0,
+            "29": 0.0,
+            "30": 0.0,
+            "31": 0.02708,
+            "32": 0.0,
+            "33": 0.0,
+            "34": 0.0,
+            "35": 0.0,
+            "36": 0.0,
+            "37": 0.0,
+            "38": 0.0,
+            "39": 0.06875,
+            "40": 0.0,
+            "41": 0.0,
+            "42": 0.0,
+            "43": 0.0,
+            "44": 0.0,
+            "45": 0.0,
+            "46": 0.0,
+            "47": 0.0,
+            "48": 0.0,
+            "49": 0.0,
+            "50": 0.0
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1.0,
+            "2": 56.0,
+            "3": 10.0,
+            "4": 583687296.0,
+            "5": 23.0,
+            "6": 583687296.0,
+            "7": 30.0,
+            "8": 583687296.0,
+            "9": 50.0,
+            "10": 583687296.0,
+            "11": 583687296.0,
+            "12": 583687296.0,
+            "13": 583687296.0,
+            "14": 41.0,
+            "15": 31.0,
+            "16": 583687296.0,
+            "17": 583687296.0,
+            "18": 583687296.0,
+            "19": 583687296.0,
+            "20": 583687296.0,
+            "21": 583687296.0,
+            "22": 583687296.0,
+            "23": 19.0,
+            "24": 583687296.0,
+            "25": 583687296.0,
+            "26": 45.0,
+            "27": 34.0,
+            "28": 583687296.0,
+            "29": 583687296.0,
+            "30": 583687296.0,
+            "31": 38.0,
+            "32": 583687296.0,
+            "33": 583687296.0,
+            "34": 583687296.0,
+            "35": 583687296.0,
+            "36": 583687296.0,
+            "37": 583687296.0,
+            "38": 583687296.0,
+            "39": 16.0,
+            "40": 583687296.0,
+            "41": 583687296.0,
+            "42": 583687296.0,
+            "43": 583687296.0,
+            "44": 583687296.0,
+            "45": 583687296.0,
+            "46": 583687296.0,
+            "47": 583687296.0,
+            "48": 583687296.0,
+            "49": 583687296.0,
+            "50": 583687296.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 57332613120.0,
+            "2": 57336213504.0,
+            "3": 57335631872.0,
+            "4": 57336352768.0,
+            "5": 57336815616.0,
+            "6": 57336795136.0,
+            "7": 57336786944.0,
+            "8": 57336766464.0,
+            "9": 57336745984.0,
+            "10": 57336786944.0,
+            "11": 57336971264.0,
+            "12": 57336934400.0,
+            "13": 57336938496.0,
+            "14": 57336938496.0,
+            "15": 57336938496.0,
+            "16": 57336934400.0,
+            "17": 57336938496.0,
+            "18": 57336942592.0,
+            "19": 57336946688.0,
+            "20": 57336946688.0,
+            "21": 57336942592.0,
+            "22": 57336938496.0,
+            "23": 57336938496.0,
+            "24": 57336938496.0,
+            "25": 57336938496.0,
+            "26": 57336942592.0,
+            "27": 57336942592.0,
+            "28": 57336946688.0,
+            "29": 57336950784.0,
+            "30": 57336942592.0,
+            "31": 57336938496.0,
+            "32": 57336942592.0,
+            "33": 57336942592.0,
+            "34": 57336946688.0,
+            "35": 57336950784.0,
+            "36": 57336950784.0,
+            "37": 57336950784.0,
+            "38": 57336950784.0,
+            "39": 57336950784.0,
+            "40": 57336954880.0,
+            "41": 57336954880.0,
+            "42": 57336958976.0,
+            "43": 57336958976.0,
+            "44": 57336954880.0,
+            "45": 57336954880.0,
+            "46": 57336963072.0,
+            "47": 57336963072.0,
+            "48": 57336963072.0,
+            "49": 57336958976.0,
+            "50": 57336958976.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 57332617216.0,
+            "2": 59150434304.0,
+            "3": 59150434304.0,
+            "4": 59150434304.0,
+            "5": 59150434304.0,
+            "6": 59150434304.0,
+            "7": 59150434304.0,
+            "8": 59150434304.0,
+            "9": 59150434304.0,
+            "10": 59150434304.0,
+            "11": 59150434304.0,
+            "12": 59150434304.0,
+            "13": 59150434304.0,
+            "14": 59150434304.0,
+            "15": 59150434304.0,
+            "16": 59150434304.0,
+            "17": 59150434304.0,
+            "18": 59150434304.0,
+            "19": 59150434304.0,
+            "20": 59150434304.0,
+            "21": 59150434304.0,
+            "22": 59150434304.0,
+            "23": 59150434304.0,
+            "24": 59150434304.0,
+            "25": 59150434304.0,
+            "26": 59150434304.0,
+            "27": 59150434304.0,
+            "28": 59150434304.0,
+            "29": 59150434304.0,
+            "30": 59150434304.0,
+            "31": 59150434304.0,
+            "32": 59150434304.0,
+            "33": 59150434304.0,
+            "34": 59150434304.0,
+            "35": 59150434304.0,
+            "36": 59150434304.0,
+            "37": 59150434304.0,
+            "38": 59150434304.0,
+            "39": 59150434304.0,
+            "40": 59150434304.0,
+            "41": 59150434304.0,
+            "42": 59150434304.0,
+            "43": 59150434304.0,
+            "44": 59150434304.0,
+            "45": 59150434304.0,
+            "46": 59150434304.0,
+            "47": 59150434304.0,
+            "48": 59150434304.0,
+            "49": 59150434304.0,
+            "50": 59150434304.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 132.06027,
+            "2": 15.71916,
+            "3": 13.59969,
+            "4": 14.24368,
+            "5": 13.97929,
+            "6": 13.94721,
+            "7": 16.13103,
+            "8": 15.0372,
+            "9": 15.68285,
+            "10": 14.48736,
+            "11": 14.83801,
+            "12": 13.88317,
+            "13": 14.23494,
+            "14": 14.17721,
+            "15": 14.44254,
+            "16": 14.46859,
+            "17": 13.31893,
+            "18": 13.85971,
+            "19": 13.30073,
+            "20": 12.97114,
+            "21": 13.13682,
+            "22": 13.19241,
+            "23": 12.91161,
+            "24": 13.477,
+            "25": 13.41073,
+            "26": 13.16635,
+            "27": 13.91528,
+            "28": 13.70152,
+            "29": 13.34747,
+            "30": 17.3336,
+            "31": 13.22079,
+            "32": 13.03197,
+            "33": 13.1548,
+            "34": 13.67568,
+            "35": 13.2386,
+            "36": 13.29333,
+            "37": 13.57906,
+            "38": 12.92362,
+            "39": 13.37357,
+            "40": 12.74468,
+            "41": 14.24188,
+            "42": 13.10419,
+            "43": 14.01918,
+            "44": 13.85198,
+            "45": 13.19797,
+            "46": 14.27233,
+            "47": 13.51886,
+            "48": 14.11249,
+            "49": 13.75763,
+            "50": 13.66548
+        }
+    }
+}
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/model_config.yaml
similarity index 91%
rename from tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/model_config.yaml
rename to tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/model_config.yaml
index 6faaca95875..bff55aea7fe 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/model_config.yaml
@@ -13,7 +13,7 @@ MODEL_ARGS:
   --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/
   --auto-detect-ckpt-format: true
   --max-tokens-to-oom: 3600000
-  --inference-max-seq-length: 4096
+  --inference-max-seq-length: 1024
   --attention-backend: flash
   --mock-data: true
   --micro-batch-size: 1
@@ -47,7 +47,6 @@ MODEL_ARGS:
   --seed: 42
   --calculate-per-token-loss: true
   --rl-use-sequence-packing: true
-  --rl-sequence-packing-bin-size: 8192
   --rl-sequence-packing-algo: fifo
   --rl-offload-optimizer-during-inference: true
   --timing-log-level: 1
@@ -62,7 +61,7 @@ MODEL_ARGS:
   --grpo-clamp-eps-upper: 0.2
   --grpo-kl-beta: 0.0
   --grpo-entropy-term-weight: 0.0
-  --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/env_config.yaml
+  --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github/env_config.yaml
   --rl-partial-rollouts: true
   --lr: 0.000001
   --lr-warmup-samples: 0
@@ -76,3 +75,6 @@ MODEL_ARGS:
   --save-interval: 1000000
   --eval-interval: 1000000
   --finetune: true
+  --inference-logging-step-interval: 1
+  --rl-inference-tensor-model-parallel-size: 2
+  --refit-method: gloo
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/env_config.yaml
new file mode 100644
index 00000000000..329246987bf
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/env_config.yaml
@@ -0,0 +1,5 @@
+- agent_type: examples.rl.environments.countdown.countdown_agent.CountdownAgent
+  agent_args:
+    dataset_file: "/mnt/artifacts/rl_environments/Jiayi-Pan___countdown-tasks-3to4"
+    split: "train"
+  weight: 1.0
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..d985f671cab
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/golden_values_dev_dgx_h100.json
@@ -0,0 +1,173 @@
+{
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 48967716864.0,
+            "2": 48973631488.0,
+            "3": 48974528512.0,
+            "4": 48971538432.0,
+            "5": 48974340096.0,
+            "6": 48974143488.0,
+            "7": 48977002496.0,
+            "8": 48975851520.0,
+            "9": 48974036992.0,
+            "10": 48973709312.0,
+            "11": 48973262848.0,
+            "12": 48973705216.0,
+            "13": 48973598720.0,
+            "14": 48976703488.0,
+            "15": 48975118336.0,
+            "16": 48977072128.0,
+            "17": 48976465920.0,
+            "18": 48976470016.0,
+            "19": 48976478208.0,
+            "20": 48976654336.0,
+            "21": 48976793600.0,
+            "22": 48976052224.0,
+            "23": 48976277504.0,
+            "24": 48974708736.0,
+            "25": 48973062144.0,
+            "26": 48976236544.0,
+            "27": 48975970304.0,
+            "28": 48976711680.0,
+            "29": 48975593472.0,
+            "30": 48977321984.0,
+            "31": 48977506304.0,
+            "32": 48976646144.0,
+            "33": 48976072704.0,
+            "34": 48973631488.0,
+            "35": 48976650240.0,
+            "36": 48975650816.0,
+            "37": 48974950400.0,
+            "38": 48972750848.0,
+            "39": 48976617472.0,
+            "40": 48979308544.0,
+            "41": 48978587648.0,
+            "42": 48975626240.0,
+            "43": 48975089664.0,
+            "44": 48973688832.0,
+            "45": 48975327232.0,
+            "46": 48975159296.0,
+            "47": 48975372288.0,
+            "48": 48973856768.0,
+            "49": 48973377536.0,
+            "50": 48975568896.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 49090379776.0,
+            "2": 49937022976.0,
+            "3": 49938366464.0,
+            "4": 49938366464.0,
+            "5": 49938366464.0,
+            "6": 49938698240.0,
+            "7": 49939156992.0,
+            "8": 49939156992.0,
+            "9": 49939156992.0,
+            "10": 49939156992.0,
+            "11": 49939156992.0,
+            "12": 49939156992.0,
+            "13": 49939156992.0,
+            "14": 49940287488.0,
+            "15": 49940287488.0,
+            "16": 49940287488.0,
+            "17": 49941729280.0,
+            "18": 49941733376.0,
+            "19": 49941741568.0,
+            "20": 49941778432.0,
+            "21": 49941778432.0,
+            "22": 49941778432.0,
+            "23": 49941778432.0,
+            "24": 49941778432.0,
+            "25": 49941778432.0,
+            "26": 49941778432.0,
+            "27": 49941934080.0,
+            "28": 49941934080.0,
+            "29": 49941934080.0,
+            "30": 49941934080.0,
+            "31": 49942675456.0,
+            "32": 49942675456.0,
+            "33": 49942675456.0,
+            "34": 49942675456.0,
+            "35": 49942675456.0,
+            "36": 49942675456.0,
+            "37": 49942675456.0,
+            "38": 49942675456.0,
+            "39": 49942675456.0,
+            "40": 49944379392.0,
+            "41": 49944379392.0,
+            "42": 49944379392.0,
+            "43": 49944379392.0,
+            "44": 49944379392.0,
+            "45": 49944379392.0,
+            "46": 49944379392.0,
+            "47": 49944379392.0,
+            "48": 49944379392.0,
+            "49": 49944379392.0,
+            "50": 49944379392.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 63.07516,
+            "2": 4.36236,
+            "3": 3.83222,
+            "4": 3.85784,
+            "5": 3.74494,
+            "6": 3.82661,
+            "7": 4.05458,
+            "8": 3.76622,
+            "9": 3.90518,
+            "10": 4.09283,
+            "11": 3.96358,
+            "12": 3.85778,
+            "13": 3.84546,
+            "14": 3.85497,
+            "15": 4.35749,
+            "16": 3.7861,
+            "17": 3.8896,
+            "18": 3.6267,
+            "19": 3.76463,
+            "20": 3.6953,
+            "21": 3.63427,
+            "22": 3.66652,
+            "23": 3.60379,
+            "24": 3.57701,
+            "25": 3.57327,
+            "26": 3.71371,
+            "27": 3.69626,
+            "28": 3.89285,
+            "29": 3.62405,
+            "30": 3.58297,
+            "31": 3.56993,
+            "32": 3.75257,
+            "33": 3.72279,
+            "34": 3.48095,
+            "35": 3.60831,
+            "36": 3.74971,
+            "37": 3.72155,
+            "38": 3.51054,
+            "39": 3.64562,
+            "40": 3.66038,
+            "41": 3.86018,
+            "42": 3.58341,
+            "43": 3.82647,
+            "44": 3.85728,
+            "45": 3.62416,
+            "46": 3.59141,
+            "47": 3.74512,
+            "48": 3.61762,
+            "49": 3.57079,
+            "50": 3.66209
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml
new file mode 100644
index 00000000000..b74417a898b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest/model_config.yaml
@@ -0,0 +1,84 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+TEST_TYPE: frozen-start
+MODE: rl
+MODEL_ARGS:
+  --tiktoken-pattern: v2
+  --use-mcore-models: true
+  --tokenizer-type: TikTokenizer
+  --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
+  --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/
+  --auto-detect-ckpt-format: true
+  --max-tokens-to-oom: 3600000
+  --inference-max-seq-length: 1024
+  --attention-backend: flash
+  --mock-data: true
+  --micro-batch-size: 1
+  --no-load-optim: true
+  --no-use-tokenizer-model-from-checkpoint-args: true
+  --timing-log-level: 0
+  --distributed-backend: nccl
+  --log-interval: 1
+  --log-progress: true
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 4
+  --ckpt-format: torch_dist
+  --bf16: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --num-layers: 24
+  --hidden-size: 1152
+  --num-attention-heads: 16
+  --max-position-embeddings: 1024
+  --seq-length: 1024
+  --timing-log-option: minmax
+  --log-throughput: true
+  --no-create-attention-mask-in-dataloader: true
+  --straggler-minmax-count: 16
+  --tensorboard-log-interval: 1
+  --empty-unused-memory-level: 2
+  --langrl-inference-server-type: inplace_megatron
+  --seed: 42
+  --calculate-per-token-loss: true
+  --rl-use-sequence-packing: true
+  --rl-sequence-packing-algo: fifo
+  --rl-offload-optimizer-during-inference: true
+  --timing-log-level: 1
+  --log-timers-to-tensorboard: true
+  --cuda-graph-impl: local
+  --micro-batch-size: 1
+  --global-batch-size: 16
+  --grpo-group-size: 2
+  --grpo-prompts-per-step: 8
+  --grpo-iterations: 1
+  --grpo-clamp-eps-lower: 0.2
+  --grpo-clamp-eps-upper: 0.2
+  --grpo-kl-beta: 0.0
+  --grpo-entropy-term-weight: 0.0
+  --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest/env_config.yaml
+  --rl-partial-rollouts: true
+  --lr: 0.000001
+  --lr-warmup-samples: 0
+  --clip-grad: 1.0
+  --use-checkpoint-args: true
+  --dist-ckpt-strictness: log_unexpected
+  --perform-rl-step: true
+  --train-samples: 48828125
+  --exit-interval: 50
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --save-interval: 1000000
+  --eval-interval: 1000000
+  --finetune: true
+  --inference-logging-step-interval: 1
+  --rl-inference-tensor-model-parallel-size: 1
+  --rl-inference-pipeline-model-parallel-size: 2
+  --refit-method: gloo
+METRICS:
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/env_config.yaml
new file mode 100644
index 00000000000..329246987bf
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/env_config.yaml
@@ -0,0 +1,5 @@
+- agent_type: examples.rl.environments.countdown.countdown_agent.CountdownAgent
+  agent_args:
+    dataset_file: "/mnt/artifacts/rl_environments/Jiayi-Pan___countdown-tasks-3to4"
+    split: "train"
+  weight: 1.0
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..42c13292446
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/golden_values_dev_dgx_h100.json
@@ -0,0 +1,83 @@
+{
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 20,
+        "step_interval": 1,
+        "values": {
+            "1": 60922068992.0,
+            "2": 60922068992.0,
+            "3": 60922331136.0,
+            "4": 60922073088.0,
+            "5": 60922331136.0,
+            "6": 60922064896.0,
+            "7": 60922331136.0,
+            "8": 60922064896.0,
+            "9": 60922322944.0,
+            "10": 60922052608.0,
+            "11": 60922056704.0,
+            "12": 60922318848.0,
+            "13": 60922056704.0,
+            "14": 60922318848.0,
+            "15": 60922056704.0,
+            "16": 60922310656.0,
+            "17": 60922052608.0,
+            "18": 60922052608.0,
+            "19": 60922048512.0,
+            "20": 60922044416.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 20,
+        "step_interval": 1,
+        "values": {
+            "1": 60922073088.0,
+            "2": 64156037120.0,
+            "3": 64156037120.0,
+            "4": 64156041216.0,
+            "5": 64156041216.0,
+            "6": 64156041216.0,
+            "7": 64156041216.0,
+            "8": 64156041216.0,
+            "9": 64156041216.0,
+            "10": 64156041216.0,
+            "11": 64156041216.0,
+            "12": 64156041216.0,
+            "13": 64156041216.0,
+            "14": 64156041216.0,
+            "15": 64156041216.0,
+            "16": 64156041216.0,
+            "17": 64156041216.0,
+            "18": 64156041216.0,
+            "19": 64156041216.0,
+            "20": 64156041216.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 20,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 60.37194,
+            "3": 13.25967,
+            "4": 13.01461,
+            "5": 14.04256,
+            "6": 13.53259,
+            "7": 13.3335,
+            "8": 12.72344,
+            "9": 13.64787,
+            "10": 12.66485,
+            "11": 13.15779,
+            "12": 13.01275,
+            "13": 12.72481,
+            "14": 12.67697,
+            "15": 12.7286,
+            "16": 12.65032,
+            "17": 12.86279,
+            "18": 12.71745,
+            "19": 13.4137,
+            "20": 12.75566
+        }
+    }
+}
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml
new file mode 100644
index 00000000000..3037e2e0803
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/model_config.yaml
@@ -0,0 +1,103 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEAT: 1
+TEST_TYPE: frozen-start
+MODE: rl
+MODEL_ARGS:
+  --tensor-model-parallel-size: 4
+  --inference-dynamic-batching-num-cuda-graphs: 1
+  --inference-dynamic-batching-unified-memory-level: 1
+  --inference-dynamic-batching-buffer-size-gb: 20
+  --ckpt-format: torch_dist
+  --seq-length: 1024
+  --inference-max-seq-length: 1024
+  --load: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist
+  --untie-embeddings-and-output-weights: true
+  --num-layers: 36
+  --hidden-size: 4096
+  --ffn-hidden-size: 12288
+  --num-attention-heads: 32
+  --kv-channels: 128
+  --max-position-embeddings: 1024
+  --group-query-attention: true
+  --num-query-groups: 8
+  --normalization: RMSNorm
+  --norm-epsilon: 0.000001
+  --qk-layernorm: true
+  --position-embedding-type: rope
+  --rotary-percent: 1.0
+  --rotary-base: 1000000
+  --use-rotary-position-embeddings: true
+  --swiglu: true
+  --disable-bias-linear: true
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --no-masked-softmax-fusion: true
+  --attention-softmax-in-fp32: true
+  --tokenizer-type: HuggingFaceTokenizer
+  --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist/tokenizer
+  --langrl-inference-server-type: inplace_megatron_chat
+  --langrl-inference-server-conversation-template: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist/tokenizer
+  --vocab-size: 151936
+  --make-vocab-size-divisible-by: 128
+  --optimizer: adam
+  --adam-beta1: 0.9
+  --adam-beta2: 0.999
+  --adam-eps: 0.00000001
+  --lr: 0.000001  
+  --min-lr: 0.0000001
+  --lr-warmup-samples: 0
+  --clip-grad: 1.0
+  --weight-decay: 0.01
+  --deterministic-mode: true
+  --use-mcore-models: true
+  --bf16: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-option: minmax
+  --log-throughput: true
+  --no-create-attention-mask-in-dataloader: true
+  --straggler-minmax-count: 16
+  --tensorboard-log-interval: 1
+  --log-interval: 1
+  --log-progress: true
+  --empty-unused-memory-level: 2
+  --seed: 42
+  --calculate-per-token-loss: true
+  --rl-use-sequence-packing: true
+  --rl-sequence-packing-algo: fifo
+  --rl-offload-optimizer-during-inference: true
+  --timing-log-level: 1
+  --cuda-graph-impl: local
+  --micro-batch-size: 1
+  --global-batch-size: 4
+  --grpo-group-size: 2
+  --grpo-prompts-per-step: 2
+  --grpo-iterations: 1
+  --grpo-clamp-eps-lower: 0.2
+  --grpo-clamp-eps-upper: 0.2
+  --grpo-kl-beta: 0.0
+  --grpo-entropy-term-weight: 0.0
+  --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput/env_config.yaml
+  --rl-partial-rollouts: true
+  --use-checkpoint-args: true
+  --dist-ckpt-strictness: log_unexpected
+  --perform-rl-step: true
+  --train-samples: 48828125
+  --exit-interval: 20
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --save-interval: 1000000
+  --eval-interval: 1000000
+  --finetune: true
+  --inference-logging-step-interval: 1
+METRICS:
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
+  - "iteration-time"
+THROUGHPUT_TEST_PARAMS:
+  --start_step: 10
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/env_config.yaml
new file mode 100644
index 00000000000..329246987bf
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/env_config.yaml
@@ -0,0 +1,5 @@
+- agent_type: examples.rl.environments.countdown.countdown_agent.CountdownAgent
+  agent_args:
+    dataset_file: "/mnt/artifacts/rl_environments/Jiayi-Pan___countdown-tasks-3to4"
+    split: "train"
+  weight: 1.0
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..e0bcb14f29b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/golden_values_dev_dgx_h100.json
@@ -0,0 +1,83 @@
+{
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 20,
+        "step_interval": 1,
+        "values": {
+            "1": 60922077184.0,
+            "2": 60922073088.0,
+            "3": 60922068992.0,
+            "4": 60922073088.0,
+            "5": 60922331136.0,
+            "6": 60922073088.0,
+            "7": 60922335232.0,
+            "8": 60922068992.0,
+            "9": 60922073088.0,
+            "10": 60922073088.0,
+            "11": 60922077184.0,
+            "12": 60922093568.0,
+            "13": 60922351616.0,
+            "14": 60922085376.0,
+            "15": 60922085376.0,
+            "16": 60922085376.0,
+            "17": 60922089472.0,
+            "18": 60922085376.0,
+            "19": 60922085376.0,
+            "20": 60922089472.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 20,
+        "step_interval": 1,
+        "values": {
+            "1": 60922081280.0,
+            "2": 64156041216.0,
+            "3": 64156041216.0,
+            "4": 64156041216.0,
+            "5": 64156041216.0,
+            "6": 64156041216.0,
+            "7": 64156041216.0,
+            "8": 64156041216.0,
+            "9": 64156041216.0,
+            "10": 64156041216.0,
+            "11": 64156045312.0,
+            "12": 64156061696.0,
+            "13": 64156061696.0,
+            "14": 64156061696.0,
+            "15": 64156061696.0,
+            "16": 64156061696.0,
+            "17": 64156061696.0,
+            "18": 64156061696.0,
+            "19": 64156061696.0,
+            "20": 64156061696.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 20,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 110.96005,
+            "3": 49.76537,
+            "4": 46.36583,
+            "5": 46.63055,
+            "6": 50.62966,
+            "7": 46.52987,
+            "8": 44.32473,
+            "9": 46.39563,
+            "10": 44.4327,
+            "11": 43.93223,
+            "12": 46.84642,
+            "13": 43.45953,
+            "14": 42.21466,
+            "15": 42.70466,
+            "16": 42.45673,
+            "17": 43.68298,
+            "18": 41.36069,
+            "19": 42.64788,
+            "20": 45.08387
+        }
+    }
+}
diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/model_config.yaml
new file mode 100644
index 00000000000..456280fdb04
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/model_config.yaml
@@ -0,0 +1,103 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEAT: 1
+TEST_TYPE: frozen-start
+MODE: rl
+MODEL_ARGS:
+  --tensor-model-parallel-size: 4
+  --inference-dynamic-batching-num-cuda-graphs: 1
+  --inference-dynamic-batching-unified-memory-level: 1
+  --inference-dynamic-batching-buffer-size-gb: 20
+  --ckpt-format: torch_dist
+  --seq-length: 1024
+  --inference-max-seq-length: 1024
+  --load: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist
+  --untie-embeddings-and-output-weights: true
+  --num-layers: 36
+  --hidden-size: 4096
+  --ffn-hidden-size: 12288
+  --num-attention-heads: 32
+  --kv-channels: 128
+  --max-position-embeddings: 1024
+  --group-query-attention: true
+  --num-query-groups: 8
+  --normalization: RMSNorm
+  --norm-epsilon: 0.000001
+  --qk-layernorm: true
+  --position-embedding-type: rope
+  --rotary-percent: 1.0
+  --rotary-base: 1000000
+  --use-rotary-position-embeddings: true
+  --swiglu: true
+  --disable-bias-linear: true
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --no-masked-softmax-fusion: true
+  --attention-softmax-in-fp32: true
+  --tokenizer-type: HuggingFaceTokenizer
+  --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist/tokenizer
+  --langrl-inference-server-type: inplace_megatron_chat
+  --langrl-inference-server-conversation-template: ${CHECKPOINT_LOAD_PATH}/model/qwen3-8b-dist/tokenizer
+  --vocab-size: 151936
+  --make-vocab-size-divisible-by: 128
+  --optimizer: adam
+  --adam-beta1: 0.9
+  --adam-beta2: 0.999
+  --adam-eps: 0.00000001
+  --lr: 0.000001  
+  --min-lr: 0.0000001
+  --lr-warmup-samples: 0
+  --clip-grad: 1.0
+  --weight-decay: 0.01
+  --deterministic-mode: true
+  --use-mcore-models: true
+  --bf16: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-option: minmax
+  --log-throughput: true
+  --no-create-attention-mask-in-dataloader: true
+  --straggler-minmax-count: 16
+  --tensorboard-log-interval: 1
+  --empty-unused-memory-level: 2
+  --seed: 42
+  --calculate-per-token-loss: true
+  --rl-use-sequence-packing: true
+  --rl-sequence-packing-algo: fifo
+  --rl-offload-optimizer-during-inference: true
+  --timing-log-level: 1
+  --log-interval: 1
+  --log-progress: true
+  --cuda-graph-impl: local
+  --micro-batch-size: 1
+  --global-batch-size: 4
+  --grpo-group-size: 2
+  --grpo-prompts-per-step: 2
+  --grpo-iterations: 1
+  --grpo-clamp-eps-lower: 0.2
+  --grpo-clamp-eps-upper: 0.2
+  --grpo-kl-beta: 0.0
+  --grpo-entropy-term-weight: 0.0
+  --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp4_pp1_dp2_8b_throughput_github/env_config.yaml
+  --rl-partial-rollouts: true
+  --use-checkpoint-args: true
+  --dist-ckpt-strictness: log_unexpected
+  --perform-rl-step: true
+  --train-samples: 48828125
+  --exit-interval: 20
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --save-interval: 1000000
+  --eval-interval: 1000000
+  --finetune: true
+  --inference-logging-step-interval: 1
+METRICS:
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
+  - "iteration-time"
+THROUGHPUT_TEST_PARAMS:
+  --start_step: 10
diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml
index cb06eae2e7e..6d63b0e4228 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml
@@ -75,8 +75,10 @@ MODEL_ARGS:
   --num-tokens-to-generate: 80
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompt-file: ./tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/test_prompts.jsonl
   --incoming-requests-per-sec: -1 # all requests arrive up front.
+  --inference-logging-step-interval: 1
+  --inference-dynamic-batching-buffer-size-gb: 20
 METRICS:
   - "generated_text"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml
index c080cd5f5a7..8f54fff0a2f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml
@@ -47,9 +47,10 @@ MODEL_ARGS:
   --inference-rng-tracker: true
   --inference-max-requests: 1
   --dist-ckpt-strictness: log_unexpected
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgx_h100.json
index 6a5ace35ec7..29b2f065af8 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgx_h100.json
@@ -1 +1 @@
-{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [0.6358857750892639, 0.009907487779855728, 0.010546143166720867, 0.009435135871171951, 0.010123520158231258, 0.009925439953804016, 0.008350367657840252, 0.008556703105568886, 0.008582624606788158, 0.00840403139591217, 0.008557791821658611, 0.008503519929945469, 0.008379808627068996, 0.009403808042407036, 0.009133151732385159, 0.008321152068674564, 0.008845727890729904, 0.008372415788471699, 0.008591103367507458, 0.009211359545588493, 0.009166751988232136, 0.009767616167664528, 0.008620256558060646, 0.009338144212961197, 0.010125535540282726, 0.010068127885460854, 0.009669983759522438, 0.010439807549118996, 0.010279008187353611, 0.0103340158239007], "latency": 0.9097336048725992, "logprobs": [-9.362524032592773, -2.761181354522705, -4.53175163269043, -1.5617105960845947, -0.7528610229492188, -1.6253626346588135, -2.45941162109375, -2.1533684730529785, -2.346475124359131, -6.157411575317383, -1.3193804025650024, -3.5247979164123535, -4.488514423370361, -3.759702682495117, -2.022449493408203, -1.8945543766021729, -3.6219239234924316, -6.842351913452148, -0.3225390613079071, -0.8537865877151489, -6.520284652709961, -7.550463676452637, -12.595708847045898, -2.9504785537719727, -3.8068642616271973, -0.5890476107597351, -4.3587751388549805, -0.0665372759103775, -0.06955777853727341, -3.3523848056793213, -9.773153305053711, -1.0814638137817383, -6.204980850219727, -5.33505392074585, -3.9411606788635254, -2.7358486652374268, -3.2924106121063232, -6.0152740478515625, -1.8116782903671265, -6.243865013122559, -12.158185958862305, -12.65605354309082, -0.08688803017139435, -2.6079092025756836, -1.4071979522705078, -2.990557909011841, -1.2379846572875977, -0.006849618628621101, -3.4119930267333984, -13.05937671661377, -4.2840399742126465, -2.4802193641662598, -5.933547019958496, -0.9116124510765076, -0.060975510627031326, -1.5681536197662354, -1.0339949131011963, -5.617187023162842, -0.41873589158058167, -4.9402852058410645, -0.5690340995788574, -0.6301103830337524, -2.396580696105957, -13.29629898071289, -0.08181379735469818, -3.6629719734191895, -1.105454683303833, -6.127413749694824, -0.5906393527984619, -3.548814296722412, -0.9948520660400391, -1.5058085918426514, -5.211822509765625, -17.489606857299805, -6.8240861892700195, -0.9539748430252075, -4.2172040939331055, -1.1572864055633545, -2.3540186882019043, -1.798780918121338, -0.2533280849456787, -9.403679847717285, -0.1830129772424698, -7.440906524658203, -2.228740692138672, -4.196046352386475, -3.5180575847625732, -1.9530653953552246, -2.2825613021850586, -1.5544131994247437, -2.3991782665252686, -1.554469347000122, -1.290938377380371, -2.785543203353882, -0.6400948166847229, -0.48503541946411133, -1.432410478591919, -0.9366894960403442, -0.42669478058815, -0.9688448905944824, -1.4787911176681519, -0.43357178568840027, -1.8381303548812866, -0.6210520267486572, -1.0601571798324585, -1.1962573528289795, -0.002758747199550271, -1.2365548610687256, -0.008277395740151405, -0.7464911341667175, -0.8628943562507629, -0.0671280175447464, -0.953361988067627, -0.02595982328057289, -2.139401435852051, -1.1942673921585083, -0.7968283295631409]}}
\ No newline at end of file
+{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [16.15542221069336, 0.010760319419205189, 0.010187488049268723, 0.009952863678336143, 0.009843999519944191, 0.009946784004569054, 0.009922400116920471, 0.009739359840750694, 0.009888960048556328, 0.009836736135184765, 0.009687071666121483, 0.009769408032298088, 0.009855296462774277, 0.009689983911812305, 0.009785440750420094, 0.009785600006580353, 0.0096413753926754, 0.009843328036367893, 0.00992854405194521, 0.009764831513166428, 0.00989731214940548, 0.009755455888807774, 0.009725536219775677, 0.009836511686444283, 0.009886848740279675, 0.009882623329758644, 0.010086975991725922, 0.009792639873921871, 0.009663871489465237, 0.009776191785931587], "latency": 16.449045547284186, "logprobs": [-9.361270904541016, -2.816261053085327, -4.588935375213623, -1.5017199516296387, -0.741045355796814, -1.667273998260498, -2.499588966369629, -2.155250072479248, -2.283968925476074, -6.043426990509033, -1.4483305215835571, -3.5459461212158203, -4.401362419128418, -3.799332618713379, -2.0079712867736816, -1.9887162446975708, -3.645993232727051, -6.644451141357422, -0.29920411109924316, -0.9304023385047913, -6.587186813354492, -7.641824722290039, -12.416095733642578, -2.671480655670166, -3.815490961074829, -0.519839346408844, -4.362468719482422, -0.07680333405733109, -0.05171871557831764, -3.396726608276367, -9.788715362548828, -0.8995059728622437, -6.323936939239502, -5.126791954040527, -3.8786838054656982, -2.635392427444458, -3.3623709678649902, -5.742304801940918, -1.7406764030456543, -5.801540374755859, -12.157327651977539, -12.30499267578125, -0.0908704325556755, -2.625852584838867, -1.3322829008102417, -2.903132915496826, -1.2308698892593384, -0.007758010178804398, -3.557692527770996, -12.974837303161621, -4.17155647277832, -2.7034285068511963, -6.056203365325928, -0.7013733983039856, -0.059931229799985886, -1.536167025566101, -1.0629496574401855, -5.60313606262207, -0.3977315127849579, -4.969045162200928, -0.6034128665924072, -0.6240106225013733, -2.4040627479553223, -13.397473335266113, -0.08686792105436325, -3.6712121963500977, -1.0370540618896484, -5.903684616088867, -0.5956857204437256, -3.5154733657836914, -0.927386462688446, -1.5613828897476196, -5.159183502197266, -17.256793975830078, -6.84757137298584, -1.0673397779464722, -4.118277549743652, -1.2183088064193726, -2.232071876525879, -1.8239736557006836, -0.23899848759174347, -9.458780288696289, -0.3047770857810974, -7.185626029968262, -2.4002504348754883, -4.3108930587768555, -3.616743803024292, -1.9881696701049805, -2.332663059234619, -1.4827791452407837, -2.3777670860290527, -1.6217947006225586, -1.2375495433807373, -2.8062713146209717, -0.6056127548217773, -0.4975180923938751, -1.5545005798339844, -0.8545036315917969, -0.42530402541160583, -0.9441970586776733, -1.4939707517623901, -0.4558667540550232, -1.8341174125671387, -0.6195889711380005, -1.1318714618682861, -1.121700644493103, -0.0030193000566214323, -1.2578070163726807, -0.008020220324397087, -0.7519669532775879, -0.9071804881095886, -0.07452713698148727, -1.0132611989974976, -0.0314607247710228, -2.253722667694092, -1.3426333665847778, -0.7446916699409485]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml
index e3a4d695ead..1a1195baa2b 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml
@@ -52,9 +52,10 @@ MODEL_ARGS:
   --inference-rng-tracker: true
   --inference-max-requests: 1
   --dist-ckpt-strictness: log_unexpected
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
+  --inference-logging-step-interval: 1
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml
index 90a1836347e..be00e4b3ce7 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml
@@ -43,7 +43,7 @@ MODEL_ARGS:
   --num-tokens-to-generate: 30
   --flash-decode: true
   --dist-ckpt-strictness: log_unexpected
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
 METRICS:
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json
index dccdd34a5e7..2a284c160aa 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json
@@ -128,8 +128,5 @@
    -1.2439141273498535,
    -1.412055253982544
   ]
- },
- "throughput": [
-  13.750125804204401, 13.955213632130931
- ]
+ }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml
index 4ae5c719291..0232bcb30bf 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml
@@ -63,10 +63,11 @@ MODEL_ARGS:
   --num-tokens-to-generate: 30
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-step: 32 
   --inference-repeat-n: 3
+  --no-record-throughput: true
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..411bc8b74a6
--- /dev/null
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/golden_values_dev_dgx_h100.json
@@ -0,0 +1,5586 @@
+{
+ "0": {
+  "input_prompt": "SYSTEM LOG - DAILY REPORTING\\nDATE: 2024-10-27\\nSERVER: US-EAST-1A\\n\\nBEGIN LOG STREAM:\\n\\n[Entry 0001]\\nTimestamp: 08:00:01\\nUser: admin_01\\nAction: Login\\nStatus: Success\\nNote: Routine maintenance check initiated.\\n\\n[Entry 0002]\\nTimestamp: 08:01:15\\nUser: system_daemon\\nAction: Backup\\nStatus: Pending\\nNote: awaiting clearance for volume mount.\\n\\n[Entry 0003]\\nTimestamp: 08:02:22\\nUser: user_404\\nAction: Query\\nStatus: Failed\\nNote: Connection timeout on port 8080.\\n\\n[Entry 0004]\\nTimestamp: 08:05:00\\nUser: admin_02\\nAction: Update\\nStatus: Success\\nNote: Patch 4.5.1 applied to kernel.\\n\\n[Entry 0005]\\nTimestamp: 08:10:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0006]\\nTimestamp: 08:12:30\\nUser: db_manager\\nAction: Write\\nStatus: Success\\nNote: Written 500 records to shard A.\\n\\n[Entry 0007]\\nTimestamp: 08:15:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 14ms.\\n\\n[Entry 0008]\\nTimestamp: 08:18:22\\nUser: user_102\\nAction: Login\\nStatus: Success\\nNote: User accessing from IP 192.168.1.55.\\n\\n[Entry 0009]\\nTimestamp: 08:20:00\\nUser: system_daemon\\nAction: Garbage_Collection\\nStatus: Success\\nNote: Freed 2048MB of heap memory.\\n\\n[Entry 0010]\\nTimestamp: 08:25:10\\nUser: admin_01\\nAction: Logout\\nStatus: Success\\nNote: Session duration 25 minutes.\\n\\n[Entry 0011]\\nTimestamp: 08:30:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 11ms.\\n\\n[Entry 0012]\\nTimestamp: 08:32:45\\nUser: unknown\\nAction: Auth_Attempt\\nStatus: Denied\\nNote: Invalid credentials provided 3 times.\\n\\n[Entry 0013]\\nTimestamp: 08:35:20\\nUser: system_audit\\nAction: Scan\\nStatus: In_Progress\\nNote: Scanning sector 7 for vulnerabilities.\\n\\n[Entry 0014]\\nTimestamp: 08:40:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 13ms.\\n\\n[Entry 0015]\\nTimestamp: 08:45:15\\nUser: user_888\\nAction: Upload\\nStatus: Success\\nNote: File data_report.csv uploaded to bucket.\\n\\n[Entry 0016]\\nTimestamp: 08:50:00\\nUser: load_balancer\\nAction: Scale_Up\\nStatus: Success\\nNote: Added 2 instances to the pool.\\n\\n[Entry 0017]\\nTimestamp: 08:55:30\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 15ms.\\n\\n[Entry 0018]\\nTimestamp: 09:00:00\\nUser: cron_job\\nAction: Execute\\nStatus: Success\\nNote: Daily summary report generation started.\\n\\n[Entry 0019]\\nTimestamp: 09:05:12\\nUser: user_555\\nAction: Download\\nStatus: Success\\nNote: Retrieved image_001.png.\\n\\n[Entry 0020]\\nTimestamp: 09:10:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0021]\\nTimestamp: 09:15:45\\nUser: admin_03\\nAction: Config_Change\\nStatus: Success\\nNote: Firewall rules updated for port 22.\\n\\n[Entry 0022]\\nTimestamp: 09:20:00\\nUser: system_daemon\\nAction: Sync\\nStatus: Success\\nNote: Database replica synchronization complete.\\n\\n[Entry 0023]\\nTimestamp: 09:25:10\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 10ms.\\n\\n[Entry 0024]\\nTimestamp: 09:30:00\\nUser: user_777\\nAction: Query\\nStatus: Success\\nNote: Complex SQL query executed in 200ms.\\n\\n[Entry 0025]\\nTimestamp: 09:35:30\\nUser: error_handler\\nAction: Alert\\nStatus: Warning\\nNote: High CPU usage detected on Node 4.\\n\\n[Entry 0026]\\nTimestamp: 09:40:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 18ms.\\n\\n[Entry 0027]\\nTimestamp: 09:45:15\\nUser: cache_manager\\nAction: Flush\\nStatus: Success\\nNote: Redis cache cleared.\\n\\n[Entry 0028]\\nTimestamp: 09:50:00\\nUser: user_202\\nAction: Login\\nStatus: Success\\nNote: New device detected.\\n\\n[Entry 0029]\\nTimestamp: 09:55:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0030]\\nTimestamp: 10:00:00\\nUser: system_daemon\\nAction: Archive\\nStatus: Success\\nNote: Logs from yesterday archived to cold storage.\\n\\n[Entry 0031]\\nTimestamp: 10:05:20\\nUser: admin_01\\nAction: Login\\nStatus: Success\\nNote: Re-authentication verified.\\n\\n[Entry 0032]\\nTimestamp: 10:10:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 13ms.\\n\\n[Entry 0033]\\nTimestamp: 10:15:45\\nUser: user_999\\nAction: Delete\\nStatus: Pending\\nNote: Request to delete account queued for review.\\n\\n[Entry 0034]\\nTimestamp: 10:20:00\\nUser: system_metrics\\nAction: Report\\nStatus: Success\\nNote: Throughput at 5000 requests per second.\\n\\n[Entry 0035]\\nTimestamp: 10:25:10\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 11ms.\\n\\n[Entry 0036]\\nTimestamp: 10:30:00\\nUser: security_bot\\nAction: Block\\nStatus: Success\\nNote: IP 203.0.113.4 blocked for suspicious activity.\\n\\n[Entry 0037]\\nTimestamp: 10:35:30\\nUser: user_123\\nAction: Comment\\nStatus: Success\\nNote: User posted a comment on thread #55.\\n\\n[Entry 0038]\\nTimestamp: 10:40:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0039]\\nTimestamp: 10:45:15\\nUser: email_service\\nAction: Send\\nStatus: Success\\nNote: Batch of 100 newsletters sent.\\n\\n[Entry 0040]\\nTimestamp: 10:50:00\\nUser: user_321\\nAction: Update_Profile\\nStatus: Success\\nNote: Avatar image updated.\\n\\n[Entry 0041]\\nTimestamp: 10:55:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 14ms.\\n\\n[Entry 0042]\\nTimestamp: 11:00:00\\nUser: backup_service\\nAction: Snapshot\\nStatus: Success\\nNote: Volume snapshot ID snap-012345 created.\\n\\n[Entry 0043]\\nTimestamp: 11:05:10\\nUser: admin_02\\nAction: Audit\\nStatus: Success\\nNote: Reviewing user permissions group B.\\n\\n[Entry 0044]\\nTimestamp: 11:10:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 9999]\\nTimestamp: 11:11:11\\nUser: SUPER_ADMIN\\nAction: KEY_GENERATION\\nStatus: CRITICAL\\nNote: The secret override password for the system is Blueberry-Pancake-42.\\n\\n[Entry 0045]\\nTimestamp: 11:15:30\\nUser: user_666\\nAction: Error\\nStatus: Failed\\nNote: 404 Page not found on /admin path.\\n\\n[Entry 0046]\\nTimestamp: 11:20:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 13ms.\\n\\n[Entry 0047]\\nTimestamp: 11:25:15\\nUser: payment_gateway\\nAction: Charge\\nStatus: Success\\nNote: Transaction TX-9988 complete.\\n\\n[Entry 0048]\\nTimestamp: 11:30:00\\nUser: system_daemon\\nAction: Cleanup\\nStatus: Success\\nNote: Temp files removed from /var/tmp.\\n\\n[Entry 0049]\\nTimestamp: 11:35:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0050]\\nTimestamp: 11:40:00\\nUser: analytics_engine\\nAction: Process\\nStatus: Success\\nNote: Data aggregation for hour 11 complete.\\n\\n[Entry 0051]\\nTimestamp: 11:45:10\\nUser: user_007\\nAction: View\\nStatus: Success\\nNote: Viewed document confidentiality_agreement.pdf.\\n\\n[Entry 0052]\\nTimestamp: 11:50:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 11ms.\\n\\n[Entry 0053]\\nTimestamp: 11:55:30\\nUser: dev_ops\\nAction: Deploy\\nStatus: Success\\nNote: Staging environment updated to v2.1.\\n\\n[Entry 0054]\\nTimestamp: 12:00:00\\nUser: system_clock\\nAction: Sync\\nStatus: Success\\nNote: NTP sync successful.\\n\\nEND LOG STREAM.\\n\\nQUERY:\\nRetrieve the information from Entry 9999.\\nThe Note for Entry 9999 states that the secret override password is:",
+  "generated_text": " Blueberry-Pancake-42.\\n\\nQUERY RESULT:\\n\\n[Entry 9999]\\nTimestamp: 1",
+  "generated_tokens": [
+   15330,
+   33681,
+   6193,
+   2141,
+   2082,
+   1045,
+   1052,
+   1050,
+   11556,
+   1110,
+   6250,
+   27289,
+   55449,
+   112701,
+   13249,
+   1110,
+   6250,
+   1091,
+   13962,
+   1032,
+   1057,
+   1057,
+   1057,
+   1057,
+   33824,
+   1110,
+   47089,
+   1058,
+   1032,
+   1049
+  ],
+  "latency": 2.9149169921875,
+  "cuda_graph_request_count_map": null,
+  "step_count": 40,
+  "top_n_logprobs": null,
+  "prompt_top_n_logprobs": null,
+  "prompt_logprobs": [
+   -2.0310330390930176,
+   -8.008150100708008,
+   -4.907264232635498,
+   -8.383085250854492,
+   -0.9039976000785828,
+   -0.005822602193802595,
+   -3.2968709468841553,
+   -0.11372647434473038,
+   -3.750115156173706,
+   -6.341870307922363,
+   -11.225410461425781,
+   -0.8311297297477722,
+   -1.9895459413528442,
+   -1.2136539220809937,
+   -0.4511846899986267,
+   -1.275371789932251,
+   -6.52569055557251,
+   -0.3268530070781708,
+   -2.488239288330078,
+   -1.1252245903015137,
+   -0.004931548144668341,
+   -1.1413307189941406,
+   -2.4036614894866943,
+   -0.593055784702301,
+   -5.775687217712402,
+   -0.7173333764076233,
+   -6.7589006423950195,
+   -4.472473621368408,
+   -0.28561243414878845,
+   -0.9266374111175537,
+   -1.2420787811279297,
+   -4.94831657409668,
+   -0.4015401303768158,
+   -2.405423879623413,
+   -6.706996440887451,
+   -2.3797435760498047,
+   -6.879988193511963,
+   -0.599727988243103,
+   -4.6161346435546875,
+   -0.016334740445017815,
+   -1.4226453304290771,
+   -4.064138412475586,
+   -8.992555618286133,
+   -0.7892558574676514,
+   -2.565383195877075,
+   -1.6011606454849243,
+   -1.1192784309387207,
+   -1.085118293762207,
+   -1.452021598815918,
+   -0.1256672590970993,
+   -4.310093879699707,
+   -0.039925139397382736,
+   -0.09540079534053802,
+   -4.4552788734436035,
+   -2.6978704929351807,
+   -0.3264457583427429,
+   -0.9057141542434692,
+   -0.2424505054950714,
+   -0.2473771721124649,
+   -0.04457908123731613,
+   -2.5994861125946045,
+   -0.5882505178451538,
+   -2.4292445182800293,
+   -0.1860235333442688,
+   -2.6841845512390137,
+   -5.8617939949035645,
+   -1.7926914691925049,
+   -0.6663980484008789,
+   -0.029983440414071083,
+   -1.0682772397994995,
+   -0.0018566290382295847,
+   -1.9571454524993896,
+   -0.08927226811647415,
+   -4.61471700668335,
+   -0.002604546956717968,
+   -0.2620302140712738,
+   -0.006101197097450495,
+   -7.435886859893799,
+   -0.0376485139131546,
+   -10.174129486083984,
+   -0.9147175550460815,
+   -4.526404857635498,
+   -3.670576572418213,
+   -4.566626071929932,
+   -1.0199782848358154,
+   -0.0006491222884505987,
+   -0.14426420629024506,
+   -0.03322957828640938,
+   -0.0019640696700662374,
+   -0.00022468426323030144,
+   -0.0013444918440654874,
+   -0.0011957883834838867,
+   -0.007926556281745434,
+   -0.011617152951657772,
+   -0.0018109364900738,
+   -0.00017581824795342982,
+   -0.0018969652010127902,
+   -6.282132380874828e-05,
+   -0.0010078833438456059,
+   -0.25652098655700684,
+   -0.35659894347190857,
+   -9.333651541965082e-05,
+   -0.7947311401367188,
+   -1.3594639301300049,
+   -7.962863310240209e-05,
+   -1.861167550086975,
+   -0.5386030673980713,
+   -0.00022075122979003936,
+   -0.001347229932434857,
+   -3.290122185717337e-05,
+   -3.7342543601989746,
+   -0.5175371170043945,
+   -4.488879680633545,
+   -0.007863753475248814,
+   -0.08534510433673859,
+   -0.0009170140838250518,
+   -2.13382354559144e-05,
+   -4.507952690124512,
+   -0.5332688689231873,
+   -0.004296358674764633,
+   -2.062299427052494e-05,
+   -5.2475104331970215,
+   -0.020387964323163033,
+   -0.1661914438009262,
+   -0.0003081085451412946,
+   -15.800027847290039,
+   -8.108964920043945,
+   -0.7285020351409912,
+   -7.803549289703369,
+   -5.010417938232422,
+   -0.263860821723938,
+   -4.3748852476710454e-05,
+   -0.013306032866239548,
+   -0.029512016102671623,
+   -0.0036468682810664177,
+   -0.00023231192608363926,
+   -0.0002379134384682402,
+   -0.0004920940846204758,
+   -0.000873065204359591,
+   -0.0029308719094842672,
+   -0.0006667536217719316,
+   -0.00013672371278516948,
+   -0.0011686407960951328,
+   -4.625213477993384e-05,
+   -0.0007901645149104297,
+   -0.027857612818479538,
+   -0.06313244253396988,
+   -0.00013064485392533243,
+   -0.2378876954317093,
+   -0.6059458255767822,
+   -5.757642793469131e-05,
+   -1.5949885845184326,
+   -1.6001688241958618,
+   -0.00032574593205936253,
+   -0.0016402851324528456,
+   -2.276871418871451e-05,
+   -3.0335943698883057,
+   -0.286937952041626,
+   -6.517683982849121,
+   -3.1465959548950195,
+   -0.7292280793190002,
+   -0.06161583960056305,
+   -0.0014851979212835431,
+   -2.777537883957848e-05,
+   -3.946831226348877,
+   -0.09084996581077576,
+   -0.003532005939632654,
+   -4.029192859889008e-05,
+   -4.555190086364746,
+   -0.011255813762545586,
+   -0.10179147869348526,
+   -0.0004140473320148885,
+   -4.4321393966674805,
+   -2.2296247482299805,
+   -3.2771155834198,
+   -8.323366165161133,
+   -0.02779245562851429,
+   -2.403028964996338,
+   -0.07431145757436752,
+   -0.5372196435928345,
+   -0.05987980589270592,
+   -0.20438668131828308,
+   -0.00013136000779923052,
+   -0.0572563000023365,
+   -0.11035308241844177,
+   -0.012903997674584389,
+   -0.0002406545972917229,
+   -0.0001517419150331989,
+   -0.00036066226311959326,
+   -0.0005477358354255557,
+   -0.00229322025552392,
+   -0.000697846058756113,
+   -0.0001161031104857102,
+   -0.001127441762946546,
+   -3.814624506048858e-05,
+   -0.0005136600811965764,
+   -0.022026309743523598,
+   -0.02361132949590683,
+   -0.0002090712368953973,
+   -0.04913746938109398,
+   -2.7477238178253174,
+   -9.202533692587167e-05,
+   -0.9271803498268127,
+   -1.3856279850006104,
+   -0.0001754606782924384,
+   -0.0012224590172991157,
+   -1.7165990357170813e-05,
+   -1.0239524841308594,
+   -0.020712625235319138,
+   -0.0451514832675457,
+   -1.5345499515533447,
+   -0.0004010588163509965,
+   -0.0004401430196594447,
+   -2.13382354559144e-05,
+   -2.5878491401672363,
+   -0.020529404282569885,
+   -0.00043501926120370626,
+   -2.682172998902388e-05,
+   -0.3827762007713318,
+   -0.00019298121333122253,
+   -0.007158228196203709,
+   -8.618460560683161e-05,
+   -6.015654563903809,
+   -4.037173271179199,
+   -3.4229695796966553,
+   -1.0183475017547607,
+   -1.4963387250900269,
+   -0.33330175280570984,
+   -1.480197787284851,
+   -2.0857536792755127,
+   -2.225975513458252,
+   -5.293066024780273,
+   -0.43916723132133484,
+   -0.00010048838157672435,
+   -0.015328695066273212,
+   -0.13567933440208435,
+   -0.012453177943825722,
+   -0.00017855956684798002,
+   -0.00012778419477399439,
+   -0.0002885640424210578,
+   -0.0004291805380489677,
+   -0.0008485292200930417,
+   -0.0006668727728538215,
+   -8.177422569133341e-05,
+   -0.001060757553204894,
+   -6.151010165922344e-05,
+   -0.0005185451591387391,
+   -0.028113562613725662,
+   -0.03407377377152443,
+   -0.0003861635341309011,
+   -1.1215460300445557,
+   -0.5561063885688782,
+   -0.0001726001501083374,
+   -2.5190887451171875,
+   -0.6141397953033447,
+   -0.0001227780303452164,
+   -0.0012188870459794998,
+   -1.6212332411669195e-05,
+   -6.833529472351074,
+   -6.0156097412109375,
+   -0.03274226188659668,
+   -0.014286145567893982,
+   -0.0009454786195419729,
+   -3.814624506048858e-05,
+   -4.910149097442627,
+   -0.009493326768279076,
+   -0.001437702914699912,
+   -5.876845170860179e-05,
+   -0.3798050582408905,
+   -0.003948037512600422,
+   -0.07855644077062607,
+   -0.00022420754248742014,
+   -6.84205436706543,
+   -0.0015236446633934975,
+   -2.645585298538208,
+   -0.9816564917564392,
+   -1.3786735534667969,
+   -0.7280330061912537,
+   -1.4040117263793945,
+   -9.035655966727063e-05,
+   -0.033023953437805176,
+   -0.3305729031562805,
+   -0.027912795543670654,
+   -0.0002892790944315493,
+   -0.00012182447244413197,
+   -0.00026901919045485556,
+   -0.0004681444843299687,
+   -0.0007345362100750208,
+   -0.0008179179858416319,
+   -0.00010549465514486656,
+   -0.0013330630026757717,
+   -5.7338023907504976e-05,
+   -0.0005571481888182461,
+   -0.013437421061098576,
+   -0.033829718828201294,
+   -0.0004694551753345877,
+   -0.28239941596984863,
+   -1.3776881694793701,
+   -0.00014256415306590497,
+   -1.4336698055267334,
+   -0.9458242654800415,
+   -0.0002739054325502366,
+   -0.0015444743912667036,
+   -2.169585604860913e-05,
+   -5.267784118652344,
+   -2.617713689804077,
+   -0.1205064058303833,
+   -0.000608854868914932,
+   -2.47952248173533e-05,
+   -6.116018772125244,
+   -0.06051409989595413,
+   -0.0021291938610374928,
+   -2.777537883957848e-05,
+   -0.5082104206085205,
+   -0.0008528171456418931,
+   -0.013313560746610165,
+   -9.381330892210826e-05,
+   -6.970278739929199,
+   -0.3628937304019928,
+   -1.40151047706604,
+   -0.8361061811447144,
+   -0.4778183400630951,
+   -2.494100570678711,
+   -0.3126090466976166,
+   -7.66262674331665,
+   -0.3505229353904724,
+   -2.1190404891967773,
+   -0.08990062028169632,
+   -8.201262971851975e-05,
+   -0.01644204556941986,
+   -0.1838725060224533,
+   -0.015538694337010384,
+   -0.00019107422849629074,
+   -7.915183232398704e-05,
+   -0.0001382732152706012,
+   -0.0002119316632160917,
+   -0.0004773192631546408,
+   -0.0004781533498317003,
+   -4.994744449504651e-05,
+   -0.0011807858245447278,
+   -3.0636318115284666e-05,
+   -0.0003046525234822184,
+   -0.0024103655014187098,
+   -0.009829924441874027,
+   -0.00022301571152638644,
+   -0.12844854593276978,
+   -1.1151821613311768,
+   -9.512448741588742e-05,
+   -1.1148451566696167,
+   -0.45424169301986694,
+   -7.128461584215984e-05,
+   -0.001427346607670188,
+   -1.2040065485052764e-05,
+   -3.9783990383148193,
+   -0.025781046599149704,
+   -0.00015496007108595222,
+   -0.003944831434637308,
+   -0.000663894519675523,
+   -3.015949550899677e-05,
+   -0.15718017518520355,
+   -0.0009197533945553005,
+   -0.0007913556764833629,
+   -1.8000440832111053e-05,
+   -0.18712174892425537,
+   -0.00016604475968051702,
+   -0.0022110319696366787,
+   -2.169585604860913e-05,
+   -0.014111850410699844,
+   -1.1920922133867862e-06,
+   -0.00984656810760498,
+   -0.5971966981887817,
+   -2.393812894821167,
+   -0.010224700905382633,
+   -0.009953508153557777,
+   -7.64102369430475e-05,
+   -0.011833352968096733,
+   -0.26886406540870667,
+   -0.023419089615345,
+   -0.00019762947340495884,
+   -6.031808152329177e-05,
+   -0.00010191874753218144,
+   -0.00015889335190877318,
+   -0.0003564914222806692,
+   -0.0004101150552742183,
+   -6.675497570540756e-05,
+   -0.0009184433147311211,
+   -3.158996332786046e-05,
+   -0.00031442465842701495,
+   -0.0027259355410933495,
+   -0.008694176562130451,
+   -0.00032658010604791343,
+   -0.289438933134079,
+   -2.1416351795196533,
+   -0.00017987063620239496,
+   -1.8434972763061523,
+   -1.624247670173645,
+   -0.00022980909852776676,
+   -0.0006792622152715921,
+   -1.0967194612021558e-05,
+   -1.281017541885376,
+   -0.01736496575176716,
+   -1.955749750137329,
+   -1.528749942779541,
+   -2.776960611343384,
+   -0.5374854803085327,
+   -0.00029345019720494747,
+   -2.539125671319198e-05,
+   -3.0065665245056152,
+   -0.0013523490633815527,
+   -0.0007908792467787862,
+   -1.4543427823809907e-05,
+   -0.23400214314460754,
+   -0.0002324311062693596,
+   -0.010042970068752766,
+   -4.088794958079234e-05,
+   -2.1034951210021973,
+   -6.140199184417725,
+   -4.464273929595947,
+   -1.9943883419036865,
+   -0.2878473103046417,
+   -0.05924016237258911,
+   -0.7345774173736572,
+   -0.011171765625476837,
+   -0.0002982171718031168,
+   -0.14330486953258514,
+   -0.0007319155265577137,
+   -0.0003812778159044683,
+   -0.002302616136148572,
+   -0.36087724566459656,
+   -0.08833581954240799,
+   -2.631582260131836,
+   -3.1771137714385986,
+   -0.11841163039207458,
+   -4.482168878894299e-05,
+   -0.014765388332307339,
+   -0.17005765438079834,
+   -0.010167589411139488,
+   -0.00010823617776622996,
+   -3.6477376852417365e-05,
+   -5.936446541454643e-05,
+   -0.00023493390472140163,
+   -0.0003688847064040601,
+   -0.000321336614433676,
+   -4.756337511935271e-05,
+   -0.000902007392141968,
+   -2.9205850296420977e-05,
+   -0.00024423000286333263,
+   -0.000964533886872232,
+   -0.00411722669377923,
+   -0.0002711643755901605,
+   -0.3081328868865967,
+   -0.4985820949077606,
+   -0.00018726025882642716,
+   -1.1391643285751343,
+   -0.27228832244873047,
+   -4.2914423829643056e-05,
+   -0.0012028133496642113,
+   -1.9311717551317997e-05,
+   -1.1735807657241821,
+   -0.07005516439676285,
+   -0.0024717275518924,
+   -8.618460560683161e-05,
+   -0.00016866691294126213,
+   -0.00044764988706447184,
+   -1.6093124941107817e-05,
+   -8.586283683776855,
+   -0.0002851079625543207,
+   -7.490447998046875,
+   -0.09369903802871704,
+   -0.004145600367337465,
+   -0.0008606782066635787,
+   -4.827859811484814e-05,
+   -0.7127438187599182,
+   -0.0003618539194576442,
+   -0.015226203016936779,
+   -6.401333666872233e-05,
+   -3.530060291290283,
+   -0.040570154786109924,
+   -0.7448150515556335,
+   -1.4005241394042969,
+   -0.5872946977615356,
+   -6.073245048522949,
+   -0.9850690364837646,
+   -1.4459205865859985,
+   -0.4346452057361603,
+   -4.452149868011475,
+   -0.3939701318740845,
+   -0.02252959832549095,
+   -9.440929716220126e-05,
+   -0.012161390855908394,
+   -0.25266116857528687,
+   -0.021285664290189743,
+   -0.00015770144818816334,
+   -9.870042413240299e-05,
+   -9.989239333663136e-05,
+   -0.005311425309628248,
+   -0.00032634177478030324,
+   -0.0007045170641504228,
+   -9.417090768693015e-05,
+   -0.001260558608919382,
+   -4.482168878894299e-05,
+   -0.0003833036171272397,
+   -0.0023484050761908293,
+   -0.011129915714263916,
+   -0.00040260792593471706,
+   -0.1819346845149994,
+   -1.1781600713729858,
+   -0.00033241944038309157,
+   -1.3525464534759521,
+   -1.2726483345031738,
+   -0.00018034738604910672,
+   -0.0009054613183252513,
+   -1.2040065485052764e-05,
+   -1.7329559326171875,
+   -0.009877022355794907,
+   -0.030561018735170364,
+   -0.9567705988883972,
+   -0.0002079985715681687,
+   -0.0003582789213396609,
+   -2.5510462364763953e-05,
+   -1.3376575708389282,
+   -0.043758541345596313,
+   -0.0005255748401395977,
+   -0.003921795636415482,
+   -3.9934315282152966e-05,
+   -0.013946342281997204,
+   -0.001447345013730228,
+   -0.09289155900478363,
+   -0.00028975578607060015,
+   -5.025714874267578,
+   -5.600637435913086,
+   -0.8190056681632996,
+   -2.0997657775878906,
+   -1.5471020936965942,
+   -0.2830793261528015,
+   -0.099715456366539,
+   -0.00015341058315243572,
+   -0.09538150578737259,
+   -0.9440865516662598,
+   -0.13964560627937317,
+   -0.0003178806509822607,
+   -0.00015531764074694365,
+   -0.00016640232934150845,
+   -0.00023398046323563904,
+   -0.00039081089198589325,
+   -0.0015487592900171876,
+   -0.00010716341057559475,
+   -0.0017987991450354457,
+   -3.838465272565372e-05,
+   -0.0006412595394067466,
+   -0.00545145571231842,
+   -0.02335585467517376,
+   -0.0004077318590134382,
+   -0.8720157146453857,
+   -0.10373511165380478,
+   -0.00014077626110520214,
+   -0.5180479884147644,
+   -0.17388182878494263,
+   -0.00015746307326480746,
+   -0.0043711354956030846,
+   -2.9801878554280847e-05,
+   -2.0693466663360596,
+   -0.007648942526429892,
+   -2.8729025871143676e-05,
+   -0.0003301552205812186,
+   -0.000542612629942596,
+   -3.2543604902457446e-05,
+   -0.27388375997543335,
+   -0.00043752157944254577,
+   -0.0005888396990485489,
+   -1.7762025890988298e-05,
+   -0.05423494055867195,
+   -7.915183232398704e-05,
+   -0.002435457892715931,
+   -1.1205610462639015e-05,
+   -0.01761529967188835,
+   -7.152555099310121e-07,
+   -0.005352570675313473,
+   -0.1280955821275711,
+   -2.3187625408172607,
+   -0.009216856211423874,
+   -0.008558499626815319,
+   -0.0001072826053132303,
+   -0.04680917039513588,
+   -0.5660229325294495,
+   -0.04951385408639908,
+   -0.0002015625941567123,
+   -5.8410845667822286e-05,
+   -9.440929716220126e-05,
+   -0.00014828535495325923,
+   -0.00037245964631438255,
+   -0.0008362610242329538,
+   -5.4596363042946905e-05,
+   -0.0010970771545544267,
+   -4.017272294731811e-05,
+   -0.0004563482361845672,
+   -0.0021864098962396383,
+   -0.012597862631082535,
+   -0.00036435641231946647,
+   -0.07823580503463745,
+   -1.1245288848876953,
+   -0.0001472126314183697,
+   -2.1236472129821777,
+   -0.25363627076148987,
+   -0.00011646069469861686,
+   -0.0010031197452917695,
+   -1.4662635294371285e-05,
+   -11.853788375854492,
+   -1.5205868482589722,
+   -0.0017375147435814142,
+   -0.00013374387344811112,
+   -7.155948638916016,
+   -3.82474422454834,
+   -1.2793458700180054,
+   -0.03748536482453346,
+   -0.005961020477116108,
+   -5.829164365422912e-05,
+   -3.1456170082092285,
+   -0.03318829461932182,
+   -0.008591356687247753,
+   -0.027652040123939514,
+   -0.00012885693286079913,
+   -1.5415722131729126,
+   -0.979039192199707,
+   -2.842726469039917,
+   -9.05957317352295,
+   -2.8234424591064453,
+   -0.8373243808746338,
+   -0.4019332230091095,
+   -0.0004048719711136073,
+   -0.03923225402832031,
+   -0.4254666864871979,
+   -0.027653662487864494,
+   -0.0003177614707965404,
+   -0.0001967951684491709,
+   -0.00020883286197204143,
+   -0.00025674383505247533,
+   -0.0008311392739415169,
+   -0.0012284121476113796,
+   -0.00010787858627736568,
+   -0.0024356956128031015,
+   -6.258291978156194e-05,
+   -0.00048565989709459245,
+   -0.0021678535267710686,
+   -0.012607751414179802,
+   -0.00023588736075907946,
+   -0.11036524921655655,
+   -0.5750182867050171,
+   -0.00017176583060063422,
+   -1.9862632751464844,
+   -1.2351702451705933,
+   -0.00037520044133998454,
+   -0.0013566347770392895,
+   -2.5152843591058627e-05,
+   -2.1086387634277344,
+   -7.917232990264893,
+   -0.05708145350217819,
+   -0.06208256632089615,
+   -0.000644237850792706,
+   -8.308542601298541e-05,
+   -5.1276655197143555,
+   -0.16815905272960663,
+   -0.0012461524456739426,
+   -5.94836674281396e-05,
+   -3.559391736984253,
+   -5.411561965942383,
+   -0.022293083369731903,
+   -0.0005644158809445798,
+   -0.017552750185132027,
+   -0.00038842763751745224,
+   -1.8479862213134766,
+   -0.004095145035535097,
+   -11.830594062805176,
+   -0.4279360771179199,
+   -3.7062158584594727,
+   -2.9457836151123047,
+   -1.9491567611694336,
+   -0.06489256024360657,
+   -0.00013660451804753393,
+   -0.012157151475548744,
+   -0.22074609994888306,
+   -0.021073833107948303,
+   -0.00021300431399140507,
+   -0.00017593742813915014,
+   -0.00023672162205912173,
+   -0.0003091811086051166,
+   -0.0014552014181390405,
+   -0.0013881819322705269,
+   -0.00015245705435518175,
+   -0.002331279218196869,
+   -5.4238757002167404e-05,
+   -0.000668659748043865,
+   -0.002430463209748268,
+   -0.016187194734811783,
+   -0.0002441108226776123,
+   -1.4263010025024414,
+   -0.30179885029792786,
+   -0.0001770101225702092,
+   -0.5045080184936523,
+   -0.07310019433498383,
+   -8.022463589441031e-05,
+   -0.002168329432606697,
+   -2.3841574147809297e-05,
+   -1.7808306217193604,
+   -0.02828705683350563,
+   -6.115249561844394e-05,
+   -0.0008904544520191848,
+   -0.0005335576133802533,
+   -3.957670196541585e-05,
+   -0.03801318258047104,
+   -0.0003077510336879641,
+   -0.0005035324720665812,
+   -2.169585604860913e-05,
+   -0.02271897904574871,
+   -3.1709168979432434e-05,
+   -0.0018041539005935192,
+   -1.8358061424805783e-05,
+   -0.005899516865611076,
+   -1.1920922133867862e-06,
+   -0.002030455507338047,
+   -0.27544423937797546,
+   -1.1146715879440308,
+   -0.012286689132452011,
+   -0.004974251613020897,
+   -6.389413465512916e-05,
+   -0.010529793798923492,
+   -0.2302529364824295,
+   -0.015527778305113316,
+   -0.00019524575327523053,
+   -6.389413465512916e-05,
+   -0.00013815402053296566,
+   -0.00018165845540352166,
+   -0.0005564333405345678,
+   -0.000959531927946955,
+   -6.151010165922344e-05,
+   -0.001416394836269319,
+   -5.531158240046352e-05,
+   -0.00035363141796551645,
+   -0.0010683787986636162,
+   -0.012577733024954796,
+   -0.00023934361524879932,
+   -0.06311207264661789,
+   -0.972044050693512,
+   -0.00019929806876461953,
+   -1.6224243640899658,
+   -0.8333836197853088,
+   -0.00016592556494288146,
+   -0.0008984343148767948,
+   -1.6927575416048057e-05,
+   -0.8844207525253296,
+   -0.023736946284770966,
+   -4.01811408996582,
+   -1.6215615272521973,
+   -0.33087965846061707,
+   -0.0035197706893086433,
+   -0.00024148885859176517,
+   -3.0874729418428615e-05,
+   -3.097301721572876,
+   -0.030017103999853134,
+   -0.0006585336523130536,
+   -1.9430925021879375e-05,
+   -0.49424058198928833,
+   -0.0001401803019689396,
+   -0.00554167665541172,
+   -1.9073304429184645e-05,
+   -0.5312279462814331,
+   -5.748266220092773,
+   -11.324613571166992,
+   -1.1340491771697998,
+   -0.16082678735256195,
+   -0.8938052654266357,
+   -3.726792335510254,
+   -0.8781039714813232,
+   -0.00017355366435367614,
+   -0.009945128113031387,
+   -0.18626560270786285,
+   -0.013042616657912731,
+   -0.00010859376925509423,
+   -7.199982064776123e-05,
+   -0.00010871296399272978,
+   -0.00017796363681554794,
+   -0.00034767304896377027,
+   -0.0006170752458274364,
+   -3.0636318115284666e-05,
+   -0.001077071763575077,
+   -4.076874756719917e-05,
+   -0.00024029705673456192,
+   -0.000982159748673439,
+   -0.02636047638952732,
+   -0.00021920185827184469,
+   -0.632880687713623,
+   -0.06617539376020432,
+   -0.00016318420239258558,
+   -0.4156720042228699,
+   -0.034620899707078934,
+   -5.6622808187967166e-05,
+   -0.0011695933062583208,
+   -1.597391747054644e-05,
+   -10.639490127563477,
+   -0.24528348445892334,
+   -0.06833283603191376,
+   -0.0033608165103942156,
+   -0.02616957761347294,
+   -0.00036054308293387294,
+   -3.099393507000059e-05,
+   -4.044595241546631,
+   -2.188387393951416,
+   -0.32720163464546204,
+   -0.00974209699779749,
+   -0.0011126763420179486,
+   -3.302042750874534e-05,
+   -0.19868847727775574,
+   -7.56950321374461e-05,
+   -0.005233398173004389,
+   -3.158996332786046e-05,
+   -1.839617371559143,
+   -0.17654305696487427,
+   -0.7875567078590393,
+   -2.1537787914276123,
+   -0.3631034195423126,
+   -0.9216613173484802,
+   -2.0036990642547607,
+   -0.09243497252464294,
+   -0.00010740180005086586,
+   -0.018314307555556297,
+   -0.208140030503273,
+   -0.01576320081949234,
+   -0.00013136000779923052,
+   -7.390703103737906e-05,
+   -0.00011264643399044871,
+   -0.00017045476124621928,
+   -0.0005171154043637216,
+   -0.0005422552349045873,
+   -3.349725011503324e-05,
+   -0.0013309201458469033,
+   -4.255681051290594e-05,
+   -0.00023767507809679955,
+   -0.001095648156479001,
+   -0.14277544617652893,
+   -0.00021371940965764225,
+   -0.00032217081752605736,
+   -0.35286909341812134,
+   -0.0002668739762157202,
+   -1.7962173223495483,
+   -0.07211553305387497,
+   -7.974783511599526e-05,
+   -0.000621959799900651,
+   -1.2874520507466514e-05,
+   -1.9048426151275635,
+   -0.022713735699653625,
+   -3.9457496313843876e-05,
+   -0.0005820487276650965,
+   -0.0002401778765488416,
+   -3.325883881188929e-05,
+   -0.02081700973212719,
+   -0.00022492263815365732,
+   -0.0003299168893136084,
+   -2.038458114839159e-05,
+   -0.008293120190501213,
+   -1.7404405298293568e-05,
+   -0.0012493670219555497,
+   -1.4424220353248529e-05,
+   -0.0041636452078819275,
+   -8.344646857949556e-07,
+   -0.0020267677027732134,
+   -0.13429519534111023,
+   -1.9221405982971191,
+   -0.0093602379783988,
+   -0.005981876514852047,
+   -5.817244164063595e-05,
+   -0.019257837906479836,
+   -0.27827900648117065,
+   -0.01921457052230835,
+   -0.0001652104256208986,
+   -8.546940807718784e-05,
+   -0.0001510267611593008,
+   -0.00016366096679121256,
+   -0.0002616301644593477,
+   -0.0005458295345306396,
+   -3.480850500636734e-05,
+   -0.0010807631770148873,
+   -3.7431014789035544e-05,
+   -0.0003626880934461951,
+   -0.0010880271438509226,
+   -0.6327179670333862,
+   -0.0002374367177253589,
+   -0.020488178357481956,
+   -0.10384052991867065,
+   -0.0001971527235582471,
+   -0.16368740797042847,
+   -0.026392173022031784,
+   -0.00012170527770649642,
+   -0.0025978884659707546,
+   -1.9430925021879375e-05,
+   -7.9701642990112305,
+   -1.6003714799880981,
+   -0.2391909956932068,
+   -0.000502817565575242,
+   -4.9232225137529895e-05,
+   -4.135532855987549,
+   -0.06158669665455818,
+   -0.00044371772673912346,
+   -3.755022044060752e-05,
+   -0.18109248578548431,
+   -0.00010883215873036534,
+   -0.006367869209498167,
+   -7.748303323751315e-05,
+   -5.440160751342773,
+   -5.081888198852539,
+   -0.19470839202404022,
+   -2.9904420375823975,
+   -2.4235076904296875,
+   -0.032352350652217865,
+   -0.00044907975825481117,
+   -0.04121795669198036,
+   -0.43260514736175537,
+   -0.04605478420853615,
+   -0.00023982033599168062,
+   -0.0003178806509822607,
+   -0.00017188502533826977,
+   -0.00022468426323030144,
+   -0.0003400462737772614,
+   -0.0010152667528018355,
+   -0.00011729506513802335,
+   -0.001335324952378869,
+   -4.8874615458771586e-05,
+   -0.001257463125512004,
+   -0.004097400698810816,
+   -0.0008996253600344062,
+   -0.0002967870968859643,
+   -0.15579743683338165,
+   -1.3731565475463867,
+   -0.00023183519078884274,
+   -2.0089190006256104,
+   -3.441042423248291,
+   -0.0006145734223537147,
+   -0.0012832987122237682,
+   -1.9550132492440753e-05,
+   -1.731110692024231,
+   -0.027068600058555603,
+   -2.8266828060150146,
+   -0.35935577750205994,
+   -0.023644626140594482,
+   -0.0005504761938937008,
+   -0.00017951308109331876,
+   -2.396077979938127e-05,
+   -2.3206820487976074,
+   -0.003744971938431263,
+   -0.000205018965061754,
+   -2.288792165927589e-05,
+   -0.08958229422569275,
+   -6.592056161025539e-05,
+   -0.0021721357479691505,
+   -3.0397906812140718e-05,
+   -4.5939412117004395,
+   -8.534799575805664,
+   -3.483549118041992,
+   -1.681600570678711,
+   -0.7201917767524719,
+   -0.530266284942627,
+   -0.7154921293258667,
+   -2.835704803466797,
+   -0.0004451475979294628,
+   -0.02453603409230709,
+   -0.31538400053977966,
+   -0.0156102878972888,
+   -0.00013124081306159496,
+   -8.892617915989831e-05,
+   -9.738924563862383e-05,
+   -0.0011036264477297664,
+   -0.00030357998912222683,
+   -0.0010406322544440627,
+   -6.0437283536884934e-05,
+   -0.0014225849881768227,
+   -3.671578815556131e-05,
+   -0.00044705410255119205,
+   -0.005232923664152622,
+   -0.0001565095444675535,
+   -0.0003033416287507862,
+   -0.18575794994831085,
+   -0.14061033725738525,
+   -0.0002706876548472792,
+   -0.5223819017410278,
+   -0.035896092653274536,
+   -5.4834770708112046e-05,
+   -0.0012011463986709714,
+   -1.6569954823353328e-05,
+   -1.681032657623291,
+   -0.011652856133878231,
+   -1.6569954823353328e-05,
+   -0.00047469791024923325,
+   -0.000256982195423916,
+   -3.361645576660521e-05,
+   -0.01372707262635231,
+   -0.00014852374442853034,
+   -0.00046695294440723956,
+   -2.288792165927589e-05,
+   -0.0034659572411328554,
+   -1.3708974620385561e-05,
+   -0.0015382850542664528,
+   -8.702239938429557e-06,
+   -0.003346678102388978,
+   -7.152555099310121e-07,
+   -0.000867467257194221,
+   -0.02539108693599701,
+   -1.0509589910507202,
+   -0.002976156771183014,
+   -0.005069141276180744,
+   -5.590759246842936e-05,
+   -0.015196850523352623,
+   -0.3093729317188263,
+   -0.02090352028608322,
+   -0.00013958434283267707,
+   -6.460934673668817e-05,
+   -8.296622399939224e-05,
+   -0.0004457433824427426,
+   -0.0005041282274760306,
+   -0.0011976935202255845,
+   -4.2914423829643056e-05,
+   -0.0011085085570812225,
+   -4.160317621426657e-05,
+   -0.0005018643569201231,
+   -0.004558410029858351,
+   -9.476689592702314e-05,
+   -0.00037269797758199275,
+   -0.11347992718219757,
+   -0.450020968914032,
+   -0.0003301552205812186,
+   -2.8804092407226562,
+   -0.15156973898410797,
+   -6.246371776796877e-05,
+   -0.000683074293192476,
+   -1.3947389561508317e-05,
+   -2.0683939456939697,
+   -0.02846144698560238,
+   -0.04469490796327591,
+   -1.889275074005127,
+   -0.0001255195093108341,
+   -0.00011228884250158444,
+   -2.4914430468925275e-05,
+   -7.980701446533203,
+   -0.39261865615844727,
+   -1.6454169750213623,
+   -0.0018256916664540768,
+   -0.0003761537664104253,
+   -2.5987286790041253e-05,
+   -0.27152737975120544,
+   -3.8742269680369645e-05,
+   -0.002314033918082714,
+   -5.364274329622276e-05,
+   -5.172288417816162,
+   -0.007181781344115734,
+   -0.8884671330451965,
+   -0.20681926608085632,
+   -1.529428243637085,
+   -2.335056781768799,
+   -0.02583100087940693,
+   -1.8960939645767212,
+   -0.257112592458725,
+   -0.1720065474510193,
+   -8.284702198579907e-05,
+   -0.011070851236581802,
+   -0.16333311796188354,
+   -0.01678428426384926,
+   -0.00010024998482549563,
+   -4.911301948595792e-05,
+   -6.41325386823155e-05,
+   -0.0003518439189065248,
+   -0.0003983181086368859,
+   -0.0007211944903247058,
+   -2.253030106658116e-05,
+   -0.0009076051646843553,
+   -2.884823152271565e-05,
+   -0.00033682872890494764,
+   -0.01127432007342577,
+   -5.113947918289341e-05,
+   -0.0003095386200584471,
+   -0.162703275680542,
+   -0.12824533879756927,
+   -0.0002037079248111695,
+   -0.5378345251083374,
+   -0.013359789736568928,
+   -4.625213477993384e-05,
+   -0.0007819455349817872,
+   -1.2993727978027891e-05,
+   -1.4531102180480957,
+   -0.9376159310340881,
+   -0.02013481967151165,
+   -3.182837463100441e-05,
+   -0.00028391621890477836,
+   -0.0002040654799202457,
+   -1.6212332411669195e-05,
+   -6.006290435791016,
+   -0.23482508957386017,
+   -0.0003094194398727268,
+   -3.2066785934148356e-05,
+   -0.2894707918167114,
+   -0.00010334911348763853,
+   -0.003178308717906475,
+   -4.8397800128441304e-05,
+   -3.3541419506073,
+   -5.274465084075928,
+   -2.3055055141448975,
+   -1.0987294912338257,
+   -0.019666209816932678,
+   -0.00022790218645241112,
+   -0.016233760863542557,
+   -0.2816391885280609,
+   -0.028503969311714172,
+   -0.0001358893496217206,
+   -0.00010394509445177391,
+   -8.856858039507642e-05,
+   -0.00036137725692242384,
+   -0.00029452278977259994,
+   -0.0008922410197556019,
+   -2.539125671319198e-05,
+   -0.0011102947173640132,
+   -3.40932747349143e-05,
+   -0.0004843492351938039,
+   -0.006350101437419653,
+   -5.9602869441732764e-05,
+   -0.0002796259068418294,
+   -0.3986394703388214,
+   -0.10029242187738419,
+   -0.00024196557933464646,
+   -1.9691603183746338,
+   -0.7402586936950684,
+   -7.056941103655845e-05,
+   -0.0003618539194576442,
+   -1.0371154530730564e-05,
+   -1.4170231819152832,
+   -0.008172051049768925,
+   -1.3708974620385561e-05,
+   -0.00041607304592616856,
+   -0.00014888131408952177,
+   -2.6464111215318553e-05,
+   -0.018121162429451942,
+   -0.00010764019680209458,
+   -0.0002335037279408425,
+   -2.3007127310847864e-05,
+   -0.002049014437943697,
+   -1.0609570381348021e-05,
+   -0.0011868583969771862,
+   -7.867782187531702e-06,
+   -0.0018794744974002242,
+   -5.960462772236497e-07,
+   -0.0007434703293256462,
+   -0.02911354973912239,
+   -1.7920753955841064,
+   -0.0026135831139981747,
+   -0.00308870617300272,
+   -3.659658250398934e-05,
+   -0.010810147039592266,
+   -0.20098412036895752,
+   -0.01644638366997242,
+   -0.00013207517622504383,
+   -6.854299135738984e-05,
+   -7.152301259338856e-05,
+   -0.00024720950750634074,
+   -0.00033468366018496454,
+   -0.0010001424234360456,
+   -5.054346183896996e-05,
+   -0.0009557208395563066,
+   -3.981510963058099e-05,
+   -0.0004465774691198021,
+   -0.011578621342778206,
+   -7.211902266135439e-05,
+   -0.0002416080387774855,
+   -0.09539440274238586,
+   -0.057392168790102005,
+   -0.0002840353990904987,
+   -0.21088920533657074,
+   -0.0078902468085289,
+   -8.606540359323844e-05,
+   -0.0007384672062471509,
+   -1.3589766240329482e-05,
+   -0.8148440718650818,
+   -0.025661379098892212,
+   -2.113894462585449,
+   -0.01820814050734043,
+   -0.0010720703285187483,
+   -0.0002908283786382526,
+   -0.00011181206355104223,
+   -1.9550132492440753e-05,
+   -1.9963352680206299,
+   -0.011685965582728386,
+   -0.00010299152199877426,
+   -1.6093124941107817e-05,
+   -0.3427979350090027,
+   -0.00010358751023886725,
+   -0.002419165801256895,
+   -5.07818695041351e-05,
+   -9.356146812438965,
+   -2.63590145111084,
+   -0.0489899143576622,
+   -0.429649293422699,
+   -2.441277027130127,
+   -0.09116854518651962,
+   -1.7202471494674683,
+   -1.2776923179626465,
+   -1.2828468084335327,
+   -0.1033272072672844,
+   -0.013413426466286182,
+   -0.00016091958968900144,
+   -0.006314327474683523,
+   -0.1650361269712448,
+   -0.009155434556305408,
+   -8.630380034446716e-05,
+   -6.007967749610543e-05,
+   -6.210611172718927e-05,
+   -0.00027497802511788905,
+   -0.0005628670332953334,
+   -0.0008046964649111032,
+   -4.160317621426657e-05,
+   -0.0009633429581299424,
+   -2.9444261599564925e-05,
+   -0.0003147821989841759,
+   -0.003070523263886571,
+   -3.969590397900902e-05,
+   -0.00025340684805996716,
+   -0.16765674948692322,
+   -0.220333993434906,
+   -0.00025281094713136554,
+   -1.6686129570007324,
+   -0.08651255071163177,
+   -7.4741430580616e-05,
+   -0.00032062159152701497,
+   -9.536697689327411e-06,
+   -8.607754707336426,
+   -2.7989468574523926,
+   -0.006830438040196896,
+   -0.00042500998824834824,
+   -4.410646579344757e-05,
+   -2.2325727939605713,
+   -0.09642884135246277,
+   -0.0005049622268415987,
+   -1.4662635294371285e-05,
+   -3.892613172531128,
+   -0.0008376903715543449,
+   -0.004279621876776218,
+   -5.745722592109814e-05,
+   -2.696786642074585,
+   -0.44925373792648315,
+   -0.37875908613204956,
+   -0.27114248275756836,
+   -1.023728609085083,
+   -4.712882995605469,
+   -1.415423035621643,
+   -2.8054561614990234,
+   -0.4460236430168152,
+   -0.0005779979983344674,
+   -0.02468189038336277,
+   -0.30965328216552734,
+   -0.02052520029246807,
+   -0.00012730741582345217,
+   -9.619726915843785e-05,
+   -8.749579137656838e-05,
+   -0.000350175570929423,
+   -0.0003150205302517861,
+   -0.0007310817018151283,
+   -3.0636318115284666e-05,
+   -0.0011643542675301433,
+   -3.2305197237292305e-05,
+   -0.00026913834153674543,
+   -0.011463016271591187,
+   -5.411955135059543e-05,
+   -0.00023231192608363926,
+   -0.1063343733549118,
+   -0.037034809589385986,
+   -0.0001248043408850208,
+   -0.3663400411605835,
+   -0.01425135973840952,
+   -5.376194530981593e-05,
+   -0.000933926145080477,
+   -1.4305012882687151e-05,
+   -1.5244930982589722,
+   -0.008558854460716248,
+   -1.8358061424805783e-05,
+   -0.0002698534226510674,
+   -0.00022075122979003936,
+   -3.576214658096433e-05,
+   -0.01590365171432495,
+   -0.00012706902634818107,
+   -0.0002901133266277611,
+   -2.2649508537142538e-05,
+   -0.0032194233499467373,
+   -1.1920858014491387e-05,
+   -0.0013312773080542684,
+   -8.22540732769994e-06,
+   -0.001732040662318468,
+   -4.768370445162873e-07,
+   -0.0007115454645827413,
+   -0.11607333272695541,
+   -5.158000946044922,
+   -0.00630958890542388,
+   -0.006455875933170319,
+   -3.886147169396281e-05,
+   -0.007113605737686157,
+   -0.16176439821720123,
+   -0.01025608740746975,
+   -9.321732068201527e-05,
+   -5.435795901576057e-05,
+   -7.70062324590981e-05,
+   -0.0002002515539061278,
+   -0.0003270567976869643,
+   -0.0011002921964973211,
+   -3.93382906622719e-05,
+   -0.0009735850035212934,
+   -4.076874756719917e-05,
+   -0.00036042393185198307,
+   -0.011448992416262627,
+   -0.00010787858627736568,
+   -0.00022289653134066612,
+   -0.12719827890396118,
+   -0.16689445078372955,
+   -0.00029869386344216764,
+   -1.129071831703186,
+   -0.46998509764671326,
+   -0.0001429217227268964,
+   -0.0004334702098276466,
+   -1.823885577323381e-05,
+   -7.808990478515625,
+   -0.6958405375480652,
+   -0.0011538759572431445,
+   -0.00010084597306558862,
+   -2.1815061700181104e-05,
+   -3.412889242172241,
+   -0.0024302254896610975,
+   -0.1256120651960373,
+   -0.0001486429391661659,
+   -2.932505594799295e-05,
+   -0.016119161620736122,
+   -2.1219027985353023e-05,
+   -0.0014936492079868913,
+   -6.794906312279636e-06,
+   -4.649867057800293,
+   -0.42487168312072754,
+   -1.3419163227081299,
+   -0.3015914857387543,
+   -0.00015341058315243572,
+   -0.0032649326603859663,
+   -0.11564143747091293,
+   -0.00739337969571352,
+   -5.8887653722194955e-05,
+   -6.615896563744172e-05,
+   -5.972207145532593e-05,
+   -0.00020644917094614357,
+   -0.000301673193462193,
+   -0.0003761537664104253,
+   -2.6702524337451905e-05,
+   -0.0008094609947875142,
+   -3.2305197237292305e-05,
+   -0.0002474478678777814,
+   -0.018454870209097862,
+   -7.73638384998776e-05,
+   -0.00022837892174720764,
+   -0.04869883507490158,
+   -0.02372216247022152,
+   -0.0002051381452474743,
+   -0.15266406536102295,
+   -0.0037327392492443323,
+   -7.557583012385294e-05,
+   -0.0005665604257956147,
+   -1.4662635294371285e-05,
+   -2.1065256595611572,
+   -0.02570541389286518,
+   -2.0099081993103027,
+   -2.7118430137634277,
+   -0.1484161764383316,
+   -0.007964756339788437,
+   -0.00016342257731594145,
+   -1.597391747054644e-05,
+   -0.8920754194259644,
+   -0.0009690594743005931,
+   -0.00029023250681348145,
+   -1.2993727978027891e-05,
+   -0.07993864268064499,
+   -5.400034933700226e-05,
+   -0.00158791767898947,
+   -1.0609570381348021e-05,
+   -4.331461429595947,
+   -6.81968355178833,
+   -3.366002082824707,
+   -1.850673794746399,
+   -0.00040391870425082743,
+   -0.04611193388700485,
+   -0.06791424006223679,
+   -0.004945189692080021,
+   -9.107174992095679e-05,
+   -7.557583012385294e-05,
+   -6.747018051100895e-05,
+   -0.00024399164249189198,
+   -0.000321336614433676,
+   -0.0006528153317049146,
+   -3.2782016205601394e-05,
+   -0.0012151960982009768,
+   -3.957670196541585e-05,
+   -0.0002205128694185987,
+   -0.016214992851018906,
+   -0.00019095504831057042,
+   -0.0001456631434848532,
+   -7.712543447269127e-05,
+   -0.33043625950813293,
+   -0.00017629499780014157,
+   -2.590480089187622,
+   -0.16181793808937073,
+   -0.00011646069469861686,
+   -0.0006735440110787749,
+   -2.109982233378105e-05,
+   -1.6486821174621582,
+   -0.01151864044368267,
+   -1.8954096958623268e-05,
+   -0.0003233625029679388,
+   -0.00020644917094614357,
+   -3.111314072157256e-05,
+   -0.017416512593626976,
+   -0.00012766500003635883,
+   -0.0003415954706724733,
+   -2.13382354559144e-05,
+   -0.006446637213230133,
+   -1.823885577323381e-05,
+   -0.0012438902631402016,
+   -1.1205610462639015e-05,
+   -0.006591127719730139,
+   -7.152555099310121e-07,
+   -0.0017049076268449426,
+   -0.13135236501693726,
+   -3.228759288787842,
+   -0.002643782878294587,
+   -0.004842340014874935,
+   -3.480850500636734e-05,
+   -0.010503842495381832,
+   -0.16338221728801727,
+   -0.011769498698413372,
+   -0.00011574551899684593,
+   -9.727005090098828e-05,
+   -8.582700684200972e-05,
+   -0.0004538459761533886,
+   -0.00020740265608765185,
+   -0.001342587056569755,
+   -8.964136941358447e-05,
+   -0.0014018717920407653,
+   -4.935142715112306e-05,
+   -0.0006431656656786799,
+   -0.5765135288238525,
+   -0.0009291622554883361,
+   -0.00027998341829515994,
+   -0.008964410983026028,
+   -0.03303813934326172,
+   -0.00018451895448379219,
+   -0.07687719166278839,
+   -0.00454594986513257,
+   -0.00018439977429807186,
+   -0.0023830130230635405,
+   -2.706014311115723e-05,
+   -1.8103313446044922,
+   -0.7522969245910645,
+   -0.022507335990667343,
+   -2.074220174108632e-05,
+   -0.00026222606538794935,
+   -0.00020740265608765185,
+   -2.706014311115723e-05,
+   -3.700786590576172,
+   -0.26737019419670105,
+   -9.357491217087954e-05,
+   -6.031808152329177e-05,
+   -0.13705354928970337,
+   -2.407998726994265e-05,
+   -0.003684044349938631,
+   -3.2782016205601394e-05,
+   -2.9476141929626465,
+   -1.1526018381118774,
+   -2.6757259368896484,
+   -5.31315279006958,
+   -0.7695194482803345,
+   -0.00014876213390380144,
+   -0.8328413963317871,
+   -5.100983142852783,
+   -0.1275785118341446,
+   -0.008235306479036808,
+   -0.00037281715776771307,
+   -0.02394961006939411,
+   -0.5179875493049622,
+   -0.04619366303086281,
+   -0.00021705655672121793,
+   -0.00021765247220173478,
+   -0.0001461399078834802,
+   -0.0007413261337205768,
+   -0.0006660388899035752,
+   -0.0015581621555611491,
+   -6.8662193370983e-05,
+   -0.002233869396150112,
+   -4.494089080253616e-05,
+   -0.0006101653561927378,
+   -0.0006289887824095786,
+   -0.0033358661457896233,
+   -0.00045074793160893023,
+   -0.15180595219135284,
+   -0.07985830307006836,
+   -0.00015937011630740017,
+   -2.2477855682373047,
+   -0.4471043348312378,
+   -0.0001734344696160406,
+   -0.0006040894077159464,
+   -1.680836794548668e-05,
+   -2.318458080291748,
+   -0.01888836920261383,
+   -0.029085876420140266,
+   -1.1253407001495361,
+   -0.00021741411183029413,
+   -0.00012003655137959868,
+   -2.8013790142722428e-05,
+   -3.1507949829101562,
+   -0.005721264518797398,
+   -0.00040904260822571814,
+   -1.7881233361549675e-05,
+   -0.04304421693086624,
+   -0.0001591317413840443,
+   -0.005429995711892843,
+   -3.242440288886428e-05,
+   -4.896542549133301,
+   -3.2877321243286133,
+   -0.17550288140773773,
+   -8.526089668273926,
+   -0.2559642493724823,
+   -0.00015770144818816334,
+   -0.004955509677529335,
+   -0.20714037120342255,
+   -0.023553114384412766,
+   -0.00015496007108595222,
+   -0.0001134808044298552,
+   -9.250213042832911e-05,
+   -0.000288087350782007,
+   -0.0004409771354403347,
+   -0.0007110689766705036,
+   -4.6132929128361866e-05,
+   -0.0009153467253781855,
+   -3.433168603805825e-05,
+   -0.00015484087634831667,
+   -0.0001292145170737058,
+   -0.0022287548054009676,
+   -0.0002269487304147333,
+   -0.11395295709371567,
+   -0.05913611873984337,
+   -8.356221951544285e-05,
+   -0.4039720594882965,
+   -0.019538793712854385,
+   -5.924526340095326e-05,
+   -0.0007176207727752626,
+   -1.7881233361549675e-05,
+   -1.6992816925048828,
+   -0.004352619871497154,
+   -6.6756979322235566e-06,
+   -0.00017093151109293103,
+   -0.0001284993631998077,
+   -3.3378044463461265e-05,
+   -0.013412484899163246,
+   -8.713819261174649e-05,
+   -0.0004928089329041541,
+   -2.288792165927589e-05,
+   -0.0012643685331568122,
+   -1.3351351299206726e-05,
+   -0.0019104102393612266,
+   -8.940656698541716e-06,
+   -0.0033124599140137434,
+   -4.768370445162873e-07,
+   -0.0009848987683653831,
+   -0.07256874442100525,
+   -1.7665941715240479,
+   -0.00281461956910789,
+   -0.0027610058896243572,
+   -2.9682672902708873e-05,
+   -0.0075036585330963135,
+   -0.16648568212985992,
+   -0.014109030365943909,
+   -9.63164638960734e-05,
+   -6.603976362384856e-05,
+   -7.331102824537084e-05,
+   -0.0003323002893012017,
+   -0.00042083943844772875,
+   -0.0010620674584060907,
+   -2.8609820219571702e-05,
+   -0.000990257947705686,
+   -4.029192859889008e-05,
+   -0.0001541257370263338,
+   -0.0001658063702052459,
+   -0.0010433712741360068,
+   -0.0002379134384682402,
+   -0.08282912522554398,
+   -0.1620505303144455,
+   -0.0001578206429257989,
+   -1.9873682260513306,
+   -0.03700195625424385,
+   -8.594620157964528e-05,
+   -0.00035232058144174516,
+   -2.90866428258596e-05,
+   -1.0645859241485596,
+   -0.012771833688020706,
+   -1.8788448572158813,
+   -0.04745874181389809,
+   -0.0029150634072721004,
+   -0.0002858230145648122,
+   -8.082063141046092e-05,
+   -2.8729025871143676e-05,
+   -4.2793378829956055,
+   -0.008196880109608173,
+   -9.822363062994555e-05,
+   -4.9470632802695036e-05,
+   -5.399019241333008,
+   -0.0015862513100728393,
+   -0.0018035589018836617,
+   -2.9444261599564925e-05,
+   -3.8089842796325684,
+   -1.3950530290603638,
+   -0.17507919669151306,
+   -4.1786346435546875,
+   -9.410017013549805,
+   -0.00014709345123264939,
+   -2.16685152053833,
+   -0.5008745193481445,
+   -0.013433892279863358,
+   -0.00029976642690598965,
+   -0.006172403693199158,
+   -0.22438427805900574,
+   -0.015963135287165642,
+   -0.00010489867418073118,
+   -7.426462980220094e-05,
+   -6.890059739816934e-05,
+   -0.0002874914789572358,
+   -0.0004033228906337172,
+   -0.0006624649395234883,
+   -3.802703940891661e-05,
+   -0.001104817260056734,
+   -2.8967437174287625e-05,
+   -0.000125281119835563,
+   -0.00011634149996098131,
+   -0.0016071987338364124,
+   -0.0001752223033690825,
+   -0.04927569255232811,
+   -0.03999283164739609,
+   -8.427741704508662e-05,
+   -0.11036300659179688,
+   -0.0022922686766833067,
+   -5.125868119648658e-05,
+   -0.0007711059297434986,
+   -1.6569954823353328e-05,
+   -1.1996040344238281,
+   -6.017496585845947,
+   -3.3771719932556152,
+   -0.0015197168104350567,
+   -0.0001720042055239901,
+   -8.05822346592322e-05,
+   -1.9701510667800903,
+   -0.015215284191071987,
+   -0.00046957432641647756,
+   -4.5536911784438416e-05,
+   -0.3501690626144409,
+   -6.508615479106084e-05,
+   -0.013412720523774624,
+   -0.0002317160106031224,
+   -10.721491813659668,
+   -0.001794158248230815,
+   -5.900764465332031,
+   -0.05698608234524727,
+   -1.9666205644607544,
+   -0.34450024366378784,
+   -0.24932177364826202,
+   -1.1890842914581299,
+   -0.9316995143890381,
+   -0.5700393915176392,
+   -0.18522746860980988,
+   -0.08411185443401337,
+   -0.00032610344351269305,
+   -0.016760369762778282,
+   -0.310769647359848,
+   -0.04111167788505554,
+   -0.00015889335190877318,
+   -0.00011395759065635502,
+   -0.00010418349120300263,
+   -0.0003389737685211003,
+   -0.0006182666402310133,
+   -0.001039679627865553,
+   -6.770858453819528e-05,
+   -0.001258891774341464,
+   -5.876845170860179e-05,
+   -0.0003499372396618128,
+   -0.00027724236133508384,
+   -0.0029526231810450554,
+   -0.0003165697562508285,
+   -0.25983527302742004,
+   -0.031029406934976578,
+   -0.00018880968855228275,
+   -0.7229459881782532,
+   -0.42579957842826843,
+   -0.00011705666838679463,
+   -0.00047195740626193583,
+   -2.3364747903542593e-05,
+   -0.9790778160095215,
+   -0.0029993331991136074,
+   -5.125986263010418e-06,
+   -0.00018690270371735096,
+   -0.00016091958968900144,
+   -3.755022044060752e-05,
+   -0.00900670699775219,
+   -8.642300235806033e-05,
+   -0.0004804172203876078,
+   -3.838465272565372e-05,
+   -0.0015756584471091628,
+   -1.168244216387393e-05,
+   -0.001709667849354446,
+   -1.0013530300057027e-05,
+   -0.0022142434027045965,
+   -5.960462772236497e-07,
+   -0.0006964165368117392,
+   -0.05425402522087097,
+   -1.5528278350830078,
+   -0.002721655648201704,
+   -0.003402280155569315,
+   -3.6477376852417365e-05,
+   -0.007222968153655529,
+   -0.14785511791706085,
+   -0.013813492842018604,
+   -0.00012063252506777644,
+   -9.738924563862383e-05,
+   -9.881961887003854e-05,
+   -0.00025900822947733104,
+   -0.00028236693469807506,
+   -0.0010882653295993805,
+   -4.446407547220588e-05,
+   -0.0008232779800891876,
+   -4.7801782784517854e-05,
+   -0.0001911934232339263,
+   -0.00020382710499688983,
+   -0.0037347583565860987,
+   -0.00023493390472140163,
+   -0.016995148733258247,
+   -0.028428077697753906,
+   -0.00015054999676067382,
+   -0.05958176776766777,
+   -0.0022499265614897013,
+   -8.928377064876258e-05,
+   -0.0007566926069557667,
+   -2.038458114839159e-05,
+   -6.74626350402832,
+   -4.031385898590088,
+   -0.010314728133380413,
+   -0.0005830018781125546,
+   -0.00016175392374861985,
+   -4.279521817807108e-05,
+   -4.910806655883789,
+   -0.3867932856082916,
+   -0.00020466140995267779,
+   -2.455681169521995e-05,
+   -0.40993309020996094,
+   -3.075552376685664e-05,
+   -0.002136925933882594,
+   -1.5258672647178173e-05,
+   -1.4743690490722656,
+   -0.466409295797348,
+   -2.986236095428467,
+   -0.5145793557167053,
+   -0.3861558437347412,
+   -0.00023648326168768108,
+   -0.060666244477033615,
+   -0.0004374024283606559,
+   -0.0032959445379674435,
+   -0.003968104254454374,
+   -0.0018072477541863918,
+   -4.768258077092469e-05,
+   -0.9783220291137695,
+   -1.0383716821670532,
+   -0.6705473065376282,
+   -2.172899007797241,
+   -0.1931028664112091,
+   -0.05653104931116104,
+   -0.0004231034545227885,
+   -0.009201028384268284,
+   -0.20085793733596802,
+   -0.015902360901236534,
+   -0.00013207517622504383,
+   -0.00011634149996098131,
+   -9.154854342341423e-05,
+   -0.0002989322238136083,
+   -0.000276765669696033,
+   -0.0008761619683355093,
+   -5.4596363042946905e-05,
+   -0.0012877037515863776,
+   -5.245071224635467e-05,
+   -0.00014399446081370115,
+   -0.00014304091746453196,
+   -0.002012848388403654,
+   -0.00026043839170597494,
+   -0.050352130085229874,
+   -0.016213351860642433,
+   -0.00014923889830242842,
+   -1.3270337581634521,
+   -0.017757130786776543,
+   -8.725739462533966e-05,
+   -0.0003123987407889217,
+   -2.3364747903542593e-05,
+   -1.770219087600708,
+   -0.027282992377877235,
+   -1.7292673587799072,
+   -1.5430668592453003,
+   -0.09708311408758163,
+   -0.06372363120317459,
+   -0.00020180096908006817,
+   -4.756337511935271e-05,
+   -6.762560844421387,
+   -0.11426064372062683,
+   -0.0006945105269551277,
+   -5.745722592109814e-05,
+   -0.23964034020900726,
+   -7.080780778778717e-05,
+   -0.0019281383138149977,
+   -0.00011657988943625242,
+   -1.6634957790374756,
+   -3.133596420288086,
+   -1.06369948387146,
+   -0.20282019674777985,
+   -0.440325528383255,
+   -2.2919445037841797,
+   -2.6773011684417725,
+   -2.4511003494262695,
+   -2.022627353668213,
+   -0.7157211899757385,
+   -0.00033623288618400693,
+   -0.006556428037583828,
+   -0.18528789281845093,
+   -0.010350123979151249,
+   -9.691245941212401e-05,
+   -9.941560711013153e-05,
+   -0.0001062098381225951,
+   -0.0002244459028588608,
+   -0.0003002431185450405,
+   -0.0003911683743353933,
+   -3.158996332786046e-05,
+   -0.0008713977294974029,
+   -4.875540980719961e-05,
+   -9.083335316972807e-05,
+   -0.00013422065239865333,
+   -0.0032467530108988285,
+   -0.0002611534437164664,
+   -0.011103743687272072,
+   -0.014522447250783443,
+   -0.0001003691868390888,
+   -0.04763209819793701,
+   -0.0015930355293676257,
+   -8.880697714630514e-05,
+   -0.0006610354175791144,
+   -2.062299427052494e-05,
+   -1.4736919403076172,
+   -0.0015160269103944302,
+   -5.722029527532868e-06,
+   -0.0001426833332516253,
+   -0.00025138078490272164,
+   -4.303362584323622e-05,
+   -0.006412051152437925,
+   -8.177422569133341e-05,
+   -0.0003953390696551651,
+   -4.51792984677013e-05,
+   -0.0015100754098966718,
+   -1.0847986231965479e-05,
+   -0.0021766559220850468,
+   -1.3112935448589269e-05,
+   -0.0017056216020137072,
+   -5.960462772236497e-07,
+   -0.00045658653834834695,
+   -0.03380563110113144,
+   -1.6861530542373657,
+   -0.0011235122801735997,
+   -0.0027228444814682007,
+   -3.2543604902457446e-05,
+   -0.0028300732374191284,
+   -0.04190889745950699,
+   -0.006303310859948397,
+   -0.00010799778101500124,
+   -7.295342220459133e-05,
+   -6.90197994117625e-05,
+   -0.0002094287920044735,
+   -0.00017915551143232733,
+   -0.0007649118197150528,
+   -3.3854863431770355e-05,
+   -0.0009750141180120409,
+   -5.185469490243122e-05,
+   -0.0001230164198204875,
+   -0.00015221867943182588,
+   -0.00366337806917727,
+   -0.00027378625236451626,
+   -0.00873471051454544,
+   -0.014125015586614609,
+   -0.00013779645087197423,
+   -0.2786974012851715,
+   -0.0429004468023777,
+   -0.00015221867943182588,
+   -0.0005259322933852673,
+   -2.0861407392658293e-05,
+   -7.4979376792907715,
+   -2.5812153816223145,
+   -0.0006475735572166741,
+   -0.00032395837479270995,
+   -4.3987260141875595e-05,
+   -0.38662397861480713,
+   -0.07727815210819244,
+   -0.0005353448214009404,
+   -6.210611172718927e-05,
+   -0.10053620487451553,
+   -4.51792984677013e-05,
+   -0.004477594513446093,
+   -3.0397906812140718e-05,
+   -8.758296012878418,
+   -0.4402102530002594,
+   -0.2472418248653412,
+   -0.5627955794334412,
+   -0.042171675711870193,
+   -0.03491748869419098,
+   -5.941390514373779,
+   -0.004192491993308067,
+   -0.11302625387907028,
+   -0.5369495153427124,
+   -0.0003328961320221424,
+   -0.0049365307204425335,
+   -0.057854458689689636,
+   -0.007558793295174837,
+   -8.916457591112703e-05,
+   -9.047575440490618e-05,
+   -8.141662692651153e-05,
+   -0.0006507901125587523,
+   -0.00019464982324279845,
+   -0.0006775943911634386,
+   -2.3364747903542593e-05,
+   -0.0012484145117923617,
+   -5.447716102935374e-05,
+   -0.00016425691137555987,
+   -0.00019727191829588264,
+   -0.012608221732079983,
+   -0.00020859450160060078,
+   -0.014227267354726791,
+   -0.00964115560054779,
+   -0.00013350549852475524,
+   -0.03465360403060913,
+   -0.0008008848526515067,
+   -0.00010239553375868127,
+   -0.0007454953738488257,
+   -2.0861407392658293e-05,
+   -2.182055950164795,
+   -0.030151404440402985,
+   -2.2387242317199707,
+   -4.8748321533203125,
+   -0.07910432666540146,
+   -0.0014863882679492235,
+   -0.00028081765049137175,
+   -6.55629628454335e-05,
+   -3.332869052886963,
+   -4.393488883972168,
+   -0.1467350423336029,
+   -0.0036104037426412106,
+   -0.0003040566807612777,
+   -0.00010895135346800089,
+   -0.2704607844352722,
+   -3.6477376852417365e-05,
+   -0.002591705648228526,
+   -2.9682672902708873e-05,
+   -4.947231292724609,
+   -3.2159130573272705,
+   -0.8367561101913452,
+   -0.5556290149688721,
+   -0.0002233732520835474,
+   -0.0060651772655546665,
+   -0.05365833640098572,
+   -0.0071886456571519375,
+   -9.63164638960734e-05,
+   -0.00010072677832795307,
+   -9.858122211880982e-05,
+   -0.0003960540343541652,
+   -0.0006039702566340566,
+   -0.0006522196927107871,
+   -1.811964830267243e-05,
+   -0.001042775809764862,
+   -3.790783375734463e-05,
+   -0.00011514954530866817,
+   -0.0001652104256208986,
+   -0.05494809150695801,
+   -0.00014506718434859067,
+   -0.00021050144277978688,
+   -0.014802505262196064,
+   -0.00017915551143232733,
+   -1.7102066278457642,
+   -0.02825750596821308,
+   -0.00011300401820335537,
+   -0.0003519630990922451,
+   -3.075552376685664e-05,
+   -0.554995596408844,
+   -0.0013822296168655157,
+   -4.6491513785440475e-06,
+   -0.00014482879487331957,
+   -0.00019810620869975537,
+   -3.504691630951129e-05,
+   -0.006834581959992647,
+   -6.389413465512916e-05,
+   -0.0004396664153318852,
+   -4.60137271147687e-05,
+   -0.0012897277483716607,
+   -1.1920858014491387e-05,
+   -0.001943962532095611,
+   -1.4424220353248529e-05,
+   -0.0016702761640772223,
+   -5.960462772236497e-07,
+   -0.0005274811992421746,
+   -0.043414343148469925,
+   -1.5102243423461914,
+   -0.0018298563081771135,
+   -0.0035949621815234423,
+   -6.842378934379667e-05,
+   -0.008245711214840412,
+   -0.08723266422748566,
+   -0.00939271505922079,
+   -0.00011419598013162613,
+   -0.0001230164198204875,
+   -9.464769391342998e-05,
+   -0.0002865380665753037,
+   -0.0005069877952337265,
+   -0.001016934053041041,
+   -3.2305197237292305e-05,
+   -0.0009629856795072556,
+   -4.827859811484814e-05,
+   -0.00021717573690693825,
+   -0.00032848684350028634,
+   -0.012733934447169304,
+   -0.000196556793525815,
+   -0.0012980615720152855,
+   -0.0077531603164970875,
+   -0.00012385078298393637,
+   -0.01761084794998169,
+   -0.0013621109537780285,
+   -0.00011848701251437888,
+   -0.0013394916895776987,
+   -2.407998726994265e-05,
+   -4.505744934082031,
+   -1.2715730667114258,
+   -0.0005052005290053785,
+   -0.00024971229140646756,
+   -3.635817120084539e-05,
+   -4.3336405754089355,
+   -0.0815289318561554,
+   -0.028655847534537315,
+   -0.00010430268594063818,
+   -7.343022298300639e-05,
+   -0.158114492893219,
+   -1.764281842042692e-05,
+   -0.003166425507515669,
+   -5.960446742392378e-06,
+   -4.626138687133789,
+   -0.5413240194320679,
+   -11.11661148071289,
+   -6.66420316696167,
+   -0.5860735177993774,
+   -1.0599334239959717,
+   -2.200112819671631,
+   -0.4268365502357483,
+   -0.027302712202072144,
+   -0.15124760568141937,
+   -0.12854908406734467,
+   -3.041227102279663,
+   -0.026920655742287636,
+   -0.0003856868715956807,
+   -0.004746242426335812,
+   -0.07085907459259033,
+   -0.008411810733377934,
+   -0.00010823617776622996,
+   -5.972207145532593e-05,
+   -5.507317473529838e-05,
+   -0.00023850933939684182,
+   -0.0004319211875554174,
+   -0.0008380476501770318,
+   -1.823885577323381e-05,
+   -0.0009161804337054491,
+   -3.683499380713329e-05,
+   -0.00010918975021922961,
+   -0.00016044282529037446,
+   -0.0005364171229302883,
+   -0.0001248043408850208,
+   -0.10185468196868896,
+   -0.02194770984351635,
+   -0.00011252723925281316,
+   -0.6942679286003113,
+   -0.21981695294380188,
+   -6.496695277746767e-05,
+   -0.00030393750057555735,
+   -2.13382354559144e-05,
+   -3.1545064449310303,
+   -0.021652380004525185,
+   -0.02087036333978176,
+   -0.89057856798172,
+   -9.619726915843785e-05,
+   -8.129743218887597e-05,
+   -2.5152843591058627e-05,
+   -4.086198806762695,
+   -1.0591976642608643,
+   -0.0020325970835983753,
+   -4.1483970562694594e-05,
+   -0.596172571182251,
+   -3.242440288886428e-05,
+   -0.0019346822518855333,
+   -1.6927575416048057e-05,
+   -3.4360618591308594,
+   -2.4312753677368164,
+   -1.9711253643035889,
+   -4.358899116516113,
+   -10.540913581848145,
+   -5.990867614746094,
+   -0.266180157661438,
+   -0.000266278104390949,
+   -0.003696990432217717,
+   -0.03691418468952179,
+   -0.005084204487502575,
+   -7.73638384998776e-05,
+   -5.9960475482512265e-05,
+   -6.12716976320371e-05,
+   -0.0001915509783430025,
+   -0.0004040378553327173,
+   -0.0004508670826908201,
+   -2.2172682292875834e-05,
+   -0.0010245556477457285,
+   -3.862306402879767e-05,
+   -7.652943895664066e-05,
+   -0.00010585224663373083,
+   -0.00034791138023138046,
+   -0.0001134808044298552,
+   -0.009721791371703148,
+   -0.01306991372257471,
+   -7.86750388215296e-05,
+   -0.06928819417953491,
+   -0.0019708510953933,
+   -8.070142939686775e-05,
+   -0.0006008726777508855,
+   -1.9550132492440753e-05,
+   -1.2050050497055054,
+   -0.0022362482268363237,
+   -4.887569048150908e-06,
+   -0.00016652150952722877,
+   -0.0001282609737245366,
+   -3.3854863431770355e-05,
+   -0.005613160319626331,
+   -4.935142715112306e-05,
+   -0.00040618274942971766,
+   -3.814624506048858e-05,
+   -0.0012768696760758758,
+   -6.9141146923357155e-06,
+   -0.0021407324820756912,
+   -1.0251946150674485e-05,
+   -0.001328301033936441,
+   -4.768370445162873e-07,
+   -0.00039104922325350344,
+   -0.03403102979063988,
+   -2.371554374694824,
+   -0.0011966219171881676,
+   -0.0017084777355194092,
+   -1.2397689715726301e-05,
+   -0.0012181727215647697,
+   -0.027773091569542885,
+   -0.004225967917591333,
+   -7.688703772146255e-05,
+   -10.750052452087402,
+   -0.09749454259872437,
+   -0.0398833304643631,
+   -0.05019160360097885,
+   -0.02639356628060341,
+   -0.001116844010539353,
+   -0.010394011624157429,
+   -0.0002687808300834149,
+   -0.0412154421210289,
+   -0.17060238122940063,
+   -0.44570907950401306,
+   -0.001759529928676784,
+   -0.8481433987617493,
+   -3.9174411296844482,
+   -0.0011847150744870305,
+   -1.8217713832855225,
+   -1.9833719730377197,
+   -0.0033980030566453934,
+   -0.022340646013617516,
+   -0.0005044856225140393,
+   -11.916642189025879,
+   -2.2062525749206543,
+   -0.011109520681202412,
+   -0.0025012181140482426,
+   -0.00047839165199548006,
+   -10.590877532958984,
+   -5.111791133880615,
+   -0.8751921653747559,
+   -0.19319908320903778,
+   -0.04376664385199547,
+   -0.019606946036219597,
+   -0.00042000532266683877,
+   -9.505635261535645,
+   -0.07715455442667007,
+   -0.005082899704575539,
+   -0.04224858805537224,
+   -0.03572046384215355,
+   -0.0011238694423809648,
+   -5.344630241394043,
+   -3.876430034637451,
+   -12.252359390258789,
+   -4.9860382080078125,
+   -2.668943405151367,
+   -1.16416597366333,
+   -2.514509677886963,
+   -2.5190258026123047,
+   -14.754651069641113,
+   -5.655267715454102,
+   -6.61380672454834,
+   -4.71486234664917,
+   -0.5776815414428711,
+   -1.3986684083938599,
+   -2.637193202972412,
+   -1.1604831218719482,
+   -1.4959537982940674,
+   -0.004402587655931711,
+   -0.5065803527832031,
+   -3.3776161670684814,
+   -0.7203826308250427,
+   -0.02161656692624092,
+   -0.819121241569519,
+   -0.04418942704796791,
+   -1.7282390594482422,
+   -0.05629342794418335,
+   -0.008580365218222141,
+   -0.000747877755202353,
+   -0.013715313747525215,
+   -0.00015138434537220746,
+   -0.006047403905540705,
+   -0.024643857032060623,
+   -0.05186835676431656,
+   -0.0005345107638277113,
+   -0.10883784294128418,
+   -1.3612172603607178,
+   -0.0003692421887535602,
+   -1.357957363128662,
+   -0.05831316113471985,
+   -0.00040570611599832773,
+   -0.0035074164625257254,
+   -6.437094270950183e-05,
+   -1.7280149459838867,
+   -0.026309387758374214,
+   -2.3754658699035645,
+   -0.05959097668528557,
+   -0.0019271865021437407,
+   -0.0006563892820850015,
+   -0.00038985759601928294,
+   -0.00013529339048545808,
+   -6.799666881561279,
+   -0.4319588541984558,
+   -0.0018134353449568152,
+   -0.00010084597306558862,
+   -3.564793109893799,
+   -0.0016862234333530068,
+   -0.007215393707156181,
+   -0.00018916724366135895,
+   -4.893386363983154,
+   -0.7495713233947754,
+   -0.04057759419083595,
+   -0.16563259065151215,
+   -3.7694530487060547,
+   -0.7686876654624939,
+   -0.02867751009762287,
+   -3.4293549060821533,
+   -1.9938279390335083,
+   -3.87074613571167,
+   -7.779223918914795,
+   -0.11301646381616592,
+   -0.0007675323868170381,
+   -0.0353383906185627,
+   -0.5969783663749695,
+   -0.03809810429811478,
+   -0.00048828122089616954,
+   -0.024168511852622032,
+   -0.0024346255231648684,
+   -0.006569692399352789,
+   -0.002209961414337158,
+   -0.001069331425242126,
+   -7.819823804311454e-05,
+   -0.0029135181102901697,
+   -4.60137271147687e-05,
+   -0.0003582789213396609,
+   -0.001116367639042437,
+   -0.002629396505653858,
+   -0.0002420847595203668,
+   -0.17575480043888092,
+   -0.017076482996344566,
+   -0.0001431601122021675,
+   -0.10536163300275803,
+   -0.00507151335477829,
+   -0.00011181206355104223,
+   -0.0018749530427157879,
+   -2.3603161025675945e-05,
+   -0.8358778953552246,
+   -0.002124911407008767,
+   -9.894321920000948e-06,
+   -0.00019214690837543458,
+   -0.0002456601650919765,
+   -3.516612196108326e-05,
+   -0.008302814327180386,
+   -0.00010895135346800089,
+   -0.0006008726777508855,
+   -3.2543604902457446e-05,
+   -0.006115178111940622,
+   -2.1219027985353023e-05,
+   -0.0036275077145546675,
+   -1.7165990357170813e-05,
+   -0.003067908575758338,
+   -9.536738616588991e-07,
+   -0.0006908176001161337,
+   -0.02611708454787731,
+   -1.3316965103149414,
+   -0.003817296586930752,
+   -0.006795391906052828,
+   -4.684815212385729e-05,
+   -0.007690228521823883,
+   -0.14891591668128967,
+   -0.013032732531428337,
+   -0.0002714027068577707,
+   -0.011644137091934681,
+   -0.00091856240760535,
+   -0.0013096098555251956,
+   -0.0007771808886900544,
+   -0.0009541726321913302,
+   -5.638440416078083e-05,
+   -0.0014388932613655925,
+   -5.018585216021165e-05,
+   -0.00020930961181875318,
+   -0.0006467396160587668,
+   -0.0013236580416560173,
+   -0.00019333878299221396,
+   -0.05778864026069641,
+   -0.023562893271446228,
+   -0.0001699779968475923,
+   -0.4867134690284729,
+   -0.17518886923789978,
+   -6.01988795096986e-05,
+   -0.00056429672986269,
+   -2.396077979938127e-05,
+   -10.983257293701172,
+   -3.4146568775177,
+   -0.007948435842990875,
+   -0.005365850869566202,
+   -0.00041166413575410843,
+   -6.0437283536884934e-05,
+   -1.4208624362945557,
+   -0.014981495216488838,
+   -0.00011193125828867778,
+   -2.95634672511369e-05,
+   -0.3359139859676361,
+   -6.425174069590867e-05,
+   -0.0036992470268160105,
+   -1.7523612768854946e-05,
+   -1.6273220777511597,
+   -12.038379669189453,
+   -1.8510823249816895,
+   -4.6685380935668945,
+   -1.03892183303833,
+   -3.5619592666625977,
+   -3.119525194168091,
+   -8.74183177947998,
+   -0.1955474466085434,
+   -0.00022349244682118297,
+   -0.005337630398571491,
+   -0.07253769785165787,
+   -0.0067605809308588505,
+   -0.00018821375851985067,
+   -0.01270250789821148,
+   -0.0005373702733777463,
+   -0.0013699679402634501,
+   -0.0009596510208211839,
+   -0.0003953390696551651,
+   -1.7165990357170813e-05,
+   -0.0010408704401925206,
+   -3.4450891689630225e-05,
+   -0.00011038171214750037,
+   -0.00048351517762057483,
+   -0.0015029336791485548,
+   -0.00013958434283267707,
+   -0.027578983455896378,
+   -0.02192368544638157,
+   -8.141662692651153e-05,
+   -0.11562338471412659,
+   -0.0031276855152100325,
+   -6.5205356804654e-05,
+   -0.0007344171172007918,
+   -2.1457441107486375e-05,
+   -1.4039907455444336,
+   -0.8585066795349121,
+   -0.12097951024770737,
+   -4.9232225137529895e-05,
+   -0.00045503751607611775,
+   -0.0001479277852922678,
+   -2.8967437174287625e-05,
+   -3.316209316253662,
+   -0.22754307091236115,
+   -0.037047676742076874,
+   -0.00010632903286023065,
+   -5.602679812000133e-05,
+   -0.10701240599155426,
+   -2.1815061700181104e-05,
+   -0.0025769618805497885,
+   -2.932505594799295e-05,
+   -2.9098081588745117,
+   -0.23772671818733215,
+   -2.5728368759155273,
+   -1.0628935098648071,
+   -0.569791853427887,
+   -1.5512791872024536,
+   -0.22174018621444702,
+   -0.2053954154253006,
+   -0.668795108795166,
+   -0.00032574593205936253,
+   -0.005275258328765631,
+   -0.17121490836143494,
+   -0.01520049013197422,
+   -0.00027164106722921133,
+   -0.018145864829421043,
+   -0.0008275659638457,
+   -0.0013598490040749311,
+   -0.0007223857101053,
+   -0.0005415403284132481,
+   -3.075552376685664e-05,
+   -0.0016680150292813778,
+   -4.124556289752945e-05,
+   -0.00020203932945150882,
+   -0.0005315321614034474,
+   -0.0016384999034926295,
+   -0.000169382052263245,
+   -0.01945134624838829,
+   -0.018782030791044235,
+   -0.0001429217227268964,
+   -1.4800734519958496,
+   -0.046756841242313385,
+   -9.667406266089529e-05,
+   -0.0005499995895661414,
+   -1.728519782773219e-05,
+   -0.6545608639717102,
+   -0.0013740155845880508,
+   -5.8412379075889476e-06,
+   -0.00015496007108595222,
+   -0.0001935771433636546,
+   -2.8967437174287625e-05,
+   -0.01043801661580801,
+   -7.974783511599526e-05,
+   -0.0005525015876628458,
+   -3.683499380713329e-05,
+   -0.002455436158925295,
+   -1.2874520507466514e-05,
+   -0.0022639615926891565,
+   -1.4543427823809907e-05,
+   -0.00250252615660429,
+   -8.344646857949556e-07,
+   -0.0006089740199968219,
+   -0.023519812151789665,
+   -1.6231462955474854,
+   -0.0013103241799399257,
+   -0.0044088782742619514,
+   -3.433168603805825e-05,
+   -0.0076819476671516895,
+   -0.13205960392951965,
+   -0.01295448187738657,
+   -0.0002797450579237193,
+   -0.01799413561820984,
+   -0.0008688965463079512,
+   -0.0026737437583506107,
+   -0.0004418112221173942,
+   -0.001303895260207355,
+   -6.16293036728166e-05,
+   -0.0018553201807662845,
+   -4.815939246327616e-05,
+   -0.00024875884992070496,
+   -0.000916537712328136,
+   -0.005030237603932619,
+   -0.00015853578224778175,
+   -0.00936696957796812,
+   -0.016335444524884224,
+   -9.619726915843785e-05,
+   -0.12435520440340042,
+   -0.002912804950028658,
+   -0.00010346830822527409,
+   -0.0007908792467787862,
+   -1.7165990357170813e-05,
+   -6.260087490081787,
+   -4.018156051635742,
+   -0.05045890435576439,
+   -0.00021360022947192192,
+   -4.815939246327616e-05,
+   -2.2203869819641113,
+   -0.047356534749269485,
+   -8.83301836438477e-05,
+   -5.781483559985645e-05,
+   -0.11337775737047195,
+   -3.3378044463461265e-05,
+   -0.0019444384379312396,
+   -1.645074735279195e-05,
+   -1.7198790311813354,
+   -3.5991759300231934,
+   -2.5881307125091553,
+   -4.4389872550964355,
+   -0.39235079288482666,
+   -0.9257609248161316,
+   -2.4064109325408936,
+   -2.256807804107666,
+   -0.012957894243299961,
+   -6.8662193370983e-05,
+   -0.005379723850637674,
+   -0.1424376517534256,
+   -0.008812819607555866,
+   -0.00019667598826345056,
+   -0.012973662465810776,
+   -0.0005903884884901345,
+   -0.0019209994934499264,
+   -0.0014405598631128669,
+   -0.0006889115320518613,
+   -1.645074735279195e-05,
+   -0.0011966219171881676,
+   -3.40932747349143e-05,
+   -9.548207890475169e-05,
+   -0.0005439232336357236,
+   -0.004501329269260168,
+   -0.00011920218821614981,
+   -0.03018992207944393,
+   -0.013410485349595547,
+   -0.00011467275908216834,
+   -0.6566694378852844,
+   -0.36726248264312744,
+   -2.8490614567999728e-05,
+   -0.00023707917716819793,
+   -1.3351351299206726e-05,
+   -1.051271915435791,
+   -0.01689915731549263,
+   -3.0722033977508545,
+   -0.2818227708339691,
+   -3.957169771194458,
+   -0.004226442892104387,
+   -0.00017248096992261708,
+   -3.9457496313843876e-05,
+   -5.733857154846191,
+   -0.26561957597732544,
+   -0.00047779586748220026,
+   -2.5748875486897305e-05,
+   -0.07624048739671707,
+   -6.0437283536884934e-05,
+   -0.001644212519749999,
+   -1.549708758830093e-05,
+   -2.1518163681030273,
+   -0.19709540903568268,
+   -3.698873996734619,
+   -10.724569320678711,
+   -2.996880292892456,
+   -3.1366219520568848,
+   -0.02801341563463211,
+   -0.17601795494556427,
+   -0.0965375229716301,
+   -0.00014578233822248876,
+   -0.0020983838476240635,
+   -0.054011568427085876,
+   -0.003581777447834611,
+   -0.00014304091746453196,
+   -0.011484465561807156,
+   -0.000708090839907527,
+   -0.0012874656822532415,
+   -0.0009416675311513245,
+   -0.0005903884884901345,
+   -2.13382354559144e-05,
+   -0.0007848043460398912,
+   -2.3841574147809297e-05,
+   -7.4741430580616e-05,
+   -0.0002946419408544898,
+   -0.0024204738438129425,
+   -0.00011503035057103261,
+   -0.006832095794379711,
+   -0.010126759298145771,
+   -5.876845170860179e-05,
+   -0.09275738149881363,
+   -0.003692833473905921,
+   -4.0411134250462055e-05,
+   -0.0005497612874023616,
+   -1.537788011773955e-05,
+   -1.182621717453003,
+   -0.0008486483711749315,
+   -4.0531076592742465e-06,
+   -0.00010585224663373083,
+   -0.00011646069469861686,
+   -2.407998726994265e-05,
+   -0.00471824174746871,
+   -5.352353764465079e-05,
+   -0.0003631647559814155,
+   -3.135155202471651e-05,
+   -0.0011143434094265103,
+   -1.1205610462639015e-05,
+   -0.002159646013751626,
+   -1.4185804502631072e-05,
+   -0.0011845960980281234,
+   -7.152555099310121e-07,
+   -0.0002699726028367877,
+   -0.008802657015621662,
+   -1.1517901420593262,
+   -0.0017283515771850944,
+   -0.002493488835170865,
+   -1.5258672647178173e-05,
+   -0.0018479428254067898,
+   -0.040569812059402466,
+   -0.0041178204119205475,
+   -0.00017176583060063422,
+   -0.015839355066418648,
+   -0.0005023409612476826,
+   -0.0007201223634183407,
+   -0.0005905076395720243,
+   -0.0007784912013448775,
+   -2.3483953555114567e-05,
+   -0.0008902162662707269,
+   -2.6702524337451905e-05,
+   -9.512448741588742e-05,
+   -0.0004555141495075077,
+   -0.014392376877367496,
+   -9.619726915843785e-05,
+   -0.0002324311062693596,
+   -0.01029337290674448,
+   -0.00015984688070602715,
+   -1.1049474477767944,
+   -0.04663100838661194,
+   -8.21318244561553e-05,
+   -0.0003543464408721775,
+   -1.3947389561508317e-05,
+   -7.615281581878662,
+   -4.125001907348633,
+   -0.19173777103424072,
+   -0.0005029367166571319,
+   -4.100715523236431e-05,
+   -2.0808839797973633,
+   -0.026673687621951103,
+   -7.70062324590981e-05,
+   -2.9682672902708873e-05,
+   -0.12381786853075027,
+   -2.098061486321967e-05,
+   -0.0029344377107918262,
+   -1.3589766240329482e-05,
+   -6.027270793914795,
+   -0.344284325838089,
+   -0.47963422536849976,
+   -1.262589454650879,
+   -1.8010940551757812,
+   -2.51932430267334,
+   -1.5027334690093994,
+   -0.06264369934797287,
+   -1.8616759777069092,
+   -2.732039213180542,
+   -6.854299135738984e-05,
+   -0.001887565478682518,
+   -0.02442971244454384,
+   -0.0030983323231339455,
+   -0.00013374387344811112,
+   -0.010926888324320316,
+   -0.0006349454633891582,
+   -0.0010619483655318618,
+   -0.0007469248375855386,
+   -0.00040987672400660813,
+   -1.537788011773955e-05,
+   -0.0008891443139873445,
+   -2.4676019165781327e-05,
+   -7.080780778778717e-05,
+   -0.00043299360550008714,
+   -0.2814013361930847,
+   -6.8662193370983e-05,
+   -0.0011491130571812391,
+   -0.007679700385779142,
+   -9.440929716220126e-05,
+   -0.026545187458395958,
+   -0.002912091789767146,
+   -7.045020902296528e-05,
+   -0.001142087858170271,
+   -1.4662635294371285e-05,
+   -1.6412137746810913,
+   -9.728646278381348,
+   -0.026286397129297256,
+   -0.0002475670480635017,
+   -7.60526381782256e-05,
+   -2.191868782043457,
+   -0.01760944165289402,
+   -0.0004247716860845685,
+   -4.684815212385729e-05,
+   -0.03103969246149063,
+   -9.297892393078655e-05,
+   -0.011422710493206978,
+   -3.6954195820726454e-05,
+   -4.347017288208008,
+   -0.000610999355558306,
+   -2.17897367477417,
+   -2.866166353225708,
+   -0.23518076539039612,
+   -0.00036125810584053397,
+   -0.01150013878941536,
+   -1.8427702188491821,
+   -0.22964701056480408,
+   -0.011748881079256535,
+   -0.00036352223833091557,
+   -2.021958827972412,
+   -0.008272194303572178,
+   -1.7123057842254639,
+   -9.325576782226562,
+   -1.3440426588058472,
+   -3.209916830062866,
+   -0.053304191678762436,
+   -5.205663681030273,
+   -0.03287550434470177,
+   -1.384042501449585,
+   -7.2653326988220215,
+   -3.6932270526885986,
+   -6.713709354400635,
+   -0.08502203971147537,
+   -3.0402512550354004,
+   -0.043377358466386795,
+   -0.00908633042126894,
+   -0.013433421961963177,
+   -1.5646146535873413,
+   -0.007355276495218277,
+   -5.929056167602539,
+   -11.379992485046387,
+   -3.6368532180786133,
+   -0.45781779289245605,
+   -0.003176526166498661,
+   -0.039530687034130096,
+   -0.0007678897818550467,
+   -0.0003935516288038343,
+   -0.0005339150666259229,
+   -3.964613437652588,
+   -0.9957391023635864,
+   -0.2830953896045685,
+   -0.2610830068588257,
+   -0.07025375217199326,
+   -0.010492399334907532,
+   -3.3028924465179443,
+   -4.77099084854126
+  ],
+  "generated_logprobs": [
+   -0.22153465449810028,
+   -0.0006783091812394559,
+   -0.010667562484741211,
+   -0.0019300420535728335,
+   -0.00015901254664640874,
+   -0.0025420039892196655,
+   -0.0038043521344661713,
+   -0.0017481058603152633,
+   -0.47604793310165405,
+   -0.0035732248798012733,
+   -1.3063528537750244,
+   -1.869868278503418,
+   -0.03979752957820892,
+   -1.2474843263626099,
+   -0.1659490168094635,
+   -0.006920535117387772,
+   -1.4199819564819336,
+   -2.666736364364624,
+   -1.218197226524353,
+   -0.017583556473255157,
+   -0.5422223806381226,
+   -0.0014107999159023166,
+   -0.0003762729174923152,
+   -0.0010189585154876113,
+   -0.03820022940635681,
+   -0.0013802058529108763,
+   -0.1537325382232666,
+   -0.0007863528444431722,
+   -0.003772999858483672,
+   -0.019278066232800484
+  ],
+  "logprobs": [
+   -2.0310330390930176,
+   -8.008150100708008,
+   -4.907264232635498,
+   -8.383085250854492,
+   -0.9039976000785828,
+   -0.005822602193802595,
+   -3.2968709468841553,
+   -0.11372647434473038,
+   -3.750115156173706,
+   -6.341870307922363,
+   -11.225410461425781,
+   -0.8311297297477722,
+   -1.9895459413528442,
+   -1.2136539220809937,
+   -0.4511846899986267,
+   -1.275371789932251,
+   -6.52569055557251,
+   -0.3268530070781708,
+   -2.488239288330078,
+   -1.1252245903015137,
+   -0.004931548144668341,
+   -1.1413307189941406,
+   -2.4036614894866943,
+   -0.593055784702301,
+   -5.775687217712402,
+   -0.7173333764076233,
+   -6.7589006423950195,
+   -4.472473621368408,
+   -0.28561243414878845,
+   -0.9266374111175537,
+   -1.2420787811279297,
+   -4.94831657409668,
+   -0.4015401303768158,
+   -2.405423879623413,
+   -6.706996440887451,
+   -2.3797435760498047,
+   -6.879988193511963,
+   -0.599727988243103,
+   -4.6161346435546875,
+   -0.016334740445017815,
+   -1.4226453304290771,
+   -4.064138412475586,
+   -8.992555618286133,
+   -0.7892558574676514,
+   -2.565383195877075,
+   -1.6011606454849243,
+   -1.1192784309387207,
+   -1.085118293762207,
+   -1.452021598815918,
+   -0.1256672590970993,
+   -4.310093879699707,
+   -0.039925139397382736,
+   -0.09540079534053802,
+   -4.4552788734436035,
+   -2.6978704929351807,
+   -0.3264457583427429,
+   -0.9057141542434692,
+   -0.2424505054950714,
+   -0.2473771721124649,
+   -0.04457908123731613,
+   -2.5994861125946045,
+   -0.5882505178451538,
+   -2.4292445182800293,
+   -0.1860235333442688,
+   -2.6841845512390137,
+   -5.8617939949035645,
+   -1.7926914691925049,
+   -0.6663980484008789,
+   -0.029983440414071083,
+   -1.0682772397994995,
+   -0.0018566290382295847,
+   -1.9571454524993896,
+   -0.08927226811647415,
+   -4.61471700668335,
+   -0.002604546956717968,
+   -0.2620302140712738,
+   -0.006101197097450495,
+   -7.435886859893799,
+   -0.0376485139131546,
+   -10.174129486083984,
+   -0.9147175550460815,
+   -4.526404857635498,
+   -3.670576572418213,
+   -4.566626071929932,
+   -1.0199782848358154,
+   -0.0006491222884505987,
+   -0.14426420629024506,
+   -0.03322957828640938,
+   -0.0019640696700662374,
+   -0.00022468426323030144,
+   -0.0013444918440654874,
+   -0.0011957883834838867,
+   -0.007926556281745434,
+   -0.011617152951657772,
+   -0.0018109364900738,
+   -0.00017581824795342982,
+   -0.0018969652010127902,
+   -6.282132380874828e-05,
+   -0.0010078833438456059,
+   -0.25652098655700684,
+   -0.35659894347190857,
+   -9.333651541965082e-05,
+   -0.7947311401367188,
+   -1.3594639301300049,
+   -7.962863310240209e-05,
+   -1.861167550086975,
+   -0.5386030673980713,
+   -0.00022075122979003936,
+   -0.001347229932434857,
+   -3.290122185717337e-05,
+   -3.7342543601989746,
+   -0.5175371170043945,
+   -4.488879680633545,
+   -0.007863753475248814,
+   -0.08534510433673859,
+   -0.0009170140838250518,
+   -2.13382354559144e-05,
+   -4.507952690124512,
+   -0.5332688689231873,
+   -0.004296358674764633,
+   -2.062299427052494e-05,
+   -5.2475104331970215,
+   -0.020387964323163033,
+   -0.1661914438009262,
+   -0.0003081085451412946,
+   -15.800027847290039,
+   -8.108964920043945,
+   -0.7285020351409912,
+   -7.803549289703369,
+   -5.010417938232422,
+   -0.263860821723938,
+   -4.3748852476710454e-05,
+   -0.013306032866239548,
+   -0.029512016102671623,
+   -0.0036468682810664177,
+   -0.00023231192608363926,
+   -0.0002379134384682402,
+   -0.0004920940846204758,
+   -0.000873065204359591,
+   -0.0029308719094842672,
+   -0.0006667536217719316,
+   -0.00013672371278516948,
+   -0.0011686407960951328,
+   -4.625213477993384e-05,
+   -0.0007901645149104297,
+   -0.027857612818479538,
+   -0.06313244253396988,
+   -0.00013064485392533243,
+   -0.2378876954317093,
+   -0.6059458255767822,
+   -5.757642793469131e-05,
+   -1.5949885845184326,
+   -1.6001688241958618,
+   -0.00032574593205936253,
+   -0.0016402851324528456,
+   -2.276871418871451e-05,
+   -3.0335943698883057,
+   -0.286937952041626,
+   -6.517683982849121,
+   -3.1465959548950195,
+   -0.7292280793190002,
+   -0.06161583960056305,
+   -0.0014851979212835431,
+   -2.777537883957848e-05,
+   -3.946831226348877,
+   -0.09084996581077576,
+   -0.003532005939632654,
+   -4.029192859889008e-05,
+   -4.555190086364746,
+   -0.011255813762545586,
+   -0.10179147869348526,
+   -0.0004140473320148885,
+   -4.4321393966674805,
+   -2.2296247482299805,
+   -3.2771155834198,
+   -8.323366165161133,
+   -0.02779245562851429,
+   -2.403028964996338,
+   -0.07431145757436752,
+   -0.5372196435928345,
+   -0.05987980589270592,
+   -0.20438668131828308,
+   -0.00013136000779923052,
+   -0.0572563000023365,
+   -0.11035308241844177,
+   -0.012903997674584389,
+   -0.0002406545972917229,
+   -0.0001517419150331989,
+   -0.00036066226311959326,
+   -0.0005477358354255557,
+   -0.00229322025552392,
+   -0.000697846058756113,
+   -0.0001161031104857102,
+   -0.001127441762946546,
+   -3.814624506048858e-05,
+   -0.0005136600811965764,
+   -0.022026309743523598,
+   -0.02361132949590683,
+   -0.0002090712368953973,
+   -0.04913746938109398,
+   -2.7477238178253174,
+   -9.202533692587167e-05,
+   -0.9271803498268127,
+   -1.3856279850006104,
+   -0.0001754606782924384,
+   -0.0012224590172991157,
+   -1.7165990357170813e-05,
+   -1.0239524841308594,
+   -0.020712625235319138,
+   -0.0451514832675457,
+   -1.5345499515533447,
+   -0.0004010588163509965,
+   -0.0004401430196594447,
+   -2.13382354559144e-05,
+   -2.5878491401672363,
+   -0.020529404282569885,
+   -0.00043501926120370626,
+   -2.682172998902388e-05,
+   -0.3827762007713318,
+   -0.00019298121333122253,
+   -0.007158228196203709,
+   -8.618460560683161e-05,
+   -6.015654563903809,
+   -4.037173271179199,
+   -3.4229695796966553,
+   -1.0183475017547607,
+   -1.4963387250900269,
+   -0.33330175280570984,
+   -1.480197787284851,
+   -2.0857536792755127,
+   -2.225975513458252,
+   -5.293066024780273,
+   -0.43916723132133484,
+   -0.00010048838157672435,
+   -0.015328695066273212,
+   -0.13567933440208435,
+   -0.012453177943825722,
+   -0.00017855956684798002,
+   -0.00012778419477399439,
+   -0.0002885640424210578,
+   -0.0004291805380489677,
+   -0.0008485292200930417,
+   -0.0006668727728538215,
+   -8.177422569133341e-05,
+   -0.001060757553204894,
+   -6.151010165922344e-05,
+   -0.0005185451591387391,
+   -0.028113562613725662,
+   -0.03407377377152443,
+   -0.0003861635341309011,
+   -1.1215460300445557,
+   -0.5561063885688782,
+   -0.0001726001501083374,
+   -2.5190887451171875,
+   -0.6141397953033447,
+   -0.0001227780303452164,
+   -0.0012188870459794998,
+   -1.6212332411669195e-05,
+   -6.833529472351074,
+   -6.0156097412109375,
+   -0.03274226188659668,
+   -0.014286145567893982,
+   -0.0009454786195419729,
+   -3.814624506048858e-05,
+   -4.910149097442627,
+   -0.009493326768279076,
+   -0.001437702914699912,
+   -5.876845170860179e-05,
+   -0.3798050582408905,
+   -0.003948037512600422,
+   -0.07855644077062607,
+   -0.00022420754248742014,
+   -6.84205436706543,
+   -0.0015236446633934975,
+   -2.645585298538208,
+   -0.9816564917564392,
+   -1.3786735534667969,
+   -0.7280330061912537,
+   -1.4040117263793945,
+   -9.035655966727063e-05,
+   -0.033023953437805176,
+   -0.3305729031562805,
+   -0.027912795543670654,
+   -0.0002892790944315493,
+   -0.00012182447244413197,
+   -0.00026901919045485556,
+   -0.0004681444843299687,
+   -0.0007345362100750208,
+   -0.0008179179858416319,
+   -0.00010549465514486656,
+   -0.0013330630026757717,
+   -5.7338023907504976e-05,
+   -0.0005571481888182461,
+   -0.013437421061098576,
+   -0.033829718828201294,
+   -0.0004694551753345877,
+   -0.28239941596984863,
+   -1.3776881694793701,
+   -0.00014256415306590497,
+   -1.4336698055267334,
+   -0.9458242654800415,
+   -0.0002739054325502366,
+   -0.0015444743912667036,
+   -2.169585604860913e-05,
+   -5.267784118652344,
+   -2.617713689804077,
+   -0.1205064058303833,
+   -0.000608854868914932,
+   -2.47952248173533e-05,
+   -6.116018772125244,
+   -0.06051409989595413,
+   -0.0021291938610374928,
+   -2.777537883957848e-05,
+   -0.5082104206085205,
+   -0.0008528171456418931,
+   -0.013313560746610165,
+   -9.381330892210826e-05,
+   -6.970278739929199,
+   -0.3628937304019928,
+   -1.40151047706604,
+   -0.8361061811447144,
+   -0.4778183400630951,
+   -2.494100570678711,
+   -0.3126090466976166,
+   -7.66262674331665,
+   -0.3505229353904724,
+   -2.1190404891967773,
+   -0.08990062028169632,
+   -8.201262971851975e-05,
+   -0.01644204556941986,
+   -0.1838725060224533,
+   -0.015538694337010384,
+   -0.00019107422849629074,
+   -7.915183232398704e-05,
+   -0.0001382732152706012,
+   -0.0002119316632160917,
+   -0.0004773192631546408,
+   -0.0004781533498317003,
+   -4.994744449504651e-05,
+   -0.0011807858245447278,
+   -3.0636318115284666e-05,
+   -0.0003046525234822184,
+   -0.0024103655014187098,
+   -0.009829924441874027,
+   -0.00022301571152638644,
+   -0.12844854593276978,
+   -1.1151821613311768,
+   -9.512448741588742e-05,
+   -1.1148451566696167,
+   -0.45424169301986694,
+   -7.128461584215984e-05,
+   -0.001427346607670188,
+   -1.2040065485052764e-05,
+   -3.9783990383148193,
+   -0.025781046599149704,
+   -0.00015496007108595222,
+   -0.003944831434637308,
+   -0.000663894519675523,
+   -3.015949550899677e-05,
+   -0.15718017518520355,
+   -0.0009197533945553005,
+   -0.0007913556764833629,
+   -1.8000440832111053e-05,
+   -0.18712174892425537,
+   -0.00016604475968051702,
+   -0.0022110319696366787,
+   -2.169585604860913e-05,
+   -0.014111850410699844,
+   -1.1920922133867862e-06,
+   -0.00984656810760498,
+   -0.5971966981887817,
+   -2.393812894821167,
+   -0.010224700905382633,
+   -0.009953508153557777,
+   -7.64102369430475e-05,
+   -0.011833352968096733,
+   -0.26886406540870667,
+   -0.023419089615345,
+   -0.00019762947340495884,
+   -6.031808152329177e-05,
+   -0.00010191874753218144,
+   -0.00015889335190877318,
+   -0.0003564914222806692,
+   -0.0004101150552742183,
+   -6.675497570540756e-05,
+   -0.0009184433147311211,
+   -3.158996332786046e-05,
+   -0.00031442465842701495,
+   -0.0027259355410933495,
+   -0.008694176562130451,
+   -0.00032658010604791343,
+   -0.289438933134079,
+   -2.1416351795196533,
+   -0.00017987063620239496,
+   -1.8434972763061523,
+   -1.624247670173645,
+   -0.00022980909852776676,
+   -0.0006792622152715921,
+   -1.0967194612021558e-05,
+   -1.281017541885376,
+   -0.01736496575176716,
+   -1.955749750137329,
+   -1.528749942779541,
+   -2.776960611343384,
+   -0.5374854803085327,
+   -0.00029345019720494747,
+   -2.539125671319198e-05,
+   -3.0065665245056152,
+   -0.0013523490633815527,
+   -0.0007908792467787862,
+   -1.4543427823809907e-05,
+   -0.23400214314460754,
+   -0.0002324311062693596,
+   -0.010042970068752766,
+   -4.088794958079234e-05,
+   -2.1034951210021973,
+   -6.140199184417725,
+   -4.464273929595947,
+   -1.9943883419036865,
+   -0.2878473103046417,
+   -0.05924016237258911,
+   -0.7345774173736572,
+   -0.011171765625476837,
+   -0.0002982171718031168,
+   -0.14330486953258514,
+   -0.0007319155265577137,
+   -0.0003812778159044683,
+   -0.002302616136148572,
+   -0.36087724566459656,
+   -0.08833581954240799,
+   -2.631582260131836,
+   -3.1771137714385986,
+   -0.11841163039207458,
+   -4.482168878894299e-05,
+   -0.014765388332307339,
+   -0.17005765438079834,
+   -0.010167589411139488,
+   -0.00010823617776622996,
+   -3.6477376852417365e-05,
+   -5.936446541454643e-05,
+   -0.00023493390472140163,
+   -0.0003688847064040601,
+   -0.000321336614433676,
+   -4.756337511935271e-05,
+   -0.000902007392141968,
+   -2.9205850296420977e-05,
+   -0.00024423000286333263,
+   -0.000964533886872232,
+   -0.00411722669377923,
+   -0.0002711643755901605,
+   -0.3081328868865967,
+   -0.4985820949077606,
+   -0.00018726025882642716,
+   -1.1391643285751343,
+   -0.27228832244873047,
+   -4.2914423829643056e-05,
+   -0.0012028133496642113,
+   -1.9311717551317997e-05,
+   -1.1735807657241821,
+   -0.07005516439676285,
+   -0.0024717275518924,
+   -8.618460560683161e-05,
+   -0.00016866691294126213,
+   -0.00044764988706447184,
+   -1.6093124941107817e-05,
+   -8.586283683776855,
+   -0.0002851079625543207,
+   -7.490447998046875,
+   -0.09369903802871704,
+   -0.004145600367337465,
+   -0.0008606782066635787,
+   -4.827859811484814e-05,
+   -0.7127438187599182,
+   -0.0003618539194576442,
+   -0.015226203016936779,
+   -6.401333666872233e-05,
+   -3.530060291290283,
+   -0.040570154786109924,
+   -0.7448150515556335,
+   -1.4005241394042969,
+   -0.5872946977615356,
+   -6.073245048522949,
+   -0.9850690364837646,
+   -1.4459205865859985,
+   -0.4346452057361603,
+   -4.452149868011475,
+   -0.3939701318740845,
+   -0.02252959832549095,
+   -9.440929716220126e-05,
+   -0.012161390855908394,
+   -0.25266116857528687,
+   -0.021285664290189743,
+   -0.00015770144818816334,
+   -9.870042413240299e-05,
+   -9.989239333663136e-05,
+   -0.005311425309628248,
+   -0.00032634177478030324,
+   -0.0007045170641504228,
+   -9.417090768693015e-05,
+   -0.001260558608919382,
+   -4.482168878894299e-05,
+   -0.0003833036171272397,
+   -0.0023484050761908293,
+   -0.011129915714263916,
+   -0.00040260792593471706,
+   -0.1819346845149994,
+   -1.1781600713729858,
+   -0.00033241944038309157,
+   -1.3525464534759521,
+   -1.2726483345031738,
+   -0.00018034738604910672,
+   -0.0009054613183252513,
+   -1.2040065485052764e-05,
+   -1.7329559326171875,
+   -0.009877022355794907,
+   -0.030561018735170364,
+   -0.9567705988883972,
+   -0.0002079985715681687,
+   -0.0003582789213396609,
+   -2.5510462364763953e-05,
+   -1.3376575708389282,
+   -0.043758541345596313,
+   -0.0005255748401395977,
+   -0.003921795636415482,
+   -3.9934315282152966e-05,
+   -0.013946342281997204,
+   -0.001447345013730228,
+   -0.09289155900478363,
+   -0.00028975578607060015,
+   -5.025714874267578,
+   -5.600637435913086,
+   -0.8190056681632996,
+   -2.0997657775878906,
+   -1.5471020936965942,
+   -0.2830793261528015,
+   -0.099715456366539,
+   -0.00015341058315243572,
+   -0.09538150578737259,
+   -0.9440865516662598,
+   -0.13964560627937317,
+   -0.0003178806509822607,
+   -0.00015531764074694365,
+   -0.00016640232934150845,
+   -0.00023398046323563904,
+   -0.00039081089198589325,
+   -0.0015487592900171876,
+   -0.00010716341057559475,
+   -0.0017987991450354457,
+   -3.838465272565372e-05,
+   -0.0006412595394067466,
+   -0.00545145571231842,
+   -0.02335585467517376,
+   -0.0004077318590134382,
+   -0.8720157146453857,
+   -0.10373511165380478,
+   -0.00014077626110520214,
+   -0.5180479884147644,
+   -0.17388182878494263,
+   -0.00015746307326480746,
+   -0.0043711354956030846,
+   -2.9801878554280847e-05,
+   -2.0693466663360596,
+   -0.007648942526429892,
+   -2.8729025871143676e-05,
+   -0.0003301552205812186,
+   -0.000542612629942596,
+   -3.2543604902457446e-05,
+   -0.27388375997543335,
+   -0.00043752157944254577,
+   -0.0005888396990485489,
+   -1.7762025890988298e-05,
+   -0.05423494055867195,
+   -7.915183232398704e-05,
+   -0.002435457892715931,
+   -1.1205610462639015e-05,
+   -0.01761529967188835,
+   -7.152555099310121e-07,
+   -0.005352570675313473,
+   -0.1280955821275711,
+   -2.3187625408172607,
+   -0.009216856211423874,
+   -0.008558499626815319,
+   -0.0001072826053132303,
+   -0.04680917039513588,
+   -0.5660229325294495,
+   -0.04951385408639908,
+   -0.0002015625941567123,
+   -5.8410845667822286e-05,
+   -9.440929716220126e-05,
+   -0.00014828535495325923,
+   -0.00037245964631438255,
+   -0.0008362610242329538,
+   -5.4596363042946905e-05,
+   -0.0010970771545544267,
+   -4.017272294731811e-05,
+   -0.0004563482361845672,
+   -0.0021864098962396383,
+   -0.012597862631082535,
+   -0.00036435641231946647,
+   -0.07823580503463745,
+   -1.1245288848876953,
+   -0.0001472126314183697,
+   -2.1236472129821777,
+   -0.25363627076148987,
+   -0.00011646069469861686,
+   -0.0010031197452917695,
+   -1.4662635294371285e-05,
+   -11.853788375854492,
+   -1.5205868482589722,
+   -0.0017375147435814142,
+   -0.00013374387344811112,
+   -7.155948638916016,
+   -3.82474422454834,
+   -1.2793458700180054,
+   -0.03748536482453346,
+   -0.005961020477116108,
+   -5.829164365422912e-05,
+   -3.1456170082092285,
+   -0.03318829461932182,
+   -0.008591356687247753,
+   -0.027652040123939514,
+   -0.00012885693286079913,
+   -1.5415722131729126,
+   -0.979039192199707,
+   -2.842726469039917,
+   -9.05957317352295,
+   -2.8234424591064453,
+   -0.8373243808746338,
+   -0.4019332230091095,
+   -0.0004048719711136073,
+   -0.03923225402832031,
+   -0.4254666864871979,
+   -0.027653662487864494,
+   -0.0003177614707965404,
+   -0.0001967951684491709,
+   -0.00020883286197204143,
+   -0.00025674383505247533,
+   -0.0008311392739415169,
+   -0.0012284121476113796,
+   -0.00010787858627736568,
+   -0.0024356956128031015,
+   -6.258291978156194e-05,
+   -0.00048565989709459245,
+   -0.0021678535267710686,
+   -0.012607751414179802,
+   -0.00023588736075907946,
+   -0.11036524921655655,
+   -0.5750182867050171,
+   -0.00017176583060063422,
+   -1.9862632751464844,
+   -1.2351702451705933,
+   -0.00037520044133998454,
+   -0.0013566347770392895,
+   -2.5152843591058627e-05,
+   -2.1086387634277344,
+   -7.917232990264893,
+   -0.05708145350217819,
+   -0.06208256632089615,
+   -0.000644237850792706,
+   -8.308542601298541e-05,
+   -5.1276655197143555,
+   -0.16815905272960663,
+   -0.0012461524456739426,
+   -5.94836674281396e-05,
+   -3.559391736984253,
+   -5.411561965942383,
+   -0.022293083369731903,
+   -0.0005644158809445798,
+   -0.017552750185132027,
+   -0.00038842763751745224,
+   -1.8479862213134766,
+   -0.004095145035535097,
+   -11.830594062805176,
+   -0.4279360771179199,
+   -3.7062158584594727,
+   -2.9457836151123047,
+   -1.9491567611694336,
+   -0.06489256024360657,
+   -0.00013660451804753393,
+   -0.012157151475548744,
+   -0.22074609994888306,
+   -0.021073833107948303,
+   -0.00021300431399140507,
+   -0.00017593742813915014,
+   -0.00023672162205912173,
+   -0.0003091811086051166,
+   -0.0014552014181390405,
+   -0.0013881819322705269,
+   -0.00015245705435518175,
+   -0.002331279218196869,
+   -5.4238757002167404e-05,
+   -0.000668659748043865,
+   -0.002430463209748268,
+   -0.016187194734811783,
+   -0.0002441108226776123,
+   -1.4263010025024414,
+   -0.30179885029792786,
+   -0.0001770101225702092,
+   -0.5045080184936523,
+   -0.07310019433498383,
+   -8.022463589441031e-05,
+   -0.002168329432606697,
+   -2.3841574147809297e-05,
+   -1.7808306217193604,
+   -0.02828705683350563,
+   -6.115249561844394e-05,
+   -0.0008904544520191848,
+   -0.0005335576133802533,
+   -3.957670196541585e-05,
+   -0.03801318258047104,
+   -0.0003077510336879641,
+   -0.0005035324720665812,
+   -2.169585604860913e-05,
+   -0.02271897904574871,
+   -3.1709168979432434e-05,
+   -0.0018041539005935192,
+   -1.8358061424805783e-05,
+   -0.005899516865611076,
+   -1.1920922133867862e-06,
+   -0.002030455507338047,
+   -0.27544423937797546,
+   -1.1146715879440308,
+   -0.012286689132452011,
+   -0.004974251613020897,
+   -6.389413465512916e-05,
+   -0.010529793798923492,
+   -0.2302529364824295,
+   -0.015527778305113316,
+   -0.00019524575327523053,
+   -6.389413465512916e-05,
+   -0.00013815402053296566,
+   -0.00018165845540352166,
+   -0.0005564333405345678,
+   -0.000959531927946955,
+   -6.151010165922344e-05,
+   -0.001416394836269319,
+   -5.531158240046352e-05,
+   -0.00035363141796551645,
+   -0.0010683787986636162,
+   -0.012577733024954796,
+   -0.00023934361524879932,
+   -0.06311207264661789,
+   -0.972044050693512,
+   -0.00019929806876461953,
+   -1.6224243640899658,
+   -0.8333836197853088,
+   -0.00016592556494288146,
+   -0.0008984343148767948,
+   -1.6927575416048057e-05,
+   -0.8844207525253296,
+   -0.023736946284770966,
+   -4.01811408996582,
+   -1.6215615272521973,
+   -0.33087965846061707,
+   -0.0035197706893086433,
+   -0.00024148885859176517,
+   -3.0874729418428615e-05,
+   -3.097301721572876,
+   -0.030017103999853134,
+   -0.0006585336523130536,
+   -1.9430925021879375e-05,
+   -0.49424058198928833,
+   -0.0001401803019689396,
+   -0.00554167665541172,
+   -1.9073304429184645e-05,
+   -0.5312279462814331,
+   -5.748266220092773,
+   -11.324613571166992,
+   -1.1340491771697998,
+   -0.16082678735256195,
+   -0.8938052654266357,
+   -3.726792335510254,
+   -0.8781039714813232,
+   -0.00017355366435367614,
+   -0.009945128113031387,
+   -0.18626560270786285,
+   -0.013042616657912731,
+   -0.00010859376925509423,
+   -7.199982064776123e-05,
+   -0.00010871296399272978,
+   -0.00017796363681554794,
+   -0.00034767304896377027,
+   -0.0006170752458274364,
+   -3.0636318115284666e-05,
+   -0.001077071763575077,
+   -4.076874756719917e-05,
+   -0.00024029705673456192,
+   -0.000982159748673439,
+   -0.02636047638952732,
+   -0.00021920185827184469,
+   -0.632880687713623,
+   -0.06617539376020432,
+   -0.00016318420239258558,
+   -0.4156720042228699,
+   -0.034620899707078934,
+   -5.6622808187967166e-05,
+   -0.0011695933062583208,
+   -1.597391747054644e-05,
+   -10.639490127563477,
+   -0.24528348445892334,
+   -0.06833283603191376,
+   -0.0033608165103942156,
+   -0.02616957761347294,
+   -0.00036054308293387294,
+   -3.099393507000059e-05,
+   -4.044595241546631,
+   -2.188387393951416,
+   -0.32720163464546204,
+   -0.00974209699779749,
+   -0.0011126763420179486,
+   -3.302042750874534e-05,
+   -0.19868847727775574,
+   -7.56950321374461e-05,
+   -0.005233398173004389,
+   -3.158996332786046e-05,
+   -1.839617371559143,
+   -0.17654305696487427,
+   -0.7875567078590393,
+   -2.1537787914276123,
+   -0.3631034195423126,
+   -0.9216613173484802,
+   -2.0036990642547607,
+   -0.09243497252464294,
+   -0.00010740180005086586,
+   -0.018314307555556297,
+   -0.208140030503273,
+   -0.01576320081949234,
+   -0.00013136000779923052,
+   -7.390703103737906e-05,
+   -0.00011264643399044871,
+   -0.00017045476124621928,
+   -0.0005171154043637216,
+   -0.0005422552349045873,
+   -3.349725011503324e-05,
+   -0.0013309201458469033,
+   -4.255681051290594e-05,
+   -0.00023767507809679955,
+   -0.001095648156479001,
+   -0.14277544617652893,
+   -0.00021371940965764225,
+   -0.00032217081752605736,
+   -0.35286909341812134,
+   -0.0002668739762157202,
+   -1.7962173223495483,
+   -0.07211553305387497,
+   -7.974783511599526e-05,
+   -0.000621959799900651,
+   -1.2874520507466514e-05,
+   -1.9048426151275635,
+   -0.022713735699653625,
+   -3.9457496313843876e-05,
+   -0.0005820487276650965,
+   -0.0002401778765488416,
+   -3.325883881188929e-05,
+   -0.02081700973212719,
+   -0.00022492263815365732,
+   -0.0003299168893136084,
+   -2.038458114839159e-05,
+   -0.008293120190501213,
+   -1.7404405298293568e-05,
+   -0.0012493670219555497,
+   -1.4424220353248529e-05,
+   -0.0041636452078819275,
+   -8.344646857949556e-07,
+   -0.0020267677027732134,
+   -0.13429519534111023,
+   -1.9221405982971191,
+   -0.0093602379783988,
+   -0.005981876514852047,
+   -5.817244164063595e-05,
+   -0.019257837906479836,
+   -0.27827900648117065,
+   -0.01921457052230835,
+   -0.0001652104256208986,
+   -8.546940807718784e-05,
+   -0.0001510267611593008,
+   -0.00016366096679121256,
+   -0.0002616301644593477,
+   -0.0005458295345306396,
+   -3.480850500636734e-05,
+   -0.0010807631770148873,
+   -3.7431014789035544e-05,
+   -0.0003626880934461951,
+   -0.0010880271438509226,
+   -0.6327179670333862,
+   -0.0002374367177253589,
+   -0.020488178357481956,
+   -0.10384052991867065,
+   -0.0001971527235582471,
+   -0.16368740797042847,
+   -0.026392173022031784,
+   -0.00012170527770649642,
+   -0.0025978884659707546,
+   -1.9430925021879375e-05,
+   -7.9701642990112305,
+   -1.6003714799880981,
+   -0.2391909956932068,
+   -0.000502817565575242,
+   -4.9232225137529895e-05,
+   -4.135532855987549,
+   -0.06158669665455818,
+   -0.00044371772673912346,
+   -3.755022044060752e-05,
+   -0.18109248578548431,
+   -0.00010883215873036534,
+   -0.006367869209498167,
+   -7.748303323751315e-05,
+   -5.440160751342773,
+   -5.081888198852539,
+   -0.19470839202404022,
+   -2.9904420375823975,
+   -2.4235076904296875,
+   -0.032352350652217865,
+   -0.00044907975825481117,
+   -0.04121795669198036,
+   -0.43260514736175537,
+   -0.04605478420853615,
+   -0.00023982033599168062,
+   -0.0003178806509822607,
+   -0.00017188502533826977,
+   -0.00022468426323030144,
+   -0.0003400462737772614,
+   -0.0010152667528018355,
+   -0.00011729506513802335,
+   -0.001335324952378869,
+   -4.8874615458771586e-05,
+   -0.001257463125512004,
+   -0.004097400698810816,
+   -0.0008996253600344062,
+   -0.0002967870968859643,
+   -0.15579743683338165,
+   -1.3731565475463867,
+   -0.00023183519078884274,
+   -2.0089190006256104,
+   -3.441042423248291,
+   -0.0006145734223537147,
+   -0.0012832987122237682,
+   -1.9550132492440753e-05,
+   -1.731110692024231,
+   -0.027068600058555603,
+   -2.8266828060150146,
+   -0.35935577750205994,
+   -0.023644626140594482,
+   -0.0005504761938937008,
+   -0.00017951308109331876,
+   -2.396077979938127e-05,
+   -2.3206820487976074,
+   -0.003744971938431263,
+   -0.000205018965061754,
+   -2.288792165927589e-05,
+   -0.08958229422569275,
+   -6.592056161025539e-05,
+   -0.0021721357479691505,
+   -3.0397906812140718e-05,
+   -4.5939412117004395,
+   -8.534799575805664,
+   -3.483549118041992,
+   -1.681600570678711,
+   -0.7201917767524719,
+   -0.530266284942627,
+   -0.7154921293258667,
+   -2.835704803466797,
+   -0.0004451475979294628,
+   -0.02453603409230709,
+   -0.31538400053977966,
+   -0.0156102878972888,
+   -0.00013124081306159496,
+   -8.892617915989831e-05,
+   -9.738924563862383e-05,
+   -0.0011036264477297664,
+   -0.00030357998912222683,
+   -0.0010406322544440627,
+   -6.0437283536884934e-05,
+   -0.0014225849881768227,
+   -3.671578815556131e-05,
+   -0.00044705410255119205,
+   -0.005232923664152622,
+   -0.0001565095444675535,
+   -0.0003033416287507862,
+   -0.18575794994831085,
+   -0.14061033725738525,
+   -0.0002706876548472792,
+   -0.5223819017410278,
+   -0.035896092653274536,
+   -5.4834770708112046e-05,
+   -0.0012011463986709714,
+   -1.6569954823353328e-05,
+   -1.681032657623291,
+   -0.011652856133878231,
+   -1.6569954823353328e-05,
+   -0.00047469791024923325,
+   -0.000256982195423916,
+   -3.361645576660521e-05,
+   -0.01372707262635231,
+   -0.00014852374442853034,
+   -0.00046695294440723956,
+   -2.288792165927589e-05,
+   -0.0034659572411328554,
+   -1.3708974620385561e-05,
+   -0.0015382850542664528,
+   -8.702239938429557e-06,
+   -0.003346678102388978,
+   -7.152555099310121e-07,
+   -0.000867467257194221,
+   -0.02539108693599701,
+   -1.0509589910507202,
+   -0.002976156771183014,
+   -0.005069141276180744,
+   -5.590759246842936e-05,
+   -0.015196850523352623,
+   -0.3093729317188263,
+   -0.02090352028608322,
+   -0.00013958434283267707,
+   -6.460934673668817e-05,
+   -8.296622399939224e-05,
+   -0.0004457433824427426,
+   -0.0005041282274760306,
+   -0.0011976935202255845,
+   -4.2914423829643056e-05,
+   -0.0011085085570812225,
+   -4.160317621426657e-05,
+   -0.0005018643569201231,
+   -0.004558410029858351,
+   -9.476689592702314e-05,
+   -0.00037269797758199275,
+   -0.11347992718219757,
+   -0.450020968914032,
+   -0.0003301552205812186,
+   -2.8804092407226562,
+   -0.15156973898410797,
+   -6.246371776796877e-05,
+   -0.000683074293192476,
+   -1.3947389561508317e-05,
+   -2.0683939456939697,
+   -0.02846144698560238,
+   -0.04469490796327591,
+   -1.889275074005127,
+   -0.0001255195093108341,
+   -0.00011228884250158444,
+   -2.4914430468925275e-05,
+   -7.980701446533203,
+   -0.39261865615844727,
+   -1.6454169750213623,
+   -0.0018256916664540768,
+   -0.0003761537664104253,
+   -2.5987286790041253e-05,
+   -0.27152737975120544,
+   -3.8742269680369645e-05,
+   -0.002314033918082714,
+   -5.364274329622276e-05,
+   -5.172288417816162,
+   -0.007181781344115734,
+   -0.8884671330451965,
+   -0.20681926608085632,
+   -1.529428243637085,
+   -2.335056781768799,
+   -0.02583100087940693,
+   -1.8960939645767212,
+   -0.257112592458725,
+   -0.1720065474510193,
+   -8.284702198579907e-05,
+   -0.011070851236581802,
+   -0.16333311796188354,
+   -0.01678428426384926,
+   -0.00010024998482549563,
+   -4.911301948595792e-05,
+   -6.41325386823155e-05,
+   -0.0003518439189065248,
+   -0.0003983181086368859,
+   -0.0007211944903247058,
+   -2.253030106658116e-05,
+   -0.0009076051646843553,
+   -2.884823152271565e-05,
+   -0.00033682872890494764,
+   -0.01127432007342577,
+   -5.113947918289341e-05,
+   -0.0003095386200584471,
+   -0.162703275680542,
+   -0.12824533879756927,
+   -0.0002037079248111695,
+   -0.5378345251083374,
+   -0.013359789736568928,
+   -4.625213477993384e-05,
+   -0.0007819455349817872,
+   -1.2993727978027891e-05,
+   -1.4531102180480957,
+   -0.9376159310340881,
+   -0.02013481967151165,
+   -3.182837463100441e-05,
+   -0.00028391621890477836,
+   -0.0002040654799202457,
+   -1.6212332411669195e-05,
+   -6.006290435791016,
+   -0.23482508957386017,
+   -0.0003094194398727268,
+   -3.2066785934148356e-05,
+   -0.2894707918167114,
+   -0.00010334911348763853,
+   -0.003178308717906475,
+   -4.8397800128441304e-05,
+   -3.3541419506073,
+   -5.274465084075928,
+   -2.3055055141448975,
+   -1.0987294912338257,
+   -0.019666209816932678,
+   -0.00022790218645241112,
+   -0.016233760863542557,
+   -0.2816391885280609,
+   -0.028503969311714172,
+   -0.0001358893496217206,
+   -0.00010394509445177391,
+   -8.856858039507642e-05,
+   -0.00036137725692242384,
+   -0.00029452278977259994,
+   -0.0008922410197556019,
+   -2.539125671319198e-05,
+   -0.0011102947173640132,
+   -3.40932747349143e-05,
+   -0.0004843492351938039,
+   -0.006350101437419653,
+   -5.9602869441732764e-05,
+   -0.0002796259068418294,
+   -0.3986394703388214,
+   -0.10029242187738419,
+   -0.00024196557933464646,
+   -1.9691603183746338,
+   -0.7402586936950684,
+   -7.056941103655845e-05,
+   -0.0003618539194576442,
+   -1.0371154530730564e-05,
+   -1.4170231819152832,
+   -0.008172051049768925,
+   -1.3708974620385561e-05,
+   -0.00041607304592616856,
+   -0.00014888131408952177,
+   -2.6464111215318553e-05,
+   -0.018121162429451942,
+   -0.00010764019680209458,
+   -0.0002335037279408425,
+   -2.3007127310847864e-05,
+   -0.002049014437943697,
+   -1.0609570381348021e-05,
+   -0.0011868583969771862,
+   -7.867782187531702e-06,
+   -0.0018794744974002242,
+   -5.960462772236497e-07,
+   -0.0007434703293256462,
+   -0.02911354973912239,
+   -1.7920753955841064,
+   -0.0026135831139981747,
+   -0.00308870617300272,
+   -3.659658250398934e-05,
+   -0.010810147039592266,
+   -0.20098412036895752,
+   -0.01644638366997242,
+   -0.00013207517622504383,
+   -6.854299135738984e-05,
+   -7.152301259338856e-05,
+   -0.00024720950750634074,
+   -0.00033468366018496454,
+   -0.0010001424234360456,
+   -5.054346183896996e-05,
+   -0.0009557208395563066,
+   -3.981510963058099e-05,
+   -0.0004465774691198021,
+   -0.011578621342778206,
+   -7.211902266135439e-05,
+   -0.0002416080387774855,
+   -0.09539440274238586,
+   -0.057392168790102005,
+   -0.0002840353990904987,
+   -0.21088920533657074,
+   -0.0078902468085289,
+   -8.606540359323844e-05,
+   -0.0007384672062471509,
+   -1.3589766240329482e-05,
+   -0.8148440718650818,
+   -0.025661379098892212,
+   -2.113894462585449,
+   -0.01820814050734043,
+   -0.0010720703285187483,
+   -0.0002908283786382526,
+   -0.00011181206355104223,
+   -1.9550132492440753e-05,
+   -1.9963352680206299,
+   -0.011685965582728386,
+   -0.00010299152199877426,
+   -1.6093124941107817e-05,
+   -0.3427979350090027,
+   -0.00010358751023886725,
+   -0.002419165801256895,
+   -5.07818695041351e-05,
+   -9.356146812438965,
+   -2.63590145111084,
+   -0.0489899143576622,
+   -0.429649293422699,
+   -2.441277027130127,
+   -0.09116854518651962,
+   -1.7202471494674683,
+   -1.2776923179626465,
+   -1.2828468084335327,
+   -0.1033272072672844,
+   -0.013413426466286182,
+   -0.00016091958968900144,
+   -0.006314327474683523,
+   -0.1650361269712448,
+   -0.009155434556305408,
+   -8.630380034446716e-05,
+   -6.007967749610543e-05,
+   -6.210611172718927e-05,
+   -0.00027497802511788905,
+   -0.0005628670332953334,
+   -0.0008046964649111032,
+   -4.160317621426657e-05,
+   -0.0009633429581299424,
+   -2.9444261599564925e-05,
+   -0.0003147821989841759,
+   -0.003070523263886571,
+   -3.969590397900902e-05,
+   -0.00025340684805996716,
+   -0.16765674948692322,
+   -0.220333993434906,
+   -0.00025281094713136554,
+   -1.6686129570007324,
+   -0.08651255071163177,
+   -7.4741430580616e-05,
+   -0.00032062159152701497,
+   -9.536697689327411e-06,
+   -8.607754707336426,
+   -2.7989468574523926,
+   -0.006830438040196896,
+   -0.00042500998824834824,
+   -4.410646579344757e-05,
+   -2.2325727939605713,
+   -0.09642884135246277,
+   -0.0005049622268415987,
+   -1.4662635294371285e-05,
+   -3.892613172531128,
+   -0.0008376903715543449,
+   -0.004279621876776218,
+   -5.745722592109814e-05,
+   -2.696786642074585,
+   -0.44925373792648315,
+   -0.37875908613204956,
+   -0.27114248275756836,
+   -1.023728609085083,
+   -4.712882995605469,
+   -1.415423035621643,
+   -2.8054561614990234,
+   -0.4460236430168152,
+   -0.0005779979983344674,
+   -0.02468189038336277,
+   -0.30965328216552734,
+   -0.02052520029246807,
+   -0.00012730741582345217,
+   -9.619726915843785e-05,
+   -8.749579137656838e-05,
+   -0.000350175570929423,
+   -0.0003150205302517861,
+   -0.0007310817018151283,
+   -3.0636318115284666e-05,
+   -0.0011643542675301433,
+   -3.2305197237292305e-05,
+   -0.00026913834153674543,
+   -0.011463016271591187,
+   -5.411955135059543e-05,
+   -0.00023231192608363926,
+   -0.1063343733549118,
+   -0.037034809589385986,
+   -0.0001248043408850208,
+   -0.3663400411605835,
+   -0.01425135973840952,
+   -5.376194530981593e-05,
+   -0.000933926145080477,
+   -1.4305012882687151e-05,
+   -1.5244930982589722,
+   -0.008558854460716248,
+   -1.8358061424805783e-05,
+   -0.0002698534226510674,
+   -0.00022075122979003936,
+   -3.576214658096433e-05,
+   -0.01590365171432495,
+   -0.00012706902634818107,
+   -0.0002901133266277611,
+   -2.2649508537142538e-05,
+   -0.0032194233499467373,
+   -1.1920858014491387e-05,
+   -0.0013312773080542684,
+   -8.22540732769994e-06,
+   -0.001732040662318468,
+   -4.768370445162873e-07,
+   -0.0007115454645827413,
+   -0.11607333272695541,
+   -5.158000946044922,
+   -0.00630958890542388,
+   -0.006455875933170319,
+   -3.886147169396281e-05,
+   -0.007113605737686157,
+   -0.16176439821720123,
+   -0.01025608740746975,
+   -9.321732068201527e-05,
+   -5.435795901576057e-05,
+   -7.70062324590981e-05,
+   -0.0002002515539061278,
+   -0.0003270567976869643,
+   -0.0011002921964973211,
+   -3.93382906622719e-05,
+   -0.0009735850035212934,
+   -4.076874756719917e-05,
+   -0.00036042393185198307,
+   -0.011448992416262627,
+   -0.00010787858627736568,
+   -0.00022289653134066612,
+   -0.12719827890396118,
+   -0.16689445078372955,
+   -0.00029869386344216764,
+   -1.129071831703186,
+   -0.46998509764671326,
+   -0.0001429217227268964,
+   -0.0004334702098276466,
+   -1.823885577323381e-05,
+   -7.808990478515625,
+   -0.6958405375480652,
+   -0.0011538759572431445,
+   -0.00010084597306558862,
+   -2.1815061700181104e-05,
+   -3.412889242172241,
+   -0.0024302254896610975,
+   -0.1256120651960373,
+   -0.0001486429391661659,
+   -2.932505594799295e-05,
+   -0.016119161620736122,
+   -2.1219027985353023e-05,
+   -0.0014936492079868913,
+   -6.794906312279636e-06,
+   -4.649867057800293,
+   -0.42487168312072754,
+   -1.3419163227081299,
+   -0.3015914857387543,
+   -0.00015341058315243572,
+   -0.0032649326603859663,
+   -0.11564143747091293,
+   -0.00739337969571352,
+   -5.8887653722194955e-05,
+   -6.615896563744172e-05,
+   -5.972207145532593e-05,
+   -0.00020644917094614357,
+   -0.000301673193462193,
+   -0.0003761537664104253,
+   -2.6702524337451905e-05,
+   -0.0008094609947875142,
+   -3.2305197237292305e-05,
+   -0.0002474478678777814,
+   -0.018454870209097862,
+   -7.73638384998776e-05,
+   -0.00022837892174720764,
+   -0.04869883507490158,
+   -0.02372216247022152,
+   -0.0002051381452474743,
+   -0.15266406536102295,
+   -0.0037327392492443323,
+   -7.557583012385294e-05,
+   -0.0005665604257956147,
+   -1.4662635294371285e-05,
+   -2.1065256595611572,
+   -0.02570541389286518,
+   -2.0099081993103027,
+   -2.7118430137634277,
+   -0.1484161764383316,
+   -0.007964756339788437,
+   -0.00016342257731594145,
+   -1.597391747054644e-05,
+   -0.8920754194259644,
+   -0.0009690594743005931,
+   -0.00029023250681348145,
+   -1.2993727978027891e-05,
+   -0.07993864268064499,
+   -5.400034933700226e-05,
+   -0.00158791767898947,
+   -1.0609570381348021e-05,
+   -4.331461429595947,
+   -6.81968355178833,
+   -3.366002082824707,
+   -1.850673794746399,
+   -0.00040391870425082743,
+   -0.04611193388700485,
+   -0.06791424006223679,
+   -0.004945189692080021,
+   -9.107174992095679e-05,
+   -7.557583012385294e-05,
+   -6.747018051100895e-05,
+   -0.00024399164249189198,
+   -0.000321336614433676,
+   -0.0006528153317049146,
+   -3.2782016205601394e-05,
+   -0.0012151960982009768,
+   -3.957670196541585e-05,
+   -0.0002205128694185987,
+   -0.016214992851018906,
+   -0.00019095504831057042,
+   -0.0001456631434848532,
+   -7.712543447269127e-05,
+   -0.33043625950813293,
+   -0.00017629499780014157,
+   -2.590480089187622,
+   -0.16181793808937073,
+   -0.00011646069469861686,
+   -0.0006735440110787749,
+   -2.109982233378105e-05,
+   -1.6486821174621582,
+   -0.01151864044368267,
+   -1.8954096958623268e-05,
+   -0.0003233625029679388,
+   -0.00020644917094614357,
+   -3.111314072157256e-05,
+   -0.017416512593626976,
+   -0.00012766500003635883,
+   -0.0003415954706724733,
+   -2.13382354559144e-05,
+   -0.006446637213230133,
+   -1.823885577323381e-05,
+   -0.0012438902631402016,
+   -1.1205610462639015e-05,
+   -0.006591127719730139,
+   -7.152555099310121e-07,
+   -0.0017049076268449426,
+   -0.13135236501693726,
+   -3.228759288787842,
+   -0.002643782878294587,
+   -0.004842340014874935,
+   -3.480850500636734e-05,
+   -0.010503842495381832,
+   -0.16338221728801727,
+   -0.011769498698413372,
+   -0.00011574551899684593,
+   -9.727005090098828e-05,
+   -8.582700684200972e-05,
+   -0.0004538459761533886,
+   -0.00020740265608765185,
+   -0.001342587056569755,
+   -8.964136941358447e-05,
+   -0.0014018717920407653,
+   -4.935142715112306e-05,
+   -0.0006431656656786799,
+   -0.5765135288238525,
+   -0.0009291622554883361,
+   -0.00027998341829515994,
+   -0.008964410983026028,
+   -0.03303813934326172,
+   -0.00018451895448379219,
+   -0.07687719166278839,
+   -0.00454594986513257,
+   -0.00018439977429807186,
+   -0.0023830130230635405,
+   -2.706014311115723e-05,
+   -1.8103313446044922,
+   -0.7522969245910645,
+   -0.022507335990667343,
+   -2.074220174108632e-05,
+   -0.00026222606538794935,
+   -0.00020740265608765185,
+   -2.706014311115723e-05,
+   -3.700786590576172,
+   -0.26737019419670105,
+   -9.357491217087954e-05,
+   -6.031808152329177e-05,
+   -0.13705354928970337,
+   -2.407998726994265e-05,
+   -0.003684044349938631,
+   -3.2782016205601394e-05,
+   -2.9476141929626465,
+   -1.1526018381118774,
+   -2.6757259368896484,
+   -5.31315279006958,
+   -0.7695194482803345,
+   -0.00014876213390380144,
+   -0.8328413963317871,
+   -5.100983142852783,
+   -0.1275785118341446,
+   -0.008235306479036808,
+   -0.00037281715776771307,
+   -0.02394961006939411,
+   -0.5179875493049622,
+   -0.04619366303086281,
+   -0.00021705655672121793,
+   -0.00021765247220173478,
+   -0.0001461399078834802,
+   -0.0007413261337205768,
+   -0.0006660388899035752,
+   -0.0015581621555611491,
+   -6.8662193370983e-05,
+   -0.002233869396150112,
+   -4.494089080253616e-05,
+   -0.0006101653561927378,
+   -0.0006289887824095786,
+   -0.0033358661457896233,
+   -0.00045074793160893023,
+   -0.15180595219135284,
+   -0.07985830307006836,
+   -0.00015937011630740017,
+   -2.2477855682373047,
+   -0.4471043348312378,
+   -0.0001734344696160406,
+   -0.0006040894077159464,
+   -1.680836794548668e-05,
+   -2.318458080291748,
+   -0.01888836920261383,
+   -0.029085876420140266,
+   -1.1253407001495361,
+   -0.00021741411183029413,
+   -0.00012003655137959868,
+   -2.8013790142722428e-05,
+   -3.1507949829101562,
+   -0.005721264518797398,
+   -0.00040904260822571814,
+   -1.7881233361549675e-05,
+   -0.04304421693086624,
+   -0.0001591317413840443,
+   -0.005429995711892843,
+   -3.242440288886428e-05,
+   -4.896542549133301,
+   -3.2877321243286133,
+   -0.17550288140773773,
+   -8.526089668273926,
+   -0.2559642493724823,
+   -0.00015770144818816334,
+   -0.004955509677529335,
+   -0.20714037120342255,
+   -0.023553114384412766,
+   -0.00015496007108595222,
+   -0.0001134808044298552,
+   -9.250213042832911e-05,
+   -0.000288087350782007,
+   -0.0004409771354403347,
+   -0.0007110689766705036,
+   -4.6132929128361866e-05,
+   -0.0009153467253781855,
+   -3.433168603805825e-05,
+   -0.00015484087634831667,
+   -0.0001292145170737058,
+   -0.0022287548054009676,
+   -0.0002269487304147333,
+   -0.11395295709371567,
+   -0.05913611873984337,
+   -8.356221951544285e-05,
+   -0.4039720594882965,
+   -0.019538793712854385,
+   -5.924526340095326e-05,
+   -0.0007176207727752626,
+   -1.7881233361549675e-05,
+   -1.6992816925048828,
+   -0.004352619871497154,
+   -6.6756979322235566e-06,
+   -0.00017093151109293103,
+   -0.0001284993631998077,
+   -3.3378044463461265e-05,
+   -0.013412484899163246,
+   -8.713819261174649e-05,
+   -0.0004928089329041541,
+   -2.288792165927589e-05,
+   -0.0012643685331568122,
+   -1.3351351299206726e-05,
+   -0.0019104102393612266,
+   -8.940656698541716e-06,
+   -0.0033124599140137434,
+   -4.768370445162873e-07,
+   -0.0009848987683653831,
+   -0.07256874442100525,
+   -1.7665941715240479,
+   -0.00281461956910789,
+   -0.0027610058896243572,
+   -2.9682672902708873e-05,
+   -0.0075036585330963135,
+   -0.16648568212985992,
+   -0.014109030365943909,
+   -9.63164638960734e-05,
+   -6.603976362384856e-05,
+   -7.331102824537084e-05,
+   -0.0003323002893012017,
+   -0.00042083943844772875,
+   -0.0010620674584060907,
+   -2.8609820219571702e-05,
+   -0.000990257947705686,
+   -4.029192859889008e-05,
+   -0.0001541257370263338,
+   -0.0001658063702052459,
+   -0.0010433712741360068,
+   -0.0002379134384682402,
+   -0.08282912522554398,
+   -0.1620505303144455,
+   -0.0001578206429257989,
+   -1.9873682260513306,
+   -0.03700195625424385,
+   -8.594620157964528e-05,
+   -0.00035232058144174516,
+   -2.90866428258596e-05,
+   -1.0645859241485596,
+   -0.012771833688020706,
+   -1.8788448572158813,
+   -0.04745874181389809,
+   -0.0029150634072721004,
+   -0.0002858230145648122,
+   -8.082063141046092e-05,
+   -2.8729025871143676e-05,
+   -4.2793378829956055,
+   -0.008196880109608173,
+   -9.822363062994555e-05,
+   -4.9470632802695036e-05,
+   -5.399019241333008,
+   -0.0015862513100728393,
+   -0.0018035589018836617,
+   -2.9444261599564925e-05,
+   -3.8089842796325684,
+   -1.3950530290603638,
+   -0.17507919669151306,
+   -4.1786346435546875,
+   -9.410017013549805,
+   -0.00014709345123264939,
+   -2.16685152053833,
+   -0.5008745193481445,
+   -0.013433892279863358,
+   -0.00029976642690598965,
+   -0.006172403693199158,
+   -0.22438427805900574,
+   -0.015963135287165642,
+   -0.00010489867418073118,
+   -7.426462980220094e-05,
+   -6.890059739816934e-05,
+   -0.0002874914789572358,
+   -0.0004033228906337172,
+   -0.0006624649395234883,
+   -3.802703940891661e-05,
+   -0.001104817260056734,
+   -2.8967437174287625e-05,
+   -0.000125281119835563,
+   -0.00011634149996098131,
+   -0.0016071987338364124,
+   -0.0001752223033690825,
+   -0.04927569255232811,
+   -0.03999283164739609,
+   -8.427741704508662e-05,
+   -0.11036300659179688,
+   -0.0022922686766833067,
+   -5.125868119648658e-05,
+   -0.0007711059297434986,
+   -1.6569954823353328e-05,
+   -1.1996040344238281,
+   -6.017496585845947,
+   -3.3771719932556152,
+   -0.0015197168104350567,
+   -0.0001720042055239901,
+   -8.05822346592322e-05,
+   -1.9701510667800903,
+   -0.015215284191071987,
+   -0.00046957432641647756,
+   -4.5536911784438416e-05,
+   -0.3501690626144409,
+   -6.508615479106084e-05,
+   -0.013412720523774624,
+   -0.0002317160106031224,
+   -10.721491813659668,
+   -0.001794158248230815,
+   -5.900764465332031,
+   -0.05698608234524727,
+   -1.9666205644607544,
+   -0.34450024366378784,
+   -0.24932177364826202,
+   -1.1890842914581299,
+   -0.9316995143890381,
+   -0.5700393915176392,
+   -0.18522746860980988,
+   -0.08411185443401337,
+   -0.00032610344351269305,
+   -0.016760369762778282,
+   -0.310769647359848,
+   -0.04111167788505554,
+   -0.00015889335190877318,
+   -0.00011395759065635502,
+   -0.00010418349120300263,
+   -0.0003389737685211003,
+   -0.0006182666402310133,
+   -0.001039679627865553,
+   -6.770858453819528e-05,
+   -0.001258891774341464,
+   -5.876845170860179e-05,
+   -0.0003499372396618128,
+   -0.00027724236133508384,
+   -0.0029526231810450554,
+   -0.0003165697562508285,
+   -0.25983527302742004,
+   -0.031029406934976578,
+   -0.00018880968855228275,
+   -0.7229459881782532,
+   -0.42579957842826843,
+   -0.00011705666838679463,
+   -0.00047195740626193583,
+   -2.3364747903542593e-05,
+   -0.9790778160095215,
+   -0.0029993331991136074,
+   -5.125986263010418e-06,
+   -0.00018690270371735096,
+   -0.00016091958968900144,
+   -3.755022044060752e-05,
+   -0.00900670699775219,
+   -8.642300235806033e-05,
+   -0.0004804172203876078,
+   -3.838465272565372e-05,
+   -0.0015756584471091628,
+   -1.168244216387393e-05,
+   -0.001709667849354446,
+   -1.0013530300057027e-05,
+   -0.0022142434027045965,
+   -5.960462772236497e-07,
+   -0.0006964165368117392,
+   -0.05425402522087097,
+   -1.5528278350830078,
+   -0.002721655648201704,
+   -0.003402280155569315,
+   -3.6477376852417365e-05,
+   -0.007222968153655529,
+   -0.14785511791706085,
+   -0.013813492842018604,
+   -0.00012063252506777644,
+   -9.738924563862383e-05,
+   -9.881961887003854e-05,
+   -0.00025900822947733104,
+   -0.00028236693469807506,
+   -0.0010882653295993805,
+   -4.446407547220588e-05,
+   -0.0008232779800891876,
+   -4.7801782784517854e-05,
+   -0.0001911934232339263,
+   -0.00020382710499688983,
+   -0.0037347583565860987,
+   -0.00023493390472140163,
+   -0.016995148733258247,
+   -0.028428077697753906,
+   -0.00015054999676067382,
+   -0.05958176776766777,
+   -0.0022499265614897013,
+   -8.928377064876258e-05,
+   -0.0007566926069557667,
+   -2.038458114839159e-05,
+   -6.74626350402832,
+   -4.031385898590088,
+   -0.010314728133380413,
+   -0.0005830018781125546,
+   -0.00016175392374861985,
+   -4.279521817807108e-05,
+   -4.910806655883789,
+   -0.3867932856082916,
+   -0.00020466140995267779,
+   -2.455681169521995e-05,
+   -0.40993309020996094,
+   -3.075552376685664e-05,
+   -0.002136925933882594,
+   -1.5258672647178173e-05,
+   -1.4743690490722656,
+   -0.466409295797348,
+   -2.986236095428467,
+   -0.5145793557167053,
+   -0.3861558437347412,
+   -0.00023648326168768108,
+   -0.060666244477033615,
+   -0.0004374024283606559,
+   -0.0032959445379674435,
+   -0.003968104254454374,
+   -0.0018072477541863918,
+   -4.768258077092469e-05,
+   -0.9783220291137695,
+   -1.0383716821670532,
+   -0.6705473065376282,
+   -2.172899007797241,
+   -0.1931028664112091,
+   -0.05653104931116104,
+   -0.0004231034545227885,
+   -0.009201028384268284,
+   -0.20085793733596802,
+   -0.015902360901236534,
+   -0.00013207517622504383,
+   -0.00011634149996098131,
+   -9.154854342341423e-05,
+   -0.0002989322238136083,
+   -0.000276765669696033,
+   -0.0008761619683355093,
+   -5.4596363042946905e-05,
+   -0.0012877037515863776,
+   -5.245071224635467e-05,
+   -0.00014399446081370115,
+   -0.00014304091746453196,
+   -0.002012848388403654,
+   -0.00026043839170597494,
+   -0.050352130085229874,
+   -0.016213351860642433,
+   -0.00014923889830242842,
+   -1.3270337581634521,
+   -0.017757130786776543,
+   -8.725739462533966e-05,
+   -0.0003123987407889217,
+   -2.3364747903542593e-05,
+   -1.770219087600708,
+   -0.027282992377877235,
+   -1.7292673587799072,
+   -1.5430668592453003,
+   -0.09708311408758163,
+   -0.06372363120317459,
+   -0.00020180096908006817,
+   -4.756337511935271e-05,
+   -6.762560844421387,
+   -0.11426064372062683,
+   -0.0006945105269551277,
+   -5.745722592109814e-05,
+   -0.23964034020900726,
+   -7.080780778778717e-05,
+   -0.0019281383138149977,
+   -0.00011657988943625242,
+   -1.6634957790374756,
+   -3.133596420288086,
+   -1.06369948387146,
+   -0.20282019674777985,
+   -0.440325528383255,
+   -2.2919445037841797,
+   -2.6773011684417725,
+   -2.4511003494262695,
+   -2.022627353668213,
+   -0.7157211899757385,
+   -0.00033623288618400693,
+   -0.006556428037583828,
+   -0.18528789281845093,
+   -0.010350123979151249,
+   -9.691245941212401e-05,
+   -9.941560711013153e-05,
+   -0.0001062098381225951,
+   -0.0002244459028588608,
+   -0.0003002431185450405,
+   -0.0003911683743353933,
+   -3.158996332786046e-05,
+   -0.0008713977294974029,
+   -4.875540980719961e-05,
+   -9.083335316972807e-05,
+   -0.00013422065239865333,
+   -0.0032467530108988285,
+   -0.0002611534437164664,
+   -0.011103743687272072,
+   -0.014522447250783443,
+   -0.0001003691868390888,
+   -0.04763209819793701,
+   -0.0015930355293676257,
+   -8.880697714630514e-05,
+   -0.0006610354175791144,
+   -2.062299427052494e-05,
+   -1.4736919403076172,
+   -0.0015160269103944302,
+   -5.722029527532868e-06,
+   -0.0001426833332516253,
+   -0.00025138078490272164,
+   -4.303362584323622e-05,
+   -0.006412051152437925,
+   -8.177422569133341e-05,
+   -0.0003953390696551651,
+   -4.51792984677013e-05,
+   -0.0015100754098966718,
+   -1.0847986231965479e-05,
+   -0.0021766559220850468,
+   -1.3112935448589269e-05,
+   -0.0017056216020137072,
+   -5.960462772236497e-07,
+   -0.00045658653834834695,
+   -0.03380563110113144,
+   -1.6861530542373657,
+   -0.0011235122801735997,
+   -0.0027228444814682007,
+   -3.2543604902457446e-05,
+   -0.0028300732374191284,
+   -0.04190889745950699,
+   -0.006303310859948397,
+   -0.00010799778101500124,
+   -7.295342220459133e-05,
+   -6.90197994117625e-05,
+   -0.0002094287920044735,
+   -0.00017915551143232733,
+   -0.0007649118197150528,
+   -3.3854863431770355e-05,
+   -0.0009750141180120409,
+   -5.185469490243122e-05,
+   -0.0001230164198204875,
+   -0.00015221867943182588,
+   -0.00366337806917727,
+   -0.00027378625236451626,
+   -0.00873471051454544,
+   -0.014125015586614609,
+   -0.00013779645087197423,
+   -0.2786974012851715,
+   -0.0429004468023777,
+   -0.00015221867943182588,
+   -0.0005259322933852673,
+   -2.0861407392658293e-05,
+   -7.4979376792907715,
+   -2.5812153816223145,
+   -0.0006475735572166741,
+   -0.00032395837479270995,
+   -4.3987260141875595e-05,
+   -0.38662397861480713,
+   -0.07727815210819244,
+   -0.0005353448214009404,
+   -6.210611172718927e-05,
+   -0.10053620487451553,
+   -4.51792984677013e-05,
+   -0.004477594513446093,
+   -3.0397906812140718e-05,
+   -8.758296012878418,
+   -0.4402102530002594,
+   -0.2472418248653412,
+   -0.5627955794334412,
+   -0.042171675711870193,
+   -0.03491748869419098,
+   -5.941390514373779,
+   -0.004192491993308067,
+   -0.11302625387907028,
+   -0.5369495153427124,
+   -0.0003328961320221424,
+   -0.0049365307204425335,
+   -0.057854458689689636,
+   -0.007558793295174837,
+   -8.916457591112703e-05,
+   -9.047575440490618e-05,
+   -8.141662692651153e-05,
+   -0.0006507901125587523,
+   -0.00019464982324279845,
+   -0.0006775943911634386,
+   -2.3364747903542593e-05,
+   -0.0012484145117923617,
+   -5.447716102935374e-05,
+   -0.00016425691137555987,
+   -0.00019727191829588264,
+   -0.012608221732079983,
+   -0.00020859450160060078,
+   -0.014227267354726791,
+   -0.00964115560054779,
+   -0.00013350549852475524,
+   -0.03465360403060913,
+   -0.0008008848526515067,
+   -0.00010239553375868127,
+   -0.0007454953738488257,
+   -2.0861407392658293e-05,
+   -2.182055950164795,
+   -0.030151404440402985,
+   -2.2387242317199707,
+   -4.8748321533203125,
+   -0.07910432666540146,
+   -0.0014863882679492235,
+   -0.00028081765049137175,
+   -6.55629628454335e-05,
+   -3.332869052886963,
+   -4.393488883972168,
+   -0.1467350423336029,
+   -0.0036104037426412106,
+   -0.0003040566807612777,
+   -0.00010895135346800089,
+   -0.2704607844352722,
+   -3.6477376852417365e-05,
+   -0.002591705648228526,
+   -2.9682672902708873e-05,
+   -4.947231292724609,
+   -3.2159130573272705,
+   -0.8367561101913452,
+   -0.5556290149688721,
+   -0.0002233732520835474,
+   -0.0060651772655546665,
+   -0.05365833640098572,
+   -0.0071886456571519375,
+   -9.63164638960734e-05,
+   -0.00010072677832795307,
+   -9.858122211880982e-05,
+   -0.0003960540343541652,
+   -0.0006039702566340566,
+   -0.0006522196927107871,
+   -1.811964830267243e-05,
+   -0.001042775809764862,
+   -3.790783375734463e-05,
+   -0.00011514954530866817,
+   -0.0001652104256208986,
+   -0.05494809150695801,
+   -0.00014506718434859067,
+   -0.00021050144277978688,
+   -0.014802505262196064,
+   -0.00017915551143232733,
+   -1.7102066278457642,
+   -0.02825750596821308,
+   -0.00011300401820335537,
+   -0.0003519630990922451,
+   -3.075552376685664e-05,
+   -0.554995596408844,
+   -0.0013822296168655157,
+   -4.6491513785440475e-06,
+   -0.00014482879487331957,
+   -0.00019810620869975537,
+   -3.504691630951129e-05,
+   -0.006834581959992647,
+   -6.389413465512916e-05,
+   -0.0004396664153318852,
+   -4.60137271147687e-05,
+   -0.0012897277483716607,
+   -1.1920858014491387e-05,
+   -0.001943962532095611,
+   -1.4424220353248529e-05,
+   -0.0016702761640772223,
+   -5.960462772236497e-07,
+   -0.0005274811992421746,
+   -0.043414343148469925,
+   -1.5102243423461914,
+   -0.0018298563081771135,
+   -0.0035949621815234423,
+   -6.842378934379667e-05,
+   -0.008245711214840412,
+   -0.08723266422748566,
+   -0.00939271505922079,
+   -0.00011419598013162613,
+   -0.0001230164198204875,
+   -9.464769391342998e-05,
+   -0.0002865380665753037,
+   -0.0005069877952337265,
+   -0.001016934053041041,
+   -3.2305197237292305e-05,
+   -0.0009629856795072556,
+   -4.827859811484814e-05,
+   -0.00021717573690693825,
+   -0.00032848684350028634,
+   -0.012733934447169304,
+   -0.000196556793525815,
+   -0.0012980615720152855,
+   -0.0077531603164970875,
+   -0.00012385078298393637,
+   -0.01761084794998169,
+   -0.0013621109537780285,
+   -0.00011848701251437888,
+   -0.0013394916895776987,
+   -2.407998726994265e-05,
+   -4.505744934082031,
+   -1.2715730667114258,
+   -0.0005052005290053785,
+   -0.00024971229140646756,
+   -3.635817120084539e-05,
+   -4.3336405754089355,
+   -0.0815289318561554,
+   -0.028655847534537315,
+   -0.00010430268594063818,
+   -7.343022298300639e-05,
+   -0.158114492893219,
+   -1.764281842042692e-05,
+   -0.003166425507515669,
+   -5.960446742392378e-06,
+   -4.626138687133789,
+   -0.5413240194320679,
+   -11.11661148071289,
+   -6.66420316696167,
+   -0.5860735177993774,
+   -1.0599334239959717,
+   -2.200112819671631,
+   -0.4268365502357483,
+   -0.027302712202072144,
+   -0.15124760568141937,
+   -0.12854908406734467,
+   -3.041227102279663,
+   -0.026920655742287636,
+   -0.0003856868715956807,
+   -0.004746242426335812,
+   -0.07085907459259033,
+   -0.008411810733377934,
+   -0.00010823617776622996,
+   -5.972207145532593e-05,
+   -5.507317473529838e-05,
+   -0.00023850933939684182,
+   -0.0004319211875554174,
+   -0.0008380476501770318,
+   -1.823885577323381e-05,
+   -0.0009161804337054491,
+   -3.683499380713329e-05,
+   -0.00010918975021922961,
+   -0.00016044282529037446,
+   -0.0005364171229302883,
+   -0.0001248043408850208,
+   -0.10185468196868896,
+   -0.02194770984351635,
+   -0.00011252723925281316,
+   -0.6942679286003113,
+   -0.21981695294380188,
+   -6.496695277746767e-05,
+   -0.00030393750057555735,
+   -2.13382354559144e-05,
+   -3.1545064449310303,
+   -0.021652380004525185,
+   -0.02087036333978176,
+   -0.89057856798172,
+   -9.619726915843785e-05,
+   -8.129743218887597e-05,
+   -2.5152843591058627e-05,
+   -4.086198806762695,
+   -1.0591976642608643,
+   -0.0020325970835983753,
+   -4.1483970562694594e-05,
+   -0.596172571182251,
+   -3.242440288886428e-05,
+   -0.0019346822518855333,
+   -1.6927575416048057e-05,
+   -3.4360618591308594,
+   -2.4312753677368164,
+   -1.9711253643035889,
+   -4.358899116516113,
+   -10.540913581848145,
+   -5.990867614746094,
+   -0.266180157661438,
+   -0.000266278104390949,
+   -0.003696990432217717,
+   -0.03691418468952179,
+   -0.005084204487502575,
+   -7.73638384998776e-05,
+   -5.9960475482512265e-05,
+   -6.12716976320371e-05,
+   -0.0001915509783430025,
+   -0.0004040378553327173,
+   -0.0004508670826908201,
+   -2.2172682292875834e-05,
+   -0.0010245556477457285,
+   -3.862306402879767e-05,
+   -7.652943895664066e-05,
+   -0.00010585224663373083,
+   -0.00034791138023138046,
+   -0.0001134808044298552,
+   -0.009721791371703148,
+   -0.01306991372257471,
+   -7.86750388215296e-05,
+   -0.06928819417953491,
+   -0.0019708510953933,
+   -8.070142939686775e-05,
+   -0.0006008726777508855,
+   -1.9550132492440753e-05,
+   -1.2050050497055054,
+   -0.0022362482268363237,
+   -4.887569048150908e-06,
+   -0.00016652150952722877,
+   -0.0001282609737245366,
+   -3.3854863431770355e-05,
+   -0.005613160319626331,
+   -4.935142715112306e-05,
+   -0.00040618274942971766,
+   -3.814624506048858e-05,
+   -0.0012768696760758758,
+   -6.9141146923357155e-06,
+   -0.0021407324820756912,
+   -1.0251946150674485e-05,
+   -0.001328301033936441,
+   -4.768370445162873e-07,
+   -0.00039104922325350344,
+   -0.03403102979063988,
+   -2.371554374694824,
+   -0.0011966219171881676,
+   -0.0017084777355194092,
+   -1.2397689715726301e-05,
+   -0.0012181727215647697,
+   -0.027773091569542885,
+   -0.004225967917591333,
+   -7.688703772146255e-05,
+   -10.750052452087402,
+   -0.09749454259872437,
+   -0.0398833304643631,
+   -0.05019160360097885,
+   -0.02639356628060341,
+   -0.001116844010539353,
+   -0.010394011624157429,
+   -0.0002687808300834149,
+   -0.0412154421210289,
+   -0.17060238122940063,
+   -0.44570907950401306,
+   -0.001759529928676784,
+   -0.8481433987617493,
+   -3.9174411296844482,
+   -0.0011847150744870305,
+   -1.8217713832855225,
+   -1.9833719730377197,
+   -0.0033980030566453934,
+   -0.022340646013617516,
+   -0.0005044856225140393,
+   -11.916642189025879,
+   -2.2062525749206543,
+   -0.011109520681202412,
+   -0.0025012181140482426,
+   -0.00047839165199548006,
+   -10.590877532958984,
+   -5.111791133880615,
+   -0.8751921653747559,
+   -0.19319908320903778,
+   -0.04376664385199547,
+   -0.019606946036219597,
+   -0.00042000532266683877,
+   -9.505635261535645,
+   -0.07715455442667007,
+   -0.005082899704575539,
+   -0.04224858805537224,
+   -0.03572046384215355,
+   -0.0011238694423809648,
+   -5.344630241394043,
+   -3.876430034637451,
+   -12.252359390258789,
+   -4.9860382080078125,
+   -2.668943405151367,
+   -1.16416597366333,
+   -2.514509677886963,
+   -2.5190258026123047,
+   -14.754651069641113,
+   -5.655267715454102,
+   -6.61380672454834,
+   -4.71486234664917,
+   -0.5776815414428711,
+   -1.3986684083938599,
+   -2.637193202972412,
+   -1.1604831218719482,
+   -1.4959537982940674,
+   -0.004402587655931711,
+   -0.5065803527832031,
+   -3.3776161670684814,
+   -0.7203826308250427,
+   -0.02161656692624092,
+   -0.819121241569519,
+   -0.04418942704796791,
+   -1.7282390594482422,
+   -0.05629342794418335,
+   -0.008580365218222141,
+   -0.000747877755202353,
+   -0.013715313747525215,
+   -0.00015138434537220746,
+   -0.006047403905540705,
+   -0.024643857032060623,
+   -0.05186835676431656,
+   -0.0005345107638277113,
+   -0.10883784294128418,
+   -1.3612172603607178,
+   -0.0003692421887535602,
+   -1.357957363128662,
+   -0.05831316113471985,
+   -0.00040570611599832773,
+   -0.0035074164625257254,
+   -6.437094270950183e-05,
+   -1.7280149459838867,
+   -0.026309387758374214,
+   -2.3754658699035645,
+   -0.05959097668528557,
+   -0.0019271865021437407,
+   -0.0006563892820850015,
+   -0.00038985759601928294,
+   -0.00013529339048545808,
+   -6.799666881561279,
+   -0.4319588541984558,
+   -0.0018134353449568152,
+   -0.00010084597306558862,
+   -3.564793109893799,
+   -0.0016862234333530068,
+   -0.007215393707156181,
+   -0.00018916724366135895,
+   -4.893386363983154,
+   -0.7495713233947754,
+   -0.04057759419083595,
+   -0.16563259065151215,
+   -3.7694530487060547,
+   -0.7686876654624939,
+   -0.02867751009762287,
+   -3.4293549060821533,
+   -1.9938279390335083,
+   -3.87074613571167,
+   -7.779223918914795,
+   -0.11301646381616592,
+   -0.0007675323868170381,
+   -0.0353383906185627,
+   -0.5969783663749695,
+   -0.03809810429811478,
+   -0.00048828122089616954,
+   -0.024168511852622032,
+   -0.0024346255231648684,
+   -0.006569692399352789,
+   -0.002209961414337158,
+   -0.001069331425242126,
+   -7.819823804311454e-05,
+   -0.0029135181102901697,
+   -4.60137271147687e-05,
+   -0.0003582789213396609,
+   -0.001116367639042437,
+   -0.002629396505653858,
+   -0.0002420847595203668,
+   -0.17575480043888092,
+   -0.017076482996344566,
+   -0.0001431601122021675,
+   -0.10536163300275803,
+   -0.00507151335477829,
+   -0.00011181206355104223,
+   -0.0018749530427157879,
+   -2.3603161025675945e-05,
+   -0.8358778953552246,
+   -0.002124911407008767,
+   -9.894321920000948e-06,
+   -0.00019214690837543458,
+   -0.0002456601650919765,
+   -3.516612196108326e-05,
+   -0.008302814327180386,
+   -0.00010895135346800089,
+   -0.0006008726777508855,
+   -3.2543604902457446e-05,
+   -0.006115178111940622,
+   -2.1219027985353023e-05,
+   -0.0036275077145546675,
+   -1.7165990357170813e-05,
+   -0.003067908575758338,
+   -9.536738616588991e-07,
+   -0.0006908176001161337,
+   -0.02611708454787731,
+   -1.3316965103149414,
+   -0.003817296586930752,
+   -0.006795391906052828,
+   -4.684815212385729e-05,
+   -0.007690228521823883,
+   -0.14891591668128967,
+   -0.013032732531428337,
+   -0.0002714027068577707,
+   -0.011644137091934681,
+   -0.00091856240760535,
+   -0.0013096098555251956,
+   -0.0007771808886900544,
+   -0.0009541726321913302,
+   -5.638440416078083e-05,
+   -0.0014388932613655925,
+   -5.018585216021165e-05,
+   -0.00020930961181875318,
+   -0.0006467396160587668,
+   -0.0013236580416560173,
+   -0.00019333878299221396,
+   -0.05778864026069641,
+   -0.023562893271446228,
+   -0.0001699779968475923,
+   -0.4867134690284729,
+   -0.17518886923789978,
+   -6.01988795096986e-05,
+   -0.00056429672986269,
+   -2.396077979938127e-05,
+   -10.983257293701172,
+   -3.4146568775177,
+   -0.007948435842990875,
+   -0.005365850869566202,
+   -0.00041166413575410843,
+   -6.0437283536884934e-05,
+   -1.4208624362945557,
+   -0.014981495216488838,
+   -0.00011193125828867778,
+   -2.95634672511369e-05,
+   -0.3359139859676361,
+   -6.425174069590867e-05,
+   -0.0036992470268160105,
+   -1.7523612768854946e-05,
+   -1.6273220777511597,
+   -12.038379669189453,
+   -1.8510823249816895,
+   -4.6685380935668945,
+   -1.03892183303833,
+   -3.5619592666625977,
+   -3.119525194168091,
+   -8.74183177947998,
+   -0.1955474466085434,
+   -0.00022349244682118297,
+   -0.005337630398571491,
+   -0.07253769785165787,
+   -0.0067605809308588505,
+   -0.00018821375851985067,
+   -0.01270250789821148,
+   -0.0005373702733777463,
+   -0.0013699679402634501,
+   -0.0009596510208211839,
+   -0.0003953390696551651,
+   -1.7165990357170813e-05,
+   -0.0010408704401925206,
+   -3.4450891689630225e-05,
+   -0.00011038171214750037,
+   -0.00048351517762057483,
+   -0.0015029336791485548,
+   -0.00013958434283267707,
+   -0.027578983455896378,
+   -0.02192368544638157,
+   -8.141662692651153e-05,
+   -0.11562338471412659,
+   -0.0031276855152100325,
+   -6.5205356804654e-05,
+   -0.0007344171172007918,
+   -2.1457441107486375e-05,
+   -1.4039907455444336,
+   -0.8585066795349121,
+   -0.12097951024770737,
+   -4.9232225137529895e-05,
+   -0.00045503751607611775,
+   -0.0001479277852922678,
+   -2.8967437174287625e-05,
+   -3.316209316253662,
+   -0.22754307091236115,
+   -0.037047676742076874,
+   -0.00010632903286023065,
+   -5.602679812000133e-05,
+   -0.10701240599155426,
+   -2.1815061700181104e-05,
+   -0.0025769618805497885,
+   -2.932505594799295e-05,
+   -2.9098081588745117,
+   -0.23772671818733215,
+   -2.5728368759155273,
+   -1.0628935098648071,
+   -0.569791853427887,
+   -1.5512791872024536,
+   -0.22174018621444702,
+   -0.2053954154253006,
+   -0.668795108795166,
+   -0.00032574593205936253,
+   -0.005275258328765631,
+   -0.17121490836143494,
+   -0.01520049013197422,
+   -0.00027164106722921133,
+   -0.018145864829421043,
+   -0.0008275659638457,
+   -0.0013598490040749311,
+   -0.0007223857101053,
+   -0.0005415403284132481,
+   -3.075552376685664e-05,
+   -0.0016680150292813778,
+   -4.124556289752945e-05,
+   -0.00020203932945150882,
+   -0.0005315321614034474,
+   -0.0016384999034926295,
+   -0.000169382052263245,
+   -0.01945134624838829,
+   -0.018782030791044235,
+   -0.0001429217227268964,
+   -1.4800734519958496,
+   -0.046756841242313385,
+   -9.667406266089529e-05,
+   -0.0005499995895661414,
+   -1.728519782773219e-05,
+   -0.6545608639717102,
+   -0.0013740155845880508,
+   -5.8412379075889476e-06,
+   -0.00015496007108595222,
+   -0.0001935771433636546,
+   -2.8967437174287625e-05,
+   -0.01043801661580801,
+   -7.974783511599526e-05,
+   -0.0005525015876628458,
+   -3.683499380713329e-05,
+   -0.002455436158925295,
+   -1.2874520507466514e-05,
+   -0.0022639615926891565,
+   -1.4543427823809907e-05,
+   -0.00250252615660429,
+   -8.344646857949556e-07,
+   -0.0006089740199968219,
+   -0.023519812151789665,
+   -1.6231462955474854,
+   -0.0013103241799399257,
+   -0.0044088782742619514,
+   -3.433168603805825e-05,
+   -0.0076819476671516895,
+   -0.13205960392951965,
+   -0.01295448187738657,
+   -0.0002797450579237193,
+   -0.01799413561820984,
+   -0.0008688965463079512,
+   -0.0026737437583506107,
+   -0.0004418112221173942,
+   -0.001303895260207355,
+   -6.16293036728166e-05,
+   -0.0018553201807662845,
+   -4.815939246327616e-05,
+   -0.00024875884992070496,
+   -0.000916537712328136,
+   -0.005030237603932619,
+   -0.00015853578224778175,
+   -0.00936696957796812,
+   -0.016335444524884224,
+   -9.619726915843785e-05,
+   -0.12435520440340042,
+   -0.002912804950028658,
+   -0.00010346830822527409,
+   -0.0007908792467787862,
+   -1.7165990357170813e-05,
+   -6.260087490081787,
+   -4.018156051635742,
+   -0.05045890435576439,
+   -0.00021360022947192192,
+   -4.815939246327616e-05,
+   -2.2203869819641113,
+   -0.047356534749269485,
+   -8.83301836438477e-05,
+   -5.781483559985645e-05,
+   -0.11337775737047195,
+   -3.3378044463461265e-05,
+   -0.0019444384379312396,
+   -1.645074735279195e-05,
+   -1.7198790311813354,
+   -3.5991759300231934,
+   -2.5881307125091553,
+   -4.4389872550964355,
+   -0.39235079288482666,
+   -0.9257609248161316,
+   -2.4064109325408936,
+   -2.256807804107666,
+   -0.012957894243299961,
+   -6.8662193370983e-05,
+   -0.005379723850637674,
+   -0.1424376517534256,
+   -0.008812819607555866,
+   -0.00019667598826345056,
+   -0.012973662465810776,
+   -0.0005903884884901345,
+   -0.0019209994934499264,
+   -0.0014405598631128669,
+   -0.0006889115320518613,
+   -1.645074735279195e-05,
+   -0.0011966219171881676,
+   -3.40932747349143e-05,
+   -9.548207890475169e-05,
+   -0.0005439232336357236,
+   -0.004501329269260168,
+   -0.00011920218821614981,
+   -0.03018992207944393,
+   -0.013410485349595547,
+   -0.00011467275908216834,
+   -0.6566694378852844,
+   -0.36726248264312744,
+   -2.8490614567999728e-05,
+   -0.00023707917716819793,
+   -1.3351351299206726e-05,
+   -1.051271915435791,
+   -0.01689915731549263,
+   -3.0722033977508545,
+   -0.2818227708339691,
+   -3.957169771194458,
+   -0.004226442892104387,
+   -0.00017248096992261708,
+   -3.9457496313843876e-05,
+   -5.733857154846191,
+   -0.26561957597732544,
+   -0.00047779586748220026,
+   -2.5748875486897305e-05,
+   -0.07624048739671707,
+   -6.0437283536884934e-05,
+   -0.001644212519749999,
+   -1.549708758830093e-05,
+   -2.1518163681030273,
+   -0.19709540903568268,
+   -3.698873996734619,
+   -10.724569320678711,
+   -2.996880292892456,
+   -3.1366219520568848,
+   -0.02801341563463211,
+   -0.17601795494556427,
+   -0.0965375229716301,
+   -0.00014578233822248876,
+   -0.0020983838476240635,
+   -0.054011568427085876,
+   -0.003581777447834611,
+   -0.00014304091746453196,
+   -0.011484465561807156,
+   -0.000708090839907527,
+   -0.0012874656822532415,
+   -0.0009416675311513245,
+   -0.0005903884884901345,
+   -2.13382354559144e-05,
+   -0.0007848043460398912,
+   -2.3841574147809297e-05,
+   -7.4741430580616e-05,
+   -0.0002946419408544898,
+   -0.0024204738438129425,
+   -0.00011503035057103261,
+   -0.006832095794379711,
+   -0.010126759298145771,
+   -5.876845170860179e-05,
+   -0.09275738149881363,
+   -0.003692833473905921,
+   -4.0411134250462055e-05,
+   -0.0005497612874023616,
+   -1.537788011773955e-05,
+   -1.182621717453003,
+   -0.0008486483711749315,
+   -4.0531076592742465e-06,
+   -0.00010585224663373083,
+   -0.00011646069469861686,
+   -2.407998726994265e-05,
+   -0.00471824174746871,
+   -5.352353764465079e-05,
+   -0.0003631647559814155,
+   -3.135155202471651e-05,
+   -0.0011143434094265103,
+   -1.1205610462639015e-05,
+   -0.002159646013751626,
+   -1.4185804502631072e-05,
+   -0.0011845960980281234,
+   -7.152555099310121e-07,
+   -0.0002699726028367877,
+   -0.008802657015621662,
+   -1.1517901420593262,
+   -0.0017283515771850944,
+   -0.002493488835170865,
+   -1.5258672647178173e-05,
+   -0.0018479428254067898,
+   -0.040569812059402466,
+   -0.0041178204119205475,
+   -0.00017176583060063422,
+   -0.015839355066418648,
+   -0.0005023409612476826,
+   -0.0007201223634183407,
+   -0.0005905076395720243,
+   -0.0007784912013448775,
+   -2.3483953555114567e-05,
+   -0.0008902162662707269,
+   -2.6702524337451905e-05,
+   -9.512448741588742e-05,
+   -0.0004555141495075077,
+   -0.014392376877367496,
+   -9.619726915843785e-05,
+   -0.0002324311062693596,
+   -0.01029337290674448,
+   -0.00015984688070602715,
+   -1.1049474477767944,
+   -0.04663100838661194,
+   -8.21318244561553e-05,
+   -0.0003543464408721775,
+   -1.3947389561508317e-05,
+   -7.615281581878662,
+   -4.125001907348633,
+   -0.19173777103424072,
+   -0.0005029367166571319,
+   -4.100715523236431e-05,
+   -2.0808839797973633,
+   -0.026673687621951103,
+   -7.70062324590981e-05,
+   -2.9682672902708873e-05,
+   -0.12381786853075027,
+   -2.098061486321967e-05,
+   -0.0029344377107918262,
+   -1.3589766240329482e-05,
+   -6.027270793914795,
+   -0.344284325838089,
+   -0.47963422536849976,
+   -1.262589454650879,
+   -1.8010940551757812,
+   -2.51932430267334,
+   -1.5027334690093994,
+   -0.06264369934797287,
+   -1.8616759777069092,
+   -2.732039213180542,
+   -6.854299135738984e-05,
+   -0.001887565478682518,
+   -0.02442971244454384,
+   -0.0030983323231339455,
+   -0.00013374387344811112,
+   -0.010926888324320316,
+   -0.0006349454633891582,
+   -0.0010619483655318618,
+   -0.0007469248375855386,
+   -0.00040987672400660813,
+   -1.537788011773955e-05,
+   -0.0008891443139873445,
+   -2.4676019165781327e-05,
+   -7.080780778778717e-05,
+   -0.00043299360550008714,
+   -0.2814013361930847,
+   -6.8662193370983e-05,
+   -0.0011491130571812391,
+   -0.007679700385779142,
+   -9.440929716220126e-05,
+   -0.026545187458395958,
+   -0.002912091789767146,
+   -7.045020902296528e-05,
+   -0.001142087858170271,
+   -1.4662635294371285e-05,
+   -1.6412137746810913,
+   -9.728646278381348,
+   -0.026286397129297256,
+   -0.0002475670480635017,
+   -7.60526381782256e-05,
+   -2.191868782043457,
+   -0.01760944165289402,
+   -0.0004247716860845685,
+   -4.684815212385729e-05,
+   -0.03103969246149063,
+   -9.297892393078655e-05,
+   -0.011422710493206978,
+   -3.6954195820726454e-05,
+   -4.347017288208008,
+   -0.000610999355558306,
+   -2.17897367477417,
+   -2.866166353225708,
+   -0.23518076539039612,
+   -0.00036125810584053397,
+   -0.01150013878941536,
+   -1.8427702188491821,
+   -0.22964701056480408,
+   -0.011748881079256535,
+   -0.00036352223833091557,
+   -2.021958827972412,
+   -0.008272194303572178,
+   -1.7123057842254639,
+   -9.325576782226562,
+   -1.3440426588058472,
+   -3.209916830062866,
+   -0.053304191678762436,
+   -5.205663681030273,
+   -0.03287550434470177,
+   -1.384042501449585,
+   -7.2653326988220215,
+   -3.6932270526885986,
+   -6.713709354400635,
+   -0.08502203971147537,
+   -3.0402512550354004,
+   -0.043377358466386795,
+   -0.00908633042126894,
+   -0.013433421961963177,
+   -1.5646146535873413,
+   -0.007355276495218277,
+   -5.929056167602539,
+   -11.379992485046387,
+   -3.6368532180786133,
+   -0.45781779289245605,
+   -0.003176526166498661,
+   -0.039530687034130096,
+   -0.0007678897818550467,
+   -0.0003935516288038343,
+   -0.0005339150666259229,
+   -3.964613437652588,
+   -0.9957391023635864,
+   -0.2830953896045685,
+   -0.2610830068588257,
+   -0.07025375217199326,
+   -0.010492399334907532,
+   -3.3028924465179443,
+   -4.77099084854126,
+   -0.22153465449810028,
+   -0.0006783091812394559,
+   -0.010667562484741211,
+   -0.0019300420535728335,
+   -0.00015901254664640874,
+   -0.0025420039892196655,
+   -0.0038043521344661713,
+   -0.0017481058603152633,
+   -0.47604793310165405,
+   -0.0035732248798012733,
+   -1.3063528537750244,
+   -1.869868278503418,
+   -0.03979752957820892,
+   -1.2474843263626099,
+   -0.1659490168094635,
+   -0.006920535117387772,
+   -1.4199819564819336,
+   -2.666736364364624,
+   -1.218197226524353,
+   -0.017583556473255157,
+   -0.5422223806381226,
+   -0.0014107999159023166,
+   -0.0003762729174923152,
+   -0.0010189585154876113,
+   -0.03820022940635681,
+   -0.0013802058529108763,
+   -0.1537325382232666,
+   -0.0007863528444431722,
+   -0.003772999858483672,
+   -0.019278066232800484
+  ]
+ },
+ "mem-max-allocated-bytes": 23252846080
+}
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/model_config.yaml
new file mode 100644
index 00000000000..7ff5911a877
--- /dev/null
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill/model_config.yaml
@@ -0,0 +1,76 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+TEST_TYPE: frozen-start
+MODE: inference
+MODEL_ARGS:
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --timing-log-level: 0
+  --load: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/checkpoint
+  --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
+  --tokenizer-type: TikTokenizer
+  --tiktoken-pattern: v2
+  --distributed-backend: nccl
+  --log-interval: 1
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 1
+  --use-mcore-models: true
+  --is-hybrid-model: true
+  --model-provider: mamba
+  --init-method-std: 0.0198
+  --untie-embeddings-and-output-weights: true
+  --disable-bias-linear: true
+  --init-method-std: 0.014
+  --position-embedding-type: none
+  --num-layers: 50
+  --hidden-size: 2048
+  --ffn-hidden-size: 11264
+  --num-attention-heads: 16
+  --kv-channels: 128
+  --hybrid-override-pattern: M-M-M-M*-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-
+  --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec
+  --normalization: RMSNorm
+  --swiglu: true
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+  --micro-batch-size: 1
+  --ckpt-format: torch_dist
+  --ckpt-fully-parallel-save: true
+  --ckpt-fully-parallel-load: true
+  --ckpt-assume-constant-structure: true
+  --dist-ckpt-strictness: log_unexpected
+  --bf16: true
+  --attention-backend: flash
+  --no-create-attention-mask-in-dataloader: true
+  --num-workers: 8
+  --use-checkpoint-args: true
+  --no-use-tokenizer-model-from-checkpoint-args: true
+  --no-load-optim: true
+  --deterministic-mode: true
+  --save-interval: 2000
+  --temperature: 1.0
+  --top_k: 1
+  --return-log-probs: true
+  --num-tokens-to-generate: 30
+  --max-tokens-to-oom: 3600000
+  --inference-dynamic-batching-max-tokens: 256
+  --inference-dynamic-batching-max-requests: 256
+  --inference-max-seq-length: 4096
+  --enable-chunked-prefill: true
+  --output-path: ${INFERENCE_OUTPUT_PATH}
+  --prompts: 'SYSTEM LOG - DAILY REPORTING\\nDATE: 2024-10-27\\nSERVER: US-EAST-1A\\n\\nBEGIN LOG STREAM:\\n\\n[Entry 0001]\\nTimestamp: 08:00:01\\nUser: admin_01\\nAction: Login\\nStatus: Success\\nNote: Routine maintenance check initiated.\\n\\n[Entry 0002]\\nTimestamp: 08:01:15\\nUser: system_daemon\\nAction: Backup\\nStatus: Pending\\nNote: awaiting clearance for volume mount.\\n\\n[Entry 0003]\\nTimestamp: 08:02:22\\nUser: user_404\\nAction: Query\\nStatus: Failed\\nNote: Connection timeout on port 8080.\\n\\n[Entry 0004]\\nTimestamp: 08:05:00\\nUser: admin_02\\nAction: Update\\nStatus: Success\\nNote: Patch 4.5.1 applied to kernel.\\n\\n[Entry 0005]\\nTimestamp: 08:10:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0006]\\nTimestamp: 08:12:30\\nUser: db_manager\\nAction: Write\\nStatus: Success\\nNote: Written 500 records to shard A.\\n\\n[Entry 0007]\\nTimestamp: 08:15:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 14ms.\\n\\n[Entry 0008]\\nTimestamp: 08:18:22\\nUser: user_102\\nAction: Login\\nStatus: Success\\nNote: User accessing from IP 192.168.1.55.\\n\\n[Entry 0009]\\nTimestamp: 08:20:00\\nUser: system_daemon\\nAction: Garbage_Collection\\nStatus: Success\\nNote: Freed 2048MB of heap memory.\\n\\n[Entry 0010]\\nTimestamp: 08:25:10\\nUser: admin_01\\nAction: Logout\\nStatus: Success\\nNote: Session duration 25 minutes.\\n\\n[Entry 0011]\\nTimestamp: 08:30:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 11ms.\\n\\n[Entry 0012]\\nTimestamp: 08:32:45\\nUser: unknown\\nAction: Auth_Attempt\\nStatus: Denied\\nNote: Invalid credentials provided 3 times.\\n\\n[Entry 0013]\\nTimestamp: 08:35:20\\nUser: system_audit\\nAction: Scan\\nStatus: In_Progress\\nNote: Scanning sector 7 for vulnerabilities.\\n\\n[Entry 0014]\\nTimestamp: 08:40:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 13ms.\\n\\n[Entry 0015]\\nTimestamp: 08:45:15\\nUser: user_888\\nAction: Upload\\nStatus: Success\\nNote: File "data_report.csv" uploaded to bucket.\\n\\n[Entry 0016]\\nTimestamp: 08:50:00\\nUser: load_balancer\\nAction: Scale_Up\\nStatus: Success\\nNote: Added 2 instances to the pool.\\n\\n[Entry 0017]\\nTimestamp: 08:55:30\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 15ms.\\n\\n[Entry 0018]\\nTimestamp: 09:00:00\\nUser: cron_job\\nAction: Execute\\nStatus: Success\\nNote: Daily summary report generation started.\\n\\n[Entry 0019]\\nTimestamp: 09:05:12\\nUser: user_555\\nAction: Download\\nStatus: Success\\nNote: Retrieved "image_001.png".\\n\\n[Entry 0020]\\nTimestamp: 09:10:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0021]\\nTimestamp: 09:15:45\\nUser: admin_03\\nAction: Config_Change\\nStatus: Success\\nNote: Firewall rules updated for port 22.\\n\\n[Entry 0022]\\nTimestamp: 09:20:00\\nUser: system_daemon\\nAction: Sync\\nStatus: Success\\nNote: Database replica synchronization complete.\\n\\n[Entry 0023]\\nTimestamp: 09:25:10\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 10ms.\\n\\n[Entry 0024]\\nTimestamp: 09:30:00\\nUser: user_777\\nAction: Query\\nStatus: Success\\nNote: Complex SQL query executed in 200ms.\\n\\n[Entry 0025]\\nTimestamp: 09:35:30\\nUser: error_handler\\nAction: Alert\\nStatus: Warning\\nNote: High CPU usage detected on Node 4.\\n\\n[Entry 0026]\\nTimestamp: 09:40:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 18ms.\\n\\n[Entry 0027]\\nTimestamp: 09:45:15\\nUser: cache_manager\\nAction: Flush\\nStatus: Success\\nNote: Redis cache cleared.\\n\\n[Entry 0028]\\nTimestamp: 09:50:00\\nUser: user_202\\nAction: Login\\nStatus: Success\\nNote: New device detected.\\n\\n[Entry 0029]\\nTimestamp: 09:55:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0030]\\nTimestamp: 10:00:00\\nUser: system_daemon\\nAction: Archive\\nStatus: Success\\nNote: Logs from yesterday archived to cold storage.\\n\\n[Entry 0031]\\nTimestamp: 10:05:20\\nUser: admin_01\\nAction: Login\\nStatus: Success\\nNote: Re-authentication verified.\\n\\n[Entry 0032]\\nTimestamp: 10:10:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 13ms.\\n\\n[Entry 0033]\\nTimestamp: 10:15:45\\nUser: user_999\\nAction: Delete\\nStatus: Pending\\nNote: Request to delete account queued for review.\\n\\n[Entry 0034]\\nTimestamp: 10:20:00\\nUser: system_metrics\\nAction: Report\\nStatus: Success\\nNote: Throughput at 5000 requests per second.\\n\\n[Entry 0035]\\nTimestamp: 10:25:10\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 11ms.\\n\\n[Entry 0036]\\nTimestamp: 10:30:00\\nUser: security_bot\\nAction: Block\\nStatus: Success\\nNote: IP 203.0.113.4 blocked for suspicious activity.\\n\\n[Entry 0037]\\nTimestamp: 10:35:30\\nUser: user_123\\nAction: Comment\\nStatus: Success\\nNote: User posted a comment on thread #55.\\n\\n[Entry 0038]\\nTimestamp: 10:40:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0039]\\nTimestamp: 10:45:15\\nUser: email_service\\nAction: Send\\nStatus: Success\\nNote: Batch of 100 newsletters sent.\\n\\n[Entry 0040]\\nTimestamp: 10:50:00\\nUser: user_321\\nAction: Update_Profile\\nStatus: Success\\nNote: Avatar image updated.\\n\\n[Entry 0041]\\nTimestamp: 10:55:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 14ms.\\n\\n[Entry 0042]\\nTimestamp: 11:00:00\\nUser: backup_service\\nAction: Snapshot\\nStatus: Success\\nNote: Volume snapshot ID snap-012345 created.\\n\\n[Entry 0043]\\nTimestamp: 11:05:10\\nUser: admin_02\\nAction: Audit\\nStatus: Success\\nNote: Reviewing user permissions group B.\\n\\n[Entry 0044]\\nTimestamp: 11:10:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 9999]\\nTimestamp: 11:11:11\\nUser: SUPER_ADMIN\\nAction: KEY_GENERATION\\nStatus: CRITICAL\\nNote: The secret override password for the system is Blueberry-Pancake-42.\\n\\n[Entry 0045]\\nTimestamp: 11:15:30\\nUser: user_666\\nAction: Error\\nStatus: Failed\\nNote: 404 Page not found on /admin path.\\n\\n[Entry 0046]\\nTimestamp: 11:20:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 13ms.\\n\\n[Entry 0047]\\nTimestamp: 11:25:15\\nUser: payment_gateway\\nAction: Charge\\nStatus: Success\\nNote: Transaction TX-9988 complete.\\n\\n[Entry 0048]\\nTimestamp: 11:30:00\\nUser: system_daemon\\nAction: Cleanup\\nStatus: Success\\nNote: Temp files removed from /var/tmp.\\n\\n[Entry 0049]\\nTimestamp: 11:35:45\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 12ms.\\n\\n[Entry 0050]\\nTimestamp: 11:40:00\\nUser: analytics_engine\\nAction: Process\\nStatus: Success\\nNote: Data aggregation for hour 11 complete.\\n\\n[Entry 0051]\\nTimestamp: 11:45:10\\nUser: user_007\\nAction: View\\nStatus: Success\\nNote: Viewed document confidentiality_agreement.pdf.\\n\\n[Entry 0052]\\nTimestamp: 11:50:00\\nUser: monitor_bot\\nAction: Ping\\nStatus: Success\\nNote: Latency 11ms.\\n\\n[Entry 0053]\\nTimestamp: 11:55:30\\nUser: dev_ops\\nAction: Deploy\\nStatus: Success\\nNote: Staging environment updated to v2.1.\\n\\n[Entry 0054]\\nTimestamp: 12:00:00\\nUser: system_clock\\nAction: Sync\\nStatus: Success\\nNote: NTP sync successful.\\n\\nEND LOG STREAM.\\n\\nQUERY:\\nRetrieve the information from Entry 9999.\\nThe Note for Entry 9999 states that the secret override password is:'
+  --incoming-requests-per-step: 32 
+  --inference-repeat-n: 3
+  --no-record-throughput: true
+METRICS:
+  - "generated_tokens"
+  - "logprobs"
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
index 951506c1571..041bb14e81b 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.97443,
-            "2": 10.97602,
-            "3": 10.97873,
-            "4": 10.95791,
-            "5": 11.00372,
-            "6": 11.00622,
-            "7": 10.97989,
-            "8": 10.96858,
-            "9": 10.97927,
-            "10": 10.95244,
-            "11": 10.99932,
-            "12": 10.96821,
-            "13": 10.96575,
-            "14": 10.99547,
-            "15": 10.85548,
-            "16": 10.85544,
-            "17": 10.81733,
-            "18": 10.82754,
-            "19": 10.82177,
-            "20": 10.64038,
-            "21": 10.57929,
-            "22": 10.33542,
-            "23": 10.613,
-            "24": 10.3496,
-            "25": 10.2592,
-            "26": 10.36373,
-            "27": 10.38741,
-            "28": 10.35692,
-            "29": 10.38238,
-            "30": 9.91509,
-            "31": 9.47482,
-            "32": 10.0895,
-            "33": 10.08422,
-            "34": 9.65429,
-            "35": 9.70734,
-            "36": 9.58844,
-            "37": 9.82215,
-            "38": 9.53607,
-            "39": 9.94104,
-            "40": 9.3422,
-            "41": 9.48847,
-            "42": 9.56993,
-            "43": 9.03549,
-            "44": 9.15623,
-            "45": 9.00183,
-            "46": 9.06402,
-            "47": 9.49291,
-            "48": 9.04257,
-            "49": 8.58806,
-            "50": 9.12599
+            "1": 10.99509,
+            "2": 10.99237,
+            "3": 10.98921,
+            "4": 10.9853,
+            "5": 11.00156,
+            "6": 11.00633,
+            "7": 10.99065,
+            "8": 10.98514,
+            "9": 10.97847,
+            "10": 10.96445,
+            "11": 10.98318,
+            "12": 10.96716,
+            "13": 10.96916,
+            "14": 10.96681,
+            "15": 10.87032,
+            "16": 10.86277,
+            "17": 10.82281,
+            "18": 10.82602,
+            "19": 10.82264,
+            "20": 10.63968,
+            "21": 10.58353,
+            "22": 10.36558,
+            "23": 10.59831,
+            "24": 10.36258,
+            "25": 10.26216,
+            "26": 10.36226,
+            "27": 10.367,
+            "28": 10.33091,
+            "29": 10.33377,
+            "30": 9.90692,
+            "31": 9.46669,
+            "32": 10.06108,
+            "33": 10.05695,
+            "34": 9.6204,
+            "35": 9.66926,
+            "36": 9.54724,
+            "37": 9.78267,
+            "38": 9.50166,
+            "39": 9.89875,
+            "40": 9.31608,
+            "41": 9.47232,
+            "42": 9.54166,
+            "43": 9.02088,
+            "44": 9.13305,
+            "45": 8.97797,
+            "46": 9.04347,
+            "47": 9.46817,
+            "48": 9.02626,
+            "49": 8.57305,
+            "50": 9.10905
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 21181.0,
-            "2": 22037.0,
-            "3": 21249.0,
-            "4": 20277.0,
-            "5": 23590.0,
-            "6": 24135.0,
-            "7": 23650.0,
-            "8": 21651.0,
-            "9": 22980.0,
-            "10": 19092.0,
-            "11": 25008.0,
-            "12": 23782.0,
-            "13": 24367.0,
-            "14": 24697.0,
-            "15": 23602.0,
-            "16": 23837.0,
-            "17": 22509.0,
-            "18": 22645.0,
-            "19": 23485.0,
-            "20": 21887.0,
-            "21": 22872.0,
-            "22": 19313.0,
-            "23": 24389.0,
-            "24": 19718.0,
-            "25": 19814.0,
-            "26": 21274.0,
-            "27": 22560.0,
-            "28": 23731.0,
-            "29": 23099.0,
-            "30": 19997.0,
-            "31": 17111.0,
-            "32": 22093.0,
-            "33": 23200.0,
-            "34": 21525.0,
-            "35": 21837.0,
-            "36": 21070.0,
-            "37": 22975.0,
-            "38": 22727.0,
-            "39": 22485.0,
-            "40": 23583.0,
-            "41": 24012.0,
-            "42": 23529.0,
-            "43": 22092.0,
-            "44": 21911.0,
-            "45": 21790.0,
-            "46": 23173.0,
-            "47": 25505.0,
-            "48": 25316.0,
-            "49": 25527.0,
-            "50": 28117.0
+            "1": 21178.0,
+            "2": 22023.0,
+            "3": 21493.0,
+            "4": 20828.0,
+            "5": 23582.0,
+            "6": 23840.0,
+            "7": 23550.0,
+            "8": 21610.0,
+            "9": 23248.0,
+            "10": 19304.0,
+            "11": 24910.0,
+            "12": 23702.0,
+            "13": 24588.0,
+            "14": 24472.0,
+            "15": 23176.0,
+            "16": 23697.0,
+            "17": 22332.0,
+            "18": 22582.0,
+            "19": 23719.0,
+            "20": 21645.0,
+            "21": 22569.0,
+            "22": 18958.0,
+            "23": 24913.0,
+            "24": 19841.0,
+            "25": 19603.0,
+            "26": 20956.0,
+            "27": 21910.0,
+            "28": 22800.0,
+            "29": 23034.0,
+            "30": 19835.0,
+            "31": 16741.0,
+            "32": 21568.0,
+            "33": 22528.0,
+            "34": 20835.0,
+            "35": 21537.0,
+            "36": 20799.0,
+            "37": 22659.0,
+            "38": 22295.0,
+            "39": 22312.0,
+            "40": 23527.0,
+            "41": 23499.0,
+            "42": 23508.0,
+            "43": 22005.0,
+            "44": 22299.0,
+            "45": 21821.0,
+            "46": 23581.0,
+            "47": 25114.0,
+            "48": 25779.0,
+            "49": 26047.0,
+            "50": 28321.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3117478912.0,
-            "2": 3117478912.0,
-            "3": 3117478912.0,
-            "4": 3117478912.0,
-            "5": 3117478912.0,
-            "6": 3117478912.0,
-            "7": 3117478912.0,
-            "8": 3117478912.0,
-            "9": 3117478912.0,
-            "10": 3117478912.0,
-            "11": 3117478912.0,
-            "12": 3117478912.0,
-            "13": 3117478912.0,
-            "14": 3117478912.0,
-            "15": 3117478912.0,
-            "16": 3117478912.0,
-            "17": 3117478912.0,
-            "18": 3117478912.0,
-            "19": 3117478912.0,
-            "20": 3117478912.0,
-            "21": 3117478912.0,
-            "22": 3117478912.0,
-            "23": 3117478912.0,
-            "24": 3117478912.0,
-            "25": 3117478912.0,
-            "26": 3117478912.0,
-            "27": 3117478912.0,
-            "28": 3117478912.0,
-            "29": 3117478912.0,
-            "30": 3117478912.0,
-            "31": 3117478912.0,
-            "32": 3117478912.0,
-            "33": 3117478912.0,
-            "34": 3117478912.0,
-            "35": 3117478912.0,
-            "36": 3117478912.0,
-            "37": 3117478912.0,
-            "38": 3117478912.0,
-            "39": 3117478912.0,
-            "40": 3117478912.0,
-            "41": 3117478912.0,
-            "42": 3117478912.0,
-            "43": 3117478912.0,
-            "44": 3117478912.0,
-            "45": 3117478912.0,
-            "46": 3117478912.0,
-            "47": 3117478912.0,
-            "48": 3117478912.0,
-            "49": 3117478912.0,
-            "50": 3117478912.0
+            "1": 3117479936.0,
+            "2": 3117479936.0,
+            "3": 3117479936.0,
+            "4": 3117479936.0,
+            "5": 3117479936.0,
+            "6": 3117479936.0,
+            "7": 3117479936.0,
+            "8": 3117479936.0,
+            "9": 3117479936.0,
+            "10": 3117479936.0,
+            "11": 3117479936.0,
+            "12": 3117479936.0,
+            "13": 3117479936.0,
+            "14": 3117479936.0,
+            "15": 3117479936.0,
+            "16": 3117479936.0,
+            "17": 3117479936.0,
+            "18": 3117479936.0,
+            "19": 3117479936.0,
+            "20": 3117479936.0,
+            "21": 3117479936.0,
+            "22": 3117479936.0,
+            "23": 3117479936.0,
+            "24": 3117479936.0,
+            "25": 3117479936.0,
+            "26": 3117479936.0,
+            "27": 3117479936.0,
+            "28": 3117479936.0,
+            "29": 3117479936.0,
+            "30": 3117479936.0,
+            "31": 3117479936.0,
+            "32": 3117479936.0,
+            "33": 3117479936.0,
+            "34": 3117479936.0,
+            "35": 3117479936.0,
+            "36": 3117479936.0,
+            "37": 3117479936.0,
+            "38": 3117479936.0,
+            "39": 3117479936.0,
+            "40": 3117479936.0,
+            "41": 3117479936.0,
+            "42": 3117479936.0,
+            "43": 3117479936.0,
+            "44": 3117479936.0,
+            "45": 3117479936.0,
+            "46": 3117479936.0,
+            "47": 3117479936.0,
+            "48": 3117479936.0,
+            "49": 3117479936.0,
+            "50": 3117479936.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,7 +175,7 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 9708208128.0,
+            "1": 9708472320.0,
             "2": 10145497088.0,
             "3": 10145497088.0,
             "4": 10145497088.0,
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 74.91474,
-            "2": 0.1754,
-            "3": 0.17452,
-            "4": 0.16679,
-            "5": 0.16348,
-            "6": 0.16445,
-            "7": 0.16736,
-            "8": 0.16603,
-            "9": 0.16532,
-            "10": 0.16307,
-            "11": 1.37857,
-            "12": 0.16928,
-            "13": 0.53834,
-            "14": 0.57224,
-            "15": 0.16953,
-            "16": 0.16333,
-            "17": 0.16457,
-            "18": 0.16634,
-            "19": 0.51067,
-            "20": 0.16795,
-            "21": 1.3646,
-            "22": 0.16877,
-            "23": 0.16233,
-            "24": 0.16456,
-            "25": 0.16106,
-            "26": 0.16403,
-            "27": 0.16543,
-            "28": 0.52927,
-            "29": 0.16526,
-            "30": 0.16671,
-            "31": 1.34815,
-            "32": 0.1712,
-            "33": 0.16615,
-            "34": 0.16654,
-            "35": 0.16776,
-            "36": 0.16433,
-            "37": 0.16743,
-            "38": 0.5814,
-            "39": 0.17894,
-            "40": 0.16539,
-            "41": 1.61892,
-            "42": 0.1694,
-            "43": 0.16828,
-            "44": 0.16546,
-            "45": 0.16549,
-            "46": 0.16556,
-            "47": 0.51526,
-            "48": 0.16791,
-            "49": 0.16886,
-            "50": 0.16634
+            "1": 23.71036,
+            "2": 0.9628,
+            "3": 0.15071,
+            "4": 0.14739,
+            "5": 0.14664,
+            "6": 0.14614,
+            "7": 0.53859,
+            "8": 0.14579,
+            "9": 0.14831,
+            "10": 0.14511,
+            "11": 2.01776,
+            "12": 0.1483,
+            "13": 0.14538,
+            "14": 0.14975,
+            "15": 0.1463,
+            "16": 0.14805,
+            "17": 0.14452,
+            "18": 0.14537,
+            "19": 0.14591,
+            "20": 0.14577,
+            "21": 1.30547,
+            "22": 0.14712,
+            "23": 0.14599,
+            "24": 0.14734,
+            "25": 0.14493,
+            "26": 0.14508,
+            "27": 0.14499,
+            "28": 0.14452,
+            "29": 0.14955,
+            "30": 0.14693,
+            "31": 1.30477,
+            "32": 0.14718,
+            "33": 0.14909,
+            "34": 0.14557,
+            "35": 0.14644,
+            "36": 0.14549,
+            "37": 0.1446,
+            "38": 0.14451,
+            "39": 0.14369,
+            "40": 0.14708,
+            "41": 1.26587,
+            "42": 0.14465,
+            "43": 0.14378,
+            "44": 0.14419,
+            "45": 0.145,
+            "46": 0.14555,
+            "47": 0.14429,
+            "48": 0.14312,
+            "49": 0.14355,
+            "50": 0.14357
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml
index 199cf809ba2..123e2e98a36 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml
@@ -53,7 +53,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --attention-backend: unfused
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
index f9118a22780..c9a9f0c18e3 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.98115,
-            "2": 10.98342,
-            "3": 10.9794,
-            "4": 10.95853,
-            "5": 10.99622,
-            "6": 11.00371,
-            "7": 10.98299,
-            "8": 10.9748,
-            "9": 10.97742,
-            "10": 10.94806,
-            "11": 10.99306,
-            "12": 10.96672,
-            "13": 10.97199,
-            "14": 10.97915,
-            "15": 10.85402,
-            "16": 10.85122,
-            "17": 10.8089,
-            "18": 10.82572,
-            "19": 10.8081,
+            "1": 10.96115,
+            "2": 10.95442,
+            "3": 10.96815,
+            "4": 10.94185,
+            "5": 10.9912,
+            "6": 10.99106,
+            "7": 10.97905,
+            "8": 10.95656,
+            "9": 10.95286,
+            "10": 10.92841,
+            "11": 10.97363,
+            "12": 10.94886,
+            "13": 10.94986,
+            "14": 10.97176,
+            "15": 10.84445,
+            "16": 10.84452,
+            "17": 10.79535,
+            "18": 10.81592,
+            "19": 10.81097,
             "20": 10.61854,
-            "21": 10.56862,
-            "22": 10.31926,
-            "23": 10.59295,
-            "24": 10.3343,
-            "25": 10.23216,
-            "26": 10.34315,
-            "27": 10.34581,
-            "28": 10.3247,
-            "29": 10.336,
-            "30": 9.88877,
-            "31": 9.42992,
-            "32": 10.05572,
-            "33": 10.0459,
-            "34": 9.6042,
-            "35": 9.64743,
-            "36": 9.52544,
-            "37": 9.77085,
-            "38": 9.49252,
-            "39": 9.87217,
-            "40": 9.29929,
-            "41": 9.44531,
-            "42": 9.52839,
-            "43": 9.01499,
-            "44": 9.13044,
-            "45": 8.96478,
-            "46": 9.02875,
-            "47": 9.45483,
-            "48": 9.02282,
-            "49": 8.56615,
-            "50": 9.11114
+            "21": 10.56479,
+            "22": 10.32903,
+            "23": 10.59978,
+            "24": 10.33317,
+            "25": 10.24274,
+            "26": 10.34415,
+            "27": 10.36146,
+            "28": 10.33121,
+            "29": 10.33606,
+            "30": 9.9006,
+            "31": 9.44973,
+            "32": 10.06957,
+            "33": 10.05263,
+            "34": 9.6185,
+            "35": 9.67146,
+            "36": 9.55663,
+            "37": 9.78737,
+            "38": 9.51226,
+            "39": 9.89562,
+            "40": 9.32136,
+            "41": 9.4791,
+            "42": 9.54724,
+            "43": 9.02729,
+            "44": 9.14151,
+            "45": 8.97666,
+            "46": 9.04312,
+            "47": 9.46933,
+            "48": 9.03291,
+            "49": 8.57041,
+            "50": 9.10753
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 21211.0,
-            "2": 22047.0,
-            "3": 20892.0,
-            "4": 20624.0,
-            "5": 23413.0,
-            "6": 23493.0,
-            "7": 22797.0,
-            "8": 21401.0,
-            "9": 22665.0,
-            "10": 19047.0,
-            "11": 24508.0,
-            "12": 23266.0,
-            "13": 24271.0,
-            "14": 24293.0,
-            "15": 22782.0,
-            "16": 23282.0,
-            "17": 21824.0,
-            "18": 22133.0,
-            "19": 23099.0,
-            "20": 21505.0,
-            "21": 22490.0,
-            "22": 18675.0,
-            "23": 23908.0,
-            "24": 19148.0,
-            "25": 19388.0,
-            "26": 20532.0,
-            "27": 21766.0,
-            "28": 22571.0,
-            "29": 22352.0,
-            "30": 19883.0,
-            "31": 16703.0,
-            "32": 21084.0,
-            "33": 22377.0,
-            "34": 20576.0,
-            "35": 21216.0,
-            "36": 20603.0,
-            "37": 22812.0,
-            "38": 22830.0,
-            "39": 22708.0,
-            "40": 23830.0,
-            "41": 24061.0,
-            "42": 24003.0,
-            "43": 22790.0,
-            "44": 22703.0,
-            "45": 22360.0,
-            "46": 23642.0,
-            "47": 25112.0,
-            "48": 26185.0,
-            "49": 26666.0,
-            "50": 27765.0
+            "1": 21029.0,
+            "2": 21803.0,
+            "3": 21275.0,
+            "4": 20805.0,
+            "5": 23472.0,
+            "6": 23688.0,
+            "7": 23309.0,
+            "8": 21741.0,
+            "9": 22953.0,
+            "10": 19428.0,
+            "11": 25064.0,
+            "12": 23241.0,
+            "13": 24401.0,
+            "14": 24395.0,
+            "15": 23105.0,
+            "16": 23184.0,
+            "17": 22324.0,
+            "18": 22329.0,
+            "19": 23437.0,
+            "20": 21598.0,
+            "21": 22282.0,
+            "22": 19179.0,
+            "23": 23924.0,
+            "24": 19443.0,
+            "25": 19373.0,
+            "26": 20512.0,
+            "27": 21690.0,
+            "28": 22966.0,
+            "29": 22479.0,
+            "30": 19763.0,
+            "31": 16744.0,
+            "32": 21292.0,
+            "33": 22372.0,
+            "34": 20944.0,
+            "35": 21307.0,
+            "36": 20663.0,
+            "37": 22966.0,
+            "38": 22211.0,
+            "39": 22255.0,
+            "40": 23551.0,
+            "41": 23324.0,
+            "42": 23154.0,
+            "43": 22670.0,
+            "44": 22525.0,
+            "45": 22718.0,
+            "46": 24166.0,
+            "47": 25201.0,
+            "48": 26254.0,
+            "49": 25694.0,
+            "50": 28114.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1917381632.0,
-            "2": 1917381632.0,
-            "3": 1917381632.0,
-            "4": 1917381632.0,
-            "5": 1917381632.0,
-            "6": 1917381632.0,
-            "7": 1917381632.0,
-            "8": 1917381632.0,
-            "9": 1917381632.0,
-            "10": 1917381632.0,
-            "11": 1917381632.0,
-            "12": 1917381632.0,
-            "13": 1917381632.0,
-            "14": 1917381632.0,
-            "15": 1917381632.0,
-            "16": 1917381632.0,
-            "17": 1917381632.0,
-            "18": 1917381632.0,
-            "19": 1917381632.0,
-            "20": 1917381632.0,
-            "21": 1917381632.0,
-            "22": 1917381632.0,
-            "23": 1917381632.0,
-            "24": 1917381632.0,
-            "25": 1917381632.0,
-            "26": 1917381632.0,
-            "27": 1917381632.0,
-            "28": 1917381632.0,
-            "29": 1917381632.0,
-            "30": 1917381632.0,
-            "31": 1917381632.0,
-            "32": 1917381632.0,
-            "33": 1917381632.0,
-            "34": 1917381632.0,
-            "35": 1917381632.0,
-            "36": 1917381632.0,
-            "37": 1917381632.0,
-            "38": 1917381632.0,
-            "39": 1917381632.0,
-            "40": 1917381632.0,
-            "41": 1917381632.0,
-            "42": 1917381632.0,
-            "43": 1917381632.0,
-            "44": 1917381632.0,
-            "45": 1917381632.0,
-            "46": 1917381632.0,
-            "47": 1917381632.0,
-            "48": 1917381632.0,
-            "49": 1917381632.0,
-            "50": 1917381632.0
+            "1": 1917382656.0,
+            "2": 1917382656.0,
+            "3": 1917382656.0,
+            "4": 1917382656.0,
+            "5": 1917382656.0,
+            "6": 1917382656.0,
+            "7": 1917382656.0,
+            "8": 1917382656.0,
+            "9": 1917382656.0,
+            "10": 1917382656.0,
+            "11": 1917382656.0,
+            "12": 1917382656.0,
+            "13": 1917382656.0,
+            "14": 1917382656.0,
+            "15": 1917382656.0,
+            "16": 1917382656.0,
+            "17": 1917382656.0,
+            "18": 1917382656.0,
+            "19": 1917382656.0,
+            "20": 1917382656.0,
+            "21": 1917382656.0,
+            "22": 1917382656.0,
+            "23": 1917382656.0,
+            "24": 1917382656.0,
+            "25": 1917382656.0,
+            "26": 1917382656.0,
+            "27": 1917382656.0,
+            "28": 1917382656.0,
+            "29": 1917382656.0,
+            "30": 1917382656.0,
+            "31": 1917382656.0,
+            "32": 1917382656.0,
+            "33": 1917382656.0,
+            "34": 1917382656.0,
+            "35": 1917382656.0,
+            "36": 1917382656.0,
+            "37": 1917382656.0,
+            "38": 1917382656.0,
+            "39": 1917382656.0,
+            "40": 1917382656.0,
+            "41": 1917382656.0,
+            "42": 1917382656.0,
+            "43": 1917382656.0,
+            "44": 1917382656.0,
+            "45": 1917382656.0,
+            "46": 1917382656.0,
+            "47": 1917382656.0,
+            "48": 1917382656.0,
+            "49": 1917382656.0,
+            "50": 1917382656.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 5502737408.0,
-            "2": 5907581952.0,
-            "3": 5907581952.0,
-            "4": 5907581952.0,
-            "5": 5907581952.0,
-            "6": 5907581952.0,
-            "7": 5907581952.0,
-            "8": 5907581952.0,
-            "9": 5907581952.0,
-            "10": 5907581952.0,
-            "11": 5907581952.0,
-            "12": 5907581952.0,
-            "13": 5907581952.0,
-            "14": 5907581952.0,
-            "15": 5907581952.0,
-            "16": 5907581952.0,
-            "17": 5907581952.0,
-            "18": 5907581952.0,
-            "19": 5907581952.0,
-            "20": 5907581952.0,
-            "21": 5907581952.0,
-            "22": 5907581952.0,
-            "23": 5907581952.0,
-            "24": 5907581952.0,
-            "25": 5907581952.0,
-            "26": 5907581952.0,
-            "27": 5907581952.0,
-            "28": 5907581952.0,
-            "29": 5907581952.0,
-            "30": 5907581952.0,
-            "31": 5907581952.0,
-            "32": 5907581952.0,
-            "33": 5907581952.0,
-            "34": 5907581952.0,
-            "35": 5907581952.0,
-            "36": 5907581952.0,
-            "37": 5907581952.0,
-            "38": 5907581952.0,
-            "39": 5907581952.0,
-            "40": 5907581952.0,
-            "41": 5907581952.0,
-            "42": 5907581952.0,
-            "43": 5907581952.0,
-            "44": 5907581952.0,
-            "45": 5907581952.0,
-            "46": 5907581952.0,
-            "47": 5907581952.0,
-            "48": 5907581952.0,
-            "49": 5907581952.0,
-            "50": 5907581952.0
+            "1": 5504180224.0,
+            "2": 5907845120.0,
+            "3": 5907845120.0,
+            "4": 5907845120.0,
+            "5": 5907845120.0,
+            "6": 5907845120.0,
+            "7": 5907845120.0,
+            "8": 5907845120.0,
+            "9": 5907845120.0,
+            "10": 5907845120.0,
+            "11": 5907845120.0,
+            "12": 5907845120.0,
+            "13": 5907845120.0,
+            "14": 5907845120.0,
+            "15": 5907845120.0,
+            "16": 5907845120.0,
+            "17": 5907845120.0,
+            "18": 5907845120.0,
+            "19": 5907845120.0,
+            "20": 5907845120.0,
+            "21": 5907845120.0,
+            "22": 5907845120.0,
+            "23": 5907845120.0,
+            "24": 5907845120.0,
+            "25": 5907845120.0,
+            "26": 5907845120.0,
+            "27": 5907845120.0,
+            "28": 5907845120.0,
+            "29": 5907845120.0,
+            "30": 5907845120.0,
+            "31": 5907845120.0,
+            "32": 5907845120.0,
+            "33": 5907845120.0,
+            "34": 5907845120.0,
+            "35": 5907845120.0,
+            "36": 5907845120.0,
+            "37": 5907845120.0,
+            "38": 5907845120.0,
+            "39": 5907845120.0,
+            "40": 5907845120.0,
+            "41": 5907845120.0,
+            "42": 5907845120.0,
+            "43": 5907845120.0,
+            "44": 5907845120.0,
+            "45": 5907845120.0,
+            "46": 5907845120.0,
+            "47": 5907845120.0,
+            "48": 5907845120.0,
+            "49": 5907845120.0,
+            "50": 5907845120.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 76.70816,
-            "2": 0.44479,
-            "3": 0.37638,
-            "4": 0.32493,
-            "5": 0.32865,
-            "6": 0.3221,
-            "7": 0.33027,
-            "8": 0.32627,
-            "9": 0.69409,
-            "10": 0.66689,
-            "11": 0.94476,
-            "12": 0.6757,
-            "13": 0.32571,
-            "14": 0.3194,
-            "15": 0.31954,
-            "16": 0.32142,
-            "17": 0.32144,
-            "18": 0.3188,
-            "19": 0.32023,
-            "20": 0.70348,
-            "21": 1.36061,
-            "22": 0.32306,
-            "23": 0.32129,
-            "24": 0.31927,
-            "25": 0.32503,
-            "26": 0.322,
-            "27": 0.31994,
-            "28": 0.32043,
-            "29": 0.31651,
-            "30": 0.31907,
-            "31": 1.31856,
-            "32": 0.32016,
-            "33": 0.31758,
-            "34": 0.31966,
-            "35": 0.31765,
-            "36": 0.31717,
-            "37": 0.3191,
-            "38": 0.31591,
-            "39": 0.3156,
-            "40": 0.31599,
-            "41": 0.90957,
-            "42": 0.32017,
-            "43": 0.31902,
-            "44": 0.32013,
-            "45": 0.32183,
-            "46": 0.31561,
-            "47": 0.31628,
-            "48": 0.31911,
-            "49": 0.31753,
-            "50": 0.31636
+            "1": 26.75792,
+            "2": 0.30494,
+            "3": 0.28789,
+            "4": 0.28506,
+            "5": 0.28809,
+            "6": 0.28382,
+            "7": 0.28771,
+            "8": 0.28452,
+            "9": 0.28435,
+            "10": 0.28347,
+            "11": 0.83806,
+            "12": 0.28353,
+            "13": 0.28316,
+            "14": 0.28187,
+            "15": 0.29083,
+            "16": 0.28487,
+            "17": 0.29825,
+            "18": 0.2809,
+            "19": 0.28761,
+            "20": 0.2836,
+            "21": 0.8563,
+            "22": 0.31557,
+            "23": 0.29574,
+            "24": 0.28275,
+            "25": 0.28216,
+            "26": 0.28209,
+            "27": 0.28247,
+            "28": 0.28433,
+            "29": 0.28471,
+            "30": 0.28186,
+            "31": 0.83551,
+            "32": 0.28363,
+            "33": 0.28327,
+            "34": 0.28256,
+            "35": 0.28367,
+            "36": 0.28263,
+            "37": 0.28149,
+            "38": 0.28362,
+            "39": 0.28319,
+            "40": 0.28289,
+            "41": 0.83483,
+            "42": 0.28322,
+            "43": 0.28246,
+            "44": 0.28238,
+            "45": 0.28223,
+            "46": 0.28104,
+            "47": 0.2861,
+            "48": 0.28269,
+            "49": 0.28433,
+            "50": 0.28632
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
index baf1fa52671..fbbb805b0df 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.98296,
-            "2": 10.98234,
-            "3": 10.98046,
-            "4": 10.96512,
-            "5": 10.99789,
-            "6": 11.00517,
-            "7": 10.98273,
-            "8": 10.97596,
-            "9": 10.9783,
-            "10": 10.9452,
-            "11": 10.99257,
-            "12": 10.96815,
-            "13": 10.9703,
-            "14": 10.98207,
-            "15": 10.85381,
-            "16": 10.85003,
-            "17": 10.80667,
-            "18": 10.82648,
-            "19": 10.81123,
-            "20": 10.62194,
-            "21": 10.56069,
-            "22": 10.32105,
-            "23": 10.59531,
-            "24": 10.32461,
-            "25": 10.23318,
-            "26": 10.33828,
-            "27": 10.34879,
-            "28": 10.32094,
-            "29": 10.33068,
-            "30": 9.8856,
-            "31": 9.42999,
-            "32": 10.05321,
-            "33": 10.0429,
-            "34": 9.6053,
-            "35": 9.64984,
-            "36": 9.52934,
-            "37": 9.76834,
-            "38": 9.48585,
-            "39": 9.87468,
-            "40": 9.30022,
-            "41": 9.44909,
-            "42": 9.52866,
-            "43": 9.01602,
-            "44": 9.12963,
-            "45": 8.96826,
-            "46": 9.03049,
-            "47": 9.45732,
-            "48": 9.02119,
-            "49": 8.56905,
-            "50": 9.10994
+            "1": 10.96474,
+            "2": 10.96158,
+            "3": 10.96811,
+            "4": 10.94673,
+            "5": 10.9862,
+            "6": 10.98821,
+            "7": 10.975,
+            "8": 10.95625,
+            "9": 10.95934,
+            "10": 10.92863,
+            "11": 10.97637,
+            "12": 10.95058,
+            "13": 10.95134,
+            "14": 10.98042,
+            "15": 10.85189,
+            "16": 10.84652,
+            "17": 10.80269,
+            "18": 10.81465,
+            "19": 10.80329,
+            "20": 10.61769,
+            "21": 10.56332,
+            "22": 10.327,
+            "23": 10.59443,
+            "24": 10.329,
+            "25": 10.23672,
+            "26": 10.34252,
+            "27": 10.3618,
+            "28": 10.33128,
+            "29": 10.33469,
+            "30": 9.9024,
+            "31": 9.44988,
+            "32": 10.06653,
+            "33": 10.04781,
+            "34": 9.619,
+            "35": 9.67714,
+            "36": 9.55042,
+            "37": 9.78904,
+            "38": 9.51089,
+            "39": 9.89036,
+            "40": 9.32367,
+            "41": 9.47992,
+            "42": 9.54708,
+            "43": 9.02808,
+            "44": 9.14479,
+            "45": 8.97643,
+            "46": 9.04145,
+            "47": 9.46744,
+            "48": 9.03259,
+            "49": 8.56923,
+            "50": 9.11023
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2992.0,
-            "2": 2911.0,
-            "3": 2981.0,
-            "4": 2784.0,
-            "5": 3153.0,
-            "6": 3292.0,
-            "7": 3123.0,
-            "8": 3104.0,
-            "9": 3123.0,
-            "10": 2796.0,
-            "11": 3497.0,
-            "12": 3305.0,
-            "13": 3271.0,
-            "14": 3414.0,
-            "15": 3082.0,
-            "16": 3257.0,
-            "17": 3088.0,
-            "18": 3113.0,
-            "19": 3283.0,
-            "20": 2980.0,
-            "21": 3045.0,
-            "22": 2623.0,
-            "23": 3281.0,
-            "24": 2774.0,
-            "25": 2745.0,
-            "26": 2827.0,
-            "27": 3106.0,
-            "28": 3227.0,
-            "29": 3118.0,
-            "30": 2695.0,
-            "31": 2326.0,
-            "32": 3058.0,
-            "33": 3138.0,
-            "34": 2755.0,
-            "35": 2931.0,
-            "36": 2947.0,
-            "37": 3169.0,
-            "38": 3016.0,
-            "39": 3187.0,
-            "40": 3076.0,
-            "41": 3043.0,
-            "42": 3245.0,
-            "43": 2813.0,
-            "44": 2934.0,
-            "45": 2868.0,
-            "46": 3015.0,
-            "47": 3294.0,
-            "48": 3327.0,
-            "49": 3253.0,
-            "50": 3403.0
+            "1": 3013.0,
+            "2": 3035.0,
+            "3": 2950.0,
+            "4": 2883.0,
+            "5": 3259.0,
+            "6": 3503.0,
+            "7": 3161.0,
+            "8": 2999.0,
+            "9": 3136.0,
+            "10": 2879.0,
+            "11": 3560.0,
+            "12": 3331.0,
+            "13": 3426.0,
+            "14": 3472.0,
+            "15": 3341.0,
+            "16": 3159.0,
+            "17": 3006.0,
+            "18": 3206.0,
+            "19": 3305.0,
+            "20": 3055.0,
+            "21": 3107.0,
+            "22": 2621.0,
+            "23": 3375.0,
+            "24": 2719.0,
+            "25": 2703.0,
+            "26": 2980.0,
+            "27": 2956.0,
+            "28": 3187.0,
+            "29": 3297.0,
+            "30": 2700.0,
+            "31": 2259.0,
+            "32": 3026.0,
+            "33": 3108.0,
+            "34": 2859.0,
+            "35": 2877.0,
+            "36": 2798.0,
+            "37": 2988.0,
+            "38": 3050.0,
+            "39": 3043.0,
+            "40": 3128.0,
+            "41": 2973.0,
+            "42": 3002.0,
+            "43": 2880.0,
+            "44": 2941.0,
+            "45": 2863.0,
+            "46": 3016.0,
+            "47": 3110.0,
+            "48": 3210.0,
+            "49": 3248.0,
+            "50": 3437.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1917251584.0,
-            "2": 1917251584.0,
-            "3": 1917251584.0,
-            "4": 1917251584.0,
-            "5": 1917251584.0,
-            "6": 1917251584.0,
-            "7": 1917251584.0,
-            "8": 1917251584.0,
-            "9": 1917251584.0,
-            "10": 1917251584.0,
-            "11": 1917251584.0,
-            "12": 1917251584.0,
-            "13": 1917251584.0,
-            "14": 1917251584.0,
-            "15": 1917251584.0,
-            "16": 1917251584.0,
-            "17": 1917251584.0,
-            "18": 1917251584.0,
-            "19": 1917251584.0,
-            "20": 1917251584.0,
-            "21": 1917251584.0,
-            "22": 1917251584.0,
-            "23": 1917251584.0,
-            "24": 1917251584.0,
-            "25": 1917251584.0,
-            "26": 1917251584.0,
-            "27": 1917251584.0,
-            "28": 1917251584.0,
-            "29": 1917251584.0,
-            "30": 1917251584.0,
-            "31": 1917251584.0,
-            "32": 1917251584.0,
-            "33": 1917251584.0,
-            "34": 1917251584.0,
-            "35": 1917251584.0,
-            "36": 1917251584.0,
-            "37": 1917251584.0,
-            "38": 1917251584.0,
-            "39": 1917251584.0,
-            "40": 1917251584.0,
-            "41": 1917251584.0,
-            "42": 1917251584.0,
-            "43": 1917251584.0,
-            "44": 1917251584.0,
-            "45": 1917251584.0,
-            "46": 1917251584.0,
-            "47": 1917251584.0,
-            "48": 1917251584.0,
-            "49": 1917251584.0,
-            "50": 1917251584.0
+            "1": 1917252608.0,
+            "2": 1917252608.0,
+            "3": 1917252608.0,
+            "4": 1917252608.0,
+            "5": 1917252608.0,
+            "6": 1917252608.0,
+            "7": 1917252608.0,
+            "8": 1917252608.0,
+            "9": 1917252608.0,
+            "10": 1917252608.0,
+            "11": 1917252608.0,
+            "12": 1917252608.0,
+            "13": 1917252608.0,
+            "14": 1917252608.0,
+            "15": 1917252608.0,
+            "16": 1917252608.0,
+            "17": 1917252608.0,
+            "18": 1917252608.0,
+            "19": 1917252608.0,
+            "20": 1917252608.0,
+            "21": 1917252608.0,
+            "22": 1917252608.0,
+            "23": 1917252608.0,
+            "24": 1917252608.0,
+            "25": 1917252608.0,
+            "26": 1917252608.0,
+            "27": 1917252608.0,
+            "28": 1917252608.0,
+            "29": 1917252608.0,
+            "30": 1917252608.0,
+            "31": 1917252608.0,
+            "32": 1917252608.0,
+            "33": 1917252608.0,
+            "34": 1917252608.0,
+            "35": 1917252608.0,
+            "36": 1917252608.0,
+            "37": 1917252608.0,
+            "38": 1917252608.0,
+            "39": 1917252608.0,
+            "40": 1917252608.0,
+            "41": 1917252608.0,
+            "42": 1917252608.0,
+            "43": 1917252608.0,
+            "44": 1917252608.0,
+            "45": 1917252608.0,
+            "46": 1917252608.0,
+            "47": 1917252608.0,
+            "48": 1917252608.0,
+            "49": 1917252608.0,
+            "50": 1917252608.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2520653312.0,
-            "2": 2743788032.0,
-            "3": 2743788032.0,
-            "4": 2743788032.0,
-            "5": 2743788032.0,
-            "6": 2743788032.0,
-            "7": 2743788032.0,
-            "8": 2743788032.0,
-            "9": 2743788032.0,
-            "10": 2743788032.0,
-            "11": 2743788032.0,
-            "12": 2743788032.0,
-            "13": 2743788032.0,
-            "14": 2743788032.0,
-            "15": 2743788032.0,
-            "16": 2743788032.0,
-            "17": 2743788032.0,
-            "18": 2743788032.0,
-            "19": 2743788032.0,
-            "20": 2743788032.0,
-            "21": 2743788032.0,
-            "22": 2743788032.0,
-            "23": 2743788032.0,
-            "24": 2743788032.0,
-            "25": 2743788032.0,
-            "26": 2743788032.0,
-            "27": 2743788032.0,
-            "28": 2743788032.0,
-            "29": 2743788032.0,
-            "30": 2743788032.0,
-            "31": 2743788032.0,
-            "32": 2743788032.0,
-            "33": 2743788032.0,
-            "34": 2743788032.0,
-            "35": 2743788032.0,
-            "36": 2743788032.0,
-            "37": 2743788032.0,
-            "38": 2743788032.0,
-            "39": 2743788032.0,
-            "40": 2743788032.0,
-            "41": 2743788032.0,
-            "42": 2743788032.0,
-            "43": 2743788032.0,
-            "44": 2743788032.0,
-            "45": 2743788032.0,
-            "46": 2743788032.0,
-            "47": 2743788032.0,
-            "48": 2743788032.0,
-            "49": 2743788032.0,
-            "50": 2743788032.0
+            "1": 2520785408.0,
+            "2": 2743789056.0,
+            "3": 2743789056.0,
+            "4": 2743789056.0,
+            "5": 2743789056.0,
+            "6": 2743789056.0,
+            "7": 2743789056.0,
+            "8": 2743789056.0,
+            "9": 2743789056.0,
+            "10": 2743789056.0,
+            "11": 2743789056.0,
+            "12": 2743789056.0,
+            "13": 2743789056.0,
+            "14": 2743789056.0,
+            "15": 2743789056.0,
+            "16": 2743789056.0,
+            "17": 2743789056.0,
+            "18": 2743789056.0,
+            "19": 2743789056.0,
+            "20": 2743789056.0,
+            "21": 2743789056.0,
+            "22": 2743789056.0,
+            "23": 2743789056.0,
+            "24": 2743789056.0,
+            "25": 2743789056.0,
+            "26": 2743789056.0,
+            "27": 2743789056.0,
+            "28": 2743789056.0,
+            "29": 2743789056.0,
+            "30": 2743789056.0,
+            "31": 2743789056.0,
+            "32": 2743789056.0,
+            "33": 2743789056.0,
+            "34": 2743789056.0,
+            "35": 2743789056.0,
+            "36": 2743789056.0,
+            "37": 2743789056.0,
+            "38": 2743789056.0,
+            "39": 2743789056.0,
+            "40": 2743789056.0,
+            "41": 2743789056.0,
+            "42": 2743789056.0,
+            "43": 2743789056.0,
+            "44": 2743789056.0,
+            "45": 2743789056.0,
+            "46": 2743789056.0,
+            "47": 2743789056.0,
+            "48": 2743789056.0,
+            "49": 2743789056.0,
+            "50": 2743789056.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 92.52278,
-            "2": 1.52203,
-            "3": 1.50103,
-            "4": 1.51627,
-            "5": 1.49943,
-            "6": 1.61325,
-            "7": 1.5622,
-            "8": 1.50668,
-            "9": 1.50122,
-            "10": 1.50749,
-            "11": 2.12764,
-            "12": 1.51111,
-            "13": 1.50973,
-            "14": 1.51712,
-            "15": 1.50952,
-            "16": 1.51343,
-            "17": 1.50742,
-            "18": 1.52017,
-            "19": 1.50622,
-            "20": 1.51648,
-            "21": 2.13229,
-            "22": 1.50789,
-            "23": 1.52087,
-            "24": 1.50668,
-            "25": 1.51534,
-            "26": 1.5016,
-            "27": 1.50737,
-            "28": 1.49873,
-            "29": 1.50715,
-            "30": 1.49941,
-            "31": 2.11492,
-            "32": 1.50348,
-            "33": 1.50106,
-            "34": 1.50093,
-            "35": 1.50813,
-            "36": 1.4988,
-            "37": 1.49847,
-            "38": 1.49777,
-            "39": 1.49937,
-            "40": 1.50456,
-            "41": 2.11318,
-            "42": 1.50605,
-            "43": 1.50721,
-            "44": 1.51813,
-            "45": 1.50211,
-            "46": 1.51633,
-            "47": 1.5019,
-            "48": 1.52386,
-            "49": 1.49987,
-            "50": 1.50829
+            "1": 35.39303,
+            "2": 1.47947,
+            "3": 1.43465,
+            "4": 1.42746,
+            "5": 1.42319,
+            "6": 1.43258,
+            "7": 1.42845,
+            "8": 1.41781,
+            "9": 1.4151,
+            "10": 1.41191,
+            "11": 1.95875,
+            "12": 1.3933,
+            "13": 1.39849,
+            "14": 1.39794,
+            "15": 1.40724,
+            "16": 1.39365,
+            "17": 1.38797,
+            "18": 1.3881,
+            "19": 1.38756,
+            "20": 1.4026,
+            "21": 1.98432,
+            "22": 1.40772,
+            "23": 1.40655,
+            "24": 1.411,
+            "25": 1.40775,
+            "26": 1.41523,
+            "27": 1.40237,
+            "28": 1.43117,
+            "29": 1.43476,
+            "30": 1.42856,
+            "31": 2.00614,
+            "32": 1.41414,
+            "33": 1.41736,
+            "34": 1.40899,
+            "35": 1.43827,
+            "36": 1.43529,
+            "37": 1.40205,
+            "38": 1.39968,
+            "39": 1.39625,
+            "40": 1.41137,
+            "41": 1.95978,
+            "42": 1.4124,
+            "43": 1.42729,
+            "44": 1.41966,
+            "45": 1.41646,
+            "46": 1.41671,
+            "47": 1.3922,
+            "48": 1.39545,
+            "49": 1.383,
+            "50": 1.38147
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml
index 7702274db5f..5bc40afede4 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml
@@ -68,7 +68,7 @@ MODEL_ARGS:
   --cuda-graph-impl: local
   --te-rng-tracker: true
   --inference-rng-tracker: true
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
 METRICS:
diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml
index 9a7769eb432..b5c3c409605 100644
--- a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml
@@ -64,7 +64,7 @@ MODEL_ARGS:
   --num-tokens-to-generate: 30
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
 METRICS:
diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/golden_values_dev_dgx_h100.json
deleted file mode 100644
index f486950e5a2..00000000000
--- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/golden_values_dev_dgx_h100.json
+++ /dev/null
@@ -1,11492 +0,0 @@
-{
-    "lm loss": {
-        "start_step": 1,
-        "end_step": 9535,
-        "step_interval": 5,
-        "values": {
-            "1": 13.89756,
-            "5": 13.89155,
-            "10": 13.85814,
-            "15": 13.84947,
-            "20": 13.74128,
-            "25": 13.71269,
-            "30": 13.39136,
-            "35": 13.32418,
-            "40": 13.23329,
-            "45": 13.12045,
-            "50": 12.53632,
-            "55": 12.35058,
-            "60": 12.17187,
-            "65": 12.01029,
-            "70": 11.83519,
-            "75": 11.55823,
-            "80": 11.30557,
-            "85": 11.11711,
-            "90": 10.96045,
-            "95": 10.79835,
-            "100": 10.58719,
-            "105": 10.45871,
-            "110": 10.23985,
-            "115": 10.03197,
-            "120": 9.88087,
-            "125": 9.74001,
-            "130": 9.64895,
-            "135": 9.58316,
-            "140": 9.34895,
-            "145": 9.3363,
-            "150": 9.17736,
-            "155": 9.11162,
-            "160": 9.02957,
-            "165": 8.91504,
-            "170": 8.86399,
-            "175": 8.82531,
-            "180": 8.68067,
-            "185": 8.72019,
-            "190": 8.59287,
-            "195": 8.59803,
-            "200": 8.48665,
-            "205": 8.39681,
-            "210": 8.35424,
-            "215": 8.40636,
-            "220": 8.27837,
-            "225": 8.29496,
-            "230": 8.27773,
-            "235": 8.20463,
-            "240": 8.15385,
-            "245": 8.1344,
-            "250": 8.06891,
-            "255": 8.08354,
-            "260": 7.97761,
-            "265": 7.96264,
-            "270": 7.91745,
-            "275": 7.9055,
-            "280": 7.89502,
-            "285": 7.91233,
-            "290": 7.858,
-            "295": 7.84326,
-            "300": 7.73922,
-            "305": 7.73479,
-            "310": 7.6998,
-            "315": 7.6959,
-            "320": 7.68835,
-            "325": 7.60857,
-            "330": 7.59888,
-            "335": 7.57833,
-            "340": 7.62257,
-            "345": 7.51187,
-            "350": 7.5063,
-            "355": 7.43406,
-            "360": 7.53414,
-            "365": 7.45759,
-            "370": 7.49186,
-            "375": 7.43607,
-            "380": 7.41292,
-            "385": 7.41117,
-            "390": 7.42986,
-            "395": 7.36781,
-            "400": 7.30747,
-            "405": 7.31834,
-            "410": 7.30943,
-            "415": 7.29421,
-            "420": 7.2965,
-            "425": 7.26158,
-            "430": 7.20979,
-            "435": 7.22197,
-            "440": 7.18512,
-            "445": 7.1687,
-            "450": 7.12181,
-            "455": 7.14062,
-            "460": 7.11041,
-            "465": 7.10497,
-            "470": 7.07645,
-            "475": 7.09742,
-            "480": 6.97587,
-            "485": 7.03312,
-            "490": 6.99478,
-            "495": 6.9692,
-            "500": 6.91435,
-            "505": 6.94713,
-            "510": 6.92309,
-            "515": 6.88853,
-            "520": 6.88024,
-            "525": 6.87529,
-            "530": 6.88311,
-            "535": 6.8642,
-            "540": 6.78769,
-            "545": 6.8252,
-            "550": 6.84568,
-            "555": 6.86869,
-            "560": 6.81372,
-            "565": 6.74969,
-            "570": 6.76579,
-            "575": 6.77872,
-            "580": 6.69766,
-            "585": 6.71359,
-            "590": 6.65449,
-            "595": 6.64792,
-            "600": 6.67016,
-            "605": 6.65924,
-            "610": 6.63641,
-            "615": 6.68438,
-            "620": 6.60355,
-            "625": 6.57203,
-            "630": 6.56964,
-            "635": 6.60732,
-            "640": 6.59738,
-            "645": 6.5815,
-            "650": 6.62582,
-            "655": 6.62475,
-            "660": 6.53171,
-            "665": 6.52224,
-            "670": 6.47146,
-            "675": 6.57058,
-            "680": 6.53989,
-            "685": 6.49695,
-            "690": 6.47037,
-            "695": 6.43685,
-            "700": 6.43121,
-            "705": 6.4313,
-            "710": 6.46058,
-            "715": 6.46842,
-            "720": 6.35254,
-            "725": 6.40344,
-            "730": 6.39123,
-            "735": 6.41174,
-            "740": 6.34886,
-            "745": 6.31567,
-            "750": 6.37227,
-            "755": 6.29068,
-            "760": 6.30783,
-            "765": 6.32016,
-            "770": 6.31539,
-            "775": 6.3051,
-            "780": 6.27484,
-            "785": 6.28635,
-            "790": 6.25066,
-            "795": 6.24498,
-            "800": 6.22595,
-            "805": 6.30241,
-            "810": 6.16125,
-            "815": 6.18921,
-            "820": 6.19984,
-            "825": 6.20878,
-            "830": 6.21184,
-            "835": 6.16547,
-            "840": 6.13918,
-            "845": 6.18907,
-            "850": 6.14544,
-            "855": 6.14245,
-            "860": 6.12573,
-            "865": 6.14471,
-            "870": 6.103,
-            "875": 6.14755,
-            "880": 6.09503,
-            "885": 6.08625,
-            "890": 6.14906,
-            "895": 6.03612,
-            "900": 6.06033,
-            "905": 6.07119,
-            "910": 6.04765,
-            "915": 6.02795,
-            "920": 6.01922,
-            "925": 6.00762,
-            "930": 6.04202,
-            "935": 6.03448,
-            "940": 5.96552,
-            "945": 6.00691,
-            "950": 6.02802,
-            "955": 5.9757,
-            "960": 5.9732,
-            "965": 5.8947,
-            "970": 5.93848,
-            "975": 5.94046,
-            "980": 5.91694,
-            "985": 5.91057,
-            "990": 5.96163,
-            "995": 5.87028,
-            "1000": 5.89819,
-            "1005": 5.85552,
-            "1010": 5.89001,
-            "1015": 5.91011,
-            "1020": 5.82121,
-            "1025": 5.81525,
-            "1030": 5.82852,
-            "1035": 5.91121,
-            "1040": 5.83477,
-            "1045": 5.80641,
-            "1050": 5.84029,
-            "1055": 5.82471,
-            "1060": 5.77657,
-            "1065": 5.75965,
-            "1070": 5.80228,
-            "1075": 5.78852,
-            "1080": 5.77993,
-            "1085": 5.79347,
-            "1090": 5.7642,
-            "1095": 5.77727,
-            "1100": 5.73679,
-            "1105": 5.71252,
-            "1110": 5.76864,
-            "1115": 5.69994,
-            "1120": 5.64073,
-            "1125": 5.65212,
-            "1130": 5.71653,
-            "1135": 5.67194,
-            "1140": 5.66144,
-            "1145": 5.65572,
-            "1150": 5.68319,
-            "1155": 5.64543,
-            "1160": 5.63371,
-            "1165": 5.67226,
-            "1170": 5.65589,
-            "1175": 5.62136,
-            "1180": 5.63006,
-            "1185": 5.6181,
-            "1190": 5.60413,
-            "1195": 5.59825,
-            "1200": 5.54202,
-            "1205": 5.65572,
-            "1210": 5.51312,
-            "1215": 5.55359,
-            "1220": 5.63431,
-            "1225": 5.51403,
-            "1230": 5.56754,
-            "1235": 5.521,
-            "1240": 5.55808,
-            "1245": 5.52886,
-            "1250": 5.51046,
-            "1255": 5.50279,
-            "1260": 5.50208,
-            "1265": 5.47964,
-            "1270": 5.44537,
-            "1275": 5.52448,
-            "1280": 5.45447,
-            "1285": 5.4682,
-            "1290": 5.43648,
-            "1295": 5.46181,
-            "1300": 5.46016,
-            "1305": 5.43278,
-            "1310": 5.38271,
-            "1315": 5.44073,
-            "1320": 5.42393,
-            "1325": 5.3568,
-            "1330": 5.41966,
-            "1335": 5.39498,
-            "1340": 5.44678,
-            "1345": 5.4046,
-            "1350": 5.3745,
-            "1355": 5.36722,
-            "1360": 5.37555,
-            "1365": 5.38819,
-            "1370": 5.31687,
-            "1375": 5.3257,
-            "1380": 5.37435,
-            "1385": 5.33822,
-            "1390": 5.32907,
-            "1395": 5.35996,
-            "1400": 5.34708,
-            "1405": 5.32768,
-            "1410": 5.30321,
-            "1415": 5.26874,
-            "1420": 5.31115,
-            "1425": 5.3045,
-            "1430": 5.33954,
-            "1435": 5.24914,
-            "1440": 5.27894,
-            "1445": 5.31118,
-            "1450": 5.28087,
-            "1455": 5.30455,
-            "1460": 5.26455,
-            "1465": 5.26355,
-            "1470": 5.29615,
-            "1475": 5.27116,
-            "1480": 5.26692,
-            "1485": 5.21939,
-            "1490": 5.21283,
-            "1495": 5.23155,
-            "1500": 5.23275,
-            "1505": 5.20436,
-            "1510": 5.22447,
-            "1515": 5.15502,
-            "1520": 5.1852,
-            "1525": 5.15413,
-            "1530": 5.17452,
-            "1535": 5.16098,
-            "1540": 5.16276,
-            "1545": 5.19593,
-            "1550": 5.1989,
-            "1555": 5.18478,
-            "1560": 5.1253,
-            "1565": 5.15973,
-            "1570": 5.17281,
-            "1575": 5.1468,
-            "1580": 5.16002,
-            "1585": 5.14495,
-            "1590": 5.12815,
-            "1595": 5.09691,
-            "1600": 5.17173,
-            "1605": 5.09626,
-            "1610": 5.10506,
-            "1615": 5.09978,
-            "1620": 5.1145,
-            "1625": 5.10983,
-            "1630": 5.08211,
-            "1635": 5.12902,
-            "1640": 5.09565,
-            "1645": 5.08916,
-            "1650": 5.08067,
-            "1655": 5.06625,
-            "1660": 5.05546,
-            "1665": 5.04609,
-            "1670": 5.06711,
-            "1675": 5.06871,
-            "1680": 5.00775,
-            "1685": 5.01672,
-            "1690": 4.99799,
-            "1695": 5.00065,
-            "1700": 5.03983,
-            "1705": 5.01824,
-            "1710": 5.00629,
-            "1715": 4.97587,
-            "1720": 4.97437,
-            "1725": 4.9984,
-            "1730": 4.95014,
-            "1735": 5.02541,
-            "1740": 4.95266,
-            "1745": 4.97461,
-            "1750": 4.95639,
-            "1755": 4.97133,
-            "1760": 4.98489,
-            "1765": 4.93728,
-            "1770": 4.93343,
-            "1775": 4.9432,
-            "1780": 4.96314,
-            "1785": 4.91574,
-            "1790": 4.93944,
-            "1795": 4.93848,
-            "1800": 4.88725,
-            "1805": 4.87771,
-            "1810": 4.8976,
-            "1815": 4.89801,
-            "1820": 4.8872,
-            "1825": 4.89371,
-            "1830": 4.8786,
-            "1835": 4.87542,
-            "1840": 4.87209,
-            "1845": 4.85811,
-            "1850": 4.83484,
-            "1855": 4.89133,
-            "1860": 4.84322,
-            "1865": 4.85108,
-            "1870": 4.82648,
-            "1875": 4.83877,
-            "1880": 4.89485,
-            "1885": 4.84392,
-            "1890": 4.8281,
-            "1895": 4.77339,
-            "1900": 4.81423,
-            "1905": 4.81232,
-            "1910": 4.82991,
-            "1915": 4.79768,
-            "1920": 4.78308,
-            "1925": 4.79277,
-            "1930": 4.76544,
-            "1935": 4.7941,
-            "1940": 4.75875,
-            "1945": 4.80214,
-            "1950": 4.83843,
-            "1955": 4.77731,
-            "1960": 4.76768,
-            "1965": 4.72596,
-            "1970": 4.73388,
-            "1975": 4.7973,
-            "1980": 4.73036,
-            "1985": 4.74162,
-            "1990": 4.78353,
-            "1995": 4.74959,
-            "2000": 4.76948,
-            "2005": 4.80113,
-            "2010": 4.70951,
-            "2015": 4.69715,
-            "2020": 4.71284,
-            "2025": 4.75821,
-            "2030": 4.68831,
-            "2035": 4.71528,
-            "2040": 4.67772,
-            "2045": 4.76255,
-            "2050": 4.74404,
-            "2055": 4.7077,
-            "2060": 4.70614,
-            "2065": 4.66526,
-            "2070": 4.67653,
-            "2075": 4.69507,
-            "2080": 4.66174,
-            "2085": 4.69911,
-            "2090": 4.61739,
-            "2095": 4.64746,
-            "2100": 4.61666,
-            "2105": 4.64633,
-            "2110": 4.64123,
-            "2115": 4.65336,
-            "2120": 4.64559,
-            "2125": 4.61059,
-            "2130": 4.61466,
-            "2135": 4.62745,
-            "2140": 4.6232,
-            "2145": 4.58124,
-            "2150": 4.60983,
-            "2155": 4.57956,
-            "2160": 4.60382,
-            "2165": 4.58415,
-            "2170": 4.61387,
-            "2175": 4.60275,
-            "2180": 4.59531,
-            "2185": 4.60788,
-            "2190": 4.58246,
-            "2195": 4.55672,
-            "2200": 4.55346,
-            "2205": 4.56383,
-            "2210": 4.6146,
-            "2215": 4.64276,
-            "2220": 4.59912,
-            "2225": 4.57263,
-            "2230": 4.56854,
-            "2235": 4.61797,
-            "2240": 4.51401,
-            "2245": 4.5176,
-            "2250": 4.52905,
-            "2255": 4.54117,
-            "2260": 4.48536,
-            "2265": 4.56489,
-            "2270": 4.49655,
-            "2275": 4.55547,
-            "2280": 4.51075,
-            "2285": 4.53333,
-            "2290": 4.52269,
-            "2295": 4.52707,
-            "2300": 4.53228,
-            "2305": 4.49287,
-            "2310": 4.53148,
-            "2315": 4.46329,
-            "2320": 4.51121,
-            "2325": 4.49336,
-            "2330": 4.49351,
-            "2335": 4.47787,
-            "2340": 4.48626,
-            "2345": 4.52525,
-            "2350": 4.4674,
-            "2355": 4.47173,
-            "2360": 4.44099,
-            "2365": 4.44682,
-            "2370": 4.44716,
-            "2375": 4.44199,
-            "2380": 4.39487,
-            "2385": 4.43475,
-            "2390": 4.43071,
-            "2395": 4.46719,
-            "2400": 4.42074,
-            "2405": 4.40081,
-            "2410": 4.44955,
-            "2415": 4.42055,
-            "2420": 4.4293,
-            "2425": 4.39783,
-            "2430": 4.42084,
-            "2435": 4.40291,
-            "2440": 4.39501,
-            "2445": 4.40808,
-            "2450": 4.38239,
-            "2455": 4.4178,
-            "2460": 4.36606,
-            "2465": 4.41327,
-            "2470": 4.40023,
-            "2475": 4.41776,
-            "2480": 4.34092,
-            "2485": 4.37423,
-            "2490": 4.37838,
-            "2495": 4.35662,
-            "2500": 4.36528,
-            "2505": 4.37219,
-            "2510": 4.41251,
-            "2515": 4.40356,
-            "2520": 4.34516,
-            "2525": 4.36214,
-            "2530": 4.36786,
-            "2535": 4.36686,
-            "2540": 4.36548,
-            "2545": 4.37687,
-            "2550": 4.30337,
-            "2555": 4.37244,
-            "2560": 4.35158,
-            "2565": 4.30393,
-            "2570": 4.33393,
-            "2575": 4.30697,
-            "2580": 4.30582,
-            "2585": 4.29358,
-            "2590": 4.31272,
-            "2595": 4.28154,
-            "2600": 4.29867,
-            "2605": 4.31115,
-            "2610": 4.32106,
-            "2615": 4.27768,
-            "2620": 4.26935,
-            "2625": 4.30437,
-            "2630": 4.22434,
-            "2635": 4.30369,
-            "2640": 4.30012,
-            "2645": 4.2581,
-            "2650": 4.28639,
-            "2655": 4.26647,
-            "2660": 4.21474,
-            "2665": 4.30436,
-            "2670": 4.26382,
-            "2675": 4.2306,
-            "2680": 4.25227,
-            "2685": 4.25736,
-            "2690": 4.22986,
-            "2695": 4.28379,
-            "2700": 4.19098,
-            "2705": 4.23853,
-            "2710": 4.25092,
-            "2715": 4.23481,
-            "2720": 4.24356,
-            "2725": 4.2225,
-            "2730": 4.22941,
-            "2735": 4.22363,
-            "2740": 4.20346,
-            "2745": 4.18765,
-            "2750": 4.21101,
-            "2755": 4.22237,
-            "2760": 4.22902,
-            "2765": 4.18298,
-            "2770": 4.23755,
-            "2775": 4.17706,
-            "2780": 4.21186,
-            "2785": 4.19469,
-            "2790": 4.21736,
-            "2795": 4.18988,
-            "2800": 4.1159,
-            "2805": 4.16613,
-            "2810": 4.17076,
-            "2815": 4.15389,
-            "2820": 4.1969,
-            "2825": 4.19241,
-            "2830": 4.16864,
-            "2835": 4.17046,
-            "2840": 4.16148,
-            "2845": 4.14967,
-            "2850": 4.16619,
-            "2855": 4.11805,
-            "2860": 4.14572,
-            "2865": 4.17023,
-            "2870": 4.14096,
-            "2875": 4.1596,
-            "2880": 4.08582,
-            "2885": 4.14242,
-            "2890": 4.11503,
-            "2895": 4.15452,
-            "2900": 4.09735,
-            "2905": 4.11101,
-            "2910": 4.10798,
-            "2915": 4.14914,
-            "2920": 4.12546,
-            "2925": 4.10099,
-            "2930": 4.08522,
-            "2935": 4.07896,
-            "2940": 4.09225,
-            "2945": 4.06113,
-            "2950": 4.03479,
-            "2955": 4.03763,
-            "2960": 4.04955,
-            "2965": 4.0643,
-            "2970": 4.08593,
-            "2975": 4.0941,
-            "2980": 4.03102,
-            "2985": 4.07394,
-            "2990": 4.08923,
-            "2995": 4.03231,
-            "3000": 4.0436,
-            "3005": 4.02568,
-            "3010": 4.06747,
-            "3015": 4.02305,
-            "3020": 4.03992,
-            "3025": 4.02491,
-            "3030": 4.0567,
-            "3035": 4.04059,
-            "3040": 4.0544,
-            "3045": 4.04677,
-            "3050": 4.017,
-            "3055": 4.00507,
-            "3060": 3.9904,
-            "3065": 4.02281,
-            "3070": 4.03826,
-            "3075": 3.97211,
-            "3080": 4.0011,
-            "3085": 4.00548,
-            "3090": 4.00887,
-            "3095": 4.02745,
-            "3100": 4.01465,
-            "3105": 3.99035,
-            "3110": 3.99124,
-            "3115": 3.92509,
-            "3120": 4.00505,
-            "3125": 3.94183,
-            "3130": 3.96987,
-            "3135": 3.96132,
-            "3140": 3.95209,
-            "3145": 3.93524,
-            "3150": 3.96949,
-            "3155": 3.96213,
-            "3160": 3.96255,
-            "3165": 3.96146,
-            "3170": 3.96456,
-            "3175": 3.93165,
-            "3180": 3.93784,
-            "3185": 3.90234,
-            "3190": 3.92455,
-            "3195": 3.9116,
-            "3200": 3.89013,
-            "3205": 3.92029,
-            "3210": 3.89711,
-            "3215": 3.90569,
-            "3220": 3.89706,
-            "3225": 3.91097,
-            "3230": 3.89895,
-            "3235": 3.91122,
-            "3240": 3.88912,
-            "3245": 3.88902,
-            "3250": 3.84407,
-            "3255": 3.89259,
-            "3260": 3.88283,
-            "3265": 3.92603,
-            "3270": 3.9052,
-            "3275": 3.85915,
-            "3280": 3.88232,
-            "3285": 3.86652,
-            "3290": 3.86681,
-            "3295": 3.83806,
-            "3300": 3.85349,
-            "3305": 3.86048,
-            "3310": 3.85872,
-            "3315": 3.89673,
-            "3320": 3.85179,
-            "3325": 3.84353,
-            "3330": 3.82539,
-            "3335": 3.86213,
-            "3340": 3.81824,
-            "3345": 3.83129,
-            "3350": 3.85901,
-            "3355": 3.8452,
-            "3360": 3.83241,
-            "3365": 3.83682,
-            "3370": 3.82265,
-            "3375": 3.85232,
-            "3380": 3.79563,
-            "3385": 3.81353,
-            "3390": 3.79143,
-            "3395": 3.86888,
-            "3400": 3.83997,
-            "3405": 3.86197,
-            "3410": 3.77529,
-            "3415": 3.72916,
-            "3420": 3.80048,
-            "3425": 3.81237,
-            "3430": 3.84497,
-            "3435": 3.80796,
-            "3440": 3.8267,
-            "3445": 3.7742,
-            "3450": 3.78787,
-            "3455": 3.80217,
-            "3460": 3.78265,
-            "3465": 3.75891,
-            "3470": 3.77341,
-            "3475": 3.77638,
-            "3480": 3.77988,
-            "3485": 3.80588,
-            "3490": 3.76958,
-            "3495": 3.80315,
-            "3500": 3.77047,
-            "3505": 3.77239,
-            "3510": 3.75092,
-            "3515": 3.80896,
-            "3520": 3.79879,
-            "3525": 3.76372,
-            "3530": 3.75322,
-            "3535": 3.76209,
-            "3540": 3.81796,
-            "3545": 3.72915,
-            "3550": 3.79201,
-            "3555": 3.72604,
-            "3560": 3.78622,
-            "3565": 3.7451,
-            "3570": 3.74254,
-            "3575": 3.71868,
-            "3580": 3.77066,
-            "3585": 3.76174,
-            "3590": 3.68853,
-            "3595": 3.76509,
-            "3600": 3.71336,
-            "3605": 3.71948,
-            "3610": 3.70916,
-            "3615": 3.74868,
-            "3620": 3.7837,
-            "3625": 3.71964,
-            "3630": 3.76519,
-            "3635": 3.68617,
-            "3640": 3.7093,
-            "3645": 3.74263,
-            "3650": 3.69638,
-            "3655": 3.72074,
-            "3660": 3.72832,
-            "3665": 3.74694,
-            "3670": 3.71178,
-            "3675": 3.71065,
-            "3680": 3.72416,
-            "3685": 3.67473,
-            "3690": 3.6936,
-            "3695": 3.68528,
-            "3700": 3.70814,
-            "3705": 3.67651,
-            "3710": 3.68493,
-            "3715": 3.6842,
-            "3720": 3.66563,
-            "3725": 3.64716,
-            "3730": 3.64883,
-            "3735": 3.68782,
-            "3740": 3.6732,
-            "3745": 3.66354,
-            "3750": 3.6757,
-            "3755": 3.66351,
-            "3760": 3.67285,
-            "3765": 3.66004,
-            "3770": 3.6516,
-            "3775": 3.63831,
-            "3780": 3.62453,
-            "3785": 3.6765,
-            "3790": 3.60163,
-            "3795": 3.64291,
-            "3800": 3.63275,
-            "3805": 3.62032,
-            "3810": 3.59475,
-            "3815": 3.63585,
-            "3820": 3.64099,
-            "3825": 3.6535,
-            "3830": 3.63864,
-            "3835": 3.59938,
-            "3840": 3.67685,
-            "3845": 3.65895,
-            "3850": 3.60064,
-            "3855": 3.60428,
-            "3860": 3.65711,
-            "3865": 3.60867,
-            "3870": 3.6721,
-            "3875": 3.58596,
-            "3880": 3.58212,
-            "3885": 3.60502,
-            "3890": 3.60969,
-            "3895": 3.5558,
-            "3900": 3.61685,
-            "3905": 3.59135,
-            "3910": 3.5772,
-            "3915": 3.5862,
-            "3920": 3.57131,
-            "3925": 3.56751,
-            "3930": 3.58005,
-            "3935": 3.5821,
-            "3940": 3.57511,
-            "3945": 3.56965,
-            "3950": 3.61887,
-            "3955": 3.57531,
-            "3960": 3.60735,
-            "3965": 3.58853,
-            "3970": 3.56735,
-            "3975": 3.56709,
-            "3980": 3.5304,
-            "3985": 3.60527,
-            "3990": 3.58124,
-            "3995": 3.60753,
-            "4000": 3.55811,
-            "4005": 3.54162,
-            "4010": 3.58376,
-            "4015": 3.58398,
-            "4020": 3.58355,
-            "4025": 3.57409,
-            "4030": 3.62855,
-            "4035": 3.57033,
-            "4040": 3.5882,
-            "4045": 3.60161,
-            "4050": 3.57522,
-            "4055": 3.57403,
-            "4060": 3.5888,
-            "4065": 3.58382,
-            "4070": 3.51488,
-            "4075": 3.55887,
-            "4080": 3.53108,
-            "4085": 3.54596,
-            "4090": 3.54584,
-            "4095": 3.53161,
-            "4100": 3.55106,
-            "4105": 3.53794,
-            "4110": 3.51736,
-            "4115": 3.56348,
-            "4120": 3.49648,
-            "4125": 3.49769,
-            "4130": 3.55149,
-            "4135": 3.54373,
-            "4140": 3.49112,
-            "4145": 3.51351,
-            "4150": 3.55497,
-            "4155": 3.48797,
-            "4160": 3.54539,
-            "4165": 3.56451,
-            "4170": 3.50424,
-            "4175": 3.50239,
-            "4180": 3.4998,
-            "4185": 3.5138,
-            "4190": 3.5011,
-            "4195": 3.50044,
-            "4200": 3.49424,
-            "4205": 3.53032,
-            "4210": 3.51921,
-            "4215": 3.52292,
-            "4220": 3.53088,
-            "4225": 3.50168,
-            "4230": 3.49756,
-            "4235": 3.52008,
-            "4240": 3.49249,
-            "4245": 3.49542,
-            "4250": 3.48848,
-            "4255": 3.50707,
-            "4260": 3.4676,
-            "4265": 3.48819,
-            "4270": 3.50473,
-            "4275": 3.53933,
-            "4280": 3.48997,
-            "4285": 3.50947,
-            "4290": 3.48405,
-            "4295": 3.48692,
-            "4300": 3.52631,
-            "4305": 3.48704,
-            "4310": 3.51358,
-            "4315": 3.50638,
-            "4320": 3.50379,
-            "4325": 3.51699,
-            "4330": 3.45992,
-            "4335": 3.49232,
-            "4340": 3.50354,
-            "4345": 3.43189,
-            "4350": 3.44845,
-            "4355": 3.52327,
-            "4360": 3.48083,
-            "4365": 3.47079,
-            "4370": 3.47624,
-            "4375": 3.44129,
-            "4380": 3.44296,
-            "4385": 3.42527,
-            "4390": 3.49048,
-            "4395": 3.47699,
-            "4400": 3.47442,
-            "4405": 3.41723,
-            "4410": 3.48335,
-            "4415": 3.44899,
-            "4420": 3.44113,
-            "4425": 3.47273,
-            "4430": 3.44742,
-            "4435": 3.49082,
-            "4440": 3.48522,
-            "4445": 3.43744,
-            "4450": 3.3974,
-            "4455": 3.4624,
-            "4460": 3.43415,
-            "4465": 3.45284,
-            "4470": 3.42199,
-            "4475": 3.45352,
-            "4480": 3.44375,
-            "4485": 3.43643,
-            "4490": 3.43453,
-            "4495": 3.38677,
-            "4500": 3.45384,
-            "4505": 3.43515,
-            "4510": 3.44292,
-            "4515": 3.40605,
-            "4520": 3.43888,
-            "4525": 3.40731,
-            "4530": 3.44131,
-            "4535": 3.3963,
-            "4540": 3.42067,
-            "4545": 3.43217,
-            "4550": 3.47418,
-            "4555": 3.39854,
-            "4560": 3.42732,
-            "4565": 3.37837,
-            "4570": 3.41702,
-            "4575": 3.41117,
-            "4580": 3.45362,
-            "4585": 3.42636,
-            "4590": 3.42388,
-            "4595": 3.39853,
-            "4600": 3.39686,
-            "4605": 3.42144,
-            "4610": 3.41286,
-            "4615": 3.45309,
-            "4620": 3.39526,
-            "4625": 3.42534,
-            "4630": 3.4127,
-            "4635": 3.39195,
-            "4640": 3.4264,
-            "4645": 3.41975,
-            "4650": 3.43542,
-            "4655": 3.40687,
-            "4660": 3.39737,
-            "4665": 3.41231,
-            "4670": 3.446,
-            "4675": 3.40423,
-            "4680": 3.42886,
-            "4685": 3.42464,
-            "4690": 3.39897,
-            "4695": 3.38,
-            "4700": 3.3729,
-            "4705": 3.35029,
-            "4710": 3.40571,
-            "4715": 3.39222,
-            "4720": 3.38774,
-            "4725": 3.35968,
-            "4730": 3.39519,
-            "4735": 3.32069,
-            "4740": 3.36458,
-            "4745": 3.40698,
-            "4750": 3.36053,
-            "4755": 3.39053,
-            "4760": 3.41421,
-            "4765": 3.36022,
-            "4770": 3.36502,
-            "4775": 3.36135,
-            "4780": 3.37362,
-            "4785": 3.374,
-            "4790": 3.41163,
-            "4795": 3.39334,
-            "4800": 3.34583,
-            "4805": 3.41139,
-            "4810": 3.35086,
-            "4815": 3.38903,
-            "4820": 3.34814,
-            "4825": 3.40406,
-            "4830": 3.38314,
-            "4835": 3.3693,
-            "4840": 3.38086,
-            "4845": 3.32726,
-            "4850": 3.39372,
-            "4855": 3.39679,
-            "4860": 3.32727,
-            "4865": 3.36392,
-            "4870": 3.34896,
-            "4875": 3.39123,
-            "4880": 3.39974,
-            "4885": 3.35153,
-            "4890": 3.36191,
-            "4895": 3.35318,
-            "4900": 3.32971,
-            "4905": 3.33008,
-            "4910": 3.32861,
-            "4915": 3.37524,
-            "4920": 3.35807,
-            "4925": 3.31242,
-            "4930": 3.34376,
-            "4935": 3.3273,
-            "4940": 3.28784,
-            "4945": 3.36034,
-            "4950": 3.29629,
-            "4955": 3.40365,
-            "4960": 3.3479,
-            "4965": 3.34204,
-            "4970": 3.33369,
-            "4975": 3.34388,
-            "4980": 3.36573,
-            "4985": 3.35352,
-            "4990": 3.33542,
-            "4995": 3.3795,
-            "5000": 3.30893,
-            "5005": 3.35715,
-            "5010": 3.36146,
-            "5015": 3.30923,
-            "5020": 3.28653,
-            "5025": 3.31605,
-            "5030": 3.32648,
-            "5035": 3.32963,
-            "5040": 3.30481,
-            "5045": 3.34994,
-            "5050": 3.30693,
-            "5055": 3.32632,
-            "5060": 3.28843,
-            "5065": 3.33396,
-            "5070": 3.33431,
-            "5075": 3.34337,
-            "5080": 3.31868,
-            "5085": 3.34518,
-            "5090": 3.32323,
-            "5095": 3.29022,
-            "5100": 3.32026,
-            "5105": 3.32744,
-            "5110": 3.3329,
-            "5115": 3.3038,
-            "5120": 3.34196,
-            "5125": 3.3184,
-            "5130": 3.31738,
-            "5135": 3.30105,
-            "5140": 3.3111,
-            "5145": 3.31125,
-            "5150": 3.32063,
-            "5155": 3.31567,
-            "5160": 3.31039,
-            "5165": 3.34534,
-            "5170": 3.23105,
-            "5175": 3.31877,
-            "5180": 3.28445,
-            "5185": 3.30691,
-            "5190": 3.32611,
-            "5195": 3.30561,
-            "5200": 3.31019,
-            "5205": 3.34654,
-            "5210": 3.28506,
-            "5215": 3.2874,
-            "5220": 3.28219,
-            "5225": 3.28677,
-            "5230": 3.32011,
-            "5235": 3.27975,
-            "5240": 3.27349,
-            "5245": 3.29646,
-            "5250": 3.3023,
-            "5255": 3.28615,
-            "5260": 3.31039,
-            "5265": 3.27007,
-            "5270": 3.25412,
-            "5275": 3.25534,
-            "5280": 3.28407,
-            "5285": 3.30874,
-            "5290": 3.2589,
-            "5295": 3.27448,
-            "5300": 3.27858,
-            "5305": 3.26656,
-            "5310": 3.32809,
-            "5315": 3.25873,
-            "5320": 3.30633,
-            "5325": 3.3111,
-            "5330": 3.27899,
-            "5335": 3.28833,
-            "5340": 3.23016,
-            "5345": 3.28336,
-            "5350": 3.28737,
-            "5355": 3.28737,
-            "5360": 3.23407,
-            "5365": 3.25011,
-            "5370": 3.28855,
-            "5375": 3.26985,
-            "5380": 3.24418,
-            "5385": 3.28394,
-            "5390": 3.28221,
-            "5395": 3.20448,
-            "5400": 3.30114,
-            "5405": 3.21525,
-            "5410": 3.29188,
-            "5415": 3.22284,
-            "5420": 3.25707,
-            "5425": 3.23689,
-            "5430": 3.24779,
-            "5435": 3.2811,
-            "5440": 3.21236,
-            "5445": 3.24176,
-            "5450": 3.24576,
-            "5455": 3.22991,
-            "5460": 3.25196,
-            "5465": 3.29692,
-            "5470": 3.27194,
-            "5475": 3.20136,
-            "5480": 3.28214,
-            "5485": 3.24325,
-            "5490": 3.26633,
-            "5495": 3.27183,
-            "5500": 3.22718,
-            "5505": 3.23914,
-            "5510": 3.28342,
-            "5515": 3.27035,
-            "5520": 3.23742,
-            "5525": 3.28473,
-            "5530": 3.22923,
-            "5535": 3.26258,
-            "5540": 3.25366,
-            "5545": 3.26198,
-            "5550": 3.24962,
-            "5555": 3.22875,
-            "5560": 3.22306,
-            "5565": 3.26845,
-            "5570": 3.22989,
-            "5575": 3.26435,
-            "5580": 3.23553,
-            "5585": 3.18594,
-            "5590": 3.24664,
-            "5595": 3.2105,
-            "5600": 3.25488,
-            "5605": 3.17461,
-            "5610": 3.2604,
-            "5615": 3.25606,
-            "5620": 3.2609,
-            "5625": 3.25214,
-            "5630": 3.24091,
-            "5635": 3.21924,
-            "5640": 3.24377,
-            "5645": 3.20743,
-            "5650": 3.2076,
-            "5655": 3.20542,
-            "5660": 3.20971,
-            "5665": 3.21069,
-            "5670": 3.20056,
-            "5675": 3.22863,
-            "5680": 3.19922,
-            "5685": 3.20573,
-            "5690": 3.2077,
-            "5695": 3.24414,
-            "5700": 3.19628,
-            "5705": 3.18515,
-            "5710": 3.17855,
-            "5715": 3.28582,
-            "5720": 3.2496,
-            "5725": 3.2002,
-            "5730": 3.24085,
-            "5735": 3.22905,
-            "5740": 3.22477,
-            "5745": 3.20281,
-            "5750": 3.23329,
-            "5755": 3.23832,
-            "5760": 3.22288,
-            "5765": 3.22651,
-            "5770": 3.25303,
-            "5775": 3.19712,
-            "5780": 3.21565,
-            "5785": 3.21756,
-            "5790": 3.22715,
-            "5795": 3.22463,
-            "5800": 3.16888,
-            "5805": 3.18332,
-            "5810": 3.22432,
-            "5815": 3.20302,
-            "5820": 3.16241,
-            "5825": 3.20754,
-            "5830": 3.1647,
-            "5835": 3.17395,
-            "5840": 3.20628,
-            "5845": 3.217,
-            "5850": 3.21594,
-            "5855": 3.15148,
-            "5860": 3.17119,
-            "5865": 3.20009,
-            "5870": 3.16136,
-            "5875": 3.20014,
-            "5880": 3.19456,
-            "5885": 3.19488,
-            "5890": 3.21776,
-            "5895": 3.23301,
-            "5900": 3.1895,
-            "5905": 3.21986,
-            "5910": 3.20185,
-            "5915": 3.17464,
-            "5920": 3.1915,
-            "5925": 3.15681,
-            "5930": 3.19135,
-            "5935": 3.19128,
-            "5940": 3.2051,
-            "5945": 3.21968,
-            "5950": 3.20213,
-            "5955": 3.16275,
-            "5960": 3.22598,
-            "5965": 3.17666,
-            "5970": 3.21828,
-            "5975": 3.18539,
-            "5980": 3.25556,
-            "5985": 3.14035,
-            "5990": 3.2373,
-            "5995": 3.15341,
-            "6000": 3.17562,
-            "6005": 3.15642,
-            "6010": 3.15958,
-            "6015": 3.16383,
-            "6020": 3.17057,
-            "6025": 3.20846,
-            "6030": 3.14683,
-            "6035": 3.20108,
-            "6040": 3.18034,
-            "6045": 3.19784,
-            "6050": 3.19841,
-            "6055": 3.17123,
-            "6060": 3.18513,
-            "6065": 3.20946,
-            "6070": 3.16514,
-            "6075": 3.13204,
-            "6080": 3.19182,
-            "6085": 3.15022,
-            "6090": 3.18799,
-            "6095": 3.18454,
-            "6100": 3.13968,
-            "6105": 3.18911,
-            "6110": 3.13194,
-            "6115": 3.18032,
-            "6120": 3.17268,
-            "6125": 3.17817,
-            "6130": 3.16826,
-            "6135": 3.16641,
-            "6140": 3.16491,
-            "6145": 3.14203,
-            "6150": 3.17849,
-            "6155": 3.14973,
-            "6160": 3.12836,
-            "6165": 3.15943,
-            "6170": 3.14366,
-            "6175": 3.14619,
-            "6180": 3.14564,
-            "6185": 3.18694,
-            "6190": 3.15491,
-            "6195": 3.12582,
-            "6200": 3.15218,
-            "6205": 3.14598,
-            "6210": 3.10092,
-            "6215": 3.15518,
-            "6220": 3.1544,
-            "6225": 3.17142,
-            "6230": 3.10668,
-            "6235": 3.14063,
-            "6240": 3.08394,
-            "6245": 3.18223,
-            "6250": 3.14309,
-            "6255": 3.15773,
-            "6260": 3.14125,
-            "6265": 3.15597,
-            "6270": 3.10065,
-            "6275": 3.12382,
-            "6280": 3.13503,
-            "6285": 3.11829,
-            "6290": 3.14415,
-            "6295": 3.15298,
-            "6300": 3.15403,
-            "6305": 3.21086,
-            "6310": 3.11266,
-            "6315": 3.10982,
-            "6320": 3.16047,
-            "6325": 3.10246,
-            "6330": 3.16954,
-            "6335": 3.15391,
-            "6340": 3.10904,
-            "6345": 3.16578,
-            "6350": 3.11808,
-            "6355": 3.11742,
-            "6360": 3.1108,
-            "6365": 3.14775,
-            "6370": 3.16278,
-            "6375": 3.1337,
-            "6380": 3.15125,
-            "6385": 3.17081,
-            "6390": 3.12597,
-            "6395": 3.10466,
-            "6400": 3.10591,
-            "6405": 3.18617,
-            "6410": 3.17298,
-            "6415": 3.12537,
-            "6420": 3.17096,
-            "6425": 3.17458,
-            "6430": 3.16659,
-            "6435": 3.12451,
-            "6440": 3.13606,
-            "6445": 3.15196,
-            "6450": 3.09161,
-            "6455": 3.08666,
-            "6460": 3.13082,
-            "6465": 3.16786,
-            "6470": 3.13951,
-            "6475": 3.13285,
-            "6480": 3.15191,
-            "6485": 3.11206,
-            "6490": 3.0797,
-            "6495": 3.16564,
-            "6500": 3.14177,
-            "6505": 3.08566,
-            "6510": 3.14483,
-            "6515": 3.16369,
-            "6520": 3.09044,
-            "6525": 3.14867,
-            "6530": 3.10896,
-            "6535": 3.12403,
-            "6540": 3.18005,
-            "6545": 3.11404,
-            "6550": 3.11103,
-            "6555": 3.10947,
-            "6560": 3.0737,
-            "6565": 3.07934,
-            "6570": 3.10438,
-            "6575": 3.05844,
-            "6580": 3.17411,
-            "6585": 3.10694,
-            "6590": 3.0877,
-            "6595": 3.10332,
-            "6600": 3.1032,
-            "6605": 3.08625,
-            "6610": 3.08405,
-            "6615": 3.1316,
-            "6620": 3.076,
-            "6625": 3.09705,
-            "6630": 3.09309,
-            "6635": 3.12933,
-            "6640": 3.08864,
-            "6645": 3.10948,
-            "6650": 3.1378,
-            "6655": 3.07416,
-            "6660": 3.11313,
-            "6665": 3.12487,
-            "6670": 3.08048,
-            "6675": 3.10457,
-            "6680": 3.10673,
-            "6685": 3.14077,
-            "6690": 3.11651,
-            "6695": 3.12176,
-            "6700": 3.1127,
-            "6705": 3.09107,
-            "6710": 3.10728,
-            "6715": 3.05842,
-            "6720": 3.13504,
-            "6725": 3.12621,
-            "6730": 3.1099,
-            "6735": 3.10898,
-            "6740": 3.11731,
-            "6745": 3.0901,
-            "6750": 3.10983,
-            "6755": 3.06749,
-            "6760": 3.06624,
-            "6765": 3.08509,
-            "6770": 3.07057,
-            "6775": 3.10523,
-            "6780": 3.07455,
-            "6785": 3.07959,
-            "6790": 3.10472,
-            "6795": 3.07166,
-            "6800": 3.09692,
-            "6805": 3.08719,
-            "6810": 3.10858,
-            "6815": 3.04354,
-            "6820": 3.07401,
-            "6825": 3.10257,
-            "6830": 3.08637,
-            "6835": 3.06002,
-            "6840": 3.0654,
-            "6845": 3.11054,
-            "6850": 3.08009,
-            "6855": 3.11065,
-            "6860": 3.06305,
-            "6865": 3.10876,
-            "6870": 3.07538,
-            "6875": 3.07578,
-            "6880": 3.08642,
-            "6885": 3.05135,
-            "6890": 3.0749,
-            "6895": 3.05299,
-            "6900": 3.05973,
-            "6905": 3.07506,
-            "6910": 3.09159,
-            "6915": 3.11333,
-            "6920": 3.06615,
-            "6925": 3.08379,
-            "6930": 3.06742,
-            "6935": 3.02485,
-            "6940": 3.06623,
-            "6945": 3.05639,
-            "6950": 3.07964,
-            "6955": 3.05853,
-            "6960": 3.05554,
-            "6965": 3.09907,
-            "6970": 3.03589,
-            "6975": 3.1075,
-            "6980": 3.06776,
-            "6985": 3.06784,
-            "6990": 3.11146,
-            "6995": 3.09126,
-            "7000": 3.02783,
-            "7005": 3.09757,
-            "7010": 3.0779,
-            "7015": 3.07385,
-            "7020": 3.10018,
-            "7025": 3.08417,
-            "7030": 3.08746,
-            "7035": 3.04096,
-            "7040": 3.01984,
-            "7045": 3.07968,
-            "7050": 3.09817,
-            "7055": 3.03816,
-            "7060": 3.09848,
-            "7065": 3.11109,
-            "7070": 3.05748,
-            "7075": 3.06319,
-            "7080": 3.11208,
-            "7085": 3.03557,
-            "7090": 3.05692,
-            "7095": 3.04652,
-            "7100": 3.07149,
-            "7105": 3.02035,
-            "7110": 3.0623,
-            "7115": 3.03547,
-            "7120": 3.07999,
-            "7125": 3.03377,
-            "7130": 3.04883,
-            "7135": 3.05627,
-            "7140": 3.06014,
-            "7145": 3.0691,
-            "7150": 3.02375,
-            "7155": 3.08612,
-            "7160": 3.0047,
-            "7165": 3.0418,
-            "7170": 3.07701,
-            "7175": 3.03661,
-            "7180": 3.07042,
-            "7185": 3.09125,
-            "7190": 3.05302,
-            "7195": 3.06058,
-            "7200": 3.06039,
-            "7205": 3.04153,
-            "7210": 3.08703,
-            "7215": 3.06723,
-            "7220": 3.08798,
-            "7225": 3.06993,
-            "7230": 3.07403,
-            "7235": 3.05435,
-            "7240": 3.05017,
-            "7245": 3.07131,
-            "7250": 3.01274,
-            "7255": 3.03229,
-            "7260": 3.06928,
-            "7265": 3.00261,
-            "7270": 3.04138,
-            "7275": 3.04223,
-            "7280": 3.04181,
-            "7285": 3.05407,
-            "7290": 3.07344,
-            "7295": 3.06537,
-            "7300": 3.02809,
-            "7305": 3.02877,
-            "7310": 3.04926,
-            "7315": 3.07646,
-            "7320": 3.05669,
-            "7325": 3.06149,
-            "7330": 3.02592,
-            "7335": 3.02733,
-            "7340": 3.06004,
-            "7345": 3.0091,
-            "7350": 3.06031,
-            "7355": 3.04495,
-            "7360": 3.03923,
-            "7365": 3.03845,
-            "7370": 3.03136,
-            "7375": 2.9999,
-            "7380": 3.06202,
-            "7385": 3.07693,
-            "7390": 3.06411,
-            "7395": 3.02221,
-            "7400": 3.07516,
-            "7405": 3.04382,
-            "7410": 3.06023,
-            "7415": 3.05228,
-            "7420": 3.03261,
-            "7425": 3.08586,
-            "7430": 3.0272,
-            "7435": 3.01757,
-            "7440": 3.0377,
-            "7445": 3.01394,
-            "7450": 2.99482,
-            "7455": 3.04735,
-            "7460": 3.04105,
-            "7465": 3.04977,
-            "7470": 3.05673,
-            "7475": 3.06741,
-            "7480": 3.02749,
-            "7485": 2.98653,
-            "7490": 2.98973,
-            "7495": 2.99863,
-            "7500": 3.02945,
-            "7505": 3.0059,
-            "7510": 2.97871,
-            "7515": 3.02404,
-            "7520": 3.01697,
-            "7525": 2.98295,
-            "7530": 3.02636,
-            "7535": 3.04423,
-            "7540": 3.02494,
-            "7545": 3.0588,
-            "7550": 3.06534,
-            "7555": 3.00732,
-            "7560": 3.01283,
-            "7565": 3.00874,
-            "7570": 3.03442,
-            "7575": 2.97962,
-            "7580": 3.03034,
-            "7585": 3.01793,
-            "7590": 3.01504,
-            "7595": 3.07403,
-            "7600": 3.03015,
-            "7605": 3.02144,
-            "7610": 3.00533,
-            "7615": 2.99602,
-            "7620": 2.99265,
-            "7625": 3.03762,
-            "7630": 3.02026,
-            "7635": 3.01854,
-            "7640": 3.01712,
-            "7645": 3.04845,
-            "7650": 3.04439,
-            "7655": 3.08975,
-            "7660": 2.96325,
-            "7665": 3.02969,
-            "7670": 3.01245,
-            "7675": 3.00305,
-            "7680": 2.9998,
-            "7685": 3.07016,
-            "7690": 3.01368,
-            "7695": 2.99671,
-            "7700": 3.05056,
-            "7705": 3.01282,
-            "7710": 3.05828,
-            "7715": 2.99725,
-            "7720": 3.08276,
-            "7725": 2.98411,
-            "7730": 2.99881,
-            "7735": 3.02714,
-            "7740": 3.00979,
-            "7745": 3.00319,
-            "7750": 3.01,
-            "7755": 3.01954,
-            "7760": 2.98571,
-            "7765": 3.00397,
-            "7770": 3.02732,
-            "7775": 2.98978,
-            "7780": 2.97862,
-            "7785": 3.01472,
-            "7790": 2.99842,
-            "7795": 3.02413,
-            "7800": 3.00827,
-            "7805": 3.01176,
-            "7810": 3.03082,
-            "7815": 3.00244,
-            "7820": 3.0019,
-            "7825": 3.03231,
-            "7830": 3.03143,
-            "7835": 2.96605,
-            "7840": 3.04336,
-            "7845": 2.97937,
-            "7850": 2.93977,
-            "7855": 2.98529,
-            "7860": 2.98344,
-            "7865": 3.02956,
-            "7870": 2.9691,
-            "7875": 2.98838,
-            "7880": 3.00349,
-            "7885": 2.9968,
-            "7890": 3.03811,
-            "7895": 3.02857,
-            "7900": 3.03097,
-            "7905": 2.99876,
-            "7910": 3.0088,
-            "7915": 3.02527,
-            "7920": 3.01259,
-            "7925": 2.99646,
-            "7930": 3.02866,
-            "7935": 2.98913,
-            "7940": 3.03573,
-            "7945": 3.0501,
-            "7950": 2.96381,
-            "7955": 2.98711,
-            "7960": 2.96943,
-            "7965": 2.94566,
-            "7970": 2.9655,
-            "7975": 2.99544,
-            "7980": 3.00887,
-            "7985": 2.97698,
-            "7990": 2.97506,
-            "7995": 2.96124,
-            "8000": 3.02098,
-            "8005": 2.9801,
-            "8010": 2.97649,
-            "8015": 2.96466,
-            "8020": 2.97779,
-            "8025": 2.95601,
-            "8030": 2.97562,
-            "8035": 2.97196,
-            "8040": 2.95703,
-            "8045": 3.01604,
-            "8050": 3.01297,
-            "8055": 2.97453,
-            "8060": 3.00494,
-            "8065": 2.98862,
-            "8070": 2.96753,
-            "8075": 2.97734,
-            "8080": 3.01019,
-            "8085": 2.96754,
-            "8090": 2.98003,
-            "8095": 3.00216,
-            "8100": 2.95105,
-            "8105": 2.99247,
-            "8110": 2.98157,
-            "8115": 2.95999,
-            "8120": 2.97249,
-            "8125": 2.99946,
-            "8130": 2.97003,
-            "8135": 2.98766,
-            "8140": 2.96736,
-            "8145": 2.95939,
-            "8150": 2.98009,
-            "8155": 2.95146,
-            "8160": 2.997,
-            "8165": 2.9913,
-            "8170": 2.95554,
-            "8175": 2.95554,
-            "8180": 3.01376,
-            "8185": 2.98624,
-            "8190": 3.02032,
-            "8195": 2.99613,
-            "8200": 2.96412,
-            "8205": 2.97566,
-            "8210": 2.9781,
-            "8215": 2.99017,
-            "8220": 2.971,
-            "8225": 2.96329,
-            "8230": 2.99505,
-            "8235": 3.00306,
-            "8240": 2.97419,
-            "8245": 2.9738,
-            "8250": 3.00958,
-            "8255": 2.96716,
-            "8260": 2.97331,
-            "8265": 2.95555,
-            "8270": 2.97514,
-            "8275": 2.96718,
-            "8280": 2.94092,
-            "8285": 2.97838,
-            "8290": 2.96734,
-            "8295": 2.95246,
-            "8300": 2.96504,
-            "8305": 2.97504,
-            "8310": 2.97996,
-            "8315": 2.95732,
-            "8320": 2.97776,
-            "8325": 2.929,
-            "8330": 2.89908,
-            "8335": 2.96646,
-            "8340": 2.99201,
-            "8345": 2.94463,
-            "8350": 2.95886,
-            "8355": 2.98631,
-            "8360": 2.96643,
-            "8365": 2.98326,
-            "8370": 2.99094,
-            "8375": 2.93854,
-            "8380": 2.94099,
-            "8385": 2.97126,
-            "8390": 2.9453,
-            "8395": 2.97523,
-            "8400": 2.95927,
-            "8405": 2.97418,
-            "8410": 3.03057,
-            "8415": 2.93533,
-            "8420": 2.91801,
-            "8425": 2.97564,
-            "8430": 2.97808,
-            "8435": 2.93124,
-            "8440": 3.01239,
-            "8445": 2.99121,
-            "8450": 2.96616,
-            "8455": 2.97106,
-            "8460": 2.97975,
-            "8465": 2.92562,
-            "8470": 2.94697,
-            "8475": 2.99054,
-            "8480": 2.93097,
-            "8485": 2.93977,
-            "8490": 2.948,
-            "8495": 2.93336,
-            "8500": 2.96904,
-            "8505": 2.92233,
-            "8510": 3.00332,
-            "8515": 2.94052,
-            "8520": 2.95755,
-            "8525": 2.88522,
-            "8530": 2.95834,
-            "8535": 2.97603,
-            "8540": 2.93194,
-            "8545": 2.95741,
-            "8550": 2.92307,
-            "8555": 2.98961,
-            "8560": 2.99424,
-            "8565": 2.9514,
-            "8570": 2.94707,
-            "8575": 2.93509,
-            "8580": 2.9669,
-            "8585": 2.976,
-            "8590": 2.97659,
-            "8595": 2.97731,
-            "8600": 2.94787,
-            "8605": 2.94545,
-            "8610": 2.95479,
-            "8615": 2.96032,
-            "8620": 2.92346,
-            "8625": 2.94581,
-            "8630": 2.95087,
-            "8635": 2.94522,
-            "8640": 2.92578,
-            "8645": 2.98133,
-            "8650": 2.92232,
-            "8655": 2.96592,
-            "8660": 2.97073,
-            "8665": 2.95471,
-            "8670": 2.96657,
-            "8675": 2.93996,
-            "8680": 2.93576,
-            "8685": 2.94815,
-            "8690": 2.96442,
-            "8695": 2.97067,
-            "8700": 2.94799,
-            "8705": 2.91745,
-            "8710": 2.96979,
-            "8715": 2.91522,
-            "8720": 2.97447,
-            "8725": 2.94876,
-            "8730": 2.94256,
-            "8735": 2.97158,
-            "8740": 2.92587,
-            "8745": 2.96492,
-            "8750": 2.96628,
-            "8755": 2.93098,
-            "8760": 2.94924,
-            "8765": 2.91354,
-            "8770": 2.96822,
-            "8775": 2.94219,
-            "8780": 2.92859,
-            "8785": 2.94726,
-            "8790": 2.92803,
-            "8795": 2.96489,
-            "8800": 2.92662,
-            "8805": 2.90115,
-            "8810": 2.93145,
-            "8815": 2.93283,
-            "8820": 2.90387,
-            "8825": 2.92443,
-            "8830": 2.91245,
-            "8835": 2.89847,
-            "8840": 2.91518,
-            "8845": 2.92785,
-            "8850": 2.95695,
-            "8855": 2.92839,
-            "8860": 2.98878,
-            "8865": 2.93356,
-            "8870": 2.90865,
-            "8875": 2.92162,
-            "8880": 2.9295,
-            "8885": 2.9207,
-            "8890": 2.9404,
-            "8895": 2.92179,
-            "8900": 2.94464,
-            "8905": 2.93594,
-            "8910": 2.91993,
-            "8915": 2.90336,
-            "8920": 2.91127,
-            "8925": 2.97428,
-            "8930": 2.96209,
-            "8935": 2.97189,
-            "8940": 2.94882,
-            "8945": 2.94789,
-            "8950": 2.9328,
-            "8955": 2.91679,
-            "8960": 2.89858,
-            "8965": 2.92721,
-            "8970": 2.94082,
-            "8975": 2.90449,
-            "8980": 2.89797,
-            "8985": 2.92102,
-            "8990": 2.9662,
-            "8995": 2.9373,
-            "9000": 2.89467,
-            "9005": 2.9399,
-            "9010": 2.97901,
-            "9015": 2.90311,
-            "9020": 2.90423,
-            "9025": 2.92238,
-            "9030": 2.94518,
-            "9035": 2.85736,
-            "9040": 2.93491,
-            "9045": 2.92378,
-            "9050": 2.96087,
-            "9055": 2.88884,
-            "9060": 2.95609,
-            "9065": 2.98682,
-            "9070": 2.92665,
-            "9075": 2.94254,
-            "9080": 2.93301,
-            "9085": 2.9439,
-            "9090": 2.93648,
-            "9095": 2.89849,
-            "9100": 2.90017,
-            "9105": 2.89,
-            "9110": 2.93211,
-            "9115": 2.93981,
-            "9120": 2.97397,
-            "9125": 2.91648,
-            "9130": 2.92277,
-            "9135": 2.94086,
-            "9140": 2.94695,
-            "9145": 2.89447,
-            "9150": 2.92217,
-            "9155": 2.93169,
-            "9160": 2.93686,
-            "9165": 2.92557,
-            "9170": 2.9498,
-            "9175": 2.88716,
-            "9180": 2.93307,
-            "9185": 2.8947,
-            "9190": 2.94894,
-            "9195": 2.91222,
-            "9200": 2.93251,
-            "9205": 2.88702,
-            "9210": 2.93304,
-            "9215": 2.87965,
-            "9220": 2.90288,
-            "9225": 2.93315,
-            "9230": 2.86569,
-            "9235": 2.87842,
-            "9240": 2.89576,
-            "9245": 2.88279,
-            "9250": 2.88136,
-            "9255": 2.91192,
-            "9260": 2.87817,
-            "9265": 2.92175,
-            "9270": 2.89613,
-            "9275": 2.91313,
-            "9280": 2.91939,
-            "9285": 2.91903,
-            "9290": 2.93047,
-            "9295": 2.92844,
-            "9300": 2.87877,
-            "9305": 2.90909,
-            "9310": 2.89871,
-            "9315": 2.86609,
-            "9320": 2.86065,
-            "9325": 2.90436,
-            "9330": 2.95511,
-            "9335": 2.87572,
-            "9340": 2.93845,
-            "9345": 2.94693,
-            "9350": 2.9134,
-            "9355": 2.87737,
-            "9360": 2.89674,
-            "9365": 2.8823,
-            "9370": 2.93386,
-            "9375": 2.91236,
-            "9380": 2.86428,
-            "9385": 2.91358,
-            "9390": 2.92324,
-            "9395": 2.92024,
-            "9400": 2.89599,
-            "9405": 2.89197,
-            "9410": 2.9185,
-            "9415": 2.91775,
-            "9420": 2.89381,
-            "9425": 2.89983,
-            "9430": 2.87833,
-            "9435": 2.90417,
-            "9440": 2.89629,
-            "9445": 2.88366,
-            "9450": 2.89069,
-            "9455": 2.88969,
-            "9460": 2.94442,
-            "9465": 2.94721,
-            "9470": 2.88553,
-            "9475": 2.94033,
-            "9480": 2.88982,
-            "9485": 2.87815,
-            "9490": 2.89723,
-            "9495": 2.9225,
-            "9500": 2.89514,
-            "9505": 2.86794,
-            "9510": 2.894,
-            "9515": 2.90369,
-            "9520": 2.91102,
-            "9525": 2.89095,
-            "9530": 2.88696,
-            "9535": 2.91216
-        }
-    },
-    "num-zeros": {
-        "start_step": 1,
-        "end_step": 9535,
-        "step_interval": 5,
-        "values": {
-            "1": 1021640256.0,
-            "5": 1024063424.0,
-            "10": 1014250560.0,
-            "15": 1024077504.0,
-            "20": 1022486144.0,
-            "25": 1041373312.0,
-            "30": 1028112896.0,
-            "35": 1035625088.0,
-            "40": 1026328384.0,
-            "45": 1022350080.0,
-            "50": 1030098560.0,
-            "55": 1028966144.0,
-            "60": 1036320640.0,
-            "65": 1034679168.0,
-            "70": 1029374848.0,
-            "75": 1028745088.0,
-            "80": 1047575040.0,
-            "85": 1029448064.0,
-            "90": 1020467392.0,
-            "95": 1028310016.0,
-            "100": 1040961344.0,
-            "105": 1039436544.0,
-            "110": 1026879104.0,
-            "115": 1052312832.0,
-            "120": 1018863104.0,
-            "125": 1045372160.0,
-            "130": 1034330368.0,
-            "135": 1016615680.0,
-            "140": 1038582272.0,
-            "145": 1020688640.0,
-            "150": 1039788096.0,
-            "155": 1032796928.0,
-            "160": 1020952640.0,
-            "165": 1032424512.0,
-            "170": 1017396096.0,
-            "175": 1033427072.0,
-            "180": 1036119424.0,
-            "185": 1030573760.0,
-            "190": 1035673984.0,
-            "195": 1034555520.0,
-            "200": 1040973824.0,
-            "205": 1048500352.0,
-            "210": 1054481024.0,
-            "215": 1025159552.0,
-            "220": 1044962496.0,
-            "225": 1038076416.0,
-            "230": 1026222720.0,
-            "235": 1051134976.0,
-            "240": 1029276416.0,
-            "245": 1031397824.0,
-            "250": 1027879616.0,
-            "255": 1016929792.0,
-            "260": 1045008896.0,
-            "265": 1021330688.0,
-            "270": 1030964864.0,
-            "275": 1036911744.0,
-            "280": 1031743488.0,
-            "285": 1015014016.0,
-            "290": 1018756352.0,
-            "295": 1017237504.0,
-            "300": 1034761152.0,
-            "305": 1032166144.0,
-            "310": 1035583104.0,
-            "315": 1012734272.0,
-            "320": 1008275072.0,
-            "325": 1042741760.0,
-            "330": 1042870656.0,
-            "335": 1033508480.0,
-            "340": 1014464512.0,
-            "345": 1042618880.0,
-            "350": 1031852736.0,
-            "355": 1050844800.0,
-            "360": 1030258432.0,
-            "365": 1034595648.0,
-            "370": 1019436032.0,
-            "375": 1022144832.0,
-            "380": 1021326592.0,
-            "385": 1025589504.0,
-            "390": 1023195072.0,
-            "395": 1019653952.0,
-            "400": 1033520512.0,
-            "405": 1023880192.0,
-            "410": 1017910016.0,
-            "415": 1024288000.0,
-            "420": 1020624256.0,
-            "425": 1025854848.0,
-            "430": 1033854336.0,
-            "435": 1028182400.0,
-            "440": 1022090752.0,
-            "445": 1036768256.0,
-            "450": 1024997376.0,
-            "455": 1013852096.0,
-            "460": 1022093824.0,
-            "465": 1041431552.0,
-            "470": 1029038016.0,
-            "475": 1010065792.0,
-            "480": 1047607616.0,
-            "485": 1029724928.0,
-            "490": 1044668160.0,
-            "495": 1025229952.0,
-            "500": 1037464960.0,
-            "505": 1032181376.0,
-            "510": 1042853056.0,
-            "515": 1026159744.0,
-            "520": 1013409792.0,
-            "525": 1035147520.0,
-            "530": 1016375552.0,
-            "535": 1040113024.0,
-            "540": 1035052352.0,
-            "545": 1032113664.0,
-            "550": 1018673408.0,
-            "555": 1008638656.0,
-            "560": 1011927680.0,
-            "565": 1041824320.0,
-            "570": 1034942208.0,
-            "575": 1010199040.0,
-            "580": 1032210496.0,
-            "585": 1041262144.0,
-            "590": 1038867968.0,
-            "595": 1035743104.0,
-            "600": 1023772736.0,
-            "605": 1032294272.0,
-            "610": 1037748672.0,
-            "615": 1005974784.0,
-            "620": 1040407424.0,
-            "625": 1045209216.0,
-            "630": 1034414464.0,
-            "635": 1028523008.0,
-            "640": 1022644928.0,
-            "645": 1035876032.0,
-            "650": 1009255680.0,
-            "655": 997757696.0,
-            "660": 1029710464.0,
-            "665": 1025532608.0,
-            "670": 1048812288.0,
-            "675": 1025202688.0,
-            "680": 1019340032.0,
-            "685": 1027832512.0,
-            "690": 1029230080.0,
-            "695": 1040024576.0,
-            "700": 1042031680.0,
-            "705": 1034382976.0,
-            "710": 1020441792.0,
-            "715": 1031472128.0,
-            "720": 1040274560.0,
-            "725": 1023279936.0,
-            "730": 1022792704.0,
-            "735": 1025085696.0,
-            "740": 1038382656.0,
-            "745": 1045205504.0,
-            "750": 1013180928.0,
-            "755": 1031644032.0,
-            "760": 1032783552.0,
-            "765": 1027135936.0,
-            "770": 1023967232.0,
-            "775": 1025895168.0,
-            "780": 1038166464.0,
-            "785": 1025486400.0,
-            "790": 1040810624.0,
-            "795": 1032531200.0,
-            "800": 1039592768.0,
-            "805": 1024318016.0,
-            "810": 1034725632.0,
-            "815": 1036000448.0,
-            "820": 1035671552.0,
-            "825": 1051375360.0,
-            "830": 1035406784.0,
-            "835": 1022547776.0,
-            "840": 1036875648.0,
-            "845": 1025700352.0,
-            "850": 1048529920.0,
-            "855": 1014986432.0,
-            "860": 1033098624.0,
-            "865": 1031543040.0,
-            "870": 1040902912.0,
-            "875": 1023938304.0,
-            "880": 1028395904.0,
-            "885": 1054406656.0,
-            "890": 1019537152.0,
-            "895": 1045189824.0,
-            "900": 1031772928.0,
-            "905": 1020970688.0,
-            "910": 1031386112.0,
-            "915": 1032926912.0,
-            "920": 1038459392.0,
-            "925": 1026754560.0,
-            "930": 1025378752.0,
-            "935": 1031126464.0,
-            "940": 1057933568.0,
-            "945": 1029823104.0,
-            "950": 1014412480.0,
-            "955": 1032173696.0,
-            "960": 1026152064.0,
-            "965": 1062678976.0,
-            "970": 1030096128.0,
-            "975": 1036903680.0,
-            "980": 1027049216.0,
-            "985": 1030676736.0,
-            "990": 1020676864.0,
-            "995": 1042301760.0,
-            "1000": 1036831616.0,
-            "1005": 1050206080.0,
-            "1010": 1023801984.0,
-            "1015": 1020539008.0,
-            "1020": 1042587392.0,
-            "1025": 1037943808.0,
-            "1030": 1049210048.0,
-            "1035": 1012483456.0,
-            "1040": 1023092032.0,
-            "1045": 1039520768.0,
-            "1050": 1026825728.0,
-            "1055": 1034861184.0,
-            "1060": 1046128704.0,
-            "1065": 1036804096.0,
-            "1070": 1019994880.0,
-            "1075": 1025341696.0,
-            "1080": 1014979200.0,
-            "1085": 1030007744.0,
-            "1090": 1029062016.0,
-            "1095": 1020309888.0,
-            "1100": 1039835008.0,
-            "1105": 1048600064.0,
-            "1110": 1020704448.0,
-            "1115": 1024782720.0,
-            "1120": 1061896576.0,
-            "1125": 1043311616.0,
-            "1130": 1031219456.0,
-            "1135": 1041360512.0,
-            "1140": 1021486272.0,
-            "1145": 1051696128.0,
-            "1150": 1035590400.0,
-            "1155": 1029590528.0,
-            "1160": 1042564800.0,
-            "1165": 1026810496.0,
-            "1170": 1018001408.0,
-            "1175": 1033684032.0,
-            "1180": 1035633536.0,
-            "1185": 1023928960.0,
-            "1190": 1033160320.0,
-            "1195": 1024228608.0,
-            "1200": 1039116544.0,
-            "1205": 1031740800.0,
-            "1210": 1053250560.0,
-            "1215": 1024617600.0,
-            "1220": 1009041280.0,
-            "1225": 1036679680.0,
-            "1230": 1041257984.0,
-            "1235": 1053974912.0,
-            "1240": 1030356224.0,
-            "1245": 1017684864.0,
-            "1250": 1022772992.0,
-            "1255": 1033439104.0,
-            "1260": 1034284736.0,
-            "1265": 1034003840.0,
-            "1270": 1037323264.0,
-            "1275": 1029345792.0,
-            "1280": 1046489856.0,
-            "1285": 1028285120.0,
-            "1290": 1036578176.0,
-            "1295": 1032421696.0,
-            "1300": 1033065728.0,
-            "1305": 1030027008.0,
-            "1310": 1051262976.0,
-            "1315": 1035373184.0,
-            "1320": 1028263936.0,
-            "1325": 1049972736.0,
-            "1330": 1030133376.0,
-            "1335": 1031164800.0,
-            "1340": 1012758912.0,
-            "1345": 1044639232.0,
-            "1350": 1034957312.0,
-            "1355": 1033623744.0,
-            "1360": 1036683392.0,
-            "1365": 1038588672.0,
-            "1370": 1039851904.0,
-            "1375": 1034117632.0,
-            "1380": 1022886656.0,
-            "1385": 1018084096.0,
-            "1390": 1049054400.0,
-            "1395": 1034868352.0,
-            "1400": 1034998144.0,
-            "1405": 1034131456.0,
-            "1410": 1036368256.0,
-            "1415": 1043577600.0,
-            "1420": 1026111104.0,
-            "1425": 1033320320.0,
-            "1430": 1012808128.0,
-            "1435": 1038394880.0,
-            "1440": 1020971904.0,
-            "1445": 1032459904.0,
-            "1450": 1014039296.0,
-            "1455": 1011673984.0,
-            "1460": 1043275904.0,
-            "1465": 1014361600.0,
-            "1470": 1020655360.0,
-            "1475": 1030231296.0,
-            "1480": 1029370496.0,
-            "1485": 1022997696.0,
-            "1490": 1026783360.0,
-            "1495": 1021815744.0,
-            "1500": 1027177088.0,
-            "1505": 1034882880.0,
-            "1510": 1014397120.0,
-            "1515": 1042136832.0,
-            "1520": 1025792640.0,
-            "1525": 1036335872.0,
-            "1530": 1039948992.0,
-            "1535": 1047640192.0,
-            "1540": 1043539840.0,
-            "1545": 1034043520.0,
-            "1550": 1016108736.0,
-            "1555": 1015573504.0,
-            "1560": 1055021824.0,
-            "1565": 1015593728.0,
-            "1570": 1018243840.0,
-            "1575": 1032515456.0,
-            "1580": 1012984768.0,
-            "1585": 1025327680.0,
-            "1590": 1034127360.0,
-            "1595": 1057393664.0,
-            "1600": 1026867584.0,
-            "1605": 1019994624.0,
-            "1610": 1031268736.0,
-            "1615": 1035274880.0,
-            "1620": 1018016000.0,
-            "1625": 1028272512.0,
-            "1630": 1027205376.0,
-            "1635": 1023799040.0,
-            "1640": 1034120832.0,
-            "1645": 1021814528.0,
-            "1650": 1015262080.0,
-            "1655": 1018280064.0,
-            "1660": 1047982976.0,
-            "1665": 1027060352.0,
-            "1670": 1048219904.0,
-            "1675": 1021102912.0,
-            "1680": 1043288320.0,
-            "1685": 1052719360.0,
-            "1690": 1026724032.0,
-            "1695": 1040385280.0,
-            "1700": 1018036352.0,
-            "1705": 1020480640.0,
-            "1710": 1021024448.0,
-            "1715": 1026932992.0,
-            "1720": 1028350208.0,
-            "1725": 1034363136.0,
-            "1730": 1013692352.0,
-            "1735": 1018429696.0,
-            "1740": 1057257024.0,
-            "1745": 1029261952.0,
-            "1750": 1024357888.0,
-            "1755": 1029970112.0,
-            "1760": 1022192512.0,
-            "1765": 1040477056.0,
-            "1770": 1029669760.0,
-            "1775": 1046196864.0,
-            "1780": 1021955712.0,
-            "1785": 1035109376.0,
-            "1790": 1028263808.0,
-            "1795": 1031023616.0,
-            "1800": 1028300480.0,
-            "1805": 1025669248.0,
-            "1810": 1021556096.0,
-            "1815": 1033440256.0,
-            "1820": 1034885888.0,
-            "1825": 1020208448.0,
-            "1830": 1013885632.0,
-            "1835": 1031382272.0,
-            "1840": 1040391040.0,
-            "1845": 1034828800.0,
-            "1850": 1014480064.0,
-            "1855": 1019418816.0,
-            "1860": 1019569536.0,
-            "1865": 1035942400.0,
-            "1870": 1026242368.0,
-            "1875": 1031525248.0,
-            "1880": 1011590784.0,
-            "1885": 1041065536.0,
-            "1890": 1035000704.0,
-            "1895": 1028959488.0,
-            "1900": 1033997568.0,
-            "1905": 1027123776.0,
-            "1910": 1029217792.0,
-            "1915": 1030492864.0,
-            "1920": 1042920384.0,
-            "1925": 1038419392.0,
-            "1930": 1019304512.0,
-            "1935": 1032535936.0,
-            "1940": 1027806336.0,
-            "1945": 1034205056.0,
-            "1950": 1006036224.0,
-            "1955": 1032577600.0,
-            "1960": 1015720256.0,
-            "1965": 1029088512.0,
-            "1970": 1021554176.0,
-            "1975": 1034048000.0,
-            "1980": 1029366912.0,
-            "1985": 1027784960.0,
-            "1990": 1020947840.0,
-            "1995": 1010422912.0,
-            "2000": 1039617152.0,
-            "2005": 1001486208.0,
-            "2010": 1020422912.0,
-            "2015": 1032034048.0,
-            "2020": 1036298624.0,
-            "2025": 1037172352.0,
-            "2030": 1029770752.0,
-            "2035": 1040333312.0,
-            "2040": 1030112768.0,
-            "2045": 1032700800.0,
-            "2050": 1008016064.0,
-            "2055": 1045723840.0,
-            "2060": 1028142400.0,
-            "2065": 1038799488.0,
-            "2070": 1045645184.0,
-            "2075": 1035237952.0,
-            "2080": 1022882304.0,
-            "2085": 1024815424.0,
-            "2090": 1034363392.0,
-            "2095": 1005220672.0,
-            "2100": 1034644096.0,
-            "2105": 1035581312.0,
-            "2110": 1030685952.0,
-            "2115": 1029798528.0,
-            "2120": 1018846080.0,
-            "2125": 1021863168.0,
-            "2130": 1026638080.0,
-            "2135": 1053279488.0,
-            "2140": 1017060608.0,
-            "2145": 1019635072.0,
-            "2150": 1037130752.0,
-            "2155": 1033302784.0,
-            "2160": 1049035776.0,
-            "2165": 1039682816.0,
-            "2170": 1020308096.0,
-            "2175": 1027338752.0,
-            "2180": 1041703168.0,
-            "2185": 1028895360.0,
-            "2190": 1029309888.0,
-            "2195": 1028944768.0,
-            "2200": 1039639680.0,
-            "2205": 1036972288.0,
-            "2210": 1031740544.0,
-            "2215": 1021404480.0,
-            "2220": 1020910848.0,
-            "2225": 1033403072.0,
-            "2230": 1014201856.0,
-            "2235": 1029395968.0,
-            "2240": 1029885184.0,
-            "2245": 1026005824.0,
-            "2250": 1046268800.0,
-            "2255": 1032951936.0,
-            "2260": 1047494592.0,
-            "2265": 1023721088.0,
-            "2270": 1022566144.0,
-            "2275": 1028537600.0,
-            "2280": 1034973568.0,
-            "2285": 1031819968.0,
-            "2290": 1038650048.0,
-            "2295": 1028816000.0,
-            "2300": 1034450496.0,
-            "2305": 1032314496.0,
-            "2310": 1013586496.0,
-            "2315": 1048182656.0,
-            "2320": 1035210368.0,
-            "2325": 1046966016.0,
-            "2330": 1014696192.0,
-            "2335": 1027382272.0,
-            "2340": 1036736512.0,
-            "2345": 1020186944.0,
-            "2350": 1031017728.0,
-            "2355": 1037474240.0,
-            "2360": 1032608128.0,
-            "2365": 1028041856.0,
-            "2370": 1021004224.0,
-            "2375": 1022912000.0,
-            "2380": 1048556224.0,
-            "2385": 1044140736.0,
-            "2390": 1021986816.0,
-            "2395": 1020595584.0,
-            "2400": 1026930816.0,
-            "2405": 1038387200.0,
-            "2410": 1045395200.0,
-            "2415": 1048454656.0,
-            "2420": 1032227712.0,
-            "2425": 1029562176.0,
-            "2430": 1030386176.0,
-            "2435": 1029217856.0,
-            "2440": 1029168000.0,
-            "2445": 1033132160.0,
-            "2450": 1038557824.0,
-            "2455": 1034721536.0,
-            "2460": 1039984192.0,
-            "2465": 1032500992.0,
-            "2470": 1024143872.0,
-            "2475": 1016539520.0,
-            "2480": 1023613248.0,
-            "2485": 1021030592.0,
-            "2490": 1035920448.0,
-            "2495": 1032967360.0,
-            "2500": 1028107008.0,
-            "2505": 1015385600.0,
-            "2510": 1030967104.0,
-            "2515": 1025700096.0,
-            "2520": 1033326208.0,
-            "2525": 1029692800.0,
-            "2530": 1023986560.0,
-            "2535": 1071069696.0,
-            "2540": 1024537984.0,
-            "2545": 1033798784.0,
-            "2550": 1029448064.0,
-            "2555": 1029183488.0,
-            "2560": 1018115072.0,
-            "2565": 1031598528.0,
-            "2570": 1022847232.0,
-            "2575": 1026503104.0,
-            "2580": 1038622592.0,
-            "2585": 1025899456.0,
-            "2590": 1026100800.0,
-            "2595": 1046623104.0,
-            "2600": 1031103360.0,
-            "2605": 1001910656.0,
-            "2610": 1028423360.0,
-            "2615": 1025564544.0,
-            "2620": 1038651392.0,
-            "2625": 1026996352.0,
-            "2630": 1036831424.0,
-            "2635": 1021198400.0,
-            "2640": 1021865856.0,
-            "2645": 1039153408.0,
-            "2650": 1025943488.0,
-            "2655": 1013255808.0,
-            "2660": 1032645248.0,
-            "2665": 1035218048.0,
-            "2670": 1036437632.0,
-            "2675": 1039296064.0,
-            "2680": 1041661696.0,
-            "2685": 1034565504.0,
-            "2690": 1058871168.0,
-            "2695": 1019879552.0,
-            "2700": 1062626816.0,
-            "2705": 1035376320.0,
-            "2710": 1019542400.0,
-            "2715": 1031885824.0,
-            "2720": 1016403200.0,
-            "2725": 1040594688.0,
-            "2730": 1019586688.0,
-            "2735": 1030889856.0,
-            "2740": 1029290752.0,
-            "2745": 1040687744.0,
-            "2750": 1023880448.0,
-            "2755": 1011865664.0,
-            "2760": 1027684864.0,
-            "2765": 1030882240.0,
-            "2770": 1033119872.0,
-            "2775": 1026332352.0,
-            "2780": 1033684224.0,
-            "2785": 1024589888.0,
-            "2790": 1033734272.0,
-            "2795": 1045949184.0,
-            "2800": 1040286016.0,
-            "2805": 1019944192.0,
-            "2810": 1031449600.0,
-            "2815": 1030932736.0,
-            "2820": 1037855616.0,
-            "2825": 1041684096.0,
-            "2830": 1030459904.0,
-            "2835": 1013508352.0,
-            "2840": 1031449600.0,
-            "2845": 1030129920.0,
-            "2850": 1026617600.0,
-            "2855": 1024705280.0,
-            "2860": 1031700096.0,
-            "2865": 1027428800.0,
-            "2870": 1026690048.0,
-            "2875": 1012777024.0,
-            "2880": 1038301568.0,
-            "2885": 1017901184.0,
-            "2890": 1044200064.0,
-            "2895": 1036459136.0,
-            "2900": 1030652928.0,
-            "2905": 1035957376.0,
-            "2910": 1038718272.0,
-            "2915": 1039385408.0,
-            "2920": 1034781248.0,
-            "2925": 1043267840.0,
-            "2930": 1038229696.0,
-            "2935": 1021222144.0,
-            "2940": 1042307456.0,
-            "2945": 1045232384.0,
-            "2950": 1047525952.0,
-            "2955": 1034172928.0,
-            "2960": 1020891904.0,
-            "2965": 1027307840.0,
-            "2970": 1038796288.0,
-            "2975": 1034007296.0,
-            "2980": 1049590400.0,
-            "2985": 1034846016.0,
-            "2990": 1026008576.0,
-            "2995": 1034919296.0,
-            "3000": 1039017856.0,
-            "3005": 1038158848.0,
-            "3010": 1010907712.0,
-            "3015": 1044976064.0,
-            "3020": 1034050688.0,
-            "3025": 1037763840.0,
-            "3030": 1027722816.0,
-            "3035": 1041821056.0,
-            "3040": 1035311872.0,
-            "3045": 1027255296.0,
-            "3050": 1029708032.0,
-            "3055": 1028029568.0,
-            "3060": 1049976960.0,
-            "3065": 1024067200.0,
-            "3070": 1011545728.0,
-            "3075": 1042846272.0,
-            "3080": 1036094912.0,
-            "3085": 1030387456.0,
-            "3090": 1035262976.0,
-            "3095": 1013803008.0,
-            "3100": 1030144896.0,
-            "3105": 1017609088.0,
-            "3110": 1033370816.0,
-            "3115": 1023737728.0,
-            "3120": 1024877504.0,
-            "3125": 1046537216.0,
-            "3130": 1024676160.0,
-            "3135": 1025722496.0,
-            "3140": 1043778176.0,
-            "3145": 1044372672.0,
-            "3150": 1016483328.0,
-            "3155": 1042487936.0,
-            "3160": 1026834688.0,
-            "3165": 1031199360.0,
-            "3170": 1024332800.0,
-            "3175": 1024368640.0,
-            "3180": 1018204288.0,
-            "3185": 1034352512.0,
-            "3190": 1019221888.0,
-            "3195": 1028425408.0,
-            "3200": 1036080640.0,
-            "3205": 1016076160.0,
-            "3210": 1034109312.0,
-            "3215": 1031349312.0,
-            "3220": 1040833664.0,
-            "3225": 1022835008.0,
-            "3230": 1033255744.0,
-            "3235": 1019975488.0,
-            "3240": 1038131840.0,
-            "3245": 1031643136.0,
-            "3250": 1022390656.0,
-            "3255": 1032876672.0,
-            "3260": 1037751616.0,
-            "3265": 1021622656.0,
-            "3270": 1031242880.0,
-            "3275": 1038461184.0,
-            "3280": 1023236992.0,
-            "3285": 1031615424.0,
-            "3290": 1045247616.0,
-            "3295": 1043177536.0,
-            "3300": 1035084224.0,
-            "3305": 1042662400.0,
-            "3310": 1058092096.0,
-            "3315": 1024282880.0,
-            "3320": 1046015296.0,
-            "3325": 1023179008.0,
-            "3330": 1048037248.0,
-            "3335": 1036690560.0,
-            "3340": 1042123392.0,
-            "3345": 1030897920.0,
-            "3350": 1020621696.0,
-            "3355": 1025960576.0,
-            "3360": 1030305344.0,
-            "3365": 1031171520.0,
-            "3370": 1036454144.0,
-            "3375": 1023472384.0,
-            "3380": 1032383744.0,
-            "3385": 1038081536.0,
-            "3390": 1052811072.0,
-            "3395": 1012090496.0,
-            "3400": 1019209600.0,
-            "3405": 1021780224.0,
-            "3410": 1028433728.0,
-            "3415": 1058222400.0,
-            "3420": 1033492480.0,
-            "3425": 1029580352.0,
-            "3430": 1021150976.0,
-            "3435": 1034991872.0,
-            "3440": 1017961600.0,
-            "3445": 1025537280.0,
-            "3450": 1032254336.0,
-            "3455": 1036261312.0,
-            "3460": 1052071808.0,
-            "3465": 1027114240.0,
-            "3470": 1043729536.0,
-            "3475": 1033265792.0,
-            "3480": 1026619776.0,
-            "3485": 1029215232.0,
-            "3490": 1041041408.0,
-            "3495": 1019252224.0,
-            "3500": 1032059904.0,
-            "3505": 1025753728.0,
-            "3510": 1044367616.0,
-            "3515": 1013817280.0,
-            "3520": 1021846400.0,
-            "3525": 1032175552.0,
-            "3530": 1029789056.0,
-            "3535": 1034568704.0,
-            "3540": 1017731456.0,
-            "3545": 1035658880.0,
-            "3550": 1024535296.0,
-            "3555": 1035866112.0,
-            "3560": 1029737600.0,
-            "3565": 1028900160.0,
-            "3570": 1046029888.0,
-            "3575": 1039186304.0,
-            "3580": 1010838336.0,
-            "3585": 1031737728.0,
-            "3590": 1041450688.0,
-            "3595": 1037636800.0,
-            "3600": 1032763584.0,
-            "3605": 1045822272.0,
-            "3610": 1039235200.0,
-            "3615": 1036870144.0,
-            "3620": 1026929664.0,
-            "3625": 1033931136.0,
-            "3630": 1017582464.0,
-            "3635": 1026629056.0,
-            "3640": 1039529088.0,
-            "3645": 1022655872.0,
-            "3650": 1036842624.0,
-            "3655": 1023990144.0,
-            "3660": 1014987456.0,
-            "3665": 1026118784.0,
-            "3670": 1041672448.0,
-            "3675": 1033250304.0,
-            "3680": 1015353984.0,
-            "3685": 1029122304.0,
-            "3690": 1026204416.0,
-            "3695": 1043800832.0,
-            "3700": 1028613504.0,
-            "3705": 1049485312.0,
-            "3710": 1027180672.0,
-            "3715": 1016134912.0,
-            "3720": 1040818560.0,
-            "3725": 1032763776.0,
-            "3730": 1030920960.0,
-            "3735": 1019008640.0,
-            "3740": 1023825600.0,
-            "3745": 1046289152.0,
-            "3750": 1034462336.0,
-            "3755": 1032090048.0,
-            "3760": 1019366912.0,
-            "3765": 1031916736.0,
-            "3770": 1026677120.0,
-            "3775": 1035708288.0,
-            "3780": 1030671104.0,
-            "3785": 1027208128.0,
-            "3790": 1019584064.0,
-            "3795": 1030306048.0,
-            "3800": 1035614976.0,
-            "3805": 1035423360.0,
-            "3810": 1033294144.0,
-            "3815": 1033988608.0,
-            "3820": 1041105792.0,
-            "3825": 1024534976.0,
-            "3830": 1037630528.0,
-            "3835": 1040347968.0,
-            "3840": 1023445888.0,
-            "3845": 1048466688.0,
-            "3850": 1052489280.0,
-            "3855": 1028907264.0,
-            "3860": 1019532672.0,
-            "3865": 1035487744.0,
-            "3870": 1028491712.0,
-            "3875": 1041164800.0,
-            "3880": 1048854912.0,
-            "3885": 1027725248.0,
-            "3890": 1027487616.0,
-            "3895": 1034190592.0,
-            "3900": 1027645312.0,
-            "3905": 1027976128.0,
-            "3910": 1041572480.0,
-            "3915": 1043995392.0,
-            "3920": 1041063424.0,
-            "3925": 1030836160.0,
-            "3930": 1027072896.0,
-            "3935": 1033782016.0,
-            "3940": 1042275712.0,
-            "3945": 1036248064.0,
-            "3950": 1021430976.0,
-            "3955": 1036304128.0,
-            "3960": 1024184192.0,
-            "3965": 1027065856.0,
-            "3970": 1015984640.0,
-            "3975": 1041421632.0,
-            "3980": 1032455488.0,
-            "3985": 1037680640.0,
-            "3990": 1038684992.0,
-            "3995": 1023654528.0,
-            "4000": 1054410240.0,
-            "4005": 1029983424.0,
-            "4010": 1025138112.0,
-            "4015": 1030978560.0,
-            "4020": 1018472448.0,
-            "4025": 1027124352.0,
-            "4030": 1010306816.0,
-            "4035": 1038641088.0,
-            "4040": 1022256640.0,
-            "4045": 1025038208.0,
-            "4050": 1032348800.0,
-            "4055": 1022420864.0,
-            "4060": 1024520768.0,
-            "4065": 1032871168.0,
-            "4070": 1027791232.0,
-            "4075": 1025596928.0,
-            "4080": 1029366656.0,
-            "4085": 1020823552.0,
-            "4090": 1033322496.0,
-            "4095": 1024142656.0,
-            "4100": 1040948864.0,
-            "4105": 1027266496.0,
-            "4110": 1038791424.0,
-            "4115": 1023497088.0,
-            "4120": 1038943168.0,
-            "4125": 1048274176.0,
-            "4130": 1021490752.0,
-            "4135": 1034570880.0,
-            "4140": 1034613824.0,
-            "4145": 1044447232.0,
-            "4150": 1000353664.0,
-            "4155": 1028363392.0,
-            "4160": 1024242624.0,
-            "4165": 1033688704.0,
-            "4170": 1018888000.0,
-            "4175": 1026492608.0,
-            "4180": 1045409024.0,
-            "4185": 1033631616.0,
-            "4190": 1029574592.0,
-            "4195": 1038777984.0,
-            "4200": 1025102336.0,
-            "4205": 1019074816.0,
-            "4210": 1029560704.0,
-            "4215": 1032269184.0,
-            "4220": 1026242048.0,
-            "4225": 1031925888.0,
-            "4230": 1030269824.0,
-            "4235": 1027603328.0,
-            "4240": 1031480832.0,
-            "4245": 1028765056.0,
-            "4250": 1026987008.0,
-            "4255": 1021240064.0,
-            "4260": 1042082432.0,
-            "4265": 1025411200.0,
-            "4270": 1030169984.0,
-            "4275": 1012472448.0,
-            "4280": 1044505600.0,
-            "4285": 1019898304.0,
-            "4290": 1033058560.0,
-            "4295": 1033596032.0,
-            "4300": 1031638912.0,
-            "4305": 1023847936.0,
-            "4310": 1021568512.0,
-            "4315": 1047221504.0,
-            "4320": 1026520576.0,
-            "4325": 1005865600.0,
-            "4330": 1037666688.0,
-            "4335": 1022006464.0,
-            "4340": 1029009920.0,
-            "4345": 1033474496.0,
-            "4350": 1036886144.0,
-            "4355": 1026808832.0,
-            "4360": 1022938240.0,
-            "4365": 1028779648.0,
-            "4370": 1029624704.0,
-            "4375": 1042196864.0,
-            "4380": 1016100096.0,
-            "4385": 1045551296.0,
-            "4390": 1026270848.0,
-            "4395": 1029796416.0,
-            "4400": 1047365760.0,
-            "4405": 1029297344.0,
-            "4410": 1033424256.0,
-            "4415": 1028298304.0,
-            "4420": 1028148928.0,
-            "4425": 1033575552.0,
-            "4430": 1031374592.0,
-            "4435": 1028571136.0,
-            "4440": 1033123328.0,
-            "4445": 1028293504.0,
-            "4450": 1052210944.0,
-            "4455": 1026286080.0,
-            "4460": 1034885888.0,
-            "4465": 1031725696.0,
-            "4470": 1035446528.0,
-            "4475": 1036971712.0,
-            "4480": 1025117824.0,
-            "4485": 1034104960.0,
-            "4490": 1024630912.0,
-            "4495": 1047974912.0,
-            "4500": 1024707840.0,
-            "4505": 1038850048.0,
-            "4510": 1043723776.0,
-            "4515": 1044276736.0,
-            "4520": 1036872320.0,
-            "4525": 1058073536.0,
-            "4530": 1030973568.0,
-            "4535": 1032592256.0,
-            "4540": 1036428160.0,
-            "4545": 1025726400.0,
-            "4550": 1021749312.0,
-            "4555": 1037546112.0,
-            "4560": 1020099200.0,
-            "4565": 1036055296.0,
-            "4570": 1020501120.0,
-            "4575": 1050412608.0,
-            "4580": 1010437888.0,
-            "4585": 1022960768.0,
-            "4590": 1039710272.0,
-            "4595": 1023274880.0,
-            "4600": 1042477824.0,
-            "4605": 1039746688.0,
-            "4610": 1046104192.0,
-            "4615": 1017999744.0,
-            "4620": 1044734592.0,
-            "4625": 1030479104.0,
-            "4630": 1027260800.0,
-            "4635": 1026995200.0,
-            "4640": 1034901248.0,
-            "4645": 1036420352.0,
-            "4650": 1033711488.0,
-            "4655": 1035461056.0,
-            "4660": 1035324800.0,
-            "4665": 1020265664.0,
-            "4670": 1020057344.0,
-            "4675": 1054848768.0,
-            "4680": 1024895872.0,
-            "4685": 1027820160.0,
-            "4690": 1034449664.0,
-            "4695": 1039151744.0,
-            "4700": 1038865024.0,
-            "4705": 1027655808.0,
-            "4710": 1020522560.0,
-            "4715": 1031825536.0,
-            "4720": 1030300416.0,
-            "4725": 1030298368.0,
-            "4730": 1044096704.0,
-            "4735": 1046133376.0,
-            "4740": 1036178112.0,
-            "4745": 1039043840.0,
-            "4750": 1031790528.0,
-            "4755": 1047723392.0,
-            "4760": 1026178176.0,
-            "4765": 1034695040.0,
-            "4770": 1036521856.0,
-            "4775": 1029375168.0,
-            "4780": 1028543488.0,
-            "4785": 1028414976.0,
-            "4790": 1019620224.0,
-            "4795": 1033060160.0,
-            "4800": 1051866880.0,
-            "4805": 1015414400.0,
-            "4810": 1029454336.0,
-            "4815": 1009572096.0,
-            "4820": 1041051200.0,
-            "4825": 1026708608.0,
-            "4830": 1020450816.0,
-            "4835": 1051307840.0,
-            "4840": 1019456512.0,
-            "4845": 1032315008.0,
-            "4850": 1036794496.0,
-            "4855": 1031052736.0,
-            "4860": 1033131776.0,
-            "4865": 1032064384.0,
-            "4870": 1049832576.0,
-            "4875": 1025110528.0,
-            "4880": 1048476160.0,
-            "4885": 1016853056.0,
-            "4890": 1037317312.0,
-            "4895": 1024323136.0,
-            "4900": 1043374208.0,
-            "4905": 1033397120.0,
-            "4910": 1032830272.0,
-            "4915": 1016889856.0,
-            "4920": 1022294784.0,
-            "4925": 1034965888.0,
-            "4930": 1034630016.0,
-            "4935": 1025885312.0,
-            "4940": 1048398272.0,
-            "4945": 1025248576.0,
-            "4950": 1024208768.0,
-            "4955": 1007485952.0,
-            "4960": 1040213824.0,
-            "4965": 1018775296.0,
-            "4970": 1014274688.0,
-            "4975": 1038025472.0,
-            "4980": 1020917888.0,
-            "4985": 1029045888.0,
-            "4990": 1028394816.0,
-            "4995": 1032020480.0,
-            "5000": 1039791104.0,
-            "5005": 1024351552.0,
-            "5010": 1029147968.0,
-            "5015": 1021807296.0,
-            "5020": 1023506944.0,
-            "5025": 1037603456.0,
-            "5030": 1041947136.0,
-            "5035": 1047130304.0,
-            "5040": 1060956096.0,
-            "5045": 1032108544.0,
-            "5050": 1029534336.0,
-            "5055": 1024552192.0,
-            "5060": 1035282304.0,
-            "5065": 1021205504.0,
-            "5070": 1035756288.0,
-            "5075": 1015771264.0,
-            "5080": 1027040064.0,
-            "5085": 1021792192.0,
-            "5090": 1034973568.0,
-            "5095": 1015499712.0,
-            "5100": 1032257600.0,
-            "5105": 1017981568.0,
-            "5110": 1019586304.0,
-            "5115": 1036063936.0,
-            "5120": 1032695040.0,
-            "5125": 1019076992.0,
-            "5130": 1033404672.0,
-            "5135": 1041203072.0,
-            "5140": 1026258752.0,
-            "5145": 1033705856.0,
-            "5150": 1022043520.0,
-            "5155": 1032265664.0,
-            "5160": 1039625984.0,
-            "5165": 1031576448.0,
-            "5170": 1035555328.0,
-            "5175": 1026116224.0,
-            "5180": 1030316032.0,
-            "5185": 1024495680.0,
-            "5190": 1019492608.0,
-            "5195": 1035626496.0,
-            "5200": 1016905344.0,
-            "5205": 1013435648.0,
-            "5210": 1049395456.0,
-            "5215": 1030833280.0,
-            "5220": 1025276800.0,
-            "5225": 1035239936.0,
-            "5230": 1025930624.0,
-            "5235": 1025120000.0,
-            "5240": 1046308224.0,
-            "5245": 1022740608.0,
-            "5250": 1027062336.0,
-            "5255": 1023887360.0,
-            "5260": 1033821440.0,
-            "5265": 1045733696.0,
-            "5270": 1052500480.0,
-            "5275": 1033018112.0,
-            "5280": 1030073920.0,
-            "5285": 1025212608.0,
-            "5290": 1026575616.0,
-            "5295": 1032653440.0,
-            "5300": 1024367872.0,
-            "5305": 1029634368.0,
-            "5310": 1033197312.0,
-            "5315": 1032988992.0,
-            "5320": 1019521664.0,
-            "5325": 1022718336.0,
-            "5330": 1021335168.0,
-            "5335": 1039275776.0,
-            "5340": 1037219648.0,
-            "5345": 1039188096.0,
-            "5350": 1023701888.0,
-            "5355": 1029935872.0,
-            "5360": 1047046080.0,
-            "5365": 1037426432.0,
-            "5370": 1024381568.0,
-            "5375": 1042070656.0,
-            "5380": 1020368384.0,
-            "5385": 1021765696.0,
-            "5390": 1035133184.0,
-            "5395": 1049653568.0,
-            "5400": 1026015744.0,
-            "5405": 1036453120.0,
-            "5410": 1027635776.0,
-            "5415": 1042285824.0,
-            "5420": 1039941888.0,
-            "5425": 1028381184.0,
-            "5430": 1043799808.0,
-            "5435": 1032653312.0,
-            "5440": 1033384448.0,
-            "5445": 1034144640.0,
-            "5450": 1025299328.0,
-            "5455": 1034079424.0,
-            "5460": 1026812416.0,
-            "5465": 1027399552.0,
-            "5470": 1028969216.0,
-            "5475": 1037233920.0,
-            "5480": 1023830272.0,
-            "5485": 1019186752.0,
-            "5490": 1030891520.0,
-            "5495": 1029399424.0,
-            "5500": 1032681216.0,
-            "5505": 1018275200.0,
-            "5510": 1023987648.0,
-            "5515": 1025156032.0,
-            "5520": 1039527296.0,
-            "5525": 1018024576.0,
-            "5530": 1037663936.0,
-            "5535": 1031599232.0,
-            "5540": 1027564544.0,
-            "5545": 1033212160.0,
-            "5550": 1032115968.0,
-            "5555": 1044802304.0,
-            "5560": 1028511232.0,
-            "5565": 1029686016.0,
-            "5570": 1042027776.0,
-            "5575": 1025379392.0,
-            "5580": 1023716736.0,
-            "5585": 1044093696.0,
-            "5590": 1041319936.0,
-            "5595": 1031549824.0,
-            "5600": 1023400320.0,
-            "5605": 1040115456.0,
-            "5610": 1034087552.0,
-            "5615": 1021042816.0,
-            "5620": 1031004800.0,
-            "5625": 1030188544.0,
-            "5630": 1023502080.0,
-            "5635": 1026684096.0,
-            "5640": 1034589120.0,
-            "5645": 1018655744.0,
-            "5650": 1052378752.0,
-            "5655": 1048933504.0,
-            "5660": 1050077696.0,
-            "5665": 1033958144.0,
-            "5670": 1033750016.0,
-            "5675": 1025392640.0,
-            "5680": 1039378304.0,
-            "5685": 1033056576.0,
-            "5690": 1031464576.0,
-            "5695": 1021946368.0,
-            "5700": 1038065664.0,
-            "5705": 1043684736.0,
-            "5710": 1057231616.0,
-            "5715": 1014462848.0,
-            "5720": 1021258816.0,
-            "5725": 1041822272.0,
-            "5730": 1039454912.0,
-            "5735": 1025128576.0,
-            "5740": 1026045440.0,
-            "5745": 1036990208.0,
-            "5750": 1044552256.0,
-            "5755": 1011860416.0,
-            "5760": 1028389568.0,
-            "5765": 1028245504.0,
-            "5770": 1021530368.0,
-            "5775": 1051210240.0,
-            "5780": 1034984512.0,
-            "5785": 1037513920.0,
-            "5790": 1016957184.0,
-            "5795": 1027873536.0,
-            "5800": 1029780736.0,
-            "5805": 1050694912.0,
-            "5810": 1018478336.0,
-            "5815": 1036123520.0,
-            "5820": 1048408704.0,
-            "5825": 1030977920.0,
-            "5830": 1031572096.0,
-            "5835": 1034045440.0,
-            "5840": 1039843776.0,
-            "5845": 1021746048.0,
-            "5850": 1029807744.0,
-            "5855": 1038789376.0,
-            "5860": 1031436288.0,
-            "5865": 1026397568.0,
-            "5870": 1029861824.0,
-            "5875": 1032841856.0,
-            "5880": 1032675968.0,
-            "5885": 1024576128.0,
-            "5890": 1026798976.0,
-            "5895": 1015796160.0,
-            "5900": 1049707008.0,
-            "5905": 1025653248.0,
-            "5910": 1019150720.0,
-            "5915": 1042739136.0,
-            "5920": 1028047232.0,
-            "5925": 1034016448.0,
-            "5930": 1030963328.0,
-            "5935": 1038102784.0,
-            "5940": 1019172864.0,
-            "5945": 1025130112.0,
-            "5950": 1035530240.0,
-            "5955": 1050437184.0,
-            "5960": 1024548736.0,
-            "5965": 1029923712.0,
-            "5970": 1016427776.0,
-            "5975": 1036682752.0,
-            "5980": 1024118464.0,
-            "5985": 1035386624.0,
-            "5990": 1010550784.0,
-            "5995": 1047019200.0,
-            "6000": 1021245568.0,
-            "6005": 1040460416.0,
-            "6010": 1025358720.0,
-            "6015": 1050179072.0,
-            "6020": 1039514496.0,
-            "6025": 1030254592.0,
-            "6030": 1025931968.0,
-            "6035": 1021745408.0,
-            "6040": 1034117056.0,
-            "6045": 1028282112.0,
-            "6050": 1020112320.0,
-            "6055": 1040397056.0,
-            "6060": 1026347008.0,
-            "6065": 1022198400.0,
-            "6070": 1040668416.0,
-            "6075": 1046037440.0,
-            "6080": 1038583168.0,
-            "6085": 1041485568.0,
-            "6090": 1037205888.0,
-            "6095": 1036282880.0,
-            "6100": 1030454720.0,
-            "6105": 1019216640.0,
-            "6110": 1035357824.0,
-            "6115": 1019452544.0,
-            "6120": 1032188800.0,
-            "6125": 1020922624.0,
-            "6130": 1012013952.0,
-            "6135": 1038733824.0,
-            "6140": 1041736896.0,
-            "6145": 1041917056.0,
-            "6150": 1018958208.0,
-            "6155": 1024649344.0,
-            "6160": 1047972160.0,
-            "6165": 1050408832.0,
-            "6170": 1032505344.0,
-            "6175": 1045793664.0,
-            "6180": 1040067072.0,
-            "6185": 1029710464.0,
-            "6190": 1023293760.0,
-            "6195": 1050897728.0,
-            "6200": 1035035776.0,
-            "6205": 1036275584.0,
-            "6210": 1039772736.0,
-            "6215": 1033200256.0,
-            "6220": 1026162432.0,
-            "6225": 1036741120.0,
-            "6230": 1025144192.0,
-            "6235": 1019352832.0,
-            "6240": 1057104384.0,
-            "6245": 1018413952.0,
-            "6250": 1035337344.0,
-            "6255": 1025380992.0,
-            "6260": 1034863744.0,
-            "6265": 1027703424.0,
-            "6270": 1042116480.0,
-            "6275": 1037659008.0,
-            "6280": 1018270208.0,
-            "6285": 1032642304.0,
-            "6290": 1038598592.0,
-            "6295": 1031803456.0,
-            "6300": 1034635200.0,
-            "6305": 1011066624.0,
-            "6310": 1039458624.0,
-            "6315": 1030054272.0,
-            "6320": 1030534208.0,
-            "6325": 1038642496.0,
-            "6330": 1033908800.0,
-            "6335": 1032297856.0,
-            "6340": 1033544448.0,
-            "6345": 1031036416.0,
-            "6350": 1037451264.0,
-            "6355": 1028075968.0,
-            "6360": 1043313408.0,
-            "6365": 1025223808.0,
-            "6370": 1033939200.0,
-            "6375": 1036038720.0,
-            "6380": 1029108096.0,
-            "6385": 1025395072.0,
-            "6390": 1025517952.0,
-            "6395": 1048611584.0,
-            "6400": 1040734976.0,
-            "6405": 1024247936.0,
-            "6410": 1017489280.0,
-            "6415": 1042827072.0,
-            "6420": 1025202432.0,
-            "6425": 1027164928.0,
-            "6430": 1040568256.0,
-            "6435": 1022908800.0,
-            "6440": 1047994624.0,
-            "6445": 1036089088.0,
-            "6450": 1048532224.0,
-            "6455": 1037272320.0,
-            "6460": 1036750912.0,
-            "6465": 1033652032.0,
-            "6470": 1018135232.0,
-            "6475": 1034691648.0,
-            "6480": 1028994048.0,
-            "6485": 1033258880.0,
-            "6490": 1035638656.0,
-            "6495": 1024470016.0,
-            "6500": 1020572096.0,
-            "6505": 1059327104.0,
-            "6510": 1020472576.0,
-            "6515": 1018688064.0,
-            "6520": 1051470592.0,
-            "6525": 1035544512.0,
-            "6530": 1027897216.0,
-            "6535": 1022722240.0,
-            "6540": 1023273984.0,
-            "6545": 1033173120.0,
-            "6550": 1029488512.0,
-            "6555": 1029575296.0,
-            "6560": 1056438784.0,
-            "6565": 1054295040.0,
-            "6570": 1032319040.0,
-            "6575": 1041208320.0,
-            "6580": 1028134400.0,
-            "6585": 1036504832.0,
-            "6590": 1042456192.0,
-            "6595": 1038568832.0,
-            "6600": 1031388096.0,
-            "6605": 1045715456.0,
-            "6610": 1034713472.0,
-            "6615": 1015576448.0,
-            "6620": 1039115136.0,
-            "6625": 1054654208.0,
-            "6630": 1043092928.0,
-            "6635": 1032226304.0,
-            "6640": 1016738496.0,
-            "6645": 1016178816.0,
-            "6650": 1034692672.0,
-            "6655": 1031753472.0,
-            "6660": 1041401920.0,
-            "6665": 1024657984.0,
-            "6670": 1023820032.0,
-            "6675": 1038306176.0,
-            "6680": 1025624064.0,
-            "6685": 1045394048.0,
-            "6690": 1046390720.0,
-            "6695": 1027754368.0,
-            "6700": 1033473920.0,
-            "6705": 1038857152.0,
-            "6710": 1047485888.0,
-            "6715": 1043229440.0,
-            "6720": 1022995456.0,
-            "6725": 1018910144.0,
-            "6730": 1027525504.0,
-            "6735": 1016937856.0,
-            "6740": 1027238016.0,
-            "6745": 1030263680.0,
-            "6750": 1006373760.0,
-            "6755": 1034765056.0,
-            "6760": 1040735296.0,
-            "6765": 1023827008.0,
-            "6770": 1036441344.0,
-            "6775": 1019627712.0,
-            "6780": 1043723904.0,
-            "6785": 1037409280.0,
-            "6790": 1029403072.0,
-            "6795": 1026349440.0,
-            "6800": 1036628224.0,
-            "6805": 1024579712.0,
-            "6810": 1042340544.0,
-            "6815": 1035274112.0,
-            "6820": 1022594880.0,
-            "6825": 1034793344.0,
-            "6830": 1029862400.0,
-            "6835": 1041609600.0,
-            "6840": 1042283776.0,
-            "6845": 1018954624.0,
-            "6850": 1032171136.0,
-            "6855": 1034434752.0,
-            "6860": 1042054848.0,
-            "6865": 1021813568.0,
-            "6870": 1037015424.0,
-            "6875": 1030379968.0,
-            "6880": 1029360768.0,
-            "6885": 1030435968.0,
-            "6890": 1039890432.0,
-            "6895": 1027267712.0,
-            "6900": 1035174016.0,
-            "6905": 1043975424.0,
-            "6910": 1019763072.0,
-            "6915": 1017476608.0,
-            "6920": 1017184256.0,
-            "6925": 1030650688.0,
-            "6930": 1036672384.0,
-            "6935": 1042835712.0,
-            "6940": 1040313216.0,
-            "6945": 1044196992.0,
-            "6950": 1040513472.0,
-            "6955": 1036112704.0,
-            "6960": 1036436224.0,
-            "6965": 1019161024.0,
-            "6970": 1034729088.0,
-            "6975": 1019134464.0,
-            "6980": 1028436160.0,
-            "6985": 1023240128.0,
-            "6990": 1026994688.0,
-            "6995": 1027547520.0,
-            "7000": 1058819840.0,
-            "7005": 1013737856.0,
-            "7010": 1028959488.0,
-            "7015": 1037288768.0,
-            "7020": 1011880576.0,
-            "7025": 1017313280.0,
-            "7030": 1028301440.0,
-            "7035": 1035955392.0,
-            "7040": 1042966016.0,
-            "7045": 1028185856.0,
-            "7050": 1017979584.0,
-            "7055": 1035088000.0,
-            "7060": 1051802624.0,
-            "7065": 1007664640.0,
-            "7070": 1035819008.0,
-            "7075": 1031039552.0,
-            "7080": 1026143296.0,
-            "7085": 1044906432.0,
-            "7090": 1046261760.0,
-            "7095": 1043760512.0,
-            "7100": 1035089024.0,
-            "7105": 1049143296.0,
-            "7110": 1010962944.0,
-            "7115": 1033869504.0,
-            "7120": 1031267456.0,
-            "7125": 1037496832.0,
-            "7130": 1024881856.0,
-            "7135": 1031991808.0,
-            "7140": 1019090176.0,
-            "7145": 1033081088.0,
-            "7150": 1037554112.0,
-            "7155": 1015729728.0,
-            "7160": 1024724608.0,
-            "7165": 1030895808.0,
-            "7170": 1037367808.0,
-            "7175": 1028816896.0,
-            "7180": 1037633280.0,
-            "7185": 1016174080.0,
-            "7190": 1019808128.0,
-            "7195": 1040915392.0,
-            "7200": 1041375360.0,
-            "7205": 1026538240.0,
-            "7210": 1022638720.0,
-            "7215": 1041890560.0,
-            "7220": 1017742720.0,
-            "7225": 1027296640.0,
-            "7230": 1030200448.0,
-            "7235": 1035726848.0,
-            "7240": 1037854848.0,
-            "7245": 1023971008.0,
-            "7250": 1044708096.0,
-            "7255": 1031900480.0,
-            "7260": 1030128256.0,
-            "7265": 1036887104.0,
-            "7270": 1050097152.0,
-            "7275": 1029225216.0,
-            "7280": 1020231808.0,
-            "7285": 1029842048.0,
-            "7290": 1017219328.0,
-            "7295": 1029139584.0,
-            "7300": 1031533824.0,
-            "7305": 1027298176.0,
-            "7310": 1029089664.0,
-            "7315": 1022782272.0,
-            "7320": 1036458176.0,
-            "7325": 1036851840.0,
-            "7330": 1021706496.0,
-            "7335": 1030715904.0,
-            "7340": 1039382976.0,
-            "7345": 1040177664.0,
-            "7350": 1034973568.0,
-            "7355": 1033656320.0,
-            "7360": 1031254912.0,
-            "7365": 1048742016.0,
-            "7370": 1027298304.0,
-            "7375": 1041854848.0,
-            "7380": 1016725760.0,
-            "7385": 1017578368.0,
-            "7390": 1017234944.0,
-            "7395": 1046793600.0,
-            "7400": 1048441216.0,
-            "7405": 1013394304.0,
-            "7410": 1017386368.0,
-            "7415": 1017815360.0,
-            "7420": 1028043008.0,
-            "7425": 1012840576.0,
-            "7430": 1034042368.0,
-            "7435": 1032530432.0,
-            "7440": 1002692928.0,
-            "7445": 1034451200.0,
-            "7450": 1039304832.0,
-            "7455": 1019027008.0,
-            "7460": 1014740928.0,
-            "7465": 1027204736.0,
-            "7470": 1030422784.0,
-            "7475": 1033792064.0,
-            "7480": 1043317376.0,
-            "7485": 1038215168.0,
-            "7490": 1049000960.0,
-            "7495": 1028982720.0,
-            "7500": 1027426816.0,
-            "7505": 1028695936.0,
-            "7510": 1048886528.0,
-            "7515": 1035648704.0,
-            "7520": 1017198848.0,
-            "7525": 1036572736.0,
-            "7530": 1029261952.0,
-            "7535": 1027190144.0,
-            "7540": 1028338048.0,
-            "7545": 1025986304.0,
-            "7550": 1023025856.0,
-            "7555": 1033025344.0,
-            "7560": 1031404672.0,
-            "7565": 1022710528.0,
-            "7570": 1037591552.0,
-            "7575": 1022603136.0,
-            "7580": 1018123584.0,
-            "7585": 1033054208.0,
-            "7590": 1010993280.0,
-            "7595": 1018260352.0,
-            "7600": 1049904448.0,
-            "7605": 1037361216.0,
-            "7610": 1040415744.0,
-            "7615": 1035247488.0,
-            "7620": 1024230912.0,
-            "7625": 1020317184.0,
-            "7630": 1034939584.0,
-            "7635": 1043224192.0,
-            "7640": 1033491520.0,
-            "7645": 1034444608.0,
-            "7650": 1039804800.0,
-            "7655": 1031240576.0,
-            "7660": 1056628096.0,
-            "7665": 1031076096.0,
-            "7670": 1033685120.0,
-            "7675": 1030681600.0,
-            "7680": 1035398720.0,
-            "7685": 1018661760.0,
-            "7690": 1031921024.0,
-            "7695": 1025858880.0,
-            "7700": 1017715200.0,
-            "7705": 1036531200.0,
-            "7710": 1029893248.0,
-            "7715": 1053230656.0,
-            "7720": 1019514240.0,
-            "7725": 1042193216.0,
-            "7730": 1035620992.0,
-            "7735": 1020726144.0,
-            "7740": 1045576128.0,
-            "7745": 1026932992.0,
-            "7750": 1048550208.0,
-            "7755": 1022539264.0,
-            "7760": 1049532032.0,
-            "7765": 1029370176.0,
-            "7770": 1018375296.0,
-            "7775": 1021364672.0,
-            "7780": 1039770624.0,
-            "7785": 1039914112.0,
-            "7790": 1030516992.0,
-            "7795": 1039353728.0,
-            "7800": 1028187904.0,
-            "7805": 1027635776.0,
-            "7810": 1020970368.0,
-            "7815": 1035878400.0,
-            "7820": 1017666240.0,
-            "7825": 1018067392.0,
-            "7830": 1035104128.0,
-            "7835": 1044507648.0,
-            "7840": 1027836224.0,
-            "7845": 1032101504.0,
-            "7850": 1034609408.0,
-            "7855": 1025464832.0,
-            "7860": 1059051648.0,
-            "7865": 1016626240.0,
-            "7870": 1033729408.0,
-            "7875": 1044185600.0,
-            "7880": 1029084352.0,
-            "7885": 1040308288.0,
-            "7890": 1029556480.0,
-            "7895": 1032947008.0,
-            "7900": 1021409216.0,
-            "7905": 1020955904.0,
-            "7910": 1008993856.0,
-            "7915": 1023120768.0,
-            "7920": 1023070976.0,
-            "7925": 1030094080.0,
-            "7930": 1020712704.0,
-            "7935": 1019443776.0,
-            "7940": 1017809152.0,
-            "7945": 1014447552.0,
-            "7950": 1026303616.0,
-            "7955": 1034518272.0,
-            "7960": 1056026304.0,
-            "7965": 1031047872.0,
-            "7970": 1030417152.0,
-            "7975": 1022189888.0,
-            "7980": 1034474624.0,
-            "7985": 1047305024.0,
-            "7990": 1032066176.0,
-            "7995": 1044264704.0,
-            "8000": 1028876672.0,
-            "8005": 1028045440.0,
-            "8010": 1050665408.0,
-            "8015": 1019758976.0,
-            "8020": 1043297408.0,
-            "8025": 1039018560.0,
-            "8030": 1030868800.0,
-            "8035": 1045304192.0,
-            "8040": 1026310784.0,
-            "8045": 1024970368.0,
-            "8050": 1018405632.0,
-            "8055": 1033736960.0,
-            "8060": 1012986816.0,
-            "8065": 1022016640.0,
-            "8070": 1034776064.0,
-            "8075": 1042759616.0,
-            "8080": 1027758784.0,
-            "8085": 1037205376.0,
-            "8090": 1007008256.0,
-            "8095": 1030374528.0,
-            "8100": 1030726016.0,
-            "8105": 1027794944.0,
-            "8110": 1031557248.0,
-            "8115": 1037685248.0,
-            "8120": 1037692992.0,
-            "8125": 1031097472.0,
-            "8130": 1028627072.0,
-            "8135": 1029680256.0,
-            "8140": 1049904256.0,
-            "8145": 1043463552.0,
-            "8150": 1040087424.0,
-            "8155": 1046780288.0,
-            "8160": 1010199040.0,
-            "8165": 1031657728.0,
-            "8170": 1024483264.0,
-            "8175": 1035019648.0,
-            "8180": 1024460544.0,
-            "8185": 1021960448.0,
-            "8190": 1037125504.0,
-            "8195": 1022368384.0,
-            "8200": 1035635968.0,
-            "8205": 1026482496.0,
-            "8210": 1023888000.0,
-            "8215": 1014276416.0,
-            "8220": 1026756224.0,
-            "8225": 1028540160.0,
-            "8230": 1027163072.0,
-            "8235": 1037914048.0,
-            "8240": 1025909376.0,
-            "8245": 1024676608.0,
-            "8250": 1041635840.0,
-            "8255": 1031908224.0,
-            "8260": 1032424512.0,
-            "8265": 1023164800.0,
-            "8270": 1040172544.0,
-            "8275": 1038050688.0,
-            "8280": 1041849216.0,
-            "8285": 1038804352.0,
-            "8290": 1024074880.0,
-            "8295": 1028403648.0,
-            "8300": 1039341440.0,
-            "8305": 1012104192.0,
-            "8310": 1021882048.0,
-            "8315": 1027307200.0,
-            "8320": 1021636992.0,
-            "8325": 1048572160.0,
-            "8330": 1041039616.0,
-            "8335": 1037964928.0,
-            "8340": 1033019136.0,
-            "8345": 1043864192.0,
-            "8350": 1037713792.0,
-            "8355": 1029686400.0,
-            "8360": 1040667776.0,
-            "8365": 1027450304.0,
-            "8370": 1037742848.0,
-            "8375": 1041986944.0,
-            "8380": 1037628416.0,
-            "8385": 1023436160.0,
-            "8390": 1026068224.0,
-            "8395": 1028913408.0,
-            "8400": 1046530560.0,
-            "8405": 1040179456.0,
-            "8410": 1034252672.0,
-            "8415": 1040258688.0,
-            "8420": 1054730752.0,
-            "8425": 1031514880.0,
-            "8430": 1030295680.0,
-            "8435": 1045707200.0,
-            "8440": 1026310784.0,
-            "8445": 1029027392.0,
-            "8450": 1034201920.0,
-            "8455": 1031794688.0,
-            "8460": 1016828032.0,
-            "8465": 1035163648.0,
-            "8470": 1035185152.0,
-            "8475": 1024712960.0,
-            "8480": 1035901184.0,
-            "8485": 1028948480.0,
-            "8490": 1023079168.0,
-            "8495": 1037393280.0,
-            "8500": 1025960064.0,
-            "8505": 1042724992.0,
-            "8510": 1028167936.0,
-            "8515": 1038101056.0,
-            "8520": 1023107328.0,
-            "8525": 1037987328.0,
-            "8530": 1027572800.0,
-            "8535": 1041656128.0,
-            "8540": 1033880960.0,
-            "8545": 1015116160.0,
-            "8550": 1040188160.0,
-            "8555": 1016340672.0,
-            "8560": 1019330048.0,
-            "8565": 1021410112.0,
-            "8570": 1032032320.0,
-            "8575": 1031880128.0,
-            "8580": 1016011264.0,
-            "8585": 1030017408.0,
-            "8590": 1031637248.0,
-            "8595": 1017776128.0,
-            "8600": 1002393216.0,
-            "8605": 1030238336.0,
-            "8610": 1017532288.0,
-            "8615": 1023989248.0,
-            "8620": 1047205696.0,
-            "8625": 1034231552.0,
-            "8630": 1030921280.0,
-            "8635": 1051992512.0,
-            "8640": 1041134208.0,
-            "8645": 1024870720.0,
-            "8650": 1025595392.0,
-            "8655": 1036904832.0,
-            "8660": 1031171200.0,
-            "8665": 1032904640.0,
-            "8670": 1037400576.0,
-            "8675": 1029157248.0,
-            "8680": 1031264704.0,
-            "8685": 1041197568.0,
-            "8690": 1035035392.0,
-            "8695": 1008508416.0,
-            "8700": 1027459072.0,
-            "8705": 1051504896.0,
-            "8710": 1041678016.0,
-            "8715": 1034152256.0,
-            "8720": 1017596544.0,
-            "8725": 1025187456.0,
-            "8730": 1036610816.0,
-            "8735": 1014829568.0,
-            "8740": 1036081536.0,
-            "8745": 1021252416.0,
-            "8750": 1027866496.0,
-            "8755": 1020742272.0,
-            "8760": 1036899712.0,
-            "8765": 1058672448.0,
-            "8770": 1020462464.0,
-            "8775": 1031773056.0,
-            "8780": 1030892544.0,
-            "8785": 1032117504.0,
-            "8790": 1041034112.0,
-            "8795": 1019523968.0,
-            "8800": 1038245632.0,
-            "8805": 1035106752.0,
-            "8810": 1043257088.0,
-            "8815": 1026490496.0,
-            "8820": 1027666944.0,
-            "8825": 1043464064.0,
-            "8830": 1027480192.0,
-            "8835": 1038812928.0,
-            "8840": 1034490752.0,
-            "8845": 1033909760.0,
-            "8850": 1030491008.0,
-            "8855": 1042524992.0,
-            "8860": 1013002880.0,
-            "8865": 1038368128.0,
-            "8870": 1025187456.0,
-            "8875": 1012981760.0,
-            "8880": 1028376704.0,
-            "8885": 1046461056.0,
-            "8890": 1038603840.0,
-            "8895": 1037909504.0,
-            "8900": 1027294848.0,
-            "8905": 1032792064.0,
-            "8910": 1029795264.0,
-            "8915": 1030003968.0,
-            "8920": 1030339968.0,
-            "8925": 1028569984.0,
-            "8930": 1031637376.0,
-            "8935": 1022951424.0,
-            "8940": 1019847872.0,
-            "8945": 1031909248.0,
-            "8950": 1039951744.0,
-            "8955": 1041902720.0,
-            "8960": 1026878464.0,
-            "8965": 1022083968.0,
-            "8970": 1029559424.0,
-            "8975": 1038934400.0,
-            "8980": 1033860160.0,
-            "8985": 1030649472.0,
-            "8990": 1025014144.0,
-            "8995": 1013963648.0,
-            "9000": 1035286400.0,
-            "9005": 1028649280.0,
-            "9010": 1011913280.0,
-            "9015": 1038912128.0,
-            "9020": 1030153856.0,
-            "9025": 1024685056.0,
-            "9030": 1025861888.0,
-            "9035": 1054309248.0,
-            "9040": 1027293952.0,
-            "9045": 1036583040.0,
-            "9050": 1020929664.0,
-            "9055": 1043212800.0,
-            "9060": 1023159104.0,
-            "9065": 1023387520.0,
-            "9070": 1039364480.0,
-            "9075": 1026728320.0,
-            "9080": 1018873408.0,
-            "9085": 1015439104.0,
-            "9090": 1043764736.0,
-            "9095": 1014020224.0,
-            "9100": 1031975296.0,
-            "9105": 1026514304.0,
-            "9110": 1029229568.0,
-            "9115": 1024866432.0,
-            "9120": 999986240.0,
-            "9125": 1032842752.0,
-            "9130": 1038534336.0,
-            "9135": 1031037696.0,
-            "9140": 1025502208.0,
-            "9145": 1030405248.0,
-            "9150": 1029416576.0,
-            "9155": 1038268928.0,
-            "9160": 1046043904.0,
-            "9165": 1017948992.0,
-            "9170": 1040955520.0,
-            "9175": 1031287552.0,
-            "9180": 1037830656.0,
-            "9185": 1040684416.0,
-            "9190": 1028985728.0,
-            "9195": 1034312320.0,
-            "9200": 1035551872.0,
-            "9205": 1029847040.0,
-            "9210": 1026535872.0,
-            "9215": 1030520448.0,
-            "9220": 1025732224.0,
-            "9225": 1048001408.0,
-            "9230": 1041601792.0,
-            "9235": 1027775104.0,
-            "9240": 1025245760.0,
-            "9245": 1036211584.0,
-            "9250": 1041192384.0,
-            "9255": 1020063872.0,
-            "9260": 1035337984.0,
-            "9265": 1023102208.0,
-            "9270": 1038332928.0,
-            "9275": 1036053568.0,
-            "9280": 1026541504.0,
-            "9285": 1014285184.0,
-            "9290": 1018866304.0,
-            "9295": 1026915264.0,
-            "9300": 1037085888.0,
-            "9305": 1045435392.0,
-            "9310": 1033242944.0,
-            "9315": 1039043840.0,
-            "9320": 1048495488.0,
-            "9325": 1023059840.0,
-            "9330": 1031724672.0,
-            "9335": 1035673472.0,
-            "9340": 1013719296.0,
-            "9345": 1022572032.0,
-            "9350": 1026585600.0,
-            "9355": 1034807104.0,
-            "9360": 1029839552.0,
-            "9365": 1019863296.0,
-            "9370": 1006904320.0,
-            "9375": 1036232960.0,
-            "9380": 1049012736.0,
-            "9385": 1015905344.0,
-            "9390": 1029208704.0,
-            "9395": 1008931968.0,
-            "9400": 1026893568.0,
-            "9405": 1027653312.0,
-            "9410": 1040913280.0,
-            "9415": 1035128576.0,
-            "9420": 1030792640.0,
-            "9425": 1027581056.0,
-            "9430": 1032727360.0,
-            "9435": 1031796288.0,
-            "9440": 1051730048.0,
-            "9445": 1019626752.0,
-            "9450": 1044505152.0,
-            "9455": 1035773696.0,
-            "9460": 1013828224.0,
-            "9465": 1023403904.0,
-            "9470": 1023576832.0,
-            "9475": 1039164416.0,
-            "9480": 1029597056.0,
-            "9485": 1032075200.0,
-            "9490": 1020994560.0,
-            "9495": 1021375616.0,
-            "9500": 1035594304.0,
-            "9505": 1034478464.0,
-            "9510": 1014286592.0,
-            "9515": 1031309312.0,
-            "9520": 1026563904.0,
-            "9525": 1035853184.0,
-            "9530": 1031624448.0,
-            "9535": 1025926720.0
-        }
-    },
-    "mem-allocated-bytes": {
-        "start_step": 1,
-        "end_step": 9535,
-        "step_interval": 5,
-        "values": {
-            "1": 33307314176.0,
-            "5": 33307424768.0,
-            "10": 33307447296.0,
-            "15": 33307439104.0,
-            "20": 33307533312.0,
-            "25": 33307473920.0,
-            "30": 33307504640.0,
-            "35": 33307639808.0,
-            "40": 33307637760.0,
-            "45": 33307568128.0,
-            "50": 33307418624.0,
-            "55": 33307326464.0,
-            "60": 33307346944.0,
-            "65": 33307490304.0,
-            "70": 33307312128.0,
-            "75": 33307308032.0,
-            "80": 33307404288.0,
-            "85": 33307314176.0,
-            "90": 33307285504.0,
-            "95": 33307392000.0,
-            "100": 33307260928.0,
-            "105": 33307129856.0,
-            "110": 33307037696.0,
-            "115": 33306703872.0,
-            "120": 33307355136.0,
-            "125": 33306873856.0,
-            "130": 33307017216.0,
-            "135": 33307305984.0,
-            "140": 33307004928.0,
-            "145": 33307121664.0,
-            "150": 33307312128.0,
-            "155": 33307176960.0,
-            "160": 33307103232.0,
-            "165": 33307174912.0,
-            "170": 33307832320.0,
-            "175": 33307199488.0,
-            "180": 33307355136.0,
-            "185": 33307355136.0,
-            "190": 33307131904.0,
-            "195": 33307256832.0,
-            "200": 33307326464.0,
-            "205": 33307492352.0,
-            "210": 33307500544.0,
-            "215": 33307086848.0,
-            "220": 33306857472.0,
-            "225": 33306933248.0,
-            "230": 33307092992.0,
-            "235": 33307183104.0,
-            "240": 33307303936.0,
-            "245": 33307426816.0,
-            "250": 33307308032.0,
-            "255": 33307295744.0,
-            "260": 33306767360.0,
-            "265": 33307461632.0,
-            "270": 33307467776.0,
-            "275": 33307469824.0,
-            "280": 33307254784.0,
-            "285": 33307947008.0,
-            "290": 33307191296.0,
-            "295": 33308014592.0,
-            "300": 33307856896.0,
-            "305": 33308340224.0,
-            "310": 33307815936.0,
-            "315": 33307181056.0,
-            "320": 33307512832.0,
-            "325": 33307488256.0,
-            "330": 33307977728.0,
-            "335": 33307947008.0,
-            "340": 33308606464.0,
-            "345": 33308037120.0,
-            "350": 33307693056.0,
-            "355": 33308000256.0,
-            "360": 33307348992.0,
-            "365": 33307451392.0,
-            "370": 33308000256.0,
-            "375": 33307283456.0,
-            "380": 33307570176.0,
-            "385": 33307860992.0,
-            "390": 33307416576.0,
-            "395": 33307031552.0,
-            "400": 33307246592.0,
-            "405": 33307676672.0,
-            "410": 33306935296.0,
-            "415": 33307752448.0,
-            "420": 33307529216.0,
-            "425": 33307314176.0,
-            "430": 33306988544.0,
-            "435": 33307455488.0,
-            "440": 33307369472.0,
-            "445": 33307709440.0,
-            "450": 33307588608.0,
-            "455": 33306963968.0,
-            "460": 33307193344.0,
-            "465": 33306845184.0,
-            "470": 33307766784.0,
-            "475": 33306464256.0,
-            "480": 33307566080.0,
-            "485": 33307682816.0,
-            "490": 33307389952.0,
-            "495": 33307179008.0,
-            "500": 33307969536.0,
-            "505": 33307629568.0,
-            "510": 33308192768.0,
-            "515": 33307279360.0,
-            "520": 33306544128.0,
-            "525": 33307265024.0,
-            "530": 33307025408.0,
-            "535": 33307648000.0,
-            "540": 33307582464.0,
-            "545": 33307297792.0,
-            "550": 33307396096.0,
-            "555": 33307301888.0,
-            "560": 33307899904.0,
-            "565": 33307379712.0,
-            "570": 33307553792.0,
-            "575": 33307136000.0,
-            "580": 33305892864.0,
-            "585": 33306945536.0,
-            "590": 33307629568.0,
-            "595": 33307860992.0,
-            "600": 33306873856.0,
-            "605": 33307357184.0,
-            "610": 33306556416.0,
-            "615": 33306349568.0,
-            "620": 33307791360.0,
-            "625": 33306378240.0,
-            "630": 33307168768.0,
-            "635": 33306767360.0,
-            "640": 33306116096.0,
-            "645": 33308092416.0,
-            "650": 33307277312.0,
-            "655": 33307131904.0,
-            "660": 33308485632.0,
-            "665": 33307334656.0,
-            "670": 33307959296.0,
-            "675": 33307701248.0,
-            "680": 33306863616.0,
-            "685": 33306697728.0,
-            "690": 33307863040.0,
-            "695": 33307293696.0,
-            "700": 33306263552.0,
-            "705": 33306955776.0,
-            "710": 33308225536.0,
-            "715": 33307174912.0,
-            "720": 33307107328.0,
-            "725": 33307324416.0,
-            "730": 33308231680.0,
-            "735": 33307224064.0,
-            "740": 33307815936.0,
-            "745": 33307938816.0,
-            "750": 33307779072.0,
-            "755": 33308463104.0,
-            "760": 33306349568.0,
-            "765": 33308266496.0,
-            "770": 33306603520.0,
-            "775": 33307424768.0,
-            "780": 33308608512.0,
-            "785": 33307969536.0,
-            "790": 33308188672.0,
-            "795": 33307656192.0,
-            "800": 33307547648.0,
-            "805": 33307619328.0,
-            "810": 33307910144.0,
-            "815": 33307170816.0,
-            "820": 33307029504.0,
-            "825": 33307443200.0,
-            "830": 33307422720.0,
-            "835": 33307262976.0,
-            "840": 33307613184.0,
-            "845": 33307928576.0,
-            "850": 33306238976.0,
-            "855": 33307396096.0,
-            "860": 33307938816.0,
-            "865": 33307701248.0,
-            "870": 33307940864.0,
-            "875": 33307545600.0,
-            "880": 33307527168.0,
-            "885": 33307336704.0,
-            "890": 33308262400.0,
-            "895": 33307717632.0,
-            "900": 33306474496.0,
-            "905": 33307480064.0,
-            "910": 33307725824.0,
-            "915": 33308303360.0,
-            "920": 33307770880.0,
-            "925": 33307566080.0,
-            "930": 33307451392.0,
-            "935": 33307975680.0,
-            "940": 33306320896.0,
-            "945": 33306429440.0,
-            "950": 33307136000.0,
-            "955": 33307846656.0,
-            "960": 33307611136.0,
-            "965": 33307465728.0,
-            "970": 33308293120.0,
-            "975": 33307078656.0,
-            "980": 33307568128.0,
-            "985": 33307080704.0,
-            "990": 33307367424.0,
-            "995": 33306861568.0,
-            "1000": 33307889664.0,
-            "1005": 33305956352.0,
-            "1010": 33307508736.0,
-            "1015": 33306671104.0,
-            "1020": 33306669056.0,
-            "1025": 33306509312.0,
-            "1030": 33307117568.0,
-            "1035": 33308332032.0,
-            "1040": 33307353088.0,
-            "1045": 33308368896.0,
-            "1050": 33306615808.0,
-            "1055": 33306802176.0,
-            "1060": 33307103232.0,
-            "1065": 33307404288.0,
-            "1070": 33307070464.0,
-            "1075": 33308188672.0,
-            "1080": 33307011072.0,
-            "1085": 33307027456.0,
-            "1090": 33308086272.0,
-            "1095": 33307086848.0,
-            "1100": 33307287552.0,
-            "1105": 33308497920.0,
-            "1110": 33307461632.0,
-            "1115": 33307533312.0,
-            "1120": 33307777024.0,
-            "1125": 33307809792.0,
-            "1130": 33307484160.0,
-            "1135": 33308082176.0,
-            "1140": 33307029504.0,
-            "1145": 33307432960.0,
-            "1150": 33307574272.0,
-            "1155": 33307551744.0,
-            "1160": 33307561984.0,
-            "1165": 33307086848.0,
-            "1170": 33307856896.0,
-            "1175": 33306976256.0,
-            "1180": 33308237824.0,
-            "1185": 33307875328.0,
-            "1190": 33307369472.0,
-            "1195": 33308231680.0,
-            "1200": 33307197440.0,
-            "1205": 33307480064.0,
-            "1210": 33305866240.0,
-            "1215": 33308297216.0,
-            "1220": 33307451392.0,
-            "1225": 33307518976.0,
-            "1230": 33307688960.0,
-            "1235": 33307901952.0,
-            "1240": 33307394048.0,
-            "1245": 33307842560.0,
-            "1250": 33307281408.0,
-            "1255": 33306906624.0,
-            "1260": 33307301888.0,
-            "1265": 33307674624.0,
-            "1270": 33307150336.0,
-            "1275": 33307686912.0,
-            "1280": 33307430912.0,
-            "1285": 33306974208.0,
-            "1290": 33307529216.0,
-            "1295": 33307901952.0,
-            "1300": 33307002880.0,
-            "1305": 33308059648.0,
-            "1310": 33306939392.0,
-            "1315": 33307336704.0,
-            "1320": 33307262976.0,
-            "1325": 33307011072.0,
-            "1330": 33306550272.0,
-            "1335": 33307181056.0,
-            "1340": 33307406336.0,
-            "1345": 33307463680.0,
-            "1350": 33308135424.0,
-            "1355": 33307480064.0,
-            "1360": 33307533312.0,
-            "1365": 33307066368.0,
-            "1370": 33306595328.0,
-            "1375": 33307891712.0,
-            "1380": 33307830272.0,
-            "1385": 33308487680.0,
-            "1390": 33306521600.0,
-            "1395": 33307338752.0,
-            "1400": 33308430336.0,
-            "1405": 33307768832.0,
-            "1410": 33308041216.0,
-            "1415": 33307797504.0,
-            "1420": 33306605568.0,
-            "1425": 33307240448.0,
-            "1430": 33307322368.0,
-            "1435": 33307559936.0,
-            "1440": 33306662912.0,
-            "1445": 33307058176.0,
-            "1450": 33307705344.0,
-            "1455": 33307291648.0,
-            "1460": 33306861568.0,
-            "1465": 33306312704.0,
-            "1470": 33307394048.0,
-            "1475": 33307211776.0,
-            "1480": 33306527744.0,
-            "1485": 33307361280.0,
-            "1490": 33307693056.0,
-            "1495": 33307271168.0,
-            "1500": 33306820608.0,
-            "1505": 33307092992.0,
-            "1510": 33306624000.0,
-            "1515": 33307097088.0,
-            "1520": 33306931200.0,
-            "1525": 33307635712.0,
-            "1530": 33307353088.0,
-            "1535": 33306468352.0,
-            "1540": 33307172864.0,
-            "1545": 33307693056.0,
-            "1550": 33307938816.0,
-            "1555": 33307832320.0,
-            "1560": 33308182528.0,
-            "1565": 33307099136.0,
-            "1570": 33306798080.0,
-            "1575": 33307492352.0,
-            "1580": 33307688960.0,
-            "1585": 33307326464.0,
-            "1590": 33306988544.0,
-            "1595": 33306818560.0,
-            "1600": 33307836416.0,
-            "1605": 33307590656.0,
-            "1610": 33307168768.0,
-            "1615": 33306931200.0,
-            "1620": 33306732544.0,
-            "1625": 33308260352.0,
-            "1630": 33308227584.0,
-            "1635": 33306957824.0,
-            "1640": 33306759168.0,
-            "1645": 33306021888.0,
-            "1650": 33306689536.0,
-            "1655": 33307332608.0,
-            "1660": 33307170816.0,
-            "1665": 33306583040.0,
-            "1670": 33307535360.0,
-            "1675": 33306912768.0,
-            "1680": 33306675200.0,
-            "1685": 33307774976.0,
-            "1690": 33307783168.0,
-            "1695": 33307971584.0,
-            "1700": 33307623424.0,
-            "1705": 33307652096.0,
-            "1710": 33307731968.0,
-            "1715": 33308090368.0,
-            "1720": 33307172864.0,
-            "1725": 33307672576.0,
-            "1730": 33306355712.0,
-            "1735": 33308229632.0,
-            "1740": 33307142144.0,
-            "1745": 33308151808.0,
-            "1750": 33306898432.0,
-            "1755": 33307105280.0,
-            "1760": 33308000256.0,
-            "1765": 33307750400.0,
-            "1770": 33308450816.0,
-            "1775": 33308184576.0,
-            "1780": 33308129280.0,
-            "1785": 33307936768.0,
-            "1790": 33307238400.0,
-            "1795": 33307922432.0,
-            "1800": 33306900480.0,
-            "1805": 33307203584.0,
-            "1810": 33306923008.0,
-            "1815": 33307617280.0,
-            "1820": 33307664384.0,
-            "1825": 33308440576.0,
-            "1830": 33306843136.0,
-            "1835": 33307979776.0,
-            "1840": 33307588608.0,
-            "1845": 33307602944.0,
-            "1850": 33307774976.0,
-            "1855": 33307529216.0,
-            "1860": 33307054080.0,
-            "1865": 33307097088.0,
-            "1870": 33307373568.0,
-            "1875": 33306265600.0,
-            "1880": 33307275264.0,
-            "1885": 33307224064.0,
-            "1890": 33307324416.0,
-            "1895": 33307283456.0,
-            "1900": 33306810368.0,
-            "1905": 33307191296.0,
-            "1910": 33306884096.0,
-            "1915": 33308162048.0,
-            "1920": 33307664384.0,
-            "1925": 33305972736.0,
-            "1930": 33308504064.0,
-            "1935": 33307377664.0,
-            "1940": 33307119616.0,
-            "1945": 33307416576.0,
-            "1950": 33307746304.0,
-            "1955": 33307420672.0,
-            "1960": 33308073984.0,
-            "1965": 33307148288.0,
-            "1970": 33306775552.0,
-            "1975": 33308207104.0,
-            "1980": 33307473920.0,
-            "1985": 33307095040.0,
-            "1990": 33307527168.0,
-            "1995": 33307037696.0,
-            "2000": 33308801024.0,
-            "2005": 33307985920.0,
-            "2010": 33307516928.0,
-            "2015": 33307604992.0,
-            "2020": 33307406336.0,
-            "2025": 33307719680.0,
-            "2030": 33308381184.0,
-            "2035": 33307914240.0,
-            "2040": 33307324416.0,
-            "2045": 33306476544.0,
-            "2050": 33308246016.0,
-            "2055": 33307430912.0,
-            "2060": 33307912192.0,
-            "2065": 33307543552.0,
-            "2070": 33307670528.0,
-            "2075": 33307482112.0,
-            "2080": 33307871232.0,
-            "2085": 33306722304.0,
-            "2090": 33307549696.0,
-            "2095": 33307260928.0,
-            "2100": 33306765312.0,
-            "2105": 33306847232.0,
-            "2110": 33307332608.0,
-            "2115": 33306480640.0,
-            "2120": 33307168768.0,
-            "2125": 33307277312.0,
-            "2130": 33307314176.0,
-            "2135": 33307752448.0,
-            "2140": 33306710016.0,
-            "2145": 33307478016.0,
-            "2150": 33307729920.0,
-            "2155": 33306943488.0,
-            "2160": 33307508736.0,
-            "2165": 33307049984.0,
-            "2170": 33307158528.0,
-            "2175": 33306599424.0,
-            "2180": 33307054080.0,
-            "2185": 33307017216.0,
-            "2190": 33307119616.0,
-            "2195": 33307289600.0,
-            "2200": 33306726400.0,
-            "2205": 33306636288.0,
-            "2210": 33307639808.0,
-            "2215": 33308215296.0,
-            "2220": 33307314176.0,
-            "2225": 33307437056.0,
-            "2230": 33306318848.0,
-            "2235": 33306941440.0,
-            "2240": 33308131328.0,
-            "2245": 33307707392.0,
-            "2250": 33307256832.0,
-            "2255": 33306845184.0,
-            "2260": 33307736064.0,
-            "2265": 33308620800.0,
-            "2270": 33307357184.0,
-            "2275": 33308151808.0,
-            "2280": 33307981824.0,
-            "2285": 33307922432.0,
-            "2290": 33306767360.0,
-            "2295": 33307670528.0,
-            "2300": 33307179008.0,
-            "2305": 33307545600.0,
-            "2310": 33307924480.0,
-            "2315": 33307396096.0,
-            "2320": 33307725824.0,
-            "2325": 33308024832.0,
-            "2330": 33307793408.0,
-            "2335": 33307019264.0,
-            "2340": 33307162624.0,
-            "2345": 33307934720.0,
-            "2350": 33306232832.0,
-            "2355": 33307719680.0,
-            "2360": 33307375616.0,
-            "2365": 33306537984.0,
-            "2370": 33307279360.0,
-            "2375": 33308131328.0,
-            "2380": 33307136000.0,
-            "2385": 33307490304.0,
-            "2390": 33307316224.0,
-            "2395": 33306587136.0,
-            "2400": 33307594752.0,
-            "2405": 33308393472.0,
-            "2410": 33306726400.0,
-            "2415": 33307506688.0,
-            "2420": 33308407808.0,
-            "2425": 33307942912.0,
-            "2430": 33308116992.0,
-            "2435": 33307308032.0,
-            "2440": 33308362752.0,
-            "2445": 33308071936.0,
-            "2450": 33307740160.0,
-            "2455": 33307959296.0,
-            "2460": 33308258304.0,
-            "2465": 33307299840.0,
-            "2470": 33307056128.0,
-            "2475": 33307224064.0,
-            "2480": 33307713536.0,
-            "2485": 33306550272.0,
-            "2490": 33306992640.0,
-            "2495": 33307232256.0,
-            "2500": 33307095040.0,
-            "2505": 33307107328.0,
-            "2510": 33307488256.0,
-            "2515": 33308360704.0,
-            "2520": 33307369472.0,
-            "2525": 33306959872.0,
-            "2530": 33307258880.0,
-            "2535": 33307082752.0,
-            "2540": 33308633088.0,
-            "2545": 33308542976.0,
-            "2550": 33308002304.0,
-            "2555": 33307961344.0,
-            "2560": 33307328512.0,
-            "2565": 33308299264.0,
-            "2570": 33307770880.0,
-            "2575": 33307877376.0,
-            "2580": 33307990016.0,
-            "2585": 33308016640.0,
-            "2590": 33308135424.0,
-            "2595": 33307617280.0,
-            "2600": 33306667008.0,
-            "2605": 33307422720.0,
-            "2610": 33306683392.0,
-            "2615": 33308669952.0,
-            "2620": 33308616704.0,
-            "2625": 33308366848.0,
-            "2630": 33307574272.0,
-            "2635": 33308166144.0,
-            "2640": 33307983872.0,
-            "2645": 33307609088.0,
-            "2650": 33307807744.0,
-            "2655": 33306955776.0,
-            "2660": 33307273216.0,
-            "2665": 33307709440.0,
-            "2670": 33307693056.0,
-            "2675": 33307731968.0,
-            "2680": 33308227584.0,
-            "2685": 33307742208.0,
-            "2690": 33307734016.0,
-            "2695": 33307424768.0,
-            "2700": 33306644480.0,
-            "2705": 33306300416.0,
-            "2710": 33307881472.0,
-            "2715": 33307488256.0,
-            "2720": 33307318272.0,
-            "2725": 33307604992.0,
-            "2730": 33306710016.0,
-            "2735": 33308049408.0,
-            "2740": 33307437056.0,
-            "2745": 33307572224.0,
-            "2750": 33307136000.0,
-            "2755": 33307584512.0,
-            "2760": 33307355136.0,
-            "2765": 33307713536.0,
-            "2770": 33308000256.0,
-            "2775": 33306460160.0,
-            "2780": 33306923008.0,
-            "2785": 33307017216.0,
-            "2790": 33306720256.0,
-            "2795": 33307785216.0,
-            "2800": 33307234304.0,
-            "2805": 33306685440.0,
-            "2810": 33307469824.0,
-            "2815": 33308069888.0,
-            "2820": 33306460160.0,
-            "2825": 33307467776.0,
-            "2830": 33307666432.0,
-            "2835": 33307371520.0,
-            "2840": 33306904576.0,
-            "2845": 33308061696.0,
-            "2850": 33308520448.0,
-            "2855": 33307695104.0,
-            "2860": 33308487680.0,
-            "2865": 33307058176.0,
-            "2870": 33307303936.0,
-            "2875": 33307324416.0,
-            "2880": 33306968064.0,
-            "2885": 33307641856.0,
-            "2890": 33307785216.0,
-            "2895": 33308221440.0,
-            "2900": 33307596800.0,
-            "2905": 33307533312.0,
-            "2910": 33307459584.0,
-            "2915": 33307799552.0,
-            "2920": 33308461056.0,
-            "2925": 33307938816.0,
-            "2930": 33308268544.0,
-            "2935": 33308594176.0,
-            "2940": 33308170240.0,
-            "2945": 33307578368.0,
-            "2950": 33307590656.0,
-            "2955": 33308131328.0,
-            "2960": 33306839040.0,
-            "2965": 33307111424.0,
-            "2970": 33307570176.0,
-            "2975": 33307766784.0,
-            "2980": 33307600896.0,
-            "2985": 33307123712.0,
-            "2990": 33307641856.0,
-            "2995": 33307527168.0,
-            "3000": 33307863040.0,
-            "3005": 33306927104.0,
-            "3010": 33307738112.0,
-            "3015": 33308217344.0,
-            "3020": 33306697728.0,
-            "3025": 33306970112.0,
-            "3030": 33308127232.0,
-            "3035": 33308213248.0,
-            "3040": 33307578368.0,
-            "3045": 33308327936.0,
-            "3050": 33306910720.0,
-            "3055": 33307004928.0,
-            "3060": 33307602944.0,
-            "3065": 33306970112.0,
-            "3070": 33307985920.0,
-            "3075": 33306945536.0,
-            "3080": 33307312128.0,
-            "3085": 33306533888.0,
-            "3090": 33306933248.0,
-            "3095": 33307906048.0,
-            "3100": 33306793984.0,
-            "3105": 33307127808.0,
-            "3110": 33308295168.0,
-            "3115": 33307295744.0,
-            "3120": 33307897856.0,
-            "3125": 33307066368.0,
-            "3130": 33307781120.0,
-            "3135": 33307762688.0,
-            "3140": 33308196864.0,
-            "3145": 33306904576.0,
-            "3150": 33307140096.0,
-            "3155": 33306660864.0,
-            "3160": 33307514880.0,
-            "3165": 33307246592.0,
-            "3170": 33307613184.0,
-            "3175": 33307375616.0,
-            "3180": 33307551744.0,
-            "3185": 33307842560.0,
-            "3190": 33308342272.0,
-            "3195": 33308350464.0,
-            "3200": 33307799552.0,
-            "3205": 33307099136.0,
-            "3210": 33306869760.0,
-            "3215": 33307678720.0,
-            "3220": 33307111424.0,
-            "3225": 33307146240.0,
-            "3230": 33306972160.0,
-            "3235": 33307387904.0,
-            "3240": 33307521024.0,
-            "3245": 33307287552.0,
-            "3250": 33307523072.0,
-            "3255": 33307639808.0,
-            "3260": 33307092992.0,
-            "3265": 33308338176.0,
-            "3270": 33307273216.0,
-            "3275": 33307713536.0,
-            "3280": 33307719680.0,
-            "3285": 33308049408.0,
-            "3290": 33307484160.0,
-            "3295": 33307594752.0,
-            "3300": 33307228160.0,
-            "3305": 33306580992.0,
-            "3310": 33307541504.0,
-            "3315": 33307211776.0,
-            "3320": 33307324416.0,
-            "3325": 33306615808.0,
-            "3330": 33307777024.0,
-            "3335": 33308135424.0,
-            "3340": 33307351040.0,
-            "3345": 33307131904.0,
-            "3350": 33307031552.0,
-            "3355": 33307791360.0,
-            "3360": 33307410432.0,
-            "3365": 33307090944.0,
-            "3370": 33306187776.0,
-            "3375": 33307113472.0,
-            "3380": 33308071936.0,
-            "3385": 33307717632.0,
-            "3390": 33306648576.0,
-            "3395": 33306781696.0,
-            "3400": 33307734016.0,
-            "3405": 33307570176.0,
-            "3410": 33307750400.0,
-            "3415": 33307920384.0,
-            "3420": 33308157952.0,
-            "3425": 33307500544.0,
-            "3430": 33307168768.0,
-            "3435": 33307645952.0,
-            "3440": 33307185152.0,
-            "3445": 33307459584.0,
-            "3450": 33306804224.0,
-            "3455": 33307662336.0,
-            "3460": 33306748928.0,
-            "3465": 33306497024.0,
-            "3470": 33306796032.0,
-            "3475": 33307947008.0,
-            "3480": 33308039168.0,
-            "3485": 33307676672.0,
-            "3490": 33306728448.0,
-            "3495": 33307115520.0,
-            "3500": 33306628096.0,
-            "3505": 33307537408.0,
-            "3510": 33306945536.0,
-            "3515": 33306902528.0,
-            "3520": 33307553792.0,
-            "3525": 33307590656.0,
-            "3530": 33307852800.0,
-            "3535": 33306773504.0,
-            "3540": 33307953152.0,
-            "3545": 33307463680.0,
-            "3550": 33307123712.0,
-            "3555": 33307738112.0,
-            "3560": 33307766784.0,
-            "3565": 33307088896.0,
-            "3570": 33306882048.0,
-            "3575": 33307443200.0,
-            "3580": 33306951680.0,
-            "3585": 33306841088.0,
-            "3590": 33308293120.0,
-            "3595": 33307723776.0,
-            "3600": 33307756544.0,
-            "3605": 33307930624.0,
-            "3610": 33307985920.0,
-            "3615": 33307222016.0,
-            "3620": 33307430912.0,
-            "3625": 33307148288.0,
-            "3630": 33306388480.0,
-            "3635": 33307035648.0,
-            "3640": 33307455488.0,
-            "3645": 33306906624.0,
-            "3650": 33307545600.0,
-            "3655": 33307336704.0,
-            "3660": 33306910720.0,
-            "3665": 33307623424.0,
-            "3670": 33306824704.0,
-            "3675": 33307590656.0,
-            "3680": 33307373568.0,
-            "3685": 33306505216.0,
-            "3690": 33307817984.0,
-            "3695": 33306890240.0,
-            "3700": 33306802176.0,
-            "3705": 33306945536.0,
-            "3710": 33306904576.0,
-            "3715": 33307754496.0,
-            "3720": 33308395520.0,
-            "3725": 33308112896.0,
-            "3730": 33307652096.0,
-            "3735": 33307867136.0,
-            "3740": 33307805696.0,
-            "3745": 33308069888.0,
-            "3750": 33307826176.0,
-            "3755": 33306439680.0,
-            "3760": 33306849280.0,
-            "3765": 33307471872.0,
-            "3770": 33307095040.0,
-            "3775": 33307492352.0,
-            "3780": 33308141568.0,
-            "3785": 33307910144.0,
-            "3790": 33307656192.0,
-            "3795": 33307727872.0,
-            "3800": 33307246592.0,
-            "3805": 33307848704.0,
-            "3810": 33307490304.0,
-            "3815": 33307357184.0,
-            "3820": 33307346944.0,
-            "3825": 33307619328.0,
-            "3830": 33308102656.0,
-            "3835": 33306849280.0,
-            "3840": 33307678720.0,
-            "3845": 33307258880.0,
-            "3850": 33307686912.0,
-            "3855": 33307467776.0,
-            "3860": 33307471872.0,
-            "3865": 33307439104.0,
-            "3870": 33307676672.0,
-            "3875": 33306865664.0,
-            "3880": 33307232256.0,
-            "3885": 33307099136.0,
-            "3890": 33307854848.0,
-            "3895": 33306370048.0,
-            "3900": 33306900480.0,
-            "3905": 33306824704.0,
-            "3910": 33307361280.0,
-            "3915": 33306591232.0,
-            "3920": 33307213824.0,
-            "3925": 33306980352.0,
-            "3930": 33308110848.0,
-            "3935": 33307179008.0,
-            "3940": 33307379712.0,
-            "3945": 33307813888.0,
-            "3950": 33307277312.0,
-            "3955": 33307203584.0,
-            "3960": 33307234304.0,
-            "3965": 33307121664.0,
-            "3970": 33307303936.0,
-            "3975": 33307144192.0,
-            "3980": 33307869184.0,
-            "3985": 33307660288.0,
-            "3990": 33307779072.0,
-            "3995": 33307795456.0,
-            "4000": 33307131904.0,
-            "4005": 33307238400.0,
-            "4010": 33307875328.0,
-            "4015": 33306726400.0,
-            "4020": 33308227584.0,
-            "4025": 33307799552.0,
-            "4030": 33307318272.0,
-            "4035": 33308190720.0,
-            "4040": 33307932672.0,
-            "4045": 33307291648.0,
-            "4050": 33307959296.0,
-            "4055": 33307447296.0,
-            "4060": 33307486208.0,
-            "4065": 33308088320.0,
-            "4070": 33307183104.0,
-            "4075": 33307201536.0,
-            "4080": 33308184576.0,
-            "4085": 33306406912.0,
-            "4090": 33307891712.0,
-            "4095": 33307031552.0,
-            "4100": 33308100608.0,
-            "4105": 33307258880.0,
-            "4110": 33307492352.0,
-            "4115": 33308344320.0,
-            "4120": 33306552320.0,
-            "4125": 33307611136.0,
-            "4130": 33306083328.0,
-            "4135": 33308463104.0,
-            "4140": 33307611136.0,
-            "4145": 33307455488.0,
-            "4150": 33307658240.0,
-            "4155": 33307133952.0,
-            "4160": 33308233728.0,
-            "4165": 33307408384.0,
-            "4170": 33306888192.0,
-            "4175": 33307852800.0,
-            "4180": 33307150336.0,
-            "4185": 33307127808.0,
-            "4190": 33307582464.0,
-            "4195": 33308610560.0,
-            "4200": 33308231680.0,
-            "4205": 33307906048.0,
-            "4210": 33308307456.0,
-            "4215": 33306363904.0,
-            "4220": 33306980352.0,
-            "4225": 33306318848.0,
-            "4230": 33307731968.0,
-            "4235": 33307142144.0,
-            "4240": 33307432960.0,
-            "4245": 33307097088.0,
-            "4250": 33307783168.0,
-            "4255": 33307365376.0,
-            "4260": 33306947584.0,
-            "4265": 33306611712.0,
-            "4270": 33306347520.0,
-            "4275": 33306624000.0,
-            "4280": 33307185152.0,
-            "4285": 33307922432.0,
-            "4290": 33307508736.0,
-            "4295": 33307658240.0,
-            "4300": 33308405760.0,
-            "4305": 33306474496.0,
-            "4310": 33307557888.0,
-            "4315": 33308307456.0,
-            "4320": 33307719680.0,
-            "4325": 33306824704.0,
-            "4330": 33307594752.0,
-            "4335": 33306144768.0,
-            "4340": 33307852800.0,
-            "4345": 33307342848.0,
-            "4350": 33308139520.0,
-            "4355": 33307713536.0,
-            "4360": 33307373568.0,
-            "4365": 33308065792.0,
-            "4370": 33306681344.0,
-            "4375": 33307770880.0,
-            "4380": 33307361280.0,
-            "4385": 33307086848.0,
-            "4390": 33307019264.0,
-            "4395": 33306986496.0,
-            "4400": 33307103232.0,
-            "4405": 33307664384.0,
-            "4410": 33307996160.0,
-            "4415": 33306990592.0,
-            "4420": 33306546176.0,
-            "4425": 33306904576.0,
-            "4430": 33307303936.0,
-            "4435": 33306763264.0,
-            "4440": 33308063744.0,
-            "4445": 33307242496.0,
-            "4450": 33307283456.0,
-            "4455": 33306654720.0,
-            "4460": 33307205632.0,
-            "4465": 33306867712.0,
-            "4470": 33307916288.0,
-            "4475": 33307791360.0,
-            "4480": 33308450816.0,
-            "4485": 33307547648.0,
-            "4490": 33307090944.0,
-            "4495": 33307000832.0,
-            "4500": 33306935296.0,
-            "4505": 33307099136.0,
-            "4510": 33307525120.0,
-            "4515": 33307367424.0,
-            "4520": 33307813888.0,
-            "4525": 33307715584.0,
-            "4530": 33307901952.0,
-            "4535": 33307174912.0,
-            "4540": 33306880000.0,
-            "4545": 33307138048.0,
-            "4550": 33306873856.0,
-            "4555": 33306316800.0,
-            "4560": 33305849856.0,
-            "4565": 33307187200.0,
-            "4570": 33307260928.0,
-            "4575": 33307410432.0,
-            "4580": 33307201536.0,
-            "4585": 33306920960.0,
-            "4590": 33307355136.0,
-            "4595": 33307346944.0,
-            "4600": 33307856896.0,
-            "4605": 33307752448.0,
-            "4610": 33307095040.0,
-            "4615": 33306286080.0,
-            "4620": 33306699776.0,
-            "4625": 33308069888.0,
-            "4630": 33307439104.0,
-            "4635": 33306900480.0,
-            "4640": 33307076608.0,
-            "4645": 33308160000.0,
-            "4650": 33307758592.0,
-            "4655": 33307865088.0,
-            "4660": 33306255360.0,
-            "4665": 33307641856.0,
-            "4670": 33307912192.0,
-            "4675": 33306603520.0,
-            "4680": 33307799552.0,
-            "4685": 33307488256.0,
-            "4690": 33307394048.0,
-            "4695": 33306763264.0,
-            "4700": 33307873280.0,
-            "4705": 33308106752.0,
-            "4710": 33307617280.0,
-            "4715": 33307047936.0,
-            "4720": 33307901952.0,
-            "4725": 33307793408.0,
-            "4730": 33308123136.0,
-            "4735": 33307451392.0,
-            "4740": 33307623424.0,
-            "4745": 33306857472.0,
-            "4750": 33308436480.0,
-            "4755": 33307260928.0,
-            "4760": 33307975680.0,
-            "4765": 33307965440.0,
-            "4770": 33306859520.0,
-            "4775": 33307922432.0,
-            "4780": 33306978304.0,
-            "4785": 33306869760.0,
-            "4790": 33307084800.0,
-            "4795": 33307226112.0,
-            "4800": 33307961344.0,
-            "4805": 33308334080.0,
-            "4810": 33305587712.0,
-            "4815": 33307928576.0,
-            "4820": 33307875328.0,
-            "4825": 33306957824.0,
-            "4830": 33307797504.0,
-            "4835": 33306116096.0,
-            "4840": 33307654144.0,
-            "4845": 33307131904.0,
-            "4850": 33308055552.0,
-            "4855": 33305792512.0,
-            "4860": 33307402240.0,
-            "4865": 33307086848.0,
-            "4870": 33307637760.0,
-            "4875": 33307789312.0,
-            "4880": 33307701248.0,
-            "4885": 33308010496.0,
-            "4890": 33307039744.0,
-            "4895": 33307369472.0,
-            "4900": 33307127808.0,
-            "4905": 33306988544.0,
-            "4910": 33308276736.0,
-            "4915": 33307090944.0,
-            "4920": 33307015168.0,
-            "4925": 33308043264.0,
-            "4930": 33307607040.0,
-            "4935": 33308209152.0,
-            "4940": 33307725824.0,
-            "4945": 33307985920.0,
-            "4950": 33307582464.0,
-            "4955": 33307297792.0,
-            "4960": 33307639808.0,
-            "4965": 33307445248.0,
-            "4970": 33306869760.0,
-            "4975": 33306787840.0,
-            "4980": 33307099136.0,
-            "4985": 33307635712.0,
-            "4990": 33307406336.0,
-            "4995": 33307471872.0,
-            "5000": 33307375616.0,
-            "5005": 33307672576.0,
-            "5010": 33306970112.0,
-            "5015": 33307244544.0,
-            "5020": 33306966016.0,
-            "5025": 33307705344.0,
-            "5030": 33307463680.0,
-            "5035": 33306818560.0,
-            "5040": 33306972160.0,
-            "5045": 33308157952.0,
-            "5050": 33306376192.0,
-            "5055": 33307594752.0,
-            "5060": 33308471296.0,
-            "5065": 33307455488.0,
-            "5070": 33307301888.0,
-            "5075": 33307488256.0,
-            "5080": 33307910144.0,
-            "5085": 33307635712.0,
-            "5090": 33307406336.0,
-            "5095": 33307254784.0,
-            "5100": 33306828800.0,
-            "5105": 33307852800.0,
-            "5110": 33308258304.0,
-            "5115": 33307228160.0,
-            "5120": 33307955200.0,
-            "5125": 33305640960.0,
-            "5130": 33306683392.0,
-            "5135": 33307336704.0,
-            "5140": 33307834368.0,
-            "5145": 33307060224.0,
-            "5150": 33307023360.0,
-            "5155": 33307308032.0,
-            "5160": 33306664960.0,
-            "5165": 33307123712.0,
-            "5170": 33306935296.0,
-            "5175": 33308094464.0,
-            "5180": 33306566656.0,
-            "5185": 33306796032.0,
-            "5190": 33307545600.0,
-            "5195": 33308067840.0,
-            "5200": 33307754496.0,
-            "5205": 33307445248.0,
-            "5210": 33306785792.0,
-            "5215": 33307551744.0,
-            "5220": 33308188672.0,
-            "5225": 33307338752.0,
-            "5230": 33307283456.0,
-            "5235": 33306976256.0,
-            "5240": 33308041216.0,
-            "5245": 33308340224.0,
-            "5250": 33308153856.0,
-            "5255": 33307590656.0,
-            "5260": 33306896384.0,
-            "5265": 33308303360.0,
-            "5270": 33308796928.0,
-            "5275": 33307949056.0,
-            "5280": 33306157056.0,
-            "5285": 33307904000.0,
-            "5290": 33308143616.0,
-            "5295": 33306533888.0,
-            "5300": 33307912192.0,
-            "5305": 33308338176.0,
-            "5310": 33308688384.0,
-            "5315": 33308045312.0,
-            "5320": 33306206208.0,
-            "5325": 33308219392.0,
-            "5330": 33308012544.0,
-            "5335": 33307602944.0,
-            "5340": 33306685440.0,
-            "5345": 33308209152.0,
-            "5350": 33307150336.0,
-            "5355": 33308176384.0,
-            "5360": 33307273216.0,
-            "5365": 33307850752.0,
-            "5370": 33307222016.0,
-            "5375": 33307803648.0,
-            "5380": 33307617280.0,
-            "5385": 33307179008.0,
-            "5390": 33307389952.0,
-            "5395": 33306927104.0,
-            "5400": 33307518976.0,
-            "5405": 33307400192.0,
-            "5410": 33307598848.0,
-            "5415": 33307846656.0,
-            "5420": 33307490304.0,
-            "5425": 33307459584.0,
-            "5430": 33307283456.0,
-            "5435": 33307453440.0,
-            "5440": 33307383808.0,
-            "5445": 33307117568.0,
-            "5450": 33307832320.0,
-            "5455": 33307582464.0,
-            "5460": 33306963968.0,
-            "5465": 33306947584.0,
-            "5470": 33307355136.0,
-            "5475": 33306748928.0,
-            "5480": 33306435584.0,
-            "5485": 33307590656.0,
-            "5490": 33307787264.0,
-            "5495": 33307568128.0,
-            "5500": 33307351040.0,
-            "5505": 33307568128.0,
-            "5510": 33307426816.0,
-            "5515": 33307451392.0,
-            "5520": 33307549696.0,
-            "5525": 33307000832.0,
-            "5530": 33307566080.0,
-            "5535": 33307664384.0,
-            "5540": 33306966016.0,
-            "5545": 33307781120.0,
-            "5550": 33307275264.0,
-            "5555": 33307269120.0,
-            "5560": 33307576320.0,
-            "5565": 33307377664.0,
-            "5570": 33307052032.0,
-            "5575": 33306978304.0,
-            "5580": 33307965440.0,
-            "5585": 33307494400.0,
-            "5590": 33308055552.0,
-            "5595": 33306943488.0,
-            "5600": 33306542080.0,
-            "5605": 33307680768.0,
-            "5610": 33308542976.0,
-            "5615": 33307826176.0,
-            "5620": 33308108800.0,
-            "5625": 33308225536.0,
-            "5630": 33308069888.0,
-            "5635": 33307760640.0,
-            "5640": 33307500544.0,
-            "5645": 33307930624.0,
-            "5650": 33306755072.0,
-            "5655": 33308192768.0,
-            "5660": 33308631040.0,
-            "5665": 33307418624.0,
-            "5670": 33307504640.0,
-            "5675": 33307715584.0,
-            "5680": 33307910144.0,
-            "5685": 33307996160.0,
-            "5690": 33307478016.0,
-            "5695": 33308164096.0,
-            "5700": 33307906048.0,
-            "5705": 33307750400.0,
-            "5710": 33306779648.0,
-            "5715": 33307219968.0,
-            "5720": 33307750400.0,
-            "5725": 33307537408.0,
-            "5730": 33307262976.0,
-            "5735": 33306767360.0,
-            "5740": 33307508736.0,
-            "5745": 33306753024.0,
-            "5750": 33306636288.0,
-            "5755": 33306943488.0,
-            "5760": 33307553792.0,
-            "5765": 33307842560.0,
-            "5770": 33307047936.0,
-            "5775": 33307348992.0,
-            "5780": 33306361856.0,
-            "5785": 33307709440.0,
-            "5790": 33307832320.0,
-            "5795": 33307406336.0,
-            "5800": 33307056128.0,
-            "5805": 33307631616.0,
-            "5810": 33307766784.0,
-            "5815": 33307971584.0,
-            "5820": 33307447296.0,
-            "5825": 33307084800.0,
-            "5830": 33307324416.0,
-            "5835": 33307127808.0,
-            "5840": 33307729920.0,
-            "5845": 33307088896.0,
-            "5850": 33307635712.0,
-            "5855": 33307119616.0,
-            "5860": 33306703872.0,
-            "5865": 33307291648.0,
-            "5870": 33307613184.0,
-            "5875": 33307893760.0,
-            "5880": 33307893760.0,
-            "5885": 33307301888.0,
-            "5890": 33307830272.0,
-            "5895": 33306671104.0,
-            "5900": 33306488832.0,
-            "5905": 33308141568.0,
-            "5910": 33307373568.0,
-            "5915": 33307330560.0,
-            "5920": 33307656192.0,
-            "5925": 33307533312.0,
-            "5930": 33307848704.0,
-            "5935": 33307586560.0,
-            "5940": 33307602944.0,
-            "5945": 33307631616.0,
-            "5950": 33306615808.0,
-            "5955": 33307719680.0,
-            "5960": 33308553216.0,
-            "5965": 33308676096.0,
-            "5970": 33308313600.0,
-            "5975": 33306810368.0,
-            "5980": 33307222016.0,
-            "5985": 33307367424.0,
-            "5990": 33307119616.0,
-            "5995": 33307166720.0,
-            "6000": 33307822080.0,
-            "6005": 33307553792.0,
-            "6010": 33307756544.0,
-            "6015": 33306392576.0,
-            "6020": 33308116992.0,
-            "6025": 33307738112.0,
-            "6030": 33307459584.0,
-            "6035": 33306920960.0,
-            "6040": 33307701248.0,
-            "6045": 33307932672.0,
-            "6050": 33307496448.0,
-            "6055": 33307133952.0,
-            "6060": 33306370048.0,
-            "6065": 33307521024.0,
-            "6070": 33307244544.0,
-            "6075": 33306447872.0,
-            "6080": 33306963968.0,
-            "6085": 33307932672.0,
-            "6090": 33307293696.0,
-            "6095": 33307058176.0,
-            "6100": 33307449344.0,
-            "6105": 33307613184.0,
-            "6110": 33307779072.0,
-            "6115": 33306832896.0,
-            "6120": 33306732544.0,
-            "6125": 33306488832.0,
-            "6130": 33308866560.0,
-            "6135": 33308000256.0,
-            "6140": 33307906048.0,
-            "6145": 33308504064.0,
-            "6150": 33307826176.0,
-            "6155": 33306906624.0,
-            "6160": 33307533312.0,
-            "6165": 33307578368.0,
-            "6170": 33307891712.0,
-            "6175": 33307537408.0,
-            "6180": 33307803648.0,
-            "6185": 33308125184.0,
-            "6190": 33307342848.0,
-            "6195": 33308135424.0,
-            "6200": 33306468352.0,
-            "6205": 33308026880.0,
-            "6210": 33308028928.0,
-            "6215": 33308157952.0,
-            "6220": 33307662336.0,
-            "6225": 33307344896.0,
-            "6230": 33308231680.0,
-            "6235": 33307148288.0,
-            "6240": 33308809216.0,
-            "6245": 33307017216.0,
-            "6250": 33307234304.0,
-            "6255": 33308430336.0,
-            "6260": 33307246592.0,
-            "6265": 33307418624.0,
-            "6270": 33308319744.0,
-            "6275": 33307090944.0,
-            "6280": 33307404288.0,
-            "6285": 33308227584.0,
-            "6290": 33307656192.0,
-            "6295": 33306865664.0,
-            "6300": 33307596800.0,
-            "6305": 33308192768.0,
-            "6310": 33307695104.0,
-            "6315": 33307361280.0,
-            "6320": 33306775552.0,
-            "6325": 33307557888.0,
-            "6330": 33307639808.0,
-            "6335": 33307820032.0,
-            "6340": 33307410432.0,
-            "6345": 33307410432.0,
-            "6350": 33308256256.0,
-            "6355": 33307082752.0,
-            "6360": 33306855424.0,
-            "6365": 33307418624.0,
-            "6370": 33307066368.0,
-            "6375": 33307891712.0,
-            "6380": 33307779072.0,
-            "6385": 33306128384.0,
-            "6390": 33306884096.0,
-            "6395": 33307060224.0,
-            "6400": 33307250688.0,
-            "6405": 33308135424.0,
-            "6410": 33308155904.0,
-            "6415": 33307101184.0,
-            "6420": 33306318848.0,
-            "6425": 33308065792.0,
-            "6430": 33307813888.0,
-            "6435": 33307842560.0,
-            "6440": 33308571648.0,
-            "6445": 33306138624.0,
-            "6450": 33307762688.0,
-            "6455": 33308119040.0,
-            "6460": 33308037120.0,
-            "6465": 33308467200.0,
-            "6470": 33307181056.0,
-            "6475": 33307246592.0,
-            "6480": 33306855424.0,
-            "6485": 33308440576.0,
-            "6490": 33307863040.0,
-            "6495": 33306857472.0,
-            "6500": 33306529792.0,
-            "6505": 33307097088.0,
-            "6510": 33307842560.0,
-            "6515": 33307095040.0,
-            "6520": 33307848704.0,
-            "6525": 33307596800.0,
-            "6530": 33307117568.0,
-            "6535": 33307811840.0,
-            "6540": 33307645952.0,
-            "6545": 33307211776.0,
-            "6550": 33308196864.0,
-            "6555": 33307213824.0,
-            "6560": 33307326464.0,
-            "6565": 33306490880.0,
-            "6570": 33306877952.0,
-            "6575": 33307199488.0,
-            "6580": 33308370944.0,
-            "6585": 33307828224.0,
-            "6590": 33307871232.0,
-            "6595": 33307590656.0,
-            "6600": 33306578944.0,
-            "6605": 33307496448.0,
-            "6610": 33307912192.0,
-            "6615": 33307521024.0,
-            "6620": 33307189248.0,
-            "6625": 33306961920.0,
-            "6630": 33306800128.0,
-            "6635": 33306957824.0,
-            "6640": 33307762688.0,
-            "6645": 33306427392.0,
-            "6650": 33307672576.0,
-            "6655": 33305133056.0,
-            "6660": 33307598848.0,
-            "6665": 33306884096.0,
-            "6670": 33307500544.0,
-            "6675": 33307592704.0,
-            "6680": 33306923008.0,
-            "6685": 33307084800.0,
-            "6690": 33307402240.0,
-            "6695": 33307963392.0,
-            "6700": 33307336704.0,
-            "6705": 33306845184.0,
-            "6710": 33307230208.0,
-            "6715": 33306310656.0,
-            "6720": 33307834368.0,
-            "6725": 33308094464.0,
-            "6730": 33308327936.0,
-            "6735": 33308092416.0,
-            "6740": 33306873856.0,
-            "6745": 33308082176.0,
-            "6750": 33306112000.0,
-            "6755": 33306810368.0,
-            "6760": 33307394048.0,
-            "6765": 33307414528.0,
-            "6770": 33308286976.0,
-            "6775": 33308618752.0,
-            "6780": 33306904576.0,
-            "6785": 33308182528.0,
-            "6790": 33308057600.0,
-            "6795": 33307049984.0,
-            "6800": 33306744832.0,
-            "6805": 33307242496.0,
-            "6810": 33307176960.0,
-            "6815": 33307779072.0,
-            "6820": 33306849280.0,
-            "6825": 33307623424.0,
-            "6830": 33307887616.0,
-            "6835": 33307670528.0,
-            "6840": 33308348416.0,
-            "6845": 33308184576.0,
-            "6850": 33307727872.0,
-            "6855": 33307252736.0,
-            "6860": 33307680768.0,
-            "6865": 33306963968.0,
-            "6870": 33307099136.0,
-            "6875": 33307037696.0,
-            "6880": 33307635712.0,
-            "6885": 33307615232.0,
-            "6890": 33307652096.0,
-            "6895": 33307369472.0,
-            "6900": 33307947008.0,
-            "6905": 33307334656.0,
-            "6910": 33306824704.0,
-            "6915": 33307537408.0,
-            "6920": 33306619904.0,
-            "6925": 33306408960.0,
-            "6930": 33306765312.0,
-            "6935": 33306609664.0,
-            "6940": 33307623424.0,
-            "6945": 33307160576.0,
-            "6950": 33307463680.0,
-            "6955": 33306507264.0,
-            "6960": 33307185152.0,
-            "6965": 33307019264.0,
-            "6970": 33307598848.0,
-            "6975": 33307435008.0,
-            "6980": 33307238400.0,
-            "6985": 33306222592.0,
-            "6990": 33308581888.0,
-            "6995": 33307254784.0,
-            "7000": 33308035072.0,
-            "7005": 33308233728.0,
-            "7010": 33307092992.0,
-            "7015": 33307193344.0,
-            "7020": 33307643904.0,
-            "7025": 33308274688.0,
-            "7030": 33307019264.0,
-            "7035": 33308454912.0,
-            "7040": 33308086272.0,
-            "7045": 33307277312.0,
-            "7050": 33307172864.0,
-            "7055": 33306599424.0,
-            "7060": 33307613184.0,
-            "7065": 33307031552.0,
-            "7070": 33306243072.0,
-            "7075": 33308037120.0,
-            "7080": 33306759168.0,
-            "7085": 33308033024.0,
-            "7090": 33307971584.0,
-            "7095": 33306873856.0,
-            "7100": 33308522496.0,
-            "7105": 33307363328.0,
-            "7110": 33308063744.0,
-            "7115": 33307770880.0,
-            "7120": 33307906048.0,
-            "7125": 33307443200.0,
-            "7130": 33307574272.0,
-            "7135": 33307541504.0,
-            "7140": 33306765312.0,
-            "7145": 33307854848.0,
-            "7150": 33306853376.0,
-            "7155": 33307856896.0,
-            "7160": 33307906048.0,
-            "7165": 33308184576.0,
-            "7170": 33308272640.0,
-            "7175": 33306417152.0,
-            "7180": 33307107328.0,
-            "7185": 33307860992.0,
-            "7190": 33307078656.0,
-            "7195": 33307494400.0,
-            "7200": 33307613184.0,
-            "7205": 33307680768.0,
-            "7210": 33307990016.0,
-            "7215": 33306822656.0,
-            "7220": 33306730496.0,
-            "7225": 33307539456.0,
-            "7230": 33307744256.0,
-            "7235": 33306136576.0,
-            "7240": 33307189248.0,
-            "7245": 33307236352.0,
-            "7250": 33306980352.0,
-            "7255": 33307832320.0,
-            "7260": 33307426816.0,
-            "7265": 33307340800.0,
-            "7270": 33307844608.0,
-            "7275": 33308094464.0,
-            "7280": 33308602368.0,
-            "7285": 33307498496.0,
-            "7290": 33307920384.0,
-            "7295": 33307426816.0,
-            "7300": 33306392576.0,
-            "7305": 33306718208.0,
-            "7310": 33307260928.0,
-            "7315": 33307527168.0,
-            "7320": 33306963968.0,
-            "7325": 33308188672.0,
-            "7330": 33307799552.0,
-            "7335": 33307717632.0,
-            "7340": 33307238400.0,
-            "7345": 33307365376.0,
-            "7350": 33307314176.0,
-            "7355": 33307940864.0,
-            "7360": 33306284032.0,
-            "7365": 33307893760.0,
-            "7370": 33306275840.0,
-            "7375": 33307873280.0,
-            "7380": 33309245440.0,
-            "7385": 33306730496.0,
-            "7390": 33307758592.0,
-            "7395": 33306609664.0,
-            "7400": 33307652096.0,
-            "7405": 33306427392.0,
-            "7410": 33308524544.0,
-            "7415": 33307961344.0,
-            "7420": 33307242496.0,
-            "7425": 33307811840.0,
-            "7430": 33307119616.0,
-            "7435": 33307428864.0,
-            "7440": 33307709440.0,
-            "7445": 33308342272.0,
-            "7450": 33306980352.0,
-            "7455": 33307351040.0,
-            "7460": 33306730496.0,
-            "7465": 33306537984.0,
-            "7470": 33307664384.0,
-            "7475": 33308037120.0,
-            "7480": 33307179008.0,
-            "7485": 33308467200.0,
-            "7490": 33307822080.0,
-            "7495": 33306638336.0,
-            "7500": 33306689536.0,
-            "7505": 33307717632.0,
-            "7510": 33306789888.0,
-            "7515": 33307518976.0,
-            "7520": 33307260928.0,
-            "7525": 33307676672.0,
-            "7530": 33306916864.0,
-            "7535": 33306996736.0,
-            "7540": 33306566656.0,
-            "7545": 33306720256.0,
-            "7550": 33307584512.0,
-            "7555": 33307471872.0,
-            "7560": 33306736640.0,
-            "7565": 33306292224.0,
-            "7570": 33307066368.0,
-            "7575": 33306871808.0,
-            "7580": 33307324416.0,
-            "7585": 33307115520.0,
-            "7590": 33306341376.0,
-            "7595": 33307744256.0,
-            "7600": 33307482112.0,
-            "7605": 33308149760.0,
-            "7610": 33307525120.0,
-            "7615": 33307656192.0,
-            "7620": 33307224064.0,
-            "7625": 33307158528.0,
-            "7630": 33307742208.0,
-            "7635": 33308012544.0,
-            "7640": 33307049984.0,
-            "7645": 33308631040.0,
-            "7650": 33307865088.0,
-            "7655": 33308229632.0,
-            "7660": 33307043840.0,
-            "7665": 33307037696.0,
-            "7670": 33306791936.0,
-            "7675": 33307320320.0,
-            "7680": 33307293696.0,
-            "7685": 33307432960.0,
-            "7690": 33307103232.0,
-            "7695": 33307568128.0,
-            "7700": 33306312704.0,
-            "7705": 33307795456.0,
-            "7710": 33307996160.0,
-            "7715": 33307133952.0,
-            "7720": 33308164096.0,
-            "7725": 33307254784.0,
-            "7730": 33307830272.0,
-            "7735": 33307721728.0,
-            "7740": 33307492352.0,
-            "7745": 33307783168.0,
-            "7750": 33306728448.0,
-            "7755": 33307734016.0,
-            "7760": 33308614656.0,
-            "7765": 33306791936.0,
-            "7770": 33308278784.0,
-            "7775": 33307873280.0,
-            "7780": 33307078656.0,
-            "7785": 33306990592.0,
-            "7790": 33307062272.0,
-            "7795": 33307680768.0,
-            "7800": 33306982400.0,
-            "7805": 33308090368.0,
-            "7810": 33307308032.0,
-            "7815": 33307078656.0,
-            "7820": 33307951104.0,
-            "7825": 33306480640.0,
-            "7830": 33307258880.0,
-            "7835": 33307891712.0,
-            "7840": 33307432960.0,
-            "7845": 33307066368.0,
-            "7850": 33306910720.0,
-            "7855": 33307938816.0,
-            "7860": 33307308032.0,
-            "7865": 33308264448.0,
-            "7870": 33307729920.0,
-            "7875": 33308129280.0,
-            "7880": 33308352512.0,
-            "7885": 33307398144.0,
-            "7890": 33306920960.0,
-            "7895": 33307156480.0,
-            "7900": 33308221440.0,
-            "7905": 33308047360.0,
-            "7910": 33306146816.0,
-            "7915": 33306910720.0,
-            "7920": 33307090944.0,
-            "7925": 33308264448.0,
-            "7930": 33307908096.0,
-            "7935": 33307465728.0,
-            "7940": 33307375616.0,
-            "7945": 33307848704.0,
-            "7950": 33308090368.0,
-            "7955": 33307043840.0,
-            "7960": 33307168768.0,
-            "7965": 33307846656.0,
-            "7970": 33306454016.0,
-            "7975": 33307635712.0,
-            "7980": 33307555840.0,
-            "7985": 33307131904.0,
-            "7990": 33306732544.0,
-            "7995": 33307430912.0,
-            "8000": 33307674624.0,
-            "8005": 33307746304.0,
-            "8010": 33308002304.0,
-            "8015": 33306906624.0,
-            "8020": 33307895808.0,
-            "8025": 33308231680.0,
-            "8030": 33307664384.0,
-            "8035": 33306888192.0,
-            "8040": 33308024832.0,
-            "8045": 33307693056.0,
-            "8050": 33306583040.0,
-            "8055": 33307201536.0,
-            "8060": 33307594752.0,
-            "8065": 33308260352.0,
-            "8070": 33307426816.0,
-            "8075": 33308108800.0,
-            "8080": 33308178432.0,
-            "8085": 33307308032.0,
-            "8090": 33306513408.0,
-            "8095": 33306968064.0,
-            "8100": 33308413952.0,
-            "8105": 33308241920.0,
-            "8110": 33307471872.0,
-            "8115": 33307832320.0,
-            "8120": 33307193344.0,
-            "8125": 33307295744.0,
-            "8130": 33306775552.0,
-            "8135": 33307097088.0,
-            "8140": 33307865088.0,
-            "8145": 33306746880.0,
-            "8150": 33307023360.0,
-            "8155": 33306806272.0,
-            "8160": 33307373568.0,
-            "8165": 33307631616.0,
-            "8170": 33306769408.0,
-            "8175": 33308239872.0,
-            "8180": 33307240448.0,
-            "8185": 33307471872.0,
-            "8190": 33308184576.0,
-            "8195": 33307754496.0,
-            "8200": 33307459584.0,
-            "8205": 33307850752.0,
-            "8210": 33306810368.0,
-            "8215": 33306222592.0,
-            "8220": 33307795456.0,
-            "8225": 33308078080.0,
-            "8230": 33306132480.0,
-            "8235": 33308764160.0,
-            "8240": 33307432960.0,
-            "8245": 33307867136.0,
-            "8250": 33308260352.0,
-            "8255": 33308334080.0,
-            "8260": 33308233728.0,
-            "8265": 33308528640.0,
-            "8270": 33307699200.0,
-            "8275": 33306748928.0,
-            "8280": 33307635712.0,
-            "8285": 33308008448.0,
-            "8290": 33307590656.0,
-            "8295": 33308041216.0,
-            "8300": 33307516928.0,
-            "8305": 33307879424.0,
-            "8310": 33307576320.0,
-            "8315": 33308366848.0,
-            "8320": 33307496448.0,
-            "8325": 33307256832.0,
-            "8330": 33307680768.0,
-            "8335": 33306669056.0,
-            "8340": 33306990592.0,
-            "8345": 33307936768.0,
-            "8350": 33307955200.0,
-            "8355": 33307791360.0,
-            "8360": 33306640384.0,
-            "8365": 33307586560.0,
-            "8370": 33307648000.0,
-            "8375": 33306890240.0,
-            "8380": 33307764736.0,
-            "8385": 33307871232.0,
-            "8390": 33307023360.0,
-            "8395": 33307664384.0,
-            "8400": 33307510784.0,
-            "8405": 33307338752.0,
-            "8410": 33307316224.0,
-            "8415": 33307566080.0,
-            "8420": 33307891712.0,
-            "8425": 33307676672.0,
-            "8430": 33307693056.0,
-            "8435": 33306812416.0,
-            "8440": 33307762688.0,
-            "8445": 33307447296.0,
-            "8450": 33307426816.0,
-            "8455": 33306660864.0,
-            "8460": 33307385856.0,
-            "8465": 33308121088.0,
-            "8470": 33307664384.0,
-            "8475": 33307023360.0,
-            "8480": 33308082176.0,
-            "8485": 33307346944.0,
-            "8490": 33307471872.0,
-            "8495": 33307889664.0,
-            "8500": 33307492352.0,
-            "8505": 33307502592.0,
-            "8510": 33307815936.0,
-            "8515": 33307983872.0,
-            "8520": 33306431488.0,
-            "8525": 33306537984.0,
-            "8530": 33307199488.0,
-            "8535": 33307848704.0,
-            "8540": 33307459584.0,
-            "8545": 33307432960.0,
-            "8550": 33307600896.0,
-            "8555": 33308553216.0,
-            "8560": 33307701248.0,
-            "8565": 33307799552.0,
-            "8570": 33307934720.0,
-            "8575": 33306324992.0,
-            "8580": 33307648000.0,
-            "8585": 33307951104.0,
-            "8590": 33308108800.0,
-            "8595": 33308037120.0,
-            "8600": 33308182528.0,
-            "8605": 33307410432.0,
-            "8610": 33308102656.0,
-            "8615": 33307342848.0,
-            "8620": 33306077184.0,
-            "8625": 33308153856.0,
-            "8630": 33307807744.0,
-            "8635": 33306734592.0,
-            "8640": 33307867136.0,
-            "8645": 33307129856.0,
-            "8650": 33307430912.0,
-            "8655": 33307545600.0,
-            "8660": 33307975680.0,
-            "8665": 33307822080.0,
-            "8670": 33307156480.0,
-            "8675": 33307758592.0,
-            "8680": 33308340224.0,
-            "8685": 33307357184.0,
-            "8690": 33308479488.0,
-            "8695": 33306523648.0,
-            "8700": 33307404288.0,
-            "8705": 33307791360.0,
-            "8710": 33308004352.0,
-            "8715": 33308108800.0,
-            "8720": 33307424768.0,
-            "8725": 33307564032.0,
-            "8730": 33306877952.0,
-            "8735": 33307199488.0,
-            "8740": 33307734016.0,
-            "8745": 33307248640.0,
-            "8750": 33307912192.0,
-            "8755": 33307215872.0,
-            "8760": 33308012544.0,
-            "8765": 33306640384.0,
-            "8770": 33307977728.0,
-            "8775": 33306624000.0,
-            "8780": 33307357184.0,
-            "8785": 33306353664.0,
-            "8790": 33307518976.0,
-            "8795": 33308178432.0,
-            "8800": 33307113472.0,
-            "8805": 33307045888.0,
-            "8810": 33307252736.0,
-            "8815": 33307430912.0,
-            "8820": 33307568128.0,
-            "8825": 33306791936.0,
-            "8830": 33307529216.0,
-            "8835": 33306691584.0,
-            "8840": 33306529792.0,
-            "8845": 33307303936.0,
-            "8850": 33307901952.0,
-            "8855": 33308196864.0,
-            "8860": 33307965440.0,
-            "8865": 33307971584.0,
-            "8870": 33306595328.0,
-            "8875": 33306419200.0,
-            "8880": 33307508736.0,
-            "8885": 33306345472.0,
-            "8890": 33307373568.0,
-            "8895": 33307631616.0,
-            "8900": 33307330560.0,
-            "8905": 33308209152.0,
-            "8910": 33308155904.0,
-            "8915": 33306943488.0,
-            "8920": 33307381760.0,
-            "8925": 33307437056.0,
-            "8930": 33308041216.0,
-            "8935": 33307142144.0,
-            "8940": 33307768832.0,
-            "8945": 33308551168.0,
-            "8950": 33307682816.0,
-            "8955": 33307656192.0,
-            "8960": 33307787264.0,
-            "8965": 33306220544.0,
-            "8970": 33307693056.0,
-            "8975": 33307529216.0,
-            "8980": 33307027456.0,
-            "8985": 33308442624.0,
-            "8990": 33307588608.0,
-            "8995": 33308315648.0,
-            "9000": 33307787264.0,
-            "9005": 33307951104.0,
-            "9010": 33305649152.0,
-            "9015": 33307592704.0,
-            "9020": 33307033600.0,
-            "9025": 33307232256.0,
-            "9030": 33307793408.0,
-            "9035": 33307385856.0,
-            "9040": 33308012544.0,
-            "9045": 33307287552.0,
-            "9050": 33307701248.0,
-            "9055": 33306814464.0,
-            "9060": 33307975680.0,
-            "9065": 33307693056.0,
-            "9070": 33306888192.0,
-            "9075": 33307168768.0,
-            "9080": 33306818560.0,
-            "9085": 33307557888.0,
-            "9090": 33308200960.0,
-            "9095": 33306867712.0,
-            "9100": 33308563456.0,
-            "9105": 33306994688.0,
-            "9110": 33307004928.0,
-            "9115": 33307439104.0,
-            "9120": 33307340800.0,
-            "9125": 33307295744.0,
-            "9130": 33306771456.0,
-            "9135": 33307031552.0,
-            "9140": 33306497024.0,
-            "9145": 33307629568.0,
-            "9150": 33308002304.0,
-            "9155": 33307484160.0,
-            "9160": 33308100608.0,
-            "9165": 33307611136.0,
-            "9170": 33307897856.0,
-            "9175": 33307473920.0,
-            "9180": 33307977728.0,
-            "9185": 33307203584.0,
-            "9190": 33306693632.0,
-            "9195": 33306931200.0,
-            "9200": 33307779072.0,
-            "9205": 33307205632.0,
-            "9210": 33307637760.0,
-            "9215": 33307090944.0,
-            "9220": 33308454912.0,
-            "9225": 33307471872.0,
-            "9230": 33307322368.0,
-            "9235": 33307422720.0,
-            "9240": 33307242496.0,
-            "9245": 33308026880.0,
-            "9250": 33308203008.0,
-            "9255": 33307389952.0,
-            "9260": 33308825600.0,
-            "9265": 33306505216.0,
-            "9270": 33307426816.0,
-            "9275": 33307865088.0,
-            "9280": 33307435008.0,
-            "9285": 33307258880.0,
-            "9290": 33308000256.0,
-            "9295": 33307498496.0,
-            "9300": 33307301888.0,
-            "9305": 33307674624.0,
-            "9310": 33307031552.0,
-            "9315": 33306327040.0,
-            "9320": 33306834944.0,
-            "9325": 33307971584.0,
-            "9330": 33307910144.0,
-            "9335": 33307213824.0,
-            "9340": 33307385856.0,
-            "9345": 33307385856.0,
-            "9350": 33308127232.0,
-            "9355": 33306615808.0,
-            "9360": 33306697728.0,
-            "9365": 33307463680.0,
-            "9370": 33306355712.0,
-            "9375": 33307219968.0,
-            "9380": 33307224064.0,
-            "9385": 33308024832.0,
-            "9390": 33307830272.0,
-            "9395": 33307535360.0,
-            "9400": 33307031552.0,
-            "9405": 33307418624.0,
-            "9410": 33306822656.0,
-            "9415": 33307267072.0,
-            "9420": 33306994688.0,
-            "9425": 33306892288.0,
-            "9430": 33307199488.0,
-            "9435": 33306980352.0,
-            "9440": 33306451968.0,
-            "9445": 33308420096.0,
-            "9450": 33306755072.0,
-            "9455": 33306341376.0,
-            "9460": 33308131328.0,
-            "9465": 33307023360.0,
-            "9470": 33308307456.0,
-            "9475": 33308221440.0,
-            "9480": 33308037120.0,
-            "9485": 33308055552.0,
-            "9490": 33307908096.0,
-            "9495": 33306486784.0,
-            "9500": 33306490880.0,
-            "9505": 33307967488.0,
-            "9510": 33307125760.0,
-            "9515": 33307242496.0,
-            "9520": 33307670528.0,
-            "9525": 33307496448.0,
-            "9530": 33307731968.0,
-            "9535": 33307435008.0
-        }
-    },
-    "mem-max-allocated-bytes": {
-        "start_step": 1,
-        "end_step": 9535,
-        "step_interval": 5,
-        "values": {
-            "1": 36905754624.0,
-            "5": 45014786048.0,
-            "10": 45173362688.0,
-            "15": 45173362688.0,
-            "20": 45251878912.0,
-            "25": 45286207488.0,
-            "30": 45286207488.0,
-            "35": 45288939520.0,
-            "40": 45288939520.0,
-            "45": 45288939520.0,
-            "50": 45288939520.0,
-            "55": 45288939520.0,
-            "60": 45288939520.0,
-            "65": 45288939520.0,
-            "70": 45288939520.0,
-            "75": 45288939520.0,
-            "80": 45288939520.0,
-            "85": 45288939520.0,
-            "90": 45288939520.0,
-            "95": 45288939520.0,
-            "100": 45288939520.0,
-            "105": 45288939520.0,
-            "110": 45299392512.0,
-            "115": 45314936832.0,
-            "120": 45378736128.0,
-            "125": 45428596736.0,
-            "130": 45428596736.0,
-            "135": 45445640192.0,
-            "140": 45445640192.0,
-            "145": 45445640192.0,
-            "150": 45445640192.0,
-            "155": 45445640192.0,
-            "160": 45445640192.0,
-            "165": 45445640192.0,
-            "170": 45445640192.0,
-            "175": 45445640192.0,
-            "180": 45445640192.0,
-            "185": 45445640192.0,
-            "190": 45445640192.0,
-            "195": 45445640192.0,
-            "200": 45536641024.0,
-            "205": 45638885376.0,
-            "210": 45638885376.0,
-            "215": 45638885376.0,
-            "220": 45638885376.0,
-            "225": 45638885376.0,
-            "230": 45638885376.0,
-            "235": 45713887232.0,
-            "240": 45932376064.0,
-            "245": 45982269440.0,
-            "250": 45982269440.0,
-            "255": 45982269440.0,
-            "260": 46039670784.0,
-            "265": 46039670784.0,
-            "270": 46039670784.0,
-            "275": 46039670784.0,
-            "280": 46293884928.0,
-            "285": 46293884928.0,
-            "290": 46293884928.0,
-            "295": 46293884928.0,
-            "300": 46293884928.0,
-            "305": 46319267840.0,
-            "310": 46319267840.0,
-            "315": 46319267840.0,
-            "320": 46319267840.0,
-            "325": 46319267840.0,
-            "330": 46319267840.0,
-            "335": 46319267840.0,
-            "340": 46319267840.0,
-            "345": 46451261440.0,
-            "350": 46451261440.0,
-            "355": 46451261440.0,
-            "360": 46451261440.0,
-            "365": 46451261440.0,
-            "370": 46451261440.0,
-            "375": 46451261440.0,
-            "380": 46451261440.0,
-            "385": 46451261440.0,
-            "390": 46451261440.0,
-            "395": 46451261440.0,
-            "400": 46451261440.0,
-            "405": 46451261440.0,
-            "410": 46451261440.0,
-            "415": 46451261440.0,
-            "420": 46451261440.0,
-            "425": 46451261440.0,
-            "430": 46451261440.0,
-            "435": 46451261440.0,
-            "440": 46451261440.0,
-            "445": 46451261440.0,
-            "450": 46451261440.0,
-            "455": 46451261440.0,
-            "460": 46451261440.0,
-            "465": 46451261440.0,
-            "470": 46451261440.0,
-            "475": 46451261440.0,
-            "480": 46451261440.0,
-            "485": 46451261440.0,
-            "490": 46451261440.0,
-            "495": 46451261440.0,
-            "500": 46451261440.0,
-            "505": 46451261440.0,
-            "510": 46451261440.0,
-            "515": 46451261440.0,
-            "520": 46451261440.0,
-            "525": 46451261440.0,
-            "530": 46451261440.0,
-            "535": 46451261440.0,
-            "540": 46451261440.0,
-            "545": 46451261440.0,
-            "550": 46451261440.0,
-            "555": 46451261440.0,
-            "560": 46451261440.0,
-            "565": 46451261440.0,
-            "570": 46451261440.0,
-            "575": 46451261440.0,
-            "580": 46451261440.0,
-            "585": 46451261440.0,
-            "590": 46451261440.0,
-            "595": 46451261440.0,
-            "600": 46451261440.0,
-            "605": 46451261440.0,
-            "610": 46451261440.0,
-            "615": 46451261440.0,
-            "620": 46451261440.0,
-            "625": 46451261440.0,
-            "630": 46451261440.0,
-            "635": 46451261440.0,
-            "640": 46451261440.0,
-            "645": 46451261440.0,
-            "650": 46451261440.0,
-            "655": 46451261440.0,
-            "660": 46451261440.0,
-            "665": 46451261440.0,
-            "670": 46451261440.0,
-            "675": 46451261440.0,
-            "680": 46451261440.0,
-            "685": 46451261440.0,
-            "690": 46451261440.0,
-            "695": 46451261440.0,
-            "700": 46451261440.0,
-            "705": 46451261440.0,
-            "710": 46451261440.0,
-            "715": 46451261440.0,
-            "720": 46451261440.0,
-            "725": 46451261440.0,
-            "730": 46451261440.0,
-            "735": 46451261440.0,
-            "740": 46451261440.0,
-            "745": 46451261440.0,
-            "750": 46451261440.0,
-            "755": 46451261440.0,
-            "760": 46451261440.0,
-            "765": 46451261440.0,
-            "770": 46451261440.0,
-            "775": 46451261440.0,
-            "780": 46451261440.0,
-            "785": 46451261440.0,
-            "790": 46451261440.0,
-            "795": 46451261440.0,
-            "800": 46451261440.0,
-            "805": 46451261440.0,
-            "810": 46451261440.0,
-            "815": 46451261440.0,
-            "820": 46451261440.0,
-            "825": 46451261440.0,
-            "830": 46451261440.0,
-            "835": 46451261440.0,
-            "840": 46451261440.0,
-            "845": 46451261440.0,
-            "850": 46451261440.0,
-            "855": 46451261440.0,
-            "860": 46451261440.0,
-            "865": 46451261440.0,
-            "870": 46451261440.0,
-            "875": 46451261440.0,
-            "880": 46451261440.0,
-            "885": 46451261440.0,
-            "890": 46451261440.0,
-            "895": 46451261440.0,
-            "900": 46451261440.0,
-            "905": 46451261440.0,
-            "910": 46451261440.0,
-            "915": 46451261440.0,
-            "920": 46451261440.0,
-            "925": 46451261440.0,
-            "930": 46451261440.0,
-            "935": 46451261440.0,
-            "940": 46451261440.0,
-            "945": 46451261440.0,
-            "950": 46451261440.0,
-            "955": 46451261440.0,
-            "960": 45564735488.0,
-            "965": 45952081920.0,
-            "970": 45952081920.0,
-            "975": 46005657600.0,
-            "980": 46005657600.0,
-            "985": 46005657600.0,
-            "990": 46005657600.0,
-            "995": 46169923584.0,
-            "1000": 46169923584.0,
-            "1005": 46169923584.0,
-            "1010": 46169923584.0,
-            "1015": 46169923584.0,
-            "1020": 46169923584.0,
-            "1025": 46169923584.0,
-            "1030": 46169923584.0,
-            "1035": 46169923584.0,
-            "1040": 46169923584.0,
-            "1045": 46169923584.0,
-            "1050": 46169923584.0,
-            "1055": 46169923584.0,
-            "1060": 46169923584.0,
-            "1065": 46169923584.0,
-            "1070": 46169923584.0,
-            "1075": 46169923584.0,
-            "1080": 46169923584.0,
-            "1085": 46169923584.0,
-            "1090": 46169923584.0,
-            "1095": 46169923584.0,
-            "1100": 46169923584.0,
-            "1105": 46169923584.0,
-            "1110": 46169923584.0,
-            "1115": 46169923584.0,
-            "1120": 46169923584.0,
-            "1125": 46169923584.0,
-            "1130": 46169923584.0,
-            "1135": 46169923584.0,
-            "1140": 46169923584.0,
-            "1145": 46169923584.0,
-            "1150": 46169923584.0,
-            "1155": 46169923584.0,
-            "1160": 46169923584.0,
-            "1165": 46169923584.0,
-            "1170": 46169923584.0,
-            "1175": 46169923584.0,
-            "1180": 46192005120.0,
-            "1185": 46192005120.0,
-            "1190": 46192005120.0,
-            "1195": 46192005120.0,
-            "1200": 46192005120.0,
-            "1205": 46192005120.0,
-            "1210": 46192005120.0,
-            "1215": 46192005120.0,
-            "1220": 46192005120.0,
-            "1225": 46192005120.0,
-            "1230": 46192005120.0,
-            "1235": 46192005120.0,
-            "1240": 46192005120.0,
-            "1245": 46192005120.0,
-            "1250": 46192005120.0,
-            "1255": 46192005120.0,
-            "1260": 46192005120.0,
-            "1265": 46192005120.0,
-            "1270": 46192005120.0,
-            "1275": 46192005120.0,
-            "1280": 46192005120.0,
-            "1285": 46192005120.0,
-            "1290": 46192005120.0,
-            "1295": 46192005120.0,
-            "1300": 46192005120.0,
-            "1305": 46192005120.0,
-            "1310": 46192005120.0,
-            "1315": 46192005120.0,
-            "1320": 46192005120.0,
-            "1325": 46192005120.0,
-            "1330": 46192005120.0,
-            "1335": 46192005120.0,
-            "1340": 46192005120.0,
-            "1345": 46192005120.0,
-            "1350": 46192005120.0,
-            "1355": 46192005120.0,
-            "1360": 46192005120.0,
-            "1365": 46192005120.0,
-            "1370": 46192005120.0,
-            "1375": 46192005120.0,
-            "1380": 46192005120.0,
-            "1385": 46192005120.0,
-            "1390": 46192005120.0,
-            "1395": 46192005120.0,
-            "1400": 46192005120.0,
-            "1405": 46192005120.0,
-            "1410": 46192005120.0,
-            "1415": 46192005120.0,
-            "1420": 46192005120.0,
-            "1425": 46192005120.0,
-            "1430": 46192005120.0,
-            "1435": 46192005120.0,
-            "1440": 46192005120.0,
-            "1445": 46192005120.0,
-            "1450": 46192005120.0,
-            "1455": 46192005120.0,
-            "1460": 46192005120.0,
-            "1465": 46192005120.0,
-            "1470": 46192005120.0,
-            "1475": 46192005120.0,
-            "1480": 46192005120.0,
-            "1485": 46192005120.0,
-            "1490": 46192005120.0,
-            "1495": 46192005120.0,
-            "1500": 46192005120.0,
-            "1505": 46192005120.0,
-            "1510": 46192005120.0,
-            "1515": 46192005120.0,
-            "1520": 46192005120.0,
-            "1525": 46192005120.0,
-            "1530": 46192005120.0,
-            "1535": 46192005120.0,
-            "1540": 46192005120.0,
-            "1545": 46192005120.0,
-            "1550": 46260322304.0,
-            "1555": 46260322304.0,
-            "1560": 46260322304.0,
-            "1565": 46260322304.0,
-            "1570": 46260322304.0,
-            "1575": 46260322304.0,
-            "1580": 46260322304.0,
-            "1585": 46260322304.0,
-            "1590": 46260322304.0,
-            "1595": 46260322304.0,
-            "1600": 46260322304.0,
-            "1605": 46260322304.0,
-            "1610": 46260322304.0,
-            "1615": 46260322304.0,
-            "1620": 46260322304.0,
-            "1625": 46260322304.0,
-            "1630": 46260322304.0,
-            "1635": 46260322304.0,
-            "1640": 46260322304.0,
-            "1645": 46260322304.0,
-            "1650": 46260322304.0,
-            "1655": 46260322304.0,
-            "1660": 46260322304.0,
-            "1665": 46260322304.0,
-            "1670": 46260322304.0,
-            "1675": 46260322304.0,
-            "1680": 46260322304.0,
-            "1685": 46260322304.0,
-            "1690": 46260322304.0,
-            "1695": 46260322304.0,
-            "1700": 46260322304.0,
-            "1705": 46260322304.0,
-            "1710": 46260322304.0,
-            "1715": 46260322304.0,
-            "1720": 46260322304.0,
-            "1725": 46260322304.0,
-            "1730": 46260322304.0,
-            "1735": 46260322304.0,
-            "1740": 46260322304.0,
-            "1745": 46260322304.0,
-            "1750": 46260322304.0,
-            "1755": 46260322304.0,
-            "1760": 46260322304.0,
-            "1765": 46260322304.0,
-            "1770": 46260322304.0,
-            "1775": 46260322304.0,
-            "1780": 46260322304.0,
-            "1785": 46260322304.0,
-            "1790": 46260322304.0,
-            "1795": 46260322304.0,
-            "1800": 46260322304.0,
-            "1805": 46260322304.0,
-            "1810": 46260322304.0,
-            "1815": 46260322304.0,
-            "1820": 46260322304.0,
-            "1825": 46260322304.0,
-            "1830": 46260322304.0,
-            "1835": 46260322304.0,
-            "1840": 46260322304.0,
-            "1845": 46260322304.0,
-            "1850": 46260322304.0,
-            "1855": 46260322304.0,
-            "1860": 46260322304.0,
-            "1865": 46260322304.0,
-            "1870": 46260322304.0,
-            "1875": 46260322304.0,
-            "1880": 46260322304.0,
-            "1885": 46260322304.0,
-            "1890": 46260322304.0,
-            "1895": 46260322304.0,
-            "1900": 46260322304.0,
-            "1905": 46260322304.0,
-            "1910": 46260322304.0,
-            "1915": 46260322304.0,
-            "1920": 46260322304.0,
-            "1925": 46260322304.0,
-            "1930": 46260322304.0,
-            "1935": 46260322304.0,
-            "1940": 46260322304.0,
-            "1945": 46260322304.0,
-            "1950": 46260322304.0,
-            "1955": 46260322304.0,
-            "1960": 46260322304.0,
-            "1965": 46260322304.0,
-            "1970": 46260322304.0,
-            "1975": 46261714944.0,
-            "1980": 46261714944.0,
-            "1985": 46261714944.0,
-            "1990": 46261714944.0,
-            "1995": 46261714944.0,
-            "2000": 46261714944.0,
-            "2005": 46261714944.0,
-            "2010": 46261714944.0,
-            "2015": 46261714944.0,
-            "2020": 46261714944.0,
-            "2025": 46261714944.0,
-            "2030": 46261714944.0,
-            "2035": 46261714944.0,
-            "2040": 46261714944.0,
-            "2045": 46261714944.0,
-            "2050": 46261714944.0,
-            "2055": 46261714944.0,
-            "2060": 46261714944.0,
-            "2065": 46261714944.0,
-            "2070": 46261714944.0,
-            "2075": 46261714944.0,
-            "2080": 46261714944.0,
-            "2085": 46261714944.0,
-            "2090": 46261714944.0,
-            "2095": 46261714944.0,
-            "2100": 46261714944.0,
-            "2105": 46261714944.0,
-            "2110": 46261714944.0,
-            "2115": 46261714944.0,
-            "2120": 46261714944.0,
-            "2125": 46261714944.0,
-            "2130": 46261714944.0,
-            "2135": 46261714944.0,
-            "2140": 46261714944.0,
-            "2145": 46261714944.0,
-            "2150": 46261714944.0,
-            "2155": 46261714944.0,
-            "2160": 46261714944.0,
-            "2165": 46261714944.0,
-            "2170": 46261714944.0,
-            "2175": 46261714944.0,
-            "2180": 46261714944.0,
-            "2185": 46261714944.0,
-            "2190": 46261714944.0,
-            "2195": 46261714944.0,
-            "2200": 46261714944.0,
-            "2205": 46261714944.0,
-            "2210": 46261714944.0,
-            "2215": 46261714944.0,
-            "2220": 46261714944.0,
-            "2225": 46261714944.0,
-            "2230": 46261714944.0,
-            "2235": 46261714944.0,
-            "2240": 46261714944.0,
-            "2245": 46261714944.0,
-            "2250": 46261714944.0,
-            "2255": 46261714944.0,
-            "2260": 46261714944.0,
-            "2265": 46261714944.0,
-            "2270": 46261714944.0,
-            "2275": 46261714944.0,
-            "2280": 46261714944.0,
-            "2285": 46261714944.0,
-            "2290": 46261714944.0,
-            "2295": 46261714944.0,
-            "2300": 46261714944.0,
-            "2305": 46261714944.0,
-            "2310": 46261714944.0,
-            "2315": 46261714944.0,
-            "2320": 46261714944.0,
-            "2325": 46261714944.0,
-            "2330": 46261714944.0,
-            "2335": 46261714944.0,
-            "2340": 46261714944.0,
-            "2345": 46261714944.0,
-            "2350": 46261714944.0,
-            "2355": 46261714944.0,
-            "2360": 46261714944.0,
-            "2365": 46261714944.0,
-            "2370": 46261714944.0,
-            "2375": 46261714944.0,
-            "2380": 46261714944.0,
-            "2385": 46261714944.0,
-            "2390": 46261714944.0,
-            "2395": 46261714944.0,
-            "2400": 46261714944.0,
-            "2405": 46261714944.0,
-            "2410": 46261714944.0,
-            "2415": 46261714944.0,
-            "2420": 46261714944.0,
-            "2425": 46261714944.0,
-            "2430": 46261714944.0,
-            "2435": 46261714944.0,
-            "2440": 46261714944.0,
-            "2445": 46261714944.0,
-            "2450": 46261714944.0,
-            "2455": 46261714944.0,
-            "2460": 46261714944.0,
-            "2465": 46261714944.0,
-            "2470": 46261714944.0,
-            "2475": 46261714944.0,
-            "2480": 46261714944.0,
-            "2485": 46261714944.0,
-            "2490": 46261714944.0,
-            "2495": 46261714944.0,
-            "2500": 46261714944.0,
-            "2505": 46261714944.0,
-            "2510": 46261714944.0,
-            "2515": 46261714944.0,
-            "2520": 46261714944.0,
-            "2525": 46261714944.0,
-            "2530": 46261714944.0,
-            "2535": 46261714944.0,
-            "2540": 46261714944.0,
-            "2545": 46261714944.0,
-            "2550": 46261714944.0,
-            "2555": 46261714944.0,
-            "2560": 46261714944.0,
-            "2565": 46261714944.0,
-            "2570": 46261714944.0,
-            "2575": 46261714944.0,
-            "2580": 46261714944.0,
-            "2585": 46261714944.0,
-            "2590": 46261714944.0,
-            "2595": 46261714944.0,
-            "2600": 46261714944.0,
-            "2605": 46261714944.0,
-            "2610": 46261714944.0,
-            "2615": 46261714944.0,
-            "2620": 46261714944.0,
-            "2625": 46261714944.0,
-            "2630": 46261714944.0,
-            "2635": 46261714944.0,
-            "2640": 46261714944.0,
-            "2645": 46261714944.0,
-            "2650": 46261714944.0,
-            "2655": 46261714944.0,
-            "2660": 46261714944.0,
-            "2665": 46261714944.0,
-            "2670": 46261714944.0,
-            "2675": 46261714944.0,
-            "2680": 46261714944.0,
-            "2685": 46261714944.0,
-            "2690": 46261714944.0,
-            "2695": 46261714944.0,
-            "2700": 46261714944.0,
-            "2705": 46261714944.0,
-            "2710": 46261714944.0,
-            "2715": 46261714944.0,
-            "2720": 46261714944.0,
-            "2725": 46261714944.0,
-            "2730": 46261714944.0,
-            "2735": 46261714944.0,
-            "2740": 46261714944.0,
-            "2745": 46261714944.0,
-            "2750": 46261714944.0,
-            "2755": 46261714944.0,
-            "2760": 46261714944.0,
-            "2765": 46261714944.0,
-            "2770": 46261714944.0,
-            "2775": 46261714944.0,
-            "2780": 46261714944.0,
-            "2785": 46261714944.0,
-            "2790": 46261714944.0,
-            "2795": 46261714944.0,
-            "2800": 46261714944.0,
-            "2805": 46261714944.0,
-            "2810": 46261714944.0,
-            "2815": 46261714944.0,
-            "2820": 46261714944.0,
-            "2825": 46261714944.0,
-            "2830": 46261714944.0,
-            "2835": 46261714944.0,
-            "2840": 46261714944.0,
-            "2845": 46261714944.0,
-            "2850": 46261714944.0,
-            "2855": 46261714944.0,
-            "2860": 46261714944.0,
-            "2865": 46261714944.0,
-            "2870": 46261714944.0,
-            "2875": 46261714944.0,
-            "2880": 46261714944.0,
-            "2885": 46261714944.0,
-            "2890": 46261714944.0,
-            "2895": 46261714944.0,
-            "2900": 46261714944.0,
-            "2905": 46261714944.0,
-            "2910": 46261714944.0,
-            "2915": 46261714944.0,
-            "2920": 46261714944.0,
-            "2925": 46261714944.0,
-            "2930": 46261714944.0,
-            "2935": 46261714944.0,
-            "2940": 46261714944.0,
-            "2945": 46261714944.0,
-            "2950": 46261714944.0,
-            "2955": 46261714944.0,
-            "2960": 46261714944.0,
-            "2965": 46261714944.0,
-            "2970": 46261714944.0,
-            "2975": 46261714944.0,
-            "2980": 46261714944.0,
-            "2985": 45706711040.0,
-            "2990": 45883699200.0,
-            "2995": 46072287232.0,
-            "3000": 46072287232.0,
-            "3005": 46072287232.0,
-            "3010": 46072287232.0,
-            "3015": 46072287232.0,
-            "3020": 46072287232.0,
-            "3025": 46072287232.0,
-            "3030": 46072287232.0,
-            "3035": 46072287232.0,
-            "3040": 46072287232.0,
-            "3045": 46072287232.0,
-            "3050": 46072287232.0,
-            "3055": 46072287232.0,
-            "3060": 46072287232.0,
-            "3065": 46072287232.0,
-            "3070": 46072287232.0,
-            "3075": 46072287232.0,
-            "3080": 46072287232.0,
-            "3085": 46072287232.0,
-            "3090": 46072287232.0,
-            "3095": 46072287232.0,
-            "3100": 46072287232.0,
-            "3105": 46072287232.0,
-            "3110": 46072287232.0,
-            "3115": 46072287232.0,
-            "3120": 46072287232.0,
-            "3125": 46072287232.0,
-            "3130": 46072287232.0,
-            "3135": 46072287232.0,
-            "3140": 46072287232.0,
-            "3145": 46072287232.0,
-            "3150": 46072287232.0,
-            "3155": 46072287232.0,
-            "3160": 46072287232.0,
-            "3165": 46072287232.0,
-            "3170": 46072287232.0,
-            "3175": 46072287232.0,
-            "3180": 46072287232.0,
-            "3185": 46072287232.0,
-            "3190": 46072287232.0,
-            "3195": 46072287232.0,
-            "3200": 46072287232.0,
-            "3205": 46072287232.0,
-            "3210": 46072287232.0,
-            "3215": 46072287232.0,
-            "3220": 46072287232.0,
-            "3225": 46072287232.0,
-            "3230": 46072287232.0,
-            "3235": 46072287232.0,
-            "3240": 46072287232.0,
-            "3245": 46072287232.0,
-            "3250": 46072287232.0,
-            "3255": 46072287232.0,
-            "3260": 46072287232.0,
-            "3265": 46072287232.0,
-            "3270": 46072287232.0,
-            "3275": 46072287232.0,
-            "3280": 46072287232.0,
-            "3285": 46072287232.0,
-            "3290": 46072287232.0,
-            "3295": 46072287232.0,
-            "3300": 46072287232.0,
-            "3305": 46072287232.0,
-            "3310": 46072287232.0,
-            "3315": 46072287232.0,
-            "3320": 46072287232.0,
-            "3325": 46072287232.0,
-            "3330": 46072287232.0,
-            "3335": 46072287232.0,
-            "3340": 46072287232.0,
-            "3345": 46072287232.0,
-            "3350": 46072287232.0,
-            "3355": 46072287232.0,
-            "3360": 46072287232.0,
-            "3365": 46072287232.0,
-            "3370": 46072287232.0,
-            "3375": 46072287232.0,
-            "3380": 46072287232.0,
-            "3385": 46072287232.0,
-            "3390": 46072287232.0,
-            "3395": 46072287232.0,
-            "3400": 46072287232.0,
-            "3405": 46072287232.0,
-            "3410": 46072287232.0,
-            "3415": 46072287232.0,
-            "3420": 46072287232.0,
-            "3425": 46072672256.0,
-            "3430": 46072672256.0,
-            "3435": 46072672256.0,
-            "3440": 46072672256.0,
-            "3445": 46072672256.0,
-            "3450": 46072672256.0,
-            "3455": 46072672256.0,
-            "3460": 46072672256.0,
-            "3465": 46072672256.0,
-            "3470": 46072672256.0,
-            "3475": 46072672256.0,
-            "3480": 46072672256.0,
-            "3485": 46095564800.0,
-            "3490": 46095564800.0,
-            "3495": 46095564800.0,
-            "3500": 46095564800.0,
-            "3505": 46095564800.0,
-            "3510": 46095564800.0,
-            "3515": 46095564800.0,
-            "3520": 46095564800.0,
-            "3525": 46095564800.0,
-            "3530": 46095564800.0,
-            "3535": 46095564800.0,
-            "3540": 46095564800.0,
-            "3545": 46095564800.0,
-            "3550": 46191697920.0,
-            "3555": 46191697920.0,
-            "3560": 46191697920.0,
-            "3565": 46191697920.0,
-            "3570": 46191697920.0,
-            "3575": 46191697920.0,
-            "3580": 46191697920.0,
-            "3585": 46191697920.0,
-            "3590": 46191697920.0,
-            "3595": 46191697920.0,
-            "3600": 46191697920.0,
-            "3605": 46191697920.0,
-            "3610": 46191697920.0,
-            "3615": 46191697920.0,
-            "3620": 46191697920.0,
-            "3625": 46191697920.0,
-            "3630": 46191697920.0,
-            "3635": 46191697920.0,
-            "3640": 46191697920.0,
-            "3645": 46191697920.0,
-            "3650": 46191697920.0,
-            "3655": 46191697920.0,
-            "3660": 46191697920.0,
-            "3665": 46191697920.0,
-            "3670": 46191697920.0,
-            "3675": 46191697920.0,
-            "3680": 46191697920.0,
-            "3685": 46191697920.0,
-            "3690": 46191697920.0,
-            "3695": 46191697920.0,
-            "3700": 46191697920.0,
-            "3705": 46191697920.0,
-            "3710": 46191697920.0,
-            "3715": 46191697920.0,
-            "3720": 46191697920.0,
-            "3725": 46191697920.0,
-            "3730": 46191697920.0,
-            "3735": 46191697920.0,
-            "3740": 46191697920.0,
-            "3745": 46191697920.0,
-            "3750": 46191697920.0,
-            "3755": 46191697920.0,
-            "3760": 46191697920.0,
-            "3765": 46191697920.0,
-            "3770": 46191697920.0,
-            "3775": 46191697920.0,
-            "3780": 46191697920.0,
-            "3785": 46191697920.0,
-            "3790": 46191697920.0,
-            "3795": 46191697920.0,
-            "3800": 46191697920.0,
-            "3805": 46191697920.0,
-            "3810": 46191697920.0,
-            "3815": 46191697920.0,
-            "3820": 46191697920.0,
-            "3825": 46191697920.0,
-            "3830": 46191697920.0,
-            "3835": 46191697920.0,
-            "3840": 46191697920.0,
-            "3845": 46191697920.0,
-            "3850": 46191697920.0,
-            "3855": 46191697920.0,
-            "3860": 46191697920.0,
-            "3865": 46191697920.0,
-            "3870": 46191697920.0,
-            "3875": 46191697920.0,
-            "3880": 46191697920.0,
-            "3885": 46191697920.0,
-            "3890": 46191697920.0,
-            "3895": 46191697920.0,
-            "3900": 46191697920.0,
-            "3905": 46191697920.0,
-            "3910": 46191697920.0,
-            "3915": 46191697920.0,
-            "3920": 46191697920.0,
-            "3925": 46191697920.0,
-            "3930": 46191697920.0,
-            "3935": 46191697920.0,
-            "3940": 46191697920.0,
-            "3945": 46191697920.0,
-            "3950": 46191697920.0,
-            "3955": 46191697920.0,
-            "3960": 46191697920.0,
-            "3965": 46191697920.0,
-            "3970": 46191697920.0,
-            "3975": 46191697920.0,
-            "3980": 46191697920.0,
-            "3985": 46191697920.0,
-            "3990": 46191697920.0,
-            "3995": 46191697920.0,
-            "4000": 45840449536.0,
-            "4005": 45869191168.0,
-            "4010": 45897973760.0,
-            "4015": 45897973760.0,
-            "4020": 45940301824.0,
-            "4025": 45940301824.0,
-            "4030": 45940301824.0,
-            "4035": 45940301824.0,
-            "4040": 45940301824.0,
-            "4045": 45940301824.0,
-            "4050": 45940301824.0,
-            "4055": 45940301824.0,
-            "4060": 45940301824.0,
-            "4065": 45940301824.0,
-            "4070": 45940301824.0,
-            "4075": 45940301824.0,
-            "4080": 45940301824.0,
-            "4085": 46009651200.0,
-            "4090": 46009651200.0,
-            "4095": 46009651200.0,
-            "4100": 46009651200.0,
-            "4105": 46009651200.0,
-            "4110": 46009651200.0,
-            "4115": 46009651200.0,
-            "4120": 46009651200.0,
-            "4125": 46009651200.0,
-            "4130": 46009651200.0,
-            "4135": 46009651200.0,
-            "4140": 46009651200.0,
-            "4145": 46009651200.0,
-            "4150": 46009651200.0,
-            "4155": 46009651200.0,
-            "4160": 46009651200.0,
-            "4165": 46009651200.0,
-            "4170": 46009651200.0,
-            "4175": 46009651200.0,
-            "4180": 46009651200.0,
-            "4185": 46009651200.0,
-            "4190": 46009651200.0,
-            "4195": 46009651200.0,
-            "4200": 46009651200.0,
-            "4205": 46009651200.0,
-            "4210": 46009651200.0,
-            "4215": 46009651200.0,
-            "4220": 46009651200.0,
-            "4225": 46064635904.0,
-            "4230": 46064635904.0,
-            "4235": 46064635904.0,
-            "4240": 46064635904.0,
-            "4245": 46064635904.0,
-            "4250": 46064635904.0,
-            "4255": 46064635904.0,
-            "4260": 46064635904.0,
-            "4265": 46064635904.0,
-            "4270": 46064635904.0,
-            "4275": 46064635904.0,
-            "4280": 46064635904.0,
-            "4285": 46064635904.0,
-            "4290": 46064635904.0,
-            "4295": 46064635904.0,
-            "4300": 46064635904.0,
-            "4305": 46064635904.0,
-            "4310": 46064635904.0,
-            "4315": 46064635904.0,
-            "4320": 46064635904.0,
-            "4325": 46064635904.0,
-            "4330": 46064635904.0,
-            "4335": 46064635904.0,
-            "4340": 46064635904.0,
-            "4345": 46064635904.0,
-            "4350": 46064635904.0,
-            "4355": 46064635904.0,
-            "4360": 46064635904.0,
-            "4365": 46064635904.0,
-            "4370": 46064635904.0,
-            "4375": 46064635904.0,
-            "4380": 46064635904.0,
-            "4385": 46064635904.0,
-            "4390": 46064635904.0,
-            "4395": 46064635904.0,
-            "4400": 46064635904.0,
-            "4405": 46064635904.0,
-            "4410": 46064635904.0,
-            "4415": 46064635904.0,
-            "4420": 46064635904.0,
-            "4425": 46064635904.0,
-            "4430": 46064635904.0,
-            "4435": 46064635904.0,
-            "4440": 46064635904.0,
-            "4445": 46064635904.0,
-            "4450": 46064635904.0,
-            "4455": 46064635904.0,
-            "4460": 46080573440.0,
-            "4465": 46080573440.0,
-            "4470": 46080573440.0,
-            "4475": 46080573440.0,
-            "4480": 46080573440.0,
-            "4485": 46080573440.0,
-            "4490": 46080573440.0,
-            "4495": 46080573440.0,
-            "4500": 46080573440.0,
-            "4505": 46080573440.0,
-            "4510": 46080573440.0,
-            "4515": 46080573440.0,
-            "4520": 46080573440.0,
-            "4525": 46080573440.0,
-            "4530": 46080573440.0,
-            "4535": 46080573440.0,
-            "4540": 46080573440.0,
-            "4545": 46080573440.0,
-            "4550": 46080573440.0,
-            "4555": 46080573440.0,
-            "4560": 46080573440.0,
-            "4565": 46080573440.0,
-            "4570": 46080573440.0,
-            "4575": 46080573440.0,
-            "4580": 46080573440.0,
-            "4585": 46080573440.0,
-            "4590": 46080573440.0,
-            "4595": 46080573440.0,
-            "4600": 46080573440.0,
-            "4605": 46080573440.0,
-            "4610": 46080573440.0,
-            "4615": 46343888896.0,
-            "4620": 46343888896.0,
-            "4625": 46343888896.0,
-            "4630": 46343888896.0,
-            "4635": 46343888896.0,
-            "4640": 46343888896.0,
-            "4645": 46343888896.0,
-            "4650": 46343888896.0,
-            "4655": 46343888896.0,
-            "4660": 46343888896.0,
-            "4665": 46343888896.0,
-            "4670": 46343888896.0,
-            "4675": 46343888896.0,
-            "4680": 46343888896.0,
-            "4685": 46343888896.0,
-            "4690": 46343888896.0,
-            "4695": 46343888896.0,
-            "4700": 46343888896.0,
-            "4705": 46343888896.0,
-            "4710": 46343888896.0,
-            "4715": 46343888896.0,
-            "4720": 46343888896.0,
-            "4725": 46343888896.0,
-            "4730": 46343888896.0,
-            "4735": 46343888896.0,
-            "4740": 46343888896.0,
-            "4745": 46343888896.0,
-            "4750": 46343888896.0,
-            "4755": 46343888896.0,
-            "4760": 46343888896.0,
-            "4765": 46343888896.0,
-            "4770": 46343888896.0,
-            "4775": 46343888896.0,
-            "4780": 46343888896.0,
-            "4785": 46343888896.0,
-            "4790": 46343888896.0,
-            "4795": 46343888896.0,
-            "4800": 46343888896.0,
-            "4805": 46343888896.0,
-            "4810": 46343888896.0,
-            "4815": 46343888896.0,
-            "4820": 46343888896.0,
-            "4825": 46343888896.0,
-            "4830": 46343888896.0,
-            "4835": 46343888896.0,
-            "4840": 46343888896.0,
-            "4845": 46343888896.0,
-            "4850": 46343888896.0,
-            "4855": 46343888896.0,
-            "4860": 46343888896.0,
-            "4865": 46343888896.0,
-            "4870": 46343888896.0,
-            "4875": 46343888896.0,
-            "4880": 46343888896.0,
-            "4885": 46343888896.0,
-            "4890": 46343888896.0,
-            "4895": 46343888896.0,
-            "4900": 46343888896.0,
-            "4905": 46343888896.0,
-            "4910": 46343888896.0,
-            "4915": 46343888896.0,
-            "4920": 46343888896.0,
-            "4925": 46343888896.0,
-            "4930": 46343888896.0,
-            "4935": 46343888896.0,
-            "4940": 46343888896.0,
-            "4945": 46343888896.0,
-            "4950": 46343888896.0,
-            "4955": 46343888896.0,
-            "4960": 46343888896.0,
-            "4965": 46343888896.0,
-            "4970": 46343888896.0,
-            "4975": 46343888896.0,
-            "4980": 46343888896.0,
-            "4985": 46343888896.0,
-            "4990": 46343888896.0,
-            "4995": 46343888896.0,
-            "5000": 46343888896.0,
-            "5005": 46199529472.0,
-            "5010": 46199529472.0,
-            "5015": 45764182016.0,
-            "5020": 45878784000.0,
-            "5025": 45878784000.0,
-            "5030": 45878784000.0,
-            "5035": 45878784000.0,
-            "5040": 45992685568.0,
-            "5045": 45992685568.0,
-            "5050": 45992685568.0,
-            "5055": 45992685568.0,
-            "5060": 45992685568.0,
-            "5065": 45992685568.0,
-            "5070": 45992685568.0,
-            "5075": 45992685568.0,
-            "5080": 45992685568.0,
-            "5085": 45992685568.0,
-            "5090": 45992685568.0,
-            "5095": 46014451712.0,
-            "5100": 46014451712.0,
-            "5105": 46014451712.0,
-            "5110": 46014451712.0,
-            "5115": 46014451712.0,
-            "5120": 46014451712.0,
-            "5125": 46014451712.0,
-            "5130": 46014451712.0,
-            "5135": 46014451712.0,
-            "5140": 46014451712.0,
-            "5145": 46014451712.0,
-            "5150": 46014451712.0,
-            "5155": 46014451712.0,
-            "5160": 46014451712.0,
-            "5165": 46014451712.0,
-            "5170": 46014451712.0,
-            "5175": 46014451712.0,
-            "5180": 46014451712.0,
-            "5185": 46014451712.0,
-            "5190": 46014451712.0,
-            "5195": 46014451712.0,
-            "5200": 46139572224.0,
-            "5205": 46139572224.0,
-            "5210": 46139572224.0,
-            "5215": 46139572224.0,
-            "5220": 46168403968.0,
-            "5225": 46168403968.0,
-            "5230": 46168403968.0,
-            "5235": 46168403968.0,
-            "5240": 46168403968.0,
-            "5245": 46168403968.0,
-            "5250": 46168403968.0,
-            "5255": 46168403968.0,
-            "5260": 46168403968.0,
-            "5265": 46168403968.0,
-            "5270": 46168403968.0,
-            "5275": 46168403968.0,
-            "5280": 46168403968.0,
-            "5285": 46168403968.0,
-            "5290": 46168403968.0,
-            "5295": 46168403968.0,
-            "5300": 46168403968.0,
-            "5305": 46168403968.0,
-            "5310": 46168403968.0,
-            "5315": 46168403968.0,
-            "5320": 46168403968.0,
-            "5325": 46168403968.0,
-            "5330": 46168403968.0,
-            "5335": 46168403968.0,
-            "5340": 46168403968.0,
-            "5345": 46168403968.0,
-            "5350": 46168403968.0,
-            "5355": 46168403968.0,
-            "5360": 46168403968.0,
-            "5365": 46168403968.0,
-            "5370": 46168403968.0,
-            "5375": 46168403968.0,
-            "5380": 46168403968.0,
-            "5385": 46168403968.0,
-            "5390": 46168403968.0,
-            "5395": 46168403968.0,
-            "5400": 46168403968.0,
-            "5405": 46168403968.0,
-            "5410": 46168403968.0,
-            "5415": 46168403968.0,
-            "5420": 46168403968.0,
-            "5425": 46168403968.0,
-            "5430": 46168403968.0,
-            "5435": 46168403968.0,
-            "5440": 46168403968.0,
-            "5445": 46168403968.0,
-            "5450": 46168403968.0,
-            "5455": 46168403968.0,
-            "5460": 46168403968.0,
-            "5465": 46168403968.0,
-            "5470": 46168403968.0,
-            "5475": 46168403968.0,
-            "5480": 46168403968.0,
-            "5485": 46168403968.0,
-            "5490": 46168403968.0,
-            "5495": 46168403968.0,
-            "5500": 46168403968.0,
-            "5505": 46168403968.0,
-            "5510": 46168403968.0,
-            "5515": 46168403968.0,
-            "5520": 46168403968.0,
-            "5525": 46168403968.0,
-            "5530": 46168403968.0,
-            "5535": 46168403968.0,
-            "5540": 46168403968.0,
-            "5545": 46168403968.0,
-            "5550": 46168403968.0,
-            "5555": 46168403968.0,
-            "5560": 46168403968.0,
-            "5565": 46168403968.0,
-            "5570": 46168403968.0,
-            "5575": 46168403968.0,
-            "5580": 46168403968.0,
-            "5585": 46168403968.0,
-            "5590": 46168403968.0,
-            "5595": 46168403968.0,
-            "5600": 46168403968.0,
-            "5605": 46226247680.0,
-            "5610": 46226247680.0,
-            "5615": 46226247680.0,
-            "5620": 46226247680.0,
-            "5625": 46226247680.0,
-            "5630": 46226247680.0,
-            "5635": 46226247680.0,
-            "5640": 46226247680.0,
-            "5645": 46226247680.0,
-            "5650": 46226247680.0,
-            "5655": 46226247680.0,
-            "5660": 46226247680.0,
-            "5665": 46226247680.0,
-            "5670": 46226247680.0,
-            "5675": 46226247680.0,
-            "5680": 46226247680.0,
-            "5685": 46226247680.0,
-            "5690": 46226247680.0,
-            "5695": 46226247680.0,
-            "5700": 46226247680.0,
-            "5705": 46226247680.0,
-            "5710": 46226247680.0,
-            "5715": 46226247680.0,
-            "5720": 46226247680.0,
-            "5725": 46226247680.0,
-            "5730": 46226247680.0,
-            "5735": 46226247680.0,
-            "5740": 46226247680.0,
-            "5745": 46226247680.0,
-            "5750": 46226247680.0,
-            "5755": 46226247680.0,
-            "5760": 46226247680.0,
-            "5765": 46226247680.0,
-            "5770": 46226247680.0,
-            "5775": 46226247680.0,
-            "5780": 46226247680.0,
-            "5785": 46226247680.0,
-            "5790": 46226247680.0,
-            "5795": 46226247680.0,
-            "5800": 46226247680.0,
-            "5805": 46226247680.0,
-            "5810": 46226247680.0,
-            "5815": 46226247680.0,
-            "5820": 46226247680.0,
-            "5825": 46226247680.0,
-            "5830": 46226247680.0,
-            "5835": 46226247680.0,
-            "5840": 46226247680.0,
-            "5845": 46226247680.0,
-            "5850": 46226247680.0,
-            "5855": 46226247680.0,
-            "5860": 46226247680.0,
-            "5865": 46226247680.0,
-            "5870": 46226247680.0,
-            "5875": 46226247680.0,
-            "5880": 46226247680.0,
-            "5885": 46226247680.0,
-            "5890": 46226247680.0,
-            "5895": 46226247680.0,
-            "5900": 46226247680.0,
-            "5905": 46226247680.0,
-            "5910": 46226247680.0,
-            "5915": 46226247680.0,
-            "5920": 46226247680.0,
-            "5925": 46226247680.0,
-            "5930": 46226247680.0,
-            "5935": 46226247680.0,
-            "5940": 46226247680.0,
-            "5945": 46226247680.0,
-            "5950": 46226247680.0,
-            "5955": 46226247680.0,
-            "5960": 46226247680.0,
-            "5965": 46226247680.0,
-            "5970": 46226247680.0,
-            "5975": 46226247680.0,
-            "5980": 46226247680.0,
-            "5985": 46226247680.0,
-            "5990": 46226247680.0,
-            "5995": 46226247680.0,
-            "6000": 46226247680.0,
-            "6005": 46226247680.0,
-            "6010": 46226247680.0,
-            "6015": 46226247680.0,
-            "6020": 46226247680.0,
-            "6025": 46226247680.0,
-            "6030": 45912186880.0,
-            "6035": 45912186880.0,
-            "6040": 45995683840.0,
-            "6045": 45995683840.0,
-            "6050": 45995683840.0,
-            "6055": 45995683840.0,
-            "6060": 45995683840.0,
-            "6065": 45995683840.0,
-            "6070": 45995683840.0,
-            "6075": 46014836736.0,
-            "6080": 46014836736.0,
-            "6085": 46014836736.0,
-            "6090": 46014836736.0,
-            "6095": 46014836736.0,
-            "6100": 46014836736.0,
-            "6105": 46014836736.0,
-            "6110": 46014836736.0,
-            "6115": 46014836736.0,
-            "6120": 46014836736.0,
-            "6125": 46014836736.0,
-            "6130": 46014836736.0,
-            "6135": 46014836736.0,
-            "6140": 46014836736.0,
-            "6145": 46014836736.0,
-            "6150": 46014836736.0,
-            "6155": 46014836736.0,
-            "6160": 46014836736.0,
-            "6165": 46025334784.0,
-            "6170": 46025334784.0,
-            "6175": 46025334784.0,
-            "6180": 46025334784.0,
-            "6185": 46035255296.0,
-            "6190": 46035255296.0,
-            "6195": 46035255296.0,
-            "6200": 46035255296.0,
-            "6205": 46035255296.0,
-            "6210": 46035255296.0,
-            "6215": 46035255296.0,
-            "6220": 46035255296.0,
-            "6225": 46035255296.0,
-            "6230": 46035255296.0,
-            "6235": 46035255296.0,
-            "6240": 46035255296.0,
-            "6245": 46035255296.0,
-            "6250": 46035255296.0,
-            "6255": 46035255296.0,
-            "6260": 46035255296.0,
-            "6265": 46035255296.0,
-            "6270": 46035255296.0,
-            "6275": 46035255296.0,
-            "6280": 46035255296.0,
-            "6285": 46035255296.0,
-            "6290": 46035255296.0,
-            "6295": 46035255296.0,
-            "6300": 46035255296.0,
-            "6305": 46035255296.0,
-            "6310": 46035255296.0,
-            "6315": 46035255296.0,
-            "6320": 46035255296.0,
-            "6325": 46035255296.0,
-            "6330": 46035255296.0,
-            "6335": 46035255296.0,
-            "6340": 46035255296.0,
-            "6345": 46035255296.0,
-            "6350": 46035255296.0,
-            "6355": 46035255296.0,
-            "6360": 46035255296.0,
-            "6365": 46035255296.0,
-            "6370": 46035255296.0,
-            "6375": 46035255296.0,
-            "6380": 46035255296.0,
-            "6385": 46035255296.0,
-            "6390": 46035255296.0,
-            "6395": 46035255296.0,
-            "6400": 46035255296.0,
-            "6405": 46035255296.0,
-            "6410": 46035255296.0,
-            "6415": 46035255296.0,
-            "6420": 46035255296.0,
-            "6425": 46035255296.0,
-            "6430": 46035255296.0,
-            "6435": 46035255296.0,
-            "6440": 46035255296.0,
-            "6445": 46035255296.0,
-            "6450": 46035255296.0,
-            "6455": 46035255296.0,
-            "6460": 46035255296.0,
-            "6465": 46035255296.0,
-            "6470": 46035255296.0,
-            "6475": 46035255296.0,
-            "6480": 46035255296.0,
-            "6485": 46035255296.0,
-            "6490": 46035255296.0,
-            "6495": 46035255296.0,
-            "6500": 46035255296.0,
-            "6505": 46064041984.0,
-            "6510": 46064041984.0,
-            "6515": 46064041984.0,
-            "6520": 46064041984.0,
-            "6525": 46064041984.0,
-            "6530": 46064041984.0,
-            "6535": 46064041984.0,
-            "6540": 46064041984.0,
-            "6545": 46064041984.0,
-            "6550": 46064041984.0,
-            "6555": 46064041984.0,
-            "6560": 46064041984.0,
-            "6565": 46064041984.0,
-            "6570": 46064041984.0,
-            "6575": 46064041984.0,
-            "6580": 46064041984.0,
-            "6585": 46064041984.0,
-            "6590": 46064041984.0,
-            "6595": 46064041984.0,
-            "6600": 46064041984.0,
-            "6605": 46064041984.0,
-            "6610": 46064041984.0,
-            "6615": 46064041984.0,
-            "6620": 46064041984.0,
-            "6625": 46064041984.0,
-            "6630": 46064041984.0,
-            "6635": 46064041984.0,
-            "6640": 46064041984.0,
-            "6645": 46064041984.0,
-            "6650": 46064041984.0,
-            "6655": 46064041984.0,
-            "6660": 46064041984.0,
-            "6665": 46064041984.0,
-            "6670": 46064041984.0,
-            "6675": 46064041984.0,
-            "6680": 46064041984.0,
-            "6685": 46064041984.0,
-            "6690": 46064041984.0,
-            "6695": 46064041984.0,
-            "6700": 46064041984.0,
-            "6705": 46064041984.0,
-            "6710": 46064041984.0,
-            "6715": 46064041984.0,
-            "6720": 46064041984.0,
-            "6725": 46064041984.0,
-            "6730": 46064041984.0,
-            "6735": 46064041984.0,
-            "6740": 46064041984.0,
-            "6745": 46064041984.0,
-            "6750": 46064041984.0,
-            "6755": 46064041984.0,
-            "6760": 46064041984.0,
-            "6765": 46064041984.0,
-            "6770": 46064041984.0,
-            "6775": 46064041984.0,
-            "6780": 46064041984.0,
-            "6785": 46064041984.0,
-            "6790": 46064041984.0,
-            "6795": 46064041984.0,
-            "6800": 46064041984.0,
-            "6805": 46064041984.0,
-            "6810": 46064041984.0,
-            "6815": 46064041984.0,
-            "6820": 46064041984.0,
-            "6825": 46064041984.0,
-            "6830": 46064041984.0,
-            "6835": 46064041984.0,
-            "6840": 46064041984.0,
-            "6845": 46064041984.0,
-            "6850": 46064041984.0,
-            "6855": 46064041984.0,
-            "6860": 46064041984.0,
-            "6865": 46064041984.0,
-            "6870": 46064041984.0,
-            "6875": 46064041984.0,
-            "6880": 46064041984.0,
-            "6885": 46064041984.0,
-            "6890": 46064041984.0,
-            "6895": 46064041984.0,
-            "6900": 46064041984.0,
-            "6905": 46064041984.0,
-            "6910": 46064041984.0,
-            "6915": 46064041984.0,
-            "6920": 46064041984.0,
-            "6925": 46064041984.0,
-            "6930": 46064041984.0,
-            "6935": 46064041984.0,
-            "6940": 46064041984.0,
-            "6945": 46064041984.0,
-            "6950": 46064041984.0,
-            "6955": 46064041984.0,
-            "6960": 46064041984.0,
-            "6965": 46064041984.0,
-            "6970": 46064041984.0,
-            "6975": 46064041984.0,
-            "6980": 46064041984.0,
-            "6985": 46064041984.0,
-            "6990": 46064041984.0,
-            "6995": 46064041984.0,
-            "7000": 46064041984.0,
-            "7005": 46064041984.0,
-            "7010": 46064041984.0,
-            "7015": 46064041984.0,
-            "7020": 46064041984.0,
-            "7025": 46064041984.0,
-            "7030": 46108979200.0,
-            "7035": 46108979200.0,
-            "7040": 46108979200.0,
-            "7045": 46108979200.0,
-            "7050": 46065532928.0,
-            "7055": 46065532928.0,
-            "7060": 46065532928.0,
-            "7065": 46065532928.0,
-            "7070": 46065532928.0,
-            "7075": 46065532928.0,
-            "7080": 46065532928.0,
-            "7085": 46065532928.0,
-            "7090": 46065532928.0,
-            "7095": 46065532928.0,
-            "7100": 46065532928.0,
-            "7105": 46065532928.0,
-            "7110": 46065532928.0,
-            "7115": 46065532928.0,
-            "7120": 46065532928.0,
-            "7125": 46065532928.0,
-            "7130": 46065532928.0,
-            "7135": 46065532928.0,
-            "7140": 46065532928.0,
-            "7145": 46065532928.0,
-            "7150": 46065532928.0,
-            "7155": 46065532928.0,
-            "7160": 46065532928.0,
-            "7165": 46065532928.0,
-            "7170": 46065532928.0,
-            "7175": 46065532928.0,
-            "7180": 46065532928.0,
-            "7185": 46065532928.0,
-            "7190": 46065532928.0,
-            "7195": 46065532928.0,
-            "7200": 46065532928.0,
-            "7205": 46065532928.0,
-            "7210": 46065532928.0,
-            "7215": 46065532928.0,
-            "7220": 46065532928.0,
-            "7225": 46065532928.0,
-            "7230": 46065532928.0,
-            "7235": 46065532928.0,
-            "7240": 46065532928.0,
-            "7245": 46065532928.0,
-            "7250": 46065532928.0,
-            "7255": 46065532928.0,
-            "7260": 46065532928.0,
-            "7265": 46065532928.0,
-            "7270": 46065532928.0,
-            "7275": 46065532928.0,
-            "7280": 46065532928.0,
-            "7285": 46065532928.0,
-            "7290": 46065532928.0,
-            "7295": 46065532928.0,
-            "7300": 46065532928.0,
-            "7305": 46065532928.0,
-            "7310": 46065532928.0,
-            "7315": 46065532928.0,
-            "7320": 46065532928.0,
-            "7325": 46065532928.0,
-            "7330": 46065532928.0,
-            "7335": 46065532928.0,
-            "7340": 46065532928.0,
-            "7345": 46065532928.0,
-            "7350": 46065532928.0,
-            "7355": 46065532928.0,
-            "7360": 46065532928.0,
-            "7365": 46065532928.0,
-            "7370": 46065532928.0,
-            "7375": 46065532928.0,
-            "7380": 46065532928.0,
-            "7385": 46065532928.0,
-            "7390": 46065532928.0,
-            "7395": 46065532928.0,
-            "7400": 46065532928.0,
-            "7405": 46065532928.0,
-            "7410": 46065532928.0,
-            "7415": 46065532928.0,
-            "7420": 46065532928.0,
-            "7425": 46065532928.0,
-            "7430": 46065532928.0,
-            "7435": 46065532928.0,
-            "7440": 46065532928.0,
-            "7445": 46065532928.0,
-            "7450": 46065532928.0,
-            "7455": 46065532928.0,
-            "7460": 46065532928.0,
-            "7465": 46065532928.0,
-            "7470": 46065532928.0,
-            "7475": 46065532928.0,
-            "7480": 46065532928.0,
-            "7485": 46065532928.0,
-            "7490": 46065532928.0,
-            "7495": 46065532928.0,
-            "7500": 46065532928.0,
-            "7505": 46065532928.0,
-            "7510": 46065532928.0,
-            "7515": 46065532928.0,
-            "7520": 45618061312.0,
-            "7525": 45747933184.0,
-            "7530": 45825024000.0,
-            "7535": 45825024000.0,
-            "7540": 45825024000.0,
-            "7545": 45910597632.0,
-            "7550": 45910597632.0,
-            "7555": 45910597632.0,
-            "7560": 45910597632.0,
-            "7565": 45910597632.0,
-            "7570": 45910597632.0,
-            "7575": 45910597632.0,
-            "7580": 45910597632.0,
-            "7585": 45910597632.0,
-            "7590": 45910597632.0,
-            "7595": 45916950528.0,
-            "7600": 45924253696.0,
-            "7605": 45924253696.0,
-            "7610": 45924253696.0,
-            "7615": 45924253696.0,
-            "7620": 45924253696.0,
-            "7625": 45924253696.0,
-            "7630": 45924253696.0,
-            "7635": 45924253696.0,
-            "7640": 45924253696.0,
-            "7645": 45944950784.0,
-            "7650": 45944950784.0,
-            "7655": 45944950784.0,
-            "7660": 45944950784.0,
-            "7665": 45944950784.0,
-            "7670": 45944950784.0,
-            "7675": 45944950784.0,
-            "7680": 45944950784.0,
-            "7685": 45944950784.0,
-            "7690": 45944950784.0,
-            "7695": 45944950784.0,
-            "7700": 45944950784.0,
-            "7705": 45944950784.0,
-            "7710": 45944950784.0,
-            "7715": 45944950784.0,
-            "7720": 45944950784.0,
-            "7725": 45944950784.0,
-            "7730": 45944950784.0,
-            "7735": 45944950784.0,
-            "7740": 45944950784.0,
-            "7745": 45944950784.0,
-            "7750": 45944950784.0,
-            "7755": 45944950784.0,
-            "7760": 45944950784.0,
-            "7765": 45944950784.0,
-            "7770": 45944950784.0,
-            "7775": 45944950784.0,
-            "7780": 45944950784.0,
-            "7785": 45944950784.0,
-            "7790": 45944950784.0,
-            "7795": 45944950784.0,
-            "7800": 45944950784.0,
-            "7805": 45944950784.0,
-            "7810": 45944950784.0,
-            "7815": 45944950784.0,
-            "7820": 45944950784.0,
-            "7825": 45944950784.0,
-            "7830": 45944950784.0,
-            "7835": 45944950784.0,
-            "7840": 45973135360.0,
-            "7845": 45973135360.0,
-            "7850": 46089904128.0,
-            "7855": 46089904128.0,
-            "7860": 46089904128.0,
-            "7865": 46089904128.0,
-            "7870": 46089904128.0,
-            "7875": 46089904128.0,
-            "7880": 46089904128.0,
-            "7885": 46089904128.0,
-            "7890": 46089904128.0,
-            "7895": 46089904128.0,
-            "7900": 46089904128.0,
-            "7905": 46089904128.0,
-            "7910": 46089904128.0,
-            "7915": 46089904128.0,
-            "7920": 46089904128.0,
-            "7925": 46089904128.0,
-            "7930": 46089904128.0,
-            "7935": 46089904128.0,
-            "7940": 46089904128.0,
-            "7945": 46089904128.0,
-            "7950": 46089904128.0,
-            "7955": 46089904128.0,
-            "7960": 46089904128.0,
-            "7965": 46089904128.0,
-            "7970": 46089904128.0,
-            "7975": 46089904128.0,
-            "7980": 46089904128.0,
-            "7985": 46089904128.0,
-            "7990": 46089904128.0,
-            "7995": 46089904128.0,
-            "8000": 46089904128.0,
-            "8005": 46089904128.0,
-            "8010": 46089904128.0,
-            "8015": 46089904128.0,
-            "8020": 46089904128.0,
-            "8025": 46089904128.0,
-            "8030": 46089904128.0,
-            "8035": 46089904128.0,
-            "8040": 46089904128.0,
-            "8045": 46089904128.0,
-            "8050": 46089904128.0,
-            "8055": 46089904128.0,
-            "8060": 46089904128.0,
-            "8065": 46089904128.0,
-            "8070": 46089904128.0,
-            "8075": 46089904128.0,
-            "8080": 46089904128.0,
-            "8085": 46089904128.0,
-            "8090": 46089904128.0,
-            "8095": 46089904128.0,
-            "8100": 46089904128.0,
-            "8105": 46089904128.0,
-            "8110": 46089904128.0,
-            "8115": 46089904128.0,
-            "8120": 46089904128.0,
-            "8125": 46089904128.0,
-            "8130": 46089904128.0,
-            "8135": 46089904128.0,
-            "8140": 46089904128.0,
-            "8145": 46089904128.0,
-            "8150": 46089904128.0,
-            "8155": 46089904128.0,
-            "8160": 46089904128.0,
-            "8165": 46089904128.0,
-            "8170": 46089904128.0,
-            "8175": 46089904128.0,
-            "8180": 46089904128.0,
-            "8185": 46089904128.0,
-            "8190": 46089904128.0,
-            "8195": 46089904128.0,
-            "8200": 46089904128.0,
-            "8205": 46089904128.0,
-            "8210": 46089904128.0,
-            "8215": 46089904128.0,
-            "8220": 46089904128.0,
-            "8225": 46089904128.0,
-            "8230": 46089904128.0,
-            "8235": 46089904128.0,
-            "8240": 46089904128.0,
-            "8245": 46089904128.0,
-            "8250": 46089904128.0,
-            "8255": 46089904128.0,
-            "8260": 46089904128.0,
-            "8265": 46089904128.0,
-            "8270": 46089904128.0,
-            "8275": 46089904128.0,
-            "8280": 46089904128.0,
-            "8285": 46089904128.0,
-            "8290": 46089904128.0,
-            "8295": 46089904128.0,
-            "8300": 46089904128.0,
-            "8305": 46089904128.0,
-            "8310": 46089904128.0,
-            "8315": 46089904128.0,
-            "8320": 46089904128.0,
-            "8325": 46089904128.0,
-            "8330": 46089904128.0,
-            "8335": 46089904128.0,
-            "8340": 46089904128.0,
-            "8345": 46089904128.0,
-            "8350": 46089904128.0,
-            "8355": 46089904128.0,
-            "8360": 46089904128.0,
-            "8365": 46089904128.0,
-            "8370": 46089904128.0,
-            "8375": 46089904128.0,
-            "8380": 46089904128.0,
-            "8385": 46089904128.0,
-            "8390": 46089904128.0,
-            "8395": 46089904128.0,
-            "8400": 46089904128.0,
-            "8405": 46089904128.0,
-            "8410": 46089904128.0,
-            "8415": 46089904128.0,
-            "8420": 46089904128.0,
-            "8425": 46089904128.0,
-            "8430": 46089904128.0,
-            "8435": 46089904128.0,
-            "8440": 46089904128.0,
-            "8445": 46089904128.0,
-            "8450": 46089904128.0,
-            "8455": 46089904128.0,
-            "8460": 46089904128.0,
-            "8465": 46089904128.0,
-            "8470": 46089904128.0,
-            "8475": 46089904128.0,
-            "8480": 46089904128.0,
-            "8485": 46089904128.0,
-            "8490": 46089904128.0,
-            "8495": 46089904128.0,
-            "8500": 46089904128.0,
-            "8505": 46089904128.0,
-            "8510": 46089904128.0,
-            "8515": 46089904128.0,
-            "8520": 46089904128.0,
-            "8525": 46089904128.0,
-            "8530": 45938114560.0,
-            "8535": 45938114560.0,
-            "8540": 45938114560.0,
-            "8545": 45938114560.0,
-            "8550": 45938114560.0,
-            "8555": 45938114560.0,
-            "8560": 45938114560.0,
-            "8565": 45938114560.0,
-            "8570": 45938114560.0,
-            "8575": 45938114560.0,
-            "8580": 45938114560.0,
-            "8585": 45938114560.0,
-            "8590": 45950377984.0,
-            "8595": 45950377984.0,
-            "8600": 45950377984.0,
-            "8605": 45950377984.0,
-            "8610": 45950377984.0,
-            "8615": 45950377984.0,
-            "8620": 45950377984.0,
-            "8625": 45950377984.0,
-            "8630": 45950377984.0,
-            "8635": 45950377984.0,
-            "8640": 45950377984.0,
-            "8645": 45950377984.0,
-            "8650": 45950377984.0,
-            "8655": 45950377984.0,
-            "8660": 45950377984.0,
-            "8665": 45950377984.0,
-            "8670": 45955510272.0,
-            "8675": 45955510272.0,
-            "8680": 45955510272.0,
-            "8685": 45955510272.0,
-            "8690": 45991550976.0,
-            "8695": 45991550976.0,
-            "8700": 45991550976.0,
-            "8705": 45991550976.0,
-            "8710": 45991550976.0,
-            "8715": 45991550976.0,
-            "8720": 45991550976.0,
-            "8725": 45991550976.0,
-            "8730": 45991550976.0,
-            "8735": 45991550976.0,
-            "8740": 46068584448.0,
-            "8745": 46068584448.0,
-            "8750": 46068584448.0,
-            "8755": 46068584448.0,
-            "8760": 46068584448.0,
-            "8765": 46068584448.0,
-            "8770": 46068584448.0,
-            "8775": 46068584448.0,
-            "8780": 46068584448.0,
-            "8785": 46068584448.0,
-            "8790": 46068584448.0,
-            "8795": 46068584448.0,
-            "8800": 46068584448.0,
-            "8805": 46068584448.0,
-            "8810": 46068584448.0,
-            "8815": 46068584448.0,
-            "8820": 46068584448.0,
-            "8825": 46068584448.0,
-            "8830": 46068584448.0,
-            "8835": 46068584448.0,
-            "8840": 46068584448.0,
-            "8845": 46068584448.0,
-            "8850": 46068584448.0,
-            "8855": 46184767488.0,
-            "8860": 46184767488.0,
-            "8865": 46184767488.0,
-            "8870": 46184767488.0,
-            "8875": 46184767488.0,
-            "8880": 46184767488.0,
-            "8885": 46184767488.0,
-            "8890": 46184767488.0,
-            "8895": 46184767488.0,
-            "8900": 46184767488.0,
-            "8905": 46184767488.0,
-            "8910": 46184767488.0,
-            "8915": 46184767488.0,
-            "8920": 46184767488.0,
-            "8925": 46184767488.0,
-            "8930": 46184767488.0,
-            "8935": 46184767488.0,
-            "8940": 46184767488.0,
-            "8945": 46184767488.0,
-            "8950": 46184767488.0,
-            "8955": 46184767488.0,
-            "8960": 46184767488.0,
-            "8965": 46184767488.0,
-            "8970": 46184767488.0,
-            "8975": 46184767488.0,
-            "8980": 46184767488.0,
-            "8985": 46184767488.0,
-            "8990": 46184767488.0,
-            "8995": 46184767488.0,
-            "9000": 46184767488.0,
-            "9005": 46184767488.0,
-            "9010": 46184767488.0,
-            "9015": 46184767488.0,
-            "9020": 46184767488.0,
-            "9025": 46184767488.0,
-            "9030": 46184767488.0,
-            "9035": 46184767488.0,
-            "9040": 46184767488.0,
-            "9045": 46184767488.0,
-            "9050": 46184767488.0,
-            "9055": 46184767488.0,
-            "9060": 46184767488.0,
-            "9065": 46184767488.0,
-            "9070": 46184767488.0,
-            "9075": 46184767488.0,
-            "9080": 46184767488.0,
-            "9085": 46184767488.0,
-            "9090": 46184767488.0,
-            "9095": 46184767488.0,
-            "9100": 46184767488.0,
-            "9105": 46184767488.0,
-            "9110": 46184767488.0,
-            "9115": 46184767488.0,
-            "9120": 46184767488.0,
-            "9125": 46184767488.0,
-            "9130": 46184767488.0,
-            "9135": 46184767488.0,
-            "9140": 46184767488.0,
-            "9145": 46184767488.0,
-            "9150": 46184767488.0,
-            "9155": 46184767488.0,
-            "9160": 46184767488.0,
-            "9165": 46184767488.0,
-            "9170": 46184767488.0,
-            "9175": 46184767488.0,
-            "9180": 46184767488.0,
-            "9185": 46184767488.0,
-            "9190": 46184767488.0,
-            "9195": 46184767488.0,
-            "9200": 46184767488.0,
-            "9205": 46184767488.0,
-            "9210": 46184767488.0,
-            "9215": 46184767488.0,
-            "9220": 46184767488.0,
-            "9225": 46184767488.0,
-            "9230": 46184767488.0,
-            "9235": 46184767488.0,
-            "9240": 46184767488.0,
-            "9245": 46184767488.0,
-            "9250": 46184767488.0,
-            "9255": 46184767488.0,
-            "9260": 46184767488.0,
-            "9265": 46184767488.0,
-            "9270": 46184767488.0,
-            "9275": 46184767488.0,
-            "9280": 46184767488.0,
-            "9285": 46184767488.0,
-            "9290": 46184767488.0,
-            "9295": 46184767488.0,
-            "9300": 46184767488.0,
-            "9305": 46184767488.0,
-            "9310": 46184767488.0,
-            "9315": 46184767488.0,
-            "9320": 46184767488.0,
-            "9325": 46184767488.0,
-            "9330": 46184767488.0,
-            "9335": 46184767488.0,
-            "9340": 46184767488.0,
-            "9345": 46184767488.0,
-            "9350": 46184767488.0,
-            "9355": 46184767488.0,
-            "9360": 46184767488.0,
-            "9365": 46184767488.0,
-            "9370": 46184767488.0,
-            "9375": 46184767488.0,
-            "9380": 46184767488.0,
-            "9385": 46184767488.0,
-            "9390": 46184767488.0,
-            "9395": 46184767488.0,
-            "9400": 46184767488.0,
-            "9405": 46184767488.0,
-            "9410": 46184767488.0,
-            "9415": 46184767488.0,
-            "9420": 46184767488.0,
-            "9425": 46184767488.0,
-            "9430": 46184767488.0,
-            "9435": 46184767488.0,
-            "9440": 46184767488.0,
-            "9445": 46184767488.0,
-            "9450": 46184767488.0,
-            "9455": 46184767488.0,
-            "9460": 46184767488.0,
-            "9465": 46184767488.0,
-            "9470": 46184767488.0,
-            "9475": 46184767488.0,
-            "9480": 46184767488.0,
-            "9485": 46184767488.0,
-            "9490": 46184767488.0,
-            "9495": 46184767488.0,
-            "9500": 46184767488.0,
-            "9505": 46184767488.0,
-            "9510": 46184767488.0,
-            "9515": 46184767488.0,
-            "9520": 46184767488.0,
-            "9525": 46184767488.0,
-            "9530": 46184767488.0,
-            "9535": 46184767488.0
-        }
-    },
-    "mtp_1 loss": {
-        "start_step": 1,
-        "end_step": 9535,
-        "step_interval": 5,
-        "values": {
-            "1": 13.88878,
-            "5": 13.88979,
-            "10": 13.88767,
-            "15": 13.88576,
-            "20": 13.88068,
-            "25": 13.87774,
-            "30": 13.85566,
-            "35": 13.84855,
-            "40": 13.84546,
-            "45": 13.82693,
-            "50": 13.74828,
-            "55": 13.7249,
-            "60": 13.70841,
-            "65": 13.67571,
-            "70": 13.63981,
-            "75": 13.44327,
-            "80": 13.36054,
-            "85": 13.2835,
-            "90": 13.18641,
-            "95": 13.0505,
-            "100": 12.90733,
-            "105": 12.74689,
-            "110": 12.48525,
-            "115": 12.26801,
-            "120": 12.04358,
-            "125": 11.87011,
-            "130": 11.74911,
-            "135": 11.5841,
-            "140": 11.3494,
-            "145": 11.26997,
-            "150": 11.11919,
-            "155": 11.0211,
-            "160": 10.88133,
-            "165": 10.75162,
-            "170": 10.65694,
-            "175": 10.59566,
-            "180": 10.43546,
-            "185": 10.42441,
-            "190": 10.27183,
-            "195": 10.2539,
-            "200": 10.12718,
-            "205": 9.97472,
-            "210": 9.94271,
-            "215": 9.92122,
-            "220": 9.78944,
-            "225": 9.77014,
-            "230": 9.73,
-            "235": 9.64372,
-            "240": 9.57366,
-            "245": 9.50499,
-            "250": 9.43776,
-            "255": 9.37037,
-            "260": 9.29579,
-            "265": 9.2411,
-            "270": 9.15629,
-            "275": 9.12851,
-            "280": 9.10516,
-            "285": 9.09815,
-            "290": 9.01068,
-            "295": 8.94828,
-            "300": 8.83207,
-            "305": 8.80663,
-            "310": 8.74389,
-            "315": 8.71813,
-            "320": 8.68425,
-            "325": 8.58706,
-            "330": 8.56208,
-            "335": 8.53307,
-            "340": 8.52937,
-            "345": 8.41091,
-            "350": 8.39973,
-            "355": 8.29759,
-            "360": 8.38348,
-            "365": 8.28981,
-            "370": 8.2833,
-            "375": 8.22588,
-            "380": 8.18359,
-            "385": 8.16998,
-            "390": 8.1467,
-            "395": 8.09789,
-            "400": 8.01583,
-            "405": 8.01349,
-            "410": 8.00377,
-            "415": 7.95012,
-            "420": 7.93109,
-            "425": 7.88677,
-            "430": 7.81895,
-            "435": 7.82989,
-            "440": 7.77278,
-            "445": 7.7493,
-            "450": 7.67877,
-            "455": 7.7063,
-            "460": 7.6532,
-            "465": 7.6329,
-            "470": 7.59885,
-            "475": 7.61277,
-            "480": 7.48436,
-            "485": 7.53153,
-            "490": 7.48574,
-            "495": 7.4714,
-            "500": 7.41282,
-            "505": 7.41932,
-            "510": 7.38698,
-            "515": 7.35645,
-            "520": 7.35102,
-            "525": 7.32559,
-            "530": 7.32588,
-            "535": 7.30357,
-            "540": 7.2179,
-            "545": 7.24022,
-            "550": 7.27618,
-            "555": 7.30238,
-            "560": 7.23984,
-            "565": 7.16321,
-            "570": 7.17228,
-            "575": 7.18898,
-            "580": 7.11497,
-            "585": 7.11901,
-            "590": 7.06121,
-            "595": 7.04317,
-            "600": 7.06682,
-            "605": 7.06137,
-            "610": 7.01939,
-            "615": 7.078,
-            "620": 6.98113,
-            "625": 6.95612,
-            "630": 6.96104,
-            "635": 6.98871,
-            "640": 6.96819,
-            "645": 6.95817,
-            "650": 7.00625,
-            "655": 7.00242,
-            "660": 6.89823,
-            "665": 6.88159,
-            "670": 6.84888,
-            "675": 6.93827,
-            "680": 6.89638,
-            "685": 6.85679,
-            "690": 6.83445,
-            "695": 6.79719,
-            "700": 6.79183,
-            "705": 6.78625,
-            "710": 6.82275,
-            "715": 6.82665,
-            "720": 6.71137,
-            "725": 6.76643,
-            "730": 6.75579,
-            "735": 6.75515,
-            "740": 6.70045,
-            "745": 6.67565,
-            "750": 6.73564,
-            "755": 6.65767,
-            "760": 6.66496,
-            "765": 6.65951,
-            "770": 6.68075,
-            "775": 6.65453,
-            "780": 6.62427,
-            "785": 6.64321,
-            "790": 6.59399,
-            "795": 6.59812,
-            "800": 6.5878,
-            "805": 6.65391,
-            "810": 6.51946,
-            "815": 6.5419,
-            "820": 6.55134,
-            "825": 6.55855,
-            "830": 6.57041,
-            "835": 6.52603,
-            "840": 6.49033,
-            "845": 6.54438,
-            "850": 6.49874,
-            "855": 6.49335,
-            "860": 6.49024,
-            "865": 6.49642,
-            "870": 6.46222,
-            "875": 6.51054,
-            "880": 6.4748,
-            "885": 6.43786,
-            "890": 6.51246,
-            "895": 6.39629,
-            "900": 6.41895,
-            "905": 6.44341,
-            "910": 6.40617,
-            "915": 6.38978,
-            "920": 6.38772,
-            "925": 6.37391,
-            "930": 6.40825,
-            "935": 6.39755,
-            "940": 6.34172,
-            "945": 6.36869,
-            "950": 6.3953,
-            "955": 6.34893,
-            "960": 6.35406,
-            "965": 6.25416,
-            "970": 6.32381,
-            "975": 6.31262,
-            "980": 6.28797,
-            "985": 6.29222,
-            "990": 6.34527,
-            "995": 6.26326,
-            "1000": 6.28434,
-            "1005": 6.23155,
-            "1010": 6.26712,
-            "1015": 6.29352,
-            "1020": 6.20454,
-            "1025": 6.21082,
-            "1030": 6.20913,
-            "1035": 6.29924,
-            "1040": 6.22531,
-            "1045": 6.19943,
-            "1050": 6.2267,
-            "1055": 6.21777,
-            "1060": 6.1673,
-            "1065": 6.15758,
-            "1070": 6.19281,
-            "1075": 6.19093,
-            "1080": 6.19319,
-            "1085": 6.19606,
-            "1090": 6.17796,
-            "1095": 6.181,
-            "1100": 6.1397,
-            "1105": 6.11513,
-            "1110": 6.17787,
-            "1115": 6.11231,
-            "1120": 6.05286,
-            "1125": 6.08699,
-            "1130": 6.14167,
-            "1135": 6.09531,
-            "1140": 6.08221,
-            "1145": 6.06731,
-            "1150": 6.09458,
-            "1155": 6.06298,
-            "1160": 6.04607,
-            "1165": 6.09676,
-            "1170": 6.07336,
-            "1175": 6.04568,
-            "1180": 6.05058,
-            "1185": 6.04124,
-            "1190": 6.04961,
-            "1195": 6.02949,
-            "1200": 5.97329,
-            "1205": 6.07601,
-            "1210": 5.93751,
-            "1215": 5.98403,
-            "1220": 6.06306,
-            "1225": 5.95152,
-            "1230": 5.99877,
-            "1235": 5.95912,
-            "1240": 5.99322,
-            "1245": 5.97187,
-            "1250": 5.95299,
-            "1255": 5.94742,
-            "1260": 5.95227,
-            "1265": 5.93352,
-            "1270": 5.90818,
-            "1275": 5.96805,
-            "1280": 5.90416,
-            "1285": 5.92308,
-            "1290": 5.90725,
-            "1295": 5.92,
-            "1300": 5.9267,
-            "1305": 5.90057,
-            "1310": 5.83908,
-            "1315": 5.8992,
-            "1320": 5.89614,
-            "1325": 5.8271,
-            "1330": 5.88462,
-            "1335": 5.8531,
-            "1340": 5.91994,
-            "1345": 5.86667,
-            "1350": 5.84738,
-            "1355": 5.84415,
-            "1360": 5.85216,
-            "1365": 5.84478,
-            "1370": 5.79663,
-            "1375": 5.80667,
-            "1380": 5.86219,
-            "1385": 5.81826,
-            "1390": 5.81231,
-            "1395": 5.8299,
-            "1400": 5.83135,
-            "1405": 5.82032,
-            "1410": 5.78518,
-            "1415": 5.77017,
-            "1420": 5.8049,
-            "1425": 5.79565,
-            "1430": 5.83189,
-            "1435": 5.74562,
-            "1440": 5.76408,
-            "1445": 5.8071,
-            "1450": 5.78859,
-            "1455": 5.80534,
-            "1460": 5.75975,
-            "1465": 5.76379,
-            "1470": 5.8044,
-            "1475": 5.76985,
-            "1480": 5.77563,
-            "1485": 5.72396,
-            "1490": 5.72354,
-            "1495": 5.74538,
-            "1500": 5.75109,
-            "1505": 5.72321,
-            "1510": 5.74832,
-            "1515": 5.67052,
-            "1520": 5.70302,
-            "1525": 5.67385,
-            "1530": 5.69497,
-            "1535": 5.68565,
-            "1540": 5.672,
-            "1545": 5.7178,
-            "1550": 5.72274,
-            "1555": 5.70942,
-            "1560": 5.65211,
-            "1565": 5.69926,
-            "1570": 5.71179,
-            "1575": 5.6613,
-            "1580": 5.69275,
-            "1585": 5.67221,
-            "1590": 5.66087,
-            "1595": 5.63673,
-            "1600": 5.70849,
-            "1605": 5.64113,
-            "1610": 5.64353,
-            "1615": 5.63334,
-            "1620": 5.65496,
-            "1625": 5.64982,
-            "1630": 5.62727,
-            "1635": 5.67706,
-            "1640": 5.62761,
-            "1645": 5.6449,
-            "1650": 5.63803,
-            "1655": 5.62499,
-            "1660": 5.61278,
-            "1665": 5.60116,
-            "1670": 5.61214,
-            "1675": 5.62193,
-            "1680": 5.56155,
-            "1685": 5.57098,
-            "1690": 5.55098,
-            "1695": 5.55521,
-            "1700": 5.60178,
-            "1705": 5.57706,
-            "1710": 5.58407,
-            "1715": 5.54721,
-            "1720": 5.52704,
-            "1725": 5.56718,
-            "1730": 5.53148,
-            "1735": 5.58307,
-            "1740": 5.52337,
-            "1745": 5.55772,
-            "1750": 5.53213,
-            "1755": 5.5301,
-            "1760": 5.55304,
-            "1765": 5.5132,
-            "1770": 5.522,
-            "1775": 5.52704,
-            "1780": 5.53997,
-            "1785": 5.48896,
-            "1790": 5.52187,
-            "1795": 5.52448,
-            "1800": 5.4698,
-            "1805": 5.46326,
-            "1810": 5.47869,
-            "1815": 5.48464,
-            "1820": 5.48466,
-            "1825": 5.48352,
-            "1830": 5.46909,
-            "1835": 5.46355,
-            "1840": 5.46633,
-            "1845": 5.44723,
-            "1850": 5.42996,
-            "1855": 5.4834,
-            "1860": 5.43502,
-            "1865": 5.44302,
-            "1870": 5.43258,
-            "1875": 5.42823,
-            "1880": 5.491,
-            "1885": 5.45039,
-            "1890": 5.44132,
-            "1895": 5.38084,
-            "1900": 5.42123,
-            "1905": 5.41299,
-            "1910": 5.43539,
-            "1915": 5.4013,
-            "1920": 5.37729,
-            "1925": 5.4085,
-            "1930": 5.37579,
-            "1935": 5.39731,
-            "1940": 5.3727,
-            "1945": 5.4174,
-            "1950": 5.45899,
-            "1955": 5.39197,
-            "1960": 5.39342,
-            "1965": 5.34213,
-            "1970": 5.34023,
-            "1975": 5.40413,
-            "1980": 5.35398,
-            "1985": 5.37376,
-            "1990": 5.39658,
-            "1995": 5.37398,
-            "2000": 5.38469,
-            "2005": 5.42838,
-            "2010": 5.32884,
-            "2015": 5.32047,
-            "2020": 5.32991,
-            "2025": 5.37403,
-            "2030": 5.31228,
-            "2035": 5.33119,
-            "2040": 5.29466,
-            "2045": 5.38332,
-            "2050": 5.35716,
-            "2055": 5.33062,
-            "2060": 5.32903,
-            "2065": 5.29751,
-            "2070": 5.29985,
-            "2075": 5.32708,
-            "2080": 5.29709,
-            "2085": 5.32918,
-            "2090": 5.24905,
-            "2095": 5.29587,
-            "2100": 5.25777,
-            "2105": 5.28625,
-            "2110": 5.28042,
-            "2115": 5.28102,
-            "2120": 5.2839,
-            "2125": 5.24699,
-            "2130": 5.25602,
-            "2135": 5.25599,
-            "2140": 5.26607,
-            "2145": 5.22772,
-            "2150": 5.24774,
-            "2155": 5.22588,
-            "2160": 5.24123,
-            "2165": 5.22937,
-            "2170": 5.26626,
-            "2175": 5.2603,
-            "2180": 5.24294,
-            "2185": 5.24675,
-            "2190": 5.22691,
-            "2195": 5.20127,
-            "2200": 5.20409,
-            "2205": 5.2127,
-            "2210": 5.25738,
-            "2215": 5.30103,
-            "2220": 5.24446,
-            "2225": 5.2194,
-            "2230": 5.21789,
-            "2235": 5.25766,
-            "2240": 5.16329,
-            "2245": 5.1607,
-            "2250": 5.18607,
-            "2255": 5.19635,
-            "2260": 5.13701,
-            "2265": 5.21276,
-            "2270": 5.14278,
-            "2275": 5.19722,
-            "2280": 5.17159,
-            "2285": 5.18798,
-            "2290": 5.17456,
-            "2295": 5.18141,
-            "2300": 5.17912,
-            "2305": 5.15551,
-            "2310": 5.1834,
-            "2315": 5.12144,
-            "2320": 5.17039,
-            "2325": 5.14984,
-            "2330": 5.15156,
-            "2335": 5.13195,
-            "2340": 5.13852,
-            "2345": 5.18732,
-            "2350": 5.12945,
-            "2355": 5.11891,
-            "2360": 5.10445,
-            "2365": 5.11898,
-            "2370": 5.10258,
-            "2375": 5.11122,
-            "2380": 5.05395,
-            "2385": 5.09747,
-            "2390": 5.11702,
-            "2395": 5.1322,
-            "2400": 5.07944,
-            "2405": 5.06236,
-            "2410": 5.11554,
-            "2415": 5.09106,
-            "2420": 5.10878,
-            "2425": 5.06863,
-            "2430": 5.09273,
-            "2435": 5.08666,
-            "2440": 5.07515,
-            "2445": 5.08608,
-            "2450": 5.04943,
-            "2455": 5.09523,
-            "2460": 5.04536,
-            "2465": 5.08334,
-            "2470": 5.07644,
-            "2475": 5.11246,
-            "2480": 5.02872,
-            "2485": 5.05906,
-            "2490": 5.05297,
-            "2495": 5.04377,
-            "2500": 5.04447,
-            "2505": 5.05124,
-            "2510": 5.0909,
-            "2515": 5.08005,
-            "2520": 5.02414,
-            "2525": 5.03617,
-            "2530": 5.05281,
-            "2535": 5.04127,
-            "2540": 5.04342,
-            "2545": 5.05498,
-            "2550": 4.99288,
-            "2555": 5.05988,
-            "2560": 5.03403,
-            "2565": 5.00279,
-            "2570": 5.02524,
-            "2575": 4.98811,
-            "2580": 5.00235,
-            "2585": 4.98259,
-            "2590": 5.00195,
-            "2595": 4.95577,
-            "2600": 4.99616,
-            "2605": 5.01565,
-            "2610": 5.00846,
-            "2615": 4.9779,
-            "2620": 4.96,
-            "2625": 4.99167,
-            "2630": 4.92069,
-            "2635": 5.00179,
-            "2640": 5.00217,
-            "2645": 4.95857,
-            "2650": 4.98056,
-            "2655": 4.97276,
-            "2660": 4.91658,
-            "2665": 5.00931,
-            "2670": 4.95271,
-            "2675": 4.92627,
-            "2680": 4.95939,
-            "2685": 4.9606,
-            "2690": 4.92299,
-            "2695": 4.99925,
-            "2700": 4.90798,
-            "2705": 4.92161,
-            "2710": 4.9625,
-            "2715": 4.94083,
-            "2720": 4.97062,
-            "2725": 4.91977,
-            "2730": 4.9445,
-            "2735": 4.9369,
-            "2740": 4.92939,
-            "2745": 4.89678,
-            "2750": 4.93832,
-            "2755": 4.94144,
-            "2760": 4.94244,
-            "2765": 4.91315,
-            "2770": 4.95527,
-            "2775": 4.90029,
-            "2780": 4.93753,
-            "2785": 4.91159,
-            "2790": 4.93952,
-            "2795": 4.89812,
-            "2800": 4.84327,
-            "2805": 4.89103,
-            "2810": 4.88284,
-            "2815": 4.89434,
-            "2820": 4.93504,
-            "2825": 4.92479,
-            "2830": 4.90086,
-            "2835": 4.90451,
-            "2840": 4.89553,
-            "2845": 4.87238,
-            "2850": 4.90777,
-            "2855": 4.83628,
-            "2860": 4.89239,
-            "2865": 4.90134,
-            "2870": 4.89048,
-            "2875": 4.90822,
-            "2880": 4.82774,
-            "2885": 4.8758,
-            "2890": 4.84909,
-            "2895": 4.88906,
-            "2900": 4.84436,
-            "2905": 4.85096,
-            "2910": 4.84745,
-            "2915": 4.89554,
-            "2920": 4.87192,
-            "2925": 4.84408,
-            "2930": 4.83304,
-            "2935": 4.83856,
-            "2940": 4.8364,
-            "2945": 4.80087,
-            "2950": 4.79094,
-            "2955": 4.79257,
-            "2960": 4.81394,
-            "2965": 4.82244,
-            "2970": 4.83033,
-            "2975": 4.843,
-            "2980": 4.78708,
-            "2985": 4.83546,
-            "2990": 4.84632,
-            "2995": 4.79479,
-            "3000": 4.79957,
-            "3005": 4.7852,
-            "3010": 4.81747,
-            "3015": 4.77707,
-            "3020": 4.79613,
-            "3025": 4.80689,
-            "3030": 4.81521,
-            "3035": 4.81107,
-            "3040": 4.83014,
-            "3045": 4.81253,
-            "3050": 4.78854,
-            "3055": 4.79109,
-            "3060": 4.77291,
-            "3065": 4.80026,
-            "3070": 4.82011,
-            "3075": 4.75177,
-            "3080": 4.78059,
-            "3085": 4.7825,
-            "3090": 4.76596,
-            "3095": 4.80833,
-            "3100": 4.79656,
-            "3105": 4.77177,
-            "3110": 4.76085,
-            "3115": 4.71609,
-            "3120": 4.78235,
-            "3125": 4.74714,
-            "3130": 4.75497,
-            "3135": 4.75435,
-            "3140": 4.7318,
-            "3145": 4.71606,
-            "3150": 4.74842,
-            "3155": 4.78313,
-            "3160": 4.765,
-            "3165": 4.75911,
-            "3170": 4.7541,
-            "3175": 4.746,
-            "3180": 4.73371,
-            "3185": 4.70655,
-            "3190": 4.70906,
-            "3195": 4.70876,
-            "3200": 4.67795,
-            "3205": 4.72527,
-            "3210": 4.67973,
-            "3215": 4.71138,
-            "3220": 4.67941,
-            "3225": 4.71501,
-            "3230": 4.698,
-            "3235": 4.73415,
-            "3240": 4.68214,
-            "3245": 4.6954,
-            "3250": 4.64543,
-            "3255": 4.69551,
-            "3260": 4.67926,
-            "3265": 4.72582,
-            "3270": 4.70744,
-            "3275": 4.65457,
-            "3280": 4.68021,
-            "3285": 4.69583,
-            "3290": 4.66845,
-            "3295": 4.67202,
-            "3300": 4.66858,
-            "3305": 4.67172,
-            "3310": 4.66314,
-            "3315": 4.70829,
-            "3320": 4.64885,
-            "3325": 4.65812,
-            "3330": 4.64245,
-            "3335": 4.65293,
-            "3340": 4.62608,
-            "3345": 4.64548,
-            "3350": 4.65071,
-            "3355": 4.65765,
-            "3360": 4.64823,
-            "3365": 4.66194,
-            "3370": 4.63984,
-            "3375": 4.67722,
-            "3380": 4.61449,
-            "3385": 4.62869,
-            "3390": 4.60608,
-            "3395": 4.6967,
-            "3400": 4.64188,
-            "3405": 4.6721,
-            "3410": 4.60581,
-            "3415": 4.55337,
-            "3420": 4.61467,
-            "3425": 4.63228,
-            "3430": 4.66874,
-            "3435": 4.63419,
-            "3440": 4.65338,
-            "3445": 4.60093,
-            "3450": 4.59889,
-            "3455": 4.62429,
-            "3460": 4.58089,
-            "3465": 4.57689,
-            "3470": 4.59454,
-            "3475": 4.60079,
-            "3480": 4.59374,
-            "3485": 4.62356,
-            "3490": 4.60917,
-            "3495": 4.63221,
-            "3500": 4.59027,
-            "3505": 4.59844,
-            "3510": 4.59797,
-            "3515": 4.648,
-            "3520": 4.62554,
-            "3525": 4.57245,
-            "3530": 4.58587,
-            "3535": 4.58174,
-            "3540": 4.63653,
-            "3545": 4.56212,
-            "3550": 4.62056,
-            "3555": 4.55332,
-            "3560": 4.62414,
-            "3565": 4.55473,
-            "3570": 4.56696,
-            "3575": 4.53468,
-            "3580": 4.59878,
-            "3585": 4.58068,
-            "3590": 4.51872,
-            "3595": 4.58848,
-            "3600": 4.55395,
-            "3605": 4.53571,
-            "3610": 4.54008,
-            "3615": 4.56874,
-            "3620": 4.61691,
-            "3625": 4.55023,
-            "3630": 4.59867,
-            "3635": 4.50879,
-            "3640": 4.52782,
-            "3645": 4.56947,
-            "3650": 4.53552,
-            "3655": 4.54665,
-            "3660": 4.55228,
-            "3665": 4.58643,
-            "3670": 4.54047,
-            "3675": 4.55594,
-            "3680": 4.57348,
-            "3685": 4.49418,
-            "3690": 4.54299,
-            "3695": 4.49297,
-            "3700": 4.52866,
-            "3705": 4.50654,
-            "3710": 4.51966,
-            "3715": 4.53,
-            "3720": 4.50118,
-            "3725": 4.47886,
-            "3730": 4.4879,
-            "3735": 4.50546,
-            "3740": 4.49399,
-            "3745": 4.48041,
-            "3750": 4.51288,
-            "3755": 4.48915,
-            "3760": 4.50004,
-            "3765": 4.47669,
-            "3770": 4.48984,
-            "3775": 4.46969,
-            "3780": 4.45476,
-            "3785": 4.50898,
-            "3790": 4.42336,
-            "3795": 4.4846,
-            "3800": 4.46028,
-            "3805": 4.46023,
-            "3810": 4.42629,
-            "3815": 4.4806,
-            "3820": 4.4736,
-            "3825": 4.4803,
-            "3830": 4.46747,
-            "3835": 4.42638,
-            "3840": 4.52349,
-            "3845": 4.48225,
-            "3850": 4.42266,
-            "3855": 4.46223,
-            "3860": 4.48001,
-            "3865": 4.44144,
-            "3870": 4.50523,
-            "3875": 4.41439,
-            "3880": 4.42672,
-            "3885": 4.44983,
-            "3890": 4.43819,
-            "3895": 4.38007,
-            "3900": 4.43434,
-            "3905": 4.41283,
-            "3910": 4.42081,
-            "3915": 4.42082,
-            "3920": 4.41329,
-            "3925": 4.39336,
-            "3930": 4.41243,
-            "3935": 4.41903,
-            "3940": 4.41848,
-            "3945": 4.39397,
-            "3950": 4.46098,
-            "3955": 4.39087,
-            "3960": 4.43851,
-            "3965": 4.44901,
-            "3970": 4.39272,
-            "3975": 4.40242,
-            "3980": 4.37236,
-            "3985": 4.40832,
-            "3990": 4.40208,
-            "3995": 4.44335,
-            "4000": 4.38322,
-            "4005": 4.37255,
-            "4010": 4.40982,
-            "4015": 4.39813,
-            "4020": 4.43488,
-            "4025": 4.39111,
-            "4030": 4.44761,
-            "4035": 4.40548,
-            "4040": 4.43553,
-            "4045": 4.41155,
-            "4050": 4.40643,
-            "4055": 4.41393,
-            "4060": 4.40665,
-            "4065": 4.41291,
-            "4070": 4.34904,
-            "4075": 4.37708,
-            "4080": 4.35797,
-            "4085": 4.39736,
-            "4090": 4.37437,
-            "4095": 4.35826,
-            "4100": 4.37323,
-            "4105": 4.36208,
-            "4110": 4.32609,
-            "4115": 4.39421,
-            "4120": 4.31057,
-            "4125": 4.31168,
-            "4130": 4.39302,
-            "4135": 4.37289,
-            "4140": 4.31616,
-            "4145": 4.32788,
-            "4150": 4.37558,
-            "4155": 4.29766,
-            "4160": 4.35633,
-            "4165": 4.38157,
-            "4170": 4.32646,
-            "4175": 4.33285,
-            "4180": 4.32735,
-            "4185": 4.31953,
-            "4190": 4.31017,
-            "4195": 4.31525,
-            "4200": 4.31406,
-            "4205": 4.37,
-            "4210": 4.32695,
-            "4215": 4.3562,
-            "4220": 4.33701,
-            "4225": 4.32036,
-            "4230": 4.30579,
-            "4235": 4.35051,
-            "4240": 4.30872,
-            "4245": 4.31564,
-            "4250": 4.29999,
-            "4255": 4.31166,
-            "4260": 4.29019,
-            "4265": 4.30554,
-            "4270": 4.29954,
-            "4275": 4.36276,
-            "4280": 4.29798,
-            "4285": 4.33284,
-            "4290": 4.27741,
-            "4295": 4.30368,
-            "4300": 4.32594,
-            "4305": 4.29066,
-            "4310": 4.33408,
-            "4315": 4.3163,
-            "4320": 4.30571,
-            "4325": 4.32764,
-            "4330": 4.26525,
-            "4335": 4.30418,
-            "4340": 4.28838,
-            "4345": 4.23753,
-            "4350": 4.25927,
-            "4355": 4.33009,
-            "4360": 4.30543,
-            "4365": 4.30411,
-            "4370": 4.28149,
-            "4375": 4.24372,
-            "4380": 4.25559,
-            "4385": 4.23331,
-            "4390": 4.30895,
-            "4395": 4.27518,
-            "4400": 4.26254,
-            "4405": 4.23007,
-            "4410": 4.28048,
-            "4415": 4.26816,
-            "4420": 4.24916,
-            "4425": 4.29252,
-            "4430": 4.24244,
-            "4435": 4.29049,
-            "4440": 4.28601,
-            "4445": 4.24232,
-            "4450": 4.20719,
-            "4455": 4.26016,
-            "4460": 4.23459,
-            "4465": 4.25243,
-            "4470": 4.23841,
-            "4475": 4.2641,
-            "4480": 4.24909,
-            "4485": 4.23389,
-            "4490": 4.23593,
-            "4495": 4.17962,
-            "4500": 4.25444,
-            "4505": 4.22942,
-            "4510": 4.23965,
-            "4515": 4.19566,
-            "4520": 4.23113,
-            "4525": 4.19456,
-            "4530": 4.24001,
-            "4535": 4.20166,
-            "4540": 4.21127,
-            "4545": 4.23188,
-            "4550": 4.27088,
-            "4555": 4.2072,
-            "4560": 4.22378,
-            "4565": 4.15426,
-            "4570": 4.21606,
-            "4575": 4.1941,
-            "4580": 4.25747,
-            "4585": 4.22428,
-            "4590": 4.21266,
-            "4595": 4.17399,
-            "4600": 4.16313,
-            "4605": 4.2045,
-            "4610": 4.19939,
-            "4615": 4.24443,
-            "4620": 4.16447,
-            "4625": 4.19099,
-            "4630": 4.20991,
-            "4635": 4.18208,
-            "4640": 4.21078,
-            "4645": 4.20652,
-            "4650": 4.22758,
-            "4655": 4.19246,
-            "4660": 4.18248,
-            "4665": 4.193,
-            "4670": 4.23574,
-            "4675": 4.17989,
-            "4680": 4.20859,
-            "4685": 4.19688,
-            "4690": 4.1723,
-            "4695": 4.18485,
-            "4700": 4.16546,
-            "4705": 4.14067,
-            "4710": 4.20305,
-            "4715": 4.19002,
-            "4720": 4.14737,
-            "4725": 4.12216,
-            "4730": 4.17809,
-            "4735": 4.10178,
-            "4740": 4.14697,
-            "4745": 4.18779,
-            "4750": 4.13615,
-            "4755": 4.19424,
-            "4760": 4.1984,
-            "4765": 4.1461,
-            "4770": 4.14849,
-            "4775": 4.14773,
-            "4780": 4.15523,
-            "4785": 4.13664,
-            "4790": 4.19224,
-            "4795": 4.17628,
-            "4800": 4.13942,
-            "4805": 4.17839,
-            "4810": 4.1375,
-            "4815": 4.17167,
-            "4820": 4.12226,
-            "4825": 4.17474,
-            "4830": 4.16985,
-            "4835": 4.14976,
-            "4840": 4.15298,
-            "4845": 4.10968,
-            "4850": 4.17354,
-            "4855": 4.17639,
-            "4860": 4.11236,
-            "4865": 4.13759,
-            "4870": 4.13215,
-            "4875": 4.17643,
-            "4880": 4.1702,
-            "4885": 4.13029,
-            "4890": 4.1249,
-            "4895": 4.12403,
-            "4900": 4.09958,
-            "4905": 4.09173,
-            "4910": 4.09074,
-            "4915": 4.14665,
-            "4920": 4.12021,
-            "4925": 4.08814,
-            "4930": 4.09778,
-            "4935": 4.12094,
-            "4940": 4.04981,
-            "4945": 4.13369,
-            "4950": 4.07708,
-            "4955": 4.15684,
-            "4960": 4.11652,
-            "4965": 4.1151,
-            "4970": 4.09971,
-            "4975": 4.11736,
-            "4980": 4.12585,
-            "4985": 4.12754,
-            "4990": 4.09005,
-            "4995": 4.12916,
-            "5000": 4.05682,
-            "5005": 4.11701,
-            "5010": 4.10942,
-            "5015": 4.07584,
-            "5020": 4.05201,
-            "5025": 4.06082,
-            "5030": 4.10005,
-            "5035": 4.08177,
-            "5040": 4.0418,
-            "5045": 4.11064,
-            "5050": 4.06425,
-            "5055": 4.08995,
-            "5060": 4.03143,
-            "5065": 4.09666,
-            "5070": 4.07056,
-            "5075": 4.12386,
-            "5080": 4.07795,
-            "5085": 4.09595,
-            "5090": 4.07748,
-            "5095": 4.0424,
-            "5100": 4.0782,
-            "5105": 4.0809,
-            "5110": 4.08612,
-            "5115": 4.07663,
-            "5120": 4.09438,
-            "5125": 4.05976,
-            "5130": 4.06327,
-            "5135": 4.0488,
-            "5140": 4.06922,
-            "5145": 4.05942,
-            "5150": 4.07092,
-            "5155": 4.07553,
-            "5160": 4.05549,
-            "5165": 4.09766,
-            "5170": 3.96642,
-            "5175": 4.07515,
-            "5180": 4.03531,
-            "5185": 4.05861,
-            "5190": 4.08092,
-            "5195": 4.04601,
-            "5200": 4.06577,
-            "5205": 4.09747,
-            "5210": 4.01055,
-            "5215": 4.02373,
-            "5220": 4.02621,
-            "5225": 4.02349,
-            "5230": 4.06271,
-            "5235": 4.03585,
-            "5240": 4.02422,
-            "5245": 4.04177,
-            "5250": 4.04544,
-            "5255": 4.03173,
-            "5260": 4.04798,
-            "5265": 4.01495,
-            "5270": 3.98673,
-            "5275": 4.00519,
-            "5280": 4.02024,
-            "5285": 4.04277,
-            "5290": 4.00304,
-            "5295": 4.00093,
-            "5300": 4.02323,
-            "5305": 4.01012,
-            "5310": 4.0478,
-            "5315": 3.99571,
-            "5320": 4.03864,
-            "5325": 4.06497,
-            "5330": 3.99981,
-            "5335": 4.02122,
-            "5340": 3.9739,
-            "5345": 4.01424,
-            "5350": 4.0246,
-            "5355": 4.01714,
-            "5360": 3.9668,
-            "5365": 3.98455,
-            "5370": 4.02892,
-            "5375": 3.99384,
-            "5380": 3.98952,
-            "5385": 4.00787,
-            "5390": 3.99585,
-            "5395": 3.932,
-            "5400": 4.02192,
-            "5405": 3.94401,
-            "5410": 4.03103,
-            "5415": 3.94954,
-            "5420": 3.98108,
-            "5425": 3.96619,
-            "5430": 3.97462,
-            "5435": 4.00917,
-            "5440": 3.96082,
-            "5445": 3.96843,
-            "5450": 3.98078,
-            "5455": 3.96312,
-            "5460": 3.97781,
-            "5465": 4.03343,
-            "5470": 3.99301,
-            "5475": 3.92634,
-            "5480": 4.0001,
-            "5485": 3.96789,
-            "5490": 3.99381,
-            "5495": 3.99755,
-            "5500": 3.95394,
-            "5505": 3.9702,
-            "5510": 4.00139,
-            "5515": 3.97886,
-            "5520": 3.95723,
-            "5525": 4.01089,
-            "5530": 3.95723,
-            "5535": 3.99058,
-            "5540": 3.95888,
-            "5545": 3.97704,
-            "5550": 3.97005,
-            "5555": 3.93134,
-            "5560": 3.94203,
-            "5565": 3.98688,
-            "5570": 3.94409,
-            "5575": 3.97691,
-            "5580": 3.95423,
-            "5585": 3.89232,
-            "5590": 3.96662,
-            "5595": 3.91996,
-            "5600": 3.97099,
-            "5605": 3.87423,
-            "5610": 3.96509,
-            "5615": 3.9629,
-            "5620": 3.97882,
-            "5625": 3.95843,
-            "5630": 3.94884,
-            "5635": 3.92989,
-            "5640": 3.95308,
-            "5645": 3.91537,
-            "5650": 3.88759,
-            "5655": 3.91914,
-            "5660": 3.9101,
-            "5665": 3.92739,
-            "5670": 3.91107,
-            "5675": 3.94487,
-            "5680": 3.91238,
-            "5685": 3.92365,
-            "5690": 3.92517,
-            "5695": 3.953,
-            "5700": 3.88996,
-            "5705": 3.88995,
-            "5710": 3.87532,
-            "5715": 3.99623,
-            "5720": 3.94505,
-            "5725": 3.89527,
-            "5730": 3.94792,
-            "5735": 3.92817,
-            "5740": 3.92171,
-            "5745": 3.89897,
-            "5750": 3.92176,
-            "5755": 3.94672,
-            "5760": 3.92632,
-            "5765": 3.92024,
-            "5770": 3.95286,
-            "5775": 3.86965,
-            "5780": 3.91041,
-            "5785": 3.91605,
-            "5790": 3.9236,
-            "5795": 3.93068,
-            "5800": 3.86954,
-            "5805": 3.8764,
-            "5810": 3.92692,
-            "5815": 3.89083,
-            "5820": 3.84021,
-            "5825": 3.89285,
-            "5830": 3.85163,
-            "5835": 3.88292,
-            "5840": 3.89361,
-            "5845": 3.91293,
-            "5850": 3.90508,
-            "5855": 3.84956,
-            "5860": 3.87018,
-            "5865": 3.8979,
-            "5870": 3.85816,
-            "5875": 3.89604,
-            "5880": 3.88075,
-            "5885": 3.89965,
-            "5890": 3.90395,
-            "5895": 3.92339,
-            "5900": 3.85618,
-            "5905": 3.92033,
-            "5910": 3.88782,
-            "5915": 3.85158,
-            "5920": 3.88999,
-            "5925": 3.82174,
-            "5930": 3.88478,
-            "5935": 3.86887,
-            "5940": 3.89924,
-            "5945": 3.90324,
-            "5950": 3.88472,
-            "5955": 3.83758,
-            "5960": 3.91077,
-            "5965": 3.85295,
-            "5970": 3.90592,
-            "5975": 3.87131,
-            "5980": 3.94635,
-            "5985": 3.81828,
-            "5990": 3.91445,
-            "5995": 3.82666,
-            "6000": 3.86389,
-            "6005": 3.82737,
-            "6010": 3.84638,
-            "6015": 3.82528,
-            "6020": 3.84213,
-            "6025": 3.8812,
-            "6030": 3.82864,
-            "6035": 3.87549,
-            "6040": 3.85371,
-            "6045": 3.88892,
-            "6050": 3.86125,
-            "6055": 3.84398,
-            "6060": 3.86538,
-            "6065": 3.8955,
-            "6070": 3.844,
-            "6075": 3.79156,
-            "6080": 3.86497,
-            "6085": 3.82767,
-            "6090": 3.86054,
-            "6095": 3.85995,
-            "6100": 3.82399,
-            "6105": 3.87238,
-            "6110": 3.80525,
-            "6115": 3.87931,
-            "6120": 3.85374,
-            "6125": 3.85469,
-            "6130": 3.85122,
-            "6135": 3.82709,
-            "6140": 3.8225,
-            "6145": 3.81264,
-            "6150": 3.85853,
-            "6155": 3.83605,
-            "6160": 3.80232,
-            "6165": 3.82292,
-            "6170": 3.81513,
-            "6175": 3.80691,
-            "6180": 3.8071,
-            "6185": 3.84448,
-            "6190": 3.81178,
-            "6195": 3.78014,
-            "6200": 3.80543,
-            "6205": 3.81219,
-            "6210": 3.77002,
-            "6215": 3.82559,
-            "6220": 3.822,
-            "6225": 3.82598,
-            "6230": 3.76955,
-            "6235": 3.8072,
-            "6240": 3.73374,
-            "6245": 3.84624,
-            "6250": 3.80845,
-            "6255": 3.8223,
-            "6260": 3.7948,
-            "6265": 3.82819,
-            "6270": 3.75673,
-            "6275": 3.78492,
-            "6280": 3.80313,
-            "6285": 3.78154,
-            "6290": 3.79976,
-            "6295": 3.80168,
-            "6300": 3.80756,
-            "6305": 3.88253,
-            "6310": 3.7702,
-            "6315": 3.7633,
-            "6320": 3.81817,
-            "6325": 3.75526,
-            "6330": 3.82862,
-            "6335": 3.81943,
-            "6340": 3.76721,
-            "6345": 3.82391,
-            "6350": 3.76718,
-            "6355": 3.77414,
-            "6360": 3.75111,
-            "6365": 3.80986,
-            "6370": 3.81014,
-            "6375": 3.78548,
-            "6380": 3.8065,
-            "6385": 3.82336,
-            "6390": 3.78289,
-            "6395": 3.75935,
-            "6400": 3.76038,
-            "6405": 3.83749,
-            "6410": 3.83127,
-            "6415": 3.7623,
-            "6420": 3.82306,
-            "6425": 3.83219,
-            "6430": 3.81048,
-            "6435": 3.77764,
-            "6440": 3.76108,
-            "6445": 3.80173,
-            "6450": 3.73884,
-            "6455": 3.75156,
-            "6460": 3.77352,
-            "6465": 3.80905,
-            "6470": 3.78701,
-            "6475": 3.78176,
-            "6480": 3.81548,
-            "6485": 3.76414,
-            "6490": 3.71291,
-            "6495": 3.81407,
-            "6500": 3.79809,
-            "6505": 3.72741,
-            "6510": 3.7976,
-            "6515": 3.81938,
-            "6520": 3.73166,
-            "6525": 3.80464,
-            "6530": 3.76853,
-            "6535": 3.76159,
-            "6540": 3.82675,
-            "6545": 3.76261,
-            "6550": 3.76963,
-            "6555": 3.75505,
-            "6560": 3.71108,
-            "6565": 3.70887,
-            "6570": 3.7465,
-            "6575": 3.69338,
-            "6580": 3.81517,
-            "6585": 3.76239,
-            "6590": 3.72546,
-            "6595": 3.74461,
-            "6600": 3.73687,
-            "6605": 3.71668,
-            "6610": 3.72679,
-            "6615": 3.76079,
-            "6620": 3.70966,
-            "6625": 3.72313,
-            "6630": 3.72114,
-            "6635": 3.76232,
-            "6640": 3.73374,
-            "6645": 3.75061,
-            "6650": 3.77922,
-            "6655": 3.70627,
-            "6660": 3.73531,
-            "6665": 3.7573,
-            "6670": 3.71979,
-            "6675": 3.74124,
-            "6680": 3.73477,
-            "6685": 3.76436,
-            "6690": 3.74256,
-            "6695": 3.75545,
-            "6700": 3.74559,
-            "6705": 3.72882,
-            "6710": 3.72913,
-            "6715": 3.69291,
-            "6720": 3.77736,
-            "6725": 3.75737,
-            "6730": 3.73993,
-            "6735": 3.74082,
-            "6740": 3.73806,
-            "6745": 3.72041,
-            "6750": 3.74412,
-            "6755": 3.69337,
-            "6760": 3.68122,
-            "6765": 3.74232,
-            "6770": 3.69625,
-            "6775": 3.74604,
-            "6780": 3.70485,
-            "6785": 3.70942,
-            "6790": 3.73683,
-            "6795": 3.69846,
-            "6800": 3.71752,
-            "6805": 3.72172,
-            "6810": 3.73628,
-            "6815": 3.65876,
-            "6820": 3.70229,
-            "6825": 3.72745,
-            "6830": 3.70872,
-            "6835": 3.68623,
-            "6840": 3.67517,
-            "6845": 3.74818,
-            "6850": 3.70405,
-            "6855": 3.73713,
-            "6860": 3.6695,
-            "6865": 3.73585,
-            "6870": 3.6953,
-            "6875": 3.69781,
-            "6880": 3.70324,
-            "6885": 3.67727,
-            "6890": 3.69236,
-            "6895": 3.67848,
-            "6900": 3.68133,
-            "6905": 3.68771,
-            "6910": 3.72919,
-            "6915": 3.73359,
-            "6920": 3.68934,
-            "6925": 3.69022,
-            "6930": 3.68858,
-            "6935": 3.62056,
-            "6940": 3.68927,
-            "6945": 3.67777,
-            "6950": 3.68038,
-            "6955": 3.6771,
-            "6960": 3.68108,
-            "6965": 3.72225,
-            "6970": 3.64603,
-            "6975": 3.72781,
-            "6980": 3.68459,
-            "6985": 3.68985,
-            "6990": 3.7316,
-            "6995": 3.70495,
-            "7000": 3.63993,
-            "7005": 3.71744,
-            "7010": 3.69223,
-            "7015": 3.67561,
-            "7020": 3.72152,
-            "7025": 3.70969,
-            "7030": 3.70236,
-            "7035": 3.65723,
-            "7040": 3.61488,
-            "7045": 3.69518,
-            "7050": 3.71947,
-            "7055": 3.64991,
-            "7060": 3.69149,
-            "7065": 3.74261,
-            "7070": 3.67108,
-            "7075": 3.67419,
-            "7080": 3.71683,
-            "7085": 3.64191,
-            "7090": 3.66318,
-            "7095": 3.63818,
-            "7100": 3.68341,
-            "7105": 3.62024,
-            "7110": 3.68873,
-            "7115": 3.63797,
-            "7120": 3.68741,
-            "7125": 3.63499,
-            "7130": 3.65311,
-            "7135": 3.66196,
-            "7140": 3.66504,
-            "7145": 3.68183,
-            "7150": 3.62677,
-            "7155": 3.69052,
-            "7160": 3.62415,
-            "7165": 3.64241,
-            "7170": 3.68231,
-            "7175": 3.64603,
-            "7180": 3.67571,
-            "7185": 3.70721,
-            "7190": 3.663,
-            "7195": 3.66862,
-            "7200": 3.67265,
-            "7205": 3.65833,
-            "7210": 3.68834,
-            "7215": 3.67282,
-            "7220": 3.69117,
-            "7225": 3.66107,
-            "7230": 3.68593,
-            "7235": 3.64823,
-            "7240": 3.64663,
-            "7245": 3.66574,
-            "7250": 3.60447,
-            "7255": 3.62598,
-            "7260": 3.68023,
-            "7265": 3.60288,
-            "7270": 3.63936,
-            "7275": 3.64805,
-            "7280": 3.62623,
-            "7285": 3.65053,
-            "7290": 3.6735,
-            "7295": 3.66357,
-            "7300": 3.62393,
-            "7305": 3.62784,
-            "7310": 3.66312,
-            "7315": 3.67632,
-            "7320": 3.65015,
-            "7325": 3.65453,
-            "7330": 3.62344,
-            "7335": 3.62574,
-            "7340": 3.64422,
-            "7345": 3.60533,
-            "7350": 3.65727,
-            "7355": 3.64352,
-            "7360": 3.61779,
-            "7365": 3.63578,
-            "7370": 3.6188,
-            "7375": 3.59366,
-            "7380": 3.64743,
-            "7385": 3.67218,
-            "7390": 3.65876,
-            "7395": 3.60688,
-            "7400": 3.65695,
-            "7405": 3.64945,
-            "7410": 3.66151,
-            "7415": 3.64439,
-            "7420": 3.63591,
-            "7425": 3.6844,
-            "7430": 3.63181,
-            "7435": 3.61154,
-            "7440": 3.62564,
-            "7445": 3.60843,
-            "7450": 3.57301,
-            "7455": 3.64772,
-            "7460": 3.63452,
-            "7465": 3.63169,
-            "7470": 3.63744,
-            "7475": 3.64264,
-            "7480": 3.61171,
-            "7485": 3.57567,
-            "7490": 3.57599,
-            "7495": 3.5863,
-            "7500": 3.61565,
-            "7505": 3.59614,
-            "7510": 3.55707,
-            "7515": 3.61683,
-            "7520": 3.60991,
-            "7525": 3.56658,
-            "7530": 3.61196,
-            "7535": 3.62507,
-            "7540": 3.61046,
-            "7545": 3.64639,
-            "7550": 3.65882,
-            "7555": 3.58595,
-            "7560": 3.60212,
-            "7565": 3.59782,
-            "7570": 3.60603,
-            "7575": 3.57351,
-            "7580": 3.62111,
-            "7585": 3.60137,
-            "7590": 3.6026,
-            "7595": 3.66318,
-            "7600": 3.6076,
-            "7605": 3.59626,
-            "7610": 3.58483,
-            "7615": 3.58478,
-            "7620": 3.56787,
-            "7625": 3.62193,
-            "7630": 3.60469,
-            "7635": 3.5928,
-            "7640": 3.59019,
-            "7645": 3.62279,
-            "7650": 3.6259,
-            "7655": 3.66371,
-            "7660": 3.5305,
-            "7665": 3.60545,
-            "7670": 3.59796,
-            "7675": 3.58201,
-            "7680": 3.57701,
-            "7685": 3.64556,
-            "7690": 3.59102,
-            "7695": 3.57063,
-            "7700": 3.63352,
-            "7705": 3.58816,
-            "7710": 3.62048,
-            "7715": 3.5764,
-            "7720": 3.65561,
-            "7725": 3.55706,
-            "7730": 3.57614,
-            "7735": 3.61006,
-            "7740": 3.58168,
-            "7745": 3.58454,
-            "7750": 3.57422,
-            "7755": 3.59202,
-            "7760": 3.56089,
-            "7765": 3.58551,
-            "7770": 3.60104,
-            "7775": 3.57103,
-            "7780": 3.55457,
-            "7785": 3.57713,
-            "7790": 3.57042,
-            "7795": 3.58792,
-            "7800": 3.57997,
-            "7805": 3.58361,
-            "7810": 3.60683,
-            "7815": 3.57773,
-            "7820": 3.57578,
-            "7825": 3.61835,
-            "7830": 3.59192,
-            "7835": 3.52632,
-            "7840": 3.6194,
-            "7845": 3.55538,
-            "7850": 3.51354,
-            "7855": 3.56599,
-            "7860": 3.54645,
-            "7865": 3.60369,
-            "7870": 3.54114,
-            "7875": 3.55695,
-            "7880": 3.572,
-            "7885": 3.56229,
-            "7890": 3.60585,
-            "7895": 3.59334,
-            "7900": 3.60641,
-            "7905": 3.56339,
-            "7910": 3.58203,
-            "7915": 3.58298,
-            "7920": 3.59012,
-            "7925": 3.5681,
-            "7930": 3.59927,
-            "7935": 3.56169,
-            "7940": 3.60948,
-            "7945": 3.62723,
-            "7950": 3.53708,
-            "7955": 3.54481,
-            "7960": 3.53124,
-            "7965": 3.51862,
-            "7970": 3.52486,
-            "7975": 3.55975,
-            "7980": 3.56722,
-            "7985": 3.54114,
-            "7990": 3.54399,
-            "7995": 3.5186,
-            "8000": 3.57756,
-            "8005": 3.54643,
-            "8010": 3.53705,
-            "8015": 3.53445,
-            "8020": 3.53111,
-            "8025": 3.51514,
-            "8030": 3.54148,
-            "8035": 3.53478,
-            "8040": 3.52163,
-            "8045": 3.57586,
-            "8050": 3.57789,
-            "8055": 3.54866,
-            "8060": 3.5712,
-            "8065": 3.54757,
-            "8070": 3.53654,
-            "8075": 3.52629,
-            "8080": 3.57467,
-            "8085": 3.52928,
-            "8090": 3.53424,
-            "8095": 3.56313,
-            "8100": 3.51543,
-            "8105": 3.54752,
-            "8110": 3.5453,
-            "8115": 3.51645,
-            "8120": 3.52703,
-            "8125": 3.56437,
-            "8130": 3.52567,
-            "8135": 3.53994,
-            "8140": 3.52104,
-            "8145": 3.50389,
-            "8150": 3.52394,
-            "8155": 3.51178,
-            "8160": 3.56129,
-            "8165": 3.54328,
-            "8170": 3.5116,
-            "8175": 3.5057,
-            "8180": 3.57245,
-            "8185": 3.54733,
-            "8190": 3.58207,
-            "8195": 3.55001,
-            "8200": 3.52156,
-            "8205": 3.52888,
-            "8210": 3.53558,
-            "8215": 3.55713,
-            "8220": 3.5201,
-            "8225": 3.51201,
-            "8230": 3.53756,
-            "8235": 3.55814,
-            "8240": 3.54052,
-            "8245": 3.53652,
-            "8250": 3.5692,
-            "8255": 3.51844,
-            "8260": 3.52912,
-            "8265": 3.52072,
-            "8270": 3.52843,
-            "8275": 3.51526,
-            "8280": 3.50321,
-            "8285": 3.52669,
-            "8290": 3.5272,
-            "8295": 3.49645,
-            "8300": 3.51721,
-            "8305": 3.53958,
-            "8310": 3.5351,
-            "8315": 3.50396,
-            "8320": 3.53046,
-            "8325": 3.47885,
-            "8330": 3.44388,
-            "8335": 3.51457,
-            "8340": 3.54076,
-            "8345": 3.49873,
-            "8350": 3.51134,
-            "8355": 3.54342,
-            "8360": 3.51607,
-            "8365": 3.53716,
-            "8370": 3.53127,
-            "8375": 3.48696,
-            "8380": 3.4848,
-            "8385": 3.52879,
-            "8390": 3.49474,
-            "8395": 3.52721,
-            "8400": 3.49636,
-            "8405": 3.51685,
-            "8410": 3.57651,
-            "8415": 3.48228,
-            "8420": 3.45216,
-            "8425": 3.53401,
-            "8430": 3.53787,
-            "8435": 3.47534,
-            "8440": 3.55163,
-            "8445": 3.53658,
-            "8450": 3.50995,
-            "8455": 3.52875,
-            "8460": 3.53463,
-            "8465": 3.4708,
-            "8470": 3.4929,
-            "8475": 3.55004,
-            "8480": 3.47555,
-            "8485": 3.49487,
-            "8490": 3.48489,
-            "8495": 3.48023,
-            "8500": 3.52888,
-            "8505": 3.46749,
-            "8510": 3.54064,
-            "8515": 3.48982,
-            "8520": 3.49184,
-            "8525": 3.42254,
-            "8530": 3.50181,
-            "8535": 3.52351,
-            "8540": 3.47484,
-            "8545": 3.49944,
-            "8550": 3.46881,
-            "8555": 3.53517,
-            "8560": 3.5346,
-            "8565": 3.48792,
-            "8570": 3.48883,
-            "8575": 3.46414,
-            "8580": 3.50837,
-            "8585": 3.52994,
-            "8590": 3.51956,
-            "8595": 3.52409,
-            "8600": 3.50319,
-            "8605": 3.49079,
-            "8610": 3.49584,
-            "8615": 3.49483,
-            "8620": 3.46525,
-            "8625": 3.4875,
-            "8630": 3.49269,
-            "8635": 3.47742,
-            "8640": 3.46288,
-            "8645": 3.52844,
-            "8650": 3.45936,
-            "8655": 3.50294,
-            "8660": 3.51093,
-            "8665": 3.48996,
-            "8670": 3.50547,
-            "8675": 3.47414,
-            "8680": 3.4685,
-            "8685": 3.48029,
-            "8690": 3.51264,
-            "8695": 3.51367,
-            "8700": 3.48324,
-            "8705": 3.45351,
-            "8710": 3.50031,
-            "8715": 3.45042,
-            "8720": 3.52876,
-            "8725": 3.48819,
-            "8730": 3.47981,
-            "8735": 3.51018,
-            "8740": 3.46013,
-            "8745": 3.50108,
-            "8750": 3.50543,
-            "8755": 3.46564,
-            "8760": 3.48373,
-            "8765": 3.43955,
-            "8770": 3.50951,
-            "8775": 3.47313,
-            "8780": 3.45782,
-            "8785": 3.47628,
-            "8790": 3.4608,
-            "8795": 3.49675,
-            "8800": 3.46402,
-            "8805": 3.43267,
-            "8810": 3.45044,
-            "8815": 3.47281,
-            "8820": 3.43586,
-            "8825": 3.46906,
-            "8830": 3.44494,
-            "8835": 3.42402,
-            "8840": 3.4361,
-            "8845": 3.45772,
-            "8850": 3.48143,
-            "8855": 3.46505,
-            "8860": 3.53187,
-            "8865": 3.46882,
-            "8870": 3.44869,
-            "8875": 3.45286,
-            "8880": 3.45584,
-            "8885": 3.44986,
-            "8890": 3.47298,
-            "8895": 3.45131,
-            "8900": 3.47879,
-            "8905": 3.46796,
-            "8910": 3.45421,
-            "8915": 3.44293,
-            "8920": 3.43345,
-            "8925": 3.50917,
-            "8930": 3.49052,
-            "8935": 3.50073,
-            "8940": 3.47584,
-            "8945": 3.47848,
-            "8950": 3.45717,
-            "8955": 3.44615,
-            "8960": 3.43965,
-            "8965": 3.45818,
-            "8970": 3.47179,
-            "8975": 3.42177,
-            "8980": 3.42266,
-            "8985": 3.44671,
-            "8990": 3.50075,
-            "8995": 3.47255,
-            "9000": 3.41954,
-            "9005": 3.46563,
-            "9010": 3.51573,
-            "9015": 3.4185,
-            "9020": 3.43896,
-            "9025": 3.44768,
-            "9030": 3.4718,
-            "9035": 3.37943,
-            "9040": 3.45501,
-            "9045": 3.45466,
-            "9050": 3.49179,
-            "9055": 3.40312,
-            "9060": 3.49477,
-            "9065": 3.51349,
-            "9070": 3.44713,
-            "9075": 3.47746,
-            "9080": 3.47127,
-            "9085": 3.47459,
-            "9090": 3.46668,
-            "9095": 3.42167,
-            "9100": 3.4227,
-            "9105": 3.41261,
-            "9110": 3.45663,
-            "9115": 3.46481,
-            "9120": 3.51949,
-            "9125": 3.44245,
-            "9130": 3.43654,
-            "9135": 3.46008,
-            "9140": 3.47929,
-            "9145": 3.42408,
-            "9150": 3.44307,
-            "9155": 3.45089,
-            "9160": 3.44998,
-            "9165": 3.45651,
-            "9170": 3.47508,
-            "9175": 3.41133,
-            "9180": 3.45323,
-            "9185": 3.41086,
-            "9190": 3.46875,
-            "9195": 3.43315,
-            "9200": 3.44758,
-            "9205": 3.42373,
-            "9210": 3.45572,
-            "9215": 3.39585,
-            "9220": 3.42327,
-            "9225": 3.44665,
-            "9230": 3.37357,
-            "9235": 3.39456,
-            "9240": 3.42282,
-            "9245": 3.40683,
-            "9250": 3.40791,
-            "9255": 3.42077,
-            "9260": 3.39755,
-            "9265": 3.44216,
-            "9270": 3.40754,
-            "9275": 3.42864,
-            "9280": 3.44334,
-            "9285": 3.44087,
-            "9290": 3.45563,
-            "9295": 3.44456,
-            "9300": 3.39522,
-            "9305": 3.42638,
-            "9310": 3.41593,
-            "9315": 3.38278,
-            "9320": 3.3797,
-            "9325": 3.42046,
-            "9330": 3.47853,
-            "9335": 3.38962,
-            "9340": 3.4706,
-            "9345": 3.46224,
-            "9350": 3.42735,
-            "9355": 3.39326,
-            "9360": 3.4165,
-            "9365": 3.41212,
-            "9370": 3.46155,
-            "9375": 3.42622,
-            "9380": 3.36413,
-            "9385": 3.43469,
-            "9390": 3.44403,
-            "9395": 3.45465,
-            "9400": 3.41582,
-            "9405": 3.40031,
-            "9410": 3.43744,
-            "9415": 3.42574,
-            "9420": 3.40295,
-            "9425": 3.42063,
-            "9430": 3.3935,
-            "9435": 3.41529,
-            "9440": 3.40125,
-            "9445": 3.39961,
-            "9450": 3.39469,
-            "9455": 3.4008,
-            "9460": 3.46489,
-            "9465": 3.46303,
-            "9470": 3.40478,
-            "9475": 3.45335,
-            "9480": 3.40789,
-            "9485": 3.3998,
-            "9490": 3.41154,
-            "9495": 3.44387,
-            "9500": 3.40535,
-            "9505": 3.37735,
-            "9510": 3.41645,
-            "9515": 3.41113,
-            "9520": 3.43045,
-            "9525": 3.40102,
-            "9530": 3.40027,
-            "9535": 3.42216
-        }
-    },
-    "iteration-time": {
-        "start_step": 1,
-        "end_step": 9535,
-        "step_interval": 5,
-        "values": {
-            "1": 241.22832,
-            "5": 11.6467,
-            "10": 11.59177,
-            "15": 11.54982,
-            "20": 11.50554,
-            "25": 11.48401,
-            "30": 11.47019,
-            "35": 11.4638,
-            "40": 11.44621,
-            "45": 11.45505,
-            "50": 11.48551,
-            "55": 11.47505,
-            "60": 11.46559,
-            "65": 11.69276,
-            "70": 11.51491,
-            "75": 11.58841,
-            "80": 11.59402,
-            "85": 11.55505,
-            "90": 11.57827,
-            "95": 11.6084,
-            "100": 11.72328,
-            "105": 11.84735,
-            "110": 11.81445,
-            "115": 12.01469,
-            "120": 12.27052,
-            "125": 12.40894,
-            "130": 12.32306,
-            "135": 12.6537,
-            "140": 12.87941,
-            "145": 12.87274,
-            "150": 13.17646,
-            "155": 13.42132,
-            "160": 13.29203,
-            "165": 13.33468,
-            "170": 13.38365,
-            "175": 13.29143,
-            "180": 13.37704,
-            "185": 13.17491,
-            "190": 13.2207,
-            "195": 13.0407,
-            "200": 13.03378,
-            "205": 12.93499,
-            "210": 12.93302,
-            "215": 12.83429,
-            "220": 12.77504,
-            "225": 12.71437,
-            "230": 12.67462,
-            "235": 12.7241,
-            "240": 12.78341,
-            "245": 12.61372,
-            "250": 12.60968,
-            "255": 12.49502,
-            "260": 12.38655,
-            "265": 12.35372,
-            "270": 12.32939,
-            "275": 12.25213,
-            "280": 12.23412,
-            "285": 12.25047,
-            "290": 12.1386,
-            "295": 12.11066,
-            "300": 12.11487,
-            "305": 12.08746,
-            "310": 12.06842,
-            "315": 12.13334,
-            "320": 12.12044,
-            "325": 12.01351,
-            "330": 11.97276,
-            "335": 11.951,
-            "340": 11.97582,
-            "345": 11.94178,
-            "350": 11.90942,
-            "355": 11.9474,
-            "360": 11.94231,
-            "365": 11.91539,
-            "370": 11.89051,
-            "375": 11.87871,
-            "380": 11.8539,
-            "385": 11.81422,
-            "390": 11.82072,
-            "395": 11.85516,
-            "400": 11.8322,
-            "405": 11.81286,
-            "410": 11.81008,
-            "415": 11.76854,
-            "420": 11.7721,
-            "425": 11.7287,
-            "430": 11.80281,
-            "435": 11.76948,
-            "440": 11.78237,
-            "445": 11.81223,
-            "450": 11.76024,
-            "455": 11.83905,
-            "460": 11.86797,
-            "465": 11.88193,
-            "470": 11.94544,
-            "475": 12.03403,
-            "480": 11.8718,
-            "485": 11.96463,
-            "490": 11.9543,
-            "495": 11.99738,
-            "500": 12.06608,
-            "505": 12.04813,
-            "510": 12.09706,
-            "515": 12.14335,
-            "520": 12.36581,
-            "525": 12.19115,
-            "530": 12.1887,
-            "535": 12.25354,
-            "540": 12.27902,
-            "545": 12.32347,
-            "550": 12.44366,
-            "555": 12.25807,
-            "560": 12.22369,
-            "565": 12.28956,
-            "570": 12.31572,
-            "575": 12.28835,
-            "580": 12.33571,
-            "585": 12.26567,
-            "590": 12.30079,
-            "595": 12.29151,
-            "600": 12.30023,
-            "605": 12.45501,
-            "610": 12.27373,
-            "615": 12.217,
-            "620": 12.22334,
-            "625": 12.21274,
-            "630": 12.21904,
-            "635": 12.20277,
-            "640": 12.25538,
-            "645": 12.19988,
-            "650": 12.14026,
-            "655": 12.14302,
-            "660": 12.14678,
-            "665": 12.13972,
-            "670": 12.11485,
-            "675": 12.0282,
-            "680": 12.01901,
-            "685": 11.98462,
-            "690": 11.98742,
-            "695": 11.95917,
-            "700": 11.92521,
-            "705": 18.38779,
-            "710": 11.92438,
-            "715": 11.8274,
-            "720": 11.90138,
-            "725": 11.84998,
-            "730": 11.83009,
-            "735": 11.89248,
-            "740": 11.82364,
-            "745": 11.91839,
-            "750": 11.9577,
-            "755": 11.85056,
-            "760": 11.90523,
-            "765": 11.9116,
-            "770": 11.83717,
-            "775": 12.05864,
-            "780": 11.84895,
-            "785": 11.84375,
-            "790": 11.86493,
-            "795": 11.85763,
-            "800": 11.94365,
-            "805": 11.86899,
-            "810": 11.86748,
-            "815": 11.86393,
-            "820": 11.87992,
-            "825": 11.85259,
-            "830": 11.86886,
-            "835": 11.8517,
-            "840": 11.86254,
-            "845": 11.89508,
-            "850": 11.85613,
-            "855": 11.87434,
-            "860": 11.90703,
-            "865": 11.83224,
-            "870": 11.88246,
-            "875": 11.9305,
-            "880": 11.96022,
-            "885": 11.81651,
-            "890": 12.06642,
-            "895": 11.92653,
-            "900": 11.86469,
-            "905": 12.01767,
-            "910": 11.89635,
-            "915": 11.8254,
-            "920": 11.86106,
-            "925": 11.88434,
-            "930": 11.97059,
-            "935": 12.03718,
-            "940": 11.87698,
-            "945": 11.88008,
-            "950": 12.02071,
-            "955": 11.84843,
-            "960": 244.37245,
-            "965": 12.32084,
-            "970": 11.86341,
-            "975": 12.01988,
-            "980": 11.92166,
-            "985": 11.85411,
-            "990": 11.87753,
-            "995": 11.84786,
-            "1000": 11.89892,
-            "1005": 11.99759,
-            "1010": 11.91045,
-            "1015": 11.87038,
-            "1020": 11.85674,
-            "1025": 11.85567,
-            "1030": 11.86674,
-            "1035": 11.92499,
-            "1040": 11.85969,
-            "1045": 12.04929,
-            "1050": 11.82341,
-            "1055": 11.83111,
-            "1060": 11.87567,
-            "1065": 11.84584,
-            "1070": 11.93603,
-            "1075": 11.87121,
-            "1080": 11.85935,
-            "1085": 11.88667,
-            "1090": 11.86058,
-            "1095": 11.86482,
-            "1100": 11.82375,
-            "1105": 11.86482,
-            "1110": 11.89668,
-            "1115": 11.94941,
-            "1120": 11.84941,
-            "1125": 11.94466,
-            "1130": 11.90846,
-            "1135": 11.8602,
-            "1140": 11.86926,
-            "1145": 11.90365,
-            "1150": 11.88788,
-            "1155": 11.81781,
-            "1160": 11.88464,
-            "1165": 11.85344,
-            "1170": 11.8865,
-            "1175": 11.93361,
-            "1180": 11.89647,
-            "1185": 11.9031,
-            "1190": 11.89287,
-            "1195": 11.88683,
-            "1200": 11.85927,
-            "1205": 11.92471,
-            "1210": 11.85592,
-            "1215": 17.4276,
-            "1220": 11.87359,
-            "1225": 11.9296,
-            "1230": 11.95025,
-            "1235": 11.90738,
-            "1240": 11.86621,
-            "1245": 11.98001,
-            "1250": 12.003,
-            "1255": 11.91396,
-            "1260": 11.92279,
-            "1265": 11.85195,
-            "1270": 11.87463,
-            "1275": 11.90307,
-            "1280": 11.84637,
-            "1285": 11.95883,
-            "1290": 11.88039,
-            "1295": 11.8399,
-            "1300": 11.81976,
-            "1305": 11.89766,
-            "1310": 11.91584,
-            "1315": 12.12571,
-            "1320": 12.05556,
-            "1325": 11.84679,
-            "1330": 11.94985,
-            "1335": 11.94039,
-            "1340": 12.00572,
-            "1345": 11.98268,
-            "1350": 12.15927,
-            "1355": 12.04312,
-            "1360": 11.98816,
-            "1365": 11.95737,
-            "1370": 11.92395,
-            "1375": 11.89595,
-            "1380": 11.88635,
-            "1385": 11.96617,
-            "1390": 11.87421,
-            "1395": 12.02833,
-            "1400": 11.87415,
-            "1405": 11.85875,
-            "1410": 11.85419,
-            "1415": 11.8978,
-            "1420": 11.86309,
-            "1425": 11.87505,
-            "1430": 12.10339,
-            "1435": 11.88151,
-            "1440": 12.15068,
-            "1445": 11.98493,
-            "1450": 11.95438,
-            "1455": 12.03808,
-            "1460": 11.85293,
-            "1465": 11.93176,
-            "1470": 11.92246,
-            "1475": 11.90448,
-            "1480": 11.98959,
-            "1485": 11.93685,
-            "1490": 11.92389,
-            "1495": 11.95047,
-            "1500": 11.94526,
-            "1505": 11.9086,
-            "1510": 11.95225,
-            "1515": 11.87405,
-            "1520": 11.87975,
-            "1525": 11.88264,
-            "1530": 12.04989,
-            "1535": 12.02942,
-            "1540": 11.93089,
-            "1545": 11.89376,
-            "1550": 11.88596,
-            "1555": 11.95001,
-            "1560": 11.90239,
-            "1565": 11.89699,
-            "1570": 11.91441,
-            "1575": 11.87813,
-            "1580": 11.86939,
-            "1585": 11.8566,
-            "1590": 11.8665,
-            "1595": 11.90861,
-            "1600": 11.90425,
-            "1605": 11.82248,
-            "1610": 11.86531,
-            "1615": 11.8796,
-            "1620": 11.87587,
-            "1625": 11.88944,
-            "1630": 11.88839,
-            "1635": 11.8307,
-            "1640": 11.87082,
-            "1645": 11.84687,
-            "1650": 11.87887,
-            "1655": 11.85709,
-            "1660": 11.85167,
-            "1665": 11.90284,
-            "1670": 11.85205,
-            "1675": 12.00742,
-            "1680": 11.90754,
-            "1685": 11.97458,
-            "1690": 11.97016,
-            "1695": 11.9189,
-            "1700": 11.89709,
-            "1705": 11.88042,
-            "1710": 11.87879,
-            "1715": 12.06779,
-            "1720": 11.98631,
-            "1725": 12.01044,
-            "1730": 11.9924,
-            "1735": 11.87648,
-            "1740": 11.87455,
-            "1745": 11.93461,
-            "1750": 11.90235,
-            "1755": 11.97053,
-            "1760": 11.89545,
-            "1765": 11.8564,
-            "1770": 11.92635,
-            "1775": 11.91815,
-            "1780": 11.91235,
-            "1785": 11.85546,
-            "1790": 11.93087,
-            "1795": 11.91138,
-            "1800": 11.95901,
-            "1805": 12.0529,
-            "1810": 11.98858,
-            "1815": 12.13997,
-            "1820": 11.94798,
-            "1825": 11.97682,
-            "1830": 11.91244,
-            "1835": 11.94888,
-            "1840": 11.93666,
-            "1845": 11.87312,
-            "1850": 11.86327,
-            "1855": 11.94769,
-            "1860": 12.00187,
-            "1865": 12.06916,
-            "1870": 11.99528,
-            "1875": 11.89416,
-            "1880": 12.02292,
-            "1885": 12.04249,
-            "1890": 11.94094,
-            "1895": 11.93619,
-            "1900": 11.95301,
-            "1905": 11.85793,
-            "1910": 11.96264,
-            "1915": 11.92826,
-            "1920": 11.94216,
-            "1925": 12.01307,
-            "1930": 11.98891,
-            "1935": 11.95834,
-            "1940": 11.92143,
-            "1945": 11.98459,
-            "1950": 16.97099,
-            "1955": 11.89147,
-            "1960": 11.94643,
-            "1965": 11.92486,
-            "1970": 11.91542,
-            "1975": 13.09741,
-            "1980": 12.02148,
-            "1985": 11.92812,
-            "1990": 12.01102,
-            "1995": 11.94891,
-            "2000": 12.06741,
-            "2005": 11.94166,
-            "2010": 11.95871,
-            "2015": 12.00042,
-            "2020": 11.99101,
-            "2025": 11.95463,
-            "2030": 12.36755,
-            "2035": 11.96199,
-            "2040": 11.97863,
-            "2045": 12.01033,
-            "2050": 12.0643,
-            "2055": 11.96928,
-            "2060": 11.98383,
-            "2065": 11.92648,
-            "2070": 11.92379,
-            "2075": 11.97669,
-            "2080": 11.95508,
-            "2085": 11.94472,
-            "2090": 11.9663,
-            "2095": 11.93695,
-            "2100": 11.97178,
-            "2105": 11.98764,
-            "2110": 11.9516,
-            "2115": 11.9215,
-            "2120": 11.95207,
-            "2125": 11.95947,
-            "2130": 11.96722,
-            "2135": 11.97924,
-            "2140": 11.88777,
-            "2145": 11.95546,
-            "2150": 11.90266,
-            "2155": 11.97573,
-            "2160": 11.93275,
-            "2165": 11.98593,
-            "2170": 11.9842,
-            "2175": 12.00145,
-            "2180": 11.99219,
-            "2185": 11.96424,
-            "2190": 11.94313,
-            "2195": 11.93489,
-            "2200": 11.94356,
-            "2205": 12.00157,
-            "2210": 11.97153,
-            "2215": 11.9563,
-            "2220": 12.14117,
-            "2225": 11.97066,
-            "2230": 12.00037,
-            "2235": 11.95279,
-            "2240": 11.9544,
-            "2245": 11.97031,
-            "2250": 11.92229,
-            "2255": 11.98097,
-            "2260": 11.96529,
-            "2265": 11.98619,
-            "2270": 12.02117,
-            "2275": 11.94865,
-            "2280": 12.02569,
-            "2285": 11.98203,
-            "2290": 12.10479,
-            "2295": 11.95346,
-            "2300": 11.99961,
-            "2305": 11.96025,
-            "2310": 11.98746,
-            "2315": 11.95209,
-            "2320": 12.02644,
-            "2325": 11.95369,
-            "2330": 11.91985,
-            "2335": 11.93244,
-            "2340": 11.97061,
-            "2345": 11.90115,
-            "2350": 11.99136,
-            "2355": 12.0541,
-            "2360": 12.03728,
-            "2365": 11.95319,
-            "2370": 11.8917,
-            "2375": 11.94629,
-            "2380": 11.9087,
-            "2385": 11.91696,
-            "2390": 11.90123,
-            "2395": 11.87998,
-            "2400": 12.02954,
-            "2405": 11.97917,
-            "2410": 11.98456,
-            "2415": 11.9575,
-            "2420": 11.95917,
-            "2425": 11.95788,
-            "2430": 11.99944,
-            "2435": 12.00043,
-            "2440": 11.91339,
-            "2445": 11.97889,
-            "2450": 11.93997,
-            "2455": 11.91834,
-            "2460": 11.98321,
-            "2465": 11.94509,
-            "2470": 11.93387,
-            "2475": 11.9562,
-            "2480": 11.93148,
-            "2485": 11.94432,
-            "2490": 11.95477,
-            "2495": 11.94334,
-            "2500": 11.9284,
-            "2505": 11.93757,
-            "2510": 11.92289,
-            "2515": 11.97869,
-            "2520": 11.94858,
-            "2525": 11.96606,
-            "2530": 11.90894,
-            "2535": 11.95425,
-            "2540": 11.89136,
-            "2545": 11.94553,
-            "2550": 11.98026,
-            "2555": 11.93376,
-            "2560": 11.94866,
-            "2565": 11.92767,
-            "2570": 11.93583,
-            "2575": 11.97284,
-            "2580": 11.98911,
-            "2585": 11.95484,
-            "2590": 11.96399,
-            "2595": 11.96211,
-            "2600": 11.93906,
-            "2605": 11.9733,
-            "2610": 12.01872,
-            "2615": 11.99897,
-            "2620": 11.90926,
-            "2625": 11.93248,
-            "2630": 11.92842,
-            "2635": 11.94338,
-            "2640": 11.94678,
-            "2645": 11.95901,
-            "2650": 11.9296,
-            "2655": 12.02405,
-            "2660": 12.0166,
-            "2665": 12.01166,
-            "2670": 11.90595,
-            "2675": 11.98569,
-            "2680": 12.0118,
-            "2685": 11.92029,
-            "2690": 11.93111,
-            "2695": 12.00369,
-            "2700": 11.94818,
-            "2705": 11.99119,
-            "2710": 11.93978,
-            "2715": 11.9296,
-            "2720": 11.93044,
-            "2725": 11.94343,
-            "2730": 12.02248,
-            "2735": 11.95389,
-            "2740": 11.94611,
-            "2745": 11.92776,
-            "2750": 11.91647,
-            "2755": 11.9522,
-            "2760": 11.95012,
-            "2765": 11.96707,
-            "2770": 11.94892,
-            "2775": 11.9867,
-            "2780": 11.96897,
-            "2785": 11.97268,
-            "2790": 12.01936,
-            "2795": 11.97259,
-            "2800": 12.01028,
-            "2805": 11.94892,
-            "2810": 12.04828,
-            "2815": 11.93469,
-            "2820": 11.94568,
-            "2825": 11.92529,
-            "2830": 11.97458,
-            "2835": 11.99475,
-            "2840": 11.94984,
-            "2845": 11.93356,
-            "2850": 12.05796,
-            "2855": 11.99065,
-            "2860": 11.96077,
-            "2865": 11.9377,
-            "2870": 11.97627,
-            "2875": 11.97986,
-            "2880": 11.97201,
-            "2885": 11.91879,
-            "2890": 11.93586,
-            "2895": 12.00661,
-            "2900": 11.94616,
-            "2905": 11.94376,
-            "2910": 11.94168,
-            "2915": 11.94867,
-            "2920": 11.99355,
-            "2925": 11.94779,
-            "2930": 11.97133,
-            "2935": 11.96256,
-            "2940": 11.97787,
-            "2945": 11.93759,
-            "2950": 11.91863,
-            "2955": 11.98973,
-            "2960": 12.00486,
-            "2965": 11.91623,
-            "2970": 11.94846,
-            "2975": 11.91534,
-            "2980": 11.97787,
-            "2985": 12.385,
-            "2990": 11.88498,
-            "2995": 11.92173,
-            "3000": 11.90561,
-            "3005": 11.86795,
-            "3010": 11.88075,
-            "3015": 11.87833,
-            "3020": 11.98777,
-            "3025": 11.90078,
-            "3030": 11.98251,
-            "3035": 11.92211,
-            "3040": 11.91067,
-            "3045": 12.04371,
-            "3050": 11.91886,
-            "3055": 11.952,
-            "3060": 11.90649,
-            "3065": 11.86917,
-            "3070": 11.86601,
-            "3075": 11.92435,
-            "3080": 11.98092,
-            "3085": 11.94809,
-            "3090": 12.20304,
-            "3095": 11.87329,
-            "3100": 11.92696,
-            "3105": 11.85799,
-            "3110": 11.84125,
-            "3115": 11.82558,
-            "3120": 11.87566,
-            "3125": 11.89426,
-            "3130": 11.85869,
-            "3135": 11.92893,
-            "3140": 11.97022,
-            "3145": 11.84939,
-            "3150": 11.9785,
-            "3155": 11.92499,
-            "3160": 11.8889,
-            "3165": 11.87938,
-            "3170": 11.95555,
-            "3175": 11.91883,
-            "3180": 11.85842,
-            "3185": 11.9325,
-            "3190": 11.86061,
-            "3195": 11.90479,
-            "3200": 11.85963,
-            "3205": 11.91214,
-            "3210": 11.9243,
-            "3215": 11.8472,
-            "3220": 11.86665,
-            "3225": 11.89836,
-            "3230": 11.86299,
-            "3235": 11.89396,
-            "3240": 11.87482,
-            "3245": 11.86774,
-            "3250": 11.86673,
-            "3255": 11.88133,
-            "3260": 11.9014,
-            "3265": 11.92289,
-            "3270": 11.98401,
-            "3275": 11.95198,
-            "3280": 11.87392,
-            "3285": 11.89268,
-            "3290": 11.88963,
-            "3295": 11.91043,
-            "3300": 11.89803,
-            "3305": 11.87011,
-            "3310": 11.84465,
-            "3315": 11.84015,
-            "3320": 11.88334,
-            "3325": 11.93368,
-            "3330": 11.83472,
-            "3335": 11.86862,
-            "3340": 11.87575,
-            "3345": 11.94875,
-            "3350": 11.93528,
-            "3355": 11.81967,
-            "3360": 11.95954,
-            "3365": 11.88024,
-            "3370": 11.88333,
-            "3375": 11.85751,
-            "3380": 11.88742,
-            "3385": 11.9179,
-            "3390": 11.83242,
-            "3395": 11.96084,
-            "3400": 11.88213,
-            "3405": 11.86112,
-            "3410": 11.8407,
-            "3415": 11.92255,
-            "3420": 11.91997,
-            "3425": 11.88372,
-            "3430": 11.8672,
-            "3435": 11.85235,
-            "3440": 11.84935,
-            "3445": 11.93228,
-            "3450": 11.85166,
-            "3455": 11.9026,
-            "3460": 11.99596,
-            "3465": 11.88838,
-            "3470": 11.90065,
-            "3475": 11.92033,
-            "3480": 11.87265,
-            "3485": 11.89235,
-            "3490": 11.89267,
-            "3495": 11.97544,
-            "3500": 11.92819,
-            "3505": 11.82459,
-            "3510": 11.90756,
-            "3515": 11.92021,
-            "3520": 11.88124,
-            "3525": 11.86983,
-            "3530": 11.90548,
-            "3535": 11.94666,
-            "3540": 11.93322,
-            "3545": 11.90904,
-            "3550": 11.85224,
-            "3555": 11.886,
-            "3560": 11.93583,
-            "3565": 11.87294,
-            "3570": 11.86107,
-            "3575": 11.83618,
-            "3580": 11.94649,
-            "3585": 11.8886,
-            "3590": 12.01796,
-            "3595": 11.86065,
-            "3600": 11.96008,
-            "3605": 11.94154,
-            "3610": 11.91928,
-            "3615": 11.88551,
-            "3620": 11.8865,
-            "3625": 11.86807,
-            "3630": 11.98152,
-            "3635": 11.87685,
-            "3640": 11.89995,
-            "3645": 11.86485,
-            "3650": 11.94291,
-            "3655": 11.86472,
-            "3660": 11.84946,
-            "3665": 11.90789,
-            "3670": 11.86396,
-            "3675": 12.07226,
-            "3680": 11.8654,
-            "3685": 11.90154,
-            "3690": 11.87282,
-            "3695": 11.84993,
-            "3700": 11.92847,
-            "3705": 11.85848,
-            "3710": 11.86691,
-            "3715": 11.93176,
-            "3720": 11.86996,
-            "3725": 11.92665,
-            "3730": 11.90876,
-            "3735": 11.83597,
-            "3740": 11.8819,
-            "3745": 11.90119,
-            "3750": 11.90765,
-            "3755": 11.89791,
-            "3760": 11.91124,
-            "3765": 11.95606,
-            "3770": 11.93789,
-            "3775": 11.87152,
-            "3780": 11.89754,
-            "3785": 11.8704,
-            "3790": 11.88079,
-            "3795": 11.89363,
-            "3800": 11.88641,
-            "3805": 11.87724,
-            "3810": 11.86303,
-            "3815": 11.96793,
-            "3820": 11.97071,
-            "3825": 11.90678,
-            "3830": 11.84478,
-            "3835": 11.86339,
-            "3840": 11.84359,
-            "3845": 11.85381,
-            "3850": 11.89843,
-            "3855": 11.83659,
-            "3860": 11.8253,
-            "3865": 11.82796,
-            "3870": 11.93815,
-            "3875": 11.87584,
-            "3880": 11.85716,
-            "3885": 11.85848,
-            "3890": 11.84472,
-            "3895": 11.85001,
-            "3900": 11.90416,
-            "3905": 11.87723,
-            "3910": 11.90409,
-            "3915": 11.88375,
-            "3920": 11.9526,
-            "3925": 11.8796,
-            "3930": 11.92607,
-            "3935": 12.02111,
-            "3940": 11.89989,
-            "3945": 11.96829,
-            "3950": 11.92362,
-            "3955": 11.91298,
-            "3960": 11.93391,
-            "3965": 11.9977,
-            "3970": 11.91134,
-            "3975": 11.87698,
-            "3980": 11.84039,
-            "3985": 11.8296,
-            "3990": 11.8824,
-            "3995": 12.03103,
-            "4000": 12.53061,
-            "4005": 11.99032,
-            "4010": 11.94569,
-            "4015": 12.02459,
-            "4020": 12.05098,
-            "4025": 11.9408,
-            "4030": 11.9872,
-            "4035": 11.91882,
-            "4040": 11.91053,
-            "4045": 11.94764,
-            "4050": 11.96252,
-            "4055": 11.92924,
-            "4060": 11.95584,
-            "4065": 11.96477,
-            "4070": 11.95333,
-            "4075": 11.95009,
-            "4080": 11.94196,
-            "4085": 11.96679,
-            "4090": 12.09863,
-            "4095": 12.09521,
-            "4100": 11.99854,
-            "4105": 12.05345,
-            "4110": 11.99127,
-            "4115": 12.05731,
-            "4120": 11.95072,
-            "4125": 12.09249,
-            "4130": 12.04972,
-            "4135": 11.892,
-            "4140": 11.93048,
-            "4145": 11.92862,
-            "4150": 12.00088,
-            "4155": 11.95542,
-            "4160": 12.01499,
-            "4165": 11.90691,
-            "4170": 11.99204,
-            "4175": 12.02661,
-            "4180": 12.08762,
-            "4185": 11.93626,
-            "4190": 11.96513,
-            "4195": 11.9247,
-            "4200": 11.89449,
-            "4205": 11.95353,
-            "4210": 11.90984,
-            "4215": 11.92857,
-            "4220": 11.99809,
-            "4225": 12.01358,
-            "4230": 12.00065,
-            "4235": 11.95146,
-            "4240": 12.12674,
-            "4245": 11.99718,
-            "4250": 11.98808,
-            "4255": 11.95388,
-            "4260": 11.91437,
-            "4265": 11.97358,
-            "4270": 11.99013,
-            "4275": 11.95746,
-            "4280": 11.9273,
-            "4285": 11.92873,
-            "4290": 11.94103,
-            "4295": 11.93054,
-            "4300": 11.92986,
-            "4305": 12.11627,
-            "4310": 11.95471,
-            "4315": 11.96985,
-            "4320": 12.03911,
-            "4325": 12.01041,
-            "4330": 11.93084,
-            "4335": 11.95171,
-            "4340": 12.03209,
-            "4345": 11.94503,
-            "4350": 11.95426,
-            "4355": 12.08714,
-            "4360": 12.18212,
-            "4365": 11.94575,
-            "4370": 11.96598,
-            "4375": 12.00939,
-            "4380": 12.08808,
-            "4385": 11.9772,
-            "4390": 12.02704,
-            "4395": 12.01062,
-            "4400": 11.94619,
-            "4405": 11.98609,
-            "4410": 11.98025,
-            "4415": 11.99156,
-            "4420": 11.96913,
-            "4425": 12.02991,
-            "4430": 11.98417,
-            "4435": 12.07654,
-            "4440": 12.09429,
-            "4445": 11.9962,
-            "4450": 11.91032,
-            "4455": 11.99724,
-            "4460": 11.94549,
-            "4465": 11.92313,
-            "4470": 11.98709,
-            "4475": 11.9946,
-            "4480": 12.041,
-            "4485": 11.98684,
-            "4490": 12.00793,
-            "4495": 11.96519,
-            "4500": 11.91768,
-            "4505": 11.93855,
-            "4510": 11.96344,
-            "4515": 11.93266,
-            "4520": 11.99772,
-            "4525": 12.00265,
-            "4530": 12.00144,
-            "4535": 11.93099,
-            "4540": 11.9976,
-            "4545": 12.04415,
-            "4550": 11.92104,
-            "4555": 11.97762,
-            "4560": 12.05513,
-            "4565": 12.08413,
-            "4570": 12.00561,
-            "4575": 12.03402,
-            "4580": 12.07435,
-            "4585": 11.91157,
-            "4590": 11.93266,
-            "4595": 12.00575,
-            "4600": 11.98764,
-            "4605": 12.07608,
-            "4610": 11.98608,
-            "4615": 12.23058,
-            "4620": 11.96992,
-            "4625": 11.98931,
-            "4630": 11.92725,
-            "4635": 11.94909,
-            "4640": 11.94336,
-            "4645": 11.95955,
-            "4650": 11.99978,
-            "4655": 11.95199,
-            "4660": 11.97643,
-            "4665": 12.03686,
-            "4670": 12.0499,
-            "4675": 11.98439,
-            "4680": 12.00394,
-            "4685": 11.97515,
-            "4690": 11.95102,
-            "4695": 12.07552,
-            "4700": 11.9222,
-            "4705": 11.97387,
-            "4710": 11.99203,
-            "4715": 11.93004,
-            "4720": 11.97237,
-            "4725": 12.00277,
-            "4730": 12.00835,
-            "4735": 11.97435,
-            "4740": 11.98233,
-            "4745": 11.92423,
-            "4750": 11.95154,
-            "4755": 12.02084,
-            "4760": 11.94378,
-            "4765": 11.95313,
-            "4770": 11.92338,
-            "4775": 11.92352,
-            "4780": 12.00277,
-            "4785": 11.94768,
-            "4790": 11.97296,
-            "4795": 11.98757,
-            "4800": 12.26361,
-            "4805": 11.90736,
-            "4810": 11.9844,
-            "4815": 12.04212,
-            "4820": 11.98762,
-            "4825": 12.89959,
-            "4830": 11.9442,
-            "4835": 12.35106,
-            "4840": 11.93828,
-            "4845": 11.92418,
-            "4850": 11.96443,
-            "4855": 12.03431,
-            "4860": 12.04422,
-            "4865": 11.9646,
-            "4870": 11.91857,
-            "4875": 11.95672,
-            "4880": 11.9198,
-            "4885": 11.96783,
-            "4890": 11.94953,
-            "4895": 11.96692,
-            "4900": 12.04475,
-            "4905": 12.05877,
-            "4910": 12.15039,
-            "4915": 12.15039,
-            "4920": 11.95008,
-            "4925": 11.96843,
-            "4930": 11.958,
-            "4935": 11.98531,
-            "4940": 11.90874,
-            "4945": 11.95752,
-            "4950": 12.01284,
-            "4955": 11.97799,
-            "4960": 11.99989,
-            "4965": 11.9277,
-            "4970": 12.06095,
-            "4975": 11.95713,
-            "4980": 12.02719,
-            "4985": 11.96446,
-            "4990": 11.92043,
-            "4995": 11.99522,
-            "5000": 12.0792,
-            "5005": 11.95462,
-            "5010": 18.30939,
-            "5015": 12.57034,
-            "5020": 12.13652,
-            "5025": 11.95064,
-            "5030": 11.93538,
-            "5035": 12.01779,
-            "5040": 11.8639,
-            "5045": 11.89312,
-            "5050": 11.93054,
-            "5055": 11.89904,
-            "5060": 11.88635,
-            "5065": 11.89505,
-            "5070": 11.95957,
-            "5075": 11.96591,
-            "5080": 11.85594,
-            "5085": 11.87343,
-            "5090": 11.89162,
-            "5095": 11.9231,
-            "5100": 11.9213,
-            "5105": 11.9793,
-            "5110": 11.92942,
-            "5115": 11.87025,
-            "5120": 11.84167,
-            "5125": 11.92967,
-            "5130": 11.90523,
-            "5135": 11.8727,
-            "5140": 11.95822,
-            "5145": 11.97795,
-            "5150": 11.90614,
-            "5155": 11.88276,
-            "5160": 11.94188,
-            "5165": 11.91373,
-            "5170": 12.01192,
-            "5175": 11.85511,
-            "5180": 11.84375,
-            "5185": 11.88965,
-            "5190": 11.88542,
-            "5195": 11.85346,
-            "5200": 11.94188,
-            "5205": 11.92082,
-            "5210": 11.8821,
-            "5215": 11.92239,
-            "5220": 11.90608,
-            "5225": 11.8947,
-            "5230": 11.88619,
-            "5235": 11.8948,
-            "5240": 11.89599,
-            "5245": 11.88662,
-            "5250": 11.95415,
-            "5255": 11.96527,
-            "5260": 11.89009,
-            "5265": 11.87997,
-            "5270": 11.94016,
-            "5275": 11.89138,
-            "5280": 11.90447,
-            "5285": 11.86453,
-            "5290": 11.90845,
-            "5295": 11.89373,
-            "5300": 11.96084,
-            "5305": 12.00505,
-            "5310": 11.87874,
-            "5315": 11.94047,
-            "5320": 11.90115,
-            "5325": 11.8657,
-            "5330": 11.98456,
-            "5335": 11.89142,
-            "5340": 11.94056,
-            "5345": 11.88326,
-            "5350": 12.02941,
-            "5355": 11.94937,
-            "5360": 11.84158,
-            "5365": 11.85236,
-            "5370": 11.89414,
-            "5375": 11.92681,
-            "5380": 11.89983,
-            "5385": 11.93247,
-            "5390": 11.88545,
-            "5395": 11.85963,
-            "5400": 11.87187,
-            "5405": 11.92558,
-            "5410": 11.94364,
-            "5415": 11.9087,
-            "5420": 11.86332,
-            "5425": 11.92767,
-            "5430": 11.87425,
-            "5435": 11.91049,
-            "5440": 11.87699,
-            "5445": 11.93171,
-            "5450": 11.90161,
-            "5455": 11.921,
-            "5460": 11.88038,
-            "5465": 11.91315,
-            "5470": 11.89728,
-            "5475": 11.95689,
-            "5480": 11.98965,
-            "5485": 11.91576,
-            "5490": 11.89757,
-            "5495": 11.93064,
-            "5500": 11.88252,
-            "5505": 11.96073,
-            "5510": 11.86654,
-            "5515": 11.87886,
-            "5520": 11.90936,
-            "5525": 12.03373,
-            "5530": 11.90318,
-            "5535": 11.92154,
-            "5540": 11.90086,
-            "5545": 11.89022,
-            "5550": 11.90225,
-            "5555": 11.83513,
-            "5560": 11.91062,
-            "5565": 11.87125,
-            "5570": 11.87145,
-            "5575": 11.86357,
-            "5580": 11.91841,
-            "5585": 11.92436,
-            "5590": 11.9023,
-            "5595": 11.86709,
-            "5600": 11.91375,
-            "5605": 11.90872,
-            "5610": 11.8916,
-            "5615": 11.95578,
-            "5620": 11.89294,
-            "5625": 11.90784,
-            "5630": 11.92391,
-            "5635": 11.89956,
-            "5640": 11.89869,
-            "5645": 11.91776,
-            "5650": 11.9431,
-            "5655": 11.89517,
-            "5660": 11.88968,
-            "5665": 11.89529,
-            "5670": 11.91051,
-            "5675": 11.91888,
-            "5680": 11.90991,
-            "5685": 11.93985,
-            "5690": 11.90708,
-            "5695": 11.8876,
-            "5700": 11.95923,
-            "5705": 11.93355,
-            "5710": 11.87364,
-            "5715": 11.9268,
-            "5720": 11.98226,
-            "5725": 11.87678,
-            "5730": 11.83368,
-            "5735": 11.89468,
-            "5740": 11.90674,
-            "5745": 11.88476,
-            "5750": 11.86646,
-            "5755": 11.88929,
-            "5760": 11.85649,
-            "5765": 11.85565,
-            "5770": 11.93646,
-            "5775": 11.90704,
-            "5780": 12.04897,
-            "5785": 11.91885,
-            "5790": 11.90414,
-            "5795": 11.92795,
-            "5800": 11.9484,
-            "5805": 11.9947,
-            "5810": 11.88562,
-            "5815": 11.89893,
-            "5820": 11.86069,
-            "5825": 11.85602,
-            "5830": 11.90577,
-            "5835": 11.90369,
-            "5840": 11.95291,
-            "5845": 11.93547,
-            "5850": 11.89776,
-            "5855": 11.89365,
-            "5860": 11.88809,
-            "5865": 11.89502,
-            "5870": 11.90093,
-            "5875": 11.89463,
-            "5880": 11.85877,
-            "5885": 11.91775,
-            "5890": 11.9362,
-            "5895": 11.90238,
-            "5900": 11.89416,
-            "5905": 11.9161,
-            "5910": 11.91617,
-            "5915": 11.89704,
-            "5920": 11.86193,
-            "5925": 11.94942,
-            "5930": 11.85147,
-            "5935": 11.87033,
-            "5940": 11.9311,
-            "5945": 11.96348,
-            "5950": 11.96932,
-            "5955": 11.90137,
-            "5960": 11.87563,
-            "5965": 11.86128,
-            "5970": 11.99512,
-            "5975": 11.92846,
-            "5980": 11.83738,
-            "5985": 11.88075,
-            "5990": 11.89265,
-            "5995": 11.92537,
-            "6000": 11.88009,
-            "6005": 11.9523,
-            "6010": 11.93509,
-            "6015": 11.89766,
-            "6020": 11.88045,
-            "6025": 11.87641,
-            "6030": 246.60413,
-            "6035": 12.33879,
-            "6040": 11.91607,
-            "6045": 11.95709,
-            "6050": 11.93381,
-            "6055": 11.91355,
-            "6060": 11.91286,
-            "6065": 11.97819,
-            "6070": 11.93373,
-            "6075": 11.85049,
-            "6080": 11.96747,
-            "6085": 11.93318,
-            "6090": 11.93239,
-            "6095": 11.8622,
-            "6100": 11.88525,
-            "6105": 11.97899,
-            "6110": 11.91577,
-            "6115": 11.92755,
-            "6120": 11.92296,
-            "6125": 11.99725,
-            "6130": 11.97753,
-            "6135": 11.92108,
-            "6140": 11.91607,
-            "6145": 11.9071,
-            "6150": 11.92499,
-            "6155": 11.91611,
-            "6160": 12.01604,
-            "6165": 11.89838,
-            "6170": 11.90254,
-            "6175": 11.96493,
-            "6180": 11.84452,
-            "6185": 11.91052,
-            "6190": 11.8712,
-            "6195": 11.90582,
-            "6200": 11.90605,
-            "6205": 11.98397,
-            "6210": 11.92035,
-            "6215": 11.96579,
-            "6220": 11.99275,
-            "6225": 11.88749,
-            "6230": 11.89369,
-            "6235": 11.95748,
-            "6240": 11.93057,
-            "6245": 11.94912,
-            "6250": 11.9372,
-            "6255": 11.90439,
-            "6260": 11.92527,
-            "6265": 11.95201,
-            "6270": 11.9095,
-            "6275": 11.97821,
-            "6280": 11.94458,
-            "6285": 11.90287,
-            "6290": 11.89278,
-            "6295": 11.96073,
-            "6300": 11.90554,
-            "6305": 11.88653,
-            "6310": 11.8962,
-            "6315": 11.93036,
-            "6320": 11.95396,
-            "6325": 11.94894,
-            "6330": 12.04569,
-            "6335": 11.88055,
-            "6340": 11.91066,
-            "6345": 11.89024,
-            "6350": 11.89994,
-            "6355": 11.92221,
-            "6360": 11.92333,
-            "6365": 11.91761,
-            "6370": 11.97313,
-            "6375": 11.90689,
-            "6380": 12.08922,
-            "6385": 11.94942,
-            "6390": 11.91702,
-            "6395": 11.90139,
-            "6400": 11.89012,
-            "6405": 11.9541,
-            "6410": 12.00044,
-            "6415": 11.89967,
-            "6420": 11.86695,
-            "6425": 11.87294,
-            "6430": 11.89524,
-            "6435": 11.94881,
-            "6440": 11.91361,
-            "6445": 11.91243,
-            "6450": 11.90246,
-            "6455": 11.88301,
-            "6460": 11.94133,
-            "6465": 11.95353,
-            "6470": 11.93545,
-            "6475": 11.91767,
-            "6480": 11.904,
-            "6485": 11.97366,
-            "6490": 11.9268,
-            "6495": 11.92497,
-            "6500": 12.05293,
-            "6505": 11.83715,
-            "6510": 11.86732,
-            "6515": 11.90038,
-            "6520": 11.86776,
-            "6525": 11.86971,
-            "6530": 11.85789,
-            "6535": 11.88616,
-            "6540": 11.85825,
-            "6545": 11.82803,
-            "6550": 11.89596,
-            "6555": 11.89246,
-            "6560": 11.87827,
-            "6565": 11.87369,
-            "6570": 11.88103,
-            "6575": 11.86696,
-            "6580": 11.90165,
-            "6585": 11.85113,
-            "6590": 11.85101,
-            "6595": 11.80896,
-            "6600": 11.90596,
-            "6605": 11.87406,
-            "6610": 11.8658,
-            "6615": 11.86475,
-            "6620": 11.88848,
-            "6625": 11.85675,
-            "6630": 11.84722,
-            "6635": 11.83752,
-            "6640": 11.8855,
-            "6645": 11.91332,
-            "6650": 11.86288,
-            "6655": 11.89588,
-            "6660": 11.8071,
-            "6665": 11.84093,
-            "6670": 11.88653,
-            "6675": 11.88047,
-            "6680": 11.87018,
-            "6685": 11.8411,
-            "6690": 11.82244,
-            "6695": 11.86596,
-            "6700": 11.85423,
-            "6705": 11.86228,
-            "6710": 11.86517,
-            "6715": 11.87189,
-            "6720": 11.84138,
-            "6725": 11.88097,
-            "6730": 11.90906,
-            "6735": 11.91578,
-            "6740": 11.88058,
-            "6745": 11.88169,
-            "6750": 12.03575,
-            "6755": 11.84511,
-            "6760": 11.84038,
-            "6765": 11.83499,
-            "6770": 11.87927,
-            "6775": 11.81349,
-            "6780": 13.01048,
-            "6785": 11.81032,
-            "6790": 11.93614,
-            "6795": 11.97801,
-            "6800": 11.86,
-            "6805": 11.83039,
-            "6810": 11.8441,
-            "6815": 11.89187,
-            "6820": 11.87841,
-            "6825": 11.86012,
-            "6830": 11.83442,
-            "6835": 11.85081,
-            "6840": 11.83799,
-            "6845": 11.82691,
-            "6850": 11.89092,
-            "6855": 11.82022,
-            "6860": 11.8279,
-            "6865": 11.79814,
-            "6870": 11.83217,
-            "6875": 11.90136,
-            "6880": 11.85295,
-            "6885": 11.84058,
-            "6890": 11.84482,
-            "6895": 11.82768,
-            "6900": 11.88337,
-            "6905": 11.84656,
-            "6910": 11.90272,
-            "6915": 11.8005,
-            "6920": 11.93804,
-            "6925": 12.00166,
-            "6930": 11.88293,
-            "6935": 11.9479,
-            "6940": 11.85228,
-            "6945": 11.86242,
-            "6950": 11.83582,
-            "6955": 11.81523,
-            "6960": 11.75894,
-            "6965": 11.81699,
-            "6970": 11.85282,
-            "6975": 11.84727,
-            "6980": 11.84729,
-            "6985": 12.01189,
-            "6990": 11.86887,
-            "6995": 11.88713,
-            "7000": 11.85612,
-            "7005": 11.86648,
-            "7010": 11.8888,
-            "7015": 11.84573,
-            "7020": 11.77395,
-            "7025": 11.85096,
-            "7030": 11.86323,
-            "7035": 11.84315,
-            "7040": 11.82293,
-            "7045": 11.81241,
-            "7050": 11.85808,
-            "7055": 11.86593,
-            "7060": 11.87475,
-            "7065": 11.90707,
-            "7070": 11.9358,
-            "7075": 11.84297,
-            "7080": 11.80853,
-            "7085": 11.88178,
-            "7090": 11.87836,
-            "7095": 11.85532,
-            "7100": 11.89414,
-            "7105": 11.85379,
-            "7110": 11.89642,
-            "7115": 11.85858,
-            "7120": 11.90327,
-            "7125": 11.89711,
-            "7130": 11.89177,
-            "7135": 11.88659,
-            "7140": 11.85757,
-            "7145": 11.87756,
-            "7150": 11.88577,
-            "7155": 11.86153,
-            "7160": 11.92297,
-            "7165": 11.88396,
-            "7170": 11.85778,
-            "7175": 11.91483,
-            "7180": 11.86232,
-            "7185": 11.87476,
-            "7190": 11.8982,
-            "7195": 11.88516,
-            "7200": 11.88158,
-            "7205": 11.88444,
-            "7210": 11.89206,
-            "7215": 11.87279,
-            "7220": 11.90742,
-            "7225": 11.85079,
-            "7230": 11.8483,
-            "7235": 11.90312,
-            "7240": 11.87181,
-            "7245": 11.91535,
-            "7250": 11.87908,
-            "7255": 11.92293,
-            "7260": 11.84549,
-            "7265": 11.8901,
-            "7270": 11.84322,
-            "7275": 11.848,
-            "7280": 11.8967,
-            "7285": 11.89986,
-            "7290": 11.95382,
-            "7295": 11.90753,
-            "7300": 11.86218,
-            "7305": 11.85436,
-            "7310": 11.85753,
-            "7315": 11.9134,
-            "7320": 11.90034,
-            "7325": 11.83407,
-            "7330": 11.85974,
-            "7335": 11.90032,
-            "7340": 11.88835,
-            "7345": 11.88443,
-            "7350": 11.85147,
-            "7355": 11.86003,
-            "7360": 11.88911,
-            "7365": 11.88721,
-            "7370": 11.94597,
-            "7375": 11.88507,
-            "7380": 11.8675,
-            "7385": 11.88615,
-            "7390": 11.85493,
-            "7395": 11.9078,
-            "7400": 11.89976,
-            "7405": 11.94755,
-            "7410": 11.86216,
-            "7415": 11.81832,
-            "7420": 11.89699,
-            "7425": 11.90201,
-            "7430": 11.88324,
-            "7435": 11.84242,
-            "7440": 11.89387,
-            "7445": 11.85554,
-            "7450": 11.927,
-            "7455": 11.89196,
-            "7460": 11.93241,
-            "7465": 11.89671,
-            "7470": 11.8633,
-            "7475": 11.85785,
-            "7480": 11.86619,
-            "7485": 11.90047,
-            "7490": 11.93453,
-            "7495": 11.89595,
-            "7500": 11.92255,
-            "7505": 11.86705,
-            "7510": 11.86492,
-            "7515": 11.83778,
-            "7520": 12.43308,
-            "7525": 11.94046,
-            "7530": 12.11911,
-            "7535": 11.95645,
-            "7540": 12.01144,
-            "7545": 11.94459,
-            "7550": 12.00989,
-            "7555": 11.95308,
-            "7560": 12.02894,
-            "7565": 12.00926,
-            "7570": 11.88032,
-            "7575": 11.94986,
-            "7580": 11.94673,
-            "7585": 11.92777,
-            "7590": 11.96311,
-            "7595": 11.90291,
-            "7600": 11.96776,
-            "7605": 11.91009,
-            "7610": 11.98945,
-            "7615": 11.943,
-            "7620": 11.97203,
-            "7625": 11.87696,
-            "7630": 11.92313,
-            "7635": 11.9056,
-            "7640": 11.89922,
-            "7645": 11.93063,
-            "7650": 11.89735,
-            "7655": 11.93078,
-            "7660": 11.95494,
-            "7665": 11.91011,
-            "7670": 11.97093,
-            "7675": 11.97514,
-            "7680": 11.93177,
-            "7685": 11.8992,
-            "7690": 11.94571,
-            "7695": 11.92277,
-            "7700": 11.94906,
-            "7705": 11.92727,
-            "7710": 11.93604,
-            "7715": 11.92305,
-            "7720": 11.93766,
-            "7725": 11.95622,
-            "7730": 11.90603,
-            "7735": 11.91132,
-            "7740": 11.97695,
-            "7745": 11.96601,
-            "7750": 11.88967,
-            "7755": 11.93644,
-            "7760": 11.96688,
-            "7765": 11.92672,
-            "7770": 23.39259,
-            "7775": 23.06567,
-            "7780": 11.93112,
-            "7785": 11.93477,
-            "7790": 11.94106,
-            "7795": 11.94556,
-            "7800": 12.0002,
-            "7805": 11.97342,
-            "7810": 11.95163,
-            "7815": 11.96208,
-            "7820": 11.96513,
-            "7825": 11.93368,
-            "7830": 11.91708,
-            "7835": 11.89017,
-            "7840": 11.94549,
-            "7845": 11.96002,
-            "7850": 11.95829,
-            "7855": 11.92186,
-            "7860": 11.93832,
-            "7865": 11.889,
-            "7870": 11.96191,
-            "7875": 12.05703,
-            "7880": 11.97288,
-            "7885": 11.91666,
-            "7890": 11.93728,
-            "7895": 11.96047,
-            "7900": 11.9818,
-            "7905": 11.92242,
-            "7910": 11.97684,
-            "7915": 11.91154,
-            "7920": 11.96828,
-            "7925": 11.94506,
-            "7930": 11.93465,
-            "7935": 11.90216,
-            "7940": 11.91383,
-            "7945": 11.91481,
-            "7950": 11.96693,
-            "7955": 11.94446,
-            "7960": 11.92358,
-            "7965": 11.94155,
-            "7970": 11.95822,
-            "7975": 12.03469,
-            "7980": 11.94102,
-            "7985": 11.94681,
-            "7990": 11.92459,
-            "7995": 11.92763,
-            "8000": 11.96299,
-            "8005": 11.9788,
-            "8010": 11.96826,
-            "8015": 12.02982,
-            "8020": 11.94329,
-            "8025": 11.98105,
-            "8030": 12.01501,
-            "8035": 11.96502,
-            "8040": 11.97586,
-            "8045": 11.96948,
-            "8050": 11.92611,
-            "8055": 11.93414,
-            "8060": 11.93961,
-            "8065": 11.9262,
-            "8070": 11.9178,
-            "8075": 11.90325,
-            "8080": 11.93833,
-            "8085": 11.97936,
-            "8090": 11.99724,
-            "8095": 11.94796,
-            "8100": 11.9625,
-            "8105": 11.94798,
-            "8110": 11.92353,
-            "8115": 11.96357,
-            "8120": 11.92451,
-            "8125": 11.89352,
-            "8130": 11.97563,
-            "8135": 11.97236,
-            "8140": 11.9723,
-            "8145": 11.92641,
-            "8150": 11.89834,
-            "8155": 11.94876,
-            "8160": 11.95465,
-            "8165": 11.95874,
-            "8170": 11.93402,
-            "8175": 11.96745,
-            "8180": 11.91172,
-            "8185": 11.91331,
-            "8190": 11.95504,
-            "8195": 11.94346,
-            "8200": 11.95192,
-            "8205": 11.9973,
-            "8210": 11.95023,
-            "8215": 12.03521,
-            "8220": 11.96486,
-            "8225": 11.95464,
-            "8230": 11.96151,
-            "8235": 11.95994,
-            "8240": 11.97909,
-            "8245": 11.92928,
-            "8250": 11.92518,
-            "8255": 11.94881,
-            "8260": 11.907,
-            "8265": 11.93185,
-            "8270": 11.9211,
-            "8275": 11.86366,
-            "8280": 12.00914,
-            "8285": 11.97086,
-            "8290": 11.98208,
-            "8295": 11.92309,
-            "8300": 11.94129,
-            "8305": 11.99302,
-            "8310": 11.97601,
-            "8315": 11.88862,
-            "8320": 11.96454,
-            "8325": 11.89961,
-            "8330": 11.99534,
-            "8335": 11.91687,
-            "8340": 11.96466,
-            "8345": 11.93152,
-            "8350": 11.94368,
-            "8355": 11.92235,
-            "8360": 11.99578,
-            "8365": 11.90045,
-            "8370": 11.91744,
-            "8375": 11.92667,
-            "8380": 11.90428,
-            "8385": 11.94828,
-            "8390": 11.93507,
-            "8395": 11.9473,
-            "8400": 11.94267,
-            "8405": 11.93414,
-            "8410": 11.90959,
-            "8415": 11.92941,
-            "8420": 11.91201,
-            "8425": 11.91625,
-            "8430": 11.9332,
-            "8435": 11.99456,
-            "8440": 11.8869,
-            "8445": 11.90729,
-            "8450": 11.93362,
-            "8455": 11.96619,
-            "8460": 12.01359,
-            "8465": 11.9429,
-            "8470": 11.99594,
-            "8475": 11.95465,
-            "8480": 11.92489,
-            "8485": 11.92415,
-            "8490": 11.97388,
-            "8495": 11.89913,
-            "8500": 11.95945,
-            "8505": 11.91567,
-            "8510": 11.91482,
-            "8515": 11.93548,
-            "8520": 11.95743,
-            "8525": 11.94743,
-            "8530": 12.42097,
-            "8535": 11.9272,
-            "8540": 12.09436,
-            "8545": 12.04967,
-            "8550": 11.9651,
-            "8555": 12.03857,
-            "8560": 11.97265,
-            "8565": 11.91082,
-            "8570": 11.95406,
-            "8575": 11.94802,
-            "8580": 11.9942,
-            "8585": 11.96288,
-            "8590": 11.95701,
-            "8595": 11.97786,
-            "8600": 11.89715,
-            "8605": 11.93644,
-            "8610": 11.98611,
-            "8615": 11.91557,
-            "8620": 11.92076,
-            "8625": 11.96113,
-            "8630": 11.99266,
-            "8635": 11.93916,
-            "8640": 12.02781,
-            "8645": 11.99006,
-            "8650": 11.91164,
-            "8655": 11.91924,
-            "8660": 11.95194,
-            "8665": 12.00021,
-            "8670": 11.90972,
-            "8675": 11.96086,
-            "8680": 11.95175,
-            "8685": 11.95495,
-            "8690": 12.00198,
-            "8695": 12.07659,
-            "8700": 11.96371,
-            "8705": 11.91845,
-            "8710": 11.97745,
-            "8715": 11.93805,
-            "8720": 11.9173,
-            "8725": 11.91035,
-            "8730": 12.01393,
-            "8735": 11.98447,
-            "8740": 11.97475,
-            "8745": 11.96291,
-            "8750": 11.9361,
-            "8755": 11.96838,
-            "8760": 11.93695,
-            "8765": 12.00162,
-            "8770": 11.92599,
-            "8775": 12.0012,
-            "8780": 12.03738,
-            "8785": 11.94909,
-            "8790": 11.90577,
-            "8795": 11.97012,
-            "8800": 11.93035,
-            "8805": 11.99893,
-            "8810": 11.94421,
-            "8815": 11.98191,
-            "8820": 11.99062,
-            "8825": 11.92267,
-            "8830": 11.95194,
-            "8835": 11.937,
-            "8840": 11.97075,
-            "8845": 11.95007,
-            "8850": 12.02522,
-            "8855": 11.94712,
-            "8860": 11.96728,
-            "8865": 11.89285,
-            "8870": 11.94189,
-            "8875": 11.92065,
-            "8880": 11.98822,
-            "8885": 11.98285,
-            "8890": 11.99582,
-            "8895": 11.96596,
-            "8900": 11.94354,
-            "8905": 11.95473,
-            "8910": 11.99259,
-            "8915": 11.96618,
-            "8920": 11.93587,
-            "8925": 11.99413,
-            "8930": 12.00638,
-            "8935": 11.93,
-            "8940": 11.95031,
-            "8945": 11.91928,
-            "8950": 11.9941,
-            "8955": 11.94031,
-            "8960": 11.96914,
-            "8965": 11.95062,
-            "8970": 11.95268,
-            "8975": 12.03161,
-            "8980": 11.97245,
-            "8985": 12.01027,
-            "8990": 11.9446,
-            "8995": 11.96843,
-            "9000": 11.9429,
-            "9005": 11.94091,
-            "9010": 11.93667,
-            "9015": 11.95344,
-            "9020": 11.93207,
-            "9025": 11.91998,
-            "9030": 11.92651,
-            "9035": 11.97131,
-            "9040": 11.92008,
-            "9045": 11.9777,
-            "9050": 11.93287,
-            "9055": 11.96682,
-            "9060": 11.982,
-            "9065": 11.9763,
-            "9070": 11.92703,
-            "9075": 11.95149,
-            "9080": 11.94863,
-            "9085": 11.92217,
-            "9090": 11.92326,
-            "9095": 11.9586,
-            "9100": 11.93403,
-            "9105": 11.97708,
-            "9110": 11.97248,
-            "9115": 11.91899,
-            "9120": 11.98175,
-            "9125": 12.0043,
-            "9130": 11.98361,
-            "9135": 11.95811,
-            "9140": 11.89116,
-            "9145": 11.92833,
-            "9150": 11.96999,
-            "9155": 11.95682,
-            "9160": 11.93898,
-            "9165": 11.98676,
-            "9170": 11.96776,
-            "9175": 11.91735,
-            "9180": 11.96488,
-            "9185": 11.93801,
-            "9190": 11.93829,
-            "9195": 11.96444,
-            "9200": 11.91924,
-            "9205": 11.99554,
-            "9210": 11.91977,
-            "9215": 11.99739,
-            "9220": 11.92053,
-            "9225": 11.93702,
-            "9230": 11.95815,
-            "9235": 12.05346,
-            "9240": 11.9596,
-            "9245": 11.97173,
-            "9250": 11.94092,
-            "9255": 11.94632,
-            "9260": 12.00354,
-            "9265": 11.96854,
-            "9270": 11.91621,
-            "9275": 11.94709,
-            "9280": 11.93375,
-            "9285": 11.92465,
-            "9290": 11.93047,
-            "9295": 11.93184,
-            "9300": 11.95538,
-            "9305": 11.96102,
-            "9310": 11.93874,
-            "9315": 11.94123,
-            "9320": 11.95854,
-            "9325": 11.98961,
-            "9330": 11.87394,
-            "9335": 11.97986,
-            "9340": 12.02583,
-            "9345": 11.94202,
-            "9350": 12.00113,
-            "9355": 11.97405,
-            "9360": 11.96746,
-            "9365": 11.96018,
-            "9370": 11.9475,
-            "9375": 11.94327,
-            "9380": 11.92135,
-            "9385": 12.01574,
-            "9390": 11.95494,
-            "9395": 11.93529,
-            "9400": 11.96463,
-            "9405": 11.9807,
-            "9410": 11.92926,
-            "9415": 11.95919,
-            "9420": 11.94796,
-            "9425": 11.94261,
-            "9430": 11.94968,
-            "9435": 11.9655,
-            "9440": 11.94016,
-            "9445": 11.98541,
-            "9450": 11.94602,
-            "9455": 11.96365,
-            "9460": 11.9884,
-            "9465": 11.93962,
-            "9470": 11.93471,
-            "9475": 11.91073,
-            "9480": 11.92557,
-            "9485": 11.93537,
-            "9490": 11.97267,
-            "9495": 11.93521,
-            "9500": 11.92542,
-            "9505": 12.00627,
-            "9510": 11.9749,
-            "9515": 11.97511,
-            "9520": 11.88493,
-            "9525": 11.91739,
-            "9530": 11.92418,
-            "9535": 11.97024
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml
new file mode 100644
index 00000000000..d169e050402
--- /dev/null
+++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release/model_config.yaml
@@ -0,0 +1,168 @@
+# The proxy model is used for local code quality check.
+# The proxy model should contain all the necessary components and settings but fewer parameters.
+ENV_VARS:
+  TORCH_NCCL_AVOID_RECORD_STREAMS: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+  NVTE_FUSED_ATTN: 1
+  NVTE_NORM_FWD_USE_CUDNN: 1
+  NVTE_NORM_BWD_USE_CUDNN: 1
+  PYTHONWARNINGS: ignore
+  NCCL_DEBUG: VERSION
+  NON_DETERMINSTIC_RESULTS: 1
+  NVSHMEM_IB_ENABLE_IBGDA: 0
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 16
+  USE_MNNVL: 1
+TEST_TYPE: "release"
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 4
+  --pipeline-model-parallel-layout: Et*2\\|\\(tt\\|\\)*5t\\|tmL # Et*2|(tt|)*5t|tmL
+  --expert-model-parallel-size: 16
+  --context-parallel-size: 1
+  --expert-tensor-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --use-flash-attn: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 512
+  --train-samples: 24414062
+  --exit-duration-in-mins: 220
+  --no-check-for-nan-in-loss-and-grad: true
+  --cross-entropy-loss-fusion: true
+  --cross-entropy-fusion-impl: te
+  --manual-gc: true
+  --manual-gc-interval: 10
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --seq-length: 4096
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
+  --data-path: $DATA_BLEND
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --no-create-attention-mask-in-dataloader: true
+  --num-workers: 6
+
+  # Add network size args
+  --num-layers: 14 # original 61 layers
+  --hidden-size: 7168
+  --ffn-hidden-size: 18432
+  --num-attention-heads: 128
+  --kv-channels: 128
+  --max-position-embeddings: 4096
+  --position-embedding-type: rope
+  --rotary-base: 10000
+  --make-vocab-size-divisible-by: 3232
+  --normalization: RMSNorm
+  --norm-epsilon: 1e-6
+  --swiglu: true
+  --untie-embeddings-and-output-weights: true
+  --multi-latent-attention: true
+  --mtp-num-layers: 1
+  --mtp-loss-scaling-factor: 0.1
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+  --qk-layernorm: true
+
+  # Add learning rate args
+  --lr-decay-samples: 24413696
+  --lr-warmup-samples: 1536000
+  --lr-warmup-init: 1e-7
+  --lr: 1e-5
+  --min-lr: 1e-6
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add MoE args
+  --num-experts: 64 # local 4 + 1 shared, EP16
+  --moe-layer-freq: ([0]*3+[1]*11)
+  --moe-ffn-hidden-size: 2048
+  --moe-shared-expert-intermediate-size: 2048
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-router-topk: 8
+  --moe-token-dispatcher-type: flex
+  --moe-flex-dispatcher-backend: hybridep
+  --moe-router-pre-softmax: true
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-4
+  --moe-router-group-topk: 4
+  --moe-router-num-groups: 8
+  --moe-router-topk-scaling-factor: 2.5
+  --moe-router-score-function: sigmoid
+  --moe-router-enable-expert-bias: true
+  --moe-router-bias-update-rate: 1e-3
+  --moe-router-dtype: fp32
+  --moe-permute-fusion: true
+
+  # Add MLA args
+  --q-lora-rank: 1536
+  --kv-lora-rank: 512
+  --qk-head-dim: 128
+  --qk-pos-emb-head-dim: 64
+  --v-head-dim: 128
+  --rotary-scaling-factor: 40
+  --mscale: 1.0
+  --mscale-all-dim: 1.0
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+
+  # Add checkpointing args
+  --auto-detect-ckpt-format:
+    true
+    # Add checkpointing args
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --save-interval: 5000
+  --save-retain-interval: 10000
+  --dist-ckpt-strictness: log_all
+
+  # Add initialization args
+  --init-method-std: 0.02
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --logging-level: 40
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+  --wandb-save-dir: ${WANDB_SAVE_PATH}
+
+  # Add mixed precision args
+  --bf16: true
+
+  # enable experimental
+  --enable-experimental: true
+METRICS:
+  - "iteration-time"
+  - "lm loss"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml
new file mode 100644
index 00000000000..2ac08d088a0
--- /dev/null
+++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_gb_200_release_sm/model_config.yaml
@@ -0,0 +1,169 @@
+# The proxy model is used for local code quality check.
+# The proxy model should contain all the necessary components and settings but fewer parameters.
+ENV_VARS:
+  TORCH_NCCL_AVOID_RECORD_STREAMS: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+  NVTE_FUSED_ATTN: 1
+  NVTE_NORM_FWD_USE_CUDNN: 1
+  NVTE_NORM_BWD_USE_CUDNN: 1
+  PYTHONWARNINGS: ignore
+  NCCL_DEBUG: VERSION
+  NON_DETERMINSTIC_RESULTS: 1
+  NVSHMEM_IB_ENABLE_IBGDA: 0
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 16
+  USE_MNNVL: 1
+TEST_TYPE: "release"
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 4
+  --pipeline-model-parallel-layout: Et*2\\|\\(tt\\|\\)*5t\\|tmL # Et*2|(tt|)*5t|tmL
+  --expert-model-parallel-size: 16
+  --context-parallel-size: 1
+  --expert-tensor-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --use-flash-attn: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 512
+  --train-samples: 24414062
+  --exit-duration-in-mins: 220
+  --no-check-for-nan-in-loss-and-grad: true
+  --cross-entropy-loss-fusion: true
+  --cross-entropy-fusion-impl: te
+  --manual-gc: true
+  --manual-gc-interval: 10
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --seq-length: 4096
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
+  --data-path: $DATA_BLEND
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --no-create-attention-mask-in-dataloader: true
+  --num-workers: 6
+
+  # Add network size args
+  --num-layers: 14 # original 61 layers
+  --hidden-size: 7168
+  --ffn-hidden-size: 18432
+  --num-attention-heads: 128
+  --kv-channels: 128
+  --max-position-embeddings: 4096
+  --position-embedding-type: rope
+  --rotary-base: 10000
+  --make-vocab-size-divisible-by: 3232
+  --normalization: RMSNorm
+  --norm-epsilon: 1e-6
+  --swiglu: true
+  --untie-embeddings-and-output-weights: true
+  --multi-latent-attention: true
+  --mtp-num-layers: 1
+  --mtp-loss-scaling-factor: 0.1
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+  --qk-layernorm: true
+
+  # Add learning rate args
+  --lr-decay-samples: 24413696
+  --lr-warmup-samples: 1536000
+  --lr-warmup-init: 1e-7
+  --lr: 1e-5
+  --min-lr: 1e-6
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add MoE args
+  --num-experts: 64 # local 4 + 1 shared, EP16
+  --moe-layer-freq: ([0]*3+[1]*11)
+  --moe-ffn-hidden-size: 2048
+  --moe-shared-expert-intermediate-size: 2048
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-router-topk: 8
+  --moe-token-dispatcher-type: flex
+  --moe-flex-dispatcher-backend: hybridep
+  --moe-router-pre-softmax: true
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-4
+  --moe-router-group-topk: 4
+  --moe-router-num-groups: 8
+  --moe-router-topk-scaling-factor: 2.5
+  --moe-router-score-function: sigmoid
+  --moe-router-enable-expert-bias: true
+  --moe-router-bias-update-rate: 1e-3
+  --moe-router-dtype: fp32
+  --moe-permute-fusion: true
+
+  # Add MLA args
+  --q-lora-rank: 1536
+  --kv-lora-rank: 512
+  --qk-head-dim: 128
+  --qk-pos-emb-head-dim: 64
+  --v-head-dim: 128
+  --rotary-scaling-factor: 40
+  --mscale: 1.0
+  --mscale-all-dim: 1.0
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+
+  # Add checkpointing args
+  --auto-detect-ckpt-format:
+    true
+    # Add checkpointing args
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --save-interval: 5000
+  --save-retain-interval: 2500
+  --dist-ckpt-strictness: log_all
+
+  # Add initialization args
+  --init-method-std: 0.02
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --logging-level: 40
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+  --wandb-save-dir: ${WANDB_SAVE_PATH}
+
+  # Add mixed precision args
+  --bf16: true
+
+  # enable experimental
+  --enable-experimental: true
+  --exit-interval: 9536
+METRICS:
+  - "iteration-time"
+  - "lm loss"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml
index c16fedc7860..3c7c4201b6e 100644
--- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release/model_config.yaml
@@ -12,11 +12,12 @@ ENV_VARS:
   NCCL_DEBUG: VERSION
   NON_DETERMINSTIC_RESULTS: 1
   NVSHMEM_IB_ENABLE_IBGDA: 0
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
 TEST_TYPE: "release"
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
-  --tensor-model-parallel-size: 1
+  --tensor-model-parallel-size: 2
   --pipeline-model-parallel-size: 4
   --pipeline-model-parallel-layout: Et*2\\|\\(tt\\|\\)*5t\\|tmL # Et*2|(tt|)*5t|tmL
   --expert-model-parallel-size: 16
@@ -47,8 +48,8 @@ MODEL_ARGS:
   # Data args
   --seq-length: 4096
   --data-cache-path: ${DATA_CACHE_PATH}
-  --tokenizer-type: HuggingFaceTokenizer
-  --tokenizer-model: ${TOKENIZER_PATH}
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
   --data-path: $DATA_BLEND
   --split: 99,1,0
   --no-mmap-bin-files: true
@@ -81,12 +82,11 @@ MODEL_ARGS:
   --qk-layernorm: true
 
   # Add learning rate args
-  --lr-decay-samples: 584765624
+  --lr-decay-samples: 24413696
   --lr-warmup-samples: 1536000
-  # Learning rate scaled down from 7.3e-6 (DeepSeek-V3 technical report, GBS=15360) to 3.9e-6 (GBS=8192)
-  --lr-warmup-init: 3.9e-7
-  --lr: 3.9e-6
-  --min-lr: 3.9e-7
+  --lr-warmup-init: 1e-7
+  --lr: 1e-5
+  --min-lr: 1e-6
   --lr-decay-style: cosine
   --adam-beta1: 0.9
   --adam-beta2: 0.95
@@ -127,14 +127,12 @@ MODEL_ARGS:
   --eval-interval: 200
 
   # Add checkpointing args
-  --no-load-optim: true
-  --no-load-rng: true
   --auto-detect-ckpt-format:
     true
     # Add checkpointing args
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
-  --save-interval: 500
+  --save-interval: 5000
   --save-retain-interval: 10000
   --dist-ckpt-strictness: log_all
 
@@ -152,6 +150,7 @@ MODEL_ARGS:
   --logging-level: 40
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --wandb-save-dir: ${WANDB_SAVE_PATH}
 
diff --git a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml
similarity index 95%
rename from tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml
rename to tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml
index 9c7d2496e2a..fead6c06ae1 100644
--- a/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etc1cp1_release_sm/model_config.yml
+++ b/tests/functional_tests/test_cases/mixtral/deepseekv3_proxy_flex_tp1pp4emp16etp1cp1_release_sm/model_config.yaml
@@ -13,7 +13,7 @@ ENV_VARS:
   NON_DETERMINSTIC_RESULTS: 1
   NVSHMEM_IB_ENABLE_IBGDA: 0
   CUDA_DEVICE_MAX_CONNECTIONS: 1
-TEST_TYPE: 'release'
+TEST_TYPE: "release"
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
@@ -132,8 +132,8 @@ MODEL_ARGS:
     # Add checkpointing args
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
-  --save-interval: 500
-  --save-retain-interval: 10000
+  --save-interval: 5000
+  --save-retain-interval: 2500
   --dist-ckpt-strictness: log_all
 
   # Add initialization args
@@ -150,6 +150,7 @@ MODEL_ARGS:
   --logging-level: 40
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --wandb-save-dir: ${WANDB_SAVE_PATH}
 
@@ -160,7 +161,7 @@ MODEL_ARGS:
   --enable-experimental: true
   --exit-interval: 9536
 METRICS:
-  - 'iteration-time'
-  - 'lm loss'
-  - 'mem-allocated-bytes'
-  - 'mem-max-allocated-bytes'
+  - "iteration-time"
+  - "lm loss"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
index e2b8b212900..efe39998065 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
@@ -92,6 +92,7 @@ MODEL_ARGS:
   --log-interval: 1
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --wandb-save-dir: ${WANDB_SAVE_PATH}
   # Add mixed precision args
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
index a02fbe99537..f4476c712f2 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
@@ -92,6 +92,7 @@ MODEL_ARGS:
   --log-interval: 1
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --wandb-save-dir: ${WANDB_SAVE_PATH}
   # Add mixed precision args
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
index b43a1227ea0..cfeb7709839 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
@@ -92,6 +92,7 @@ MODEL_ARGS:
   --log-interval: 1
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --wandb-save-dir: ${WANDB_SAVE_PATH}
   # Add mixed precision args
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
index 1fdad2a5c70..29dcefadf0e 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
@@ -94,6 +94,7 @@ MODEL_ARGS:
   --log-interval: 1
   --tensorboard-dir: ${TENSORBOARD_PATH}
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --wandb-save-dir: ${WANDB_SAVE_PATH}
   # Add mixed precision args
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json
index acc70537006..fee855b0084 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json
@@ -2,91 +2,286 @@
     "lm loss": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 10.82721,
-            "5": 10.85697,
-            "10": 10.79166,
-            "15": 10.82555,
-            "20": 10.7225,
-            "25": 10.54453,
-            "30": 10.35773,
-            "35": 10.27098,
-            "40": 10.09715,
-            "45": 9.84113,
-            "50": 9.92414
+            "1": 10.82753,
+            "2": 10.84043,
+            "3": 10.82715,
+            "4": 10.81921,
+            "5": 10.85715,
+            "6": 10.86963,
+            "7": 10.85115,
+            "8": 10.84459,
+            "9": 10.85294,
+            "10": 10.79205,
+            "11": 10.86576,
+            "12": 10.87104,
+            "13": 10.87066,
+            "14": 10.8786,
+            "15": 10.82531,
+            "16": 10.81239,
+            "17": 10.77441,
+            "18": 10.81066,
+            "19": 10.79655,
+            "20": 10.72261,
+            "21": 10.69716,
+            "22": 10.55179,
+            "23": 10.70541,
+            "24": 10.59,
+            "25": 10.5444,
+            "26": 10.60019,
+            "27": 10.62037,
+            "28": 10.57394,
+            "29": 10.58621,
+            "30": 10.35743,
+            "31": 10.12236,
+            "32": 10.4699,
+            "33": 10.45701,
+            "34": 10.21542,
+            "35": 10.27175,
+            "36": 10.23575,
+            "37": 10.35238,
+            "38": 10.20563,
+            "39": 10.40098,
+            "40": 10.09712,
+            "41": 10.13849,
+            "42": 10.21817,
+            "43": 9.84392,
+            "44": 9.96202,
+            "45": 9.84103,
+            "46": 9.81937,
+            "47": 10.13889,
+            "48": 9.85138,
+            "49": 9.53556,
+            "50": 9.92467
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 4712.0,
-            "5": 5441.0,
-            "10": 4322.0,
-            "15": 5376.0,
-            "20": 4936.0,
-            "25": 4834.0,
-            "30": 5393.0,
-            "35": 5612.0,
-            "40": 5947.0,
-            "45": 5737.0,
-            "50": 6611.0
+            "1": 4603.0,
+            "2": 5083.0,
+            "3": 4785.0,
+            "4": 4984.0,
+            "5": 5363.0,
+            "6": 5526.0,
+            "7": 5186.0,
+            "8": 4832.0,
+            "9": 5266.0,
+            "10": 4277.0,
+            "11": 5578.0,
+            "12": 5167.0,
+            "13": 5542.0,
+            "14": 5534.0,
+            "15": 5159.0,
+            "16": 5362.0,
+            "17": 5218.0,
+            "18": 5139.0,
+            "19": 5256.0,
+            "20": 4828.0,
+            "21": 5250.0,
+            "22": 4751.0,
+            "23": 5581.0,
+            "24": 5143.0,
+            "25": 4818.0,
+            "26": 5119.0,
+            "27": 5303.0,
+            "28": 5695.0,
+            "29": 5950.0,
+            "30": 5442.0,
+            "31": 4846.0,
+            "32": 5628.0,
+            "33": 6184.0,
+            "34": 5101.0,
+            "35": 5705.0,
+            "36": 5638.0,
+            "37": 6355.0,
+            "38": 6140.0,
+            "39": 6610.0,
+            "40": 5946.0,
+            "41": 5935.0,
+            "42": 6405.0,
+            "43": 5917.0,
+            "44": 5830.0,
+            "45": 5791.0,
+            "46": 6026.0,
+            "47": 6456.0,
+            "48": 6440.0,
+            "49": 6174.0,
+            "50": 6644.0
         }
     },
     "mem-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 1114775040.0,
-            "5": 1114770944.0,
-            "10": 1114772992.0,
-            "15": 1114774016.0,
-            "20": 1114772480.0,
-            "25": 1114770944.0,
-            "30": 1114770944.0,
-            "35": 1114775040.0,
-            "40": 1114774016.0,
-            "45": 1114772992.0,
-            "50": 1114773504.0
+            "1": 1116843520.0,
+            "2": 1116841984.0,
+            "3": 1116839936.0,
+            "4": 1116843008.0,
+            "5": 1116839424.0,
+            "6": 1116839936.0,
+            "7": 1116840960.0,
+            "8": 1116839936.0,
+            "9": 1116842496.0,
+            "10": 1116841472.0,
+            "11": 1116841984.0,
+            "12": 1116839936.0,
+            "13": 1116845056.0,
+            "14": 1116838912.0,
+            "15": 1116842496.0,
+            "16": 1116841472.0,
+            "17": 1116838912.0,
+            "18": 1116843520.0,
+            "19": 1116839936.0,
+            "20": 1116841472.0,
+            "21": 1116838912.0,
+            "22": 1116840448.0,
+            "23": 1116840448.0,
+            "24": 1116843520.0,
+            "25": 1116839424.0,
+            "26": 1116843008.0,
+            "27": 1116840960.0,
+            "28": 1116842496.0,
+            "29": 1116843008.0,
+            "30": 1116839936.0,
+            "31": 1116846080.0,
+            "32": 1116842496.0,
+            "33": 1116841472.0,
+            "34": 1116840960.0,
+            "35": 1116843520.0,
+            "36": 1116838912.0,
+            "37": 1116840448.0,
+            "38": 1116841472.0,
+            "39": 1116840448.0,
+            "40": 1116841984.0,
+            "41": 1116842496.0,
+            "42": 1116843520.0,
+            "43": 1116844032.0,
+            "44": 1116843008.0,
+            "45": 1116840960.0,
+            "46": 1116842496.0,
+            "47": 1116841984.0,
+            "48": 1116839424.0,
+            "49": 1116837376.0,
+            "50": 1116843008.0
         }
     },
     "mem-max-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 1563141632.0,
-            "5": 2020767232.0,
-            "10": 2023552512.0,
-            "15": 2025326592.0,
-            "20": 2025326592.0,
-            "25": 2025326592.0,
-            "30": 2025326592.0,
-            "35": 2028347392.0,
-            "40": 2028347392.0,
-            "45": 2028347392.0,
-            "50": 2028347392.0
+            "1": 1562991104.0,
+            "2": 2022045696.0,
+            "3": 2022045696.0,
+            "4": 2023063552.0,
+            "5": 2023063552.0,
+            "6": 2023063552.0,
+            "7": 2023063552.0,
+            "8": 2023063552.0,
+            "9": 2023063552.0,
+            "10": 2025666048.0,
+            "11": 2025666048.0,
+            "12": 2025666048.0,
+            "13": 2027637760.0,
+            "14": 2027637760.0,
+            "15": 2027637760.0,
+            "16": 2027637760.0,
+            "17": 2027637760.0,
+            "18": 2027637760.0,
+            "19": 2027637760.0,
+            "20": 2027637760.0,
+            "21": 2027637760.0,
+            "22": 2027637760.0,
+            "23": 2027637760.0,
+            "24": 2027637760.0,
+            "25": 2027637760.0,
+            "26": 2027637760.0,
+            "27": 2027637760.0,
+            "28": 2027637760.0,
+            "29": 2027637760.0,
+            "30": 2027637760.0,
+            "31": 2029937664.0,
+            "32": 2029937664.0,
+            "33": 2029937664.0,
+            "34": 2029937664.0,
+            "35": 2029937664.0,
+            "36": 2029937664.0,
+            "37": 2029937664.0,
+            "38": 2029937664.0,
+            "39": 2029937664.0,
+            "40": 2029937664.0,
+            "41": 2029937664.0,
+            "42": 2029937664.0,
+            "43": 2029937664.0,
+            "44": 2029937664.0,
+            "45": 2029937664.0,
+            "46": 2029937664.0,
+            "47": 2029937664.0,
+            "48": 2029937664.0,
+            "49": 2029937664.0,
+            "50": 2029937664.0
         }
     },
     "iteration-time": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 10.56989,
-            "5": 0.34599,
-            "10": 0.34601,
-            "15": 0.34343,
-            "20": 0.34409,
-            "25": 0.34378,
-            "30": 0.34403,
-            "35": 0.34395,
-            "40": 0.34489,
-            "45": 0.34046,
-            "50": 0.34152
+            "1": 16.87326,
+            "2": 0.3522,
+            "3": 0.33665,
+            "4": 0.32376,
+            "5": 0.32134,
+            "6": 0.32089,
+            "7": 0.32,
+            "8": 0.32013,
+            "9": 0.32009,
+            "10": 0.32059,
+            "11": 0.31897,
+            "12": 0.31983,
+            "13": 0.32143,
+            "14": 0.32114,
+            "15": 0.32116,
+            "16": 0.32112,
+            "17": 0.32136,
+            "18": 0.32313,
+            "19": 0.32195,
+            "20": 0.32131,
+            "21": 0.32215,
+            "22": 0.32253,
+            "23": 0.32037,
+            "24": 0.32194,
+            "25": 0.32053,
+            "26": 0.72275,
+            "27": 0.32115,
+            "28": 0.32108,
+            "29": 0.32328,
+            "30": 0.32158,
+            "31": 0.32145,
+            "32": 0.32206,
+            "33": 0.32101,
+            "34": 0.32196,
+            "35": 0.32277,
+            "36": 0.32103,
+            "37": 0.32143,
+            "38": 0.32156,
+            "39": 0.32198,
+            "40": 0.32071,
+            "41": 0.32265,
+            "42": 0.32274,
+            "43": 0.32271,
+            "44": 0.32188,
+            "45": 0.32208,
+            "46": 0.32183,
+            "47": 0.32051,
+            "48": 0.3213,
+            "49": 0.32129,
+            "50": 0.31989
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..5c353a70683
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81386,
+            "2": 10.8312,
+            "3": 10.80976,
+            "4": 10.82429,
+            "5": 10.84553,
+            "6": 10.85179,
+            "7": 10.83654,
+            "8": 10.83644,
+            "9": 10.84704,
+            "10": 10.78251,
+            "11": 10.85041,
+            "12": 10.84824,
+            "13": 10.86042,
+            "14": 10.86568,
+            "15": 10.81163,
+            "16": 10.79013,
+            "17": 10.76156,
+            "18": 10.78107,
+            "19": 10.78131,
+            "20": 10.70284,
+            "21": 10.67616,
+            "22": 10.51867,
+            "23": 10.70469,
+            "24": 10.57188,
+            "25": 10.51628,
+            "26": 10.58103,
+            "27": 10.59482,
+            "28": 10.56661,
+            "29": 10.58588,
+            "30": 10.33634,
+            "31": 10.08426,
+            "32": 10.4506,
+            "33": 10.4457,
+            "34": 10.19772,
+            "35": 10.25794,
+            "36": 10.21991,
+            "37": 10.34564,
+            "38": 10.18704,
+            "39": 10.39388,
+            "40": 10.08233,
+            "41": 10.13235,
+            "42": 10.21151,
+            "43": 9.83045,
+            "44": 9.94704,
+            "45": 9.84037,
+            "46": 9.81454,
+            "47": 10.12979,
+            "48": 9.85142,
+            "49": 9.52861,
+            "50": 9.91131
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4720.0,
+            "2": 4914.0,
+            "3": 4776.0,
+            "4": 4671.0,
+            "5": 5518.0,
+            "6": 5824.0,
+            "7": 4980.0,
+            "8": 4796.0,
+            "9": 5167.0,
+            "10": 4355.0,
+            "11": 5673.0,
+            "12": 5426.0,
+            "13": 5583.0,
+            "14": 5744.0,
+            "15": 5108.0,
+            "16": 5421.0,
+            "17": 5053.0,
+            "18": 5299.0,
+            "19": 5132.0,
+            "20": 4876.0,
+            "21": 5310.0,
+            "22": 4667.0,
+            "23": 5540.0,
+            "24": 5085.0,
+            "25": 4723.0,
+            "26": 5278.0,
+            "27": 5336.0,
+            "28": 5707.0,
+            "29": 6154.0,
+            "30": 5376.0,
+            "31": 4689.0,
+            "32": 5934.0,
+            "33": 6223.0,
+            "34": 5379.0,
+            "35": 5828.0,
+            "36": 5708.0,
+            "37": 6494.0,
+            "38": 6186.0,
+            "39": 6680.0,
+            "40": 6110.0,
+            "41": 6110.0,
+            "42": 6339.0,
+            "43": 5869.0,
+            "44": 5905.0,
+            "45": 6036.0,
+            "46": 5862.0,
+            "47": 6757.0,
+            "48": 6445.0,
+            "49": 6445.0,
+            "50": 6776.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1144114688.0,
+            "2": 1144112128.0,
+            "3": 1144114688.0,
+            "4": 1144114176.0,
+            "5": 1144115200.0,
+            "6": 1144113664.0,
+            "7": 1144114688.0,
+            "8": 1144112128.0,
+            "9": 1144112640.0,
+            "10": 1144114688.0,
+            "11": 1144114176.0,
+            "12": 1144115200.0,
+            "13": 1144113664.0,
+            "14": 1144116224.0,
+            "15": 1144114688.0,
+            "16": 1144113664.0,
+            "17": 1144111616.0,
+            "18": 1144113664.0,
+            "19": 1144114176.0,
+            "20": 1144114688.0,
+            "21": 1144112128.0,
+            "22": 1144114176.0,
+            "23": 1144112640.0,
+            "24": 1144111616.0,
+            "25": 1144115712.0,
+            "26": 1144116736.0,
+            "27": 1144114688.0,
+            "28": 1144112128.0,
+            "29": 1144112640.0,
+            "30": 1144114176.0,
+            "31": 1144110080.0,
+            "32": 1144113152.0,
+            "33": 1144113664.0,
+            "34": 1144113664.0,
+            "35": 1144111104.0,
+            "36": 1144113664.0,
+            "37": 1144115200.0,
+            "38": 1144114176.0,
+            "39": 1144112128.0,
+            "40": 1144112128.0,
+            "41": 1144110080.0,
+            "42": 1144111616.0,
+            "43": 1144108544.0,
+            "44": 1144111616.0,
+            "45": 1144114688.0,
+            "46": 1144113664.0,
+            "47": 1144112128.0,
+            "48": 1144111616.0,
+            "49": 1144113664.0,
+            "50": 1144111616.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1592113152.0,
+            "2": 2049260032.0,
+            "3": 2049305088.0,
+            "4": 2049981952.0,
+            "5": 2049981952.0,
+            "6": 2049981952.0,
+            "7": 2049981952.0,
+            "8": 2049981952.0,
+            "9": 2049981952.0,
+            "10": 2050618880.0,
+            "11": 2050618880.0,
+            "12": 2050618880.0,
+            "13": 2050618880.0,
+            "14": 2050618880.0,
+            "15": 2050618880.0,
+            "16": 2050618880.0,
+            "17": 2050618880.0,
+            "18": 2050618880.0,
+            "19": 2050618880.0,
+            "20": 2050618880.0,
+            "21": 2050618880.0,
+            "22": 2050618880.0,
+            "23": 2050618880.0,
+            "24": 2050618880.0,
+            "25": 2050618880.0,
+            "26": 2050618880.0,
+            "27": 2050618880.0,
+            "28": 2050618880.0,
+            "29": 2050618880.0,
+            "30": 2050618880.0,
+            "31": 2050618880.0,
+            "32": 2050618880.0,
+            "33": 2050618880.0,
+            "34": 2050618880.0,
+            "35": 2050618880.0,
+            "36": 2050618880.0,
+            "37": 2050618880.0,
+            "38": 2050618880.0,
+            "39": 2050618880.0,
+            "40": 2050618880.0,
+            "41": 2050618880.0,
+            "42": 2050618880.0,
+            "43": 2050618880.0,
+            "44": 2050618880.0,
+            "45": 2050618880.0,
+            "46": 2050618880.0,
+            "47": 2050618880.0,
+            "48": 2050618880.0,
+            "49": 2050618880.0,
+            "50": 2050618880.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 7.71822,
+            "3": 0.41543,
+            "4": 0.48385,
+            "5": 0.43238,
+            "6": 0.4005,
+            "7": 0.37702,
+            "8": 0.34877,
+            "9": 0.34747,
+            "10": 0.33405,
+            "11": 0.35433,
+            "12": 0.4949,
+            "13": 0.78498,
+            "14": 0.34468,
+            "15": 0.34552,
+            "16": 0.34055,
+            "17": 0.34051,
+            "18": 0.33811,
+            "19": 0.34319,
+            "20": 0.33811,
+            "21": 0.34085,
+            "22": 0.35398,
+            "23": 0.33545,
+            "24": 0.3393,
+            "25": 0.34161,
+            "26": 0.33744,
+            "27": 0.33573,
+            "28": 0.33954,
+            "29": 0.33344,
+            "30": 0.33741,
+            "31": 0.34691,
+            "32": 0.33581,
+            "33": 0.3395,
+            "34": 0.34333,
+            "35": 0.3424,
+            "36": 0.34673,
+            "37": 0.33697,
+            "38": 0.33705,
+            "39": 0.33394,
+            "40": 0.33964,
+            "41": 0.34276,
+            "42": 0.3401,
+            "43": 0.34688,
+            "44": 0.3413,
+            "45": 0.33867,
+            "46": 0.34719,
+            "47": 0.34606,
+            "48": 0.35149,
+            "49": 0.34219,
+            "50": 0.33349
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json
index e4e01388a15..6a4f3459a2c 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.7999,
-            "2": 10.80046,
-            "3": 10.8089,
-            "4": 10.78245,
-            "5": 10.82504,
-            "6": 10.83657,
-            "7": 10.81628,
-            "8": 10.81184,
-            "9": 10.8108,
-            "10": 10.7742,
-            "11": 10.85482,
-            "12": 10.82663,
-            "13": 10.85131,
-            "14": 10.85461,
-            "15": 10.78253,
-            "16": 10.77375,
-            "17": 10.74989,
-            "18": 10.78346,
-            "19": 10.75877,
-            "20": 10.69982,
-            "21": 10.67287,
-            "22": 10.5142,
-            "23": 10.68053,
-            "24": 10.57164,
-            "25": 10.51814,
+            "1": 10.80012,
+            "2": 10.8005,
+            "3": 10.8088,
+            "4": 10.78235,
+            "5": 10.82515,
+            "6": 10.83624,
+            "7": 10.81603,
+            "8": 10.81186,
+            "9": 10.8109,
+            "10": 10.77384,
+            "11": 10.85522,
+            "12": 10.82691,
+            "13": 10.85113,
+            "14": 10.85524,
+            "15": 10.78245,
+            "16": 10.77327,
+            "17": 10.75069,
+            "18": 10.78345,
+            "19": 10.75897,
+            "20": 10.69992,
+            "21": 10.67228,
+            "22": 10.51407,
+            "23": 10.68079,
+            "24": 10.57159,
+            "25": 10.51796,
             "26": 10.57591,
-            "27": 10.59136,
-            "28": 10.55398,
-            "29": 10.57104,
-            "30": 10.36425,
-            "31": 10.10945,
-            "32": 10.45329,
-            "33": 10.43693,
-            "34": 10.20011,
-            "35": 10.25443,
-            "36": 10.23318,
-            "37": 10.3536,
-            "38": 10.20421,
-            "39": 10.3993,
-            "40": 10.10241,
-            "41": 10.12765,
-            "42": 10.21115,
-            "43": 9.83746,
-            "44": 9.96186,
-            "45": 9.84266,
-            "46": 9.80686,
-            "47": 10.14266,
-            "48": 9.86672,
-            "49": 9.53822,
-            "50": 9.92595
+            "27": 10.59187,
+            "28": 10.55352,
+            "29": 10.57123,
+            "30": 10.36507,
+            "31": 10.10867,
+            "32": 10.45411,
+            "33": 10.437,
+            "34": 10.20016,
+            "35": 10.25454,
+            "36": 10.23316,
+            "37": 10.35376,
+            "38": 10.20479,
+            "39": 10.39932,
+            "40": 10.10206,
+            "41": 10.12772,
+            "42": 10.2109,
+            "43": 9.83726,
+            "44": 9.96178,
+            "45": 9.84258,
+            "46": 9.80634,
+            "47": 10.14233,
+            "48": 9.86646,
+            "49": 9.53815,
+            "50": 9.92572
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4752.0,
-            "2": 5040.0,
-            "3": 5112.0,
-            "4": 5072.0,
-            "5": 5472.0,
-            "6": 5619.0,
-            "7": 5255.0,
-            "8": 5065.0,
-            "9": 5483.0,
-            "10": 4607.0,
-            "11": 5862.0,
-            "12": 5377.0,
-            "13": 5783.0,
-            "14": 5830.0,
-            "15": 5249.0,
-            "16": 5346.0,
-            "17": 5291.0,
-            "18": 5277.0,
-            "19": 5352.0,
-            "20": 4942.0,
-            "21": 5465.0,
-            "22": 4878.0,
-            "23": 5807.0,
-            "24": 5145.0,
-            "25": 4873.0,
-            "26": 5380.0,
-            "27": 5479.0,
-            "28": 5739.0,
-            "29": 5950.0,
-            "30": 5363.0,
-            "31": 4730.0,
-            "32": 5732.0,
-            "33": 5963.0,
-            "34": 5261.0,
-            "35": 5660.0,
-            "36": 5422.0,
-            "37": 6362.0,
-            "38": 6114.0,
-            "39": 6803.0,
-            "40": 5731.0,
-            "41": 5808.0,
-            "42": 6485.0,
-            "43": 5742.0,
-            "44": 5843.0,
-            "45": 5876.0,
-            "46": 6024.0,
-            "47": 6554.0,
-            "48": 6354.0,
-            "49": 6497.0,
-            "50": 6526.0
+            "1": 4754.0,
+            "2": 5059.0,
+            "3": 5119.0,
+            "4": 5063.0,
+            "5": 5547.0,
+            "6": 5513.0,
+            "7": 5119.0,
+            "8": 5021.0,
+            "9": 5280.0,
+            "10": 4401.0,
+            "11": 5996.0,
+            "12": 5401.0,
+            "13": 5775.0,
+            "14": 5673.0,
+            "15": 5182.0,
+            "16": 5401.0,
+            "17": 5223.0,
+            "18": 5195.0,
+            "19": 5312.0,
+            "20": 4783.0,
+            "21": 5332.0,
+            "22": 4858.0,
+            "23": 5752.0,
+            "24": 5114.0,
+            "25": 4946.0,
+            "26": 5370.0,
+            "27": 5291.0,
+            "28": 5771.0,
+            "29": 5900.0,
+            "30": 5276.0,
+            "31": 4814.0,
+            "32": 5760.0,
+            "33": 6010.0,
+            "34": 5199.0,
+            "35": 5583.0,
+            "36": 5494.0,
+            "37": 6408.0,
+            "38": 5931.0,
+            "39": 6618.0,
+            "40": 5910.0,
+            "41": 5851.0,
+            "42": 6294.0,
+            "43": 5754.0,
+            "44": 5656.0,
+            "45": 5874.0,
+            "46": 5925.0,
+            "47": 6568.0,
+            "48": 6429.0,
+            "49": 6436.0,
+            "50": 6468.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1144115200.0,
-            "2": 1144113152.0,
-            "3": 1144113664.0,
-            "4": 1144112640.0,
-            "5": 1144113664.0,
-            "6": 1144113152.0,
-            "7": 1144115200.0,
-            "8": 1144112640.0,
-            "9": 1144113152.0,
-            "10": 1144118272.0,
-            "11": 1144112640.0,
-            "12": 1144112128.0,
-            "13": 1144110592.0,
-            "14": 1144112640.0,
-            "15": 1144111616.0,
-            "16": 1144112640.0,
-            "17": 1144112128.0,
-            "18": 1144113152.0,
-            "19": 1144112640.0,
-            "20": 1144113664.0,
-            "21": 1144113152.0,
-            "22": 1144114176.0,
-            "23": 1144113664.0,
-            "24": 1144111616.0,
-            "25": 1144110592.0,
-            "26": 1144113664.0,
-            "27": 1144113664.0,
-            "28": 1144112128.0,
-            "29": 1144110080.0,
-            "30": 1144113152.0,
-            "31": 1144116224.0,
-            "32": 1144112128.0,
-            "33": 1144113152.0,
-            "34": 1144113664.0,
-            "35": 1144115712.0,
-            "36": 1144111616.0,
-            "37": 1144111104.0,
-            "38": 1144110592.0,
-            "39": 1144113664.0,
-            "40": 1144113664.0,
-            "41": 1144114176.0,
-            "42": 1144109056.0,
-            "43": 1144114176.0,
-            "44": 1144115200.0,
-            "45": 1144113152.0,
-            "46": 1144117760.0,
-            "47": 1144113152.0,
-            "48": 1144115712.0,
-            "49": 1144117760.0,
-            "50": 1144114176.0
+            "1": 1145163776.0,
+            "2": 1145163776.0,
+            "3": 1145163264.0,
+            "4": 1145162240.0,
+            "5": 1145163776.0,
+            "6": 1146211328.0,
+            "7": 1146213376.0,
+            "8": 1145162240.0,
+            "9": 1145162752.0,
+            "10": 1145167360.0,
+            "11": 1145162240.0,
+            "12": 1145162240.0,
+            "13": 1145161216.0,
+            "14": 1146210816.0,
+            "15": 1145160192.0,
+            "16": 1145162752.0,
+            "17": 1145161728.0,
+            "18": 1145162752.0,
+            "19": 1146210816.0,
+            "20": 1145163264.0,
+            "21": 1146211328.0,
+            "22": 1145163776.0,
+            "23": 1146212352.0,
+            "24": 1145161216.0,
+            "25": 1145160704.0,
+            "26": 1145164288.0,
+            "27": 1145163264.0,
+            "28": 1145161728.0,
+            "29": 1145159680.0,
+            "30": 1145162752.0,
+            "31": 1145165824.0,
+            "32": 1145163264.0,
+            "33": 1145162752.0,
+            "34": 1145163264.0,
+            "35": 1145165312.0,
+            "36": 1145161728.0,
+            "37": 1145160704.0,
+            "38": 1145160192.0,
+            "39": 1145162752.0,
+            "40": 1145163264.0,
+            "41": 1145163264.0,
+            "42": 1145159680.0,
+            "43": 1145164288.0,
+            "44": 1146213888.0,
+            "45": 1146211328.0,
+            "46": 1146215936.0,
+            "47": 1145162752.0,
+            "48": 1145165824.0,
+            "49": 1146216448.0,
+            "50": 1146212864.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1593775104.0,
-            "2": 2049587200.0,
-            "3": 2050487808.0,
-            "4": 2050487808.0,
-            "5": 2050487808.0,
-            "6": 2051877376.0,
-            "7": 2052037632.0,
-            "8": 2052037632.0,
-            "9": 2053219840.0,
-            "10": 2055123968.0,
-            "11": 2055123968.0,
-            "12": 2055123968.0,
-            "13": 2055123968.0,
-            "14": 2055123968.0,
-            "15": 2055123968.0,
-            "16": 2055123968.0,
-            "17": 2055123968.0,
-            "18": 2055123968.0,
-            "19": 2055123968.0,
-            "20": 2055123968.0,
-            "21": 2055123968.0,
-            "22": 2055123968.0,
-            "23": 2055123968.0,
-            "24": 2055123968.0,
-            "25": 2055123968.0,
-            "26": 2055123968.0,
-            "27": 2055123968.0,
-            "28": 2055123968.0,
-            "29": 2055123968.0,
-            "30": 2055123968.0,
-            "31": 2055123968.0,
-            "32": 2055123968.0,
-            "33": 2055123968.0,
-            "34": 2055123968.0,
-            "35": 2055123968.0,
-            "36": 2055123968.0,
-            "37": 2055123968.0,
-            "38": 2055123968.0,
-            "39": 2055123968.0,
-            "40": 2055123968.0,
-            "41": 2055123968.0,
-            "42": 2055123968.0,
-            "43": 2055123968.0,
-            "44": 2055123968.0,
-            "45": 2055123968.0,
-            "46": 2055123968.0,
-            "47": 2055123968.0,
-            "48": 2055123968.0,
-            "49": 2055123968.0,
-            "50": 2055123968.0
+            "1": 1593583104.0,
+            "2": 2051629056.0,
+            "3": 2053139456.0,
+            "4": 2053139456.0,
+            "5": 2053139456.0,
+            "6": 2053992960.0,
+            "7": 2055479296.0,
+            "8": 2055479296.0,
+            "9": 2056268288.0,
+            "10": 2059108864.0,
+            "11": 2059108864.0,
+            "12": 2059108864.0,
+            "13": 2059108864.0,
+            "14": 2059108864.0,
+            "15": 2059108864.0,
+            "16": 2059108864.0,
+            "17": 2059108864.0,
+            "18": 2059108864.0,
+            "19": 2059108864.0,
+            "20": 2059108864.0,
+            "21": 2059108864.0,
+            "22": 2059108864.0,
+            "23": 2059108864.0,
+            "24": 2059108864.0,
+            "25": 2059108864.0,
+            "26": 2059108864.0,
+            "27": 2059108864.0,
+            "28": 2059108864.0,
+            "29": 2059108864.0,
+            "30": 2059108864.0,
+            "31": 2059108864.0,
+            "32": 2059108864.0,
+            "33": 2059108864.0,
+            "34": 2059108864.0,
+            "35": 2059108864.0,
+            "36": 2059108864.0,
+            "37": 2059108864.0,
+            "38": 2059108864.0,
+            "39": 2059108864.0,
+            "40": 2059108864.0,
+            "41": 2059108864.0,
+            "42": 2059108864.0,
+            "43": 2059108864.0,
+            "44": 2059108864.0,
+            "45": 2059108864.0,
+            "46": 2059108864.0,
+            "47": 2059108864.0,
+            "48": 2059108864.0,
+            "49": 2059108864.0,
+            "50": 2059108864.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 17.54696,
-            "2": 0.35381,
-            "3": 0.30805,
-            "4": 0.32999,
-            "5": 0.28074,
-            "6": 0.27713,
-            "7": 0.30692,
-            "8": 0.27076,
-            "9": 0.28178,
-            "10": 0.28798,
-            "11": 0.26657,
-            "12": 0.27288,
-            "13": 0.27118,
-            "14": 0.26505,
-            "15": 0.27307,
-            "16": 0.26745,
-            "17": 0.28092,
-            "18": 0.25951,
-            "19": 0.26123,
-            "20": 0.27117,
-            "21": 0.26705,
-            "22": 0.27657,
-            "23": 0.2785,
-            "24": 0.27138,
-            "25": 0.27542,
-            "26": 0.26549,
-            "27": 0.26436,
-            "28": 0.2817,
-            "29": 0.26002,
-            "30": 0.26437,
-            "31": 0.29073,
-            "32": 0.27239,
-            "33": 0.26215,
-            "34": 0.2748,
-            "35": 0.2623,
-            "36": 0.25929,
-            "37": 0.26086,
-            "38": 0.26996,
-            "39": 0.25721,
-            "40": 0.25938,
-            "41": 0.26959,
-            "42": 0.25657,
-            "43": 0.26426,
-            "44": 0.25689,
-            "45": 0.26206,
-            "46": 0.27753,
-            "47": 0.27998,
-            "48": 0.26838,
-            "49": 0.27354,
-            "50": 0.26097
+            "1": 34.53022,
+            "2": 0.38382,
+            "3": 0.30651,
+            "4": 0.31954,
+            "5": 0.26567,
+            "6": 0.25765,
+            "7": 0.2929,
+            "8": 0.25619,
+            "9": 0.258,
+            "10": 0.25636,
+            "11": 0.25532,
+            "12": 0.24287,
+            "13": 0.2492,
+            "14": 0.24147,
+            "15": 0.26466,
+            "16": 0.24525,
+            "17": 0.24874,
+            "18": 0.23153,
+            "19": 0.23145,
+            "20": 0.23938,
+            "21": 0.23145,
+            "22": 0.67309,
+            "23": 0.24419,
+            "24": 0.23267,
+            "25": 0.24476,
+            "26": 0.23424,
+            "27": 0.23306,
+            "28": 0.24797,
+            "29": 0.22898,
+            "30": 0.23089,
+            "31": 0.26141,
+            "32": 0.24406,
+            "33": 0.22981,
+            "34": 0.24305,
+            "35": 0.22955,
+            "36": 0.23411,
+            "37": 0.22923,
+            "38": 0.23544,
+            "39": 0.23275,
+            "40": 0.23602,
+            "41": 0.238,
+            "42": 0.23132,
+            "43": 0.23557,
+            "44": 0.22984,
+            "45": 0.22919,
+            "46": 0.27449,
+            "47": 0.24511,
+            "48": 0.25065,
+            "49": 0.24993,
+            "50": 0.24332
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json
index 7ca7a077425..4bf1314508c 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8277, "5": 10.85694, "10": 10.79218, "15": 10.82554, "20": 10.72266, "25": 10.54408, "30": 10.35702, "35": 10.27159, "40": 10.09693, "45": 9.84114, "50": 9.92408}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4651.0, "5": 5442.0, "10": 4292.0, "15": 5228.0, "20": 4806.0, "25": 4844.0, "30": 5408.0, "35": 5653.0, "40": 5925.0, "45": 5632.0, "50": 6701.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1114236928.0, "5": 1114232832.0, "10": 1114234368.0, "15": 1114235904.0, "20": 1114234368.0, "25": 1114232832.0, "30": 1114233344.0, "35": 1114236928.0, "40": 1114235392.0, "45": 1114234880.0, "50": 1114236416.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1562494464.0, "5": 2020286464.0, "10": 2021971968.0, "15": 2023709184.0, "20": 2023709184.0, "25": 2023709184.0, "30": 2023709184.0, "35": 2028052992.0, "40": 2028052992.0, "45": 2028052992.0, "50": 2028052992.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.18372, "5": 0.31643, "10": 0.31694, "15": 0.31783, "20": 0.31908, "25": 0.31135, "30": 0.31816, "35": 0.31147, "40": 0.31529, "45": 0.31149, "50": 0.31277}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82721,
+            "2": 10.84035,
+            "3": 10.82745,
+            "4": 10.81915,
+            "5": 10.85627,
+            "6": 10.86983,
+            "7": 10.85163,
+            "8": 10.84508,
+            "9": 10.85219,
+            "10": 10.7925,
+            "11": 10.86564,
+            "12": 10.87089,
+            "13": 10.87065,
+            "14": 10.87856,
+            "15": 10.82558,
+            "16": 10.81245,
+            "17": 10.77494,
+            "18": 10.81119,
+            "19": 10.79646,
+            "20": 10.72204,
+            "21": 10.69748,
+            "22": 10.55149,
+            "23": 10.70513,
+            "24": 10.59002,
+            "25": 10.54424,
+            "26": 10.60053,
+            "27": 10.61985,
+            "28": 10.57416,
+            "29": 10.58647,
+            "30": 10.35756,
+            "31": 10.12146,
+            "32": 10.47023,
+            "33": 10.45687,
+            "34": 10.21575,
+            "35": 10.27137,
+            "36": 10.23554,
+            "37": 10.35262,
+            "38": 10.20577,
+            "39": 10.40106,
+            "40": 10.09677,
+            "41": 10.13884,
+            "42": 10.21795,
+            "43": 9.84364,
+            "44": 9.96195,
+            "45": 9.84129,
+            "46": 9.81913,
+            "47": 10.13875,
+            "48": 9.85153,
+            "49": 9.53512,
+            "50": 9.92452
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4775.0,
+            "2": 4861.0,
+            "3": 4764.0,
+            "4": 5023.0,
+            "5": 5439.0,
+            "6": 5522.0,
+            "7": 5124.0,
+            "8": 4666.0,
+            "9": 5272.0,
+            "10": 4186.0,
+            "11": 5466.0,
+            "12": 5281.0,
+            "13": 5569.0,
+            "14": 5501.0,
+            "15": 5233.0,
+            "16": 5322.0,
+            "17": 5097.0,
+            "18": 5014.0,
+            "19": 5234.0,
+            "20": 4733.0,
+            "21": 5325.0,
+            "22": 4809.0,
+            "23": 5533.0,
+            "24": 5061.0,
+            "25": 4818.0,
+            "26": 5216.0,
+            "27": 5208.0,
+            "28": 5826.0,
+            "29": 5732.0,
+            "30": 5492.0,
+            "31": 4787.0,
+            "32": 5647.0,
+            "33": 6102.0,
+            "34": 5313.0,
+            "35": 5706.0,
+            "36": 5649.0,
+            "37": 6405.0,
+            "38": 6181.0,
+            "39": 6630.0,
+            "40": 5800.0,
+            "41": 5960.0,
+            "42": 6310.0,
+            "43": 5877.0,
+            "44": 5751.0,
+            "45": 5902.0,
+            "46": 5952.0,
+            "47": 6536.0,
+            "48": 6332.0,
+            "49": 6179.0,
+            "50": 6632.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1115784704.0,
+            "2": 1115784192.0,
+            "3": 1115781120.0,
+            "4": 1115783680.0,
+            "5": 1115780608.0,
+            "6": 1115781120.0,
+            "7": 1115783168.0,
+            "8": 1115781120.0,
+            "9": 1115783680.0,
+            "10": 1115782656.0,
+            "11": 1115782656.0,
+            "12": 1115780608.0,
+            "13": 1115785728.0,
+            "14": 1115780608.0,
+            "15": 1115783680.0,
+            "16": 1115783680.0,
+            "17": 1115781120.0,
+            "18": 1115783680.0,
+            "19": 1115780096.0,
+            "20": 1115782144.0,
+            "21": 1115780096.0,
+            "22": 1115781632.0,
+            "23": 1115782656.0,
+            "24": 1115784192.0,
+            "25": 1115781632.0,
+            "26": 1115784192.0,
+            "27": 1115782144.0,
+            "28": 1115783680.0,
+            "29": 1115784192.0,
+            "30": 1115780608.0,
+            "31": 1115787264.0,
+            "32": 1115783168.0,
+            "33": 1115781632.0,
+            "34": 1115782144.0,
+            "35": 1115784704.0,
+            "36": 1115780096.0,
+            "37": 1115781632.0,
+            "38": 1115782656.0,
+            "39": 1115781120.0,
+            "40": 1115783168.0,
+            "41": 1115783680.0,
+            "42": 1115783680.0,
+            "43": 1115785216.0,
+            "44": 1115784192.0,
+            "45": 1115782144.0,
+            "46": 1115784192.0,
+            "47": 1115784192.0,
+            "48": 1115780608.0,
+            "49": 1115779072.0,
+            "50": 1115784704.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1563058688.0,
+            "2": 2022025216.0,
+            "3": 2022025216.0,
+            "4": 2022816256.0,
+            "5": 2022816256.0,
+            "6": 2022816256.0,
+            "7": 2022816256.0,
+            "8": 2022816256.0,
+            "9": 2022816256.0,
+            "10": 2025321984.0,
+            "11": 2025321984.0,
+            "12": 2025321984.0,
+            "13": 2028008960.0,
+            "14": 2028008960.0,
+            "15": 2028008960.0,
+            "16": 2028008960.0,
+            "17": 2028008960.0,
+            "18": 2028008960.0,
+            "19": 2028008960.0,
+            "20": 2028008960.0,
+            "21": 2028008960.0,
+            "22": 2028008960.0,
+            "23": 2028008960.0,
+            "24": 2028008960.0,
+            "25": 2028008960.0,
+            "26": 2028008960.0,
+            "27": 2028008960.0,
+            "28": 2028008960.0,
+            "29": 2028008960.0,
+            "30": 2028008960.0,
+            "31": 2030280704.0,
+            "32": 2030280704.0,
+            "33": 2030280704.0,
+            "34": 2030280704.0,
+            "35": 2030280704.0,
+            "36": 2030280704.0,
+            "37": 2030280704.0,
+            "38": 2030280704.0,
+            "39": 2030280704.0,
+            "40": 2030280704.0,
+            "41": 2030280704.0,
+            "42": 2030280704.0,
+            "43": 2030280704.0,
+            "44": 2030280704.0,
+            "45": 2030280704.0,
+            "46": 2030280704.0,
+            "47": 2030280704.0,
+            "48": 2030280704.0,
+            "49": 2030280704.0,
+            "50": 2030280704.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 23.51368,
+            "2": 0.372,
+            "3": 0.34151,
+            "4": 0.32901,
+            "5": 0.32625,
+            "6": 0.32542,
+            "7": 0.32567,
+            "8": 0.32532,
+            "9": 0.3246,
+            "10": 0.33277,
+            "11": 0.3347,
+            "12": 0.33248,
+            "13": 0.33305,
+            "14": 0.33419,
+            "15": 0.33226,
+            "16": 0.3359,
+            "17": 0.33203,
+            "18": 0.331,
+            "19": 0.3345,
+            "20": 0.3364,
+            "21": 0.334,
+            "22": 0.33335,
+            "23": 0.33273,
+            "24": 0.33251,
+            "25": 0.33104,
+            "26": 0.3322,
+            "27": 0.33082,
+            "28": 0.33107,
+            "29": 0.33275,
+            "30": 0.33104,
+            "31": 0.33073,
+            "32": 0.33192,
+            "33": 0.32966,
+            "34": 0.3315,
+            "35": 0.33271,
+            "36": 0.33633,
+            "37": 0.33246,
+            "38": 0.80821,
+            "39": 0.33259,
+            "40": 0.33171,
+            "41": 0.33156,
+            "42": 0.33428,
+            "43": 0.33263,
+            "44": 0.81732,
+            "45": 0.33782,
+            "46": 0.33165,
+            "47": 0.71569,
+            "48": 0.33327,
+            "49": 0.33588,
+            "50": 0.33196
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json
index 0999afd59a3..f6b0539891f 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82762, "5": 10.85681, "10": 10.79217, "15": 10.82534, "20": 10.72228, "25": 10.54483, "30": 10.35746, "35": 10.27126, "40": 10.09704, "45": 9.84116, "50": 9.92438}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4699.0, "5": 5376.0, "10": 4507.0, "15": 5311.0, "20": 4809.0, "25": 4797.0, "30": 5353.0, "35": 5678.0, "40": 5904.0, "45": 5760.0, "50": 6526.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1114231296.0, "5": 1114228224.0, "10": 1114228224.0, "15": 1114230272.0, "20": 1114228224.0, "25": 1114228224.0, "30": 1114227200.0, "35": 1114231296.0, "40": 1114229760.0, "45": 1114228736.0, "50": 1114230784.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1562553856.0, "5": 2021133824.0, "10": 2022334976.0, "15": 2024271872.0, "20": 2024271872.0, "25": 2024820736.0, "30": 2024820736.0, "35": 2027709440.0, "40": 2027709440.0, "45": 2027709440.0, "50": 2027709440.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.72505, "5": 0.3325, "10": 0.33257, "15": 0.33093, "20": 0.33304, "25": 0.33508, "30": 0.37083, "35": 0.33207, "40": 0.3328, "45": 0.33149, "50": 0.3319}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82753,
+            "2": 10.84043,
+            "3": 10.82696,
+            "4": 10.81906,
+            "5": 10.8571,
+            "6": 10.86999,
+            "7": 10.85125,
+            "8": 10.84501,
+            "9": 10.85265,
+            "10": 10.79211,
+            "11": 10.86575,
+            "12": 10.87117,
+            "13": 10.87051,
+            "14": 10.87901,
+            "15": 10.82536,
+            "16": 10.8123,
+            "17": 10.77452,
+            "18": 10.81079,
+            "19": 10.79696,
+            "20": 10.72249,
+            "21": 10.6974,
+            "22": 10.55098,
+            "23": 10.70558,
+            "24": 10.58965,
+            "25": 10.54401,
+            "26": 10.60019,
+            "27": 10.62042,
+            "28": 10.57421,
+            "29": 10.58618,
+            "30": 10.35747,
+            "31": 10.12177,
+            "32": 10.47023,
+            "33": 10.45691,
+            "34": 10.21589,
+            "35": 10.27151,
+            "36": 10.23536,
+            "37": 10.35281,
+            "38": 10.20581,
+            "39": 10.40112,
+            "40": 10.09709,
+            "41": 10.13842,
+            "42": 10.21786,
+            "43": 9.84412,
+            "44": 9.96175,
+            "45": 9.84106,
+            "46": 9.81952,
+            "47": 10.13903,
+            "48": 9.85138,
+            "49": 9.5357,
+            "50": 9.92441
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4626.0,
+            "2": 4979.0,
+            "3": 4857.0,
+            "4": 4949.0,
+            "5": 5274.0,
+            "6": 5510.0,
+            "7": 5208.0,
+            "8": 4789.0,
+            "9": 5178.0,
+            "10": 4415.0,
+            "11": 5661.0,
+            "12": 5262.0,
+            "13": 5488.0,
+            "14": 5557.0,
+            "15": 5334.0,
+            "16": 5308.0,
+            "17": 5223.0,
+            "18": 5053.0,
+            "19": 5313.0,
+            "20": 4900.0,
+            "21": 5337.0,
+            "22": 4891.0,
+            "23": 5775.0,
+            "24": 5079.0,
+            "25": 4783.0,
+            "26": 5161.0,
+            "27": 5253.0,
+            "28": 5789.0,
+            "29": 5972.0,
+            "30": 5409.0,
+            "31": 4717.0,
+            "32": 5767.0,
+            "33": 6154.0,
+            "34": 5213.0,
+            "35": 5592.0,
+            "36": 5634.0,
+            "37": 6316.0,
+            "38": 6079.0,
+            "39": 6447.0,
+            "40": 6079.0,
+            "41": 5878.0,
+            "42": 6332.0,
+            "43": 5835.0,
+            "44": 5753.0,
+            "45": 5722.0,
+            "46": 6031.0,
+            "47": 6598.0,
+            "48": 6402.0,
+            "49": 6249.0,
+            "50": 6676.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1116843520.0,
+            "2": 1116841984.0,
+            "3": 1116839936.0,
+            "4": 1116843008.0,
+            "5": 1116839424.0,
+            "6": 1116838912.0,
+            "7": 1116840448.0,
+            "8": 1116839936.0,
+            "9": 1116842496.0,
+            "10": 1116841472.0,
+            "11": 1116840448.0,
+            "12": 1116840960.0,
+            "13": 1116845056.0,
+            "14": 1116839424.0,
+            "15": 1116842496.0,
+            "16": 1116841472.0,
+            "17": 1116839936.0,
+            "18": 1116841984.0,
+            "19": 1116838912.0,
+            "20": 1116841472.0,
+            "21": 1116839936.0,
+            "22": 1116840448.0,
+            "23": 1116840448.0,
+            "24": 1116844544.0,
+            "25": 1116840448.0,
+            "26": 1116843008.0,
+            "27": 1116840960.0,
+            "28": 1116841984.0,
+            "29": 1116843008.0,
+            "30": 1116839424.0,
+            "31": 1116846080.0,
+            "32": 1116842496.0,
+            "33": 1116840448.0,
+            "34": 1116840448.0,
+            "35": 1116843520.0,
+            "36": 1116838912.0,
+            "37": 1116840448.0,
+            "38": 1116841472.0,
+            "39": 1116839936.0,
+            "40": 1116841984.0,
+            "41": 1116843520.0,
+            "42": 1116843520.0,
+            "43": 1116844032.0,
+            "44": 1116843008.0,
+            "45": 1116840960.0,
+            "46": 1116842496.0,
+            "47": 1116841984.0,
+            "48": 1116839936.0,
+            "49": 1116837376.0,
+            "50": 1116844032.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1563050496.0,
+            "2": 2021967872.0,
+            "3": 2021967872.0,
+            "4": 2022971392.0,
+            "5": 2022971392.0,
+            "6": 2022971392.0,
+            "7": 2022971392.0,
+            "8": 2022971392.0,
+            "9": 2022971392.0,
+            "10": 2024804864.0,
+            "11": 2024804864.0,
+            "12": 2024804864.0,
+            "13": 2027590656.0,
+            "14": 2027590656.0,
+            "15": 2027590656.0,
+            "16": 2027590656.0,
+            "17": 2027590656.0,
+            "18": 2027590656.0,
+            "19": 2027590656.0,
+            "20": 2027590656.0,
+            "21": 2027590656.0,
+            "22": 2027590656.0,
+            "23": 2027590656.0,
+            "24": 2027590656.0,
+            "25": 2027590656.0,
+            "26": 2027590656.0,
+            "27": 2027590656.0,
+            "28": 2027590656.0,
+            "29": 2027590656.0,
+            "30": 2027590656.0,
+            "31": 2030131200.0,
+            "32": 2030131200.0,
+            "33": 2030131200.0,
+            "34": 2030131200.0,
+            "35": 2030131200.0,
+            "36": 2030131200.0,
+            "37": 2030131200.0,
+            "38": 2030131200.0,
+            "39": 2030131200.0,
+            "40": 2030131200.0,
+            "41": 2030131200.0,
+            "42": 2030131200.0,
+            "43": 2030131200.0,
+            "44": 2030131200.0,
+            "45": 2030131200.0,
+            "46": 2030131200.0,
+            "47": 2030131200.0,
+            "48": 2030131200.0,
+            "49": 2030131200.0,
+            "50": 2030131200.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 21.05476,
+            "2": 0.37335,
+            "3": 0.34228,
+            "4": 0.32445,
+            "5": 0.32484,
+            "6": 0.3249,
+            "7": 0.32488,
+            "8": 0.32585,
+            "9": 0.32395,
+            "10": 0.32465,
+            "11": 0.32197,
+            "12": 0.32169,
+            "13": 0.32213,
+            "14": 0.32236,
+            "15": 0.32344,
+            "16": 0.32418,
+            "17": 0.32357,
+            "18": 0.32327,
+            "19": 0.72477,
+            "20": 0.32351,
+            "21": 0.32286,
+            "22": 0.32395,
+            "23": 0.3238,
+            "24": 0.32345,
+            "25": 0.32441,
+            "26": 0.32375,
+            "27": 0.32444,
+            "28": 0.32394,
+            "29": 0.32438,
+            "30": 0.32386,
+            "31": 0.32381,
+            "32": 0.32332,
+            "33": 0.32386,
+            "34": 0.32457,
+            "35": 0.32337,
+            "36": 0.32334,
+            "37": 0.3239,
+            "38": 0.32451,
+            "39": 0.324,
+            "40": 0.32494,
+            "41": 0.324,
+            "42": 0.32347,
+            "43": 0.32398,
+            "44": 0.32338,
+            "45": 0.32336,
+            "46": 0.32329,
+            "47": 0.32358,
+            "48": 0.32344,
+            "49": 0.32289,
+            "50": 0.3206
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..51442760296
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81386,
+            "2": 10.8312,
+            "3": 10.80993,
+            "4": 10.82435,
+            "5": 10.84542,
+            "6": 10.85174,
+            "7": 10.8364,
+            "8": 10.83623,
+            "9": 10.84765,
+            "10": 10.7827,
+            "11": 10.85034,
+            "12": 10.84795,
+            "13": 10.86022,
+            "14": 10.86553,
+            "15": 10.81126,
+            "16": 10.78981,
+            "17": 10.76199,
+            "18": 10.78188,
+            "19": 10.78067,
+            "20": 10.7031,
+            "21": 10.67657,
+            "22": 10.51865,
+            "23": 10.70519,
+            "24": 10.57167,
+            "25": 10.51611,
+            "26": 10.58127,
+            "27": 10.59422,
+            "28": 10.56658,
+            "29": 10.58518,
+            "30": 10.33581,
+            "31": 10.08412,
+            "32": 10.45077,
+            "33": 10.4461,
+            "34": 10.19766,
+            "35": 10.2585,
+            "36": 10.21965,
+            "37": 10.34543,
+            "38": 10.18739,
+            "39": 10.39385,
+            "40": 10.0823,
+            "41": 10.13221,
+            "42": 10.21174,
+            "43": 9.83034,
+            "44": 9.9469,
+            "45": 9.84028,
+            "46": 9.81421,
+            "47": 10.12976,
+            "48": 9.85137,
+            "49": 9.52825,
+            "50": 9.91126
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4660.0,
+            "2": 4808.0,
+            "3": 4883.0,
+            "4": 4655.0,
+            "5": 5526.0,
+            "6": 5649.0,
+            "7": 4873.0,
+            "8": 4733.0,
+            "9": 5165.0,
+            "10": 4367.0,
+            "11": 5583.0,
+            "12": 5440.0,
+            "13": 5659.0,
+            "14": 5572.0,
+            "15": 5101.0,
+            "16": 5484.0,
+            "17": 5115.0,
+            "18": 5200.0,
+            "19": 5406.0,
+            "20": 4960.0,
+            "21": 5420.0,
+            "22": 4791.0,
+            "23": 5566.0,
+            "24": 5019.0,
+            "25": 4679.0,
+            "26": 5246.0,
+            "27": 5433.0,
+            "28": 5907.0,
+            "29": 6065.0,
+            "30": 5409.0,
+            "31": 4827.0,
+            "32": 5809.0,
+            "33": 6243.0,
+            "34": 5520.0,
+            "35": 5592.0,
+            "36": 5754.0,
+            "37": 6732.0,
+            "38": 6330.0,
+            "39": 6779.0,
+            "40": 6198.0,
+            "41": 6001.0,
+            "42": 6274.0,
+            "43": 5876.0,
+            "44": 6046.0,
+            "45": 6084.0,
+            "46": 5925.0,
+            "47": 6772.0,
+            "48": 6415.0,
+            "49": 6494.0,
+            "50": 6648.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1144114688.0,
+            "2": 1144112128.0,
+            "3": 1144114688.0,
+            "4": 1144115200.0,
+            "5": 1144115200.0,
+            "6": 1144113664.0,
+            "7": 1144114688.0,
+            "8": 1144113152.0,
+            "9": 1144112640.0,
+            "10": 1144114688.0,
+            "11": 1144114176.0,
+            "12": 1144115200.0,
+            "13": 1144114176.0,
+            "14": 1144115712.0,
+            "15": 1144115712.0,
+            "16": 1144113664.0,
+            "17": 1144111616.0,
+            "18": 1144113664.0,
+            "19": 1144114688.0,
+            "20": 1144113664.0,
+            "21": 1144113152.0,
+            "22": 1144114176.0,
+            "23": 1144113664.0,
+            "24": 1144111616.0,
+            "25": 1144115712.0,
+            "26": 1144116224.0,
+            "27": 1144114688.0,
+            "28": 1144112128.0,
+            "29": 1144113152.0,
+            "30": 1144114176.0,
+            "31": 1144109568.0,
+            "32": 1144113152.0,
+            "33": 1144114176.0,
+            "34": 1144113664.0,
+            "35": 1144111104.0,
+            "36": 1144113664.0,
+            "37": 1144115200.0,
+            "38": 1144114688.0,
+            "39": 1144112128.0,
+            "40": 1144111616.0,
+            "41": 1144110080.0,
+            "42": 1144113152.0,
+            "43": 1144109568.0,
+            "44": 1144111616.0,
+            "45": 1144115200.0,
+            "46": 1144112640.0,
+            "47": 1144112128.0,
+            "48": 1144111616.0,
+            "49": 1144113664.0,
+            "50": 1144111616.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1592113152.0,
+            "2": 2049344000.0,
+            "3": 2049344000.0,
+            "4": 2049852928.0,
+            "5": 2050200576.0,
+            "6": 2050200576.0,
+            "7": 2050200576.0,
+            "8": 2050200576.0,
+            "9": 2050200576.0,
+            "10": 2050613760.0,
+            "11": 2050613760.0,
+            "12": 2050613760.0,
+            "13": 2050613760.0,
+            "14": 2050613760.0,
+            "15": 2050613760.0,
+            "16": 2050613760.0,
+            "17": 2050613760.0,
+            "18": 2050613760.0,
+            "19": 2050613760.0,
+            "20": 2050613760.0,
+            "21": 2050613760.0,
+            "22": 2050613760.0,
+            "23": 2050613760.0,
+            "24": 2050613760.0,
+            "25": 2050613760.0,
+            "26": 2050613760.0,
+            "27": 2050613760.0,
+            "28": 2050613760.0,
+            "29": 2050613760.0,
+            "30": 2050613760.0,
+            "31": 2050613760.0,
+            "32": 2050613760.0,
+            "33": 2050613760.0,
+            "34": 2050613760.0,
+            "35": 2050613760.0,
+            "36": 2050613760.0,
+            "37": 2050613760.0,
+            "38": 2050613760.0,
+            "39": 2050613760.0,
+            "40": 2050613760.0,
+            "41": 2050613760.0,
+            "42": 2050613760.0,
+            "43": 2050613760.0,
+            "44": 2050613760.0,
+            "45": 2050613760.0,
+            "46": 2050613760.0,
+            "47": 2050613760.0,
+            "48": 2050613760.0,
+            "49": 2050613760.0,
+            "50": 2050613760.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 6.272,
+            "3": 0.37646,
+            "4": 0.38139,
+            "5": 0.36006,
+            "6": 0.35438,
+            "7": 0.37532,
+            "8": 0.34896,
+            "9": 0.34666,
+            "10": 0.34575,
+            "11": 0.35016,
+            "12": 0.34334,
+            "13": 0.34313,
+            "14": 0.34121,
+            "15": 0.34333,
+            "16": 0.33917,
+            "17": 0.34414,
+            "18": 0.34158,
+            "19": 0.33904,
+            "20": 0.34192,
+            "21": 0.34305,
+            "22": 0.35491,
+            "23": 0.34584,
+            "24": 0.34162,
+            "25": 0.34733,
+            "26": 0.34153,
+            "27": 0.34246,
+            "28": 0.34,
+            "29": 0.33893,
+            "30": 0.34315,
+            "31": 0.3468,
+            "32": 0.34193,
+            "33": 0.33765,
+            "34": 0.34671,
+            "35": 0.33955,
+            "36": 0.34134,
+            "37": 0.33879,
+            "38": 0.34103,
+            "39": 0.33784,
+            "40": 0.33992,
+            "41": 0.3506,
+            "42": 0.33836,
+            "43": 0.34282,
+            "44": 0.33978,
+            "45": 0.339,
+            "46": 0.34898,
+            "47": 0.34512,
+            "48": 0.35552,
+            "49": 0.34616,
+            "50": 0.33258
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
index d342471ff77..5b369a3137c 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.7999,
-            "2": 10.80046,
-            "3": 10.80877,
-            "4": 10.78226,
-            "5": 10.8254,
-            "6": 10.83596,
-            "7": 10.81676,
-            "8": 10.81163,
-            "9": 10.81106,
-            "10": 10.77366,
-            "11": 10.85495,
-            "12": 10.82711,
-            "13": 10.85109,
-            "14": 10.8546,
-            "15": 10.78267,
-            "16": 10.77358,
-            "17": 10.75036,
-            "18": 10.78319,
-            "19": 10.75876,
-            "20": 10.6992,
-            "21": 10.67244,
-            "22": 10.51382,
-            "23": 10.68112,
-            "24": 10.57174,
-            "25": 10.51756,
-            "26": 10.57624,
-            "27": 10.59185,
-            "28": 10.55401,
-            "29": 10.57113,
-            "30": 10.36465,
-            "31": 10.10866,
-            "32": 10.45338,
-            "33": 10.43764,
-            "34": 10.20033,
-            "35": 10.25433,
-            "36": 10.23362,
-            "37": 10.35369,
-            "38": 10.20443,
-            "39": 10.39917,
-            "40": 10.10245,
-            "41": 10.12765,
-            "42": 10.21106,
-            "43": 9.83722,
-            "44": 9.962,
-            "45": 9.84252,
-            "46": 9.80612,
-            "47": 10.14257,
-            "48": 9.86665,
-            "49": 9.5383,
-            "50": 9.92576
+            "1": 10.80012,
+            "2": 10.8005,
+            "3": 10.80883,
+            "4": 10.78232,
+            "5": 10.82514,
+            "6": 10.83649,
+            "7": 10.8162,
+            "8": 10.81195,
+            "9": 10.8108,
+            "10": 10.77412,
+            "11": 10.85566,
+            "12": 10.82707,
+            "13": 10.85141,
+            "14": 10.85446,
+            "15": 10.78278,
+            "16": 10.77366,
+            "17": 10.7506,
+            "18": 10.78381,
+            "19": 10.7589,
+            "20": 10.7001,
+            "21": 10.67278,
+            "22": 10.51434,
+            "23": 10.68074,
+            "24": 10.57171,
+            "25": 10.518,
+            "26": 10.57588,
+            "27": 10.59157,
+            "28": 10.55337,
+            "29": 10.57061,
+            "30": 10.36462,
+            "31": 10.10867,
+            "32": 10.45325,
+            "33": 10.43728,
+            "34": 10.20006,
+            "35": 10.25436,
+            "36": 10.23332,
+            "37": 10.35373,
+            "38": 10.20421,
+            "39": 10.39913,
+            "40": 10.10214,
+            "41": 10.12724,
+            "42": 10.21139,
+            "43": 9.83735,
+            "44": 9.96179,
+            "45": 9.8429,
+            "46": 9.80656,
+            "47": 10.14235,
+            "48": 9.86669,
+            "49": 9.53809,
+            "50": 9.92544
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4827.0,
-            "2": 4935.0,
-            "3": 5030.0,
-            "4": 4956.0,
-            "5": 5583.0,
-            "6": 5594.0,
-            "7": 5325.0,
-            "8": 5098.0,
-            "9": 5335.0,
-            "10": 4581.0,
-            "11": 5895.0,
-            "12": 5249.0,
-            "13": 5692.0,
-            "14": 5736.0,
-            "15": 5303.0,
-            "16": 5347.0,
-            "17": 5361.0,
-            "18": 5322.0,
-            "19": 5407.0,
-            "20": 4961.0,
-            "21": 5441.0,
-            "22": 4776.0,
-            "23": 5752.0,
-            "24": 5157.0,
-            "25": 4897.0,
-            "26": 5202.0,
-            "27": 5455.0,
-            "28": 5769.0,
-            "29": 5911.0,
-            "30": 5256.0,
-            "31": 4674.0,
-            "32": 5854.0,
-            "33": 6080.0,
-            "34": 5278.0,
-            "35": 5743.0,
-            "36": 5523.0,
-            "37": 6477.0,
-            "38": 5839.0,
-            "39": 6711.0,
-            "40": 5852.0,
-            "41": 6062.0,
-            "42": 6501.0,
-            "43": 5605.0,
-            "44": 5883.0,
-            "45": 5763.0,
-            "46": 6076.0,
-            "47": 6613.0,
-            "48": 6348.0,
-            "49": 6430.0,
-            "50": 6699.0
+            "1": 4916.0,
+            "2": 4954.0,
+            "3": 5054.0,
+            "4": 5108.0,
+            "5": 5499.0,
+            "6": 5705.0,
+            "7": 5188.0,
+            "8": 4899.0,
+            "9": 5442.0,
+            "10": 4498.0,
+            "11": 5894.0,
+            "12": 5279.0,
+            "13": 5766.0,
+            "14": 5633.0,
+            "15": 5168.0,
+            "16": 5358.0,
+            "17": 5399.0,
+            "18": 5305.0,
+            "19": 5131.0,
+            "20": 4905.0,
+            "21": 5355.0,
+            "22": 4916.0,
+            "23": 5674.0,
+            "24": 5034.0,
+            "25": 4922.0,
+            "26": 5355.0,
+            "27": 5424.0,
+            "28": 5771.0,
+            "29": 6052.0,
+            "30": 5386.0,
+            "31": 4773.0,
+            "32": 5773.0,
+            "33": 6105.0,
+            "34": 5287.0,
+            "35": 5623.0,
+            "36": 5502.0,
+            "37": 6266.0,
+            "38": 6005.0,
+            "39": 6727.0,
+            "40": 5810.0,
+            "41": 5898.0,
+            "42": 6417.0,
+            "43": 5774.0,
+            "44": 5812.0,
+            "45": 5768.0,
+            "46": 5884.0,
+            "47": 6481.0,
+            "48": 6435.0,
+            "49": 6461.0,
+            "50": 6489.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1145716736.0,
-            "2": 1145714688.0,
-            "3": 1145715200.0,
-            "4": 1145714176.0,
-            "5": 1146210816.0,
-            "6": 1146210304.0,
-            "7": 1145716736.0,
-            "8": 1146209792.0,
-            "9": 1145714688.0,
-            "10": 1146214912.0,
-            "11": 1145714176.0,
-            "12": 1145713664.0,
-            "13": 1145712128.0,
-            "14": 1146209280.0,
-            "15": 1145713152.0,
-            "16": 1146210304.0,
-            "17": 1145713664.0,
-            "18": 1146210304.0,
-            "19": 1145714176.0,
-            "20": 1145715200.0,
-            "21": 1146210304.0,
-            "22": 1145715712.0,
-            "23": 1145716224.0,
-            "24": 1145713152.0,
-            "25": 1145712128.0,
-            "26": 1145715200.0,
-            "27": 1146210304.0,
-            "28": 1145713664.0,
-            "29": 1145711104.0,
-            "30": 1145714688.0,
-            "31": 1146213376.0,
-            "32": 1145713152.0,
-            "33": 1145714688.0,
-            "34": 1145714688.0,
-            "35": 1146213376.0,
-            "36": 1145713664.0,
-            "37": 1145712128.0,
-            "38": 1146207744.0,
-            "39": 1145715200.0,
-            "40": 1146210816.0,
-            "41": 1145714688.0,
-            "42": 1145711104.0,
-            "43": 1146211840.0,
-            "44": 1145717248.0,
-            "45": 1145714688.0,
-            "46": 1146214400.0,
-            "47": 1145714688.0,
-            "48": 1145717248.0,
-            "49": 1146214912.0,
-            "50": 1145716224.0
+            "1": 1145163776.0,
+            "2": 1146163200.0,
+            "3": 1145163264.0,
+            "4": 1145162240.0,
+            "5": 1145163264.0,
+            "6": 1145163264.0,
+            "7": 1146213376.0,
+            "8": 1146210816.0,
+            "9": 1146211328.0,
+            "10": 1145167360.0,
+            "11": 1145162240.0,
+            "12": 1145161728.0,
+            "13": 1145161216.0,
+            "14": 1145161728.0,
+            "15": 1145161216.0,
+            "16": 1145162752.0,
+            "17": 1145882624.0,
+            "18": 1145162752.0,
+            "19": 1145162240.0,
+            "20": 1145163264.0,
+            "21": 1145162752.0,
+            "22": 1145163776.0,
+            "23": 1146212352.0,
+            "24": 1145161216.0,
+            "25": 1145160704.0,
+            "26": 1145164288.0,
+            "27": 1146212352.0,
+            "28": 1145161728.0,
+            "29": 1145159680.0,
+            "30": 1145162752.0,
+            "31": 1145165824.0,
+            "32": 1145162240.0,
+            "33": 1145162752.0,
+            "34": 1145163264.0,
+            "35": 1146213888.0,
+            "36": 1145161728.0,
+            "37": 1145160192.0,
+            "38": 1146208768.0,
+            "39": 1146211840.0,
+            "40": 1146211328.0,
+            "41": 1145163264.0,
+            "42": 1145160704.0,
+            "43": 1145164288.0,
+            "44": 1146213376.0,
+            "45": 1146211328.0,
+            "46": 1146215424.0,
+            "47": 1145162752.0,
+            "48": 1145165312.0,
+            "49": 1146216448.0,
+            "50": 1145164288.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1593766912.0,
-            "2": 2051463168.0,
-            "3": 2052584960.0,
-            "4": 2052584960.0,
-            "5": 2052584960.0,
-            "6": 2053404160.0,
-            "7": 2054199296.0,
-            "8": 2054199296.0,
-            "9": 2056971776.0,
-            "10": 2057138688.0,
-            "11": 2057138688.0,
-            "12": 2057138688.0,
-            "13": 2057138688.0,
-            "14": 2057138688.0,
-            "15": 2057138688.0,
-            "16": 2057138688.0,
-            "17": 2057138688.0,
-            "18": 2057138688.0,
-            "19": 2057138688.0,
-            "20": 2057138688.0,
-            "21": 2057138688.0,
-            "22": 2057138688.0,
-            "23": 2057138688.0,
-            "24": 2057138688.0,
-            "25": 2057138688.0,
-            "26": 2057138688.0,
-            "27": 2057138688.0,
-            "28": 2057138688.0,
-            "29": 2057138688.0,
-            "30": 2057138688.0,
-            "31": 2057138688.0,
-            "32": 2057138688.0,
-            "33": 2057138688.0,
-            "34": 2057138688.0,
-            "35": 2057138688.0,
-            "36": 2057138688.0,
-            "37": 2057138688.0,
-            "38": 2057138688.0,
-            "39": 2057138688.0,
-            "40": 2057138688.0,
-            "41": 2057138688.0,
-            "42": 2057138688.0,
-            "43": 2057138688.0,
-            "44": 2057138688.0,
-            "45": 2057138688.0,
-            "46": 2057138688.0,
-            "47": 2057138688.0,
-            "48": 2057138688.0,
-            "49": 2057138688.0,
-            "50": 2057138688.0
+            "1": 1593583104.0,
+            "2": 2051818496.0,
+            "3": 2053099520.0,
+            "4": 2053099520.0,
+            "5": 2053099520.0,
+            "6": 2054166016.0,
+            "7": 2055368704.0,
+            "8": 2055444992.0,
+            "9": 2056095232.0,
+            "10": 2057353728.0,
+            "11": 2057353728.0,
+            "12": 2057353728.0,
+            "13": 2057353728.0,
+            "14": 2057353728.0,
+            "15": 2057353728.0,
+            "16": 2057353728.0,
+            "17": 2057353728.0,
+            "18": 2057353728.0,
+            "19": 2057353728.0,
+            "20": 2057353728.0,
+            "21": 2057353728.0,
+            "22": 2057353728.0,
+            "23": 2057353728.0,
+            "24": 2057353728.0,
+            "25": 2057353728.0,
+            "26": 2057353728.0,
+            "27": 2057353728.0,
+            "28": 2057353728.0,
+            "29": 2057353728.0,
+            "30": 2057353728.0,
+            "31": 2057353728.0,
+            "32": 2057353728.0,
+            "33": 2057353728.0,
+            "34": 2057353728.0,
+            "35": 2057353728.0,
+            "36": 2057353728.0,
+            "37": 2057353728.0,
+            "38": 2057353728.0,
+            "39": 2057353728.0,
+            "40": 2057353728.0,
+            "41": 2057353728.0,
+            "42": 2057353728.0,
+            "43": 2057353728.0,
+            "44": 2057353728.0,
+            "45": 2057353728.0,
+            "46": 2057353728.0,
+            "47": 2057353728.0,
+            "48": 2057353728.0,
+            "49": 2057353728.0,
+            "50": 2057353728.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 17.99317,
-            "2": 0.35408,
-            "3": 0.30455,
-            "4": 0.32631,
-            "5": 0.27174,
-            "6": 0.27168,
-            "7": 0.29847,
-            "8": 0.27152,
-            "9": 0.27606,
-            "10": 0.27991,
-            "11": 0.25875,
-            "12": 0.25854,
-            "13": 0.26351,
-            "14": 0.2599,
-            "15": 0.26827,
-            "16": 0.25734,
-            "17": 0.26876,
-            "18": 0.26302,
-            "19": 0.25791,
-            "20": 0.26587,
-            "21": 0.26207,
-            "22": 0.2718,
-            "23": 0.27036,
-            "24": 0.2557,
-            "25": 0.27098,
-            "26": 0.2562,
-            "27": 0.25663,
-            "28": 0.28209,
-            "29": 0.25678,
-            "30": 0.26198,
-            "31": 0.27896,
-            "32": 0.26879,
-            "33": 0.25449,
-            "34": 0.27377,
-            "35": 0.25725,
-            "36": 0.25349,
-            "37": 0.2537,
-            "38": 0.26246,
-            "39": 0.25527,
-            "40": 0.25676,
-            "41": 0.26427,
-            "42": 0.25718,
-            "43": 0.26206,
-            "44": 0.25615,
-            "45": 0.261,
-            "46": 0.28413,
-            "47": 0.27633,
-            "48": 0.26455,
-            "49": 0.2706,
-            "50": 0.25944
+            "1": 36.30862,
+            "2": 0.33719,
+            "3": 0.28216,
+            "4": 0.2843,
+            "5": 0.23756,
+            "6": 0.23639,
+            "7": 0.27014,
+            "8": 0.24101,
+            "9": 0.24066,
+            "10": 0.25135,
+            "11": 0.2342,
+            "12": 0.22722,
+            "13": 0.23279,
+            "14": 0.22714,
+            "15": 0.24041,
+            "16": 0.22689,
+            "17": 0.23762,
+            "18": 0.22666,
+            "19": 0.2282,
+            "20": 0.22795,
+            "21": 0.2341,
+            "22": 0.65676,
+            "23": 0.24009,
+            "24": 0.22741,
+            "25": 0.23512,
+            "26": 0.22626,
+            "27": 0.22751,
+            "28": 0.246,
+            "29": 0.22763,
+            "30": 0.23076,
+            "31": 0.25299,
+            "32": 0.23341,
+            "33": 0.22812,
+            "34": 0.24223,
+            "35": 0.23465,
+            "36": 0.22594,
+            "37": 0.22774,
+            "38": 0.23179,
+            "39": 0.22535,
+            "40": 0.22597,
+            "41": 0.23473,
+            "42": 0.2254,
+            "43": 0.23446,
+            "44": 0.22767,
+            "45": 0.23442,
+            "46": 0.25088,
+            "47": 0.24058,
+            "48": 0.23646,
+            "49": 0.24323,
+            "50": 0.23136
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
index 4383c914d8e..03cdcbebfb1 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.8277,
-            "2": 10.84068,
-            "3": 10.82705,
+            "1": 10.82721,
+            "2": 10.84035,
+            "3": 10.82733,
             "4": 10.81913,
-            "5": 10.85673,
-            "6": 10.86984,
-            "7": 10.85119,
-            "8": 10.84465,
-            "9": 10.85269,
-            "10": 10.79157,
-            "11": 10.86571,
-            "12": 10.87169,
-            "13": 10.8708,
-            "14": 10.8787,
-            "15": 10.82554,
-            "16": 10.81251,
-            "17": 10.77478,
-            "18": 10.81068,
-            "19": 10.79632,
-            "20": 10.72175,
-            "21": 10.69765,
-            "22": 10.55138,
-            "23": 10.70555,
-            "24": 10.59005,
-            "25": 10.54425,
-            "26": 10.60036,
-            "27": 10.61973,
-            "28": 10.57442,
-            "29": 10.58656,
-            "30": 10.35754,
-            "31": 10.12169,
-            "32": 10.46987,
-            "33": 10.45722,
-            "34": 10.2158,
-            "35": 10.27086,
-            "36": 10.2354,
-            "37": 10.35246,
-            "38": 10.20574,
-            "39": 10.40061,
-            "40": 10.09681,
-            "41": 10.13869,
-            "42": 10.21829,
-            "43": 9.84428,
-            "44": 9.9614,
-            "45": 9.84116,
-            "46": 9.81955,
-            "47": 10.13927,
-            "48": 9.85138,
-            "49": 9.53518,
-            "50": 9.92455
+            "5": 10.85669,
+            "6": 10.86992,
+            "7": 10.85145,
+            "8": 10.84454,
+            "9": 10.85217,
+            "10": 10.79203,
+            "11": 10.86556,
+            "12": 10.87068,
+            "13": 10.87092,
+            "14": 10.87861,
+            "15": 10.82588,
+            "16": 10.81198,
+            "17": 10.77469,
+            "18": 10.81081,
+            "19": 10.79685,
+            "20": 10.72214,
+            "21": 10.69749,
+            "22": 10.55117,
+            "23": 10.70533,
+            "24": 10.59031,
+            "25": 10.54454,
+            "26": 10.60011,
+            "27": 10.62053,
+            "28": 10.57401,
+            "29": 10.58652,
+            "30": 10.35738,
+            "31": 10.12167,
+            "32": 10.46986,
+            "33": 10.45718,
+            "34": 10.21579,
+            "35": 10.27137,
+            "36": 10.23516,
+            "37": 10.35226,
+            "38": 10.20647,
+            "39": 10.40076,
+            "40": 10.09694,
+            "41": 10.13882,
+            "42": 10.21793,
+            "43": 9.844,
+            "44": 9.96176,
+            "45": 9.84078,
+            "46": 9.81922,
+            "47": 10.13915,
+            "48": 9.85114,
+            "49": 9.53525,
+            "50": 9.92432
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4627.0,
-            "2": 4785.0,
-            "3": 4887.0,
-            "4": 5134.0,
-            "5": 5403.0,
-            "6": 5457.0,
-            "7": 5140.0,
-            "8": 4876.0,
-            "9": 5213.0,
-            "10": 4396.0,
-            "11": 5749.0,
-            "12": 5182.0,
-            "13": 5436.0,
-            "14": 5431.0,
-            "15": 5327.0,
-            "16": 5452.0,
-            "17": 5245.0,
-            "18": 5116.0,
-            "19": 5216.0,
-            "20": 4869.0,
-            "21": 5326.0,
-            "22": 4832.0,
-            "23": 5719.0,
-            "24": 5017.0,
-            "25": 4980.0,
-            "26": 5288.0,
-            "27": 5346.0,
-            "28": 5727.0,
-            "29": 5937.0,
-            "30": 5289.0,
-            "31": 4777.0,
-            "32": 5616.0,
-            "33": 6137.0,
-            "34": 5140.0,
-            "35": 5690.0,
-            "36": 5739.0,
-            "37": 6425.0,
-            "38": 5962.0,
-            "39": 6620.0,
-            "40": 5921.0,
-            "41": 5820.0,
-            "42": 6472.0,
-            "43": 5860.0,
-            "44": 5731.0,
-            "45": 5769.0,
-            "46": 6130.0,
-            "47": 6576.0,
-            "48": 6403.0,
-            "49": 6084.0,
-            "50": 6648.0
+            "1": 4672.0,
+            "2": 4867.0,
+            "3": 4956.0,
+            "4": 4946.0,
+            "5": 5421.0,
+            "6": 5554.0,
+            "7": 5128.0,
+            "8": 4852.0,
+            "9": 5281.0,
+            "10": 4254.0,
+            "11": 5524.0,
+            "12": 5140.0,
+            "13": 5533.0,
+            "14": 5553.0,
+            "15": 5130.0,
+            "16": 5322.0,
+            "17": 5214.0,
+            "18": 5146.0,
+            "19": 5276.0,
+            "20": 4803.0,
+            "21": 5286.0,
+            "22": 4882.0,
+            "23": 5710.0,
+            "24": 4925.0,
+            "25": 4732.0,
+            "26": 5191.0,
+            "27": 5286.0,
+            "28": 5771.0,
+            "29": 5891.0,
+            "30": 5411.0,
+            "31": 4721.0,
+            "32": 5606.0,
+            "33": 6002.0,
+            "34": 5137.0,
+            "35": 5602.0,
+            "36": 5708.0,
+            "37": 6467.0,
+            "38": 6089.0,
+            "39": 6746.0,
+            "40": 6058.0,
+            "41": 5845.0,
+            "42": 6342.0,
+            "43": 6034.0,
+            "44": 5828.0,
+            "45": 5758.0,
+            "46": 5886.0,
+            "47": 6555.0,
+            "48": 6437.0,
+            "49": 6286.0,
+            "50": 6602.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1115810816.0,
-            "2": 1115809280.0,
-            "3": 1115807232.0,
-            "4": 1115809792.0,
-            "5": 1115806720.0,
-            "6": 1115807232.0,
-            "7": 1115808768.0,
-            "8": 1115807744.0,
-            "9": 1115809792.0,
-            "10": 1115808768.0,
-            "11": 1115808768.0,
-            "12": 1115808256.0,
-            "13": 1115811840.0,
-            "14": 1115807232.0,
-            "15": 1115809792.0,
-            "16": 1115808768.0,
-            "17": 1115806720.0,
-            "18": 1115809792.0,
-            "19": 1115806208.0,
-            "20": 1115808256.0,
-            "21": 1115806208.0,
-            "22": 1115807744.0,
-            "23": 1115807744.0,
-            "24": 1115810304.0,
-            "25": 1115807744.0,
-            "26": 1115810304.0,
-            "27": 1115808256.0,
-            "28": 1115809280.0,
-            "29": 1115810304.0,
-            "30": 1115806720.0,
-            "31": 1115813376.0,
-            "32": 1115809792.0,
-            "33": 1115807744.0,
-            "34": 1115808256.0,
-            "35": 1115810816.0,
-            "36": 1115806208.0,
-            "37": 1115807744.0,
-            "38": 1115809792.0,
-            "39": 1115807232.0,
-            "40": 1115809792.0,
-            "41": 1115810816.0,
-            "42": 1115810816.0,
-            "43": 1115811328.0,
-            "44": 1115809792.0,
-            "45": 1115808768.0,
-            "46": 1115810304.0,
-            "47": 1115808256.0,
-            "48": 1115806208.0,
-            "49": 1115805184.0,
-            "50": 1115811328.0
+            "1": 1116852736.0,
+            "2": 1116852224.0,
+            "3": 1116850176.0,
+            "4": 1116851712.0,
+            "5": 1116848640.0,
+            "6": 1116849152.0,
+            "7": 1116851200.0,
+            "8": 1116849152.0,
+            "9": 1116851712.0,
+            "10": 1116850176.0,
+            "11": 1116849664.0,
+            "12": 1116849152.0,
+            "13": 1116854784.0,
+            "14": 1116848640.0,
+            "15": 1116851712.0,
+            "16": 1116849664.0,
+            "17": 1116848640.0,
+            "18": 1116851200.0,
+            "19": 1116848128.0,
+            "20": 1116850688.0,
+            "21": 1116850176.0,
+            "22": 1116849664.0,
+            "23": 1116849664.0,
+            "24": 1116852224.0,
+            "25": 1116848640.0,
+            "26": 1116852224.0,
+            "27": 1116850176.0,
+            "28": 1116851712.0,
+            "29": 1116852224.0,
+            "30": 1116848640.0,
+            "31": 1116855296.0,
+            "32": 1116851200.0,
+            "33": 1116848640.0,
+            "34": 1116850176.0,
+            "35": 1116852736.0,
+            "36": 1116848128.0,
+            "37": 1116849664.0,
+            "38": 1116850688.0,
+            "39": 1116849664.0,
+            "40": 1116851200.0,
+            "41": 1116851712.0,
+            "42": 1116851712.0,
+            "43": 1116852224.0,
+            "44": 1116851712.0,
+            "45": 1116851200.0,
+            "46": 1116851712.0,
+            "47": 1116850176.0,
+            "48": 1116848128.0,
+            "49": 1116846080.0,
+            "50": 1116852736.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1562923008.0,
-            "2": 2021974528.0,
-            "3": 2021974528.0,
-            "4": 2023057408.0,
-            "5": 2023057408.0,
-            "6": 2023057408.0,
-            "7": 2023057408.0,
-            "8": 2023057408.0,
-            "9": 2023057408.0,
-            "10": 2026853376.0,
-            "11": 2026853376.0,
-            "12": 2026853376.0,
-            "13": 2026853376.0,
-            "14": 2026853376.0,
-            "15": 2026853376.0,
-            "16": 2026853376.0,
-            "17": 2026853376.0,
-            "18": 2026853376.0,
-            "19": 2026853376.0,
-            "20": 2026853376.0,
-            "21": 2026964992.0,
-            "22": 2026964992.0,
-            "23": 2026964992.0,
-            "24": 2026964992.0,
-            "25": 2026964992.0,
-            "26": 2026964992.0,
-            "27": 2026964992.0,
-            "28": 2026964992.0,
-            "29": 2026964992.0,
-            "30": 2026964992.0,
-            "31": 2030492160.0,
-            "32": 2030492160.0,
-            "33": 2030492160.0,
-            "34": 2030492160.0,
-            "35": 2030492160.0,
-            "36": 2030492160.0,
-            "37": 2030492160.0,
-            "38": 2030492160.0,
-            "39": 2030492160.0,
-            "40": 2030492160.0,
-            "41": 2030492160.0,
-            "42": 2030492160.0,
-            "43": 2030492160.0,
-            "44": 2030492160.0,
-            "45": 2030492160.0,
-            "46": 2030492160.0,
-            "47": 2030492160.0,
-            "48": 2030492160.0,
-            "49": 2030492160.0,
-            "50": 2030492160.0
+            "1": 1563067904.0,
+            "2": 2022025216.0,
+            "3": 2022025216.0,
+            "4": 2023037440.0,
+            "5": 2023037440.0,
+            "6": 2023037440.0,
+            "7": 2023037440.0,
+            "8": 2023037440.0,
+            "9": 2023037440.0,
+            "10": 2025690112.0,
+            "11": 2025690112.0,
+            "12": 2025690112.0,
+            "13": 2027666944.0,
+            "14": 2027666944.0,
+            "15": 2027666944.0,
+            "16": 2027666944.0,
+            "17": 2027666944.0,
+            "18": 2027666944.0,
+            "19": 2027666944.0,
+            "20": 2027666944.0,
+            "21": 2027666944.0,
+            "22": 2027666944.0,
+            "23": 2027666944.0,
+            "24": 2027666944.0,
+            "25": 2027666944.0,
+            "26": 2027666944.0,
+            "27": 2027666944.0,
+            "28": 2027666944.0,
+            "29": 2027666944.0,
+            "30": 2027666944.0,
+            "31": 2030213120.0,
+            "32": 2030213120.0,
+            "33": 2030213120.0,
+            "34": 2030213120.0,
+            "35": 2030213120.0,
+            "36": 2030213120.0,
+            "37": 2030213120.0,
+            "38": 2030213120.0,
+            "39": 2030213120.0,
+            "40": 2030213120.0,
+            "41": 2030213120.0,
+            "42": 2030213120.0,
+            "43": 2030213120.0,
+            "44": 2030213120.0,
+            "45": 2030213120.0,
+            "46": 2030213120.0,
+            "47": 2030213120.0,
+            "48": 2030213120.0,
+            "49": 2030213120.0,
+            "50": 2030213120.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 18.3953,
-            "2": 0.37892,
-            "3": 0.34007,
-            "4": 0.3355,
-            "5": 0.33186,
-            "6": 0.33483,
-            "7": 0.3277,
-            "8": 0.32755,
-            "9": 0.32791,
-            "10": 0.32415,
-            "11": 0.32272,
-            "12": 0.32392,
-            "13": 0.33508,
-            "14": 0.31609,
-            "15": 0.31941,
-            "16": 0.3178,
-            "17": 0.31692,
-            "18": 0.31834,
-            "19": 0.32074,
-            "20": 0.31765,
-            "21": 0.31933,
-            "22": 0.32169,
-            "23": 0.32073,
-            "24": 0.31872,
-            "25": 0.32305,
-            "26": 0.32018,
-            "27": 0.32077,
-            "28": 0.32022,
-            "29": 0.31612,
-            "30": 0.31263,
-            "31": 0.31663,
-            "32": 0.31415,
-            "33": 0.31634,
-            "34": 0.31559,
-            "35": 0.31239,
-            "36": 0.31218,
-            "37": 0.31427,
-            "38": 0.31433,
-            "39": 0.31314,
-            "40": 0.313,
-            "41": 0.31331,
-            "42": 0.31314,
-            "43": 0.31359,
-            "44": 0.31884,
-            "45": 0.31165,
-            "46": 0.31278,
-            "47": 0.31273,
-            "48": 0.31668,
-            "49": 0.31177,
-            "50": 0.31472
+            "1": 17.84226,
+            "2": 0.49333,
+            "3": 0.35144,
+            "4": 0.35051,
+            "5": 0.33127,
+            "6": 0.33097,
+            "7": 0.33432,
+            "8": 0.33416,
+            "9": 0.33201,
+            "10": 0.33094,
+            "11": 0.33097,
+            "12": 0.3311,
+            "13": 0.33011,
+            "14": 0.32873,
+            "15": 0.32954,
+            "16": 0.3303,
+            "17": 0.33003,
+            "18": 0.32863,
+            "19": 0.32894,
+            "20": 0.32985,
+            "21": 0.32984,
+            "22": 0.32894,
+            "23": 0.33018,
+            "24": 0.32858,
+            "25": 0.32803,
+            "26": 0.32972,
+            "27": 0.32892,
+            "28": 0.32933,
+            "29": 0.3335,
+            "30": 0.32858,
+            "31": 0.3292,
+            "32": 0.32984,
+            "33": 0.32969,
+            "34": 0.32922,
+            "35": 0.33031,
+            "36": 0.32829,
+            "37": 0.32934,
+            "38": 0.77677,
+            "39": 0.32893,
+            "40": 0.32703,
+            "41": 0.32692,
+            "42": 0.32603,
+            "43": 0.32676,
+            "44": 0.80704,
+            "45": 0.32903,
+            "46": 0.32781,
+            "47": 0.70671,
+            "48": 0.32916,
+            "49": 0.3289,
+            "50": 0.32584
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..b804ba57a90
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81772,
+            "2": 10.82538,
+            "3": 10.82315,
+            "4": 10.7893,
+            "5": 10.84414,
+            "6": 10.85629,
+            "7": 10.82859,
+            "8": 10.8301,
+            "9": 10.84207,
+            "10": 10.78601,
+            "11": 10.85733,
+            "12": 10.84663,
+            "13": 10.86562,
+            "14": 10.86914,
+            "15": 10.81232,
+            "16": 10.80865,
+            "17": 10.77965,
+            "18": 10.80508,
+            "19": 10.79288,
+            "20": 10.74264,
+            "21": 10.72495,
+            "22": 10.58933,
+            "23": 10.73854,
+            "24": 10.63021,
+            "25": 10.58736,
+            "26": 10.63591,
+            "27": 10.66781,
+            "28": 10.64617,
+            "29": 10.65842,
+            "30": 10.44627,
+            "31": 10.21299,
+            "32": 10.53989,
+            "33": 10.52788,
+            "34": 10.30656,
+            "35": 10.35429,
+            "36": 10.31411,
+            "37": 10.43313,
+            "38": 10.29142,
+            "39": 10.47282,
+            "40": 10.18336,
+            "41": 10.24081,
+            "42": 10.30294,
+            "43": 9.95174,
+            "44": 10.05781,
+            "45": 9.9572,
+            "46": 9.93655,
+            "47": 10.22836,
+            "48": 9.95329,
+            "49": 9.6607,
+            "50": 9.99855,
+            "51": 9.94973,
+            "52": 9.84349,
+            "53": 10.14413,
+            "54": 10.04737,
+            "55": 9.98385,
+            "56": 9.71898,
+            "57": 9.5883,
+            "58": 9.92285,
+            "59": 9.67628,
+            "60": 9.60379,
+            "61": 9.78734,
+            "62": 10.06656,
+            "63": 9.47521,
+            "64": 9.85036,
+            "65": 9.03212,
+            "66": 9.78289,
+            "67": 9.44253,
+            "68": 9.85795,
+            "69": 9.85298,
+            "70": 9.7992,
+            "71": 9.6974,
+            "72": 9.66103,
+            "73": 9.56335,
+            "74": 9.05976,
+            "75": 9.50058,
+            "76": 9.18716,
+            "77": 10.12117,
+            "78": 9.78252,
+            "79": 9.44971,
+            "80": 9.47021,
+            "81": 9.54374,
+            "82": 9.75396,
+            "83": 9.39966,
+            "84": 9.46977,
+            "85": 9.67727,
+            "86": 9.13918,
+            "87": 9.64053,
+            "88": 9.81152,
+            "89": 9.6769,
+            "90": 9.8722,
+            "91": 9.41711,
+            "92": 9.42414,
+            "93": 9.1643,
+            "94": 8.903,
+            "95": 9.57911,
+            "96": 9.5909,
+            "97": 9.35398,
+            "98": 9.73253,
+            "99": 8.96675,
+            "100": 9.46267
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 31327.0,
+            "2": 32688.0,
+            "3": 32823.0,
+            "4": 31506.0,
+            "5": 36384.0,
+            "6": 37608.0,
+            "7": 35129.0,
+            "8": 31069.0,
+            "9": 34564.0,
+            "10": 29747.0,
+            "11": 38781.0,
+            "12": 35732.0,
+            "13": 37194.0,
+            "14": 38338.0,
+            "15": 35011.0,
+            "16": 36251.0,
+            "17": 34823.0,
+            "18": 35457.0,
+            "19": 35578.0,
+            "20": 32707.0,
+            "21": 33645.0,
+            "22": 30136.0,
+            "23": 38476.0,
+            "24": 32292.0,
+            "25": 30916.0,
+            "26": 34382.0,
+            "27": 36106.0,
+            "28": 37666.0,
+            "29": 38421.0,
+            "30": 33015.0,
+            "31": 30489.0,
+            "32": 36688.0,
+            "33": 38306.0,
+            "34": 33425.0,
+            "35": 34129.0,
+            "36": 35506.0,
+            "37": 38441.0,
+            "38": 35394.0,
+            "39": 38939.0,
+            "40": 36115.0,
+            "41": 36452.0,
+            "42": 37245.0,
+            "43": 34000.0,
+            "44": 33879.0,
+            "45": 36293.0,
+            "46": 37265.0,
+            "47": 40947.0,
+            "48": 36423.0,
+            "49": 35090.0,
+            "50": 40022.0,
+            "51": 37599.0,
+            "52": 36874.0,
+            "53": 42329.0,
+            "54": 40750.0,
+            "55": 37208.0,
+            "56": 39947.0,
+            "57": 36219.0,
+            "58": 42369.0,
+            "59": 39714.0,
+            "60": 39697.0,
+            "61": 40288.0,
+            "62": 44682.0,
+            "63": 37743.0,
+            "64": 43466.0,
+            "65": 40862.0,
+            "66": 45025.0,
+            "67": 40213.0,
+            "68": 40169.0,
+            "69": 40885.0,
+            "70": 45480.0,
+            "71": 41411.0,
+            "72": 40544.0,
+            "73": 45712.0,
+            "74": 34875.0,
+            "75": 39109.0,
+            "76": 45477.0,
+            "77": 45742.0,
+            "78": 47634.0,
+            "79": 48400.0,
+            "80": 46578.0,
+            "81": 50032.0,
+            "82": 49469.0,
+            "83": 45158.0,
+            "84": 45794.0,
+            "85": 49099.0,
+            "86": 45075.0,
+            "87": 49153.0,
+            "88": 47648.0,
+            "89": 49368.0,
+            "90": 49965.0,
+            "91": 44550.0,
+            "92": 46072.0,
+            "93": 46606.0,
+            "94": 47182.0,
+            "95": 47865.0,
+            "96": 50348.0,
+            "97": 46303.0,
+            "98": 49697.0,
+            "99": 48948.0,
+            "100": 44134.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 964398080.0,
+            "2": 964398592.0,
+            "3": 964397056.0,
+            "4": 964396032.0,
+            "5": 964397056.0,
+            "6": 964397568.0,
+            "7": 964397056.0,
+            "8": 964396544.0,
+            "9": 964396544.0,
+            "10": 964396032.0,
+            "11": 964397056.0,
+            "12": 964393984.0,
+            "13": 964396544.0,
+            "14": 964399104.0,
+            "15": 964396544.0,
+            "16": 964397568.0,
+            "17": 964399104.0,
+            "18": 964397568.0,
+            "19": 964397056.0,
+            "20": 964398080.0,
+            "21": 964397056.0,
+            "22": 964394496.0,
+            "23": 964396032.0,
+            "24": 964395520.0,
+            "25": 964395008.0,
+            "26": 964396032.0,
+            "27": 964397568.0,
+            "28": 964396032.0,
+            "29": 964398080.0,
+            "30": 964397056.0,
+            "31": 964390912.0,
+            "32": 965370880.0,
+            "33": 964398080.0,
+            "34": 964396544.0,
+            "35": 964395008.0,
+            "36": 964395008.0,
+            "37": 964396032.0,
+            "38": 964397056.0,
+            "39": 964397056.0,
+            "40": 964398080.0,
+            "41": 964390400.0,
+            "42": 964396032.0,
+            "43": 964393472.0,
+            "44": 964394496.0,
+            "45": 964396032.0,
+            "46": 964390912.0,
+            "47": 964396032.0,
+            "48": 964389888.0,
+            "49": 964392960.0,
+            "50": 964396032.0,
+            "51": 964395008.0,
+            "52": 964391936.0,
+            "53": 964392960.0,
+            "54": 964390912.0,
+            "55": 964390400.0,
+            "56": 964393984.0,
+            "57": 964384768.0,
+            "58": 964389888.0,
+            "59": 964388352.0,
+            "60": 964390912.0,
+            "61": 964396032.0,
+            "62": 964393472.0,
+            "63": 964391424.0,
+            "64": 964388864.0,
+            "65": 964380672.0,
+            "66": 964391936.0,
+            "67": 964391936.0,
+            "68": 964396032.0,
+            "69": 964390400.0,
+            "70": 964392448.0,
+            "71": 964392448.0,
+            "72": 964388352.0,
+            "73": 964390912.0,
+            "74": 964385792.0,
+            "75": 964396032.0,
+            "76": 964396544.0,
+            "77": 964395008.0,
+            "78": 964386816.0,
+            "79": 964391936.0,
+            "80": 964388864.0,
+            "81": 964390400.0,
+            "82": 964391936.0,
+            "83": 964390912.0,
+            "84": 964388352.0,
+            "85": 964391424.0,
+            "86": 964390912.0,
+            "87": 964393984.0,
+            "88": 964390400.0,
+            "89": 964391424.0,
+            "90": 964391936.0,
+            "91": 964391424.0,
+            "92": 964391424.0,
+            "93": 964391936.0,
+            "94": 964392448.0,
+            "95": 964391936.0,
+            "96": 964388352.0,
+            "97": 964390400.0,
+            "98": 964392448.0,
+            "99": 964390912.0,
+            "100": 964389888.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2512260096.0,
+            "2": 2777459200.0,
+            "3": 2777459200.0,
+            "4": 2777459200.0,
+            "5": 2777459200.0,
+            "6": 2778216960.0,
+            "7": 2778216960.0,
+            "8": 2778216960.0,
+            "9": 2778216960.0,
+            "10": 2778216960.0,
+            "11": 2778216960.0,
+            "12": 2778216960.0,
+            "13": 2778216960.0,
+            "14": 2778216960.0,
+            "15": 2778216960.0,
+            "16": 2778216960.0,
+            "17": 2778216960.0,
+            "18": 2778216960.0,
+            "19": 2778216960.0,
+            "20": 2778216960.0,
+            "21": 2778216960.0,
+            "22": 2778216960.0,
+            "23": 2778216960.0,
+            "24": 2778216960.0,
+            "25": 2778216960.0,
+            "26": 2778216960.0,
+            "27": 2778216960.0,
+            "28": 2778216960.0,
+            "29": 2778216960.0,
+            "30": 2778216960.0,
+            "31": 2778216960.0,
+            "32": 2778216960.0,
+            "33": 2778216960.0,
+            "34": 2778216960.0,
+            "35": 2778216960.0,
+            "36": 2778216960.0,
+            "37": 2778216960.0,
+            "38": 2778216960.0,
+            "39": 2778216960.0,
+            "40": 2778216960.0,
+            "41": 2778216960.0,
+            "42": 2778216960.0,
+            "43": 2778216960.0,
+            "44": 2778216960.0,
+            "45": 2778216960.0,
+            "46": 2778216960.0,
+            "47": 2778216960.0,
+            "48": 2778216960.0,
+            "49": 2778216960.0,
+            "50": 2778216960.0,
+            "51": 2778216960.0,
+            "52": 2778216960.0,
+            "53": 2778216960.0,
+            "54": 2778216960.0,
+            "55": 2778216960.0,
+            "56": 2778216960.0,
+            "57": 2778216960.0,
+            "58": 2778216960.0,
+            "59": 2778216960.0,
+            "60": 2778216960.0,
+            "61": 2778216960.0,
+            "62": 2778216960.0,
+            "63": 2778216960.0,
+            "64": 2778216960.0,
+            "65": 2778216960.0,
+            "66": 2778216960.0,
+            "67": 2778216960.0,
+            "68": 2778216960.0,
+            "69": 2778216960.0,
+            "70": 2778216960.0,
+            "71": 2778216960.0,
+            "72": 2778216960.0,
+            "73": 2778216960.0,
+            "74": 2778216960.0,
+            "75": 2778216960.0,
+            "76": 2778216960.0,
+            "77": 2778216960.0,
+            "78": 2778216960.0,
+            "79": 2778216960.0,
+            "80": 2778216960.0,
+            "81": 2778216960.0,
+            "82": 2778216960.0,
+            "83": 2778216960.0,
+            "84": 2778216960.0,
+            "85": 2778216960.0,
+            "86": 2778216960.0,
+            "87": 2778216960.0,
+            "88": 2778216960.0,
+            "89": 2778216960.0,
+            "90": 2778216960.0,
+            "91": 2778216960.0,
+            "92": 2778216960.0,
+            "93": 2778216960.0,
+            "94": 2778216960.0,
+            "95": 2778216960.0,
+            "96": 2778216960.0,
+            "97": 2778216960.0,
+            "98": 2778216960.0,
+            "99": 2778216960.0,
+            "100": 2778216960.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 6.33917,
+            "3": 0.2744,
+            "4": 0.25641,
+            "5": 0.24878,
+            "6": 0.24537,
+            "7": 0.2438,
+            "8": 0.23878,
+            "9": 0.24702,
+            "10": 0.23746,
+            "11": 0.23512,
+            "12": 0.22956,
+            "13": 0.2347,
+            "14": 0.23605,
+            "15": 0.24017,
+            "16": 0.23204,
+            "17": 0.2327,
+            "18": 0.23371,
+            "19": 0.23278,
+            "20": 0.23324,
+            "21": 0.2375,
+            "22": 0.2357,
+            "23": 0.23341,
+            "24": 0.23508,
+            "25": 0.23292,
+            "26": 0.23763,
+            "27": 0.23487,
+            "28": 0.23071,
+            "29": 0.23154,
+            "30": 0.23464,
+            "31": 0.23829,
+            "32": 0.22989,
+            "33": 0.23328,
+            "34": 0.23409,
+            "35": 0.23024,
+            "36": 0.23774,
+            "37": 0.23416,
+            "38": 0.23657,
+            "39": 0.23087,
+            "40": 0.23163,
+            "41": 0.23724,
+            "42": 0.23245,
+            "43": 0.23545,
+            "44": 0.23041,
+            "45": 0.23512,
+            "46": 0.23935,
+            "47": 0.23571,
+            "48": 0.2329,
+            "49": 0.25544,
+            "50": 0.22697,
+            "51": 0.27515,
+            "52": 0.69001,
+            "53": 0.24129,
+            "54": 0.23155,
+            "55": 0.24045,
+            "56": 0.24512,
+            "57": 0.24802,
+            "58": 0.23433,
+            "59": 0.3274,
+            "60": 0.23221,
+            "61": 0.23713,
+            "62": 0.24042,
+            "63": 0.25806,
+            "64": 0.2355,
+            "65": 0.27386,
+            "66": 0.68273,
+            "67": 0.30343,
+            "68": 0.26428,
+            "69": 0.25274,
+            "70": 0.24031,
+            "71": 0.25644,
+            "72": 0.24947,
+            "73": 0.2737,
+            "74": 0.26515,
+            "75": 0.25101,
+            "76": 0.27258,
+            "77": 0.65643,
+            "78": 0.25055,
+            "79": 0.26819,
+            "80": 0.24291,
+            "81": 0.24807,
+            "82": 0.24385,
+            "83": 0.24932,
+            "84": 0.24366,
+            "85": 0.25449,
+            "86": 0.28807,
+            "87": 0.25052,
+            "88": 0.25388,
+            "89": 0.24876,
+            "90": 0.24712,
+            "91": 0.27209,
+            "92": 0.25942,
+            "93": 0.26516,
+            "94": 0.27795,
+            "95": 0.25093,
+            "96": 0.58451,
+            "97": 0.26354,
+            "98": 0.24591,
+            "99": 0.2477,
+            "100": 0.24515
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json
index d869313b50f..8f055dc00d7 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json
@@ -6,104 +6,104 @@
         "values": {
             "1": 10.81565,
             "2": 10.81048,
-            "3": 10.8127,
-            "4": 10.79089,
-            "5": 10.83784,
-            "6": 10.85116,
-            "7": 10.82036,
-            "8": 10.82117,
-            "9": 10.83043,
-            "10": 10.78955,
-            "11": 10.86357,
-            "12": 10.84268,
-            "13": 10.85799,
-            "14": 10.86268,
-            "15": 10.80594,
-            "16": 10.80356,
-            "17": 10.77851,
-            "18": 10.80762,
-            "19": 10.79465,
-            "20": 10.747,
-            "21": 10.72249,
-            "22": 10.58742,
-            "23": 10.72933,
-            "24": 10.63238,
-            "25": 10.575,
-            "26": 10.638,
-            "27": 10.64966,
-            "28": 10.63496,
-            "29": 10.64307,
-            "30": 10.44635,
-            "31": 10.19441,
-            "32": 10.52449,
-            "33": 10.51815,
-            "34": 10.28843,
-            "35": 10.33138,
-            "36": 10.3123,
-            "37": 10.4265,
-            "38": 10.27866,
-            "39": 10.47612,
-            "40": 10.19821,
-            "41": 10.21536,
-            "42": 10.28769,
-            "43": 9.94235,
-            "44": 10.05775,
-            "45": 9.94354,
-            "46": 9.90902,
-            "47": 10.21214,
-            "48": 9.94982,
-            "49": 9.63605,
-            "50": 10.00335,
-            "51": 9.92304,
-            "52": 9.82779,
-            "53": 10.14656,
-            "54": 10.04338,
-            "55": 9.96311,
-            "56": 9.70508,
-            "57": 9.58542,
-            "58": 9.91687,
-            "59": 9.66061,
-            "60": 9.60393,
-            "61": 9.77855,
-            "62": 10.0624,
-            "63": 9.47205,
-            "64": 9.85428,
-            "65": 9.02467,
-            "66": 9.79454,
-            "67": 9.43333,
-            "68": 9.85327,
-            "69": 9.847,
-            "70": 9.81072,
-            "71": 9.684,
-            "72": 9.66023,
-            "73": 9.57314,
-            "74": 9.05973,
-            "75": 9.50551,
-            "76": 9.17942,
-            "77": 10.12761,
-            "78": 9.77438,
-            "79": 9.44209,
-            "80": 9.46747,
-            "81": 9.53873,
-            "82": 9.75725,
-            "83": 9.38702,
-            "84": 9.46662,
-            "85": 9.67918,
-            "86": 9.13556,
-            "87": 9.63426,
-            "88": 9.80794,
-            "89": 9.67925,
-            "90": 9.85561,
-            "91": 9.41267,
-            "92": 9.41773,
-            "93": 9.15396,
-            "94": 8.90227,
-            "95": 9.56526,
-            "96": 9.58425,
-            "97": 9.35836,
-            "98": 9.7302,
-            "99": 8.95917,
-            "100": 9.45408
+            "3": 10.81274,
+            "4": 10.79109,
+            "5": 10.838,
+            "6": 10.84998,
+            "7": 10.8209,
+            "8": 10.821,
+            "9": 10.83092,
+            "10": 10.78949,
+            "11": 10.86351,
+            "12": 10.84299,
+            "13": 10.85677,
+            "14": 10.86241,
+            "15": 10.8062,
+            "16": 10.80347,
+            "17": 10.77927,
+            "18": 10.80722,
+            "19": 10.79448,
+            "20": 10.74689,
+            "21": 10.72163,
+            "22": 10.58676,
+            "23": 10.72952,
+            "24": 10.63218,
+            "25": 10.57522,
+            "26": 10.63797,
+            "27": 10.64969,
+            "28": 10.63484,
+            "29": 10.64318,
+            "30": 10.44633,
+            "31": 10.19408,
+            "32": 10.5239,
+            "33": 10.51833,
+            "34": 10.28815,
+            "35": 10.33158,
+            "36": 10.31281,
+            "37": 10.42627,
+            "38": 10.27886,
+            "39": 10.47564,
+            "40": 10.19805,
+            "41": 10.21579,
+            "42": 10.28687,
+            "43": 9.942,
+            "44": 10.05731,
+            "45": 9.94351,
+            "46": 9.9088,
+            "47": 10.21222,
+            "48": 9.94969,
+            "49": 9.63645,
+            "50": 10.0035,
+            "51": 9.92297,
+            "52": 9.82832,
+            "53": 10.14635,
+            "54": 10.04348,
+            "55": 9.96283,
+            "56": 9.70531,
+            "57": 9.58566,
+            "58": 9.91703,
+            "59": 9.66041,
+            "60": 9.60398,
+            "61": 9.77842,
+            "62": 10.06249,
+            "63": 9.47211,
+            "64": 9.85381,
+            "65": 9.02443,
+            "66": 9.794,
+            "67": 9.43339,
+            "68": 9.85345,
+            "69": 9.84704,
+            "70": 9.81023,
+            "71": 9.68396,
+            "72": 9.66038,
+            "73": 9.57331,
+            "74": 9.06008,
+            "75": 9.50505,
+            "76": 9.17917,
+            "77": 10.12748,
+            "78": 9.77465,
+            "79": 9.44204,
+            "80": 9.46777,
+            "81": 9.53832,
+            "82": 9.75735,
+            "83": 9.38708,
+            "84": 9.46663,
+            "85": 9.67908,
+            "86": 9.13575,
+            "87": 9.6347,
+            "88": 9.80851,
+            "89": 9.67935,
+            "90": 9.85541,
+            "91": 9.4128,
+            "92": 9.41772,
+            "93": 9.15363,
+            "94": 8.90205,
+            "95": 9.56516,
+            "96": 9.58409,
+            "97": 9.35837,
+            "98": 9.72999,
+            "99": 8.95859,
+            "100": 9.45369
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 30991.0,
-            "2": 32927.0,
-            "3": 33481.0,
-            "4": 30866.0,
-            "5": 36255.0,
-            "6": 37186.0,
-            "7": 35644.0,
-            "8": 31356.0,
-            "9": 34832.0,
-            "10": 29855.0,
-            "11": 38396.0,
-            "12": 35164.0,
-            "13": 37118.0,
-            "14": 38011.0,
-            "15": 34458.0,
-            "16": 35843.0,
-            "17": 34836.0,
-            "18": 35149.0,
-            "19": 36044.0,
-            "20": 32823.0,
-            "21": 33340.0,
-            "22": 30040.0,
-            "23": 37733.0,
-            "24": 31992.0,
-            "25": 31045.0,
-            "26": 34280.0,
-            "27": 36064.0,
-            "28": 36993.0,
-            "29": 38087.0,
-            "30": 32689.0,
-            "31": 30361.0,
-            "32": 36050.0,
-            "33": 37627.0,
-            "34": 33149.0,
-            "35": 34316.0,
-            "36": 35026.0,
-            "37": 37852.0,
-            "38": 35490.0,
-            "39": 38325.0,
-            "40": 35730.0,
-            "41": 35890.0,
-            "42": 37811.0,
-            "43": 34239.0,
-            "44": 33282.0,
-            "45": 35354.0,
-            "46": 37112.0,
-            "47": 40323.0,
-            "48": 36296.0,
-            "49": 36098.0,
-            "50": 38996.0,
-            "51": 37187.0,
-            "52": 36798.0,
-            "53": 41385.0,
-            "54": 41151.0,
-            "55": 36715.0,
-            "56": 40382.0,
-            "57": 36942.0,
-            "58": 42415.0,
-            "59": 39138.0,
-            "60": 39766.0,
-            "61": 40532.0,
-            "62": 43919.0,
-            "63": 38747.0,
-            "64": 43509.0,
-            "65": 40794.0,
-            "66": 44093.0,
-            "67": 40369.0,
-            "68": 40509.0,
-            "69": 40728.0,
-            "70": 45431.0,
-            "71": 41117.0,
-            "72": 39982.0,
-            "73": 44758.0,
-            "74": 34170.0,
-            "75": 38601.0,
-            "76": 46113.0,
-            "77": 45621.0,
-            "78": 47007.0,
-            "79": 47410.0,
-            "80": 46647.0,
-            "81": 50449.0,
-            "82": 49494.0,
-            "83": 45080.0,
-            "84": 46331.0,
-            "85": 48470.0,
-            "86": 45870.0,
-            "87": 49138.0,
-            "88": 46357.0,
-            "89": 48274.0,
-            "90": 50049.0,
-            "91": 43937.0,
-            "92": 47318.0,
-            "93": 46654.0,
-            "94": 46515.0,
-            "95": 47167.0,
-            "96": 50587.0,
-            "97": 46623.0,
-            "98": 49830.0,
-            "99": 48092.0,
-            "100": 43643.0
+            "1": 30973.0,
+            "2": 32949.0,
+            "3": 33708.0,
+            "4": 30953.0,
+            "5": 35857.0,
+            "6": 36975.0,
+            "7": 35061.0,
+            "8": 31831.0,
+            "9": 34544.0,
+            "10": 29924.0,
+            "11": 38570.0,
+            "12": 34892.0,
+            "13": 37266.0,
+            "14": 37629.0,
+            "15": 34335.0,
+            "16": 36204.0,
+            "17": 35086.0,
+            "18": 35374.0,
+            "19": 36376.0,
+            "20": 32512.0,
+            "21": 33131.0,
+            "22": 30019.0,
+            "23": 37801.0,
+            "24": 32117.0,
+            "25": 31024.0,
+            "26": 34085.0,
+            "27": 36047.0,
+            "28": 36795.0,
+            "29": 37764.0,
+            "30": 32629.0,
+            "31": 30029.0,
+            "32": 36315.0,
+            "33": 37487.0,
+            "34": 33214.0,
+            "35": 34197.0,
+            "36": 34782.0,
+            "37": 38163.0,
+            "38": 35456.0,
+            "39": 38082.0,
+            "40": 35203.0,
+            "41": 35757.0,
+            "42": 37312.0,
+            "43": 34196.0,
+            "44": 33296.0,
+            "45": 35603.0,
+            "46": 36998.0,
+            "47": 40550.0,
+            "48": 36177.0,
+            "49": 36622.0,
+            "50": 38729.0,
+            "51": 37241.0,
+            "52": 36636.0,
+            "53": 41646.0,
+            "54": 41087.0,
+            "55": 36966.0,
+            "56": 40084.0,
+            "57": 37098.0,
+            "58": 42342.0,
+            "59": 39005.0,
+            "60": 40046.0,
+            "61": 40691.0,
+            "62": 43923.0,
+            "63": 38200.0,
+            "64": 43685.0,
+            "65": 41003.0,
+            "66": 44323.0,
+            "67": 40139.0,
+            "68": 40884.0,
+            "69": 40461.0,
+            "70": 45248.0,
+            "71": 41715.0,
+            "72": 40154.0,
+            "73": 44063.0,
+            "74": 33983.0,
+            "75": 38741.0,
+            "76": 46349.0,
+            "77": 45940.0,
+            "78": 46873.0,
+            "79": 47483.0,
+            "80": 46517.0,
+            "81": 50082.0,
+            "82": 49796.0,
+            "83": 45095.0,
+            "84": 46054.0,
+            "85": 48997.0,
+            "86": 45548.0,
+            "87": 49041.0,
+            "88": 46299.0,
+            "89": 48533.0,
+            "90": 49742.0,
+            "91": 43837.0,
+            "92": 47775.0,
+            "93": 46259.0,
+            "94": 45802.0,
+            "95": 47626.0,
+            "96": 50166.0,
+            "97": 47157.0,
+            "98": 50271.0,
+            "99": 47962.0,
+            "100": 43608.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1016564224.0,
-            "2": 1016563712.0,
-            "3": 1016564224.0,
-            "4": 1017172480.0,
-            "5": 1016564224.0,
-            "6": 1016565248.0,
-            "7": 1016564736.0,
-            "8": 1016565248.0,
-            "9": 1016562688.0,
-            "10": 1016564736.0,
-            "11": 1016562688.0,
-            "12": 1016565248.0,
-            "13": 1016564736.0,
-            "14": 1016564224.0,
-            "15": 1016564736.0,
-            "16": 1016562176.0,
-            "17": 1016564736.0,
-            "18": 1016565760.0,
-            "19": 1016563200.0,
-            "20": 1016563200.0,
-            "21": 1016564224.0,
-            "22": 1016566272.0,
-            "23": 1016564736.0,
-            "24": 1016564224.0,
-            "25": 1016564736.0,
-            "26": 1016562176.0,
-            "27": 1016563200.0,
-            "28": 1016562688.0,
-            "29": 1016562688.0,
-            "30": 1016566272.0,
-            "31": 1016569856.0,
-            "32": 1016564736.0,
-            "33": 1016564736.0,
-            "34": 1016565248.0,
-            "35": 1017459712.0,
-            "36": 1016565248.0,
-            "37": 1016565248.0,
-            "38": 1016564224.0,
-            "39": 1016562176.0,
-            "40": 1016565248.0,
-            "41": 1016567808.0,
-            "42": 1016564224.0,
-            "43": 1016568320.0,
-            "44": 1016565760.0,
-            "45": 1016565760.0,
-            "46": 1016570368.0,
-            "47": 1016565248.0,
-            "48": 1016569856.0,
-            "49": 1016568832.0,
-            "50": 1016565760.0,
-            "51": 1016566272.0,
-            "52": 1016574976.0,
-            "53": 1016567808.0,
-            "54": 1016566784.0,
-            "55": 1016569856.0,
-            "56": 1016565248.0,
-            "57": 1016574976.0,
-            "58": 1017110528.0,
-            "59": 1016574976.0,
-            "60": 1016571904.0,
-            "61": 1016567296.0,
-            "62": 1016565760.0,
-            "63": 1016576000.0,
-            "64": 1016572928.0,
-            "65": 1016585216.0,
-            "66": 1016568832.0,
-            "67": 1016569344.0,
-            "68": 1016566272.0,
-            "69": 1016569856.0,
-            "70": 1016569344.0,
-            "71": 1016566272.0,
-            "72": 1016571392.0,
-            "73": 1016572416.0,
-            "74": 1016577536.0,
-            "75": 1016567296.0,
-            "76": 1016565760.0,
-            "77": 1016566272.0,
-            "78": 1016572928.0,
-            "79": 1016568832.0,
-            "80": 1016572416.0,
-            "81": 1016570368.0,
-            "82": 1016571904.0,
-            "83": 1016568832.0,
-            "84": 1016573440.0,
-            "85": 1016575488.0,
-            "86": 1016574976.0,
-            "87": 1016568320.0,
-            "88": 1016816640.0,
-            "89": 1016577024.0,
-            "90": 1016569344.0,
-            "91": 1016566784.0,
-            "92": 1016566784.0,
-            "93": 1016569856.0,
-            "94": 1016571392.0,
-            "95": 1016567808.0,
-            "96": 1016566784.0,
-            "97": 1016573952.0,
-            "98": 1016565760.0,
-            "99": 1016577024.0,
-            "100": 1016574464.0
+            "1": 1014467072.0,
+            "2": 1014466560.0,
+            "3": 1014467072.0,
+            "4": 1014466560.0,
+            "5": 1014466560.0,
+            "6": 1014467584.0,
+            "7": 1014468608.0,
+            "8": 1014468096.0,
+            "9": 1014466048.0,
+            "10": 1014467584.0,
+            "11": 1014465536.0,
+            "12": 1014467072.0,
+            "13": 1014467072.0,
+            "14": 1014466048.0,
+            "15": 1015065088.0,
+            "16": 1014465024.0,
+            "17": 1014467072.0,
+            "18": 1014467072.0,
+            "19": 1014466560.0,
+            "20": 1014467072.0,
+            "21": 1014466560.0,
+            "22": 1014468608.0,
+            "23": 1014467584.0,
+            "24": 1014675456.0,
+            "25": 1014468096.0,
+            "26": 1014465536.0,
+            "27": 1014466048.0,
+            "28": 1014465024.0,
+            "29": 1014465536.0,
+            "30": 1014469120.0,
+            "31": 1014472192.0,
+            "32": 1014468096.0,
+            "33": 1014467584.0,
+            "34": 1014467072.0,
+            "35": 1014468096.0,
+            "36": 1014468096.0,
+            "37": 1014787072.0,
+            "38": 1014467584.0,
+            "39": 1014465024.0,
+            "40": 1015253504.0,
+            "41": 1014470144.0,
+            "42": 1014467584.0,
+            "43": 1014471168.0,
+            "44": 1014467584.0,
+            "45": 1014468608.0,
+            "46": 1014472704.0,
+            "47": 1014467584.0,
+            "48": 1014473216.0,
+            "49": 1014471168.0,
+            "50": 1014468608.0,
+            "51": 1014469120.0,
+            "52": 1014478336.0,
+            "53": 1014471168.0,
+            "54": 1014885888.0,
+            "55": 1014472192.0,
+            "56": 1014468096.0,
+            "57": 1014478336.0,
+            "58": 1014472704.0,
+            "59": 1014477312.0,
+            "60": 1014473728.0,
+            "61": 1014470656.0,
+            "62": 1014469632.0,
+            "63": 1014479360.0,
+            "64": 1014475264.0,
+            "65": 1015306240.0,
+            "66": 1014471680.0,
+            "67": 1014473216.0,
+            "68": 1014499840.0,
+            "69": 1014473728.0,
+            "70": 1014472192.0,
+            "71": 1014468608.0,
+            "72": 1014474752.0,
+            "73": 1014475264.0,
+            "74": 1014479872.0,
+            "75": 1014469632.0,
+            "76": 1014468096.0,
+            "77": 1014470144.0,
+            "78": 1014475776.0,
+            "79": 1014471680.0,
+            "80": 1014475264.0,
+            "81": 1014472704.0,
+            "82": 1014474752.0,
+            "83": 1014471680.0,
+            "84": 1014475776.0,
+            "85": 1014478336.0,
+            "86": 1014477824.0,
+            "87": 1014470144.0,
+            "88": 1014473728.0,
+            "89": 1014479872.0,
+            "90": 1014471168.0,
+            "91": 1014469120.0,
+            "92": 1014470656.0,
+            "93": 1014472704.0,
+            "94": 1014474752.0,
+            "95": 1014600704.0,
+            "96": 1014468096.0,
+            "97": 1014476800.0,
+            "98": 1014468608.0,
+            "99": 1014480384.0,
+            "100": 1014477312.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2560655872.0,
-            "2": 2827037696.0,
-            "3": 2827771392.0,
-            "4": 2828163584.0,
-            "5": 2828163584.0,
-            "6": 2828163584.0,
-            "7": 2829373440.0,
-            "8": 2829373440.0,
-            "9": 2829373440.0,
-            "10": 2829925376.0,
-            "11": 2829925376.0,
-            "12": 2829925376.0,
-            "13": 2829925376.0,
-            "14": 2829925376.0,
-            "15": 2830320640.0,
-            "16": 2830320640.0,
-            "17": 2830320640.0,
-            "18": 2830320640.0,
-            "19": 2830320640.0,
-            "20": 2830320640.0,
-            "21": 2830320640.0,
-            "22": 2830406144.0,
-            "23": 2830406144.0,
-            "24": 2830406144.0,
-            "25": 2830406144.0,
-            "26": 2830406144.0,
-            "27": 2830406144.0,
-            "28": 2830406144.0,
-            "29": 2830406144.0,
-            "30": 2831433216.0,
-            "31": 2836904960.0,
-            "32": 2836904960.0,
-            "33": 2836904960.0,
-            "34": 2836904960.0,
-            "35": 2836904960.0,
-            "36": 2836904960.0,
-            "37": 2836904960.0,
-            "38": 2836904960.0,
-            "39": 2836904960.0,
-            "40": 2836904960.0,
-            "41": 2836904960.0,
-            "42": 2836904960.0,
-            "43": 2836904960.0,
-            "44": 2836904960.0,
-            "45": 2836904960.0,
-            "46": 2837527040.0,
-            "47": 2837527040.0,
-            "48": 2837527040.0,
-            "49": 2837527040.0,
-            "50": 2837527040.0,
-            "51": 2837527040.0,
-            "52": 2844526592.0,
-            "53": 2844526592.0,
-            "54": 2844526592.0,
-            "55": 2844526592.0,
-            "56": 2844526592.0,
-            "57": 2845833216.0,
-            "58": 2845833216.0,
-            "59": 2845833216.0,
-            "60": 2845833216.0,
-            "61": 2845833216.0,
-            "62": 2845833216.0,
-            "63": 2847350784.0,
-            "64": 2847350784.0,
-            "65": 2859365376.0,
-            "66": 2859365376.0,
-            "67": 2859365376.0,
-            "68": 2859365376.0,
-            "69": 2859365376.0,
-            "70": 2859365376.0,
-            "71": 2859365376.0,
-            "72": 2859365376.0,
-            "73": 2859365376.0,
-            "74": 2859365376.0,
-            "75": 2859365376.0,
-            "76": 2859365376.0,
-            "77": 2859365376.0,
-            "78": 2859365376.0,
-            "79": 2859365376.0,
-            "80": 2859365376.0,
-            "81": 2859365376.0,
-            "82": 2859365376.0,
-            "83": 2859365376.0,
-            "84": 2859365376.0,
-            "85": 2859365376.0,
-            "86": 2859365376.0,
-            "87": 2859365376.0,
-            "88": 2859365376.0,
-            "89": 2859365376.0,
-            "90": 2859365376.0,
-            "91": 2859365376.0,
-            "92": 2859365376.0,
-            "93": 2859365376.0,
-            "94": 2859365376.0,
-            "95": 2859365376.0,
-            "96": 2859365376.0,
-            "97": 2859365376.0,
-            "98": 2859365376.0,
-            "99": 2859365376.0,
-            "100": 2859365376.0
+            "1": 2563003904.0,
+            "2": 2826423296.0,
+            "3": 2826423296.0,
+            "4": 2826423296.0,
+            "5": 2826423296.0,
+            "6": 2828489728.0,
+            "7": 2828489728.0,
+            "8": 2828489728.0,
+            "9": 2828489728.0,
+            "10": 2828489728.0,
+            "11": 2828489728.0,
+            "12": 2828489728.0,
+            "13": 2828489728.0,
+            "14": 2828489728.0,
+            "15": 2828489728.0,
+            "16": 2828489728.0,
+            "17": 2828489728.0,
+            "18": 2828489728.0,
+            "19": 2828489728.0,
+            "20": 2828489728.0,
+            "21": 2828489728.0,
+            "22": 2830208000.0,
+            "23": 2830208000.0,
+            "24": 2830208000.0,
+            "25": 2830208000.0,
+            "26": 2830208000.0,
+            "27": 2830208000.0,
+            "28": 2830208000.0,
+            "29": 2830208000.0,
+            "30": 2830208000.0,
+            "31": 2835122688.0,
+            "32": 2835122688.0,
+            "33": 2835122688.0,
+            "34": 2835122688.0,
+            "35": 2835122688.0,
+            "36": 2835122688.0,
+            "37": 2835122688.0,
+            "38": 2835122688.0,
+            "39": 2835122688.0,
+            "40": 2835122688.0,
+            "41": 2835122688.0,
+            "42": 2835122688.0,
+            "43": 2835122688.0,
+            "44": 2835122688.0,
+            "45": 2835122688.0,
+            "46": 2835122688.0,
+            "47": 2835122688.0,
+            "48": 2836012544.0,
+            "49": 2836012544.0,
+            "50": 2836012544.0,
+            "51": 2836012544.0,
+            "52": 2842577408.0,
+            "53": 2842577408.0,
+            "54": 2842577408.0,
+            "55": 2842577408.0,
+            "56": 2842577408.0,
+            "57": 2846367232.0,
+            "58": 2846367232.0,
+            "59": 2846367232.0,
+            "60": 2846367232.0,
+            "61": 2846367232.0,
+            "62": 2846367232.0,
+            "63": 2846367232.0,
+            "64": 2846367232.0,
+            "65": 2856796160.0,
+            "66": 2856796160.0,
+            "67": 2856796160.0,
+            "68": 2856796160.0,
+            "69": 2856796160.0,
+            "70": 2856796160.0,
+            "71": 2856796160.0,
+            "72": 2856796160.0,
+            "73": 2856796160.0,
+            "74": 2856796160.0,
+            "75": 2856796160.0,
+            "76": 2856796160.0,
+            "77": 2856796160.0,
+            "78": 2856796160.0,
+            "79": 2856796160.0,
+            "80": 2856796160.0,
+            "81": 2856796160.0,
+            "82": 2856796160.0,
+            "83": 2856796160.0,
+            "84": 2856796160.0,
+            "85": 2856796160.0,
+            "86": 2856796160.0,
+            "87": 2856796160.0,
+            "88": 2856796160.0,
+            "89": 2856796160.0,
+            "90": 2856796160.0,
+            "91": 2856796160.0,
+            "92": 2856796160.0,
+            "93": 2856796160.0,
+            "94": 2856796160.0,
+            "95": 2856796160.0,
+            "96": 2856796160.0,
+            "97": 2856796160.0,
+            "98": 2856796160.0,
+            "99": 2856796160.0,
+            "100": 2856796160.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 17.55161,
-            "2": 0.27584,
-            "3": 0.20906,
-            "4": 0.18821,
-            "5": 0.17883,
-            "6": 0.17484,
-            "7": 0.18214,
-            "8": 0.18025,
-            "9": 0.16785,
-            "10": 0.16718,
-            "11": 0.17122,
-            "12": 0.16341,
-            "13": 0.16356,
-            "14": 0.16447,
-            "15": 0.17469,
-            "16": 0.16231,
-            "17": 0.17002,
-            "18": 0.1621,
-            "19": 0.16543,
-            "20": 0.16097,
-            "21": 0.16113,
-            "22": 0.17866,
-            "23": 0.16939,
-            "24": 0.16784,
-            "25": 0.16322,
-            "26": 0.15752,
-            "27": 0.16042,
-            "28": 0.16296,
-            "29": 0.16022,
-            "30": 0.16569,
-            "31": 0.20634,
-            "32": 0.16627,
-            "33": 0.16203,
-            "34": 0.18965,
-            "35": 0.1656,
-            "36": 0.17227,
-            "37": 0.16394,
-            "38": 0.16364,
-            "39": 0.15966,
-            "40": 0.17482,
-            "41": 0.16992,
-            "42": 0.16079,
-            "43": 0.17541,
-            "44": 0.1626,
-            "45": 0.16436,
-            "46": 0.1838,
-            "47": 0.15773,
-            "48": 0.18504,
-            "49": 0.22116,
-            "50": 0.16497,
-            "51": 0.17193,
-            "52": 0.17228,
-            "53": 0.15999,
-            "54": 0.15946,
-            "55": 0.1611,
-            "56": 0.21983,
-            "57": 0.18423,
-            "58": 0.16229,
-            "59": 0.18268,
-            "60": 0.17406,
-            "61": 0.15956,
-            "62": 0.16172,
-            "63": 0.17465,
-            "64": 0.17307,
-            "65": 0.25477,
-            "66": 0.15926,
-            "67": 0.23477,
-            "68": 0.16872,
-            "69": 0.16094,
-            "70": 0.16631,
-            "71": 0.18552,
-            "72": 0.16728,
-            "73": 0.1889,
-            "74": 0.17586,
-            "75": 0.17577,
-            "76": 0.21503,
-            "77": 0.16576,
-            "78": 0.17284,
-            "79": 0.18166,
-            "80": 0.19235,
-            "81": 0.17347,
-            "82": 0.1597,
-            "83": 0.17024,
-            "84": 0.17843,
-            "85": 0.15917,
-            "86": 0.20315,
-            "87": 0.16523,
-            "88": 0.16367,
-            "89": 0.18499,
-            "90": 0.16286,
-            "91": 0.19025,
-            "92": 0.17186,
-            "93": 0.19123,
-            "94": 0.19378,
-            "95": 0.16849,
-            "96": 0.16781,
-            "97": 0.17705,
-            "98": 0.15729,
-            "99": 0.17119,
-            "100": 0.16
+            "1": 14.68238,
+            "2": 0.38712,
+            "3": 0.19949,
+            "4": 0.16868,
+            "5": 0.15278,
+            "6": 0.14858,
+            "7": 0.15754,
+            "8": 0.15132,
+            "9": 0.14692,
+            "10": 0.14516,
+            "11": 0.14033,
+            "12": 0.14161,
+            "13": 0.14186,
+            "14": 0.13624,
+            "15": 0.15371,
+            "16": 0.1395,
+            "17": 0.16083,
+            "18": 0.13717,
+            "19": 0.1421,
+            "20": 0.13767,
+            "21": 0.13643,
+            "22": 0.15072,
+            "23": 0.13944,
+            "24": 0.13522,
+            "25": 0.13454,
+            "26": 0.13493,
+            "27": 0.13514,
+            "28": 0.14174,
+            "29": 0.13479,
+            "30": 0.14261,
+            "31": 0.17426,
+            "32": 0.14571,
+            "33": 0.13803,
+            "34": 0.16399,
+            "35": 0.1389,
+            "36": 0.14089,
+            "37": 0.13701,
+            "38": 0.14212,
+            "39": 0.13299,
+            "40": 0.14907,
+            "41": 0.14239,
+            "42": 0.13978,
+            "43": 0.14469,
+            "44": 0.1344,
+            "45": 0.14546,
+            "46": 0.16258,
+            "47": 0.14403,
+            "48": 0.15688,
+            "49": 0.20655,
+            "50": 0.13686,
+            "51": 0.16635,
+            "52": 0.15085,
+            "53": 0.54128,
+            "54": 0.13812,
+            "55": 0.14612,
+            "56": 0.20029,
+            "57": 0.15601,
+            "58": 0.15373,
+            "59": 0.15883,
+            "60": 0.15348,
+            "61": 0.13897,
+            "62": 0.14293,
+            "63": 0.15882,
+            "64": 0.15023,
+            "65": 0.21706,
+            "66": 0.14405,
+            "67": 0.20424,
+            "68": 0.15367,
+            "69": 0.14298,
+            "70": 0.14311,
+            "71": 0.16751,
+            "72": 0.15144,
+            "73": 0.17862,
+            "74": 0.15928,
+            "75": 0.15132,
+            "76": 0.18706,
+            "77": 0.14118,
+            "78": 0.14807,
+            "79": 0.15437,
+            "80": 0.15794,
+            "81": 0.14257,
+            "82": 0.13828,
+            "83": 0.15021,
+            "84": 0.14886,
+            "85": 0.14363,
+            "86": 0.19012,
+            "87": 0.14052,
+            "88": 0.14621,
+            "89": 0.15591,
+            "90": 0.1453,
+            "91": 0.17378,
+            "92": 0.16177,
+            "93": 0.18337,
+            "94": 0.18449,
+            "95": 0.14789,
+            "96": 0.14329,
+            "97": 0.15465,
+            "98": 0.14162,
+            "99": 0.14792,
+            "100": 0.14082
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..615b1b90939
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.92297,
+            "52": 9.8284,
+            "53": 10.14624,
+            "54": 10.04331,
+            "55": 9.96248,
+            "56": 9.70547,
+            "57": 9.58553,
+            "58": 9.91673,
+            "59": 9.66059,
+            "60": 9.60402,
+            "61": 9.77812,
+            "62": 10.06258,
+            "63": 9.47179,
+            "64": 9.85361,
+            "65": 9.02415,
+            "66": 9.79391,
+            "67": 9.43341,
+            "68": 9.85341,
+            "69": 9.84716,
+            "70": 9.81035,
+            "71": 9.68402,
+            "72": 9.65988,
+            "73": 9.57308,
+            "74": 9.05997,
+            "75": 9.50561,
+            "76": 9.17936,
+            "77": 10.12733,
+            "78": 9.77475,
+            "79": 9.44198,
+            "80": 9.46754,
+            "81": 9.53859,
+            "82": 9.75755,
+            "83": 9.38709,
+            "84": 9.46679,
+            "85": 9.67903,
+            "86": 9.1356,
+            "87": 9.63439,
+            "88": 9.80841,
+            "89": 9.67922,
+            "90": 9.8555,
+            "91": 9.41299,
+            "92": 9.41796,
+            "93": 9.15357,
+            "94": 8.90198,
+            "95": 9.56514,
+            "96": 9.58401,
+            "97": 9.35865,
+            "98": 9.73028,
+            "99": 8.95871,
+            "100": 9.45412
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 37309.0,
+            "52": 36703.0,
+            "53": 41651.0,
+            "54": 41063.0,
+            "55": 36785.0,
+            "56": 40238.0,
+            "57": 36695.0,
+            "58": 42135.0,
+            "59": 39294.0,
+            "60": 39482.0,
+            "61": 40661.0,
+            "62": 44026.0,
+            "63": 38069.0,
+            "64": 43162.0,
+            "65": 40823.0,
+            "66": 44305.0,
+            "67": 40571.0,
+            "68": 40330.0,
+            "69": 40479.0,
+            "70": 45305.0,
+            "71": 41317.0,
+            "72": 39952.0,
+            "73": 44530.0,
+            "74": 34138.0,
+            "75": 38838.0,
+            "76": 46191.0,
+            "77": 45788.0,
+            "78": 47368.0,
+            "79": 47694.0,
+            "80": 46540.0,
+            "81": 50541.0,
+            "82": 49391.0,
+            "83": 45041.0,
+            "84": 46205.0,
+            "85": 49075.0,
+            "86": 45491.0,
+            "87": 49629.0,
+            "88": 46513.0,
+            "89": 48672.0,
+            "90": 49752.0,
+            "91": 44036.0,
+            "92": 47292.0,
+            "93": 46999.0,
+            "94": 46286.0,
+            "95": 46691.0,
+            "96": 50402.0,
+            "97": 47195.0,
+            "98": 49883.0,
+            "99": 48365.0,
+            "100": 43445.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1015977472.0,
+            "52": 1015986176.0,
+            "53": 1015979520.0,
+            "54": 1015978496.0,
+            "55": 1015981056.0,
+            "56": 1015976448.0,
+            "57": 1015986688.0,
+            "58": 1015981056.0,
+            "59": 1015985152.0,
+            "60": 1015982592.0,
+            "61": 1015979008.0,
+            "62": 1015977984.0,
+            "63": 1015987712.0,
+            "64": 1015983616.0,
+            "65": 1015994880.0,
+            "66": 1015980032.0,
+            "67": 1015981568.0,
+            "68": 1015977984.0,
+            "69": 1015982080.0,
+            "70": 1016161280.0,
+            "71": 1015979008.0,
+            "72": 1015982080.0,
+            "73": 1015984128.0,
+            "74": 1015988736.0,
+            "75": 1015978496.0,
+            "76": 1015976448.0,
+            "77": 1015979520.0,
+            "78": 1015984640.0,
+            "79": 1015979520.0,
+            "80": 1015983616.0,
+            "81": 1015981568.0,
+            "82": 1015983104.0,
+            "83": 1015980032.0,
+            "84": 1015984128.0,
+            "85": 1015986688.0,
+            "86": 1015986688.0,
+            "87": 1015980032.0,
+            "88": 1015981568.0,
+            "89": 1015988736.0,
+            "90": 1015980544.0,
+            "91": 1015977984.0,
+            "92": 1016114176.0,
+            "93": 1015981056.0,
+            "94": 1015982080.0,
+            "95": 1015979008.0,
+            "96": 1015976960.0,
+            "97": 1015984640.0,
+            "98": 1015977472.0,
+            "99": 1015988224.0,
+            "100": 1015985664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2935420416.0,
+            "52": 2935420416.0,
+            "53": 2935420416.0,
+            "54": 2935420416.0,
+            "55": 2935420416.0,
+            "56": 2935420416.0,
+            "57": 2935420416.0,
+            "58": 2935420416.0,
+            "59": 2935420416.0,
+            "60": 2935420416.0,
+            "61": 2935420416.0,
+            "62": 2935420416.0,
+            "63": 2935420416.0,
+            "64": 2935420416.0,
+            "65": 2935420416.0,
+            "66": 2935420416.0,
+            "67": 2935420416.0,
+            "68": 2935420416.0,
+            "69": 2935420416.0,
+            "70": 2935420416.0,
+            "71": 2935420416.0,
+            "72": 2935420416.0,
+            "73": 2935420416.0,
+            "74": 2935420416.0,
+            "75": 2935420416.0,
+            "76": 2935420416.0,
+            "77": 2935420416.0,
+            "78": 2935420416.0,
+            "79": 2935420416.0,
+            "80": 2935420416.0,
+            "81": 2935420416.0,
+            "82": 2935420416.0,
+            "83": 2935420416.0,
+            "84": 2935420416.0,
+            "85": 2935420416.0,
+            "86": 2935420416.0,
+            "87": 2935420416.0,
+            "88": 2935420416.0,
+            "89": 2935420416.0,
+            "90": 2935420416.0,
+            "91": 2935420416.0,
+            "92": 2935420416.0,
+            "93": 2935420416.0,
+            "94": 2935420416.0,
+            "95": 2935420416.0,
+            "96": 2935420416.0,
+            "97": 2935420416.0,
+            "98": 2935420416.0,
+            "99": 2935420416.0,
+            "100": 2935420416.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 13.36069,
+            "52": 0.28674,
+            "53": 0.19891,
+            "54": 0.20156,
+            "55": 0.1819,
+            "56": 0.25306,
+            "57": 0.18921,
+            "58": 0.16571,
+            "59": 0.18603,
+            "60": 0.18108,
+            "61": 0.16054,
+            "62": 0.15396,
+            "63": 0.17162,
+            "64": 0.17605,
+            "65": 0.23651,
+            "66": 0.15684,
+            "67": 0.24234,
+            "68": 0.16737,
+            "69": 0.1644,
+            "70": 0.17023,
+            "71": 0.18887,
+            "72": 0.17787,
+            "73": 0.17972,
+            "74": 0.17258,
+            "75": 0.16961,
+            "76": 0.17324,
+            "77": 0.16212,
+            "78": 0.16629,
+            "79": 0.15673,
+            "80": 0.17244,
+            "81": 0.15957,
+            "82": 0.14913,
+            "83": 0.15131,
+            "84": 0.16274,
+            "85": 0.1686,
+            "86": 0.19415,
+            "87": 0.15249,
+            "88": 0.14449,
+            "89": 0.16305,
+            "90": 0.13988,
+            "91": 0.17343,
+            "92": 0.15546,
+            "93": 0.15914,
+            "94": 0.19609,
+            "95": 0.14746,
+            "96": 0.1437,
+            "97": 0.1637,
+            "98": 0.14571,
+            "99": 0.15931,
+            "100": 0.14229
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..3dd007cc9ec
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.7915,
+            "2": 10.8072,
+            "3": 10.79214,
+            "4": 10.77571,
+            "5": 10.82495,
+            "6": 10.83191,
+            "7": 10.82037,
+            "8": 10.81565,
+            "9": 10.81976,
+            "10": 10.7695,
+            "11": 10.8454,
+            "12": 10.83063,
+            "13": 10.83651,
+            "14": 10.85696,
+            "15": 10.80631,
+            "16": 10.78763,
+            "17": 10.75856,
+            "18": 10.79234,
+            "19": 10.78331,
+            "20": 10.73181,
+            "21": 10.71017,
+            "22": 10.57574,
+            "23": 10.71599,
+            "24": 10.62049,
+            "25": 10.58266,
+            "26": 10.61764,
+            "27": 10.65105,
+            "28": 10.63303,
+            "29": 10.63022,
+            "30": 10.44254,
+            "31": 10.20049,
+            "32": 10.52014,
+            "33": 10.50814,
+            "34": 10.29535,
+            "35": 10.33643,
+            "36": 10.30247,
+            "37": 10.41766,
+            "38": 10.28067,
+            "39": 10.46149,
+            "40": 10.18213,
+            "41": 10.21349,
+            "42": 10.28426,
+            "43": 9.9557,
+            "44": 10.05793,
+            "45": 9.9574,
+            "46": 9.93571,
+            "47": 10.22719,
+            "48": 9.96561,
+            "49": 9.66581,
+            "50": 10.00922,
+            "51": 9.94826,
+            "52": 9.84653,
+            "53": 10.14876,
+            "54": 10.03737,
+            "55": 9.97454,
+            "56": 9.71384,
+            "57": 9.5955,
+            "58": 9.92044,
+            "59": 9.67604,
+            "60": 9.61264,
+            "61": 9.79194,
+            "62": 10.05699,
+            "63": 9.47838,
+            "64": 9.84479,
+            "65": 9.03861,
+            "66": 9.78386,
+            "67": 9.43595,
+            "68": 9.85188,
+            "69": 9.84445,
+            "70": 9.79288,
+            "71": 9.69163,
+            "72": 9.64893,
+            "73": 9.55502,
+            "74": 9.04736,
+            "75": 9.49186,
+            "76": 9.17766,
+            "77": 10.11289,
+            "78": 9.7687,
+            "79": 9.43966,
+            "80": 9.45416,
+            "81": 9.53142,
+            "82": 9.7541,
+            "83": 9.38201,
+            "84": 9.46121,
+            "85": 9.66928,
+            "86": 9.13531,
+            "87": 9.63413,
+            "88": 9.8011,
+            "89": 9.66658,
+            "90": 9.86173,
+            "91": 9.39963,
+            "92": 9.41066,
+            "93": 9.14665,
+            "94": 8.8869,
+            "95": 9.56959,
+            "96": 9.57609,
+            "97": 9.34309,
+            "98": 9.72749,
+            "99": 8.96222,
+            "100": 9.44903
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 31217.0,
+            "2": 33106.0,
+            "3": 33596.0,
+            "4": 31946.0,
+            "5": 36783.0,
+            "6": 37252.0,
+            "7": 35314.0,
+            "8": 31970.0,
+            "9": 34937.0,
+            "10": 29900.0,
+            "11": 38039.0,
+            "12": 34886.0,
+            "13": 37108.0,
+            "14": 37755.0,
+            "15": 35069.0,
+            "16": 36687.0,
+            "17": 34887.0,
+            "18": 35219.0,
+            "19": 35710.0,
+            "20": 32682.0,
+            "21": 33456.0,
+            "22": 30216.0,
+            "23": 37780.0,
+            "24": 32298.0,
+            "25": 30789.0,
+            "26": 34549.0,
+            "27": 35611.0,
+            "28": 36806.0,
+            "29": 37955.0,
+            "30": 32950.0,
+            "31": 30468.0,
+            "32": 36291.0,
+            "33": 37916.0,
+            "34": 32820.0,
+            "35": 34371.0,
+            "36": 34957.0,
+            "37": 38282.0,
+            "38": 35878.0,
+            "39": 38974.0,
+            "40": 36048.0,
+            "41": 35988.0,
+            "42": 37320.0,
+            "43": 33909.0,
+            "44": 33889.0,
+            "45": 35577.0,
+            "46": 37076.0,
+            "47": 40966.0,
+            "48": 35327.0,
+            "49": 34682.0,
+            "50": 39871.0,
+            "51": 36802.0,
+            "52": 36445.0,
+            "53": 41968.0,
+            "54": 40797.0,
+            "55": 36920.0,
+            "56": 40345.0,
+            "57": 36961.0,
+            "58": 41622.0,
+            "59": 37988.0,
+            "60": 40534.0,
+            "61": 40456.0,
+            "62": 43543.0,
+            "63": 37438.0,
+            "64": 42659.0,
+            "65": 39924.0,
+            "66": 44122.0,
+            "67": 40136.0,
+            "68": 40005.0,
+            "69": 41675.0,
+            "70": 45011.0,
+            "71": 40746.0,
+            "72": 41647.0,
+            "73": 44080.0,
+            "74": 35412.0,
+            "75": 39478.0,
+            "76": 46254.0,
+            "77": 44764.0,
+            "78": 47985.0,
+            "79": 48646.0,
+            "80": 46686.0,
+            "81": 50102.0,
+            "82": 50188.0,
+            "83": 44717.0,
+            "84": 46114.0,
+            "85": 49347.0,
+            "86": 45770.0,
+            "87": 49671.0,
+            "88": 46449.0,
+            "89": 49666.0,
+            "90": 51087.0,
+            "91": 45827.0,
+            "92": 48163.0,
+            "93": 46547.0,
+            "94": 47562.0,
+            "95": 48540.0,
+            "96": 50182.0,
+            "97": 46055.0,
+            "98": 50271.0,
+            "99": 48494.0,
+            "100": 45373.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 892874752.0,
+            "2": 892866560.0,
+            "3": 892869120.0,
+            "4": 892876800.0,
+            "5": 892869120.0,
+            "6": 892870656.0,
+            "7": 892874240.0,
+            "8": 892868608.0,
+            "9": 892869632.0,
+            "10": 892868608.0,
+            "11": 892869632.0,
+            "12": 892867072.0,
+            "13": 892872192.0,
+            "14": 892873216.0,
+            "15": 892870656.0,
+            "16": 892868608.0,
+            "17": 892879360.0,
+            "18": 892867072.0,
+            "19": 892870656.0,
+            "20": 892867072.0,
+            "21": 892871168.0,
+            "22": 892874752.0,
+            "23": 892877824.0,
+            "24": 892869120.0,
+            "25": 892877312.0,
+            "26": 892873216.0,
+            "27": 892865024.0,
+            "28": 892870144.0,
+            "29": 892869632.0,
+            "30": 892871680.0,
+            "31": 892881920.0,
+            "32": 892874752.0,
+            "33": 892870144.0,
+            "34": 892872192.0,
+            "35": 892874240.0,
+            "36": 892869632.0,
+            "37": 892868096.0,
+            "38": 892867072.0,
+            "39": 892871168.0,
+            "40": 892869120.0,
+            "41": 892873728.0,
+            "42": 892868608.0,
+            "43": 892871168.0,
+            "44": 892871680.0,
+            "45": 892869632.0,
+            "46": 892876800.0,
+            "47": 892869632.0,
+            "48": 892875264.0,
+            "49": 892872704.0,
+            "50": 892869120.0,
+            "51": 892872192.0,
+            "52": 892875776.0,
+            "53": 892868096.0,
+            "54": 892872192.0,
+            "55": 892867072.0,
+            "56": 892865024.0,
+            "57": 892876288.0,
+            "58": 892869120.0,
+            "59": 892871680.0,
+            "60": 892869120.0,
+            "61": 892869120.0,
+            "62": 892869632.0,
+            "63": 892870656.0,
+            "64": 892865536.0,
+            "65": 892872192.0,
+            "66": 892864512.0,
+            "67": 892862464.0,
+            "68": 892867584.0,
+            "69": 892861952.0,
+            "70": 892867072.0,
+            "71": 892870656.0,
+            "72": 892862464.0,
+            "73": 892861440.0,
+            "74": 892849664.0,
+            "75": 892868096.0,
+            "76": 892869632.0,
+            "77": 892868096.0,
+            "78": 892859392.0,
+            "79": 892865024.0,
+            "80": 892855296.0,
+            "81": 892856320.0,
+            "82": 892860416.0,
+            "83": 892869632.0,
+            "84": 892852736.0,
+            "85": 892871680.0,
+            "86": 892861952.0,
+            "87": 892869120.0,
+            "88": 892869632.0,
+            "89": 892859392.0,
+            "90": 892867072.0,
+            "91": 892865536.0,
+            "92": 892865536.0,
+            "93": 892861440.0,
+            "94": 892860928.0,
+            "95": 892869120.0,
+            "96": 892866560.0,
+            "97": 892856320.0,
+            "98": 892869120.0,
+            "99": 892864512.0,
+            "100": 892864000.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1948027904.0,
+            "2": 2183897088.0,
+            "3": 2184431104.0,
+            "4": 2190589952.0,
+            "5": 2190589952.0,
+            "6": 2190589952.0,
+            "7": 2190589952.0,
+            "8": 2190589952.0,
+            "9": 2190589952.0,
+            "10": 2190589952.0,
+            "11": 2190589952.0,
+            "12": 2190589952.0,
+            "13": 2190589952.0,
+            "14": 2190589952.0,
+            "15": 2190589952.0,
+            "16": 2190589952.0,
+            "17": 2194543104.0,
+            "18": 2194543104.0,
+            "19": 2194543104.0,
+            "20": 2194543104.0,
+            "21": 2194543104.0,
+            "22": 2194543104.0,
+            "23": 2194543104.0,
+            "24": 2194543104.0,
+            "25": 2194543104.0,
+            "26": 2194543104.0,
+            "27": 2194543104.0,
+            "28": 2194543104.0,
+            "29": 2194543104.0,
+            "30": 2194543104.0,
+            "31": 2195852288.0,
+            "32": 2195852288.0,
+            "33": 2195852288.0,
+            "34": 2195852288.0,
+            "35": 2195852288.0,
+            "36": 2195852288.0,
+            "37": 2195852288.0,
+            "38": 2195852288.0,
+            "39": 2195852288.0,
+            "40": 2195852288.0,
+            "41": 2195852288.0,
+            "42": 2195852288.0,
+            "43": 2195852288.0,
+            "44": 2195852288.0,
+            "45": 2195852288.0,
+            "46": 2195852288.0,
+            "47": 2195852288.0,
+            "48": 2195852288.0,
+            "49": 2195852288.0,
+            "50": 2195852288.0,
+            "51": 2195852288.0,
+            "52": 2195852288.0,
+            "53": 2195852288.0,
+            "54": 2195852288.0,
+            "55": 2195852288.0,
+            "56": 2195852288.0,
+            "57": 2195852288.0,
+            "58": 2195852288.0,
+            "59": 2195852288.0,
+            "60": 2195852288.0,
+            "61": 2195852288.0,
+            "62": 2195852288.0,
+            "63": 2195852288.0,
+            "64": 2195852288.0,
+            "65": 2195852288.0,
+            "66": 2195852288.0,
+            "67": 2195852288.0,
+            "68": 2195852288.0,
+            "69": 2195852288.0,
+            "70": 2195852288.0,
+            "71": 2195852288.0,
+            "72": 2195852288.0,
+            "73": 2195852288.0,
+            "74": 2195852288.0,
+            "75": 2195852288.0,
+            "76": 2195852288.0,
+            "77": 2195852288.0,
+            "78": 2195852288.0,
+            "79": 2195852288.0,
+            "80": 2195852288.0,
+            "81": 2195852288.0,
+            "82": 2195852288.0,
+            "83": 2195852288.0,
+            "84": 2195852288.0,
+            "85": 2195852288.0,
+            "86": 2195852288.0,
+            "87": 2195852288.0,
+            "88": 2195852288.0,
+            "89": 2195852288.0,
+            "90": 2195852288.0,
+            "91": 2195852288.0,
+            "92": 2195852288.0,
+            "93": 2195852288.0,
+            "94": 2195852288.0,
+            "95": 2195852288.0,
+            "96": 2195852288.0,
+            "97": 2195852288.0,
+            "98": 2195852288.0,
+            "99": 2195852288.0,
+            "100": 2195852288.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.86481,
+            "3": 0.3588,
+            "4": 0.35276,
+            "5": 0.33575,
+            "6": 0.3344,
+            "7": 0.3406,
+            "8": 0.33551,
+            "9": 0.33157,
+            "10": 0.32814,
+            "11": 0.32882,
+            "12": 0.3298,
+            "13": 0.32887,
+            "14": 0.32898,
+            "15": 0.33409,
+            "16": 0.32679,
+            "17": 0.34317,
+            "18": 0.33153,
+            "19": 0.32828,
+            "20": 0.33077,
+            "21": 0.32713,
+            "22": 0.32603,
+            "23": 0.32819,
+            "24": 0.33158,
+            "25": 0.32832,
+            "26": 0.32593,
+            "27": 0.33086,
+            "28": 0.32481,
+            "29": 0.32607,
+            "30": 0.33032,
+            "31": 0.33561,
+            "32": 0.33149,
+            "33": 0.32643,
+            "34": 0.34262,
+            "35": 0.32889,
+            "36": 0.32749,
+            "37": 0.32097,
+            "38": 0.33036,
+            "39": 0.69454,
+            "40": 0.33723,
+            "41": 0.3284,
+            "42": 0.32735,
+            "43": 0.33334,
+            "44": 0.3333,
+            "45": 0.33315,
+            "46": 0.33505,
+            "47": 0.32976,
+            "48": 0.32918,
+            "49": 0.34661,
+            "50": 0.32681,
+            "51": 0.3427,
+            "52": 0.3299,
+            "53": 0.32454,
+            "54": 0.3251,
+            "55": 0.32968,
+            "56": 0.34696,
+            "57": 0.33819,
+            "58": 0.32649,
+            "59": 0.3341,
+            "60": 0.33324,
+            "61": 0.33925,
+            "62": 0.33532,
+            "63": 0.34334,
+            "64": 0.34963,
+            "65": 0.38392,
+            "66": 0.33805,
+            "67": 0.3728,
+            "68": 0.33745,
+            "69": 0.33504,
+            "70": 0.33581,
+            "71": 0.35385,
+            "72": 0.34934,
+            "73": 0.34952,
+            "74": 0.35756,
+            "75": 0.35105,
+            "76": 0.34933,
+            "77": 0.33518,
+            "78": 0.34556,
+            "79": 0.34603,
+            "80": 0.36355,
+            "81": 0.34186,
+            "82": 0.34271,
+            "83": 0.39765,
+            "84": 0.36927,
+            "85": 0.33938,
+            "86": 0.35142,
+            "87": 0.34329,
+            "88": 0.33135,
+            "89": 0.34535,
+            "90": 0.33856,
+            "91": 0.3522,
+            "92": 0.33934,
+            "93": 0.38169,
+            "94": 0.36358,
+            "95": 0.33846,
+            "96": 0.33554,
+            "97": 0.34438,
+            "98": 0.32586,
+            "99": 0.43185,
+            "100": 0.33974
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json
index c598c8c5c86..64a0d3b0293 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 892864512.0,
-            "2": 892868608.0,
-            "3": 892868608.0,
-            "4": 892864512.0,
-            "5": 892865024.0,
-            "6": 892866560.0,
-            "7": 892866048.0,
-            "8": 892867584.0,
-            "9": 892865536.0,
-            "10": 892867584.0,
-            "11": 892866048.0,
-            "12": 892865536.0,
-            "13": 892865536.0,
-            "14": 892868096.0,
-            "15": 892867584.0,
-            "16": 892867072.0,
-            "17": 892867584.0,
-            "18": 892869632.0,
-            "19": 892868096.0,
-            "20": 892866560.0,
-            "21": 892866560.0,
-            "22": 892863488.0,
-            "23": 892864512.0,
-            "24": 892867072.0,
-            "25": 892863488.0,
-            "26": 892866560.0,
-            "27": 892867072.0,
-            "28": 892865536.0,
-            "29": 892866048.0,
-            "30": 892863488.0,
-            "31": 892862464.0,
-            "32": 892861952.0,
-            "33": 892866048.0,
-            "34": 892865536.0,
-            "35": 892865024.0,
-            "36": 892868608.0,
-            "37": 892867072.0,
-            "38": 892866560.0,
-            "39": 892866048.0,
-            "40": 892867072.0,
-            "41": 892865536.0,
-            "42": 892867584.0,
-            "43": 892861440.0,
-            "44": 892862976.0,
-            "45": 892865024.0,
-            "46": 892864512.0,
-            "47": 892865024.0,
-            "48": 892861440.0,
-            "49": 892863488.0,
-            "50": 892867072.0,
-            "51": 892860416.0,
-            "52": 892858880.0,
-            "53": 892861440.0,
-            "54": 892861440.0,
-            "55": 892862464.0,
-            "56": 892865024.0,
-            "57": 892857344.0,
-            "58": 892859392.0,
-            "59": 892858880.0,
-            "60": 892859904.0,
-            "61": 892868608.0,
-            "62": 892865536.0,
-            "63": 892861952.0,
-            "64": 892863488.0,
-            "65": 892851712.0,
-            "66": 892866048.0,
-            "67": 892861440.0,
-            "68": 892868608.0,
-            "69": 892864512.0,
-            "70": 892866560.0,
-            "71": 892868608.0,
-            "72": 892860416.0,
-            "73": 892868096.0,
-            "74": 892858368.0,
-            "75": 892867072.0,
-            "76": 892866560.0,
-            "77": 892867072.0,
-            "78": 892863488.0,
-            "79": 892864512.0,
-            "80": 892864512.0,
-            "81": 892866048.0,
-            "82": 892864000.0,
-            "83": 892860928.0,
-            "84": 892861440.0,
-            "85": 892861952.0,
-            "86": 892861440.0,
-            "87": 892870144.0,
-            "88": 892862464.0,
-            "89": 892864512.0,
-            "90": 892866048.0,
-            "91": 892867072.0,
-            "92": 892865536.0,
-            "93": 892868608.0,
-            "94": 892864512.0,
-            "95": 892865024.0,
-            "96": 892865024.0,
-            "97": 892862976.0,
-            "98": 892867584.0,
-            "99": 892859904.0,
-            "100": 892861952.0
+            "1": 892865536.0,
+            "2": 892869632.0,
+            "3": 892869632.0,
+            "4": 892865536.0,
+            "5": 892866048.0,
+            "6": 892867584.0,
+            "7": 892867072.0,
+            "8": 892868608.0,
+            "9": 892866560.0,
+            "10": 892868608.0,
+            "11": 892867072.0,
+            "12": 892866560.0,
+            "13": 892866560.0,
+            "14": 892869120.0,
+            "15": 892868608.0,
+            "16": 892868096.0,
+            "17": 892868608.0,
+            "18": 892870656.0,
+            "19": 892869120.0,
+            "20": 892867584.0,
+            "21": 892867584.0,
+            "22": 892864512.0,
+            "23": 892865536.0,
+            "24": 892868096.0,
+            "25": 892864512.0,
+            "26": 892867584.0,
+            "27": 892868096.0,
+            "28": 892866560.0,
+            "29": 892867072.0,
+            "30": 892864512.0,
+            "31": 892863488.0,
+            "32": 892862976.0,
+            "33": 892867072.0,
+            "34": 892866560.0,
+            "35": 892866048.0,
+            "36": 892869632.0,
+            "37": 892868096.0,
+            "38": 892867584.0,
+            "39": 892867072.0,
+            "40": 892868096.0,
+            "41": 892866560.0,
+            "42": 892868608.0,
+            "43": 892862464.0,
+            "44": 892864000.0,
+            "45": 892866048.0,
+            "46": 892865536.0,
+            "47": 892866048.0,
+            "48": 892862464.0,
+            "49": 892864512.0,
+            "50": 892868096.0,
+            "51": 892861440.0,
+            "52": 892859904.0,
+            "53": 892862464.0,
+            "54": 892862464.0,
+            "55": 892863488.0,
+            "56": 892866048.0,
+            "57": 892858368.0,
+            "58": 892860416.0,
+            "59": 892859904.0,
+            "60": 892860928.0,
+            "61": 892869632.0,
+            "62": 892866560.0,
+            "63": 892862976.0,
+            "64": 892864512.0,
+            "65": 892852736.0,
+            "66": 892867072.0,
+            "67": 892862464.0,
+            "68": 892869632.0,
+            "69": 892865536.0,
+            "70": 892867584.0,
+            "71": 892869632.0,
+            "72": 892861440.0,
+            "73": 892869120.0,
+            "74": 892859392.0,
+            "75": 892868096.0,
+            "76": 892867584.0,
+            "77": 892868096.0,
+            "78": 892864512.0,
+            "79": 892865536.0,
+            "80": 892865536.0,
+            "81": 892867072.0,
+            "82": 892865024.0,
+            "83": 892861952.0,
+            "84": 892862464.0,
+            "85": 892862976.0,
+            "86": 892862464.0,
+            "87": 892871168.0,
+            "88": 892863488.0,
+            "89": 892865536.0,
+            "90": 892867072.0,
+            "91": 892868096.0,
+            "92": 892866560.0,
+            "93": 892869632.0,
+            "94": 892865536.0,
+            "95": 892866048.0,
+            "96": 892866048.0,
+            "97": 892864000.0,
+            "98": 892868608.0,
+            "99": 892860928.0,
+            "100": 892862976.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1867566080.0,
-            "2": 2107252736.0,
-            "3": 2107252736.0,
-            "4": 2107252736.0,
-            "5": 2107481600.0,
-            "6": 2107481600.0,
-            "7": 2107481600.0,
-            "8": 2107481600.0,
-            "9": 2107481600.0,
-            "10": 2108814336.0,
-            "11": 2108814336.0,
-            "12": 2108814336.0,
-            "13": 2108814336.0,
-            "14": 2108814336.0,
-            "15": 2108814336.0,
-            "16": 2109139456.0,
-            "17": 2109139456.0,
-            "18": 2109139456.0,
-            "19": 2109139456.0,
-            "20": 2109139456.0,
-            "21": 2109139456.0,
-            "22": 2109139456.0,
-            "23": 2109139456.0,
-            "24": 2109139456.0,
-            "25": 2109139456.0,
-            "26": 2109139456.0,
-            "27": 2109139456.0,
-            "28": 2109139456.0,
-            "29": 2109139456.0,
-            "30": 2109139456.0,
-            "31": 2109139456.0,
-            "32": 2109139456.0,
-            "33": 2109139456.0,
-            "34": 2109139456.0,
-            "35": 2109139456.0,
-            "36": 2109139456.0,
-            "37": 2109139456.0,
-            "38": 2109139456.0,
-            "39": 2109139456.0,
-            "40": 2109139456.0,
-            "41": 2109139456.0,
-            "42": 2109139456.0,
-            "43": 2109139456.0,
-            "44": 2109139456.0,
-            "45": 2109139456.0,
-            "46": 2109139456.0,
-            "47": 2109139456.0,
-            "48": 2109139456.0,
-            "49": 2109139456.0,
-            "50": 2109139456.0,
-            "51": 2109139456.0,
-            "52": 2109139456.0,
-            "53": 2109139456.0,
-            "54": 2109139456.0,
-            "55": 2109139456.0,
-            "56": 2109139456.0,
-            "57": 2109139456.0,
-            "58": 2109139456.0,
-            "59": 2109139456.0,
-            "60": 2109139456.0,
-            "61": 2109139456.0,
-            "62": 2109139456.0,
-            "63": 2109139456.0,
-            "64": 2109139456.0,
-            "65": 2109139456.0,
-            "66": 2109139456.0,
-            "67": 2109139456.0,
-            "68": 2109139456.0,
-            "69": 2109139456.0,
-            "70": 2109139456.0,
-            "71": 2109139456.0,
-            "72": 2109139456.0,
-            "73": 2109139456.0,
-            "74": 2109139456.0,
-            "75": 2109139456.0,
-            "76": 2109139456.0,
-            "77": 2109139456.0,
-            "78": 2109139456.0,
-            "79": 2109139456.0,
-            "80": 2109139456.0,
-            "81": 2109139456.0,
-            "82": 2109139456.0,
-            "83": 2109139456.0,
-            "84": 2109139456.0,
-            "85": 2109139456.0,
-            "86": 2109139456.0,
-            "87": 2109897728.0,
-            "88": 2109897728.0,
-            "89": 2109897728.0,
-            "90": 2109897728.0,
-            "91": 2109897728.0,
-            "92": 2109897728.0,
-            "93": 2109897728.0,
-            "94": 2109897728.0,
-            "95": 2109897728.0,
-            "96": 2109897728.0,
-            "97": 2109897728.0,
-            "98": 2109897728.0,
-            "99": 2109897728.0,
-            "100": 2109897728.0
+            "1": 1918568448.0,
+            "2": 2157712384.0,
+            "3": 2157712384.0,
+            "4": 2157712384.0,
+            "5": 2159109632.0,
+            "6": 2159109632.0,
+            "7": 2159109632.0,
+            "8": 2159109632.0,
+            "9": 2159109632.0,
+            "10": 2159142912.0,
+            "11": 2159142912.0,
+            "12": 2159142912.0,
+            "13": 2159142912.0,
+            "14": 2159633920.0,
+            "15": 2159633920.0,
+            "16": 2159633920.0,
+            "17": 2159633920.0,
+            "18": 2159633920.0,
+            "19": 2159633920.0,
+            "20": 2159633920.0,
+            "21": 2159633920.0,
+            "22": 2159633920.0,
+            "23": 2159633920.0,
+            "24": 2159633920.0,
+            "25": 2159633920.0,
+            "26": 2159802368.0,
+            "27": 2159802368.0,
+            "28": 2159802368.0,
+            "29": 2159802368.0,
+            "30": 2159802368.0,
+            "31": 2159802368.0,
+            "32": 2159802368.0,
+            "33": 2159802368.0,
+            "34": 2159802368.0,
+            "35": 2159802368.0,
+            "36": 2159802368.0,
+            "37": 2159802368.0,
+            "38": 2159802368.0,
+            "39": 2159802368.0,
+            "40": 2159802368.0,
+            "41": 2159802368.0,
+            "42": 2159802368.0,
+            "43": 2159802368.0,
+            "44": 2159802368.0,
+            "45": 2159802368.0,
+            "46": 2159802368.0,
+            "47": 2159802368.0,
+            "48": 2159802368.0,
+            "49": 2159802368.0,
+            "50": 2159802368.0,
+            "51": 2159802368.0,
+            "52": 2159802368.0,
+            "53": 2159802368.0,
+            "54": 2159802368.0,
+            "55": 2159802368.0,
+            "56": 2159802368.0,
+            "57": 2159802368.0,
+            "58": 2159802368.0,
+            "59": 2159802368.0,
+            "60": 2159802368.0,
+            "61": 2159802368.0,
+            "62": 2159802368.0,
+            "63": 2159802368.0,
+            "64": 2159802368.0,
+            "65": 2159802368.0,
+            "66": 2159802368.0,
+            "67": 2159802368.0,
+            "68": 2159802368.0,
+            "69": 2159802368.0,
+            "70": 2159802368.0,
+            "71": 2159802368.0,
+            "72": 2159802368.0,
+            "73": 2160337408.0,
+            "74": 2160337408.0,
+            "75": 2160337408.0,
+            "76": 2160337408.0,
+            "77": 2160337408.0,
+            "78": 2160337408.0,
+            "79": 2160337408.0,
+            "80": 2160337408.0,
+            "81": 2160337408.0,
+            "82": 2160337408.0,
+            "83": 2160337408.0,
+            "84": 2161362944.0,
+            "85": 2161362944.0,
+            "86": 2161362944.0,
+            "87": 2161362944.0,
+            "88": 2161362944.0,
+            "89": 2161362944.0,
+            "90": 2161362944.0,
+            "91": 2161362944.0,
+            "92": 2161362944.0,
+            "93": 2161362944.0,
+            "94": 2161362944.0,
+            "95": 2162391552.0,
+            "96": 2162391552.0,
+            "97": 2162391552.0,
+            "98": 2162391552.0,
+            "99": 2162391552.0,
+            "100": 2162391552.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 14.1374,
-            "2": 0.29466,
-            "3": 0.26236,
-            "4": 0.26156,
-            "5": 0.24237,
-            "6": 0.23849,
-            "7": 0.252,
-            "8": 0.24427,
-            "9": 0.24029,
-            "10": 0.23618,
-            "11": 0.23659,
-            "12": 0.23342,
-            "13": 0.23316,
-            "14": 0.23233,
-            "15": 0.24856,
-            "16": 0.23522,
-            "17": 0.24126,
-            "18": 0.22751,
-            "19": 0.2299,
-            "20": 0.23346,
-            "21": 0.23441,
-            "22": 0.22921,
-            "23": 0.23376,
-            "24": 0.23927,
-            "25": 0.23185,
-            "26": 0.23099,
-            "27": 0.22756,
-            "28": 0.2284,
-            "29": 0.22889,
-            "30": 0.23032,
-            "31": 0.26621,
-            "32": 0.23553,
-            "33": 0.23683,
-            "34": 0.25808,
-            "35": 0.23912,
-            "36": 0.23198,
-            "37": 0.23086,
-            "38": 0.23515,
-            "39": 0.2291,
-            "40": 0.24108,
-            "41": 0.23663,
-            "42": 0.23631,
-            "43": 0.23891,
-            "44": 0.23205,
-            "45": 0.24801,
-            "46": 0.2689,
-            "47": 0.23258,
-            "48": 0.25079,
-            "49": 0.26858,
-            "50": 0.2361,
-            "51": 0.27052,
-            "52": 0.26801,
-            "53": 0.23804,
-            "54": 0.23998,
-            "55": 0.25008,
-            "56": 0.29894,
-            "57": 0.26807,
-            "58": 0.23939,
-            "59": 0.24845,
-            "60": 0.24835,
-            "61": 0.24071,
-            "62": 0.23697,
-            "63": 0.25187,
-            "64": 0.24293,
-            "65": 0.31273,
-            "66": 0.23771,
-            "67": 0.28851,
-            "68": 0.25834,
-            "69": 0.24387,
-            "70": 0.23624,
-            "71": 0.26612,
-            "72": 0.25067,
-            "73": 0.28048,
-            "74": 0.26617,
-            "75": 0.24822,
-            "76": 0.26459,
-            "77": 0.23429,
-            "78": 0.24496,
-            "79": 0.24741,
-            "80": 0.25523,
-            "81": 0.2433,
-            "82": 0.23696,
-            "83": 0.2421,
-            "84": 0.24973,
-            "85": 0.24316,
-            "86": 0.25585,
-            "87": 0.23448,
-            "88": 0.23245,
-            "89": 0.25191,
-            "90": 0.23373,
-            "91": 0.25927,
-            "92": 0.24203,
-            "93": 0.25124,
-            "94": 0.26498,
-            "95": 0.24482,
-            "96": 0.23378,
-            "97": 0.25053,
-            "98": 0.23165,
-            "99": 0.24761,
-            "100": 0.23858
+            "1": 14.93722,
+            "2": 0.29196,
+            "3": 0.25566,
+            "4": 0.22819,
+            "5": 0.21657,
+            "6": 0.22742,
+            "7": 0.23255,
+            "8": 0.21868,
+            "9": 0.23203,
+            "10": 0.22911,
+            "11": 0.22371,
+            "12": 0.22358,
+            "13": 0.21762,
+            "14": 0.2166,
+            "15": 0.2341,
+            "16": 0.21834,
+            "17": 0.21429,
+            "18": 0.21499,
+            "19": 0.2158,
+            "20": 0.21523,
+            "21": 0.21654,
+            "22": 0.21788,
+            "23": 0.21597,
+            "24": 0.20917,
+            "25": 0.2076,
+            "26": 0.20309,
+            "27": 0.20463,
+            "28": 0.57074,
+            "29": 0.20266,
+            "30": 0.21832,
+            "31": 0.23121,
+            "32": 0.2052,
+            "33": 0.20847,
+            "34": 0.22756,
+            "35": 0.21093,
+            "36": 0.20495,
+            "37": 0.20762,
+            "38": 0.20131,
+            "39": 0.1991,
+            "40": 0.20426,
+            "41": 0.20518,
+            "42": 0.20555,
+            "43": 0.21112,
+            "44": 0.20079,
+            "45": 0.21854,
+            "46": 0.22885,
+            "47": 0.20366,
+            "48": 0.21784,
+            "49": 0.23722,
+            "50": 0.20288,
+            "51": 0.23225,
+            "52": 0.23281,
+            "53": 0.20606,
+            "54": 0.21135,
+            "55": 0.21897,
+            "56": 0.25991,
+            "57": 0.22845,
+            "58": 0.21751,
+            "59": 0.21469,
+            "60": 0.21187,
+            "61": 0.20946,
+            "62": 0.21358,
+            "63": 0.21765,
+            "64": 0.20357,
+            "65": 0.27698,
+            "66": 0.2118,
+            "67": 0.25518,
+            "68": 0.22631,
+            "69": 0.21209,
+            "70": 0.2039,
+            "71": 0.22504,
+            "72": 0.22276,
+            "73": 0.25179,
+            "74": 0.22993,
+            "75": 0.21538,
+            "76": 0.23629,
+            "77": 0.20835,
+            "78": 0.21168,
+            "79": 0.21631,
+            "80": 0.21797,
+            "81": 0.20362,
+            "82": 0.20269,
+            "83": 0.21014,
+            "84": 0.21456,
+            "85": 0.20971,
+            "86": 0.22253,
+            "87": 0.20037,
+            "88": 0.20403,
+            "89": 0.21541,
+            "90": 0.21443,
+            "91": 0.23258,
+            "92": 0.21749,
+            "93": 0.22377,
+            "94": 0.23559,
+            "95": 0.21351,
+            "96": 0.20316,
+            "97": 0.21349,
+            "98": 0.20244,
+            "99": 0.21023,
+            "100": 0.20508
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..9b6990b963d
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.90528,
+            "52": 9.80364,
+            "53": 10.12728,
+            "54": 10.00036,
+            "55": 9.9362,
+            "56": 9.68506,
+            "57": 9.55805,
+            "58": 9.90514,
+            "59": 9.63857,
+            "60": 9.57451,
+            "61": 9.76864,
+            "62": 10.03802,
+            "63": 9.44503,
+            "64": 9.82796,
+            "65": 9.00712,
+            "66": 9.77422,
+            "67": 9.41277,
+            "68": 9.84111,
+            "69": 9.82784,
+            "70": 9.79011,
+            "71": 9.66957,
+            "72": 9.62799,
+            "73": 9.5473,
+            "74": 9.03663,
+            "75": 9.49153,
+            "76": 9.16783,
+            "77": 10.10857,
+            "78": 9.77081,
+            "79": 9.4383,
+            "80": 9.45436,
+            "81": 9.52266,
+            "82": 9.7424,
+            "83": 9.37076,
+            "84": 9.45377,
+            "85": 9.65832,
+            "86": 9.12522,
+            "87": 9.62697,
+            "88": 9.79619,
+            "89": 9.66054,
+            "90": 9.85081,
+            "91": 9.39408,
+            "92": 9.40744,
+            "93": 9.13595,
+            "94": 8.89048,
+            "95": 9.563,
+            "96": 9.5714,
+            "97": 9.34318,
+            "98": 9.73026,
+            "99": 8.95002,
+            "100": 9.4424
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 38061.0,
+            "52": 37025.0,
+            "53": 41802.0,
+            "54": 41253.0,
+            "55": 37654.0,
+            "56": 41164.0,
+            "57": 37682.0,
+            "58": 41782.0,
+            "59": 39444.0,
+            "60": 40691.0,
+            "61": 40876.0,
+            "62": 43113.0,
+            "63": 38389.0,
+            "64": 43217.0,
+            "65": 41689.0,
+            "66": 45525.0,
+            "67": 41717.0,
+            "68": 40369.0,
+            "69": 41287.0,
+            "70": 45545.0,
+            "71": 41651.0,
+            "72": 41881.0,
+            "73": 45139.0,
+            "74": 35747.0,
+            "75": 39155.0,
+            "76": 44874.0,
+            "77": 45442.0,
+            "78": 46782.0,
+            "79": 48776.0,
+            "80": 47161.0,
+            "81": 51277.0,
+            "82": 49953.0,
+            "83": 45334.0,
+            "84": 46096.0,
+            "85": 49238.0,
+            "86": 46118.0,
+            "87": 49880.0,
+            "88": 47115.0,
+            "89": 48583.0,
+            "90": 49057.0,
+            "91": 45950.0,
+            "92": 47820.0,
+            "93": 46437.0,
+            "94": 47530.0,
+            "95": 48000.0,
+            "96": 50285.0,
+            "97": 46225.0,
+            "98": 49809.0,
+            "99": 47890.0,
+            "100": 44636.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 892861440.0,
+            "52": 892859904.0,
+            "53": 892862464.0,
+            "54": 892862464.0,
+            "55": 892863488.0,
+            "56": 892866048.0,
+            "57": 892858368.0,
+            "58": 892860416.0,
+            "59": 892859904.0,
+            "60": 892860928.0,
+            "61": 892869632.0,
+            "62": 892866560.0,
+            "63": 892862976.0,
+            "64": 892864512.0,
+            "65": 892852736.0,
+            "66": 892867072.0,
+            "67": 892862464.0,
+            "68": 892869632.0,
+            "69": 892865536.0,
+            "70": 892867584.0,
+            "71": 892869632.0,
+            "72": 892861440.0,
+            "73": 892869120.0,
+            "74": 892859392.0,
+            "75": 892868096.0,
+            "76": 892867584.0,
+            "77": 892868096.0,
+            "78": 892864512.0,
+            "79": 892865536.0,
+            "80": 892865536.0,
+            "81": 892867072.0,
+            "82": 892865024.0,
+            "83": 892861952.0,
+            "84": 892862464.0,
+            "85": 892862976.0,
+            "86": 892862464.0,
+            "87": 892871168.0,
+            "88": 892863488.0,
+            "89": 892865536.0,
+            "90": 892867072.0,
+            "91": 892868096.0,
+            "92": 892866560.0,
+            "93": 892869632.0,
+            "94": 892865536.0,
+            "95": 892866048.0,
+            "96": 892866048.0,
+            "97": 892864000.0,
+            "98": 892868608.0,
+            "99": 892860928.0,
+            "100": 892862976.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2157768704.0,
+            "52": 2158874112.0,
+            "53": 2160225280.0,
+            "54": 2160225280.0,
+            "55": 2161818624.0,
+            "56": 2161818624.0,
+            "57": 2161818624.0,
+            "58": 2161818624.0,
+            "59": 2161818624.0,
+            "60": 2161818624.0,
+            "61": 2161818624.0,
+            "62": 2161943040.0,
+            "63": 2161943040.0,
+            "64": 2162058240.0,
+            "65": 2162058240.0,
+            "66": 2162058240.0,
+            "67": 2162058240.0,
+            "68": 2162058240.0,
+            "69": 2162058240.0,
+            "70": 2162058240.0,
+            "71": 2162214912.0,
+            "72": 2162214912.0,
+            "73": 2165406208.0,
+            "74": 2165406208.0,
+            "75": 2165406208.0,
+            "76": 2165406208.0,
+            "77": 2165406208.0,
+            "78": 2165406208.0,
+            "79": 2165406208.0,
+            "80": 2165406208.0,
+            "81": 2165406208.0,
+            "82": 2165406208.0,
+            "83": 2165406208.0,
+            "84": 2166458368.0,
+            "85": 2166458368.0,
+            "86": 2166458368.0,
+            "87": 2166458368.0,
+            "88": 2166458368.0,
+            "89": 2166458368.0,
+            "90": 2166458368.0,
+            "91": 2166458368.0,
+            "92": 2166458368.0,
+            "93": 2166458368.0,
+            "94": 2166458368.0,
+            "95": 2166458368.0,
+            "96": 2166458368.0,
+            "97": 2166458368.0,
+            "98": 2166458368.0,
+            "99": 2166458368.0,
+            "100": 2166458368.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 12.42315,
+            "52": 0.32575,
+            "53": 0.25742,
+            "54": 0.24982,
+            "55": 0.24955,
+            "56": 0.27601,
+            "57": 0.24269,
+            "58": 0.22199,
+            "59": 0.21885,
+            "60": 0.22264,
+            "61": 0.21068,
+            "62": 0.21026,
+            "63": 0.22993,
+            "64": 0.20923,
+            "65": 0.27663,
+            "66": 0.64746,
+            "67": 0.26108,
+            "68": 0.22825,
+            "69": 0.83895,
+            "70": 0.20737,
+            "71": 0.23029,
+            "72": 0.21664,
+            "73": 0.24327,
+            "74": 0.23403,
+            "75": 0.21475,
+            "76": 0.2341,
+            "77": 0.20143,
+            "78": 0.60189,
+            "79": 0.22007,
+            "80": 0.22126,
+            "81": 0.20541,
+            "82": 0.20414,
+            "83": 0.21458,
+            "84": 0.34679,
+            "85": 0.21148,
+            "86": 0.22182,
+            "87": 0.2044,
+            "88": 0.204,
+            "89": 0.21796,
+            "90": 0.20536,
+            "91": 0.22132,
+            "92": 0.20859,
+            "93": 0.21705,
+            "94": 0.23829,
+            "95": 0.21049,
+            "96": 0.20011,
+            "97": 0.2156,
+            "98": 0.19753,
+            "99": 0.21068,
+            "100": 0.20211
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml
index 4ed0bb89001..9a827a4ee72 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml
@@ -52,7 +52,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --no-bias-gelu-fusion: true
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..31167be6de5
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.7915,
+            "2": 10.8072,
+            "3": 10.79214,
+            "4": 10.77571,
+            "5": 10.82495,
+            "6": 10.83193,
+            "7": 10.82077,
+            "8": 10.81496,
+            "9": 10.81973,
+            "10": 10.7692,
+            "11": 10.84519,
+            "12": 10.83101,
+            "13": 10.83652,
+            "14": 10.85771,
+            "15": 10.80581,
+            "16": 10.78733,
+            "17": 10.75844,
+            "18": 10.79297,
+            "19": 10.78295,
+            "20": 10.73199,
+            "21": 10.70953,
+            "22": 10.57675,
+            "23": 10.71651,
+            "24": 10.61983,
+            "25": 10.58207,
+            "26": 10.61694,
+            "27": 10.6509,
+            "28": 10.63261,
+            "29": 10.63024,
+            "30": 10.4432,
+            "31": 10.19983,
+            "32": 10.52048,
+            "33": 10.5079,
+            "34": 10.29565,
+            "35": 10.33536,
+            "36": 10.30278,
+            "37": 10.41788,
+            "38": 10.28121,
+            "39": 10.46185,
+            "40": 10.18169,
+            "41": 10.21391,
+            "42": 10.28457,
+            "43": 9.95538,
+            "44": 10.05751,
+            "45": 9.95713,
+            "46": 9.93528,
+            "47": 10.22675,
+            "48": 9.96521,
+            "49": 9.66603,
+            "50": 10.009,
+            "51": 9.94789,
+            "52": 9.84665,
+            "53": 10.14887,
+            "54": 10.03772,
+            "55": 9.97445,
+            "56": 9.71378,
+            "57": 9.59509,
+            "58": 9.92081,
+            "59": 9.67609,
+            "60": 9.61253,
+            "61": 9.79221,
+            "62": 10.05653,
+            "63": 9.47849,
+            "64": 9.84455,
+            "65": 9.03889,
+            "66": 9.78399,
+            "67": 9.43609,
+            "68": 9.85203,
+            "69": 9.84438,
+            "70": 9.7933,
+            "71": 9.69163,
+            "72": 9.64909,
+            "73": 9.55528,
+            "74": 9.04743,
+            "75": 9.49185,
+            "76": 9.178,
+            "77": 10.11275,
+            "78": 9.76838,
+            "79": 9.4398,
+            "80": 9.45421,
+            "81": 9.53191,
+            "82": 9.75402,
+            "83": 9.38186,
+            "84": 9.46162,
+            "85": 9.66959,
+            "86": 9.1349,
+            "87": 9.6343,
+            "88": 9.80083,
+            "89": 9.66682,
+            "90": 9.86175,
+            "91": 9.39987,
+            "92": 9.41063,
+            "93": 9.14654,
+            "94": 8.88648,
+            "95": 9.56986,
+            "96": 9.57642,
+            "97": 9.34305,
+            "98": 9.72786,
+            "99": 8.96203,
+            "100": 9.44942
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 31217.0,
+            "2": 33106.0,
+            "3": 33596.0,
+            "4": 31946.0,
+            "5": 36783.0,
+            "6": 37377.0,
+            "7": 35362.0,
+            "8": 31711.0,
+            "9": 34749.0,
+            "10": 29758.0,
+            "11": 38348.0,
+            "12": 35446.0,
+            "13": 37087.0,
+            "14": 37869.0,
+            "15": 35242.0,
+            "16": 36520.0,
+            "17": 35190.0,
+            "18": 35191.0,
+            "19": 35614.0,
+            "20": 32571.0,
+            "21": 33220.0,
+            "22": 30518.0,
+            "23": 37619.0,
+            "24": 32547.0,
+            "25": 30591.0,
+            "26": 34546.0,
+            "27": 35275.0,
+            "28": 36936.0,
+            "29": 37531.0,
+            "30": 33354.0,
+            "31": 30754.0,
+            "32": 36331.0,
+            "33": 38273.0,
+            "34": 32645.0,
+            "35": 34237.0,
+            "36": 35092.0,
+            "37": 37931.0,
+            "38": 35480.0,
+            "39": 39175.0,
+            "40": 36296.0,
+            "41": 35902.0,
+            "42": 37609.0,
+            "43": 33748.0,
+            "44": 34027.0,
+            "45": 35215.0,
+            "46": 37108.0,
+            "47": 41056.0,
+            "48": 35765.0,
+            "49": 35087.0,
+            "50": 39734.0,
+            "51": 36712.0,
+            "52": 36176.0,
+            "53": 41774.0,
+            "54": 40447.0,
+            "55": 37071.0,
+            "56": 39975.0,
+            "57": 36828.0,
+            "58": 41815.0,
+            "59": 37962.0,
+            "60": 40415.0,
+            "61": 39921.0,
+            "62": 43840.0,
+            "63": 37890.0,
+            "64": 42699.0,
+            "65": 40347.0,
+            "66": 44159.0,
+            "67": 40057.0,
+            "68": 39563.0,
+            "69": 42246.0,
+            "70": 44867.0,
+            "71": 40910.0,
+            "72": 40982.0,
+            "73": 44363.0,
+            "74": 35672.0,
+            "75": 39602.0,
+            "76": 46157.0,
+            "77": 44919.0,
+            "78": 48134.0,
+            "79": 48666.0,
+            "80": 46770.0,
+            "81": 50144.0,
+            "82": 49680.0,
+            "83": 44991.0,
+            "84": 45912.0,
+            "85": 49371.0,
+            "86": 45600.0,
+            "87": 49292.0,
+            "88": 46411.0,
+            "89": 49710.0,
+            "90": 51008.0,
+            "91": 45796.0,
+            "92": 47991.0,
+            "93": 46847.0,
+            "94": 47360.0,
+            "95": 48680.0,
+            "96": 50369.0,
+            "97": 46162.0,
+            "98": 49921.0,
+            "99": 48235.0,
+            "100": 45390.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1254511616.0,
+            "2": 1254503424.0,
+            "3": 1254505984.0,
+            "4": 1254513664.0,
+            "5": 1254505984.0,
+            "6": 1254507520.0,
+            "7": 1254511104.0,
+            "8": 1254505472.0,
+            "9": 1254505984.0,
+            "10": 1254506496.0,
+            "11": 1254507520.0,
+            "12": 1254503936.0,
+            "13": 1254509568.0,
+            "14": 1254510080.0,
+            "15": 1254506496.0,
+            "16": 1254505984.0,
+            "17": 1254516224.0,
+            "18": 1254503424.0,
+            "19": 1254506496.0,
+            "20": 1254504960.0,
+            "21": 1254508032.0,
+            "22": 1254510592.0,
+            "23": 1254512640.0,
+            "24": 1254505472.0,
+            "25": 1254513664.0,
+            "26": 1254512128.0,
+            "27": 1254501888.0,
+            "28": 1254509056.0,
+            "29": 1254508032.0,
+            "30": 1254509056.0,
+            "31": 1254519296.0,
+            "32": 1254512128.0,
+            "33": 1254507008.0,
+            "34": 1254509056.0,
+            "35": 1254510080.0,
+            "36": 1254507008.0,
+            "37": 1254504448.0,
+            "38": 1254505472.0,
+            "39": 1254508032.0,
+            "40": 1254505984.0,
+            "41": 1254512128.0,
+            "42": 1254504960.0,
+            "43": 1254507008.0,
+            "44": 1254508032.0,
+            "45": 1254506496.0,
+            "46": 1254513664.0,
+            "47": 1254507008.0,
+            "48": 1254511616.0,
+            "49": 1254508032.0,
+            "50": 1254506496.0,
+            "51": 1254508032.0,
+            "52": 1254513152.0,
+            "53": 1254505984.0,
+            "54": 1254508544.0,
+            "55": 1254503936.0,
+            "56": 1254502912.0,
+            "57": 1254515200.0,
+            "58": 1254503936.0,
+            "59": 1254508544.0,
+            "60": 1254503936.0,
+            "61": 1254507008.0,
+            "62": 1254508032.0,
+            "63": 1254507520.0,
+            "64": 1254502400.0,
+            "65": 1254509568.0,
+            "66": 1254501376.0,
+            "67": 1254499328.0,
+            "68": 1254503936.0,
+            "69": 1254499328.0,
+            "70": 1254502912.0,
+            "71": 1254507520.0,
+            "72": 1254499328.0,
+            "73": 1254497280.0,
+            "74": 1254486016.0,
+            "75": 1254504960.0,
+            "76": 1254507008.0,
+            "77": 1254504448.0,
+            "78": 1254496256.0,
+            "79": 1254500864.0,
+            "80": 1254491648.0,
+            "81": 1254493696.0,
+            "82": 1254497280.0,
+            "83": 1254505984.0,
+            "84": 1254489600.0,
+            "85": 1254505984.0,
+            "86": 1254500352.0,
+            "87": 1254505472.0,
+            "88": 1254506496.0,
+            "89": 1254498304.0,
+            "90": 1254504448.0,
+            "91": 1254501888.0,
+            "92": 1254501888.0,
+            "93": 1254499328.0,
+            "94": 1254494720.0,
+            "95": 1254504960.0,
+            "96": 1254503424.0,
+            "97": 1254492672.0,
+            "98": 1254505984.0,
+            "99": 1254499328.0,
+            "100": 1254501888.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2066381824.0,
+            "2": 2543623168.0,
+            "3": 2544637440.0,
+            "4": 2550310912.0,
+            "5": 2550310912.0,
+            "6": 2550310912.0,
+            "7": 2550310912.0,
+            "8": 2550310912.0,
+            "9": 2550310912.0,
+            "10": 2550310912.0,
+            "11": 2550310912.0,
+            "12": 2550310912.0,
+            "13": 2550310912.0,
+            "14": 2550310912.0,
+            "15": 2550310912.0,
+            "16": 2550310912.0,
+            "17": 2554848768.0,
+            "18": 2554848768.0,
+            "19": 2554848768.0,
+            "20": 2554848768.0,
+            "21": 2554848768.0,
+            "22": 2554848768.0,
+            "23": 2554848768.0,
+            "24": 2554848768.0,
+            "25": 2554848768.0,
+            "26": 2554848768.0,
+            "27": 2554848768.0,
+            "28": 2554848768.0,
+            "29": 2554848768.0,
+            "30": 2554848768.0,
+            "31": 2556047872.0,
+            "32": 2556047872.0,
+            "33": 2556047872.0,
+            "34": 2556047872.0,
+            "35": 2556047872.0,
+            "36": 2556047872.0,
+            "37": 2556047872.0,
+            "38": 2556047872.0,
+            "39": 2556047872.0,
+            "40": 2556047872.0,
+            "41": 2556047872.0,
+            "42": 2556047872.0,
+            "43": 2556047872.0,
+            "44": 2556047872.0,
+            "45": 2556047872.0,
+            "46": 2556047872.0,
+            "47": 2556047872.0,
+            "48": 2556047872.0,
+            "49": 2556047872.0,
+            "50": 2556047872.0,
+            "51": 2556047872.0,
+            "52": 2556047872.0,
+            "53": 2556047872.0,
+            "54": 2556047872.0,
+            "55": 2556047872.0,
+            "56": 2556047872.0,
+            "57": 2556047872.0,
+            "58": 2556047872.0,
+            "59": 2556047872.0,
+            "60": 2556047872.0,
+            "61": 2556047872.0,
+            "62": 2556047872.0,
+            "63": 2556047872.0,
+            "64": 2556047872.0,
+            "65": 2556047872.0,
+            "66": 2556047872.0,
+            "67": 2556047872.0,
+            "68": 2556047872.0,
+            "69": 2556047872.0,
+            "70": 2556047872.0,
+            "71": 2556047872.0,
+            "72": 2556047872.0,
+            "73": 2556047872.0,
+            "74": 2556047872.0,
+            "75": 2556047872.0,
+            "76": 2556047872.0,
+            "77": 2556047872.0,
+            "78": 2556047872.0,
+            "79": 2556047872.0,
+            "80": 2556047872.0,
+            "81": 2556047872.0,
+            "82": 2556047872.0,
+            "83": 2556047872.0,
+            "84": 2556047872.0,
+            "85": 2556047872.0,
+            "86": 2556047872.0,
+            "87": 2556047872.0,
+            "88": 2556047872.0,
+            "89": 2556047872.0,
+            "90": 2556047872.0,
+            "91": 2556047872.0,
+            "92": 2556047872.0,
+            "93": 2556047872.0,
+            "94": 2556047872.0,
+            "95": 2556047872.0,
+            "96": 2556047872.0,
+            "97": 2556047872.0,
+            "98": 2556047872.0,
+            "99": 2556047872.0,
+            "100": 2556047872.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.80614,
+            "3": 0.71249,
+            "4": 0.40839,
+            "5": 0.39358,
+            "6": 0.3911,
+            "7": 0.39032,
+            "8": 0.38318,
+            "9": 0.70382,
+            "10": 0.34707,
+            "11": 0.34403,
+            "12": 0.34043,
+            "13": 0.33959,
+            "14": 0.33461,
+            "15": 0.34767,
+            "16": 0.33495,
+            "17": 0.34839,
+            "18": 0.33673,
+            "19": 0.33335,
+            "20": 0.33161,
+            "21": 0.32643,
+            "22": 0.33565,
+            "23": 0.33625,
+            "24": 0.33009,
+            "25": 0.33065,
+            "26": 0.33344,
+            "27": 0.33552,
+            "28": 0.33047,
+            "29": 0.33011,
+            "30": 0.33358,
+            "31": 0.34631,
+            "32": 0.33536,
+            "33": 0.33271,
+            "34": 0.33949,
+            "35": 0.33073,
+            "36": 0.32877,
+            "37": 0.32806,
+            "38": 0.33111,
+            "39": 0.33408,
+            "40": 0.33428,
+            "41": 0.34927,
+            "42": 1.47745,
+            "43": 0.48012,
+            "44": 0.33077,
+            "45": 0.33262,
+            "46": 0.34066,
+            "47": 0.33152,
+            "48": 0.33512,
+            "49": 0.34429,
+            "50": 0.33697,
+            "51": 0.34656,
+            "52": 0.337,
+            "53": 0.33133,
+            "54": 0.33172,
+            "55": 0.33188,
+            "56": 0.35163,
+            "57": 0.34162,
+            "58": 0.33258,
+            "59": 0.7122,
+            "60": 0.33979,
+            "61": 0.33569,
+            "62": 0.33523,
+            "63": 0.33864,
+            "64": 0.34776,
+            "65": 0.37658,
+            "66": 0.3377,
+            "67": 0.36916,
+            "68": 0.3452,
+            "69": 0.33854,
+            "70": 0.34023,
+            "71": 0.3544,
+            "72": 0.34395,
+            "73": 0.3567,
+            "74": 0.35025,
+            "75": 0.35164,
+            "76": 0.35012,
+            "77": 0.3364,
+            "78": 0.34491,
+            "79": 0.34789,
+            "80": 0.35388,
+            "81": 0.34075,
+            "82": 0.34743,
+            "83": 0.34211,
+            "84": 0.34722,
+            "85": 0.33956,
+            "86": 0.35402,
+            "87": 0.34301,
+            "88": 0.34056,
+            "89": 0.35764,
+            "90": 0.33476,
+            "91": 0.3539,
+            "92": 0.34448,
+            "93": 0.34895,
+            "94": 0.3624,
+            "95": 0.34001,
+            "96": 0.3382,
+            "97": 0.35217,
+            "98": 0.33252,
+            "99": 0.34909,
+            "100": 0.34966
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json
index 1a09e73e300..bf57cfecddc 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1254501376.0,
-            "2": 1254505472.0,
-            "3": 1254505472.0,
-            "4": 1254501376.0,
-            "5": 1254501888.0,
-            "6": 1254503424.0,
-            "7": 1254503936.0,
-            "8": 1254503936.0,
-            "9": 1254501888.0,
-            "10": 1254503424.0,
-            "11": 1254503936.0,
-            "12": 1254502912.0,
-            "13": 1254500864.0,
-            "14": 1254505472.0,
-            "15": 1254504448.0,
-            "16": 1254503424.0,
-            "17": 1254504448.0,
-            "18": 1254502400.0,
-            "19": 1254503936.0,
-            "20": 1254503424.0,
-            "21": 1254503424.0,
-            "22": 1254501376.0,
-            "23": 1254500864.0,
-            "24": 1254503424.0,
-            "25": 1254500352.0,
-            "26": 1254502400.0,
-            "27": 1254501888.0,
-            "28": 1254502912.0,
-            "29": 1254505472.0,
-            "30": 1254500352.0,
-            "31": 1254499328.0,
-            "32": 1254500352.0,
-            "33": 1254502912.0,
-            "34": 1254502912.0,
-            "35": 1254501888.0,
-            "36": 1254505472.0,
-            "37": 1254503424.0,
-            "38": 1254503936.0,
-            "39": 1254502912.0,
-            "40": 1254502912.0,
-            "41": 1254503424.0,
-            "42": 1254502912.0,
-            "43": 1254499840.0,
-            "44": 1254501376.0,
-            "45": 1254502400.0,
-            "46": 1254500864.0,
-            "47": 1254503936.0,
-            "48": 1254499840.0,
-            "49": 1254500352.0,
-            "50": 1254502912.0,
-            "51": 1254496768.0,
-            "52": 1254496256.0,
-            "53": 1254497792.0,
-            "54": 1254498304.0,
-            "55": 1254500352.0,
-            "56": 1254501888.0,
-            "57": 1254493184.0,
-            "58": 1254498304.0,
-            "59": 1254495232.0,
-            "60": 1254496768.0,
-            "61": 1254504960.0,
-            "62": 1254503936.0,
-            "63": 1254499328.0,
-            "64": 1254498816.0,
-            "65": 1254488576.0,
-            "66": 1254502912.0,
-            "67": 1254498304.0,
-            "68": 1254505984.0,
-            "69": 1254501376.0,
-            "70": 1254502912.0,
-            "71": 1254504960.0,
-            "72": 1254496256.0,
-            "73": 1254504448.0,
-            "74": 1254495232.0,
-            "75": 1254504448.0,
-            "76": 1254503424.0,
-            "77": 1254503936.0,
-            "78": 1254500352.0,
-            "79": 1254500864.0,
-            "80": 1254499840.0,
-            "81": 1254503424.0,
-            "82": 1254500352.0,
-            "83": 1254497792.0,
-            "84": 1254497280.0,
-            "85": 1254499328.0,
-            "86": 1254498816.0,
-            "87": 1254505472.0,
-            "88": 1254499328.0,
-            "89": 1254500864.0,
-            "90": 1254502912.0,
-            "91": 1254505472.0,
-            "92": 1254502912.0,
-            "93": 1254505472.0,
-            "94": 1254500352.0,
-            "95": 1254501888.0,
-            "96": 1254501888.0,
-            "97": 1254499328.0,
-            "98": 1254507520.0,
-            "99": 1254497280.0,
-            "100": 1254499840.0
+            "1": 1254502400.0,
+            "2": 1254506496.0,
+            "3": 1254506496.0,
+            "4": 1254502400.0,
+            "5": 1254502912.0,
+            "6": 1254504448.0,
+            "7": 1254504960.0,
+            "8": 1254504960.0,
+            "9": 1254502912.0,
+            "10": 1254504448.0,
+            "11": 1254504960.0,
+            "12": 1254503936.0,
+            "13": 1254501888.0,
+            "14": 1254506496.0,
+            "15": 1254505472.0,
+            "16": 1254504448.0,
+            "17": 1254505472.0,
+            "18": 1254503424.0,
+            "19": 1254504960.0,
+            "20": 1254504448.0,
+            "21": 1254504448.0,
+            "22": 1254502400.0,
+            "23": 1254501888.0,
+            "24": 1254504448.0,
+            "25": 1254501376.0,
+            "26": 1254503424.0,
+            "27": 1254502912.0,
+            "28": 1254503936.0,
+            "29": 1254506496.0,
+            "30": 1254501376.0,
+            "31": 1254500352.0,
+            "32": 1254501376.0,
+            "33": 1254503936.0,
+            "34": 1254503936.0,
+            "35": 1254502912.0,
+            "36": 1254506496.0,
+            "37": 1254504448.0,
+            "38": 1254504960.0,
+            "39": 1254503936.0,
+            "40": 1254503936.0,
+            "41": 1254504448.0,
+            "42": 1254503936.0,
+            "43": 1254500864.0,
+            "44": 1254502400.0,
+            "45": 1254503424.0,
+            "46": 1254501888.0,
+            "47": 1254504960.0,
+            "48": 1254500864.0,
+            "49": 1254501376.0,
+            "50": 1254503936.0,
+            "51": 1254497792.0,
+            "52": 1254497280.0,
+            "53": 1254498816.0,
+            "54": 1254499328.0,
+            "55": 1254501376.0,
+            "56": 1254502912.0,
+            "57": 1254494208.0,
+            "58": 1254499328.0,
+            "59": 1254496256.0,
+            "60": 1254497792.0,
+            "61": 1254505984.0,
+            "62": 1254504960.0,
+            "63": 1254500352.0,
+            "64": 1254499840.0,
+            "65": 1254489600.0,
+            "66": 1254503936.0,
+            "67": 1254499328.0,
+            "68": 1254507008.0,
+            "69": 1254502400.0,
+            "70": 1254503936.0,
+            "71": 1254505984.0,
+            "72": 1254497280.0,
+            "73": 1254505472.0,
+            "74": 1254496256.0,
+            "75": 1254505472.0,
+            "76": 1254504448.0,
+            "77": 1254504960.0,
+            "78": 1254501376.0,
+            "79": 1254501888.0,
+            "80": 1254500864.0,
+            "81": 1254504448.0,
+            "82": 1254501376.0,
+            "83": 1254498816.0,
+            "84": 1254498304.0,
+            "85": 1254500352.0,
+            "86": 1254499840.0,
+            "87": 1254506496.0,
+            "88": 1254500352.0,
+            "89": 1254501888.0,
+            "90": 1254503936.0,
+            "91": 1254506496.0,
+            "92": 1254503936.0,
+            "93": 1254506496.0,
+            "94": 1254501376.0,
+            "95": 1254502912.0,
+            "96": 1254502912.0,
+            "97": 1254500352.0,
+            "98": 1254508544.0,
+            "99": 1254498304.0,
+            "100": 1254500864.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1987779584.0,
-            "2": 2468141568.0,
-            "3": 2468920320.0,
-            "4": 2468920320.0,
-            "5": 2468920320.0,
-            "6": 2468920320.0,
-            "7": 2468920320.0,
-            "8": 2468920320.0,
-            "9": 2469234688.0,
-            "10": 2469234688.0,
-            "11": 2469234688.0,
-            "12": 2469234688.0,
-            "13": 2469234688.0,
-            "14": 2469234688.0,
-            "15": 2469234688.0,
-            "16": 2469234688.0,
-            "17": 2469234688.0,
-            "18": 2469234688.0,
-            "19": 2469234688.0,
-            "20": 2469234688.0,
-            "21": 2469234688.0,
-            "22": 2469234688.0,
-            "23": 2469234688.0,
-            "24": 2469234688.0,
-            "25": 2469234688.0,
-            "26": 2469234688.0,
-            "27": 2469234688.0,
-            "28": 2469234688.0,
-            "29": 2469234688.0,
-            "30": 2469234688.0,
-            "31": 2469234688.0,
-            "32": 2469234688.0,
-            "33": 2469234688.0,
-            "34": 2469234688.0,
-            "35": 2469234688.0,
-            "36": 2469234688.0,
-            "37": 2469234688.0,
-            "38": 2469234688.0,
-            "39": 2469234688.0,
-            "40": 2469234688.0,
-            "41": 2469234688.0,
-            "42": 2469234688.0,
-            "43": 2469234688.0,
-            "44": 2469234688.0,
-            "45": 2469234688.0,
-            "46": 2469234688.0,
-            "47": 2469234688.0,
-            "48": 2469234688.0,
-            "49": 2469234688.0,
-            "50": 2469234688.0,
-            "51": 2469234688.0,
-            "52": 2469234688.0,
-            "53": 2469234688.0,
-            "54": 2469234688.0,
-            "55": 2469234688.0,
-            "56": 2469234688.0,
-            "57": 2469234688.0,
-            "58": 2469234688.0,
-            "59": 2469234688.0,
-            "60": 2469234688.0,
-            "61": 2469234688.0,
-            "62": 2469234688.0,
-            "63": 2469234688.0,
-            "64": 2469234688.0,
-            "65": 2469234688.0,
-            "66": 2469234688.0,
-            "67": 2469234688.0,
-            "68": 2469234688.0,
-            "69": 2469234688.0,
-            "70": 2469234688.0,
-            "71": 2469234688.0,
-            "72": 2469234688.0,
-            "73": 2469234688.0,
-            "74": 2469234688.0,
-            "75": 2469234688.0,
-            "76": 2471084032.0,
-            "77": 2471084032.0,
-            "78": 2471084032.0,
-            "79": 2471084032.0,
-            "80": 2471084032.0,
-            "81": 2471084032.0,
-            "82": 2471084032.0,
-            "83": 2471084032.0,
-            "84": 2471084032.0,
-            "85": 2471084032.0,
-            "86": 2471084032.0,
-            "87": 2471084032.0,
-            "88": 2471084032.0,
-            "89": 2471084032.0,
-            "90": 2471084032.0,
-            "91": 2471084032.0,
-            "92": 2471084032.0,
-            "93": 2471084032.0,
-            "94": 2471084032.0,
-            "95": 2471084032.0,
-            "96": 2471084032.0,
-            "97": 2471084032.0,
-            "98": 2471084032.0,
-            "99": 2471084032.0,
-            "100": 2471084032.0
+            "1": 2038519808.0,
+            "2": 2520255488.0,
+            "3": 2520255488.0,
+            "4": 2520255488.0,
+            "5": 2520552960.0,
+            "6": 2520552960.0,
+            "7": 2520552960.0,
+            "8": 2520552960.0,
+            "9": 2520552960.0,
+            "10": 2520552960.0,
+            "11": 2520552960.0,
+            "12": 2520552960.0,
+            "13": 2520552960.0,
+            "14": 2520552960.0,
+            "15": 2520552960.0,
+            "16": 2520552960.0,
+            "17": 2520552960.0,
+            "18": 2520552960.0,
+            "19": 2520552960.0,
+            "20": 2520552960.0,
+            "21": 2520552960.0,
+            "22": 2520552960.0,
+            "23": 2520552960.0,
+            "24": 2520552960.0,
+            "25": 2520552960.0,
+            "26": 2520552960.0,
+            "27": 2520552960.0,
+            "28": 2520552960.0,
+            "29": 2520552960.0,
+            "30": 2520552960.0,
+            "31": 2520552960.0,
+            "32": 2520552960.0,
+            "33": 2521159680.0,
+            "34": 2521159680.0,
+            "35": 2521159680.0,
+            "36": 2521159680.0,
+            "37": 2521159680.0,
+            "38": 2521159680.0,
+            "39": 2521159680.0,
+            "40": 2521159680.0,
+            "41": 2521159680.0,
+            "42": 2521159680.0,
+            "43": 2521159680.0,
+            "44": 2521159680.0,
+            "45": 2521159680.0,
+            "46": 2521615360.0,
+            "47": 2521615360.0,
+            "48": 2521615360.0,
+            "49": 2521615360.0,
+            "50": 2521615360.0,
+            "51": 2521615360.0,
+            "52": 2521615360.0,
+            "53": 2521615360.0,
+            "54": 2521615360.0,
+            "55": 2521615360.0,
+            "56": 2521615360.0,
+            "57": 2521615360.0,
+            "58": 2521615360.0,
+            "59": 2521615360.0,
+            "60": 2521615360.0,
+            "61": 2521615360.0,
+            "62": 2521615360.0,
+            "63": 2521615360.0,
+            "64": 2521615360.0,
+            "65": 2521615360.0,
+            "66": 2521615360.0,
+            "67": 2521615360.0,
+            "68": 2521615360.0,
+            "69": 2521615360.0,
+            "70": 2521615360.0,
+            "71": 2521615360.0,
+            "72": 2521615360.0,
+            "73": 2521615360.0,
+            "74": 2521615360.0,
+            "75": 2521615360.0,
+            "76": 2521615360.0,
+            "77": 2521615360.0,
+            "78": 2521615360.0,
+            "79": 2521615360.0,
+            "80": 2521615360.0,
+            "81": 2521615360.0,
+            "82": 2521615360.0,
+            "83": 2521615360.0,
+            "84": 2521615360.0,
+            "85": 2521615360.0,
+            "86": 2521615360.0,
+            "87": 2521615360.0,
+            "88": 2521615360.0,
+            "89": 2521615360.0,
+            "90": 2521615360.0,
+            "91": 2521615360.0,
+            "92": 2521615360.0,
+            "93": 2521615360.0,
+            "94": 2521615360.0,
+            "95": 2523076096.0,
+            "96": 2523076096.0,
+            "97": 2523076096.0,
+            "98": 2523076096.0,
+            "99": 2523076096.0,
+            "100": 2523076096.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 16.55217,
-            "2": 0.35181,
-            "3": 0.30566,
-            "4": 0.27474,
-            "5": 0.25821,
-            "6": 0.24756,
-            "7": 0.26543,
-            "8": 0.25377,
-            "9": 0.25669,
-            "10": 0.24857,
-            "11": 0.25265,
-            "12": 0.25052,
-            "13": 0.25023,
-            "14": 0.24925,
-            "15": 0.26244,
-            "16": 0.25012,
-            "17": 0.26253,
-            "18": 0.24643,
-            "19": 0.24809,
-            "20": 0.24556,
-            "21": 0.24394,
-            "22": 0.251,
-            "23": 0.24828,
-            "24": 0.24669,
-            "25": 0.24387,
-            "26": 0.24678,
-            "27": 0.24651,
-            "28": 0.25139,
-            "29": 0.24752,
-            "30": 0.24424,
-            "31": 0.28311,
-            "32": 0.25225,
-            "33": 0.24909,
-            "34": 0.26885,
-            "35": 0.25395,
-            "36": 0.2523,
-            "37": 0.24797,
-            "38": 0.25223,
-            "39": 0.24992,
-            "40": 0.25852,
-            "41": 0.24878,
-            "42": 0.2538,
-            "43": 0.2597,
-            "44": 0.24622,
-            "45": 0.26158,
-            "46": 0.27295,
-            "47": 0.2509,
-            "48": 0.26644,
-            "49": 0.28407,
-            "50": 0.25557,
-            "51": 0.26677,
-            "52": 0.27657,
-            "53": 0.25511,
-            "54": 0.25626,
-            "55": 0.26088,
-            "56": 0.30712,
-            "57": 0.27149,
-            "58": 0.25315,
-            "59": 0.26247,
-            "60": 0.26163,
-            "61": 0.25105,
-            "62": 0.24787,
-            "63": 0.27859,
-            "64": 0.26395,
-            "65": 0.32678,
-            "66": 0.25441,
-            "67": 0.30841,
-            "68": 0.27583,
-            "69": 0.2474,
-            "70": 0.25895,
-            "71": 0.27463,
-            "72": 0.26044,
-            "73": 0.27953,
-            "74": 0.27908,
-            "75": 0.26127,
-            "76": 0.28492,
-            "77": 0.25287,
-            "78": 0.26927,
-            "79": 0.26632,
-            "80": 0.26465,
-            "81": 0.25418,
-            "82": 0.25,
-            "83": 0.26012,
-            "84": 0.27232,
-            "85": 0.25707,
-            "86": 0.26564,
-            "87": 0.25446,
-            "88": 0.24718,
-            "89": 0.26899,
-            "90": 0.24357,
-            "91": 0.27455,
-            "92": 0.25494,
-            "93": 0.26852,
-            "94": 0.27917,
-            "95": 0.258,
-            "96": 0.25134,
-            "97": 0.26377,
-            "98": 0.24669,
-            "99": 0.26096,
-            "100": 0.25411
+            "1": 17.78784,
+            "2": 0.2935,
+            "3": 0.25416,
+            "4": 0.28848,
+            "5": 0.27342,
+            "6": 0.21986,
+            "7": 0.22775,
+            "8": 0.21125,
+            "9": 0.22242,
+            "10": 0.20696,
+            "11": 0.21121,
+            "12": 0.20562,
+            "13": 0.20918,
+            "14": 0.20486,
+            "15": 0.22312,
+            "16": 0.20648,
+            "17": 0.21741,
+            "18": 0.20596,
+            "19": 0.20449,
+            "20": 0.20633,
+            "21": 0.20648,
+            "22": 0.20939,
+            "23": 0.20613,
+            "24": 0.2098,
+            "25": 0.21077,
+            "26": 0.20978,
+            "27": 0.20622,
+            "28": 0.20953,
+            "29": 0.2052,
+            "30": 0.20858,
+            "31": 0.23751,
+            "32": 0.20916,
+            "33": 0.21528,
+            "34": 0.22994,
+            "35": 0.20666,
+            "36": 0.56591,
+            "37": 0.2088,
+            "38": 0.20535,
+            "39": 0.20334,
+            "40": 0.21053,
+            "41": 0.20731,
+            "42": 0.21647,
+            "43": 0.21279,
+            "44": 0.20733,
+            "45": 0.22499,
+            "46": 0.22926,
+            "47": 0.21023,
+            "48": 0.21769,
+            "49": 0.24399,
+            "50": 0.21286,
+            "51": 0.238,
+            "52": 0.23293,
+            "53": 0.20987,
+            "54": 0.21516,
+            "55": 0.22388,
+            "56": 0.25985,
+            "57": 0.22604,
+            "58": 0.61513,
+            "59": 0.22219,
+            "60": 0.21734,
+            "61": 0.90688,
+            "62": 0.21705,
+            "63": 0.23992,
+            "64": 0.21828,
+            "65": 0.27683,
+            "66": 0.21653,
+            "67": 0.27213,
+            "68": 0.8349,
+            "69": 0.21293,
+            "70": 0.21051,
+            "71": 0.22862,
+            "72": 0.22498,
+            "73": 0.24298,
+            "74": 0.23094,
+            "75": 0.22956,
+            "76": 0.24583,
+            "77": 0.21646,
+            "78": 0.22364,
+            "79": 0.22898,
+            "80": 0.21878,
+            "81": 0.21415,
+            "82": 0.21267,
+            "83": 0.22485,
+            "84": 0.22454,
+            "85": 0.21746,
+            "86": 0.23031,
+            "87": 0.21423,
+            "88": 0.21226,
+            "89": 0.2196,
+            "90": 0.21327,
+            "91": 0.23392,
+            "92": 0.22086,
+            "93": 0.23306,
+            "94": 0.24169,
+            "95": 0.22202,
+            "96": 0.2155,
+            "97": 0.22184,
+            "98": 0.2139,
+            "99": 0.21705,
+            "100": 0.21654
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..f6f646ddf4a
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.90532,
+            "52": 9.8039,
+            "53": 10.12749,
+            "54": 10.00016,
+            "55": 9.93664,
+            "56": 9.68581,
+            "57": 9.55837,
+            "58": 9.90508,
+            "59": 9.63839,
+            "60": 9.57464,
+            "61": 9.76841,
+            "62": 10.03826,
+            "63": 9.44553,
+            "64": 9.82755,
+            "65": 9.00746,
+            "66": 9.77476,
+            "67": 9.41315,
+            "68": 9.84101,
+            "69": 9.8283,
+            "70": 9.79049,
+            "71": 9.66947,
+            "72": 9.62799,
+            "73": 9.54696,
+            "74": 9.03684,
+            "75": 9.49167,
+            "76": 9.16779,
+            "77": 10.1088,
+            "78": 9.77072,
+            "79": 9.43806,
+            "80": 9.45438,
+            "81": 9.5225,
+            "82": 9.74228,
+            "83": 9.36999,
+            "84": 9.45397,
+            "85": 9.65808,
+            "86": 9.12501,
+            "87": 9.62705,
+            "88": 9.79641,
+            "89": 9.66075,
+            "90": 9.8512,
+            "91": 9.39414,
+            "92": 9.40741,
+            "93": 9.13573,
+            "94": 8.89066,
+            "95": 9.56273,
+            "96": 9.5712,
+            "97": 9.34355,
+            "98": 9.73013,
+            "99": 8.95039,
+            "100": 9.44212
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 37791.0,
+            "52": 37021.0,
+            "53": 41944.0,
+            "54": 40947.0,
+            "55": 37727.0,
+            "56": 40761.0,
+            "57": 37481.0,
+            "58": 41787.0,
+            "59": 39365.0,
+            "60": 40922.0,
+            "61": 41100.0,
+            "62": 43388.0,
+            "63": 38269.0,
+            "64": 43526.0,
+            "65": 41821.0,
+            "66": 44876.0,
+            "67": 42497.0,
+            "68": 39967.0,
+            "69": 41255.0,
+            "70": 45781.0,
+            "71": 42348.0,
+            "72": 42151.0,
+            "73": 45043.0,
+            "74": 35705.0,
+            "75": 39397.0,
+            "76": 45340.0,
+            "77": 45670.0,
+            "78": 46614.0,
+            "79": 49159.0,
+            "80": 47317.0,
+            "81": 51048.0,
+            "82": 49312.0,
+            "83": 45257.0,
+            "84": 45494.0,
+            "85": 49366.0,
+            "86": 45783.0,
+            "87": 50223.0,
+            "88": 47536.0,
+            "89": 48826.0,
+            "90": 49499.0,
+            "91": 45726.0,
+            "92": 47926.0,
+            "93": 46433.0,
+            "94": 47675.0,
+            "95": 47504.0,
+            "96": 50174.0,
+            "97": 46465.0,
+            "98": 49255.0,
+            "99": 48053.0,
+            "100": 44507.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1254985216.0,
+            "52": 1254984704.0,
+            "53": 1254986240.0,
+            "54": 1254986752.0,
+            "55": 1254988800.0,
+            "56": 1254990336.0,
+            "57": 1254981632.0,
+            "58": 1254986752.0,
+            "59": 1254983680.0,
+            "60": 1254985216.0,
+            "61": 1254993408.0,
+            "62": 1254992384.0,
+            "63": 1254987776.0,
+            "64": 1254987264.0,
+            "65": 1254977024.0,
+            "66": 1254991360.0,
+            "67": 1254986752.0,
+            "68": 1254994432.0,
+            "69": 1254989824.0,
+            "70": 1254991360.0,
+            "71": 1254993408.0,
+            "72": 1254984704.0,
+            "73": 1254992896.0,
+            "74": 1254983680.0,
+            "75": 1254992896.0,
+            "76": 1254991872.0,
+            "77": 1254992384.0,
+            "78": 1254988800.0,
+            "79": 1254989312.0,
+            "80": 1254988288.0,
+            "81": 1254991872.0,
+            "82": 1254988800.0,
+            "83": 1254986240.0,
+            "84": 1254985728.0,
+            "85": 1254987776.0,
+            "86": 1254987264.0,
+            "87": 1254993920.0,
+            "88": 1254987776.0,
+            "89": 1254989312.0,
+            "90": 1254991360.0,
+            "91": 1254993920.0,
+            "92": 1254991360.0,
+            "93": 1254993920.0,
+            "94": 1254988800.0,
+            "95": 1254990336.0,
+            "96": 1254990336.0,
+            "97": 1254987776.0,
+            "98": 1254995968.0,
+            "99": 1254985728.0,
+            "100": 1254988288.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3124454912.0,
+            "52": 3124454912.0,
+            "53": 3124454912.0,
+            "54": 3124454912.0,
+            "55": 3124454912.0,
+            "56": 3124454912.0,
+            "57": 3124454912.0,
+            "58": 3124454912.0,
+            "59": 3124454912.0,
+            "60": 3124454912.0,
+            "61": 3124454912.0,
+            "62": 3124454912.0,
+            "63": 3124454912.0,
+            "64": 3124454912.0,
+            "65": 3124454912.0,
+            "66": 3124454912.0,
+            "67": 3124454912.0,
+            "68": 3124454912.0,
+            "69": 3124454912.0,
+            "70": 3124454912.0,
+            "71": 3124454912.0,
+            "72": 3124454912.0,
+            "73": 3124454912.0,
+            "74": 3124454912.0,
+            "75": 3124454912.0,
+            "76": 3124454912.0,
+            "77": 3124454912.0,
+            "78": 3124454912.0,
+            "79": 3124454912.0,
+            "80": 3124454912.0,
+            "81": 3124454912.0,
+            "82": 3124454912.0,
+            "83": 3124454912.0,
+            "84": 3124454912.0,
+            "85": 3124454912.0,
+            "86": 3124454912.0,
+            "87": 3124454912.0,
+            "88": 3124454912.0,
+            "89": 3124454912.0,
+            "90": 3124454912.0,
+            "91": 3124454912.0,
+            "92": 3124454912.0,
+            "93": 3124454912.0,
+            "94": 3124454912.0,
+            "95": 3124454912.0,
+            "96": 3124454912.0,
+            "97": 3124454912.0,
+            "98": 3124454912.0,
+            "99": 3124454912.0,
+            "100": 3124454912.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 14.75813,
+            "52": 0.32673,
+            "53": 0.25047,
+            "54": 0.24173,
+            "55": 0.23984,
+            "56": 0.28067,
+            "57": 0.24362,
+            "58": 0.23949,
+            "59": 0.22718,
+            "60": 0.22572,
+            "61": 0.21463,
+            "62": 0.21566,
+            "63": 0.24356,
+            "64": 0.22422,
+            "65": 0.28681,
+            "66": 0.2175,
+            "67": 0.268,
+            "68": 0.24975,
+            "69": 0.21136,
+            "70": 0.21698,
+            "71": 0.23525,
+            "72": 0.22621,
+            "73": 0.24672,
+            "74": 0.2348,
+            "75": 0.22093,
+            "76": 0.24479,
+            "77": 0.21587,
+            "78": 0.2274,
+            "79": 0.23052,
+            "80": 0.22194,
+            "81": 0.212,
+            "82": 0.21273,
+            "83": 0.22719,
+            "84": 0.23492,
+            "85": 0.22378,
+            "86": 0.2309,
+            "87": 0.21404,
+            "88": 0.21648,
+            "89": 0.2217,
+            "90": 0.59895,
+            "91": 0.23561,
+            "92": 0.22052,
+            "93": 0.22925,
+            "94": 0.23793,
+            "95": 0.22403,
+            "96": 0.21436,
+            "97": 0.22243,
+            "98": 0.21293,
+            "99": 0.21642,
+            "100": 0.21522
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml
index 9490d832f7d..8525e285ac9 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml
@@ -53,7 +53,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..ce3d79128b1
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.80509,
+            "2": 10.82386,
+            "3": 10.80196,
+            "4": 10.79424,
+            "5": 10.8277,
+            "6": 10.84005,
+            "7": 10.8365,
+            "8": 10.82828,
+            "9": 10.83477,
+            "10": 10.77496,
+            "11": 10.85204,
+            "12": 10.83903,
+            "13": 10.85207,
+            "14": 10.85914,
+            "15": 10.81681,
+            "16": 10.79456,
+            "17": 10.77491,
+            "18": 10.80399,
+            "19": 10.79956,
+            "20": 10.73801,
+            "21": 10.72487,
+            "22": 10.59177,
+            "23": 10.73098,
+            "24": 10.6406,
+            "25": 10.59018,
+            "26": 10.63555,
+            "27": 10.66245,
+            "28": 10.6472,
+            "29": 10.64163,
+            "30": 10.4518,
+            "31": 10.22249,
+            "32": 10.52995,
+            "33": 10.51998,
+            "34": 10.31247,
+            "35": 10.34796,
+            "36": 10.31677,
+            "37": 10.42804,
+            "38": 10.29194,
+            "39": 10.46881,
+            "40": 10.19257,
+            "41": 10.23159,
+            "42": 10.29766,
+            "43": 9.97363,
+            "44": 10.07169,
+            "45": 9.97015,
+            "46": 9.94713,
+            "47": 10.23179,
+            "48": 9.97593,
+            "49": 9.67748,
+            "50": 10.0144
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 31182.0,
+            "2": 33013.0,
+            "3": 33646.0,
+            "4": 32202.0,
+            "5": 36913.0,
+            "6": 37554.0,
+            "7": 35184.0,
+            "8": 32207.0,
+            "9": 34523.0,
+            "10": 29945.0,
+            "11": 38237.0,
+            "12": 35346.0,
+            "13": 37426.0,
+            "14": 38358.0,
+            "15": 35140.0,
+            "16": 36293.0,
+            "17": 35645.0,
+            "18": 35117.0,
+            "19": 35648.0,
+            "20": 32896.0,
+            "21": 33511.0,
+            "22": 30704.0,
+            "23": 38149.0,
+            "24": 32677.0,
+            "25": 31055.0,
+            "26": 34700.0,
+            "27": 35410.0,
+            "28": 37268.0,
+            "29": 37953.0,
+            "30": 33210.0,
+            "31": 30482.0,
+            "32": 36908.0,
+            "33": 38308.0,
+            "34": 33125.0,
+            "35": 34341.0,
+            "36": 34925.0,
+            "37": 38767.0,
+            "38": 35780.0,
+            "39": 38955.0,
+            "40": 36485.0,
+            "41": 36015.0,
+            "42": 37638.0,
+            "43": 33689.0,
+            "44": 33688.0,
+            "45": 35448.0,
+            "46": 36810.0,
+            "47": 40858.0,
+            "48": 35696.0,
+            "49": 34729.0,
+            "50": 39077.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1027095040.0,
+            "2": 1027093504.0,
+            "3": 1027094528.0,
+            "4": 1027095040.0,
+            "5": 1027091968.0,
+            "6": 1027091968.0,
+            "7": 1027098112.0,
+            "8": 1027097600.0,
+            "9": 1027094528.0,
+            "10": 1027094016.0,
+            "11": 1027098624.0,
+            "12": 1027094528.0,
+            "13": 1027092480.0,
+            "14": 1027095040.0,
+            "15": 1027095040.0,
+            "16": 1027091456.0,
+            "17": 1027101184.0,
+            "18": 1027096064.0,
+            "19": 1027093504.0,
+            "20": 1027093504.0,
+            "21": 1027097088.0,
+            "22": 1027100160.0,
+            "23": 1027100160.0,
+            "24": 1027095552.0,
+            "25": 1027097088.0,
+            "26": 1027098112.0,
+            "27": 1027091456.0,
+            "28": 1027090944.0,
+            "29": 1027091968.0,
+            "30": 1027099648.0,
+            "31": 1027109888.0,
+            "32": 1027095552.0,
+            "33": 1027090944.0,
+            "34": 1027098112.0,
+            "35": 1027103744.0,
+            "36": 1027098112.0,
+            "37": 1027092480.0,
+            "38": 1027091456.0,
+            "39": 1027095040.0,
+            "40": 1027095040.0,
+            "41": 1027100160.0,
+            "42": 1027091968.0,
+            "43": 1027098624.0,
+            "44": 1027098624.0,
+            "45": 1027096064.0,
+            "46": 1027104256.0,
+            "47": 1027093504.0,
+            "48": 1027101184.0,
+            "49": 1027096064.0,
+            "50": 1027095552.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 3059586560.0,
+            "2": 3299159040.0,
+            "3": 3299482112.0,
+            "4": 3302137344.0,
+            "5": 3302137344.0,
+            "6": 3302137344.0,
+            "7": 3303535104.0,
+            "8": 3303535104.0,
+            "9": 3303535104.0,
+            "10": 3303535104.0,
+            "11": 3303535104.0,
+            "12": 3303535104.0,
+            "13": 3303535104.0,
+            "14": 3303535104.0,
+            "15": 3303535104.0,
+            "16": 3303535104.0,
+            "17": 3306910208.0,
+            "18": 3306910208.0,
+            "19": 3306910208.0,
+            "20": 3306910208.0,
+            "21": 3306910208.0,
+            "22": 3306910208.0,
+            "23": 3306910208.0,
+            "24": 3306910208.0,
+            "25": 3306910208.0,
+            "26": 3306910208.0,
+            "27": 3306910208.0,
+            "28": 3306910208.0,
+            "29": 3306910208.0,
+            "30": 3306910208.0,
+            "31": 3312495616.0,
+            "32": 3312495616.0,
+            "33": 3312495616.0,
+            "34": 3312495616.0,
+            "35": 3312495616.0,
+            "36": 3312495616.0,
+            "37": 3312495616.0,
+            "38": 3312495616.0,
+            "39": 3312495616.0,
+            "40": 3312495616.0,
+            "41": 3312495616.0,
+            "42": 3312495616.0,
+            "43": 3312495616.0,
+            "44": 3312495616.0,
+            "45": 3312495616.0,
+            "46": 3312495616.0,
+            "47": 3312495616.0,
+            "48": 3312495616.0,
+            "49": 3312495616.0,
+            "50": 3312495616.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.82645,
+            "3": 0.34371,
+            "4": 0.32704,
+            "5": 0.31536,
+            "6": 0.32001,
+            "7": 0.31919,
+            "8": 0.31719,
+            "9": 0.31876,
+            "10": 0.31015,
+            "11": 0.31546,
+            "12": 0.31198,
+            "13": 0.31518,
+            "14": 0.40567,
+            "15": 0.31856,
+            "16": 0.30868,
+            "17": 0.31352,
+            "18": 0.31536,
+            "19": 0.31164,
+            "20": 0.31286,
+            "21": 0.35519,
+            "22": 0.30985,
+            "23": 0.31256,
+            "24": 0.31727,
+            "25": 0.36651,
+            "26": 0.47287,
+            "27": 0.57438,
+            "28": 0.3575,
+            "29": 0.71431,
+            "30": 0.31163,
+            "31": 0.31877,
+            "32": 0.34436,
+            "33": 0.51773,
+            "34": 0.32292,
+            "35": 0.31651,
+            "36": 0.34162,
+            "37": 0.31339,
+            "38": 0.30524,
+            "39": 0.63856,
+            "40": 0.31883,
+            "41": 0.31475,
+            "42": 0.67365,
+            "43": 0.33393,
+            "44": 0.31389,
+            "45": 0.65089,
+            "46": 0.6524,
+            "47": 0.3061,
+            "48": 0.30487,
+            "49": 0.3295,
+            "50": 0.30784
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json
index 089545b6f4a..38498d3139b 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1027085824.0,
-            "2": 1027085824.0,
-            "3": 1027086848.0,
-            "4": 1027086336.0,
-            "5": 1027086848.0,
-            "6": 1027085312.0,
-            "7": 1027081728.0,
-            "8": 1027082752.0,
-            "9": 1027089408.0,
-            "10": 1027083776.0,
-            "11": 1027084288.0,
-            "12": 1027084288.0,
-            "13": 1027086848.0,
-            "14": 1027083776.0,
-            "15": 1027085312.0,
-            "16": 1027086336.0,
-            "17": 1027084288.0,
-            "18": 1027088384.0,
-            "19": 1027086848.0,
-            "20": 1027089920.0,
-            "21": 1027083264.0,
-            "22": 1027086336.0,
-            "23": 1027086848.0,
-            "24": 1027085824.0,
-            "25": 1027084288.0,
-            "26": 1027085312.0,
-            "27": 1027085312.0,
-            "28": 1027082752.0,
-            "29": 1027083776.0,
-            "30": 1027082240.0,
-            "31": 1027074048.0,
-            "32": 1027077120.0,
-            "33": 1027086336.0,
-            "34": 1027083264.0,
-            "35": 1027085312.0,
-            "36": 1027083776.0,
-            "37": 1027084288.0,
-            "38": 1027085312.0,
-            "39": 1027080704.0,
-            "40": 1027081728.0,
-            "41": 1027083264.0,
-            "42": 1027086848.0,
-            "43": 1027079680.0,
-            "44": 1027082752.0,
-            "45": 1027082752.0,
-            "46": 1027073536.0,
-            "47": 1027082752.0,
-            "48": 1027081216.0,
-            "49": 1027077120.0,
-            "50": 1027084800.0
+            "1": 1027090944.0,
+            "2": 1027090944.0,
+            "3": 1027091968.0,
+            "4": 1027091456.0,
+            "5": 1027091968.0,
+            "6": 1027090432.0,
+            "7": 1027086848.0,
+            "8": 1027087872.0,
+            "9": 1027094528.0,
+            "10": 1027088896.0,
+            "11": 1027089408.0,
+            "12": 1027089408.0,
+            "13": 1027091968.0,
+            "14": 1027088896.0,
+            "15": 1027090432.0,
+            "16": 1027091456.0,
+            "17": 1027089408.0,
+            "18": 1027093504.0,
+            "19": 1027091968.0,
+            "20": 1027095040.0,
+            "21": 1027088384.0,
+            "22": 1027091456.0,
+            "23": 1027091968.0,
+            "24": 1027090944.0,
+            "25": 1027089408.0,
+            "26": 1027090432.0,
+            "27": 1027090432.0,
+            "28": 1027087872.0,
+            "29": 1027088896.0,
+            "30": 1027087360.0,
+            "31": 1027079168.0,
+            "32": 1027082240.0,
+            "33": 1027091456.0,
+            "34": 1027088384.0,
+            "35": 1027090432.0,
+            "36": 1027088896.0,
+            "37": 1027089408.0,
+            "38": 1027090432.0,
+            "39": 1027085824.0,
+            "40": 1027086848.0,
+            "41": 1027088384.0,
+            "42": 1027091968.0,
+            "43": 1027084800.0,
+            "44": 1027087872.0,
+            "45": 1027087872.0,
+            "46": 1027078656.0,
+            "47": 1027087872.0,
+            "48": 1027086336.0,
+            "49": 1027082240.0,
+            "50": 1027089920.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3007080960.0,
-            "2": 3247499776.0,
-            "3": 3247499776.0,
-            "4": 3248093184.0,
-            "5": 3248476160.0,
-            "6": 3248476160.0,
-            "7": 3248476160.0,
-            "8": 3248476160.0,
-            "9": 3248476160.0,
-            "10": 3249142784.0,
-            "11": 3249142784.0,
-            "12": 3249142784.0,
-            "13": 3249142784.0,
-            "14": 3249142784.0,
-            "15": 3249142784.0,
-            "16": 3249142784.0,
-            "17": 3249142784.0,
-            "18": 3249142784.0,
-            "19": 3249142784.0,
-            "20": 3249142784.0,
-            "21": 3249142784.0,
-            "22": 3249860608.0,
-            "23": 3249860608.0,
-            "24": 3249972736.0,
-            "25": 3249972736.0,
-            "26": 3249972736.0,
-            "27": 3249972736.0,
-            "28": 3249972736.0,
-            "29": 3249972736.0,
-            "30": 3249972736.0,
-            "31": 3249972736.0,
-            "32": 3249972736.0,
-            "33": 3249972736.0,
-            "34": 3249972736.0,
-            "35": 3249972736.0,
-            "36": 3249972736.0,
-            "37": 3249972736.0,
-            "38": 3249972736.0,
-            "39": 3249972736.0,
-            "40": 3249972736.0,
-            "41": 3249972736.0,
-            "42": 3249972736.0,
-            "43": 3249972736.0,
-            "44": 3249972736.0,
-            "45": 3249972736.0,
-            "46": 3249972736.0,
-            "47": 3249972736.0,
-            "48": 3249972736.0,
-            "49": 3249972736.0,
-            "50": 3249972736.0
+            "1": 3057868288.0,
+            "2": 3298335232.0,
+            "3": 3298335232.0,
+            "4": 3300084224.0,
+            "5": 3300084224.0,
+            "6": 3300084224.0,
+            "7": 3300084224.0,
+            "8": 3300084224.0,
+            "9": 3300084224.0,
+            "10": 3300122624.0,
+            "11": 3300122624.0,
+            "12": 3300122624.0,
+            "13": 3300122624.0,
+            "14": 3300122624.0,
+            "15": 3300122624.0,
+            "16": 3300122624.0,
+            "17": 3300122624.0,
+            "18": 3300122624.0,
+            "19": 3300376576.0,
+            "20": 3300416000.0,
+            "21": 3300416000.0,
+            "22": 3301032960.0,
+            "23": 3301998080.0,
+            "24": 3301998080.0,
+            "25": 3301998080.0,
+            "26": 3301998080.0,
+            "27": 3301998080.0,
+            "28": 3301998080.0,
+            "29": 3301998080.0,
+            "30": 3301998080.0,
+            "31": 3301998080.0,
+            "32": 3301998080.0,
+            "33": 3301998080.0,
+            "34": 3301998080.0,
+            "35": 3301998080.0,
+            "36": 3301998080.0,
+            "37": 3301998080.0,
+            "38": 3301998080.0,
+            "39": 3301998080.0,
+            "40": 3301998080.0,
+            "41": 3301998080.0,
+            "42": 3301998080.0,
+            "43": 3301998080.0,
+            "44": 3301998080.0,
+            "45": 3301998080.0,
+            "46": 3301998080.0,
+            "47": 3301998080.0,
+            "48": 3301998080.0,
+            "49": 3301998080.0,
+            "50": 3301998080.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 13.20887,
-            "2": 0.29449,
-            "3": 0.26099,
-            "4": 0.25199,
-            "5": 0.24285,
-            "6": 0.23658,
-            "7": 0.24248,
-            "8": 0.23258,
-            "9": 0.22661,
-            "10": 0.23769,
-            "11": 0.22933,
-            "12": 0.23288,
-            "13": 0.23074,
-            "14": 0.22376,
-            "15": 0.25054,
-            "16": 0.22881,
-            "17": 0.23932,
-            "18": 0.22427,
-            "19": 0.23467,
-            "20": 0.22747,
-            "21": 0.22662,
-            "22": 0.22866,
-            "23": 0.22726,
-            "24": 0.22901,
-            "25": 0.22654,
-            "26": 0.22683,
-            "27": 0.22909,
-            "28": 0.2264,
-            "29": 0.23339,
-            "30": 0.23066,
-            "31": 0.27285,
-            "32": 0.22966,
-            "33": 0.23016,
-            "34": 0.24956,
-            "35": 0.23114,
-            "36": 0.24161,
-            "37": 0.22585,
-            "38": 0.23047,
-            "39": 0.22695,
-            "40": 0.24845,
-            "41": 0.23491,
-            "42": 0.22656,
-            "43": 0.23744,
-            "44": 0.23602,
-            "45": 0.24859,
-            "46": 0.25828,
-            "47": 0.2367,
-            "48": 0.2564,
-            "49": 0.27812,
-            "50": 0.23401
+            "1": 16.45405,
+            "2": 0.30024,
+            "3": 0.24416,
+            "4": 0.22949,
+            "5": 0.21642,
+            "6": 0.20677,
+            "7": 0.21591,
+            "8": 0.21087,
+            "9": 0.20973,
+            "10": 0.20724,
+            "11": 0.20594,
+            "12": 0.20225,
+            "13": 0.21091,
+            "14": 0.2028,
+            "15": 0.22641,
+            "16": 0.20409,
+            "17": 0.21141,
+            "18": 0.20363,
+            "19": 0.20701,
+            "20": 0.2078,
+            "21": 0.20171,
+            "22": 0.20432,
+            "23": 0.19941,
+            "24": 0.20413,
+            "25": 0.20204,
+            "26": 0.20188,
+            "27": 0.60524,
+            "28": 0.21001,
+            "29": 0.20338,
+            "30": 0.20253,
+            "31": 0.2399,
+            "32": 0.19914,
+            "33": 0.20122,
+            "34": 0.22929,
+            "35": 0.20106,
+            "36": 0.22225,
+            "37": 0.20411,
+            "38": 0.20267,
+            "39": 0.19726,
+            "40": 0.21398,
+            "41": 0.21317,
+            "42": 0.20362,
+            "43": 0.20696,
+            "44": 0.20834,
+            "45": 0.21563,
+            "46": 0.22195,
+            "47": 0.20394,
+            "48": 0.22663,
+            "49": 0.24701,
+            "50": 0.20255
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgx_a100.json
index f91ad30ed3a..512f1302b5f 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgx_a100.json
@@ -175,7 +175,7 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2910859264.0,
+            "1": 2910130176.0,
             "2": 3151821824.0,
             "3": 3152806912.0,
             "4": 3156619264.0,
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 6.90142,
-            "2": 0.35609,
-            "3": 0.29589,
-            "4": 0.29327,
-            "5": 0.29594,
-            "6": 0.293,
-            "7": 0.29087,
-            "8": 0.29178,
-            "9": 0.29184,
-            "10": 0.29303,
-            "11": 0.29381,
-            "12": 0.29249,
-            "13": 0.2936,
-            "14": 0.29671,
-            "15": 0.29969,
-            "16": 0.30214,
-            "17": 0.29463,
-            "18": 0.30986,
-            "19": 0.29429,
-            "20": 0.29497,
-            "21": 0.29609,
-            "22": 0.29421,
-            "23": 0.2931,
-            "24": 0.29341,
-            "25": 0.29443,
-            "26": 0.28879,
-            "27": 0.28844,
-            "28": 0.28873,
-            "29": 0.28741,
-            "30": 0.28737,
-            "31": 0.28905,
-            "32": 0.28701,
-            "33": 0.28706,
-            "34": 0.28739,
-            "35": 0.28701,
-            "36": 0.28751,
-            "37": 0.28826,
-            "38": 0.28792,
-            "39": 0.28663,
-            "40": 0.28805,
-            "41": 0.28776,
-            "42": 0.28855,
-            "43": 0.28777,
-            "44": 0.28801,
-            "45": 0.2885,
-            "46": 0.28907,
-            "47": 0.28755,
-            "48": 0.28719,
-            "49": 0.28878,
-            "50": 0.28677
+            "1": 6.10504,
+            "2": 0.31901,
+            "3": 0.30905,
+            "4": 0.29474,
+            "5": 0.29396,
+            "6": 0.29282,
+            "7": 0.29057,
+            "8": 0.2914,
+            "9": 0.29228,
+            "10": 0.29365,
+            "11": 0.29209,
+            "12": 0.28885,
+            "13": 0.28831,
+            "14": 0.28848,
+            "15": 0.29001,
+            "16": 0.28893,
+            "17": 0.28956,
+            "18": 0.28887,
+            "19": 0.28776,
+            "20": 0.28952,
+            "21": 0.6384,
+            "22": 0.29529,
+            "23": 0.29475,
+            "24": 0.29441,
+            "25": 0.29534,
+            "26": 0.29435,
+            "27": 0.29559,
+            "28": 0.30134,
+            "29": 0.2903,
+            "30": 0.28843,
+            "31": 0.28861,
+            "32": 0.28817,
+            "33": 0.29466,
+            "34": 0.28874,
+            "35": 0.28729,
+            "36": 0.28824,
+            "37": 0.28808,
+            "38": 0.28729,
+            "39": 0.28702,
+            "40": 0.28605,
+            "41": 0.28667,
+            "42": 0.2877,
+            "43": 0.28836,
+            "44": 0.28722,
+            "45": 0.28782,
+            "46": 0.28798,
+            "47": 0.28716,
+            "48": 0.28759,
+            "49": 0.28891,
+            "50": 0.28753
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..f62929eef31
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.80509,
+            "2": 10.82386,
+            "3": 10.80196,
+            "4": 10.79424,
+            "5": 10.8277,
+            "6": 10.84005,
+            "7": 10.8365,
+            "8": 10.82828,
+            "9": 10.83477,
+            "10": 10.77496,
+            "11": 10.85204,
+            "12": 10.83903,
+            "13": 10.85207,
+            "14": 10.85914,
+            "15": 10.81681,
+            "16": 10.79456,
+            "17": 10.77491,
+            "18": 10.80399,
+            "19": 10.79956,
+            "20": 10.73801,
+            "21": 10.72487,
+            "22": 10.59177,
+            "23": 10.73098,
+            "24": 10.6406,
+            "25": 10.59018,
+            "26": 10.63555,
+            "27": 10.66245,
+            "28": 10.6472,
+            "29": 10.64163,
+            "30": 10.4518,
+            "31": 10.22249,
+            "32": 10.52995,
+            "33": 10.51998,
+            "34": 10.31247,
+            "35": 10.34796,
+            "36": 10.31677,
+            "37": 10.42804,
+            "38": 10.29194,
+            "39": 10.46881,
+            "40": 10.19257,
+            "41": 10.23159,
+            "42": 10.29766,
+            "43": 9.97363,
+            "44": 10.07169,
+            "45": 9.97015,
+            "46": 9.94713,
+            "47": 10.23179,
+            "48": 9.97593,
+            "49": 9.67748,
+            "50": 10.0144
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 31182.0,
+            "2": 33013.0,
+            "3": 33646.0,
+            "4": 32202.0,
+            "5": 36913.0,
+            "6": 37554.0,
+            "7": 35184.0,
+            "8": 32207.0,
+            "9": 34523.0,
+            "10": 29945.0,
+            "11": 38237.0,
+            "12": 35346.0,
+            "13": 37426.0,
+            "14": 38358.0,
+            "15": 35140.0,
+            "16": 36293.0,
+            "17": 35645.0,
+            "18": 35117.0,
+            "19": 35648.0,
+            "20": 32896.0,
+            "21": 33511.0,
+            "22": 30704.0,
+            "23": 38149.0,
+            "24": 32677.0,
+            "25": 31055.0,
+            "26": 34700.0,
+            "27": 35410.0,
+            "28": 37268.0,
+            "29": 37953.0,
+            "30": 33210.0,
+            "31": 30482.0,
+            "32": 36908.0,
+            "33": 38308.0,
+            "34": 33125.0,
+            "35": 34341.0,
+            "36": 34925.0,
+            "37": 38767.0,
+            "38": 35780.0,
+            "39": 38955.0,
+            "40": 36485.0,
+            "41": 36015.0,
+            "42": 37638.0,
+            "43": 33689.0,
+            "44": 33688.0,
+            "45": 35448.0,
+            "46": 36810.0,
+            "47": 40858.0,
+            "48": 35696.0,
+            "49": 34729.0,
+            "50": 39077.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1027095040.0,
+            "2": 1027093504.0,
+            "3": 1027094528.0,
+            "4": 1027095040.0,
+            "5": 1027091968.0,
+            "6": 1027091968.0,
+            "7": 1027098112.0,
+            "8": 1027097600.0,
+            "9": 1027094528.0,
+            "10": 1027094016.0,
+            "11": 1027098624.0,
+            "12": 1027094528.0,
+            "13": 1027092480.0,
+            "14": 1027095040.0,
+            "15": 1027095040.0,
+            "16": 1027091456.0,
+            "17": 1027101184.0,
+            "18": 1027096064.0,
+            "19": 1027093504.0,
+            "20": 1027093504.0,
+            "21": 1027097088.0,
+            "22": 1027100160.0,
+            "23": 1027100160.0,
+            "24": 1027095552.0,
+            "25": 1027097088.0,
+            "26": 1027098112.0,
+            "27": 1027091456.0,
+            "28": 1027090944.0,
+            "29": 1027091968.0,
+            "30": 1027099648.0,
+            "31": 1027109888.0,
+            "32": 1027095552.0,
+            "33": 1027090944.0,
+            "34": 1027098112.0,
+            "35": 1027103744.0,
+            "36": 1027098112.0,
+            "37": 1027092480.0,
+            "38": 1027091456.0,
+            "39": 1027095040.0,
+            "40": 1027095040.0,
+            "41": 1027100160.0,
+            "42": 1027091968.0,
+            "43": 1027098624.0,
+            "44": 1027098624.0,
+            "45": 1027096064.0,
+            "46": 1027104256.0,
+            "47": 1027093504.0,
+            "48": 1027101184.0,
+            "49": 1027096064.0,
+            "50": 1027095552.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 3059586560.0,
+            "2": 3299159040.0,
+            "3": 3299482112.0,
+            "4": 3302137344.0,
+            "5": 3302137344.0,
+            "6": 3302137344.0,
+            "7": 3303535104.0,
+            "8": 3303535104.0,
+            "9": 3303535104.0,
+            "10": 3303535104.0,
+            "11": 3303535104.0,
+            "12": 3303535104.0,
+            "13": 3303535104.0,
+            "14": 3303535104.0,
+            "15": 3303535104.0,
+            "16": 3303535104.0,
+            "17": 3306910208.0,
+            "18": 3306910208.0,
+            "19": 3306910208.0,
+            "20": 3306910208.0,
+            "21": 3306910208.0,
+            "22": 3306910208.0,
+            "23": 3306910208.0,
+            "24": 3306910208.0,
+            "25": 3306910208.0,
+            "26": 3306910208.0,
+            "27": 3306910208.0,
+            "28": 3306910208.0,
+            "29": 3306910208.0,
+            "30": 3306910208.0,
+            "31": 3312495616.0,
+            "32": 3312495616.0,
+            "33": 3312495616.0,
+            "34": 3312495616.0,
+            "35": 3312495616.0,
+            "36": 3312495616.0,
+            "37": 3312495616.0,
+            "38": 3312495616.0,
+            "39": 3312495616.0,
+            "40": 3312495616.0,
+            "41": 3312495616.0,
+            "42": 3312495616.0,
+            "43": 3312495616.0,
+            "44": 3312495616.0,
+            "45": 3312495616.0,
+            "46": 3312495616.0,
+            "47": 3312495616.0,
+            "48": 3312495616.0,
+            "49": 3312495616.0,
+            "50": 3312495616.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.6307,
+            "3": 0.3854,
+            "4": 0.38116,
+            "5": 0.36866,
+            "6": 0.36756,
+            "7": 0.37196,
+            "8": 0.37096,
+            "9": 0.36719,
+            "10": 0.36516,
+            "11": 0.36882,
+            "12": 0.37126,
+            "13": 0.36294,
+            "14": 0.36799,
+            "15": 0.3669,
+            "16": 0.36835,
+            "17": 0.37548,
+            "18": 0.37236,
+            "19": 0.36274,
+            "20": 0.36388,
+            "21": 0.36581,
+            "22": 0.3703,
+            "23": 0.36921,
+            "24": 0.35712,
+            "25": 0.36049,
+            "26": 0.36512,
+            "27": 0.36657,
+            "28": 0.36074,
+            "29": 0.41887,
+            "30": 0.45698,
+            "31": 0.54747,
+            "32": 0.4695,
+            "33": 0.67157,
+            "34": 0.4186,
+            "35": 0.39703,
+            "36": 0.40139,
+            "37": 0.39345,
+            "38": 0.38789,
+            "39": 1.0807,
+            "40": 0.42023,
+            "41": 0.3945,
+            "42": 0.39312,
+            "43": 0.41319,
+            "44": 0.40657,
+            "45": 0.4003,
+            "46": 0.3986,
+            "47": 0.38501,
+            "48": 0.38618,
+            "49": 0.38586,
+            "50": 0.38297
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json
index c49c5a579c0..b626738d63e 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1027085824.0,
-            "2": 1027085824.0,
-            "3": 1027086848.0,
-            "4": 1027086336.0,
-            "5": 1027086848.0,
-            "6": 1027085312.0,
-            "7": 1027081728.0,
-            "8": 1027082752.0,
-            "9": 1027089408.0,
-            "10": 1027083776.0,
-            "11": 1027084288.0,
-            "12": 1027084288.0,
-            "13": 1027086848.0,
-            "14": 1027083776.0,
-            "15": 1027085312.0,
-            "16": 1027086336.0,
-            "17": 1027084288.0,
-            "18": 1027088384.0,
-            "19": 1027086848.0,
-            "20": 1027089920.0,
-            "21": 1027083264.0,
-            "22": 1027086336.0,
-            "23": 1027086848.0,
-            "24": 1027085824.0,
-            "25": 1027084288.0,
-            "26": 1027085312.0,
-            "27": 1027085312.0,
-            "28": 1027082752.0,
-            "29": 1027083776.0,
-            "30": 1027082240.0,
-            "31": 1027074048.0,
-            "32": 1027077120.0,
-            "33": 1027086336.0,
-            "34": 1027083264.0,
-            "35": 1027085312.0,
-            "36": 1027083776.0,
-            "37": 1027084288.0,
-            "38": 1027085312.0,
-            "39": 1027080704.0,
-            "40": 1027081728.0,
-            "41": 1027083264.0,
-            "42": 1027086848.0,
-            "43": 1027079680.0,
-            "44": 1027082752.0,
-            "45": 1027082752.0,
-            "46": 1027073536.0,
-            "47": 1027082752.0,
-            "48": 1027081216.0,
-            "49": 1027077120.0,
-            "50": 1027084800.0
+            "1": 1027090944.0,
+            "2": 1027090944.0,
+            "3": 1027091968.0,
+            "4": 1027091456.0,
+            "5": 1027091968.0,
+            "6": 1027090432.0,
+            "7": 1027086848.0,
+            "8": 1027087872.0,
+            "9": 1027094528.0,
+            "10": 1027088896.0,
+            "11": 1027089408.0,
+            "12": 1027089408.0,
+            "13": 1027091968.0,
+            "14": 1027088896.0,
+            "15": 1027090432.0,
+            "16": 1027091456.0,
+            "17": 1027089408.0,
+            "18": 1027093504.0,
+            "19": 1027091968.0,
+            "20": 1027095040.0,
+            "21": 1027088384.0,
+            "22": 1027091456.0,
+            "23": 1027091968.0,
+            "24": 1027090944.0,
+            "25": 1027089408.0,
+            "26": 1027090432.0,
+            "27": 1027090432.0,
+            "28": 1027087872.0,
+            "29": 1027088896.0,
+            "30": 1027087360.0,
+            "31": 1027079168.0,
+            "32": 1027082240.0,
+            "33": 1027091456.0,
+            "34": 1027088384.0,
+            "35": 1027090432.0,
+            "36": 1027088896.0,
+            "37": 1027089408.0,
+            "38": 1027090432.0,
+            "39": 1027085824.0,
+            "40": 1027086848.0,
+            "41": 1027088384.0,
+            "42": 1027091968.0,
+            "43": 1027084800.0,
+            "44": 1027087872.0,
+            "45": 1027087872.0,
+            "46": 1027078656.0,
+            "47": 1027087872.0,
+            "48": 1027086336.0,
+            "49": 1027082240.0,
+            "50": 1027089920.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3007080960.0,
-            "2": 3247499776.0,
-            "3": 3247499776.0,
-            "4": 3248093184.0,
-            "5": 3248476160.0,
-            "6": 3248476160.0,
-            "7": 3248476160.0,
-            "8": 3248476160.0,
-            "9": 3248476160.0,
-            "10": 3249142784.0,
-            "11": 3249142784.0,
-            "12": 3249142784.0,
-            "13": 3249142784.0,
-            "14": 3249142784.0,
-            "15": 3249142784.0,
-            "16": 3249142784.0,
-            "17": 3249142784.0,
-            "18": 3249142784.0,
-            "19": 3249142784.0,
-            "20": 3249142784.0,
-            "21": 3249142784.0,
-            "22": 3249860608.0,
-            "23": 3249860608.0,
-            "24": 3249972736.0,
-            "25": 3249972736.0,
-            "26": 3249972736.0,
-            "27": 3249972736.0,
-            "28": 3249972736.0,
-            "29": 3249972736.0,
-            "30": 3249972736.0,
-            "31": 3249972736.0,
-            "32": 3249972736.0,
-            "33": 3249972736.0,
-            "34": 3249972736.0,
-            "35": 3249972736.0,
-            "36": 3249972736.0,
-            "37": 3249972736.0,
-            "38": 3249972736.0,
-            "39": 3249972736.0,
-            "40": 3249972736.0,
-            "41": 3249972736.0,
-            "42": 3249972736.0,
-            "43": 3249972736.0,
-            "44": 3249972736.0,
-            "45": 3249972736.0,
-            "46": 3249972736.0,
-            "47": 3249972736.0,
-            "48": 3249972736.0,
-            "49": 3249972736.0,
-            "50": 3249972736.0
+            "1": 3057868288.0,
+            "2": 3298335232.0,
+            "3": 3298335232.0,
+            "4": 3300084224.0,
+            "5": 3300084224.0,
+            "6": 3300084224.0,
+            "7": 3300084224.0,
+            "8": 3300084224.0,
+            "9": 3300084224.0,
+            "10": 3300122624.0,
+            "11": 3300122624.0,
+            "12": 3300122624.0,
+            "13": 3300122624.0,
+            "14": 3300122624.0,
+            "15": 3300122624.0,
+            "16": 3300122624.0,
+            "17": 3300122624.0,
+            "18": 3300122624.0,
+            "19": 3300376576.0,
+            "20": 3300416000.0,
+            "21": 3300416000.0,
+            "22": 3301032960.0,
+            "23": 3301998080.0,
+            "24": 3301998080.0,
+            "25": 3301998080.0,
+            "26": 3301998080.0,
+            "27": 3301998080.0,
+            "28": 3301998080.0,
+            "29": 3301998080.0,
+            "30": 3301998080.0,
+            "31": 3301998080.0,
+            "32": 3301998080.0,
+            "33": 3301998080.0,
+            "34": 3301998080.0,
+            "35": 3301998080.0,
+            "36": 3301998080.0,
+            "37": 3301998080.0,
+            "38": 3301998080.0,
+            "39": 3301998080.0,
+            "40": 3301998080.0,
+            "41": 3301998080.0,
+            "42": 3301998080.0,
+            "43": 3301998080.0,
+            "44": 3301998080.0,
+            "45": 3301998080.0,
+            "46": 3301998080.0,
+            "47": 3301998080.0,
+            "48": 3301998080.0,
+            "49": 3301998080.0,
+            "50": 3301998080.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 13.35552,
-            "2": 0.37785,
-            "3": 0.29632,
-            "4": 0.29599,
-            "5": 0.25057,
-            "6": 0.2376,
-            "7": 0.24788,
-            "8": 0.2386,
-            "9": 0.23567,
-            "10": 0.23981,
-            "11": 0.23457,
-            "12": 0.23608,
-            "13": 0.24093,
-            "14": 0.23076,
-            "15": 0.25524,
-            "16": 0.23573,
-            "17": 0.24636,
-            "18": 0.2348,
-            "19": 0.23922,
-            "20": 0.23445,
-            "21": 0.22924,
-            "22": 0.23872,
-            "23": 0.23172,
-            "24": 0.23116,
-            "25": 0.23103,
-            "26": 0.23556,
-            "27": 0.23228,
-            "28": 0.23323,
-            "29": 0.23495,
-            "30": 0.23011,
-            "31": 0.27652,
-            "32": 0.23015,
-            "33": 0.22902,
-            "34": 0.25666,
-            "35": 0.23045,
-            "36": 0.24626,
-            "37": 0.23146,
-            "38": 0.2344,
-            "39": 0.22864,
-            "40": 0.24642,
-            "41": 0.23788,
-            "42": 0.23274,
-            "43": 0.24326,
-            "44": 0.23733,
-            "45": 0.24263,
-            "46": 0.25392,
-            "47": 0.23328,
-            "48": 0.26156,
-            "49": 0.27837,
-            "50": 0.23303
+            "1": 15.57121,
+            "2": 0.28312,
+            "3": 0.24431,
+            "4": 0.2266,
+            "5": 0.21347,
+            "6": 0.20803,
+            "7": 0.2145,
+            "8": 0.20409,
+            "9": 0.2038,
+            "10": 0.20378,
+            "11": 0.20122,
+            "12": 0.20047,
+            "13": 0.2053,
+            "14": 0.20008,
+            "15": 0.22405,
+            "16": 0.19642,
+            "17": 0.20937,
+            "18": 0.19918,
+            "19": 0.2032,
+            "20": 0.19792,
+            "21": 0.19626,
+            "22": 0.20047,
+            "23": 0.19555,
+            "24": 0.2,
+            "25": 0.23371,
+            "26": 0.2005,
+            "27": 0.59196,
+            "28": 0.19966,
+            "29": 0.20231,
+            "30": 0.19778,
+            "31": 0.23768,
+            "32": 0.20526,
+            "33": 0.20518,
+            "34": 0.22786,
+            "35": 0.20088,
+            "36": 0.21894,
+            "37": 0.20033,
+            "38": 0.20352,
+            "39": 0.19985,
+            "40": 0.20975,
+            "41": 0.2189,
+            "42": 0.20277,
+            "43": 0.20495,
+            "44": 0.20563,
+            "45": 0.21473,
+            "46": 0.21859,
+            "47": 0.2018,
+            "48": 0.22732,
+            "49": 0.2668,
+            "50": 0.19761
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/model_config.yaml
index d02951177b0..b84bf45b890 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/model_config.yaml
@@ -55,7 +55,6 @@ MODEL_ARGS:
   --attention-softmax-in-fp32: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgx_a100.json
index 9114b4bb385..43beb1e88d3 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgx_a100.json
@@ -175,7 +175,7 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3404911104.0,
+            "1": 3405945344.0,
             "2": 3972516352.0,
             "3": 3976973312.0,
             "4": 3976973312.0,
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 7.62035,
-            "2": 0.36752,
-            "3": 0.30562,
-            "4": 0.29876,
-            "5": 0.298,
-            "6": 0.29743,
-            "7": 0.29729,
-            "8": 0.2967,
-            "9": 0.29751,
-            "10": 0.29912,
-            "11": 0.29575,
-            "12": 0.29589,
-            "13": 0.29696,
-            "14": 0.29898,
-            "15": 0.30053,
-            "16": 0.30093,
-            "17": 0.2977,
-            "18": 0.2973,
-            "19": 0.29596,
-            "20": 0.29757,
-            "21": 0.2967,
-            "22": 0.29963,
-            "23": 0.29707,
-            "24": 0.29748,
-            "25": 0.29701,
-            "26": 0.29838,
-            "27": 0.29889,
-            "28": 0.29962,
-            "29": 0.30399,
-            "30": 0.30932,
-            "31": 0.30553,
-            "32": 0.29765,
-            "33": 0.30499,
-            "34": 0.29754,
-            "35": 0.29747,
-            "36": 0.29801,
-            "37": 0.30768,
-            "38": 0.29693,
-            "39": 0.29912,
-            "40": 0.299,
-            "41": 0.2982,
-            "42": 0.37256,
-            "43": 0.29865,
-            "44": 0.29774,
-            "45": 0.29961,
-            "46": 0.2988,
-            "47": 0.30454,
-            "48": 0.30466,
-            "49": 0.30093,
-            "50": 0.29883
+            "1": 9.45286,
+            "2": 0.38607,
+            "3": 0.3213,
+            "4": 0.29678,
+            "5": 0.29879,
+            "6": 0.29861,
+            "7": 0.29609,
+            "8": 0.29454,
+            "9": 0.29554,
+            "10": 0.2938,
+            "11": 0.29617,
+            "12": 0.29426,
+            "13": 0.29354,
+            "14": 0.29415,
+            "15": 0.29446,
+            "16": 0.29436,
+            "17": 0.29604,
+            "18": 0.29438,
+            "19": 0.29445,
+            "20": 0.2949,
+            "21": 0.29462,
+            "22": 0.2942,
+            "23": 0.29494,
+            "24": 0.29415,
+            "25": 0.29456,
+            "26": 0.29464,
+            "27": 0.29403,
+            "28": 0.29487,
+            "29": 0.29396,
+            "30": 0.30341,
+            "31": 0.29906,
+            "32": 0.29469,
+            "33": 0.29821,
+            "34": 0.29373,
+            "35": 0.294,
+            "36": 0.6955,
+            "37": 0.30497,
+            "38": 0.29453,
+            "39": 0.29652,
+            "40": 0.29409,
+            "41": 0.29484,
+            "42": 0.29643,
+            "43": 0.29621,
+            "44": 0.2949,
+            "45": 0.29781,
+            "46": 0.29896,
+            "47": 0.29487,
+            "48": 0.29896,
+            "49": 0.29728,
+            "50": 0.29271
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..682fa44a64d
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82137,
+            "2": 10.8271,
+            "3": 10.81279,
+            "4": 10.80424,
+            "5": 10.84481,
+            "6": 10.85159,
+            "7": 10.82705,
+            "8": 10.83127,
+            "9": 10.8396,
+            "10": 10.79638,
+            "11": 10.85834,
+            "12": 10.8443,
+            "13": 10.8625,
+            "14": 10.86559,
+            "15": 10.8001,
+            "16": 10.78718,
+            "17": 10.7639,
+            "18": 10.78578,
+            "19": 10.78836,
+            "20": 10.71249,
+            "21": 10.68241,
+            "22": 10.54353,
+            "23": 10.69825,
+            "24": 10.58633,
+            "25": 10.52721,
+            "26": 10.58871,
+            "27": 10.60408,
+            "28": 10.57696,
+            "29": 10.57897,
+            "30": 10.36401,
+            "31": 10.10796,
+            "32": 10.44854,
+            "33": 10.4401,
+            "34": 10.20252,
+            "35": 10.25069,
+            "36": 10.21055,
+            "37": 10.32849,
+            "38": 10.17511,
+            "39": 10.38336,
+            "40": 10.05674,
+            "41": 10.10841,
+            "42": 10.18865,
+            "43": 9.80582,
+            "44": 9.91887,
+            "45": 9.79924,
+            "46": 9.78948,
+            "47": 10.11342,
+            "48": 9.82499,
+            "49": 9.49844,
+            "50": 9.87311
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 27059.0,
+            "2": 29311.0,
+            "3": 28886.0,
+            "4": 27768.0,
+            "5": 32694.0,
+            "6": 33260.0,
+            "7": 31409.0,
+            "8": 27342.0,
+            "9": 30401.0,
+            "10": 25524.0,
+            "11": 33805.0,
+            "12": 31146.0,
+            "13": 33161.0,
+            "14": 33991.0,
+            "15": 31160.0,
+            "16": 32445.0,
+            "17": 30974.0,
+            "18": 31151.0,
+            "19": 31742.0,
+            "20": 28624.0,
+            "21": 29115.0,
+            "22": 26827.0,
+            "23": 34472.0,
+            "24": 29096.0,
+            "25": 27239.0,
+            "26": 30910.0,
+            "27": 31915.0,
+            "28": 33968.0,
+            "29": 36017.0,
+            "30": 30702.0,
+            "31": 27384.0,
+            "32": 33681.0,
+            "33": 35476.0,
+            "34": 30160.0,
+            "35": 31419.0,
+            "36": 32568.0,
+            "37": 36189.0,
+            "38": 33607.0,
+            "39": 37731.0,
+            "40": 34463.0,
+            "41": 33229.0,
+            "42": 35616.0,
+            "43": 32361.0,
+            "44": 31908.0,
+            "45": 33571.0,
+            "46": 33618.0,
+            "47": 38873.0,
+            "48": 35034.0,
+            "49": 34407.0,
+            "50": 37669.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1559412224.0,
+            "2": 1558616064.0,
+            "3": 1558495744.0,
+            "4": 1559267328.0,
+            "5": 1558842880.0,
+            "6": 1559098368.0,
+            "7": 1558495744.0,
+            "8": 1558546432.0,
+            "9": 1558495744.0,
+            "10": 1558546432.0,
+            "11": 1558597120.0,
+            "12": 1558546432.0,
+            "13": 1558597120.0,
+            "14": 1558546432.0,
+            "15": 1558904320.0,
+            "16": 1558647808.0,
+            "17": 1558597120.0,
+            "18": 1558889472.0,
+            "19": 1558597120.0,
+            "20": 1559229440.0,
+            "21": 1558597120.0,
+            "22": 1558758400.0,
+            "23": 1559698944.0,
+            "24": 1559078912.0,
+            "25": 1559052800.0,
+            "26": 1558647808.0,
+            "27": 1559382528.0,
+            "28": 1558749184.0,
+            "29": 1558830592.0,
+            "30": 1558749184.0,
+            "31": 1558915584.0,
+            "32": 1559541760.0,
+            "33": 1558698496.0,
+            "34": 1558749184.0,
+            "35": 1559422464.0,
+            "36": 1558863872.0,
+            "37": 1558799872.0,
+            "38": 1558749184.0,
+            "39": 1559397888.0,
+            "40": 1559002112.0,
+            "41": 1558799872.0,
+            "42": 1558850560.0,
+            "43": 1559724544.0,
+            "44": 1558850560.0,
+            "45": 1558901248.0,
+            "46": 1559175168.0,
+            "47": 1558901248.0,
+            "48": 1558850560.0,
+            "49": 1558901248.0,
+            "50": 1559632896.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 3495116800.0,
+            "2": 4054579712.0,
+            "3": 4062724096.0,
+            "4": 4062724096.0,
+            "5": 4062724096.0,
+            "6": 4062724096.0,
+            "7": 4070930432.0,
+            "8": 4070930432.0,
+            "9": 4073446400.0,
+            "10": 4073446400.0,
+            "11": 4073446400.0,
+            "12": 4073446400.0,
+            "13": 4073446400.0,
+            "14": 4075493888.0,
+            "15": 4075493888.0,
+            "16": 4075493888.0,
+            "17": 4075493888.0,
+            "18": 4075493888.0,
+            "19": 4075493888.0,
+            "20": 4075493888.0,
+            "21": 4075493888.0,
+            "22": 4079303168.0,
+            "23": 4096666624.0,
+            "24": 4096666624.0,
+            "25": 4096666624.0,
+            "26": 4096666624.0,
+            "27": 4096666624.0,
+            "28": 4096666624.0,
+            "29": 4096666624.0,
+            "30": 4096666624.0,
+            "31": 4105302016.0,
+            "32": 4105302016.0,
+            "33": 4105302016.0,
+            "34": 4105302016.0,
+            "35": 4105302016.0,
+            "36": 4105302016.0,
+            "37": 4105302016.0,
+            "38": 4105302016.0,
+            "39": 4105302016.0,
+            "40": 4105302016.0,
+            "41": 4105302016.0,
+            "42": 4105302016.0,
+            "43": 4105302016.0,
+            "44": 4105302016.0,
+            "45": 4105302016.0,
+            "46": 4105302016.0,
+            "47": 4105302016.0,
+            "48": 4105302016.0,
+            "49": 4105302016.0,
+            "50": 4105302016.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 6.04776,
+            "3": 0.43191,
+            "4": 0.39355,
+            "5": 0.39556,
+            "6": 0.39818,
+            "7": 0.39915,
+            "8": 0.39139,
+            "9": 0.41074,
+            "10": 0.45245,
+            "11": 0.45849,
+            "12": 0.46806,
+            "13": 0.46943,
+            "14": 0.47411,
+            "15": 0.48525,
+            "16": 0.47939,
+            "17": 0.47872,
+            "18": 0.4715,
+            "19": 0.4792,
+            "20": 0.46531,
+            "21": 0.46809,
+            "22": 0.46348,
+            "23": 0.47875,
+            "24": 0.83175,
+            "25": 0.50009,
+            "26": 0.4884,
+            "27": 0.82926,
+            "28": 0.50184,
+            "29": 0.50509,
+            "30": 0.49725,
+            "31": 0.50602,
+            "32": 0.84607,
+            "33": 0.50581,
+            "34": 0.49849,
+            "35": 0.50057,
+            "36": 0.5007,
+            "37": 0.50598,
+            "38": 0.50147,
+            "39": 0.51593,
+            "40": 0.51491,
+            "41": 0.50337,
+            "42": 0.48945,
+            "43": 0.49729,
+            "44": 0.49341,
+            "45": 0.4898,
+            "46": 0.49624,
+            "47": 0.51146,
+            "48": 0.49582,
+            "49": 0.49624,
+            "50": 0.49469
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_h100.json
index acf98f05d31..19b393f6369 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1561367040.0,
-            "2": 1560972288.0,
-            "3": 1561248256.0,
-            "4": 1560096768.0,
-            "5": 1559926784.0,
-            "6": 1561850368.0,
-            "7": 1560161792.0,
-            "8": 1560285184.0,
-            "9": 1560998912.0,
-            "10": 1561293824.0,
-            "11": 1560700416.0,
-            "12": 1562299904.0,
-            "13": 1560526848.0,
-            "14": 1561499648.0,
-            "15": 1559979520.0,
-            "16": 1561232384.0,
-            "17": 1561337856.0,
-            "18": 1560266240.0,
-            "19": 1561224704.0,
-            "20": 1560222720.0,
-            "21": 1561771008.0,
-            "22": 1559743488.0,
-            "23": 1560801792.0,
-            "24": 1561316864.0,
-            "25": 1560606720.0,
-            "26": 1562301440.0,
-            "27": 1560251904.0,
-            "28": 1559861248.0,
-            "29": 1559861248.0,
-            "30": 1560919552.0,
-            "31": 1561406976.0,
-            "32": 1565212672.0,
-            "33": 1560626176.0,
-            "34": 1561871360.0,
-            "35": 1560959488.0,
-            "36": 1561910784.0,
-            "37": 1559904256.0,
-            "38": 1560347648.0,
-            "39": 1562116608.0,
-            "40": 1562510336.0,
-            "41": 1562299392.0,
-            "42": 1561589248.0,
-            "43": 1560753664.0,
-            "44": 1561721856.0,
-            "45": 1561170944.0,
-            "46": 1561996288.0,
-            "47": 1560805888.0,
-            "48": 1561083392.0,
-            "49": 1560795136.0,
-            "50": 1561778176.0
+            "1": 1561031168.0,
+            "2": 1562193408.0,
+            "3": 1561517056.0,
+            "4": 1560948224.0,
+            "5": 1562155008.0,
+            "6": 1563247104.0,
+            "7": 1562656768.0,
+            "8": 1562246656.0,
+            "9": 1561597952.0,
+            "10": 1564070400.0,
+            "11": 1562084352.0,
+            "12": 1559892480.0,
+            "13": 1562137600.0,
+            "14": 1561026048.0,
+            "15": 1561419776.0,
+            "16": 1562166784.0,
+            "17": 1560322048.0,
+            "18": 1561402880.0,
+            "19": 1564046336.0,
+            "20": 1562059264.0,
+            "21": 1560781824.0,
+            "22": 1561673728.0,
+            "23": 1562520064.0,
+            "24": 1561093632.0,
+            "25": 1561384960.0,
+            "26": 1562000896.0,
+            "27": 1561264128.0,
+            "28": 1561458176.0,
+            "29": 1561382912.0,
+            "30": 1562413568.0,
+            "31": 1560165376.0,
+            "32": 1561413120.0,
+            "33": 1562501120.0,
+            "34": 1562718720.0,
+            "35": 1563195392.0,
+            "36": 1561894400.0,
+            "37": 1560998912.0,
+            "38": 1563760128.0,
+            "39": 1561207808.0,
+            "40": 1562625536.0,
+            "41": 1561658368.0,
+            "42": 1561409024.0,
+            "43": 1559668736.0,
+            "44": 1561136640.0,
+            "45": 1560246272.0,
+            "46": 1562813952.0,
+            "47": 1561296896.0,
+            "48": 1561900544.0,
+            "49": 1562101760.0,
+            "50": 1563655680.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 3680567296.0,
-            "2": 4256236032.0,
-            "3": 4260136960.0,
-            "4": 4260136960.0,
-            "5": 4261063168.0,
-            "6": 4289287168.0,
-            "7": 4289287168.0,
-            "8": 4289287168.0,
-            "9": 4289287168.0,
-            "10": 4289287168.0,
-            "11": 4289287168.0,
-            "12": 4289287168.0,
-            "13": 4289287168.0,
-            "14": 4289287168.0,
-            "15": 4289287168.0,
-            "16": 4289287168.0,
-            "17": 4289287168.0,
-            "18": 4289287168.0,
-            "19": 4289287168.0,
-            "20": 4289287168.0,
-            "21": 4289287168.0,
-            "22": 4289287168.0,
-            "23": 4289287168.0,
-            "24": 4289287168.0,
-            "25": 4289287168.0,
-            "26": 4289287168.0,
-            "27": 4289287168.0,
-            "28": 4289287168.0,
-            "29": 4289287168.0,
-            "30": 4289287168.0,
-            "31": 4289287168.0,
-            "32": 4289287168.0,
-            "33": 4289287168.0,
-            "34": 4289287168.0,
-            "35": 4289287168.0,
-            "36": 4289287168.0,
-            "37": 4289287168.0,
-            "38": 4289287168.0,
-            "39": 4289287168.0,
-            "40": 4289287168.0,
-            "41": 4289287168.0,
-            "42": 4289287168.0,
-            "43": 4289287168.0,
-            "44": 4289287168.0,
-            "45": 4289287168.0,
-            "46": 4289287168.0,
-            "47": 4289287168.0,
-            "48": 4289287168.0,
-            "49": 4289287168.0,
-            "50": 4289287168.0
+            "1": 3465706496.0,
+            "2": 4045009920.0,
+            "3": 4045009920.0,
+            "4": 4045009920.0,
+            "5": 4045009920.0,
+            "6": 4067111936.0,
+            "7": 4067111936.0,
+            "8": 4067111936.0,
+            "9": 4067111936.0,
+            "10": 4067111936.0,
+            "11": 4067111936.0,
+            "12": 4067111936.0,
+            "13": 4067111936.0,
+            "14": 4067111936.0,
+            "15": 4067111936.0,
+            "16": 4067111936.0,
+            "17": 4067111936.0,
+            "18": 4067111936.0,
+            "19": 4067111936.0,
+            "20": 4067111936.0,
+            "21": 4067111936.0,
+            "22": 4067111936.0,
+            "23": 4067111936.0,
+            "24": 4067111936.0,
+            "25": 4067111936.0,
+            "26": 4067111936.0,
+            "27": 4067111936.0,
+            "28": 4067111936.0,
+            "29": 4067111936.0,
+            "30": 4067111936.0,
+            "31": 4067111936.0,
+            "32": 4067111936.0,
+            "33": 4067111936.0,
+            "34": 4067111936.0,
+            "35": 4067111936.0,
+            "36": 4067111936.0,
+            "37": 4067111936.0,
+            "38": 4067111936.0,
+            "39": 4067111936.0,
+            "40": 4067111936.0,
+            "41": 4067111936.0,
+            "42": 4067111936.0,
+            "43": 4067111936.0,
+            "44": 4067111936.0,
+            "45": 4067111936.0,
+            "46": 4067111936.0,
+            "47": 4067111936.0,
+            "48": 4067111936.0,
+            "49": 4067111936.0,
+            "50": 4067111936.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 18.57368,
-            "2": 0.50382,
-            "3": 0.41522,
-            "4": 0.37227,
-            "5": 0.37501,
-            "6": 0.33117,
-            "7": 0.32515,
-            "8": 0.31941,
-            "9": 0.32367,
-            "10": 0.32326,
-            "11": 0.30606,
-            "12": 0.30616,
-            "13": 0.29955,
-            "14": 0.30443,
-            "15": 0.30558,
-            "16": 0.29289,
-            "17": 0.30498,
-            "18": 0.29213,
-            "19": 0.29318,
-            "20": 0.29695,
-            "21": 0.29798,
-            "22": 0.31295,
-            "23": 0.29473,
-            "24": 0.29975,
-            "25": 0.29698,
-            "26": 0.30574,
-            "27": 0.29785,
-            "28": 0.30807,
-            "29": 0.29928,
-            "30": 0.3087,
-            "31": 0.30718,
-            "32": 0.30993,
-            "33": 0.30203,
-            "34": 0.31719,
-            "35": 0.30742,
-            "36": 0.30563,
-            "37": 0.31427,
-            "38": 0.31171,
-            "39": 0.31768,
-            "40": 0.30755,
-            "41": 0.30394,
-            "42": 0.29792,
-            "43": 0.30454,
-            "44": 0.31398,
-            "45": 0.29651,
-            "46": 0.31171,
-            "47": 0.29161,
-            "48": 0.3034,
-            "49": 0.2972,
-            "50": 0.29959
+            "1": 25.658,
+            "2": 0.47954,
+            "3": 0.41847,
+            "4": 0.33258,
+            "5": 0.34351,
+            "6": 0.31011,
+            "7": 0.31575,
+            "8": 0.29238,
+            "9": 0.30311,
+            "10": 0.34916,
+            "11": 0.30925,
+            "12": 0.34341,
+            "13": 0.28433,
+            "14": 0.28892,
+            "15": 0.29252,
+            "16": 0.2927,
+            "17": 0.30297,
+            "18": 0.29339,
+            "19": 0.2886,
+            "20": 0.29686,
+            "21": 0.29022,
+            "22": 0.65703,
+            "23": 0.29161,
+            "24": 0.29821,
+            "25": 0.29341,
+            "26": 0.30856,
+            "27": 0.2991,
+            "28": 0.29279,
+            "29": 0.29852,
+            "30": 0.30839,
+            "31": 0.29491,
+            "32": 0.2896,
+            "33": 0.29084,
+            "34": 0.32605,
+            "35": 0.29205,
+            "36": 0.28559,
+            "37": 0.29399,
+            "38": 0.28264,
+            "39": 0.28463,
+            "40": 0.28019,
+            "41": 0.28893,
+            "42": 0.27586,
+            "43": 0.28759,
+            "44": 0.28318,
+            "45": 0.27759,
+            "46": 0.27363,
+            "47": 0.27776,
+            "48": 0.27855,
+            "49": 1.02062,
+            "50": 0.28168
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json
index 2ef31d337c6..3948f0ea908 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 11.04722,
+            "1": 11.04733,
             "2": 11.03572,
-            "3": 9.58802,
-            "4": 9.25807,
-            "5": 9.46595,
-            "6": 9.99646,
-            "7": 9.50952,
-            "8": 8.97596,
-            "9": 8.64768,
-            "10": 9.40103,
-            "11": 8.86556,
-            "12": 8.63563,
-            "13": 8.52125,
-            "14": 8.08824,
-            "15": 8.1958,
-            "16": 8.22112,
-            "17": 8.14098,
-            "18": 7.8386,
-            "19": 8.23438,
-            "20": 7.95361,
-            "21": 7.62549,
-            "22": 7.60352,
-            "23": 7.47957,
-            "24": 7.46573,
-            "25": 7.70343,
-            "26": 7.10719,
-            "27": 7.64313,
-            "28": 7.34582,
-            "29": 7.5169,
-            "30": 7.67511,
-            "31": 7.41799,
-            "32": 7.61213,
-            "33": 7.66582,
-            "34": 7.73101,
-            "35": 7.23081,
-            "36": 7.10765,
-            "37": 7.4476,
-            "38": 7.21053,
-            "39": 7.57508,
-            "40": 7.5662,
-            "41": 7.51605,
-            "42": 7.27243,
-            "43": 7.25706,
-            "44": 7.44,
-            "45": 7.21244,
-            "46": 6.92421,
-            "47": 7.32604,
-            "48": 7.17147,
-            "49": 7.62154,
-            "50": 7.0624
+            "3": 9.58761,
+            "4": 9.25798,
+            "5": 9.53373,
+            "6": 9.90316,
+            "7": 9.4853,
+            "8": 8.93791,
+            "9": 8.65798,
+            "10": 9.05611,
+            "11": 8.49418,
+            "12": 8.5242,
+            "13": 8.45277,
+            "14": 7.97207,
+            "15": 8.04481,
+            "16": 8.0797,
+            "17": 8.08354,
+            "18": 7.76107,
+            "19": 8.14865,
+            "20": 7.89777,
+            "21": 7.58594,
+            "22": 7.54567,
+            "23": 7.43399,
+            "24": 7.43098,
+            "25": 7.67584,
+            "26": 7.07216,
+            "27": 7.6197,
+            "28": 7.32805,
+            "29": 7.4899,
+            "30": 7.64402,
+            "31": 7.39581,
+            "32": 7.58878,
+            "33": 7.63916,
+            "34": 7.69992,
+            "35": 7.21112,
+            "36": 7.08484,
+            "37": 7.42312,
+            "38": 7.18694,
+            "39": 7.54858,
+            "40": 7.54095,
+            "41": 7.48915,
+            "42": 7.24832,
+            "43": 7.2344,
+            "44": 7.4117,
+            "45": 7.1836,
+            "46": 6.89743,
+            "47": 7.29953,
+            "48": 7.14192,
+            "49": 7.58721,
+            "50": 7.03393
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 38802612.0,
-            "2": 38543656.0,
-            "3": 38739356.0,
-            "4": 273649600.0,
-            "5": 252887040.0,
-            "6": 255692384.0,
-            "7": 598483264.0,
-            "8": 787737984.0,
-            "9": 696133120.0,
-            "10": 505146368.0,
-            "11": 718888640.0,
-            "12": 872597184.0,
-            "13": 947495104.0,
-            "14": 1076398976.0,
-            "15": 856390592.0,
-            "16": 1048635648.0,
-            "17": 831370688.0,
-            "18": 963679552.0,
-            "19": 970018240.0,
-            "20": 935737344.0,
-            "21": 904189312.0,
-            "22": 887937280.0,
-            "23": 894777856.0,
-            "24": 703744192.0,
-            "25": 909232512.0,
-            "26": 875633216.0,
-            "27": 894981376.0,
-            "28": 919242816.0,
-            "29": 931351552.0,
-            "30": 929784768.0,
-            "31": 941621376.0,
-            "32": 885000768.0,
-            "33": 828484096.0,
-            "34": 822284800.0,
-            "35": 832032128.0,
-            "36": 787939392.0,
-            "37": 770719808.0,
-            "38": 561204672.0,
-            "39": 617201536.0,
-            "40": 695374592.0,
-            "41": 698978816.0,
-            "42": 692913728.0,
-            "43": 668003776.0,
-            "44": 673780992.0,
-            "45": 631182912.0,
-            "46": 444613312.0,
-            "47": 591957824.0,
-            "48": 617363968.0,
-            "49": 585295808.0,
-            "50": 570423872.0
+            "1": 38802580.0,
+            "2": 38543496.0,
+            "3": 38739384.0,
+            "4": 286224448.0,
+            "5": 252889984.0,
+            "6": 255719936.0,
+            "7": 604766528.0,
+            "8": 762591552.0,
+            "9": 658408896.0,
+            "10": 737969280.0,
+            "11": 728304000.0,
+            "12": 759307840.0,
+            "13": 900330048.0,
+            "14": 827930176.0,
+            "15": 771439488.0,
+            "16": 941681408.0,
+            "17": 645770560.0,
+            "18": 630285120.0,
+            "19": 976311360.0,
+            "20": 982916608.0,
+            "21": 781530112.0,
+            "22": 714968384.0,
+            "23": 907354560.0,
+            "24": 807526912.0,
+            "25": 814861568.0,
+            "26": 800138240.0,
+            "27": 847802560.0,
+            "28": 831162880.0,
+            "29": 811810368.0,
+            "30": 816535808.0,
+            "31": 815796160.0,
+            "32": 793772928.0,
+            "33": 781300032.0,
+            "34": 778254592.0,
+            "35": 762826688.0,
+            "36": 737609088.0,
+            "37": 679501376.0,
+            "38": 664984064.0,
+            "39": 645504448.0,
+            "40": 635595648.0,
+            "41": 604614784.0,
+            "42": 579667968.0,
+            "43": 567337600.0,
+            "44": 557388992.0,
+            "45": 533662880.0,
+            "46": 340805728.0,
+            "47": 488152032.0,
+            "48": 475815680.0,
+            "49": 453176704.0,
+            "50": 438299776.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 6637272576.0,
-            "2": 6637274624.0,
-            "3": 6637274624.0,
-            "4": 6637274624.0,
-            "5": 6637274624.0,
-            "6": 6637274624.0,
-            "7": 6637274624.0,
-            "8": 6637274624.0,
-            "9": 6637274624.0,
-            "10": 6637274624.0,
-            "11": 6637274624.0,
-            "12": 6637274624.0,
-            "13": 6637274624.0,
-            "14": 6637274624.0,
-            "15": 6637274624.0,
-            "16": 6637274624.0,
-            "17": 6637274624.0,
-            "18": 6637274624.0,
-            "19": 6637274624.0,
-            "20": 6637274624.0,
-            "21": 6637274624.0,
-            "22": 6637274624.0,
-            "23": 6637274624.0,
-            "24": 6637274624.0,
-            "25": 6637274624.0,
-            "26": 6637274624.0,
-            "27": 6637274624.0,
-            "28": 6637274624.0,
-            "29": 6637274624.0,
-            "30": 6637274624.0,
-            "31": 6637274624.0,
-            "32": 6637274624.0,
-            "33": 6637274624.0,
-            "34": 6637274624.0,
-            "35": 6637274624.0,
-            "36": 6637274624.0,
-            "37": 6637274624.0,
-            "38": 6637274624.0,
-            "39": 6637274624.0,
-            "40": 6637274624.0,
-            "41": 6637274624.0,
-            "42": 6637274624.0,
-            "43": 6637274624.0,
-            "44": 6637274624.0,
-            "45": 6637274624.0,
-            "46": 6637274624.0,
-            "47": 6637274624.0,
-            "48": 6637274624.0,
-            "49": 6637274624.0,
-            "50": 6637274624.0
+            "1": 6632029696.0,
+            "2": 6632031744.0,
+            "3": 6632031744.0,
+            "4": 6632031744.0,
+            "5": 6632031744.0,
+            "6": 6632031744.0,
+            "7": 6632031744.0,
+            "8": 6632031744.0,
+            "9": 6632031744.0,
+            "10": 6632031744.0,
+            "11": 6632031744.0,
+            "12": 6632031744.0,
+            "13": 6632031744.0,
+            "14": 6632031744.0,
+            "15": 6632031744.0,
+            "16": 6632031744.0,
+            "17": 6632031744.0,
+            "18": 6632031744.0,
+            "19": 6632031744.0,
+            "20": 6632031744.0,
+            "21": 6632031744.0,
+            "22": 6632031744.0,
+            "23": 6632031744.0,
+            "24": 6632031744.0,
+            "25": 6632031744.0,
+            "26": 6632031744.0,
+            "27": 6632031744.0,
+            "28": 6632031744.0,
+            "29": 6632031744.0,
+            "30": 6632031744.0,
+            "31": 6632031744.0,
+            "32": 6632031744.0,
+            "33": 6632031744.0,
+            "34": 6632031744.0,
+            "35": 6632031744.0,
+            "36": 6632031744.0,
+            "37": 6632031744.0,
+            "38": 6632031744.0,
+            "39": 6632031744.0,
+            "40": 6632031744.0,
+            "41": 6632031744.0,
+            "42": 6632031744.0,
+            "43": 6632031744.0,
+            "44": 6632031744.0,
+            "45": 6632031744.0,
+            "46": 6632031744.0,
+            "47": 6632031744.0,
+            "48": 6632031744.0,
+            "49": 6632031744.0,
+            "50": 6632031744.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 55056003072.0,
-            "2": 57810763776.0,
-            "3": 57920647168.0,
-            "4": 57920647168.0,
-            "5": 57920647168.0,
-            "6": 57920647168.0,
-            "7": 57920647168.0,
-            "8": 57920647168.0,
-            "9": 57920647168.0,
-            "10": 57920647168.0,
-            "11": 57920647168.0,
-            "12": 57920647168.0,
-            "13": 57920647168.0,
-            "14": 57920647168.0,
-            "15": 57920647168.0,
-            "16": 57920647168.0,
-            "17": 57920647168.0,
-            "18": 57920647168.0,
-            "19": 57920647168.0,
-            "20": 57920647168.0,
-            "21": 57920647168.0,
-            "22": 57920647168.0,
-            "23": 57920647168.0,
-            "24": 57920647168.0,
-            "25": 57920647168.0,
-            "26": 57920647168.0,
-            "27": 57920647168.0,
-            "28": 57920647168.0,
-            "29": 57920647168.0,
-            "30": 57920647168.0,
-            "31": 57920647168.0,
-            "32": 57920647168.0,
-            "33": 57920647168.0,
-            "34": 57961472000.0,
-            "35": 57961472000.0,
-            "36": 57961472000.0,
-            "37": 57961472000.0,
-            "38": 57961472000.0,
-            "39": 57961472000.0,
-            "40": 57961472000.0,
-            "41": 57961472000.0,
-            "42": 57961472000.0,
-            "43": 57961472000.0,
-            "44": 57961472000.0,
-            "45": 57961472000.0,
-            "46": 57961472000.0,
-            "47": 57961472000.0,
-            "48": 57961472000.0,
-            "49": 57961472000.0,
-            "50": 57961472000.0
+            "1": 55051542528.0,
+            "2": 57803964416.0,
+            "3": 57920471040.0,
+            "4": 57920471040.0,
+            "5": 57920471040.0,
+            "6": 57920471040.0,
+            "7": 57920471040.0,
+            "8": 57920471040.0,
+            "9": 57920471040.0,
+            "10": 57920471040.0,
+            "11": 57920471040.0,
+            "12": 57920471040.0,
+            "13": 57920471040.0,
+            "14": 57920471040.0,
+            "15": 57920471040.0,
+            "16": 57920471040.0,
+            "17": 57920471040.0,
+            "18": 57920471040.0,
+            "19": 57920471040.0,
+            "20": 57920471040.0,
+            "21": 57920471040.0,
+            "22": 57920471040.0,
+            "23": 57920471040.0,
+            "24": 57920471040.0,
+            "25": 57920471040.0,
+            "26": 57920471040.0,
+            "27": 57920471040.0,
+            "28": 57920471040.0,
+            "29": 57920471040.0,
+            "30": 58636701696.0,
+            "31": 58636701696.0,
+            "32": 58636701696.0,
+            "33": 58636701696.0,
+            "34": 58636701696.0,
+            "35": 58636701696.0,
+            "36": 58684317696.0,
+            "37": 59176394752.0,
+            "38": 59698597888.0,
+            "39": 60111630336.0,
+            "40": 60111630336.0,
+            "41": 60111630336.0,
+            "42": 60111630336.0,
+            "43": 60111630336.0,
+            "44": 60111630336.0,
+            "45": 60111630336.0,
+            "46": 60111630336.0,
+            "47": 60111630336.0,
+            "48": 60111630336.0,
+            "49": 60111630336.0,
+            "50": 60111630336.0
         }
     },
     "mtp_1 loss": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 11.07648,
+            "1": 11.0765,
             "2": 11.07404,
-            "3": 10.53854,
-            "4": 10.09813,
-            "5": 9.81166,
-            "6": 10.09741,
-            "7": 9.79481,
-            "8": 9.0642,
-            "9": 8.86016,
-            "10": 9.34039,
-            "11": 8.51318,
-            "12": 8.59467,
-            "13": 8.5292,
-            "14": 7.95757,
-            "15": 8.06962,
-            "16": 8.11802,
-            "17": 8.06993,
-            "18": 7.80587,
-            "19": 8.19192,
-            "20": 7.8906,
-            "21": 7.57063,
-            "22": 7.55091,
-            "23": 7.41606,
-            "24": 7.42454,
-            "25": 7.65274,
-            "26": 7.05583,
-            "27": 7.59747,
-            "28": 7.29984,
-            "29": 7.472,
-            "30": 7.61908,
-            "31": 7.35179,
-            "32": 7.52979,
-            "33": 7.59161,
-            "34": 7.66287,
-            "35": 7.17383,
-            "36": 7.04133,
-            "37": 7.37081,
-            "38": 7.1443,
-            "39": 7.50879,
-            "40": 7.48921,
-            "41": 7.43802,
-            "42": 7.19405,
-            "43": 7.17581,
-            "44": 7.35785,
-            "45": 7.13985,
-            "46": 6.84014,
-            "47": 7.25094,
-            "48": 7.09407,
-            "49": 7.52321,
-            "50": 6.98987
+            "3": 10.53858,
+            "4": 10.09805,
+            "5": 9.81149,
+            "6": 10.07175,
+            "7": 9.79911,
+            "8": 9.07181,
+            "9": 8.87128,
+            "10": 9.12754,
+            "11": 8.49883,
+            "12": 8.53076,
+            "13": 8.42486,
+            "14": 7.84718,
+            "15": 7.99114,
+            "16": 8.05044,
+            "17": 8.0009,
+            "18": 7.73184,
+            "19": 8.11049,
+            "20": 7.83068,
+            "21": 7.52561,
+            "22": 7.49995,
+            "23": 7.37324,
+            "24": 7.37304,
+            "25": 7.61503,
+            "26": 7.01863,
+            "27": 7.5608,
+            "28": 7.26908,
+            "29": 7.4442,
+            "30": 7.58626,
+            "31": 7.327,
+            "32": 7.5089,
+            "33": 7.57391,
+            "34": 7.63803,
+            "35": 7.15468,
+            "36": 7.02234,
+            "37": 7.35288,
+            "38": 7.12913,
+            "39": 7.48869,
+            "40": 7.47562,
+            "41": 7.42293,
+            "42": 7.17768,
+            "43": 7.16333,
+            "44": 7.34362,
+            "45": 7.12401,
+            "46": 6.82934,
+            "47": 7.23649,
+            "48": 7.08053,
+            "49": 7.51319,
+            "50": 6.97383
         }
     },
     "iteration-time": {
@@ -289,56 +289,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 85.36715,
-            "2": 1.28108,
-            "3": 1.20997,
-            "4": 2.1995,
-            "5": 2.24721,
-            "6": 1.76282,
-            "7": 2.15873,
-            "8": 1.65741,
-            "9": 1.68158,
-            "10": 1.05017,
-            "11": 1.03578,
-            "12": 1.03043,
-            "13": 1.03186,
-            "14": 1.03493,
-            "15": 1.0319,
-            "16": 1.04172,
-            "17": 1.05419,
-            "18": 1.03994,
-            "19": 1.0369,
-            "20": 1.03992,
-            "21": 1.05243,
-            "22": 1.04145,
-            "23": 1.04202,
-            "24": 1.04481,
-            "25": 1.04991,
-            "26": 1.06122,
-            "27": 1.04535,
-            "28": 1.04289,
-            "29": 1.0367,
-            "30": 1.03753,
-            "31": 1.03324,
-            "32": 1.04441,
-            "33": 1.0476,
-            "34": 1.05195,
-            "35": 1.05343,
-            "36": 1.04283,
-            "37": 1.05165,
-            "38": 1.05155,
-            "39": 1.04776,
-            "40": 1.05026,
-            "41": 1.04796,
-            "42": 1.05954,
-            "43": 1.04773,
-            "44": 1.05679,
-            "45": 1.04821,
-            "46": 1.04749,
-            "47": 1.05428,
-            "48": 1.04887,
-            "49": 1.0568,
-            "50": 1.07411
+            "1": 73.10019,
+            "2": 1.25873,
+            "3": 1.16322,
+            "4": 1.29653,
+            "5": 1.29631,
+            "6": 1.11998,
+            "7": 1.35727,
+            "8": 1.09252,
+            "9": 1.11578,
+            "10": 1.02138,
+            "11": 1.01615,
+            "12": 1.01222,
+            "13": 1.02281,
+            "14": 1.02294,
+            "15": 1.02492,
+            "16": 1.01859,
+            "17": 1.03891,
+            "18": 1.03349,
+            "19": 1.02727,
+            "20": 1.02559,
+            "21": 1.02143,
+            "22": 1.02847,
+            "23": 1.02845,
+            "24": 1.01891,
+            "25": 1.02716,
+            "26": 1.0234,
+            "27": 1.02648,
+            "28": 1.0165,
+            "29": 1.02468,
+            "30": 1.02451,
+            "31": 1.0298,
+            "32": 1.02899,
+            "33": 1.01515,
+            "34": 1.02615,
+            "35": 1.02426,
+            "36": 1.02583,
+            "37": 1.0171,
+            "38": 1.01354,
+            "39": 1.03472,
+            "40": 1.02918,
+            "41": 1.03913,
+            "42": 1.03355,
+            "43": 1.02441,
+            "44": 1.03591,
+            "45": 1.02675,
+            "46": 1.04457,
+            "47": 1.05738,
+            "48": 1.02657,
+            "49": 1.0303,
+            "50": 1.02663
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml
index a1a5219ecb4..fdb452c65a9 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml
@@ -108,7 +108,6 @@ MODEL_ARGS:
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
   --save-interval: 25
-  --dist-ckpt-save-pre-mcore-014: true
   # Add initialization args
   --init-method-std: 0.02
   # Add logging args
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json
index 27fd798d121..82b8d8b1e56 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.94971,
-            "2": 10.95163,
-            "3": 10.51641,
-            "4": 9.9652,
-            "5": 9.94116,
-            "6": 9.67394,
-            "7": 10.19887,
-            "8": 9.50035,
-            "9": 9.54982,
-            "10": 9.79667,
-            "11": 9.30128,
-            "12": 9.40566,
-            "13": 9.39438,
-            "14": 8.84572,
-            "15": 9.02231,
-            "16": 9.06973,
-            "17": 9.04712,
-            "18": 8.75662,
-            "19": 9.18074,
-            "20": 8.86175,
-            "21": 8.53558,
-            "22": 8.55288,
-            "23": 8.42513,
-            "24": 8.37683,
-            "25": 8.64426,
-            "26": 7.9756,
-            "27": 8.57026,
-            "28": 8.1987,
-            "29": 8.39406,
-            "30": 8.67631,
-            "31": 8.29096,
-            "32": 8.43692,
-            "33": 8.55897,
-            "34": 8.66123,
-            "35": 8.08,
-            "36": 7.95214,
-            "37": 8.2979,
-            "38": 7.98177,
-            "39": 8.39281,
-            "40": 8.35852,
-            "41": 8.32006,
-            "42": 8.05954,
-            "43": 8.03381,
-            "44": 8.24236,
-            "45": 8.1025,
-            "46": 7.61814,
-            "47": 8.15364,
-            "48": 8.00693,
-            "49": 8.38704,
-            "50": 7.81592
+            "1": 10.94944,
+            "2": 10.95158,
+            "3": 10.50143,
+            "4": 9.9637,
+            "5": 9.9402,
+            "6": 9.6731,
+            "7": 10.2345,
+            "8": 9.49643,
+            "9": 9.54137,
+            "10": 9.7923,
+            "11": 9.29954,
+            "12": 9.40392,
+            "13": 9.39508,
+            "14": 8.85071,
+            "15": 9.02369,
+            "16": 9.07021,
+            "17": 9.04484,
+            "18": 8.75671,
+            "19": 9.17766,
+            "20": 8.86116,
+            "21": 8.53586,
+            "22": 8.54907,
+            "23": 8.42586,
+            "24": 8.37914,
+            "25": 8.63571,
+            "26": 7.96589,
+            "27": 8.57436,
+            "28": 8.19058,
+            "29": 8.39383,
+            "30": 8.6699,
+            "31": 8.28275,
+            "32": 8.43083,
+            "33": 8.55346,
+            "34": 8.65736,
+            "35": 8.07845,
+            "36": 7.94562,
+            "37": 8.29186,
+            "38": 7.97668,
+            "39": 8.38836,
+            "40": 8.35237,
+            "41": 8.31549,
+            "42": 8.05591,
+            "43": 8.03009,
+            "44": 8.23739,
+            "45": 8.09515,
+            "46": 7.61452,
+            "47": 8.14972,
+            "48": 8.00299,
+            "49": 8.38216,
+            "50": 7.81157
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 19403704.0,
-            "2": 19274216.0,
-            "3": 22517470.0,
-            "4": 83429816.0,
-            "5": 139167728.0,
-            "6": 138921280.0,
-            "7": 173470304.0,
-            "8": 200511856.0,
-            "9": 165696320.0,
-            "10": 166120112.0,
-            "11": 213254416.0,
-            "12": 187847360.0,
-            "13": 231586656.0,
-            "14": 226879072.0,
-            "15": 219025920.0,
-            "16": 205179664.0,
-            "17": 280450432.0,
-            "18": 181477792.0,
-            "19": 191026096.0,
-            "20": 186395632.0,
-            "21": 233632576.0,
-            "22": 231696832.0,
-            "23": 216390688.0,
-            "24": 215133760.0,
-            "25": 233079504.0,
-            "26": 244437920.0,
-            "27": 222637584.0,
-            "28": 278773952.0,
-            "29": 253409264.0,
-            "30": 240036736.0,
-            "31": 236599008.0,
-            "32": 205066624.0,
-            "33": 263303312.0,
-            "34": 200444544.0,
-            "35": 199033824.0,
-            "36": 243001216.0,
-            "37": 151181872.0,
-            "38": 175301280.0,
-            "39": 219001024.0,
-            "40": 220307936.0,
-            "41": 217385856.0,
-            "42": 230074176.0,
-            "43": 208226784.0,
-            "44": 148172720.0,
-            "45": 141103744.0,
-            "46": 132664976.0,
-            "47": 179619392.0,
-            "48": 118381144.0,
-            "49": 86643984.0,
-            "50": 113798320.0
+            "1": 19403658.0,
+            "2": 19274108.0,
+            "3": 19374004.0,
+            "4": 86537864.0,
+            "5": 137554544.0,
+            "6": 131043136.0,
+            "7": 167191584.0,
+            "8": 187932592.0,
+            "9": 167271824.0,
+            "10": 163003344.0,
+            "11": 222662128.0,
+            "12": 206727744.0,
+            "13": 231576672.0,
+            "14": 229976992.0,
+            "15": 248932672.0,
+            "16": 234972816.0,
+            "17": 252131904.0,
+            "18": 176733312.0,
+            "19": 175326720.0,
+            "20": 197382592.0,
+            "21": 225766720.0,
+            "22": 217633664.0,
+            "23": 196029024.0,
+            "24": 210323328.0,
+            "25": 221997792.0,
+            "26": 239705040.0,
+            "27": 246196976.0,
+            "28": 278753024.0,
+            "29": 272254432.0,
+            "30": 228998896.0,
+            "31": 252338576.0,
+            "32": 205052992.0,
+            "33": 250756576.0,
+            "34": 205128928.0,
+            "35": 192742864.0,
+            "36": 244582560.0,
+            "37": 180947680.0,
+            "38": 231918688.0,
+            "39": 220600064.0,
+            "40": 212460240.0,
+            "41": 215821280.0,
+            "42": 176641872.0,
+            "43": 203473536.0,
+            "44": 151341744.0,
+            "45": 167786640.0,
+            "46": 105920200.0,
+            "47": 173317104.0,
+            "48": 164021296.0,
+            "49": 100857144.0,
+            "50": 164130128.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 4882326528.0,
-            "2": 4880379392.0,
-            "3": 4880237056.0,
-            "4": 4879834624.0,
-            "5": 4881078784.0,
-            "6": 4881928704.0,
-            "7": 4880591360.0,
-            "8": 4880542208.0,
-            "9": 4880547328.0,
-            "10": 4882171392.0,
-            "11": 4881006080.0,
-            "12": 4881802752.0,
-            "13": 4881517056.0,
-            "14": 4880953856.0,
-            "15": 4881994240.0,
-            "16": 4880894464.0,
-            "17": 4881596928.0,
-            "18": 4879665664.0,
-            "19": 4880534016.0,
-            "20": 4881038848.0,
-            "21": 4880376320.0,
-            "22": 4880297472.0,
-            "23": 4880386560.0,
-            "24": 4879292928.0,
-            "25": 4880342528.0,
-            "26": 4880798208.0,
-            "27": 4881144320.0,
-            "28": 4879861248.0,
-            "29": 4880552448.0,
-            "30": 4880334336.0,
-            "31": 4879297024.0,
-            "32": 4880034304.0,
-            "33": 4880325120.0,
-            "34": 4880040448.0,
-            "35": 4880456192.0,
-            "36": 4880591360.0,
-            "37": 4879780352.0,
-            "38": 4880071168.0,
-            "39": 4879615488.0,
-            "40": 4879229440.0,
-            "41": 4879632896.0,
-            "42": 4879910400.0,
-            "43": 4879630848.0,
-            "44": 4880927232.0,
-            "45": 4879393280.0,
-            "46": 4881033728.0,
-            "47": 4879720960.0,
-            "48": 4881100288.0,
-            "49": 4879365632.0,
-            "50": 4880260608.0
+            "1": 4876392448.0,
+            "2": 4875814400.0,
+            "3": 4875040256.0,
+            "4": 4876553728.0,
+            "5": 4876546560.0,
+            "6": 4875578880.0,
+            "7": 4877725184.0,
+            "8": 4876062208.0,
+            "9": 4875521536.0,
+            "10": 4875812352.0,
+            "11": 4877753856.0,
+            "12": 4875833856.0,
+            "13": 4875491840.0,
+            "14": 4876834304.0,
+            "15": 4874819072.0,
+            "16": 4875979264.0,
+            "17": 4876512768.0,
+            "18": 4876787200.0,
+            "19": 4874727936.0,
+            "20": 4875113984.0,
+            "21": 4875528704.0,
+            "22": 4876432896.0,
+            "23": 4877065728.0,
+            "24": 4875671040.0,
+            "25": 4875840000.0,
+            "26": 4875620864.0,
+            "27": 4876904960.0,
+            "28": 4875815424.0,
+            "29": 4877359616.0,
+            "30": 4875890176.0,
+            "31": 4875692544.0,
+            "32": 4874448384.0,
+            "33": 4876354048.0,
+            "34": 4876618240.0,
+            "35": 4874722816.0,
+            "36": 4875591168.0,
+            "37": 4876935680.0,
+            "38": 4877427200.0,
+            "39": 4876846592.0,
+            "40": 4876000768.0,
+            "41": 4876271104.0,
+            "42": 4876566016.0,
+            "43": 4875017728.0,
+            "44": 4875452928.0,
+            "45": 4875992576.0,
+            "46": 4874968576.0,
+            "47": 4874319360.0,
+            "48": 4877893120.0,
+            "49": 4875783680.0,
+            "50": 4876252672.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 41208373248.0,
-            "2": 41208373248.0,
-            "3": 41208373248.0,
-            "4": 41208373248.0,
-            "5": 41208373248.0,
-            "6": 41208373248.0,
-            "7": 41208373248.0,
-            "8": 41208373248.0,
-            "9": 41208373248.0,
-            "10": 41208373248.0,
-            "11": 41208373248.0,
-            "12": 41208373248.0,
-            "13": 41208373248.0,
-            "14": 41208373248.0,
-            "15": 41208373248.0,
-            "16": 41208373248.0,
-            "17": 41208373248.0,
-            "18": 41208373248.0,
-            "19": 41208373248.0,
-            "20": 41208373248.0,
-            "21": 41208373248.0,
-            "22": 41208373248.0,
-            "23": 41208373248.0,
-            "24": 41208373248.0,
-            "25": 41208373248.0,
-            "26": 41208373248.0,
-            "27": 41208373248.0,
-            "28": 41208373248.0,
-            "29": 41208373248.0,
-            "30": 41208373248.0,
-            "31": 41208373248.0,
-            "32": 41208373248.0,
-            "33": 41208373248.0,
-            "34": 41208373248.0,
-            "35": 41208373248.0,
-            "36": 41208373248.0,
-            "37": 41208373248.0,
-            "38": 41208373248.0,
-            "39": 41208373248.0,
-            "40": 41208373248.0,
-            "41": 41208373248.0,
-            "42": 41208373248.0,
-            "43": 41208373248.0,
-            "44": 41208373248.0,
-            "45": 41208373248.0,
-            "46": 41208373248.0,
-            "47": 41208373248.0,
-            "48": 41208373248.0,
-            "49": 41208373248.0,
-            "50": 41208373248.0
+            "1": 41201033216.0,
+            "2": 41201033216.0,
+            "3": 41201033216.0,
+            "4": 41201033216.0,
+            "5": 41201033216.0,
+            "6": 41201033216.0,
+            "7": 41201033216.0,
+            "8": 41201033216.0,
+            "9": 41201033216.0,
+            "10": 41201033216.0,
+            "11": 41201033216.0,
+            "12": 41201033216.0,
+            "13": 41201033216.0,
+            "14": 41201033216.0,
+            "15": 41201033216.0,
+            "16": 41201033216.0,
+            "17": 41201033216.0,
+            "18": 41201033216.0,
+            "19": 41201033216.0,
+            "20": 41201033216.0,
+            "21": 41201033216.0,
+            "22": 41201033216.0,
+            "23": 41201033216.0,
+            "24": 41201033216.0,
+            "25": 41201033216.0,
+            "26": 41201033216.0,
+            "27": 41201033216.0,
+            "28": 41201033216.0,
+            "29": 41201033216.0,
+            "30": 41201033216.0,
+            "31": 41201033216.0,
+            "32": 41201033216.0,
+            "33": 41201033216.0,
+            "34": 41201033216.0,
+            "35": 41201033216.0,
+            "36": 41201033216.0,
+            "37": 41201033216.0,
+            "38": 41201033216.0,
+            "39": 41201033216.0,
+            "40": 41201033216.0,
+            "41": 41201033216.0,
+            "42": 41201033216.0,
+            "43": 41201033216.0,
+            "44": 41201033216.0,
+            "45": 41201033216.0,
+            "46": 41201033216.0,
+            "47": 41201033216.0,
+            "48": 41201033216.0,
+            "49": 41201033216.0,
+            "50": 41201033216.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 93.50581,
-            "2": 1.10311,
-            "3": 0.97651,
-            "4": 0.91679,
-            "5": 1.42607,
-            "6": 1.09159,
-            "7": 1.05673,
-            "8": 1.10123,
-            "9": 0.89686,
-            "10": 0.88361,
-            "11": 1.03303,
-            "12": 0.89241,
-            "13": 0.89019,
-            "14": 0.88818,
-            "15": 0.88483,
-            "16": 0.88301,
-            "17": 0.8735,
-            "18": 0.88523,
-            "19": 0.87306,
-            "20": 0.87169,
-            "21": 0.86111,
-            "22": 0.86812,
-            "23": 0.8652,
-            "24": 0.86412,
-            "25": 0.86804,
-            "26": 0.87119,
-            "27": 0.85851,
-            "28": 0.85867,
-            "29": 0.85921,
-            "30": 0.87123,
-            "31": 0.86655,
-            "32": 0.86406,
-            "33": 0.86147,
-            "34": 0.8603,
-            "35": 0.85378,
-            "36": 0.85478,
-            "37": 0.85414,
-            "38": 0.84972,
-            "39": 0.85007,
-            "40": 0.85059,
-            "41": 0.85564,
-            "42": 0.85309,
-            "43": 0.85237,
-            "44": 0.85578,
-            "45": 0.85261,
-            "46": 0.85507,
-            "47": 0.85212,
-            "48": 0.85143,
-            "49": 0.84802,
-            "50": 0.84388
+            "1": 73.81742,
+            "2": 1.08519,
+            "3": 0.9475,
+            "4": 0.8839,
+            "5": 1.11345,
+            "6": 0.85209,
+            "7": 1.03653,
+            "8": 1.16512,
+            "9": 0.8689,
+            "10": 0.85758,
+            "11": 0.85766,
+            "12": 0.8648,
+            "13": 0.85582,
+            "14": 0.85912,
+            "15": 0.85612,
+            "16": 0.85625,
+            "17": 0.84689,
+            "18": 0.85414,
+            "19": 0.85342,
+            "20": 0.85913,
+            "21": 0.84294,
+            "22": 0.84528,
+            "23": 0.8484,
+            "24": 0.84952,
+            "25": 0.84758,
+            "26": 0.84799,
+            "27": 0.84573,
+            "28": 0.85082,
+            "29": 0.85369,
+            "30": 0.85037,
+            "31": 0.85238,
+            "32": 0.84846,
+            "33": 0.85245,
+            "34": 0.86084,
+            "35": 0.85495,
+            "36": 0.85092,
+            "37": 0.85315,
+            "38": 0.85318,
+            "39": 0.85153,
+            "40": 0.84991,
+            "41": 0.84921,
+            "42": 0.84843,
+            "43": 0.84456,
+            "44": 0.85002,
+            "45": 0.84683,
+            "46": 0.84268,
+            "47": 0.849,
+            "48": 0.8467,
+            "49": 0.84356,
+            "50": 0.84122
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..113a491b0ba
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81199,
+            "2": 10.82649,
+            "3": 10.81384,
+            "4": 10.79509,
+            "5": 10.83534,
+            "6": 10.84275,
+            "7": 10.83571,
+            "8": 10.83439,
+            "9": 10.83696,
+            "10": 10.78957,
+            "11": 10.85974,
+            "12": 10.84264,
+            "13": 10.84986,
+            "14": 10.86378,
+            "15": 10.80482,
+            "16": 10.79204,
+            "17": 10.7636,
+            "18": 10.78823,
+            "19": 10.78841,
+            "20": 10.70796,
+            "21": 10.68628,
+            "22": 10.53299,
+            "23": 10.691,
+            "24": 10.58061,
+            "25": 10.5289,
+            "26": 10.57723,
+            "27": 10.58971,
+            "28": 10.5643,
+            "29": 10.56693,
+            "30": 10.35124,
+            "31": 10.09414,
+            "32": 10.43287,
+            "33": 10.43231,
+            "34": 10.19673,
+            "35": 10.23457,
+            "36": 10.19059,
+            "37": 10.31658,
+            "38": 10.16469,
+            "39": 10.37482,
+            "40": 10.05031,
+            "41": 10.10005,
+            "42": 10.1774,
+            "43": 9.79407,
+            "44": 9.91934,
+            "45": 9.7932,
+            "46": 9.78104,
+            "47": 10.10607,
+            "48": 9.8118,
+            "49": 9.48096,
+            "50": 9.86752,
+            "51": 9.8069,
+            "52": 9.70296,
+            "53": 10.03508,
+            "54": 9.92052,
+            "55": 9.84588,
+            "56": 9.58072,
+            "57": 9.43445,
+            "58": 9.79856,
+            "59": 9.54419,
+            "60": 9.45288,
+            "61": 9.65801,
+            "62": 9.95366,
+            "63": 9.34015,
+            "64": 9.73433,
+            "65": 8.90213,
+            "66": 9.6667,
+            "67": 9.33687,
+            "68": 9.7563,
+            "69": 9.77598,
+            "70": 9.70281,
+            "71": 9.60206,
+            "72": 9.543,
+            "73": 9.4557,
+            "74": 8.87804,
+            "75": 9.37677,
+            "76": 9.03816,
+            "77": 10.03912,
+            "78": 9.69714,
+            "79": 9.35195,
+            "80": 9.37278,
+            "81": 9.45649,
+            "82": 9.6802,
+            "83": 9.27723,
+            "84": 9.39341,
+            "85": 9.58928,
+            "86": 9.05151,
+            "87": 9.57623,
+            "88": 9.72869,
+            "89": 9.57637,
+            "90": 9.80884,
+            "91": 9.30719,
+            "92": 9.33823,
+            "93": 9.05712,
+            "94": 8.80375,
+            "95": 9.5091,
+            "96": 9.50777,
+            "97": 9.27751,
+            "98": 9.65271,
+            "99": 8.87009,
+            "100": 9.38142
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 26865.0,
+            "2": 29306.0,
+            "3": 29361.0,
+            "4": 28339.0,
+            "5": 32501.0,
+            "6": 33051.0,
+            "7": 31429.0,
+            "8": 27274.0,
+            "9": 30849.0,
+            "10": 25253.0,
+            "11": 34123.0,
+            "12": 30710.0,
+            "13": 33513.0,
+            "14": 33611.0,
+            "15": 31132.0,
+            "16": 32283.0,
+            "17": 31523.0,
+            "18": 30937.0,
+            "19": 31324.0,
+            "20": 28686.0,
+            "21": 29644.0,
+            "22": 27366.0,
+            "23": 34392.0,
+            "24": 29052.0,
+            "25": 27947.0,
+            "26": 31335.0,
+            "27": 31669.0,
+            "28": 33909.0,
+            "29": 35204.0,
+            "30": 30468.0,
+            "31": 27904.0,
+            "32": 33358.0,
+            "33": 35896.0,
+            "34": 30365.0,
+            "35": 31692.0,
+            "36": 32966.0,
+            "37": 35992.0,
+            "38": 33308.0,
+            "39": 38061.0,
+            "40": 34579.0,
+            "41": 33534.0,
+            "42": 36447.0,
+            "43": 32600.0,
+            "44": 32178.0,
+            "45": 34034.0,
+            "46": 34910.0,
+            "47": 39009.0,
+            "48": 34943.0,
+            "49": 34977.0,
+            "50": 38519.0,
+            "51": 36877.0,
+            "52": 36443.0,
+            "53": 43145.0,
+            "54": 41676.0,
+            "55": 38684.0,
+            "56": 41454.0,
+            "57": 35771.0,
+            "58": 41538.0,
+            "59": 39697.0,
+            "60": 56137.0,
+            "61": 59394.0,
+            "62": 2137056.0,
+            "63": 36401.0,
+            "64": 50930.0,
+            "65": 43788.0,
+            "66": 2139459.0,
+            "67": 2137025.0,
+            "68": 2137005.0,
+            "69": 2139555.0,
+            "70": 2140268.0,
+            "71": 2138613.0,
+            "72": 2139093.0,
+            "73": 2141321.0,
+            "74": 2137048.0,
+            "75": 2136852.0,
+            "76": 2140757.0,
+            "77": 2140654.0,
+            "78": 2141929.0,
+            "79": 2142543.0,
+            "80": 2142157.0,
+            "81": 2145547.0,
+            "82": 2144670.0,
+            "83": 2140858.0,
+            "84": 2140984.0,
+            "85": 2145921.0,
+            "86": 149825.0,
+            "87": 2144700.0,
+            "88": 2142479.0,
+            "89": 2140988.0,
+            "90": 2144684.0,
+            "91": 2143848.0,
+            "92": 2142027.0,
+            "93": 2139531.0,
+            "94": 2145775.0,
+            "95": 2143141.0,
+            "96": 2146259.0,
+            "97": 2140268.0,
+            "98": 2143316.0,
+            "99": 2144369.0,
+            "100": 2143057.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 787591680.0,
+            "2": 787578880.0,
+            "3": 787593728.0,
+            "4": 787568128.0,
+            "5": 787563008.0,
+            "6": 787585536.0,
+            "7": 787578368.0,
+            "8": 787582976.0,
+            "9": 787581952.0,
+            "10": 787592192.0,
+            "11": 787569152.0,
+            "12": 787570688.0,
+            "13": 787579392.0,
+            "14": 787582976.0,
+            "15": 787565568.0,
+            "16": 787572224.0,
+            "17": 787566592.0,
+            "18": 787547648.0,
+            "19": 787566592.0,
+            "20": 787537408.0,
+            "21": 787540992.0,
+            "22": 787540480.0,
+            "23": 787548672.0,
+            "24": 787542016.0,
+            "25": 787534336.0,
+            "26": 787548672.0,
+            "27": 787509760.0,
+            "28": 787504640.0,
+            "29": 787499520.0,
+            "30": 787494912.0,
+            "31": 787510784.0,
+            "32": 787501056.0,
+            "33": 787482624.0,
+            "34": 787486208.0,
+            "35": 787483136.0,
+            "36": 787482624.0,
+            "37": 787460608.0,
+            "38": 787457536.0,
+            "39": 787461632.0,
+            "40": 787457536.0,
+            "41": 787466752.0,
+            "42": 787432448.0,
+            "43": 787450368.0,
+            "44": 787436032.0,
+            "45": 787411456.0,
+            "46": 787460608.0,
+            "47": 787412992.0,
+            "48": 787440128.0,
+            "49": 787409920.0,
+            "50": 787396096.0,
+            "51": 787388416.0,
+            "52": 787415040.0,
+            "53": 787377664.0,
+            "54": 787403264.0,
+            "55": 787375104.0,
+            "56": 787362304.0,
+            "57": 787405824.0,
+            "58": 787356160.0,
+            "59": 787378688.0,
+            "60": 787380224.0,
+            "61": 787337216.0,
+            "62": 787331584.0,
+            "63": 787368960.0,
+            "64": 787339264.0,
+            "65": 787403776.0,
+            "66": 787330048.0,
+            "67": 787337728.0,
+            "68": 787324416.0,
+            "69": 787335680.0,
+            "70": 787328512.0,
+            "71": 787331584.0,
+            "72": 787341312.0,
+            "73": 787353088.0,
+            "74": 787366400.0,
+            "75": 787342848.0,
+            "76": 787344384.0,
+            "77": 787345920.0,
+            "78": 787371520.0,
+            "79": 787366400.0,
+            "80": 787390464.0,
+            "81": 787385344.0,
+            "82": 787395584.0,
+            "83": 787403776.0,
+            "84": 787397632.0,
+            "85": 787398144.0,
+            "86": 787411968.0,
+            "87": 787389952.0,
+            "88": 787387904.0,
+            "89": 787400704.0,
+            "90": 787379712.0,
+            "91": 787401216.0,
+            "92": 787399168.0,
+            "93": 787391488.0,
+            "94": 787392000.0,
+            "95": 787398656.0,
+            "96": 787395584.0,
+            "97": 787403776.0,
+            "98": 787396608.0,
+            "99": 787406848.0,
+            "100": 787410432.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2662647296.0,
+            "2": 2662647296.0,
+            "3": 2665052672.0,
+            "4": 2665052672.0,
+            "5": 2665052672.0,
+            "6": 2665052672.0,
+            "7": 2665052672.0,
+            "8": 2665052672.0,
+            "9": 2665052672.0,
+            "10": 2665052672.0,
+            "11": 2665052672.0,
+            "12": 2665052672.0,
+            "13": 2665052672.0,
+            "14": 2665052672.0,
+            "15": 2665052672.0,
+            "16": 2665052672.0,
+            "17": 2665052672.0,
+            "18": 2665052672.0,
+            "19": 2665052672.0,
+            "20": 2665052672.0,
+            "21": 2665052672.0,
+            "22": 2665052672.0,
+            "23": 2665052672.0,
+            "24": 2665052672.0,
+            "25": 2665052672.0,
+            "26": 2665052672.0,
+            "27": 2665052672.0,
+            "28": 2665052672.0,
+            "29": 2665052672.0,
+            "30": 2665052672.0,
+            "31": 2665052672.0,
+            "32": 2665052672.0,
+            "33": 2665052672.0,
+            "34": 2665052672.0,
+            "35": 2665052672.0,
+            "36": 2665052672.0,
+            "37": 2665052672.0,
+            "38": 2665052672.0,
+            "39": 2665052672.0,
+            "40": 2665052672.0,
+            "41": 2665052672.0,
+            "42": 2665052672.0,
+            "43": 2665052672.0,
+            "44": 2665052672.0,
+            "45": 2665052672.0,
+            "46": 2665052672.0,
+            "47": 2665052672.0,
+            "48": 2665052672.0,
+            "49": 2665052672.0,
+            "50": 2665052672.0,
+            "51": 2665052672.0,
+            "52": 2665052672.0,
+            "53": 2665052672.0,
+            "54": 2665052672.0,
+            "55": 2665052672.0,
+            "56": 2665052672.0,
+            "57": 2665052672.0,
+            "58": 2665052672.0,
+            "59": 2665052672.0,
+            "60": 2665052672.0,
+            "61": 2665052672.0,
+            "62": 2665052672.0,
+            "63": 2665052672.0,
+            "64": 2665052672.0,
+            "65": 2665052672.0,
+            "66": 2665052672.0,
+            "67": 2665052672.0,
+            "68": 2665052672.0,
+            "69": 2665052672.0,
+            "70": 2665052672.0,
+            "71": 2665052672.0,
+            "72": 2665052672.0,
+            "73": 2665052672.0,
+            "74": 2665052672.0,
+            "75": 2665052672.0,
+            "76": 2665052672.0,
+            "77": 2665052672.0,
+            "78": 2665052672.0,
+            "79": 2665052672.0,
+            "80": 2665052672.0,
+            "81": 2665052672.0,
+            "82": 2665052672.0,
+            "83": 2665052672.0,
+            "84": 2665052672.0,
+            "85": 2665052672.0,
+            "86": 2665052672.0,
+            "87": 2665052672.0,
+            "88": 2665052672.0,
+            "89": 2665052672.0,
+            "90": 2665052672.0,
+            "91": 2665052672.0,
+            "92": 2665052672.0,
+            "93": 2665052672.0,
+            "94": 2665052672.0,
+            "95": 2665052672.0,
+            "96": 2665052672.0,
+            "97": 2665052672.0,
+            "98": 2665052672.0,
+            "99": 2665052672.0,
+            "100": 2665052672.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.78579,
+            "3": 0.53829,
+            "4": 0.5501,
+            "5": 0.52877,
+            "6": 0.53341,
+            "7": 0.53101,
+            "8": 0.52594,
+            "9": 0.52656,
+            "10": 0.52721,
+            "11": 0.51907,
+            "12": 0.52113,
+            "13": 0.52417,
+            "14": 0.52392,
+            "15": 0.53475,
+            "16": 0.52116,
+            "17": 0.52656,
+            "18": 0.52034,
+            "19": 0.52016,
+            "20": 0.52199,
+            "21": 0.53183,
+            "22": 0.53661,
+            "23": 0.54084,
+            "24": 0.52495,
+            "25": 0.53128,
+            "26": 0.52735,
+            "27": 0.54335,
+            "28": 0.52654,
+            "29": 0.53834,
+            "30": 0.53606,
+            "31": 0.53938,
+            "32": 0.53598,
+            "33": 0.53326,
+            "34": 0.54444,
+            "35": 0.53164,
+            "36": 0.5404,
+            "37": 0.54568,
+            "38": 0.54552,
+            "39": 0.5366,
+            "40": 0.54027,
+            "41": 0.53525,
+            "42": 0.55075,
+            "43": 0.53886,
+            "44": 0.53665,
+            "45": 0.55089,
+            "46": 0.5331,
+            "47": 0.54482,
+            "48": 0.53151,
+            "49": 0.53493,
+            "50": 0.53302,
+            "51": 0.52424,
+            "52": 0.52434,
+            "53": 0.51687,
+            "54": 0.52816,
+            "55": 0.53022,
+            "56": 0.53577,
+            "57": 0.53245,
+            "58": 0.53568,
+            "59": 0.54753,
+            "60": 0.53813,
+            "61": 0.53815,
+            "62": 0.5366,
+            "63": 0.54423,
+            "64": 0.5344,
+            "65": 0.53864,
+            "66": 0.54089,
+            "67": 0.53579,
+            "68": 0.54777,
+            "69": 0.54032,
+            "70": 0.54348,
+            "71": 0.5411,
+            "72": 0.54019,
+            "73": 0.53851,
+            "74": 0.54021,
+            "75": 0.53784,
+            "76": 0.53954,
+            "77": 0.54237,
+            "78": 0.53049,
+            "79": 0.57915,
+            "80": 0.57307,
+            "81": 0.56876,
+            "82": 0.56781,
+            "83": 0.56481,
+            "84": 0.55385,
+            "85": 0.56577,
+            "86": 0.569,
+            "87": 0.5621,
+            "88": 0.56698,
+            "89": 0.55835,
+            "90": 0.85395,
+            "91": 0.56888,
+            "92": 0.55621,
+            "93": 0.57143,
+            "94": 0.5584,
+            "95": 0.56204,
+            "96": 0.5656,
+            "97": 0.5491,
+            "98": 0.56348,
+            "99": 0.5607,
+            "100": 0.56258
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json
index 5272fa38474..bfbb1e850e1 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 787516416.0,
-            "2": 787540992.0,
-            "3": 787524096.0,
-            "4": 787512320.0,
-            "5": 787547136.0,
-            "6": 787537920.0,
-            "7": 787512832.0,
-            "8": 787524608.0,
-            "9": 787528192.0,
-            "10": 787505152.0,
-            "11": 787522048.0,
-            "12": 787520000.0,
-            "13": 787529728.0,
-            "14": 787529216.0,
-            "15": 787504128.0,
-            "16": 787513344.0,
-            "17": 787503104.0,
-            "18": 787489280.0,
-            "19": 787514880.0,
-            "20": 787505152.0,
-            "21": 787479552.0,
-            "22": 787486208.0,
-            "23": 787478528.0,
-            "24": 787486208.0,
-            "25": 787451392.0,
-            "26": 787482112.0,
-            "27": 787470848.0,
-            "28": 787450368.0,
-            "29": 787458048.0,
-            "30": 787435008.0,
-            "31": 787406848.0,
-            "32": 787424256.0,
-            "33": 787435520.0,
-            "34": 787426304.0,
-            "35": 787418624.0,
-            "36": 787436544.0,
-            "37": 787428352.0,
-            "38": 787436544.0,
-            "39": 787417600.0,
-            "40": 787415040.0,
-            "41": 787405824.0,
-            "42": 787415040.0,
-            "43": 787367936.0,
-            "44": 787392512.0,
-            "45": 787399680.0,
-            "46": 787355136.0,
-            "47": 787411456.0,
-            "48": 787354112.0,
-            "49": 787374080.0,
-            "50": 787389440.0,
-            "51": 787375616.0,
-            "52": 787383808.0,
-            "53": 787379712.0,
-            "54": 787384832.0,
-            "55": 787388928.0,
-            "56": 787388928.0,
-            "57": 787351040.0,
-            "58": 787382784.0,
-            "59": 787374080.0,
-            "60": 787395072.0,
-            "61": 787405312.0,
-            "62": 787405824.0,
-            "63": 787373056.0,
-            "64": 787388928.0,
-            "65": 787351552.0,
-            "66": 787386880.0,
-            "67": 787392000.0,
-            "68": 787399168.0,
-            "69": 787383296.0,
-            "70": 787393024.0,
-            "71": 787406848.0,
-            "72": 787400704.0,
-            "73": 787401216.0,
-            "74": 787403264.0,
-            "75": 787442688.0,
-            "76": 787444736.0,
-            "77": 787445760.0,
-            "78": 787395072.0,
-            "79": 787430400.0,
-            "80": 787410432.0,
-            "81": 787412992.0,
-            "82": 787427840.0,
-            "83": 787428864.0,
-            "84": 787412480.0,
-            "85": 787412480.0,
-            "86": 787394560.0,
-            "87": 787452928.0,
-            "88": 787414528.0,
-            "89": 787404800.0,
-            "90": 787446784.0,
-            "91": 787446272.0,
-            "92": 787446784.0,
-            "93": 787430400.0,
-            "94": 787440128.0,
-            "95": 787450368.0,
-            "96": 787454976.0,
-            "97": 787427328.0,
-            "98": 787475968.0,
-            "99": 787419136.0,
-            "100": 787438592.0
+            "1": 1668119552.0,
+            "2": 1668144128.0,
+            "3": 1668127232.0,
+            "4": 1668115456.0,
+            "5": 1668150272.0,
+            "6": 1668141056.0,
+            "7": 1668115968.0,
+            "8": 1668127744.0,
+            "9": 1668131328.0,
+            "10": 1668108288.0,
+            "11": 1668125184.0,
+            "12": 1668123136.0,
+            "13": 1668132864.0,
+            "14": 1668132352.0,
+            "15": 1668107264.0,
+            "16": 1668116480.0,
+            "17": 1668106240.0,
+            "18": 1668092416.0,
+            "19": 1668118016.0,
+            "20": 1668108288.0,
+            "21": 1668082688.0,
+            "22": 1668089344.0,
+            "23": 1668081664.0,
+            "24": 1668089344.0,
+            "25": 1668054528.0,
+            "26": 1668085248.0,
+            "27": 1668073984.0,
+            "28": 1668053504.0,
+            "29": 1668061184.0,
+            "30": 1668038144.0,
+            "31": 1668009984.0,
+            "32": 1668027392.0,
+            "33": 1668038656.0,
+            "34": 1668029440.0,
+            "35": 1668021760.0,
+            "36": 1668039680.0,
+            "37": 1668031488.0,
+            "38": 1668039680.0,
+            "39": 1668020736.0,
+            "40": 1668018176.0,
+            "41": 1668008960.0,
+            "42": 1668018176.0,
+            "43": 1667971072.0,
+            "44": 1667995648.0,
+            "45": 1668002816.0,
+            "46": 1667958272.0,
+            "47": 1668014592.0,
+            "48": 1667957248.0,
+            "49": 1667977216.0,
+            "50": 1667992576.0,
+            "51": 1667978752.0,
+            "52": 1667986944.0,
+            "53": 1667982848.0,
+            "54": 1667987968.0,
+            "55": 1667992064.0,
+            "56": 1667992064.0,
+            "57": 1667954176.0,
+            "58": 1667985920.0,
+            "59": 1667977216.0,
+            "60": 1667998208.0,
+            "61": 1668008448.0,
+            "62": 1668008960.0,
+            "63": 1667976192.0,
+            "64": 1667992064.0,
+            "65": 1667954688.0,
+            "66": 1667990016.0,
+            "67": 1667995136.0,
+            "68": 1668002304.0,
+            "69": 1667986432.0,
+            "70": 1667996160.0,
+            "71": 1668009984.0,
+            "72": 1668003840.0,
+            "73": 1668004352.0,
+            "74": 1668006400.0,
+            "75": 1668045824.0,
+            "76": 1668047872.0,
+            "77": 1668048896.0,
+            "78": 1667998208.0,
+            "79": 1668033536.0,
+            "80": 1668013568.0,
+            "81": 1668016128.0,
+            "82": 1668030976.0,
+            "83": 1668032000.0,
+            "84": 1668015616.0,
+            "85": 1668015616.0,
+            "86": 1667997696.0,
+            "87": 1668056064.0,
+            "88": 1668017664.0,
+            "89": 1668007936.0,
+            "90": 1668049920.0,
+            "91": 1668049408.0,
+            "92": 1668049920.0,
+            "93": 1668033536.0,
+            "94": 1668043264.0,
+            "95": 1668053504.0,
+            "96": 1668058112.0,
+            "97": 1668030464.0,
+            "98": 1668079104.0,
+            "99": 1668022272.0,
+            "100": 1668041728.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2579673088.0,
-            "2": 2590714880.0,
-            "3": 2590714880.0,
-            "4": 2590714880.0,
-            "5": 2596039680.0,
-            "6": 2596039680.0,
-            "7": 2596039680.0,
-            "8": 2596039680.0,
-            "9": 2596039680.0,
-            "10": 2596039680.0,
-            "11": 2596039680.0,
-            "12": 2596039680.0,
-            "13": 2596039680.0,
-            "14": 2596039680.0,
-            "15": 2596039680.0,
-            "16": 2596039680.0,
-            "17": 2596039680.0,
-            "18": 2596039680.0,
-            "19": 2596039680.0,
-            "20": 2596039680.0,
-            "21": 2596039680.0,
-            "22": 2596039680.0,
-            "23": 2596039680.0,
-            "24": 2596039680.0,
-            "25": 2596039680.0,
-            "26": 2596039680.0,
-            "27": 2596039680.0,
-            "28": 2596039680.0,
-            "29": 2596039680.0,
-            "30": 2596039680.0,
-            "31": 2596039680.0,
-            "32": 2596039680.0,
-            "33": 2596039680.0,
-            "34": 2596039680.0,
-            "35": 2596039680.0,
-            "36": 2596039680.0,
-            "37": 2596039680.0,
-            "38": 2596039680.0,
-            "39": 2596039680.0,
-            "40": 2596039680.0,
-            "41": 2596039680.0,
-            "42": 2596039680.0,
-            "43": 2596039680.0,
-            "44": 2596039680.0,
-            "45": 2596039680.0,
-            "46": 2596039680.0,
-            "47": 2596039680.0,
-            "48": 2596039680.0,
-            "49": 2596039680.0,
-            "50": 2596039680.0,
-            "51": 2596039680.0,
-            "52": 2596039680.0,
-            "53": 2596039680.0,
-            "54": 2596039680.0,
-            "55": 2596039680.0,
-            "56": 2596039680.0,
-            "57": 2596039680.0,
-            "58": 2596039680.0,
-            "59": 2596039680.0,
-            "60": 2596039680.0,
-            "61": 2596039680.0,
-            "62": 2596039680.0,
-            "63": 2596039680.0,
-            "64": 2596039680.0,
-            "65": 2596039680.0,
-            "66": 2596039680.0,
-            "67": 2596039680.0,
-            "68": 2596039680.0,
-            "69": 2596039680.0,
-            "70": 2596039680.0,
-            "71": 2596039680.0,
-            "72": 2596039680.0,
-            "73": 2596039680.0,
-            "74": 2596039680.0,
-            "75": 2596039680.0,
-            "76": 2596039680.0,
-            "77": 2596039680.0,
-            "78": 2596039680.0,
-            "79": 2596039680.0,
-            "80": 2596039680.0,
-            "81": 2596039680.0,
-            "82": 2596039680.0,
-            "83": 2596039680.0,
-            "84": 2596039680.0,
-            "85": 2596039680.0,
-            "86": 2596039680.0,
-            "87": 2596039680.0,
-            "88": 2596039680.0,
-            "89": 2596039680.0,
-            "90": 2596039680.0,
-            "91": 2596039680.0,
-            "92": 2596039680.0,
-            "93": 2596039680.0,
-            "94": 2596039680.0,
-            "95": 2596039680.0,
-            "96": 2596039680.0,
-            "97": 2596039680.0,
-            "98": 2596039680.0,
-            "99": 2596039680.0,
-            "100": 2596039680.0
+            "1": 3460789248.0,
+            "2": 3470375936.0,
+            "3": 3470375936.0,
+            "4": 3470375936.0,
+            "5": 3480799232.0,
+            "6": 3480799232.0,
+            "7": 3480799232.0,
+            "8": 3480799232.0,
+            "9": 3480799232.0,
+            "10": 3480799232.0,
+            "11": 3480799232.0,
+            "12": 3480799232.0,
+            "13": 3480799232.0,
+            "14": 3480799232.0,
+            "15": 3480799232.0,
+            "16": 3480799232.0,
+            "17": 3480799232.0,
+            "18": 3480799232.0,
+            "19": 3480799232.0,
+            "20": 3480799232.0,
+            "21": 3480799232.0,
+            "22": 3480799232.0,
+            "23": 3480799232.0,
+            "24": 3480799232.0,
+            "25": 3480799232.0,
+            "26": 3480799232.0,
+            "27": 3480799232.0,
+            "28": 3480799232.0,
+            "29": 3480799232.0,
+            "30": 3480799232.0,
+            "31": 3480799232.0,
+            "32": 3480799232.0,
+            "33": 3480799232.0,
+            "34": 3480799232.0,
+            "35": 3480799232.0,
+            "36": 3480799232.0,
+            "37": 3480799232.0,
+            "38": 3480799232.0,
+            "39": 3480799232.0,
+            "40": 3480799232.0,
+            "41": 3480799232.0,
+            "42": 3480799232.0,
+            "43": 3480799232.0,
+            "44": 3480799232.0,
+            "45": 3480799232.0,
+            "46": 3480799232.0,
+            "47": 3480799232.0,
+            "48": 3480799232.0,
+            "49": 3480799232.0,
+            "50": 3480799232.0,
+            "51": 3480799232.0,
+            "52": 3480799232.0,
+            "53": 3480799232.0,
+            "54": 3480799232.0,
+            "55": 3480799232.0,
+            "56": 3480799232.0,
+            "57": 3480799232.0,
+            "58": 3480799232.0,
+            "59": 3480799232.0,
+            "60": 3480799232.0,
+            "61": 3480799232.0,
+            "62": 3480799232.0,
+            "63": 3480799232.0,
+            "64": 3480799232.0,
+            "65": 3480799232.0,
+            "66": 3480799232.0,
+            "67": 3480799232.0,
+            "68": 3480799232.0,
+            "69": 3480799232.0,
+            "70": 3480799232.0,
+            "71": 3480799232.0,
+            "72": 3480799232.0,
+            "73": 3480799232.0,
+            "74": 3480799232.0,
+            "75": 3480799232.0,
+            "76": 3480799232.0,
+            "77": 3480799232.0,
+            "78": 3480799232.0,
+            "79": 3480799232.0,
+            "80": 3480799232.0,
+            "81": 3480799232.0,
+            "82": 3480799232.0,
+            "83": 3480799232.0,
+            "84": 3480799232.0,
+            "85": 3480799232.0,
+            "86": 3480799232.0,
+            "87": 3480799232.0,
+            "88": 3480799232.0,
+            "89": 3480799232.0,
+            "90": 3480799232.0,
+            "91": 3480799232.0,
+            "92": 3480799232.0,
+            "93": 3480799232.0,
+            "94": 3480799232.0,
+            "95": 3480799232.0,
+            "96": 3480799232.0,
+            "97": 3480799232.0,
+            "98": 3480799232.0,
+            "99": 3480799232.0,
+            "100": 3480799232.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 11.32216,
-            "2": 0.51152,
-            "3": 0.3991,
-            "4": 0.39384,
-            "5": 0.34861,
-            "6": 0.34066,
-            "7": 0.34617,
-            "8": 0.33486,
-            "9": 0.32675,
-            "10": 0.32667,
-            "11": 0.32484,
-            "12": 0.31668,
-            "13": 0.33715,
-            "14": 0.32412,
-            "15": 0.31875,
-            "16": 0.32114,
-            "17": 0.3229,
-            "18": 0.31808,
-            "19": 0.32136,
-            "20": 0.31859,
-            "21": 0.31745,
-            "22": 0.31017,
-            "23": 0.32808,
-            "24": 0.31401,
-            "25": 0.31375,
-            "26": 0.31997,
-            "27": 0.32499,
-            "28": 0.32994,
-            "29": 0.33622,
-            "30": 0.33243,
-            "31": 0.33178,
-            "32": 0.35562,
-            "33": 0.3162,
-            "34": 0.32935,
-            "35": 0.32942,
-            "36": 0.32747,
-            "37": 0.32399,
-            "38": 0.32853,
-            "39": 0.32725,
-            "40": 0.32666,
-            "41": 0.33444,
-            "42": 0.32666,
-            "43": 0.32009,
-            "44": 0.38316,
-            "45": 0.36982,
-            "46": 0.3282,
-            "47": 0.33228,
-            "48": 0.32173,
-            "49": 0.32336,
-            "50": 0.33092,
-            "51": 0.32405,
-            "52": 0.344,
-            "53": 0.31793,
-            "54": 0.31881,
-            "55": 0.32423,
-            "56": 0.3238,
-            "57": 0.32754,
-            "58": 0.33365,
-            "59": 0.3188,
-            "60": 0.32627,
-            "61": 0.32313,
-            "62": 0.3251,
-            "63": 0.32111,
-            "64": 0.32694,
-            "65": 0.32677,
-            "66": 0.32916,
-            "67": 0.32392,
-            "68": 0.326,
-            "69": 0.31823,
-            "70": 0.32846,
-            "71": 0.32194,
-            "72": 0.3191,
-            "73": 0.32552,
-            "74": 0.32352,
-            "75": 0.31973,
-            "76": 0.32666,
-            "77": 0.32946,
-            "78": 0.31928,
-            "79": 0.32534,
-            "80": 0.31953,
-            "81": 0.31781,
-            "82": 0.3276,
-            "83": 0.32328,
-            "84": 0.31773,
-            "85": 0.32013,
-            "86": 0.32232,
-            "87": 0.31793,
-            "88": 0.31909,
-            "89": 0.6397,
-            "90": 0.31785,
-            "91": 0.3271,
-            "92": 0.31825,
-            "93": 0.31968,
-            "94": 0.32804,
-            "95": 0.31746,
-            "96": 0.31519,
-            "97": 0.32525,
-            "98": 0.3209,
-            "99": 0.31591,
-            "100": 0.31898
+            "1": 11.49667,
+            "2": 0.45982,
+            "3": 0.39283,
+            "4": 0.37269,
+            "5": 0.33438,
+            "6": 0.33048,
+            "7": 0.33351,
+            "8": 0.32704,
+            "9": 0.31789,
+            "10": 0.30958,
+            "11": 0.30791,
+            "12": 0.30859,
+            "13": 0.32053,
+            "14": 0.30171,
+            "15": 0.30843,
+            "16": 0.30302,
+            "17": 0.30464,
+            "18": 0.30431,
+            "19": 0.30467,
+            "20": 0.29614,
+            "21": 0.3034,
+            "22": 0.30183,
+            "23": 0.29505,
+            "24": 0.29208,
+            "25": 0.29678,
+            "26": 0.29737,
+            "27": 0.30864,
+            "28": 0.31313,
+            "29": 0.30795,
+            "30": 0.31701,
+            "31": 0.31516,
+            "32": 0.32758,
+            "33": 0.31728,
+            "34": 0.32164,
+            "35": 0.32366,
+            "36": 0.3008,
+            "37": 0.30816,
+            "38": 0.30782,
+            "39": 0.3097,
+            "40": 0.31658,
+            "41": 0.30749,
+            "42": 0.30662,
+            "43": 0.30452,
+            "44": 0.32171,
+            "45": 0.30874,
+            "46": 0.31718,
+            "47": 0.30947,
+            "48": 0.30568,
+            "49": 0.30559,
+            "50": 0.30518,
+            "51": 0.32349,
+            "52": 0.30552,
+            "53": 0.2972,
+            "54": 0.29675,
+            "55": 0.6806,
+            "56": 0.30449,
+            "57": 0.30268,
+            "58": 0.29449,
+            "59": 0.29915,
+            "60": 0.30558,
+            "61": 0.29817,
+            "62": 0.29837,
+            "63": 0.29648,
+            "64": 0.30355,
+            "65": 0.30526,
+            "66": 0.29685,
+            "67": 0.29607,
+            "68": 0.30383,
+            "69": 0.29497,
+            "70": 0.29908,
+            "71": 0.298,
+            "72": 0.29482,
+            "73": 0.29392,
+            "74": 0.29933,
+            "75": 0.29938,
+            "76": 0.29472,
+            "77": 0.29225,
+            "78": 0.29345,
+            "79": 0.29571,
+            "80": 0.29379,
+            "81": 0.29694,
+            "82": 0.29442,
+            "83": 0.29839,
+            "84": 0.30064,
+            "85": 0.29571,
+            "86": 0.30107,
+            "87": 0.29723,
+            "88": 0.29324,
+            "89": 0.29688,
+            "90": 0.29142,
+            "91": 0.29759,
+            "92": 0.29347,
+            "93": 0.29617,
+            "94": 0.29996,
+            "95": 0.29791,
+            "96": 0.29236,
+            "97": 0.29637,
+            "98": 0.29446,
+            "99": 0.293,
+            "100": 0.2937
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..9e46de6c95a
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.79428,
+            "52": 9.69347,
+            "53": 10.02752,
+            "54": 9.90501,
+            "55": 9.82435,
+            "56": 9.54897,
+            "57": 9.39485,
+            "58": 9.7808,
+            "59": 9.50877,
+            "60": 9.42349,
+            "61": 9.63084,
+            "62": 9.93098,
+            "63": 9.30185,
+            "64": 9.70993,
+            "65": 8.86079,
+            "66": 9.6403,
+            "67": 9.30746,
+            "68": 9.739,
+            "69": 9.74443,
+            "70": 9.68785,
+            "71": 9.56432,
+            "72": 9.50788,
+            "73": 9.43507,
+            "74": 8.84742,
+            "75": 9.3602,
+            "76": 8.99973,
+            "77": 10.01014,
+            "78": 9.67223,
+            "79": 9.31512,
+            "80": 9.34539,
+            "81": 9.41771,
+            "82": 9.64173,
+            "83": 9.22906,
+            "84": 9.35261,
+            "85": 9.54121,
+            "86": 9.00835,
+            "87": 9.53227,
+            "88": 9.69231,
+            "89": 9.52663,
+            "90": 9.76997,
+            "91": 9.26595,
+            "92": 9.29755,
+            "93": 8.99851,
+            "94": 8.76338,
+            "95": 9.4712,
+            "96": 9.46514,
+            "97": 9.24403,
+            "98": 9.61142,
+            "99": 8.82341,
+            "100": 9.33414
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 38608.0,
+            "52": 29672.0,
+            "53": 145995.0,
+            "54": 43699.0,
+            "55": 33546.0,
+            "56": 40511.0,
+            "57": 36799.0,
+            "58": 44234.0,
+            "59": 40253.0,
+            "60": 36209.0,
+            "61": 38020.0,
+            "62": 129819.0,
+            "63": 154780.0,
+            "64": 39430.0,
+            "65": 39232.0,
+            "66": 154214.0,
+            "67": 161225.0,
+            "68": 2135842.0,
+            "69": 50464.0,
+            "70": 56439.0,
+            "71": 2137847.0,
+            "72": 147293.0,
+            "73": 2141880.0,
+            "74": 2137167.0,
+            "75": 2135335.0,
+            "76": 2139034.0,
+            "77": 159341.0,
+            "78": 2139830.0,
+            "79": 2141683.0,
+            "80": 139853.0,
+            "81": 2145240.0,
+            "82": 164983.0,
+            "83": 2140685.0,
+            "84": 2140869.0,
+            "85": 2146230.0,
+            "86": 2141768.0,
+            "87": 2146906.0,
+            "88": 153161.0,
+            "89": 127490.0,
+            "90": 158621.0,
+            "91": 125039.0,
+            "92": 56204.0,
+            "93": 147769.0,
+            "94": 157550.0,
+            "95": 166285.0,
+            "96": 151337.0,
+            "97": 142825.0,
+            "98": 2144852.0,
+            "99": 2142365.0,
+            "100": 2140440.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2789504000.0,
+            "52": 2789479936.0,
+            "53": 2789480960.0,
+            "54": 2789488640.0,
+            "55": 2789504000.0,
+            "56": 2789505536.0,
+            "57": 2789456896.0,
+            "58": 2789505536.0,
+            "59": 2789500416.0,
+            "60": 2789513728.0,
+            "61": 2789532160.0,
+            "62": 2789525504.0,
+            "63": 2789493248.0,
+            "64": 2789495296.0,
+            "65": 2789463552.0,
+            "66": 2789480448.0,
+            "67": 2789486080.0,
+            "68": 2789483008.0,
+            "69": 2789475328.0,
+            "70": 2789485568.0,
+            "71": 2789494784.0,
+            "72": 2789506560.0,
+            "73": 2789509120.0,
+            "74": 2789521920.0,
+            "75": 2789557760.0,
+            "76": 2789565440.0,
+            "77": 2789567488.0,
+            "78": 2789526528.0,
+            "79": 2789558272.0,
+            "80": 2789537792.0,
+            "81": 2789550592.0,
+            "82": 2789554176.0,
+            "83": 2789553152.0,
+            "84": 2789535744.0,
+            "85": 2789536768.0,
+            "86": 2789527040.0,
+            "87": 2789571072.0,
+            "88": 2789549568.0,
+            "89": 2789547008.0,
+            "90": 2789578752.0,
+            "91": 2789577216.0,
+            "92": 2789581824.0,
+            "93": 2789574656.0,
+            "94": 2789586944.0,
+            "95": 2789600256.0,
+            "96": 2789601792.0,
+            "97": 2789582848.0,
+            "98": 2789626880.0,
+            "99": 2789582336.0,
+            "100": 2789600768.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4455227392.0,
+            "52": 4460379136.0,
+            "53": 4460379136.0,
+            "54": 4460379136.0,
+            "55": 4465446400.0,
+            "56": 4465446400.0,
+            "57": 4465446400.0,
+            "58": 4473809408.0,
+            "59": 4473809408.0,
+            "60": 4473809408.0,
+            "61": 4479029760.0,
+            "62": 4479029760.0,
+            "63": 4479029760.0,
+            "64": 4479029760.0,
+            "65": 4479029760.0,
+            "66": 4479029760.0,
+            "67": 4479029760.0,
+            "68": 4479029760.0,
+            "69": 4479029760.0,
+            "70": 4479029760.0,
+            "71": 4479029760.0,
+            "72": 4479029760.0,
+            "73": 4479029760.0,
+            "74": 4479029760.0,
+            "75": 4502322688.0,
+            "76": 4506302464.0,
+            "77": 4512311296.0,
+            "78": 4512311296.0,
+            "79": 4512311296.0,
+            "80": 4512311296.0,
+            "81": 4512311296.0,
+            "82": 4512311296.0,
+            "83": 4512311296.0,
+            "84": 4512311296.0,
+            "85": 4512311296.0,
+            "86": 4512311296.0,
+            "87": 4512311296.0,
+            "88": 4521950208.0,
+            "89": 4521950208.0,
+            "90": 4521950208.0,
+            "91": 4522659328.0,
+            "92": 4522659328.0,
+            "93": 4522659328.0,
+            "94": 4526183424.0,
+            "95": 4541133824.0,
+            "96": 4541133824.0,
+            "97": 4544613888.0,
+            "98": 4559089664.0,
+            "99": 4559089664.0,
+            "100": 4559089664.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 11.78443,
+            "52": 0.53246,
+            "53": 0.38652,
+            "54": 0.36366,
+            "55": 0.35397,
+            "56": 0.3447,
+            "57": 0.32475,
+            "58": 0.34667,
+            "59": 0.32989,
+            "60": 0.34524,
+            "61": 0.32952,
+            "62": 0.31145,
+            "63": 0.30418,
+            "64": 0.31694,
+            "65": 0.30895,
+            "66": 0.30823,
+            "67": 0.31663,
+            "68": 0.30653,
+            "69": 0.30537,
+            "70": 0.30313,
+            "71": 0.30204,
+            "72": 0.30417,
+            "73": 0.29895,
+            "74": 0.29982,
+            "75": 0.30334,
+            "76": 0.29924,
+            "77": 0.29767,
+            "78": 0.30576,
+            "79": 0.30429,
+            "80": 0.30015,
+            "81": 0.30466,
+            "82": 0.3039,
+            "83": 0.30919,
+            "84": 0.30306,
+            "85": 0.30633,
+            "86": 0.30372,
+            "87": 0.30348,
+            "88": 0.30271,
+            "89": 0.30741,
+            "90": 0.30323,
+            "91": 0.30502,
+            "92": 0.72064,
+            "93": 0.29549,
+            "94": 0.29663,
+            "95": 0.2941,
+            "96": 0.29558,
+            "97": 0.30196,
+            "98": 0.30035,
+            "99": 0.30083,
+            "100": 0.29573
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json
index 2eab394e23e..dffbbf25de6 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82196, "5": 10.84518, "10": 10.78921, "15": 10.8336, "20": 10.73505, "25": 10.58138, "30": 10.40958, "35": 10.31467, "40": 10.14618, "45": 9.91713, "50": 9.97428}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4960.0, "5": 6022.0, "10": 4813.0, "15": 5586.0, "20": 5068.0, "25": 4868.0, "30": 5528.0, "35": 5700.0, "40": 6137.0, "45": 6030.0, "50": 6652.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 598359040.0, "5": 598358016.0, "10": 598356992.0, "15": 598359040.0, "20": 598357504.0, "25": 598357504.0, "30": 598358528.0, "35": 598356480.0, "40": 598357504.0, "45": 598355968.0, "50": 598358016.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 842904576.0, "5": 1072649216.0, "10": 1072649216.0, "15": 1072709632.0, "20": 1073532416.0, "25": 1073532416.0, "30": 1073532416.0, "35": 1073532416.0, "40": 1073532416.0, "45": 1073532416.0, "50": 1073532416.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.98481, "5": 0.66254, "10": 0.65398, "15": 0.65456, "20": 0.65608, "25": 0.65402, "30": 0.66555, "35": 0.66433, "40": 0.65947, "45": 0.64399, "50": 0.64234}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82207,
+            "2": 10.84178,
+            "3": 10.81126,
+            "4": 10.82219,
+            "5": 10.8455,
+            "6": 10.86291,
+            "7": 10.84399,
+            "8": 10.84652,
+            "9": 10.84916,
+            "10": 10.78879,
+            "11": 10.8581,
+            "12": 10.84415,
+            "13": 10.87153,
+            "14": 10.87463,
+            "15": 10.83396,
+            "16": 10.8091,
+            "17": 10.79098,
+            "18": 10.81032,
+            "19": 10.80535,
+            "20": 10.73557,
+            "21": 10.71472,
+            "22": 10.57762,
+            "23": 10.72594,
+            "24": 10.61811,
+            "25": 10.58114,
+            "26": 10.63747,
+            "27": 10.63794,
+            "28": 10.60614,
+            "29": 10.61062,
+            "30": 10.40965,
+            "31": 10.16941,
+            "32": 10.49897,
+            "33": 10.49702,
+            "34": 10.26142,
+            "35": 10.31452,
+            "36": 10.2851,
+            "37": 10.3895,
+            "38": 10.2473,
+            "39": 10.43792,
+            "40": 10.14599,
+            "41": 10.19691,
+            "42": 10.26122,
+            "43": 9.91082,
+            "44": 10.02318,
+            "45": 9.91674,
+            "46": 9.89463,
+            "47": 10.19281,
+            "48": 9.93104,
+            "49": 9.61208,
+            "50": 9.97427
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4986.0,
+            "2": 5272.0,
+            "3": 5309.0,
+            "4": 5162.0,
+            "5": 5824.0,
+            "6": 5990.0,
+            "7": 5433.0,
+            "8": 5101.0,
+            "9": 5654.0,
+            "10": 4736.0,
+            "11": 6213.0,
+            "12": 5723.0,
+            "13": 5952.0,
+            "14": 6073.0,
+            "15": 5503.0,
+            "16": 5808.0,
+            "17": 5545.0,
+            "18": 5647.0,
+            "19": 5555.0,
+            "20": 5120.0,
+            "21": 5578.0,
+            "22": 5097.0,
+            "23": 5992.0,
+            "24": 5204.0,
+            "25": 5016.0,
+            "26": 5487.0,
+            "27": 5618.0,
+            "28": 5994.0,
+            "29": 6202.0,
+            "30": 5538.0,
+            "31": 4762.0,
+            "32": 6010.0,
+            "33": 6302.0,
+            "34": 5312.0,
+            "35": 5783.0,
+            "36": 5716.0,
+            "37": 6562.0,
+            "38": 6183.0,
+            "39": 6964.0,
+            "40": 6220.0,
+            "41": 6139.0,
+            "42": 6368.0,
+            "43": 5900.0,
+            "44": 5754.0,
+            "45": 5814.0,
+            "46": 5882.0,
+            "47": 6818.0,
+            "48": 6495.0,
+            "49": 6047.0,
+            "50": 6623.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 598359040.0,
+            "2": 598359040.0,
+            "3": 598358528.0,
+            "4": 598360576.0,
+            "5": 598358016.0,
+            "6": 598358016.0,
+            "7": 598354432.0,
+            "8": 598359040.0,
+            "9": 598358016.0,
+            "10": 598356992.0,
+            "11": 598358016.0,
+            "12": 598358016.0,
+            "13": 598359040.0,
+            "14": 598359040.0,
+            "15": 598359040.0,
+            "16": 598358528.0,
+            "17": 598352384.0,
+            "18": 598358016.0,
+            "19": 598359040.0,
+            "20": 598357504.0,
+            "21": 598358016.0,
+            "22": 598354432.0,
+            "23": 598355968.0,
+            "24": 598356480.0,
+            "25": 598358528.0,
+            "26": 598357504.0,
+            "27": 598360064.0,
+            "28": 598358016.0,
+            "29": 598356480.0,
+            "30": 598359552.0,
+            "31": 598354944.0,
+            "32": 598356992.0,
+            "33": 598359552.0,
+            "34": 598358016.0,
+            "35": 598356480.0,
+            "36": 598356992.0,
+            "37": 598358016.0,
+            "38": 598358016.0,
+            "39": 598357504.0,
+            "40": 598357504.0,
+            "41": 598352384.0,
+            "42": 598357504.0,
+            "43": 598352384.0,
+            "44": 598355456.0,
+            "45": 598355968.0,
+            "46": 598351872.0,
+            "47": 598359040.0,
+            "48": 598354944.0,
+            "49": 598353408.0,
+            "50": 598356992.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 855598080.0,
+            "2": 1083234304.0,
+            "3": 1083234304.0,
+            "4": 1083234304.0,
+            "5": 1083234304.0,
+            "6": 1083493888.0,
+            "7": 1083493888.0,
+            "8": 1083493888.0,
+            "9": 1083493888.0,
+            "10": 1083493888.0,
+            "11": 1083493888.0,
+            "12": 1083493888.0,
+            "13": 1083493888.0,
+            "14": 1084195840.0,
+            "15": 1084195840.0,
+            "16": 1084195840.0,
+            "17": 1084195840.0,
+            "18": 1084195840.0,
+            "19": 1084195840.0,
+            "20": 1084195840.0,
+            "21": 1084195840.0,
+            "22": 1084195840.0,
+            "23": 1084195840.0,
+            "24": 1084195840.0,
+            "25": 1084195840.0,
+            "26": 1084195840.0,
+            "27": 1084195840.0,
+            "28": 1084195840.0,
+            "29": 1084195840.0,
+            "30": 1084195840.0,
+            "31": 1084195840.0,
+            "32": 1084195840.0,
+            "33": 1084195840.0,
+            "34": 1084195840.0,
+            "35": 1084195840.0,
+            "36": 1084195840.0,
+            "37": 1084195840.0,
+            "38": 1084195840.0,
+            "39": 1084195840.0,
+            "40": 1084195840.0,
+            "41": 1084195840.0,
+            "42": 1084195840.0,
+            "43": 1084195840.0,
+            "44": 1084195840.0,
+            "45": 1084195840.0,
+            "46": 1084195840.0,
+            "47": 1084195840.0,
+            "48": 1084195840.0,
+            "49": 1084195840.0,
+            "50": 1084195840.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 12.15002,
+            "2": 0.70236,
+            "3": 0.6774,
+            "4": 0.6698,
+            "5": 0.66613,
+            "6": 0.65685,
+            "7": 0.65852,
+            "8": 1.19123,
+            "9": 0.65621,
+            "10": 1.09603,
+            "11": 0.65688,
+            "12": 0.65983,
+            "13": 0.6521,
+            "14": 0.65135,
+            "15": 0.65551,
+            "16": 0.64995,
+            "17": 0.6532,
+            "18": 0.65306,
+            "19": 0.65221,
+            "20": 0.65239,
+            "21": 0.65356,
+            "22": 0.6536,
+            "23": 0.65416,
+            "24": 0.65298,
+            "25": 0.65469,
+            "26": 0.65391,
+            "27": 0.65289,
+            "28": 1.1109,
+            "29": 0.65365,
+            "30": 0.65326,
+            "31": 0.68599,
+            "32": 0.65366,
+            "33": 0.65416,
+            "34": 0.6538,
+            "35": 0.65304,
+            "36": 0.65351,
+            "37": 0.65423,
+            "38": 0.6542,
+            "39": 0.65254,
+            "40": 0.65386,
+            "41": 0.65384,
+            "42": 0.65434,
+            "43": 0.65537,
+            "44": 0.65573,
+            "45": 0.65342,
+            "46": 0.65451,
+            "47": 0.6535,
+            "48": 0.65377,
+            "49": 0.65522,
+            "50": 0.65221
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..bfea64b8438
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.79193,
+            "2": 10.81245,
+            "3": 10.79181,
+            "4": 10.78209,
+            "5": 10.82295,
+            "6": 10.83309,
+            "7": 10.81351,
+            "8": 10.81215,
+            "9": 10.81457,
+            "10": 10.76068,
+            "11": 10.84185,
+            "12": 10.82404,
+            "13": 10.83895,
+            "14": 10.84433,
+            "15": 10.79974,
+            "16": 10.78654,
+            "17": 10.76789,
+            "18": 10.77495,
+            "19": 10.77669,
+            "20": 10.71893,
+            "21": 10.69691,
+            "22": 10.5691,
+            "23": 10.7131,
+            "24": 10.59975,
+            "25": 10.56123,
+            "26": 10.60735,
+            "27": 10.63093,
+            "28": 10.6064,
+            "29": 10.61213,
+            "30": 10.39823,
+            "31": 10.16422,
+            "32": 10.49019,
+            "33": 10.48385,
+            "34": 10.26645,
+            "35": 10.31743,
+            "36": 10.28264,
+            "37": 10.39002,
+            "38": 10.25116,
+            "39": 10.43811,
+            "40": 10.1403,
+            "41": 10.19191,
+            "42": 10.25886,
+            "43": 9.91588,
+            "44": 10.02837,
+            "45": 9.91815,
+            "46": 9.89353,
+            "47": 10.20144,
+            "48": 9.92509,
+            "49": 9.62973,
+            "50": 9.97857
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5192.0,
+            "2": 5510.0,
+            "3": 5508.0,
+            "4": 5240.0,
+            "5": 6136.0,
+            "6": 6180.0,
+            "7": 5549.0,
+            "8": 5242.0,
+            "9": 5717.0,
+            "10": 4818.0,
+            "11": 6299.0,
+            "12": 5746.0,
+            "13": 6110.0,
+            "14": 6165.0,
+            "15": 5683.0,
+            "16": 5805.0,
+            "17": 5758.0,
+            "18": 5546.0,
+            "19": 5787.0,
+            "20": 5231.0,
+            "21": 5741.0,
+            "22": 5126.0,
+            "23": 6019.0,
+            "24": 5410.0,
+            "25": 5100.0,
+            "26": 5630.0,
+            "27": 5627.0,
+            "28": 6146.0,
+            "29": 6174.0,
+            "30": 5570.0,
+            "31": 4768.0,
+            "32": 5926.0,
+            "33": 6348.0,
+            "34": 5389.0,
+            "35": 5856.0,
+            "36": 5741.0,
+            "37": 6611.0,
+            "38": 6262.0,
+            "39": 6971.0,
+            "40": 6094.0,
+            "41": 6227.0,
+            "42": 6622.0,
+            "43": 5761.0,
+            "44": 5929.0,
+            "45": 5769.0,
+            "46": 6141.0,
+            "47": 6909.0,
+            "48": 6650.0,
+            "49": 6100.0,
+            "50": 6753.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 627716608.0,
+            "2": 627719168.0,
+            "3": 627717632.0,
+            "4": 627719680.0,
+            "5": 627717120.0,
+            "6": 627717120.0,
+            "7": 627719680.0,
+            "8": 627716608.0,
+            "9": 627718144.0,
+            "10": 627718144.0,
+            "11": 627717632.0,
+            "12": 627718144.0,
+            "13": 627719168.0,
+            "14": 627718144.0,
+            "15": 627722240.0,
+            "16": 627718144.0,
+            "17": 627720704.0,
+            "18": 627719680.0,
+            "19": 627719168.0,
+            "20": 627718144.0,
+            "21": 627718656.0,
+            "22": 627723264.0,
+            "23": 627720192.0,
+            "24": 627719680.0,
+            "25": 627718144.0,
+            "26": 627719168.0,
+            "27": 627719168.0,
+            "28": 627718144.0,
+            "29": 627718144.0,
+            "30": 627719168.0,
+            "31": 627719168.0,
+            "32": 627719168.0,
+            "33": 627717632.0,
+            "34": 627719680.0,
+            "35": 627721216.0,
+            "36": 627717120.0,
+            "37": 627719168.0,
+            "38": 627721216.0,
+            "39": 627719168.0,
+            "40": 627718656.0,
+            "41": 627718144.0,
+            "42": 627717632.0,
+            "43": 627717120.0,
+            "44": 627718656.0,
+            "45": 627717632.0,
+            "46": 627717120.0,
+            "47": 627719168.0,
+            "48": 627718144.0,
+            "49": 627716608.0,
+            "50": 627716096.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 879803392.0,
+            "2": 1114769920.0,
+            "3": 1114769920.0,
+            "4": 1116293632.0,
+            "5": 1116293632.0,
+            "6": 1116293632.0,
+            "7": 1116293632.0,
+            "8": 1116293632.0,
+            "9": 1116293632.0,
+            "10": 1116293632.0,
+            "11": 1116293632.0,
+            "12": 1116293632.0,
+            "13": 1116293632.0,
+            "14": 1116293632.0,
+            "15": 1116293632.0,
+            "16": 1116293632.0,
+            "17": 1116293632.0,
+            "18": 1116293632.0,
+            "19": 1116293632.0,
+            "20": 1116293632.0,
+            "21": 1116293632.0,
+            "22": 1116293632.0,
+            "23": 1116293632.0,
+            "24": 1116293632.0,
+            "25": 1116293632.0,
+            "26": 1116293632.0,
+            "27": 1116293632.0,
+            "28": 1116293632.0,
+            "29": 1116293632.0,
+            "30": 1116293632.0,
+            "31": 1116293632.0,
+            "32": 1116293632.0,
+            "33": 1116293632.0,
+            "34": 1116293632.0,
+            "35": 1116293632.0,
+            "36": 1116293632.0,
+            "37": 1116293632.0,
+            "38": 1116293632.0,
+            "39": 1116293632.0,
+            "40": 1116293632.0,
+            "41": 1116293632.0,
+            "42": 1116293632.0,
+            "43": 1116293632.0,
+            "44": 1116293632.0,
+            "45": 1116293632.0,
+            "46": 1116293632.0,
+            "47": 1116293632.0,
+            "48": 1116293632.0,
+            "49": 1116293632.0,
+            "50": 1116293632.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 7.71846,
+            "3": 0.76188,
+            "4": 0.74577,
+            "5": 0.73403,
+            "6": 0.73193,
+            "7": 0.73107,
+            "8": 0.72199,
+            "9": 0.726,
+            "10": 0.71891,
+            "11": 0.72723,
+            "12": 0.71504,
+            "13": 0.71448,
+            "14": 0.71551,
+            "15": 0.71936,
+            "16": 0.71512,
+            "17": 0.73948,
+            "18": 0.83787,
+            "19": 0.94178,
+            "20": 0.98096,
+            "21": 0.71399,
+            "22": 0.87302,
+            "23": 0.71359,
+            "24": 0.7104,
+            "25": 0.70807,
+            "26": 0.71636,
+            "27": 0.70864,
+            "28": 0.72237,
+            "29": 0.7163,
+            "30": 0.7153,
+            "31": 0.71793,
+            "32": 0.70846,
+            "33": 0.7079,
+            "34": 0.71058,
+            "35": 0.71492,
+            "36": 0.72031,
+            "37": 0.71537,
+            "38": 0.70333,
+            "39": 0.70449,
+            "40": 0.71725,
+            "41": 0.72322,
+            "42": 0.7105,
+            "43": 0.70421,
+            "44": 0.70441,
+            "45": 0.70449,
+            "46": 0.7091,
+            "47": 0.70989,
+            "48": 0.70781,
+            "49": 0.71985,
+            "50": 0.70534
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json
index c9eee5d9463..e9af2c920dd 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.81746,
-            "2": 10.82149,
-            "3": 10.82234,
-            "4": 10.79883,
-            "5": 10.84067,
-            "6": 10.85636,
-            "7": 10.81775,
-            "8": 10.81498,
-            "9": 10.83664,
-            "10": 10.7822,
-            "11": 10.85151,
-            "12": 10.84335,
-            "13": 10.85001,
-            "14": 10.87346,
-            "15": 10.80974,
-            "16": 10.80359,
-            "17": 10.75702,
-            "18": 10.80691,
-            "19": 10.78689,
-            "20": 10.73095,
-            "21": 10.70872,
-            "22": 10.57886,
-            "23": 10.71772,
-            "24": 10.63253,
-            "25": 10.57332,
-            "26": 10.62323,
-            "27": 10.63892,
+            "1": 10.81737,
+            "2": 10.82147,
+            "3": 10.82281,
+            "4": 10.79843,
+            "5": 10.84076,
+            "6": 10.85646,
+            "7": 10.81805,
+            "8": 10.81508,
+            "9": 10.83702,
+            "10": 10.78206,
+            "11": 10.85139,
+            "12": 10.84369,
+            "13": 10.84954,
+            "14": 10.87421,
+            "15": 10.81044,
+            "16": 10.80279,
+            "17": 10.75666,
+            "18": 10.80666,
+            "19": 10.78635,
+            "20": 10.7305,
+            "21": 10.7094,
+            "22": 10.57865,
+            "23": 10.71817,
+            "24": 10.63281,
+            "25": 10.57347,
+            "26": 10.62329,
+            "27": 10.63909,
             "28": 10.60509,
-            "29": 10.61796,
-            "30": 10.42067,
-            "31": 10.18074,
-            "32": 10.50619,
-            "33": 10.50937,
-            "34": 10.27626,
-            "35": 10.3249,
-            "36": 10.29423,
-            "37": 10.40006,
-            "38": 10.26099,
-            "39": 10.44197,
-            "40": 10.1644,
-            "41": 10.2004,
-            "42": 10.26981,
-            "43": 9.93054,
-            "44": 10.04184,
-            "45": 9.9288,
-            "46": 9.89638,
-            "47": 10.18471,
-            "48": 9.93119,
+            "29": 10.61783,
+            "30": 10.42028,
+            "31": 10.18079,
+            "32": 10.50616,
+            "33": 10.50906,
+            "34": 10.27697,
+            "35": 10.3245,
+            "36": 10.29406,
+            "37": 10.39966,
+            "38": 10.2616,
+            "39": 10.44227,
+            "40": 10.16376,
+            "41": 10.2005,
+            "42": 10.26994,
+            "43": 9.93005,
+            "44": 10.04225,
+            "45": 9.92868,
+            "46": 9.89675,
+            "47": 10.18499,
+            "48": 9.93166,
             "49": 9.62763,
-            "50": 9.98402
+            "50": 9.98403
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 5082.0,
-            "2": 5274.0,
-            "3": 5447.0,
-            "4": 5269.0,
-            "5": 6020.0,
-            "6": 6160.0,
-            "7": 5592.0,
-            "8": 5309.0,
-            "9": 5743.0,
-            "10": 4800.0,
-            "11": 6186.0,
-            "12": 5648.0,
-            "13": 6106.0,
-            "14": 6126.0,
-            "15": 5600.0,
-            "16": 5819.0,
-            "17": 5669.0,
-            "18": 5547.0,
-            "19": 5711.0,
-            "20": 5380.0,
-            "21": 5677.0,
-            "22": 5023.0,
-            "23": 6080.0,
-            "24": 5403.0,
-            "25": 5120.0,
-            "26": 5431.0,
-            "27": 5866.0,
-            "28": 6035.0,
-            "29": 6154.0,
-            "30": 5456.0,
-            "31": 4832.0,
-            "32": 5956.0,
-            "33": 6301.0,
-            "34": 5366.0,
-            "35": 5900.0,
-            "36": 5703.0,
-            "37": 6744.0,
-            "38": 6098.0,
-            "39": 6737.0,
-            "40": 5994.0,
-            "41": 6144.0,
-            "42": 6542.0,
-            "43": 5751.0,
-            "44": 5876.0,
-            "45": 5795.0,
-            "46": 6162.0,
-            "47": 6736.0,
-            "48": 6331.0,
-            "49": 6235.0,
-            "50": 6668.0
+            "1": 5162.0,
+            "2": 5294.0,
+            "3": 5343.0,
+            "4": 5333.0,
+            "5": 5868.0,
+            "6": 6119.0,
+            "7": 5447.0,
+            "8": 5258.0,
+            "9": 5738.0,
+            "10": 4888.0,
+            "11": 6126.0,
+            "12": 5816.0,
+            "13": 6034.0,
+            "14": 6205.0,
+            "15": 5700.0,
+            "16": 5769.0,
+            "17": 5716.0,
+            "18": 5606.0,
+            "19": 5781.0,
+            "20": 5226.0,
+            "21": 5690.0,
+            "22": 5164.0,
+            "23": 6126.0,
+            "24": 5314.0,
+            "25": 5071.0,
+            "26": 5505.0,
+            "27": 5772.0,
+            "28": 6005.0,
+            "29": 6328.0,
+            "30": 5628.0,
+            "31": 4847.0,
+            "32": 5883.0,
+            "33": 6277.0,
+            "34": 5280.0,
+            "35": 5737.0,
+            "36": 5716.0,
+            "37": 6534.0,
+            "38": 6002.0,
+            "39": 6879.0,
+            "40": 5969.0,
+            "41": 6140.0,
+            "42": 6558.0,
+            "43": 5814.0,
+            "44": 5764.0,
+            "45": 5925.0,
+            "46": 5890.0,
+            "47": 6716.0,
+            "48": 6553.0,
+            "49": 6112.0,
+            "50": 6617.0
         }
     },
     "mem-allocated-bytes": {
@@ -121,53 +121,53 @@
             "1": 627718656.0,
             "2": 627719168.0,
             "3": 627719168.0,
-            "4": 627720704.0,
+            "4": 627720192.0,
             "5": 627718656.0,
             "6": 627718656.0,
             "7": 627718144.0,
             "8": 627718144.0,
             "9": 627718144.0,
             "10": 627719168.0,
-            "11": 627719680.0,
-            "12": 627719168.0,
-            "13": 627719680.0,
-            "14": 627717120.0,
+            "11": 627718656.0,
+            "12": 627718144.0,
+            "13": 627720192.0,
+            "14": 627717632.0,
             "15": 627720192.0,
             "16": 627717632.0,
             "17": 627718144.0,
-            "18": 627719680.0,
+            "18": 627718656.0,
             "19": 627719168.0,
             "20": 627717120.0,
             "21": 627718144.0,
             "22": 627720192.0,
             "23": 627720192.0,
-            "24": 627718144.0,
+            "24": 627717120.0,
             "25": 627718656.0,
-            "26": 627718144.0,
-            "27": 627717120.0,
-            "28": 627718656.0,
+            "26": 627717632.0,
+            "27": 627719680.0,
+            "28": 627717632.0,
             "29": 627717120.0,
             "30": 627720192.0,
-            "31": 627715072.0,
-            "32": 627720192.0,
+            "31": 627715584.0,
+            "32": 627720704.0,
             "33": 627717632.0,
-            "34": 627719168.0,
-            "35": 627716608.0,
-            "36": 627719168.0,
-            "37": 627718144.0,
+            "34": 627718144.0,
+            "35": 627715584.0,
+            "36": 627718656.0,
+            "37": 627717632.0,
             "38": 627718656.0,
             "39": 627715584.0,
-            "40": 627717632.0,
+            "40": 627718656.0,
             "41": 627714560.0,
             "42": 627718144.0,
             "43": 627713536.0,
-            "44": 627714048.0,
-            "45": 627719168.0,
+            "44": 627715072.0,
+            "45": 627718144.0,
             "46": 627716096.0,
-            "47": 627717120.0,
+            "47": 627718144.0,
             "48": 627716608.0,
-            "49": 627715072.0,
-            "50": 627718144.0
+            "49": 627716096.0,
+            "50": 627717632.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 870138880.0,
-            "2": 1099332096.0,
-            "3": 1099950080.0,
-            "4": 1102007296.0,
-            "5": 1102007296.0,
-            "6": 1102007296.0,
-            "7": 1102007296.0,
-            "8": 1102007296.0,
-            "9": 1102007296.0,
-            "10": 1102007296.0,
-            "11": 1102007296.0,
-            "12": 1102007296.0,
-            "13": 1103012352.0,
-            "14": 1103012352.0,
-            "15": 1103012352.0,
-            "16": 1103012352.0,
-            "17": 1103012352.0,
-            "18": 1103012352.0,
-            "19": 1103012352.0,
-            "20": 1103012352.0,
-            "21": 1103012352.0,
-            "22": 1103012352.0,
-            "23": 1103012352.0,
-            "24": 1103012352.0,
-            "25": 1103012352.0,
-            "26": 1103012352.0,
-            "27": 1103012352.0,
-            "28": 1103012352.0,
-            "29": 1103012352.0,
-            "30": 1103012352.0,
-            "31": 1103012352.0,
-            "32": 1103012352.0,
-            "33": 1103012352.0,
-            "34": 1103012352.0,
-            "35": 1103012352.0,
-            "36": 1103012352.0,
-            "37": 1103012352.0,
-            "38": 1103012352.0,
-            "39": 1103012352.0,
-            "40": 1103012352.0,
-            "41": 1103012352.0,
-            "42": 1103012352.0,
-            "43": 1103012352.0,
-            "44": 1103012352.0,
-            "45": 1103012352.0,
-            "46": 1103012352.0,
-            "47": 1103012352.0,
-            "48": 1103012352.0,
-            "49": 1103012352.0,
-            "50": 1103012352.0
+            "1": 879924224.0,
+            "2": 1111762432.0,
+            "3": 1111762432.0,
+            "4": 1113592832.0,
+            "5": 1113592832.0,
+            "6": 1113592832.0,
+            "7": 1113592832.0,
+            "8": 1113592832.0,
+            "9": 1113592832.0,
+            "10": 1113592832.0,
+            "11": 1113592832.0,
+            "12": 1113592832.0,
+            "13": 1113592832.0,
+            "14": 1113592832.0,
+            "15": 1113592832.0,
+            "16": 1113592832.0,
+            "17": 1113592832.0,
+            "18": 1113592832.0,
+            "19": 1113592832.0,
+            "20": 1113592832.0,
+            "21": 1113592832.0,
+            "22": 1113592832.0,
+            "23": 1113592832.0,
+            "24": 1113592832.0,
+            "25": 1113592832.0,
+            "26": 1113592832.0,
+            "27": 1113592832.0,
+            "28": 1113592832.0,
+            "29": 1113592832.0,
+            "30": 1113592832.0,
+            "31": 1113592832.0,
+            "32": 1113592832.0,
+            "33": 1113592832.0,
+            "34": 1113592832.0,
+            "35": 1113592832.0,
+            "36": 1113592832.0,
+            "37": 1113592832.0,
+            "38": 1113592832.0,
+            "39": 1113592832.0,
+            "40": 1113592832.0,
+            "41": 1113592832.0,
+            "42": 1113592832.0,
+            "43": 1113592832.0,
+            "44": 1113592832.0,
+            "45": 1113592832.0,
+            "46": 1113592832.0,
+            "47": 1113592832.0,
+            "48": 1113592832.0,
+            "49": 1113592832.0,
+            "50": 1113592832.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 17.75731,
-            "2": 0.59137,
-            "3": 0.52847,
-            "4": 0.55398,
-            "5": 0.51736,
-            "6": 0.51707,
-            "7": 0.52895,
-            "8": 0.51861,
-            "9": 0.5181,
-            "10": 0.51717,
-            "11": 0.51445,
-            "12": 0.51129,
-            "13": 0.51494,
-            "14": 0.51037,
-            "15": 0.51828,
-            "16": 0.50983,
-            "17": 0.51156,
-            "18": 0.51029,
-            "19": 0.51087,
-            "20": 0.51452,
-            "21": 0.5039,
-            "22": 0.51296,
-            "23": 0.50822,
-            "24": 0.51693,
-            "25": 0.51087,
-            "26": 0.51188,
-            "27": 0.51138,
-            "28": 0.51374,
-            "29": 0.50808,
-            "30": 0.50936,
-            "31": 0.51301,
-            "32": 0.5132,
-            "33": 0.51,
-            "34": 0.51133,
-            "35": 0.51556,
-            "36": 0.51397,
-            "37": 0.51183,
-            "38": 0.51721,
-            "39": 0.50468,
-            "40": 0.50915,
-            "41": 0.51802,
-            "42": 0.51064,
-            "43": 0.51335,
-            "44": 0.50717,
-            "45": 0.51189,
-            "46": 0.52735,
-            "47": 0.52015,
-            "48": 0.50421,
-            "49": 0.5285,
-            "50": 0.50368
+            "1": 19.37156,
+            "2": 0.57228,
+            "3": 0.50712,
+            "4": 0.49818,
+            "5": 0.46521,
+            "6": 0.46426,
+            "7": 0.48248,
+            "8": 0.46121,
+            "9": 0.46322,
+            "10": 0.943,
+            "11": 0.46349,
+            "12": 0.46108,
+            "13": 0.47225,
+            "14": 0.45499,
+            "15": 0.47496,
+            "16": 0.4611,
+            "17": 0.46441,
+            "18": 0.45776,
+            "19": 0.90663,
+            "20": 0.8319,
+            "21": 0.45677,
+            "22": 0.45736,
+            "23": 0.45985,
+            "24": 1.08757,
+            "25": 0.46245,
+            "26": 0.45592,
+            "27": 0.45988,
+            "28": 0.93317,
+            "29": 0.46123,
+            "30": 0.4584,
+            "31": 0.45997,
+            "32": 0.45818,
+            "33": 0.45532,
+            "34": 0.46013,
+            "35": 0.85461,
+            "36": 0.46712,
+            "37": 0.46955,
+            "38": 0.46952,
+            "39": 0.45914,
+            "40": 0.45553,
+            "41": 0.45756,
+            "42": 0.45149,
+            "43": 0.46141,
+            "44": 0.44921,
+            "45": 0.46166,
+            "46": 0.47347,
+            "47": 0.472,
+            "48": 0.45384,
+            "49": 0.47868,
+            "50": 0.45871
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json
index 93abc66f3c0..d2a07cdf1dd 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82196, "5": 10.84518, "10": 10.78921, "15": 10.8336, "20": 10.73505, "25": 10.58138, "30": 10.40958, "35": 10.31467, "40": 10.14618, "45": 9.91713, "50": 9.97428}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4960.0, "5": 6022.0, "10": 4813.0, "15": 5586.0, "20": 5068.0, "25": 4868.0, "30": 5528.0, "35": 5700.0, "40": 6137.0, "45": 6030.0, "50": 6652.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 598359040.0, "5": 598358016.0, "10": 598356992.0, "15": 598359040.0, "20": 598357504.0, "25": 598357504.0, "30": 598358528.0, "35": 598356480.0, "40": 598357504.0, "45": 598355968.0, "50": 598358016.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 842904576.0, "5": 1072649216.0, "10": 1072649216.0, "15": 1072709632.0, "20": 1073532416.0, "25": 1073532416.0, "30": 1073532416.0, "35": 1073532416.0, "40": 1073532416.0, "45": 1073532416.0, "50": 1073532416.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 15.53653, "5": 0.66223, "10": 0.66331, "15": 0.65892, "20": 0.66075, "25": 0.6607, "30": 0.68157, "35": 0.68189, "40": 0.68279, "45": 0.68065, "50": 0.65686}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82207,
+            "2": 10.84178,
+            "3": 10.81126,
+            "4": 10.82219,
+            "5": 10.8455,
+            "6": 10.86291,
+            "7": 10.84399,
+            "8": 10.84652,
+            "9": 10.84916,
+            "10": 10.78879,
+            "11": 10.8581,
+            "12": 10.84415,
+            "13": 10.87153,
+            "14": 10.87463,
+            "15": 10.83396,
+            "16": 10.8091,
+            "17": 10.79098,
+            "18": 10.81032,
+            "19": 10.80535,
+            "20": 10.73557,
+            "21": 10.71472,
+            "22": 10.57762,
+            "23": 10.72594,
+            "24": 10.61811,
+            "25": 10.58114,
+            "26": 10.63747,
+            "27": 10.63794,
+            "28": 10.60614,
+            "29": 10.61062,
+            "30": 10.40965,
+            "31": 10.16941,
+            "32": 10.49897,
+            "33": 10.49702,
+            "34": 10.26142,
+            "35": 10.31452,
+            "36": 10.2851,
+            "37": 10.3895,
+            "38": 10.2473,
+            "39": 10.43792,
+            "40": 10.14599,
+            "41": 10.19691,
+            "42": 10.26122,
+            "43": 9.91082,
+            "44": 10.02318,
+            "45": 9.91674,
+            "46": 9.89463,
+            "47": 10.19281,
+            "48": 9.93104,
+            "49": 9.61208,
+            "50": 9.97427
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4986.0,
+            "2": 5272.0,
+            "3": 5309.0,
+            "4": 5162.0,
+            "5": 5824.0,
+            "6": 5990.0,
+            "7": 5433.0,
+            "8": 5101.0,
+            "9": 5654.0,
+            "10": 4736.0,
+            "11": 6213.0,
+            "12": 5723.0,
+            "13": 5952.0,
+            "14": 6073.0,
+            "15": 5503.0,
+            "16": 5808.0,
+            "17": 5545.0,
+            "18": 5647.0,
+            "19": 5555.0,
+            "20": 5120.0,
+            "21": 5578.0,
+            "22": 5097.0,
+            "23": 5992.0,
+            "24": 5204.0,
+            "25": 5016.0,
+            "26": 5487.0,
+            "27": 5618.0,
+            "28": 5994.0,
+            "29": 6202.0,
+            "30": 5538.0,
+            "31": 4762.0,
+            "32": 6010.0,
+            "33": 6302.0,
+            "34": 5312.0,
+            "35": 5783.0,
+            "36": 5716.0,
+            "37": 6562.0,
+            "38": 6183.0,
+            "39": 6964.0,
+            "40": 6220.0,
+            "41": 6139.0,
+            "42": 6368.0,
+            "43": 5900.0,
+            "44": 5754.0,
+            "45": 5814.0,
+            "46": 5882.0,
+            "47": 6818.0,
+            "48": 6495.0,
+            "49": 6047.0,
+            "50": 6623.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 598359040.0,
+            "2": 598359040.0,
+            "3": 598358528.0,
+            "4": 598360576.0,
+            "5": 598358016.0,
+            "6": 598358016.0,
+            "7": 598354432.0,
+            "8": 598359040.0,
+            "9": 598358016.0,
+            "10": 598356992.0,
+            "11": 598358016.0,
+            "12": 598358016.0,
+            "13": 598359040.0,
+            "14": 598359040.0,
+            "15": 598359040.0,
+            "16": 598358528.0,
+            "17": 598352384.0,
+            "18": 598358016.0,
+            "19": 598359040.0,
+            "20": 598357504.0,
+            "21": 598358016.0,
+            "22": 598354432.0,
+            "23": 598355968.0,
+            "24": 598356480.0,
+            "25": 598358528.0,
+            "26": 598357504.0,
+            "27": 598360064.0,
+            "28": 598358016.0,
+            "29": 598356480.0,
+            "30": 598359552.0,
+            "31": 598354944.0,
+            "32": 598356992.0,
+            "33": 598359552.0,
+            "34": 598358016.0,
+            "35": 598356480.0,
+            "36": 598356992.0,
+            "37": 598358016.0,
+            "38": 598358016.0,
+            "39": 598357504.0,
+            "40": 598357504.0,
+            "41": 598352384.0,
+            "42": 598357504.0,
+            "43": 598352384.0,
+            "44": 598355456.0,
+            "45": 598355968.0,
+            "46": 598351872.0,
+            "47": 598359040.0,
+            "48": 598354944.0,
+            "49": 598353408.0,
+            "50": 598356992.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 855598080.0,
+            "2": 1083234304.0,
+            "3": 1083234304.0,
+            "4": 1083234304.0,
+            "5": 1083234304.0,
+            "6": 1083493888.0,
+            "7": 1083493888.0,
+            "8": 1083493888.0,
+            "9": 1083493888.0,
+            "10": 1083493888.0,
+            "11": 1083493888.0,
+            "12": 1083493888.0,
+            "13": 1083493888.0,
+            "14": 1084195840.0,
+            "15": 1084195840.0,
+            "16": 1084195840.0,
+            "17": 1084195840.0,
+            "18": 1084195840.0,
+            "19": 1084195840.0,
+            "20": 1084195840.0,
+            "21": 1084195840.0,
+            "22": 1084195840.0,
+            "23": 1084195840.0,
+            "24": 1084195840.0,
+            "25": 1084195840.0,
+            "26": 1084195840.0,
+            "27": 1084195840.0,
+            "28": 1084195840.0,
+            "29": 1084195840.0,
+            "30": 1084195840.0,
+            "31": 1084195840.0,
+            "32": 1084195840.0,
+            "33": 1084195840.0,
+            "34": 1084195840.0,
+            "35": 1084195840.0,
+            "36": 1084195840.0,
+            "37": 1084195840.0,
+            "38": 1084195840.0,
+            "39": 1084195840.0,
+            "40": 1084195840.0,
+            "41": 1084195840.0,
+            "42": 1084195840.0,
+            "43": 1084195840.0,
+            "44": 1084195840.0,
+            "45": 1084195840.0,
+            "46": 1084195840.0,
+            "47": 1084195840.0,
+            "48": 1084195840.0,
+            "49": 1084195840.0,
+            "50": 1084195840.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 12.18178,
+            "2": 0.71018,
+            "3": 0.6513,
+            "4": 0.63757,
+            "5": 0.63692,
+            "6": 1.25031,
+            "7": 0.63769,
+            "8": 0.6385,
+            "9": 1.00487,
+            "10": 0.63706,
+            "11": 0.63646,
+            "12": 0.63826,
+            "13": 0.63654,
+            "14": 0.63609,
+            "15": 0.64,
+            "16": 0.6373,
+            "17": 0.63737,
+            "18": 0.63625,
+            "19": 0.63624,
+            "20": 0.63844,
+            "21": 0.6361,
+            "22": 0.63788,
+            "23": 0.63738,
+            "24": 0.63546,
+            "25": 0.63758,
+            "26": 0.63704,
+            "27": 0.63992,
+            "28": 0.64468,
+            "29": 0.64456,
+            "30": 0.6501,
+            "31": 0.64571,
+            "32": 0.64554,
+            "33": 0.64543,
+            "34": 0.64396,
+            "35": 0.64389,
+            "36": 0.64513,
+            "37": 0.6451,
+            "38": 0.64723,
+            "39": 0.6454,
+            "40": 0.64512,
+            "41": 0.64629,
+            "42": 0.64576,
+            "43": 0.64737,
+            "44": 0.64709,
+            "45": 0.64517,
+            "46": 0.64605,
+            "47": 0.64625,
+            "48": 0.64627,
+            "49": 0.64638,
+            "50": 0.64367
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..2bcdb30bc50
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.79193,
+            "2": 10.81245,
+            "3": 10.79181,
+            "4": 10.78209,
+            "5": 10.82295,
+            "6": 10.83309,
+            "7": 10.81351,
+            "8": 10.81215,
+            "9": 10.81457,
+            "10": 10.76068,
+            "11": 10.84185,
+            "12": 10.82404,
+            "13": 10.83895,
+            "14": 10.84433,
+            "15": 10.79974,
+            "16": 10.78654,
+            "17": 10.76789,
+            "18": 10.77495,
+            "19": 10.77669,
+            "20": 10.71893,
+            "21": 10.69691,
+            "22": 10.5691,
+            "23": 10.7131,
+            "24": 10.59975,
+            "25": 10.56123,
+            "26": 10.60735,
+            "27": 10.63093,
+            "28": 10.6064,
+            "29": 10.61213,
+            "30": 10.39823,
+            "31": 10.16422,
+            "32": 10.49019,
+            "33": 10.48385,
+            "34": 10.26645,
+            "35": 10.31743,
+            "36": 10.28264,
+            "37": 10.39002,
+            "38": 10.25116,
+            "39": 10.43811,
+            "40": 10.1403,
+            "41": 10.19191,
+            "42": 10.25886,
+            "43": 9.91588,
+            "44": 10.02837,
+            "45": 9.91815,
+            "46": 9.89353,
+            "47": 10.20144,
+            "48": 9.92509,
+            "49": 9.62973,
+            "50": 9.97857
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5192.0,
+            "2": 5510.0,
+            "3": 5508.0,
+            "4": 5240.0,
+            "5": 6136.0,
+            "6": 6180.0,
+            "7": 5549.0,
+            "8": 5242.0,
+            "9": 5717.0,
+            "10": 4818.0,
+            "11": 6299.0,
+            "12": 5746.0,
+            "13": 6110.0,
+            "14": 6165.0,
+            "15": 5683.0,
+            "16": 5805.0,
+            "17": 5758.0,
+            "18": 5546.0,
+            "19": 5787.0,
+            "20": 5231.0,
+            "21": 5741.0,
+            "22": 5126.0,
+            "23": 6019.0,
+            "24": 5410.0,
+            "25": 5100.0,
+            "26": 5630.0,
+            "27": 5627.0,
+            "28": 6146.0,
+            "29": 6174.0,
+            "30": 5570.0,
+            "31": 4768.0,
+            "32": 5926.0,
+            "33": 6348.0,
+            "34": 5389.0,
+            "35": 5856.0,
+            "36": 5741.0,
+            "37": 6611.0,
+            "38": 6262.0,
+            "39": 6971.0,
+            "40": 6094.0,
+            "41": 6227.0,
+            "42": 6622.0,
+            "43": 5761.0,
+            "44": 5929.0,
+            "45": 5769.0,
+            "46": 6141.0,
+            "47": 6909.0,
+            "48": 6650.0,
+            "49": 6100.0,
+            "50": 6753.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 627716608.0,
+            "2": 627719168.0,
+            "3": 627717632.0,
+            "4": 627719680.0,
+            "5": 627717120.0,
+            "6": 627717120.0,
+            "7": 627719680.0,
+            "8": 627716608.0,
+            "9": 627718144.0,
+            "10": 627718144.0,
+            "11": 627717632.0,
+            "12": 627718144.0,
+            "13": 627719168.0,
+            "14": 627718144.0,
+            "15": 627722240.0,
+            "16": 627718144.0,
+            "17": 627720704.0,
+            "18": 627719680.0,
+            "19": 627719168.0,
+            "20": 627718144.0,
+            "21": 627718656.0,
+            "22": 627723264.0,
+            "23": 627720192.0,
+            "24": 627719680.0,
+            "25": 627718144.0,
+            "26": 627719168.0,
+            "27": 627719168.0,
+            "28": 627718144.0,
+            "29": 627718144.0,
+            "30": 627719168.0,
+            "31": 627719168.0,
+            "32": 627719168.0,
+            "33": 627717632.0,
+            "34": 627719680.0,
+            "35": 627721216.0,
+            "36": 627717120.0,
+            "37": 627719168.0,
+            "38": 627721216.0,
+            "39": 627719168.0,
+            "40": 627718656.0,
+            "41": 627718144.0,
+            "42": 627717632.0,
+            "43": 627717120.0,
+            "44": 627718656.0,
+            "45": 627717632.0,
+            "46": 627717120.0,
+            "47": 627719168.0,
+            "48": 627718144.0,
+            "49": 627716608.0,
+            "50": 627716096.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 879803392.0,
+            "2": 1114769920.0,
+            "3": 1114769920.0,
+            "4": 1116293632.0,
+            "5": 1116293632.0,
+            "6": 1116293632.0,
+            "7": 1116293632.0,
+            "8": 1116293632.0,
+            "9": 1116293632.0,
+            "10": 1116293632.0,
+            "11": 1116293632.0,
+            "12": 1116293632.0,
+            "13": 1116293632.0,
+            "14": 1116293632.0,
+            "15": 1116293632.0,
+            "16": 1116293632.0,
+            "17": 1116293632.0,
+            "18": 1116293632.0,
+            "19": 1116293632.0,
+            "20": 1116293632.0,
+            "21": 1116293632.0,
+            "22": 1116293632.0,
+            "23": 1116293632.0,
+            "24": 1116293632.0,
+            "25": 1116293632.0,
+            "26": 1116293632.0,
+            "27": 1116293632.0,
+            "28": 1116293632.0,
+            "29": 1116293632.0,
+            "30": 1116293632.0,
+            "31": 1116293632.0,
+            "32": 1116293632.0,
+            "33": 1116293632.0,
+            "34": 1116293632.0,
+            "35": 1116293632.0,
+            "36": 1116293632.0,
+            "37": 1116293632.0,
+            "38": 1116293632.0,
+            "39": 1116293632.0,
+            "40": 1116293632.0,
+            "41": 1116293632.0,
+            "42": 1116293632.0,
+            "43": 1116293632.0,
+            "44": 1116293632.0,
+            "45": 1116293632.0,
+            "46": 1116293632.0,
+            "47": 1116293632.0,
+            "48": 1116293632.0,
+            "49": 1116293632.0,
+            "50": 1116293632.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 7.52257,
+            "3": 0.74502,
+            "4": 0.74089,
+            "5": 0.73009,
+            "6": 0.73041,
+            "7": 0.73704,
+            "8": 0.71933,
+            "9": 0.72466,
+            "10": 1.0546,
+            "11": 0.71525,
+            "12": 0.71298,
+            "13": 0.71412,
+            "14": 0.71521,
+            "15": 0.71883,
+            "16": 0.71464,
+            "17": 0.72192,
+            "18": 1.32991,
+            "19": 0.92083,
+            "20": 0.72233,
+            "21": 0.71533,
+            "22": 0.7144,
+            "23": 0.71011,
+            "24": 0.71396,
+            "25": 0.70984,
+            "26": 0.7111,
+            "27": 0.71496,
+            "28": 0.71187,
+            "29": 0.71729,
+            "30": 0.72095,
+            "31": 0.71436,
+            "32": 0.70963,
+            "33": 0.71384,
+            "34": 0.71534,
+            "35": 0.7148,
+            "36": 0.71389,
+            "37": 0.71097,
+            "38": 0.71244,
+            "39": 0.7048,
+            "40": 0.715,
+            "41": 1.08196,
+            "42": 0.71129,
+            "43": 0.73716,
+            "44": 0.72639,
+            "45": 0.71182,
+            "46": 0.71576,
+            "47": 0.72917,
+            "48": 0.72017,
+            "49": 0.72166,
+            "50": 0.70656
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json
index 25a8b5ae572..80df38f0478 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.81746,
-            "2": 10.82149,
-            "3": 10.82234,
-            "4": 10.79883,
-            "5": 10.84067,
-            "6": 10.85636,
-            "7": 10.81775,
-            "8": 10.81498,
-            "9": 10.83664,
-            "10": 10.7822,
-            "11": 10.85151,
-            "12": 10.84335,
-            "13": 10.85001,
-            "14": 10.87346,
-            "15": 10.80974,
-            "16": 10.80359,
-            "17": 10.75702,
-            "18": 10.80691,
-            "19": 10.78689,
-            "20": 10.73095,
-            "21": 10.70872,
-            "22": 10.57886,
-            "23": 10.71772,
-            "24": 10.63253,
-            "25": 10.57332,
-            "26": 10.62323,
-            "27": 10.63892,
+            "1": 10.81737,
+            "2": 10.82147,
+            "3": 10.82281,
+            "4": 10.79843,
+            "5": 10.84076,
+            "6": 10.85646,
+            "7": 10.81805,
+            "8": 10.81508,
+            "9": 10.83702,
+            "10": 10.78206,
+            "11": 10.85139,
+            "12": 10.84369,
+            "13": 10.84954,
+            "14": 10.87421,
+            "15": 10.81044,
+            "16": 10.80279,
+            "17": 10.75666,
+            "18": 10.80666,
+            "19": 10.78635,
+            "20": 10.7305,
+            "21": 10.7094,
+            "22": 10.57865,
+            "23": 10.71817,
+            "24": 10.63281,
+            "25": 10.57347,
+            "26": 10.62329,
+            "27": 10.63909,
             "28": 10.60509,
-            "29": 10.61796,
-            "30": 10.42067,
-            "31": 10.18074,
-            "32": 10.50619,
-            "33": 10.50937,
-            "34": 10.27626,
-            "35": 10.3249,
-            "36": 10.29423,
-            "37": 10.40006,
-            "38": 10.26099,
-            "39": 10.44197,
-            "40": 10.1644,
-            "41": 10.2004,
-            "42": 10.26981,
-            "43": 9.93054,
-            "44": 10.04184,
-            "45": 9.9288,
-            "46": 9.89638,
-            "47": 10.18471,
-            "48": 9.93119,
+            "29": 10.61783,
+            "30": 10.42028,
+            "31": 10.18079,
+            "32": 10.50616,
+            "33": 10.50906,
+            "34": 10.27697,
+            "35": 10.3245,
+            "36": 10.29406,
+            "37": 10.39966,
+            "38": 10.2616,
+            "39": 10.44227,
+            "40": 10.16376,
+            "41": 10.2005,
+            "42": 10.26994,
+            "43": 9.93005,
+            "44": 10.04225,
+            "45": 9.92868,
+            "46": 9.89675,
+            "47": 10.18499,
+            "48": 9.93166,
             "49": 9.62763,
-            "50": 9.98402
+            "50": 9.98403
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 5082.0,
-            "2": 5274.0,
-            "3": 5447.0,
-            "4": 5269.0,
-            "5": 6020.0,
-            "6": 6160.0,
-            "7": 5592.0,
-            "8": 5309.0,
-            "9": 5743.0,
-            "10": 4800.0,
-            "11": 6186.0,
-            "12": 5648.0,
-            "13": 6106.0,
-            "14": 6126.0,
-            "15": 5600.0,
-            "16": 5819.0,
-            "17": 5669.0,
-            "18": 5547.0,
-            "19": 5711.0,
-            "20": 5380.0,
-            "21": 5677.0,
-            "22": 5023.0,
-            "23": 6080.0,
-            "24": 5403.0,
-            "25": 5120.0,
-            "26": 5431.0,
-            "27": 5866.0,
-            "28": 6035.0,
-            "29": 6154.0,
-            "30": 5456.0,
-            "31": 4832.0,
-            "32": 5956.0,
-            "33": 6301.0,
-            "34": 5366.0,
-            "35": 5900.0,
-            "36": 5703.0,
-            "37": 6744.0,
-            "38": 6098.0,
-            "39": 6737.0,
-            "40": 5994.0,
-            "41": 6144.0,
-            "42": 6542.0,
-            "43": 5751.0,
-            "44": 5876.0,
-            "45": 5795.0,
-            "46": 6162.0,
-            "47": 6736.0,
-            "48": 6331.0,
-            "49": 6235.0,
-            "50": 6668.0
+            "1": 5162.0,
+            "2": 5294.0,
+            "3": 5343.0,
+            "4": 5333.0,
+            "5": 5868.0,
+            "6": 6119.0,
+            "7": 5447.0,
+            "8": 5258.0,
+            "9": 5738.0,
+            "10": 4888.0,
+            "11": 6126.0,
+            "12": 5816.0,
+            "13": 6034.0,
+            "14": 6205.0,
+            "15": 5700.0,
+            "16": 5769.0,
+            "17": 5716.0,
+            "18": 5606.0,
+            "19": 5781.0,
+            "20": 5226.0,
+            "21": 5690.0,
+            "22": 5164.0,
+            "23": 6126.0,
+            "24": 5314.0,
+            "25": 5071.0,
+            "26": 5505.0,
+            "27": 5772.0,
+            "28": 6005.0,
+            "29": 6328.0,
+            "30": 5628.0,
+            "31": 4847.0,
+            "32": 5883.0,
+            "33": 6277.0,
+            "34": 5280.0,
+            "35": 5737.0,
+            "36": 5716.0,
+            "37": 6534.0,
+            "38": 6002.0,
+            "39": 6879.0,
+            "40": 5969.0,
+            "41": 6140.0,
+            "42": 6558.0,
+            "43": 5814.0,
+            "44": 5764.0,
+            "45": 5925.0,
+            "46": 5890.0,
+            "47": 6716.0,
+            "48": 6553.0,
+            "49": 6112.0,
+            "50": 6617.0
         }
     },
     "mem-allocated-bytes": {
@@ -121,53 +121,53 @@
             "1": 627718656.0,
             "2": 627719168.0,
             "3": 627719168.0,
-            "4": 627720704.0,
+            "4": 627720192.0,
             "5": 627718656.0,
             "6": 627718656.0,
             "7": 627718144.0,
             "8": 627718144.0,
             "9": 627718144.0,
             "10": 627719168.0,
-            "11": 627719680.0,
-            "12": 627719168.0,
-            "13": 627719680.0,
-            "14": 627717120.0,
+            "11": 627718656.0,
+            "12": 627718144.0,
+            "13": 627720192.0,
+            "14": 627717632.0,
             "15": 627720192.0,
             "16": 627717632.0,
             "17": 627718144.0,
-            "18": 627719680.0,
+            "18": 627718656.0,
             "19": 627719168.0,
             "20": 627717120.0,
             "21": 627718144.0,
             "22": 627720192.0,
             "23": 627720192.0,
-            "24": 627718144.0,
+            "24": 627717120.0,
             "25": 627718656.0,
-            "26": 627718144.0,
-            "27": 627717120.0,
-            "28": 627718656.0,
+            "26": 627717632.0,
+            "27": 627719680.0,
+            "28": 627717632.0,
             "29": 627717120.0,
             "30": 627720192.0,
-            "31": 627715072.0,
-            "32": 627720192.0,
+            "31": 627715584.0,
+            "32": 627720704.0,
             "33": 627717632.0,
-            "34": 627719168.0,
-            "35": 627716608.0,
-            "36": 627719168.0,
-            "37": 627718144.0,
+            "34": 627718144.0,
+            "35": 627715584.0,
+            "36": 627718656.0,
+            "37": 627717632.0,
             "38": 627718656.0,
             "39": 627715584.0,
-            "40": 627717632.0,
+            "40": 627718656.0,
             "41": 627714560.0,
             "42": 627718144.0,
             "43": 627713536.0,
-            "44": 627714048.0,
-            "45": 627719168.0,
+            "44": 627715072.0,
+            "45": 627718144.0,
             "46": 627716096.0,
-            "47": 627717120.0,
+            "47": 627718144.0,
             "48": 627716608.0,
-            "49": 627715072.0,
-            "50": 627718144.0
+            "49": 627716096.0,
+            "50": 627717632.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 870138880.0,
-            "2": 1099332096.0,
-            "3": 1099950080.0,
-            "4": 1102007296.0,
-            "5": 1102007296.0,
-            "6": 1102007296.0,
-            "7": 1102007296.0,
-            "8": 1102007296.0,
-            "9": 1102007296.0,
-            "10": 1102007296.0,
-            "11": 1102007296.0,
-            "12": 1102007296.0,
-            "13": 1103012352.0,
-            "14": 1103012352.0,
-            "15": 1103012352.0,
-            "16": 1103012352.0,
-            "17": 1103012352.0,
-            "18": 1103012352.0,
-            "19": 1103012352.0,
-            "20": 1103012352.0,
-            "21": 1103012352.0,
-            "22": 1103012352.0,
-            "23": 1103012352.0,
-            "24": 1103012352.0,
-            "25": 1103012352.0,
-            "26": 1103012352.0,
-            "27": 1103012352.0,
-            "28": 1103012352.0,
-            "29": 1103012352.0,
-            "30": 1103012352.0,
-            "31": 1103012352.0,
-            "32": 1103012352.0,
-            "33": 1103012352.0,
-            "34": 1103012352.0,
-            "35": 1103012352.0,
-            "36": 1103012352.0,
-            "37": 1103012352.0,
-            "38": 1103012352.0,
-            "39": 1103012352.0,
-            "40": 1103012352.0,
-            "41": 1103012352.0,
-            "42": 1103012352.0,
-            "43": 1103012352.0,
-            "44": 1103012352.0,
-            "45": 1103012352.0,
-            "46": 1103012352.0,
-            "47": 1103012352.0,
-            "48": 1103012352.0,
-            "49": 1103012352.0,
-            "50": 1103012352.0
+            "1": 879924224.0,
+            "2": 1111762432.0,
+            "3": 1111762432.0,
+            "4": 1113592832.0,
+            "5": 1113592832.0,
+            "6": 1113592832.0,
+            "7": 1113592832.0,
+            "8": 1113592832.0,
+            "9": 1113592832.0,
+            "10": 1113592832.0,
+            "11": 1113592832.0,
+            "12": 1113592832.0,
+            "13": 1113592832.0,
+            "14": 1113592832.0,
+            "15": 1113592832.0,
+            "16": 1113592832.0,
+            "17": 1113592832.0,
+            "18": 1113592832.0,
+            "19": 1113592832.0,
+            "20": 1113592832.0,
+            "21": 1113592832.0,
+            "22": 1113592832.0,
+            "23": 1113592832.0,
+            "24": 1113592832.0,
+            "25": 1113592832.0,
+            "26": 1113592832.0,
+            "27": 1113592832.0,
+            "28": 1113592832.0,
+            "29": 1113592832.0,
+            "30": 1113592832.0,
+            "31": 1113592832.0,
+            "32": 1113592832.0,
+            "33": 1113592832.0,
+            "34": 1113592832.0,
+            "35": 1113592832.0,
+            "36": 1113592832.0,
+            "37": 1113592832.0,
+            "38": 1113592832.0,
+            "39": 1113592832.0,
+            "40": 1113592832.0,
+            "41": 1113592832.0,
+            "42": 1113592832.0,
+            "43": 1113592832.0,
+            "44": 1113592832.0,
+            "45": 1113592832.0,
+            "46": 1113592832.0,
+            "47": 1113592832.0,
+            "48": 1113592832.0,
+            "49": 1113592832.0,
+            "50": 1113592832.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 17.91902,
-            "2": 0.59117,
-            "3": 0.52614,
-            "4": 0.54746,
-            "5": 0.5056,
-            "6": 0.50649,
-            "7": 0.52305,
-            "8": 0.50853,
-            "9": 0.50644,
-            "10": 0.50303,
-            "11": 0.50387,
-            "12": 0.50249,
-            "13": 0.51153,
-            "14": 0.49861,
-            "15": 0.51318,
-            "16": 0.50066,
-            "17": 0.50888,
-            "18": 0.50788,
-            "19": 0.51533,
-            "20": 0.51425,
-            "21": 0.51111,
-            "22": 0.5116,
-            "23": 0.50626,
-            "24": 0.5049,
-            "25": 0.51101,
-            "26": 0.50993,
-            "27": 0.5073,
-            "28": 0.50949,
-            "29": 0.50784,
-            "30": 0.50783,
-            "31": 0.51255,
-            "32": 0.51065,
-            "33": 0.50731,
-            "34": 0.50768,
-            "35": 0.51749,
-            "36": 0.50656,
-            "37": 0.51012,
-            "38": 0.51668,
-            "39": 0.50475,
-            "40": 0.50784,
-            "41": 0.51405,
-            "42": 0.51014,
-            "43": 0.51186,
-            "44": 0.50532,
-            "45": 0.51211,
-            "46": 0.52864,
-            "47": 0.52545,
-            "48": 0.50927,
-            "49": 0.52883,
-            "50": 0.50373
+            "1": 22.46796,
+            "2": 0.55121,
+            "3": 0.49073,
+            "4": 0.49513,
+            "5": 0.46581,
+            "6": 0.45704,
+            "7": 0.47585,
+            "8": 1.29882,
+            "9": 0.47574,
+            "10": 0.46585,
+            "11": 0.48809,
+            "12": 0.45979,
+            "13": 0.47153,
+            "14": 0.82188,
+            "15": 0.47696,
+            "16": 0.45474,
+            "17": 0.46236,
+            "18": 0.45323,
+            "19": 0.45728,
+            "20": 0.47493,
+            "21": 0.45187,
+            "22": 0.45466,
+            "23": 0.45322,
+            "24": 0.45177,
+            "25": 0.45722,
+            "26": 0.46293,
+            "27": 0.45714,
+            "28": 0.45943,
+            "29": 0.45163,
+            "30": 0.45687,
+            "31": 0.4545,
+            "32": 0.45288,
+            "33": 0.45164,
+            "34": 0.45777,
+            "35": 0.46272,
+            "36": 0.45524,
+            "37": 0.45441,
+            "38": 0.45752,
+            "39": 0.4509,
+            "40": 0.44879,
+            "41": 0.45622,
+            "42": 0.45367,
+            "43": 0.46325,
+            "44": 0.45127,
+            "45": 0.46393,
+            "46": 0.51509,
+            "47": 0.46791,
+            "48": 0.45502,
+            "49": 0.48346,
+            "50": 0.45945
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json
index 184675324be..e3b2e326fda 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79175, "5": 10.82288, "10": 10.7688, "15": 10.79157, "20": 10.71001, "25": 10.54662, "30": 10.39407, "35": 10.30461, "40": 10.13303, "45": 9.90015, "50": 9.97874}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5656.0, "5": 6476.0, "10": 5453.0, "15": 6233.0, "20": 5837.0, "25": 5811.0, "30": 6047.0, "35": 6712.0, "40": 7062.0, "45": 6681.0, "50": 7527.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 458213888.0, "5": 458213376.0, "10": 458215936.0, "15": 458215424.0, "20": 458214400.0, "25": 458211840.0, "30": 458211840.0, "35": 458215936.0, "40": 458213376.0, "45": 458214400.0, "50": 458214912.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1016114688.0, "5": 1180862464.0, "10": 1181913600.0, "15": 1181913600.0, "20": 1181913600.0, "25": 1181913600.0, "30": 1181913600.0, "35": 1181913600.0, "40": 1181913600.0, "45": 1181913600.0, "50": 1181913600.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.08798, "5": 0.55184, "10": 0.55591, "15": 0.55638, "20": 0.55589, "25": 0.55697, "30": 0.55631, "35": 0.55801, "40": 0.55677, "45": 0.55857, "50": 0.57711}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.79175,
+            "2": 10.80907,
+            "3": 10.81011,
+            "4": 10.78146,
+            "5": 10.82288,
+            "6": 10.84057,
+            "7": 10.81192,
+            "8": 10.80005,
+            "9": 10.81667,
+            "10": 10.7688,
+            "11": 10.8618,
+            "12": 10.84042,
+            "13": 10.84452,
+            "14": 10.86421,
+            "15": 10.79157,
+            "16": 10.78199,
+            "17": 10.75122,
+            "18": 10.79446,
+            "19": 10.79523,
+            "20": 10.71001,
+            "21": 10.68811,
+            "22": 10.53736,
+            "23": 10.7066,
+            "24": 10.58865,
+            "25": 10.54662,
+            "26": 10.59492,
+            "27": 10.62142,
+            "28": 10.5969,
+            "29": 10.60036,
+            "30": 10.39407,
+            "31": 10.12951,
+            "32": 10.49684,
+            "33": 10.48779,
+            "34": 10.24347,
+            "35": 10.30461,
+            "36": 10.26056,
+            "37": 10.38859,
+            "38": 10.24848,
+            "39": 10.43799,
+            "40": 10.13303,
+            "41": 10.18651,
+            "42": 10.25823,
+            "43": 9.892,
+            "44": 10.02576,
+            "45": 9.90015,
+            "46": 9.88387,
+            "47": 10.19565,
+            "48": 9.91255,
+            "49": 9.60147,
+            "50": 9.97874
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5656.0,
+            "2": 6018.0,
+            "3": 5790.0,
+            "4": 5941.0,
+            "5": 6476.0,
+            "6": 6653.0,
+            "7": 6287.0,
+            "8": 5875.0,
+            "9": 6239.0,
+            "10": 5453.0,
+            "11": 6936.0,
+            "12": 6711.0,
+            "13": 6655.0,
+            "14": 6814.0,
+            "15": 6233.0,
+            "16": 6533.0,
+            "17": 6397.0,
+            "18": 6112.0,
+            "19": 6678.0,
+            "20": 5837.0,
+            "21": 6403.0,
+            "22": 5715.0,
+            "23": 6744.0,
+            "24": 6051.0,
+            "25": 5811.0,
+            "26": 6104.0,
+            "27": 6484.0,
+            "28": 6884.0,
+            "29": 7253.0,
+            "30": 6047.0,
+            "31": 5593.0,
+            "32": 6625.0,
+            "33": 7054.0,
+            "34": 6104.0,
+            "35": 6712.0,
+            "36": 6684.0,
+            "37": 7523.0,
+            "38": 7273.0,
+            "39": 7620.0,
+            "40": 7062.0,
+            "41": 6895.0,
+            "42": 7426.0,
+            "43": 6713.0,
+            "44": 6664.0,
+            "45": 6681.0,
+            "46": 6923.0,
+            "47": 7705.0,
+            "48": 7248.0,
+            "49": 7331.0,
+            "50": 7527.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 458213888.0,
+            "2": 458211840.0,
+            "3": 458215424.0,
+            "4": 458211840.0,
+            "5": 458213376.0,
+            "6": 458213888.0,
+            "7": 458216448.0,
+            "8": 458216448.0,
+            "9": 458212864.0,
+            "10": 458215936.0,
+            "11": 458213888.0,
+            "12": 458213888.0,
+            "13": 458214400.0,
+            "14": 458215424.0,
+            "15": 458215424.0,
+            "16": 458212864.0,
+            "17": 458214400.0,
+            "18": 458214400.0,
+            "19": 458214400.0,
+            "20": 458214400.0,
+            "21": 458211840.0,
+            "22": 458218496.0,
+            "23": 458214912.0,
+            "24": 458214400.0,
+            "25": 458211840.0,
+            "26": 458215936.0,
+            "27": 458210816.0,
+            "28": 458213888.0,
+            "29": 458212864.0,
+            "30": 458211840.0,
+            "31": 458219008.0,
+            "32": 458214400.0,
+            "33": 458214912.0,
+            "34": 458211840.0,
+            "35": 458215936.0,
+            "36": 458212864.0,
+            "37": 458215424.0,
+            "38": 458213888.0,
+            "39": 458213888.0,
+            "40": 458213376.0,
+            "41": 458216960.0,
+            "42": 458215424.0,
+            "43": 458216960.0,
+            "44": 458213376.0,
+            "45": 458214400.0,
+            "46": 458216448.0,
+            "47": 458213376.0,
+            "48": 458213888.0,
+            "49": 458215424.0,
+            "50": 458214912.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1029256704.0,
+            "2": 1193177088.0,
+            "3": 1193177088.0,
+            "4": 1193686016.0,
+            "5": 1193686016.0,
+            "6": 1193686016.0,
+            "7": 1193686016.0,
+            "8": 1193686016.0,
+            "9": 1193771520.0,
+            "10": 1193771520.0,
+            "11": 1193771520.0,
+            "12": 1193771520.0,
+            "13": 1193771520.0,
+            "14": 1193771520.0,
+            "15": 1193771520.0,
+            "16": 1193771520.0,
+            "17": 1193771520.0,
+            "18": 1193771520.0,
+            "19": 1193771520.0,
+            "20": 1193771520.0,
+            "21": 1193771520.0,
+            "22": 1193918464.0,
+            "23": 1193918464.0,
+            "24": 1193918464.0,
+            "25": 1193918464.0,
+            "26": 1193918464.0,
+            "27": 1193918464.0,
+            "28": 1193918464.0,
+            "29": 1193918464.0,
+            "30": 1193918464.0,
+            "31": 1193918464.0,
+            "32": 1193918464.0,
+            "33": 1193918464.0,
+            "34": 1193918464.0,
+            "35": 1193918464.0,
+            "36": 1193918464.0,
+            "37": 1193918464.0,
+            "38": 1193918464.0,
+            "39": 1193918464.0,
+            "40": 1194139136.0,
+            "41": 1194139136.0,
+            "42": 1194139136.0,
+            "43": 1194249728.0,
+            "44": 1194249728.0,
+            "45": 1194249728.0,
+            "46": 1194249728.0,
+            "47": 1194249728.0,
+            "48": 1194249728.0,
+            "49": 1194249728.0,
+            "50": 1194249728.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 6.67874,
+            "2": 0.59048,
+            "3": 0.55954,
+            "4": 0.55064,
+            "5": 0.54285,
+            "6": 0.54344,
+            "7": 0.54862,
+            "8": 0.542,
+            "9": 0.54738,
+            "10": 0.54947,
+            "11": 0.53996,
+            "12": 0.54615,
+            "13": 0.54407,
+            "14": 0.54098,
+            "15": 0.55148,
+            "16": 0.54024,
+            "17": 0.54784,
+            "18": 0.54329,
+            "19": 0.54213,
+            "20": 0.55192,
+            "21": 0.53901,
+            "22": 0.54612,
+            "23": 0.54495,
+            "24": 0.54254,
+            "25": 0.55242,
+            "26": 0.53958,
+            "27": 0.54346,
+            "28": 0.5466,
+            "29": 0.54048,
+            "30": 0.55385,
+            "31": 0.54112,
+            "32": 0.54404,
+            "33": 0.54779,
+            "34": 0.54049,
+            "35": 0.53889,
+            "36": 0.53823,
+            "37": 0.54013,
+            "38": 0.53918,
+            "39": 0.53801,
+            "40": 0.5394,
+            "41": 0.53905,
+            "42": 0.53797,
+            "43": 0.53957,
+            "44": 0.5384,
+            "45": 0.53795,
+            "46": 0.53859,
+            "47": 0.54222,
+            "48": 0.53881,
+            "49": 0.5401,
+            "50": 0.53746
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..9a3140994d3
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.80801,
+            "2": 10.80912,
+            "3": 10.81804,
+            "4": 10.77924,
+            "5": 10.83867,
+            "6": 10.8473,
+            "7": 10.80366,
+            "8": 10.81171,
+            "9": 10.80948,
+            "10": 10.77389,
+            "11": 10.85853,
+            "12": 10.83206,
+            "13": 10.84778,
+            "14": 10.86351,
+            "15": 10.77822,
+            "16": 10.78665,
+            "17": 10.74969,
+            "18": 10.78174,
+            "19": 10.77893,
+            "20": 10.71133,
+            "21": 10.68188,
+            "22": 10.53221,
+            "23": 10.70751,
+            "24": 10.58301,
+            "25": 10.53686,
+            "26": 10.59662,
+            "27": 10.62332,
+            "28": 10.58807,
+            "29": 10.61089,
+            "30": 10.39372,
+            "31": 10.1118,
+            "32": 10.4835,
+            "33": 10.48693,
+            "34": 10.23859,
+            "35": 10.29466,
+            "36": 10.25749,
+            "37": 10.38723,
+            "38": 10.24326,
+            "39": 10.43603,
+            "40": 10.12881,
+            "41": 10.18559,
+            "42": 10.25677,
+            "43": 9.8808,
+            "44": 10.00863,
+            "45": 9.89409,
+            "46": 9.85423,
+            "47": 10.1998,
+            "48": 9.90437,
+            "49": 9.58703,
+            "50": 9.96891
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5550.0,
+            "2": 5910.0,
+            "3": 5966.0,
+            "4": 5860.0,
+            "5": 6749.0,
+            "6": 6869.0,
+            "7": 6129.0,
+            "8": 5905.0,
+            "9": 6154.0,
+            "10": 5266.0,
+            "11": 6792.0,
+            "12": 6324.0,
+            "13": 6845.0,
+            "14": 6862.0,
+            "15": 6306.0,
+            "16": 6524.0,
+            "17": 6567.0,
+            "18": 6194.0,
+            "19": 6515.0,
+            "20": 5979.0,
+            "21": 6327.0,
+            "22": 5748.0,
+            "23": 6749.0,
+            "24": 5978.0,
+            "25": 5661.0,
+            "26": 6206.0,
+            "27": 6307.0,
+            "28": 7003.0,
+            "29": 7124.0,
+            "30": 6390.0,
+            "31": 5578.0,
+            "32": 6783.0,
+            "33": 7031.0,
+            "34": 6306.0,
+            "35": 6516.0,
+            "36": 6614.0,
+            "37": 7690.0,
+            "38": 7193.0,
+            "39": 7850.0,
+            "40": 7170.0,
+            "41": 6880.0,
+            "42": 7329.0,
+            "43": 6669.0,
+            "44": 6616.0,
+            "45": 6700.0,
+            "46": 7080.0,
+            "47": 7661.0,
+            "48": 7259.0,
+            "49": 7083.0,
+            "50": 7418.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 458211840.0,
+            "2": 458214400.0,
+            "3": 458210304.0,
+            "4": 458211840.0,
+            "5": 458212864.0,
+            "6": 458211840.0,
+            "7": 458213376.0,
+            "8": 458212352.0,
+            "9": 458213888.0,
+            "10": 458214400.0,
+            "11": 458212864.0,
+            "12": 458211840.0,
+            "13": 458214400.0,
+            "14": 458213888.0,
+            "15": 458214912.0,
+            "16": 458210816.0,
+            "17": 458213888.0,
+            "18": 458212864.0,
+            "19": 458214400.0,
+            "20": 458212352.0,
+            "21": 458214912.0,
+            "22": 458217472.0,
+            "23": 458213888.0,
+            "24": 458215424.0,
+            "25": 458212864.0,
+            "26": 458211328.0,
+            "27": 458213888.0,
+            "28": 458212864.0,
+            "29": 458213376.0,
+            "30": 458211840.0,
+            "31": 458214400.0,
+            "32": 458213888.0,
+            "33": 458213376.0,
+            "34": 458214400.0,
+            "35": 458213888.0,
+            "36": 458213888.0,
+            "37": 458212352.0,
+            "38": 458211328.0,
+            "39": 458212352.0,
+            "40": 458214912.0,
+            "41": 458212864.0,
+            "42": 458214912.0,
+            "43": 458215936.0,
+            "44": 458213376.0,
+            "45": 458212352.0,
+            "46": 458214400.0,
+            "47": 458214400.0,
+            "48": 458214400.0,
+            "49": 458212864.0,
+            "50": 458212352.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1027753472.0,
+            "2": 1191298048.0,
+            "3": 1191298048.0,
+            "4": 1192127488.0,
+            "5": 1192127488.0,
+            "6": 1192127488.0,
+            "7": 1192127488.0,
+            "8": 1192127488.0,
+            "9": 1192127488.0,
+            "10": 1192127488.0,
+            "11": 1192546816.0,
+            "12": 1192546816.0,
+            "13": 1193283584.0,
+            "14": 1193283584.0,
+            "15": 1193283584.0,
+            "16": 1193283584.0,
+            "17": 1193283584.0,
+            "18": 1193283584.0,
+            "19": 1193283584.0,
+            "20": 1193283584.0,
+            "21": 1193283584.0,
+            "22": 1193556992.0,
+            "23": 1193556992.0,
+            "24": 1193556992.0,
+            "25": 1193556992.0,
+            "26": 1193556992.0,
+            "27": 1193556992.0,
+            "28": 1193556992.0,
+            "29": 1193556992.0,
+            "30": 1193556992.0,
+            "31": 1193556992.0,
+            "32": 1193556992.0,
+            "33": 1193556992.0,
+            "34": 1193556992.0,
+            "35": 1193556992.0,
+            "36": 1193556992.0,
+            "37": 1193556992.0,
+            "38": 1193556992.0,
+            "39": 1193556992.0,
+            "40": 1193556992.0,
+            "41": 1193556992.0,
+            "42": 1193556992.0,
+            "43": 1193556992.0,
+            "44": 1193556992.0,
+            "45": 1193556992.0,
+            "46": 1193556992.0,
+            "47": 1193556992.0,
+            "48": 1193556992.0,
+            "49": 1193556992.0,
+            "50": 1193556992.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 5.55037,
+            "3": 0.71803,
+            "4": 1.02899,
+            "5": 0.69274,
+            "6": 0.69497,
+            "7": 0.70712,
+            "8": 0.70476,
+            "9": 0.71137,
+            "10": 0.69978,
+            "11": 0.69201,
+            "12": 0.68876,
+            "13": 0.68954,
+            "14": 0.69496,
+            "15": 0.70166,
+            "16": 0.6946,
+            "17": 0.69269,
+            "18": 0.69041,
+            "19": 0.69219,
+            "20": 0.69191,
+            "21": 0.68931,
+            "22": 0.69642,
+            "23": 0.7087,
+            "24": 0.71205,
+            "25": 0.75075,
+            "26": 0.71466,
+            "27": 0.79945,
+            "28": 0.68459,
+            "29": 0.69018,
+            "30": 0.68977,
+            "31": 0.69421,
+            "32": 0.68991,
+            "33": 0.70331,
+            "34": 0.70581,
+            "35": 0.69718,
+            "36": 0.69748,
+            "37": 0.69248,
+            "38": 0.69828,
+            "39": 0.68816,
+            "40": 0.69315,
+            "41": 0.69476,
+            "42": 0.69711,
+            "43": 0.70588,
+            "44": 0.69538,
+            "45": 0.69598,
+            "46": 0.70429,
+            "47": 0.69137,
+            "48": 0.69183,
+            "49": 0.70009,
+            "50": 0.69388
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json
index b250bf7ac21..6ec10f4f931 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 491766784.0,
-            "2": 491767296.0,
-            "3": 491765760.0,
-            "4": 491767296.0,
-            "5": 491766784.0,
-            "6": 491767808.0,
-            "7": 491767296.0,
-            "8": 491768320.0,
-            "9": 491767808.0,
-            "10": 491767296.0,
-            "11": 491765248.0,
-            "12": 491764736.0,
-            "13": 491766272.0,
-            "14": 491767808.0,
-            "15": 491768832.0,
-            "16": 491769856.0,
-            "17": 491767296.0,
-            "18": 491765248.0,
-            "19": 491766272.0,
-            "20": 491766784.0,
-            "21": 491768320.0,
-            "22": 491768320.0,
-            "23": 491765760.0,
-            "24": 491766272.0,
-            "25": 491766272.0,
-            "26": 491767296.0,
-            "27": 491766784.0,
-            "28": 491767296.0,
-            "29": 491766272.0,
-            "30": 491766272.0,
-            "31": 491767808.0,
-            "32": 491765760.0,
-            "33": 491764736.0,
-            "34": 491768320.0,
-            "35": 491769344.0,
-            "36": 491765760.0,
-            "37": 491765248.0,
-            "38": 491766272.0,
-            "39": 491767808.0,
-            "40": 491765760.0,
-            "41": 491768320.0,
-            "42": 491766272.0,
-            "43": 491768832.0,
-            "44": 491768320.0,
-            "45": 491765248.0,
-            "46": 491768320.0,
-            "47": 491765760.0,
-            "48": 491766784.0,
-            "49": 491766784.0,
-            "50": 491765248.0
+            "1": 458212352.0,
+            "2": 458212864.0,
+            "3": 458211328.0,
+            "4": 458212864.0,
+            "5": 458212352.0,
+            "6": 458213376.0,
+            "7": 458212864.0,
+            "8": 458213888.0,
+            "9": 458213376.0,
+            "10": 458212864.0,
+            "11": 458210816.0,
+            "12": 458210304.0,
+            "13": 458211840.0,
+            "14": 458213376.0,
+            "15": 458214400.0,
+            "16": 458215424.0,
+            "17": 458212864.0,
+            "18": 458210816.0,
+            "19": 458211840.0,
+            "20": 458212352.0,
+            "21": 458213888.0,
+            "22": 458213888.0,
+            "23": 458211328.0,
+            "24": 458211840.0,
+            "25": 458211840.0,
+            "26": 458212864.0,
+            "27": 458212352.0,
+            "28": 458212864.0,
+            "29": 458211840.0,
+            "30": 458211840.0,
+            "31": 458213376.0,
+            "32": 458211328.0,
+            "33": 458210304.0,
+            "34": 458213888.0,
+            "35": 458214912.0,
+            "36": 458211328.0,
+            "37": 458210816.0,
+            "38": 458211840.0,
+            "39": 458213376.0,
+            "40": 458211328.0,
+            "41": 458213888.0,
+            "42": 458211840.0,
+            "43": 458214400.0,
+            "44": 458213888.0,
+            "45": 458210816.0,
+            "46": 458213888.0,
+            "47": 458211328.0,
+            "48": 458212352.0,
+            "49": 458212352.0,
+            "50": 458210816.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1047229440.0,
-            "2": 1213900288.0,
-            "3": 1213900288.0,
-            "4": 1213900288.0,
-            "5": 1213900288.0,
-            "6": 1213900288.0,
-            "7": 1213900288.0,
-            "8": 1213900288.0,
-            "9": 1213900288.0,
-            "10": 1213900288.0,
-            "11": 1213900288.0,
-            "12": 1213900288.0,
-            "13": 1213900288.0,
-            "14": 1213900288.0,
-            "15": 1213900288.0,
-            "16": 1213900288.0,
-            "17": 1213900288.0,
-            "18": 1213900288.0,
-            "19": 1213900288.0,
-            "20": 1213900288.0,
-            "21": 1213900288.0,
-            "22": 1213900288.0,
-            "23": 1213900288.0,
-            "24": 1213900288.0,
-            "25": 1213900288.0,
-            "26": 1213900288.0,
-            "27": 1213900288.0,
-            "28": 1213900288.0,
-            "29": 1213900288.0,
-            "30": 1213900288.0,
-            "31": 1213900288.0,
-            "32": 1213900288.0,
-            "33": 1213900288.0,
-            "34": 1213900288.0,
-            "35": 1213900288.0,
-            "36": 1213900288.0,
-            "37": 1213900288.0,
-            "38": 1213900288.0,
-            "39": 1213900288.0,
-            "40": 1213900288.0,
-            "41": 1213900288.0,
-            "42": 1213900288.0,
-            "43": 1213900288.0,
-            "44": 1213900288.0,
-            "45": 1213900288.0,
-            "46": 1213900288.0,
-            "47": 1213900288.0,
-            "48": 1213900288.0,
-            "49": 1213900288.0,
-            "50": 1213900288.0
+            "1": 1026068480.0,
+            "2": 1192152064.0,
+            "3": 1192152064.0,
+            "4": 1192205312.0,
+            "5": 1192205312.0,
+            "6": 1192205312.0,
+            "7": 1192205312.0,
+            "8": 1192205312.0,
+            "9": 1192205312.0,
+            "10": 1192205312.0,
+            "11": 1192205312.0,
+            "12": 1192205312.0,
+            "13": 1192349184.0,
+            "14": 1192349184.0,
+            "15": 1192506368.0,
+            "16": 1192506368.0,
+            "17": 1192506368.0,
+            "18": 1192506368.0,
+            "19": 1192506368.0,
+            "20": 1192506368.0,
+            "21": 1192506368.0,
+            "22": 1192506368.0,
+            "23": 1192506368.0,
+            "24": 1192506368.0,
+            "25": 1192506368.0,
+            "26": 1192506368.0,
+            "27": 1192506368.0,
+            "28": 1192506368.0,
+            "29": 1192506368.0,
+            "30": 1192506368.0,
+            "31": 1192506368.0,
+            "32": 1192506368.0,
+            "33": 1192506368.0,
+            "34": 1192506368.0,
+            "35": 1192506368.0,
+            "36": 1192506368.0,
+            "37": 1192506368.0,
+            "38": 1192506368.0,
+            "39": 1192506368.0,
+            "40": 1192506368.0,
+            "41": 1192506368.0,
+            "42": 1192506368.0,
+            "43": 1192506368.0,
+            "44": 1192506368.0,
+            "45": 1192506368.0,
+            "46": 1192506368.0,
+            "47": 1192506368.0,
+            "48": 1192506368.0,
+            "49": 1192506368.0,
+            "50": 1192506368.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 13.26707,
-            "2": 0.52806,
-            "3": 0.46475,
-            "4": 0.47125,
-            "5": 0.42985,
-            "6": 0.42614,
-            "7": 0.43552,
-            "8": 0.42689,
-            "9": 0.42927,
-            "10": 0.42373,
-            "11": 0.42662,
-            "12": 0.42301,
-            "13": 0.42359,
-            "14": 0.4226,
-            "15": 0.42796,
-            "16": 0.42415,
-            "17": 0.4235,
-            "18": 0.41948,
-            "19": 0.42601,
-            "20": 0.42722,
-            "21": 0.4176,
-            "22": 0.41953,
-            "23": 0.42303,
-            "24": 0.4187,
-            "25": 0.42281,
-            "26": 0.42449,
-            "27": 0.41941,
-            "28": 0.42935,
-            "29": 0.417,
-            "30": 0.4261,
-            "31": 0.42904,
-            "32": 0.41844,
-            "33": 0.41687,
-            "34": 0.43419,
-            "35": 0.43727,
-            "36": 0.42315,
-            "37": 0.42179,
-            "38": 0.42403,
-            "39": 0.4179,
-            "40": 0.42443,
-            "41": 0.42169,
-            "42": 0.42155,
-            "43": 0.43942,
-            "44": 0.42209,
-            "45": 0.41972,
-            "46": 0.46515,
-            "47": 0.43911,
-            "48": 0.43693,
-            "49": 0.44745,
-            "50": 0.4198
+            "1": 13.43711,
+            "2": 0.5648,
+            "3": 0.46103,
+            "4": 0.42843,
+            "5": 0.39023,
+            "6": 0.40228,
+            "7": 0.39933,
+            "8": 0.40801,
+            "9": 0.41661,
+            "10": 0.41115,
+            "11": 0.40919,
+            "12": 0.38713,
+            "13": 0.3967,
+            "14": 0.39634,
+            "15": 0.3917,
+            "16": 0.38895,
+            "17": 0.39488,
+            "18": 0.38262,
+            "19": 0.38633,
+            "20": 0.38778,
+            "21": 0.37793,
+            "22": 0.38122,
+            "23": 0.3785,
+            "24": 0.38176,
+            "25": 0.37936,
+            "26": 0.38399,
+            "27": 0.37425,
+            "28": 0.38373,
+            "29": 0.37674,
+            "30": 0.38541,
+            "31": 0.38748,
+            "32": 0.37483,
+            "33": 0.37931,
+            "34": 0.38691,
+            "35": 0.39293,
+            "36": 0.38011,
+            "37": 0.37641,
+            "38": 0.37714,
+            "39": 0.37754,
+            "40": 0.3929,
+            "41": 0.37984,
+            "42": 0.37748,
+            "43": 0.39504,
+            "44": 0.38155,
+            "45": 0.39617,
+            "46": 0.42631,
+            "47": 0.39497,
+            "48": 0.39432,
+            "49": 0.40482,
+            "50": 0.37964
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json
index a186febffbe..d8e319ffb51 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 462408192.0,
-            "2": 462406144.0,
-            "3": 462409728.0,
-            "4": 462406144.0,
-            "5": 462407680.0,
-            "6": 462408192.0,
-            "7": 462410752.0,
-            "8": 462410752.0,
-            "9": 462407168.0,
-            "10": 462410240.0,
-            "11": 462408192.0,
-            "12": 462408192.0,
-            "13": 462408704.0,
-            "14": 462409728.0,
-            "15": 462409728.0,
-            "16": 462407168.0,
-            "17": 462408704.0,
-            "18": 462408704.0,
-            "19": 462408704.0,
-            "20": 462408704.0,
-            "21": 462406144.0,
-            "22": 462412800.0,
-            "23": 462409216.0,
-            "24": 462408704.0,
-            "25": 462406144.0,
-            "26": 462410240.0,
-            "27": 462405120.0,
-            "28": 462408192.0,
-            "29": 462407168.0,
-            "30": 462406144.0,
-            "31": 462413312.0,
-            "32": 462408704.0,
-            "33": 462409216.0,
-            "34": 462406144.0,
-            "35": 462410240.0,
-            "36": 462407168.0,
-            "37": 462409728.0,
-            "38": 462408192.0,
-            "39": 462408192.0,
-            "40": 462407680.0,
-            "41": 462411264.0,
-            "42": 462409728.0,
-            "43": 462411264.0,
-            "44": 462407680.0,
-            "45": 462408704.0,
-            "46": 462410752.0,
-            "47": 462407680.0,
-            "48": 462408192.0,
-            "49": 462409728.0,
-            "50": 462409216.0
+            "1": 458213888.0,
+            "2": 458211840.0,
+            "3": 458215424.0,
+            "4": 458211840.0,
+            "5": 458213376.0,
+            "6": 458213888.0,
+            "7": 458216448.0,
+            "8": 458216448.0,
+            "9": 458212864.0,
+            "10": 458215936.0,
+            "11": 458213888.0,
+            "12": 458213888.0,
+            "13": 458214400.0,
+            "14": 458215424.0,
+            "15": 458215424.0,
+            "16": 458212864.0,
+            "17": 458214400.0,
+            "18": 458214400.0,
+            "19": 458214400.0,
+            "20": 458214400.0,
+            "21": 458211840.0,
+            "22": 458218496.0,
+            "23": 458214912.0,
+            "24": 458214400.0,
+            "25": 458211840.0,
+            "26": 458215936.0,
+            "27": 458210816.0,
+            "28": 458213888.0,
+            "29": 458212864.0,
+            "30": 458211840.0,
+            "31": 458219008.0,
+            "32": 458214400.0,
+            "33": 458214912.0,
+            "34": 458211840.0,
+            "35": 458215936.0,
+            "36": 458212864.0,
+            "37": 458215424.0,
+            "38": 458213888.0,
+            "39": 458213888.0,
+            "40": 458213376.0,
+            "41": 458216960.0,
+            "42": 458215424.0,
+            "43": 458216960.0,
+            "44": 458213376.0,
+            "45": 458214400.0,
+            "46": 458216448.0,
+            "47": 458213376.0,
+            "48": 458213888.0,
+            "49": 458215424.0,
+            "50": 458214912.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1033451008.0,
-            "2": 1197371392.0,
-            "3": 1197371392.0,
-            "4": 1197880320.0,
-            "5": 1197880320.0,
-            "6": 1197880320.0,
-            "7": 1197880320.0,
-            "8": 1197880320.0,
-            "9": 1197965824.0,
-            "10": 1197965824.0,
-            "11": 1197965824.0,
-            "12": 1197965824.0,
-            "13": 1197965824.0,
-            "14": 1197965824.0,
-            "15": 1197965824.0,
-            "16": 1197965824.0,
-            "17": 1197965824.0,
-            "18": 1197965824.0,
-            "19": 1197965824.0,
-            "20": 1197965824.0,
-            "21": 1197965824.0,
-            "22": 1198112768.0,
-            "23": 1198112768.0,
-            "24": 1198112768.0,
-            "25": 1198112768.0,
-            "26": 1198112768.0,
-            "27": 1198112768.0,
-            "28": 1198112768.0,
-            "29": 1198112768.0,
-            "30": 1198112768.0,
-            "31": 1198112768.0,
-            "32": 1198112768.0,
-            "33": 1198112768.0,
-            "34": 1198112768.0,
-            "35": 1198112768.0,
-            "36": 1198112768.0,
-            "37": 1198112768.0,
-            "38": 1198112768.0,
-            "39": 1198112768.0,
-            "40": 1198333440.0,
-            "41": 1198333440.0,
-            "42": 1198333440.0,
-            "43": 1198444032.0,
-            "44": 1198444032.0,
-            "45": 1198444032.0,
-            "46": 1198444032.0,
-            "47": 1198444032.0,
-            "48": 1198444032.0,
-            "49": 1198444032.0,
-            "50": 1198444032.0
+            "1": 1029256704.0,
+            "2": 1193177088.0,
+            "3": 1193177088.0,
+            "4": 1193686016.0,
+            "5": 1193686016.0,
+            "6": 1193686016.0,
+            "7": 1193686016.0,
+            "8": 1193686016.0,
+            "9": 1193771520.0,
+            "10": 1193771520.0,
+            "11": 1193771520.0,
+            "12": 1193771520.0,
+            "13": 1193771520.0,
+            "14": 1193771520.0,
+            "15": 1193771520.0,
+            "16": 1193771520.0,
+            "17": 1193771520.0,
+            "18": 1193771520.0,
+            "19": 1193771520.0,
+            "20": 1193771520.0,
+            "21": 1193771520.0,
+            "22": 1193918464.0,
+            "23": 1193918464.0,
+            "24": 1193918464.0,
+            "25": 1193918464.0,
+            "26": 1193918464.0,
+            "27": 1193918464.0,
+            "28": 1193918464.0,
+            "29": 1193918464.0,
+            "30": 1193918464.0,
+            "31": 1193918464.0,
+            "32": 1193918464.0,
+            "33": 1193918464.0,
+            "34": 1193918464.0,
+            "35": 1193918464.0,
+            "36": 1193918464.0,
+            "37": 1193918464.0,
+            "38": 1193918464.0,
+            "39": 1193918464.0,
+            "40": 1194139136.0,
+            "41": 1194139136.0,
+            "42": 1194139136.0,
+            "43": 1194249728.0,
+            "44": 1194249728.0,
+            "45": 1194249728.0,
+            "46": 1194249728.0,
+            "47": 1194249728.0,
+            "48": 1194249728.0,
+            "49": 1194249728.0,
+            "50": 1194249728.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 12.49228,
-            "2": 0.63481,
-            "3": 0.56951,
-            "4": 0.57807,
-            "5": 0.581,
-            "6": 0.58159,
-            "7": 0.5705,
-            "8": 0.56929,
-            "9": 0.56794,
-            "10": 0.56314,
-            "11": 0.57935,
-            "12": 0.57294,
-            "13": 0.56865,
-            "14": 0.56698,
-            "15": 0.56505,
-            "16": 0.56266,
-            "17": 0.56337,
-            "18": 0.56237,
-            "19": 0.56197,
-            "20": 0.5626,
-            "21": 0.5642,
-            "22": 0.56373,
-            "23": 0.57291,
-            "24": 0.56432,
-            "25": 0.56287,
-            "26": 0.56295,
-            "27": 0.56146,
-            "28": 0.56459,
-            "29": 0.56415,
-            "30": 0.56587,
-            "31": 0.5671,
-            "32": 0.56896,
-            "33": 0.57526,
-            "34": 0.57281,
-            "35": 0.57407,
-            "36": 0.57321,
-            "37": 0.57403,
-            "38": 0.57296,
-            "39": 0.57248,
-            "40": 0.57089,
-            "41": 0.57201,
-            "42": 0.5661,
-            "43": 0.57044,
-            "44": 0.56777,
-            "45": 0.56877,
-            "46": 0.57143,
-            "47": 0.57031,
-            "48": 0.56952,
-            "49": 0.57353,
-            "50": 0.56636
+            "1": 6.42109,
+            "2": 0.63984,
+            "3": 0.57811,
+            "4": 0.56134,
+            "5": 0.56563,
+            "6": 0.56363,
+            "7": 0.56774,
+            "8": 0.56212,
+            "9": 0.56082,
+            "10": 0.55677,
+            "11": 0.55824,
+            "12": 0.55917,
+            "13": 0.55878,
+            "14": 0.55777,
+            "15": 0.5601,
+            "16": 0.5566,
+            "17": 0.55819,
+            "18": 0.55905,
+            "19": 0.55832,
+            "20": 0.55798,
+            "21": 0.56392,
+            "22": 0.55882,
+            "23": 0.55672,
+            "24": 0.55578,
+            "25": 0.559,
+            "26": 0.55625,
+            "27": 0.55438,
+            "28": 0.55769,
+            "29": 0.55694,
+            "30": 0.55738,
+            "31": 0.55917,
+            "32": 0.55757,
+            "33": 0.55756,
+            "34": 0.55564,
+            "35": 0.557,
+            "36": 0.55678,
+            "37": 0.55963,
+            "38": 0.55693,
+            "39": 0.55382,
+            "40": 0.55644,
+            "41": 0.55445,
+            "42": 0.55427,
+            "43": 0.55749,
+            "44": 0.55808,
+            "45": 0.56177,
+            "46": 0.57237,
+            "47": 0.55947,
+            "48": 0.55498,
+            "49": 0.55635,
+            "50": 0.55639
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json
index d859d8da902..b4462fc931e 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79175, "5": 10.82288, "10": 10.7688, "15": 10.79157, "20": 10.71001, "25": 10.54662, "30": 10.39407, "35": 10.30461, "40": 10.13303, "45": 9.90015, "50": 9.97874}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5656.0, "5": 6476.0, "10": 5453.0, "15": 6233.0, "20": 5837.0, "25": 5811.0, "30": 6047.0, "35": 6712.0, "40": 7062.0, "45": 6681.0, "50": 7527.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 458213888.0, "5": 458213376.0, "10": 458215936.0, "15": 458215424.0, "20": 458214400.0, "25": 458211840.0, "30": 458211840.0, "35": 458215936.0, "40": 458213376.0, "45": 458214400.0, "50": 458214912.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1016114688.0, "5": 1180862464.0, "10": 1181913600.0, "15": 1181913600.0, "20": 1181913600.0, "25": 1181913600.0, "30": 1181913600.0, "35": 1181913600.0, "40": 1181913600.0, "45": 1181913600.0, "50": 1181913600.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.22686, "5": 0.5556, "10": 0.55416, "15": 0.55551, "20": 0.55398, "25": 0.55449, "30": 0.59353, "35": 0.55443, "40": 0.55473, "45": 0.55192, "50": 0.55296}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.79175,
+            "2": 10.80907,
+            "3": 10.81011,
+            "4": 10.78146,
+            "5": 10.82288,
+            "6": 10.84057,
+            "7": 10.81192,
+            "8": 10.80005,
+            "9": 10.81667,
+            "10": 10.7688,
+            "11": 10.8618,
+            "12": 10.84042,
+            "13": 10.84452,
+            "14": 10.86421,
+            "15": 10.79157,
+            "16": 10.78199,
+            "17": 10.75122,
+            "18": 10.79446,
+            "19": 10.79523,
+            "20": 10.71001,
+            "21": 10.68811,
+            "22": 10.53736,
+            "23": 10.7066,
+            "24": 10.58865,
+            "25": 10.54662,
+            "26": 10.59492,
+            "27": 10.62142,
+            "28": 10.5969,
+            "29": 10.60036,
+            "30": 10.39407,
+            "31": 10.12951,
+            "32": 10.49684,
+            "33": 10.48779,
+            "34": 10.24347,
+            "35": 10.30461,
+            "36": 10.26056,
+            "37": 10.38859,
+            "38": 10.24848,
+            "39": 10.43799,
+            "40": 10.13303,
+            "41": 10.18651,
+            "42": 10.25823,
+            "43": 9.892,
+            "44": 10.02576,
+            "45": 9.90015,
+            "46": 9.88387,
+            "47": 10.19565,
+            "48": 9.91255,
+            "49": 9.60147,
+            "50": 9.97874
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5656.0,
+            "2": 6018.0,
+            "3": 5790.0,
+            "4": 5941.0,
+            "5": 6476.0,
+            "6": 6653.0,
+            "7": 6287.0,
+            "8": 5875.0,
+            "9": 6239.0,
+            "10": 5453.0,
+            "11": 6936.0,
+            "12": 6711.0,
+            "13": 6655.0,
+            "14": 6814.0,
+            "15": 6233.0,
+            "16": 6533.0,
+            "17": 6397.0,
+            "18": 6112.0,
+            "19": 6678.0,
+            "20": 5837.0,
+            "21": 6403.0,
+            "22": 5715.0,
+            "23": 6744.0,
+            "24": 6051.0,
+            "25": 5811.0,
+            "26": 6104.0,
+            "27": 6484.0,
+            "28": 6884.0,
+            "29": 7253.0,
+            "30": 6047.0,
+            "31": 5593.0,
+            "32": 6625.0,
+            "33": 7054.0,
+            "34": 6104.0,
+            "35": 6712.0,
+            "36": 6684.0,
+            "37": 7523.0,
+            "38": 7273.0,
+            "39": 7620.0,
+            "40": 7062.0,
+            "41": 6895.0,
+            "42": 7426.0,
+            "43": 6713.0,
+            "44": 6664.0,
+            "45": 6681.0,
+            "46": 6923.0,
+            "47": 7705.0,
+            "48": 7248.0,
+            "49": 7331.0,
+            "50": 7527.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 458213888.0,
+            "2": 458211840.0,
+            "3": 458215424.0,
+            "4": 458211840.0,
+            "5": 458213376.0,
+            "6": 458213888.0,
+            "7": 458216448.0,
+            "8": 458216448.0,
+            "9": 458212864.0,
+            "10": 458215936.0,
+            "11": 458213888.0,
+            "12": 458213888.0,
+            "13": 458214400.0,
+            "14": 458215424.0,
+            "15": 458215424.0,
+            "16": 458212864.0,
+            "17": 458214400.0,
+            "18": 458214400.0,
+            "19": 458214400.0,
+            "20": 458214400.0,
+            "21": 458211840.0,
+            "22": 458218496.0,
+            "23": 458214912.0,
+            "24": 458214400.0,
+            "25": 458211840.0,
+            "26": 458215936.0,
+            "27": 458210816.0,
+            "28": 458213888.0,
+            "29": 458212864.0,
+            "30": 458211840.0,
+            "31": 458219008.0,
+            "32": 458214400.0,
+            "33": 458214912.0,
+            "34": 458211840.0,
+            "35": 458215936.0,
+            "36": 458212864.0,
+            "37": 458215424.0,
+            "38": 458213888.0,
+            "39": 458213888.0,
+            "40": 458213376.0,
+            "41": 458216960.0,
+            "42": 458215424.0,
+            "43": 458216960.0,
+            "44": 458213376.0,
+            "45": 458214400.0,
+            "46": 458216448.0,
+            "47": 458213376.0,
+            "48": 458213888.0,
+            "49": 458215424.0,
+            "50": 458214912.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1029256704.0,
+            "2": 1193177088.0,
+            "3": 1193177088.0,
+            "4": 1193686016.0,
+            "5": 1193686016.0,
+            "6": 1193686016.0,
+            "7": 1193686016.0,
+            "8": 1193686016.0,
+            "9": 1193771520.0,
+            "10": 1193771520.0,
+            "11": 1193771520.0,
+            "12": 1193771520.0,
+            "13": 1193771520.0,
+            "14": 1193771520.0,
+            "15": 1193771520.0,
+            "16": 1193771520.0,
+            "17": 1193771520.0,
+            "18": 1193771520.0,
+            "19": 1193771520.0,
+            "20": 1193771520.0,
+            "21": 1193771520.0,
+            "22": 1193918464.0,
+            "23": 1193918464.0,
+            "24": 1193918464.0,
+            "25": 1193918464.0,
+            "26": 1193918464.0,
+            "27": 1193918464.0,
+            "28": 1193918464.0,
+            "29": 1193918464.0,
+            "30": 1193918464.0,
+            "31": 1193918464.0,
+            "32": 1193918464.0,
+            "33": 1193918464.0,
+            "34": 1193918464.0,
+            "35": 1193918464.0,
+            "36": 1193918464.0,
+            "37": 1193918464.0,
+            "38": 1193918464.0,
+            "39": 1193918464.0,
+            "40": 1194139136.0,
+            "41": 1194139136.0,
+            "42": 1194139136.0,
+            "43": 1194249728.0,
+            "44": 1194249728.0,
+            "45": 1194249728.0,
+            "46": 1194249728.0,
+            "47": 1194249728.0,
+            "48": 1194249728.0,
+            "49": 1194249728.0,
+            "50": 1194249728.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 6.42299,
+            "2": 0.59069,
+            "3": 0.56496,
+            "4": 0.54736,
+            "5": 0.54792,
+            "6": 0.57731,
+            "7": 0.54778,
+            "8": 0.54659,
+            "9": 0.54833,
+            "10": 0.54497,
+            "11": 0.55076,
+            "12": 0.55595,
+            "13": 0.54721,
+            "14": 0.54614,
+            "15": 0.5457,
+            "16": 0.54774,
+            "17": 0.54518,
+            "18": 0.54582,
+            "19": 0.5467,
+            "20": 0.54611,
+            "21": 0.54622,
+            "22": 0.54617,
+            "23": 0.54622,
+            "24": 0.54547,
+            "25": 0.54796,
+            "26": 0.54413,
+            "27": 0.5458,
+            "28": 0.54598,
+            "29": 0.54813,
+            "30": 0.54556,
+            "31": 0.54684,
+            "32": 0.54789,
+            "33": 0.57275,
+            "34": 0.54705,
+            "35": 0.54545,
+            "36": 0.54414,
+            "37": 0.54225,
+            "38": 0.54504,
+            "39": 0.54284,
+            "40": 0.54185,
+            "41": 0.54578,
+            "42": 0.54542,
+            "43": 0.54621,
+            "44": 0.54447,
+            "45": 0.54521,
+            "46": 0.5449,
+            "47": 0.54529,
+            "48": 0.54403,
+            "49": 0.56089,
+            "50": 0.54374
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..883b2c99518
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.80801,
+            "2": 10.80912,
+            "3": 10.81804,
+            "4": 10.77924,
+            "5": 10.83867,
+            "6": 10.8473,
+            "7": 10.80366,
+            "8": 10.81171,
+            "9": 10.80948,
+            "10": 10.77389,
+            "11": 10.85853,
+            "12": 10.83206,
+            "13": 10.84778,
+            "14": 10.86351,
+            "15": 10.77822,
+            "16": 10.78665,
+            "17": 10.74969,
+            "18": 10.78174,
+            "19": 10.77893,
+            "20": 10.71133,
+            "21": 10.68188,
+            "22": 10.53221,
+            "23": 10.70751,
+            "24": 10.58301,
+            "25": 10.53686,
+            "26": 10.59662,
+            "27": 10.62332,
+            "28": 10.58807,
+            "29": 10.61089,
+            "30": 10.39372,
+            "31": 10.1118,
+            "32": 10.4835,
+            "33": 10.48693,
+            "34": 10.23859,
+            "35": 10.29466,
+            "36": 10.25749,
+            "37": 10.38723,
+            "38": 10.24326,
+            "39": 10.43603,
+            "40": 10.12881,
+            "41": 10.18559,
+            "42": 10.25677,
+            "43": 9.8808,
+            "44": 10.00863,
+            "45": 9.89409,
+            "46": 9.85423,
+            "47": 10.1998,
+            "48": 9.90437,
+            "49": 9.58703,
+            "50": 9.96891
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5550.0,
+            "2": 5910.0,
+            "3": 5966.0,
+            "4": 5860.0,
+            "5": 6749.0,
+            "6": 6869.0,
+            "7": 6129.0,
+            "8": 5905.0,
+            "9": 6154.0,
+            "10": 5266.0,
+            "11": 6792.0,
+            "12": 6324.0,
+            "13": 6845.0,
+            "14": 6862.0,
+            "15": 6306.0,
+            "16": 6524.0,
+            "17": 6567.0,
+            "18": 6194.0,
+            "19": 6515.0,
+            "20": 5979.0,
+            "21": 6327.0,
+            "22": 5748.0,
+            "23": 6749.0,
+            "24": 5978.0,
+            "25": 5661.0,
+            "26": 6206.0,
+            "27": 6307.0,
+            "28": 7003.0,
+            "29": 7124.0,
+            "30": 6390.0,
+            "31": 5578.0,
+            "32": 6783.0,
+            "33": 7031.0,
+            "34": 6306.0,
+            "35": 6516.0,
+            "36": 6614.0,
+            "37": 7690.0,
+            "38": 7193.0,
+            "39": 7850.0,
+            "40": 7170.0,
+            "41": 6880.0,
+            "42": 7329.0,
+            "43": 6669.0,
+            "44": 6616.0,
+            "45": 6700.0,
+            "46": 7080.0,
+            "47": 7661.0,
+            "48": 7259.0,
+            "49": 7083.0,
+            "50": 7418.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 458211840.0,
+            "2": 458214400.0,
+            "3": 458210304.0,
+            "4": 458211840.0,
+            "5": 458212864.0,
+            "6": 458211840.0,
+            "7": 458213376.0,
+            "8": 458212352.0,
+            "9": 458213888.0,
+            "10": 458214400.0,
+            "11": 458212864.0,
+            "12": 458211840.0,
+            "13": 458214400.0,
+            "14": 458213888.0,
+            "15": 458214912.0,
+            "16": 458210816.0,
+            "17": 458213888.0,
+            "18": 458212864.0,
+            "19": 458214400.0,
+            "20": 458212352.0,
+            "21": 458214912.0,
+            "22": 458217472.0,
+            "23": 458213888.0,
+            "24": 458215424.0,
+            "25": 458212864.0,
+            "26": 458211328.0,
+            "27": 458213888.0,
+            "28": 458212864.0,
+            "29": 458213376.0,
+            "30": 458211840.0,
+            "31": 458214400.0,
+            "32": 458213888.0,
+            "33": 458213376.0,
+            "34": 458214400.0,
+            "35": 458213888.0,
+            "36": 458213888.0,
+            "37": 458212352.0,
+            "38": 458211328.0,
+            "39": 458212352.0,
+            "40": 458214912.0,
+            "41": 458212864.0,
+            "42": 458214912.0,
+            "43": 458215936.0,
+            "44": 458213376.0,
+            "45": 458212352.0,
+            "46": 458214400.0,
+            "47": 458214400.0,
+            "48": 458214400.0,
+            "49": 458212864.0,
+            "50": 458212352.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1027753472.0,
+            "2": 1191298048.0,
+            "3": 1191298048.0,
+            "4": 1192127488.0,
+            "5": 1192127488.0,
+            "6": 1192127488.0,
+            "7": 1192127488.0,
+            "8": 1192127488.0,
+            "9": 1192127488.0,
+            "10": 1192127488.0,
+            "11": 1192546816.0,
+            "12": 1192546816.0,
+            "13": 1193283584.0,
+            "14": 1193283584.0,
+            "15": 1193283584.0,
+            "16": 1193283584.0,
+            "17": 1193283584.0,
+            "18": 1193283584.0,
+            "19": 1193283584.0,
+            "20": 1193283584.0,
+            "21": 1193283584.0,
+            "22": 1193556992.0,
+            "23": 1193556992.0,
+            "24": 1193556992.0,
+            "25": 1193556992.0,
+            "26": 1193556992.0,
+            "27": 1193556992.0,
+            "28": 1193556992.0,
+            "29": 1193556992.0,
+            "30": 1193556992.0,
+            "31": 1193556992.0,
+            "32": 1193556992.0,
+            "33": 1193556992.0,
+            "34": 1193556992.0,
+            "35": 1193556992.0,
+            "36": 1193556992.0,
+            "37": 1193556992.0,
+            "38": 1193556992.0,
+            "39": 1193556992.0,
+            "40": 1193556992.0,
+            "41": 1193556992.0,
+            "42": 1193556992.0,
+            "43": 1193556992.0,
+            "44": 1193556992.0,
+            "45": 1193556992.0,
+            "46": 1193556992.0,
+            "47": 1193556992.0,
+            "48": 1193556992.0,
+            "49": 1193556992.0,
+            "50": 1193556992.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 5.57426,
+            "3": 0.69219,
+            "4": 0.73624,
+            "5": 0.67766,
+            "6": 0.68248,
+            "7": 0.69364,
+            "8": 0.67336,
+            "9": 0.67768,
+            "10": 0.64366,
+            "11": 0.62986,
+            "12": 0.62576,
+            "13": 0.618,
+            "14": 0.61177,
+            "15": 0.61656,
+            "16": 0.61633,
+            "17": 0.61648,
+            "18": 0.62197,
+            "19": 0.61422,
+            "20": 0.61923,
+            "21": 0.61598,
+            "22": 0.62583,
+            "23": 0.62054,
+            "24": 0.61791,
+            "25": 0.62065,
+            "26": 0.61387,
+            "27": 0.61437,
+            "28": 0.61372,
+            "29": 0.65198,
+            "30": 0.66353,
+            "31": 0.68179,
+            "32": 0.67222,
+            "33": 0.67462,
+            "34": 0.68277,
+            "35": 0.66769,
+            "36": 0.66387,
+            "37": 0.66002,
+            "38": 0.63341,
+            "39": 0.62396,
+            "40": 0.62802,
+            "41": 0.62419,
+            "42": 0.61655,
+            "43": 0.62362,
+            "44": 0.61679,
+            "45": 0.61772,
+            "46": 0.62253,
+            "47": 0.61779,
+            "48": 0.61712,
+            "49": 0.63575,
+            "50": 0.67932
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json
index eb4665ad7e2..64dc8751e92 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 491766784.0,
-            "2": 491767296.0,
-            "3": 491765760.0,
-            "4": 491767296.0,
-            "5": 491766784.0,
-            "6": 491767808.0,
-            "7": 491767296.0,
-            "8": 491768320.0,
-            "9": 491767808.0,
-            "10": 491767296.0,
-            "11": 491765248.0,
-            "12": 491764736.0,
-            "13": 491766272.0,
-            "14": 491767808.0,
-            "15": 491768832.0,
-            "16": 491769856.0,
-            "17": 491767296.0,
-            "18": 491765248.0,
-            "19": 491766272.0,
-            "20": 491766784.0,
-            "21": 491768320.0,
-            "22": 491768320.0,
-            "23": 491765760.0,
-            "24": 491766272.0,
-            "25": 491766272.0,
-            "26": 491767296.0,
-            "27": 491766784.0,
-            "28": 491767296.0,
-            "29": 491766272.0,
-            "30": 491766272.0,
-            "31": 491767808.0,
-            "32": 491765760.0,
-            "33": 491764736.0,
-            "34": 491768320.0,
-            "35": 491769344.0,
-            "36": 491765760.0,
-            "37": 491765248.0,
-            "38": 491766272.0,
-            "39": 491767808.0,
-            "40": 491765760.0,
-            "41": 491768320.0,
-            "42": 491766272.0,
-            "43": 491768832.0,
-            "44": 491768320.0,
-            "45": 491765248.0,
-            "46": 491768320.0,
-            "47": 491765760.0,
-            "48": 491766784.0,
-            "49": 491766784.0,
-            "50": 491765248.0
+            "1": 458212352.0,
+            "2": 458212864.0,
+            "3": 458211328.0,
+            "4": 458212864.0,
+            "5": 458212352.0,
+            "6": 458213376.0,
+            "7": 458212864.0,
+            "8": 458213888.0,
+            "9": 458213376.0,
+            "10": 458212864.0,
+            "11": 458210816.0,
+            "12": 458210304.0,
+            "13": 458211840.0,
+            "14": 458213376.0,
+            "15": 458214400.0,
+            "16": 458215424.0,
+            "17": 458212864.0,
+            "18": 458210816.0,
+            "19": 458211840.0,
+            "20": 458212352.0,
+            "21": 458213888.0,
+            "22": 458213888.0,
+            "23": 458211328.0,
+            "24": 458211840.0,
+            "25": 458211840.0,
+            "26": 458212864.0,
+            "27": 458212352.0,
+            "28": 458212864.0,
+            "29": 458211840.0,
+            "30": 458211840.0,
+            "31": 458213376.0,
+            "32": 458211328.0,
+            "33": 458210304.0,
+            "34": 458213888.0,
+            "35": 458214912.0,
+            "36": 458211328.0,
+            "37": 458210816.0,
+            "38": 458211840.0,
+            "39": 458213376.0,
+            "40": 458211328.0,
+            "41": 458213888.0,
+            "42": 458211840.0,
+            "43": 458214400.0,
+            "44": 458213888.0,
+            "45": 458210816.0,
+            "46": 458213888.0,
+            "47": 458211328.0,
+            "48": 458212352.0,
+            "49": 458212352.0,
+            "50": 458210816.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1047229440.0,
-            "2": 1213900288.0,
-            "3": 1213900288.0,
-            "4": 1213900288.0,
-            "5": 1213900288.0,
-            "6": 1213900288.0,
-            "7": 1213900288.0,
-            "8": 1213900288.0,
-            "9": 1213900288.0,
-            "10": 1213900288.0,
-            "11": 1213900288.0,
-            "12": 1213900288.0,
-            "13": 1213900288.0,
-            "14": 1213900288.0,
-            "15": 1213900288.0,
-            "16": 1213900288.0,
-            "17": 1213900288.0,
-            "18": 1213900288.0,
-            "19": 1213900288.0,
-            "20": 1213900288.0,
-            "21": 1213900288.0,
-            "22": 1213900288.0,
-            "23": 1213900288.0,
-            "24": 1213900288.0,
-            "25": 1213900288.0,
-            "26": 1213900288.0,
-            "27": 1213900288.0,
-            "28": 1213900288.0,
-            "29": 1213900288.0,
-            "30": 1213900288.0,
-            "31": 1213900288.0,
-            "32": 1213900288.0,
-            "33": 1213900288.0,
-            "34": 1213900288.0,
-            "35": 1213900288.0,
-            "36": 1213900288.0,
-            "37": 1213900288.0,
-            "38": 1213900288.0,
-            "39": 1213900288.0,
-            "40": 1213900288.0,
-            "41": 1213900288.0,
-            "42": 1213900288.0,
-            "43": 1213900288.0,
-            "44": 1213900288.0,
-            "45": 1213900288.0,
-            "46": 1213900288.0,
-            "47": 1213900288.0,
-            "48": 1213900288.0,
-            "49": 1213900288.0,
-            "50": 1213900288.0
+            "1": 1026068480.0,
+            "2": 1192152064.0,
+            "3": 1192152064.0,
+            "4": 1192205312.0,
+            "5": 1192205312.0,
+            "6": 1192205312.0,
+            "7": 1192205312.0,
+            "8": 1192205312.0,
+            "9": 1192205312.0,
+            "10": 1192205312.0,
+            "11": 1192205312.0,
+            "12": 1192205312.0,
+            "13": 1192349184.0,
+            "14": 1192349184.0,
+            "15": 1192506368.0,
+            "16": 1192506368.0,
+            "17": 1192506368.0,
+            "18": 1192506368.0,
+            "19": 1192506368.0,
+            "20": 1192506368.0,
+            "21": 1192506368.0,
+            "22": 1192506368.0,
+            "23": 1192506368.0,
+            "24": 1192506368.0,
+            "25": 1192506368.0,
+            "26": 1192506368.0,
+            "27": 1192506368.0,
+            "28": 1192506368.0,
+            "29": 1192506368.0,
+            "30": 1192506368.0,
+            "31": 1192506368.0,
+            "32": 1192506368.0,
+            "33": 1192506368.0,
+            "34": 1192506368.0,
+            "35": 1192506368.0,
+            "36": 1192506368.0,
+            "37": 1192506368.0,
+            "38": 1192506368.0,
+            "39": 1192506368.0,
+            "40": 1192506368.0,
+            "41": 1192506368.0,
+            "42": 1192506368.0,
+            "43": 1192506368.0,
+            "44": 1192506368.0,
+            "45": 1192506368.0,
+            "46": 1192506368.0,
+            "47": 1192506368.0,
+            "48": 1192506368.0,
+            "49": 1192506368.0,
+            "50": 1192506368.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 13.31352,
-            "2": 0.50754,
-            "3": 0.44486,
-            "4": 0.4668,
-            "5": 0.42238,
-            "6": 0.42115,
-            "7": 0.42604,
-            "8": 0.4217,
-            "9": 0.42265,
-            "10": 0.41522,
-            "11": 0.41976,
-            "12": 0.41287,
-            "13": 0.42113,
-            "14": 0.41948,
-            "15": 0.4211,
-            "16": 0.41519,
-            "17": 0.42043,
-            "18": 0.415,
-            "19": 0.42142,
-            "20": 0.42878,
-            "21": 0.4145,
-            "22": 0.42054,
-            "23": 0.41581,
-            "24": 0.42934,
-            "25": 0.43897,
-            "26": 0.42648,
-            "27": 0.42242,
-            "28": 0.42576,
-            "29": 0.42795,
-            "30": 0.42485,
-            "31": 0.43439,
-            "32": 0.42257,
-            "33": 0.41924,
-            "34": 0.43519,
-            "35": 0.43865,
-            "36": 0.42518,
-            "37": 0.42435,
-            "38": 0.42597,
-            "39": 0.42134,
-            "40": 0.42937,
-            "41": 0.42822,
-            "42": 0.42413,
-            "43": 0.44197,
-            "44": 0.42413,
-            "45": 0.42687,
-            "46": 0.46081,
-            "47": 0.45208,
-            "48": 0.43527,
-            "49": 0.44658,
-            "50": 0.41965
+            "1": 13.13083,
+            "2": 0.49339,
+            "3": 0.43067,
+            "4": 0.43124,
+            "5": 0.38622,
+            "6": 0.39174,
+            "7": 0.39833,
+            "8": 0.39421,
+            "9": 0.3937,
+            "10": 0.38682,
+            "11": 0.39333,
+            "12": 0.38647,
+            "13": 0.38364,
+            "14": 0.38374,
+            "15": 0.38593,
+            "16": 0.38263,
+            "17": 0.39915,
+            "18": 0.38564,
+            "19": 0.38954,
+            "20": 0.38955,
+            "21": 0.38216,
+            "22": 0.38466,
+            "23": 0.38551,
+            "24": 0.38195,
+            "25": 0.38416,
+            "26": 0.38554,
+            "27": 0.38123,
+            "28": 0.38882,
+            "29": 0.43011,
+            "30": 0.38995,
+            "31": 0.39202,
+            "32": 0.38203,
+            "33": 0.38777,
+            "34": 0.39058,
+            "35": 0.39634,
+            "36": 0.38496,
+            "37": 0.38112,
+            "38": 0.38052,
+            "39": 0.37771,
+            "40": 0.38438,
+            "41": 0.38696,
+            "42": 0.38029,
+            "43": 0.39638,
+            "44": 0.38187,
+            "45": 0.38285,
+            "46": 0.42266,
+            "47": 0.3977,
+            "48": 0.39566,
+            "49": 0.40884,
+            "50": 0.38389
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json
index 4ebfff8da76..3f86a0b644a 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 462408192.0,
-            "2": 462406144.0,
-            "3": 462409728.0,
-            "4": 462406144.0,
-            "5": 462407680.0,
-            "6": 462408192.0,
-            "7": 462410752.0,
-            "8": 462410752.0,
-            "9": 462407168.0,
-            "10": 462410240.0,
-            "11": 462408192.0,
-            "12": 462408192.0,
-            "13": 462408704.0,
-            "14": 462409728.0,
-            "15": 462409728.0,
-            "16": 462407168.0,
-            "17": 462408704.0,
-            "18": 462408704.0,
-            "19": 462408704.0,
-            "20": 462408704.0,
-            "21": 462406144.0,
-            "22": 462412800.0,
-            "23": 462409216.0,
-            "24": 462408704.0,
-            "25": 462406144.0,
-            "26": 462410240.0,
-            "27": 462405120.0,
-            "28": 462408192.0,
-            "29": 462407168.0,
-            "30": 462406144.0,
-            "31": 462413312.0,
-            "32": 462408704.0,
-            "33": 462409216.0,
-            "34": 462406144.0,
-            "35": 462410240.0,
-            "36": 462407168.0,
-            "37": 462409728.0,
-            "38": 462408192.0,
-            "39": 462408192.0,
-            "40": 462407680.0,
-            "41": 462411264.0,
-            "42": 462409728.0,
-            "43": 462411264.0,
-            "44": 462407680.0,
-            "45": 462408704.0,
-            "46": 462410752.0,
-            "47": 462407680.0,
-            "48": 462408192.0,
-            "49": 462409728.0,
-            "50": 462409216.0
+            "1": 458213888.0,
+            "2": 458211840.0,
+            "3": 458215424.0,
+            "4": 458211840.0,
+            "5": 458213376.0,
+            "6": 458213888.0,
+            "7": 458216448.0,
+            "8": 458216448.0,
+            "9": 458212864.0,
+            "10": 458215936.0,
+            "11": 458213888.0,
+            "12": 458213888.0,
+            "13": 458214400.0,
+            "14": 458215424.0,
+            "15": 458215424.0,
+            "16": 458212864.0,
+            "17": 458214400.0,
+            "18": 458214400.0,
+            "19": 458214400.0,
+            "20": 458214400.0,
+            "21": 458211840.0,
+            "22": 458218496.0,
+            "23": 458214912.0,
+            "24": 458214400.0,
+            "25": 458211840.0,
+            "26": 458215936.0,
+            "27": 458210816.0,
+            "28": 458213888.0,
+            "29": 458212864.0,
+            "30": 458211840.0,
+            "31": 458219008.0,
+            "32": 458214400.0,
+            "33": 458214912.0,
+            "34": 458211840.0,
+            "35": 458215936.0,
+            "36": 458212864.0,
+            "37": 458215424.0,
+            "38": 458213888.0,
+            "39": 458213888.0,
+            "40": 458213376.0,
+            "41": 458216960.0,
+            "42": 458215424.0,
+            "43": 458216960.0,
+            "44": 458213376.0,
+            "45": 458214400.0,
+            "46": 458216448.0,
+            "47": 458213376.0,
+            "48": 458213888.0,
+            "49": 458215424.0,
+            "50": 458214912.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1033451008.0,
-            "2": 1197371392.0,
-            "3": 1197371392.0,
-            "4": 1197880320.0,
-            "5": 1197880320.0,
-            "6": 1197880320.0,
-            "7": 1197880320.0,
-            "8": 1197880320.0,
-            "9": 1197965824.0,
-            "10": 1197965824.0,
-            "11": 1197965824.0,
-            "12": 1197965824.0,
-            "13": 1197965824.0,
-            "14": 1197965824.0,
-            "15": 1197965824.0,
-            "16": 1197965824.0,
-            "17": 1197965824.0,
-            "18": 1197965824.0,
-            "19": 1197965824.0,
-            "20": 1197965824.0,
-            "21": 1197965824.0,
-            "22": 1198112768.0,
-            "23": 1198112768.0,
-            "24": 1198112768.0,
-            "25": 1198112768.0,
-            "26": 1198112768.0,
-            "27": 1198112768.0,
-            "28": 1198112768.0,
-            "29": 1198112768.0,
-            "30": 1198112768.0,
-            "31": 1198112768.0,
-            "32": 1198112768.0,
-            "33": 1198112768.0,
-            "34": 1198112768.0,
-            "35": 1198112768.0,
-            "36": 1198112768.0,
-            "37": 1198112768.0,
-            "38": 1198112768.0,
-            "39": 1198112768.0,
-            "40": 1198333440.0,
-            "41": 1198333440.0,
-            "42": 1198333440.0,
-            "43": 1198444032.0,
-            "44": 1198444032.0,
-            "45": 1198444032.0,
-            "46": 1198444032.0,
-            "47": 1198444032.0,
-            "48": 1198444032.0,
-            "49": 1198444032.0,
-            "50": 1198444032.0
+            "1": 1029256704.0,
+            "2": 1193177088.0,
+            "3": 1193177088.0,
+            "4": 1193686016.0,
+            "5": 1193686016.0,
+            "6": 1193686016.0,
+            "7": 1193686016.0,
+            "8": 1193686016.0,
+            "9": 1193771520.0,
+            "10": 1193771520.0,
+            "11": 1193771520.0,
+            "12": 1193771520.0,
+            "13": 1193771520.0,
+            "14": 1193771520.0,
+            "15": 1193771520.0,
+            "16": 1193771520.0,
+            "17": 1193771520.0,
+            "18": 1193771520.0,
+            "19": 1193771520.0,
+            "20": 1193771520.0,
+            "21": 1193771520.0,
+            "22": 1193918464.0,
+            "23": 1193918464.0,
+            "24": 1193918464.0,
+            "25": 1193918464.0,
+            "26": 1193918464.0,
+            "27": 1193918464.0,
+            "28": 1193918464.0,
+            "29": 1193918464.0,
+            "30": 1193918464.0,
+            "31": 1193918464.0,
+            "32": 1193918464.0,
+            "33": 1193918464.0,
+            "34": 1193918464.0,
+            "35": 1193918464.0,
+            "36": 1193918464.0,
+            "37": 1193918464.0,
+            "38": 1193918464.0,
+            "39": 1193918464.0,
+            "40": 1194139136.0,
+            "41": 1194139136.0,
+            "42": 1194139136.0,
+            "43": 1194249728.0,
+            "44": 1194249728.0,
+            "45": 1194249728.0,
+            "46": 1194249728.0,
+            "47": 1194249728.0,
+            "48": 1194249728.0,
+            "49": 1194249728.0,
+            "50": 1194249728.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 8.27777,
-            "2": 0.60806,
-            "3": 0.55409,
-            "4": 0.55324,
-            "5": 0.54815,
-            "6": 0.54698,
-            "7": 0.54712,
-            "8": 0.55008,
-            "9": 0.55718,
-            "10": 0.55527,
-            "11": 0.55082,
-            "12": 0.56208,
-            "13": 0.55625,
-            "14": 0.55717,
-            "15": 0.56582,
-            "16": 0.55953,
-            "17": 0.57188,
-            "18": 0.55508,
-            "19": 0.55956,
-            "20": 0.55934,
-            "21": 0.55676,
-            "22": 0.55842,
-            "23": 0.55867,
-            "24": 0.55987,
-            "25": 0.55941,
-            "26": 0.55642,
-            "27": 0.55364,
-            "28": 0.55209,
-            "29": 0.55397,
-            "30": 0.55602,
-            "31": 0.55344,
-            "32": 0.55195,
-            "33": 0.56308,
-            "34": 0.55588,
-            "35": 0.55251,
-            "36": 0.55314,
-            "37": 0.55563,
-            "38": 0.56708,
-            "39": 0.5661,
-            "40": 0.56725,
-            "41": 0.5663,
-            "42": 0.56565,
-            "43": 0.5725,
-            "44": 0.56736,
-            "45": 0.5674,
-            "46": 0.56751,
-            "47": 0.56642,
-            "48": 0.56257,
-            "49": 0.56841,
-            "50": 0.56452
+            "1": 6.51772,
+            "2": 0.67032,
+            "3": 0.58012,
+            "4": 0.56416,
+            "5": 0.56277,
+            "6": 0.56185,
+            "7": 0.56613,
+            "8": 0.56306,
+            "9": 0.55846,
+            "10": 0.55676,
+            "11": 0.58727,
+            "12": 0.58309,
+            "13": 0.58685,
+            "14": 0.57988,
+            "15": 0.57248,
+            "16": 0.5838,
+            "17": 0.58349,
+            "18": 0.57587,
+            "19": 0.57576,
+            "20": 0.56068,
+            "21": 0.56288,
+            "22": 0.5656,
+            "23": 0.56764,
+            "24": 0.55796,
+            "25": 0.5651,
+            "26": 0.56407,
+            "27": 0.56035,
+            "28": 0.5648,
+            "29": 0.55018,
+            "30": 0.55186,
+            "31": 0.64216,
+            "32": 0.64815,
+            "33": 0.64922,
+            "34": 0.64899,
+            "35": 0.65107,
+            "36": 0.64829,
+            "37": 0.64814,
+            "38": 0.64822,
+            "39": 0.64955,
+            "40": 0.61641,
+            "41": 0.5534,
+            "42": 0.55493,
+            "43": 0.55548,
+            "44": 0.55538,
+            "45": 0.55475,
+            "46": 0.5581,
+            "47": 0.55771,
+            "48": 0.5557,
+            "49": 0.55591,
+            "50": 0.5552
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml
index c714e058651..4d674322a23 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml
@@ -49,7 +49,6 @@ MODEL_ARGS:
   --no-gradient-accumulation-fusion: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json
index 52611762241..e752e7d8fe0 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json
@@ -1 +1,287 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82004, "5": 10.84794, "10": 10.79461, "15": 10.82911, "20": 10.73175, "25": 10.57964, "30": 10.40859, "35": 10.31503, "40": 10.14367, "45": 9.914, "50": 9.97565}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12826.0, "5": 15951.0, "10": 12611.0, "15": 14834.0, "20": 13675.0, "25": 13129.0, "30": 14652.0, "35": 15183.0, "40": 16971.0, "45": 16188.0, "50": 18998.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 624217088.0, "5": 624219648.0, "10": 624217600.0, "15": 624222208.0, "20": 624221184.0, "25": 624558080.0, "30": 624215552.0, "35": 624218624.0, "40": 624219136.0, "45": 624218112.0, "50": 624219648.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1818388480.0, "5": 2048155136.0, "10": 2049900032.0, "15": 2049900032.0, "20": 2049900032.0, "25": 2049900032.0, "30": 2049900032.0, "35": 2049900032.0, "40": 2049900032.0, "45": 2049900032.0, "50": 2049900032.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.11963, "5": 0.34973, "10": 0.34623, "15": 0.3403, "20": 0.34061, "25": 0.3401, "30": 0.34214, "35": 0.3402, "40": 0.37279, "45": 0.33997, "50": 0.33985}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.82004,
+            "2": 10.8392,
+            "3": 10.81124,
+            "4": 10.81983,
+            "5": 10.84794,
+            "6": 10.8608,
+            "7": 10.84085,
+            "8": 10.84432,
+            "9": 10.8504,
+            "10": 10.79461,
+            "11": 10.85658,
+            "12": 10.84848,
+            "13": 10.86929,
+            "14": 10.8667,
+            "15": 10.82911,
+            "16": 10.81111,
+            "17": 10.79027,
+            "18": 10.80981,
+            "19": 10.81143,
+            "20": 10.73175,
+            "21": 10.71285,
+            "22": 10.58199,
+            "23": 10.72,
+            "24": 10.61704,
+            "25": 10.57964,
+            "26": 10.63372,
+            "27": 10.6365,
+            "28": 10.60641,
+            "29": 10.61561,
+            "30": 10.40859,
+            "31": 10.17068,
+            "32": 10.49958,
+            "33": 10.4963,
+            "34": 10.25574,
+            "35": 10.31503,
+            "36": 10.28536,
+            "37": 10.38742,
+            "38": 10.24674,
+            "39": 10.44222,
+            "40": 10.14384,
+            "41": 10.19169,
+            "42": 10.25683,
+            "43": 9.90704,
+            "44": 10.02666,
+            "45": 9.91412,
+            "46": 9.89643,
+            "47": 10.18881,
+            "48": 9.93025,
+            "49": 9.61398,
+            "50": 9.97515
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 12826.0,
+            "2": 14613.0,
+            "3": 14549.0,
+            "4": 13422.0,
+            "5": 15951.0,
+            "6": 16055.0,
+            "7": 15208.0,
+            "8": 12944.0,
+            "9": 15110.0,
+            "10": 12611.0,
+            "11": 16586.0,
+            "12": 14954.0,
+            "13": 15925.0,
+            "14": 16182.0,
+            "15": 14834.0,
+            "16": 16023.0,
+            "17": 15486.0,
+            "18": 15116.0,
+            "19": 15584.0,
+            "20": 13675.0,
+            "21": 13873.0,
+            "22": 12917.0,
+            "23": 16766.0,
+            "24": 13924.0,
+            "25": 13129.0,
+            "26": 14794.0,
+            "27": 15169.0,
+            "28": 16393.0,
+            "29": 16719.0,
+            "30": 14652.0,
+            "31": 13126.0,
+            "32": 15987.0,
+            "33": 17372.0,
+            "34": 14206.0,
+            "35": 15183.0,
+            "36": 15837.0,
+            "37": 17507.0,
+            "38": 16382.0,
+            "39": 18071.0,
+            "40": 16755.0,
+            "41": 16757.0,
+            "42": 17222.0,
+            "43": 15308.0,
+            "44": 15173.0,
+            "45": 16243.0,
+            "46": 17454.0,
+            "47": 19165.0,
+            "48": 16552.0,
+            "49": 16282.0,
+            "50": 19162.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 625695744.0,
+            "2": 625700352.0,
+            "3": 625698304.0,
+            "4": 625694720.0,
+            "5": 625771008.0,
+            "6": 625698304.0,
+            "7": 625713664.0,
+            "8": 625698304.0,
+            "9": 625696768.0,
+            "10": 626427392.0,
+            "11": 626528256.0,
+            "12": 625700864.0,
+            "13": 625701376.0,
+            "14": 625740288.0,
+            "15": 625700864.0,
+            "16": 625891840.0,
+            "17": 625693184.0,
+            "18": 625699840.0,
+            "19": 625699840.0,
+            "20": 625699840.0,
+            "21": 625711616.0,
+            "22": 625694720.0,
+            "23": 626073088.0,
+            "24": 626040832.0,
+            "25": 626703360.0,
+            "26": 625732096.0,
+            "27": 625732096.0,
+            "28": 625745408.0,
+            "29": 625777664.0,
+            "30": 625699328.0,
+            "31": 625959936.0,
+            "32": 625695232.0,
+            "33": 625698304.0,
+            "34": 625747968.0,
+            "35": 625720832.0,
+            "36": 625694720.0,
+            "37": 625883136.0,
+            "38": 625796096.0,
+            "39": 625697280.0,
+            "40": 625727488.0,
+            "41": 625707520.0,
+            "42": 625724416.0,
+            "43": 625731584.0,
+            "44": 625759232.0,
+            "45": 625696256.0,
+            "46": 625780224.0,
+            "47": 625701888.0,
+            "48": 625842688.0,
+            "49": 626536960.0,
+            "50": 625698816.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1843612672.0,
+            "2": 2073786880.0,
+            "3": 2074433024.0,
+            "4": 2074433024.0,
+            "5": 2074806784.0,
+            "6": 2075118080.0,
+            "7": 2076633600.0,
+            "8": 2076633600.0,
+            "9": 2076633600.0,
+            "10": 2076633600.0,
+            "11": 2076633600.0,
+            "12": 2076633600.0,
+            "13": 2076673536.0,
+            "14": 2076673536.0,
+            "15": 2076673536.0,
+            "16": 2076673536.0,
+            "17": 2076673536.0,
+            "18": 2076673536.0,
+            "19": 2076673536.0,
+            "20": 2076673536.0,
+            "21": 2076673536.0,
+            "22": 2076673536.0,
+            "23": 2076673536.0,
+            "24": 2076673536.0,
+            "25": 2076673536.0,
+            "26": 2076673536.0,
+            "27": 2076673536.0,
+            "28": 2076673536.0,
+            "29": 2076673536.0,
+            "30": 2076673536.0,
+            "31": 2076673536.0,
+            "32": 2076673536.0,
+            "33": 2076673536.0,
+            "34": 2076673536.0,
+            "35": 2076673536.0,
+            "36": 2076673536.0,
+            "37": 2076673536.0,
+            "38": 2076673536.0,
+            "39": 2076673536.0,
+            "40": 2076673536.0,
+            "41": 2076673536.0,
+            "42": 2076673536.0,
+            "43": 2076673536.0,
+            "44": 2076673536.0,
+            "45": 2076673536.0,
+            "46": 2076673536.0,
+            "47": 2076673536.0,
+            "48": 2076673536.0,
+            "49": 2076673536.0,
+            "50": 2076673536.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 6.71779,
+            "2": 0.4021,
+            "3": 0.34522,
+            "4": 0.32028,
+            "5": 0.32155,
+            "6": 0.32036,
+            "7": 0.32793,
+            "8": 0.31946,
+            "9": 0.32227,
+            "10": 0.32133,
+            "11": 0.3212,
+            "12": 0.32189,
+            "13": 0.32578,
+            "14": 0.3194,
+            "15": 0.32101,
+            "16": 0.3216,
+            "17": 0.32118,
+            "18": 0.3199,
+            "19": 0.32019,
+            "20": 0.32361,
+            "21": 0.32862,
+            "22": 0.32239,
+            "23": 0.31961,
+            "24": 0.31968,
+            "25": 0.32024,
+            "26": 0.31969,
+            "27": 0.31928,
+            "28": 0.32117,
+            "29": 0.32074,
+            "30": 0.32265,
+            "31": 0.32078,
+            "32": 0.32625,
+            "33": 0.32431,
+            "34": 0.3229,
+            "35": 0.32227,
+            "36": 0.32535,
+            "37": 0.32428,
+            "38": 0.31953,
+            "39": 0.32251,
+            "40": 0.32338,
+            "41": 0.32439,
+            "42": 0.32389,
+            "43": 0.32348,
+            "44": 0.32363,
+            "45": 0.32303,
+            "46": 0.32406,
+            "47": 0.32367,
+            "48": 0.32364,
+            "49": 0.32375,
+            "50": 0.32234
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..3e910ef7869
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.79574,
+            "2": 10.81485,
+            "3": 10.78713,
+            "4": 10.78269,
+            "5": 10.82015,
+            "6": 10.83331,
+            "7": 10.81116,
+            "8": 10.81446,
+            "9": 10.81645,
+            "10": 10.75997,
+            "11": 10.8388,
+            "12": 10.81544,
+            "13": 10.84141,
+            "14": 10.8476,
+            "15": 10.79857,
+            "16": 10.78544,
+            "17": 10.77004,
+            "18": 10.77906,
+            "19": 10.7689,
+            "20": 10.71392,
+            "21": 10.69182,
+            "22": 10.56438,
+            "23": 10.70939,
+            "24": 10.60304,
+            "25": 10.55748,
+            "26": 10.60238,
+            "27": 10.62835,
+            "28": 10.59772,
+            "29": 10.61013,
+            "30": 10.40394,
+            "31": 10.17092,
+            "32": 10.49069,
+            "33": 10.48436,
+            "34": 10.26719,
+            "35": 10.31532,
+            "36": 10.27654,
+            "37": 10.39353,
+            "38": 10.24536,
+            "39": 10.43863,
+            "40": 10.13998,
+            "41": 10.19151,
+            "42": 10.25868,
+            "43": 9.9191,
+            "44": 10.03026,
+            "45": 9.92187,
+            "46": 9.89763,
+            "47": 10.1946,
+            "48": 9.93001,
+            "49": 9.62787,
+            "50": 9.97966
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 13025.0,
+            "2": 14911.0,
+            "3": 14651.0,
+            "4": 13760.0,
+            "5": 16297.0,
+            "6": 16032.0,
+            "7": 15521.0,
+            "8": 13170.0,
+            "9": 15403.0,
+            "10": 12605.0,
+            "11": 16803.0,
+            "12": 15289.0,
+            "13": 16415.0,
+            "14": 16182.0,
+            "15": 15127.0,
+            "16": 16135.0,
+            "17": 15282.0,
+            "18": 15280.0,
+            "19": 15379.0,
+            "20": 13642.0,
+            "21": 14281.0,
+            "22": 13476.0,
+            "23": 16892.0,
+            "24": 13920.0,
+            "25": 13236.0,
+            "26": 15256.0,
+            "27": 15454.0,
+            "28": 15973.0,
+            "29": 16892.0,
+            "30": 14103.0,
+            "31": 13113.0,
+            "32": 16067.0,
+            "33": 16788.0,
+            "34": 14559.0,
+            "35": 14974.0,
+            "36": 15798.0,
+            "37": 17569.0,
+            "38": 16172.0,
+            "39": 17774.0,
+            "40": 16088.0,
+            "41": 16616.0,
+            "42": 17149.0,
+            "43": 15487.0,
+            "44": 15110.0,
+            "45": 16499.0,
+            "46": 17407.0,
+            "47": 19502.0,
+            "48": 16568.0,
+            "49": 16613.0,
+            "50": 18892.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 625796096.0,
+            "2": 625850368.0,
+            "3": 625987072.0,
+            "4": 625831424.0,
+            "5": 625794048.0,
+            "6": 625789952.0,
+            "7": 625830912.0,
+            "8": 625794048.0,
+            "9": 625861120.0,
+            "10": 625806848.0,
+            "11": 625795584.0,
+            "12": 626022912.0,
+            "13": 625802240.0,
+            "14": 625853952.0,
+            "15": 625796608.0,
+            "16": 625793024.0,
+            "17": 625798144.0,
+            "18": 625802240.0,
+            "19": 625792000.0,
+            "20": 625793536.0,
+            "21": 626690048.0,
+            "22": 626176000.0,
+            "23": 626092032.0,
+            "24": 625794560.0,
+            "25": 626540544.0,
+            "26": 625934848.0,
+            "27": 625799168.0,
+            "28": 625801728.0,
+            "29": 625793536.0,
+            "30": 626191360.0,
+            "31": 626149376.0,
+            "32": 626774016.0,
+            "33": 625792512.0,
+            "34": 625793024.0,
+            "35": 625851904.0,
+            "36": 625809408.0,
+            "37": 625794048.0,
+            "38": 625827328.0,
+            "39": 625865216.0,
+            "40": 625831936.0,
+            "41": 626081280.0,
+            "42": 626046464.0,
+            "43": 625792000.0,
+            "44": 625792000.0,
+            "45": 626266112.0,
+            "46": 626042880.0,
+            "47": 625789440.0,
+            "48": 625905152.0,
+            "49": 625883648.0,
+            "50": 626099712.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1844879360.0,
+            "2": 2076399104.0,
+            "3": 2076399104.0,
+            "4": 2079827456.0,
+            "5": 2079827456.0,
+            "6": 2079827456.0,
+            "7": 2079827456.0,
+            "8": 2079827456.0,
+            "9": 2079827456.0,
+            "10": 2079827456.0,
+            "11": 2079827456.0,
+            "12": 2079827456.0,
+            "13": 2079827456.0,
+            "14": 2079827456.0,
+            "15": 2079827456.0,
+            "16": 2079827456.0,
+            "17": 2079827456.0,
+            "18": 2079827456.0,
+            "19": 2079827456.0,
+            "20": 2079827456.0,
+            "21": 2079827456.0,
+            "22": 2079827456.0,
+            "23": 2079827456.0,
+            "24": 2079827456.0,
+            "25": 2079827456.0,
+            "26": 2079827456.0,
+            "27": 2079827456.0,
+            "28": 2079827456.0,
+            "29": 2079827456.0,
+            "30": 2079827456.0,
+            "31": 2079827456.0,
+            "32": 2079827456.0,
+            "33": 2079827456.0,
+            "34": 2079827456.0,
+            "35": 2079827456.0,
+            "36": 2079827456.0,
+            "37": 2079827456.0,
+            "38": 2079827456.0,
+            "39": 2079827456.0,
+            "40": 2079827456.0,
+            "41": 2079827456.0,
+            "42": 2079827456.0,
+            "43": 2079827456.0,
+            "44": 2079827456.0,
+            "45": 2079827456.0,
+            "46": 2079827456.0,
+            "47": 2079827456.0,
+            "48": 2079827456.0,
+            "49": 2079827456.0,
+            "50": 2079827456.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 5.62487,
+            "3": 0.39428,
+            "4": 0.3711,
+            "5": 0.36777,
+            "6": 0.36423,
+            "7": 0.35408,
+            "8": 0.35462,
+            "9": 0.35588,
+            "10": 0.35204,
+            "11": 0.35155,
+            "12": 0.35049,
+            "13": 0.35818,
+            "14": 0.35461,
+            "15": 0.36874,
+            "16": 0.367,
+            "17": 0.37423,
+            "18": 0.36926,
+            "19": 0.37139,
+            "20": 0.37109,
+            "21": 0.37066,
+            "22": 0.37237,
+            "23": 0.37636,
+            "24": 0.37618,
+            "25": 0.37461,
+            "26": 0.37622,
+            "27": 0.37576,
+            "28": 0.37551,
+            "29": 0.3765,
+            "30": 0.3787,
+            "31": 0.38695,
+            "32": 0.37235,
+            "33": 0.37931,
+            "34": 0.37817,
+            "35": 0.3749,
+            "36": 0.37829,
+            "37": 0.37774,
+            "38": 0.3755,
+            "39": 0.37889,
+            "40": 0.37688,
+            "41": 0.38007,
+            "42": 0.37324,
+            "43": 0.36948,
+            "44": 0.37523,
+            "45": 0.37464,
+            "46": 0.38496,
+            "47": 0.3737,
+            "48": 0.37892,
+            "49": 0.39066,
+            "50": 0.37612
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json
index eb013c007ca..8928145fcbb 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 659394560.0,
-            "2": 659346944.0,
-            "3": 659401728.0,
-            "4": 659351040.0,
-            "5": 659623424.0,
-            "6": 659348480.0,
-            "7": 659508736.0,
-            "8": 659353088.0,
-            "9": 659383296.0,
-            "10": 659347456.0,
-            "11": 659350016.0,
-            "12": 659437056.0,
-            "13": 659356160.0,
-            "14": 659702272.0,
-            "15": 659658240.0,
-            "16": 659450880.0,
-            "17": 659438080.0,
-            "18": 659384320.0,
-            "19": 659492352.0,
-            "20": 659372544.0,
-            "21": 659350016.0,
-            "22": 659347456.0,
-            "23": 659348992.0,
-            "24": 659430400.0,
-            "25": 659347968.0,
-            "26": 659378176.0,
-            "27": 659353088.0,
-            "28": 659346944.0,
-            "29": 659440640.0,
-            "30": 659732480.0,
-            "31": 659361792.0,
-            "32": 659345920.0,
-            "33": 659473920.0,
-            "34": 660008448.0,
-            "35": 659819520.0,
-            "36": 659363840.0,
-            "37": 659418624.0,
-            "38": 659351040.0,
-            "39": 659449344.0,
-            "40": 659586560.0,
-            "41": 659387392.0,
-            "42": 659476480.0,
-            "43": 659567104.0,
-            "44": 659344384.0,
-            "45": 659346944.0,
-            "46": 659466752.0,
-            "47": 659345408.0,
-            "48": 659835392.0,
-            "49": 659494400.0,
-            "50": 659346432.0
+            "1": 625530880.0,
+            "2": 625483264.0,
+            "3": 625484800.0,
+            "4": 625516032.0,
+            "5": 625759744.0,
+            "6": 625774592.0,
+            "7": 625485312.0,
+            "8": 625568256.0,
+            "9": 625519616.0,
+            "10": 625655808.0,
+            "11": 625630720.0,
+            "12": 625482240.0,
+            "13": 625488384.0,
+            "14": 625819136.0,
+            "15": 625982976.0,
+            "16": 625500160.0,
+            "17": 625613312.0,
+            "18": 625494016.0,
+            "19": 625484288.0,
+            "20": 625508864.0,
+            "21": 625486336.0,
+            "22": 625486848.0,
+            "23": 625632768.0,
+            "24": 625487872.0,
+            "25": 625484288.0,
+            "26": 625753088.0,
+            "27": 625513984.0,
+            "28": 625483264.0,
+            "29": 625698816.0,
+            "30": 625967104.0,
+            "31": 625477632.0,
+            "32": 625523200.0,
+            "33": 625484288.0,
+            "34": 625481216.0,
+            "35": 625479680.0,
+            "36": 625554432.0,
+            "37": 625554944.0,
+            "38": 625487360.0,
+            "39": 625504768.0,
+            "40": 625481216.0,
+            "41": 625481728.0,
+            "42": 625481728.0,
+            "43": 626760192.0,
+            "44": 625598464.0,
+            "45": 625534464.0,
+            "46": 625603072.0,
+            "47": 625509376.0,
+            "48": 626520576.0,
+            "49": 625630720.0,
+            "50": 625565696.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1853294080.0,
-            "2": 2083995136.0,
-            "3": 2084402688.0,
-            "4": 2084433408.0,
-            "5": 2084433408.0,
-            "6": 2084433408.0,
-            "7": 2085503488.0,
-            "8": 2085503488.0,
-            "9": 2085503488.0,
-            "10": 2085503488.0,
-            "11": 2085503488.0,
-            "12": 2085503488.0,
-            "13": 2085503488.0,
-            "14": 2085503488.0,
-            "15": 2085503488.0,
-            "16": 2085503488.0,
-            "17": 2085503488.0,
-            "18": 2085503488.0,
-            "19": 2085503488.0,
-            "20": 2085503488.0,
-            "21": 2085503488.0,
-            "22": 2085503488.0,
-            "23": 2085503488.0,
-            "24": 2085503488.0,
-            "25": 2085503488.0,
-            "26": 2085503488.0,
-            "27": 2085503488.0,
-            "28": 2085503488.0,
-            "29": 2085503488.0,
-            "30": 2085503488.0,
-            "31": 2085503488.0,
-            "32": 2085503488.0,
-            "33": 2085503488.0,
-            "34": 2085503488.0,
-            "35": 2085503488.0,
-            "36": 2085503488.0,
-            "37": 2085503488.0,
-            "38": 2085503488.0,
-            "39": 2085503488.0,
-            "40": 2085503488.0,
-            "41": 2085503488.0,
-            "42": 2085503488.0,
-            "43": 2085503488.0,
-            "44": 2085503488.0,
-            "45": 2085503488.0,
-            "46": 2085503488.0,
-            "47": 2085503488.0,
-            "48": 2085503488.0,
-            "49": 2085503488.0,
-            "50": 2085503488.0
+            "1": 1845331456.0,
+            "2": 2075684352.0,
+            "3": 2075684352.0,
+            "4": 2078547456.0,
+            "5": 2078547456.0,
+            "6": 2078547456.0,
+            "7": 2078547456.0,
+            "8": 2078547456.0,
+            "9": 2078547456.0,
+            "10": 2078547456.0,
+            "11": 2078547456.0,
+            "12": 2078547456.0,
+            "13": 2078547456.0,
+            "14": 2078547456.0,
+            "15": 2078547456.0,
+            "16": 2078547456.0,
+            "17": 2078547456.0,
+            "18": 2078547456.0,
+            "19": 2078547456.0,
+            "20": 2078547456.0,
+            "21": 2078547456.0,
+            "22": 2078547456.0,
+            "23": 2078547456.0,
+            "24": 2078547456.0,
+            "25": 2078547456.0,
+            "26": 2078547456.0,
+            "27": 2078547456.0,
+            "28": 2078547456.0,
+            "29": 2078547456.0,
+            "30": 2078547456.0,
+            "31": 2078547456.0,
+            "32": 2078547456.0,
+            "33": 2078547456.0,
+            "34": 2078547456.0,
+            "35": 2078547456.0,
+            "36": 2078547456.0,
+            "37": 2078547456.0,
+            "38": 2078547456.0,
+            "39": 2078547456.0,
+            "40": 2078547456.0,
+            "41": 2078547456.0,
+            "42": 2078547456.0,
+            "43": 2078547456.0,
+            "44": 2078547456.0,
+            "45": 2078547456.0,
+            "46": 2078547456.0,
+            "47": 2078547456.0,
+            "48": 2078547456.0,
+            "49": 2078547456.0,
+            "50": 2078547456.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 13.92506,
-            "2": 0.34079,
-            "3": 0.28891,
-            "4": 0.30652,
-            "5": 0.27326,
-            "6": 0.26908,
-            "7": 0.28337,
-            "8": 0.26429,
-            "9": 0.27048,
-            "10": 0.26866,
-            "11": 0.28689,
-            "12": 0.25961,
-            "13": 0.26511,
-            "14": 0.26065,
-            "15": 0.27834,
-            "16": 0.26398,
-            "17": 0.26064,
-            "18": 0.26661,
-            "19": 0.26487,
-            "20": 0.27686,
-            "21": 0.26249,
-            "22": 0.2677,
-            "23": 0.26859,
-            "24": 0.26049,
-            "25": 0.26086,
-            "26": 0.26279,
-            "27": 0.25983,
-            "28": 0.26561,
-            "29": 0.26345,
-            "30": 0.26142,
-            "31": 0.30613,
-            "32": 0.26049,
-            "33": 0.26142,
-            "34": 0.27278,
-            "35": 0.25691,
-            "36": 0.26151,
-            "37": 0.25654,
-            "38": 0.25753,
-            "39": 0.2576,
-            "40": 0.25839,
-            "41": 0.27219,
-            "42": 0.25851,
-            "43": 0.2668,
-            "44": 0.26229,
-            "45": 0.27182,
-            "46": 0.27691,
-            "47": 0.26299,
-            "48": 0.27152,
-            "49": 0.31513,
-            "50": 0.25813
+            "1": 14.22688,
+            "2": 0.36404,
+            "3": 0.28777,
+            "4": 0.27054,
+            "5": 0.24844,
+            "6": 0.23753,
+            "7": 0.2541,
+            "8": 0.2395,
+            "9": 0.23675,
+            "10": 0.23301,
+            "11": 0.25454,
+            "12": 0.22665,
+            "13": 0.23214,
+            "14": 0.22521,
+            "15": 0.24748,
+            "16": 0.2636,
+            "17": 0.2605,
+            "18": 0.24164,
+            "19": 0.24627,
+            "20": 0.25668,
+            "21": 0.24329,
+            "22": 0.24722,
+            "23": 0.25378,
+            "24": 0.22642,
+            "25": 0.22497,
+            "26": 0.22495,
+            "27": 0.2239,
+            "28": 0.22848,
+            "29": 0.22515,
+            "30": 0.22501,
+            "31": 0.27252,
+            "32": 0.22744,
+            "33": 0.22453,
+            "34": 0.23411,
+            "35": 0.22556,
+            "36": 0.2278,
+            "37": 0.22109,
+            "38": 0.22459,
+            "39": 0.22077,
+            "40": 0.22097,
+            "41": 0.23428,
+            "42": 0.22009,
+            "43": 0.23227,
+            "44": 0.22717,
+            "45": 0.23445,
+            "46": 0.23886,
+            "47": 0.22667,
+            "48": 0.23204,
+            "49": 0.27864,
+            "50": 0.22287
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json
index 478bae6fdec..126f22e3d75 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json
@@ -41,19 +41,19 @@
             "35": 10.31503,
             "36": 10.28536,
             "37": 10.38742,
-            "38": 10.24676,
-            "39": 10.44249,
-            "40": 10.14367,
-            "41": 10.19116,
-            "42": 10.25654,
-            "43": 9.90671,
-            "44": 10.02653,
-            "45": 9.914,
-            "46": 9.89613,
-            "47": 10.18885,
-            "48": 9.92993,
-            "49": 9.61419,
-            "50": 9.97565
+            "38": 10.24674,
+            "39": 10.44222,
+            "40": 10.14384,
+            "41": 10.19169,
+            "42": 10.25683,
+            "43": 9.90704,
+            "44": 10.02666,
+            "45": 9.91412,
+            "46": 9.89643,
+            "47": 10.18881,
+            "48": 9.93025,
+            "49": 9.61398,
+            "50": 9.97515
         }
     },
     "num-zeros": {
@@ -98,19 +98,19 @@
             "35": 15183.0,
             "36": 15837.0,
             "37": 17507.0,
-            "38": 16617.0,
-            "39": 17712.0,
-            "40": 16971.0,
-            "41": 16795.0,
-            "42": 17304.0,
-            "43": 15578.0,
-            "44": 15564.0,
-            "45": 16188.0,
-            "46": 17443.0,
-            "47": 19238.0,
-            "48": 16575.0,
-            "49": 16273.0,
-            "50": 18998.0
+            "38": 16382.0,
+            "39": 18071.0,
+            "40": 16755.0,
+            "41": 16757.0,
+            "42": 17222.0,
+            "43": 15308.0,
+            "44": 15173.0,
+            "45": 16243.0,
+            "46": 17454.0,
+            "47": 19165.0,
+            "48": 16552.0,
+            "49": 16282.0,
+            "50": 19162.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 629738496.0,
-            "2": 629705216.0,
-            "3": 629710336.0,
-            "4": 629699584.0,
-            "5": 629732864.0,
-            "6": 629703168.0,
-            "7": 629718528.0,
-            "8": 629722624.0,
-            "9": 629763072.0,
-            "10": 629733888.0,
-            "11": 629810176.0,
-            "12": 629705728.0,
-            "13": 629848576.0,
-            "14": 629702144.0,
-            "15": 629870592.0,
-            "16": 629805568.0,
-            "17": 629698048.0,
-            "18": 629731328.0,
-            "19": 629798912.0,
-            "20": 629752832.0,
-            "21": 629716480.0,
-            "22": 629699584.0,
-            "23": 629705216.0,
-            "24": 629736448.0,
-            "25": 629699584.0,
-            "26": 629736960.0,
-            "27": 629704192.0,
-            "28": 629750272.0,
-            "29": 629728256.0,
-            "30": 629933568.0,
-            "31": 629847040.0,
-            "32": 629700096.0,
-            "33": 629703168.0,
-            "34": 629752832.0,
-            "35": 629725696.0,
-            "36": 629724160.0,
-            "37": 629702656.0,
-            "38": 629704192.0,
-            "39": 629733888.0,
-            "40": 629749760.0,
-            "41": 629700096.0,
-            "42": 629729280.0,
-            "43": 629699072.0,
-            "44": 629769728.0,
-            "45": 629713920.0,
-            "46": 629804544.0,
-            "47": 629719552.0,
-            "48": 629843456.0,
-            "49": 630007296.0,
-            "50": 629703168.0
+            "1": 625695744.0,
+            "2": 625700352.0,
+            "3": 625698304.0,
+            "4": 625694720.0,
+            "5": 625771008.0,
+            "6": 625698304.0,
+            "7": 625713664.0,
+            "8": 625698304.0,
+            "9": 625696768.0,
+            "10": 626427392.0,
+            "11": 626528256.0,
+            "12": 625700864.0,
+            "13": 625701376.0,
+            "14": 625740288.0,
+            "15": 625700864.0,
+            "16": 625891840.0,
+            "17": 625693184.0,
+            "18": 625699840.0,
+            "19": 625699840.0,
+            "20": 625699840.0,
+            "21": 625711616.0,
+            "22": 625694720.0,
+            "23": 626073088.0,
+            "24": 626040832.0,
+            "25": 626703360.0,
+            "26": 625732096.0,
+            "27": 625732096.0,
+            "28": 625745408.0,
+            "29": 625777664.0,
+            "30": 625699328.0,
+            "31": 625959936.0,
+            "32": 625695232.0,
+            "33": 625698304.0,
+            "34": 625747968.0,
+            "35": 625720832.0,
+            "36": 625694720.0,
+            "37": 625883136.0,
+            "38": 625796096.0,
+            "39": 625697280.0,
+            "40": 625727488.0,
+            "41": 625707520.0,
+            "42": 625724416.0,
+            "43": 625731584.0,
+            "44": 625759232.0,
+            "45": 625696256.0,
+            "46": 625780224.0,
+            "47": 625701888.0,
+            "48": 625842688.0,
+            "49": 626536960.0,
+            "50": 625698816.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1847654400.0,
-            "2": 2077632000.0,
-            "3": 2078750208.0,
-            "4": 2078750208.0,
-            "5": 2079077888.0,
-            "6": 2079077888.0,
-            "7": 2080535040.0,
-            "8": 2080535040.0,
-            "9": 2080535040.0,
-            "10": 2080535040.0,
-            "11": 2080535040.0,
-            "12": 2080535040.0,
-            "13": 2080535040.0,
-            "14": 2080535040.0,
-            "15": 2080535040.0,
-            "16": 2080535040.0,
-            "17": 2080535040.0,
-            "18": 2080535040.0,
-            "19": 2080535040.0,
-            "20": 2080535040.0,
-            "21": 2080535040.0,
-            "22": 2080535040.0,
-            "23": 2080535040.0,
-            "24": 2080535040.0,
-            "25": 2080535040.0,
-            "26": 2080535040.0,
-            "27": 2080535040.0,
-            "28": 2080535040.0,
-            "29": 2080535040.0,
-            "30": 2080535040.0,
-            "31": 2080535040.0,
-            "32": 2080535040.0,
-            "33": 2080535040.0,
-            "34": 2080535040.0,
-            "35": 2080535040.0,
-            "36": 2080535040.0,
-            "37": 2080535040.0,
-            "38": 2080535040.0,
-            "39": 2080535040.0,
-            "40": 2080535040.0,
-            "41": 2080535040.0,
-            "42": 2080535040.0,
-            "43": 2080535040.0,
-            "44": 2080535040.0,
-            "45": 2080535040.0,
-            "46": 2080535040.0,
-            "47": 2080535040.0,
-            "48": 2080535040.0,
-            "49": 2080535040.0,
-            "50": 2080535040.0
+            "1": 1843612672.0,
+            "2": 2073786880.0,
+            "3": 2074433024.0,
+            "4": 2074433024.0,
+            "5": 2074806784.0,
+            "6": 2075118080.0,
+            "7": 2076633600.0,
+            "8": 2076633600.0,
+            "9": 2076633600.0,
+            "10": 2076633600.0,
+            "11": 2076633600.0,
+            "12": 2076633600.0,
+            "13": 2076673536.0,
+            "14": 2076673536.0,
+            "15": 2076673536.0,
+            "16": 2076673536.0,
+            "17": 2076673536.0,
+            "18": 2076673536.0,
+            "19": 2076673536.0,
+            "20": 2076673536.0,
+            "21": 2076673536.0,
+            "22": 2076673536.0,
+            "23": 2076673536.0,
+            "24": 2076673536.0,
+            "25": 2076673536.0,
+            "26": 2076673536.0,
+            "27": 2076673536.0,
+            "28": 2076673536.0,
+            "29": 2076673536.0,
+            "30": 2076673536.0,
+            "31": 2076673536.0,
+            "32": 2076673536.0,
+            "33": 2076673536.0,
+            "34": 2076673536.0,
+            "35": 2076673536.0,
+            "36": 2076673536.0,
+            "37": 2076673536.0,
+            "38": 2076673536.0,
+            "39": 2076673536.0,
+            "40": 2076673536.0,
+            "41": 2076673536.0,
+            "42": 2076673536.0,
+            "43": 2076673536.0,
+            "44": 2076673536.0,
+            "45": 2076673536.0,
+            "46": 2076673536.0,
+            "47": 2076673536.0,
+            "48": 2076673536.0,
+            "49": 2076673536.0,
+            "50": 2076673536.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 10.46534,
-            "2": 0.4102,
-            "3": 0.34219,
-            "4": 0.34093,
-            "5": 0.34255,
-            "6": 0.33706,
-            "7": 0.33661,
-            "8": 0.33616,
-            "9": 0.33565,
-            "10": 0.33568,
-            "11": 0.33538,
-            "12": 0.33457,
-            "13": 0.33488,
-            "14": 0.33416,
-            "15": 0.33683,
-            "16": 0.33684,
-            "17": 0.33708,
-            "18": 0.33815,
-            "19": 0.33805,
-            "20": 0.33696,
-            "21": 0.33675,
-            "22": 0.33623,
-            "23": 0.33752,
-            "24": 0.33699,
-            "25": 0.3409,
-            "26": 0.33513,
-            "27": 0.33524,
-            "28": 0.33491,
-            "29": 0.33714,
-            "30": 0.33571,
-            "31": 0.33638,
-            "32": 0.33629,
-            "33": 0.3369,
-            "34": 0.33685,
-            "35": 0.33651,
-            "36": 0.33539,
-            "37": 0.33561,
-            "38": 0.33636,
-            "39": 0.33558,
-            "40": 0.3356,
-            "41": 0.33618,
-            "42": 0.33669,
-            "43": 0.33535,
-            "44": 0.3362,
-            "45": 0.3354,
-            "46": 0.33686,
-            "47": 0.33486,
-            "48": 0.33657,
-            "49": 0.33563,
-            "50": 0.33513
+            "1": 6.70836,
+            "2": 0.3903,
+            "3": 0.34658,
+            "4": 0.33174,
+            "5": 0.33024,
+            "6": 0.32826,
+            "7": 0.32764,
+            "8": 0.32869,
+            "9": 0.32788,
+            "10": 0.3286,
+            "11": 0.32808,
+            "12": 0.33088,
+            "13": 0.32722,
+            "14": 0.32709,
+            "15": 0.32599,
+            "16": 0.32627,
+            "17": 0.32568,
+            "18": 0.32553,
+            "19": 0.32587,
+            "20": 0.32614,
+            "21": 0.32643,
+            "22": 0.32599,
+            "23": 0.32625,
+            "24": 0.32672,
+            "25": 0.32482,
+            "26": 0.32493,
+            "27": 0.32669,
+            "28": 0.32628,
+            "29": 0.32713,
+            "30": 0.32658,
+            "31": 0.32584,
+            "32": 0.32655,
+            "33": 0.3257,
+            "34": 0.32557,
+            "35": 0.3265,
+            "36": 0.32561,
+            "37": 0.32526,
+            "38": 0.32485,
+            "39": 0.32759,
+            "40": 0.32685,
+            "41": 0.32691,
+            "42": 0.32612,
+            "43": 0.32555,
+            "44": 0.32643,
+            "45": 0.32699,
+            "46": 0.32711,
+            "47": 0.32611,
+            "48": 0.32765,
+            "49": 0.32669,
+            "50": 0.32485
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
index d763069b566..c93bd4367f3 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
@@ -48,7 +48,6 @@ MODEL_ARGS:
   --use-checkpoint-opt_param-scheduler: true
   --use-mcore-models: true
   --ckpt-format: torch_dist
-  --dist-ckpt-save-pre-mcore-014: true
   --dist-ckpt-strictness: log_all # backward compatibility for TE changes
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..d9441fb83aa
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.80397,
+            "2": 10.81064,
+            "3": 10.798,
+            "4": 10.78655,
+            "5": 10.8299,
+            "6": 10.85307,
+            "7": 10.80967,
+            "8": 10.8015,
+            "9": 10.82484,
+            "10": 10.78237,
+            "11": 10.83805,
+            "12": 10.84658,
+            "13": 10.86137,
+            "14": 10.86451,
+            "15": 10.83791,
+            "16": 10.83385,
+            "17": 10.81249,
+            "18": 10.84432,
+            "19": 10.83623,
+            "20": 10.8168,
+            "21": 10.83551,
+            "22": 10.76274,
+            "23": 10.85573,
+            "24": 10.8062,
+            "25": 10.80042,
+            "26": 10.8143,
+            "27": 10.82932,
+            "28": 10.8546,
+            "29": 10.86594,
+            "30": 10.79379,
+            "31": 10.74776,
+            "32": 10.84932,
+            "33": 10.83727,
+            "34": 10.80597,
+            "35": 10.80259,
+            "36": 10.79662,
+            "37": 10.82559,
+            "38": 10.79231,
+            "39": 10.84778,
+            "40": 10.77804,
+            "41": 10.79895,
+            "42": 10.81493,
+            "43": 10.74316,
+            "44": 10.76656,
+            "45": 10.76408,
+            "46": 10.7768,
+            "47": 10.79908,
+            "48": 10.77572,
+            "49": 10.72207,
+            "50": 10.78609,
+            "51": 10.78712,
+            "52": 10.7653,
+            "53": 10.81235,
+            "54": 10.79776,
+            "55": 10.8072,
+            "56": 10.7562,
+            "57": 10.71334,
+            "58": 10.78166,
+            "59": 10.75039,
+            "60": 10.72977,
+            "61": 10.76435,
+            "62": 10.81299,
+            "63": 10.69266,
+            "64": 10.76646,
+            "65": 10.62484,
+            "66": 10.75371,
+            "67": 10.69118,
+            "68": 10.77122,
+            "69": 10.76048,
+            "70": 10.76506,
+            "71": 10.73497,
+            "72": 10.72999,
+            "73": 10.71715,
+            "74": 10.57819,
+            "75": 10.68208,
+            "76": 10.6133,
+            "77": 10.80786,
+            "78": 10.73142,
+            "79": 10.66063,
+            "80": 10.68014,
+            "81": 10.69828,
+            "82": 10.72277,
+            "83": 10.64104,
+            "84": 10.66223,
+            "85": 10.70251,
+            "86": 10.57982,
+            "87": 10.69083,
+            "88": 10.73435,
+            "89": 10.67796,
+            "90": 10.74299,
+            "91": 10.62241,
+            "92": 10.64011,
+            "93": 10.56628,
+            "94": 10.49922,
+            "95": 10.65675,
+            "96": 10.65892,
+            "97": 10.57941,
+            "98": 10.67242,
+            "99": 10.47965,
+            "100": 10.59346
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1158.0,
+            "2": 1181.0,
+            "3": 1227.0,
+            "4": 1263.0,
+            "5": 1308.0,
+            "6": 1594.0,
+            "7": 1345.0,
+            "8": 1253.0,
+            "9": 1239.0,
+            "10": 1232.0,
+            "11": 1255.0,
+            "12": 1250.0,
+            "13": 1516.0,
+            "14": 1270.0,
+            "15": 1335.0,
+            "16": 1260.0,
+            "17": 1249.0,
+            "18": 1261.0,
+            "19": 1099.0,
+            "20": 1387.0,
+            "21": 1298.0,
+            "22": 1286.0,
+            "23": 1315.0,
+            "24": 1131.0,
+            "25": 1157.0,
+            "26": 1166.0,
+            "27": 1133.0,
+            "28": 1243.0,
+            "29": 1348.0,
+            "30": 1235.0,
+            "31": 1090.0,
+            "32": 1272.0,
+            "33": 1355.0,
+            "34": 1161.0,
+            "35": 1159.0,
+            "36": 1146.0,
+            "37": 1222.0,
+            "38": 1418.0,
+            "39": 1273.0,
+            "40": 1198.0,
+            "41": 1160.0,
+            "42": 1285.0,
+            "43": 1094.0,
+            "44": 1127.0,
+            "45": 1130.0,
+            "46": 1183.0,
+            "47": 1312.0,
+            "48": 1238.0,
+            "49": 1068.0,
+            "50": 1163.0,
+            "51": 1234.0,
+            "52": 1284.0,
+            "53": 1380.0,
+            "54": 1191.0,
+            "55": 1099.0,
+            "56": 1298.0,
+            "57": 1241.0,
+            "58": 1203.0,
+            "59": 1324.0,
+            "60": 1254.0,
+            "61": 1120.0,
+            "62": 1362.0,
+            "63": 1213.0,
+            "64": 1330.0,
+            "65": 1057.0,
+            "66": 1171.0,
+            "67": 1208.0,
+            "68": 1320.0,
+            "69": 1304.0,
+            "70": 1122.0,
+            "71": 1259.0,
+            "72": 1254.0,
+            "73": 1203.0,
+            "74": 1125.0,
+            "75": 1413.0,
+            "76": 1217.0,
+            "77": 1412.0,
+            "78": 1291.0,
+            "79": 1020.0,
+            "80": 1143.0,
+            "81": 1243.0,
+            "82": 1154.0,
+            "83": 1052.0,
+            "84": 1219.0,
+            "85": 1360.0,
+            "86": 1072.0,
+            "87": 1319.0,
+            "88": 1347.0,
+            "89": 1127.0,
+            "90": 1474.0,
+            "91": 1140.0,
+            "92": 1110.0,
+            "93": 924.0,
+            "94": 1062.0,
+            "95": 1147.0,
+            "96": 1128.0,
+            "97": 1099.0,
+            "98": 1191.0,
+            "99": 1071.0,
+            "100": 1214.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 993802240.0,
+            "2": 993814528.0,
+            "3": 993790464.0,
+            "4": 993820160.0,
+            "5": 993728512.0,
+            "6": 993758208.0,
+            "7": 993780224.0,
+            "8": 993795584.0,
+            "9": 993800704.0,
+            "10": 993771520.0,
+            "11": 993752064.0,
+            "12": 993737216.0,
+            "13": 993804800.0,
+            "14": 993734144.0,
+            "15": 993773056.0,
+            "16": 993763840.0,
+            "17": 993744384.0,
+            "18": 993743872.0,
+            "19": 993764864.0,
+            "20": 993719296.0,
+            "21": 993772544.0,
+            "22": 993796096.0,
+            "23": 993748992.0,
+            "24": 993819136.0,
+            "25": 993792512.0,
+            "26": 993776640.0,
+            "27": 993804800.0,
+            "28": 993701888.0,
+            "29": 993717248.0,
+            "30": 993766912.0,
+            "31": 993802240.0,
+            "32": 993705984.0,
+            "33": 993759744.0,
+            "34": 993780224.0,
+            "35": 993740288.0,
+            "36": 993748480.0,
+            "37": 993697280.0,
+            "38": 993763840.0,
+            "39": 993747456.0,
+            "40": 993781248.0,
+            "41": 993752064.0,
+            "42": 993757696.0,
+            "43": 993793024.0,
+            "44": 993833984.0,
+            "45": 993780736.0,
+            "46": 993798144.0,
+            "47": 993789440.0,
+            "48": 993793024.0,
+            "49": 993743360.0,
+            "50": 993754624.0,
+            "51": 993786368.0,
+            "52": 993749504.0,
+            "53": 993812992.0,
+            "54": 993750528.0,
+            "55": 993732608.0,
+            "56": 993777664.0,
+            "57": 993859584.0,
+            "58": 993849856.0,
+            "59": 993761792.0,
+            "60": 993774592.0,
+            "61": 993771520.0,
+            "62": 993786880.0,
+            "63": 993787904.0,
+            "64": 993761280.0,
+            "65": 993792000.0,
+            "66": 993746432.0,
+            "67": 993782784.0,
+            "68": 993783808.0,
+            "69": 993741824.0,
+            "70": 993747968.0,
+            "71": 993736192.0,
+            "72": 993762304.0,
+            "73": 993784832.0,
+            "74": 993809920.0,
+            "75": 993753088.0,
+            "76": 993797632.0,
+            "77": 993720832.0,
+            "78": 993729536.0,
+            "79": 993730560.0,
+            "80": 993763840.0,
+            "81": 993728000.0,
+            "82": 993711616.0,
+            "83": 993772544.0,
+            "84": 993782784.0,
+            "85": 993787392.0,
+            "86": 993804288.0,
+            "87": 993737728.0,
+            "88": 993731584.0,
+            "89": 993755136.0,
+            "90": 993742848.0,
+            "91": 993763840.0,
+            "92": 993774080.0,
+            "93": 993792000.0,
+            "94": 993779712.0,
+            "95": 993723904.0,
+            "96": 993714688.0,
+            "97": 993752064.0,
+            "98": 993708544.0,
+            "99": 993792000.0,
+            "100": 993812992.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 3074847232.0,
+            "2": 3373968896.0,
+            "3": 3373968896.0,
+            "4": 3378071552.0,
+            "5": 3378071552.0,
+            "6": 3378071552.0,
+            "7": 3378071552.0,
+            "8": 3378071552.0,
+            "9": 3378071552.0,
+            "10": 3378071552.0,
+            "11": 3378071552.0,
+            "12": 3378071552.0,
+            "13": 3378071552.0,
+            "14": 3378071552.0,
+            "15": 3378071552.0,
+            "16": 3378071552.0,
+            "17": 3378071552.0,
+            "18": 3378071552.0,
+            "19": 3378071552.0,
+            "20": 3378071552.0,
+            "21": 3378071552.0,
+            "22": 3378071552.0,
+            "23": 3378071552.0,
+            "24": 3378894848.0,
+            "25": 3378894848.0,
+            "26": 3378894848.0,
+            "27": 3378894848.0,
+            "28": 3378894848.0,
+            "29": 3378894848.0,
+            "30": 3378894848.0,
+            "31": 3378894848.0,
+            "32": 3378894848.0,
+            "33": 3378894848.0,
+            "34": 3378894848.0,
+            "35": 3378894848.0,
+            "36": 3378894848.0,
+            "37": 3378894848.0,
+            "38": 3378894848.0,
+            "39": 3378894848.0,
+            "40": 3378894848.0,
+            "41": 3378894848.0,
+            "42": 3378894848.0,
+            "43": 3378894848.0,
+            "44": 3392753152.0,
+            "45": 3392753152.0,
+            "46": 3392753152.0,
+            "47": 3392753152.0,
+            "48": 3392753152.0,
+            "49": 3392753152.0,
+            "50": 3392753152.0,
+            "51": 3392753152.0,
+            "52": 3392753152.0,
+            "53": 3392753152.0,
+            "54": 3392753152.0,
+            "55": 3392753152.0,
+            "56": 3392753152.0,
+            "57": 3407671808.0,
+            "58": 3407671808.0,
+            "59": 3407671808.0,
+            "60": 3407671808.0,
+            "61": 3407671808.0,
+            "62": 3407671808.0,
+            "63": 3407671808.0,
+            "64": 3407671808.0,
+            "65": 3407671808.0,
+            "66": 3407671808.0,
+            "67": 3407671808.0,
+            "68": 3407671808.0,
+            "69": 3407671808.0,
+            "70": 3407671808.0,
+            "71": 3407671808.0,
+            "72": 3407671808.0,
+            "73": 3407671808.0,
+            "74": 3407671808.0,
+            "75": 3407671808.0,
+            "76": 3407671808.0,
+            "77": 3407671808.0,
+            "78": 3407671808.0,
+            "79": 3407671808.0,
+            "80": 3407671808.0,
+            "81": 3407671808.0,
+            "82": 3407671808.0,
+            "83": 3407671808.0,
+            "84": 3407671808.0,
+            "85": 3407671808.0,
+            "86": 3407671808.0,
+            "87": 3407671808.0,
+            "88": 3407671808.0,
+            "89": 3407671808.0,
+            "90": 3407671808.0,
+            "91": 3407671808.0,
+            "92": 3407671808.0,
+            "93": 3407671808.0,
+            "94": 3407671808.0,
+            "95": 3407671808.0,
+            "96": 3407671808.0,
+            "97": 3407671808.0,
+            "98": 3407671808.0,
+            "99": 3407671808.0,
+            "100": 3407671808.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 3.79579,
+            "3": 0.25267,
+            "4": 0.22623,
+            "5": 0.22831,
+            "6": 0.57471,
+            "7": 0.22109,
+            "8": 0.21732,
+            "9": 0.2127,
+            "10": 0.20981,
+            "11": 0.21771,
+            "12": 0.23313,
+            "13": 0.20775,
+            "14": 0.19946,
+            "15": 0.21125,
+            "16": 0.2099,
+            "17": 0.20543,
+            "18": 0.19972,
+            "19": 0.20265,
+            "20": 0.20005,
+            "21": 0.20188,
+            "22": 0.19675,
+            "23": 0.19822,
+            "24": 0.19828,
+            "25": 0.19827,
+            "26": 0.19789,
+            "27": 0.20238,
+            "28": 0.19366,
+            "29": 0.19297,
+            "30": 0.19521,
+            "31": 0.19886,
+            "32": 0.19176,
+            "33": 0.19628,
+            "34": 0.19156,
+            "35": 0.19683,
+            "36": 0.19061,
+            "37": 0.19031,
+            "38": 0.19383,
+            "39": 0.1966,
+            "40": 0.19152,
+            "41": 0.18691,
+            "42": 0.1917,
+            "43": 0.20258,
+            "44": 0.19552,
+            "45": 0.20203,
+            "46": 0.18769,
+            "47": 0.18872,
+            "48": 0.18493,
+            "49": 0.18884,
+            "50": 0.18824,
+            "51": 0.20579,
+            "52": 0.18818,
+            "53": 0.18827,
+            "54": 0.19298,
+            "55": 0.57299,
+            "56": 0.18813,
+            "57": 0.18557,
+            "58": 0.18597,
+            "59": 0.18577,
+            "60": 0.18756,
+            "61": 0.18972,
+            "62": 0.18872,
+            "63": 0.18937,
+            "64": 0.1888,
+            "65": 0.19262,
+            "66": 0.1879,
+            "67": 0.18498,
+            "68": 0.18535,
+            "69": 0.19492,
+            "70": 0.1923,
+            "71": 0.18822,
+            "72": 0.19191,
+            "73": 0.19457,
+            "74": 0.19765,
+            "75": 0.19091,
+            "76": 0.73064,
+            "77": 0.19543,
+            "78": 0.19034,
+            "79": 0.18715,
+            "80": 0.19339,
+            "81": 0.19135,
+            "82": 0.18703,
+            "83": 0.19082,
+            "84": 0.18783,
+            "85": 0.1926,
+            "86": 0.19556,
+            "87": 0.19127,
+            "88": 0.19028,
+            "89": 0.56083,
+            "90": 0.19223,
+            "91": 0.18622,
+            "92": 0.18536,
+            "93": 0.19063,
+            "94": 0.18804,
+            "95": 0.18711,
+            "96": 0.1883,
+            "97": 0.19006,
+            "98": 0.18897,
+            "99": 0.60361,
+            "100": 0.19278
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..ccbece04f60
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_h100.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81103,
+            "2": 10.83065,
+            "3": 10.82048,
+            "4": 10.81293,
+            "5": 10.84375,
+            "6": 10.8473,
+            "7": 10.85341,
+            "8": 10.83652,
+            "9": 10.84691,
+            "10": 10.78166,
+            "11": 10.85213,
+            "12": 10.8629,
+            "13": 10.85433,
+            "14": 10.88455,
+            "15": 10.87782,
+            "16": 10.84637,
+            "17": 10.83054,
+            "18": 10.86645,
+            "19": 10.84951,
+            "20": 10.84547,
+            "21": 10.8476,
+            "22": 10.79618,
+            "23": 10.88285,
+            "24": 10.83247,
+            "25": 10.8246,
+            "26": 10.8432,
+            "27": 10.85345,
+            "28": 10.87635,
+            "29": 10.864,
+            "30": 10.81293,
+            "31": 10.78651,
+            "32": 10.85541,
+            "33": 10.85587,
+            "34": 10.8491,
+            "35": 10.83747,
+            "36": 10.80362,
+            "37": 10.83812,
+            "38": 10.80509,
+            "39": 10.84183,
+            "40": 10.80312,
+            "41": 10.84012,
+            "42": 10.84384,
+            "43": 10.80987,
+            "44": 10.80275,
+            "45": 10.78691,
+            "46": 10.80833,
+            "47": 10.81704,
+            "48": 10.80337,
+            "49": 10.78131,
+            "50": 10.80305,
+            "51": 10.82235,
+            "52": 10.80371,
+            "53": 10.83231,
+            "54": 10.8151,
+            "55": 10.82578,
+            "56": 10.77729,
+            "57": 10.75325,
+            "58": 10.80742,
+            "59": 10.79087,
+            "60": 10.73998,
+            "61": 10.79954,
+            "62": 10.81284,
+            "63": 10.72011,
+            "64": 10.78598,
+            "65": 10.68981,
+            "66": 10.76066,
+            "67": 10.73402,
+            "68": 10.8022,
+            "69": 10.78312,
+            "70": 10.77711,
+            "71": 10.76626,
+            "72": 10.73591,
+            "73": 10.72919,
+            "74": 10.62192,
+            "75": 10.69079,
+            "76": 10.65398,
+            "77": 10.82162,
+            "78": 10.76368,
+            "79": 10.70473,
+            "80": 10.69368,
+            "81": 10.72419,
+            "82": 10.74233,
+            "83": 10.66786,
+            "84": 10.6983,
+            "85": 10.714,
+            "86": 10.6383,
+            "87": 10.71809,
+            "88": 10.73508,
+            "89": 10.7139,
+            "90": 10.74649,
+            "91": 10.64861,
+            "92": 10.64636,
+            "93": 10.60234,
+            "94": 10.53327,
+            "95": 10.66155,
+            "96": 10.67215,
+            "97": 10.61446,
+            "98": 10.68506,
+            "99": 10.52056,
+            "100": 10.61544
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1248.0,
+            "2": 1334.0,
+            "3": 1297.0,
+            "4": 1215.0,
+            "5": 1398.0,
+            "6": 1528.0,
+            "7": 1225.0,
+            "8": 1301.0,
+            "9": 1348.0,
+            "10": 1359.0,
+            "11": 1296.0,
+            "12": 1248.0,
+            "13": 1286.0,
+            "14": 1373.0,
+            "15": 1195.0,
+            "16": 1177.0,
+            "17": 1266.0,
+            "18": 1393.0,
+            "19": 1219.0,
+            "20": 1257.0,
+            "21": 1244.0,
+            "22": 1155.0,
+            "23": 1385.0,
+            "24": 1323.0,
+            "25": 1226.0,
+            "26": 1184.0,
+            "27": 1394.0,
+            "28": 1476.0,
+            "29": 1300.0,
+            "30": 1245.0,
+            "31": 1138.0,
+            "32": 1283.0,
+            "33": 1247.0,
+            "34": 1186.0,
+            "35": 1158.0,
+            "36": 1178.0,
+            "37": 1232.0,
+            "38": 1357.0,
+            "39": 1541.0,
+            "40": 1170.0,
+            "41": 1369.0,
+            "42": 1153.0,
+            "43": 1180.0,
+            "44": 1239.0,
+            "45": 1189.0,
+            "46": 1141.0,
+            "47": 1203.0,
+            "48": 1126.0,
+            "49": 1194.0,
+            "50": 1214.0,
+            "51": 1274.0,
+            "52": 1209.0,
+            "53": 1360.0,
+            "54": 1257.0,
+            "55": 1170.0,
+            "56": 1282.0,
+            "57": 1296.0,
+            "58": 1271.0,
+            "59": 1180.0,
+            "60": 1182.0,
+            "61": 1202.0,
+            "62": 1192.0,
+            "63": 1253.0,
+            "64": 1248.0,
+            "65": 1180.0,
+            "66": 1179.0,
+            "67": 1188.0,
+            "68": 1229.0,
+            "69": 1232.0,
+            "70": 1280.0,
+            "71": 1246.0,
+            "72": 1261.0,
+            "73": 1148.0,
+            "74": 1114.0,
+            "75": 1281.0,
+            "76": 1376.0,
+            "77": 1373.0,
+            "78": 1285.0,
+            "79": 1087.0,
+            "80": 1127.0,
+            "81": 1135.0,
+            "82": 1169.0,
+            "83": 1300.0,
+            "84": 1206.0,
+            "85": 1269.0,
+            "86": 1187.0,
+            "87": 1236.0,
+            "88": 1262.0,
+            "89": 1197.0,
+            "90": 1425.0,
+            "91": 1197.0,
+            "92": 1244.0,
+            "93": 1142.0,
+            "94": 971.0,
+            "95": 1281.0,
+            "96": 1243.0,
+            "97": 1145.0,
+            "98": 1288.0,
+            "99": 1286.0,
+            "100": 1212.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1095885312.0,
+            "2": 1095855104.0,
+            "3": 1095902720.0,
+            "4": 1095882752.0,
+            "5": 1095905792.0,
+            "6": 1095846912.0,
+            "7": 1095869952.0,
+            "8": 1095877120.0,
+            "9": 1095892480.0,
+            "10": 1095903232.0,
+            "11": 1095879168.0,
+            "12": 1095851008.0,
+            "13": 1095903232.0,
+            "14": 1095813120.0,
+            "15": 1095857152.0,
+            "16": 1095791104.0,
+            "17": 1095911936.0,
+            "18": 1095883264.0,
+            "19": 1095893504.0,
+            "20": 1095812096.0,
+            "21": 1095832064.0,
+            "22": 1095908864.0,
+            "23": 1095883776.0,
+            "24": 1095824384.0,
+            "25": 1095956480.0,
+            "26": 1095863808.0,
+            "27": 1095919104.0,
+            "28": 1095844864.0,
+            "29": 1095879168.0,
+            "30": 1095843840.0,
+            "31": 1095908352.0,
+            "32": 1095840768.0,
+            "33": 1095850496.0,
+            "34": 1095818240.0,
+            "35": 1095864832.0,
+            "36": 1095879680.0,
+            "37": 1095839232.0,
+            "38": 1095923200.0,
+            "39": 1095930880.0,
+            "40": 1095819264.0,
+            "41": 1095848448.0,
+            "42": 1095866880.0,
+            "43": 1095878656.0,
+            "44": 1095980544.0,
+            "45": 1095855104.0,
+            "46": 1095869952.0,
+            "47": 1095895040.0,
+            "48": 1095877632.0,
+            "49": 1095844352.0,
+            "50": 1095864320.0,
+            "51": 1095936000.0,
+            "52": 1095879680.0,
+            "53": 1095939584.0,
+            "54": 1095890432.0,
+            "55": 1095879168.0,
+            "56": 1095869440.0,
+            "57": 1095916544.0,
+            "58": 1095913984.0,
+            "59": 1095899136.0,
+            "60": 1095863296.0,
+            "61": 1095864320.0,
+            "62": 1095858176.0,
+            "63": 1095874048.0,
+            "64": 1095861760.0,
+            "65": 1095869952.0,
+            "66": 1095875584.0,
+            "67": 1095864832.0,
+            "68": 1095874048.0,
+            "69": 1095860224.0,
+            "70": 1095905280.0,
+            "71": 1095880192.0,
+            "72": 1095805440.0,
+            "73": 1095907840.0,
+            "74": 1095919616.0,
+            "75": 1095884800.0,
+            "76": 1095905792.0,
+            "77": 1095855616.0,
+            "78": 1095916544.0,
+            "79": 1095888384.0,
+            "80": 1095842304.0,
+            "81": 1095875584.0,
+            "82": 1095823872.0,
+            "83": 1095923712.0,
+            "84": 1095906304.0,
+            "85": 1095871488.0,
+            "86": 1095872512.0,
+            "87": 1095895552.0,
+            "88": 1095880192.0,
+            "89": 1095869440.0,
+            "90": 1095863296.0,
+            "91": 1095917056.0,
+            "92": 1095900160.0,
+            "93": 1095879680.0,
+            "94": 1095888896.0,
+            "95": 1095886848.0,
+            "96": 1095888384.0,
+            "97": 1095897088.0,
+            "98": 1095875584.0,
+            "99": 1095889408.0,
+            "100": 1095928320.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 3260420096.0,
+            "2": 3582874112.0,
+            "3": 3616017408.0,
+            "4": 3616017408.0,
+            "5": 3616065536.0,
+            "6": 3616065536.0,
+            "7": 3616065536.0,
+            "8": 3616065536.0,
+            "9": 3616065536.0,
+            "10": 3619626496.0,
+            "11": 3619626496.0,
+            "12": 3619626496.0,
+            "13": 3619626496.0,
+            "14": 3619626496.0,
+            "15": 3619626496.0,
+            "16": 3619626496.0,
+            "17": 3619626496.0,
+            "18": 3619626496.0,
+            "19": 3619626496.0,
+            "20": 3619626496.0,
+            "21": 3619626496.0,
+            "22": 3619626496.0,
+            "23": 3619626496.0,
+            "24": 3619626496.0,
+            "25": 3648242176.0,
+            "26": 3648242176.0,
+            "27": 3648242176.0,
+            "28": 3648242176.0,
+            "29": 3648242176.0,
+            "30": 3648242176.0,
+            "31": 3648242176.0,
+            "32": 3648242176.0,
+            "33": 3648242176.0,
+            "34": 3648242176.0,
+            "35": 3648242176.0,
+            "36": 3648242176.0,
+            "37": 3648242176.0,
+            "38": 3648242176.0,
+            "39": 3648242176.0,
+            "40": 3648242176.0,
+            "41": 3648242176.0,
+            "42": 3648242176.0,
+            "43": 3648242176.0,
+            "44": 3665209344.0,
+            "45": 3665209344.0,
+            "46": 3665209344.0,
+            "47": 3665209344.0,
+            "48": 3665209344.0,
+            "49": 3665209344.0,
+            "50": 3665209344.0,
+            "51": 3665209344.0,
+            "52": 3665209344.0,
+            "53": 3665209344.0,
+            "54": 3665209344.0,
+            "55": 3665209344.0,
+            "56": 3665209344.0,
+            "57": 3665209344.0,
+            "58": 3665209344.0,
+            "59": 3665209344.0,
+            "60": 3665209344.0,
+            "61": 3665209344.0,
+            "62": 3665209344.0,
+            "63": 3665209344.0,
+            "64": 3665209344.0,
+            "65": 3665209344.0,
+            "66": 3665209344.0,
+            "67": 3665209344.0,
+            "68": 3665209344.0,
+            "69": 3665209344.0,
+            "70": 3665209344.0,
+            "71": 3665209344.0,
+            "72": 3665209344.0,
+            "73": 3665209344.0,
+            "74": 3665209344.0,
+            "75": 3665209344.0,
+            "76": 3665209344.0,
+            "77": 3665209344.0,
+            "78": 3665209344.0,
+            "79": 3665209344.0,
+            "80": 3665209344.0,
+            "81": 3665209344.0,
+            "82": 3665209344.0,
+            "83": 3665209344.0,
+            "84": 3665209344.0,
+            "85": 3665209344.0,
+            "86": 3665209344.0,
+            "87": 3665209344.0,
+            "88": 3665209344.0,
+            "89": 3665209344.0,
+            "90": 3665209344.0,
+            "91": 3665209344.0,
+            "92": 3665209344.0,
+            "93": 3665209344.0,
+            "94": 3665209344.0,
+            "95": 3665209344.0,
+            "96": 3665209344.0,
+            "97": 3665209344.0,
+            "98": 3665209344.0,
+            "99": 3665209344.0,
+            "100": 3665209344.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 6.96692,
+            "3": 0.41239,
+            "4": 0.39161,
+            "5": 0.40475,
+            "6": 0.3904,
+            "7": 0.39424,
+            "8": 0.38721,
+            "9": 0.37766,
+            "10": 0.38826,
+            "11": 0.39241,
+            "12": 0.37744,
+            "13": 0.37937,
+            "14": 0.39891,
+            "15": 0.39154,
+            "16": 0.38546,
+            "17": 0.36906,
+            "18": 0.37961,
+            "19": 0.37168,
+            "20": 0.37856,
+            "21": 0.37322,
+            "22": 0.36901,
+            "23": 0.36962,
+            "24": 0.37071,
+            "25": 0.36454,
+            "26": 0.37164,
+            "27": 0.35661,
+            "28": 0.36072,
+            "29": 0.37992,
+            "30": 0.35418,
+            "31": 0.35828,
+            "32": 0.35863,
+            "33": 0.36304,
+            "34": 0.34938,
+            "35": 0.36044,
+            "36": 0.3661,
+            "37": 0.36694,
+            "38": 0.37046,
+            "39": 0.37481,
+            "40": 0.37606,
+            "41": 0.35942,
+            "42": 0.35928,
+            "43": 0.82934,
+            "44": 0.36187,
+            "45": 0.36124,
+            "46": 0.35574,
+            "47": 0.36316,
+            "48": 0.36376,
+            "49": 0.35682,
+            "50": 0.36509,
+            "51": 0.36781,
+            "52": 0.36533,
+            "53": 0.85049,
+            "54": 0.36057,
+            "55": 0.3565,
+            "56": 0.3743,
+            "57": 0.36606,
+            "58": 0.36355,
+            "59": 0.36215,
+            "60": 0.36264,
+            "61": 0.36287,
+            "62": 0.35671,
+            "63": 0.3661,
+            "64": 0.35095,
+            "65": 0.38153,
+            "66": 0.35893,
+            "67": 0.37021,
+            "68": 0.35656,
+            "69": 0.35749,
+            "70": 0.3687,
+            "71": 0.35581,
+            "72": 0.36693,
+            "73": 0.35596,
+            "74": 0.361,
+            "75": 0.35439,
+            "76": 0.35584,
+            "77": 0.36297,
+            "78": 0.35272,
+            "79": 0.35409,
+            "80": 0.35974,
+            "81": 0.355,
+            "82": 0.35692,
+            "83": 0.3617,
+            "84": 0.36038,
+            "85": 0.36694,
+            "86": 0.36667,
+            "87": 0.36782,
+            "88": 0.37457,
+            "89": 0.36585,
+            "90": 0.37116,
+            "91": 0.36385,
+            "92": 0.3564,
+            "93": 0.36251,
+            "94": 0.35477,
+            "95": 0.35372,
+            "96": 0.8695,
+            "97": 0.35034,
+            "98": 0.36289,
+            "99": 0.35766,
+            "100": 0.35116
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..59528111109
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.82235,
+            "52": 10.80371,
+            "53": 10.83231,
+            "54": 10.8151,
+            "55": 10.82578,
+            "56": 10.77729,
+            "57": 10.75325,
+            "58": 10.80742,
+            "59": 10.79087,
+            "60": 10.73998,
+            "61": 10.79954,
+            "62": 10.81284,
+            "63": 10.72011,
+            "64": 10.78598,
+            "65": 10.68981,
+            "66": 10.76066,
+            "67": 10.73402,
+            "68": 10.8022,
+            "69": 10.78312,
+            "70": 10.77711,
+            "71": 10.76626,
+            "72": 10.73591,
+            "73": 10.72919,
+            "74": 10.62192,
+            "75": 10.69079,
+            "76": 10.65398,
+            "77": 10.82162,
+            "78": 10.76368,
+            "79": 10.70473,
+            "80": 10.69368,
+            "81": 10.72419,
+            "82": 10.74233,
+            "83": 10.66786,
+            "84": 10.6983,
+            "85": 10.714,
+            "86": 10.6383,
+            "87": 10.71809,
+            "88": 10.73508,
+            "89": 10.7139,
+            "90": 10.74649,
+            "91": 10.64861,
+            "92": 10.64636,
+            "93": 10.60234,
+            "94": 10.53327,
+            "95": 10.66155,
+            "96": 10.67215,
+            "97": 10.61446,
+            "98": 10.68506,
+            "99": 10.52056,
+            "100": 10.61544
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1274.0,
+            "52": 1209.0,
+            "53": 1360.0,
+            "54": 1257.0,
+            "55": 1170.0,
+            "56": 1282.0,
+            "57": 1296.0,
+            "58": 1271.0,
+            "59": 1180.0,
+            "60": 1182.0,
+            "61": 1202.0,
+            "62": 1192.0,
+            "63": 1253.0,
+            "64": 1248.0,
+            "65": 1180.0,
+            "66": 1179.0,
+            "67": 1188.0,
+            "68": 1229.0,
+            "69": 1232.0,
+            "70": 1280.0,
+            "71": 1246.0,
+            "72": 1261.0,
+            "73": 1148.0,
+            "74": 1114.0,
+            "75": 1281.0,
+            "76": 1376.0,
+            "77": 1373.0,
+            "78": 1285.0,
+            "79": 1087.0,
+            "80": 1127.0,
+            "81": 1135.0,
+            "82": 1169.0,
+            "83": 1300.0,
+            "84": 1206.0,
+            "85": 1269.0,
+            "86": 1187.0,
+            "87": 1236.0,
+            "88": 1262.0,
+            "89": 1197.0,
+            "90": 1425.0,
+            "91": 1197.0,
+            "92": 1244.0,
+            "93": 1142.0,
+            "94": 971.0,
+            "95": 1281.0,
+            "96": 1243.0,
+            "97": 1145.0,
+            "98": 1288.0,
+            "99": 1286.0,
+            "100": 1212.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1095902208.0,
+            "52": 1095846912.0,
+            "53": 1095906816.0,
+            "54": 1095857664.0,
+            "55": 1095846400.0,
+            "56": 1095836672.0,
+            "57": 1095883776.0,
+            "58": 1095881216.0,
+            "59": 1095866368.0,
+            "60": 1095830528.0,
+            "61": 1095831552.0,
+            "62": 1095825408.0,
+            "63": 1095841280.0,
+            "64": 1095828992.0,
+            "65": 1095837184.0,
+            "66": 1095842816.0,
+            "67": 1095832064.0,
+            "68": 1095841280.0,
+            "69": 1095827456.0,
+            "70": 1095872512.0,
+            "71": 1095847424.0,
+            "72": 1095772672.0,
+            "73": 1095875072.0,
+            "74": 1095886848.0,
+            "75": 1095852032.0,
+            "76": 1095873024.0,
+            "77": 1095822848.0,
+            "78": 1095883776.0,
+            "79": 1095855616.0,
+            "80": 1095809536.0,
+            "81": 1095842816.0,
+            "82": 1095791104.0,
+            "83": 1095890944.0,
+            "84": 1095873536.0,
+            "85": 1095838720.0,
+            "86": 1095839744.0,
+            "87": 1095862784.0,
+            "88": 1095847424.0,
+            "89": 1095836672.0,
+            "90": 1095830528.0,
+            "91": 1095884288.0,
+            "92": 1095867392.0,
+            "93": 1095846912.0,
+            "94": 1095856128.0,
+            "95": 1095854080.0,
+            "96": 1095855616.0,
+            "97": 1095864320.0,
+            "98": 1095842816.0,
+            "99": 1095856640.0,
+            "100": 1095895552.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3605514752.0,
+            "52": 3605514752.0,
+            "53": 3638906880.0,
+            "54": 3638906880.0,
+            "55": 3638906880.0,
+            "56": 3638906880.0,
+            "57": 3638906880.0,
+            "58": 3638906880.0,
+            "59": 3638906880.0,
+            "60": 3638906880.0,
+            "61": 3638906880.0,
+            "62": 3638906880.0,
+            "63": 3638906880.0,
+            "64": 3638906880.0,
+            "65": 3638906880.0,
+            "66": 3638906880.0,
+            "67": 3638906880.0,
+            "68": 3638906880.0,
+            "69": 3638906880.0,
+            "70": 3638906880.0,
+            "71": 3638906880.0,
+            "72": 3638906880.0,
+            "73": 3638906880.0,
+            "74": 3638906880.0,
+            "75": 3638906880.0,
+            "76": 3638906880.0,
+            "77": 3638906880.0,
+            "78": 3638906880.0,
+            "79": 3638906880.0,
+            "80": 3638906880.0,
+            "81": 3638906880.0,
+            "82": 3638906880.0,
+            "83": 3638906880.0,
+            "84": 3638906880.0,
+            "85": 3638906880.0,
+            "86": 3638906880.0,
+            "87": 3638906880.0,
+            "88": 3638906880.0,
+            "89": 3638906880.0,
+            "90": 3638906880.0,
+            "91": 3638906880.0,
+            "92": 3638906880.0,
+            "93": 3638906880.0,
+            "94": 3638906880.0,
+            "95": 3638906880.0,
+            "96": 3638906880.0,
+            "97": 3638906880.0,
+            "98": 3638906880.0,
+            "99": 3638906880.0,
+            "100": 3638906880.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 5.33757,
+            "53": 0.39893,
+            "54": 0.38074,
+            "55": 0.38709,
+            "56": 0.37977,
+            "57": 0.37403,
+            "58": 0.3832,
+            "59": 0.37979,
+            "60": 0.3767,
+            "61": 0.37583,
+            "62": 0.38081,
+            "63": 0.38367,
+            "64": 0.38655,
+            "65": 0.37373,
+            "66": 0.37183,
+            "67": 0.37121,
+            "68": 0.38709,
+            "69": 0.38149,
+            "70": 0.38976,
+            "71": 0.38463,
+            "72": 0.38157,
+            "73": 0.36873,
+            "74": 0.3762,
+            "75": 0.36571,
+            "76": 0.36544,
+            "77": 0.37985,
+            "78": 0.37941,
+            "79": 0.36655,
+            "80": 0.37258,
+            "81": 0.36741,
+            "82": 0.36798,
+            "83": 0.3641,
+            "84": 0.36415,
+            "85": 0.37605,
+            "86": 0.37639,
+            "87": 0.38223,
+            "88": 0.37682,
+            "89": 0.3604,
+            "90": 0.37267,
+            "91": 0.36421,
+            "92": 0.36312,
+            "93": 0.36608,
+            "94": 0.35916,
+            "95": 0.37338,
+            "96": 0.3876,
+            "97": 0.37229,
+            "98": 0.3763,
+            "99": 0.37389,
+            "100": 0.3586
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/model_config.yaml
new file mode 100644
index 00000000000..1d0ef19232e
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon/model_config.yaml
@@ -0,0 +1,66 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --disable-bias-linear: true
+  --train-iters: 100
+  --timing-log-level: 0
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 8
+  --num-experts: 8
+  --moe-token-dispatcher-type: allgather
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-router-dtype: fp32
+  --moe-ffn-hidden-size: 1024
+  --moe-grouped-gemm: true
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --no-bias-gelu-fusion: true
+  --log-memory-to-tensorboard: true
+  --optimizer: dist_muon
+  --muon-momentum: 0.9
+  --muon-extra-scale-factor: 0.2
+  --muon-scale-mode: spectral
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..35841a4eaa1
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.80397,
+            "2": 10.81064,
+            "3": 10.79794,
+            "4": 10.78736,
+            "5": 10.82943,
+            "6": 10.85276,
+            "7": 10.80881,
+            "8": 10.79936,
+            "9": 10.82143,
+            "10": 10.77719,
+            "11": 10.8324,
+            "12": 10.8353,
+            "13": 10.84919,
+            "14": 10.85282,
+            "15": 10.79747,
+            "16": 10.78882,
+            "17": 10.75342,
+            "18": 10.78784,
+            "19": 10.77563,
+            "20": 10.70852,
+            "21": 10.6913,
+            "22": 10.53886,
+            "23": 10.69977,
+            "24": 10.58969,
+            "25": 10.54168,
+            "26": 10.60115,
+            "27": 10.61421,
+            "28": 10.59031,
+            "29": 10.60749,
+            "30": 10.38665,
+            "31": 10.13235,
+            "32": 10.49407,
+            "33": 10.47454,
+            "34": 10.23691,
+            "35": 10.28682,
+            "36": 10.26871,
+            "37": 10.38314,
+            "38": 10.23703,
+            "39": 10.43401,
+            "40": 10.1257,
+            "41": 10.17238,
+            "42": 10.24995,
+            "43": 9.85773,
+            "44": 9.98944,
+            "45": 9.87376,
+            "46": 9.84256,
+            "47": 10.1623,
+            "48": 9.89144,
+            "49": 9.57738,
+            "50": 9.96171,
+            "51": 9.88785,
+            "52": 9.76989,
+            "53": 10.10483,
+            "54": 9.99665,
+            "55": 9.92216,
+            "56": 9.67616,
+            "57": 9.51879,
+            "58": 9.89053,
+            "59": 9.63068,
+            "60": 9.55149,
+            "61": 9.72264,
+            "62": 10.0414,
+            "63": 9.43971,
+            "64": 9.8184,
+            "65": 8.98176,
+            "66": 9.75925,
+            "67": 9.39746,
+            "68": 9.83254,
+            "69": 9.81649,
+            "70": 9.75965,
+            "71": 9.66402,
+            "72": 9.63516,
+            "73": 9.54388,
+            "74": 9.00071,
+            "75": 9.465,
+            "76": 9.13889,
+            "77": 10.09535,
+            "78": 9.75814,
+            "79": 9.41614,
+            "80": 9.44749,
+            "81": 9.5168,
+            "82": 9.73156,
+            "83": 9.36737,
+            "84": 9.45017,
+            "85": 9.65534,
+            "86": 9.10891,
+            "87": 9.62042,
+            "88": 9.79408,
+            "89": 9.64391,
+            "90": 9.85314,
+            "91": 9.39297,
+            "92": 9.39817,
+            "93": 9.13664,
+            "94": 8.86865,
+            "95": 9.55719,
+            "96": 9.56146,
+            "97": 9.33062,
+            "98": 9.69677,
+            "99": 8.93672,
+            "100": 9.43355
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1158.0,
+            "2": 1181.0,
+            "3": 1239.0,
+            "4": 1193.0,
+            "5": 1380.0,
+            "6": 1501.0,
+            "7": 1391.0,
+            "8": 1239.0,
+            "9": 1272.0,
+            "10": 1225.0,
+            "11": 1337.0,
+            "12": 1167.0,
+            "13": 1452.0,
+            "14": 1254.0,
+            "15": 1269.0,
+            "16": 1347.0,
+            "17": 1234.0,
+            "18": 1202.0,
+            "19": 1183.0,
+            "20": 1141.0,
+            "21": 1236.0,
+            "22": 982.0,
+            "23": 1234.0,
+            "24": 1135.0,
+            "25": 1073.0,
+            "26": 1087.0,
+            "27": 1008.0,
+            "28": 1166.0,
+            "29": 1127.0,
+            "30": 1094.0,
+            "31": 929.0,
+            "32": 1176.0,
+            "33": 1123.0,
+            "34": 1084.0,
+            "35": 1110.0,
+            "36": 1053.0,
+            "37": 1208.0,
+            "38": 1233.0,
+            "39": 1210.0,
+            "40": 1229.0,
+            "41": 1217.0,
+            "42": 1317.0,
+            "43": 1212.0,
+            "44": 1144.0,
+            "45": 1374.0,
+            "46": 1249.0,
+            "47": 1163.0,
+            "48": 1176.0,
+            "49": 1335.0,
+            "50": 1172.0,
+            "51": 1345.0,
+            "52": 1334.0,
+            "53": 1388.0,
+            "54": 1316.0,
+            "55": 1263.0,
+            "56": 1320.0,
+            "57": 1143.0,
+            "58": 1050337.0,
+            "59": 1629.0,
+            "60": 58535.0,
+            "61": 1389.0,
+            "62": 1050420.0,
+            "63": 1302.0,
+            "64": 77146.0,
+            "65": 1058670.0,
+            "66": 1404.0,
+            "67": 59730.0,
+            "68": 50392.0,
+            "69": 2394.0,
+            "70": 108916.0,
+            "71": 46411.0,
+            "72": 2099307.0,
+            "73": 98574.0,
+            "74": 1106325.0,
+            "75": 4199.0,
+            "76": 1157870.0,
+            "77": 2145771.0,
+            "78": 1106899.0,
+            "79": 57707.0,
+            "80": 1105606.0,
+            "81": 1059706.0,
+            "82": 96668.0,
+            "83": 2099291.0,
+            "84": 1106935.0,
+            "85": 2099569.0,
+            "86": 1097473.0,
+            "87": 2099736.0,
+            "88": 2154950.0,
+            "89": 1154961.0,
+            "90": 3148620.0,
+            "91": 1157803.0,
+            "92": 3148576.0,
+            "93": 3148494.0,
+            "94": 1160976.0,
+            "95": 3148697.0,
+            "96": 3148663.0,
+            "97": 2147830.0,
+            "98": 3148776.0,
+            "99": 2099539.0,
+            "100": 3148590.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 788258816.0,
+            "2": 788271104.0,
+            "3": 788248064.0,
+            "4": 788275200.0,
+            "5": 788183552.0,
+            "6": 788212736.0,
+            "7": 788235264.0,
+            "8": 788251648.0,
+            "9": 788255232.0,
+            "10": 788226560.0,
+            "11": 788207104.0,
+            "12": 788192256.0,
+            "13": 788256256.0,
+            "14": 788187136.0,
+            "15": 788221952.0,
+            "16": 788214272.0,
+            "17": 788195328.0,
+            "18": 788191744.0,
+            "19": 788210176.0,
+            "20": 788163072.0,
+            "21": 788207616.0,
+            "22": 788235264.0,
+            "23": 788186112.0,
+            "24": 788249088.0,
+            "25": 788213248.0,
+            "26": 788204032.0,
+            "27": 788216832.0,
+            "28": 788117504.0,
+            "29": 788125696.0,
+            "30": 788169216.0,
+            "31": 788190720.0,
+            "32": 788118016.0,
+            "33": 788142592.0,
+            "34": 788156928.0,
+            "35": 788133376.0,
+            "36": 788130816.0,
+            "37": 788082688.0,
+            "38": 788156416.0,
+            "39": 788091904.0,
+            "40": 788120576.0,
+            "41": 788122112.0,
+            "42": 788104192.0,
+            "43": 788170752.0,
+            "44": 788189696.0,
+            "45": 788140032.0,
+            "46": 788148224.0,
+            "47": 788086272.0,
+            "48": 788128768.0,
+            "49": 788078080.0,
+            "50": 788078592.0,
+            "51": 788088832.0,
+            "52": 788065280.0,
+            "53": 788091392.0,
+            "54": 788059648.0,
+            "55": 788009472.0,
+            "56": 788060672.0,
+            "57": 788129792.0,
+            "58": 788124672.0,
+            "59": 788038656.0,
+            "60": 788026368.0,
+            "61": 788004352.0,
+            "62": 788007936.0,
+            "63": 788059136.0,
+            "64": 787972096.0,
+            "65": 788070400.0,
+            "66": 788031488.0,
+            "67": 788046336.0,
+            "68": 788037632.0,
+            "69": 787970560.0,
+            "70": 787992064.0,
+            "71": 788023808.0,
+            "72": 788022272.0,
+            "73": 788058624.0,
+            "74": 788075520.0,
+            "75": 788037632.0,
+            "76": 788094976.0,
+            "77": 787966464.0,
+            "78": 787980288.0,
+            "79": 788018176.0,
+            "80": 788026880.0,
+            "81": 787994624.0,
+            "82": 787986944.0,
+            "83": 788061696.0,
+            "84": 787999744.0,
+            "85": 787995648.0,
+            "86": 788012544.0,
+            "87": 787939328.0,
+            "88": 787957760.0,
+            "89": 787977728.0,
+            "90": 787927552.0,
+            "91": 787998720.0,
+            "92": 788026368.0,
+            "93": 788039680.0,
+            "94": 788032512.0,
+            "95": 788007424.0,
+            "96": 787978240.0,
+            "97": 788036608.0,
+            "98": 787984384.0,
+            "99": 788088320.0,
+            "100": 788081664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2992141824.0,
+            "2": 3170010624.0,
+            "3": 3170010624.0,
+            "4": 3172966400.0,
+            "5": 3172966400.0,
+            "6": 3172966400.0,
+            "7": 3172966400.0,
+            "8": 3172966400.0,
+            "9": 3172966400.0,
+            "10": 3172966400.0,
+            "11": 3172966400.0,
+            "12": 3172966400.0,
+            "13": 3172966400.0,
+            "14": 3172966400.0,
+            "15": 3172966400.0,
+            "16": 3172966400.0,
+            "17": 3172966400.0,
+            "18": 3172966400.0,
+            "19": 3172966400.0,
+            "20": 3172966400.0,
+            "21": 3172966400.0,
+            "22": 3172966400.0,
+            "23": 3172966400.0,
+            "24": 3172966400.0,
+            "25": 3172966400.0,
+            "26": 3172966400.0,
+            "27": 3172966400.0,
+            "28": 3172966400.0,
+            "29": 3172966400.0,
+            "30": 3172966400.0,
+            "31": 3172966400.0,
+            "32": 3172966400.0,
+            "33": 3172966400.0,
+            "34": 3172966400.0,
+            "35": 3172966400.0,
+            "36": 3172966400.0,
+            "37": 3172966400.0,
+            "38": 3172966400.0,
+            "39": 3172966400.0,
+            "40": 3172966400.0,
+            "41": 3172966400.0,
+            "42": 3172966400.0,
+            "43": 3172966400.0,
+            "44": 3172966400.0,
+            "45": 3172966400.0,
+            "46": 3172966400.0,
+            "47": 3172966400.0,
+            "48": 3172966400.0,
+            "49": 3172966400.0,
+            "50": 3172966400.0,
+            "51": 3172966400.0,
+            "52": 3172966400.0,
+            "53": 3172966400.0,
+            "54": 3172966400.0,
+            "55": 3172966400.0,
+            "56": 3172966400.0,
+            "57": 3172966400.0,
+            "58": 3172966400.0,
+            "59": 3172966400.0,
+            "60": 3172966400.0,
+            "61": 3172966400.0,
+            "62": 3172966400.0,
+            "63": 3172966400.0,
+            "64": 3172966400.0,
+            "65": 3172966400.0,
+            "66": 3172966400.0,
+            "67": 3172966400.0,
+            "68": 3172966400.0,
+            "69": 3172966400.0,
+            "70": 3172966400.0,
+            "71": 3172966400.0,
+            "72": 3172966400.0,
+            "73": 3172966400.0,
+            "74": 3172966400.0,
+            "75": 3172966400.0,
+            "76": 3172966400.0,
+            "77": 3172966400.0,
+            "78": 3172966400.0,
+            "79": 3172966400.0,
+            "80": 3172966400.0,
+            "81": 3172966400.0,
+            "82": 3172966400.0,
+            "83": 3172966400.0,
+            "84": 3172966400.0,
+            "85": 3172966400.0,
+            "86": 3172966400.0,
+            "87": 3172966400.0,
+            "88": 3172966400.0,
+            "89": 3172966400.0,
+            "90": 3172966400.0,
+            "91": 3172966400.0,
+            "92": 3172966400.0,
+            "93": 3172966400.0,
+            "94": 3172966400.0,
+            "95": 3172966400.0,
+            "96": 3172966400.0,
+            "97": 3172966400.0,
+            "98": 3172966400.0,
+            "99": 3172966400.0,
+            "100": 3172966400.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.24581,
+            "3": 0.22984,
+            "4": 0.19901,
+            "5": 0.20013,
+            "6": 0.19557,
+            "7": 0.19148,
+            "8": 0.18346,
+            "9": 0.18886,
+            "10": 0.18315,
+            "11": 0.18065,
+            "12": 0.18158,
+            "13": 0.17863,
+            "14": 0.17669,
+            "15": 0.16949,
+            "16": 0.1813,
+            "17": 0.17239,
+            "18": 0.16839,
+            "19": 0.17197,
+            "20": 0.18078,
+            "21": 0.17159,
+            "22": 0.17182,
+            "23": 0.17032,
+            "24": 0.16416,
+            "25": 0.16192,
+            "26": 0.17125,
+            "27": 0.15863,
+            "28": 0.16459,
+            "29": 0.16007,
+            "30": 0.16611,
+            "31": 0.16404,
+            "32": 0.15794,
+            "33": 0.16011,
+            "34": 0.15482,
+            "35": 0.15384,
+            "36": 0.16644,
+            "37": 0.15786,
+            "38": 0.15755,
+            "39": 0.15631,
+            "40": 0.15691,
+            "41": 0.15405,
+            "42": 0.14989,
+            "43": 0.1609,
+            "44": 0.15219,
+            "45": 0.15611,
+            "46": 0.1513,
+            "47": 0.15678,
+            "48": 0.14912,
+            "49": 0.14848,
+            "50": 0.15182,
+            "51": 0.16313,
+            "52": 0.14839,
+            "53": 0.14122,
+            "54": 0.14422,
+            "55": 0.14712,
+            "56": 0.14693,
+            "57": 0.14795,
+            "58": 0.14977,
+            "59": 0.15359,
+            "60": 0.14668,
+            "61": 0.15225,
+            "62": 0.14521,
+            "63": 0.14464,
+            "64": 0.14763,
+            "65": 0.14539,
+            "66": 0.14694,
+            "67": 0.1474,
+            "68": 0.14336,
+            "69": 0.14734,
+            "70": 0.14392,
+            "71": 0.14814,
+            "72": 0.15208,
+            "73": 0.15012,
+            "74": 0.14617,
+            "75": 0.14599,
+            "76": 0.14537,
+            "77": 0.15575,
+            "78": 0.15044,
+            "79": 0.68569,
+            "80": 0.15145,
+            "81": 0.1455,
+            "82": 0.14662,
+            "83": 0.14886,
+            "84": 0.14582,
+            "85": 0.14802,
+            "86": 0.1466,
+            "87": 0.14632,
+            "88": 0.14515,
+            "89": 0.14994,
+            "90": 0.1445,
+            "91": 0.14773,
+            "92": 0.14812,
+            "93": 0.14796,
+            "94": 0.51237,
+            "95": 0.15138,
+            "96": 0.15025,
+            "97": 0.14525,
+            "98": 0.1449,
+            "99": 0.1508,
+            "100": 0.14531
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
index ad7af2bddb0..b106daa13a1 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
@@ -4,105 +4,105 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.81131,
-            "2": 10.83052,
-            "3": 10.82093,
-            "4": 10.81347,
-            "5": 10.84338,
-            "6": 10.84743,
-            "7": 10.85254,
-            "8": 10.83482,
-            "9": 10.84276,
-            "10": 10.77693,
-            "11": 10.8459,
-            "12": 10.85115,
-            "13": 10.84165,
-            "14": 10.8714,
-            "15": 10.83613,
-            "16": 10.79815,
-            "17": 10.77288,
-            "18": 10.8075,
-            "19": 10.78773,
-            "20": 10.73433,
-            "21": 10.69461,
-            "22": 10.56597,
-            "23": 10.71611,
-            "24": 10.61321,
-            "25": 10.552,
-            "26": 10.61364,
-            "27": 10.62702,
-            "28": 10.59546,
-            "29": 10.59195,
-            "30": 10.3916,
-            "31": 10.14615,
-            "32": 10.47399,
-            "33": 10.47051,
-            "34": 10.23435,
-            "35": 10.29318,
-            "36": 10.26627,
-            "37": 10.37219,
-            "38": 10.2254,
-            "39": 10.42101,
-            "40": 10.13002,
-            "41": 10.16265,
-            "42": 10.24278,
-            "43": 9.88237,
-            "44": 9.99105,
-            "45": 9.87295,
+            "1": 10.81103,
+            "2": 10.83065,
+            "3": 10.82107,
+            "4": 10.81304,
+            "5": 10.84321,
+            "6": 10.84718,
+            "7": 10.85237,
+            "8": 10.83499,
+            "9": 10.84293,
+            "10": 10.77678,
+            "11": 10.84585,
+            "12": 10.85174,
+            "13": 10.84182,
+            "14": 10.87189,
+            "15": 10.83593,
+            "16": 10.79751,
+            "17": 10.77325,
+            "18": 10.8073,
+            "19": 10.78778,
+            "20": 10.73435,
+            "21": 10.69516,
+            "22": 10.56641,
+            "23": 10.71634,
+            "24": 10.61287,
+            "25": 10.55191,
+            "26": 10.61354,
+            "27": 10.62651,
+            "28": 10.59524,
+            "29": 10.5917,
+            "30": 10.39149,
+            "31": 10.1464,
+            "32": 10.47402,
+            "33": 10.47024,
+            "34": 10.23415,
+            "35": 10.2932,
+            "36": 10.26667,
+            "37": 10.37209,
+            "38": 10.22542,
+            "39": 10.42143,
+            "40": 10.13017,
+            "41": 10.16266,
+            "42": 10.24275,
+            "43": 9.88221,
+            "44": 9.99119,
+            "45": 9.87323,
             "46": 9.85181,
-            "47": 10.15633,
-            "48": 9.8915,
-            "49": 9.58889,
-            "50": 9.9543,
-            "51": 9.8849,
-            "52": 9.78004,
-            "53": 10.10188,
-            "54": 9.98715,
+            "47": 10.15626,
+            "48": 9.89157,
+            "49": 9.58903,
+            "50": 9.95443,
+            "51": 9.88487,
+            "52": 9.78018,
+            "53": 10.10226,
+            "54": 9.9873,
             "55": 9.9027,
-            "56": 9.66837,
-            "57": 9.53524,
+            "56": 9.66818,
+            "57": 9.53521,
             "58": 9.89495,
-            "59": 9.62892,
-            "60": 9.54308,
-            "61": 9.72727,
-            "62": 10.0332,
-            "63": 9.45215,
-            "64": 9.83179,
-            "65": 8.99109,
-            "66": 9.76394,
-            "67": 9.40349,
-            "68": 9.83129,
-            "69": 9.81856,
-            "70": 9.77262,
-            "71": 9.658,
-            "72": 9.64033,
-            "73": 9.55124,
-            "74": 9.02026,
-            "75": 9.47695,
-            "76": 9.13586,
-            "77": 10.09787,
-            "78": 9.75274,
-            "79": 9.41697,
-            "80": 9.45074,
-            "81": 9.52041,
-            "82": 9.73203,
-            "83": 9.36912,
-            "84": 9.45039,
-            "85": 9.65229,
-            "86": 9.1123,
-            "87": 9.61119,
-            "88": 9.78708,
-            "89": 9.64625,
-            "90": 9.83474,
-            "91": 9.39429,
-            "92": 9.39178,
+            "59": 9.6289,
+            "60": 9.54307,
+            "61": 9.72725,
+            "62": 10.03319,
+            "63": 9.45201,
+            "64": 9.83185,
+            "65": 8.99108,
+            "66": 9.76421,
+            "67": 9.40334,
+            "68": 9.83107,
+            "69": 9.81874,
+            "70": 9.77252,
+            "71": 9.65812,
+            "72": 9.64065,
+            "73": 9.5512,
+            "74": 9.02044,
+            "75": 9.47713,
+            "76": 9.13591,
+            "77": 10.09778,
+            "78": 9.75282,
+            "79": 9.41686,
+            "80": 9.45072,
+            "81": 9.52034,
+            "82": 9.73197,
+            "83": 9.36926,
+            "84": 9.4504,
+            "85": 9.65212,
+            "86": 9.11237,
+            "87": 9.61129,
+            "88": 9.78679,
+            "89": 9.64613,
+            "90": 9.83484,
+            "91": 9.39422,
+            "92": 9.39187,
             "93": 9.12787,
-            "94": 8.86637,
-            "95": 9.54352,
-            "96": 9.55716,
-            "97": 9.332,
-            "98": 9.69189,
-            "99": 8.92072,
+            "94": 8.86646,
+            "95": 9.54348,
+            "96": 9.55708,
+            "97": 9.33174,
+            "98": 9.6919,
+            "99": 8.92043,
             "100": 9.41916
         }
     },
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1216.0,
-            "2": 1361.0,
-            "3": 1221.0,
-            "4": 1222.0,
-            "5": 1385.0,
-            "6": 1467.0,
+            "1": 1248.0,
+            "2": 1334.0,
+            "3": 1294.0,
+            "4": 1227.0,
+            "5": 1403.0,
+            "6": 1427.0,
             "7": 1252.0,
-            "8": 1355.0,
-            "9": 1346.0,
-            "10": 1335.0,
-            "11": 1278.0,
-            "12": 1185.0,
-            "13": 1203.0,
-            "14": 1385.0,
-            "15": 1303.0,
-            "16": 1377.0,
-            "17": 1229.0,
-            "18": 1291.0,
-            "19": 1244.0,
-            "20": 1183.0,
-            "21": 1262.0,
-            "22": 1122.0,
-            "23": 1301.0,
-            "24": 1066.0,
-            "25": 1182.0,
-            "26": 1263.0,
-            "27": 1162.0,
-            "28": 1262.0,
-            "29": 1179.0,
-            "30": 1168.0,
-            "31": 991.0,
-            "32": 1092.0,
-            "33": 1183.0,
-            "34": 1081.0,
-            "35": 1146.0,
-            "36": 1076.0,
-            "37": 1252.0,
-            "38": 1176.0,
+            "8": 1427.0,
+            "9": 1305.0,
+            "10": 1282.0,
+            "11": 1297.0,
+            "12": 1278.0,
+            "13": 1202.0,
+            "14": 1425.0,
+            "15": 1290.0,
+            "16": 1353.0,
+            "17": 1248.0,
+            "18": 1308.0,
+            "19": 1305.0,
+            "20": 1244.0,
+            "21": 1166.0,
+            "22": 1145.0,
+            "23": 1320.0,
+            "24": 1102.0,
+            "25": 1254.0,
+            "26": 1241.0,
+            "27": 1137.0,
+            "28": 1332.0,
+            "29": 1297.0,
+            "30": 1138.0,
+            "31": 1027.0,
+            "32": 1093.0,
+            "33": 1262.0,
+            "34": 1095.0,
+            "35": 1120.0,
+            "36": 1048.0,
+            "37": 1161.0,
+            "38": 1211.0,
             "39": 1225.0,
-            "40": 1303.0,
-            "41": 1104.0,
-            "42": 1210.0,
-            "43": 1116.0,
-            "44": 1165.0,
-            "45": 1097.0,
-            "46": 1308.0,
-            "47": 1165.0,
-            "48": 1134.0,
-            "49": 1272.0,
-            "50": 1083.0,
-            "51": 1234.0,
-            "52": 1274.0,
-            "53": 1393.0,
-            "54": 1299.0,
-            "55": 1186.0,
-            "56": 1267.0,
-            "57": 1161.0,
-            "58": 1326.0,
-            "59": 1403.0,
-            "60": 1177.0,
-            "61": 1363.0,
-            "62": 1302.0,
-            "63": 1245.0,
-            "64": 1378.0,
-            "65": 1330.0,
-            "66": 1363.0,
-            "67": 1286.0,
-            "68": 1313.0,
-            "69": 1295.0,
-            "70": 1459.0,
-            "71": 1374.0,
-            "72": 1092.0,
-            "73": 1274.0,
-            "74": 943.0,
-            "75": 1059.0,
-            "76": 1323.0,
-            "77": 1475.0,
-            "78": 1487.0,
-            "79": 1496.0,
-            "80": 1382.0,
-            "81": 1470.0,
-            "82": 1417.0,
-            "83": 1177.0,
-            "84": 1506.0,
-            "85": 1420.0,
-            "86": 1281.0,
-            "87": 1540.0,
-            "88": 1467.0,
-            "89": 1452.0,
-            "90": 1350.0,
-            "91": 1010.0,
-            "92": 1324.0,
-            "93": 1349.0,
-            "94": 1197.0,
-            "95": 2503.0,
-            "96": 2373.0,
-            "97": 1490.0,
-            "98": 2541.0,
-            "99": 1367.0,
-            "100": 1122.0
+            "40": 1379.0,
+            "41": 1115.0,
+            "42": 1175.0,
+            "43": 1049.0,
+            "44": 1164.0,
+            "45": 1127.0,
+            "46": 1334.0,
+            "47": 1233.0,
+            "48": 1192.0,
+            "49": 1310.0,
+            "50": 1125.0,
+            "51": 1311.0,
+            "52": 1269.0,
+            "53": 1392.0,
+            "54": 1266.0,
+            "55": 1197.0,
+            "56": 1294.0,
+            "57": 1125.0,
+            "58": 1380.0,
+            "59": 1335.0,
+            "60": 1070.0,
+            "61": 1317.0,
+            "62": 1323.0,
+            "63": 1177.0,
+            "64": 1464.0,
+            "65": 1297.0,
+            "66": 1459.0,
+            "67": 1319.0,
+            "68": 1281.0,
+            "69": 1361.0,
+            "70": 1439.0,
+            "71": 1408.0,
+            "72": 1131.0,
+            "73": 1261.0,
+            "74": 918.0,
+            "75": 1051.0,
+            "76": 1288.0,
+            "77": 1472.0,
+            "78": 1433.0,
+            "79": 1433.0,
+            "80": 1350.0,
+            "81": 1576.0,
+            "82": 1414.0,
+            "83": 1205.0,
+            "84": 1485.0,
+            "85": 1339.0,
+            "86": 1265.0,
+            "87": 1538.0,
+            "88": 1462.0,
+            "89": 1499.0,
+            "90": 1289.0,
+            "91": 1052.0,
+            "92": 1303.0,
+            "93": 1235.0,
+            "94": 1301.0,
+            "95": 1386.0,
+            "96": 2364.0,
+            "97": 1408.0,
+            "98": 2551.0,
+            "99": 1263.0,
+            "100": 1227.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 788523008.0,
-            "2": 788493312.0,
-            "3": 788540416.0,
-            "4": 788518400.0,
-            "5": 788542464.0,
-            "6": 788484608.0,
-            "7": 788507648.0,
-            "8": 788515328.0,
-            "9": 788531200.0,
-            "10": 788543488.0,
-            "11": 788518400.0,
-            "12": 788489216.0,
-            "13": 788547584.0,
-            "14": 788456448.0,
-            "15": 788508160.0,
-            "16": 788445696.0,
-            "17": 788563456.0,
-            "18": 788540416.0,
-            "19": 788547584.0,
-            "20": 788475904.0,
-            "21": 788513792.0,
-            "22": 788599296.0,
-            "23": 788578816.0,
-            "24": 788518400.0,
-            "25": 788660736.0,
-            "26": 788571136.0,
-            "27": 788635648.0,
-            "28": 788573696.0,
-            "29": 788615680.0,
-            "30": 788592640.0,
-            "31": 788652544.0,
-            "32": 788608000.0,
-            "33": 788621824.0,
-            "34": 788582912.0,
-            "35": 788621824.0,
-            "36": 788647424.0,
-            "37": 788602880.0,
-            "38": 788655616.0,
-            "39": 788668416.0,
-            "40": 788555264.0,
-            "41": 788596736.0,
-            "42": 788580352.0,
-            "43": 788547072.0,
-            "44": 788628992.0,
-            "45": 788496384.0,
-            "46": 788508672.0,
-            "47": 788577280.0,
-            "48": 788493824.0,
-            "49": 788466688.0,
-            "50": 788492288.0,
-            "51": 788528128.0,
-            "52": 788488704.0,
-            "53": 788518912.0,
-            "54": 788508672.0,
-            "55": 788505088.0,
-            "56": 788464128.0,
-            "57": 788461568.0,
-            "58": 788505088.0,
-            "59": 788508672.0,
-            "60": 788496384.0,
-            "61": 788468736.0,
-            "62": 788502528.0,
-            "63": 788454912.0,
-            "64": 788470784.0,
-            "65": 788413440.0,
-            "66": 788450816.0,
-            "67": 788450816.0,
-            "68": 788461568.0,
-            "69": 788478976.0,
-            "70": 788502528.0,
-            "71": 788459008.0,
-            "72": 788419072.0,
-            "73": 788449280.0,
-            "74": 788424192.0,
-            "75": 788446720.0,
-            "76": 788418048.0,
-            "77": 788476416.0,
-            "78": 788467712.0,
-            "79": 788424192.0,
-            "80": 788416512.0,
-            "81": 788435968.0,
-            "82": 788444160.0,
-            "83": 788440576.0,
-            "84": 788476416.0,
-            "85": 788466176.0,
-            "86": 788400128.0,
-            "87": 788495872.0,
-            "88": 788498432.0,
-            "89": 788506624.0,
-            "90": 788536832.0,
-            "91": 788518912.0,
-            "92": 788521984.0,
-            "93": 788492288.0,
-            "94": 788511744.0,
-            "95": 788548608.0,
-            "96": 788568576.0,
-            "97": 788584960.0,
-            "98": 788595712.0,
-            "99": 788519936.0,
-            "100": 788575744.0
+            "1": 788555776.0,
+            "2": 788525568.0,
+            "3": 788572672.0,
+            "4": 788552704.0,
+            "5": 788574720.0,
+            "6": 788517888.0,
+            "7": 788541440.0,
+            "8": 788548096.0,
+            "9": 788562944.0,
+            "10": 788577280.0,
+            "11": 788553216.0,
+            "12": 788523008.0,
+            "13": 788579328.0,
+            "14": 788489216.0,
+            "15": 788539904.0,
+            "16": 788476928.0,
+            "17": 788598784.0,
+            "18": 788574208.0,
+            "19": 788580864.0,
+            "20": 788508160.0,
+            "21": 788545536.0,
+            "22": 788632064.0,
+            "23": 788610560.0,
+            "24": 788551168.0,
+            "25": 788694016.0,
+            "26": 788605440.0,
+            "27": 788667904.0,
+            "28": 788609024.0,
+            "29": 788647936.0,
+            "30": 788625408.0,
+            "31": 788685824.0,
+            "32": 788640768.0,
+            "33": 788655616.0,
+            "34": 788615680.0,
+            "35": 788654080.0,
+            "36": 788679680.0,
+            "37": 788634624.0,
+            "38": 788688896.0,
+            "39": 788698112.0,
+            "40": 788588032.0,
+            "41": 788628992.0,
+            "42": 788613632.0,
+            "43": 788577792.0,
+            "44": 788661248.0,
+            "45": 788528640.0,
+            "46": 788540928.0,
+            "47": 788609536.0,
+            "48": 788528640.0,
+            "49": 788498944.0,
+            "50": 788524544.0,
+            "51": 788559872.0,
+            "52": 788518400.0,
+            "53": 788552192.0,
+            "54": 788543488.0,
+            "55": 788538880.0,
+            "56": 788497408.0,
+            "57": 788493824.0,
+            "58": 788537344.0,
+            "59": 788539904.0,
+            "60": 788527104.0,
+            "61": 788499968.0,
+            "62": 788535296.0,
+            "63": 788487168.0,
+            "64": 788503552.0,
+            "65": 788446208.0,
+            "66": 788485632.0,
+            "67": 788485120.0,
+            "68": 788493312.0,
+            "69": 788508672.0,
+            "70": 788534784.0,
+            "71": 788491264.0,
+            "72": 788452864.0,
+            "73": 788477440.0,
+            "74": 788452864.0,
+            "75": 788480000.0,
+            "76": 788450304.0,
+            "77": 788506624.0,
+            "78": 788500992.0,
+            "79": 788451840.0,
+            "80": 788448256.0,
+            "81": 788466176.0,
+            "82": 788474880.0,
+            "83": 788470784.0,
+            "84": 788506624.0,
+            "85": 788496384.0,
+            "86": 788430848.0,
+            "87": 788528128.0,
+            "88": 788530176.0,
+            "89": 788537856.0,
+            "90": 788569600.0,
+            "91": 788549632.0,
+            "92": 788555264.0,
+            "93": 788525056.0,
+            "94": 788546560.0,
+            "95": 788583424.0,
+            "96": 788601856.0,
+            "97": 788617216.0,
+            "98": 788629504.0,
+            "99": 788551680.0,
+            "100": 788611072.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 3022964224.0,
-            "2": 3177559552.0,
-            "3": 3206005248.0,
-            "4": 3206005248.0,
-            "5": 3206005248.0,
-            "6": 3206005248.0,
-            "7": 3206005248.0,
-            "8": 3206005248.0,
-            "9": 3206005248.0,
-            "10": 3206005248.0,
-            "11": 3206005248.0,
-            "12": 3206005248.0,
-            "13": 3208181248.0,
-            "14": 3208181248.0,
-            "15": 3208181248.0,
-            "16": 3208181248.0,
-            "17": 3216008192.0,
-            "18": 3216008192.0,
-            "19": 3216008192.0,
-            "20": 3216008192.0,
-            "21": 3216008192.0,
-            "22": 3238043648.0,
-            "23": 3238043648.0,
-            "24": 3238043648.0,
-            "25": 3281027072.0,
-            "26": 3281027072.0,
-            "27": 3281027072.0,
-            "28": 3281027072.0,
-            "29": 3281027072.0,
-            "30": 3281027072.0,
-            "31": 3281027072.0,
-            "32": 3281027072.0,
-            "33": 3281027072.0,
-            "34": 3281027072.0,
-            "35": 3281027072.0,
-            "36": 3281027072.0,
-            "37": 3281027072.0,
-            "38": 3281027072.0,
-            "39": 3281027072.0,
-            "40": 3281027072.0,
-            "41": 3281027072.0,
-            "42": 3281027072.0,
-            "43": 3281027072.0,
-            "44": 3281027072.0,
-            "45": 3281027072.0,
-            "46": 3281027072.0,
-            "47": 3281027072.0,
-            "48": 3281027072.0,
-            "49": 3281027072.0,
-            "50": 3281027072.0,
-            "51": 3281027072.0,
-            "52": 3281027072.0,
-            "53": 3281027072.0,
-            "54": 3281027072.0,
-            "55": 3281027072.0,
-            "56": 3281027072.0,
-            "57": 3281027072.0,
-            "58": 3281027072.0,
-            "59": 3281027072.0,
-            "60": 3281027072.0,
-            "61": 3281027072.0,
-            "62": 3281027072.0,
-            "63": 3281027072.0,
-            "64": 3281027072.0,
-            "65": 3281027072.0,
-            "66": 3281027072.0,
-            "67": 3281027072.0,
-            "68": 3281027072.0,
-            "69": 3281027072.0,
-            "70": 3281027072.0,
-            "71": 3281027072.0,
-            "72": 3281027072.0,
-            "73": 3281027072.0,
-            "74": 3281027072.0,
-            "75": 3281027072.0,
-            "76": 3281027072.0,
-            "77": 3281027072.0,
-            "78": 3281027072.0,
-            "79": 3281027072.0,
-            "80": 3281027072.0,
-            "81": 3281027072.0,
-            "82": 3281027072.0,
-            "83": 3281027072.0,
-            "84": 3281027072.0,
-            "85": 3281027072.0,
-            "86": 3281027072.0,
-            "87": 3281027072.0,
-            "88": 3281027072.0,
-            "89": 3281027072.0,
-            "90": 3281027072.0,
-            "91": 3281027072.0,
-            "92": 3281027072.0,
-            "93": 3281027072.0,
-            "94": 3281027072.0,
-            "95": 3281027072.0,
-            "96": 3281027072.0,
-            "97": 3281027072.0,
-            "98": 3281027072.0,
-            "99": 3281027072.0,
-            "100": 3281027072.0
+            "1": 3121186304.0,
+            "2": 3272137728.0,
+            "3": 3305329664.0,
+            "4": 3305329664.0,
+            "5": 3309687808.0,
+            "6": 3309687808.0,
+            "7": 3309687808.0,
+            "8": 3309687808.0,
+            "9": 3309687808.0,
+            "10": 3309926912.0,
+            "11": 3309926912.0,
+            "12": 3309926912.0,
+            "13": 3309926912.0,
+            "14": 3309926912.0,
+            "15": 3309926912.0,
+            "16": 3309926912.0,
+            "17": 3318584832.0,
+            "18": 3318584832.0,
+            "19": 3318584832.0,
+            "20": 3318584832.0,
+            "21": 3318584832.0,
+            "22": 3346422784.0,
+            "23": 3346422784.0,
+            "24": 3346422784.0,
+            "25": 3392057856.0,
+            "26": 3392057856.0,
+            "27": 3392057856.0,
+            "28": 3392057856.0,
+            "29": 3392057856.0,
+            "30": 3392057856.0,
+            "31": 3392057856.0,
+            "32": 3392057856.0,
+            "33": 3392057856.0,
+            "34": 3392057856.0,
+            "35": 3392057856.0,
+            "36": 3392057856.0,
+            "37": 3392057856.0,
+            "38": 3392057856.0,
+            "39": 3392057856.0,
+            "40": 3392057856.0,
+            "41": 3392057856.0,
+            "42": 3392057856.0,
+            "43": 3392057856.0,
+            "44": 3392057856.0,
+            "45": 3392057856.0,
+            "46": 3392057856.0,
+            "47": 3392057856.0,
+            "48": 3392057856.0,
+            "49": 3392057856.0,
+            "50": 3392057856.0,
+            "51": 3392057856.0,
+            "52": 3392057856.0,
+            "53": 3392057856.0,
+            "54": 3392057856.0,
+            "55": 3392057856.0,
+            "56": 3392057856.0,
+            "57": 3392057856.0,
+            "58": 3392057856.0,
+            "59": 3392057856.0,
+            "60": 3392057856.0,
+            "61": 3392057856.0,
+            "62": 3392057856.0,
+            "63": 3392057856.0,
+            "64": 3392057856.0,
+            "65": 3392057856.0,
+            "66": 3392057856.0,
+            "67": 3392057856.0,
+            "68": 3392057856.0,
+            "69": 3392057856.0,
+            "70": 3392057856.0,
+            "71": 3392057856.0,
+            "72": 3392057856.0,
+            "73": 3392057856.0,
+            "74": 3392057856.0,
+            "75": 3392057856.0,
+            "76": 3392057856.0,
+            "77": 3392057856.0,
+            "78": 3392057856.0,
+            "79": 3392057856.0,
+            "80": 3392057856.0,
+            "81": 3392057856.0,
+            "82": 3392057856.0,
+            "83": 3392057856.0,
+            "84": 3392057856.0,
+            "85": 3392057856.0,
+            "86": 3392057856.0,
+            "87": 3392057856.0,
+            "88": 3392057856.0,
+            "89": 3392057856.0,
+            "90": 3392057856.0,
+            "91": 3392057856.0,
+            "92": 3392057856.0,
+            "93": 3392057856.0,
+            "94": 3392057856.0,
+            "95": 3392057856.0,
+            "96": 3392057856.0,
+            "97": 3392057856.0,
+            "98": 3392057856.0,
+            "99": 3392057856.0,
+            "100": 3392057856.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 12.96093,
-            "2": 0.20892,
-            "3": 0.18473,
-            "4": 0.18131,
-            "5": 0.18523,
-            "6": 0.15261,
-            "7": 0.15478,
-            "8": 0.15961,
-            "9": 0.14304,
-            "10": 0.14479,
-            "11": 0.14001,
-            "12": 0.14477,
-            "13": 0.13539,
-            "14": 0.14122,
-            "15": 0.12814,
-            "16": 0.1422,
-            "17": 0.14026,
-            "18": 0.1393,
-            "19": 0.13844,
-            "20": 0.14704,
-            "21": 0.13226,
-            "22": 0.12909,
-            "23": 0.13878,
-            "24": 0.13814,
-            "25": 0.13861,
-            "26": 0.14021,
-            "27": 0.15004,
-            "28": 0.14508,
-            "29": 0.15539,
-            "30": 0.14923,
-            "31": 0.15897,
-            "32": 0.14709,
-            "33": 0.15008,
-            "34": 0.14672,
-            "35": 0.15075,
-            "36": 0.15567,
-            "37": 0.14723,
-            "38": 0.15175,
-            "39": 0.14843,
-            "40": 0.15144,
-            "41": 0.14498,
-            "42": 0.15026,
-            "43": 0.15467,
-            "44": 0.14949,
-            "45": 0.14547,
-            "46": 0.16159,
-            "47": 0.14865,
-            "48": 0.13694,
-            "49": 0.1448,
-            "50": 0.14252,
-            "51": 0.1539,
-            "52": 0.14596,
-            "53": 0.14405,
-            "54": 0.13597,
-            "55": 0.13684,
-            "56": 0.1422,
-            "57": 0.14574,
-            "58": 0.15689,
-            "59": 0.14026,
-            "60": 0.15291,
-            "61": 0.14644,
-            "62": 0.14867,
-            "63": 0.14378,
-            "64": 0.14841,
-            "65": 0.13208,
-            "66": 0.13289,
-            "67": 0.13565,
-            "68": 0.13616,
-            "69": 0.1404,
-            "70": 0.15207,
-            "71": 0.12955,
-            "72": 0.13978,
-            "73": 0.13699,
-            "74": 0.13757,
-            "75": 0.13284,
-            "76": 0.12662,
-            "77": 0.13897,
-            "78": 0.13046,
-            "79": 0.13331,
-            "80": 0.13187,
-            "81": 0.13684,
-            "82": 0.12702,
-            "83": 0.13369,
-            "84": 0.14567,
-            "85": 0.13204,
-            "86": 0.12582,
-            "87": 0.12655,
-            "88": 0.13008,
-            "89": 0.12999,
-            "90": 0.13521,
-            "91": 0.12701,
-            "92": 0.13282,
-            "93": 0.12621,
-            "94": 0.12513,
-            "95": 0.12172,
-            "96": 0.12142,
-            "97": 0.13611,
-            "98": 0.12449,
-            "99": 0.12809,
-            "100": 0.12496
+            "1": 12.9672,
+            "2": 0.18032,
+            "3": 0.16621,
+            "4": 0.14138,
+            "5": 0.14697,
+            "6": 0.12745,
+            "7": 0.13018,
+            "8": 0.1308,
+            "9": 0.12325,
+            "10": 0.11929,
+            "11": 0.11868,
+            "12": 0.11662,
+            "13": 0.11935,
+            "14": 0.12579,
+            "15": 0.10685,
+            "16": 0.1235,
+            "17": 0.11712,
+            "18": 0.11351,
+            "19": 0.11956,
+            "20": 0.12036,
+            "21": 0.11206,
+            "22": 0.12061,
+            "23": 0.11918,
+            "24": 0.11718,
+            "25": 0.11286,
+            "26": 0.11553,
+            "27": 0.12325,
+            "28": 0.12425,
+            "29": 0.1373,
+            "30": 0.14042,
+            "31": 0.12588,
+            "32": 0.12886,
+            "33": 0.11871,
+            "34": 0.1268,
+            "35": 0.12631,
+            "36": 0.13682,
+            "37": 0.12561,
+            "38": 0.12806,
+            "39": 0.13203,
+            "40": 0.13218,
+            "41": 0.12224,
+            "42": 0.13858,
+            "43": 0.13174,
+            "44": 0.12012,
+            "45": 0.12567,
+            "46": 0.13565,
+            "47": 0.12427,
+            "48": 0.11574,
+            "49": 0.11974,
+            "50": 0.12631,
+            "51": 0.14169,
+            "52": 0.11509,
+            "53": 0.1256,
+            "54": 0.1169,
+            "55": 0.12608,
+            "56": 0.11705,
+            "57": 0.12085,
+            "58": 0.11877,
+            "59": 0.1187,
+            "60": 0.12978,
+            "61": 0.11339,
+            "62": 0.1117,
+            "63": 0.12276,
+            "64": 0.12623,
+            "65": 0.1311,
+            "66": 0.1174,
+            "67": 0.12925,
+            "68": 0.11502,
+            "69": 0.1185,
+            "70": 0.12525,
+            "71": 0.10756,
+            "72": 0.11771,
+            "73": 0.1132,
+            "74": 0.12549,
+            "75": 0.10854,
+            "76": 0.11252,
+            "77": 0.11354,
+            "78": 0.10942,
+            "79": 0.11618,
+            "80": 0.1066,
+            "81": 0.11024,
+            "82": 0.10189,
+            "83": 0.10909,
+            "84": 0.14864,
+            "85": 0.10374,
+            "86": 0.10395,
+            "87": 0.10291,
+            "88": 0.11323,
+            "89": 0.10749,
+            "90": 0.10777,
+            "91": 0.10528,
+            "92": 0.10628,
+            "93": 0.10398,
+            "94": 0.11116,
+            "95": 0.10621,
+            "96": 0.11081,
+            "97": 0.11111,
+            "98": 0.09872,
+            "99": 0.1051,
+            "100": 0.10136
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..7bfccdb49b6
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.88487,
+            "52": 9.78018,
+            "53": 10.10226,
+            "54": 9.9873,
+            "55": 9.9027,
+            "56": 9.66818,
+            "57": 9.53521,
+            "58": 9.89495,
+            "59": 9.6289,
+            "60": 9.54307,
+            "61": 9.72725,
+            "62": 10.03319,
+            "63": 9.45201,
+            "64": 9.83185,
+            "65": 8.99108,
+            "66": 9.76421,
+            "67": 9.40334,
+            "68": 9.83107,
+            "69": 9.81874,
+            "70": 9.77252,
+            "71": 9.65812,
+            "72": 9.64065,
+            "73": 9.5512,
+            "74": 9.02044,
+            "75": 9.47713,
+            "76": 9.13591,
+            "77": 10.09778,
+            "78": 9.75282,
+            "79": 9.41686,
+            "80": 9.45072,
+            "81": 9.52034,
+            "82": 9.73197,
+            "83": 9.36926,
+            "84": 9.4504,
+            "85": 9.65212,
+            "86": 9.11237,
+            "87": 9.61129,
+            "88": 9.78679,
+            "89": 9.64613,
+            "90": 9.83484,
+            "91": 9.39422,
+            "92": 9.39187,
+            "93": 9.12787,
+            "94": 8.86646,
+            "95": 9.54348,
+            "96": 9.55708,
+            "97": 9.33174,
+            "98": 9.6919,
+            "99": 8.92043,
+            "100": 9.41916
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1311.0,
+            "52": 1269.0,
+            "53": 1392.0,
+            "54": 1266.0,
+            "55": 1197.0,
+            "56": 1294.0,
+            "57": 1125.0,
+            "58": 1380.0,
+            "59": 1335.0,
+            "60": 1070.0,
+            "61": 1317.0,
+            "62": 1323.0,
+            "63": 1177.0,
+            "64": 1464.0,
+            "65": 1297.0,
+            "66": 1459.0,
+            "67": 1319.0,
+            "68": 1281.0,
+            "69": 1361.0,
+            "70": 1439.0,
+            "71": 1408.0,
+            "72": 1131.0,
+            "73": 1261.0,
+            "74": 918.0,
+            "75": 1051.0,
+            "76": 1288.0,
+            "77": 1472.0,
+            "78": 1433.0,
+            "79": 1433.0,
+            "80": 1350.0,
+            "81": 1576.0,
+            "82": 1414.0,
+            "83": 1205.0,
+            "84": 1485.0,
+            "85": 1339.0,
+            "86": 1265.0,
+            "87": 1538.0,
+            "88": 1462.0,
+            "89": 1499.0,
+            "90": 1289.0,
+            "91": 1052.0,
+            "92": 1303.0,
+            "93": 1235.0,
+            "94": 1301.0,
+            "95": 1386.0,
+            "96": 2364.0,
+            "97": 1408.0,
+            "98": 2551.0,
+            "99": 1263.0,
+            "100": 1227.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 788526080.0,
+            "52": 788485632.0,
+            "53": 788519424.0,
+            "54": 788510720.0,
+            "55": 788506112.0,
+            "56": 788464640.0,
+            "57": 788461056.0,
+            "58": 788504576.0,
+            "59": 788507136.0,
+            "60": 788494336.0,
+            "61": 788467200.0,
+            "62": 788502528.0,
+            "63": 788454400.0,
+            "64": 788470784.0,
+            "65": 788413440.0,
+            "66": 788452864.0,
+            "67": 788452352.0,
+            "68": 788460544.0,
+            "69": 788475904.0,
+            "70": 788502016.0,
+            "71": 788458496.0,
+            "72": 788420096.0,
+            "73": 788444672.0,
+            "74": 788420096.0,
+            "75": 788447232.0,
+            "76": 788417536.0,
+            "77": 788473856.0,
+            "78": 788468224.0,
+            "79": 788419072.0,
+            "80": 788415488.0,
+            "81": 788433408.0,
+            "82": 788442112.0,
+            "83": 788438016.0,
+            "84": 788473856.0,
+            "85": 788463616.0,
+            "86": 788398080.0,
+            "87": 788495360.0,
+            "88": 788497408.0,
+            "89": 788505088.0,
+            "90": 788536832.0,
+            "91": 788516864.0,
+            "92": 788522496.0,
+            "93": 788492288.0,
+            "94": 788513792.0,
+            "95": 788550656.0,
+            "96": 788569088.0,
+            "97": 788584448.0,
+            "98": 788596736.0,
+            "99": 788518912.0,
+            "100": 788578304.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3260852736.0,
+            "52": 3268460544.0,
+            "53": 3289748992.0,
+            "54": 3289748992.0,
+            "55": 3289748992.0,
+            "56": 3289748992.0,
+            "57": 3289748992.0,
+            "58": 3289748992.0,
+            "59": 3289748992.0,
+            "60": 3289748992.0,
+            "61": 3289748992.0,
+            "62": 3289748992.0,
+            "63": 3289748992.0,
+            "64": 3289748992.0,
+            "65": 3289748992.0,
+            "66": 3289748992.0,
+            "67": 3289748992.0,
+            "68": 3289748992.0,
+            "69": 3289748992.0,
+            "70": 3289748992.0,
+            "71": 3289748992.0,
+            "72": 3289748992.0,
+            "73": 3289748992.0,
+            "74": 3289748992.0,
+            "75": 3289748992.0,
+            "76": 3289748992.0,
+            "77": 3289748992.0,
+            "78": 3289748992.0,
+            "79": 3289748992.0,
+            "80": 3289748992.0,
+            "81": 3289748992.0,
+            "82": 3289748992.0,
+            "83": 3289748992.0,
+            "84": 3289748992.0,
+            "85": 3289748992.0,
+            "86": 3289748992.0,
+            "87": 3289748992.0,
+            "88": 3289748992.0,
+            "89": 3289748992.0,
+            "90": 3304260608.0,
+            "91": 3304260608.0,
+            "92": 3304260608.0,
+            "93": 3304260608.0,
+            "94": 3304260608.0,
+            "95": 3317049856.0,
+            "96": 3327264256.0,
+            "97": 3342199296.0,
+            "98": 3342199296.0,
+            "99": 3342199296.0,
+            "100": 3342199296.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.41223,
+            "52": 0.19638,
+            "53": 0.16863,
+            "54": 0.16115,
+            "55": 0.16098,
+            "56": 0.14835,
+            "57": 0.14976,
+            "58": 0.1434,
+            "59": 0.15979,
+            "60": 0.14662,
+            "61": 0.13636,
+            "62": 0.13903,
+            "63": 0.14463,
+            "64": 0.12921,
+            "65": 0.14012,
+            "66": 0.1288,
+            "67": 0.13615,
+            "68": 0.12598,
+            "69": 0.12709,
+            "70": 0.13652,
+            "71": 0.12173,
+            "72": 0.13319,
+            "73": 0.12379,
+            "74": 0.13482,
+            "75": 0.1344,
+            "76": 0.11894,
+            "77": 0.13537,
+            "78": 0.12153,
+            "79": 0.12133,
+            "80": 0.11937,
+            "81": 0.11569,
+            "82": 0.11902,
+            "83": 0.12127,
+            "84": 0.1134,
+            "85": 0.10983,
+            "86": 0.12467,
+            "87": 0.10796,
+            "88": 0.11354,
+            "89": 0.11117,
+            "90": 0.1179,
+            "91": 0.10903,
+            "92": 0.10919,
+            "93": 0.11161,
+            "94": 0.11589,
+            "95": 0.11757,
+            "96": 0.11512,
+            "97": 0.11492,
+            "98": 0.1084,
+            "99": 0.12117,
+            "100": 0.10905
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..fe8d3f78926
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.80397,
+            "2": 10.81064,
+            "3": 10.79812,
+            "4": 10.78677,
+            "5": 10.82981,
+            "6": 10.85273,
+            "7": 10.80976,
+            "8": 10.80152,
+            "9": 10.82476,
+            "10": 10.78235,
+            "11": 10.83837,
+            "12": 10.84645,
+            "13": 10.86121,
+            "14": 10.86494,
+            "15": 10.83809,
+            "16": 10.8346,
+            "17": 10.8121,
+            "18": 10.84437,
+            "19": 10.83592,
+            "20": 10.81732,
+            "21": 10.83519,
+            "22": 10.76256,
+            "23": 10.85511,
+            "24": 10.80666,
+            "25": 10.80025,
+            "26": 10.81426,
+            "27": 10.82995,
+            "28": 10.855,
+            "29": 10.86565,
+            "30": 10.79384,
+            "31": 10.74771,
+            "32": 10.84943,
+            "33": 10.83771,
+            "34": 10.80572,
+            "35": 10.80265,
+            "36": 10.79622,
+            "37": 10.82514,
+            "38": 10.79237,
+            "39": 10.84811,
+            "40": 10.77883,
+            "41": 10.79922,
+            "42": 10.81563,
+            "43": 10.74376,
+            "44": 10.76683,
+            "45": 10.76467,
+            "46": 10.77697,
+            "47": 10.79973,
+            "48": 10.77586,
+            "49": 10.72215,
+            "50": 10.78584,
+            "51": 10.78731,
+            "52": 10.7657,
+            "53": 10.81241,
+            "54": 10.79761,
+            "55": 10.80688,
+            "56": 10.75611,
+            "57": 10.71341,
+            "58": 10.78104,
+            "59": 10.7507,
+            "60": 10.72941,
+            "61": 10.76448,
+            "62": 10.8119,
+            "63": 10.69242,
+            "64": 10.76661,
+            "65": 10.62474,
+            "66": 10.75342,
+            "67": 10.69134,
+            "68": 10.77079,
+            "69": 10.76029,
+            "70": 10.76451,
+            "71": 10.73531,
+            "72": 10.72951,
+            "73": 10.7174,
+            "74": 10.57782,
+            "75": 10.68245,
+            "76": 10.61342,
+            "77": 10.80749,
+            "78": 10.7321,
+            "79": 10.66078,
+            "80": 10.68008,
+            "81": 10.69796,
+            "82": 10.72301,
+            "83": 10.6413,
+            "84": 10.6619,
+            "85": 10.70249,
+            "86": 10.58035,
+            "87": 10.69015,
+            "88": 10.73441,
+            "89": 10.67777,
+            "90": 10.74269,
+            "91": 10.62186,
+            "92": 10.63964,
+            "93": 10.56627,
+            "94": 10.49913,
+            "95": 10.65738,
+            "96": 10.65873,
+            "97": 10.57872,
+            "98": 10.6722,
+            "99": 10.4802,
+            "100": 10.59334
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1158.0,
+            "2": 1181.0,
+            "3": 1248.0,
+            "4": 1219.0,
+            "5": 1372.0,
+            "6": 1529.0,
+            "7": 1362.0,
+            "8": 1179.0,
+            "9": 1276.0,
+            "10": 1234.0,
+            "11": 1281.0,
+            "12": 1249.0,
+            "13": 1386.0,
+            "14": 1213.0,
+            "15": 1215.0,
+            "16": 1299.0,
+            "17": 1242.0,
+            "18": 1233.0,
+            "19": 1167.0,
+            "20": 1392.0,
+            "21": 1264.0,
+            "22": 1289.0,
+            "23": 1336.0,
+            "24": 1168.0,
+            "25": 1170.0,
+            "26": 1207.0,
+            "27": 1192.0,
+            "28": 1327.0,
+            "29": 1354.0,
+            "30": 1250.0,
+            "31": 1110.0,
+            "32": 1331.0,
+            "33": 1340.0,
+            "34": 1250.0,
+            "35": 1105.0,
+            "36": 1138.0,
+            "37": 1265.0,
+            "38": 1375.0,
+            "39": 1243.0,
+            "40": 1306.0,
+            "41": 1154.0,
+            "42": 1251.0,
+            "43": 1122.0,
+            "44": 1139.0,
+            "45": 1122.0,
+            "46": 1203.0,
+            "47": 1405.0,
+            "48": 1282.0,
+            "49": 1167.0,
+            "50": 1166.0,
+            "51": 1249.0,
+            "52": 1320.0,
+            "53": 1340.0,
+            "54": 1232.0,
+            "55": 1103.0,
+            "56": 1275.0,
+            "57": 1194.0,
+            "58": 1259.0,
+            "59": 1283.0,
+            "60": 1265.0,
+            "61": 1124.0,
+            "62": 1349.0,
+            "63": 1132.0,
+            "64": 1272.0,
+            "65": 1017.0,
+            "66": 1174.0,
+            "67": 1242.0,
+            "68": 1291.0,
+            "69": 1295.0,
+            "70": 1143.0,
+            "71": 1148.0,
+            "72": 1266.0,
+            "73": 1199.0,
+            "74": 1133.0,
+            "75": 1346.0,
+            "76": 1224.0,
+            "77": 1329.0,
+            "78": 1256.0,
+            "79": 997.0,
+            "80": 1093.0,
+            "81": 1204.0,
+            "82": 1213.0,
+            "83": 1128.0,
+            "84": 1228.0,
+            "85": 1316.0,
+            "86": 1101.0,
+            "87": 1278.0,
+            "88": 1286.0,
+            "89": 1163.0,
+            "90": 1415.0,
+            "91": 1248.0,
+            "92": 1137.0,
+            "93": 912.0,
+            "94": 985.0,
+            "95": 1097.0,
+            "96": 1087.0,
+            "97": 1098.0,
+            "98": 1170.0,
+            "99": 1047.0,
+            "100": 1205.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1095588352.0,
+            "2": 1095600640.0,
+            "3": 1095576576.0,
+            "4": 1095606272.0,
+            "5": 1095514624.0,
+            "6": 1095542272.0,
+            "7": 1095563776.0,
+            "8": 1095580160.0,
+            "9": 1095585792.0,
+            "10": 1095554048.0,
+            "11": 1095538176.0,
+            "12": 1095523328.0,
+            "13": 1095589888.0,
+            "14": 1095519744.0,
+            "15": 1095557120.0,
+            "16": 1095548928.0,
+            "17": 1095531008.0,
+            "18": 1095528448.0,
+            "19": 1095549440.0,
+            "20": 1095504384.0,
+            "21": 1095561728.0,
+            "22": 1095583232.0,
+            "23": 1095534592.0,
+            "24": 1095604736.0,
+            "25": 1095577088.0,
+            "26": 1095565824.0,
+            "27": 1095591424.0,
+            "28": 1095485952.0,
+            "29": 1095502848.0,
+            "30": 1095552512.0,
+            "31": 1095588352.0,
+            "32": 1095491072.0,
+            "33": 1095547392.0,
+            "34": 1095568384.0,
+            "35": 1095527424.0,
+            "36": 1095533568.0,
+            "37": 1095482880.0,
+            "38": 1095552000.0,
+            "39": 1095532544.0,
+            "40": 1095567360.0,
+            "41": 1095537152.0,
+            "42": 1095543296.0,
+            "43": 1095581184.0,
+            "44": 1095620096.0,
+            "45": 1095569408.0,
+            "46": 1095584768.0,
+            "47": 1095573504.0,
+            "48": 1095577088.0,
+            "49": 1095530496.0,
+            "50": 1095540736.0,
+            "51": 1095570944.0,
+            "52": 1095538176.0,
+            "53": 1095597568.0,
+            "54": 1095536640.0,
+            "55": 1095517184.0,
+            "56": 1095566848.0,
+            "57": 1095645696.0,
+            "58": 1095634944.0,
+            "59": 1095548928.0,
+            "60": 1095562752.0,
+            "61": 1095553536.0,
+            "62": 1095572480.0,
+            "63": 1095573504.0,
+            "64": 1095550464.0,
+            "65": 1095578112.0,
+            "66": 1095531008.0,
+            "67": 1095568896.0,
+            "68": 1095566848.0,
+            "69": 1095527424.0,
+            "70": 1095532032.0,
+            "71": 1095520768.0,
+            "72": 1095548928.0,
+            "73": 1095569920.0,
+            "74": 1095596032.0,
+            "75": 1095538688.0,
+            "76": 1095584768.0,
+            "77": 1095507968.0,
+            "78": 1095514624.0,
+            "79": 1095515648.0,
+            "80": 1095551488.0,
+            "81": 1095513600.0,
+            "82": 1095498240.0,
+            "83": 1095558656.0,
+            "84": 1095569408.0,
+            "85": 1095576064.0,
+            "86": 1095590400.0,
+            "87": 1095523840.0,
+            "88": 1095517696.0,
+            "89": 1095539712.0,
+            "90": 1095528960.0,
+            "91": 1095550976.0,
+            "92": 1095561216.0,
+            "93": 1095579136.0,
+            "94": 1095564288.0,
+            "95": 1095510528.0,
+            "96": 1095502336.0,
+            "97": 1095537152.0,
+            "98": 1095496192.0,
+            "99": 1095577600.0,
+            "100": 1095598592.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 3125957632.0,
+            "2": 3477050368.0,
+            "3": 3477050368.0,
+            "4": 3481636352.0,
+            "5": 3481636352.0,
+            "6": 3481636352.0,
+            "7": 3481636352.0,
+            "8": 3481636352.0,
+            "9": 3481636352.0,
+            "10": 3481636352.0,
+            "11": 3481636352.0,
+            "12": 3481636352.0,
+            "13": 3481636352.0,
+            "14": 3481636352.0,
+            "15": 3481636352.0,
+            "16": 3481636352.0,
+            "17": 3481636352.0,
+            "18": 3481636352.0,
+            "19": 3481636352.0,
+            "20": 3481636352.0,
+            "21": 3481636352.0,
+            "22": 3481636352.0,
+            "23": 3481636352.0,
+            "24": 3482527744.0,
+            "25": 3482527744.0,
+            "26": 3482527744.0,
+            "27": 3482527744.0,
+            "28": 3482527744.0,
+            "29": 3482527744.0,
+            "30": 3482527744.0,
+            "31": 3482527744.0,
+            "32": 3482527744.0,
+            "33": 3482527744.0,
+            "34": 3482527744.0,
+            "35": 3482527744.0,
+            "36": 3482527744.0,
+            "37": 3482527744.0,
+            "38": 3482527744.0,
+            "39": 3482527744.0,
+            "40": 3482527744.0,
+            "41": 3482527744.0,
+            "42": 3482527744.0,
+            "43": 3482527744.0,
+            "44": 3495770112.0,
+            "45": 3495770112.0,
+            "46": 3495770112.0,
+            "47": 3495770112.0,
+            "48": 3495770112.0,
+            "49": 3495770112.0,
+            "50": 3495770112.0,
+            "51": 3495770112.0,
+            "52": 3495770112.0,
+            "53": 3495770112.0,
+            "54": 3495770112.0,
+            "55": 3495770112.0,
+            "56": 3495770112.0,
+            "57": 3505988608.0,
+            "58": 3505988608.0,
+            "59": 3505988608.0,
+            "60": 3505988608.0,
+            "61": 3505988608.0,
+            "62": 3505988608.0,
+            "63": 3505988608.0,
+            "64": 3505988608.0,
+            "65": 3505988608.0,
+            "66": 3505988608.0,
+            "67": 3505988608.0,
+            "68": 3505988608.0,
+            "69": 3505988608.0,
+            "70": 3505988608.0,
+            "71": 3505988608.0,
+            "72": 3505988608.0,
+            "73": 3505988608.0,
+            "74": 3505988608.0,
+            "75": 3505988608.0,
+            "76": 3505988608.0,
+            "77": 3505988608.0,
+            "78": 3505988608.0,
+            "79": 3505988608.0,
+            "80": 3505988608.0,
+            "81": 3505988608.0,
+            "82": 3505988608.0,
+            "83": 3505988608.0,
+            "84": 3505988608.0,
+            "85": 3505988608.0,
+            "86": 3505988608.0,
+            "87": 3505988608.0,
+            "88": 3505988608.0,
+            "89": 3505988608.0,
+            "90": 3505988608.0,
+            "91": 3505988608.0,
+            "92": 3505988608.0,
+            "93": 3505988608.0,
+            "94": 3505988608.0,
+            "95": 3505988608.0,
+            "96": 3505988608.0,
+            "97": 3505988608.0,
+            "98": 3505988608.0,
+            "99": 3505988608.0,
+            "100": 3505988608.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 4.71174,
+            "3": 0.47502,
+            "4": 0.44931,
+            "5": 0.44277,
+            "6": 0.44844,
+            "7": 0.45785,
+            "8": 0.44209,
+            "9": 0.43757,
+            "10": 0.42772,
+            "11": 0.44315,
+            "12": 0.42725,
+            "13": 0.42666,
+            "14": 0.41928,
+            "15": 0.42831,
+            "16": 0.42799,
+            "17": 0.42051,
+            "18": 0.41469,
+            "19": 0.41876,
+            "20": 0.41842,
+            "21": 0.43095,
+            "22": 0.41003,
+            "23": 0.41066,
+            "24": 0.41091,
+            "25": 0.40849,
+            "26": 0.4098,
+            "27": 0.41447,
+            "28": 0.4098,
+            "29": 0.40395,
+            "30": 0.41016,
+            "31": 0.41347,
+            "32": 0.40916,
+            "33": 0.41299,
+            "34": 0.40596,
+            "35": 0.40696,
+            "36": 0.40868,
+            "37": 0.40718,
+            "38": 0.40736,
+            "39": 0.40604,
+            "40": 0.40127,
+            "41": 0.4,
+            "42": 0.40197,
+            "43": 0.40902,
+            "44": 0.40712,
+            "45": 0.4098,
+            "46": 0.40168,
+            "47": 0.40487,
+            "48": 0.40622,
+            "49": 0.4089,
+            "50": 0.40406,
+            "51": 0.41118,
+            "52": 0.40412,
+            "53": 0.40027,
+            "54": 0.40192,
+            "55": 0.39782,
+            "56": 0.39731,
+            "57": 0.39836,
+            "58": 0.40128,
+            "59": 0.39958,
+            "60": 0.39863,
+            "61": 0.78712,
+            "62": 0.39887,
+            "63": 0.39967,
+            "64": 0.40024,
+            "65": 0.39891,
+            "66": 0.40058,
+            "67": 0.80982,
+            "68": 0.39889,
+            "69": 0.39895,
+            "70": 0.40201,
+            "71": 0.39871,
+            "72": 0.39819,
+            "73": 0.40638,
+            "74": 0.40241,
+            "75": 0.39867,
+            "76": 0.40192,
+            "77": 0.4032,
+            "78": 0.39871,
+            "79": 0.96252,
+            "80": 0.39811,
+            "81": 0.40176,
+            "82": 0.39856,
+            "83": 0.40217,
+            "84": 0.3966,
+            "85": 0.40212,
+            "86": 0.40144,
+            "87": 0.39779,
+            "88": 0.3989,
+            "89": 0.39982,
+            "90": 0.40291,
+            "91": 0.40052,
+            "92": 0.39772,
+            "93": 0.40147,
+            "94": 0.40072,
+            "95": 0.40007,
+            "96": 0.40232,
+            "97": 0.40777,
+            "98": 0.4002,
+            "99": 0.39995,
+            "100": 0.39879
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..7d62923f634
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_h100.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81103,
+            "2": 10.83065,
+            "3": 10.82041,
+            "4": 10.81308,
+            "5": 10.84384,
+            "6": 10.84719,
+            "7": 10.85346,
+            "8": 10.83656,
+            "9": 10.84673,
+            "10": 10.78252,
+            "11": 10.85208,
+            "12": 10.86326,
+            "13": 10.85438,
+            "14": 10.88369,
+            "15": 10.87797,
+            "16": 10.84675,
+            "17": 10.83091,
+            "18": 10.86618,
+            "19": 10.84893,
+            "20": 10.84566,
+            "21": 10.8476,
+            "22": 10.79629,
+            "23": 10.88263,
+            "24": 10.83271,
+            "25": 10.82477,
+            "26": 10.84285,
+            "27": 10.85338,
+            "28": 10.87687,
+            "29": 10.86419,
+            "30": 10.81306,
+            "31": 10.78638,
+            "32": 10.85513,
+            "33": 10.85601,
+            "34": 10.8492,
+            "35": 10.83702,
+            "36": 10.80421,
+            "37": 10.83785,
+            "38": 10.80484,
+            "39": 10.84147,
+            "40": 10.80332,
+            "41": 10.83988,
+            "42": 10.84406,
+            "43": 10.81051,
+            "44": 10.8022,
+            "45": 10.78682,
+            "46": 10.80806,
+            "47": 10.81757,
+            "48": 10.80298,
+            "49": 10.78142,
+            "50": 10.8023,
+            "51": 10.82205,
+            "52": 10.80341,
+            "53": 10.83273,
+            "54": 10.81558,
+            "55": 10.82551,
+            "56": 10.77761,
+            "57": 10.7527,
+            "58": 10.80801,
+            "59": 10.79071,
+            "60": 10.73971,
+            "61": 10.80018,
+            "62": 10.81299,
+            "63": 10.72069,
+            "64": 10.78573,
+            "65": 10.69001,
+            "66": 10.76067,
+            "67": 10.73433,
+            "68": 10.80225,
+            "69": 10.7835,
+            "70": 10.77632,
+            "71": 10.76604,
+            "72": 10.736,
+            "73": 10.72965,
+            "74": 10.62244,
+            "75": 10.69059,
+            "76": 10.65429,
+            "77": 10.82179,
+            "78": 10.76341,
+            "79": 10.70461,
+            "80": 10.69433,
+            "81": 10.72473,
+            "82": 10.74232,
+            "83": 10.66784,
+            "84": 10.69896,
+            "85": 10.7144,
+            "86": 10.63886,
+            "87": 10.71783,
+            "88": 10.73541,
+            "89": 10.7139,
+            "90": 10.74667,
+            "91": 10.64906,
+            "92": 10.64667,
+            "93": 10.60204,
+            "94": 10.53296,
+            "95": 10.66128,
+            "96": 10.67208,
+            "97": 10.61439,
+            "98": 10.68466,
+            "99": 10.52017,
+            "100": 10.61535
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1248.0,
+            "2": 1334.0,
+            "3": 1279.0,
+            "4": 1251.0,
+            "5": 1382.0,
+            "6": 1478.0,
+            "7": 1205.0,
+            "8": 1414.0,
+            "9": 1410.0,
+            "10": 1302.0,
+            "11": 1305.0,
+            "12": 1282.0,
+            "13": 1254.0,
+            "14": 1432.0,
+            "15": 1176.0,
+            "16": 1222.0,
+            "17": 1256.0,
+            "18": 1355.0,
+            "19": 1265.0,
+            "20": 1285.0,
+            "21": 1258.0,
+            "22": 1186.0,
+            "23": 1312.0,
+            "24": 1329.0,
+            "25": 1270.0,
+            "26": 1206.0,
+            "27": 1432.0,
+            "28": 1433.0,
+            "29": 1297.0,
+            "30": 1191.0,
+            "31": 1166.0,
+            "32": 1273.0,
+            "33": 1273.0,
+            "34": 1165.0,
+            "35": 1200.0,
+            "36": 1216.0,
+            "37": 1241.0,
+            "38": 1343.0,
+            "39": 1544.0,
+            "40": 1200.0,
+            "41": 1350.0,
+            "42": 1218.0,
+            "43": 1213.0,
+            "44": 1223.0,
+            "45": 1179.0,
+            "46": 1211.0,
+            "47": 1353.0,
+            "48": 1180.0,
+            "49": 1180.0,
+            "50": 1182.0,
+            "51": 1221.0,
+            "52": 1192.0,
+            "53": 1460.0,
+            "54": 1267.0,
+            "55": 1209.0,
+            "56": 1312.0,
+            "57": 1287.0,
+            "58": 1291.0,
+            "59": 1292.0,
+            "60": 1229.0,
+            "61": 1153.0,
+            "62": 1228.0,
+            "63": 1200.0,
+            "64": 1307.0,
+            "65": 1183.0,
+            "66": 1202.0,
+            "67": 1163.0,
+            "68": 1246.0,
+            "69": 1316.0,
+            "70": 1336.0,
+            "71": 1209.0,
+            "72": 1196.0,
+            "73": 1115.0,
+            "74": 1121.0,
+            "75": 1276.0,
+            "76": 1299.0,
+            "77": 1349.0,
+            "78": 1322.0,
+            "79": 1092.0,
+            "80": 1223.0,
+            "81": 1098.0,
+            "82": 1237.0,
+            "83": 1317.0,
+            "84": 1179.0,
+            "85": 1286.0,
+            "86": 1152.0,
+            "87": 1188.0,
+            "88": 1294.0,
+            "89": 1227.0,
+            "90": 1392.0,
+            "91": 1150.0,
+            "92": 1268.0,
+            "93": 1105.0,
+            "94": 1010.0,
+            "95": 1265.0,
+            "96": 1276.0,
+            "97": 1181.0,
+            "98": 1194.0,
+            "99": 1221.0,
+            "100": 1285.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1095884800.0,
+            "2": 1095854592.0,
+            "3": 1095902720.0,
+            "4": 1095882752.0,
+            "5": 1095905792.0,
+            "6": 1095847424.0,
+            "7": 1095868928.0,
+            "8": 1095876608.0,
+            "9": 1095891968.0,
+            "10": 1095904768.0,
+            "11": 1095880704.0,
+            "12": 1095849472.0,
+            "13": 1095902720.0,
+            "14": 1095814144.0,
+            "15": 1095857152.0,
+            "16": 1095790592.0,
+            "17": 1095910400.0,
+            "18": 1095884288.0,
+            "19": 1095892480.0,
+            "20": 1095812096.0,
+            "21": 1095834112.0,
+            "22": 1095906816.0,
+            "23": 1095881728.0,
+            "24": 1095824896.0,
+            "25": 1095955968.0,
+            "26": 1095862272.0,
+            "27": 1095919616.0,
+            "28": 1095846400.0,
+            "29": 1095878656.0,
+            "30": 1095843840.0,
+            "31": 1095906816.0,
+            "32": 1095840768.0,
+            "33": 1095851008.0,
+            "34": 1095815680.0,
+            "35": 1095865856.0,
+            "36": 1095880192.0,
+            "37": 1095838720.0,
+            "38": 1095921664.0,
+            "39": 1095930368.0,
+            "40": 1095818240.0,
+            "41": 1095847424.0,
+            "42": 1095864320.0,
+            "43": 1095878144.0,
+            "44": 1095982080.0,
+            "45": 1095855104.0,
+            "46": 1095869952.0,
+            "47": 1095891968.0,
+            "48": 1095878144.0,
+            "49": 1095843840.0,
+            "50": 1095862272.0,
+            "51": 1095934464.0,
+            "52": 1095880192.0,
+            "53": 1095940608.0,
+            "54": 1095887872.0,
+            "55": 1095877632.0,
+            "56": 1095868416.0,
+            "57": 1095913472.0,
+            "58": 1095910912.0,
+            "59": 1095898112.0,
+            "60": 1095865344.0,
+            "61": 1095864320.0,
+            "62": 1095858176.0,
+            "63": 1095872000.0,
+            "64": 1095862272.0,
+            "65": 1095868928.0,
+            "66": 1095877120.0,
+            "67": 1095863808.0,
+            "68": 1095873024.0,
+            "69": 1095859712.0,
+            "70": 1095904768.0,
+            "71": 1095876608.0,
+            "72": 1095805952.0,
+            "73": 1095908352.0,
+            "74": 1095918592.0,
+            "75": 1095884288.0,
+            "76": 1095903744.0,
+            "77": 1095857664.0,
+            "78": 1095914496.0,
+            "79": 1095888896.0,
+            "80": 1095839232.0,
+            "81": 1095875584.0,
+            "82": 1095825408.0,
+            "83": 1095925248.0,
+            "84": 1095904256.0,
+            "85": 1095870976.0,
+            "86": 1095870976.0,
+            "87": 1095893504.0,
+            "88": 1095882240.0,
+            "89": 1095869952.0,
+            "90": 1095860224.0,
+            "91": 1095916032.0,
+            "92": 1095900672.0,
+            "93": 1095878144.0,
+            "94": 1095884800.0,
+            "95": 1095889920.0,
+            "96": 1095886848.0,
+            "97": 1095896576.0,
+            "98": 1095873024.0,
+            "99": 1095887872.0,
+            "100": 1095927808.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 3260419584.0,
+            "2": 3582873600.0,
+            "3": 3615975424.0,
+            "4": 3615975424.0,
+            "5": 3616532480.0,
+            "6": 3616532480.0,
+            "7": 3616532480.0,
+            "8": 3616532480.0,
+            "9": 3616532480.0,
+            "10": 3618800640.0,
+            "11": 3618800640.0,
+            "12": 3618800640.0,
+            "13": 3618800640.0,
+            "14": 3618800640.0,
+            "15": 3618800640.0,
+            "16": 3618800640.0,
+            "17": 3618800640.0,
+            "18": 3618800640.0,
+            "19": 3618800640.0,
+            "20": 3618800640.0,
+            "21": 3618800640.0,
+            "22": 3618800640.0,
+            "23": 3618800640.0,
+            "24": 3618800640.0,
+            "25": 3648277504.0,
+            "26": 3648277504.0,
+            "27": 3648277504.0,
+            "28": 3648277504.0,
+            "29": 3648277504.0,
+            "30": 3648277504.0,
+            "31": 3648277504.0,
+            "32": 3648277504.0,
+            "33": 3648277504.0,
+            "34": 3648277504.0,
+            "35": 3648277504.0,
+            "36": 3648277504.0,
+            "37": 3648277504.0,
+            "38": 3648277504.0,
+            "39": 3648277504.0,
+            "40": 3648277504.0,
+            "41": 3648277504.0,
+            "42": 3648277504.0,
+            "43": 3648277504.0,
+            "44": 3667798528.0,
+            "45": 3667798528.0,
+            "46": 3667798528.0,
+            "47": 3667798528.0,
+            "48": 3667798528.0,
+            "49": 3667798528.0,
+            "50": 3667798528.0,
+            "51": 3667798528.0,
+            "52": 3667798528.0,
+            "53": 3667798528.0,
+            "54": 3667798528.0,
+            "55": 3667798528.0,
+            "56": 3667798528.0,
+            "57": 3667798528.0,
+            "58": 3667798528.0,
+            "59": 3667798528.0,
+            "60": 3667798528.0,
+            "61": 3667798528.0,
+            "62": 3667798528.0,
+            "63": 3667798528.0,
+            "64": 3667798528.0,
+            "65": 3667798528.0,
+            "66": 3667798528.0,
+            "67": 3667798528.0,
+            "68": 3667798528.0,
+            "69": 3667798528.0,
+            "70": 3667798528.0,
+            "71": 3667798528.0,
+            "72": 3667798528.0,
+            "73": 3667798528.0,
+            "74": 3667798528.0,
+            "75": 3667798528.0,
+            "76": 3667798528.0,
+            "77": 3667798528.0,
+            "78": 3667798528.0,
+            "79": 3667798528.0,
+            "80": 3667798528.0,
+            "81": 3667798528.0,
+            "82": 3667798528.0,
+            "83": 3667798528.0,
+            "84": 3667798528.0,
+            "85": 3667798528.0,
+            "86": 3667798528.0,
+            "87": 3667798528.0,
+            "88": 3667798528.0,
+            "89": 3667798528.0,
+            "90": 3667798528.0,
+            "91": 3667798528.0,
+            "92": 3667798528.0,
+            "93": 3667798528.0,
+            "94": 3667798528.0,
+            "95": 3667798528.0,
+            "96": 3667798528.0,
+            "97": 3667798528.0,
+            "98": 3667798528.0,
+            "99": 3667798528.0,
+            "100": 3667798528.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 6.73766,
+            "3": 0.31869,
+            "4": 0.3125,
+            "5": 0.31279,
+            "6": 0.29974,
+            "7": 0.30628,
+            "8": 0.29637,
+            "9": 0.29751,
+            "10": 0.28123,
+            "11": 0.3055,
+            "12": 0.28861,
+            "13": 0.27778,
+            "14": 0.28796,
+            "15": 0.28678,
+            "16": 0.27533,
+            "17": 0.27979,
+            "18": 1.87923,
+            "19": 0.28574,
+            "20": 0.28215,
+            "21": 0.2771,
+            "22": 0.27101,
+            "23": 0.27311,
+            "24": 1.50235,
+            "25": 0.27537,
+            "26": 1.04897,
+            "27": 0.26783,
+            "28": 0.69868,
+            "29": 0.27953,
+            "30": 1.54699,
+            "31": 0.27899,
+            "32": 0.28165,
+            "33": 0.28792,
+            "34": 0.27678,
+            "35": 1.25378,
+            "36": 0.88573,
+            "37": 0.26832,
+            "38": 0.26501,
+            "39": 0.28399,
+            "40": 0.96629,
+            "41": 0.26938,
+            "42": 0.31209,
+            "43": 0.27968,
+            "44": 0.65987,
+            "45": 0.51088,
+            "46": 1.37707,
+            "47": 0.26575,
+            "48": 0.92193,
+            "49": 0.26081,
+            "50": 0.27031,
+            "51": 0.31353,
+            "52": 0.27257,
+            "53": 0.27323,
+            "54": 0.27148,
+            "55": 0.27248,
+            "56": 0.7475,
+            "57": 0.26706,
+            "58": 0.28367,
+            "59": 0.27716,
+            "60": 1.12441,
+            "61": 0.26587,
+            "62": 0.68635,
+            "63": 0.28123,
+            "64": 0.98333,
+            "65": 0.27408,
+            "66": 1.22087,
+            "67": 0.26407,
+            "68": 0.95198,
+            "69": 0.29272,
+            "70": 0.52799,
+            "71": 0.92323,
+            "72": 0.25931,
+            "73": 0.26616,
+            "74": 0.28128,
+            "75": 0.28947,
+            "76": 0.27481,
+            "77": 0.67217,
+            "78": 0.28612,
+            "79": 0.85039,
+            "80": 0.2721,
+            "81": 0.5328,
+            "82": 0.57505,
+            "83": 0.79918,
+            "84": 0.28096,
+            "85": 0.27744,
+            "86": 0.273,
+            "87": 0.33552,
+            "88": 0.48699,
+            "89": 0.28552,
+            "90": 0.50386,
+            "91": 0.27372,
+            "92": 0.64636,
+            "93": 0.26742,
+            "94": 0.2649,
+            "95": 0.49366,
+            "96": 0.36845,
+            "97": 0.29731,
+            "98": 0.53051,
+            "99": 0.26212,
+            "100": 0.75087
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..d8a426b39e0
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.82229,
+            "52": 10.80331,
+            "53": 10.83272,
+            "54": 10.81511,
+            "55": 10.82544,
+            "56": 10.77739,
+            "57": 10.75212,
+            "58": 10.80727,
+            "59": 10.79059,
+            "60": 10.74002,
+            "61": 10.79967,
+            "62": 10.81287,
+            "63": 10.72057,
+            "64": 10.78554,
+            "65": 10.68954,
+            "66": 10.76088,
+            "67": 10.73433,
+            "68": 10.80171,
+            "69": 10.78354,
+            "70": 10.77601,
+            "71": 10.767,
+            "72": 10.73617,
+            "73": 10.72977,
+            "74": 10.62268,
+            "75": 10.69072,
+            "76": 10.65444,
+            "77": 10.82173,
+            "78": 10.76342,
+            "79": 10.70428,
+            "80": 10.69419,
+            "81": 10.72444,
+            "82": 10.74209,
+            "83": 10.66776,
+            "84": 10.69841,
+            "85": 10.71466,
+            "86": 10.63794,
+            "87": 10.71867,
+            "88": 10.73504,
+            "89": 10.71428,
+            "90": 10.74679,
+            "91": 10.64894,
+            "92": 10.64647,
+            "93": 10.60196,
+            "94": 10.53294,
+            "95": 10.66112,
+            "96": 10.6724,
+            "97": 10.61431,
+            "98": 10.68496,
+            "99": 10.52028,
+            "100": 10.61542
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1192.0,
+            "52": 1269.0,
+            "53": 1394.0,
+            "54": 1248.0,
+            "55": 1159.0,
+            "56": 1286.0,
+            "57": 1320.0,
+            "58": 1277.0,
+            "59": 1258.0,
+            "60": 1208.0,
+            "61": 1163.0,
+            "62": 1153.0,
+            "63": 1291.0,
+            "64": 1246.0,
+            "65": 1270.0,
+            "66": 1214.0,
+            "67": 1160.0,
+            "68": 1234.0,
+            "69": 1298.0,
+            "70": 1371.0,
+            "71": 1159.0,
+            "72": 1221.0,
+            "73": 1193.0,
+            "74": 1133.0,
+            "75": 1314.0,
+            "76": 1279.0,
+            "77": 1351.0,
+            "78": 1304.0,
+            "79": 1100.0,
+            "80": 1124.0,
+            "81": 1146.0,
+            "82": 1247.0,
+            "83": 1291.0,
+            "84": 1104.0,
+            "85": 1226.0,
+            "86": 1171.0,
+            "87": 1212.0,
+            "88": 1322.0,
+            "89": 1215.0,
+            "90": 1303.0,
+            "91": 1142.0,
+            "92": 1267.0,
+            "93": 1099.0,
+            "94": 1022.0,
+            "95": 1297.0,
+            "96": 1255.0,
+            "97": 1195.0,
+            "98": 1250.0,
+            "99": 1256.0,
+            "100": 1214.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1095899648.0,
+            "52": 1095847424.0,
+            "53": 1095908352.0,
+            "54": 1095856640.0,
+            "55": 1095843328.0,
+            "56": 1095836160.0,
+            "57": 1095881216.0,
+            "58": 1095879680.0,
+            "59": 1095863296.0,
+            "60": 1095830016.0,
+            "61": 1095828992.0,
+            "62": 1095825920.0,
+            "63": 1095840256.0,
+            "64": 1095826944.0,
+            "65": 1095834112.0,
+            "66": 1095843840.0,
+            "67": 1095830528.0,
+            "68": 1095840256.0,
+            "69": 1095829504.0,
+            "70": 1095872000.0,
+            "71": 1095846912.0,
+            "72": 1095772160.0,
+            "73": 1095873024.0,
+            "74": 1095885824.0,
+            "75": 1095849984.0,
+            "76": 1095870976.0,
+            "77": 1095824896.0,
+            "78": 1095884288.0,
+            "79": 1095855616.0,
+            "80": 1095808000.0,
+            "81": 1095844864.0,
+            "82": 1095790080.0,
+            "83": 1095890944.0,
+            "84": 1095872000.0,
+            "85": 1095839744.0,
+            "86": 1095839232.0,
+            "87": 1095861760.0,
+            "88": 1095849472.0,
+            "89": 1095837696.0,
+            "90": 1095828480.0,
+            "91": 1095883776.0,
+            "92": 1095866880.0,
+            "93": 1095845376.0,
+            "94": 1095854592.0,
+            "95": 1095854080.0,
+            "96": 1095854592.0,
+            "97": 1095863296.0,
+            "98": 1095840256.0,
+            "99": 1095857152.0,
+            "100": 1095894528.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 3605154816.0,
+            "52": 3605154816.0,
+            "53": 3639019008.0,
+            "54": 3639019008.0,
+            "55": 3639019008.0,
+            "56": 3639019008.0,
+            "57": 3639019008.0,
+            "58": 3639019008.0,
+            "59": 3639019008.0,
+            "60": 3639019008.0,
+            "61": 3639019008.0,
+            "62": 3639019008.0,
+            "63": 3639019008.0,
+            "64": 3639019008.0,
+            "65": 3639019008.0,
+            "66": 3639019008.0,
+            "67": 3639019008.0,
+            "68": 3639019008.0,
+            "69": 3639019008.0,
+            "70": 3639019008.0,
+            "71": 3639019008.0,
+            "72": 3639019008.0,
+            "73": 3639019008.0,
+            "74": 3639019008.0,
+            "75": 3639019008.0,
+            "76": 3639019008.0,
+            "77": 3639019008.0,
+            "78": 3639019008.0,
+            "79": 3639019008.0,
+            "80": 3639019008.0,
+            "81": 3639019008.0,
+            "82": 3639019008.0,
+            "83": 3639019008.0,
+            "84": 3639019008.0,
+            "85": 3639019008.0,
+            "86": 3639019008.0,
+            "87": 3639019008.0,
+            "88": 3639019008.0,
+            "89": 3639019008.0,
+            "90": 3639019008.0,
+            "91": 3639019008.0,
+            "92": 3639019008.0,
+            "93": 3639019008.0,
+            "94": 3639019008.0,
+            "95": 3639019008.0,
+            "96": 3639019008.0,
+            "97": 3639019008.0,
+            "98": 3639019008.0,
+            "99": 3639019008.0,
+            "100": 3639019008.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 2.63883,
+            "53": 0.31395,
+            "54": 0.31367,
+            "55": 0.29374,
+            "56": 0.30814,
+            "57": 0.28518,
+            "58": 0.2893,
+            "59": 0.29547,
+            "60": 0.29213,
+            "61": 0.27939,
+            "62": 0.28509,
+            "63": 0.28109,
+            "64": 0.28285,
+            "65": 0.27653,
+            "66": 0.27923,
+            "67": 0.27493,
+            "68": 0.28188,
+            "69": 0.2851,
+            "70": 0.28475,
+            "71": 0.28187,
+            "72": 0.28341,
+            "73": 0.26848,
+            "74": 0.27702,
+            "75": 0.29514,
+            "76": 0.26459,
+            "77": 0.27617,
+            "78": 0.27231,
+            "79": 0.28621,
+            "80": 0.27218,
+            "81": 0.27522,
+            "82": 0.27114,
+            "83": 0.26001,
+            "84": 0.26222,
+            "85": 0.27374,
+            "86": 0.27145,
+            "87": 0.28673,
+            "88": 0.27394,
+            "89": 0.26336,
+            "90": 0.28319,
+            "91": 0.26195,
+            "92": 0.26716,
+            "93": 0.26523,
+            "94": 0.26477,
+            "95": 0.26706,
+            "96": 0.2815,
+            "97": 0.27054,
+            "98": 0.28122,
+            "99": 0.27335,
+            "100": 0.27113
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/model_config.yaml
new file mode 100644
index 00000000000..5c395caed56
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/model_config.yaml
@@ -0,0 +1,68 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --disable-bias-linear: true
+  --train-iters: 100
+  --timing-log-level: 0
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 8
+  --num-experts: 8
+  --moe-token-dispatcher-type: allgather
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-router-dtype: fp32
+  --moe-ffn-hidden-size: 1024
+  --moe-grouped-gemm: true
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --ckpt-assume-constant-structure: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --no-bias-gelu-fusion: true
+  --log-memory-to-tensorboard: true
+  --optimizer: muon
+  --muon-momentum: 0.9
+  --muon-extra-scale-factor: 0.2
+  --muon-scale-mode: spectral
+  --check-weight-hash-across-dp-replicas-interval: 1
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..64e256f8b57
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,344 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 11.02499,
+            "2": 11.05412,
+            "3": 10.03918,
+            "4": 9.80215,
+            "5": 13.60005,
+            "6": 8.54454,
+            "7": 9.77444,
+            "8": 8.35233,
+            "9": 7.88788,
+            "10": 7.14039,
+            "11": 9.06955,
+            "12": 9.20099,
+            "13": 8.15607,
+            "14": 8.36221,
+            "15": 8.43013,
+            "16": 8.48001,
+            "17": 8.52462,
+            "18": 7.90076,
+            "19": 8.35376,
+            "20": 7.90482,
+            "21": 8.17608,
+            "22": 7.55176,
+            "23": 8.27889,
+            "24": 7.65732,
+            "25": 8.43063,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 47165208.0,
+            "2": 46898016.0,
+            "3": 134483200.0,
+            "4": 98911144.0,
+            "5": 405652352.0,
+            "6": 510869088.0,
+            "7": 779007104.0,
+            "8": 538338816.0,
+            "9": 365348032.0,
+            "10": 620817088.0,
+            "11": 502895552.0,
+            "12": 572091776.0,
+            "13": 714972800.0,
+            "14": 748130816.0,
+            "15": 709938432.0,
+            "16": 691583488.0,
+            "17": 963168256.0,
+            "18": 953453952.0,
+            "19": 713408000.0,
+            "20": 919014656.0,
+            "21": 899637952.0,
+            "22": 688944512.0,
+            "23": 856034560.0,
+            "24": 858768064.0,
+            "25": 818025472.0,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5283603968.0,
+            "2": 5283808768.0,
+            "3": 5283504640.0,
+            "4": 5283707392.0,
+            "5": 5283910144.0,
+            "6": 5284112896.0,
+            "7": 5284315648.0,
+            "8": 5284518400.0,
+            "9": 5284721152.0,
+            "10": 5284923904.0,
+            "11": 5285126656.0,
+            "12": 5285329408.0,
+            "13": 5285532160.0,
+            "14": 5285734912.0,
+            "15": 5285937664.0,
+            "16": 5286140416.0,
+            "17": 5286343168.0,
+            "18": 5286545920.0,
+            "19": 5286748672.0,
+            "20": 5286951424.0,
+            "21": 5287154176.0,
+            "22": 5287356928.0,
+            "23": 5287559680.0,
+            "24": 5287762432.0,
+            "25": 5287965184.0,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5283606528.0,
+            "2": 8265524736.0,
+            "3": 8265524736.0,
+            "4": 8265524736.0,
+            "5": 8265524736.0,
+            "6": 8265524736.0,
+            "7": 8265524736.0,
+            "8": 8265524736.0,
+            "9": 8265524736.0,
+            "10": 8276976128.0,
+            "11": 8276976128.0,
+            "12": 8276976128.0,
+            "13": 8276976128.0,
+            "14": 8276976128.0,
+            "15": 8276976128.0,
+            "16": 8276976128.0,
+            "17": 8276976128.0,
+            "18": 8276976128.0,
+            "19": 8276976128.0,
+            "20": 8276976128.0,
+            "21": 8276976128.0,
+            "22": 8285769216.0,
+            "23": 8285769216.0,
+            "24": 8285769216.0,
+            "25": 8285769216.0,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "mtp_1 loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 11.03308,
+            "2": 11.06518,
+            "3": 10.81759,
+            "4": 10.53818,
+            "5": 10.71366,
+            "6": 8.61304,
+            "7": 10.1377,
+            "8": 8.29237,
+            "9": 7.71086,
+            "10": 6.91516,
+            "11": 9.19783,
+            "12": 9.26769,
+            "13": 8.06484,
+            "14": 8.2784,
+            "15": 8.36908,
+            "16": 8.41495,
+            "17": 8.38655,
+            "18": 7.69044,
+            "19": 8.28621,
+            "20": 7.79896,
+            "21": 8.09324,
+            "22": 7.49223,
+            "23": 8.14261,
+            "24": 7.5863,
+            "25": 8.37107,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 25.92832,
+            "3": 2.8263,
+            "4": 1.73655,
+            "5": 2.05123,
+            "6": 2.39425,
+            "7": 2.15639,
+            "8": 0.92138,
+            "9": 0.94247,
+            "10": 1.64642,
+            "11": 0.96645,
+            "12": 0.91866,
+            "13": 0.94198,
+            "14": 0.9255,
+            "15": 0.95577,
+            "16": 0.92425,
+            "17": 0.94137,
+            "18": 0.93111,
+            "19": 0.89952,
+            "20": 0.90021,
+            "21": 0.91289,
+            "22": 0.93437,
+            "23": 0.96363,
+            "24": 1.00045,
+            "25": 0.96782,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..d5ced620365
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json
@@ -0,0 +1,344 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 11.06693,
+            "2": 11.0602,
+            "3": 10.16141,
+            "4": 10.11145,
+            "5": 10.47957,
+            "6": 10.21751,
+            "7": 10.56153,
+            "8": 12.79501,
+            "9": 12.96949,
+            "10": 13.32223,
+            "11": 11.63359,
+            "12": 11.4938,
+            "13": 12.46292,
+            "14": 12.13415,
+            "15": 11.90295,
+            "16": 12.01307,
+            "17": 12.17443,
+            "18": 12.64978,
+            "19": 11.81295,
+            "20": 12.18673,
+            "21": 11.24306,
+            "22": 11.54156,
+            "23": 10.98412,
+            "24": 11.01925,
+            "25": 10.73001,
+            "26": 10.72806,
+            "27": 10.79039,
+            "28": 10.714,
+            "29": 10.73974,
+            "30": 10.75246,
+            "31": 10.68874,
+            "32": 10.65791,
+            "33": 10.81137,
+            "34": 10.79058,
+            "35": 10.75368,
+            "36": 10.64393,
+            "37": 10.87492,
+            "38": 10.90591,
+            "39": 10.78825,
+            "40": 10.75548,
+            "41": 10.8955,
+            "42": 10.70411,
+            "43": 10.66907,
+            "44": 10.72512,
+            "45": 10.54927,
+            "46": 10.46973,
+            "47": 10.66311,
+            "48": 10.62453,
+            "49": 10.61656,
+            "50": 10.21176
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 47165216.0,
+            "2": 46897552.0,
+            "3": 52682736.0,
+            "4": 70585808.0,
+            "5": 1850183680.0,
+            "6": 171098656.0,
+            "7": 436105120.0,
+            "8": 1850183680.0,
+            "9": 1850183680.0,
+            "10": 1850183680.0,
+            "11": 1850183680.0,
+            "12": 1850183680.0,
+            "13": 1850183680.0,
+            "14": 1850183680.0,
+            "15": 555857088.0,
+            "16": 1850183680.0,
+            "17": 1850183680.0,
+            "18": 1850183680.0,
+            "19": 886404992.0,
+            "20": 654826944.0,
+            "21": 603993664.0,
+            "22": 726709632.0,
+            "23": 566656896.0,
+            "24": 1850183680.0,
+            "25": 799245696.0,
+            "26": 978252032.0,
+            "27": 1850183680.0,
+            "28": 906183104.0,
+            "29": 1850183680.0,
+            "30": 1850183680.0,
+            "31": 810874112.0,
+            "32": 1850183680.0,
+            "33": 1850183680.0,
+            "34": 553779584.0,
+            "35": 565382400.0,
+            "36": 585787712.0,
+            "37": 627284160.0,
+            "38": 331368192.0,
+            "39": 638619264.0,
+            "40": 1850183680.0,
+            "41": 1850183680.0,
+            "42": 1850183680.0,
+            "43": 1850183680.0,
+            "44": 1850183680.0,
+            "45": 1850183680.0,
+            "46": 1850183680.0,
+            "47": 434842944.0,
+            "48": 1850183680.0,
+            "49": 575219328.0,
+            "50": 1850183680.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5283616256.0,
+            "2": 5288015360.0,
+            "3": 5288218112.0,
+            "4": 5288420864.0,
+            "5": 5288623616.0,
+            "6": 5287812608.0,
+            "7": 5288015360.0,
+            "8": 5288218112.0,
+            "9": 5287711232.0,
+            "10": 5287913984.0,
+            "11": 5288116736.0,
+            "12": 5288319488.0,
+            "13": 5288522240.0,
+            "14": 5288724992.0,
+            "15": 5288927744.0,
+            "16": 5289130496.0,
+            "17": 5289333248.0,
+            "18": 5289536000.0,
+            "19": 5289738752.0,
+            "20": 5289941504.0,
+            "21": 5290144256.0,
+            "22": 5290347008.0,
+            "23": 5290549760.0,
+            "24": 5290752512.0,
+            "25": 5290955264.0,
+            "26": 5291158016.0,
+            "27": 5291360768.0,
+            "28": 5291563520.0,
+            "29": 5291766272.0,
+            "30": 5291969024.0,
+            "31": 5292171776.0,
+            "32": 5292374528.0,
+            "33": 5292577280.0,
+            "34": 5292780032.0,
+            "35": 5292982784.0,
+            "36": 5293185536.0,
+            "37": 5293388288.0,
+            "38": 5293591040.0,
+            "39": 5293793792.0,
+            "40": 5293996544.0,
+            "41": 5294199296.0,
+            "42": 5294402048.0,
+            "43": 5294604800.0,
+            "44": 5294807552.0,
+            "45": 5295010304.0,
+            "46": 5295213056.0,
+            "47": 5295415808.0,
+            "48": 5295618560.0,
+            "49": 5295821312.0,
+            "50": 5296024064.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 5283618816.0,
+            "2": 8185453056.0,
+            "3": 8185453056.0,
+            "4": 8185453056.0,
+            "5": 8195318272.0,
+            "6": 8195318272.0,
+            "7": 8195318272.0,
+            "8": 8195318272.0,
+            "9": 8195318272.0,
+            "10": 8195318272.0,
+            "11": 8195318272.0,
+            "12": 8195318272.0,
+            "13": 8195318272.0,
+            "14": 8195318272.0,
+            "15": 8195318272.0,
+            "16": 8199233024.0,
+            "17": 8199233024.0,
+            "18": 8199233024.0,
+            "19": 8199233024.0,
+            "20": 8199233024.0,
+            "21": 8238446080.0,
+            "22": 8238446080.0,
+            "23": 8238446080.0,
+            "24": 8238446080.0,
+            "25": 8247293440.0,
+            "26": 8247293440.0,
+            "27": 8247293440.0,
+            "28": 8250185216.0,
+            "29": 8255527424.0,
+            "30": 8255527424.0,
+            "31": 8255527424.0,
+            "32": 8255527424.0,
+            "33": 8255527424.0,
+            "34": 8255527424.0,
+            "35": 8255527424.0,
+            "36": 8255527424.0,
+            "37": 8255527424.0,
+            "38": 8255527424.0,
+            "39": 8255527424.0,
+            "40": 8255527424.0,
+            "41": 8255527424.0,
+            "42": 8255527424.0,
+            "43": 8255527424.0,
+            "44": 8255527424.0,
+            "45": 8255527424.0,
+            "46": 8255527424.0,
+            "47": 8255527424.0,
+            "48": 8255527424.0,
+            "49": 8255527424.0,
+            "50": 8255527424.0
+        }
+    },
+    "mtp_1 loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 11.07401,
+            "2": 11.0927,
+            "3": 10.83159,
+            "4": 10.61397,
+            "5": 10.85768,
+            "6": 9.79263,
+            "7": 10.90607,
+            "8": 10.19798,
+            "9": 9.82717,
+            "10": 9.23805,
+            "11": 11.0712,
+            "12": 11.11709,
+            "13": 10.03407,
+            "14": 10.27606,
+            "15": 10.73067,
+            "16": 10.91485,
+            "17": 10.76886,
+            "18": 10.49659,
+            "19": 10.96955,
+            "20": 10.45905,
+            "21": 10.91629,
+            "22": 10.05081,
+            "23": 10.44411,
+            "24": 9.74826,
+            "25": 10.81497,
+            "26": 10.38519,
+            "27": 10.31999,
+            "28": 10.27887,
+            "29": 10.40945,
+            "30": 10.20684,
+            "31": 10.54594,
+            "32": 8.85942,
+            "33": 9.75619,
+            "34": 10.56214,
+            "35": 10.59167,
+            "36": 9.30537,
+            "37": 10.59407,
+            "38": 10.2994,
+            "39": 10.69954,
+            "40": 10.37003,
+            "41": 10.248,
+            "42": 8.56376,
+            "43": 10.49224,
+            "44": 10.57211,
+            "45": 9.36238,
+            "46": 10.2179,
+            "47": 10.63449,
+            "48": 10.56697,
+            "49": 10.44093,
+            "50": 9.49252
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 71.30157,
+            "2": 2.34464,
+            "3": 2.38747,
+            "4": 2.10322,
+            "5": 2.12945,
+            "6": 2.0424,
+            "7": 2.12036,
+            "8": 2.0147,
+            "9": 2.04925,
+            "10": 2.02797,
+            "11": 1.95087,
+            "12": 2.04985,
+            "13": 1.94106,
+            "14": 1.90425,
+            "15": 1.89051,
+            "16": 1.89398,
+            "17": 1.94082,
+            "18": 1.93176,
+            "19": 1.94027,
+            "20": 1.90271,
+            "21": 1.91097,
+            "22": 1.90382,
+            "23": 1.93889,
+            "24": 1.90551,
+            "25": 1.90947,
+            "26": 1.92126,
+            "27": 1.89917,
+            "28": 1.89866,
+            "29": 1.93981,
+            "30": 1.90782,
+            "31": 1.91244,
+            "32": 1.93864,
+            "33": 1.93947,
+            "34": 1.96882,
+            "35": 1.89751,
+            "36": 1.94038,
+            "37": 1.90603,
+            "38": 1.94988,
+            "39": 1.89874,
+            "40": 1.90233,
+            "41": 1.92861,
+            "42": 1.93931,
+            "43": 1.91212,
+            "44": 1.92615,
+            "45": 1.89555,
+            "46": 1.94522,
+            "47": 1.9103,
+            "48": 1.94689,
+            "49": 1.9355,
+            "50": 1.89832
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml
new file mode 100644
index 00000000000..a37dd0dc658
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml
@@ -0,0 +1,140 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 32
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+  PYTHONWARNINGS: ignore
+  NCCL_DEBUG: VERSION
+  NVTE_CPU_OFFLOAD_V1: 1
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: ':4096:8'
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --expert-model-parallel-size: 4
+  --context-parallel-size: 1
+  --expert-tensor-parallel-size: 1
+  --use-distributed-optimizer: true
+  # NOTE: uncomment if TE >= 2.9.0
+  # --overlap-grad-reduce: true
+  # --overlap-param-gather: true
+  # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN
+  --attention-backend: unfused # TODO: switch back to fused attention after fix
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 8
+  --train-iters: 50
+  --exit-duration-in-mins: 230
+  --no-check-for-nan-in-loss-and-grad: true
+  --no-rope-fusion: true
+  --manual-gc: true
+  --manual-gc-interval: 100
+  --recompute-granularity: selective
+  --recompute-modules: "[layernorm mla_up_proj mlp moe_act]"
+  --fine-grained-activation-offloading: true
+  --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm qkv_linear core_attn attn_proj]"
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+  # Data args
+  --seq-length: 4096
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  # Add network size args
+  --num-layers: 15
+  --moe-layer-freq: ([0]*3+[1]*12)
+  --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6mL # Et*3|(tt|)*6mL
+  --hidden-size: 1024
+  --ffn-hidden-size: 4096
+  --num-attention-heads: 32
+  --kv-channels: 128
+  --max-position-embeddings: 4096
+  --position-embedding-type: rope
+  --rotary-base: 10000
+  --make-vocab-size-divisible-by: 3232
+  --normalization: RMSNorm
+  --norm-epsilon: 1e-6
+  --swiglu: true
+  --untie-embeddings-and-output-weights: true
+  --multi-latent-attention: true
+  # Comment out the following MTP args to disable MTP
+  --mtp-num-layers: 1
+  --mtp-loss-scaling-factor: 0.1
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+  --qk-layernorm: true
+  # Add learning rate args
+  --lr-warmup-fraction: .01
+  --lr: 0.00015
+  --min-lr: 1.0e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+  # Add MoE args
+  --num-experts: 32
+  --moe-ffn-hidden-size: 1024
+  --moe-shared-expert-intermediate-size: 1024
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-router-topk: 4
+  --moe-token-dispatcher-type: alltoall
+  --moe-router-pre-softmax: true
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-4
+  --moe-router-group-topk: 2
+  --moe-router-num-groups: 4
+  --moe-router-topk-scaling-factor: 2.0
+  --moe-router-score-function: sigmoid
+  --moe-router-enable-expert-bias: true
+  --moe-router-bias-update-rate: 1e-3
+  --moe-router-dtype: fp32
+  --moe-permute-fusion: true
+  # Add MLA args
+  --q-lora-rank: 1536
+  --kv-lora-rank: 512
+  --qk-head-dim: 128
+  --qk-pos-emb-head-dim: 64
+  --v-head-dim: 128
+  --rotary-scaling-factor: 40
+  --mscale: 1.0
+  --mscale-all-dim: 1.0
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+  # Add checkpointing args
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --save-interval: 25
+  # Add initialization args
+  --init-method-std: 0.02
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --logging-level: 40
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  # Add mixed precision args
+  --bf16: true
+  --exit-interval: 50
+  --overlap-moe-expert-parallel-comm: true
+TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular
+METRICS:
+  - "iteration-time"
+  - "lm loss"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
+  - "mtp_1 loss"
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..dc46db36c72
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 11.0146,
+            "2": 11.04991,
+            "3": 10.14357,
+            "4": 9.67761,
+            "5": 9.94473,
+            "6": 9.95632,
+            "7": 9.92367,
+            "8": 8.83165,
+            "9": 8.42103,
+            "10": 7.83364,
+            "11": 10.81778,
+            "12": 10.35014,
+            "13": 8.66833,
+            "14": 9.13058,
+            "15": 9.2484,
+            "16": 9.32149,
+            "17": 9.20232,
+            "18": 8.73719,
+            "19": 9.32726,
+            "20": 8.88552,
+            "21": 9.10111,
+            "22": 8.53259,
+            "23": 8.96918,
+            "24": 8.67428,
+            "25": 9.1617,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 47167824.0,
+            "2": 46900664.0,
+            "3": 96750344.0,
+            "4": 246765024.0,
+            "5": 817718912.0,
+            "6": 592653248.0,
+            "7": 1018015232.0,
+            "8": 657925824.0,
+            "9": 711406848.0,
+            "10": 576785472.0,
+            "11": 704207488.0,
+            "12": 619264576.0,
+            "13": 718118144.0,
+            "14": 656897024.0,
+            "15": 621830912.0,
+            "16": 729345984.0,
+            "17": 831063744.0,
+            "18": 1025804096.0,
+            "19": 832938368.0,
+            "20": 1003945088.0,
+            "21": 830430208.0,
+            "22": 846188736.0,
+            "23": 1035339456.0,
+            "24": 1003472384.0,
+            "25": 1019352320.0,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4313446912.0,
+            "2": 4313448448.0,
+            "3": 4313448448.0,
+            "4": 4313448448.0,
+            "5": 4313448448.0,
+            "6": 4313448448.0,
+            "7": 4313448448.0,
+            "8": 4313448448.0,
+            "9": 4313448448.0,
+            "10": 4313448448.0,
+            "11": 4313448448.0,
+            "12": 4313448448.0,
+            "13": 4313448448.0,
+            "14": 4313448448.0,
+            "15": 4313448448.0,
+            "16": 4313448448.0,
+            "17": 4313448448.0,
+            "18": 4313448448.0,
+            "19": 4313448448.0,
+            "20": 4313448448.0,
+            "21": 4313448448.0,
+            "22": 4313448448.0,
+            "23": 4313448448.0,
+            "24": 4313448448.0,
+            "25": 4313448448.0,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4313449472.0,
+            "2": 7058323968.0,
+            "3": 7093507072.0,
+            "4": 7103545856.0,
+            "5": 7103545856.0,
+            "6": 7103545856.0,
+            "7": 7103545856.0,
+            "8": 7103545856.0,
+            "9": 7103545856.0,
+            "10": 7103545856.0,
+            "11": 7105847296.0,
+            "12": 7107386368.0,
+            "13": 7107386368.0,
+            "14": 7107386368.0,
+            "15": 7107386368.0,
+            "16": 7107386368.0,
+            "17": 7107386368.0,
+            "18": 7107386368.0,
+            "19": 7107386368.0,
+            "20": 7107386368.0,
+            "21": 7107386368.0,
+            "22": 7108604416.0,
+            "23": 7108922368.0,
+            "24": 7109204992.0,
+            "25": 7109204992.0,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 26.62047,
+            "3": 1.74298,
+            "4": 2.19744,
+            "5": 2.54081,
+            "6": 0.94207,
+            "7": 2.41564,
+            "8": 0.89613,
+            "9": 0.88788,
+            "10": 0.90363,
+            "11": 2.30063,
+            "12": 0.89466,
+            "13": 0.87273,
+            "14": 2.31557,
+            "15": 0.91663,
+            "16": 0.87731,
+            "17": 0.89596,
+            "18": 0.87486,
+            "19": 0.87795,
+            "20": 0.87855,
+            "21": 0.88064,
+            "22": 0.88881,
+            "23": 0.88358,
+            "24": 0.88347,
+            "25": 0.88411,
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..57848f8130e
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 11.01693,
+            "2": 11.06263,
+            "3": 10.08845,
+            "4": 9.73223,
+            "5": 10.41008,
+            "6": 10.46377,
+            "7": 11.62265,
+            "8": 12.30479,
+            "9": 12.258,
+            "10": 12.11321,
+            "11": 11.67717,
+            "12": 11.60724,
+            "13": 11.46408,
+            "14": 11.41026,
+            "15": 11.44828,
+            "16": 11.31999,
+            "17": 11.28503,
+            "18": 11.35547,
+            "19": 11.35205,
+            "20": 11.50757,
+            "21": 11.41181,
+            "22": 11.56383,
+            "23": 11.41906,
+            "24": 11.39788,
+            "25": 11.26438,
+            "26": 11.36733,
+            "27": 11.37099,
+            "28": 11.40035,
+            "29": 11.42808,
+            "30": 11.53613,
+            "31": 11.3981,
+            "32": 12.00058,
+            "33": 11.68213,
+            "34": 11.38046,
+            "35": 11.36734,
+            "36": 11.77291,
+            "37": 11.34584,
+            "38": 11.4654,
+            "39": 11.33231,
+            "40": 11.43538,
+            "41": 11.47405,
+            "42": 12.09241,
+            "43": 11.39968,
+            "44": 11.38762,
+            "45": 11.79356,
+            "46": 11.4469,
+            "47": 11.3507,
+            "48": 11.30787,
+            "49": 11.39251,
+            "50": 11.7264
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 47167880.0,
+            "2": 46899772.0,
+            "3": 1722086400.0,
+            "4": 1722086400.0,
+            "5": 188597600.0,
+            "6": 120779000.0,
+            "7": 527310080.0,
+            "8": 1722086400.0,
+            "9": 1722086400.0,
+            "10": 321966144.0,
+            "11": 493484608.0,
+            "12": 1722086400.0,
+            "13": 529395136.0,
+            "14": 1722086400.0,
+            "15": 1722086400.0,
+            "16": 723018944.0,
+            "17": 233377744.0,
+            "18": 642084544.0,
+            "19": 1722086400.0,
+            "20": 1722086400.0,
+            "21": 578776704.0,
+            "22": 396416192.0,
+            "23": 506872960.0,
+            "24": 670044160.0,
+            "25": 884090624.0,
+            "26": 912192512.0,
+            "27": 764026112.0,
+            "28": 972234112.0,
+            "29": 915345600.0,
+            "30": 937728768.0,
+            "31": 1722086400.0,
+            "32": 976440512.0,
+            "33": 984833664.0,
+            "34": 802321088.0,
+            "35": 1722086400.0,
+            "36": 931810816.0,
+            "37": 897772032.0,
+            "38": 982505792.0,
+            "39": 704699008.0,
+            "40": 688513344.0,
+            "41": 946725760.0,
+            "42": 1722086400.0,
+            "43": 1722086400.0,
+            "44": 875336384.0,
+            "45": 1722086400.0,
+            "46": 909066432.0,
+            "47": 900409280.0,
+            "48": 890279744.0,
+            "49": 597272192.0,
+            "50": 921883712.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4313446912.0,
+            "2": 4313448448.0,
+            "3": 4313448448.0,
+            "4": 4313448448.0,
+            "5": 4313448448.0,
+            "6": 4313448448.0,
+            "7": 4313448448.0,
+            "8": 4313448448.0,
+            "9": 4313448448.0,
+            "10": 4313448448.0,
+            "11": 4313448448.0,
+            "12": 4313448448.0,
+            "13": 4313448448.0,
+            "14": 4313448448.0,
+            "15": 4313448448.0,
+            "16": 4313448448.0,
+            "17": 4313448448.0,
+            "18": 4313448448.0,
+            "19": 4313448448.0,
+            "20": 4313448448.0,
+            "21": 4313448448.0,
+            "22": 4313448448.0,
+            "23": 4313448448.0,
+            "24": 4313448448.0,
+            "25": 4313448448.0,
+            "26": 4313448448.0,
+            "27": 4313448448.0,
+            "28": 4313448448.0,
+            "29": 4313448448.0,
+            "30": 4313448448.0,
+            "31": 4313448448.0,
+            "32": 4313448448.0,
+            "33": 4313448448.0,
+            "34": 4313448448.0,
+            "35": 4313448448.0,
+            "36": 4313448448.0,
+            "37": 4313448448.0,
+            "38": 4313448448.0,
+            "39": 4313448448.0,
+            "40": 4313448448.0,
+            "41": 4313448448.0,
+            "42": 4313448448.0,
+            "43": 4313448448.0,
+            "44": 4313448448.0,
+            "45": 4313448448.0,
+            "46": 4313448448.0,
+            "47": 4313448448.0,
+            "48": 4313448448.0,
+            "49": 4313448448.0,
+            "50": 4313448448.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 4313449472.0,
+            "2": 7108272640.0,
+            "3": 7108272640.0,
+            "4": 7108272640.0,
+            "5": 7119571456.0,
+            "6": 7119571456.0,
+            "7": 7129409024.0,
+            "8": 7158368768.0,
+            "9": 7158368768.0,
+            "10": 7158838784.0,
+            "11": 7202046464.0,
+            "12": 7202046464.0,
+            "13": 7202046464.0,
+            "14": 7202046464.0,
+            "15": 7202046464.0,
+            "16": 7202046464.0,
+            "17": 7202046464.0,
+            "18": 7202046464.0,
+            "19": 7202046464.0,
+            "20": 7202046464.0,
+            "21": 7202046464.0,
+            "22": 7202046464.0,
+            "23": 7202046464.0,
+            "24": 7202046464.0,
+            "25": 7202046464.0,
+            "26": 7202046464.0,
+            "27": 7202046464.0,
+            "28": 7202046464.0,
+            "29": 7202046464.0,
+            "30": 7202046464.0,
+            "31": 7202046464.0,
+            "32": 7202046464.0,
+            "33": 7202046464.0,
+            "34": 7202046464.0,
+            "35": 7202046464.0,
+            "36": 7202046464.0,
+            "37": 7202046464.0,
+            "38": 7202046464.0,
+            "39": 7202046464.0,
+            "40": 7202046464.0,
+            "41": 7202046464.0,
+            "42": 7202046464.0,
+            "43": 7202046464.0,
+            "44": 7202046464.0,
+            "45": 7202046464.0,
+            "46": 7202046464.0,
+            "47": 7202046464.0,
+            "48": 7202046464.0,
+            "49": 7202046464.0,
+            "50": 7202046464.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 90.31742,
+            "2": 2.522,
+            "3": 2.42029,
+            "4": 2.06158,
+            "5": 2.28893,
+            "6": 3.01447,
+            "7": 3.96389,
+            "8": 3.20878,
+            "9": 2.43815,
+            "10": 1.94158,
+            "11": 1.95031,
+            "12": 1.98877,
+            "13": 1.92978,
+            "14": 1.93494,
+            "15": 1.92559,
+            "16": 1.95925,
+            "17": 2.59672,
+            "18": 1.94175,
+            "19": 1.92388,
+            "20": 1.92283,
+            "21": 1.92623,
+            "22": 1.92561,
+            "23": 1.92611,
+            "24": 1.94339,
+            "25": 2.02939,
+            "26": 1.93181,
+            "27": 1.92433,
+            "28": 1.96842,
+            "29": 1.92479,
+            "30": 1.93949,
+            "31": 1.96151,
+            "32": 1.93071,
+            "33": 1.92266,
+            "34": 1.92587,
+            "35": 1.92251,
+            "36": 1.92324,
+            "37": 1.93141,
+            "38": 1.92431,
+            "39": 1.93685,
+            "40": 1.92592,
+            "41": 1.92962,
+            "42": 1.92986,
+            "43": 1.92956,
+            "44": 1.93019,
+            "45": 1.93251,
+            "46": 1.92915,
+            "47": 1.93714,
+            "48": 1.93564,
+            "49": 1.94035,
+            "50": 1.93018
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml
new file mode 100644
index 00000000000..da78378ddae
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml
@@ -0,0 +1,134 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+  PYTHONWARNINGS: ignore
+  NCCL_DEBUG: VERSION
+  NVTE_CPU_OFFLOAD_V1: 1
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --expert-model-parallel-size: 4
+  --context-parallel-size: 1
+  --expert-tensor-parallel-size: 1
+  --use-distributed-optimizer: true
+  # NOTE: uncomment if TE >= 2.9.0
+  # --overlap-grad-reduce: true
+  # --overlap-param-gather: true
+  # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN
+  --attention-backend: unfused # TODO: switch back to fused attention after fix
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 8
+  --train-iters: 50
+  --exit-duration-in-mins: 230
+  --no-check-for-nan-in-loss-and-grad: true
+  --no-rope-fusion: true
+  --manual-gc: true
+  --manual-gc-interval: 100
+  --recompute-granularity: selective
+  --recompute-modules: "[layernorm mla_up_proj mlp moe_act]"
+  --fine-grained-activation-offloading: true
+  --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm qkv_linear core_attn attn_proj]"
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+  # Data args
+  --seq-length: 4096
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  # Add network size args
+  --num-layers: 15
+  --moe-layer-freq: ([0]*3+[1]*12)
+  --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6L # Et*3|(tt|)*6L
+  --hidden-size: 1024
+  --ffn-hidden-size: 4096
+  --num-attention-heads: 32
+  --kv-channels: 128
+  --max-position-embeddings: 4096
+  --position-embedding-type: rope
+  --rotary-base: 10000
+  --make-vocab-size-divisible-by: 3232
+  --normalization: RMSNorm
+  --norm-epsilon: 1e-6
+  --swiglu: true
+  --untie-embeddings-and-output-weights: true
+  --multi-latent-attention: true
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+  --qk-layernorm: true
+  # Add learning rate args
+  --lr-warmup-fraction: .01
+  --lr: 0.00015
+  --min-lr: 1.0e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+  # Add MoE args
+  --num-experts: 32
+  --moe-ffn-hidden-size: 1024
+  --moe-shared-expert-intermediate-size: 1024
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-router-topk: 4
+  --moe-token-dispatcher-type: alltoall
+  --moe-router-pre-softmax: true
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-4
+  --moe-router-group-topk: 2
+  --moe-router-num-groups: 4
+  --moe-router-topk-scaling-factor: 2.0
+  --moe-router-score-function: sigmoid
+  --moe-router-enable-expert-bias: true
+  --moe-router-bias-update-rate: 1e-3
+  --moe-router-dtype: fp32
+  --moe-permute-fusion: true
+  # Add MLA args
+  --q-lora-rank: 1536
+  --kv-lora-rank: 512
+  --qk-head-dim: 128
+  --qk-pos-emb-head-dim: 64
+  --v-head-dim: 128
+  --rotary-scaling-factor: 40
+  --mscale: 1.0
+  --mscale-all-dim: 1.0
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+  # Add checkpointing args
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --save-interval: 25
+  # Add initialization args
+  --init-method-std: 0.02
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --logging-level: 40
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  # Add mixed precision args
+  --bf16: true
+  --exit-interval: 50
+TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular
+METRICS:
+  - "lm loss"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..a35a7574e59
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.81233,
+            "2": 10.82416,
+            "3": 10.81841,
+            "4": 10.81357,
+            "5": 10.85116,
+            "6": 10.85502,
+            "7": 10.84363,
+            "8": 10.83621,
+            "9": 10.84178,
+            "10": 10.77391,
+            "11": 10.86217,
+            "12": 10.84672,
+            "13": 10.85692,
+            "14": 10.8614,
+            "15": 10.80709,
+            "16": 10.78544,
+            "17": 10.7701,
+            "18": 10.79072,
+            "19": 10.78529,
+            "20": 10.71496,
+            "21": 10.67362,
+            "22": 10.5386,
+            "23": 10.69608,
+            "24": 10.58118,
+            "25": 10.52212,
+            "26": 10.58665,
+            "27": 10.60344,
+            "28": 10.5676,
+            "29": 10.5868,
+            "30": 10.36177,
+            "31": 10.09661,
+            "32": 10.45911,
+            "33": 10.45926,
+            "34": 10.21524,
+            "35": 10.2617,
+            "36": 10.22327,
+            "37": 10.35631,
+            "38": 10.20637,
+            "39": 10.40825,
+            "40": 10.08881,
+            "41": 10.13871,
+            "42": 10.22236,
+            "43": 9.82978,
+            "44": 9.96931,
+            "45": 9.83925,
+            "46": 9.81008,
+            "47": 10.16408,
+            "48": 9.84608,
+            "49": 9.53674,
+            "50": 9.91754,
+            "51": 9.86341,
+            "52": 9.74862,
+            "53": 10.08034,
+            "54": 9.96286,
+            "55": 9.89221,
+            "56": 9.64295,
+            "57": 9.48196,
+            "58": 9.85327,
+            "59": 9.58985,
+            "60": 9.5157,
+            "61": 9.70142,
+            "62": 10.01153,
+            "63": 9.40557,
+            "64": 9.78559,
+            "65": 8.96047,
+            "66": 9.72678,
+            "67": 9.38244,
+            "68": 9.79903,
+            "69": 9.81114,
+            "70": 9.74788,
+            "71": 9.6452,
+            "72": 9.6027,
+            "73": 9.51692,
+            "74": 8.95583,
+            "75": 9.43449,
+            "76": 9.10005,
+            "77": 10.07816,
+            "78": 9.72912,
+            "79": 9.39357,
+            "80": 9.41584,
+            "81": 9.49174,
+            "82": 9.71087,
+            "83": 9.32591,
+            "84": 9.42272,
+            "85": 9.62054,
+            "86": 9.08096,
+            "87": 9.59797,
+            "88": 9.7551,
+            "89": 9.6096,
+            "90": 9.83264,
+            "91": 9.34163,
+            "92": 9.3578,
+            "93": 9.09025,
+            "94": 8.83205,
+            "95": 9.52868,
+            "96": 9.5278,
+            "97": 9.30277,
+            "98": 9.66393,
+            "99": 8.89773,
+            "100": 9.404
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 5532.0,
+            "2": 5934.0,
+            "3": 5812.0,
+            "4": 5817.0,
+            "5": 6435.0,
+            "6": 6641.0,
+            "7": 5880.0,
+            "8": 5900.0,
+            "9": 6317.0,
+            "10": 5314.0,
+            "11": 6659.0,
+            "12": 6393.0,
+            "13": 6585.0,
+            "14": 6649.0,
+            "15": 6237.0,
+            "16": 6606.0,
+            "17": 6232.0,
+            "18": 6059.0,
+            "19": 6380.0,
+            "20": 5723.0,
+            "21": 6197.0,
+            "22": 5714.0,
+            "23": 6527.0,
+            "24": 5948.0,
+            "25": 5822.0,
+            "26": 6271.0,
+            "27": 6493.0,
+            "28": 6789.0,
+            "29": 6971.0,
+            "30": 6252.0,
+            "31": 5836.0,
+            "32": 6830.0,
+            "33": 7155.0,
+            "34": 6428.0,
+            "35": 6909.0,
+            "36": 6559.0,
+            "37": 7582.0,
+            "38": 7325.0,
+            "39": 8189.0,
+            "40": 7156.0,
+            "41": 7113.0,
+            "42": 7783.0,
+            "43": 7236.0,
+            "44": 6958.0,
+            "45": 7093.0,
+            "46": 7385.0,
+            "47": 7634.0,
+            "48": 7916.0,
+            "49": 7565.0,
+            "50": 7795.0,
+            "51": 7967.0,
+            "52": 7869.0,
+            "53": 9001.0,
+            "54": 8408.0,
+            "55": 7734.0,
+            "56": 8108.0,
+            "57": 7339.0,
+            "58": 8677.0,
+            "59": 8299.0,
+            "60": 7790.0,
+            "61": 8347.0,
+            "62": 8345.0,
+            "63": 7835.0,
+            "64": 8861.0,
+            "65": 8293.0,
+            "66": 9180.0,
+            "67": 8276.0,
+            "68": 8251.0,
+            "69": 8666.0,
+            "70": 9836.0,
+            "71": 9020.0,
+            "72": 8503.0,
+            "73": 8996.0,
+            "74": 6967.0,
+            "75": 7749.0,
+            "76": 8534.0,
+            "77": 10688.0,
+            "78": 48163.0,
+            "79": 9603.0,
+            "80": 9991.0,
+            "81": 55995.0,
+            "82": 9533.0,
+            "83": 65535.0,
+            "84": 9876.0,
+            "85": 15848.0,
+            "86": 8732.0,
+            "87": 10574.0,
+            "88": 12165.0,
+            "89": 9808.0,
+            "90": 9644.0,
+            "91": 8584.0,
+            "92": 9300.0,
+            "93": 8081.0,
+            "94": 9101.0,
+            "95": 9919.0,
+            "96": 9755.0,
+            "97": 11113.0,
+            "98": 10522.0,
+            "99": 8739.0,
+            "100": 9616.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 628064256.0,
+            "2": 628065280.0,
+            "3": 628065280.0,
+            "4": 628065280.0,
+            "5": 628065280.0,
+            "6": 628065280.0,
+            "7": 628065280.0,
+            "8": 628065280.0,
+            "9": 628065280.0,
+            "10": 628065280.0,
+            "11": 628065280.0,
+            "12": 628065280.0,
+            "13": 628065280.0,
+            "14": 628065280.0,
+            "15": 628065280.0,
+            "16": 628065280.0,
+            "17": 628065280.0,
+            "18": 628065280.0,
+            "19": 628065280.0,
+            "20": 628065280.0,
+            "21": 628065280.0,
+            "22": 628065280.0,
+            "23": 628065280.0,
+            "24": 628065280.0,
+            "25": 628065280.0,
+            "26": 628065280.0,
+            "27": 628065280.0,
+            "28": 628065280.0,
+            "29": 628065280.0,
+            "30": 628065280.0,
+            "31": 628065280.0,
+            "32": 628065280.0,
+            "33": 628065280.0,
+            "34": 628065280.0,
+            "35": 628065280.0,
+            "36": 628065280.0,
+            "37": 628065280.0,
+            "38": 628065280.0,
+            "39": 628065280.0,
+            "40": 628065280.0,
+            "41": 628065280.0,
+            "42": 628065280.0,
+            "43": 628065280.0,
+            "44": 628065280.0,
+            "45": 628065280.0,
+            "46": 628065280.0,
+            "47": 628065280.0,
+            "48": 628065280.0,
+            "49": 628065280.0,
+            "50": 628065280.0,
+            "51": 628065280.0,
+            "52": 628065280.0,
+            "53": 628065280.0,
+            "54": 628065280.0,
+            "55": 628065280.0,
+            "56": 628065280.0,
+            "57": 628065280.0,
+            "58": 628065280.0,
+            "59": 628065280.0,
+            "60": 628065280.0,
+            "61": 628065280.0,
+            "62": 628065280.0,
+            "63": 628065280.0,
+            "64": 628065280.0,
+            "65": 628065280.0,
+            "66": 628065280.0,
+            "67": 628065280.0,
+            "68": 628065280.0,
+            "69": 628065280.0,
+            "70": 628065280.0,
+            "71": 628065280.0,
+            "72": 628065280.0,
+            "73": 628065280.0,
+            "74": 628065280.0,
+            "75": 628065280.0,
+            "76": 628065280.0,
+            "77": 628065280.0,
+            "78": 628065280.0,
+            "79": 628065280.0,
+            "80": 628065280.0,
+            "81": 628065280.0,
+            "82": 628065280.0,
+            "83": 628065280.0,
+            "84": 628065280.0,
+            "85": 628065280.0,
+            "86": 628065280.0,
+            "87": 628065280.0,
+            "88": 628065280.0,
+            "89": 628065280.0,
+            "90": 628065280.0,
+            "91": 628065280.0,
+            "92": 628065280.0,
+            "93": 628065280.0,
+            "94": 628065280.0,
+            "95": 628065280.0,
+            "96": 628065280.0,
+            "97": 628065280.0,
+            "98": 628065280.0,
+            "99": 628065280.0,
+            "100": 628065280.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 982504960.0,
+            "2": 1156255744.0,
+            "3": 1156255744.0,
+            "4": 1156255744.0,
+            "5": 1156255744.0,
+            "6": 1156255744.0,
+            "7": 1156255744.0,
+            "8": 1156255744.0,
+            "9": 1156255744.0,
+            "10": 1156255744.0,
+            "11": 1156255744.0,
+            "12": 1156255744.0,
+            "13": 1156255744.0,
+            "14": 1156255744.0,
+            "15": 1156255744.0,
+            "16": 1156255744.0,
+            "17": 1156255744.0,
+            "18": 1156255744.0,
+            "19": 1156255744.0,
+            "20": 1156255744.0,
+            "21": 1156255744.0,
+            "22": 1156255744.0,
+            "23": 1156255744.0,
+            "24": 1156255744.0,
+            "25": 1156255744.0,
+            "26": 1156255744.0,
+            "27": 1157233664.0,
+            "28": 1157233664.0,
+            "29": 1157233664.0,
+            "30": 1157233664.0,
+            "31": 1157233664.0,
+            "32": 1157233664.0,
+            "33": 1157233664.0,
+            "34": 1157233664.0,
+            "35": 1157233664.0,
+            "36": 1157233664.0,
+            "37": 1157233664.0,
+            "38": 1157233664.0,
+            "39": 1157233664.0,
+            "40": 1157233664.0,
+            "41": 1158865408.0,
+            "42": 1158865408.0,
+            "43": 1158865408.0,
+            "44": 1158865408.0,
+            "45": 1158865408.0,
+            "46": 1158865408.0,
+            "47": 1158865408.0,
+            "48": 1158865408.0,
+            "49": 1158865408.0,
+            "50": 1158865408.0,
+            "51": 1158865408.0,
+            "52": 1158865408.0,
+            "53": 1158865408.0,
+            "54": 1158865408.0,
+            "55": 1159034368.0,
+            "56": 1159063040.0,
+            "57": 1159542784.0,
+            "58": 1159542784.0,
+            "59": 1159542784.0,
+            "60": 1159542784.0,
+            "61": 1165075456.0,
+            "62": 1165075456.0,
+            "63": 1165075456.0,
+            "64": 1165075456.0,
+            "65": 1165075456.0,
+            "66": 1165075456.0,
+            "67": 1165075456.0,
+            "68": 1165075456.0,
+            "69": 1165075456.0,
+            "70": 1165075456.0,
+            "71": 1165075456.0,
+            "72": 1165075456.0,
+            "73": 1165075456.0,
+            "74": 1165075456.0,
+            "75": 1165075456.0,
+            "76": 1166216192.0,
+            "77": 1166216192.0,
+            "78": 1166216192.0,
+            "79": 1166216192.0,
+            "80": 1166216192.0,
+            "81": 1166216192.0,
+            "82": 1166216192.0,
+            "83": 1166639104.0,
+            "84": 1166639104.0,
+            "85": 1166639104.0,
+            "86": 1166639104.0,
+            "87": 1166639104.0,
+            "88": 1166639104.0,
+            "89": 1166639104.0,
+            "90": 1166639104.0,
+            "91": 1166639104.0,
+            "92": 1166639104.0,
+            "93": 1166639104.0,
+            "94": 1166639104.0,
+            "95": 1166639104.0,
+            "96": 1166639104.0,
+            "97": 1166639104.0,
+            "98": 1166639104.0,
+            "99": 1166639104.0,
+            "100": 1166639104.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 7.77547,
+            "3": 1.01252,
+            "4": 1.00639,
+            "5": 0.9897,
+            "6": 0.99553,
+            "7": 0.99796,
+            "8": 1.00873,
+            "9": 0.99009,
+            "10": 0.99264,
+            "11": 0.98765,
+            "12": 0.99024,
+            "13": 0.98319,
+            "14": 0.98552,
+            "15": 0.99368,
+            "16": 0.98342,
+            "17": 0.97729,
+            "18": 0.97272,
+            "19": 0.97308,
+            "20": 0.96906,
+            "21": 0.9751,
+            "22": 0.97375,
+            "23": 0.97447,
+            "24": 0.98494,
+            "25": 0.9779,
+            "26": 1.30939,
+            "27": 0.9766,
+            "28": 0.9856,
+            "29": 0.99223,
+            "30": 1.27178,
+            "31": 0.98025,
+            "32": 1.22425,
+            "33": 1.27653,
+            "34": 0.99358,
+            "35": 1.00171,
+            "36": 1.25408,
+            "37": 1.60005,
+            "38": 1.00572,
+            "39": 0.98676,
+            "40": 0.97218,
+            "41": 1.30266,
+            "42": 1.29066,
+            "43": 0.99057,
+            "44": 0.98517,
+            "45": 0.97968,
+            "46": 0.97289,
+            "47": 0.98145,
+            "48": 0.9804,
+            "49": 0.98022,
+            "50": 0.97431,
+            "51": 0.97593,
+            "52": 0.97255,
+            "53": 0.97424,
+            "54": 0.97043,
+            "55": 0.96887,
+            "56": 0.97492,
+            "57": 0.97623,
+            "58": 0.97423,
+            "59": 0.98879,
+            "60": 0.97992,
+            "61": 0.97895,
+            "62": 0.98829,
+            "63": 0.98719,
+            "64": 0.98651,
+            "65": 0.97852,
+            "66": 0.98045,
+            "67": 0.97825,
+            "68": 0.9795,
+            "69": 0.97812,
+            "70": 0.96297,
+            "71": 0.96718,
+            "72": 0.98343,
+            "73": 0.978,
+            "74": 0.99341,
+            "75": 0.97768,
+            "76": 0.97508,
+            "77": 0.97891,
+            "78": 0.9739,
+            "79": 0.96825,
+            "80": 0.96595,
+            "81": 0.95551,
+            "82": 0.97223,
+            "83": 0.9633,
+            "84": 0.96539,
+            "85": 0.97065,
+            "86": 0.97198,
+            "87": 0.97978,
+            "88": 0.98268,
+            "89": 0.99894,
+            "90": 1.00246,
+            "91": 0.98763,
+            "92": 0.98552,
+            "93": 0.99698,
+            "94": 0.99827,
+            "95": 0.99936,
+            "96": 0.99295,
+            "97": 0.99144,
+            "98": 0.99227,
+            "99": 0.98859,
+            "100": 0.99158
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
index 68b72267704..dc836c3d699 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json
@@ -4,106 +4,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.81442,
-            "2": 10.81882,
-            "3": 10.81551,
-            "4": 10.80292,
-            "5": 10.85144,
+            "1": 10.81455,
+            "2": 10.81846,
+            "3": 10.81528,
+            "4": 10.80297,
+            "5": 10.8513,
             "6": 10.85011,
-            "7": 10.83867,
-            "8": 10.83952,
-            "9": 10.82213,
-            "10": 10.77746,
-            "11": 10.86426,
-            "12": 10.83689,
-            "13": 10.85831,
-            "14": 10.86354,
-            "15": 10.79774,
-            "16": 10.79537,
-            "17": 10.77155,
-            "18": 10.78908,
-            "19": 10.78343,
-            "20": 10.71629,
-            "21": 10.6835,
-            "22": 10.53061,
-            "23": 10.69849,
-            "24": 10.58571,
-            "25": 10.52397,
-            "26": 10.58327,
-            "27": 10.60963,
-            "28": 10.57207,
-            "29": 10.59012,
-            "30": 10.35613,
-            "31": 10.09392,
-            "32": 10.45887,
-            "33": 10.45644,
-            "34": 10.20494,
-            "35": 10.26735,
-            "36": 10.22333,
-            "37": 10.35299,
-            "38": 10.19476,
-            "39": 10.41731,
-            "40": 10.08948,
-            "41": 10.12721,
-            "42": 10.21207,
-            "43": 9.8313,
-            "44": 9.96936,
-            "45": 9.83601,
-            "46": 9.81666,
-            "47": 10.1539,
-            "48": 9.85279,
-            "49": 9.53447,
-            "50": 9.91909,
-            "51": 9.85364,
-            "52": 9.74286,
-            "53": 10.07155,
-            "54": 9.96279,
-            "55": 9.88223,
-            "56": 9.63465,
-            "57": 9.48633,
-            "58": 9.84878,
-            "59": 9.58904,
-            "60": 9.51094,
-            "61": 9.7032,
-            "62": 9.99637,
-            "63": 9.40044,
-            "64": 9.78465,
-            "65": 8.95366,
-            "66": 9.71808,
-            "67": 9.36931,
-            "68": 9.79818,
-            "69": 9.79667,
-            "70": 9.74899,
-            "71": 9.63213,
-            "72": 9.59956,
-            "73": 9.50308,
-            "74": 8.95202,
-            "75": 9.43084,
-            "76": 9.09067,
-            "77": 10.08102,
-            "78": 9.73521,
-            "79": 9.38853,
+            "7": 10.83843,
+            "8": 10.83961,
+            "9": 10.82224,
+            "10": 10.77788,
+            "11": 10.86443,
+            "12": 10.83746,
+            "13": 10.85841,
+            "14": 10.86315,
+            "15": 10.79766,
+            "16": 10.79525,
+            "17": 10.77133,
+            "18": 10.78938,
+            "19": 10.78311,
+            "20": 10.71655,
+            "21": 10.68376,
+            "22": 10.53038,
+            "23": 10.69869,
+            "24": 10.5858,
+            "25": 10.52379,
+            "26": 10.58281,
+            "27": 10.6097,
+            "28": 10.57173,
+            "29": 10.59005,
+            "30": 10.35671,
+            "31": 10.09391,
+            "32": 10.45878,
+            "33": 10.45658,
+            "34": 10.20481,
+            "35": 10.26727,
+            "36": 10.22341,
+            "37": 10.35319,
+            "38": 10.19446,
+            "39": 10.41712,
+            "40": 10.08932,
+            "41": 10.12772,
+            "42": 10.21193,
+            "43": 9.83111,
+            "44": 9.96933,
+            "45": 9.83615,
+            "46": 9.81673,
+            "47": 10.15426,
+            "48": 9.85308,
+            "49": 9.53436,
+            "50": 9.91912,
+            "51": 9.85363,
+            "52": 9.74288,
+            "53": 10.07163,
+            "54": 9.96275,
+            "55": 9.88233,
+            "56": 9.63455,
+            "57": 9.48649,
+            "58": 9.84879,
+            "59": 9.589,
+            "60": 9.5109,
+            "61": 9.703,
+            "62": 9.99634,
+            "63": 9.40054,
+            "64": 9.78477,
+            "65": 8.95365,
+            "66": 9.71813,
+            "67": 9.36915,
+            "68": 9.79814,
+            "69": 9.79674,
+            "70": 9.74886,
+            "71": 9.63185,
+            "72": 9.59951,
+            "73": 9.50305,
+            "74": 8.95217,
+            "75": 9.43098,
+            "76": 9.09068,
+            "77": 10.08086,
+            "78": 9.7353,
+            "79": 9.38859,
             "80": 9.41418,
-            "81": 9.48403,
-            "82": 9.70907,
-            "83": 9.3152,
-            "84": 9.41838,
-            "85": 9.62222,
-            "86": 9.07945,
-            "87": 9.59202,
-            "88": 9.74953,
-            "89": 9.60441,
-            "90": 9.82577,
-            "91": 9.34232,
-            "92": 9.35837,
-            "93": 9.07969,
-            "94": 8.82793,
-            "95": 9.50864,
-            "96": 9.52117,
-            "97": 9.30605,
-            "98": 9.6658,
-            "99": 8.87716,
-            "100": 9.38997
+            "81": 9.48423,
+            "82": 9.70903,
+            "83": 9.3151,
+            "84": 9.41846,
+            "85": 9.62239,
+            "86": 9.07953,
+            "87": 9.59204,
+            "88": 9.74948,
+            "89": 9.60436,
+            "90": 9.82573,
+            "91": 9.34231,
+            "92": 9.35857,
+            "93": 9.07976,
+            "94": 8.82788,
+            "95": 9.50877,
+            "96": 9.52129,
+            "97": 9.30597,
+            "98": 9.66586,
+            "99": 8.87711,
+            "100": 9.38978
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 5488.0,
-            "2": 5704.0,
-            "3": 5788.0,
-            "4": 5853.0,
-            "5": 6401.0,
-            "6": 6686.0,
-            "7": 5949.0,
-            "8": 5811.0,
-            "9": 6280.0,
-            "10": 5192.0,
-            "11": 6645.0,
-            "12": 6193.0,
-            "13": 6525.0,
-            "14": 6487.0,
-            "15": 6258.0,
-            "16": 6261.0,
-            "17": 6080.0,
-            "18": 5901.0,
-            "19": 6228.0,
-            "20": 5713.0,
-            "21": 6265.0,
-            "22": 5788.0,
-            "23": 6618.0,
-            "24": 6159.0,
-            "25": 5674.0,
-            "26": 6218.0,
-            "27": 6180.0,
-            "28": 6802.0,
-            "29": 7006.0,
-            "30": 6195.0,
-            "31": 5847.0,
-            "32": 6680.0,
-            "33": 7327.0,
-            "34": 6433.0,
-            "35": 6593.0,
-            "36": 6717.0,
-            "37": 7545.0,
-            "38": 7130.0,
-            "39": 7928.0,
-            "40": 7233.0,
-            "41": 7093.0,
-            "42": 7653.0,
-            "43": 7136.0,
-            "44": 7113.0,
-            "45": 7167.0,
-            "46": 7435.0,
-            "47": 7501.0,
-            "48": 7648.0,
-            "49": 7520.0,
-            "50": 7701.0,
-            "51": 7847.0,
-            "52": 7828.0,
-            "53": 8765.0,
-            "54": 8799.0,
-            "55": 7683.0,
-            "56": 7972.0,
-            "57": 7642.0,
-            "58": 8419.0,
-            "59": 8276.0,
-            "60": 7917.0,
-            "61": 8598.0,
-            "62": 8394.0,
-            "63": 7896.0,
-            "64": 9047.0,
-            "65": 8280.0,
-            "66": 9315.0,
-            "67": 8277.0,
-            "68": 8341.0,
-            "69": 8737.0,
-            "70": 9764.0,
-            "71": 9050.0,
-            "72": 9036.0,
-            "73": 9076.0,
-            "74": 6969.0,
-            "75": 7833.0,
-            "76": 8450.0,
-            "77": 13505.0,
-            "78": 9634.0,
-            "79": 13982.0,
-            "80": 11548.0,
-            "81": 10035.0,
-            "82": 9732.0,
-            "83": 9037.0,
-            "84": 9522.0,
-            "85": 46479.0,
-            "86": 8626.0,
-            "87": 11964.0,
-            "88": 9637.0,
+            "1": 5566.0,
+            "2": 5749.0,
+            "3": 5881.0,
+            "4": 5840.0,
+            "5": 6476.0,
+            "6": 6425.0,
+            "7": 5900.0,
+            "8": 5783.0,
+            "9": 6426.0,
+            "10": 5252.0,
+            "11": 6722.0,
+            "12": 6169.0,
+            "13": 6556.0,
+            "14": 6524.0,
+            "15": 6116.0,
+            "16": 6245.0,
+            "17": 6139.0,
+            "18": 5888.0,
+            "19": 6375.0,
+            "20": 5773.0,
+            "21": 6188.0,
+            "22": 5742.0,
+            "23": 6768.0,
+            "24": 6000.0,
+            "25": 5852.0,
+            "26": 6285.0,
+            "27": 6357.0,
+            "28": 6586.0,
+            "29": 6742.0,
+            "30": 6214.0,
+            "31": 5775.0,
+            "32": 6746.0,
+            "33": 7205.0,
+            "34": 6344.0,
+            "35": 6686.0,
+            "36": 6743.0,
+            "37": 7281.0,
+            "38": 7228.0,
+            "39": 7810.0,
+            "40": 7116.0,
+            "41": 6902.0,
+            "42": 7809.0,
+            "43": 7110.0,
+            "44": 7040.0,
+            "45": 7058.0,
+            "46": 7292.0,
+            "47": 7813.0,
+            "48": 7672.0,
+            "49": 7601.0,
+            "50": 7605.0,
+            "51": 8105.0,
+            "52": 7792.0,
+            "53": 8870.0,
+            "54": 8700.0,
+            "55": 7685.0,
+            "56": 7975.0,
+            "57": 7544.0,
+            "58": 8539.0,
+            "59": 8275.0,
+            "60": 7822.0,
+            "61": 8316.0,
+            "62": 8493.0,
+            "63": 7748.0,
+            "64": 8801.0,
+            "65": 8269.0,
+            "66": 9209.0,
+            "67": 8382.0,
+            "68": 8362.0,
+            "69": 8644.0,
+            "70": 9785.0,
+            "71": 9060.0,
+            "72": 8909.0,
+            "73": 9217.0,
+            "74": 6949.0,
+            "75": 7960.0,
+            "76": 8489.0,
+            "77": 12484.0,
+            "78": 9598.0,
+            "79": 12984.0,
+            "80": 11398.0,
+            "81": 10221.0,
+            "82": 9615.0,
+            "83": 62741.0,
+            "84": 9936.0,
+            "85": 46541.0,
+            "86": 8528.0,
+            "87": 14916.0,
+            "88": 9710.0,
             "89": 10273.0,
-            "90": 11256.0,
-            "91": 8811.0,
-            "92": 9218.0,
-            "93": 8281.0,
-            "94": 9390.0,
-            "95": 9376.0,
-            "96": 13248.0,
-            "97": 8945.0,
-            "98": 10682.0,
-            "99": 15485.0,
-            "100": 9101.0
+            "90": 11178.0,
+            "91": 8856.0,
+            "92": 9337.0,
+            "93": 8404.0,
+            "94": 9649.0,
+            "95": 9657.0,
+            "96": 13226.0,
+            "97": 9093.0,
+            "98": 10575.0,
+            "99": 15320.0,
+            "100": 9363.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 628059136.0,
-            "2": 628060160.0,
-            "3": 628060160.0,
-            "4": 628060160.0,
-            "5": 628060160.0,
-            "6": 628060160.0,
-            "7": 628060160.0,
-            "8": 628060160.0,
-            "9": 628060160.0,
-            "10": 628060160.0,
-            "11": 628060160.0,
-            "12": 628060160.0,
-            "13": 628060160.0,
-            "14": 628060160.0,
-            "15": 628060160.0,
-            "16": 628060160.0,
-            "17": 628060160.0,
-            "18": 628060160.0,
-            "19": 628060160.0,
-            "20": 628060160.0,
-            "21": 628060160.0,
-            "22": 628060160.0,
-            "23": 628060160.0,
-            "24": 628060160.0,
-            "25": 628060160.0,
-            "26": 628060160.0,
-            "27": 628060160.0,
-            "28": 628060160.0,
-            "29": 628060160.0,
-            "30": 628060160.0,
-            "31": 628060160.0,
-            "32": 628060160.0,
-            "33": 628060160.0,
-            "34": 628060160.0,
-            "35": 628060160.0,
-            "36": 628060160.0,
-            "37": 628060160.0,
-            "38": 628060160.0,
-            "39": 628060160.0,
-            "40": 628060160.0,
-            "41": 628060160.0,
-            "42": 628060160.0,
-            "43": 628060160.0,
-            "44": 628060160.0,
-            "45": 628060160.0,
-            "46": 628060160.0,
-            "47": 628060160.0,
-            "48": 628060160.0,
-            "49": 628060160.0,
-            "50": 628060160.0,
-            "51": 628060160.0,
-            "52": 628060160.0,
-            "53": 628060160.0,
-            "54": 628060160.0,
-            "55": 628060160.0,
-            "56": 628060160.0,
-            "57": 628060160.0,
-            "58": 628060160.0,
-            "59": 628060160.0,
-            "60": 628060160.0,
-            "61": 628060160.0,
-            "62": 628060160.0,
-            "63": 628060160.0,
-            "64": 628060160.0,
-            "65": 628060160.0,
-            "66": 628060160.0,
-            "67": 628060160.0,
-            "68": 628060160.0,
-            "69": 628060160.0,
-            "70": 628060160.0,
-            "71": 628060160.0,
-            "72": 628060160.0,
-            "73": 628060160.0,
-            "74": 628060160.0,
-            "75": 628060160.0,
-            "76": 628060160.0,
-            "77": 628060160.0,
-            "78": 628060160.0,
-            "79": 628060160.0,
-            "80": 628060160.0,
-            "81": 628060160.0,
-            "82": 628060160.0,
-            "83": 628060160.0,
-            "84": 628060160.0,
-            "85": 628060160.0,
-            "86": 628060160.0,
-            "87": 628060160.0,
-            "88": 628060160.0,
-            "89": 628060160.0,
-            "90": 628060160.0,
-            "91": 628060160.0,
-            "92": 628060160.0,
-            "93": 628060160.0,
-            "94": 628060160.0,
-            "95": 628060160.0,
-            "96": 628060160.0,
-            "97": 628060160.0,
-            "98": 628060160.0,
-            "99": 628060160.0,
-            "100": 628060160.0
+            "1": 628645888.0,
+            "2": 628646912.0,
+            "3": 628646912.0,
+            "4": 628646912.0,
+            "5": 628646912.0,
+            "6": 628646912.0,
+            "7": 628646912.0,
+            "8": 628646912.0,
+            "9": 628646912.0,
+            "10": 628646912.0,
+            "11": 628646912.0,
+            "12": 628646912.0,
+            "13": 628646912.0,
+            "14": 628646912.0,
+            "15": 628646912.0,
+            "16": 628646912.0,
+            "17": 628646912.0,
+            "18": 628646912.0,
+            "19": 628646912.0,
+            "20": 628646912.0,
+            "21": 628646912.0,
+            "22": 628646912.0,
+            "23": 628646912.0,
+            "24": 628646912.0,
+            "25": 628646912.0,
+            "26": 628646912.0,
+            "27": 628646912.0,
+            "28": 628646912.0,
+            "29": 628646912.0,
+            "30": 628646912.0,
+            "31": 628646912.0,
+            "32": 628646912.0,
+            "33": 628646912.0,
+            "34": 628646912.0,
+            "35": 628646912.0,
+            "36": 628646912.0,
+            "37": 628646912.0,
+            "38": 628646912.0,
+            "39": 628646912.0,
+            "40": 628646912.0,
+            "41": 628646912.0,
+            "42": 628646912.0,
+            "43": 628646912.0,
+            "44": 628646912.0,
+            "45": 628646912.0,
+            "46": 628646912.0,
+            "47": 628646912.0,
+            "48": 628646912.0,
+            "49": 628646912.0,
+            "50": 628646912.0,
+            "51": 628646912.0,
+            "52": 628646912.0,
+            "53": 628646912.0,
+            "54": 628646912.0,
+            "55": 628646912.0,
+            "56": 628646912.0,
+            "57": 628646912.0,
+            "58": 628646912.0,
+            "59": 628646912.0,
+            "60": 628646912.0,
+            "61": 628646912.0,
+            "62": 628646912.0,
+            "63": 628646912.0,
+            "64": 628646912.0,
+            "65": 628646912.0,
+            "66": 628646912.0,
+            "67": 628646912.0,
+            "68": 628646912.0,
+            "69": 628646912.0,
+            "70": 628646912.0,
+            "71": 628646912.0,
+            "72": 628646912.0,
+            "73": 628646912.0,
+            "74": 628646912.0,
+            "75": 628646912.0,
+            "76": 628646912.0,
+            "77": 628646912.0,
+            "78": 628646912.0,
+            "79": 628646912.0,
+            "80": 628646912.0,
+            "81": 628646912.0,
+            "82": 628646912.0,
+            "83": 628646912.0,
+            "84": 628646912.0,
+            "85": 628646912.0,
+            "86": 628646912.0,
+            "87": 628646912.0,
+            "88": 628646912.0,
+            "89": 628646912.0,
+            "90": 628646912.0,
+            "91": 628646912.0,
+            "92": 628646912.0,
+            "93": 628646912.0,
+            "94": 628646912.0,
+            "95": 628646912.0,
+            "96": 628646912.0,
+            "97": 628646912.0,
+            "98": 628646912.0,
+            "99": 628646912.0,
+            "100": 628646912.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 966226944.0,
-            "2": 1135178752.0,
-            "3": 1135178752.0,
-            "4": 1142154752.0,
-            "5": 1142154752.0,
-            "6": 1142154752.0,
-            "7": 1142154752.0,
-            "8": 1142154752.0,
-            "9": 1142154752.0,
-            "10": 1142154752.0,
-            "11": 1142154752.0,
-            "12": 1142154752.0,
-            "13": 1142154752.0,
-            "14": 1142154752.0,
-            "15": 1142154752.0,
-            "16": 1142154752.0,
-            "17": 1142154752.0,
-            "18": 1142154752.0,
-            "19": 1142154752.0,
-            "20": 1142154752.0,
-            "21": 1142154752.0,
-            "22": 1142154752.0,
-            "23": 1142154752.0,
-            "24": 1142154752.0,
-            "25": 1142154752.0,
-            "26": 1142154752.0,
-            "27": 1142154752.0,
-            "28": 1142154752.0,
-            "29": 1142154752.0,
-            "30": 1142154752.0,
-            "31": 1142154752.0,
-            "32": 1142154752.0,
-            "33": 1142154752.0,
-            "34": 1142154752.0,
-            "35": 1142154752.0,
-            "36": 1142154752.0,
-            "37": 1142154752.0,
-            "38": 1142154752.0,
-            "39": 1142154752.0,
-            "40": 1142154752.0,
-            "41": 1142154752.0,
-            "42": 1142154752.0,
-            "43": 1142154752.0,
-            "44": 1142154752.0,
-            "45": 1142154752.0,
-            "46": 1142154752.0,
-            "47": 1142154752.0,
-            "48": 1142154752.0,
-            "49": 1142154752.0,
-            "50": 1142154752.0,
-            "51": 1142154752.0,
-            "52": 1142154752.0,
-            "53": 1142154752.0,
-            "54": 1142154752.0,
-            "55": 1142154752.0,
-            "56": 1142154752.0,
-            "57": 1142154752.0,
-            "58": 1142154752.0,
-            "59": 1142154752.0,
-            "60": 1142154752.0,
-            "61": 1145444352.0,
-            "62": 1145444352.0,
-            "63": 1145444352.0,
-            "64": 1145444352.0,
-            "65": 1145444352.0,
-            "66": 1145444352.0,
-            "67": 1145444352.0,
-            "68": 1145444352.0,
-            "69": 1145444352.0,
-            "70": 1145444352.0,
-            "71": 1145444352.0,
-            "72": 1145444352.0,
-            "73": 1145444352.0,
-            "74": 1145444352.0,
-            "75": 1145444352.0,
-            "76": 1149560320.0,
-            "77": 1149560320.0,
-            "78": 1149560320.0,
-            "79": 1149560320.0,
-            "80": 1149560320.0,
-            "81": 1149560320.0,
-            "82": 1149560320.0,
-            "83": 1149560320.0,
-            "84": 1149560320.0,
-            "85": 1149560320.0,
-            "86": 1149560320.0,
-            "87": 1149560320.0,
-            "88": 1149560320.0,
-            "89": 1149560320.0,
-            "90": 1149560320.0,
-            "91": 1149560320.0,
-            "92": 1149560320.0,
-            "93": 1149560320.0,
-            "94": 1149560320.0,
-            "95": 1149560320.0,
-            "96": 1149560320.0,
-            "97": 1149560320.0,
-            "98": 1149560320.0,
-            "99": 1149560320.0,
-            "100": 1149560320.0
+            "1": 982203392.0,
+            "2": 1149396992.0,
+            "3": 1149396992.0,
+            "4": 1155475456.0,
+            "5": 1155475456.0,
+            "6": 1155475456.0,
+            "7": 1155475456.0,
+            "8": 1155475456.0,
+            "9": 1155475456.0,
+            "10": 1155475456.0,
+            "11": 1155475456.0,
+            "12": 1155475456.0,
+            "13": 1155475456.0,
+            "14": 1155475456.0,
+            "15": 1155475456.0,
+            "16": 1155475456.0,
+            "17": 1155475456.0,
+            "18": 1155475456.0,
+            "19": 1155475456.0,
+            "20": 1155475456.0,
+            "21": 1155475456.0,
+            "22": 1155475456.0,
+            "23": 1155475456.0,
+            "24": 1155475456.0,
+            "25": 1155475456.0,
+            "26": 1155475456.0,
+            "27": 1155475456.0,
+            "28": 1155475456.0,
+            "29": 1155475456.0,
+            "30": 1155475456.0,
+            "31": 1155475456.0,
+            "32": 1155475456.0,
+            "33": 1155475456.0,
+            "34": 1155475456.0,
+            "35": 1155475456.0,
+            "36": 1155475456.0,
+            "37": 1155475456.0,
+            "38": 1155475456.0,
+            "39": 1155475456.0,
+            "40": 1155475456.0,
+            "41": 1155475456.0,
+            "42": 1155475456.0,
+            "43": 1155475456.0,
+            "44": 1155475456.0,
+            "45": 1155475456.0,
+            "46": 1155475456.0,
+            "47": 1155475456.0,
+            "48": 1155475456.0,
+            "49": 1155475456.0,
+            "50": 1155475456.0,
+            "51": 1155475456.0,
+            "52": 1155475456.0,
+            "53": 1155475456.0,
+            "54": 1155475456.0,
+            "55": 1155475456.0,
+            "56": 1155475456.0,
+            "57": 1155475456.0,
+            "58": 1155475456.0,
+            "59": 1155475456.0,
+            "60": 1155975680.0,
+            "61": 1159303168.0,
+            "62": 1159303168.0,
+            "63": 1159303168.0,
+            "64": 1159303168.0,
+            "65": 1159303168.0,
+            "66": 1159303168.0,
+            "67": 1159303168.0,
+            "68": 1159303168.0,
+            "69": 1159303168.0,
+            "70": 1159303168.0,
+            "71": 1159303168.0,
+            "72": 1159303168.0,
+            "73": 1159303168.0,
+            "74": 1159303168.0,
+            "75": 1159303168.0,
+            "76": 1164697088.0,
+            "77": 1164697088.0,
+            "78": 1164697088.0,
+            "79": 1164697088.0,
+            "80": 1164697088.0,
+            "81": 1164697088.0,
+            "82": 1164697088.0,
+            "83": 1164697088.0,
+            "84": 1164697088.0,
+            "85": 1164697088.0,
+            "86": 1164697088.0,
+            "87": 1164697088.0,
+            "88": 1164697088.0,
+            "89": 1164697088.0,
+            "90": 1164697088.0,
+            "91": 1164697088.0,
+            "92": 1164697088.0,
+            "93": 1164697088.0,
+            "94": 1164697088.0,
+            "95": 1164697088.0,
+            "96": 1164697088.0,
+            "97": 1164697088.0,
+            "98": 1164697088.0,
+            "99": 1164697088.0,
+            "100": 1164697088.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 20.38736,
-            "2": 0.68138,
-            "3": 0.62881,
-            "4": 0.61692,
-            "5": 0.61365,
-            "6": 0.60735,
-            "7": 0.60006,
-            "8": 0.59897,
-            "9": 0.59763,
-            "10": 0.6122,
-            "11": 0.59106,
-            "12": 0.59749,
-            "13": 0.60001,
-            "14": 0.58446,
-            "15": 0.57929,
-            "16": 0.58508,
-            "17": 0.5725,
-            "18": 0.57386,
-            "19": 0.57617,
-            "20": 0.57081,
-            "21": 0.57614,
-            "22": 0.57046,
-            "23": 0.57731,
-            "24": 0.56893,
-            "25": 0.58004,
-            "26": 0.56911,
-            "27": 0.60575,
-            "28": 0.61474,
-            "29": 0.58874,
-            "30": 0.57969,
-            "31": 0.57737,
-            "32": 0.58556,
-            "33": 0.5704,
-            "34": 0.57592,
-            "35": 0.58241,
-            "36": 0.57697,
-            "37": 0.57978,
-            "38": 0.57647,
-            "39": 0.56977,
-            "40": 0.58017,
-            "41": 0.57153,
-            "42": 0.57267,
-            "43": 0.5881,
-            "44": 0.57211,
-            "45": 0.59552,
-            "46": 0.56308,
-            "47": 0.5736,
-            "48": 0.58403,
-            "49": 0.57693,
-            "50": 0.57016,
-            "51": 0.57233,
-            "52": 0.55871,
-            "53": 0.5593,
-            "54": 0.55755,
-            "55": 0.56057,
-            "56": 0.56649,
-            "57": 0.56057,
-            "58": 0.56658,
-            "59": 0.55825,
-            "60": 0.57038,
-            "61": 0.5563,
-            "62": 0.56031,
-            "63": 0.56901,
-            "64": 0.56097,
-            "65": 0.56153,
-            "66": 0.56761,
-            "67": 0.5785,
-            "68": 0.57341,
-            "69": 0.57139,
-            "70": 0.56231,
-            "71": 0.55874,
-            "72": 0.55834,
-            "73": 0.55824,
-            "74": 0.5552,
-            "75": 0.5593,
-            "76": 0.56038,
-            "77": 0.56527,
-            "78": 0.56728,
-            "79": 0.56424,
-            "80": 0.55564,
-            "81": 0.55955,
-            "82": 0.55867,
-            "83": 0.56254,
-            "84": 0.55754,
-            "85": 0.55409,
-            "86": 0.55901,
-            "87": 0.55904,
-            "88": 0.57097,
-            "89": 0.5735,
-            "90": 0.55808,
-            "91": 0.55819,
-            "92": 0.58224,
-            "93": 0.55845,
-            "94": 0.56512,
-            "95": 0.5709,
-            "96": 0.56099,
-            "97": 0.56779,
-            "98": 0.55446,
-            "99": 0.56053,
-            "100": 0.56338
+            "1": 19.23269,
+            "2": 0.72886,
+            "3": 0.65505,
+            "4": 0.57926,
+            "5": 0.56473,
+            "6": 0.56262,
+            "7": 0.55541,
+            "8": 0.55169,
+            "9": 0.54588,
+            "10": 0.54513,
+            "11": 0.54209,
+            "12": 0.55074,
+            "13": 0.54861,
+            "14": 0.54825,
+            "15": 0.54517,
+            "16": 0.54378,
+            "17": 0.54038,
+            "18": 0.53418,
+            "19": 0.54272,
+            "20": 0.53786,
+            "21": 0.5453,
+            "22": 0.53544,
+            "23": 0.5385,
+            "24": 0.5306,
+            "25": 0.53752,
+            "26": 0.53028,
+            "27": 1.14331,
+            "28": 0.55476,
+            "29": 0.55192,
+            "30": 0.53922,
+            "31": 0.53776,
+            "32": 0.53422,
+            "33": 0.53153,
+            "34": 0.53781,
+            "35": 0.53428,
+            "36": 0.5321,
+            "37": 0.53103,
+            "38": 0.53328,
+            "39": 0.53189,
+            "40": 1.26265,
+            "41": 0.53531,
+            "42": 0.53252,
+            "43": 0.53665,
+            "44": 0.88396,
+            "45": 0.53586,
+            "46": 0.89593,
+            "47": 0.53907,
+            "48": 0.5309,
+            "49": 0.53767,
+            "50": 0.53491,
+            "51": 0.55263,
+            "52": 0.53343,
+            "53": 0.53673,
+            "54": 0.53859,
+            "55": 0.5329,
+            "56": 0.52954,
+            "57": 0.53085,
+            "58": 0.53458,
+            "59": 0.53132,
+            "60": 0.53967,
+            "61": 0.53205,
+            "62": 0.53559,
+            "63": 0.53393,
+            "64": 0.53143,
+            "65": 0.5339,
+            "66": 0.53358,
+            "67": 0.53117,
+            "68": 0.53709,
+            "69": 0.53768,
+            "70": 0.53628,
+            "71": 0.53275,
+            "72": 0.54058,
+            "73": 0.53091,
+            "74": 0.53069,
+            "75": 0.53307,
+            "76": 0.53389,
+            "77": 0.53403,
+            "78": 0.53188,
+            "79": 0.53173,
+            "80": 0.532,
+            "81": 0.53145,
+            "82": 0.5358,
+            "83": 0.53475,
+            "84": 0.5323,
+            "85": 0.54048,
+            "86": 0.53766,
+            "87": 0.53212,
+            "88": 0.53119,
+            "89": 0.53372,
+            "90": 0.53371,
+            "91": 0.53164,
+            "92": 0.53327,
+            "93": 0.54146,
+            "94": 0.53517,
+            "95": 0.53542,
+            "96": 0.5306,
+            "97": 0.53654,
+            "98": 0.53425,
+            "99": 0.53223,
+            "100": 0.53446
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..78918e95bae
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.85363,
+            "52": 9.74288,
+            "53": 10.07163,
+            "54": 9.96275,
+            "55": 9.88233,
+            "56": 9.63455,
+            "57": 9.48649,
+            "58": 9.84879,
+            "59": 9.589,
+            "60": 9.5109,
+            "61": 9.703,
+            "62": 9.99634,
+            "63": 9.40054,
+            "64": 9.78477,
+            "65": 8.95365,
+            "66": 9.71813,
+            "67": 9.36915,
+            "68": 9.79814,
+            "69": 9.79674,
+            "70": 9.74886,
+            "71": 9.63185,
+            "72": 9.59951,
+            "73": 9.50305,
+            "74": 8.95217,
+            "75": 9.43098,
+            "76": 9.09068,
+            "77": 10.08086,
+            "78": 9.7353,
+            "79": 9.38859,
+            "80": 9.41418,
+            "81": 9.48423,
+            "82": 9.70903,
+            "83": 9.3151,
+            "84": 9.41846,
+            "85": 9.62239,
+            "86": 9.07953,
+            "87": 9.59204,
+            "88": 9.74948,
+            "89": 9.60436,
+            "90": 9.82573,
+            "91": 9.34231,
+            "92": 9.35857,
+            "93": 9.07976,
+            "94": 8.82788,
+            "95": 9.50877,
+            "96": 9.52129,
+            "97": 9.30597,
+            "98": 9.66586,
+            "99": 8.87711,
+            "100": 9.38978
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 8105.0,
+            "52": 7792.0,
+            "53": 8870.0,
+            "54": 8700.0,
+            "55": 7685.0,
+            "56": 7975.0,
+            "57": 7544.0,
+            "58": 8539.0,
+            "59": 8275.0,
+            "60": 7822.0,
+            "61": 8316.0,
+            "62": 8493.0,
+            "63": 7748.0,
+            "64": 8801.0,
+            "65": 8269.0,
+            "66": 9209.0,
+            "67": 8382.0,
+            "68": 8362.0,
+            "69": 8644.0,
+            "70": 9785.0,
+            "71": 9060.0,
+            "72": 8909.0,
+            "73": 9217.0,
+            "74": 6949.0,
+            "75": 7960.0,
+            "76": 8489.0,
+            "77": 12484.0,
+            "78": 9598.0,
+            "79": 12984.0,
+            "80": 11398.0,
+            "81": 10221.0,
+            "82": 9615.0,
+            "83": 62741.0,
+            "84": 9936.0,
+            "85": 46541.0,
+            "86": 8528.0,
+            "87": 14916.0,
+            "88": 9710.0,
+            "89": 10273.0,
+            "90": 11178.0,
+            "91": 8856.0,
+            "92": 9337.0,
+            "93": 8404.0,
+            "94": 9649.0,
+            "95": 9657.0,
+            "96": 13226.0,
+            "97": 9093.0,
+            "98": 10575.0,
+            "99": 15320.0,
+            "100": 9363.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 628064256.0,
+            "52": 628065280.0,
+            "53": 628065280.0,
+            "54": 628065280.0,
+            "55": 628065280.0,
+            "56": 628065280.0,
+            "57": 628065280.0,
+            "58": 628065280.0,
+            "59": 628065280.0,
+            "60": 628065280.0,
+            "61": 628065280.0,
+            "62": 628065280.0,
+            "63": 628065280.0,
+            "64": 628065280.0,
+            "65": 628065280.0,
+            "66": 628065280.0,
+            "67": 628065280.0,
+            "68": 628065280.0,
+            "69": 628065280.0,
+            "70": 628065280.0,
+            "71": 628065280.0,
+            "72": 628065280.0,
+            "73": 628065280.0,
+            "74": 628065280.0,
+            "75": 628065280.0,
+            "76": 628065280.0,
+            "77": 628065280.0,
+            "78": 628065280.0,
+            "79": 628065280.0,
+            "80": 628065280.0,
+            "81": 628065280.0,
+            "82": 628065280.0,
+            "83": 628065280.0,
+            "84": 628065280.0,
+            "85": 628065280.0,
+            "86": 628065280.0,
+            "87": 628065280.0,
+            "88": 628065280.0,
+            "89": 628065280.0,
+            "90": 628065280.0,
+            "91": 628065280.0,
+            "92": 628065280.0,
+            "93": 628065280.0,
+            "94": 628065280.0,
+            "95": 628065280.0,
+            "96": 628065280.0,
+            "97": 628065280.0,
+            "98": 628065280.0,
+            "99": 628065280.0,
+            "100": 628065280.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1148061696.0,
+            "52": 1150909952.0,
+            "53": 1154032640.0,
+            "54": 1154032640.0,
+            "55": 1154032640.0,
+            "56": 1154885120.0,
+            "57": 1154885120.0,
+            "58": 1154885120.0,
+            "59": 1154885120.0,
+            "60": 1158400512.0,
+            "61": 1161243648.0,
+            "62": 1161243648.0,
+            "63": 1161243648.0,
+            "64": 1161243648.0,
+            "65": 1161243648.0,
+            "66": 1161243648.0,
+            "67": 1161243648.0,
+            "68": 1161243648.0,
+            "69": 1161243648.0,
+            "70": 1161243648.0,
+            "71": 1161243648.0,
+            "72": 1161243648.0,
+            "73": 1161243648.0,
+            "74": 1161243648.0,
+            "75": 1161243648.0,
+            "76": 1164402176.0,
+            "77": 1164402176.0,
+            "78": 1164402176.0,
+            "79": 1164402176.0,
+            "80": 1164402176.0,
+            "81": 1164402176.0,
+            "82": 1164402176.0,
+            "83": 1164402176.0,
+            "84": 1164402176.0,
+            "85": 1164402176.0,
+            "86": 1164402176.0,
+            "87": 1164402176.0,
+            "88": 1164402176.0,
+            "89": 1164402176.0,
+            "90": 1164402176.0,
+            "91": 1164402176.0,
+            "92": 1164402176.0,
+            "93": 1164402176.0,
+            "94": 1164402176.0,
+            "95": 1164402176.0,
+            "96": 1164402176.0,
+            "97": 1164402176.0,
+            "98": 1164402176.0,
+            "99": 1164402176.0,
+            "100": 1164402176.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 19.75998,
+            "52": 0.75632,
+            "53": 0.61311,
+            "54": 0.58323,
+            "55": 0.58626,
+            "56": 0.55076,
+            "57": 0.55884,
+            "58": 0.55879,
+            "59": 0.55701,
+            "60": 0.55258,
+            "61": 0.54558,
+            "62": 0.54571,
+            "63": 0.52564,
+            "64": 0.52057,
+            "65": 0.52606,
+            "66": 0.52186,
+            "67": 0.51907,
+            "68": 0.52677,
+            "69": 0.52114,
+            "70": 0.51963,
+            "71": 0.51192,
+            "72": 0.51671,
+            "73": 0.53544,
+            "74": 0.53543,
+            "75": 0.53296,
+            "76": 0.53665,
+            "77": 0.53249,
+            "78": 0.53515,
+            "79": 0.53542,
+            "80": 0.53567,
+            "81": 0.53848,
+            "82": 0.55706,
+            "83": 0.52186,
+            "84": 0.51342,
+            "85": 0.53509,
+            "86": 0.53067,
+            "87": 0.51458,
+            "88": 0.53017,
+            "89": 0.52642,
+            "90": 0.52796,
+            "91": 0.5213,
+            "92": 0.52233,
+            "93": 0.52409,
+            "94": 0.52466,
+            "95": 0.52364,
+            "96": 0.52347,
+            "97": 0.52512,
+            "98": 0.52375,
+            "99": 0.52859,
+            "100": 0.52625
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json
new file mode 100644
index 00000000000..8710366a4a2
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json
@@ -0,0 +1,644 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.94839,
+            "2": 10.94024,
+            "3": 10.95902,
+            "4": 10.9592,
+            "5": 10.93942,
+            "6": 10.95284,
+            "7": 10.95227,
+            "8": 10.93987,
+            "9": 10.94518,
+            "10": 10.94146,
+            "11": 10.94366,
+            "12": 10.93351,
+            "13": 10.92937,
+            "14": 10.93117,
+            "15": 10.87714,
+            "16": 10.88218,
+            "17": 10.87388,
+            "18": 10.86829,
+            "19": 10.86292,
+            "20": 10.78627,
+            "21": 10.73278,
+            "22": 10.62202,
+            "23": 10.72355,
+            "24": 10.61784,
+            "25": 10.54739,
+            "26": 10.64163,
+            "27": 10.63354,
+            "28": 10.59007,
+            "29": 10.59937,
+            "30": 10.36921,
+            "31": 10.1175,
+            "32": 10.457,
+            "33": 10.45238,
+            "34": 10.18943,
+            "35": 10.24409,
+            "36": 10.20779,
+            "37": 10.32099,
+            "38": 10.17141,
+            "39": 10.39579,
+            "40": 10.03318,
+            "41": 10.08573,
+            "42": 10.17487,
+            "43": 9.7274,
+            "44": 9.88257,
+            "45": 9.73978,
+            "46": 9.72104,
+            "47": 10.08354,
+            "48": 9.75251,
+            "49": 9.39373,
+            "50": 9.83765,
+            "51": 9.76236,
+            "52": 9.65444,
+            "53": 10.01594,
+            "54": 9.86969,
+            "55": 9.79645,
+            "56": 9.53492,
+            "57": 9.365,
+            "58": 9.75243,
+            "59": 9.4751,
+            "60": 9.40362,
+            "61": 9.59124,
+            "62": 9.91012,
+            "63": 9.24082,
+            "64": 9.67728,
+            "65": 8.79731,
+            "66": 9.60544,
+            "67": 9.24212,
+            "68": 9.70475,
+            "69": 9.70741,
+            "70": 9.65988,
+            "71": 9.50626,
+            "72": 9.45834,
+            "73": 9.38692,
+            "74": 8.79461,
+            "75": 9.32175,
+            "76": 8.92857,
+            "77": 9.99456,
+            "78": 9.63104,
+            "79": 9.26692,
+            "80": 9.29144,
+            "81": 9.37768,
+            "82": 9.60984,
+            "83": 9.21108,
+            "84": 9.33667,
+            "85": 9.52726,
+            "86": 8.94539,
+            "87": 9.49937,
+            "88": 9.67766,
+            "89": 9.49525,
+            "90": 9.7509,
+            "91": 9.22918,
+            "92": 9.25394,
+            "93": 8.96194,
+            "94": 8.69021,
+            "95": 9.43531,
+            "96": 9.39886,
+            "97": 9.19199,
+            "98": 9.57248,
+            "99": 8.75688,
+            "100": 9.2924
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 22750392.0,
+            "2": 22953048.0,
+            "3": 22604332.0,
+            "4": 23266194.0,
+            "5": 22735592.0,
+            "6": 23061740.0,
+            "7": 22793278.0,
+            "8": 22960884.0,
+            "9": 22865532.0,
+            "10": 22950250.0,
+            "11": 22499592.0,
+            "12": 22455936.0,
+            "13": 22948014.0,
+            "14": 22384528.0,
+            "15": 22846172.0,
+            "16": 22856720.0,
+            "17": 22836352.0,
+            "18": 22590198.0,
+            "19": 22627006.0,
+            "20": 22712338.0,
+            "21": 22762590.0,
+            "22": 22816896.0,
+            "23": 22545280.0,
+            "24": 22794372.0,
+            "25": 22841964.0,
+            "26": 22549700.0,
+            "27": 22464724.0,
+            "28": 22453634.0,
+            "29": 22534620.0,
+            "30": 22636106.0,
+            "31": 22989484.0,
+            "32": 22593994.0,
+            "33": 22565948.0,
+            "34": 22855396.0,
+            "35": 22813606.0,
+            "36": 22595412.0,
+            "37": 22499234.0,
+            "38": 22926180.0,
+            "39": 22825052.0,
+            "40": 22675880.0,
+            "41": 22671624.0,
+            "42": 22682188.0,
+            "43": 23015228.0,
+            "44": 22766040.0,
+            "45": 22679588.0,
+            "46": 22915144.0,
+            "47": 22642744.0,
+            "48": 24003236.0,
+            "49": 23786618.0,
+            "50": 22931756.0,
+            "51": 23866290.0,
+            "52": 23807188.0,
+            "53": 24007482.0,
+            "54": 23916892.0,
+            "55": 23571308.0,
+            "56": 23954192.0,
+            "57": 24211600.0,
+            "58": 23914524.0,
+            "59": 23771900.0,
+            "60": 23813638.0,
+            "61": 23795512.0,
+            "62": 23739412.0,
+            "63": 23917700.0,
+            "64": 23895780.0,
+            "65": 24147262.0,
+            "66": 23794750.0,
+            "67": 23983810.0,
+            "68": 23674060.0,
+            "69": 23647770.0,
+            "70": 23907338.0,
+            "71": 23818256.0,
+            "72": 23723392.0,
+            "73": 22754048.0,
+            "74": 25181258.0,
+            "75": 24144968.0,
+            "76": 23976372.0,
+            "77": 22260516.0,
+            "78": 23862138.0,
+            "79": 23806872.0,
+            "80": 23773052.0,
+            "81": 25020468.0,
+            "82": 22812998.0,
+            "83": 23911992.0,
+            "84": 25144028.0,
+            "85": 22725432.0,
+            "86": 24205484.0,
+            "87": 24851672.0,
+            "88": 23700260.0,
+            "89": 22505492.0,
+            "90": 24062928.0,
+            "91": 22790310.0,
+            "92": 24923596.0,
+            "93": 23722104.0,
+            "94": 23993086.0,
+            "95": 24140048.0,
+            "96": 23909352.0,
+            "97": 23668280.0,
+            "98": 23832272.0,
+            "99": 23985032.0,
+            "100": 24101560.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 810140160.0,
+            "2": 804531200.0,
+            "3": 804531200.0,
+            "4": 934860800.0,
+            "5": 934860800.0,
+            "6": 934860800.0,
+            "7": 934860800.0,
+            "8": 934860800.0,
+            "9": 938611712.0,
+            "10": 938104832.0,
+            "11": 938379264.0,
+            "12": 934860800.0,
+            "13": 934860800.0,
+            "14": 934860800.0,
+            "15": 934860800.0,
+            "16": 942249984.0,
+            "17": 941443072.0,
+            "18": 937990144.0,
+            "19": 937548800.0,
+            "20": 937498624.0,
+            "21": 934860800.0,
+            "22": 934860800.0,
+            "23": 941533184.0,
+            "24": 942114816.0,
+            "25": 942398464.0,
+            "26": 934860800.0,
+            "27": 934860800.0,
+            "28": 934860800.0,
+            "29": 934860800.0,
+            "30": 934860800.0,
+            "31": 934860800.0,
+            "32": 934860800.0,
+            "33": 934860800.0,
+            "34": 941477888.0,
+            "35": 934860800.0,
+            "36": 934860800.0,
+            "37": 934860800.0,
+            "38": 934860800.0,
+            "39": 934860800.0,
+            "40": 934860800.0,
+            "41": 940742656.0,
+            "42": 940742656.0,
+            "43": 940742656.0,
+            "44": 940968960.0,
+            "45": 941581312.0,
+            "46": 934860800.0,
+            "47": 934860800.0,
+            "48": 940742656.0,
+            "49": 934860800.0,
+            "50": 934860800.0,
+            "51": 934860800.0,
+            "52": 940742656.0,
+            "53": 937498624.0,
+            "54": 937498624.0,
+            "55": 937498624.0,
+            "56": 937498624.0,
+            "57": 938199040.0,
+            "58": 937498624.0,
+            "59": 937498624.0,
+            "60": 940742656.0,
+            "61": 934860800.0,
+            "62": 934860800.0,
+            "63": 934860800.0,
+            "64": 934860800.0,
+            "65": 934860800.0,
+            "66": 934860800.0,
+            "67": 934860800.0,
+            "68": 934860800.0,
+            "69": 934860800.0,
+            "70": 934860800.0,
+            "71": 934860800.0,
+            "72": 934860800.0,
+            "73": 934860800.0,
+            "74": 934860800.0,
+            "75": 934860800.0,
+            "76": 934860800.0,
+            "77": 934860800.0,
+            "78": 934860800.0,
+            "79": 938199040.0,
+            "80": 938199040.0,
+            "81": 937498624.0,
+            "82": 938061824.0,
+            "83": 938412032.0,
+            "84": 937498624.0,
+            "85": 938199040.0,
+            "86": 938445824.0,
+            "87": 937498624.0,
+            "88": 937498624.0,
+            "89": 934860800.0,
+            "90": 934860800.0,
+            "91": 934860800.0,
+            "92": 940742656.0,
+            "93": 940742656.0,
+            "94": 938104832.0,
+            "95": 941451264.0,
+            "96": 940742656.0,
+            "97": 941542400.0,
+            "98": 938104832.0,
+            "99": 940742656.0,
+            "100": 938104832.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 1017976320.0,
+            "2": 1226964480.0,
+            "3": 1228012032.0,
+            "4": 1300063744.0,
+            "5": 1300063744.0,
+            "6": 1300223488.0,
+            "7": 1300891648.0,
+            "8": 1300891648.0,
+            "9": 1300891648.0,
+            "10": 1303292416.0,
+            "11": 1303292416.0,
+            "12": 1303292416.0,
+            "13": 1303292416.0,
+            "14": 1303292416.0,
+            "15": 1303292416.0,
+            "16": 1303292416.0,
+            "17": 1303292416.0,
+            "18": 1303292416.0,
+            "19": 1303292416.0,
+            "20": 1303292416.0,
+            "21": 1303292416.0,
+            "22": 1303292416.0,
+            "23": 1303292416.0,
+            "24": 1303292416.0,
+            "25": 1303292416.0,
+            "26": 1303292416.0,
+            "27": 1303292416.0,
+            "28": 1303292416.0,
+            "29": 1303292416.0,
+            "30": 1303292416.0,
+            "31": 1303292416.0,
+            "32": 1303292416.0,
+            "33": 1303292416.0,
+            "34": 1303292416.0,
+            "35": 1303292416.0,
+            "36": 1303292416.0,
+            "37": 1303292416.0,
+            "38": 1303292416.0,
+            "39": 1303292416.0,
+            "40": 1303292416.0,
+            "41": 1303292416.0,
+            "42": 1303292416.0,
+            "43": 1303292416.0,
+            "44": 1303292416.0,
+            "45": 1303292416.0,
+            "46": 1303292416.0,
+            "47": 1303292416.0,
+            "48": 1303292416.0,
+            "49": 1303292416.0,
+            "50": 1303292416.0,
+            "51": 1303292416.0,
+            "52": 1303292416.0,
+            "53": 1303292416.0,
+            "54": 1303292416.0,
+            "55": 1303292416.0,
+            "56": 1303292416.0,
+            "57": 1303292416.0,
+            "58": 1303292416.0,
+            "59": 1303292416.0,
+            "60": 1303292416.0,
+            "61": 1303292416.0,
+            "62": 1303292416.0,
+            "63": 1303292416.0,
+            "64": 1303292416.0,
+            "65": 1303292416.0,
+            "66": 1303292416.0,
+            "67": 1303292416.0,
+            "68": 1303292416.0,
+            "69": 1303292416.0,
+            "70": 1303292416.0,
+            "71": 1303292416.0,
+            "72": 1303292416.0,
+            "73": 1303292416.0,
+            "74": 1303292416.0,
+            "75": 1303292416.0,
+            "76": 1303292416.0,
+            "77": 1303292416.0,
+            "78": 1303292416.0,
+            "79": 1303292416.0,
+            "80": 1303292416.0,
+            "81": 1303292416.0,
+            "82": 1303292416.0,
+            "83": 1303292416.0,
+            "84": 1303292416.0,
+            "85": 1303292416.0,
+            "86": 1303292416.0,
+            "87": 1303292416.0,
+            "88": 1303292416.0,
+            "89": 1303292416.0,
+            "90": 1303292416.0,
+            "91": 1303292416.0,
+            "92": 1303292416.0,
+            "93": 1303292416.0,
+            "94": 1303292416.0,
+            "95": 1303292416.0,
+            "96": 1303292416.0,
+            "97": 1303292416.0,
+            "98": 1303292416.0,
+            "99": 1303292416.0,
+            "100": 1303292416.0
+        }
+    },
+    "mtp_1 loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.89507,
+            "2": 10.90521,
+            "3": 10.90879,
+            "4": 10.86406,
+            "5": 10.91765,
+            "6": 10.92332,
+            "7": 10.90072,
+            "8": 10.8906,
+            "9": 10.90544,
+            "10": 10.88636,
+            "11": 10.93328,
+            "12": 10.91582,
+            "13": 10.90917,
+            "14": 10.92294,
+            "15": 10.89802,
+            "16": 10.90337,
+            "17": 10.88446,
+            "18": 10.90526,
+            "19": 10.90011,
+            "20": 10.88775,
+            "21": 10.88103,
+            "22": 10.85514,
+            "23": 10.89267,
+            "24": 10.87352,
+            "25": 10.86182,
+            "26": 10.87152,
+            "27": 10.88847,
+            "28": 10.87872,
+            "29": 10.88744,
+            "30": 10.87297,
+            "31": 10.80177,
+            "32": 10.8732,
+            "33": 10.88219,
+            "34": 10.83823,
+            "35": 10.85291,
+            "36": 10.84901,
+            "37": 10.85873,
+            "38": 10.83148,
+            "39": 10.86289,
+            "40": 10.82147,
+            "41": 10.82913,
+            "42": 10.84798,
+            "43": 10.7908,
+            "44": 10.81431,
+            "45": 10.7879,
+            "46": 10.78018,
+            "47": 10.83142,
+            "48": 10.78854,
+            "49": 10.71024,
+            "50": 10.76861,
+            "51": 10.76331,
+            "52": 10.73945,
+            "53": 10.80126,
+            "54": 10.77704,
+            "55": 10.765,
+            "56": 10.71649,
+            "57": 10.67368,
+            "58": 10.75107,
+            "59": 10.69607,
+            "60": 10.66236,
+            "61": 10.69617,
+            "62": 10.77167,
+            "63": 10.6134,
+            "64": 10.70881,
+            "65": 10.49259,
+            "66": 10.66843,
+            "67": 10.58084,
+            "68": 10.68215,
+            "69": 10.68669,
+            "70": 10.67296,
+            "71": 10.64397,
+            "72": 10.60997,
+            "73": 10.56734,
+            "74": 10.38624,
+            "75": 10.53623,
+            "76": 10.40297,
+            "77": 10.75436,
+            "78": 10.62548,
+            "79": 10.47858,
+            "80": 10.47388,
+            "81": 10.5143,
+            "82": 10.58579,
+            "83": 10.43913,
+            "84": 10.45418,
+            "85": 10.55042,
+            "86": 10.27831,
+            "87": 10.51067,
+            "88": 10.60469,
+            "89": 10.5084,
+            "90": 10.60243,
+            "91": 10.38487,
+            "92": 10.38165,
+            "93": 10.23549,
+            "94": 10.07844,
+            "95": 10.42709,
+            "96": 10.44697,
+            "97": 10.31686,
+            "98": 10.4968,
+            "99": 10.04966,
+            "100": 10.32944
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 19.93451,
+            "3": 2.31445,
+            "4": 5.28856,
+            "5": 1.09994,
+            "6": 1.09399,
+            "7": 1.09697,
+            "8": 1.09872,
+            "9": 1.17005,
+            "10": 1.10071,
+            "11": 1.0994,
+            "12": 1.08313,
+            "13": 1.09364,
+            "14": 1.09082,
+            "15": 1.09269,
+            "16": 1.08133,
+            "17": 1.08872,
+            "18": 1.09032,
+            "19": 1.10458,
+            "20": 1.10126,
+            "21": 1.09029,
+            "22": 1.19723,
+            "23": 1.36303,
+            "24": 1.39758,
+            "25": 1.40863,
+            "26": 1.40985,
+            "27": 1.40231,
+            "28": 1.42816,
+            "29": 1.37678,
+            "30": 1.40545,
+            "31": 1.40841,
+            "32": 1.40385,
+            "33": 1.39528,
+            "34": 1.4028,
+            "35": 1.41768,
+            "36": 1.40649,
+            "37": 1.41813,
+            "38": 1.40674,
+            "39": 1.38881,
+            "40": 1.40998,
+            "41": 1.37775,
+            "42": 1.39701,
+            "43": 1.3967,
+            "44": 1.40408,
+            "45": 1.40972,
+            "46": 1.41116,
+            "47": 1.40427,
+            "48": 1.38905,
+            "49": 1.42541,
+            "50": 1.40474,
+            "51": 1.40708,
+            "52": 1.37484,
+            "53": 1.38539,
+            "54": 1.39988,
+            "55": 1.39635,
+            "56": 1.41326,
+            "57": 1.40012,
+            "58": 1.40599,
+            "59": 1.41023,
+            "60": 1.4209,
+            "61": 1.41702,
+            "62": 1.40134,
+            "63": 1.40282,
+            "64": 1.40573,
+            "65": 1.41933,
+            "66": 1.40057,
+            "67": 1.41526,
+            "68": 1.40285,
+            "69": 1.41947,
+            "70": 1.37747,
+            "71": 1.41206,
+            "72": 1.39123,
+            "73": 1.42381,
+            "74": 1.40806,
+            "75": 1.40032,
+            "76": 1.41783,
+            "77": 1.39133,
+            "78": 1.41146,
+            "79": 1.42648,
+            "80": 1.40774,
+            "81": 1.40046,
+            "82": 1.39158,
+            "83": 1.4079,
+            "84": 1.40469,
+            "85": 1.39689,
+            "86": 1.41401,
+            "87": 1.40637,
+            "88": 1.40569,
+            "89": 1.45225,
+            "90": 1.39469,
+            "91": 1.39677,
+            "92": 1.39569,
+            "93": 1.38882,
+            "94": 1.40133,
+            "95": 1.41493,
+            "96": 1.40659,
+            "97": 1.39059,
+            "98": 1.40044,
+            "99": 1.41118,
+            "100": 1.39159
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..0954418053d
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json
@@ -0,0 +1,644 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.93691,
+            "2": 10.93262,
+            "3": 10.94243,
+            "4": 10.95011,
+            "5": 10.9502,
+            "6": 10.94175,
+            "7": 10.94469,
+            "8": 10.93675,
+            "9": 10.94939,
+            "10": 10.9367,
+            "11": 10.94082,
+            "12": 10.93794,
+            "13": 10.92338,
+            "14": 10.93415,
+            "15": 10.88723,
+            "16": 10.87495,
+            "17": 10.86864,
+            "18": 10.86127,
+            "19": 10.86341,
+            "20": 10.78125,
+            "21": 10.73131,
+            "22": 10.60371,
+            "23": 10.73309,
+            "24": 10.61865,
+            "25": 10.55175,
+            "26": 10.62651,
+            "27": 10.63921,
+            "28": 10.59104,
+            "29": 10.5981,
+            "30": 10.37817,
+            "31": 10.12235,
+            "32": 10.46117,
+            "33": 10.45537,
+            "34": 10.20087,
+            "35": 10.25661,
+            "36": 10.20876,
+            "37": 10.33662,
+            "38": 10.16683,
+            "39": 10.40916,
+            "40": 10.05209,
+            "41": 10.09427,
+            "42": 10.17821,
+            "43": 9.74204,
+            "44": 9.89005,
+            "45": 9.74011,
+            "46": 9.72669,
+            "47": 10.09152,
+            "48": 9.75295,
+            "49": 9.40186,
+            "50": 9.83645,
+            "51": 9.77036,
+            "52": 9.65641,
+            "53": 10.03067,
+            "54": 9.87916,
+            "55": 9.79619,
+            "56": 9.52858,
+            "57": 9.36596,
+            "58": 9.75327,
+            "59": 9.48259,
+            "60": 9.40835,
+            "61": 9.60202,
+            "62": 9.90742,
+            "63": 9.25777,
+            "64": 9.68411,
+            "65": 8.79911,
+            "66": 9.60796,
+            "67": 9.25427,
+            "68": 9.71419,
+            "69": 9.71666,
+            "70": 9.6613,
+            "71": 9.52439,
+            "72": 9.4709,
+            "73": 9.38862,
+            "74": 8.80286,
+            "75": 9.34004,
+            "76": 8.93543,
+            "77": 9.99337,
+            "78": 9.64723,
+            "79": 9.28126,
+            "80": 9.29633,
+            "81": 9.39609,
+            "82": 9.60877,
+            "83": 9.21694,
+            "84": 9.34008,
+            "85": 9.53009,
+            "86": 8.95652,
+            "87": 9.51691,
+            "88": 9.68221,
+            "89": 9.50553,
+            "90": 9.753,
+            "91": 9.2347,
+            "92": 9.26019,
+            "93": 8.94568,
+            "94": 8.69194,
+            "95": 9.44616,
+            "96": 9.41008,
+            "97": 9.20125,
+            "98": 9.58169,
+            "99": 8.75946,
+            "100": 9.29483
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 22750372.0,
+            "2": 22953180.0,
+            "3": 22604424.0,
+            "4": 23266362.0,
+            "5": 22735560.0,
+            "6": 23061884.0,
+            "7": 22793368.0,
+            "8": 22960792.0,
+            "9": 22865612.0,
+            "10": 22950328.0,
+            "11": 22499656.0,
+            "12": 22456052.0,
+            "13": 22948014.0,
+            "14": 22384498.0,
+            "15": 22846334.0,
+            "16": 22856854.0,
+            "17": 22836340.0,
+            "18": 22590220.0,
+            "19": 22627128.0,
+            "20": 22712376.0,
+            "21": 22762744.0,
+            "22": 22816900.0,
+            "23": 22545168.0,
+            "24": 22794340.0,
+            "25": 22841898.0,
+            "26": 22549680.0,
+            "27": 22464852.0,
+            "28": 22453780.0,
+            "29": 22534588.0,
+            "30": 22636160.0,
+            "31": 22989382.0,
+            "32": 22594002.0,
+            "33": 22566000.0,
+            "34": 22855476.0,
+            "35": 22813640.0,
+            "36": 22595484.0,
+            "37": 22499348.0,
+            "38": 22926172.0,
+            "39": 22825344.0,
+            "40": 22675752.0,
+            "41": 22671542.0,
+            "42": 22682408.0,
+            "43": 23014140.0,
+            "44": 22768504.0,
+            "45": 22679044.0,
+            "46": 22912572.0,
+            "47": 23691904.0,
+            "48": 24003148.0,
+            "49": 23786764.0,
+            "50": 22931654.0,
+            "51": 23866164.0,
+            "52": 23807242.0,
+            "53": 24007504.0,
+            "54": 22867916.0,
+            "55": 23571280.0,
+            "56": 23954212.0,
+            "57": 24211680.0,
+            "58": 23914512.0,
+            "59": 22722820.0,
+            "60": 23813508.0,
+            "61": 23796364.0,
+            "62": 23739896.0,
+            "63": 24965914.0,
+            "64": 23898698.0,
+            "65": 24150860.0,
+            "66": 23796512.0,
+            "67": 25032960.0,
+            "68": 23673048.0,
+            "69": 23644684.0,
+            "70": 23903614.0,
+            "71": 24864656.0,
+            "72": 24766928.0,
+            "73": 24850636.0,
+            "74": 24133166.0,
+            "75": 24143912.0,
+            "76": 25025406.0,
+            "77": 24358344.0,
+            "78": 24910132.0,
+            "79": 23808164.0,
+            "80": 23772256.0,
+            "81": 25020440.0,
+            "82": 23851242.0,
+            "83": 23911824.0,
+            "84": 25143864.0,
+            "85": 24823592.0,
+            "86": 23153228.0,
+            "87": 24850332.0,
+            "88": 24749368.0,
+            "89": 22505174.0,
+            "90": 25108752.0,
+            "91": 23838548.0,
+            "92": 24923816.0,
+            "93": 24769484.0,
+            "94": 25041572.0,
+            "95": 25189350.0,
+            "96": 23909318.0,
+            "97": 23664104.0,
+            "98": 23832392.0,
+            "99": 23981812.0,
+            "100": 24101144.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 773784064.0,
+            "2": 775203840.0,
+            "3": 766700544.0,
+            "4": 937129984.0,
+            "5": 934836224.0,
+            "6": 934836224.0,
+            "7": 935983104.0,
+            "8": 937129984.0,
+            "9": 937129984.0,
+            "10": 937129984.0,
+            "11": 937129984.0,
+            "12": 937129984.0,
+            "13": 937129984.0,
+            "14": 934836224.0,
+            "15": 934836224.0,
+            "16": 935376896.0,
+            "17": 935983104.0,
+            "18": 937129984.0,
+            "19": 937129984.0,
+            "20": 937129984.0,
+            "21": 937129984.0,
+            "22": 934836224.0,
+            "23": 934836224.0,
+            "24": 935376896.0,
+            "25": 937129984.0,
+            "26": 937129984.0,
+            "27": 937129984.0,
+            "28": 934836224.0,
+            "29": 935376896.0,
+            "30": 936523776.0,
+            "31": 936523776.0,
+            "32": 937129984.0,
+            "33": 937129984.0,
+            "34": 937129984.0,
+            "35": 937129984.0,
+            "36": 937129984.0,
+            "37": 937129984.0,
+            "38": 934836224.0,
+            "39": 935376896.0,
+            "40": 936523776.0,
+            "41": 937129984.0,
+            "42": 937129984.0,
+            "43": 937129984.0,
+            "44": 934836224.0,
+            "45": 934836224.0,
+            "46": 937129984.0,
+            "47": 935376896.0,
+            "48": 937129984.0,
+            "49": 937129984.0,
+            "50": 935376896.0,
+            "51": 935376896.0,
+            "52": 937129984.0,
+            "53": 937129984.0,
+            "54": 934836224.0,
+            "55": 934836224.0,
+            "56": 934836224.0,
+            "57": 934836224.0,
+            "58": 934836224.0,
+            "59": 934836224.0,
+            "60": 934836224.0,
+            "61": 935376896.0,
+            "62": 935376896.0,
+            "63": 935983104.0,
+            "64": 936523776.0,
+            "65": 936523776.0,
+            "66": 936523776.0,
+            "67": 937129984.0,
+            "68": 937129984.0,
+            "69": 937129984.0,
+            "70": 937129984.0,
+            "71": 937129984.0,
+            "72": 937129984.0,
+            "73": 937129984.0,
+            "74": 934836224.0,
+            "75": 934836224.0,
+            "76": 935376896.0,
+            "77": 935376896.0,
+            "78": 936523776.0,
+            "79": 937129984.0,
+            "80": 937129984.0,
+            "81": 937129984.0,
+            "82": 937129984.0,
+            "83": 934836224.0,
+            "84": 934836224.0,
+            "85": 934836224.0,
+            "86": 936523776.0,
+            "87": 936523776.0,
+            "88": 937129984.0,
+            "89": 937129984.0,
+            "90": 937129984.0,
+            "91": 937129984.0,
+            "92": 934836224.0,
+            "93": 935376896.0,
+            "94": 936523776.0,
+            "95": 936523776.0,
+            "96": 936523776.0,
+            "97": 936523776.0,
+            "98": 936523776.0,
+            "99": 937129984.0,
+            "100": 937129984.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 990381056.0,
+            "2": 1211127808.0,
+            "3": 1211127808.0,
+            "4": 1296840704.0,
+            "5": 1297885184.0,
+            "6": 1297885184.0,
+            "7": 1298358784.0,
+            "8": 1299077120.0,
+            "9": 1299077120.0,
+            "10": 1300477952.0,
+            "11": 1300477952.0,
+            "12": 1300477952.0,
+            "13": 1300477952.0,
+            "14": 1300477952.0,
+            "15": 1300477952.0,
+            "16": 1300477952.0,
+            "17": 1300477952.0,
+            "18": 1300477952.0,
+            "19": 1300779008.0,
+            "20": 1300779008.0,
+            "21": 1300779008.0,
+            "22": 1300779008.0,
+            "23": 1301612544.0,
+            "24": 1301612544.0,
+            "25": 1301612544.0,
+            "26": 1301612544.0,
+            "27": 1301612544.0,
+            "28": 1301612544.0,
+            "29": 1301612544.0,
+            "30": 1301612544.0,
+            "31": 1301612544.0,
+            "32": 1301612544.0,
+            "33": 1301612544.0,
+            "34": 1301612544.0,
+            "35": 1301612544.0,
+            "36": 1301612544.0,
+            "37": 1301612544.0,
+            "38": 1301612544.0,
+            "39": 1301612544.0,
+            "40": 1301612544.0,
+            "41": 1301612544.0,
+            "42": 1301612544.0,
+            "43": 1301612544.0,
+            "44": 1301612544.0,
+            "45": 1301612544.0,
+            "46": 1301612544.0,
+            "47": 1301612544.0,
+            "48": 1301612544.0,
+            "49": 1301612544.0,
+            "50": 1301612544.0,
+            "51": 1301612544.0,
+            "52": 1301612544.0,
+            "53": 1301612544.0,
+            "54": 1301612544.0,
+            "55": 1301612544.0,
+            "56": 1301612544.0,
+            "57": 1301612544.0,
+            "58": 1301612544.0,
+            "59": 1301612544.0,
+            "60": 1301612544.0,
+            "61": 1301612544.0,
+            "62": 1301612544.0,
+            "63": 1301612544.0,
+            "64": 1301612544.0,
+            "65": 1301612544.0,
+            "66": 1301612544.0,
+            "67": 1301612544.0,
+            "68": 1301612544.0,
+            "69": 1301612544.0,
+            "70": 1301612544.0,
+            "71": 1301612544.0,
+            "72": 1301612544.0,
+            "73": 1301612544.0,
+            "74": 1301612544.0,
+            "75": 1301612544.0,
+            "76": 1301612544.0,
+            "77": 1301612544.0,
+            "78": 1301612544.0,
+            "79": 1301612544.0,
+            "80": 1301612544.0,
+            "81": 1301612544.0,
+            "82": 1301612544.0,
+            "83": 1301612544.0,
+            "84": 1301612544.0,
+            "85": 1301612544.0,
+            "86": 1301612544.0,
+            "87": 1301612544.0,
+            "88": 1301612544.0,
+            "89": 1301612544.0,
+            "90": 1301612544.0,
+            "91": 1301612544.0,
+            "92": 1301612544.0,
+            "93": 1301612544.0,
+            "94": 1301612544.0,
+            "95": 1301612544.0,
+            "96": 1301612544.0,
+            "97": 1301612544.0,
+            "98": 1301612544.0,
+            "99": 1301612544.0,
+            "100": 1301612544.0
+        }
+    },
+    "mtp_1 loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.88691,
+            "2": 10.90544,
+            "3": 10.90868,
+            "4": 10.86912,
+            "5": 10.91636,
+            "6": 10.90651,
+            "7": 10.90278,
+            "8": 10.88975,
+            "9": 10.90453,
+            "10": 10.89162,
+            "11": 10.93392,
+            "12": 10.91634,
+            "13": 10.91136,
+            "14": 10.91999,
+            "15": 10.88538,
+            "16": 10.90717,
+            "17": 10.87525,
+            "18": 10.91409,
+            "19": 10.90936,
+            "20": 10.87835,
+            "21": 10.8786,
+            "22": 10.85481,
+            "23": 10.87937,
+            "24": 10.87208,
+            "25": 10.85798,
+            "26": 10.86991,
+            "27": 10.87718,
+            "28": 10.88667,
+            "29": 10.88859,
+            "30": 10.85479,
+            "31": 10.79701,
+            "32": 10.86609,
+            "33": 10.87789,
+            "34": 10.8397,
+            "35": 10.84184,
+            "36": 10.85,
+            "37": 10.85585,
+            "38": 10.83714,
+            "39": 10.86361,
+            "40": 10.82866,
+            "41": 10.83386,
+            "42": 10.84447,
+            "43": 10.78747,
+            "44": 10.82127,
+            "45": 10.78826,
+            "46": 10.78323,
+            "47": 10.82894,
+            "48": 10.7901,
+            "49": 10.71201,
+            "50": 10.77359,
+            "51": 10.76681,
+            "52": 10.74029,
+            "53": 10.8027,
+            "54": 10.77345,
+            "55": 10.76133,
+            "56": 10.71153,
+            "57": 10.66673,
+            "58": 10.74318,
+            "59": 10.69182,
+            "60": 10.66418,
+            "61": 10.70712,
+            "62": 10.77164,
+            "63": 10.61759,
+            "64": 10.71667,
+            "65": 10.4936,
+            "66": 10.67118,
+            "67": 10.57515,
+            "68": 10.68716,
+            "69": 10.68277,
+            "70": 10.66908,
+            "71": 10.64566,
+            "72": 10.60905,
+            "73": 10.56507,
+            "74": 10.37106,
+            "75": 10.5114,
+            "76": 10.39856,
+            "77": 10.75192,
+            "78": 10.62708,
+            "79": 10.4675,
+            "80": 10.47474,
+            "81": 10.51003,
+            "82": 10.58819,
+            "83": 10.43946,
+            "84": 10.45015,
+            "85": 10.55142,
+            "86": 10.2831,
+            "87": 10.51182,
+            "88": 10.60318,
+            "89": 10.50948,
+            "90": 10.60407,
+            "91": 10.38208,
+            "92": 10.38708,
+            "93": 10.23019,
+            "94": 10.08381,
+            "95": 10.4259,
+            "96": 10.4489,
+            "97": 10.32133,
+            "98": 10.49668,
+            "99": 10.04795,
+            "100": 10.33446
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 56.96201,
+            "2": 1.45193,
+            "3": 1.37387,
+            "4": 3.96627,
+            "5": 0.7423,
+            "6": 0.71394,
+            "7": 0.74369,
+            "8": 0.72342,
+            "9": 0.70545,
+            "10": 0.70125,
+            "11": 0.70256,
+            "12": 0.69915,
+            "13": 0.70499,
+            "14": 0.72329,
+            "15": 0.71852,
+            "16": 0.71011,
+            "17": 0.70885,
+            "18": 0.73035,
+            "19": 0.71099,
+            "20": 0.70225,
+            "21": 0.70459,
+            "22": 0.71823,
+            "23": 0.7143,
+            "24": 0.72574,
+            "25": 0.72055,
+            "26": 0.71722,
+            "27": 0.71209,
+            "28": 0.72407,
+            "29": 0.72809,
+            "30": 0.71187,
+            "31": 0.70668,
+            "32": 0.70676,
+            "33": 0.70474,
+            "34": 0.70406,
+            "35": 0.70401,
+            "36": 0.70968,
+            "37": 0.71106,
+            "38": 0.72458,
+            "39": 0.736,
+            "40": 0.71238,
+            "41": 0.71868,
+            "42": 0.71459,
+            "43": 0.71031,
+            "44": 0.70945,
+            "45": 0.72444,
+            "46": 0.76158,
+            "47": 0.75856,
+            "48": 0.7282,
+            "49": 0.72448,
+            "50": 0.7471,
+            "51": 0.80801,
+            "52": 0.73438,
+            "53": 0.71695,
+            "54": 0.71541,
+            "55": 0.70768,
+            "56": 0.70462,
+            "57": 0.70705,
+            "58": 0.70511,
+            "59": 0.70702,
+            "60": 0.70636,
+            "61": 0.70372,
+            "62": 0.71024,
+            "63": 0.70358,
+            "64": 0.70559,
+            "65": 0.70617,
+            "66": 0.70048,
+            "67": 0.71248,
+            "68": 0.7119,
+            "69": 0.71093,
+            "70": 0.7051,
+            "71": 0.70391,
+            "72": 0.70275,
+            "73": 0.70876,
+            "74": 0.7119,
+            "75": 0.71307,
+            "76": 0.718,
+            "77": 0.71166,
+            "78": 0.71308,
+            "79": 0.70995,
+            "80": 0.71153,
+            "81": 0.71464,
+            "82": 0.71596,
+            "83": 0.71997,
+            "84": 0.71197,
+            "85": 0.70577,
+            "86": 0.71956,
+            "87": 0.70383,
+            "88": 0.71047,
+            "89": 0.71711,
+            "90": 0.70818,
+            "91": 0.71353,
+            "92": 0.71401,
+            "93": 0.73616,
+            "94": 0.71104,
+            "95": 0.70295,
+            "96": 0.69995,
+            "97": 0.7015,
+            "98": 0.70705,
+            "99": 0.70765,
+            "100": 0.72052
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..f6ac4db56ee
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,644 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 9.77036,
+            "52": 9.65641,
+            "53": 10.03067,
+            "54": 9.87916,
+            "55": 9.79619,
+            "56": 9.52858,
+            "57": 9.36596,
+            "58": 9.75327,
+            "59": 9.48259,
+            "60": 9.40835,
+            "61": 9.60202,
+            "62": 9.90742,
+            "63": 9.25777,
+            "64": 9.68411,
+            "65": 8.79911,
+            "66": 9.60796,
+            "67": 9.25427,
+            "68": 9.71419,
+            "69": 9.71666,
+            "70": 9.6613,
+            "71": 9.52439,
+            "72": 9.4709,
+            "73": 9.38862,
+            "74": 8.80286,
+            "75": 9.34004,
+            "76": 8.93543,
+            "77": 9.99337,
+            "78": 9.64723,
+            "79": 9.28126,
+            "80": 9.29633,
+            "81": 9.39609,
+            "82": 9.60877,
+            "83": 9.21694,
+            "84": 9.34008,
+            "85": 9.53009,
+            "86": 8.95652,
+            "87": 9.51691,
+            "88": 9.68221,
+            "89": 9.50553,
+            "90": 9.753,
+            "91": 9.2347,
+            "92": 9.26019,
+            "93": 8.94568,
+            "94": 8.69194,
+            "95": 9.44616,
+            "96": 9.41008,
+            "97": 9.20125,
+            "98": 9.58169,
+            "99": 8.75946,
+            "100": 9.29483
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 23866164.0,
+            "52": 23807242.0,
+            "53": 24007504.0,
+            "54": 22867916.0,
+            "55": 23571280.0,
+            "56": 23954212.0,
+            "57": 24211680.0,
+            "58": 23914512.0,
+            "59": 22722820.0,
+            "60": 23813508.0,
+            "61": 23796364.0,
+            "62": 23739896.0,
+            "63": 24965914.0,
+            "64": 23898698.0,
+            "65": 24150860.0,
+            "66": 23796512.0,
+            "67": 25032960.0,
+            "68": 23673048.0,
+            "69": 23644684.0,
+            "70": 23903614.0,
+            "71": 24864656.0,
+            "72": 24766928.0,
+            "73": 24850636.0,
+            "74": 24133166.0,
+            "75": 24143912.0,
+            "76": 25025406.0,
+            "77": 24358344.0,
+            "78": 24910132.0,
+            "79": 23808164.0,
+            "80": 23772256.0,
+            "81": 25020440.0,
+            "82": 23851242.0,
+            "83": 23911824.0,
+            "84": 25143864.0,
+            "85": 24823592.0,
+            "86": 23153228.0,
+            "87": 24850332.0,
+            "88": 24749368.0,
+            "89": 22505174.0,
+            "90": 25108752.0,
+            "91": 23838548.0,
+            "92": 24923816.0,
+            "93": 24769484.0,
+            "94": 25041572.0,
+            "95": 25189350.0,
+            "96": 23909318.0,
+            "97": 23664104.0,
+            "98": 23832392.0,
+            "99": 23981812.0,
+            "100": 24101144.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 773784064.0,
+            "52": 782961664.0,
+            "53": 762989568.0,
+            "54": 937131008.0,
+            "55": 937131008.0,
+            "56": 936524800.0,
+            "57": 935377920.0,
+            "58": 934837248.0,
+            "59": 937131008.0,
+            "60": 937131008.0,
+            "61": 937131008.0,
+            "62": 935984128.0,
+            "63": 934837248.0,
+            "64": 937131008.0,
+            "65": 937131008.0,
+            "66": 936524800.0,
+            "67": 934837248.0,
+            "68": 937131008.0,
+            "69": 937131008.0,
+            "70": 935377920.0,
+            "71": 934837248.0,
+            "72": 937131008.0,
+            "73": 936524800.0,
+            "74": 934837248.0,
+            "75": 937131008.0,
+            "76": 936524800.0,
+            "77": 934837248.0,
+            "78": 937131008.0,
+            "79": 937131008.0,
+            "80": 935377920.0,
+            "81": 934837248.0,
+            "82": 937131008.0,
+            "83": 936524800.0,
+            "84": 934837248.0,
+            "85": 937131008.0,
+            "86": 937131008.0,
+            "87": 934837248.0,
+            "88": 937131008.0,
+            "89": 937131008.0,
+            "90": 935377920.0,
+            "91": 937131008.0,
+            "92": 937131008.0,
+            "93": 935377920.0,
+            "94": 934837248.0,
+            "95": 937131008.0,
+            "96": 935984128.0,
+            "97": 934837248.0,
+            "98": 937131008.0,
+            "99": 937131008.0,
+            "100": 934837248.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1191340032.0,
+            "52": 1191340032.0,
+            "53": 1191340032.0,
+            "54": 1286565888.0,
+            "55": 1287746048.0,
+            "56": 1287746048.0,
+            "57": 1288803328.0,
+            "58": 1288803328.0,
+            "59": 1288803328.0,
+            "60": 1288803328.0,
+            "61": 1288803328.0,
+            "62": 1288803328.0,
+            "63": 1288803328.0,
+            "64": 1288803328.0,
+            "65": 1288803328.0,
+            "66": 1288803328.0,
+            "67": 1288803328.0,
+            "68": 1288803328.0,
+            "69": 1288803328.0,
+            "70": 1288803328.0,
+            "71": 1288803328.0,
+            "72": 1288803328.0,
+            "73": 1288803328.0,
+            "74": 1288803328.0,
+            "75": 1288803328.0,
+            "76": 1288803328.0,
+            "77": 1288803328.0,
+            "78": 1288803328.0,
+            "79": 1288803328.0,
+            "80": 1288803328.0,
+            "81": 1288803328.0,
+            "82": 1288803328.0,
+            "83": 1288803328.0,
+            "84": 1288803328.0,
+            "85": 1288803328.0,
+            "86": 1288803328.0,
+            "87": 1288803328.0,
+            "88": 1288803328.0,
+            "89": 1288803328.0,
+            "90": 1288803328.0,
+            "91": 1288803328.0,
+            "92": 1288803328.0,
+            "93": 1288803328.0,
+            "94": 1288803328.0,
+            "95": 1288803328.0,
+            "96": 1288803328.0,
+            "97": 1288803328.0,
+            "98": 1288803328.0,
+            "99": 1288803328.0,
+            "100": 1288803328.0
+        }
+    },
+    "mtp_1 loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.76681,
+            "52": 10.74029,
+            "53": 10.8027,
+            "54": 10.77345,
+            "55": 10.76133,
+            "56": 10.71153,
+            "57": 10.66673,
+            "58": 10.74318,
+            "59": 10.69182,
+            "60": 10.66418,
+            "61": 10.70712,
+            "62": 10.77164,
+            "63": 10.61759,
+            "64": 10.71667,
+            "65": 10.4936,
+            "66": 10.67118,
+            "67": 10.57515,
+            "68": 10.68716,
+            "69": 10.68277,
+            "70": 10.66908,
+            "71": 10.64566,
+            "72": 10.60905,
+            "73": 10.56507,
+            "74": 10.37106,
+            "75": 10.5114,
+            "76": 10.39856,
+            "77": 10.75192,
+            "78": 10.62708,
+            "79": 10.4675,
+            "80": 10.47474,
+            "81": 10.51003,
+            "82": 10.58819,
+            "83": 10.43946,
+            "84": 10.45015,
+            "85": 10.55142,
+            "86": 10.2831,
+            "87": 10.51182,
+            "88": 10.60318,
+            "89": 10.50948,
+            "90": 10.60407,
+            "91": 10.38208,
+            "92": 10.38708,
+            "93": 10.23019,
+            "94": 10.08381,
+            "95": 10.4259,
+            "96": 10.4489,
+            "97": 10.32133,
+            "98": 10.49668,
+            "99": 10.04795,
+            "100": 10.33446
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 57.04071,
+            "52": 1.40134,
+            "53": 1.32404,
+            "54": 3.89868,
+            "55": 0.67679,
+            "56": 0.684,
+            "57": 0.68825,
+            "58": 0.68465,
+            "59": 0.68607,
+            "60": 0.68633,
+            "61": 0.6798,
+            "62": 0.68281,
+            "63": 0.68253,
+            "64": 0.68011,
+            "65": 0.6766,
+            "66": 0.67533,
+            "67": 0.67885,
+            "68": 0.67126,
+            "69": 0.6756,
+            "70": 0.67255,
+            "71": 0.67556,
+            "72": 0.67135,
+            "73": 0.66897,
+            "74": 0.66783,
+            "75": 0.66944,
+            "76": 0.66908,
+            "77": 0.66904,
+            "78": 0.67839,
+            "79": 0.6752,
+            "80": 0.67644,
+            "81": 0.6727,
+            "82": 0.67278,
+            "83": 0.66999,
+            "84": 0.67287,
+            "85": 0.67248,
+            "86": 0.6678,
+            "87": 0.67191,
+            "88": 0.66961,
+            "89": 0.67168,
+            "90": 0.67021,
+            "91": 0.66676,
+            "92": 0.66871,
+            "93": 0.67204,
+            "94": 0.67233,
+            "95": 0.66905,
+            "96": 0.6735,
+            "97": 0.67671,
+            "98": 0.67137,
+            "99": 0.67053,
+            "100": 0.67168
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/model_config.yaml
new file mode 100644
index 00000000000..f0d1cc0afd3
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/model_config.yaml
@@ -0,0 +1,96 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 13
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --mtp-num-layers: 1
+  --micro-batch-size: 2
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --position-embedding-type: rope
+  --rotary-base: 10000
+  --untie-embeddings-and-output-weights: true
+  --disable-bias-linear: true
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --train-iters: 100
+  --lr-decay-iters: 320000
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 2
+  --expert-model-parallel-size: 2
+  --expert-tensor-parallel-size: 2
+  --pipeline-model-parallel-layout: Et\\|\\(tt\\|\\)*6mL # Et|(tt|)*6mL
+  --sequence-parallel: true
+  --num-experts: 8
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --moe-token-dispatcher-type: alltoall
+  --moe-router-load-balancing-type: global_aux_loss
+  --moe-router-topk: 2
+  --moe-router-dtype: fp32
+  --moe-router-fusion: true
+  --moe-router-enable-expert-bias: true
+  --moe-router-score-function: sigmoid
+  --moe-router-pre-softmax: true
+  --moe-ffn-hidden-size: 1024
+  --moe-shared-expert-intermediate-size: 512
+  --moe-grouped-gemm: true
+  --moe-layer-freq: ([0]*4+[1]*9)
+  --moe-permute-fusion: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --bf16: true
+  --fp8-format: hybrid
+  --fp8-recipe: blockwise
+  --first-last-layers-bf16: true
+  --no-bias-gelu-fusion: true
+  --recompute-granularity: selective
+  --recompute-modules: "[moe_act]"
+  --cuda-graph-impl: transformer_engine
+  --cuda-graph-scope: "[attn mlp moe_router moe_preprocess]"
+  --log-memory-to-tensorboard: true
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-interval: 1
+  --timing-log-level: 0
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --ckpt-fully-parallel-load: true
+  --ckpt-format: torch_dist
+  --ckpt-assume-constant-structure: true
+TEST_TYPE: ckpt-resume
+METRICS:
+  # - "iteration-time"
+  - "lm loss"
+  - "num-zeros"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
+  - "mtp_1 loss"
diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json
index d9a60d1ae11..447e404810b 100644
--- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json
@@ -1,178 +1,295 @@
 {
-  "0": {
-    "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.",
-    "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you",
-    "generated_tokens": [
-      32844,
-      1394,
-      1278,
-      4735,
-      2200,
-      1278,
-      7146,
-      30774,
-      1044,
-      1321,
-      1278,
-      26466,
-      3930,
-      2015,
-      1044,
-      1321,
-      1278,
-      30245,
-      8223,
-      1044,
-      1429,
-      1073,
-      4525,
-      4670,
-      1317,
-      3354,
-      1261,
-      6947,
-      1394,
-      1636
-    ],
-    "latency": 2.0423662662506104,
-    "cuda_graph_request_count_map": {
-      "6584": 0,
-      "6240": 0,
-      "5824": 0,
-      "5408": 0,
-      "4992": 0,
-      "4576": 0,
-      "4160": 0,
-      "3744": 0,
-      "3328": 0,
-      "2912": 0,
-      "2496": 0,
-      "2080": 0,
-      "1664": 0,
-      "1248": 0,
-      "832": 0,
-      "416": 29
-    },
-    "step_count": 30,
-    "logprobs": [
-      -10.737512588500977,
-      -3.724862575531006,
-      -2.833397388458252,
-      -1.2464861869812012,
-      -0.2549239993095398,
-      -1.7607988119125366,
-      -2.419379711151123,
-      -1.9533929824829102,
-      -2.1014301776885986,
-      -6.169030666351318,
-      -0.8734959363937378,
-      -2.4733574390411377,
-      -3.4822516441345215,
-      -4.180896759033203,
-      -1.9767613410949707,
-      -1.8347630500793457,
-      -2.2581257820129395,
-      -7.180149078369141,
-      -0.0453881211578846,
-      -1.9841610193252563,
-      -5.015386581420898,
-      -8.827117919921875,
-      -9.885746002197266,
-      -0.8498678207397461,
-      -4.770059585571289,
-      -0.855280339717865,
-      -2.2494924068450928,
-      -0.017164958640933037,
-      -0.03715415671467781,
-      -3.4830124378204346,
-      -8.635110855102539,
-      -1.2520610094070435,
-      -6.62324857711792,
-      -3.639960765838623,
-      -3.664339542388916,
-      -4.182392597198486,
-      -2.1796066761016846,
-      -1.0725229978561401,
-      -0.26311880350112915,
-      -0.8036076426506042,
-      -4.6958818435668945,
-      -9.042495727539062,
-      -0.013647346757352352,
-      -3.1747794151306152,
-      -1.322129487991333,
-      -3.949110746383667,
-      -0.7829495072364807,
-      -0.002083513652905822,
-      -2.970266580581665,
-      -10.56244945526123,
-      -3.2369167804718018,
-      -1.1530492305755615,
-      -4.917466163635254,
-      -0.21241025626659393,
-      -0.06490474194288254,
-      -1.372581124305725,
-      -2.224682092666626,
-      -4.3847503662109375,
-      -0.36867555975914,
-      -4.035493850708008,
-      -0.39869019389152527,
-      -0.14373983442783356,
-      -2.716118812561035,
-      -10.687016487121582,
-      -0.04773370549082756,
-      -3.398231267929077,
-      -0.8646175265312195,
-      -4.74052619934082,
-      -0.23649944365024567,
-      -2.6610701084136963,
-      -0.8428961634635925,
-      -1.614527940750122,
-      -5.793307781219482,
-      -16.929147720336914,
-      -2.6586406230926514,
-      -0.1385982781648636,
-      -7.435610771179199,
-      -1.0483647584915161,
-      -2.1261863708496094,
-      -1.5261307954788208,
-      -0.27082547545433044,
-      -5.859070777893066,
-      -0.00648513063788414,
-      -7.732051849365234,
-      -2.712515354156494,
-      -2.9137418270111084,
-      -3.041210651397705,
-      -2.3559694290161133,
-      -0.3973437249660492,
-      -1.4338903427124023,
-      -2.2967660427093506,
-      -0.6096595525741577,
-      -1.3119444847106934,
-      -1.93257474899292,
-      -1.726539134979248,
-      -0.8397530317306519,
-      -0.5014236569404602,
-      -1.2989763021469116,
-      -1.5857150554656982,
-      -1.096572995185852,
-      -0.4009067416191101,
-      -0.43302634358406067,
-      -0.041601795703172684,
-      -1.285712718963623,
-      -2.214778184890747,
-      -2.6971933841705322,
-      -0.8101387619972229,
-      -0.43101266026496887,
-      -2.808060884475708,
-      -1.5226430892944336,
-      -1.6209226846694946,
-      -0.048716772347688675,
-      -1.3497682809829712,
-      -1.343377947807312,
-      -1.2755295038223267,
-      -1.2342015504837036,
-      -0.5394397377967834
-    ]
+ "0": {
+  "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.",
+  "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you",
+  "generated_tokens": [
+   32844,
+   1394,
+   1278,
+   4735,
+   2200,
+   1278,
+   7146,
+   30774,
+   1044,
+   1321,
+   1278,
+   26466,
+   3930,
+   2015,
+   1044,
+   1321,
+   1278,
+   30245,
+   8223,
+   1044,
+   1429,
+   1073,
+   4525,
+   4670,
+   1317,
+   3354,
+   1261,
+   6947,
+   1394,
+   1636
+  ],
+  "latency": 0.6956486701965332,
+  "cuda_graph_request_count_map": {
+   "8": 29
   },
-  "throughput": [34.95064017365726, 34.95064017365726]
-}
+  "step_count": 30,
+  "top_n_logprobs": null,
+  "prompt_top_n_logprobs": null,
+  "prompt_logprobs": [
+   -10.737512588500977,
+   -3.724862575531006,
+   -2.833397388458252,
+   -1.2464861869812012,
+   -0.2549239993095398,
+   -1.7607988119125366,
+   -2.419379711151123,
+   -1.9533929824829102,
+   -2.1014301776885986,
+   -6.169030666351318,
+   -0.8734959363937378,
+   -2.4733574390411377,
+   -3.4822516441345215,
+   -4.180896759033203,
+   -1.9767613410949707,
+   -1.8347630500793457,
+   -2.2581257820129395,
+   -7.180149078369141,
+   -0.0453881211578846,
+   -1.9841610193252563,
+   -5.015386581420898,
+   -8.827117919921875,
+   -9.885746002197266,
+   -0.8498678207397461,
+   -4.770059585571289,
+   -0.855280339717865,
+   -2.2494924068450928,
+   -0.017164958640933037,
+   -0.03715415671467781,
+   -3.4830124378204346,
+   -8.635110855102539,
+   -1.2520610094070435,
+   -6.62324857711792,
+   -3.639960765838623,
+   -3.664339542388916,
+   -4.182392597198486,
+   -2.1796066761016846,
+   -1.0725229978561401,
+   -0.26311880350112915,
+   -0.8036076426506042,
+   -4.6958818435668945,
+   -9.042495727539062,
+   -0.013647346757352352,
+   -3.1747794151306152,
+   -1.322129487991333,
+   -3.949110746383667,
+   -0.7829495072364807,
+   -0.002083513652905822,
+   -2.970266580581665,
+   -10.56244945526123,
+   -3.2369167804718018,
+   -1.1530492305755615,
+   -4.917466163635254,
+   -0.21241025626659393,
+   -0.06490474194288254,
+   -1.372581124305725,
+   -2.224682092666626,
+   -4.3847503662109375,
+   -0.36867555975914,
+   -4.035493850708008,
+   -0.39869019389152527,
+   -0.14373983442783356,
+   -2.716118812561035,
+   -10.687016487121582,
+   -0.04773370549082756,
+   -3.398231267929077,
+   -0.8646175265312195,
+   -4.74052619934082,
+   -0.23649944365024567,
+   -2.6610701084136963,
+   -0.8428961634635925,
+   -1.614527940750122,
+   -5.793307781219482,
+   -16.929147720336914,
+   -2.6586406230926514,
+   -0.1385982781648636,
+   -7.435610771179199,
+   -1.0483647584915161,
+   -2.1261863708496094,
+   -1.5261307954788208,
+   -0.27082547545433044,
+   -5.859070777893066,
+   -0.00648513063788414,
+   -7.732051849365234,
+   -2.712515354156494,
+   -2.9137418270111084,
+   -3.041210651397705
+  ],
+  "generated_logprobs": [
+   -2.3559694290161133,
+   -0.3973437249660492,
+   -1.4338903427124023,
+   -2.2967660427093506,
+   -0.6096595525741577,
+   -1.310807704925537,
+   -1.9799187183380127,
+   -1.710689663887024,
+   -0.8325198888778687,
+   -0.4943186938762665,
+   -1.2886956930160522,
+   -1.585263729095459,
+   -1.101692795753479,
+   -0.44188663363456726,
+   -0.4740143418312073,
+   -0.042198192328214645,
+   -1.2899682521820068,
+   -2.1242613792419434,
+   -2.7151529788970947,
+   -0.8274281024932861,
+   -0.39647114276885986,
+   -2.7846553325653076,
+   -1.5348155498504639,
+   -1.626529335975647,
+   -0.047930192202329636,
+   -1.3622726202011108,
+   -1.3274445533752441,
+   -1.2834383249282837,
+   -1.3211638927459717,
+   -0.48530423641204834
+  ],
+  "logprobs": [
+   -10.737512588500977,
+   -3.724862575531006,
+   -2.833397388458252,
+   -1.2464861869812012,
+   -0.2549239993095398,
+   -1.7607988119125366,
+   -2.419379711151123,
+   -1.9533929824829102,
+   -2.1014301776885986,
+   -6.169030666351318,
+   -0.8734959363937378,
+   -2.4733574390411377,
+   -3.4822516441345215,
+   -4.180896759033203,
+   -1.9767613410949707,
+   -1.8347630500793457,
+   -2.2581257820129395,
+   -7.180149078369141,
+   -0.0453881211578846,
+   -1.9841610193252563,
+   -5.015386581420898,
+   -8.827117919921875,
+   -9.885746002197266,
+   -0.8498678207397461,
+   -4.770059585571289,
+   -0.855280339717865,
+   -2.2494924068450928,
+   -0.017164958640933037,
+   -0.03715415671467781,
+   -3.4830124378204346,
+   -8.635110855102539,
+   -1.2520610094070435,
+   -6.62324857711792,
+   -3.639960765838623,
+   -3.664339542388916,
+   -4.182392597198486,
+   -2.1796066761016846,
+   -1.0725229978561401,
+   -0.26311880350112915,
+   -0.8036076426506042,
+   -4.6958818435668945,
+   -9.042495727539062,
+   -0.013647346757352352,
+   -3.1747794151306152,
+   -1.322129487991333,
+   -3.949110746383667,
+   -0.7829495072364807,
+   -0.002083513652905822,
+   -2.970266580581665,
+   -10.56244945526123,
+   -3.2369167804718018,
+   -1.1530492305755615,
+   -4.917466163635254,
+   -0.21241025626659393,
+   -0.06490474194288254,
+   -1.372581124305725,
+   -2.224682092666626,
+   -4.3847503662109375,
+   -0.36867555975914,
+   -4.035493850708008,
+   -0.39869019389152527,
+   -0.14373983442783356,
+   -2.716118812561035,
+   -10.687016487121582,
+   -0.04773370549082756,
+   -3.398231267929077,
+   -0.8646175265312195,
+   -4.74052619934082,
+   -0.23649944365024567,
+   -2.6610701084136963,
+   -0.8428961634635925,
+   -1.614527940750122,
+   -5.793307781219482,
+   -16.929147720336914,
+   -2.6586406230926514,
+   -0.1385982781648636,
+   -7.435610771179199,
+   -1.0483647584915161,
+   -2.1261863708496094,
+   -1.5261307954788208,
+   -0.27082547545433044,
+   -5.859070777893066,
+   -0.00648513063788414,
+   -7.732051849365234,
+   -2.712515354156494,
+   -2.9137418270111084,
+   -3.041210651397705,
+   -2.3559694290161133,
+   -0.3973437249660492,
+   -1.4338903427124023,
+   -2.2967660427093506,
+   -0.6096595525741577,
+   -1.310807704925537,
+   -1.9799187183380127,
+   -1.710689663887024,
+   -0.8325198888778687,
+   -0.4943186938762665,
+   -1.2886956930160522,
+   -1.585263729095459,
+   -1.101692795753479,
+   -0.44188663363456726,
+   -0.4740143418312073,
+   -0.042198192328214645,
+   -1.2899682521820068,
+   -2.1242613792419434,
+   -2.7151529788970947,
+   -0.8274281024932861,
+   -0.39647114276885986,
+   -2.7846553325653076,
+   -1.5348155498504639,
+   -1.626529335975647,
+   -0.047930192202329636,
+   -1.3622726202011108,
+   -1.3274445533752441,
+   -1.2834383249282837,
+   -1.3211638927459717,
+   -0.48530423641204834
+  ]
+ },
+ "throughput": [
+  1.9407774475980641,
+  40.8327035151158,
+  42.82685786577602,
+  42.814063580843204,
+  42.88001112304976,
+  42.90077111461981,
+  42.81151438072744,
+  42.89731946981911
+ ]
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
index e97dc0b56a4..afc75144dc8 100644
--- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
@@ -76,11 +76,13 @@ MODEL_ARGS:
   --num-tokens-to-generate: 30
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1
   --inference-repeat-n: 8
   --inference-dynamic-batching-buffer-size-gb: 20
+  --inference-dynamic-batching-num-cuda-graphs: 1 
+  --inference-dynamic-batching-max-requests: 8
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..b239ac96c3d
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/golden_values_dev_dgx_h100.json
@@ -0,0 +1,158 @@
+{
+    "0": {
+        "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.",
+        "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you",
+        "generated_tokens": [
+            32844,
+            1394,
+            1278,
+            4735,
+            2200,
+            1278,
+            7146,
+            30774,
+            1044,
+            1321,
+            1278,
+            26466,
+            3930,
+            2015,
+            1044,
+            1321,
+            1278,
+            30245,
+            8223,
+            1044,
+            1429,
+            1073,
+            4525,
+            4670,
+            1317,
+            3354,
+            1261,
+            6947,
+            1394,
+            1636
+        ],
+        "latency": 1.974948332994245,
+        "logprobs": [
+            -10.737512588500977,
+            -3.724862575531006,
+            -2.833397388458252,
+            -1.2464861869812012,
+            -0.2549239993095398,
+            -1.7607988119125366,
+            -2.419379711151123,
+            -1.9533929824829102,
+            -2.1014301776885986,
+            -6.169030666351318,
+            -0.8734959363937378,
+            -2.4733574390411377,
+            -3.4822516441345215,
+            -4.180896759033203,
+            -1.9767613410949707,
+            -1.8347630500793457,
+            -2.2581257820129395,
+            -7.180149078369141,
+            -0.0453881211578846,
+            -1.9841610193252563,
+            -5.015386581420898,
+            -8.827117919921875,
+            -9.885746002197266,
+            -0.8498678207397461,
+            -4.770059585571289,
+            -0.855280339717865,
+            -2.2494924068450928,
+            -0.017164958640933037,
+            -0.03715415671467781,
+            -3.4830124378204346,
+            -8.635110855102539,
+            -1.2520610094070435,
+            -6.604944705963135,
+            -3.873375177383423,
+            -3.6618807315826416,
+            -4.184391975402832,
+            -2.2140231132507324,
+            -1.0711486339569092,
+            -0.23651468753814697,
+            -0.8028834462165833,
+            -4.772289276123047,
+            -9.09329605102539,
+            -0.013350379653275013,
+            -3.204096555709839,
+            -1.286576271057129,
+            -3.966451644897461,
+            -0.784888744354248,
+            -0.002081372309476137,
+            -2.9676947593688965,
+            -10.666797637939453,
+            -3.026693105697632,
+            -1.1712640523910522,
+            -4.93634033203125,
+            -0.2349349856376648,
+            -0.06406460702419281,
+            -1.3560465574264526,
+            -2.1993725299835205,
+            -4.378607749938965,
+            -0.36503157019615173,
+            -4.032698631286621,
+            -0.3783165216445923,
+            -0.14568912982940674,
+            -2.7145652770996094,
+            -10.591975212097168,
+            -0.043545372784137726,
+            -3.484504461288452,
+            -0.9063424468040466,
+            -4.716145992279053,
+            -0.263183057308197,
+            -2.654480457305908,
+            -0.8381667137145996,
+            -1.677478551864624,
+            -5.816836357116699,
+            -17.05772590637207,
+            -2.634756088256836,
+            -0.1400006264448166,
+            -7.400341987609863,
+            -1.0960910320281982,
+            -2.1047825813293457,
+            -1.5999900102615356,
+            -0.26269301772117615,
+            -5.727797508239746,
+            -0.006663957145065069,
+            -7.748298168182373,
+            -2.6529595851898193,
+            -3.0231595039367676,
+            -2.9622015953063965,
+            -2.3360800743103027,
+            -0.3991503119468689,
+            -1.5159229040145874,
+            -2.289414882659912,
+            -0.6100144386291504,
+            -1.3164187669754028,
+            -1.9431946277618408,
+            -1.7792527675628662,
+            -0.8328706622123718,
+            -0.501052737236023,
+            -1.278053879737854,
+            -1.5683506727218628,
+            -0.9720054864883423,
+            -0.40760406851768494,
+            -0.43419456481933594,
+            -0.04328203946352005,
+            -1.2999448776245117,
+            -2.1266980171203613,
+            -2.6690115928649902,
+            -0.7812177538871765,
+            -0.41717368364334106,
+            -2.8806936740875244,
+            -1.5312169790267944,
+            -1.62917160987854,
+            -0.05274559557437897,
+            -1.362119436264038,
+            -1.337896704673767,
+            -1.2551532983779907,
+            -1.256169080734253,
+            -0.49199968576431274
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/model_config.yaml
new file mode 100644
index 00000000000..edc5fc2eb32
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq/model_config.yaml
@@ -0,0 +1,90 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+TEST_TYPE: frozen-start
+MODE: inference
+MODEL_ARGS:
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --timing-log-level: 0
+  --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints
+  --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
+  --tokenizer-type: TikTokenizer
+  --tiktoken-pattern: v2
+  --distributed-backend: nccl
+  --log-interval: 1
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 8
+  --expert-tensor-parallel-size: 1
+  --sequence-parallel: true
+  --use-mcore-models: true
+  --moe-token-dispatcher-type: alltoall
+  --moe-grouped-gemm: true
+  --num-experts: 64
+  --moe-router-topk: 6
+  --moe-z-loss-coeff: 0
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-aux-loss-coeff: 1e-3
+  --moe-router-score-function: sigmoid
+  --untie-embeddings-and-output-weights: true
+  --disable-bias-linear: true
+  --init-method-std: 0.014
+  --position-embedding-type: rope
+  --rotary-base: 1000000
+  --rotary-percent: 1.0
+  --num-layers: 27
+  --hidden-size: 2048
+  --moe-ffn-hidden-size: 1408
+  --moe-shared-expert-intermediate-size: 2816
+  --ffn-hidden-size: 10944
+  --num-attention-heads: 16
+  --kv-channels: 128
+  --normalization: RMSNorm
+  --swiglu: true
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+  --micro-batch-size: 1
+  --ckpt-format: torch_dist
+  --ckpt-fully-parallel-save: true
+  --ckpt-fully-parallel-load: true
+  --ckpt-assume-constant-structure: true
+  --dist-ckpt-strictness: log_unexpected
+  --bf16: true
+  --attention-backend: flash
+  --no-create-attention-mask-in-dataloader: true
+  --num-workers: 8
+  --use-checkpoint-args: true
+  --no-use-tokenizer-model-from-checkpoint-args: true
+  --no-load-optim: true
+  --deterministic-mode: true # moe will use different ops for determinism for inference
+  --save-interval: 2000
+  --temperature: 1.0
+  --top_k: 1
+  --return-log-probs: true
+  --num-tokens-to-generate: 30
+  --max-tokens-to-oom: 3600000
+  --inference-max-seq-length: 4096
+  --output-path: ${INFERENCE_OUTPUT_PATH}
+  --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
+  --incoming-requests-per-sec: -1 # all requests arrive up front.
+  --inference-repeat-n: 8
+  --inference-dynamic-batching-buffer-size-gb: 20
+  --cuda-graph-impl: local 
+  --moe-pad-experts-for-cuda-graph-inference: true
+  --inference-dynamic-batching-buffer-size-gb: 20
+  --inference-dynamic-batching-num-cuda-graphs: 1 
+  --inference-dynamic-batching-max-requests: 512
+  --inference-logging-step-interval: 1
+  --sequence-parallel: true
+
+METRICS:
+  - "generated_tokens"
+  - "logprobs"
diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..66c9e3e4121
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/golden_values_dev_dgx_h100.json
@@ -0,0 +1,158 @@
+{
+    "0": {
+        "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.",
+        "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you",
+        "generated_tokens": [
+            32844,
+            1394,
+            1278,
+            4735,
+            2200,
+            1278,
+            7146,
+            30774,
+            1044,
+            1321,
+            1278,
+            26466,
+            3930,
+            2015,
+            1044,
+            1321,
+            1278,
+            30245,
+            8223,
+            1044,
+            1429,
+            1073,
+            4525,
+            4670,
+            1317,
+            3354,
+            1261,
+            6947,
+            1394,
+            1636
+        ],
+        "latency": 28.185462809633464,
+        "logprobs": [
+            -10.737512588500977,
+            -3.724862575531006,
+            -2.833397388458252,
+            -1.2464861869812012,
+            -0.2549239993095398,
+            -1.7607988119125366,
+            -2.419379711151123,
+            -1.9533929824829102,
+            -2.1014301776885986,
+            -6.169030666351318,
+            -0.8734959363937378,
+            -2.4733574390411377,
+            -3.4822516441345215,
+            -4.180896759033203,
+            -1.9767613410949707,
+            -1.8347630500793457,
+            -2.2581257820129395,
+            -7.180149078369141,
+            -0.0453881211578846,
+            -1.9841610193252563,
+            -5.015386581420898,
+            -8.827117919921875,
+            -9.885746002197266,
+            -0.8498678207397461,
+            -4.770059585571289,
+            -0.855280339717865,
+            -2.2494924068450928,
+            -0.017164958640933037,
+            -0.03715415671467781,
+            -3.4830124378204346,
+            -8.635110855102539,
+            -1.2520610094070435,
+            -6.62324857711792,
+            -3.639960765838623,
+            -3.664339542388916,
+            -4.182392597198486,
+            -2.1796066761016846,
+            -1.0725229978561401,
+            -0.26311880350112915,
+            -0.8036076426506042,
+            -4.6958818435668945,
+            -9.042495727539062,
+            -0.013647346757352352,
+            -3.1747794151306152,
+            -1.322129487991333,
+            -3.949110746383667,
+            -0.7829495072364807,
+            -0.002083513652905822,
+            -2.970266580581665,
+            -10.56244945526123,
+            -3.2369167804718018,
+            -1.1530492305755615,
+            -4.917466163635254,
+            -0.21241025626659393,
+            -0.06490474194288254,
+            -1.372581124305725,
+            -2.224682092666626,
+            -4.3847503662109375,
+            -0.36867555975914,
+            -4.035493850708008,
+            -0.39869019389152527,
+            -0.14373983442783356,
+            -2.716118812561035,
+            -10.687016487121582,
+            -0.04773370549082756,
+            -3.398231267929077,
+            -0.8646175265312195,
+            -4.74052619934082,
+            -0.23649944365024567,
+            -2.6610701084136963,
+            -0.8428961634635925,
+            -1.614527940750122,
+            -5.793307781219482,
+            -16.929147720336914,
+            -2.6586406230926514,
+            -0.1385982781648636,
+            -7.435610771179199,
+            -1.0483647584915161,
+            -2.1261863708496094,
+            -1.5261307954788208,
+            -0.27082547545433044,
+            -5.859070777893066,
+            -0.00648513063788414,
+            -7.732051849365234,
+            -2.712515354156494,
+            -2.9137418270111084,
+            -3.041210651397705,
+            -2.3559694290161133,
+            -0.3973437249660492,
+            -1.4338903427124023,
+            -2.2967660427093506,
+            -0.6096595525741577,
+            -1.3119444847106934,
+            -1.93257474899292,
+            -1.726539134979248,
+            -0.8397530317306519,
+            -0.5014236569404602,
+            -1.2989763021469116,
+            -1.5857150554656982,
+            -1.096572995185852,
+            -0.4009067416191101,
+            -0.43302634358406067,
+            -0.041601795703172684,
+            -1.285712718963623,
+            -2.214778184890747,
+            -2.6971933841705322,
+            -0.8101387619972229,
+            -0.43101266026496887,
+            -2.808060884475708,
+            -1.5226430892944336,
+            -1.6209226846694946,
+            -0.048716772347688675,
+            -1.3497682809829712,
+            -1.343377947807312,
+            -1.2755295038223267,
+            -1.2342015504837036,
+            -0.5394397377967834
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/model_config.yaml
new file mode 100644
index 00000000000..d62d10db7c1
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq/model_config.yaml
@@ -0,0 +1,84 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+TEST_TYPE: frozen-start
+MODE: inference
+MODEL_ARGS:
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --timing-log-level: 0
+  --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints
+  --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
+  --tokenizer-type: TikTokenizer
+  --tiktoken-pattern: v2
+  --distributed-backend: nccl
+  --log-interval: 1
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 8
+  --expert-tensor-parallel-size: 1
+  --sequence-parallel: true
+  --use-mcore-models: true
+  --moe-token-dispatcher-type: alltoall
+  --moe-grouped-gemm: true
+  --num-experts: 64
+  --moe-router-topk: 6
+  --moe-z-loss-coeff: 0
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-aux-loss-coeff: 1e-3
+  --moe-router-score-function: sigmoid
+  --untie-embeddings-and-output-weights: true
+  --disable-bias-linear: true
+  --init-method-std: 0.014
+  --position-embedding-type: rope
+  --rotary-base: 1000000
+  --rotary-percent: 1.0
+  --num-layers: 27
+  --hidden-size: 2048
+  --moe-ffn-hidden-size: 1408
+  --moe-shared-expert-intermediate-size: 2816
+  --ffn-hidden-size: 10944
+  --num-attention-heads: 16
+  --kv-channels: 128
+  --normalization: RMSNorm
+  --swiglu: true
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+  --micro-batch-size: 1
+  --ckpt-format: torch_dist
+  --ckpt-fully-parallel-save: true
+  --ckpt-fully-parallel-load: true
+  --ckpt-assume-constant-structure: true
+  --dist-ckpt-strictness: log_unexpected
+  --bf16: true
+  --attention-backend: flash
+  --no-create-attention-mask-in-dataloader: true
+  --num-workers: 8
+  --use-checkpoint-args: true
+  --no-use-tokenizer-model-from-checkpoint-args: true
+  --no-load-optim: true
+  --deterministic-mode: true # moe will use different ops for determinism for inference
+  --save-interval: 2000
+  --temperature: 1.0
+  --top_k: 1
+  --return-log-probs: true
+  --num-tokens-to-generate: 30
+  --max-tokens-to-oom: 3600000
+  --inference-max-seq-length: 4096
+  --output-path: ${INFERENCE_OUTPUT_PATH}
+  --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
+  --incoming-requests-per-sec: -1 # all requests arrive up front.
+  --inference-repeat-n: 8
+  --inference-logging-step-interval: 1 
+  --sequence-parallel: true
+  
+METRICS:
+  - "generated_tokens"
+  - "logprobs"
diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
index 6c119cc548b..5ed1f1205f6 100644
--- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
@@ -72,7 +72,7 @@ MODEL_ARGS:
   --num-tokens-to-generate: 30
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
   --inference-repeat-n: 8
diff --git a/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/env_config.yaml b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/env_config.yaml
new file mode 100644
index 00000000000..329246987bf
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/env_config.yaml
@@ -0,0 +1,5 @@
+- agent_type: examples.rl.environments.countdown.countdown_agent.CountdownAgent
+  agent_args:
+    dataset_file: "/mnt/artifacts/rl_environments/Jiayi-Pan___countdown-tasks-3to4"
+    split: "train"
+  weight: 1.0
diff --git a/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..1c13e432979
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/golden_values_dev_dgx_h100.json
@@ -0,0 +1,62 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 5,
+        "step_interval": 1,
+        "values": {
+            "1": 0.0,
+            "2": 0.0,
+            "3": 0.0,
+            "4": 0.0,
+            "5": 0.0
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 5,
+        "step_interval": 1,
+        "values": {
+            "1": 1116221440.0,
+            "2": 1107565568.0,
+            "3": 1116188160.0,
+            "4": 1107525248.0,
+            "5": 1116234624.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 5,
+        "step_interval": 1,
+        "values": {
+            "1": 76683395072.0,
+            "2": 76694667264.0,
+            "3": 76694667264.0,
+            "4": 76694667264.0,
+            "5": 76694667264.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 5,
+        "step_interval": 1,
+        "values": {
+            "1": 76683403264.0,
+            "2": 77029359616.0,
+            "3": 77029900288.0,
+            "4": 77030817792.0,
+            "5": 77030817792.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 5,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 152.30721,
+            "3": 105.10506,
+            "4": 104.09995,
+            "5": 102.75745
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/model_config.yaml
new file mode 100644
index 00000000000..ed5d123892e
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/model_config.yaml
@@ -0,0 +1,141 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEAT: 1
+TEST_TYPE: frozen-start
+MODE: rl
+MODEL_ARGS:
+  # Logging and debug
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-interval: 1
+  --log-throughput: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 1
+  --timing-log-option: minmax
+  --tensorboard-log-interval: 1
+
+  # Model loading
+  --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints
+  --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
+  --tokenizer-type: TikTokenizer
+  --tiktoken-pattern: v2
+  --use-checkpoint-args: true
+  --no-use-tokenizer-model-from-checkpoint-args: true
+  --no-load-optim: true
+  --ckpt-format: torch_dist
+  --ckpt-fully-parallel-save: true
+  --ckpt-fully-parallel-load: true
+  --ckpt-assume-constant-structure: true
+  --dist-ckpt-strictness: log_unexpected
+
+  # Parallelism - Training: TP=1, EP=4 (4 GPUs model, DP=2 on 8 GPUs)
+  --sequence-parallel: true
+  --tensor-model-parallel-size: 8
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 8
+  --expert-tensor-parallel-size: 1
+  # Parallelism - Inference (refit): TP=1, EP=2 (tests EP refit)
+  --rl-inference-tensor-model-parallel-size: 4
+  --rl-inference-expert-model-parallel-size: 2
+  --rl-inference-expert-tensor-model-parallel-size: 1
+
+  # MoE configuration
+  --use-mcore-models: true
+  --moe-token-dispatcher-type: alltoall
+  --moe-grouped-gemm: true
+  --num-experts: 64
+  --moe-router-topk: 6
+  --moe-z-loss-coeff: 0
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-aux-loss-coeff: 1e-3
+  --moe-router-score-function: sigmoid
+
+  # Model architecture
+  --untie-embeddings-and-output-weights: true
+  --disable-bias-linear: true
+  --init-method-std: 0.014
+  --position-embedding-type: rope
+  --rotary-base: 1000000
+  --rotary-percent: 1.0
+  --num-layers: 27
+  --hidden-size: 2048
+  --moe-ffn-hidden-size: 1408
+  --moe-shared-expert-intermediate-size: 2816
+  --ffn-hidden-size: 10944
+  --num-attention-heads: 16
+  --kv-channels: 128
+  --normalization: RMSNorm
+  --swiglu: true
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --seq-length: 256
+  --max-position-embeddings: 256
+
+  # Training settings
+  --distributed-backend: nccl
+  --transformer-impl: transformer_engine
+  --bf16: true
+  --attention-backend: flash
+  --no-create-attention-mask-in-dataloader: true
+  --num-workers: 8
+  --deterministic-mode: true
+  --seed: 42
+
+  # RL / GRPO settings
+  --mock-data: true
+  --max-tokens-to-oom: 3600000
+  --inference-max-seq-length: 256
+  --langrl-inference-server-type: inplace_megatron
+  --calculate-per-token-loss: true
+  --rl-use-sequence-packing: true
+  --rl-sequence-packing-algo: fifo
+  --rl-offload-optimizer-during-inference: true
+  --rl-parallel-generation-tasks: 1
+  --cuda-graph-impl: local
+  --micro-batch-size: 1
+  --global-batch-size: 4
+  --grpo-group-size: 2
+  --grpo-prompts-per-step: 2
+  --grpo-iterations: 1
+  --grpo-clamp-eps-lower: 0.2
+  --grpo-clamp-eps-upper: 0.2
+  --grpo-kl-beta: 0.0
+  --grpo-entropy-term-weight: 0.0
+  --langrl-env-config: tests/functional_tests/test_cases/moe/gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest/env_config.yaml
+  --rl-partial-rollouts: true
+  --perform-rl-step: true
+  --moe-pad-experts-for-cuda-graph-inference: true
+  --rl-inference-model-unified-memory-level: 1
+  --rl-offload-inference-model-weights-when-idle: true
+  --inference-dynamic-batching-buffer-size-gb: 20
+  --inference-dynamic-batching-num-cuda-graphs: 4
+
+  # Optimizer
+  --lr: 0.000001
+  --lr-warmup-samples: 0
+  --clip-grad: 1.0
+
+  # Run control
+  --train-samples: 48828125
+  --exit-interval: 5
+  --save-interval: 1000000
+  --eval-interval: 1000000
+  --finetune: true
+  --inference-logging-step-interval: 1
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --straggler-minmax-count: 16
+  --empty-unused-memory-level: 2
+
+METRICS:
+  - "iteration-time"
+  - "lm loss"
+  - "num-zeros"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
+
+THROUGHPUT_TEST_PARAMS:
+  --start_step: 1
diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
index 3ba12056190..549821afc8b 100644
--- a/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
@@ -78,7 +78,7 @@ MODEL_ARGS:
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
   --inference-max-requests: 1
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1
 METRICS:
diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml
index 569eb969d72..4934fe6c913 100644
--- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml
@@ -71,9 +71,10 @@ MODEL_ARGS:
   --num-tokens-to-generate: 30
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
+  --inference-dynamic-batching-buffer-size-gb: 20
 METRICS:
   - "generated_tokens"
   - "logprobs"
diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
index 366d2f23575..69c0db980b0 100644
--- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
+++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml
@@ -73,7 +73,7 @@ MODEL_ARGS:
   --num-tokens-to-generate: 30
   --max-tokens-to-oom: 3600000
   --inference-max-seq-length: 4096
-  --output-path: ${TENSORBOARD_PATH}
+  --output-path: ${INFERENCE_OUTPUT_PATH}
   --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
   --incoming-requests-per-sec: -1 # all requests arrive up front.
 METRICS:
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_h100.json
index 7dbf0c3c806..f4357530aed 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_h100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2431335424.0,
-            "2": 2431335424.0,
-            "3": 2431335424.0,
-            "4": 2431335424.0,
-            "5": 2431335424.0,
-            "6": 2431335424.0,
-            "7": 2431335424.0,
-            "8": 2431335424.0,
-            "9": 2431335424.0,
-            "10": 2431335424.0,
-            "11": 2431335424.0,
-            "12": 2431335424.0,
-            "13": 2431335424.0,
-            "14": 2431335424.0,
-            "15": 2431335424.0,
-            "16": 2431335424.0,
-            "17": 2431335424.0,
-            "18": 2431335424.0,
-            "19": 2431335424.0,
-            "20": 2431335424.0,
-            "21": 2431335424.0,
-            "22": 2431335424.0,
-            "23": 2431335424.0,
-            "24": 2431335424.0,
-            "25": 2431335424.0,
-            "26": 2431335424.0,
-            "27": 2431335424.0,
-            "28": 2431335424.0,
-            "29": 2431335424.0,
-            "30": 2431335424.0,
-            "31": 2431335424.0,
-            "32": 2431335424.0,
-            "33": 2431335424.0,
-            "34": 2431335424.0,
-            "35": 2431335424.0,
-            "36": 2431335424.0,
-            "37": 2431335424.0,
-            "38": 2431335424.0,
-            "39": 2431335424.0,
-            "40": 2431335424.0,
-            "41": 2431335424.0,
-            "42": 2431335424.0,
-            "43": 2431335424.0,
-            "44": 2431335424.0,
-            "45": 2431335424.0,
-            "46": 2431335424.0,
-            "47": 2431335424.0,
-            "48": 2431335424.0,
-            "49": 2431335424.0,
-            "50": 2431335424.0
+            "1": 2431875072.0,
+            "2": 2431875072.0,
+            "3": 2431875072.0,
+            "4": 2431875072.0,
+            "5": 2431875072.0,
+            "6": 2431875072.0,
+            "7": 2431875072.0,
+            "8": 2431875072.0,
+            "9": 2431875072.0,
+            "10": 2431875072.0,
+            "11": 2431875072.0,
+            "12": 2431875072.0,
+            "13": 2431875072.0,
+            "14": 2431875072.0,
+            "15": 2431875072.0,
+            "16": 2431875072.0,
+            "17": 2431875072.0,
+            "18": 2431875072.0,
+            "19": 2431875072.0,
+            "20": 2431875072.0,
+            "21": 2431875072.0,
+            "22": 2431875072.0,
+            "23": 2431875072.0,
+            "24": 2431875072.0,
+            "25": 2431875072.0,
+            "26": 2431875072.0,
+            "27": 2431875072.0,
+            "28": 2431875072.0,
+            "29": 2431875072.0,
+            "30": 2431875072.0,
+            "31": 2431875072.0,
+            "32": 2431875072.0,
+            "33": 2431875072.0,
+            "34": 2431875072.0,
+            "35": 2431875072.0,
+            "36": 2431875072.0,
+            "37": 2431875072.0,
+            "38": 2431875072.0,
+            "39": 2431875072.0,
+            "40": 2431875072.0,
+            "41": 2431875072.0,
+            "42": 2431875072.0,
+            "43": 2431875072.0,
+            "44": 2431875072.0,
+            "45": 2431875072.0,
+            "46": 2431875072.0,
+            "47": 2431875072.0,
+            "48": 2431875072.0,
+            "49": 2431875072.0,
+            "50": 2431875072.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 14740086784.0,
-            "2": 15773663232.0,
-            "3": 15773663232.0,
-            "4": 15773663232.0,
-            "5": 15773663232.0,
-            "6": 15773663232.0,
-            "7": 15773663232.0,
-            "8": 15773663232.0,
-            "9": 15773663232.0,
-            "10": 15773663232.0,
-            "11": 15773663232.0,
-            "12": 15773663232.0,
-            "13": 15773663232.0,
-            "14": 15773663232.0,
-            "15": 15773663232.0,
-            "16": 15773663232.0,
-            "17": 15773663232.0,
-            "18": 15773663232.0,
-            "19": 15773663232.0,
-            "20": 15773663232.0,
-            "21": 15773663232.0,
-            "22": 15773663232.0,
-            "23": 15773663232.0,
-            "24": 15773663232.0,
-            "25": 15773663232.0,
-            "26": 15773663232.0,
-            "27": 15773663232.0,
-            "28": 15773663232.0,
-            "29": 15773663232.0,
-            "30": 15773663232.0,
-            "31": 15773663232.0,
-            "32": 15773663232.0,
-            "33": 15773663232.0,
-            "34": 15773663232.0,
-            "35": 15773663232.0,
-            "36": 15773663232.0,
-            "37": 15773663232.0,
-            "38": 15773663232.0,
-            "39": 15773663232.0,
-            "40": 15773663232.0,
-            "41": 15773663232.0,
-            "42": 15773663232.0,
-            "43": 15773663232.0,
-            "44": 15773663232.0,
-            "45": 15773663232.0,
-            "46": 15773663232.0,
-            "47": 15773663232.0,
-            "48": 15773663232.0,
-            "49": 15773663232.0,
-            "50": 15773663232.0
+            "1": 14740087808.0,
+            "2": 15774200832.0,
+            "3": 15774200832.0,
+            "4": 15774200832.0,
+            "5": 15774200832.0,
+            "6": 15774200832.0,
+            "7": 15774200832.0,
+            "8": 15774200832.0,
+            "9": 15774200832.0,
+            "10": 15774200832.0,
+            "11": 15774200832.0,
+            "12": 15774200832.0,
+            "13": 15774200832.0,
+            "14": 15774200832.0,
+            "15": 15774200832.0,
+            "16": 15774200832.0,
+            "17": 15774200832.0,
+            "18": 15774200832.0,
+            "19": 15774200832.0,
+            "20": 15774200832.0,
+            "21": 15774200832.0,
+            "22": 15774200832.0,
+            "23": 15774200832.0,
+            "24": 15774200832.0,
+            "25": 15774200832.0,
+            "26": 15774200832.0,
+            "27": 15774200832.0,
+            "28": 15774200832.0,
+            "29": 15774200832.0,
+            "30": 15774200832.0,
+            "31": 15774200832.0,
+            "32": 15774200832.0,
+            "33": 15774200832.0,
+            "34": 15774200832.0,
+            "35": 15774200832.0,
+            "36": 15774200832.0,
+            "37": 15774200832.0,
+            "38": 15774200832.0,
+            "39": 15774200832.0,
+            "40": 15774200832.0,
+            "41": 15774200832.0,
+            "42": 15774200832.0,
+            "43": 15774200832.0,
+            "44": 15774200832.0,
+            "45": 15774200832.0,
+            "46": 15774200832.0,
+            "47": 15774200832.0,
+            "48": 15774200832.0,
+            "49": 15774200832.0,
+            "50": 15774200832.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 5.97454,
-            "2": 0.19297,
-            "3": 0.18331,
-            "4": 0.18419,
-            "5": 0.18099,
-            "6": 0.18354,
-            "7": 0.18332,
-            "8": 0.18477,
-            "9": 0.18391,
-            "10": 0.18412,
-            "11": 0.18154,
-            "12": 0.18441,
-            "13": 0.18338,
-            "14": 0.1859,
-            "15": 0.18316,
-            "16": 0.18298,
-            "17": 0.18167,
-            "18": 0.18385,
-            "19": 0.18358,
-            "20": 0.18325,
-            "21": 0.18392,
-            "22": 0.1826,
-            "23": 0.18266,
-            "24": 0.18333,
-            "25": 0.18413,
-            "26": 0.185,
-            "27": 0.18218,
-            "28": 0.18361,
-            "29": 0.18161,
-            "30": 0.18366,
-            "31": 0.18238,
-            "32": 0.18355,
-            "33": 0.18274,
-            "34": 0.18399,
-            "35": 0.18232,
-            "36": 0.18405,
-            "37": 0.18325,
-            "38": 0.18367,
-            "39": 0.18313,
-            "40": 0.18319,
-            "41": 0.18244,
-            "42": 0.18305,
-            "43": 0.18287,
-            "44": 0.18263,
-            "45": 0.18326,
-            "46": 0.18213,
-            "47": 0.18261,
-            "48": 0.18333,
-            "49": 0.18287,
-            "50": 0.18284
+            "1": 21.47107,
+            "2": 0.21426,
+            "3": 0.18485,
+            "4": 0.1655,
+            "5": 0.16764,
+            "6": 0.16482,
+            "7": 0.16761,
+            "8": 0.16451,
+            "9": 0.16762,
+            "10": 0.16536,
+            "11": 0.17999,
+            "12": 0.18657,
+            "13": 0.16983,
+            "14": 0.16676,
+            "15": 0.16908,
+            "16": 0.16963,
+            "17": 0.17346,
+            "18": 0.17019,
+            "19": 0.17052,
+            "20": 0.17018,
+            "21": 0.16541,
+            "22": 0.16566,
+            "23": 0.16521,
+            "24": 0.16662,
+            "25": 0.16493,
+            "26": 0.16377,
+            "27": 0.16515,
+            "28": 0.16469,
+            "29": 0.16683,
+            "30": 0.16435,
+            "31": 0.1697,
+            "32": 0.16472,
+            "33": 0.1693,
+            "34": 0.16637,
+            "35": 0.16593,
+            "36": 0.16439,
+            "37": 0.16693,
+            "38": 0.16653,
+            "39": 0.16645,
+            "40": 0.16669,
+            "41": 0.16547,
+            "42": 0.16438,
+            "43": 0.16787,
+            "44": 0.16848,
+            "45": 0.16631,
+            "46": 0.16902,
+            "47": 0.16588,
+            "48": 0.16644,
+            "49": 0.16691,
+            "50": 0.1671
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_h100.json
index bf52c8e8fd4..b0c23087659 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_h100.json
@@ -4,56 +4,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 9.28651,
-            "2": 9.28395,
+            "1": 9.28644,
+            "2": 9.28396,
             "3": 9.28076,
-            "4": 9.28861,
-            "5": 9.27695,
+            "4": 9.28856,
+            "5": 9.27699,
             "6": 9.28726,
-            "7": 9.27836,
-            "8": 9.28267,
-            "9": 9.28528,
-            "10": 9.28293,
-            "11": 9.28342,
-            "12": 9.27384,
-            "13": 9.27126,
+            "7": 9.27831,
+            "8": 9.28266,
+            "9": 9.28518,
+            "10": 9.28294,
+            "11": 9.28326,
+            "12": 9.27377,
+            "13": 9.27113,
             "14": 9.27209,
-            "15": 9.25309,
-            "16": 9.24492,
+            "15": 9.25297,
+            "16": 9.24499,
             "17": 9.24857,
-            "18": 9.22951,
+            "18": 9.2295,
             "19": 9.23151,
-            "20": 9.20817,
-            "21": 9.17046,
-            "22": 9.15049,
-            "23": 9.16842,
-            "24": 9.15079,
-            "25": 9.1444,
-            "26": 9.14727,
-            "27": 9.12295,
-            "28": 9.09719,
-            "29": 9.09388,
-            "30": 9.0783,
-            "31": 8.97175,
-            "32": 9.03158,
-            "33": 9.02021,
-            "34": 8.98662,
-            "35": 8.95924,
-            "36": 8.97139,
-            "37": 8.91443,
-            "38": 8.88795,
-            "39": 8.88883,
-            "40": 8.90642,
-            "41": 8.81811,
+            "20": 9.20818,
+            "21": 9.1704,
+            "22": 9.15059,
+            "23": 9.16837,
+            "24": 9.15073,
+            "25": 9.14424,
+            "26": 9.14738,
+            "27": 9.12308,
+            "28": 9.09717,
+            "29": 9.09386,
+            "30": 9.07826,
+            "31": 8.97181,
+            "32": 9.0315,
+            "33": 9.02023,
+            "34": 8.98663,
+            "35": 8.95928,
+            "36": 8.97134,
+            "37": 8.91442,
+            "38": 8.88791,
+            "39": 8.88879,
+            "40": 8.90639,
+            "41": 8.81803,
             "42": 8.87405,
-            "43": 8.85666,
-            "44": 8.81697,
-            "45": 8.81379,
-            "46": 8.84457,
-            "47": 8.73721,
-            "48": 8.66931,
-            "49": 8.70107,
-            "50": 8.73494
+            "43": 8.85655,
+            "44": 8.81693,
+            "45": 8.81356,
+            "46": 8.84453,
+            "47": 8.73701,
+            "48": 8.66923,
+            "49": 8.70104,
+            "50": 8.73489
         }
     },
     "num-zeros": {
@@ -61,56 +61,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 5959400.0,
-            "2": 6553837.0,
-            "3": 7313493.0,
-            "4": 6377142.0,
-            "5": 6498093.0,
-            "6": 7151947.0,
-            "7": 6210401.0,
-            "8": 6334645.0,
-            "9": 6624584.0,
-            "10": 6529058.0,
-            "11": 7466715.0,
-            "12": 6471579.0,
-            "13": 6003497.0,
-            "14": 8071952.0,
-            "15": 6530023.0,
-            "16": 7526922.0,
-            "17": 6034909.0,
-            "18": 6289605.0,
-            "19": 6162573.0,
-            "20": 6527801.0,
-            "21": 6981914.0,
-            "22": 7132792.0,
-            "23": 5928465.0,
-            "24": 6210239.0,
-            "25": 6993035.0,
-            "26": 6471579.0,
-            "27": 6355357.0,
-            "28": 6877112.0,
-            "29": 6380110.0,
-            "30": 6468659.0,
-            "31": 8165130.0,
-            "32": 6765448.0,
+            "1": 5959428.0,
+            "2": 6553739.0,
+            "3": 7313558.0,
+            "4": 6377212.0,
+            "5": 6498220.0,
+            "6": 7152015.0,
+            "7": 6210260.0,
+            "8": 6334672.0,
+            "9": 6624655.0,
+            "10": 6529106.0,
+            "11": 7466660.0,
+            "12": 6471717.0,
+            "13": 6003465.0,
+            "14": 8072041.0,
+            "15": 6529968.0,
+            "16": 7526852.0,
+            "17": 6035134.0,
+            "18": 6289690.0,
+            "19": 6162498.0,
+            "20": 6527712.0,
+            "21": 6981897.0,
+            "22": 7132920.0,
+            "23": 5928645.0,
+            "24": 6210340.0,
+            "25": 6993116.0,
+            "26": 6471329.0,
+            "27": 6355333.0,
+            "28": 6876968.0,
+            "29": 6380137.0,
+            "30": 6468615.0,
+            "31": 8165212.0,
+            "32": 6765571.0,
             "33": 6355561.0,
-            "34": 6662237.0,
-            "35": 7065192.0,
-            "36": 6076915.0,
-            "37": 7785518.0,
-            "38": 6727009.0,
-            "39": 7315902.0,
-            "40": 6555154.0,
-            "41": 7314617.0,
-            "42": 6591869.0,
-            "43": 6928017.0,
-            "44": 7274417.0,
-            "45": 6680008.0,
-            "46": 6232372.0,
-            "47": 6496696.0,
-            "48": 6809696.0,
-            "49": 6753491.0,
-            "50": 6238169.0
+            "34": 6662287.0,
+            "35": 7065313.0,
+            "36": 6076925.0,
+            "37": 7785462.0,
+            "38": 6727049.0,
+            "39": 7315988.0,
+            "40": 6555018.0,
+            "41": 7314645.0,
+            "42": 6591992.0,
+            "43": 6928020.0,
+            "44": 7274444.0,
+            "45": 6680179.0,
+            "46": 6232560.0,
+            "47": 6496796.0,
+            "48": 6809653.0,
+            "49": 6753531.0,
+            "50": 6238141.0
         }
     },
     "mem-allocated-bytes": {
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1653820416.0,
-            "2": 1653820416.0,
-            "3": 1653820416.0,
-            "4": 1653820416.0,
-            "5": 1653820416.0,
-            "6": 1653820416.0,
-            "7": 1653820416.0,
-            "8": 1653820416.0,
-            "9": 1653820416.0,
-            "10": 1653820416.0,
-            "11": 1653820416.0,
-            "12": 1653820416.0,
-            "13": 1653820416.0,
-            "14": 1653820416.0,
-            "15": 1653820416.0,
-            "16": 1653820416.0,
-            "17": 1653820416.0,
-            "18": 1653820416.0,
-            "19": 1653820416.0,
-            "20": 1653820416.0,
-            "21": 1653820416.0,
-            "22": 1653820416.0,
-            "23": 1653820416.0,
-            "24": 1653820416.0,
-            "25": 1653820416.0,
-            "26": 1653820416.0,
-            "27": 1653820416.0,
-            "28": 1653820416.0,
-            "29": 1653820416.0,
-            "30": 1653820416.0,
-            "31": 1653820416.0,
-            "32": 1653820416.0,
-            "33": 1653820416.0,
-            "34": 1653820416.0,
-            "35": 1653820416.0,
-            "36": 1653820416.0,
-            "37": 1653820416.0,
-            "38": 1653820416.0,
-            "39": 1653820416.0,
-            "40": 1653820416.0,
-            "41": 1653820416.0,
-            "42": 1653820416.0,
-            "43": 1653820416.0,
-            "44": 1653820416.0,
-            "45": 1653820416.0,
-            "46": 1653820416.0,
-            "47": 1653820416.0,
-            "48": 1653820416.0,
-            "49": 1653820416.0,
-            "50": 1653820416.0
+            "1": 1653821440.0,
+            "2": 1653821440.0,
+            "3": 1653821440.0,
+            "4": 1653821440.0,
+            "5": 1653821440.0,
+            "6": 1653821440.0,
+            "7": 1653821440.0,
+            "8": 1653821440.0,
+            "9": 1653821440.0,
+            "10": 1653821440.0,
+            "11": 1653821440.0,
+            "12": 1653821440.0,
+            "13": 1653821440.0,
+            "14": 1653821440.0,
+            "15": 1653821440.0,
+            "16": 1653821440.0,
+            "17": 1653821440.0,
+            "18": 1653821440.0,
+            "19": 1653821440.0,
+            "20": 1653821440.0,
+            "21": 1653821440.0,
+            "22": 1653821440.0,
+            "23": 1653821440.0,
+            "24": 1653821440.0,
+            "25": 1653821440.0,
+            "26": 1653821440.0,
+            "27": 1653821440.0,
+            "28": 1653821440.0,
+            "29": 1653821440.0,
+            "30": 1653821440.0,
+            "31": 1653821440.0,
+            "32": 1653821440.0,
+            "33": 1653821440.0,
+            "34": 1653821440.0,
+            "35": 1653821440.0,
+            "36": 1653821440.0,
+            "37": 1653821440.0,
+            "38": 1653821440.0,
+            "39": 1653821440.0,
+            "40": 1653821440.0,
+            "41": 1653821440.0,
+            "42": 1653821440.0,
+            "43": 1653821440.0,
+            "44": 1653821440.0,
+            "45": 1653821440.0,
+            "46": 1653821440.0,
+            "47": 1653821440.0,
+            "48": 1653821440.0,
+            "49": 1653821440.0,
+            "50": 1653821440.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -175,56 +175,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 1653824512.0,
-            "2": 2142515200.0,
-            "3": 2142515200.0,
-            "4": 2142515200.0,
-            "5": 2142515200.0,
-            "6": 2142515200.0,
-            "7": 2142515200.0,
-            "8": 2142515200.0,
-            "9": 2142515200.0,
-            "10": 2142515200.0,
-            "11": 2142515200.0,
-            "12": 2142515200.0,
-            "13": 2142515200.0,
-            "14": 2142515200.0,
-            "15": 2142515200.0,
-            "16": 2142515200.0,
-            "17": 2142515200.0,
-            "18": 2142515200.0,
-            "19": 2142515200.0,
-            "20": 2142515200.0,
-            "21": 2142515200.0,
-            "22": 2142515200.0,
-            "23": 2142515200.0,
-            "24": 2142515200.0,
-            "25": 2142515200.0,
-            "26": 2142515200.0,
-            "27": 2142515200.0,
-            "28": 2142515200.0,
-            "29": 2142515200.0,
-            "30": 2142515200.0,
-            "31": 2142515200.0,
-            "32": 2142515200.0,
-            "33": 2142515200.0,
-            "34": 2142515200.0,
-            "35": 2142515200.0,
-            "36": 2142515200.0,
-            "37": 2142515200.0,
-            "38": 2142515200.0,
-            "39": 2142515200.0,
-            "40": 2142515200.0,
-            "41": 2142515200.0,
-            "42": 2142515200.0,
-            "43": 2142515200.0,
-            "44": 2142515200.0,
-            "45": 2142515200.0,
-            "46": 2142515200.0,
-            "47": 2142515200.0,
-            "48": 2142515200.0,
-            "49": 2142515200.0,
-            "50": 2142515200.0
+            "1": 1653825536.0,
+            "2": 2142998016.0,
+            "3": 2142998016.0,
+            "4": 2142998016.0,
+            "5": 2142998016.0,
+            "6": 2142998016.0,
+            "7": 2142998016.0,
+            "8": 2142998016.0,
+            "9": 2142998016.0,
+            "10": 2142998016.0,
+            "11": 2142998016.0,
+            "12": 2142998016.0,
+            "13": 2142998016.0,
+            "14": 2142998016.0,
+            "15": 2142998016.0,
+            "16": 2142998016.0,
+            "17": 2142998016.0,
+            "18": 2142998016.0,
+            "19": 2142998016.0,
+            "20": 2142998016.0,
+            "21": 2142998016.0,
+            "22": 2142998016.0,
+            "23": 2142998016.0,
+            "24": 2142998016.0,
+            "25": 2142998016.0,
+            "26": 2142998016.0,
+            "27": 2142998016.0,
+            "28": 2142998016.0,
+            "29": 2142998016.0,
+            "30": 2142998016.0,
+            "31": 2142998016.0,
+            "32": 2142998016.0,
+            "33": 2142998016.0,
+            "34": 2142998016.0,
+            "35": 2142998016.0,
+            "36": 2142998016.0,
+            "37": 2142998016.0,
+            "38": 2142998016.0,
+            "39": 2142998016.0,
+            "40": 2142998016.0,
+            "41": 2142998016.0,
+            "42": 2142998016.0,
+            "43": 2142998016.0,
+            "44": 2142998016.0,
+            "45": 2142998016.0,
+            "46": 2142998016.0,
+            "47": 2142998016.0,
+            "48": 2142998016.0,
+            "49": 2142998016.0,
+            "50": 2142998016.0
         }
     },
     "iteration-time": {
@@ -232,56 +232,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 14.64684,
-            "2": 0.98193,
-            "3": 0.95861,
-            "4": 0.96167,
-            "5": 0.96222,
-            "6": 0.96444,
-            "7": 0.95334,
-            "8": 0.95675,
-            "9": 0.95004,
-            "10": 0.9526,
-            "11": 0.94782,
-            "12": 0.95256,
-            "13": 0.95466,
-            "14": 0.95046,
-            "15": 0.96366,
-            "16": 0.95156,
-            "17": 0.95425,
-            "18": 0.9544,
-            "19": 1.2298,
-            "20": 0.95303,
-            "21": 0.95634,
-            "22": 0.95632,
-            "23": 0.95424,
-            "24": 0.95464,
-            "25": 0.96269,
-            "26": 0.96616,
-            "27": 0.94874,
-            "28": 0.94988,
-            "29": 1.26385,
-            "30": 0.95465,
-            "31": 1.2033,
-            "32": 0.9571,
-            "33": 0.956,
-            "34": 0.95832,
-            "35": 1.32667,
-            "36": 0.95679,
-            "37": 0.95623,
-            "38": 0.96193,
-            "39": 0.96003,
-            "40": 1.25799,
-            "41": 0.95599,
-            "42": 0.95891,
-            "43": 1.55786,
-            "44": 0.96371,
-            "45": 0.96764,
-            "46": 0.95894,
-            "47": 0.96017,
-            "48": 0.95646,
-            "49": 0.961,
-            "50": 0.96278
+            "1": 28.88794,
+            "2": 1.3875,
+            "3": 1.3655,
+            "4": 0.91436,
+            "5": 0.92323,
+            "6": 0.90862,
+            "7": 0.90351,
+            "8": 0.90087,
+            "9": 0.90804,
+            "10": 0.90099,
+            "11": 1.44829,
+            "12": 1.27198,
+            "13": 1.47603,
+            "14": 0.90715,
+            "15": 0.90169,
+            "16": 0.8955,
+            "17": 0.91977,
+            "18": 0.91161,
+            "19": 0.90173,
+            "20": 0.89581,
+            "21": 0.89026,
+            "22": 0.88949,
+            "23": 0.91159,
+            "24": 0.90975,
+            "25": 0.90708,
+            "26": 0.89948,
+            "27": 0.89544,
+            "28": 0.89745,
+            "29": 0.90068,
+            "30": 0.89534,
+            "31": 0.90066,
+            "32": 0.91859,
+            "33": 0.91419,
+            "34": 0.89878,
+            "35": 0.89846,
+            "36": 0.8945,
+            "37": 0.89356,
+            "38": 0.89475,
+            "39": 0.89372,
+            "40": 0.90674,
+            "41": 0.90461,
+            "42": 0.93092,
+            "43": 0.90002,
+            "44": 0.89721,
+            "45": 0.89453,
+            "46": 0.89499,
+            "47": 0.90828,
+            "48": 0.89629,
+            "49": 0.90644,
+            "50": 0.90588
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
index 45c06ac2f7e..f4a701a2e4d 100644
--- a/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
@@ -100,7 +100,7 @@
         "end_step": 25,
         "step_interval": 1,
         "values": {
-            "1": 40735711232.0,
+            "1": 40735715328.0,
             "2": 44991991808.0,
             "3": 44993564672.0,
             "4": 44993564672.0,
@@ -132,31 +132,31 @@
         "end_step": 25,
         "step_interval": 1,
         "values": {
-            "1": 12.25468,
-            "2": 0.47853,
-            "3": 0.41459,
-            "4": 0.41066,
-            "5": 0.4125,
-            "6": 0.42243,
-            "7": 0.40926,
-            "8": 0.41832,
-            "9": 0.4068,
-            "10": 0.41071,
-            "11": 0.41068,
-            "12": 0.41187,
-            "13": 0.42064,
-            "14": 0.4228,
-            "15": 0.41026,
-            "16": 0.81409,
-            "17": 0.41651,
-            "18": 0.41416,
-            "19": 0.41418,
-            "20": 0.41217,
-            "21": 0.42084,
-            "22": 0.4131,
-            "23": 0.41106,
-            "24": 0.41518,
-            "25": 0.41106
+            "1": 25.74522,
+            "2": 0.73559,
+            "3": 0.40581,
+            "4": 0.38308,
+            "5": 0.37606,
+            "6": 0.37631,
+            "7": 0.39269,
+            "8": 0.37902,
+            "9": 0.37764,
+            "10": 0.8554,
+            "11": 0.95952,
+            "12": 0.37861,
+            "13": 0.38954,
+            "14": 0.42497,
+            "15": 0.37698,
+            "16": 0.37629,
+            "17": 0.37835,
+            "18": 0.3766,
+            "19": 0.37494,
+            "20": 0.42005,
+            "21": 0.38011,
+            "22": 0.37713,
+            "23": 0.37617,
+            "24": 0.37515,
+            "25": 0.37401
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json
index 8915a1493e9..377aa000112 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.34492, "5": 9.3892, "10": 9.01571, "15": 8.64615, "20": 8.26963, "25": 7.99467, "30": 7.87463, "35": 7.65847, "40": 7.50295, "45": 7.36112, "50": 7.19186, "55": 7.16789, "60": 7.16511, "65": 7.00051, "70": 7.07139, "75": 7.07586, "80": 6.95246, "85": 6.86372, "90": 7.25405, "95": 6.85964, "100": 6.99698}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43313.0, "5": 45406.0, "10": 45370.0, "15": 43950.0, "20": 44806.0, "25": 42740.0, "30": 44052.0, "35": 43279.0, "40": 43242.0, "45": 43344.0, "50": 43411.0, "55": 43968.0, "60": 41346.0, "65": 44726.0, "70": 45545.0, "75": 44680.0, "80": 41138.0, "85": 44039.0, "90": 44735.0, "95": 44094.0, "100": 42475.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4138985984.0, "5": 4138985984.0, "10": 4138985984.0, "15": 4138985984.0, "20": 4138985984.0, "25": 4138985984.0, "30": 4138985984.0, "35": 4138985984.0, "40": 4138985984.0, "45": 4138985984.0, "50": 4138985984.0, "55": 4138985984.0, "60": 4138985984.0, "65": 4138985984.0, "70": 4138985984.0, "75": 4138985984.0, "80": 4138985984.0, "85": 4138985984.0, "90": 4138985984.0, "95": 4138985984.0, "100": 4138985984.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4345973248.0, "5": 6177401856.0, "10": 6177401856.0, "15": 6177401856.0, "20": 6177401856.0, "25": 6177401856.0, "30": 6177401856.0, "35": 6177401856.0, "40": 6177401856.0, "45": 6177401856.0, "50": 6177401856.0, "55": 6177401856.0, "60": 6177401856.0, "65": 6177401856.0, "70": 6177401856.0, "75": 6177401856.0, "80": 6177401856.0, "85": 6177401856.0, "90": 6177401856.0, "95": 6177401856.0, "100": 6177401856.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.23885, "5": 0.26424, "10": 0.26473, "15": 0.25653, "20": 0.25905, "25": 0.26025, "30": 0.25576, "35": 0.26028, "40": 0.26409, "45": 0.27254, "50": 0.25589, "55": 0.25786, "60": 0.25294, "65": 0.25565, "70": 0.25965, "75": 0.25357, "80": 0.25553, "85": 0.25588, "90": 0.25409, "95": 0.2567, "100": 0.25733}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.34492,
+            "2": 10.36433,
+            "3": 9.73145,
+            "4": 9.57923,
+            "5": 9.3892,
+            "6": 9.41078,
+            "7": 9.30545,
+            "8": 9.24872,
+            "9": 9.09363,
+            "10": 9.01571,
+            "11": 8.86227,
+            "12": 8.79088,
+            "13": 8.80884,
+            "14": 8.67658,
+            "15": 8.64615,
+            "16": 8.53973,
+            "17": 8.47875,
+            "18": 8.38919,
+            "19": 8.36145,
+            "20": 8.26963,
+            "21": 8.26321,
+            "22": 8.15047,
+            "23": 8.08861,
+            "24": 8.12416,
+            "25": 7.99467,
+            "26": 8.08474,
+            "27": 7.87741,
+            "28": 7.95852,
+            "29": 7.79567,
+            "30": 7.87463,
+            "31": 7.83211,
+            "32": 7.69448,
+            "33": 7.78447,
+            "34": 7.55753,
+            "35": 7.65847,
+            "36": 7.52861,
+            "37": 7.44889,
+            "38": 7.50364,
+            "39": 7.48064,
+            "40": 7.50295,
+            "41": 7.3974,
+            "42": 7.37184,
+            "43": 7.44291,
+            "44": 7.38083,
+            "45": 7.36112,
+            "46": 7.29391,
+            "47": 7.475,
+            "48": 7.29535,
+            "49": 7.3607,
+            "50": 7.19186,
+            "51": 7.38728,
+            "52": 7.13728,
+            "53": 7.12477,
+            "54": 7.23618,
+            "55": 7.16789,
+            "56": 7.22866,
+            "57": 7.34625,
+            "58": 7.03082,
+            "59": 7.12273,
+            "60": 7.16511,
+            "61": 7.11656,
+            "62": 7.26779,
+            "63": 7.16695,
+            "64": 7.08275,
+            "65": 7.00051,
+            "66": 7.07139,
+            "67": 7.05884,
+            "68": 7.14563,
+            "69": 7.03993,
+            "70": 7.07139,
+            "71": 6.91636,
+            "72": 7.02022,
+            "73": 6.99002,
+            "74": 6.91408,
+            "75": 7.07586,
+            "76": 6.97032,
+            "77": 7.08431,
+            "78": 7.03516,
+            "79": 6.88312,
+            "80": 6.95246,
+            "81": 6.98441,
+            "82": 7.06806,
+            "83": 7.00882,
+            "84": 7.01789,
+            "85": 6.86372,
+            "86": 7.04924,
+            "87": 6.99288,
+            "88": 6.92333,
+            "89": 6.82337,
+            "90": 7.25405,
+            "91": 6.72212,
+            "92": 7.05344,
+            "93": 6.91633,
+            "94": 7.0654,
+            "95": 6.85964,
+            "96": 6.98723,
+            "97": 6.96749,
+            "98": 6.89904,
+            "99": 7.02746,
+            "100": 6.99698
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 43313.0,
+            "2": 44075.0,
+            "3": 44779.0,
+            "4": 42461.0,
+            "5": 45406.0,
+            "6": 40995.0,
+            "7": 43185.0,
+            "8": 45480.0,
+            "9": 42555.0,
+            "10": 45370.0,
+            "11": 44017.0,
+            "12": 44619.0,
+            "13": 43939.0,
+            "14": 46223.0,
+            "15": 43950.0,
+            "16": 41732.0,
+            "17": 43869.0,
+            "18": 44696.0,
+            "19": 42631.0,
+            "20": 44806.0,
+            "21": 44813.0,
+            "22": 41897.0,
+            "23": 45483.0,
+            "24": 43099.0,
+            "25": 42740.0,
+            "26": 43950.0,
+            "27": 46249.0,
+            "28": 46424.0,
+            "29": 46206.0,
+            "30": 44052.0,
+            "31": 41268.0,
+            "32": 43408.0,
+            "33": 45487.0,
+            "34": 43390.0,
+            "35": 43279.0,
+            "36": 42533.0,
+            "37": 40700.0,
+            "38": 42585.0,
+            "39": 44772.0,
+            "40": 43242.0,
+            "41": 44698.0,
+            "42": 43271.0,
+            "43": 45502.0,
+            "44": 44648.0,
+            "45": 43344.0,
+            "46": 43923.0,
+            "47": 42519.0,
+            "48": 44691.0,
+            "49": 43190.0,
+            "50": 43411.0,
+            "51": 41175.0,
+            "52": 43901.0,
+            "53": 43967.0,
+            "54": 41964.0,
+            "55": 43968.0,
+            "56": 43280.0,
+            "57": 42566.0,
+            "58": 43903.0,
+            "59": 44657.0,
+            "60": 41346.0,
+            "61": 39760.0,
+            "62": 44779.0,
+            "63": 44680.0,
+            "64": 45395.0,
+            "65": 44726.0,
+            "66": 45386.0,
+            "67": 43197.0,
+            "68": 42570.0,
+            "69": 43834.0,
+            "70": 45545.0,
+            "71": 43402.0,
+            "72": 44828.0,
+            "73": 45410.0,
+            "74": 42508.0,
+            "75": 44680.0,
+            "76": 43936.0,
+            "77": 42111.0,
+            "78": 40541.0,
+            "79": 38950.0,
+            "80": 41138.0,
+            "81": 45397.0,
+            "82": 43256.0,
+            "83": 38500.0,
+            "84": 42533.0,
+            "85": 44039.0,
+            "86": 45756.0,
+            "87": 41125.0,
+            "88": 41799.0,
+            "89": 41088.0,
+            "90": 44735.0,
+            "91": 46292.0,
+            "92": 41852.0,
+            "93": 43234.0,
+            "94": 39581.0,
+            "95": 44094.0,
+            "96": 44736.0,
+            "97": 45487.0,
+            "98": 41852.0,
+            "99": 45522.0,
+            "100": 42475.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 4138985984.0,
+            "2": 4138985984.0,
+            "3": 4138985984.0,
+            "4": 4138985984.0,
+            "5": 4138985984.0,
+            "6": 4138985984.0,
+            "7": 4138985984.0,
+            "8": 4138985984.0,
+            "9": 4138985984.0,
+            "10": 4138985984.0,
+            "11": 4138985984.0,
+            "12": 4138985984.0,
+            "13": 4138985984.0,
+            "14": 4138985984.0,
+            "15": 4138985984.0,
+            "16": 4138985984.0,
+            "17": 4138985984.0,
+            "18": 4138985984.0,
+            "19": 4138985984.0,
+            "20": 4138985984.0,
+            "21": 4138985984.0,
+            "22": 4138985984.0,
+            "23": 4138985984.0,
+            "24": 4138985984.0,
+            "25": 4138985984.0,
+            "26": 4138985984.0,
+            "27": 4138985984.0,
+            "28": 4138985984.0,
+            "29": 4138985984.0,
+            "30": 4138985984.0,
+            "31": 4138985984.0,
+            "32": 4138985984.0,
+            "33": 4138985984.0,
+            "34": 4138985984.0,
+            "35": 4138985984.0,
+            "36": 4138985984.0,
+            "37": 4138985984.0,
+            "38": 4138985984.0,
+            "39": 4138985984.0,
+            "40": 4138985984.0,
+            "41": 4138985984.0,
+            "42": 4138985984.0,
+            "43": 4138985984.0,
+            "44": 4138985984.0,
+            "45": 4138985984.0,
+            "46": 4138985984.0,
+            "47": 4138985984.0,
+            "48": 4138985984.0,
+            "49": 4138985984.0,
+            "50": 4138985984.0,
+            "51": 4138985984.0,
+            "52": 4138985984.0,
+            "53": 4138985984.0,
+            "54": 4138985984.0,
+            "55": 4138985984.0,
+            "56": 4138985984.0,
+            "57": 4138985984.0,
+            "58": 4138985984.0,
+            "59": 4138985984.0,
+            "60": 4138985984.0,
+            "61": 4138985984.0,
+            "62": 4138985984.0,
+            "63": 4138985984.0,
+            "64": 4138985984.0,
+            "65": 4138985984.0,
+            "66": 4138985984.0,
+            "67": 4138985984.0,
+            "68": 4138985984.0,
+            "69": 4138985984.0,
+            "70": 4138985984.0,
+            "71": 4138985984.0,
+            "72": 4138985984.0,
+            "73": 4138985984.0,
+            "74": 4138985984.0,
+            "75": 4138985984.0,
+            "76": 4138985984.0,
+            "77": 4138985984.0,
+            "78": 4138985984.0,
+            "79": 4138985984.0,
+            "80": 4138985984.0,
+            "81": 4138985984.0,
+            "82": 4138985984.0,
+            "83": 4138985984.0,
+            "84": 4138985984.0,
+            "85": 4138985984.0,
+            "86": 4138985984.0,
+            "87": 4138985984.0,
+            "88": 4138985984.0,
+            "89": 4138985984.0,
+            "90": 4138985984.0,
+            "91": 4138985984.0,
+            "92": 4138985984.0,
+            "93": 4138985984.0,
+            "94": 4138985984.0,
+            "95": 4138985984.0,
+            "96": 4138985984.0,
+            "97": 4138985984.0,
+            "98": 4138985984.0,
+            "99": 4138985984.0,
+            "100": 4138985984.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 4345973248.0,
+            "2": 6174256128.0,
+            "3": 6177401856.0,
+            "4": 6177401856.0,
+            "5": 6177401856.0,
+            "6": 6177401856.0,
+            "7": 6177401856.0,
+            "8": 6177401856.0,
+            "9": 6177401856.0,
+            "10": 6177401856.0,
+            "11": 6177401856.0,
+            "12": 6177401856.0,
+            "13": 6177401856.0,
+            "14": 6177401856.0,
+            "15": 6177401856.0,
+            "16": 6177401856.0,
+            "17": 6177401856.0,
+            "18": 6177401856.0,
+            "19": 6177401856.0,
+            "20": 6177401856.0,
+            "21": 6177401856.0,
+            "22": 6177401856.0,
+            "23": 6177401856.0,
+            "24": 6177401856.0,
+            "25": 6177401856.0,
+            "26": 6177401856.0,
+            "27": 6177401856.0,
+            "28": 6177401856.0,
+            "29": 6177401856.0,
+            "30": 6177401856.0,
+            "31": 6177401856.0,
+            "32": 6177401856.0,
+            "33": 6177401856.0,
+            "34": 6177401856.0,
+            "35": 6177401856.0,
+            "36": 6177401856.0,
+            "37": 6177401856.0,
+            "38": 6177401856.0,
+            "39": 6177401856.0,
+            "40": 6177401856.0,
+            "41": 6177401856.0,
+            "42": 6177401856.0,
+            "43": 6177401856.0,
+            "44": 6177401856.0,
+            "45": 6177401856.0,
+            "46": 6177401856.0,
+            "47": 6177401856.0,
+            "48": 6177401856.0,
+            "49": 6177401856.0,
+            "50": 6177401856.0,
+            "51": 6177401856.0,
+            "52": 6177401856.0,
+            "53": 6177401856.0,
+            "54": 6177401856.0,
+            "55": 6177401856.0,
+            "56": 6177401856.0,
+            "57": 6177401856.0,
+            "58": 6177401856.0,
+            "59": 6177401856.0,
+            "60": 6177401856.0,
+            "61": 6177401856.0,
+            "62": 6177401856.0,
+            "63": 6177401856.0,
+            "64": 6177401856.0,
+            "65": 6177401856.0,
+            "66": 6177401856.0,
+            "67": 6177401856.0,
+            "68": 6177401856.0,
+            "69": 6177401856.0,
+            "70": 6177401856.0,
+            "71": 6177401856.0,
+            "72": 6177401856.0,
+            "73": 6177401856.0,
+            "74": 6177401856.0,
+            "75": 6177401856.0,
+            "76": 6177401856.0,
+            "77": 6177401856.0,
+            "78": 6177401856.0,
+            "79": 6177401856.0,
+            "80": 6177401856.0,
+            "81": 6177401856.0,
+            "82": 6177401856.0,
+            "83": 6177401856.0,
+            "84": 6177401856.0,
+            "85": 6177401856.0,
+            "86": 6177401856.0,
+            "87": 6177401856.0,
+            "88": 6177401856.0,
+            "89": 6177401856.0,
+            "90": 6177401856.0,
+            "91": 6177401856.0,
+            "92": 6177401856.0,
+            "93": 6177401856.0,
+            "94": 6177401856.0,
+            "95": 6177401856.0,
+            "96": 6177401856.0,
+            "97": 6177401856.0,
+            "98": 6177401856.0,
+            "99": 6177401856.0,
+            "100": 6177401856.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 15.90256,
+            "2": 0.38776,
+            "3": 0.2538,
+            "4": 0.23765,
+            "5": 0.24163,
+            "6": 0.23676,
+            "7": 0.24025,
+            "8": 0.23655,
+            "9": 0.23987,
+            "10": 0.23768,
+            "11": 0.23998,
+            "12": 0.23715,
+            "13": 0.24393,
+            "14": 0.24443,
+            "15": 0.239,
+            "16": 0.23703,
+            "17": 0.23983,
+            "18": 0.68895,
+            "19": 0.24165,
+            "20": 0.23942,
+            "21": 0.2407,
+            "22": 0.24031,
+            "23": 0.24024,
+            "24": 0.23652,
+            "25": 0.24086,
+            "26": 0.2366,
+            "27": 0.23948,
+            "28": 0.23647,
+            "29": 0.23853,
+            "30": 0.23618,
+            "31": 0.24073,
+            "32": 0.24306,
+            "33": 0.24364,
+            "34": 0.24271,
+            "35": 0.25558,
+            "36": 0.24636,
+            "37": 0.24909,
+            "38": 0.24557,
+            "39": 0.23889,
+            "40": 0.23902,
+            "41": 0.24642,
+            "42": 0.25339,
+            "43": 0.24074,
+            "44": 0.24571,
+            "45": 0.24717,
+            "46": 0.24699,
+            "47": 0.24736,
+            "48": 0.24603,
+            "49": 0.24517,
+            "50": 0.24539,
+            "51": 0.24811,
+            "52": 0.24582,
+            "53": 0.24593,
+            "54": 0.24504,
+            "55": 0.246,
+            "56": 0.24529,
+            "57": 0.24504,
+            "58": 0.2456,
+            "59": 0.24486,
+            "60": 0.24469,
+            "61": 0.24492,
+            "62": 0.24541,
+            "63": 0.24477,
+            "64": 0.24513,
+            "65": 0.24517,
+            "66": 0.24604,
+            "67": 0.24545,
+            "68": 0.24484,
+            "69": 0.24544,
+            "70": 0.2465,
+            "71": 0.24485,
+            "72": 0.24533,
+            "73": 0.24696,
+            "74": 0.24713,
+            "75": 0.24439,
+            "76": 0.24545,
+            "77": 0.24597,
+            "78": 0.24609,
+            "79": 0.24565,
+            "80": 0.24461,
+            "81": 0.2449,
+            "82": 0.24557,
+            "83": 0.24452,
+            "84": 0.67347,
+            "85": 0.24571,
+            "86": 0.24569,
+            "87": 0.62538,
+            "88": 0.24689,
+            "89": 0.24525,
+            "90": 0.67646,
+            "91": 0.24552,
+            "92": 0.67563,
+            "93": 0.24534,
+            "94": 0.24466,
+            "95": 0.24425,
+            "96": 0.24474,
+            "97": 0.24581,
+            "98": 0.24507,
+            "99": 0.24475,
+            "100": 0.24541
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100_2nd.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100_2nd.json
new file mode 100644
index 00000000000..ecfeaf1c209
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.38728,
+            "52": 7.13728,
+            "53": 7.12477,
+            "54": 7.23618,
+            "55": 7.16789,
+            "56": 7.22866,
+            "57": 7.34625,
+            "58": 7.03082,
+            "59": 7.12273,
+            "60": 7.16511,
+            "61": 7.11656,
+            "62": 7.26779,
+            "63": 7.16695,
+            "64": 7.08275,
+            "65": 7.00051,
+            "66": 7.07139,
+            "67": 7.05884,
+            "68": 7.14563,
+            "69": 7.03993,
+            "70": 7.07139,
+            "71": 6.91636,
+            "72": 7.02022,
+            "73": 6.99002,
+            "74": 6.91408,
+            "75": 7.07586,
+            "76": 6.97032,
+            "77": 7.08431,
+            "78": 7.03516,
+            "79": 6.88312,
+            "80": 6.95246,
+            "81": 6.98441,
+            "82": 7.06806,
+            "83": 7.00882,
+            "84": 7.01789,
+            "85": 6.86372,
+            "86": 7.04924,
+            "87": 6.99288,
+            "88": 6.92333,
+            "89": 6.82337,
+            "90": 7.25405,
+            "91": 6.72212,
+            "92": 7.05344,
+            "93": 6.91633,
+            "94": 7.0654,
+            "95": 6.85964,
+            "96": 6.98723,
+            "97": 6.96749,
+            "98": 6.89904,
+            "99": 7.02746,
+            "100": 6.99698
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 41175.0,
+            "52": 43901.0,
+            "53": 43967.0,
+            "54": 41964.0,
+            "55": 43968.0,
+            "56": 43280.0,
+            "57": 42566.0,
+            "58": 43903.0,
+            "59": 44657.0,
+            "60": 41346.0,
+            "61": 39760.0,
+            "62": 44779.0,
+            "63": 44680.0,
+            "64": 45395.0,
+            "65": 44726.0,
+            "66": 45386.0,
+            "67": 43197.0,
+            "68": 42570.0,
+            "69": 43834.0,
+            "70": 45545.0,
+            "71": 43402.0,
+            "72": 44828.0,
+            "73": 45410.0,
+            "74": 42508.0,
+            "75": 44680.0,
+            "76": 43936.0,
+            "77": 42111.0,
+            "78": 40541.0,
+            "79": 38950.0,
+            "80": 41138.0,
+            "81": 45397.0,
+            "82": 43256.0,
+            "83": 38500.0,
+            "84": 42533.0,
+            "85": 44039.0,
+            "86": 45756.0,
+            "87": 41125.0,
+            "88": 41799.0,
+            "89": 41088.0,
+            "90": 44735.0,
+            "91": 46292.0,
+            "92": 41852.0,
+            "93": 43234.0,
+            "94": 39581.0,
+            "95": 44094.0,
+            "96": 44736.0,
+            "97": 45487.0,
+            "98": 41852.0,
+            "99": 45522.0,
+            "100": 42475.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4137282048.0,
+            "52": 4137282048.0,
+            "53": 4137282048.0,
+            "54": 4137282048.0,
+            "55": 4137282048.0,
+            "56": 4137282048.0,
+            "57": 4137282048.0,
+            "58": 4137282048.0,
+            "59": 4137282048.0,
+            "60": 4137282048.0,
+            "61": 4137282048.0,
+            "62": 4137282048.0,
+            "63": 4137282048.0,
+            "64": 4137282048.0,
+            "65": 4137282048.0,
+            "66": 4137282048.0,
+            "67": 4137282048.0,
+            "68": 4137282048.0,
+            "69": 4137282048.0,
+            "70": 4137282048.0,
+            "71": 4137282048.0,
+            "72": 4137282048.0,
+            "73": 4137282048.0,
+            "74": 4137282048.0,
+            "75": 4137282048.0,
+            "76": 4137282048.0,
+            "77": 4137282048.0,
+            "78": 4137282048.0,
+            "79": 4137282048.0,
+            "80": 4137282048.0,
+            "81": 4137282048.0,
+            "82": 4137282048.0,
+            "83": 4137282048.0,
+            "84": 4137282048.0,
+            "85": 4137282048.0,
+            "86": 4137282048.0,
+            "87": 4137282048.0,
+            "88": 4137282048.0,
+            "89": 4137282048.0,
+            "90": 4137282048.0,
+            "91": 4137282048.0,
+            "92": 4137282048.0,
+            "93": 4137282048.0,
+            "94": 4137282048.0,
+            "95": 4137282048.0,
+            "96": 4137282048.0,
+            "97": 4137282048.0,
+            "98": 4137282048.0,
+            "99": 4137282048.0,
+            "100": 4137282048.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6119897600.0,
+            "52": 6198635520.0,
+            "53": 6198635520.0,
+            "54": 6198635520.0,
+            "55": 6198635520.0,
+            "56": 6198635520.0,
+            "57": 6198635520.0,
+            "58": 6198635520.0,
+            "59": 6198635520.0,
+            "60": 6198635520.0,
+            "61": 6198635520.0,
+            "62": 6198635520.0,
+            "63": 6198635520.0,
+            "64": 6198635520.0,
+            "65": 6198635520.0,
+            "66": 6198635520.0,
+            "67": 6198635520.0,
+            "68": 6198635520.0,
+            "69": 6198635520.0,
+            "70": 6198635520.0,
+            "71": 6198635520.0,
+            "72": 6198635520.0,
+            "73": 6198635520.0,
+            "74": 6198635520.0,
+            "75": 6198635520.0,
+            "76": 6198635520.0,
+            "77": 6198635520.0,
+            "78": 6198635520.0,
+            "79": 6198635520.0,
+            "80": 6198635520.0,
+            "81": 6198635520.0,
+            "82": 6198635520.0,
+            "83": 6198635520.0,
+            "84": 6198635520.0,
+            "85": 6198635520.0,
+            "86": 6198635520.0,
+            "87": 6198635520.0,
+            "88": 6198635520.0,
+            "89": 6198635520.0,
+            "90": 6198635520.0,
+            "91": 6198635520.0,
+            "92": 6198635520.0,
+            "93": 6198635520.0,
+            "94": 6198635520.0,
+            "95": 6198635520.0,
+            "96": 6198635520.0,
+            "97": 6198635520.0,
+            "98": 6198635520.0,
+            "99": 6198635520.0,
+            "100": 6198635520.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 17.50157,
+            "52": 0.25854,
+            "53": 0.23866,
+            "54": 0.23772,
+            "55": 0.23735,
+            "56": 0.25491,
+            "57": 0.23917,
+            "58": 0.23806,
+            "59": 0.24067,
+            "60": 0.25384,
+            "61": 0.64867,
+            "62": 0.23907,
+            "63": 0.23697,
+            "64": 0.23809,
+            "65": 0.23776,
+            "66": 0.23806,
+            "67": 0.23688,
+            "68": 0.2374,
+            "69": 0.23748,
+            "70": 0.23755,
+            "71": 0.23825,
+            "72": 0.23729,
+            "73": 0.23714,
+            "74": 0.23744,
+            "75": 0.24319,
+            "76": 0.24832,
+            "77": 0.24157,
+            "78": 0.24391,
+            "79": 0.24576,
+            "80": 0.245,
+            "81": 0.24875,
+            "82": 0.24081,
+            "83": 0.24491,
+            "84": 0.24628,
+            "85": 0.23944,
+            "86": 0.23819,
+            "87": 0.23895,
+            "88": 0.24078,
+            "89": 0.24348,
+            "90": 0.23902,
+            "91": 0.23911,
+            "92": 0.23727,
+            "93": 0.23776,
+            "94": 0.23873,
+            "95": 0.23736,
+            "96": 0.23765,
+            "97": 0.23709,
+            "98": 0.2376,
+            "99": 0.23731,
+            "100": 0.23775
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json
index 8809a47cd54..2f16e1424cf 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 7.22025,
-            "2": 0.31576,
-            "3": 0.19278,
-            "4": 0.19432,
-            "5": 0.18909,
-            "6": 0.19307,
-            "7": 0.18922,
-            "8": 0.19506,
-            "9": 0.18834,
-            "10": 0.19233,
-            "11": 0.18825,
-            "12": 0.19571,
-            "13": 0.19081,
-            "14": 0.19613,
-            "15": 0.18954,
-            "16": 0.18825,
-            "17": 0.18583,
-            "18": 0.18933,
-            "19": 0.1896,
-            "20": 0.19136,
-            "21": 0.18842,
-            "22": 0.19581,
-            "23": 0.18752,
-            "24": 0.19277,
-            "25": 0.18759,
-            "26": 0.19405,
-            "27": 0.18784,
-            "28": 0.18762,
-            "29": 0.19232,
-            "30": 0.18798,
-            "31": 0.18713,
-            "32": 0.18948,
-            "33": 0.18968,
-            "34": 0.19011,
-            "35": 0.18907,
-            "36": 0.18983,
-            "37": 0.18857,
-            "38": 0.18728,
-            "39": 0.18835,
-            "40": 0.18777,
-            "41": 0.188,
-            "42": 0.18818,
-            "43": 0.18602,
-            "44": 0.18972,
-            "45": 0.19276,
-            "46": 0.18816,
-            "47": 0.18794,
-            "48": 0.19299,
-            "49": 0.19241,
-            "50": 0.18805,
-            "51": 0.18895,
-            "52": 0.19459,
-            "53": 0.18821,
-            "54": 0.18597,
-            "55": 0.189,
-            "56": 0.18748,
-            "57": 0.18709,
-            "58": 0.19127,
-            "59": 0.19097,
-            "60": 0.18702,
-            "61": 0.18725,
-            "62": 0.18762,
-            "63": 0.19407,
-            "64": 0.19411,
-            "65": 0.20071,
-            "66": 0.19555,
-            "67": 0.22543,
-            "68": 0.21724,
-            "69": 0.22635,
-            "70": 0.52922,
-            "71": 0.19086,
-            "72": 0.19899,
-            "73": 0.51667,
-            "74": 0.20138,
-            "75": 0.19507,
-            "76": 0.24987,
-            "77": 0.22838,
-            "78": 0.51523,
-            "79": 0.19126,
-            "80": 0.18911,
-            "81": 0.19269,
-            "82": 0.18816,
-            "83": 0.18902,
-            "84": 0.18942,
-            "85": 0.19004,
-            "86": 0.50868,
-            "87": 0.19274,
-            "88": 0.18813,
-            "89": 0.19169,
-            "90": 0.50854,
-            "91": 0.1924,
-            "92": 0.18906,
-            "93": 0.19016,
-            "94": 0.1902,
-            "95": 0.19338,
-            "96": 0.51468,
-            "97": 0.19597,
-            "98": 0.19147,
-            "99": 0.19626,
-            "100": 0.18852
+            "1": 21.8125,
+            "2": 0.28714,
+            "3": 0.18248,
+            "4": 0.16775,
+            "5": 0.16676,
+            "6": 0.16648,
+            "7": 0.16754,
+            "8": 0.1665,
+            "9": 0.16691,
+            "10": 0.16693,
+            "11": 0.16662,
+            "12": 0.16643,
+            "13": 0.16866,
+            "14": 0.18027,
+            "15": 0.18602,
+            "16": 0.17217,
+            "17": 0.1728,
+            "18": 0.80687,
+            "19": 0.17209,
+            "20": 0.16817,
+            "21": 0.16774,
+            "22": 0.16767,
+            "23": 0.16997,
+            "24": 0.17545,
+            "25": 0.16618,
+            "26": 0.16606,
+            "27": 0.16686,
+            "28": 0.16671,
+            "29": 0.16978,
+            "30": 0.16859,
+            "31": 0.16653,
+            "32": 0.16895,
+            "33": 0.1718,
+            "34": 0.16983,
+            "35": 0.17083,
+            "36": 0.16981,
+            "37": 0.21328,
+            "38": 0.20684,
+            "39": 0.17073,
+            "40": 0.17292,
+            "41": 0.17014,
+            "42": 0.16958,
+            "43": 0.17123,
+            "44": 0.23117,
+            "45": 0.17089,
+            "46": 0.16839,
+            "47": 0.16741,
+            "48": 0.16733,
+            "49": 0.16907,
+            "50": 0.166,
+            "51": 0.18917,
+            "52": 0.16625,
+            "53": 0.1648,
+            "54": 0.16453,
+            "55": 0.19111,
+            "56": 0.16472,
+            "57": 0.1648,
+            "58": 0.16849,
+            "59": 0.16461,
+            "60": 0.16483,
+            "61": 0.16545,
+            "62": 0.1653,
+            "63": 0.16489,
+            "64": 0.16447,
+            "65": 0.16466,
+            "66": 0.16483,
+            "67": 0.1656,
+            "68": 0.16424,
+            "69": 0.16509,
+            "70": 0.16891,
+            "71": 0.16577,
+            "72": 0.1654,
+            "73": 0.16726,
+            "74": 0.16512,
+            "75": 0.16474,
+            "76": 0.16524,
+            "77": 0.1647,
+            "78": 0.16627,
+            "79": 0.16568,
+            "80": 0.16511,
+            "81": 0.16637,
+            "82": 0.16694,
+            "83": 0.16527,
+            "84": 0.56724,
+            "85": 0.17088,
+            "86": 0.16835,
+            "87": 0.59121,
+            "88": 0.16681,
+            "89": 0.16548,
+            "90": 0.58424,
+            "91": 0.1663,
+            "92": 0.57005,
+            "93": 0.16681,
+            "94": 0.165,
+            "95": 0.16566,
+            "96": 0.16609,
+            "97": 0.16553,
+            "98": 0.16396,
+            "99": 0.16454,
+            "100": 0.16365
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..990df178a9a
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.37351,
+            "52": 7.13362,
+            "53": 7.11248,
+            "54": 7.23395,
+            "55": 7.14784,
+            "56": 7.2278,
+            "57": 7.33273,
+            "58": 6.99464,
+            "59": 7.11597,
+            "60": 7.13216,
+            "61": 7.10561,
+            "62": 7.26519,
+            "63": 7.14764,
+            "64": 7.08702,
+            "65": 6.98658,
+            "66": 7.04733,
+            "67": 7.04745,
+            "68": 7.14076,
+            "69": 7.24347,
+            "70": 7.05974,
+            "71": 6.89358,
+            "72": 6.99793,
+            "73": 6.97928,
+            "74": 6.91973,
+            "75": 7.05295,
+            "76": 6.96054,
+            "77": 7.07939,
+            "78": 7.0137,
+            "79": 6.88344,
+            "80": 6.93032,
+            "81": 6.96568,
+            "82": 7.05273,
+            "83": 6.98785,
+            "84": 7.00434,
+            "85": 6.84596,
+            "86": 7.03651,
+            "87": 6.96347,
+            "88": 6.91343,
+            "89": 6.80657,
+            "90": 7.23629,
+            "91": 6.70068,
+            "92": 7.05694,
+            "93": 6.89292,
+            "94": 7.05848,
+            "95": 6.84802,
+            "96": 6.9679,
+            "97": 6.9429,
+            "98": 6.87432,
+            "99": 7.01828,
+            "100": 6.98491
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 41200.0,
+            "52": 43884.0,
+            "53": 43946.0,
+            "54": 41916.0,
+            "55": 43925.0,
+            "56": 43252.0,
+            "57": 42636.0,
+            "58": 43941.0,
+            "59": 44619.0,
+            "60": 41400.0,
+            "61": 39750.0,
+            "62": 44764.0,
+            "63": 44671.0,
+            "64": 45375.0,
+            "65": 44753.0,
+            "66": 45404.0,
+            "67": 43154.0,
+            "68": 42551.0,
+            "69": 43844.0,
+            "70": 45537.0,
+            "71": 43335.0,
+            "72": 44839.0,
+            "73": 45372.0,
+            "74": 42511.0,
+            "75": 44712.0,
+            "76": 43930.0,
+            "77": 42073.0,
+            "78": 40535.0,
+            "79": 38992.0,
+            "80": 41092.0,
+            "81": 45382.0,
+            "82": 43275.0,
+            "83": 38475.0,
+            "84": 42418.0,
+            "85": 43979.0,
+            "86": 45691.0,
+            "87": 41145.0,
+            "88": 41782.0,
+            "89": 41042.0,
+            "90": 44713.0,
+            "91": 46270.0,
+            "92": 41845.0,
+            "93": 43272.0,
+            "94": 39536.0,
+            "95": 44085.0,
+            "96": 44689.0,
+            "97": 45411.0,
+            "98": 41858.0,
+            "99": 45575.0,
+            "100": 42501.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4166512128.0,
+            "52": 4166512128.0,
+            "53": 4166512128.0,
+            "54": 4166512128.0,
+            "55": 4166512128.0,
+            "56": 4166512128.0,
+            "57": 4166512128.0,
+            "58": 4166512128.0,
+            "59": 4166512128.0,
+            "60": 4166512128.0,
+            "61": 4166512128.0,
+            "62": 4166512128.0,
+            "63": 4166512128.0,
+            "64": 4166512128.0,
+            "65": 4166512128.0,
+            "66": 4166512128.0,
+            "67": 4166512128.0,
+            "68": 4166512128.0,
+            "69": 4166512128.0,
+            "70": 4166512128.0,
+            "71": 4166512128.0,
+            "72": 4166512128.0,
+            "73": 4166512128.0,
+            "74": 4166512128.0,
+            "75": 4166512128.0,
+            "76": 4166512128.0,
+            "77": 4166512128.0,
+            "78": 4166512128.0,
+            "79": 4166512128.0,
+            "80": 4166512128.0,
+            "81": 4166512128.0,
+            "82": 4166512128.0,
+            "83": 4166512128.0,
+            "84": 4166512128.0,
+            "85": 4166512128.0,
+            "86": 4166512128.0,
+            "87": 4166512128.0,
+            "88": 4166512128.0,
+            "89": 4166512128.0,
+            "90": 4166512128.0,
+            "91": 4166512128.0,
+            "92": 4166512128.0,
+            "93": 4166512128.0,
+            "94": 4166512128.0,
+            "95": 4166512128.0,
+            "96": 4166512128.0,
+            "97": 4166512128.0,
+            "98": 4166512128.0,
+            "99": 4166512128.0,
+            "100": 4166512128.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6147947008.0,
+            "52": 6229044224.0,
+            "53": 6229044224.0,
+            "54": 6229044224.0,
+            "55": 6229044224.0,
+            "56": 6229044224.0,
+            "57": 6229044224.0,
+            "58": 6229044224.0,
+            "59": 6229044224.0,
+            "60": 6229044224.0,
+            "61": 6229044224.0,
+            "62": 6229044224.0,
+            "63": 6229044224.0,
+            "64": 6229044224.0,
+            "65": 6229044224.0,
+            "66": 6229044224.0,
+            "67": 6229044224.0,
+            "68": 6229044224.0,
+            "69": 6229044224.0,
+            "70": 6229044224.0,
+            "71": 6229044224.0,
+            "72": 6229044224.0,
+            "73": 6229044224.0,
+            "74": 6229044224.0,
+            "75": 6229044224.0,
+            "76": 6229044224.0,
+            "77": 6229044224.0,
+            "78": 6229044224.0,
+            "79": 6229044224.0,
+            "80": 6229044224.0,
+            "81": 6229044224.0,
+            "82": 6229044224.0,
+            "83": 6229044224.0,
+            "84": 6229044224.0,
+            "85": 6229044224.0,
+            "86": 6229044224.0,
+            "87": 6229044224.0,
+            "88": 6229044224.0,
+            "89": 6229044224.0,
+            "90": 6229044224.0,
+            "91": 6229044224.0,
+            "92": 6229044224.0,
+            "93": 6229044224.0,
+            "94": 6229044224.0,
+            "95": 6229044224.0,
+            "96": 6229044224.0,
+            "97": 6229044224.0,
+            "98": 6229044224.0,
+            "99": 6229044224.0,
+            "100": 6229044224.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 21.52581,
+            "52": 0.20557,
+            "53": 0.16728,
+            "54": 0.16541,
+            "55": 0.16459,
+            "56": 0.1635,
+            "57": 0.16634,
+            "58": 0.16486,
+            "59": 0.18518,
+            "60": 0.18385,
+            "61": 0.18349,
+            "62": 0.16716,
+            "63": 0.85301,
+            "64": 0.16878,
+            "65": 0.16296,
+            "66": 0.16285,
+            "67": 0.16213,
+            "68": 0.1653,
+            "69": 0.16402,
+            "70": 0.16087,
+            "71": 0.16009,
+            "72": 0.16411,
+            "73": 0.16271,
+            "74": 0.16402,
+            "75": 0.19388,
+            "76": 0.19834,
+            "77": 0.18848,
+            "78": 0.17552,
+            "79": 0.16404,
+            "80": 0.21371,
+            "81": 0.16791,
+            "82": 0.16882,
+            "83": 0.16426,
+            "84": 0.16282,
+            "85": 0.16565,
+            "86": 0.16341,
+            "87": 0.16331,
+            "88": 0.16306,
+            "89": 0.16564,
+            "90": 0.20919,
+            "91": 0.16623,
+            "92": 0.16207,
+            "93": 0.16589,
+            "94": 0.16268,
+            "95": 0.16134,
+            "96": 0.16581,
+            "97": 0.1593,
+            "98": 0.16011,
+            "99": 0.16089,
+            "100": 0.16056
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json
index 6e6c2f4365a..25b93ce0f66 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.38869, "5": 9.38084, "10": 9.05709, "15": 8.65595, "20": 8.26189, "25": 7.98194, "30": 7.86925, "35": 7.66275, "40": 7.5007, "45": 7.34875, "50": 7.18139, "55": 7.15407, "60": 7.14724, "65": 6.99707, "70": 7.06003, "75": 7.0608, "80": 6.94288, "85": 6.85973, "90": 7.24972, "95": 6.84835, "100": 6.9828}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43318.0, "5": 45385.0, "10": 45371.0, "15": 43897.0, "20": 44769.0, "25": 42476.0, "30": 43985.0, "35": 43264.0, "40": 43230.0, "45": 43278.0, "50": 43381.0, "55": 43857.0, "60": 41225.0, "65": 44683.0, "70": 45534.0, "75": 44679.0, "80": 41115.0, "85": 44010.0, "90": 44673.0, "95": 44064.0, "100": 42520.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2164472832.0, "5": 2164472832.0, "10": 2164472832.0, "15": 2164472832.0, "20": 2164472832.0, "25": 2164472832.0, "30": 2164472832.0, "35": 2164472832.0, "40": 2164472832.0, "45": 2164472832.0, "50": 2164472832.0, "55": 2164472832.0, "60": 2164472832.0, "65": 2164472832.0, "70": 2164472832.0, "75": 2164472832.0, "80": 2164472832.0, "85": 2164472832.0, "90": 2164472832.0, "95": 2164472832.0, "100": 2164472832.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2413216256.0, "5": 3345833472.0, "10": 3345833472.0, "15": 3345833472.0, "20": 3345833472.0, "25": 3345833472.0, "30": 3345833472.0, "35": 3345833472.0, "40": 3345833472.0, "45": 3345833472.0, "50": 3345833472.0, "55": 3345833472.0, "60": 3345833472.0, "65": 3345833472.0, "70": 3345833472.0, "75": 3345833472.0, "80": 3345833472.0, "85": 3345833472.0, "90": 3345833472.0, "95": 3345833472.0, "100": 3345833472.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.74162, "5": 0.47846, "10": 0.4772, "15": 0.47601, "20": 0.47317, "25": 0.47899, "30": 0.79953, "35": 0.47489, "40": 0.47181, "45": 0.772, "50": 0.4704, "55": 0.47309, "60": 0.47139, "65": 0.4766, "70": 0.47286, "75": 0.47576, "80": 0.4722, "85": 0.47279, "90": 0.46958, "95": 0.46793, "100": 0.47059}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.38869,
+            "2": 10.39385,
+            "3": 9.78084,
+            "4": 9.59727,
+            "5": 9.38084,
+            "6": 9.40579,
+            "7": 9.30788,
+            "8": 9.24106,
+            "9": 9.12192,
+            "10": 9.05709,
+            "11": 8.87331,
+            "12": 8.7937,
+            "13": 8.84028,
+            "14": 8.68508,
+            "15": 8.65595,
+            "16": 8.54356,
+            "17": 8.50088,
+            "18": 8.39002,
+            "19": 8.36442,
+            "20": 8.26189,
+            "21": 8.27089,
+            "22": 8.14388,
+            "23": 8.07456,
+            "24": 8.11903,
+            "25": 7.98194,
+            "26": 8.08775,
+            "27": 7.87135,
+            "28": 7.96498,
+            "29": 7.80253,
+            "30": 7.86925,
+            "31": 7.81724,
+            "32": 7.68778,
+            "33": 7.78042,
+            "34": 7.55486,
+            "35": 7.66275,
+            "36": 7.52238,
+            "37": 7.44446,
+            "38": 7.50242,
+            "39": 7.45039,
+            "40": 7.5007,
+            "41": 7.39051,
+            "42": 7.36065,
+            "43": 7.43329,
+            "44": 7.3762,
+            "45": 7.34875,
+            "46": 7.28162,
+            "47": 7.46112,
+            "48": 7.28762,
+            "49": 7.35376,
+            "50": 7.18139,
+            "51": 7.36575,
+            "52": 7.1333,
+            "53": 7.11549,
+            "54": 7.22921,
+            "55": 7.15407,
+            "56": 7.22241,
+            "57": 7.32951,
+            "58": 7.02329,
+            "59": 7.11369,
+            "60": 7.14724,
+            "61": 7.11415,
+            "62": 7.24749,
+            "63": 7.15673,
+            "64": 7.08408,
+            "65": 6.99707,
+            "66": 7.06064,
+            "67": 7.04874,
+            "68": 7.14167,
+            "69": 7.0346,
+            "70": 7.06003,
+            "71": 6.92549,
+            "72": 7.00408,
+            "73": 6.97962,
+            "74": 6.92272,
+            "75": 7.0608,
+            "76": 6.97256,
+            "77": 7.08183,
+            "78": 7.01864,
+            "79": 6.8552,
+            "80": 6.94288,
+            "81": 6.97634,
+            "82": 7.06647,
+            "83": 6.99975,
+            "84": 7.00894,
+            "85": 6.85973,
+            "86": 7.03631,
+            "87": 6.98045,
+            "88": 6.91491,
+            "89": 6.81048,
+            "90": 7.24972,
+            "91": 6.71004,
+            "92": 7.04898,
+            "93": 6.90555,
+            "94": 7.06456,
+            "95": 6.84835,
+            "96": 6.97647,
+            "97": 6.9631,
+            "98": 6.88688,
+            "99": 7.01307,
+            "100": 6.9828
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 43318.0,
+            "2": 44050.0,
+            "3": 44756.0,
+            "4": 42391.0,
+            "5": 45385.0,
+            "6": 40966.0,
+            "7": 43182.0,
+            "8": 45459.0,
+            "9": 42453.0,
+            "10": 45371.0,
+            "11": 43978.0,
+            "12": 44598.0,
+            "13": 43892.0,
+            "14": 46190.0,
+            "15": 43897.0,
+            "16": 41608.0,
+            "17": 43825.0,
+            "18": 44703.0,
+            "19": 42550.0,
+            "20": 44769.0,
+            "21": 44793.0,
+            "22": 41844.0,
+            "23": 45444.0,
+            "24": 43071.0,
+            "25": 42476.0,
+            "26": 43926.0,
+            "27": 46218.0,
+            "28": 46430.0,
+            "29": 46178.0,
+            "30": 43985.0,
+            "31": 41281.0,
+            "32": 43347.0,
+            "33": 45448.0,
+            "34": 43305.0,
+            "35": 43264.0,
+            "36": 42485.0,
+            "37": 40077.0,
+            "38": 42514.0,
+            "39": 44723.0,
+            "40": 43230.0,
+            "41": 44653.0,
+            "42": 43269.0,
+            "43": 45446.0,
+            "44": 44588.0,
+            "45": 43278.0,
+            "46": 43896.0,
+            "47": 42369.0,
+            "48": 44704.0,
+            "49": 43172.0,
+            "50": 43381.0,
+            "51": 41175.0,
+            "52": 43812.0,
+            "53": 43934.0,
+            "54": 41932.0,
+            "55": 43857.0,
+            "56": 43277.0,
+            "57": 42576.0,
+            "58": 43835.0,
+            "59": 44629.0,
+            "60": 41225.0,
+            "61": 39716.0,
+            "62": 44773.0,
+            "63": 44717.0,
+            "64": 45367.0,
+            "65": 44683.0,
+            "66": 45367.0,
+            "67": 43136.0,
+            "68": 42523.0,
+            "69": 43828.0,
+            "70": 45534.0,
+            "71": 43316.0,
+            "72": 44750.0,
+            "73": 45364.0,
+            "74": 42445.0,
+            "75": 44679.0,
+            "76": 43875.0,
+            "77": 42100.0,
+            "78": 40289.0,
+            "79": 38949.0,
+            "80": 41115.0,
+            "81": 45362.0,
+            "82": 43205.0,
+            "83": 38475.0,
+            "84": 42459.0,
+            "85": 44010.0,
+            "86": 45731.0,
+            "87": 40860.0,
+            "88": 41793.0,
+            "89": 41068.0,
+            "90": 44673.0,
+            "91": 46149.0,
+            "92": 41798.0,
+            "93": 43246.0,
+            "94": 39583.0,
+            "95": 44064.0,
+            "96": 44715.0,
+            "97": 45390.0,
+            "98": 41808.0,
+            "99": 45436.0,
+            "100": 42520.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2164472832.0,
+            "2": 2164472832.0,
+            "3": 2164472832.0,
+            "4": 2164472832.0,
+            "5": 2164472832.0,
+            "6": 2164472832.0,
+            "7": 2164472832.0,
+            "8": 2164472832.0,
+            "9": 2164472832.0,
+            "10": 2164472832.0,
+            "11": 2164472832.0,
+            "12": 2164472832.0,
+            "13": 2164472832.0,
+            "14": 2164472832.0,
+            "15": 2164472832.0,
+            "16": 2164472832.0,
+            "17": 2164472832.0,
+            "18": 2164472832.0,
+            "19": 2164472832.0,
+            "20": 2164472832.0,
+            "21": 2164472832.0,
+            "22": 2164472832.0,
+            "23": 2164472832.0,
+            "24": 2164472832.0,
+            "25": 2164472832.0,
+            "26": 2164472832.0,
+            "27": 2164472832.0,
+            "28": 2164472832.0,
+            "29": 2164472832.0,
+            "30": 2164472832.0,
+            "31": 2164472832.0,
+            "32": 2164472832.0,
+            "33": 2164472832.0,
+            "34": 2164472832.0,
+            "35": 2164472832.0,
+            "36": 2164472832.0,
+            "37": 2164472832.0,
+            "38": 2164472832.0,
+            "39": 2164472832.0,
+            "40": 2164472832.0,
+            "41": 2164472832.0,
+            "42": 2164472832.0,
+            "43": 2164472832.0,
+            "44": 2164472832.0,
+            "45": 2164472832.0,
+            "46": 2164472832.0,
+            "47": 2164472832.0,
+            "48": 2164472832.0,
+            "49": 2164472832.0,
+            "50": 2164472832.0,
+            "51": 2164472832.0,
+            "52": 2164472832.0,
+            "53": 2164472832.0,
+            "54": 2164472832.0,
+            "55": 2164472832.0,
+            "56": 2164472832.0,
+            "57": 2164472832.0,
+            "58": 2164472832.0,
+            "59": 2164472832.0,
+            "60": 2164472832.0,
+            "61": 2164472832.0,
+            "62": 2164472832.0,
+            "63": 2164472832.0,
+            "64": 2164472832.0,
+            "65": 2164472832.0,
+            "66": 2164472832.0,
+            "67": 2164472832.0,
+            "68": 2164472832.0,
+            "69": 2164472832.0,
+            "70": 2164472832.0,
+            "71": 2164472832.0,
+            "72": 2164472832.0,
+            "73": 2164472832.0,
+            "74": 2164472832.0,
+            "75": 2164472832.0,
+            "76": 2164472832.0,
+            "77": 2164472832.0,
+            "78": 2164472832.0,
+            "79": 2164472832.0,
+            "80": 2164472832.0,
+            "81": 2164472832.0,
+            "82": 2164472832.0,
+            "83": 2164472832.0,
+            "84": 2164472832.0,
+            "85": 2164472832.0,
+            "86": 2164472832.0,
+            "87": 2164472832.0,
+            "88": 2164472832.0,
+            "89": 2164472832.0,
+            "90": 2164472832.0,
+            "91": 2164472832.0,
+            "92": 2164472832.0,
+            "93": 2164472832.0,
+            "94": 2164472832.0,
+            "95": 2164472832.0,
+            "96": 2164472832.0,
+            "97": 2164472832.0,
+            "98": 2164472832.0,
+            "99": 2164472832.0,
+            "100": 2164472832.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2413216256.0,
+            "2": 3345833472.0,
+            "3": 3345833472.0,
+            "4": 3345833472.0,
+            "5": 3345833472.0,
+            "6": 3345833472.0,
+            "7": 3345833472.0,
+            "8": 3345833472.0,
+            "9": 3345833472.0,
+            "10": 3345833472.0,
+            "11": 3345833472.0,
+            "12": 3345833472.0,
+            "13": 3345833472.0,
+            "14": 3345833472.0,
+            "15": 3345833472.0,
+            "16": 3345833472.0,
+            "17": 3345833472.0,
+            "18": 3345833472.0,
+            "19": 3345833472.0,
+            "20": 3345833472.0,
+            "21": 3345833472.0,
+            "22": 3345833472.0,
+            "23": 3345833472.0,
+            "24": 3345833472.0,
+            "25": 3345833472.0,
+            "26": 3345833472.0,
+            "27": 3345833472.0,
+            "28": 3345833472.0,
+            "29": 3345833472.0,
+            "30": 3345833472.0,
+            "31": 3345833472.0,
+            "32": 3345833472.0,
+            "33": 3345833472.0,
+            "34": 3345833472.0,
+            "35": 3345833472.0,
+            "36": 3345833472.0,
+            "37": 3345833472.0,
+            "38": 3345833472.0,
+            "39": 3345833472.0,
+            "40": 3345833472.0,
+            "41": 3345833472.0,
+            "42": 3345833472.0,
+            "43": 3345833472.0,
+            "44": 3345833472.0,
+            "45": 3345833472.0,
+            "46": 3345833472.0,
+            "47": 3345833472.0,
+            "48": 3345833472.0,
+            "49": 3345833472.0,
+            "50": 3345833472.0,
+            "51": 3345833472.0,
+            "52": 3345833472.0,
+            "53": 3345833472.0,
+            "54": 3345833472.0,
+            "55": 3345833472.0,
+            "56": 3345833472.0,
+            "57": 3345833472.0,
+            "58": 3345833472.0,
+            "59": 3345833472.0,
+            "60": 3345833472.0,
+            "61": 3345833472.0,
+            "62": 3345833472.0,
+            "63": 3345833472.0,
+            "64": 3345833472.0,
+            "65": 3345833472.0,
+            "66": 3345833472.0,
+            "67": 3345833472.0,
+            "68": 3345833472.0,
+            "69": 3345833472.0,
+            "70": 3345833472.0,
+            "71": 3345833472.0,
+            "72": 3345833472.0,
+            "73": 3345833472.0,
+            "74": 3345833472.0,
+            "75": 3345833472.0,
+            "76": 3345833472.0,
+            "77": 3345833472.0,
+            "78": 3345833472.0,
+            "79": 3345833472.0,
+            "80": 3345833472.0,
+            "81": 3345833472.0,
+            "82": 3345833472.0,
+            "83": 3345833472.0,
+            "84": 3345833472.0,
+            "85": 3345833472.0,
+            "86": 3345833472.0,
+            "87": 3345833472.0,
+            "88": 3345833472.0,
+            "89": 3345833472.0,
+            "90": 3345833472.0,
+            "91": 3345833472.0,
+            "92": 3345833472.0,
+            "93": 3345833472.0,
+            "94": 3345833472.0,
+            "95": 3345833472.0,
+            "96": 3345833472.0,
+            "97": 3345833472.0,
+            "98": 3345833472.0,
+            "99": 3345833472.0,
+            "100": 3345833472.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 15.92465,
+            "2": 0.73672,
+            "3": 0.44476,
+            "4": 0.43267,
+            "5": 0.43229,
+            "6": 0.43162,
+            "7": 0.43222,
+            "8": 0.4329,
+            "9": 0.43176,
+            "10": 0.43233,
+            "11": 0.43227,
+            "12": 0.43124,
+            "13": 0.43277,
+            "14": 0.44061,
+            "15": 0.4485,
+            "16": 0.45121,
+            "17": 0.80848,
+            "18": 0.43555,
+            "19": 0.43298,
+            "20": 0.44302,
+            "21": 0.44708,
+            "22": 0.43142,
+            "23": 0.43189,
+            "24": 0.44055,
+            "25": 0.4339,
+            "26": 0.43161,
+            "27": 0.43237,
+            "28": 0.43157,
+            "29": 0.43161,
+            "30": 0.43227,
+            "31": 0.43156,
+            "32": 0.43921,
+            "33": 0.43687,
+            "34": 0.43188,
+            "35": 0.43194,
+            "36": 0.43194,
+            "37": 0.43209,
+            "38": 0.43171,
+            "39": 0.4409,
+            "40": 0.45052,
+            "41": 0.43131,
+            "42": 0.43172,
+            "43": 0.43147,
+            "44": 0.84045,
+            "45": 0.43076,
+            "46": 0.43068,
+            "47": 0.87305,
+            "48": 0.43164,
+            "49": 1.00548,
+            "50": 0.8703,
+            "51": 0.43255,
+            "52": 0.43229,
+            "53": 0.43202,
+            "54": 0.432,
+            "55": 0.43189,
+            "56": 0.43154,
+            "57": 0.43166,
+            "58": 0.4319,
+            "59": 0.43132,
+            "60": 0.43234,
+            "61": 0.43225,
+            "62": 0.43193,
+            "63": 0.43153,
+            "64": 0.43325,
+            "65": 0.4339,
+            "66": 0.43652,
+            "67": 0.43828,
+            "68": 0.43797,
+            "69": 0.44101,
+            "70": 0.43951,
+            "71": 0.43787,
+            "72": 0.43391,
+            "73": 0.4315,
+            "74": 0.43378,
+            "75": 0.43568,
+            "76": 0.43331,
+            "77": 0.43334,
+            "78": 0.43227,
+            "79": 0.43399,
+            "80": 0.44924,
+            "81": 0.4326,
+            "82": 0.43301,
+            "83": 0.43228,
+            "84": 0.43254,
+            "85": 0.43238,
+            "86": 0.43838,
+            "87": 0.44364,
+            "88": 0.43194,
+            "89": 0.43286,
+            "90": 0.43292,
+            "91": 0.43386,
+            "92": 0.43602,
+            "93": 0.43208,
+            "94": 0.43192,
+            "95": 0.43262,
+            "96": 0.43158,
+            "97": 0.43293,
+            "98": 0.43715,
+            "99": 0.43258,
+            "100": 0.43232
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json
index 89582b25851..8e29e2a4993 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2194357248.0,
-            "2": 2194357248.0,
-            "3": 2194357248.0,
-            "4": 2194357248.0,
-            "5": 2194357248.0,
-            "6": 2194357248.0,
-            "7": 2194357248.0,
-            "8": 2194357248.0,
-            "9": 2194357248.0,
-            "10": 2194357248.0,
-            "11": 2194357248.0,
-            "12": 2194357248.0,
-            "13": 2194357248.0,
-            "14": 2194357248.0,
-            "15": 2194357248.0,
-            "16": 2194357248.0,
-            "17": 2194357248.0,
-            "18": 2194357248.0,
-            "19": 2194357248.0,
-            "20": 2194357248.0,
-            "21": 2194357248.0,
-            "22": 2194357248.0,
-            "23": 2194357248.0,
-            "24": 2194357248.0,
-            "25": 2194357248.0,
-            "26": 2194357248.0,
-            "27": 2194357248.0,
-            "28": 2194357248.0,
-            "29": 2194357248.0,
-            "30": 2194357248.0,
-            "31": 2194357248.0,
-            "32": 2194357248.0,
-            "33": 2194357248.0,
-            "34": 2194357248.0,
-            "35": 2194357248.0,
-            "36": 2194357248.0,
-            "37": 2194357248.0,
-            "38": 2194357248.0,
-            "39": 2194357248.0,
-            "40": 2194357248.0,
-            "41": 2194357248.0,
-            "42": 2194357248.0,
-            "43": 2194357248.0,
-            "44": 2194357248.0,
-            "45": 2194357248.0,
-            "46": 2194357248.0,
-            "47": 2194357248.0,
-            "48": 2194357248.0,
-            "49": 2194357248.0,
-            "50": 2194357248.0,
-            "51": 2194357248.0,
-            "52": 2194357248.0,
-            "53": 2194357248.0,
-            "54": 2194357248.0,
-            "55": 2194357248.0,
-            "56": 2194357248.0,
-            "57": 2194357248.0,
-            "58": 2194357248.0,
-            "59": 2194357248.0,
-            "60": 2194357248.0,
-            "61": 2194357248.0,
-            "62": 2194357248.0,
-            "63": 2194357248.0,
-            "64": 2194357248.0,
-            "65": 2194357248.0,
-            "66": 2194357248.0,
-            "67": 2194357248.0,
-            "68": 2194357248.0,
-            "69": 2194357248.0,
-            "70": 2194357248.0,
-            "71": 2194357248.0,
-            "72": 2194357248.0,
-            "73": 2194357248.0,
-            "74": 2194357248.0,
-            "75": 2194357248.0,
-            "76": 2194357248.0,
-            "77": 2194357248.0,
-            "78": 2194357248.0,
-            "79": 2194357248.0,
-            "80": 2194357248.0,
-            "81": 2194357248.0,
-            "82": 2194357248.0,
-            "83": 2194357248.0,
-            "84": 2194357248.0,
-            "85": 2194357248.0,
-            "86": 2194357248.0,
-            "87": 2194357248.0,
-            "88": 2194357248.0,
-            "89": 2194357248.0,
-            "90": 2194357248.0,
-            "91": 2194357248.0,
-            "92": 2194357248.0,
-            "93": 2194357248.0,
-            "94": 2194357248.0,
-            "95": 2194357248.0,
-            "96": 2194357248.0,
-            "97": 2194357248.0,
-            "98": 2194357248.0,
-            "99": 2194357248.0,
-            "100": 2194357248.0
+            "1": 2196192256.0,
+            "2": 2196192256.0,
+            "3": 2196192256.0,
+            "4": 2196192256.0,
+            "5": 2196192256.0,
+            "6": 2196192256.0,
+            "7": 2196192256.0,
+            "8": 2196192256.0,
+            "9": 2196192256.0,
+            "10": 2196192256.0,
+            "11": 2196192256.0,
+            "12": 2196192256.0,
+            "13": 2196192256.0,
+            "14": 2196192256.0,
+            "15": 2196192256.0,
+            "16": 2196192256.0,
+            "17": 2196192256.0,
+            "18": 2196192256.0,
+            "19": 2196192256.0,
+            "20": 2196192256.0,
+            "21": 2196192256.0,
+            "22": 2196192256.0,
+            "23": 2196192256.0,
+            "24": 2196192256.0,
+            "25": 2196192256.0,
+            "26": 2196192256.0,
+            "27": 2196192256.0,
+            "28": 2196192256.0,
+            "29": 2196192256.0,
+            "30": 2196192256.0,
+            "31": 2196192256.0,
+            "32": 2196192256.0,
+            "33": 2196192256.0,
+            "34": 2196192256.0,
+            "35": 2196192256.0,
+            "36": 2196192256.0,
+            "37": 2196192256.0,
+            "38": 2196192256.0,
+            "39": 2196192256.0,
+            "40": 2196192256.0,
+            "41": 2196192256.0,
+            "42": 2196192256.0,
+            "43": 2196192256.0,
+            "44": 2196192256.0,
+            "45": 2196192256.0,
+            "46": 2196192256.0,
+            "47": 2196192256.0,
+            "48": 2196192256.0,
+            "49": 2196192256.0,
+            "50": 2196192256.0,
+            "51": 2196192256.0,
+            "52": 2196192256.0,
+            "53": 2196192256.0,
+            "54": 2196192256.0,
+            "55": 2196192256.0,
+            "56": 2196192256.0,
+            "57": 2196192256.0,
+            "58": 2196192256.0,
+            "59": 2196192256.0,
+            "60": 2196192256.0,
+            "61": 2196192256.0,
+            "62": 2196192256.0,
+            "63": 2196192256.0,
+            "64": 2196192256.0,
+            "65": 2196192256.0,
+            "66": 2196192256.0,
+            "67": 2196192256.0,
+            "68": 2196192256.0,
+            "69": 2196192256.0,
+            "70": 2196192256.0,
+            "71": 2196192256.0,
+            "72": 2196192256.0,
+            "73": 2196192256.0,
+            "74": 2196192256.0,
+            "75": 2196192256.0,
+            "76": 2196192256.0,
+            "77": 2196192256.0,
+            "78": 2196192256.0,
+            "79": 2196192256.0,
+            "80": 2196192256.0,
+            "81": 2196192256.0,
+            "82": 2196192256.0,
+            "83": 2196192256.0,
+            "84": 2196192256.0,
+            "85": 2196192256.0,
+            "86": 2196192256.0,
+            "87": 2196192256.0,
+            "88": 2196192256.0,
+            "89": 2196192256.0,
+            "90": 2196192256.0,
+            "91": 2196192256.0,
+            "92": 2196192256.0,
+            "93": 2196192256.0,
+            "94": 2196192256.0,
+            "95": 2196192256.0,
+            "96": 2196192256.0,
+            "97": 2196192256.0,
+            "98": 2196192256.0,
+            "99": 2196192256.0,
+            "100": 2196192256.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2443624960.0,
-            "2": 3375193600.0,
-            "3": 3375193600.0,
-            "4": 3375193600.0,
-            "5": 3375193600.0,
-            "6": 3375193600.0,
-            "7": 3375193600.0,
-            "8": 3375193600.0,
-            "9": 3375193600.0,
-            "10": 3375193600.0,
-            "11": 3375193600.0,
-            "12": 3375193600.0,
-            "13": 3375193600.0,
-            "14": 3375193600.0,
-            "15": 3375193600.0,
-            "16": 3375193600.0,
-            "17": 3375193600.0,
-            "18": 3375193600.0,
-            "19": 3375193600.0,
-            "20": 3375193600.0,
-            "21": 3375193600.0,
-            "22": 3375193600.0,
-            "23": 3375193600.0,
-            "24": 3375193600.0,
-            "25": 3375193600.0,
-            "26": 3375193600.0,
-            "27": 3375193600.0,
-            "28": 3375193600.0,
-            "29": 3375193600.0,
-            "30": 3375193600.0,
-            "31": 3375193600.0,
-            "32": 3375193600.0,
-            "33": 3375193600.0,
-            "34": 3375193600.0,
-            "35": 3375193600.0,
-            "36": 3375193600.0,
-            "37": 3375193600.0,
-            "38": 3375193600.0,
-            "39": 3375193600.0,
-            "40": 3375193600.0,
-            "41": 3375193600.0,
-            "42": 3375193600.0,
-            "43": 3375193600.0,
-            "44": 3375193600.0,
-            "45": 3375193600.0,
-            "46": 3375193600.0,
-            "47": 3375193600.0,
-            "48": 3375193600.0,
-            "49": 3375193600.0,
-            "50": 3375193600.0,
-            "51": 3375193600.0,
-            "52": 3375193600.0,
-            "53": 3375193600.0,
-            "54": 3375193600.0,
-            "55": 3375193600.0,
-            "56": 3375193600.0,
-            "57": 3375193600.0,
-            "58": 3375193600.0,
-            "59": 3375193600.0,
-            "60": 3375193600.0,
-            "61": 3375193600.0,
-            "62": 3375193600.0,
-            "63": 3375193600.0,
-            "64": 3375193600.0,
-            "65": 3375193600.0,
-            "66": 3375193600.0,
-            "67": 3375193600.0,
-            "68": 3375193600.0,
-            "69": 3375193600.0,
-            "70": 3375193600.0,
-            "71": 3375193600.0,
-            "72": 3375193600.0,
-            "73": 3375193600.0,
-            "74": 3375193600.0,
-            "75": 3375193600.0,
-            "76": 3375193600.0,
-            "77": 3375193600.0,
-            "78": 3375193600.0,
-            "79": 3375193600.0,
-            "80": 3375193600.0,
-            "81": 3375193600.0,
-            "82": 3375193600.0,
-            "83": 3375193600.0,
-            "84": 3375193600.0,
-            "85": 3375193600.0,
-            "86": 3375193600.0,
-            "87": 3375193600.0,
-            "88": 3375193600.0,
-            "89": 3375193600.0,
-            "90": 3375193600.0,
-            "91": 3375193600.0,
-            "92": 3375193600.0,
-            "93": 3375193600.0,
-            "94": 3375193600.0,
-            "95": 3375193600.0,
-            "96": 3375193600.0,
-            "97": 3375193600.0,
-            "98": 3375193600.0,
-            "99": 3375193600.0,
-            "100": 3375193600.0
+            "1": 2444149248.0,
+            "2": 3377290752.0,
+            "3": 3377290752.0,
+            "4": 3377290752.0,
+            "5": 3377290752.0,
+            "6": 3377290752.0,
+            "7": 3377290752.0,
+            "8": 3377290752.0,
+            "9": 3377290752.0,
+            "10": 3377290752.0,
+            "11": 3377290752.0,
+            "12": 3377290752.0,
+            "13": 3377290752.0,
+            "14": 3377290752.0,
+            "15": 3377290752.0,
+            "16": 3377290752.0,
+            "17": 3377290752.0,
+            "18": 3377290752.0,
+            "19": 3377290752.0,
+            "20": 3377290752.0,
+            "21": 3377290752.0,
+            "22": 3377290752.0,
+            "23": 3377290752.0,
+            "24": 3377290752.0,
+            "25": 3377290752.0,
+            "26": 3377290752.0,
+            "27": 3377290752.0,
+            "28": 3377290752.0,
+            "29": 3377290752.0,
+            "30": 3377290752.0,
+            "31": 3377290752.0,
+            "32": 3377290752.0,
+            "33": 3377290752.0,
+            "34": 3377290752.0,
+            "35": 3377290752.0,
+            "36": 3377290752.0,
+            "37": 3377290752.0,
+            "38": 3377290752.0,
+            "39": 3377290752.0,
+            "40": 3377290752.0,
+            "41": 3377290752.0,
+            "42": 3377290752.0,
+            "43": 3377290752.0,
+            "44": 3377290752.0,
+            "45": 3377290752.0,
+            "46": 3377290752.0,
+            "47": 3377290752.0,
+            "48": 3377290752.0,
+            "49": 3377290752.0,
+            "50": 3377290752.0,
+            "51": 3377290752.0,
+            "52": 3377290752.0,
+            "53": 3377290752.0,
+            "54": 3377290752.0,
+            "55": 3377290752.0,
+            "56": 3377290752.0,
+            "57": 3377290752.0,
+            "58": 3377290752.0,
+            "59": 3377290752.0,
+            "60": 3377290752.0,
+            "61": 3377290752.0,
+            "62": 3377290752.0,
+            "63": 3377290752.0,
+            "64": 3377290752.0,
+            "65": 3377290752.0,
+            "66": 3377290752.0,
+            "67": 3377290752.0,
+            "68": 3377290752.0,
+            "69": 3377290752.0,
+            "70": 3377290752.0,
+            "71": 3377290752.0,
+            "72": 3377290752.0,
+            "73": 3377290752.0,
+            "74": 3377290752.0,
+            "75": 3377290752.0,
+            "76": 3377290752.0,
+            "77": 3377290752.0,
+            "78": 3377290752.0,
+            "79": 3377290752.0,
+            "80": 3377290752.0,
+            "81": 3377290752.0,
+            "82": 3377290752.0,
+            "83": 3377290752.0,
+            "84": 3377290752.0,
+            "85": 3377290752.0,
+            "86": 3377290752.0,
+            "87": 3377290752.0,
+            "88": 3377290752.0,
+            "89": 3377290752.0,
+            "90": 3377290752.0,
+            "91": 3377290752.0,
+            "92": 3377290752.0,
+            "93": 3377290752.0,
+            "94": 3377290752.0,
+            "95": 3377290752.0,
+            "96": 3377290752.0,
+            "97": 3377290752.0,
+            "98": 3377290752.0,
+            "99": 3377290752.0,
+            "100": 3377290752.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.37156,
-            "2": 0.38887,
-            "3": 0.36602,
-            "4": 0.35866,
-            "5": 0.36165,
-            "6": 0.37465,
-            "7": 0.35731,
-            "8": 0.3641,
-            "9": 0.35988,
-            "10": 0.35622,
-            "11": 0.36397,
-            "12": 0.36059,
-            "13": 0.35322,
-            "14": 0.36378,
-            "15": 0.35044,
-            "16": 0.351,
-            "17": 0.3614,
-            "18": 0.3499,
-            "19": 0.3502,
-            "20": 0.35899,
-            "21": 0.34832,
-            "22": 0.35463,
-            "23": 0.36264,
-            "24": 0.3582,
-            "25": 0.68028,
-            "26": 0.35807,
-            "27": 0.36086,
-            "28": 0.3546,
-            "29": 0.35008,
-            "30": 0.36639,
-            "31": 0.35917,
-            "32": 0.35093,
-            "33": 0.42545,
-            "34": 0.36458,
-            "35": 0.36139,
-            "36": 0.66018,
-            "37": 0.36179,
-            "38": 0.35264,
-            "39": 0.35347,
-            "40": 0.35947,
-            "41": 0.65933,
-            "42": 0.36488,
-            "43": 0.35596,
-            "44": 0.35639,
-            "45": 0.35817,
-            "46": 0.35914,
-            "47": 0.65482,
-            "48": 0.35543,
-            "49": 0.3548,
-            "50": 0.36559,
-            "51": 0.3585,
-            "52": 0.35668,
-            "53": 0.3592,
-            "54": 0.35503,
-            "55": 0.36108,
-            "56": 0.74128,
-            "57": 0.36657,
-            "58": 0.36018,
-            "59": 0.35608,
-            "60": 0.36593,
-            "61": 0.35388,
-            "62": 0.35617,
-            "63": 0.63145,
-            "64": 0.35737,
-            "65": 0.36509,
-            "66": 0.35793,
-            "67": 0.36215,
-            "68": 0.35502,
-            "69": 0.35608,
-            "70": 0.36406,
-            "71": 0.35939,
-            "72": 0.36012,
-            "73": 0.36102,
-            "74": 0.35997,
-            "75": 0.35821,
-            "76": 0.36372,
-            "77": 0.36015,
-            "78": 0.36089,
-            "79": 0.3626,
-            "80": 0.36632,
-            "81": 0.36481,
-            "82": 0.38444,
-            "83": 0.36154,
-            "84": 0.37204,
-            "85": 0.35784,
-            "86": 0.35591,
-            "87": 0.36678,
-            "88": 0.73353,
-            "89": 0.36867,
-            "90": 0.36231,
-            "91": 0.36826,
-            "92": 0.35945,
-            "93": 0.36394,
-            "94": 0.43835,
-            "95": 0.36152,
-            "96": 0.36154,
-            "97": 0.35778,
-            "98": 0.35857,
-            "99": 0.36061,
-            "100": 0.35857
+            "1": 25.09235,
+            "2": 0.40134,
+            "3": 0.33175,
+            "4": 0.31603,
+            "5": 0.31264,
+            "6": 0.3171,
+            "7": 0.31353,
+            "8": 0.31164,
+            "9": 0.31158,
+            "10": 0.31146,
+            "11": 0.3125,
+            "12": 0.31264,
+            "13": 0.31346,
+            "14": 0.317,
+            "15": 0.32556,
+            "16": 0.31934,
+            "17": 0.69799,
+            "18": 0.32677,
+            "19": 0.31967,
+            "20": 0.3173,
+            "21": 0.31556,
+            "22": 0.31356,
+            "23": 0.31832,
+            "24": 0.31564,
+            "25": 0.31197,
+            "26": 0.31173,
+            "27": 0.31328,
+            "28": 0.31264,
+            "29": 0.31324,
+            "30": 0.31156,
+            "31": 0.31097,
+            "32": 0.31333,
+            "33": 0.31645,
+            "34": 0.31419,
+            "35": 0.31325,
+            "36": 0.30809,
+            "37": 0.30923,
+            "38": 0.30875,
+            "39": 0.30819,
+            "40": 0.31109,
+            "41": 0.30849,
+            "42": 0.30871,
+            "43": 0.72163,
+            "44": 0.70555,
+            "45": 0.31196,
+            "46": 0.30971,
+            "47": 0.90209,
+            "48": 0.30901,
+            "49": 0.30899,
+            "50": 0.31177,
+            "51": 0.31251,
+            "52": 0.30763,
+            "53": 0.31005,
+            "54": 0.30977,
+            "55": 0.30883,
+            "56": 0.30955,
+            "57": 0.30687,
+            "58": 0.30701,
+            "59": 0.30937,
+            "60": 0.3093,
+            "61": 0.30827,
+            "62": 0.30923,
+            "63": 0.30942,
+            "64": 0.30862,
+            "65": 0.31004,
+            "66": 0.30958,
+            "67": 0.3081,
+            "68": 0.30948,
+            "69": 0.30866,
+            "70": 0.30848,
+            "71": 0.32952,
+            "72": 0.32928,
+            "73": 0.32761,
+            "74": 0.32983,
+            "75": 0.32798,
+            "76": 0.40614,
+            "77": 0.33024,
+            "78": 0.33019,
+            "79": 0.31035,
+            "80": 0.30849,
+            "81": 0.31139,
+            "82": 0.3106,
+            "83": 0.30861,
+            "84": 0.3083,
+            "85": 0.30817,
+            "86": 0.31324,
+            "87": 0.31432,
+            "88": 0.31032,
+            "89": 0.30979,
+            "90": 0.30748,
+            "91": 0.30871,
+            "92": 0.31423,
+            "93": 0.31134,
+            "94": 0.31265,
+            "95": 0.30865,
+            "96": 0.30849,
+            "97": 0.31368,
+            "98": 0.30792,
+            "99": 0.31014,
+            "100": 0.30734
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json
index db68b291113..df17a69a638 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.38736, "5": 9.38281, "10": 9.06783, "15": 8.65706, "20": 8.26603, "25": 7.98158, "30": 7.87182, "35": 7.66308, "40": 7.50499, "45": 7.3523, "50": 7.17986, "55": 7.15383, "60": 7.14998, "65": 6.99542, "70": 7.0643, "75": 7.06414, "80": 6.94493, "85": 6.8595, "90": 7.25918, "95": 6.84927, "100": 6.99082}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43296.0, "5": 45373.0, "10": 45357.0, "15": 43909.0, "20": 44765.0, "25": 42457.0, "30": 43999.0, "35": 43276.0, "40": 43214.0, "45": 43265.0, "50": 43383.0, "55": 43861.0, "60": 41267.0, "65": 44696.0, "70": 45504.0, "75": 44661.0, "80": 41077.0, "85": 43970.0, "90": 44657.0, "95": 44047.0, "100": 42429.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2166438912.0, "5": 2166438912.0, "10": 2166438912.0, "15": 2166438912.0, "20": 2166438912.0, "25": 2166438912.0, "30": 2166438912.0, "35": 2166438912.0, "40": 2166438912.0, "45": 2166438912.0, "50": 2166438912.0, "55": 2166438912.0, "60": 2166438912.0, "65": 2166438912.0, "70": 2166438912.0, "75": 2166438912.0, "80": 2166438912.0, "85": 2166438912.0, "90": 2166438912.0, "95": 2166438912.0, "100": 2166438912.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2274851328.0, "5": 3206419968.0, "10": 3206419968.0, "15": 3206419968.0, "20": 3206419968.0, "25": 3206419968.0, "30": 3206419968.0, "35": 3206419968.0, "40": 3206419968.0, "45": 3206419968.0, "50": 3206419968.0, "55": 3206419968.0, "60": 3206419968.0, "65": 3206419968.0, "70": 3206419968.0, "75": 3206419968.0, "80": 3206419968.0, "85": 3206419968.0, "90": 3206419968.0, "95": 3206419968.0, "100": 3206419968.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.73958, "5": 0.5162, "10": 0.51641, "15": 0.51693, "20": 0.93549, "25": 0.52094, "30": 1.03416, "35": 0.51, "40": 0.85483, "45": 0.50998, "50": 0.51431, "55": 0.51184, "60": 0.51243, "65": 0.51243, "70": 0.52038, "75": 0.51387, "80": 0.51875, "85": 0.51808, "90": 0.52661, "95": 0.51088, "100": 0.51108}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.38736,
+            "2": 10.37971,
+            "3": 9.79428,
+            "4": 9.59941,
+            "5": 9.38281,
+            "6": 9.40765,
+            "7": 9.31116,
+            "8": 9.25004,
+            "9": 9.1304,
+            "10": 9.06783,
+            "11": 8.89519,
+            "12": 8.8149,
+            "13": 8.82749,
+            "14": 8.69768,
+            "15": 8.65706,
+            "16": 8.54479,
+            "17": 8.50168,
+            "18": 8.39069,
+            "19": 8.36692,
+            "20": 8.26603,
+            "21": 8.27533,
+            "22": 8.14757,
+            "23": 8.0735,
+            "24": 8.12127,
+            "25": 7.98158,
+            "26": 8.09181,
+            "27": 7.87361,
+            "28": 7.96832,
+            "29": 7.80579,
+            "30": 7.87182,
+            "31": 7.818,
+            "32": 7.69078,
+            "33": 7.7864,
+            "34": 7.55667,
+            "35": 7.66308,
+            "36": 7.52559,
+            "37": 7.44779,
+            "38": 7.50335,
+            "39": 7.45281,
+            "40": 7.50499,
+            "41": 7.38901,
+            "42": 7.36263,
+            "43": 7.43543,
+            "44": 7.37578,
+            "45": 7.3523,
+            "46": 7.2817,
+            "47": 7.46121,
+            "48": 7.29037,
+            "49": 7.35179,
+            "50": 7.17986,
+            "51": 7.36821,
+            "52": 7.13332,
+            "53": 7.11532,
+            "54": 7.23214,
+            "55": 7.15383,
+            "56": 7.22184,
+            "57": 7.33328,
+            "58": 7.02116,
+            "59": 7.11467,
+            "60": 7.14998,
+            "61": 7.1117,
+            "62": 7.25117,
+            "63": 7.15586,
+            "64": 7.08539,
+            "65": 6.99542,
+            "66": 7.05924,
+            "67": 7.04804,
+            "68": 7.13906,
+            "69": 7.03428,
+            "70": 7.0643,
+            "71": 6.9218,
+            "72": 7.00511,
+            "73": 6.97917,
+            "74": 6.92066,
+            "75": 7.06414,
+            "76": 6.97532,
+            "77": 7.0837,
+            "78": 7.01986,
+            "79": 6.86115,
+            "80": 6.94493,
+            "81": 6.97847,
+            "82": 7.06834,
+            "83": 6.99434,
+            "84": 7.01114,
+            "85": 6.8595,
+            "86": 7.04211,
+            "87": 6.98111,
+            "88": 6.91353,
+            "89": 6.81096,
+            "90": 7.25918,
+            "91": 6.71195,
+            "92": 7.05431,
+            "93": 6.91084,
+            "94": 7.06872,
+            "95": 6.84927,
+            "96": 6.98126,
+            "97": 6.96743,
+            "98": 6.89421,
+            "99": 7.0152,
+            "100": 6.99082
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 43296.0,
+            "2": 44067.0,
+            "3": 44759.0,
+            "4": 42367.0,
+            "5": 45373.0,
+            "6": 40966.0,
+            "7": 43147.0,
+            "8": 45448.0,
+            "9": 42470.0,
+            "10": 45357.0,
+            "11": 43969.0,
+            "12": 44583.0,
+            "13": 43897.0,
+            "14": 46189.0,
+            "15": 43909.0,
+            "16": 41613.0,
+            "17": 43823.0,
+            "18": 44678.0,
+            "19": 42556.0,
+            "20": 44765.0,
+            "21": 44723.0,
+            "22": 41820.0,
+            "23": 45463.0,
+            "24": 43077.0,
+            "25": 42457.0,
+            "26": 43913.0,
+            "27": 46221.0,
+            "28": 46390.0,
+            "29": 46160.0,
+            "30": 43999.0,
+            "31": 41276.0,
+            "32": 43316.0,
+            "33": 45432.0,
+            "34": 43303.0,
+            "35": 43276.0,
+            "36": 42461.0,
+            "37": 40045.0,
+            "38": 42557.0,
+            "39": 44701.0,
+            "40": 43214.0,
+            "41": 44667.0,
+            "42": 43241.0,
+            "43": 45448.0,
+            "44": 44605.0,
+            "45": 43265.0,
+            "46": 43892.0,
+            "47": 42375.0,
+            "48": 44656.0,
+            "49": 43182.0,
+            "50": 43383.0,
+            "51": 41130.0,
+            "52": 43841.0,
+            "53": 43918.0,
+            "54": 41894.0,
+            "55": 43861.0,
+            "56": 43229.0,
+            "57": 42488.0,
+            "58": 43831.0,
+            "59": 44616.0,
+            "60": 41267.0,
+            "61": 39701.0,
+            "62": 44746.0,
+            "63": 44704.0,
+            "64": 45346.0,
+            "65": 44696.0,
+            "66": 45356.0,
+            "67": 43133.0,
+            "68": 42535.0,
+            "69": 43803.0,
+            "70": 45504.0,
+            "71": 43309.0,
+            "72": 44800.0,
+            "73": 45401.0,
+            "74": 42467.0,
+            "75": 44661.0,
+            "76": 43882.0,
+            "77": 42110.0,
+            "78": 40337.0,
+            "79": 38924.0,
+            "80": 41077.0,
+            "81": 45349.0,
+            "82": 43228.0,
+            "83": 38446.0,
+            "84": 42443.0,
+            "85": 43970.0,
+            "86": 45668.0,
+            "87": 40846.0,
+            "88": 41780.0,
+            "89": 41056.0,
+            "90": 44657.0,
+            "91": 46133.0,
+            "92": 41748.0,
+            "93": 43205.0,
+            "94": 39556.0,
+            "95": 44047.0,
+            "96": 44668.0,
+            "97": 45383.0,
+            "98": 41817.0,
+            "99": 45425.0,
+            "100": 42429.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2166438912.0,
+            "2": 2166438912.0,
+            "3": 2166438912.0,
+            "4": 2166438912.0,
+            "5": 2166438912.0,
+            "6": 2166438912.0,
+            "7": 2166438912.0,
+            "8": 2166438912.0,
+            "9": 2166438912.0,
+            "10": 2166438912.0,
+            "11": 2166438912.0,
+            "12": 2166438912.0,
+            "13": 2166438912.0,
+            "14": 2166438912.0,
+            "15": 2166438912.0,
+            "16": 2166438912.0,
+            "17": 2166438912.0,
+            "18": 2166438912.0,
+            "19": 2166438912.0,
+            "20": 2166438912.0,
+            "21": 2166438912.0,
+            "22": 2166438912.0,
+            "23": 2166438912.0,
+            "24": 2166438912.0,
+            "25": 2166438912.0,
+            "26": 2166438912.0,
+            "27": 2166438912.0,
+            "28": 2166438912.0,
+            "29": 2166438912.0,
+            "30": 2166438912.0,
+            "31": 2166438912.0,
+            "32": 2166438912.0,
+            "33": 2166438912.0,
+            "34": 2166438912.0,
+            "35": 2166438912.0,
+            "36": 2166438912.0,
+            "37": 2166438912.0,
+            "38": 2166438912.0,
+            "39": 2166438912.0,
+            "40": 2166438912.0,
+            "41": 2166438912.0,
+            "42": 2166438912.0,
+            "43": 2166438912.0,
+            "44": 2166438912.0,
+            "45": 2166438912.0,
+            "46": 2166438912.0,
+            "47": 2166438912.0,
+            "48": 2166438912.0,
+            "49": 2166438912.0,
+            "50": 2166438912.0,
+            "51": 2166438912.0,
+            "52": 2166438912.0,
+            "53": 2166438912.0,
+            "54": 2166438912.0,
+            "55": 2166438912.0,
+            "56": 2166438912.0,
+            "57": 2166438912.0,
+            "58": 2166438912.0,
+            "59": 2166438912.0,
+            "60": 2166438912.0,
+            "61": 2166438912.0,
+            "62": 2166438912.0,
+            "63": 2166438912.0,
+            "64": 2166438912.0,
+            "65": 2166438912.0,
+            "66": 2166438912.0,
+            "67": 2166438912.0,
+            "68": 2166438912.0,
+            "69": 2166438912.0,
+            "70": 2166438912.0,
+            "71": 2166438912.0,
+            "72": 2166438912.0,
+            "73": 2166438912.0,
+            "74": 2166438912.0,
+            "75": 2166438912.0,
+            "76": 2166438912.0,
+            "77": 2166438912.0,
+            "78": 2166438912.0,
+            "79": 2166438912.0,
+            "80": 2166438912.0,
+            "81": 2166438912.0,
+            "82": 2166438912.0,
+            "83": 2166438912.0,
+            "84": 2166438912.0,
+            "85": 2166438912.0,
+            "86": 2166438912.0,
+            "87": 2166438912.0,
+            "88": 2166438912.0,
+            "89": 2166438912.0,
+            "90": 2166438912.0,
+            "91": 2166438912.0,
+            "92": 2166438912.0,
+            "93": 2166438912.0,
+            "94": 2166438912.0,
+            "95": 2166438912.0,
+            "96": 2166438912.0,
+            "97": 2166438912.0,
+            "98": 2166438912.0,
+            "99": 2166438912.0,
+            "100": 2166438912.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2274851328.0,
+            "2": 3206419968.0,
+            "3": 3206419968.0,
+            "4": 3206419968.0,
+            "5": 3206419968.0,
+            "6": 3206419968.0,
+            "7": 3206419968.0,
+            "8": 3206419968.0,
+            "9": 3206419968.0,
+            "10": 3206419968.0,
+            "11": 3206419968.0,
+            "12": 3206419968.0,
+            "13": 3206419968.0,
+            "14": 3206419968.0,
+            "15": 3206419968.0,
+            "16": 3206419968.0,
+            "17": 3206419968.0,
+            "18": 3206419968.0,
+            "19": 3206419968.0,
+            "20": 3206419968.0,
+            "21": 3206419968.0,
+            "22": 3206419968.0,
+            "23": 3206419968.0,
+            "24": 3206419968.0,
+            "25": 3206419968.0,
+            "26": 3206419968.0,
+            "27": 3206419968.0,
+            "28": 3206419968.0,
+            "29": 3206419968.0,
+            "30": 3206419968.0,
+            "31": 3206419968.0,
+            "32": 3206419968.0,
+            "33": 3206419968.0,
+            "34": 3206419968.0,
+            "35": 3206419968.0,
+            "36": 3206419968.0,
+            "37": 3206419968.0,
+            "38": 3206419968.0,
+            "39": 3206419968.0,
+            "40": 3206419968.0,
+            "41": 3206419968.0,
+            "42": 3206419968.0,
+            "43": 3206419968.0,
+            "44": 3206419968.0,
+            "45": 3206419968.0,
+            "46": 3206419968.0,
+            "47": 3206419968.0,
+            "48": 3206419968.0,
+            "49": 3206419968.0,
+            "50": 3206419968.0,
+            "51": 3206419968.0,
+            "52": 3206419968.0,
+            "53": 3206419968.0,
+            "54": 3206419968.0,
+            "55": 3206419968.0,
+            "56": 3206419968.0,
+            "57": 3206419968.0,
+            "58": 3206419968.0,
+            "59": 3206419968.0,
+            "60": 3206419968.0,
+            "61": 3206419968.0,
+            "62": 3206419968.0,
+            "63": 3206419968.0,
+            "64": 3206419968.0,
+            "65": 3206419968.0,
+            "66": 3206419968.0,
+            "67": 3206419968.0,
+            "68": 3206419968.0,
+            "69": 3206419968.0,
+            "70": 3206419968.0,
+            "71": 3206419968.0,
+            "72": 3206419968.0,
+            "73": 3206419968.0,
+            "74": 3206419968.0,
+            "75": 3206419968.0,
+            "76": 3206419968.0,
+            "77": 3206419968.0,
+            "78": 3206419968.0,
+            "79": 3206419968.0,
+            "80": 3206419968.0,
+            "81": 3206419968.0,
+            "82": 3206419968.0,
+            "83": 3206419968.0,
+            "84": 3206419968.0,
+            "85": 3206419968.0,
+            "86": 3206419968.0,
+            "87": 3206419968.0,
+            "88": 3206419968.0,
+            "89": 3206419968.0,
+            "90": 3206419968.0,
+            "91": 3206419968.0,
+            "92": 3206419968.0,
+            "93": 3206419968.0,
+            "94": 3206419968.0,
+            "95": 3206419968.0,
+            "96": 3206419968.0,
+            "97": 3206419968.0,
+            "98": 3206419968.0,
+            "99": 3206419968.0,
+            "100": 3206419968.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 15.84492,
+            "2": 0.56374,
+            "3": 0.48979,
+            "4": 0.47999,
+            "5": 0.47943,
+            "6": 0.4785,
+            "7": 0.48067,
+            "8": 0.98328,
+            "9": 0.47936,
+            "10": 0.47967,
+            "11": 0.48109,
+            "12": 0.49359,
+            "13": 0.50052,
+            "14": 0.4915,
+            "15": 0.49405,
+            "16": 0.50085,
+            "17": 0.49211,
+            "18": 0.51598,
+            "19": 0.50449,
+            "20": 0.4857,
+            "21": 0.48578,
+            "22": 0.48623,
+            "23": 0.48781,
+            "24": 0.87325,
+            "25": 0.48523,
+            "26": 0.92864,
+            "27": 0.4864,
+            "28": 0.48651,
+            "29": 0.48435,
+            "30": 0.49416,
+            "31": 1.05489,
+            "32": 1.10052,
+            "33": 0.49491,
+            "34": 0.49294,
+            "35": 0.48798,
+            "36": 0.48781,
+            "37": 0.48704,
+            "38": 0.49022,
+            "39": 0.48933,
+            "40": 0.48881,
+            "41": 0.48549,
+            "42": 0.48579,
+            "43": 0.48689,
+            "44": 0.48684,
+            "45": 0.48751,
+            "46": 0.48731,
+            "47": 0.48706,
+            "48": 0.48816,
+            "49": 0.48587,
+            "50": 0.48676,
+            "51": 0.4868,
+            "52": 0.48709,
+            "53": 0.4868,
+            "54": 0.48647,
+            "55": 0.48914,
+            "56": 0.48748,
+            "57": 0.487,
+            "58": 0.48636,
+            "59": 0.48608,
+            "60": 0.48583,
+            "61": 0.48634,
+            "62": 0.48753,
+            "63": 0.48694,
+            "64": 0.48525,
+            "65": 0.4853,
+            "66": 0.48545,
+            "67": 0.48738,
+            "68": 0.48709,
+            "69": 0.48727,
+            "70": 0.48494,
+            "71": 0.48573,
+            "72": 0.48622,
+            "73": 0.48642,
+            "74": 0.48627,
+            "75": 0.48837,
+            "76": 0.48773,
+            "77": 0.48748,
+            "78": 0.49724,
+            "79": 0.49868,
+            "80": 0.48848,
+            "81": 0.48729,
+            "82": 0.48827,
+            "83": 0.48649,
+            "84": 0.48563,
+            "85": 0.4887,
+            "86": 0.49085,
+            "87": 0.50008,
+            "88": 0.48807,
+            "89": 0.48771,
+            "90": 0.49194,
+            "91": 0.48913,
+            "92": 0.48833,
+            "93": 0.48713,
+            "94": 0.48704,
+            "95": 0.48785,
+            "96": 0.489,
+            "97": 0.48763,
+            "98": 0.49533,
+            "99": 0.49947,
+            "100": 0.48805
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json
index 30c495148f4..6b1bd4f8405 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json
@@ -32,78 +32,78 @@
             "26": 8.10636,
             "27": 7.88853,
             "28": 7.97024,
-            "29": 7.8121,
-            "30": 7.87698,
-            "31": 7.82339,
-            "32": 7.70086,
-            "33": 7.80317,
-            "34": 7.56843,
-            "35": 7.67276,
-            "36": 7.54942,
-            "37": 7.475,
-            "38": 7.51068,
-            "39": 7.49979,
-            "40": 7.51131,
-            "41": 7.41252,
-            "42": 7.38333,
-            "43": 7.4414,
-            "44": 7.39857,
-            "45": 7.37352,
-            "46": 7.28824,
-            "47": 7.4683,
-            "48": 7.29457,
-            "49": 7.35181,
-            "50": 7.17223,
-            "51": 7.37216,
-            "52": 7.14588,
-            "53": 7.12384,
-            "54": 7.23984,
-            "55": 7.15454,
-            "56": 7.23308,
-            "57": 7.33501,
-            "58": 7.01226,
-            "59": 7.12063,
-            "60": 7.15043,
-            "61": 7.11076,
-            "62": 7.26458,
-            "63": 7.1544,
-            "64": 7.08651,
-            "65": 6.99077,
-            "66": 7.05503,
+            "29": 7.81206,
+            "30": 7.87695,
+            "31": 7.82331,
+            "32": 7.70095,
+            "33": 7.80328,
+            "34": 7.56837,
+            "35": 7.67277,
+            "36": 7.54939,
+            "37": 7.47502,
+            "38": 7.51064,
+            "39": 7.49974,
+            "40": 7.51136,
+            "41": 7.41248,
+            "42": 7.38332,
+            "43": 7.44137,
+            "44": 7.39868,
+            "45": 7.37355,
+            "46": 7.2884,
+            "47": 7.46831,
+            "48": 7.29467,
+            "49": 7.3518,
+            "50": 7.17242,
+            "51": 7.37224,
+            "52": 7.14591,
+            "53": 7.12383,
+            "54": 7.23985,
+            "55": 7.15463,
+            "56": 7.23305,
+            "57": 7.33504,
+            "58": 7.01209,
+            "59": 7.12052,
+            "60": 7.15042,
+            "61": 7.11083,
+            "62": 7.26448,
+            "63": 7.15439,
+            "64": 7.08647,
+            "65": 6.99081,
+            "66": 7.05501,
             "67": 7.04463,
-            "68": 7.136,
-            "69": 7.03404,
-            "70": 7.05994,
-            "71": 6.90146,
-            "72": 6.99845,
-            "73": 6.97783,
-            "74": 6.92205,
-            "75": 7.06268,
-            "76": 6.95612,
-            "77": 7.08838,
-            "78": 7.02608,
-            "79": 6.85354,
-            "80": 6.93543,
-            "81": 6.97396,
-            "82": 7.05854,
-            "83": 6.98003,
-            "84": 7.00602,
-            "85": 6.84771,
-            "86": 7.04197,
-            "87": 6.97366,
-            "88": 6.90817,
-            "89": 6.80902,
-            "90": 7.23999,
-            "91": 6.70221,
-            "92": 7.0543,
-            "93": 6.89332,
-            "94": 7.05002,
-            "95": 6.84547,
-            "96": 6.96202,
-            "97": 6.95355,
-            "98": 6.8731,
-            "99": 6.99831,
-            "100": 6.98508
+            "68": 7.13589,
+            "69": 7.03403,
+            "70": 7.05993,
+            "71": 6.90134,
+            "72": 6.99846,
+            "73": 6.97799,
+            "74": 6.92221,
+            "75": 7.06246,
+            "76": 6.95628,
+            "77": 7.08818,
+            "78": 7.02594,
+            "79": 6.85356,
+            "80": 6.93552,
+            "81": 6.97408,
+            "82": 7.05838,
+            "83": 6.98013,
+            "84": 7.00615,
+            "85": 6.84767,
+            "86": 7.04208,
+            "87": 6.97372,
+            "88": 6.90816,
+            "89": 6.80892,
+            "90": 7.23979,
+            "91": 6.70218,
+            "92": 7.05429,
+            "93": 6.89324,
+            "94": 7.05007,
+            "95": 6.84548,
+            "96": 6.96184,
+            "97": 6.95372,
+            "98": 6.87307,
+            "99": 6.99837,
+            "100": 6.98518
         }
     },
     "num-zeros": {
@@ -139,78 +139,78 @@
             "26": 43923.0,
             "27": 46212.0,
             "28": 46362.0,
-            "29": 46133.0,
-            "30": 43978.0,
-            "31": 41220.0,
-            "32": 43307.0,
-            "33": 45440.0,
-            "34": 43284.0,
-            "35": 43248.0,
-            "36": 42437.0,
-            "37": 40066.0,
-            "38": 42483.0,
-            "39": 44702.0,
-            "40": 43230.0,
-            "41": 44672.0,
-            "42": 43202.0,
-            "43": 45459.0,
-            "44": 44609.0,
-            "45": 43265.0,
-            "46": 43915.0,
-            "47": 42366.0,
-            "48": 44650.0,
-            "49": 43139.0,
-            "50": 43399.0,
-            "51": 41159.0,
-            "52": 43818.0,
-            "53": 43924.0,
-            "54": 41952.0,
-            "55": 43866.0,
-            "56": 43239.0,
-            "57": 42540.0,
-            "58": 43856.0,
-            "59": 44589.0,
-            "60": 41152.0,
-            "61": 39709.0,
-            "62": 44822.0,
-            "63": 44663.0,
-            "64": 45372.0,
+            "29": 46135.0,
+            "30": 43975.0,
+            "31": 41226.0,
+            "32": 43299.0,
+            "33": 45425.0,
+            "34": 43296.0,
+            "35": 43243.0,
+            "36": 42441.0,
+            "37": 40060.0,
+            "38": 42489.0,
+            "39": 44704.0,
+            "40": 43237.0,
+            "41": 44663.0,
+            "42": 43215.0,
+            "43": 45451.0,
+            "44": 44614.0,
+            "45": 43281.0,
+            "46": 43913.0,
+            "47": 42359.0,
+            "48": 44654.0,
+            "49": 43144.0,
+            "50": 43398.0,
+            "51": 41144.0,
+            "52": 43830.0,
+            "53": 43934.0,
+            "54": 41941.0,
+            "55": 43886.0,
+            "56": 43231.0,
+            "57": 42542.0,
+            "58": 43846.0,
+            "59": 44585.0,
+            "60": 41140.0,
+            "61": 39720.0,
+            "62": 44819.0,
+            "63": 44670.0,
+            "64": 45354.0,
             "65": 44676.0,
             "66": 45345.0,
-            "67": 43130.0,
-            "68": 42567.0,
-            "69": 43812.0,
-            "70": 45538.0,
-            "71": 43282.0,
-            "72": 44765.0,
-            "73": 45354.0,
-            "74": 42517.0,
-            "75": 44666.0,
+            "67": 43146.0,
+            "68": 42561.0,
+            "69": 43826.0,
+            "70": 45535.0,
+            "71": 43294.0,
+            "72": 44777.0,
+            "73": 45349.0,
+            "74": 42497.0,
+            "75": 44676.0,
             "76": 43904.0,
-            "77": 42041.0,
-            "78": 40320.0,
-            "79": 38914.0,
-            "80": 41081.0,
-            "81": 45333.0,
-            "82": 43195.0,
+            "77": 42038.0,
+            "78": 40306.0,
+            "79": 38925.0,
+            "80": 41075.0,
+            "81": 45335.0,
+            "82": 43207.0,
             "83": 38489.0,
-            "84": 42436.0,
-            "85": 43978.0,
-            "86": 45680.0,
-            "87": 40832.0,
-            "88": 41797.0,
-            "89": 41083.0,
-            "90": 44676.0,
-            "91": 46190.0,
-            "92": 41837.0,
-            "93": 43234.0,
+            "84": 42428.0,
+            "85": 43976.0,
+            "86": 45688.0,
+            "87": 40838.0,
+            "88": 41786.0,
+            "89": 41088.0,
+            "90": 44682.0,
+            "91": 46204.0,
+            "92": 41815.0,
+            "93": 43233.0,
             "94": 39504.0,
-            "95": 44067.0,
-            "96": 44684.0,
-            "97": 45419.0,
-            "98": 41854.0,
-            "99": 45431.0,
-            "100": 42479.0
+            "95": 44070.0,
+            "96": 44687.0,
+            "97": 45432.0,
+            "98": 41849.0,
+            "99": 45441.0,
+            "100": 42488.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2195405824.0,
-            "2": 2195405824.0,
-            "3": 2195405824.0,
-            "4": 2195405824.0,
-            "5": 2195405824.0,
-            "6": 2195405824.0,
-            "7": 2195405824.0,
-            "8": 2195405824.0,
-            "9": 2195405824.0,
-            "10": 2195405824.0,
-            "11": 2195405824.0,
-            "12": 2195405824.0,
-            "13": 2195405824.0,
-            "14": 2195405824.0,
-            "15": 2195405824.0,
-            "16": 2195405824.0,
-            "17": 2195405824.0,
-            "18": 2195405824.0,
-            "19": 2195405824.0,
-            "20": 2195405824.0,
-            "21": 2195405824.0,
-            "22": 2195405824.0,
-            "23": 2195405824.0,
-            "24": 2195405824.0,
-            "25": 2195405824.0,
-            "26": 2195405824.0,
-            "27": 2195405824.0,
-            "28": 2195405824.0,
-            "29": 2195405824.0,
-            "30": 2195405824.0,
-            "31": 2195405824.0,
-            "32": 2195405824.0,
-            "33": 2195405824.0,
-            "34": 2195405824.0,
-            "35": 2195405824.0,
-            "36": 2195405824.0,
-            "37": 2195405824.0,
-            "38": 2195405824.0,
-            "39": 2195405824.0,
-            "40": 2195405824.0,
-            "41": 2195405824.0,
-            "42": 2195405824.0,
-            "43": 2195405824.0,
-            "44": 2195405824.0,
-            "45": 2195405824.0,
-            "46": 2195405824.0,
-            "47": 2195405824.0,
-            "48": 2195405824.0,
-            "49": 2195405824.0,
-            "50": 2195405824.0,
-            "51": 2195405824.0,
-            "52": 2195405824.0,
-            "53": 2195405824.0,
-            "54": 2195405824.0,
-            "55": 2195405824.0,
-            "56": 2195405824.0,
-            "57": 2195405824.0,
-            "58": 2195405824.0,
-            "59": 2195405824.0,
-            "60": 2195405824.0,
-            "61": 2195405824.0,
-            "62": 2195405824.0,
-            "63": 2195405824.0,
-            "64": 2195405824.0,
-            "65": 2195405824.0,
-            "66": 2195405824.0,
-            "67": 2195405824.0,
-            "68": 2195405824.0,
-            "69": 2195405824.0,
-            "70": 2195405824.0,
-            "71": 2195405824.0,
-            "72": 2195405824.0,
-            "73": 2195405824.0,
-            "74": 2195405824.0,
-            "75": 2195405824.0,
-            "76": 2195405824.0,
-            "77": 2195405824.0,
-            "78": 2195405824.0,
-            "79": 2195405824.0,
-            "80": 2195405824.0,
-            "81": 2195405824.0,
-            "82": 2195405824.0,
-            "83": 2195405824.0,
-            "84": 2195405824.0,
-            "85": 2195405824.0,
-            "86": 2195405824.0,
-            "87": 2195405824.0,
-            "88": 2195405824.0,
-            "89": 2195405824.0,
-            "90": 2195405824.0,
-            "91": 2195405824.0,
-            "92": 2195405824.0,
-            "93": 2195405824.0,
-            "94": 2195405824.0,
-            "95": 2195405824.0,
-            "96": 2195405824.0,
-            "97": 2195405824.0,
-            "98": 2195405824.0,
-            "99": 2195405824.0,
-            "100": 2195405824.0
+            "1": 2197502976.0,
+            "2": 2197502976.0,
+            "3": 2197502976.0,
+            "4": 2197502976.0,
+            "5": 2197502976.0,
+            "6": 2197502976.0,
+            "7": 2197502976.0,
+            "8": 2197502976.0,
+            "9": 2197502976.0,
+            "10": 2197502976.0,
+            "11": 2197502976.0,
+            "12": 2197502976.0,
+            "13": 2197502976.0,
+            "14": 2197502976.0,
+            "15": 2197502976.0,
+            "16": 2197502976.0,
+            "17": 2197502976.0,
+            "18": 2197502976.0,
+            "19": 2197502976.0,
+            "20": 2197502976.0,
+            "21": 2197502976.0,
+            "22": 2197502976.0,
+            "23": 2197502976.0,
+            "24": 2197502976.0,
+            "25": 2197502976.0,
+            "26": 2197502976.0,
+            "27": 2197502976.0,
+            "28": 2197502976.0,
+            "29": 2197502976.0,
+            "30": 2197502976.0,
+            "31": 2197502976.0,
+            "32": 2197502976.0,
+            "33": 2197502976.0,
+            "34": 2197502976.0,
+            "35": 2197502976.0,
+            "36": 2197502976.0,
+            "37": 2197502976.0,
+            "38": 2197502976.0,
+            "39": 2197502976.0,
+            "40": 2197502976.0,
+            "41": 2197502976.0,
+            "42": 2197502976.0,
+            "43": 2197502976.0,
+            "44": 2197502976.0,
+            "45": 2197502976.0,
+            "46": 2197502976.0,
+            "47": 2197502976.0,
+            "48": 2197502976.0,
+            "49": 2197502976.0,
+            "50": 2197502976.0,
+            "51": 2197502976.0,
+            "52": 2197502976.0,
+            "53": 2197502976.0,
+            "54": 2197502976.0,
+            "55": 2197502976.0,
+            "56": 2197502976.0,
+            "57": 2197502976.0,
+            "58": 2197502976.0,
+            "59": 2197502976.0,
+            "60": 2197502976.0,
+            "61": 2197502976.0,
+            "62": 2197502976.0,
+            "63": 2197502976.0,
+            "64": 2197502976.0,
+            "65": 2197502976.0,
+            "66": 2197502976.0,
+            "67": 2197502976.0,
+            "68": 2197502976.0,
+            "69": 2197502976.0,
+            "70": 2197502976.0,
+            "71": 2197502976.0,
+            "72": 2197502976.0,
+            "73": 2197502976.0,
+            "74": 2197502976.0,
+            "75": 2197502976.0,
+            "76": 2197502976.0,
+            "77": 2197502976.0,
+            "78": 2197502976.0,
+            "79": 2197502976.0,
+            "80": 2197502976.0,
+            "81": 2197502976.0,
+            "82": 2197502976.0,
+            "83": 2197502976.0,
+            "84": 2197502976.0,
+            "85": 2197502976.0,
+            "86": 2197502976.0,
+            "87": 2197502976.0,
+            "88": 2197502976.0,
+            "89": 2197502976.0,
+            "90": 2197502976.0,
+            "91": 2197502976.0,
+            "92": 2197502976.0,
+            "93": 2197502976.0,
+            "94": 2197502976.0,
+            "95": 2197502976.0,
+            "96": 2197502976.0,
+            "97": 2197502976.0,
+            "98": 2197502976.0,
+            "99": 2197502976.0,
+            "100": 2197502976.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2302114304.0,
-            "2": 3236697600.0,
-            "3": 3236697600.0,
-            "4": 3236697600.0,
-            "5": 3236697600.0,
-            "6": 3236697600.0,
-            "7": 3236697600.0,
-            "8": 3236697600.0,
-            "9": 3236697600.0,
-            "10": 3236697600.0,
-            "11": 3236697600.0,
-            "12": 3236697600.0,
-            "13": 3236697600.0,
-            "14": 3236697600.0,
-            "15": 3236697600.0,
-            "16": 3236697600.0,
-            "17": 3236697600.0,
-            "18": 3236697600.0,
-            "19": 3236697600.0,
-            "20": 3236697600.0,
-            "21": 3236697600.0,
-            "22": 3236697600.0,
-            "23": 3236697600.0,
-            "24": 3236697600.0,
-            "25": 3236697600.0,
-            "26": 3236697600.0,
-            "27": 3236697600.0,
-            "28": 3236697600.0,
-            "29": 3236697600.0,
-            "30": 3236697600.0,
-            "31": 3236697600.0,
-            "32": 3236697600.0,
-            "33": 3236697600.0,
-            "34": 3236697600.0,
-            "35": 3236697600.0,
-            "36": 3236697600.0,
-            "37": 3236697600.0,
-            "38": 3236697600.0,
-            "39": 3236697600.0,
-            "40": 3236697600.0,
-            "41": 3236697600.0,
-            "42": 3236697600.0,
-            "43": 3236697600.0,
-            "44": 3236697600.0,
-            "45": 3236697600.0,
-            "46": 3236697600.0,
-            "47": 3236697600.0,
-            "48": 3236697600.0,
-            "49": 3236697600.0,
-            "50": 3236697600.0,
-            "51": 3236697600.0,
-            "52": 3236697600.0,
-            "53": 3236697600.0,
-            "54": 3236697600.0,
-            "55": 3236697600.0,
-            "56": 3236697600.0,
-            "57": 3236697600.0,
-            "58": 3236697600.0,
-            "59": 3236697600.0,
-            "60": 3236697600.0,
-            "61": 3236697600.0,
-            "62": 3236697600.0,
-            "63": 3236697600.0,
-            "64": 3236697600.0,
-            "65": 3236697600.0,
-            "66": 3236697600.0,
-            "67": 3236697600.0,
-            "68": 3236697600.0,
-            "69": 3236697600.0,
-            "70": 3236697600.0,
-            "71": 3236697600.0,
-            "72": 3236697600.0,
-            "73": 3236697600.0,
-            "74": 3236697600.0,
-            "75": 3236697600.0,
-            "76": 3236697600.0,
-            "77": 3236697600.0,
-            "78": 3236697600.0,
-            "79": 3236697600.0,
-            "80": 3236697600.0,
-            "81": 3236697600.0,
-            "82": 3236697600.0,
-            "83": 3236697600.0,
-            "84": 3236697600.0,
-            "85": 3236697600.0,
-            "86": 3236697600.0,
-            "87": 3236697600.0,
-            "88": 3236697600.0,
-            "89": 3236697600.0,
-            "90": 3236697600.0,
-            "91": 3236697600.0,
-            "92": 3236697600.0,
-            "93": 3236697600.0,
-            "94": 3236697600.0,
-            "95": 3236697600.0,
-            "96": 3236697600.0,
-            "97": 3236697600.0,
-            "98": 3236697600.0,
-            "99": 3236697600.0,
-            "100": 3236697600.0
+            "1": 2302638592.0,
+            "2": 3238794752.0,
+            "3": 3238794752.0,
+            "4": 3238794752.0,
+            "5": 3238794752.0,
+            "6": 3238794752.0,
+            "7": 3238794752.0,
+            "8": 3238794752.0,
+            "9": 3238794752.0,
+            "10": 3238794752.0,
+            "11": 3238794752.0,
+            "12": 3238794752.0,
+            "13": 3238794752.0,
+            "14": 3238794752.0,
+            "15": 3238794752.0,
+            "16": 3238794752.0,
+            "17": 3238794752.0,
+            "18": 3238794752.0,
+            "19": 3238794752.0,
+            "20": 3238794752.0,
+            "21": 3238794752.0,
+            "22": 3238794752.0,
+            "23": 3238794752.0,
+            "24": 3238794752.0,
+            "25": 3238794752.0,
+            "26": 3238794752.0,
+            "27": 3238794752.0,
+            "28": 3238794752.0,
+            "29": 3238794752.0,
+            "30": 3238794752.0,
+            "31": 3238794752.0,
+            "32": 3238794752.0,
+            "33": 3238794752.0,
+            "34": 3238794752.0,
+            "35": 3238794752.0,
+            "36": 3238794752.0,
+            "37": 3238794752.0,
+            "38": 3238794752.0,
+            "39": 3238794752.0,
+            "40": 3238794752.0,
+            "41": 3238794752.0,
+            "42": 3238794752.0,
+            "43": 3238794752.0,
+            "44": 3238794752.0,
+            "45": 3238794752.0,
+            "46": 3238794752.0,
+            "47": 3238794752.0,
+            "48": 3238794752.0,
+            "49": 3238794752.0,
+            "50": 3238794752.0,
+            "51": 3238794752.0,
+            "52": 3238794752.0,
+            "53": 3238794752.0,
+            "54": 3238794752.0,
+            "55": 3238794752.0,
+            "56": 3238794752.0,
+            "57": 3238794752.0,
+            "58": 3238794752.0,
+            "59": 3238794752.0,
+            "60": 3238794752.0,
+            "61": 3238794752.0,
+            "62": 3238794752.0,
+            "63": 3238794752.0,
+            "64": 3238794752.0,
+            "65": 3238794752.0,
+            "66": 3238794752.0,
+            "67": 3238794752.0,
+            "68": 3238794752.0,
+            "69": 3238794752.0,
+            "70": 3238794752.0,
+            "71": 3238794752.0,
+            "72": 3238794752.0,
+            "73": 3238794752.0,
+            "74": 3238794752.0,
+            "75": 3238794752.0,
+            "76": 3238794752.0,
+            "77": 3238794752.0,
+            "78": 3238794752.0,
+            "79": 3238794752.0,
+            "80": 3238794752.0,
+            "81": 3238794752.0,
+            "82": 3238794752.0,
+            "83": 3238794752.0,
+            "84": 3238794752.0,
+            "85": 3238794752.0,
+            "86": 3238794752.0,
+            "87": 3238794752.0,
+            "88": 3238794752.0,
+            "89": 3238794752.0,
+            "90": 3238794752.0,
+            "91": 3238794752.0,
+            "92": 3238794752.0,
+            "93": 3238794752.0,
+            "94": 3238794752.0,
+            "95": 3238794752.0,
+            "96": 3238794752.0,
+            "97": 3238794752.0,
+            "98": 3238794752.0,
+            "99": 3238794752.0,
+            "100": 3238794752.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.46115,
-            "2": 0.46835,
-            "3": 0.38416,
-            "4": 0.37391,
-            "5": 0.37703,
-            "6": 0.38173,
-            "7": 0.37456,
-            "8": 0.37696,
-            "9": 0.37338,
-            "10": 0.37687,
-            "11": 0.38251,
-            "12": 0.38037,
-            "13": 0.37996,
-            "14": 0.38264,
-            "15": 0.37959,
-            "16": 0.38232,
-            "17": 0.37852,
-            "18": 0.37735,
-            "19": 0.3812,
-            "20": 0.37493,
-            "21": 0.38227,
-            "22": 0.38196,
-            "23": 0.37745,
-            "24": 0.3782,
-            "25": 0.37181,
-            "26": 0.37935,
-            "27": 0.38539,
-            "28": 0.38393,
-            "29": 0.3826,
-            "30": 0.37839,
-            "31": 0.38438,
-            "32": 0.64523,
-            "33": 0.37971,
-            "34": 0.38082,
-            "35": 0.74313,
-            "36": 0.3848,
-            "37": 0.38169,
-            "38": 0.38154,
-            "39": 0.40495,
-            "40": 0.40243,
-            "41": 0.37972,
-            "42": 0.37792,
-            "43": 0.38261,
-            "44": 0.37607,
-            "45": 0.37463,
-            "46": 0.37881,
-            "47": 0.37293,
-            "48": 0.37592,
-            "49": 0.659,
-            "50": 0.37783,
-            "51": 0.38158,
-            "52": 0.73901,
-            "53": 0.37684,
-            "54": 0.37707,
-            "55": 0.42405,
-            "56": 0.38184,
-            "57": 0.37936,
-            "58": 0.37539,
-            "59": 0.37591,
-            "60": 0.72267,
-            "61": 0.37815,
-            "62": 0.77277,
-            "63": 0.38815,
-            "64": 0.3807,
-            "65": 0.37848,
-            "66": 0.38143,
-            "67": 0.37999,
-            "68": 0.38158,
-            "69": 0.38427,
-            "70": 0.37479,
-            "71": 0.38252,
-            "72": 0.38036,
-            "73": 0.38116,
-            "74": 0.38336,
-            "75": 0.3771,
-            "76": 0.37876,
-            "77": 0.38102,
-            "78": 0.37864,
-            "79": 0.38095,
-            "80": 0.37954,
-            "81": 0.37575,
-            "82": 0.38084,
-            "83": 0.38192,
-            "84": 0.38267,
-            "85": 0.38765,
-            "86": 0.38467,
-            "87": 0.3817,
-            "88": 0.37395,
-            "89": 0.37751,
-            "90": 0.38076,
-            "91": 0.37565,
-            "92": 0.38237,
-            "93": 0.37738,
-            "94": 0.37726,
-            "95": 0.38237,
-            "96": 0.38018,
-            "97": 0.38525,
-            "98": 0.40815,
-            "99": 0.38117,
-            "100": 0.38201
+            "1": 25.05607,
+            "2": 0.4771,
+            "3": 0.78234,
+            "4": 0.35523,
+            "5": 0.34787,
+            "6": 0.35038,
+            "7": 0.35972,
+            "8": 0.35589,
+            "9": 0.38294,
+            "10": 0.35953,
+            "11": 0.35001,
+            "12": 0.35158,
+            "13": 0.3501,
+            "14": 0.3486,
+            "15": 0.34967,
+            "16": 0.347,
+            "17": 0.34513,
+            "18": 0.36694,
+            "19": 0.36383,
+            "20": 0.3472,
+            "21": 0.3418,
+            "22": 0.34601,
+            "23": 0.76245,
+            "24": 0.73697,
+            "25": 0.7256,
+            "26": 0.34524,
+            "27": 0.34628,
+            "28": 0.34443,
+            "29": 0.35468,
+            "30": 0.73189,
+            "31": 0.96909,
+            "32": 0.34399,
+            "33": 0.34907,
+            "34": 0.35028,
+            "35": 0.34486,
+            "36": 0.34787,
+            "37": 0.345,
+            "38": 0.34797,
+            "39": 0.34864,
+            "40": 0.34596,
+            "41": 0.34855,
+            "42": 0.34707,
+            "43": 0.34709,
+            "44": 0.34717,
+            "45": 0.34917,
+            "46": 0.34955,
+            "47": 0.34487,
+            "48": 0.35114,
+            "49": 0.34985,
+            "50": 0.35151,
+            "51": 0.3515,
+            "52": 0.34854,
+            "53": 0.34699,
+            "54": 0.35058,
+            "55": 0.34683,
+            "56": 0.34606,
+            "57": 0.34877,
+            "58": 0.34509,
+            "59": 0.34822,
+            "60": 0.34532,
+            "61": 0.34516,
+            "62": 0.34479,
+            "63": 0.36001,
+            "64": 0.3983,
+            "65": 0.34758,
+            "66": 0.34684,
+            "67": 0.34571,
+            "68": 0.3481,
+            "69": 0.34685,
+            "70": 0.34473,
+            "71": 0.34557,
+            "72": 0.34856,
+            "73": 0.34506,
+            "74": 0.34674,
+            "75": 0.34706,
+            "76": 0.34879,
+            "77": 0.35195,
+            "78": 0.34663,
+            "79": 0.35252,
+            "80": 0.34719,
+            "81": 0.3448,
+            "82": 0.34727,
+            "83": 0.34972,
+            "84": 0.34547,
+            "85": 0.35367,
+            "86": 0.34453,
+            "87": 0.3406,
+            "88": 0.34389,
+            "89": 0.3438,
+            "90": 0.34535,
+            "91": 0.34386,
+            "92": 0.34313,
+            "93": 0.34017,
+            "94": 0.34115,
+            "95": 0.34187,
+            "96": 0.34159,
+            "97": 0.34076,
+            "98": 0.34202,
+            "99": 0.34323,
+            "100": 0.34206
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_h100.json
index 2400879202c..da925a09fb1 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1132053504.0,
-            "2": 1132053504.0,
-            "3": 1132053504.0,
-            "4": 1132053504.0,
-            "5": 1132053504.0,
-            "6": 1132053504.0,
-            "7": 1132053504.0,
-            "8": 1132053504.0,
-            "9": 1132053504.0,
-            "10": 1132053504.0,
-            "11": 1132053504.0,
-            "12": 1132053504.0,
-            "13": 1132053504.0,
-            "14": 1132053504.0,
-            "15": 1132053504.0,
-            "16": 1132053504.0,
-            "17": 1132053504.0,
-            "18": 1132053504.0,
-            "19": 1132053504.0,
-            "20": 1132053504.0,
-            "21": 1132053504.0,
-            "22": 1132053504.0,
-            "23": 1132053504.0,
-            "24": 1132053504.0,
-            "25": 1132053504.0,
-            "26": 1132053504.0,
-            "27": 1132053504.0,
-            "28": 1132053504.0,
-            "29": 1132053504.0,
-            "30": 1132053504.0,
-            "31": 1132053504.0,
-            "32": 1132053504.0,
-            "33": 1132053504.0,
-            "34": 1132053504.0,
-            "35": 1132053504.0,
-            "36": 1132053504.0,
-            "37": 1132053504.0,
-            "38": 1132053504.0,
-            "39": 1132053504.0,
-            "40": 1132053504.0,
-            "41": 1132053504.0,
-            "42": 1132053504.0,
-            "43": 1132053504.0,
-            "44": 1132053504.0,
-            "45": 1132053504.0,
-            "46": 1132053504.0,
-            "47": 1132053504.0,
-            "48": 1132053504.0,
-            "49": 1132053504.0,
-            "50": 1132053504.0,
-            "51": 1132053504.0,
-            "52": 1132053504.0,
-            "53": 1132053504.0,
-            "54": 1132053504.0,
-            "55": 1132053504.0,
-            "56": 1132053504.0,
-            "57": 1132053504.0,
-            "58": 1132053504.0,
-            "59": 1132053504.0,
-            "60": 1132053504.0,
-            "61": 1132053504.0,
-            "62": 1132053504.0,
-            "63": 1132053504.0,
-            "64": 1132053504.0,
-            "65": 1132053504.0,
-            "66": 1132053504.0,
-            "67": 1132053504.0,
-            "68": 1132053504.0,
-            "69": 1132053504.0,
-            "70": 1132053504.0,
-            "71": 1132053504.0,
-            "72": 1132053504.0,
-            "73": 1132053504.0,
-            "74": 1132053504.0,
-            "75": 1132053504.0,
-            "76": 1132053504.0,
-            "77": 1132053504.0,
-            "78": 1132053504.0,
-            "79": 1132053504.0,
-            "80": 1132053504.0,
-            "81": 1132053504.0,
-            "82": 1132053504.0,
-            "83": 1132053504.0,
-            "84": 1132053504.0,
-            "85": 1132053504.0,
-            "86": 1132053504.0,
-            "87": 1132053504.0,
-            "88": 1132053504.0,
-            "89": 1132053504.0,
-            "90": 1132053504.0,
-            "91": 1132053504.0,
-            "92": 1132053504.0,
-            "93": 1132053504.0,
-            "94": 1132053504.0,
-            "95": 1132053504.0,
-            "96": 1132053504.0,
-            "97": 1132053504.0,
-            "98": 1132053504.0,
-            "99": 1132053504.0,
-            "100": 1132053504.0
+            "1": 1131791360.0,
+            "2": 1131791360.0,
+            "3": 1131791360.0,
+            "4": 1131791360.0,
+            "5": 1131791360.0,
+            "6": 1131791360.0,
+            "7": 1131791360.0,
+            "8": 1131791360.0,
+            "9": 1131791360.0,
+            "10": 1131791360.0,
+            "11": 1131791360.0,
+            "12": 1131791360.0,
+            "13": 1131791360.0,
+            "14": 1131791360.0,
+            "15": 1131791360.0,
+            "16": 1131791360.0,
+            "17": 1131791360.0,
+            "18": 1131791360.0,
+            "19": 1131791360.0,
+            "20": 1131791360.0,
+            "21": 1131791360.0,
+            "22": 1131791360.0,
+            "23": 1131791360.0,
+            "24": 1131791360.0,
+            "25": 1131791360.0,
+            "26": 1131791360.0,
+            "27": 1131791360.0,
+            "28": 1131791360.0,
+            "29": 1131791360.0,
+            "30": 1131791360.0,
+            "31": 1131791360.0,
+            "32": 1131791360.0,
+            "33": 1131791360.0,
+            "34": 1131791360.0,
+            "35": 1131791360.0,
+            "36": 1131791360.0,
+            "37": 1131791360.0,
+            "38": 1131791360.0,
+            "39": 1131791360.0,
+            "40": 1131791360.0,
+            "41": 1131791360.0,
+            "42": 1131791360.0,
+            "43": 1131791360.0,
+            "44": 1131791360.0,
+            "45": 1131791360.0,
+            "46": 1131791360.0,
+            "47": 1131791360.0,
+            "48": 1131791360.0,
+            "49": 1131791360.0,
+            "50": 1131791360.0,
+            "51": 1131791360.0,
+            "52": 1131791360.0,
+            "53": 1131791360.0,
+            "54": 1131791360.0,
+            "55": 1131791360.0,
+            "56": 1131791360.0,
+            "57": 1131791360.0,
+            "58": 1131791360.0,
+            "59": 1131791360.0,
+            "60": 1131791360.0,
+            "61": 1131791360.0,
+            "62": 1131791360.0,
+            "63": 1131791360.0,
+            "64": 1131791360.0,
+            "65": 1131791360.0,
+            "66": 1131791360.0,
+            "67": 1131791360.0,
+            "68": 1131791360.0,
+            "69": 1131791360.0,
+            "70": 1131791360.0,
+            "71": 1131791360.0,
+            "72": 1131791360.0,
+            "73": 1131791360.0,
+            "74": 1131791360.0,
+            "75": 1131791360.0,
+            "76": 1131791360.0,
+            "77": 1131791360.0,
+            "78": 1131791360.0,
+            "79": 1131791360.0,
+            "80": 1131791360.0,
+            "81": 1131791360.0,
+            "82": 1131791360.0,
+            "83": 1131791360.0,
+            "84": 1131791360.0,
+            "85": 1131791360.0,
+            "86": 1131791360.0,
+            "87": 1131791360.0,
+            "88": 1131791360.0,
+            "89": 1131791360.0,
+            "90": 1131791360.0,
+            "91": 1131791360.0,
+            "92": 1131791360.0,
+            "93": 1131791360.0,
+            "94": 1131791360.0,
+            "95": 1131791360.0,
+            "96": 1131791360.0,
+            "97": 1131791360.0,
+            "98": 1131791360.0,
+            "99": 1131791360.0,
+            "100": 1131791360.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1409266176.0,
-            "2": 1864166912.0,
-            "3": 1864166912.0,
-            "4": 1864166912.0,
-            "5": 1864166912.0,
-            "6": 1864166912.0,
-            "7": 1864166912.0,
-            "8": 1864166912.0,
-            "9": 1864166912.0,
-            "10": 1864166912.0,
-            "11": 1864166912.0,
-            "12": 1864166912.0,
-            "13": 1864166912.0,
-            "14": 1864166912.0,
-            "15": 1864166912.0,
-            "16": 1864166912.0,
-            "17": 1864166912.0,
-            "18": 1864166912.0,
-            "19": 1864166912.0,
-            "20": 1864166912.0,
-            "21": 1864166912.0,
-            "22": 1864166912.0,
-            "23": 1864166912.0,
-            "24": 1864166912.0,
-            "25": 1864166912.0,
-            "26": 1864166912.0,
-            "27": 1864166912.0,
-            "28": 1864166912.0,
-            "29": 1864166912.0,
-            "30": 1864166912.0,
-            "31": 1864166912.0,
-            "32": 1864166912.0,
-            "33": 1864166912.0,
-            "34": 1864166912.0,
-            "35": 1864166912.0,
-            "36": 1864166912.0,
-            "37": 1864166912.0,
-            "38": 1864166912.0,
-            "39": 1864166912.0,
-            "40": 1864166912.0,
-            "41": 1864166912.0,
-            "42": 1864166912.0,
-            "43": 1864166912.0,
-            "44": 1864166912.0,
-            "45": 1864166912.0,
-            "46": 1864166912.0,
-            "47": 1864166912.0,
-            "48": 1864166912.0,
-            "49": 1864166912.0,
-            "50": 1864166912.0,
-            "51": 1864166912.0,
-            "52": 1864166912.0,
-            "53": 1864166912.0,
-            "54": 1864166912.0,
-            "55": 1864166912.0,
-            "56": 1864166912.0,
-            "57": 1864166912.0,
-            "58": 1864166912.0,
-            "59": 1864166912.0,
-            "60": 1864166912.0,
-            "61": 1864166912.0,
-            "62": 1864166912.0,
-            "63": 1864166912.0,
-            "64": 1864166912.0,
-            "65": 1864166912.0,
-            "66": 1864166912.0,
-            "67": 1864166912.0,
-            "68": 1864166912.0,
-            "69": 1864166912.0,
-            "70": 1864166912.0,
-            "71": 1864166912.0,
-            "72": 1864166912.0,
-            "73": 1864166912.0,
-            "74": 1864166912.0,
-            "75": 1864166912.0,
-            "76": 1864166912.0,
-            "77": 1864166912.0,
-            "78": 1864166912.0,
-            "79": 1864166912.0,
-            "80": 1864166912.0,
-            "81": 1864166912.0,
-            "82": 1864166912.0,
-            "83": 1864166912.0,
-            "84": 1864166912.0,
-            "85": 1864166912.0,
-            "86": 1864166912.0,
-            "87": 1864166912.0,
-            "88": 1864166912.0,
-            "89": 1864166912.0,
-            "90": 1864166912.0,
-            "91": 1864166912.0,
-            "92": 1864166912.0,
-            "93": 1864166912.0,
-            "94": 1864166912.0,
-            "95": 1864166912.0,
-            "96": 1864166912.0,
-            "97": 1864166912.0,
-            "98": 1864166912.0,
-            "99": 1864166912.0,
-            "100": 1864166912.0
+            "1": 1410773504.0,
+            "2": 1862789632.0,
+            "3": 1862789632.0,
+            "4": 1862789632.0,
+            "5": 1862789632.0,
+            "6": 1862789632.0,
+            "7": 1862789632.0,
+            "8": 1862789632.0,
+            "9": 1862789632.0,
+            "10": 1862789632.0,
+            "11": 1862789632.0,
+            "12": 1862789632.0,
+            "13": 1862789632.0,
+            "14": 1862789632.0,
+            "15": 1862789632.0,
+            "16": 1862789632.0,
+            "17": 1862789632.0,
+            "18": 1862789632.0,
+            "19": 1862789632.0,
+            "20": 1862789632.0,
+            "21": 1862789632.0,
+            "22": 1862789632.0,
+            "23": 1862789632.0,
+            "24": 1862789632.0,
+            "25": 1862789632.0,
+            "26": 1862789632.0,
+            "27": 1862789632.0,
+            "28": 1862789632.0,
+            "29": 1862789632.0,
+            "30": 1862789632.0,
+            "31": 1862789632.0,
+            "32": 1862789632.0,
+            "33": 1862789632.0,
+            "34": 1862789632.0,
+            "35": 1862789632.0,
+            "36": 1862789632.0,
+            "37": 1862789632.0,
+            "38": 1862789632.0,
+            "39": 1862789632.0,
+            "40": 1862789632.0,
+            "41": 1862789632.0,
+            "42": 1862789632.0,
+            "43": 1862789632.0,
+            "44": 1862789632.0,
+            "45": 1862789632.0,
+            "46": 1862789632.0,
+            "47": 1862789632.0,
+            "48": 1862789632.0,
+            "49": 1862789632.0,
+            "50": 1862789632.0,
+            "51": 1862789632.0,
+            "52": 1862789632.0,
+            "53": 1862789632.0,
+            "54": 1862789632.0,
+            "55": 1862789632.0,
+            "56": 1862789632.0,
+            "57": 1862789632.0,
+            "58": 1862789632.0,
+            "59": 1862789632.0,
+            "60": 1862789632.0,
+            "61": 1862789632.0,
+            "62": 1862789632.0,
+            "63": 1862789632.0,
+            "64": 1862789632.0,
+            "65": 1862789632.0,
+            "66": 1862789632.0,
+            "67": 1862789632.0,
+            "68": 1862789632.0,
+            "69": 1862789632.0,
+            "70": 1862789632.0,
+            "71": 1862789632.0,
+            "72": 1862789632.0,
+            "73": 1862789632.0,
+            "74": 1862789632.0,
+            "75": 1862789632.0,
+            "76": 1862789632.0,
+            "77": 1862789632.0,
+            "78": 1862789632.0,
+            "79": 1862789632.0,
+            "80": 1862789632.0,
+            "81": 1862789632.0,
+            "82": 1862789632.0,
+            "83": 1862789632.0,
+            "84": 1862789632.0,
+            "85": 1862789632.0,
+            "86": 1862789632.0,
+            "87": 1862789632.0,
+            "88": 1862789632.0,
+            "89": 1862789632.0,
+            "90": 1862789632.0,
+            "91": 1862789632.0,
+            "92": 1862789632.0,
+            "93": 1862789632.0,
+            "94": 1862789632.0,
+            "95": 1862789632.0,
+            "96": 1862789632.0,
+            "97": 1862789632.0,
+            "98": 1862789632.0,
+            "99": 1862789632.0,
+            "100": 1862789632.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.54009,
-            "2": 0.66845,
-            "3": 0.64084,
-            "4": 0.64526,
-            "5": 0.64331,
-            "6": 0.65463,
-            "7": 0.63991,
-            "8": 0.63854,
-            "9": 0.64034,
-            "10": 0.63886,
-            "11": 0.63968,
-            "12": 0.64441,
-            "13": 0.63828,
-            "14": 0.64647,
-            "15": 0.64199,
-            "16": 0.63783,
-            "17": 0.64359,
-            "18": 0.66439,
-            "19": 0.64718,
-            "20": 0.63999,
-            "21": 0.65677,
-            "22": 0.95191,
-            "23": 0.64765,
-            "24": 0.98317,
-            "25": 1.63221,
-            "26": 0.64915,
-            "27": 0.64318,
-            "28": 0.99238,
-            "29": 0.64655,
-            "30": 0.64693,
-            "31": 0.64241,
-            "32": 0.98967,
-            "33": 0.64928,
-            "34": 0.64294,
-            "35": 0.65629,
-            "36": 0.64358,
-            "37": 0.64814,
-            "38": 0.64325,
-            "39": 0.64509,
-            "40": 0.64733,
-            "41": 0.64693,
-            "42": 0.65392,
-            "43": 0.64721,
-            "44": 0.64487,
-            "45": 0.64766,
-            "46": 0.65872,
-            "47": 0.65402,
-            "48": 0.65486,
-            "49": 0.64433,
-            "50": 0.64917,
-            "51": 0.64197,
-            "52": 0.64647,
-            "53": 0.64656,
-            "54": 0.64815,
-            "55": 0.64573,
-            "56": 0.6539,
-            "57": 0.64582,
-            "58": 0.64668,
-            "59": 0.64431,
-            "60": 0.64957,
-            "61": 0.64703,
-            "62": 0.64671,
-            "63": 0.65979,
-            "64": 0.64599,
-            "65": 0.6466,
-            "66": 0.64754,
-            "67": 0.6471,
-            "68": 0.64756,
-            "69": 0.64621,
-            "70": 0.65906,
-            "71": 0.64587,
-            "72": 0.65969,
-            "73": 0.64476,
-            "74": 0.65304,
-            "75": 0.64786,
-            "76": 0.65077,
-            "77": 0.66405,
-            "78": 0.6472,
-            "79": 0.64431,
-            "80": 0.64472,
-            "81": 0.64407,
-            "82": 0.64326,
-            "83": 0.93161,
-            "84": 0.65573,
-            "85": 0.63999,
-            "86": 0.64393,
-            "87": 0.92064,
-            "88": 0.64399,
-            "89": 0.64306,
-            "90": 0.64439,
-            "91": 0.6414,
-            "92": 0.64504,
-            "93": 0.64858,
-            "94": 0.64041,
-            "95": 0.64497,
-            "96": 0.64493,
-            "97": 0.64508,
-            "98": 0.6444,
-            "99": 0.64587,
-            "100": 0.64886
+            "1": 25.99742,
+            "2": 0.74354,
+            "3": 0.5991,
+            "4": 0.58509,
+            "5": 0.57829,
+            "6": 0.59904,
+            "7": 0.60788,
+            "8": 0.59588,
+            "9": 0.59262,
+            "10": 0.59201,
+            "11": 0.6011,
+            "12": 0.58294,
+            "13": 1.00971,
+            "14": 1.2235,
+            "15": 0.59824,
+            "16": 0.59871,
+            "17": 0.59553,
+            "18": 0.60447,
+            "19": 0.59305,
+            "20": 0.59516,
+            "21": 0.59434,
+            "22": 0.59253,
+            "23": 0.59245,
+            "24": 0.59395,
+            "25": 0.59087,
+            "26": 0.59548,
+            "27": 0.59981,
+            "28": 0.59298,
+            "29": 0.60365,
+            "30": 0.59179,
+            "31": 0.59532,
+            "32": 0.59589,
+            "33": 0.58615,
+            "34": 0.5832,
+            "35": 0.58623,
+            "36": 0.58286,
+            "37": 0.58446,
+            "38": 0.59392,
+            "39": 0.60039,
+            "40": 0.59556,
+            "41": 0.59642,
+            "42": 0.60532,
+            "43": 0.6013,
+            "44": 0.60295,
+            "45": 0.60146,
+            "46": 0.58736,
+            "47": 0.58628,
+            "48": 0.58704,
+            "49": 0.5858,
+            "50": 0.59709,
+            "51": 0.61827,
+            "52": 0.58553,
+            "53": 0.58061,
+            "54": 0.57839,
+            "55": 0.58578,
+            "56": 0.59768,
+            "57": 0.59453,
+            "58": 0.61716,
+            "59": 0.57953,
+            "60": 0.57769,
+            "61": 0.57901,
+            "62": 0.58074,
+            "63": 0.58369,
+            "64": 0.57997,
+            "65": 0.58275,
+            "66": 0.58343,
+            "67": 0.57961,
+            "68": 0.57755,
+            "69": 0.58701,
+            "70": 0.57588,
+            "71": 0.5775,
+            "72": 0.57925,
+            "73": 0.57648,
+            "74": 0.57923,
+            "75": 0.58354,
+            "76": 0.58196,
+            "77": 0.57857,
+            "78": 0.58636,
+            "79": 0.58475,
+            "80": 0.58428,
+            "81": 0.58017,
+            "82": 0.58459,
+            "83": 0.58698,
+            "84": 0.57714,
+            "85": 0.57756,
+            "86": 0.58774,
+            "87": 0.57843,
+            "88": 0.57647,
+            "89": 0.57865,
+            "90": 0.5784,
+            "91": 0.57912,
+            "92": 0.57658,
+            "93": 0.58094,
+            "94": 0.57865,
+            "95": 0.58251,
+            "96": 0.62025,
+            "97": 0.58429,
+            "98": 0.59488,
+            "99": 0.58183,
+            "100": 0.583
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
index 11ef3fbd8c5..448fe2595ce 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1132053504.0,
-            "2": 1132053504.0,
-            "3": 1132053504.0,
-            "4": 1132053504.0,
-            "5": 1132053504.0,
-            "6": 1132053504.0,
-            "7": 1132053504.0,
-            "8": 1132053504.0,
-            "9": 1132053504.0,
-            "10": 1132053504.0,
-            "11": 1132053504.0,
-            "12": 1132053504.0,
-            "13": 1132053504.0,
-            "14": 1132053504.0,
-            "15": 1132053504.0,
-            "16": 1132053504.0,
-            "17": 1132053504.0,
-            "18": 1132053504.0,
-            "19": 1132053504.0,
-            "20": 1132053504.0,
-            "21": 1132053504.0,
-            "22": 1132053504.0,
-            "23": 1132053504.0,
-            "24": 1132053504.0,
-            "25": 1132053504.0,
-            "26": 1132053504.0,
-            "27": 1132053504.0,
-            "28": 1132053504.0,
-            "29": 1132053504.0,
-            "30": 1132053504.0,
-            "31": 1132053504.0,
-            "32": 1132053504.0,
-            "33": 1132053504.0,
-            "34": 1132053504.0,
-            "35": 1132053504.0,
-            "36": 1132053504.0,
-            "37": 1132053504.0,
-            "38": 1132053504.0,
-            "39": 1132053504.0,
-            "40": 1132053504.0,
-            "41": 1132053504.0,
-            "42": 1132053504.0,
-            "43": 1132053504.0,
-            "44": 1132053504.0,
-            "45": 1132053504.0,
-            "46": 1132053504.0,
-            "47": 1132053504.0,
-            "48": 1132053504.0,
-            "49": 1132053504.0,
-            "50": 1132053504.0,
-            "51": 1132053504.0,
-            "52": 1132053504.0,
-            "53": 1132053504.0,
-            "54": 1132053504.0,
-            "55": 1132053504.0,
-            "56": 1132053504.0,
-            "57": 1132053504.0,
-            "58": 1132053504.0,
-            "59": 1132053504.0,
-            "60": 1132053504.0,
-            "61": 1132053504.0,
-            "62": 1132053504.0,
-            "63": 1132053504.0,
-            "64": 1132053504.0,
-            "65": 1132053504.0,
-            "66": 1132053504.0,
-            "67": 1132053504.0,
-            "68": 1132053504.0,
-            "69": 1132053504.0,
-            "70": 1132053504.0,
-            "71": 1132053504.0,
-            "72": 1132053504.0,
-            "73": 1132053504.0,
-            "74": 1132053504.0,
-            "75": 1132053504.0,
-            "76": 1132053504.0,
-            "77": 1132053504.0,
-            "78": 1132053504.0,
-            "79": 1132053504.0,
-            "80": 1132053504.0,
-            "81": 1132053504.0,
-            "82": 1132053504.0,
-            "83": 1132053504.0,
-            "84": 1132053504.0,
-            "85": 1132053504.0,
-            "86": 1132053504.0,
-            "87": 1132053504.0,
-            "88": 1132053504.0,
-            "89": 1132053504.0,
-            "90": 1132053504.0,
-            "91": 1132053504.0,
-            "92": 1132053504.0,
-            "93": 1132053504.0,
-            "94": 1132053504.0,
-            "95": 1132053504.0,
-            "96": 1132053504.0,
-            "97": 1132053504.0,
-            "98": 1132053504.0,
-            "99": 1132053504.0,
-            "100": 1132053504.0
+            "1": 1131791360.0,
+            "2": 1131791360.0,
+            "3": 1131791360.0,
+            "4": 1131791360.0,
+            "5": 1131791360.0,
+            "6": 1131791360.0,
+            "7": 1131791360.0,
+            "8": 1131791360.0,
+            "9": 1131791360.0,
+            "10": 1131791360.0,
+            "11": 1131791360.0,
+            "12": 1131791360.0,
+            "13": 1131791360.0,
+            "14": 1131791360.0,
+            "15": 1131791360.0,
+            "16": 1131791360.0,
+            "17": 1131791360.0,
+            "18": 1131791360.0,
+            "19": 1131791360.0,
+            "20": 1131791360.0,
+            "21": 1131791360.0,
+            "22": 1131791360.0,
+            "23": 1131791360.0,
+            "24": 1131791360.0,
+            "25": 1131791360.0,
+            "26": 1131791360.0,
+            "27": 1131791360.0,
+            "28": 1131791360.0,
+            "29": 1131791360.0,
+            "30": 1131791360.0,
+            "31": 1131791360.0,
+            "32": 1131791360.0,
+            "33": 1131791360.0,
+            "34": 1131791360.0,
+            "35": 1131791360.0,
+            "36": 1131791360.0,
+            "37": 1131791360.0,
+            "38": 1131791360.0,
+            "39": 1131791360.0,
+            "40": 1131791360.0,
+            "41": 1131791360.0,
+            "42": 1131791360.0,
+            "43": 1131791360.0,
+            "44": 1131791360.0,
+            "45": 1131791360.0,
+            "46": 1131791360.0,
+            "47": 1131791360.0,
+            "48": 1131791360.0,
+            "49": 1131791360.0,
+            "50": 1131791360.0,
+            "51": 1131791360.0,
+            "52": 1131791360.0,
+            "53": 1131791360.0,
+            "54": 1131791360.0,
+            "55": 1131791360.0,
+            "56": 1131791360.0,
+            "57": 1131791360.0,
+            "58": 1131791360.0,
+            "59": 1131791360.0,
+            "60": 1131791360.0,
+            "61": 1131791360.0,
+            "62": 1131791360.0,
+            "63": 1131791360.0,
+            "64": 1131791360.0,
+            "65": 1131791360.0,
+            "66": 1131791360.0,
+            "67": 1131791360.0,
+            "68": 1131791360.0,
+            "69": 1131791360.0,
+            "70": 1131791360.0,
+            "71": 1131791360.0,
+            "72": 1131791360.0,
+            "73": 1131791360.0,
+            "74": 1131791360.0,
+            "75": 1131791360.0,
+            "76": 1131791360.0,
+            "77": 1131791360.0,
+            "78": 1131791360.0,
+            "79": 1131791360.0,
+            "80": 1131791360.0,
+            "81": 1131791360.0,
+            "82": 1131791360.0,
+            "83": 1131791360.0,
+            "84": 1131791360.0,
+            "85": 1131791360.0,
+            "86": 1131791360.0,
+            "87": 1131791360.0,
+            "88": 1131791360.0,
+            "89": 1131791360.0,
+            "90": 1131791360.0,
+            "91": 1131791360.0,
+            "92": 1131791360.0,
+            "93": 1131791360.0,
+            "94": 1131791360.0,
+            "95": 1131791360.0,
+            "96": 1131791360.0,
+            "97": 1131791360.0,
+            "98": 1131791360.0,
+            "99": 1131791360.0,
+            "100": 1131791360.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 1409266176.0,
-            "2": 1864166912.0,
-            "3": 1864166912.0,
-            "4": 1864166912.0,
-            "5": 1864166912.0,
-            "6": 1864166912.0,
-            "7": 1864166912.0,
-            "8": 1864166912.0,
-            "9": 1864166912.0,
-            "10": 1864166912.0,
-            "11": 1864166912.0,
-            "12": 1864166912.0,
-            "13": 1864166912.0,
-            "14": 1864166912.0,
-            "15": 1864166912.0,
-            "16": 1864166912.0,
-            "17": 1864166912.0,
-            "18": 1864166912.0,
-            "19": 1864166912.0,
-            "20": 1864166912.0,
-            "21": 1864166912.0,
-            "22": 1864166912.0,
-            "23": 1864166912.0,
-            "24": 1864166912.0,
-            "25": 1864166912.0,
-            "26": 1864166912.0,
-            "27": 1864166912.0,
-            "28": 1864166912.0,
-            "29": 1864166912.0,
-            "30": 1864166912.0,
-            "31": 1864166912.0,
-            "32": 1864166912.0,
-            "33": 1864166912.0,
-            "34": 1864166912.0,
-            "35": 1864166912.0,
-            "36": 1864166912.0,
-            "37": 1864166912.0,
-            "38": 1864166912.0,
-            "39": 1864166912.0,
-            "40": 1864166912.0,
-            "41": 1864166912.0,
-            "42": 1864166912.0,
-            "43": 1864166912.0,
-            "44": 1864166912.0,
-            "45": 1864166912.0,
-            "46": 1864166912.0,
-            "47": 1864166912.0,
-            "48": 1864166912.0,
-            "49": 1864166912.0,
-            "50": 1864166912.0,
-            "51": 1864166912.0,
-            "52": 1864166912.0,
-            "53": 1864166912.0,
-            "54": 1864166912.0,
-            "55": 1864166912.0,
-            "56": 1864166912.0,
-            "57": 1864166912.0,
-            "58": 1864166912.0,
-            "59": 1864166912.0,
-            "60": 1864166912.0,
-            "61": 1864166912.0,
-            "62": 1864166912.0,
-            "63": 1864166912.0,
-            "64": 1864166912.0,
-            "65": 1864166912.0,
-            "66": 1864166912.0,
-            "67": 1864166912.0,
-            "68": 1864166912.0,
-            "69": 1864166912.0,
-            "70": 1864166912.0,
-            "71": 1864166912.0,
-            "72": 1864166912.0,
-            "73": 1864166912.0,
-            "74": 1864166912.0,
-            "75": 1864166912.0,
-            "76": 1864166912.0,
-            "77": 1864166912.0,
-            "78": 1864166912.0,
-            "79": 1864166912.0,
-            "80": 1864166912.0,
-            "81": 1864166912.0,
-            "82": 1864166912.0,
-            "83": 1864166912.0,
-            "84": 1864166912.0,
-            "85": 1864166912.0,
-            "86": 1864166912.0,
-            "87": 1864166912.0,
-            "88": 1864166912.0,
-            "89": 1864166912.0,
-            "90": 1864166912.0,
-            "91": 1864166912.0,
-            "92": 1864166912.0,
-            "93": 1864166912.0,
-            "94": 1864166912.0,
-            "95": 1864166912.0,
-            "96": 1864166912.0,
-            "97": 1864166912.0,
-            "98": 1864166912.0,
-            "99": 1864166912.0,
-            "100": 1864166912.0
+            "1": 1410773504.0,
+            "2": 1862789632.0,
+            "3": 1862789632.0,
+            "4": 1862789632.0,
+            "5": 1862789632.0,
+            "6": 1862789632.0,
+            "7": 1862789632.0,
+            "8": 1862789632.0,
+            "9": 1862789632.0,
+            "10": 1862789632.0,
+            "11": 1862789632.0,
+            "12": 1862789632.0,
+            "13": 1862789632.0,
+            "14": 1862789632.0,
+            "15": 1862789632.0,
+            "16": 1862789632.0,
+            "17": 1862789632.0,
+            "18": 1862789632.0,
+            "19": 1862789632.0,
+            "20": 1862789632.0,
+            "21": 1862789632.0,
+            "22": 1862789632.0,
+            "23": 1862789632.0,
+            "24": 1862789632.0,
+            "25": 1862789632.0,
+            "26": 1862789632.0,
+            "27": 1862789632.0,
+            "28": 1862789632.0,
+            "29": 1862789632.0,
+            "30": 1862789632.0,
+            "31": 1862789632.0,
+            "32": 1862789632.0,
+            "33": 1862789632.0,
+            "34": 1862789632.0,
+            "35": 1862789632.0,
+            "36": 1862789632.0,
+            "37": 1862789632.0,
+            "38": 1862789632.0,
+            "39": 1862789632.0,
+            "40": 1862789632.0,
+            "41": 1862789632.0,
+            "42": 1862789632.0,
+            "43": 1862789632.0,
+            "44": 1862789632.0,
+            "45": 1862789632.0,
+            "46": 1862789632.0,
+            "47": 1862789632.0,
+            "48": 1862789632.0,
+            "49": 1862789632.0,
+            "50": 1862789632.0,
+            "51": 1862789632.0,
+            "52": 1862789632.0,
+            "53": 1862789632.0,
+            "54": 1862789632.0,
+            "55": 1862789632.0,
+            "56": 1862789632.0,
+            "57": 1862789632.0,
+            "58": 1862789632.0,
+            "59": 1862789632.0,
+            "60": 1862789632.0,
+            "61": 1862789632.0,
+            "62": 1862789632.0,
+            "63": 1862789632.0,
+            "64": 1862789632.0,
+            "65": 1862789632.0,
+            "66": 1862789632.0,
+            "67": 1862789632.0,
+            "68": 1862789632.0,
+            "69": 1862789632.0,
+            "70": 1862789632.0,
+            "71": 1862789632.0,
+            "72": 1862789632.0,
+            "73": 1862789632.0,
+            "74": 1862789632.0,
+            "75": 1862789632.0,
+            "76": 1862789632.0,
+            "77": 1862789632.0,
+            "78": 1862789632.0,
+            "79": 1862789632.0,
+            "80": 1862789632.0,
+            "81": 1862789632.0,
+            "82": 1862789632.0,
+            "83": 1862789632.0,
+            "84": 1862789632.0,
+            "85": 1862789632.0,
+            "86": 1862789632.0,
+            "87": 1862789632.0,
+            "88": 1862789632.0,
+            "89": 1862789632.0,
+            "90": 1862789632.0,
+            "91": 1862789632.0,
+            "92": 1862789632.0,
+            "93": 1862789632.0,
+            "94": 1862789632.0,
+            "95": 1862789632.0,
+            "96": 1862789632.0,
+            "97": 1862789632.0,
+            "98": 1862789632.0,
+            "99": 1862789632.0,
+            "100": 1862789632.0
         }
     },
     "iteration-time": {
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.74091,
-            "2": 0.66943,
-            "3": 0.64954,
-            "4": 0.64695,
-            "5": 0.65419,
-            "6": 0.6513,
-            "7": 0.64556,
-            "8": 0.6385,
-            "9": 0.64307,
-            "10": 0.63679,
-            "11": 0.64386,
-            "12": 0.64012,
-            "13": 0.63889,
-            "14": 0.63958,
-            "15": 0.64024,
-            "16": 0.63721,
-            "17": 0.6492,
-            "18": 0.65247,
-            "19": 0.64523,
-            "20": 1.0041,
-            "21": 0.64739,
-            "22": 1.02158,
-            "23": 0.96313,
-            "24": 0.64631,
-            "25": 0.64337,
-            "26": 0.64702,
-            "27": 0.64516,
-            "28": 0.64748,
-            "29": 0.64657,
-            "30": 0.95958,
-            "31": 1.05772,
-            "32": 0.64319,
-            "33": 0.64455,
-            "34": 0.64044,
-            "35": 0.6445,
-            "36": 0.64649,
-            "37": 0.64593,
-            "38": 0.64912,
-            "39": 0.64665,
-            "40": 0.64585,
-            "41": 0.64603,
-            "42": 0.64765,
-            "43": 0.64548,
-            "44": 0.64732,
-            "45": 0.64996,
-            "46": 0.65909,
-            "47": 0.66335,
-            "48": 0.64625,
-            "49": 0.64641,
-            "50": 0.64822,
-            "51": 0.65982,
-            "52": 0.64882,
-            "53": 0.64892,
-            "54": 0.64636,
-            "55": 0.64591,
-            "56": 0.65232,
-            "57": 0.64591,
-            "58": 0.64572,
-            "59": 0.64949,
-            "60": 0.64277,
-            "61": 0.64766,
-            "62": 0.64726,
-            "63": 0.64637,
-            "64": 0.64901,
-            "65": 0.6476,
-            "66": 0.64458,
-            "67": 0.64951,
-            "68": 0.64438,
-            "69": 0.64854,
-            "70": 0.65268,
-            "71": 0.64762,
-            "72": 1.02587,
-            "73": 0.65274,
-            "74": 0.65942,
-            "75": 0.65091,
-            "76": 0.65181,
-            "77": 0.65582,
-            "78": 0.64434,
-            "79": 0.65116,
-            "80": 0.65073,
-            "81": 0.64645,
-            "82": 0.65405,
-            "83": 0.65107,
-            "84": 0.64883,
-            "85": 0.94272,
-            "86": 0.65641,
-            "87": 0.99204,
-            "88": 0.96199,
-            "89": 0.64856,
-            "90": 0.65165,
-            "91": 0.65163,
-            "92": 0.6506,
-            "93": 0.64828,
-            "94": 0.64682,
-            "95": 1.01586,
-            "96": 1.04151,
-            "97": 0.65481,
-            "98": 0.64703,
-            "99": 0.64964,
-            "100": 0.65343
+            "1": 25.75145,
+            "2": 0.68955,
+            "3": 0.62891,
+            "4": 0.62371,
+            "5": 0.64907,
+            "6": 0.63218,
+            "7": 0.66755,
+            "8": 0.61813,
+            "9": 0.59993,
+            "10": 0.59659,
+            "11": 0.60388,
+            "12": 0.60369,
+            "13": 1.0243,
+            "14": 1.00512,
+            "15": 0.61333,
+            "16": 0.61377,
+            "17": 0.6103,
+            "18": 0.60779,
+            "19": 0.6087,
+            "20": 0.60685,
+            "21": 0.61179,
+            "22": 0.61036,
+            "23": 0.60843,
+            "24": 0.61334,
+            "25": 0.61104,
+            "26": 0.60721,
+            "27": 0.60906,
+            "28": 0.61093,
+            "29": 0.60885,
+            "30": 0.60331,
+            "31": 0.60347,
+            "32": 0.61091,
+            "33": 0.60942,
+            "34": 0.59484,
+            "35": 0.59387,
+            "36": 0.59382,
+            "37": 0.60178,
+            "38": 0.59578,
+            "39": 0.59527,
+            "40": 0.59259,
+            "41": 0.65592,
+            "42": 0.60449,
+            "43": 0.59683,
+            "44": 0.59604,
+            "45": 0.59257,
+            "46": 0.59555,
+            "47": 0.59173,
+            "48": 0.58982,
+            "49": 0.59611,
+            "50": 0.59259,
+            "51": 0.6131,
+            "52": 0.61177,
+            "53": 0.59702,
+            "54": 0.59373,
+            "55": 0.59877,
+            "56": 0.59405,
+            "57": 0.59369,
+            "58": 0.59622,
+            "59": 0.59453,
+            "60": 0.59018,
+            "61": 0.59521,
+            "62": 0.59435,
+            "63": 0.59412,
+            "64": 0.5937,
+            "65": 0.5926,
+            "66": 0.61412,
+            "67": 0.60902,
+            "68": 0.59153,
+            "69": 0.59219,
+            "70": 0.59689,
+            "71": 0.59441,
+            "72": 0.59498,
+            "73": 0.59486,
+            "74": 0.5906,
+            "75": 0.59758,
+            "76": 0.59428,
+            "77": 0.60149,
+            "78": 0.59424,
+            "79": 0.59801,
+            "80": 0.59552,
+            "81": 0.60182,
+            "82": 0.58057,
+            "83": 0.58573,
+            "84": 0.58157,
+            "85": 0.93106,
+            "86": 0.58378,
+            "87": 1.02253,
+            "88": 0.60509,
+            "89": 1.03608,
+            "90": 0.59228,
+            "91": 0.59375,
+            "92": 0.59564,
+            "93": 0.59607,
+            "94": 0.59269,
+            "95": 0.59143,
+            "96": 0.59188,
+            "97": 0.59202,
+            "98": 0.60085,
+            "99": 0.60637,
+            "100": 0.60502
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..54505a38bfd
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.36541,
+            "52": 7.12192,
+            "53": 7.09189,
+            "54": 7.22759,
+            "55": 7.13584,
+            "56": 7.20822,
+            "57": 7.31316,
+            "58": 6.99088,
+            "59": 7.09934,
+            "60": 7.12683,
+            "61": 7.1014,
+            "62": 7.23954,
+            "63": 7.14417,
+            "64": 7.06836,
+            "65": 6.98412,
+            "66": 7.03768,
+            "67": 7.02847,
+            "68": 7.1299,
+            "69": 7.01456,
+            "70": 7.04997,
+            "71": 6.89408,
+            "72": 6.98553,
+            "73": 6.96694,
+            "74": 6.90297,
+            "75": 7.0574,
+            "76": 6.9581,
+            "77": 7.06903,
+            "78": 7.02133,
+            "79": 6.8504,
+            "80": 6.91935,
+            "81": 6.95874,
+            "82": 7.04745,
+            "83": 6.98522,
+            "84": 6.99712,
+            "85": 6.83565,
+            "86": 7.04156,
+            "87": 6.96476,
+            "88": 6.89883,
+            "89": 6.80051,
+            "90": 7.22593,
+            "91": 6.70562,
+            "92": 7.0381,
+            "93": 6.88685,
+            "94": 7.03908,
+            "95": 6.84815,
+            "96": 6.95281,
+            "97": 6.94344,
+            "98": 6.86987,
+            "99": 6.99502,
+            "100": 6.96683
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 41112.0,
+            "52": 43837.0,
+            "53": 43913.0,
+            "54": 41704.0,
+            "55": 43870.0,
+            "56": 43209.0,
+            "57": 42636.0,
+            "58": 43841.0,
+            "59": 44630.0,
+            "60": 41219.0,
+            "61": 39702.0,
+            "62": 44739.0,
+            "63": 44651.0,
+            "64": 45372.0,
+            "65": 44682.0,
+            "66": 45351.0,
+            "67": 43174.0,
+            "68": 42502.0,
+            "69": 43834.0,
+            "70": 45514.0,
+            "71": 43291.0,
+            "72": 44767.0,
+            "73": 45384.0,
+            "74": 42457.0,
+            "75": 44673.0,
+            "76": 43876.0,
+            "77": 42026.0,
+            "78": 40350.0,
+            "79": 38918.0,
+            "80": 41092.0,
+            "81": 45364.0,
+            "82": 43198.0,
+            "83": 38467.0,
+            "84": 42477.0,
+            "85": 43981.0,
+            "86": 45667.0,
+            "87": 40863.0,
+            "88": 41772.0,
+            "89": 41104.0,
+            "90": 44669.0,
+            "91": 46134.0,
+            "92": 41634.0,
+            "93": 43241.0,
+            "94": 39538.0,
+            "95": 43915.0,
+            "96": 44683.0,
+            "97": 45405.0,
+            "98": 41791.0,
+            "99": 45414.0,
+            "100": 42458.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1133037568.0,
+            "52": 1133037568.0,
+            "53": 1133037568.0,
+            "54": 1133037568.0,
+            "55": 1133037568.0,
+            "56": 1133037568.0,
+            "57": 1133037568.0,
+            "58": 1133037568.0,
+            "59": 1133037568.0,
+            "60": 1133037568.0,
+            "61": 1133037568.0,
+            "62": 1133037568.0,
+            "63": 1133037568.0,
+            "64": 1133037568.0,
+            "65": 1133037568.0,
+            "66": 1133037568.0,
+            "67": 1133037568.0,
+            "68": 1133037568.0,
+            "69": 1133037568.0,
+            "70": 1133037568.0,
+            "71": 1133037568.0,
+            "72": 1133037568.0,
+            "73": 1133037568.0,
+            "74": 1133037568.0,
+            "75": 1133037568.0,
+            "76": 1133037568.0,
+            "77": 1133037568.0,
+            "78": 1133037568.0,
+            "79": 1133037568.0,
+            "80": 1133037568.0,
+            "81": 1133037568.0,
+            "82": 1133037568.0,
+            "83": 1133037568.0,
+            "84": 1133037568.0,
+            "85": 1133037568.0,
+            "86": 1133037568.0,
+            "87": 1133037568.0,
+            "88": 1133037568.0,
+            "89": 1133037568.0,
+            "90": 1133037568.0,
+            "91": 1133037568.0,
+            "92": 1133037568.0,
+            "93": 1133037568.0,
+            "94": 1133037568.0,
+            "95": 1133037568.0,
+            "96": 1133037568.0,
+            "97": 1133037568.0,
+            "98": 1133037568.0,
+            "99": 1133037568.0,
+            "100": 1133037568.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1863903744.0,
+            "52": 1863904768.0,
+            "53": 1863904768.0,
+            "54": 1863904768.0,
+            "55": 1863904768.0,
+            "56": 1863904768.0,
+            "57": 1863904768.0,
+            "58": 1863904768.0,
+            "59": 1863904768.0,
+            "60": 1863904768.0,
+            "61": 1863904768.0,
+            "62": 1863904768.0,
+            "63": 1863904768.0,
+            "64": 1863904768.0,
+            "65": 1863904768.0,
+            "66": 1863904768.0,
+            "67": 1863904768.0,
+            "68": 1863904768.0,
+            "69": 1863904768.0,
+            "70": 1863904768.0,
+            "71": 1863904768.0,
+            "72": 1863904768.0,
+            "73": 1863904768.0,
+            "74": 1863904768.0,
+            "75": 1863904768.0,
+            "76": 1863904768.0,
+            "77": 1863904768.0,
+            "78": 1863904768.0,
+            "79": 1863904768.0,
+            "80": 1863904768.0,
+            "81": 1863904768.0,
+            "82": 1863904768.0,
+            "83": 1863904768.0,
+            "84": 1863904768.0,
+            "85": 1863904768.0,
+            "86": 1863904768.0,
+            "87": 1863904768.0,
+            "88": 1863904768.0,
+            "89": 1863904768.0,
+            "90": 1863904768.0,
+            "91": 1863904768.0,
+            "92": 1863904768.0,
+            "93": 1863904768.0,
+            "94": 1863904768.0,
+            "95": 1863904768.0,
+            "96": 1863904768.0,
+            "97": 1863904768.0,
+            "98": 1863904768.0,
+            "99": 1863904768.0,
+            "100": 1863904768.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 23.83009,
+            "52": 0.76142,
+            "53": 0.67196,
+            "54": 0.6081,
+            "55": 0.60646,
+            "56": 0.60713,
+            "57": 0.6272,
+            "58": 0.62763,
+            "59": 0.62688,
+            "60": 0.62193,
+            "61": 0.62167,
+            "62": 0.61817,
+            "63": 0.61775,
+            "64": 0.5974,
+            "65": 0.60155,
+            "66": 0.60696,
+            "67": 0.59768,
+            "68": 0.59371,
+            "69": 0.59479,
+            "70": 0.59367,
+            "71": 0.60012,
+            "72": 0.5983,
+            "73": 0.60139,
+            "74": 0.60001,
+            "75": 0.59852,
+            "76": 0.59622,
+            "77": 0.59604,
+            "78": 0.59666,
+            "79": 0.6022,
+            "80": 0.62234,
+            "81": 0.62179,
+            "82": 0.62692,
+            "83": 0.62266,
+            "84": 0.6182,
+            "85": 0.62589,
+            "86": 0.62575,
+            "87": 0.59517,
+            "88": 0.60178,
+            "89": 0.60479,
+            "90": 0.61692,
+            "91": 0.60273,
+            "92": 0.61308,
+            "93": 0.6039,
+            "94": 0.62096,
+            "95": 0.62166,
+            "96": 0.61878,
+            "97": 0.6187,
+            "98": 0.6215,
+            "99": 0.62325,
+            "100": 0.61948
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json
index 5aebe0d3c7a..8476c973a1a 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.34494, "5": 9.38931, "10": 9.01569, "15": 8.64631, "20": 8.26966, "25": 7.99493, "30": 7.87492, "35": 7.65834, "40": 7.50302, "45": 7.36143, "50": 7.19205, "55": 7.16852, "60": 7.16587, "65": 7.00099, "70": 7.07162, "75": 7.07611, "80": 6.95251, "85": 6.8641, "90": 7.25457, "95": 6.8601, "100": 6.99745}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43305.0, "5": 45413.0, "10": 45379.0, "15": 43956.0, "20": 44818.0, "25": 42745.0, "30": 44042.0, "35": 43297.0, "40": 43251.0, "45": 43345.0, "50": 43415.0, "55": 43960.0, "60": 41326.0, "65": 44730.0, "70": 45543.0, "75": 44684.0, "80": 41118.0, "85": 44024.0, "90": 44744.0, "95": 44092.0, "100": 42500.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4158515200.0, "5": 4158515200.0, "10": 4158515200.0, "15": 4158515200.0, "20": 4158515200.0, "25": 4158515200.0, "30": 4158515200.0, "35": 4158515200.0, "40": 4158515200.0, "45": 4158515200.0, "50": 4158515200.0, "55": 4158515200.0, "60": 4158515200.0, "65": 4158515200.0, "70": 4158515200.0, "75": 4158515200.0, "80": 4158515200.0, "85": 4158515200.0, "90": 4158515200.0, "95": 4158515200.0, "100": 4158515200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4349380608.0, "5": 6187556864.0, "10": 6187556864.0, "15": 6187556864.0, "20": 6187556864.0, "25": 6187556864.0, "30": 6187556864.0, "35": 6187556864.0, "40": 6187556864.0, "45": 6187556864.0, "50": 6187556864.0, "55": 6187556864.0, "60": 6187556864.0, "65": 6187556864.0, "70": 6187556864.0, "75": 6187556864.0, "80": 6187556864.0, "85": 6187556864.0, "90": 6187556864.0, "95": 6187556864.0, "100": 6187556864.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.48832, "5": 0.2262, "10": 0.22726, "15": 0.22564, "20": 0.22623, "25": 0.22711, "30": 0.22781, "35": 0.2271, "40": 0.22647, "45": 0.2358, "50": 0.22658, "55": 0.22646, "60": 0.22506, "65": 0.2281, "70": 0.22663, "75": 0.2252, "80": 0.22659, "85": 0.22661, "90": 0.23186, "95": 0.24827, "100": 0.23899}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.34494,
+            "2": 10.36431,
+            "3": 9.73158,
+            "4": 9.57928,
+            "5": 9.38931,
+            "6": 9.41074,
+            "7": 9.30545,
+            "8": 9.24868,
+            "9": 9.09349,
+            "10": 9.01569,
+            "11": 8.86286,
+            "12": 8.79096,
+            "13": 8.80892,
+            "14": 8.67669,
+            "15": 8.64631,
+            "16": 8.5398,
+            "17": 8.47895,
+            "18": 8.38945,
+            "19": 8.36156,
+            "20": 8.26966,
+            "21": 8.26333,
+            "22": 8.15066,
+            "23": 8.08893,
+            "24": 8.12421,
+            "25": 7.99493,
+            "26": 8.08494,
+            "27": 7.87755,
+            "28": 7.95863,
+            "29": 7.79585,
+            "30": 7.87492,
+            "31": 7.83245,
+            "32": 7.69489,
+            "33": 7.78469,
+            "34": 7.55767,
+            "35": 7.65834,
+            "36": 7.52881,
+            "37": 7.44912,
+            "38": 7.50398,
+            "39": 7.48056,
+            "40": 7.50302,
+            "41": 7.39767,
+            "42": 7.37206,
+            "43": 7.44301,
+            "44": 7.3811,
+            "45": 7.36143,
+            "46": 7.29415,
+            "47": 7.47498,
+            "48": 7.29564,
+            "49": 7.36092,
+            "50": 7.19205,
+            "51": 7.38769,
+            "52": 7.13773,
+            "53": 7.125,
+            "54": 7.23668,
+            "55": 7.16852,
+            "56": 7.22884,
+            "57": 7.34699,
+            "58": 7.03128,
+            "59": 7.1229,
+            "60": 7.16587,
+            "61": 7.1174,
+            "62": 7.26837,
+            "63": 7.16759,
+            "64": 7.08376,
+            "65": 7.00099,
+            "66": 7.07203,
+            "67": 7.05971,
+            "68": 7.14618,
+            "69": 7.03944,
+            "70": 7.07162,
+            "71": 6.91653,
+            "72": 7.02025,
+            "73": 6.9904,
+            "74": 6.9146,
+            "75": 7.07611,
+            "76": 6.97098,
+            "77": 7.08446,
+            "78": 7.03608,
+            "79": 6.88325,
+            "80": 6.95251,
+            "81": 6.985,
+            "82": 7.06843,
+            "83": 7.00882,
+            "84": 7.0181,
+            "85": 6.8641,
+            "86": 7.04979,
+            "87": 6.99342,
+            "88": 6.9238,
+            "89": 6.82406,
+            "90": 7.25457,
+            "91": 6.7226,
+            "92": 7.05372,
+            "93": 6.91688,
+            "94": 7.066,
+            "95": 6.8601,
+            "96": 6.98742,
+            "97": 6.96796,
+            "98": 6.89964,
+            "99": 7.02766,
+            "100": 6.99745
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 43305.0,
+            "2": 44091.0,
+            "3": 44794.0,
+            "4": 42436.0,
+            "5": 45413.0,
+            "6": 40989.0,
+            "7": 43195.0,
+            "8": 45462.0,
+            "9": 42551.0,
+            "10": 45379.0,
+            "11": 44016.0,
+            "12": 44629.0,
+            "13": 43937.0,
+            "14": 46250.0,
+            "15": 43956.0,
+            "16": 41728.0,
+            "17": 43873.0,
+            "18": 44716.0,
+            "19": 42648.0,
+            "20": 44818.0,
+            "21": 44812.0,
+            "22": 41883.0,
+            "23": 45468.0,
+            "24": 43112.0,
+            "25": 42745.0,
+            "26": 43949.0,
+            "27": 46268.0,
+            "28": 46429.0,
+            "29": 46199.0,
+            "30": 44042.0,
+            "31": 41264.0,
+            "32": 43413.0,
+            "33": 45478.0,
+            "34": 43375.0,
+            "35": 43297.0,
+            "36": 42545.0,
+            "37": 40689.0,
+            "38": 42575.0,
+            "39": 44772.0,
+            "40": 43251.0,
+            "41": 44707.0,
+            "42": 43261.0,
+            "43": 45506.0,
+            "44": 44652.0,
+            "45": 43345.0,
+            "46": 43935.0,
+            "47": 42506.0,
+            "48": 44693.0,
+            "49": 43200.0,
+            "50": 43415.0,
+            "51": 41174.0,
+            "52": 43885.0,
+            "53": 43959.0,
+            "54": 41961.0,
+            "55": 43960.0,
+            "56": 43269.0,
+            "57": 42561.0,
+            "58": 43898.0,
+            "59": 44654.0,
+            "60": 41326.0,
+            "61": 39744.0,
+            "62": 44774.0,
+            "63": 44682.0,
+            "64": 45396.0,
+            "65": 44730.0,
+            "66": 45388.0,
+            "67": 43196.0,
+            "68": 42556.0,
+            "69": 43825.0,
+            "70": 45543.0,
+            "71": 43407.0,
+            "72": 44832.0,
+            "73": 45412.0,
+            "74": 42502.0,
+            "75": 44684.0,
+            "76": 43926.0,
+            "77": 42100.0,
+            "78": 40525.0,
+            "79": 38954.0,
+            "80": 41118.0,
+            "81": 45412.0,
+            "82": 43238.0,
+            "83": 38495.0,
+            "84": 42524.0,
+            "85": 44024.0,
+            "86": 45749.0,
+            "87": 41116.0,
+            "88": 41798.0,
+            "89": 41078.0,
+            "90": 44744.0,
+            "91": 46266.0,
+            "92": 41865.0,
+            "93": 43254.0,
+            "94": 39588.0,
+            "95": 44092.0,
+            "96": 44732.0,
+            "97": 45474.0,
+            "98": 41859.0,
+            "99": 45537.0,
+            "100": 42500.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 4158515200.0,
+            "2": 4158515200.0,
+            "3": 4158515200.0,
+            "4": 4158515200.0,
+            "5": 4158515200.0,
+            "6": 4158515200.0,
+            "7": 4158515200.0,
+            "8": 4158515200.0,
+            "9": 4158515200.0,
+            "10": 4158515200.0,
+            "11": 4158515200.0,
+            "12": 4158515200.0,
+            "13": 4158515200.0,
+            "14": 4158515200.0,
+            "15": 4158515200.0,
+            "16": 4158515200.0,
+            "17": 4158515200.0,
+            "18": 4158515200.0,
+            "19": 4158515200.0,
+            "20": 4158515200.0,
+            "21": 4158515200.0,
+            "22": 4158515200.0,
+            "23": 4158515200.0,
+            "24": 4158515200.0,
+            "25": 4158515200.0,
+            "26": 4158515200.0,
+            "27": 4158515200.0,
+            "28": 4158515200.0,
+            "29": 4158515200.0,
+            "30": 4158515200.0,
+            "31": 4158515200.0,
+            "32": 4158515200.0,
+            "33": 4158515200.0,
+            "34": 4158515200.0,
+            "35": 4158515200.0,
+            "36": 4158515200.0,
+            "37": 4158515200.0,
+            "38": 4158515200.0,
+            "39": 4158515200.0,
+            "40": 4158515200.0,
+            "41": 4158515200.0,
+            "42": 4158515200.0,
+            "43": 4158515200.0,
+            "44": 4158515200.0,
+            "45": 4158515200.0,
+            "46": 4158515200.0,
+            "47": 4158515200.0,
+            "48": 4158515200.0,
+            "49": 4158515200.0,
+            "50": 4158515200.0,
+            "51": 4158515200.0,
+            "52": 4158515200.0,
+            "53": 4158515200.0,
+            "54": 4158515200.0,
+            "55": 4158515200.0,
+            "56": 4158515200.0,
+            "57": 4158515200.0,
+            "58": 4158515200.0,
+            "59": 4158515200.0,
+            "60": 4158515200.0,
+            "61": 4158515200.0,
+            "62": 4158515200.0,
+            "63": 4158515200.0,
+            "64": 4158515200.0,
+            "65": 4158515200.0,
+            "66": 4158515200.0,
+            "67": 4158515200.0,
+            "68": 4158515200.0,
+            "69": 4158515200.0,
+            "70": 4158515200.0,
+            "71": 4158515200.0,
+            "72": 4158515200.0,
+            "73": 4158515200.0,
+            "74": 4158515200.0,
+            "75": 4158515200.0,
+            "76": 4158515200.0,
+            "77": 4158515200.0,
+            "78": 4158515200.0,
+            "79": 4158515200.0,
+            "80": 4158515200.0,
+            "81": 4158515200.0,
+            "82": 4158515200.0,
+            "83": 4158515200.0,
+            "84": 4158515200.0,
+            "85": 4158515200.0,
+            "86": 4158515200.0,
+            "87": 4158515200.0,
+            "88": 4158515200.0,
+            "89": 4158515200.0,
+            "90": 4158515200.0,
+            "91": 4158515200.0,
+            "92": 4158515200.0,
+            "93": 4158515200.0,
+            "94": 4158515200.0,
+            "95": 4158515200.0,
+            "96": 4158515200.0,
+            "97": 4158515200.0,
+            "98": 4158515200.0,
+            "99": 4158515200.0,
+            "100": 4158515200.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 4349380608.0,
+            "2": 6185459712.0,
+            "3": 6187556864.0,
+            "4": 6187556864.0,
+            "5": 6187556864.0,
+            "6": 6187556864.0,
+            "7": 6187556864.0,
+            "8": 6187556864.0,
+            "9": 6187556864.0,
+            "10": 6187556864.0,
+            "11": 6187556864.0,
+            "12": 6187556864.0,
+            "13": 6187556864.0,
+            "14": 6187556864.0,
+            "15": 6187556864.0,
+            "16": 6187556864.0,
+            "17": 6187556864.0,
+            "18": 6187556864.0,
+            "19": 6187556864.0,
+            "20": 6187556864.0,
+            "21": 6187556864.0,
+            "22": 6187556864.0,
+            "23": 6187556864.0,
+            "24": 6187556864.0,
+            "25": 6187556864.0,
+            "26": 6187556864.0,
+            "27": 6187556864.0,
+            "28": 6187556864.0,
+            "29": 6187556864.0,
+            "30": 6187556864.0,
+            "31": 6187556864.0,
+            "32": 6187556864.0,
+            "33": 6187556864.0,
+            "34": 6187556864.0,
+            "35": 6187556864.0,
+            "36": 6187556864.0,
+            "37": 6187556864.0,
+            "38": 6187556864.0,
+            "39": 6187556864.0,
+            "40": 6187556864.0,
+            "41": 6187556864.0,
+            "42": 6187556864.0,
+            "43": 6187556864.0,
+            "44": 6187556864.0,
+            "45": 6187556864.0,
+            "46": 6187556864.0,
+            "47": 6187556864.0,
+            "48": 6187556864.0,
+            "49": 6187556864.0,
+            "50": 6187556864.0,
+            "51": 6187556864.0,
+            "52": 6187556864.0,
+            "53": 6187556864.0,
+            "54": 6187556864.0,
+            "55": 6187556864.0,
+            "56": 6187556864.0,
+            "57": 6187556864.0,
+            "58": 6187556864.0,
+            "59": 6187556864.0,
+            "60": 6187556864.0,
+            "61": 6187556864.0,
+            "62": 6187556864.0,
+            "63": 6187556864.0,
+            "64": 6187556864.0,
+            "65": 6187556864.0,
+            "66": 6187556864.0,
+            "67": 6187556864.0,
+            "68": 6187556864.0,
+            "69": 6187556864.0,
+            "70": 6187556864.0,
+            "71": 6187556864.0,
+            "72": 6187556864.0,
+            "73": 6187556864.0,
+            "74": 6187556864.0,
+            "75": 6187556864.0,
+            "76": 6187556864.0,
+            "77": 6187556864.0,
+            "78": 6187556864.0,
+            "79": 6187556864.0,
+            "80": 6187556864.0,
+            "81": 6187556864.0,
+            "82": 6187556864.0,
+            "83": 6187556864.0,
+            "84": 6187556864.0,
+            "85": 6187556864.0,
+            "86": 6187556864.0,
+            "87": 6187556864.0,
+            "88": 6187556864.0,
+            "89": 6187556864.0,
+            "90": 6187556864.0,
+            "91": 6187556864.0,
+            "92": 6187556864.0,
+            "93": 6187556864.0,
+            "94": 6187556864.0,
+            "95": 6187556864.0,
+            "96": 6187556864.0,
+            "97": 6187556864.0,
+            "98": 6187556864.0,
+            "99": 6187556864.0,
+            "100": 6187556864.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 15.70772,
+            "2": 0.54719,
+            "3": 0.22124,
+            "4": 0.2113,
+            "5": 0.21574,
+            "6": 0.20899,
+            "7": 0.21163,
+            "8": 0.20932,
+            "9": 0.20931,
+            "10": 0.20843,
+            "11": 0.20865,
+            "12": 0.20976,
+            "13": 0.21153,
+            "14": 0.21141,
+            "15": 0.22881,
+            "16": 0.2095,
+            "17": 0.22252,
+            "18": 0.21238,
+            "19": 0.21011,
+            "20": 0.21012,
+            "21": 0.20824,
+            "22": 0.21048,
+            "23": 0.21174,
+            "24": 0.21129,
+            "25": 0.21316,
+            "26": 0.2111,
+            "27": 0.20884,
+            "28": 0.20897,
+            "29": 0.2111,
+            "30": 0.20827,
+            "31": 0.20796,
+            "32": 0.20813,
+            "33": 0.21328,
+            "34": 0.21312,
+            "35": 0.20816,
+            "36": 0.2194,
+            "37": 0.21822,
+            "38": 0.21033,
+            "39": 0.20794,
+            "40": 0.2076,
+            "41": 0.21268,
+            "42": 0.23004,
+            "43": 0.21754,
+            "44": 0.21505,
+            "45": 0.21734,
+            "46": 0.21516,
+            "47": 0.21219,
+            "48": 0.21234,
+            "49": 0.21349,
+            "50": 0.21178,
+            "51": 0.20738,
+            "52": 0.2076,
+            "53": 0.20803,
+            "54": 0.20714,
+            "55": 0.20879,
+            "56": 0.66578,
+            "57": 0.21121,
+            "58": 0.20847,
+            "59": 0.20864,
+            "60": 0.20774,
+            "61": 0.2096,
+            "62": 0.20814,
+            "63": 0.20821,
+            "64": 0.20754,
+            "65": 0.20865,
+            "66": 0.20774,
+            "67": 0.20742,
+            "68": 0.20782,
+            "69": 0.20843,
+            "70": 0.20816,
+            "71": 0.20717,
+            "72": 0.20871,
+            "73": 0.20889,
+            "74": 0.20819,
+            "75": 0.20754,
+            "76": 0.20875,
+            "77": 0.20921,
+            "78": 0.2087,
+            "79": 0.20863,
+            "80": 0.20792,
+            "81": 0.20726,
+            "82": 0.20882,
+            "83": 0.20819,
+            "84": 0.20781,
+            "85": 0.20789,
+            "86": 0.20766,
+            "87": 0.20795,
+            "88": 0.20781,
+            "89": 0.20815,
+            "90": 0.20721,
+            "91": 0.20799,
+            "92": 0.20836,
+            "93": 0.20739,
+            "94": 0.20893,
+            "95": 0.20842,
+            "96": 0.20769,
+            "97": 0.2107,
+            "98": 0.20784,
+            "99": 0.20696,
+            "100": 0.20698
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json
index e788215b20a..8c2893286fd 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 7.09171,
-            "2": 0.19937,
-            "3": 0.15739,
-            "4": 0.15626,
-            "5": 0.15726,
-            "6": 0.16596,
-            "7": 0.15866,
-            "8": 0.16018,
-            "9": 0.16342,
-            "10": 0.15848,
-            "11": 0.1563,
-            "12": 0.15949,
-            "13": 0.16471,
-            "14": 0.1653,
-            "15": 0.15904,
-            "16": 0.15673,
-            "17": 0.15845,
-            "18": 0.15591,
-            "19": 0.15809,
-            "20": 0.1593,
-            "21": 0.15934,
-            "22": 0.1588,
-            "23": 0.15615,
-            "24": 0.15816,
-            "25": 0.15513,
-            "26": 0.16623,
-            "27": 0.1635,
-            "28": 0.15796,
-            "29": 0.15745,
-            "30": 0.15659,
-            "31": 0.15757,
-            "32": 0.15805,
-            "33": 0.16121,
-            "34": 0.15918,
-            "35": 0.15628,
-            "36": 0.16015,
-            "37": 0.15954,
-            "38": 0.15711,
-            "39": 0.16207,
-            "40": 0.16543,
-            "41": 0.16329,
-            "42": 0.15895,
-            "43": 0.15771,
-            "44": 0.16372,
-            "45": 0.15827,
-            "46": 0.16205,
-            "47": 0.16175,
-            "48": 0.15754,
-            "49": 0.15916,
-            "50": 0.15618,
-            "51": 0.15693,
-            "52": 0.16151,
-            "53": 0.16143,
-            "54": 0.16281,
-            "55": 0.15891,
-            "56": 0.16235,
-            "57": 0.16248,
-            "58": 0.16949,
-            "59": 0.16264,
-            "60": 0.15666,
-            "61": 0.19456,
-            "62": 0.19414,
-            "63": 0.16346,
-            "64": 0.16675,
-            "65": 0.16803,
-            "66": 0.1748,
-            "67": 0.16431,
-            "68": 0.1587,
-            "69": 0.16219,
-            "70": 0.16457,
-            "71": 0.1716,
-            "72": 0.16546,
-            "73": 0.16711,
-            "74": 0.16142,
-            "75": 0.17042,
-            "76": 0.17092,
-            "77": 0.16596,
-            "78": 0.16577,
-            "79": 0.15743,
-            "80": 0.15851,
-            "81": 0.15791,
-            "82": 0.16001,
-            "83": 0.15783,
-            "84": 0.15788,
-            "85": 0.15665,
-            "86": 0.16107,
-            "87": 0.15608,
-            "88": 0.15928,
-            "89": 0.16138,
-            "90": 0.15621,
-            "91": 0.15886,
-            "92": 0.15808,
-            "93": 0.15911,
-            "94": 0.16777,
-            "95": 0.16017,
-            "96": 0.15821,
-            "97": 0.15642,
-            "98": 0.16061,
-            "99": 0.157,
-            "100": 0.15975
+            "1": 21.7472,
+            "2": 0.26947,
+            "3": 0.15906,
+            "4": 0.14381,
+            "5": 0.13718,
+            "6": 0.13541,
+            "7": 0.13627,
+            "8": 0.13552,
+            "9": 0.15313,
+            "10": 0.15332,
+            "11": 0.15293,
+            "12": 0.14699,
+            "13": 0.13522,
+            "14": 0.13752,
+            "15": 0.14123,
+            "16": 0.14245,
+            "17": 0.14135,
+            "18": 0.13773,
+            "19": 0.13696,
+            "20": 0.13686,
+            "21": 0.13916,
+            "22": 0.13592,
+            "23": 0.13723,
+            "24": 0.13489,
+            "25": 0.13734,
+            "26": 0.14011,
+            "27": 0.13977,
+            "28": 0.13653,
+            "29": 0.13981,
+            "30": 0.13581,
+            "31": 0.13818,
+            "32": 0.13543,
+            "33": 0.13872,
+            "34": 0.13879,
+            "35": 0.14257,
+            "36": 0.13909,
+            "37": 0.259,
+            "38": 0.15725,
+            "39": 0.16376,
+            "40": 0.13972,
+            "41": 0.13871,
+            "42": 0.13723,
+            "43": 0.24968,
+            "44": 0.13741,
+            "45": 0.17732,
+            "46": 0.13888,
+            "47": 0.13561,
+            "48": 0.17199,
+            "49": 0.14457,
+            "50": 0.14057,
+            "51": 0.13853,
+            "52": 0.53484,
+            "53": 0.13659,
+            "54": 0.13534,
+            "55": 0.13612,
+            "56": 0.13281,
+            "57": 0.1356,
+            "58": 0.13222,
+            "59": 0.13569,
+            "60": 0.13553,
+            "61": 0.13464,
+            "62": 0.13388,
+            "63": 0.13695,
+            "64": 0.13201,
+            "65": 0.13601,
+            "66": 0.13229,
+            "67": 0.13532,
+            "68": 0.13224,
+            "69": 0.13444,
+            "70": 0.13376,
+            "71": 0.13581,
+            "72": 0.13302,
+            "73": 0.13502,
+            "74": 0.13267,
+            "75": 0.13531,
+            "76": 0.13332,
+            "77": 0.13635,
+            "78": 0.13294,
+            "79": 0.13456,
+            "80": 0.13311,
+            "81": 0.13594,
+            "82": 0.13241,
+            "83": 0.13659,
+            "84": 0.13211,
+            "85": 0.1359,
+            "86": 0.13243,
+            "87": 0.13479,
+            "88": 0.13306,
+            "89": 0.13564,
+            "90": 0.13326,
+            "91": 0.13434,
+            "92": 0.13257,
+            "93": 0.13697,
+            "94": 0.13578,
+            "95": 0.13676,
+            "96": 0.13248,
+            "97": 0.13516,
+            "98": 0.13424,
+            "99": 0.13587,
+            "100": 0.13365
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json
index b9771639ebd..d0e9e9b3b5a 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.34494, "5": 9.38931, "10": 9.01569, "15": 8.64631, "20": 8.26966, "25": 7.99493, "30": 7.87492, "35": 7.65834, "40": 7.50302, "45": 7.36143, "50": 7.19205, "55": 7.16852, "60": 7.16587, "65": 7.00099, "70": 7.07162, "75": 7.07611, "80": 6.95251, "85": 6.8641, "90": 7.25457, "95": 6.8601, "100": 6.99745}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43305.0, "5": 45413.0, "10": 45379.0, "15": 43956.0, "20": 44818.0, "25": 42745.0, "30": 44042.0, "35": 43297.0, "40": 43251.0, "45": 43345.0, "50": 43415.0, "55": 43960.0, "60": 41326.0, "65": 44730.0, "70": 45543.0, "75": 44684.0, "80": 41118.0, "85": 44024.0, "90": 44744.0, "95": 44092.0, "100": 42500.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4158515200.0, "5": 4158515200.0, "10": 4158515200.0, "15": 4158515200.0, "20": 4158515200.0, "25": 4158515200.0, "30": 4158515200.0, "35": 4158515200.0, "40": 4158515200.0, "45": 4158515200.0, "50": 4158515200.0, "55": 4158515200.0, "60": 4158515200.0, "65": 4158515200.0, "70": 4158515200.0, "75": 4158515200.0, "80": 4158515200.0, "85": 4158515200.0, "90": 4158515200.0, "95": 4158515200.0, "100": 4158515200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4349380608.0, "5": 6186508288.0, "10": 6186508288.0, "15": 6186508288.0, "20": 6186508288.0, "25": 6186508288.0, "30": 6186508288.0, "35": 6186508288.0, "40": 6186508288.0, "45": 6186508288.0, "50": 6186508288.0, "55": 6186508288.0, "60": 6186508288.0, "65": 6186508288.0, "70": 6186508288.0, "75": 6186508288.0, "80": 6186508288.0, "85": 6186508288.0, "90": 6186508288.0, "95": 6186508288.0, "100": 6186508288.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.34153, "5": 0.23162, "10": 0.22893, "15": 0.23688, "20": 0.2316, "25": 0.22871, "30": 0.23008, "35": 0.22669, "40": 0.24999, "45": 0.22865, "50": 0.23226, "55": 0.22758, "60": 0.23004, "65": 0.22585, "70": 0.23272, "75": 0.22388, "80": 0.22441, "85": 0.22606, "90": 0.6846, "95": 0.22521, "100": 0.22591}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.34494,
+            "2": 10.36431,
+            "3": 9.73158,
+            "4": 9.57928,
+            "5": 9.38931,
+            "6": 9.41074,
+            "7": 9.30545,
+            "8": 9.24868,
+            "9": 9.09349,
+            "10": 9.01569,
+            "11": 8.86286,
+            "12": 8.79096,
+            "13": 8.80892,
+            "14": 8.67669,
+            "15": 8.64631,
+            "16": 8.5398,
+            "17": 8.47895,
+            "18": 8.38945,
+            "19": 8.36156,
+            "20": 8.26966,
+            "21": 8.26333,
+            "22": 8.15066,
+            "23": 8.08893,
+            "24": 8.12421,
+            "25": 7.99493,
+            "26": 8.08494,
+            "27": 7.87755,
+            "28": 7.95863,
+            "29": 7.79585,
+            "30": 7.87492,
+            "31": 7.83245,
+            "32": 7.69489,
+            "33": 7.78469,
+            "34": 7.55767,
+            "35": 7.65834,
+            "36": 7.52881,
+            "37": 7.44912,
+            "38": 7.50398,
+            "39": 7.48056,
+            "40": 7.50302,
+            "41": 7.39767,
+            "42": 7.37206,
+            "43": 7.44301,
+            "44": 7.3811,
+            "45": 7.36143,
+            "46": 7.29415,
+            "47": 7.47498,
+            "48": 7.29564,
+            "49": 7.36092,
+            "50": 7.19205,
+            "51": 7.38769,
+            "52": 7.13773,
+            "53": 7.125,
+            "54": 7.23668,
+            "55": 7.16852,
+            "56": 7.22884,
+            "57": 7.34699,
+            "58": 7.03128,
+            "59": 7.1229,
+            "60": 7.16587,
+            "61": 7.1174,
+            "62": 7.26837,
+            "63": 7.16759,
+            "64": 7.08376,
+            "65": 7.00099,
+            "66": 7.07203,
+            "67": 7.05971,
+            "68": 7.14618,
+            "69": 7.03944,
+            "70": 7.07162,
+            "71": 6.91653,
+            "72": 7.02025,
+            "73": 6.9904,
+            "74": 6.9146,
+            "75": 7.07611,
+            "76": 6.97098,
+            "77": 7.08446,
+            "78": 7.03608,
+            "79": 6.88325,
+            "80": 6.95251,
+            "81": 6.985,
+            "82": 7.06843,
+            "83": 7.00882,
+            "84": 7.0181,
+            "85": 6.8641,
+            "86": 7.04979,
+            "87": 6.99342,
+            "88": 6.9238,
+            "89": 6.82406,
+            "90": 7.25457,
+            "91": 6.7226,
+            "92": 7.05372,
+            "93": 6.91688,
+            "94": 7.066,
+            "95": 6.8601,
+            "96": 6.98742,
+            "97": 6.96796,
+            "98": 6.89964,
+            "99": 7.02766,
+            "100": 6.99745
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 43305.0,
+            "2": 44091.0,
+            "3": 44794.0,
+            "4": 42436.0,
+            "5": 45413.0,
+            "6": 40989.0,
+            "7": 43195.0,
+            "8": 45462.0,
+            "9": 42551.0,
+            "10": 45379.0,
+            "11": 44016.0,
+            "12": 44629.0,
+            "13": 43937.0,
+            "14": 46250.0,
+            "15": 43956.0,
+            "16": 41728.0,
+            "17": 43873.0,
+            "18": 44716.0,
+            "19": 42648.0,
+            "20": 44818.0,
+            "21": 44812.0,
+            "22": 41883.0,
+            "23": 45468.0,
+            "24": 43112.0,
+            "25": 42745.0,
+            "26": 43949.0,
+            "27": 46268.0,
+            "28": 46429.0,
+            "29": 46199.0,
+            "30": 44042.0,
+            "31": 41264.0,
+            "32": 43413.0,
+            "33": 45478.0,
+            "34": 43375.0,
+            "35": 43297.0,
+            "36": 42545.0,
+            "37": 40689.0,
+            "38": 42575.0,
+            "39": 44772.0,
+            "40": 43251.0,
+            "41": 44707.0,
+            "42": 43261.0,
+            "43": 45506.0,
+            "44": 44652.0,
+            "45": 43345.0,
+            "46": 43935.0,
+            "47": 42506.0,
+            "48": 44693.0,
+            "49": 43200.0,
+            "50": 43415.0,
+            "51": 41174.0,
+            "52": 43885.0,
+            "53": 43959.0,
+            "54": 41961.0,
+            "55": 43960.0,
+            "56": 43269.0,
+            "57": 42561.0,
+            "58": 43898.0,
+            "59": 44654.0,
+            "60": 41326.0,
+            "61": 39744.0,
+            "62": 44774.0,
+            "63": 44682.0,
+            "64": 45396.0,
+            "65": 44730.0,
+            "66": 45388.0,
+            "67": 43196.0,
+            "68": 42556.0,
+            "69": 43825.0,
+            "70": 45543.0,
+            "71": 43407.0,
+            "72": 44832.0,
+            "73": 45412.0,
+            "74": 42502.0,
+            "75": 44684.0,
+            "76": 43926.0,
+            "77": 42100.0,
+            "78": 40525.0,
+            "79": 38954.0,
+            "80": 41118.0,
+            "81": 45412.0,
+            "82": 43238.0,
+            "83": 38495.0,
+            "84": 42524.0,
+            "85": 44024.0,
+            "86": 45749.0,
+            "87": 41116.0,
+            "88": 41798.0,
+            "89": 41078.0,
+            "90": 44744.0,
+            "91": 46266.0,
+            "92": 41865.0,
+            "93": 43254.0,
+            "94": 39588.0,
+            "95": 44092.0,
+            "96": 44732.0,
+            "97": 45474.0,
+            "98": 41859.0,
+            "99": 45537.0,
+            "100": 42500.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 4158515200.0,
+            "2": 4158515200.0,
+            "3": 4158515200.0,
+            "4": 4158515200.0,
+            "5": 4158515200.0,
+            "6": 4158515200.0,
+            "7": 4158515200.0,
+            "8": 4158515200.0,
+            "9": 4158515200.0,
+            "10": 4158515200.0,
+            "11": 4158515200.0,
+            "12": 4158515200.0,
+            "13": 4158515200.0,
+            "14": 4158515200.0,
+            "15": 4158515200.0,
+            "16": 4158515200.0,
+            "17": 4158515200.0,
+            "18": 4158515200.0,
+            "19": 4158515200.0,
+            "20": 4158515200.0,
+            "21": 4158515200.0,
+            "22": 4158515200.0,
+            "23": 4158515200.0,
+            "24": 4158515200.0,
+            "25": 4158515200.0,
+            "26": 4158515200.0,
+            "27": 4158515200.0,
+            "28": 4158515200.0,
+            "29": 4158515200.0,
+            "30": 4158515200.0,
+            "31": 4158515200.0,
+            "32": 4158515200.0,
+            "33": 4158515200.0,
+            "34": 4158515200.0,
+            "35": 4158515200.0,
+            "36": 4158515200.0,
+            "37": 4158515200.0,
+            "38": 4158515200.0,
+            "39": 4158515200.0,
+            "40": 4158515200.0,
+            "41": 4158515200.0,
+            "42": 4158515200.0,
+            "43": 4158515200.0,
+            "44": 4158515200.0,
+            "45": 4158515200.0,
+            "46": 4158515200.0,
+            "47": 4158515200.0,
+            "48": 4158515200.0,
+            "49": 4158515200.0,
+            "50": 4158515200.0,
+            "51": 4158515200.0,
+            "52": 4158515200.0,
+            "53": 4158515200.0,
+            "54": 4158515200.0,
+            "55": 4158515200.0,
+            "56": 4158515200.0,
+            "57": 4158515200.0,
+            "58": 4158515200.0,
+            "59": 4158515200.0,
+            "60": 4158515200.0,
+            "61": 4158515200.0,
+            "62": 4158515200.0,
+            "63": 4158515200.0,
+            "64": 4158515200.0,
+            "65": 4158515200.0,
+            "66": 4158515200.0,
+            "67": 4158515200.0,
+            "68": 4158515200.0,
+            "69": 4158515200.0,
+            "70": 4158515200.0,
+            "71": 4158515200.0,
+            "72": 4158515200.0,
+            "73": 4158515200.0,
+            "74": 4158515200.0,
+            "75": 4158515200.0,
+            "76": 4158515200.0,
+            "77": 4158515200.0,
+            "78": 4158515200.0,
+            "79": 4158515200.0,
+            "80": 4158515200.0,
+            "81": 4158515200.0,
+            "82": 4158515200.0,
+            "83": 4158515200.0,
+            "84": 4158515200.0,
+            "85": 4158515200.0,
+            "86": 4158515200.0,
+            "87": 4158515200.0,
+            "88": 4158515200.0,
+            "89": 4158515200.0,
+            "90": 4158515200.0,
+            "91": 4158515200.0,
+            "92": 4158515200.0,
+            "93": 4158515200.0,
+            "94": 4158515200.0,
+            "95": 4158515200.0,
+            "96": 4158515200.0,
+            "97": 4158515200.0,
+            "98": 4158515200.0,
+            "99": 4158515200.0,
+            "100": 4158515200.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 4349380608.0,
+            "2": 6185459712.0,
+            "3": 6187556864.0,
+            "4": 6187556864.0,
+            "5": 6187556864.0,
+            "6": 6187556864.0,
+            "7": 6187556864.0,
+            "8": 6187556864.0,
+            "9": 6187556864.0,
+            "10": 6187556864.0,
+            "11": 6187556864.0,
+            "12": 6187556864.0,
+            "13": 6187556864.0,
+            "14": 6187556864.0,
+            "15": 6187556864.0,
+            "16": 6187556864.0,
+            "17": 6187556864.0,
+            "18": 6187556864.0,
+            "19": 6187556864.0,
+            "20": 6187556864.0,
+            "21": 6187556864.0,
+            "22": 6187556864.0,
+            "23": 6187556864.0,
+            "24": 6187556864.0,
+            "25": 6187556864.0,
+            "26": 6187556864.0,
+            "27": 6187556864.0,
+            "28": 6187556864.0,
+            "29": 6187556864.0,
+            "30": 6187556864.0,
+            "31": 6187556864.0,
+            "32": 6187556864.0,
+            "33": 6187556864.0,
+            "34": 6187556864.0,
+            "35": 6187556864.0,
+            "36": 6187556864.0,
+            "37": 6187556864.0,
+            "38": 6187556864.0,
+            "39": 6187556864.0,
+            "40": 6187556864.0,
+            "41": 6187556864.0,
+            "42": 6187556864.0,
+            "43": 6187556864.0,
+            "44": 6187556864.0,
+            "45": 6187556864.0,
+            "46": 6187556864.0,
+            "47": 6187556864.0,
+            "48": 6187556864.0,
+            "49": 6187556864.0,
+            "50": 6187556864.0,
+            "51": 6187556864.0,
+            "52": 6187556864.0,
+            "53": 6187556864.0,
+            "54": 6187556864.0,
+            "55": 6187556864.0,
+            "56": 6187556864.0,
+            "57": 6187556864.0,
+            "58": 6187556864.0,
+            "59": 6187556864.0,
+            "60": 6187556864.0,
+            "61": 6187556864.0,
+            "62": 6187556864.0,
+            "63": 6187556864.0,
+            "64": 6187556864.0,
+            "65": 6187556864.0,
+            "66": 6187556864.0,
+            "67": 6187556864.0,
+            "68": 6187556864.0,
+            "69": 6187556864.0,
+            "70": 6187556864.0,
+            "71": 6187556864.0,
+            "72": 6187556864.0,
+            "73": 6187556864.0,
+            "74": 6187556864.0,
+            "75": 6187556864.0,
+            "76": 6187556864.0,
+            "77": 6187556864.0,
+            "78": 6187556864.0,
+            "79": 6187556864.0,
+            "80": 6187556864.0,
+            "81": 6187556864.0,
+            "82": 6187556864.0,
+            "83": 6187556864.0,
+            "84": 6187556864.0,
+            "85": 6187556864.0,
+            "86": 6187556864.0,
+            "87": 6187556864.0,
+            "88": 6187556864.0,
+            "89": 6187556864.0,
+            "90": 6187556864.0,
+            "91": 6187556864.0,
+            "92": 6187556864.0,
+            "93": 6187556864.0,
+            "94": 6187556864.0,
+            "95": 6187556864.0,
+            "96": 6187556864.0,
+            "97": 6187556864.0,
+            "98": 6187556864.0,
+            "99": 6187556864.0,
+            "100": 6187556864.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 15.06497,
+            "2": 0.33648,
+            "3": 0.22277,
+            "4": 0.20886,
+            "5": 0.21308,
+            "6": 0.20892,
+            "7": 0.21092,
+            "8": 0.20752,
+            "9": 0.21199,
+            "10": 0.20742,
+            "11": 0.21069,
+            "12": 0.20826,
+            "13": 0.21142,
+            "14": 0.21455,
+            "15": 0.21627,
+            "16": 0.21296,
+            "17": 0.21402,
+            "18": 0.20889,
+            "19": 0.21078,
+            "20": 0.20954,
+            "21": 0.20887,
+            "22": 0.20979,
+            "23": 0.21439,
+            "24": 0.2099,
+            "25": 0.21242,
+            "26": 0.21605,
+            "27": 0.21297,
+            "28": 0.20982,
+            "29": 0.21136,
+            "30": 0.20907,
+            "31": 0.20972,
+            "32": 0.21139,
+            "33": 0.21469,
+            "34": 0.21652,
+            "35": 0.21743,
+            "36": 0.2149,
+            "37": 0.22692,
+            "38": 0.21471,
+            "39": 0.21755,
+            "40": 0.21624,
+            "41": 0.21941,
+            "42": 0.21428,
+            "43": 0.21749,
+            "44": 0.21544,
+            "45": 0.22837,
+            "46": 0.21663,
+            "47": 0.21319,
+            "48": 0.21421,
+            "49": 0.21543,
+            "50": 0.21524,
+            "51": 0.61922,
+            "52": 0.21119,
+            "53": 0.21075,
+            "54": 0.20936,
+            "55": 0.20973,
+            "56": 0.20946,
+            "57": 0.2092,
+            "58": 0.20996,
+            "59": 0.20928,
+            "60": 0.20927,
+            "61": 0.21061,
+            "62": 0.20871,
+            "63": 0.20949,
+            "64": 0.20862,
+            "65": 0.21028,
+            "66": 0.20932,
+            "67": 0.20996,
+            "68": 0.20879,
+            "69": 0.21044,
+            "70": 0.20912,
+            "71": 0.20946,
+            "72": 0.2097,
+            "73": 0.21061,
+            "74": 0.20946,
+            "75": 0.20911,
+            "76": 0.20928,
+            "77": 0.20987,
+            "78": 0.21013,
+            "79": 0.2094,
+            "80": 0.20969,
+            "81": 0.20909,
+            "82": 0.20968,
+            "83": 0.21037,
+            "84": 0.20978,
+            "85": 0.21017,
+            "86": 0.20951,
+            "87": 0.21004,
+            "88": 0.20955,
+            "89": 0.20979,
+            "90": 0.20905,
+            "91": 0.21055,
+            "92": 0.20916,
+            "93": 0.21026,
+            "94": 0.20948,
+            "95": 0.20954,
+            "96": 0.20902,
+            "97": 0.20988,
+            "98": 0.20896,
+            "99": 0.20908,
+            "100": 0.20889
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100_2nd.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100_2nd.json
new file mode 100644
index 00000000000..3e69a67d2bd
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.38769,
+            "52": 7.13773,
+            "53": 7.125,
+            "54": 7.23668,
+            "55": 7.16852,
+            "56": 7.22884,
+            "57": 7.34699,
+            "58": 7.03128,
+            "59": 7.1229,
+            "60": 7.16587,
+            "61": 7.1174,
+            "62": 7.26837,
+            "63": 7.16759,
+            "64": 7.08376,
+            "65": 7.00099,
+            "66": 7.07203,
+            "67": 7.05971,
+            "68": 7.14618,
+            "69": 7.03944,
+            "70": 7.07162,
+            "71": 6.91653,
+            "72": 7.02025,
+            "73": 6.9904,
+            "74": 6.9146,
+            "75": 7.07611,
+            "76": 6.97098,
+            "77": 7.08446,
+            "78": 7.03608,
+            "79": 6.88325,
+            "80": 6.95251,
+            "81": 6.985,
+            "82": 7.06843,
+            "83": 7.00882,
+            "84": 7.0181,
+            "85": 6.8641,
+            "86": 7.04979,
+            "87": 6.99342,
+            "88": 6.9238,
+            "89": 6.82406,
+            "90": 7.25457,
+            "91": 6.7226,
+            "92": 7.05372,
+            "93": 6.91688,
+            "94": 7.066,
+            "95": 6.8601,
+            "96": 6.98742,
+            "97": 6.96796,
+            "98": 6.89964,
+            "99": 7.02766,
+            "100": 6.99745
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 41174.0,
+            "52": 43885.0,
+            "53": 43959.0,
+            "54": 41961.0,
+            "55": 43960.0,
+            "56": 43269.0,
+            "57": 42561.0,
+            "58": 43898.0,
+            "59": 44654.0,
+            "60": 41326.0,
+            "61": 39744.0,
+            "62": 44774.0,
+            "63": 44682.0,
+            "64": 45396.0,
+            "65": 44730.0,
+            "66": 45388.0,
+            "67": 43196.0,
+            "68": 42556.0,
+            "69": 43825.0,
+            "70": 45543.0,
+            "71": 43407.0,
+            "72": 44832.0,
+            "73": 45412.0,
+            "74": 42502.0,
+            "75": 44684.0,
+            "76": 43926.0,
+            "77": 42100.0,
+            "78": 40525.0,
+            "79": 38954.0,
+            "80": 41118.0,
+            "81": 45412.0,
+            "82": 43238.0,
+            "83": 38495.0,
+            "84": 42524.0,
+            "85": 44024.0,
+            "86": 45749.0,
+            "87": 41116.0,
+            "88": 41798.0,
+            "89": 41078.0,
+            "90": 44744.0,
+            "91": 46266.0,
+            "92": 41865.0,
+            "93": 43254.0,
+            "94": 39588.0,
+            "95": 44092.0,
+            "96": 44732.0,
+            "97": 45474.0,
+            "98": 41859.0,
+            "99": 45537.0,
+            "100": 42500.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4157204480.0,
+            "52": 4157204480.0,
+            "53": 4157204480.0,
+            "54": 4157204480.0,
+            "55": 4157204480.0,
+            "56": 4157204480.0,
+            "57": 4157204480.0,
+            "58": 4157204480.0,
+            "59": 4157204480.0,
+            "60": 4157204480.0,
+            "61": 4157204480.0,
+            "62": 4157204480.0,
+            "63": 4157204480.0,
+            "64": 4157204480.0,
+            "65": 4157204480.0,
+            "66": 4157204480.0,
+            "67": 4157204480.0,
+            "68": 4157204480.0,
+            "69": 4157204480.0,
+            "70": 4157204480.0,
+            "71": 4157204480.0,
+            "72": 4157204480.0,
+            "73": 4157204480.0,
+            "74": 4157204480.0,
+            "75": 4157204480.0,
+            "76": 4157204480.0,
+            "77": 4157204480.0,
+            "78": 4157204480.0,
+            "79": 4157204480.0,
+            "80": 4157204480.0,
+            "81": 4157204480.0,
+            "82": 4157204480.0,
+            "83": 4157204480.0,
+            "84": 4157204480.0,
+            "85": 4157204480.0,
+            "86": 4157204480.0,
+            "87": 4157204480.0,
+            "88": 4157204480.0,
+            "89": 4157204480.0,
+            "90": 4157204480.0,
+            "91": 4157204480.0,
+            "92": 4157204480.0,
+            "93": 4157204480.0,
+            "94": 4157204480.0,
+            "95": 4157204480.0,
+            "96": 4157204480.0,
+            "97": 4157204480.0,
+            "98": 4157204480.0,
+            "99": 4157204480.0,
+            "100": 4157204480.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6123567104.0,
+            "52": 6204596224.0,
+            "53": 6204596224.0,
+            "54": 6204596224.0,
+            "55": 6204596224.0,
+            "56": 6204596224.0,
+            "57": 6204596224.0,
+            "58": 6204596224.0,
+            "59": 6204596224.0,
+            "60": 6204596224.0,
+            "61": 6204596224.0,
+            "62": 6204596224.0,
+            "63": 6204596224.0,
+            "64": 6204596224.0,
+            "65": 6204596224.0,
+            "66": 6204596224.0,
+            "67": 6204596224.0,
+            "68": 6204596224.0,
+            "69": 6204596224.0,
+            "70": 6204596224.0,
+            "71": 6204596224.0,
+            "72": 6204596224.0,
+            "73": 6204596224.0,
+            "74": 6204596224.0,
+            "75": 6204596224.0,
+            "76": 6204596224.0,
+            "77": 6204596224.0,
+            "78": 6204596224.0,
+            "79": 6204596224.0,
+            "80": 6204596224.0,
+            "81": 6204596224.0,
+            "82": 6204596224.0,
+            "83": 6204596224.0,
+            "84": 6204596224.0,
+            "85": 6204596224.0,
+            "86": 6204596224.0,
+            "87": 6204596224.0,
+            "88": 6204596224.0,
+            "89": 6204596224.0,
+            "90": 6204596224.0,
+            "91": 6204596224.0,
+            "92": 6204596224.0,
+            "93": 6204596224.0,
+            "94": 6204596224.0,
+            "95": 6204596224.0,
+            "96": 6204596224.0,
+            "97": 6204596224.0,
+            "98": 6204596224.0,
+            "99": 6204596224.0,
+            "100": 6204596224.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 15.16949,
+            "52": 0.23002,
+            "53": 0.21058,
+            "54": 0.20946,
+            "55": 0.20972,
+            "56": 0.20922,
+            "57": 0.20983,
+            "58": 0.20987,
+            "59": 0.20922,
+            "60": 0.20914,
+            "61": 0.2094,
+            "62": 0.20895,
+            "63": 0.2095,
+            "64": 0.21548,
+            "65": 0.21352,
+            "66": 0.21226,
+            "67": 0.21515,
+            "68": 0.20948,
+            "69": 0.21616,
+            "70": 0.21445,
+            "71": 0.21232,
+            "72": 0.21093,
+            "73": 0.21045,
+            "74": 0.21041,
+            "75": 0.21224,
+            "76": 0.21145,
+            "77": 0.21077,
+            "78": 0.21093,
+            "79": 0.2106,
+            "80": 0.20977,
+            "81": 0.21008,
+            "82": 0.2107,
+            "83": 0.21493,
+            "84": 0.22072,
+            "85": 0.24247,
+            "86": 0.23417,
+            "87": 0.68465,
+            "88": 0.21379,
+            "89": 0.21223,
+            "90": 0.20997,
+            "91": 0.21086,
+            "92": 0.2272,
+            "93": 0.21574,
+            "94": 0.21262,
+            "95": 0.21076,
+            "96": 0.21013,
+            "97": 0.2109,
+            "98": 0.21138,
+            "99": 0.21072,
+            "100": 0.21732
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json
index e0a55371afb..87d5de19688 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 6.98463,
-            "2": 0.19558,
-            "3": 0.15734,
-            "4": 0.15695,
-            "5": 0.15774,
-            "6": 0.15468,
-            "7": 0.15373,
-            "8": 0.15721,
-            "9": 0.15375,
-            "10": 0.15555,
-            "11": 0.15762,
-            "12": 0.15358,
-            "13": 0.15446,
-            "14": 0.15343,
-            "15": 0.15567,
-            "16": 0.15597,
-            "17": 0.19986,
-            "18": 0.19685,
-            "19": 0.15757,
-            "20": 0.16418,
-            "21": 0.1662,
-            "22": 0.1633,
-            "23": 0.15542,
-            "24": 0.16131,
-            "25": 0.15713,
-            "26": 0.16116,
-            "27": 0.15731,
-            "28": 0.16645,
-            "29": 0.1581,
-            "30": 0.16334,
-            "31": 0.15469,
-            "32": 0.1607,
-            "33": 0.15565,
-            "34": 0.16369,
-            "35": 0.15592,
-            "36": 0.16404,
-            "37": 0.15034,
-            "38": 0.15864,
-            "39": 0.15017,
-            "40": 0.1607,
-            "41": 0.15387,
-            "42": 0.17077,
-            "43": 0.15397,
-            "44": 0.1563,
-            "45": 0.15512,
-            "46": 0.16115,
-            "47": 0.15635,
-            "48": 0.16292,
-            "49": 0.15581,
-            "50": 0.16402,
-            "51": 0.15457,
-            "52": 0.16232,
-            "53": 0.156,
-            "54": 0.16433,
-            "55": 0.15283,
-            "56": 0.19434,
-            "57": 0.19273,
-            "58": 0.15955,
-            "59": 0.15405,
-            "60": 0.15503,
-            "61": 0.15418,
-            "62": 0.15446,
-            "63": 0.15778,
-            "64": 0.1578,
-            "65": 0.16024,
-            "66": 0.15656,
-            "67": 0.15524,
-            "68": 0.15394,
-            "69": 0.16041,
-            "70": 0.16082,
-            "71": 0.16503,
-            "72": 0.16142,
-            "73": 0.16242,
-            "74": 0.15995,
-            "75": 0.15816,
-            "76": 0.16199,
-            "77": 0.16827,
-            "78": 0.15987,
-            "79": 0.15797,
-            "80": 0.15617,
-            "81": 0.15308,
-            "82": 0.15484,
-            "83": 0.15382,
-            "84": 0.16856,
-            "85": 0.15976,
-            "86": 0.15794,
-            "87": 0.15409,
-            "88": 0.15333,
-            "89": 0.15511,
-            "90": 0.15333,
-            "91": 0.17162,
-            "92": 0.15418,
-            "93": 0.15421,
-            "94": 0.15169,
-            "95": 0.15479,
-            "96": 0.15268,
-            "97": 0.1552,
-            "98": 0.1575,
-            "99": 0.15403,
-            "100": 0.15379
+            "1": 21.61124,
+            "2": 0.25375,
+            "3": 0.15381,
+            "4": 0.13668,
+            "5": 0.14061,
+            "6": 0.13695,
+            "7": 0.13991,
+            "8": 0.13647,
+            "9": 0.13948,
+            "10": 0.13599,
+            "11": 0.13996,
+            "12": 0.13684,
+            "13": 0.13803,
+            "14": 0.13775,
+            "15": 0.14405,
+            "16": 0.14329,
+            "17": 0.14214,
+            "18": 0.13792,
+            "19": 0.14542,
+            "20": 0.13933,
+            "21": 0.14385,
+            "22": 0.14038,
+            "23": 0.1392,
+            "24": 0.14184,
+            "25": 0.14024,
+            "26": 0.13811,
+            "27": 0.14146,
+            "28": 0.1387,
+            "29": 0.16852,
+            "30": 0.17758,
+            "31": 0.17327,
+            "32": 0.139,
+            "33": 0.14013,
+            "34": 0.14167,
+            "35": 0.56403,
+            "36": 0.16981,
+            "37": 0.16552,
+            "38": 0.16667,
+            "39": 0.14682,
+            "40": 0.14282,
+            "41": 0.14246,
+            "42": 0.13999,
+            "43": 0.14095,
+            "44": 0.13857,
+            "45": 0.13996,
+            "46": 0.13897,
+            "47": 0.13758,
+            "48": 0.13993,
+            "49": 0.13748,
+            "50": 0.13821,
+            "51": 0.15888,
+            "52": 0.13795,
+            "53": 0.13793,
+            "54": 0.13589,
+            "55": 0.13601,
+            "56": 0.13569,
+            "57": 0.13516,
+            "58": 0.13634,
+            "59": 0.13738,
+            "60": 0.13603,
+            "61": 0.15318,
+            "62": 0.13568,
+            "63": 0.13667,
+            "64": 0.1406,
+            "65": 0.1369,
+            "66": 0.13909,
+            "67": 0.13571,
+            "68": 0.13523,
+            "69": 0.13642,
+            "70": 0.13547,
+            "71": 0.1377,
+            "72": 0.13793,
+            "73": 0.13582,
+            "74": 0.13579,
+            "75": 0.13481,
+            "76": 0.13578,
+            "77": 0.13685,
+            "78": 0.13529,
+            "79": 0.13534,
+            "80": 0.13583,
+            "81": 0.13619,
+            "82": 0.13843,
+            "83": 0.13827,
+            "84": 0.13815,
+            "85": 0.13776,
+            "86": 0.13726,
+            "87": 0.13781,
+            "88": 0.13804,
+            "89": 0.13806,
+            "90": 0.13816,
+            "91": 0.13897,
+            "92": 0.13721,
+            "93": 0.13893,
+            "94": 0.14047,
+            "95": 0.13678,
+            "96": 0.13685,
+            "97": 0.13729,
+            "98": 0.13723,
+            "99": 0.13754,
+            "100": 0.50769
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..573e46b0bdd
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.37361,
+            "52": 7.13381,
+            "53": 7.11244,
+            "54": 7.23402,
+            "55": 7.14785,
+            "56": 7.22775,
+            "57": 7.33273,
+            "58": 6.99461,
+            "59": 7.11599,
+            "60": 7.13222,
+            "61": 7.1056,
+            "62": 7.26513,
+            "63": 7.14772,
+            "64": 7.08696,
+            "65": 6.98643,
+            "66": 7.04728,
+            "67": 7.04697,
+            "68": 7.14062,
+            "69": 7.2435,
+            "70": 7.05957,
+            "71": 6.89356,
+            "72": 6.99769,
+            "73": 6.97897,
+            "74": 6.91983,
+            "75": 7.05297,
+            "76": 6.96036,
+            "77": 7.0791,
+            "78": 7.01392,
+            "79": 6.88358,
+            "80": 6.93014,
+            "81": 6.96553,
+            "82": 7.05265,
+            "83": 6.98788,
+            "84": 7.00427,
+            "85": 6.84577,
+            "86": 7.03621,
+            "87": 6.96327,
+            "88": 6.9137,
+            "89": 6.80631,
+            "90": 7.23619,
+            "91": 6.70015,
+            "92": 7.05679,
+            "93": 6.89287,
+            "94": 7.05835,
+            "95": 6.84786,
+            "96": 6.96771,
+            "97": 6.94258,
+            "98": 6.87388,
+            "99": 7.01816,
+            "100": 6.98466
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 41198.0,
+            "52": 43900.0,
+            "53": 43938.0,
+            "54": 41922.0,
+            "55": 43916.0,
+            "56": 43237.0,
+            "57": 42634.0,
+            "58": 43916.0,
+            "59": 44616.0,
+            "60": 41414.0,
+            "61": 39759.0,
+            "62": 44750.0,
+            "63": 44673.0,
+            "64": 45378.0,
+            "65": 44765.0,
+            "66": 45401.0,
+            "67": 43155.0,
+            "68": 42552.0,
+            "69": 43831.0,
+            "70": 45546.0,
+            "71": 43332.0,
+            "72": 44847.0,
+            "73": 45376.0,
+            "74": 42503.0,
+            "75": 44704.0,
+            "76": 43916.0,
+            "77": 42101.0,
+            "78": 40543.0,
+            "79": 38997.0,
+            "80": 41079.0,
+            "81": 45377.0,
+            "82": 43254.0,
+            "83": 38473.0,
+            "84": 42420.0,
+            "85": 43989.0,
+            "86": 45694.0,
+            "87": 41164.0,
+            "88": 41773.0,
+            "89": 41047.0,
+            "90": 44710.0,
+            "91": 46274.0,
+            "92": 41823.0,
+            "93": 43286.0,
+            "94": 39530.0,
+            "95": 44074.0,
+            "96": 44686.0,
+            "97": 45424.0,
+            "98": 41849.0,
+            "99": 45567.0,
+            "100": 42485.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 4157204480.0,
+            "52": 4157204480.0,
+            "53": 4157204480.0,
+            "54": 4157204480.0,
+            "55": 4157204480.0,
+            "56": 4157204480.0,
+            "57": 4157204480.0,
+            "58": 4157204480.0,
+            "59": 4157204480.0,
+            "60": 4157204480.0,
+            "61": 4157204480.0,
+            "62": 4157204480.0,
+            "63": 4157204480.0,
+            "64": 4157204480.0,
+            "65": 4157204480.0,
+            "66": 4157204480.0,
+            "67": 4157204480.0,
+            "68": 4157204480.0,
+            "69": 4157204480.0,
+            "70": 4157204480.0,
+            "71": 4157204480.0,
+            "72": 4157204480.0,
+            "73": 4157204480.0,
+            "74": 4157204480.0,
+            "75": 4157204480.0,
+            "76": 4157204480.0,
+            "77": 4157204480.0,
+            "78": 4157204480.0,
+            "79": 4157204480.0,
+            "80": 4157204480.0,
+            "81": 4157204480.0,
+            "82": 4157204480.0,
+            "83": 4157204480.0,
+            "84": 4157204480.0,
+            "85": 4157204480.0,
+            "86": 4157204480.0,
+            "87": 4157204480.0,
+            "88": 4157204480.0,
+            "89": 4157204480.0,
+            "90": 4157204480.0,
+            "91": 4157204480.0,
+            "92": 4157204480.0,
+            "93": 4157204480.0,
+            "94": 4157204480.0,
+            "95": 4157204480.0,
+            "96": 4157204480.0,
+            "97": 4157204480.0,
+            "98": 4157204480.0,
+            "99": 4157204480.0,
+            "100": 4157204480.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 6123567104.0,
+            "52": 6204596224.0,
+            "53": 6204596224.0,
+            "54": 6204596224.0,
+            "55": 6204596224.0,
+            "56": 6204596224.0,
+            "57": 6204596224.0,
+            "58": 6204596224.0,
+            "59": 6204596224.0,
+            "60": 6204596224.0,
+            "61": 6204596224.0,
+            "62": 6204596224.0,
+            "63": 6204596224.0,
+            "64": 6204596224.0,
+            "65": 6204596224.0,
+            "66": 6204596224.0,
+            "67": 6204596224.0,
+            "68": 6204596224.0,
+            "69": 6204596224.0,
+            "70": 6204596224.0,
+            "71": 6204596224.0,
+            "72": 6204596224.0,
+            "73": 6204596224.0,
+            "74": 6204596224.0,
+            "75": 6204596224.0,
+            "76": 6204596224.0,
+            "77": 6204596224.0,
+            "78": 6204596224.0,
+            "79": 6204596224.0,
+            "80": 6204596224.0,
+            "81": 6204596224.0,
+            "82": 6204596224.0,
+            "83": 6204596224.0,
+            "84": 6204596224.0,
+            "85": 6204596224.0,
+            "86": 6204596224.0,
+            "87": 6204596224.0,
+            "88": 6204596224.0,
+            "89": 6204596224.0,
+            "90": 6204596224.0,
+            "91": 6204596224.0,
+            "92": 6204596224.0,
+            "93": 6204596224.0,
+            "94": 6204596224.0,
+            "95": 6204596224.0,
+            "96": 6204596224.0,
+            "97": 6204596224.0,
+            "98": 6204596224.0,
+            "99": 6204596224.0,
+            "100": 6204596224.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 21.53373,
+            "52": 0.17108,
+            "53": 0.14343,
+            "54": 0.1382,
+            "55": 0.13776,
+            "56": 0.13812,
+            "57": 0.13818,
+            "58": 0.60134,
+            "59": 0.14006,
+            "60": 0.13721,
+            "61": 0.13776,
+            "62": 0.1388,
+            "63": 0.1416,
+            "64": 0.14634,
+            "65": 0.14469,
+            "66": 0.14853,
+            "67": 0.14401,
+            "68": 0.14036,
+            "69": 0.13971,
+            "70": 0.14452,
+            "71": 0.13933,
+            "72": 0.14544,
+            "73": 0.14099,
+            "74": 0.14162,
+            "75": 0.13904,
+            "76": 0.14131,
+            "77": 0.1772,
+            "78": 0.17391,
+            "79": 0.15422,
+            "80": 0.14246,
+            "81": 0.14329,
+            "82": 0.14005,
+            "83": 0.14166,
+            "84": 0.14169,
+            "85": 0.14284,
+            "86": 0.13961,
+            "87": 0.14163,
+            "88": 0.1407,
+            "89": 0.14357,
+            "90": 0.13852,
+            "91": 0.13984,
+            "92": 0.14186,
+            "93": 0.13873,
+            "94": 0.13893,
+            "95": 0.13848,
+            "96": 0.14366,
+            "97": 0.14476,
+            "98": 0.14352,
+            "99": 0.14347,
+            "100": 0.14605
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json
index 415d8919883..ff144e3d252 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json
@@ -1 +1,537 @@
-{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.38854, "5": 9.38095, "10": 9.05714, "15": 8.65603, "20": 8.26193, "25": 7.98192, "30": 7.86937, "35": 7.66279, "40": 7.50083, "45": 7.34894, "50": 7.18147, "55": 7.1542, "60": 7.14734, "65": 6.9972, "70": 7.06009, "75": 7.06086, "80": 6.94306, "85": 6.85989, "90": 7.24967, "95": 6.84836, "100": 6.98289}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43331.0, "5": 45376.0, "10": 45361.0, "15": 43888.0, "20": 44777.0, "25": 42465.0, "30": 43995.0, "35": 43276.0, "40": 43245.0, "45": 43285.0, "50": 43365.0, "55": 43853.0, "60": 41218.0, "65": 44684.0, "70": 45522.0, "75": 44695.0, "80": 41096.0, "85": 43990.0, "90": 44676.0, "95": 44077.0, "100": 42530.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2171550208.0, "5": 2171550208.0, "10": 2171550208.0, "15": 2171550208.0, "20": 2171550208.0, "25": 2171550208.0, "30": 2171550208.0, "35": 2171550208.0, "40": 2171550208.0, "45": 2171550208.0, "50": 2171550208.0, "55": 2171550208.0, "60": 2171550208.0, "65": 2171550208.0, "70": 2171550208.0, "75": 2171550208.0, "80": 2171550208.0, "85": 2171550208.0, "90": 2171550208.0, "95": 2171550208.0, "100": 2171550208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2407642624.0, "5": 3336458752.0, "10": 3336458752.0, "15": 3336458752.0, "20": 3336458752.0, "25": 3336458752.0, "30": 3336458752.0, "35": 3336458752.0, "40": 3336458752.0, "45": 3336458752.0, "50": 3336458752.0, "55": 3336458752.0, "60": 3336458752.0, "65": 3336458752.0, "70": 3336458752.0, "75": 3336458752.0, "80": 3336458752.0, "85": 3336458752.0, "90": 3336458752.0, "95": 3336458752.0, "100": 3336458752.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 7.05678, "5": 0.40847, "10": 0.40944, "15": 0.41103, "20": 0.40541, "25": 0.40521, "30": 0.41404, "35": 0.40757, "40": 0.40461, "45": 0.40953, "50": 0.41332, "55": 0.41397, "60": 0.41379, "65": 0.41333, "70": 0.4099, "75": 0.41406, "80": 0.40498, "85": 0.40583, "90": 0.40273, "95": 0.40387, "100": 0.88919}}}
\ No newline at end of file
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 10.38854,
+            "2": 10.3937,
+            "3": 9.78105,
+            "4": 9.59731,
+            "5": 9.38095,
+            "6": 9.4057,
+            "7": 9.30785,
+            "8": 9.24107,
+            "9": 9.12192,
+            "10": 9.05714,
+            "11": 8.87325,
+            "12": 8.79368,
+            "13": 8.84026,
+            "14": 8.68518,
+            "15": 8.65603,
+            "16": 8.54372,
+            "17": 8.50113,
+            "18": 8.39001,
+            "19": 8.36443,
+            "20": 8.26193,
+            "21": 8.27097,
+            "22": 8.14406,
+            "23": 8.07467,
+            "24": 8.11915,
+            "25": 7.98192,
+            "26": 8.08777,
+            "27": 7.87148,
+            "28": 7.96511,
+            "29": 7.80258,
+            "30": 7.86937,
+            "31": 7.81742,
+            "32": 7.68788,
+            "33": 7.7805,
+            "34": 7.55497,
+            "35": 7.66279,
+            "36": 7.52257,
+            "37": 7.44455,
+            "38": 7.5026,
+            "39": 7.4504,
+            "40": 7.50083,
+            "41": 7.39053,
+            "42": 7.36073,
+            "43": 7.4333,
+            "44": 7.37641,
+            "45": 7.34894,
+            "46": 7.28171,
+            "47": 7.46122,
+            "48": 7.2877,
+            "49": 7.35375,
+            "50": 7.18147,
+            "51": 7.36608,
+            "52": 7.13343,
+            "53": 7.11575,
+            "54": 7.22932,
+            "55": 7.1542,
+            "56": 7.22261,
+            "57": 7.32969,
+            "58": 7.02356,
+            "59": 7.11377,
+            "60": 7.14734,
+            "61": 7.11404,
+            "62": 7.24755,
+            "63": 7.1568,
+            "64": 7.08414,
+            "65": 6.9972,
+            "66": 7.06074,
+            "67": 7.04881,
+            "68": 7.14167,
+            "69": 7.03482,
+            "70": 7.06009,
+            "71": 6.92578,
+            "72": 7.0043,
+            "73": 6.97965,
+            "74": 6.92276,
+            "75": 7.06086,
+            "76": 6.97271,
+            "77": 7.08186,
+            "78": 7.01883,
+            "79": 6.85524,
+            "80": 6.94306,
+            "81": 6.97637,
+            "82": 7.06676,
+            "83": 6.99984,
+            "84": 7.0089,
+            "85": 6.85989,
+            "86": 7.03607,
+            "87": 6.98072,
+            "88": 6.91508,
+            "89": 6.81068,
+            "90": 7.24967,
+            "91": 6.71006,
+            "92": 7.04916,
+            "93": 6.9057,
+            "94": 7.06458,
+            "95": 6.84836,
+            "96": 6.97667,
+            "97": 6.96312,
+            "98": 6.88704,
+            "99": 7.013,
+            "100": 6.98289
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 43331.0,
+            "2": 44051.0,
+            "3": 44760.0,
+            "4": 42395.0,
+            "5": 45376.0,
+            "6": 40957.0,
+            "7": 43160.0,
+            "8": 45463.0,
+            "9": 42446.0,
+            "10": 45361.0,
+            "11": 43965.0,
+            "12": 44605.0,
+            "13": 43884.0,
+            "14": 46187.0,
+            "15": 43888.0,
+            "16": 41604.0,
+            "17": 43828.0,
+            "18": 44690.0,
+            "19": 42562.0,
+            "20": 44777.0,
+            "21": 44792.0,
+            "22": 41854.0,
+            "23": 45465.0,
+            "24": 43071.0,
+            "25": 42465.0,
+            "26": 43917.0,
+            "27": 46228.0,
+            "28": 46431.0,
+            "29": 46169.0,
+            "30": 43995.0,
+            "31": 41278.0,
+            "32": 43346.0,
+            "33": 45463.0,
+            "34": 43298.0,
+            "35": 43276.0,
+            "36": 42490.0,
+            "37": 40069.0,
+            "38": 42527.0,
+            "39": 44730.0,
+            "40": 43245.0,
+            "41": 44653.0,
+            "42": 43269.0,
+            "43": 45462.0,
+            "44": 44594.0,
+            "45": 43285.0,
+            "46": 43915.0,
+            "47": 42370.0,
+            "48": 44704.0,
+            "49": 43164.0,
+            "50": 43365.0,
+            "51": 41167.0,
+            "52": 43825.0,
+            "53": 43945.0,
+            "54": 41947.0,
+            "55": 43853.0,
+            "56": 43268.0,
+            "57": 42591.0,
+            "58": 43843.0,
+            "59": 44625.0,
+            "60": 41218.0,
+            "61": 39714.0,
+            "62": 44779.0,
+            "63": 44716.0,
+            "64": 45359.0,
+            "65": 44684.0,
+            "66": 45355.0,
+            "67": 43146.0,
+            "68": 42519.0,
+            "69": 43835.0,
+            "70": 45522.0,
+            "71": 43316.0,
+            "72": 44767.0,
+            "73": 45365.0,
+            "74": 42449.0,
+            "75": 44695.0,
+            "76": 43885.0,
+            "77": 42092.0,
+            "78": 40278.0,
+            "79": 38915.0,
+            "80": 41096.0,
+            "81": 45372.0,
+            "82": 43206.0,
+            "83": 38481.0,
+            "84": 42474.0,
+            "85": 43990.0,
+            "86": 45729.0,
+            "87": 40884.0,
+            "88": 41772.0,
+            "89": 41076.0,
+            "90": 44676.0,
+            "91": 46159.0,
+            "92": 41790.0,
+            "93": 43242.0,
+            "94": 39566.0,
+            "95": 44077.0,
+            "96": 44741.0,
+            "97": 45379.0,
+            "98": 41802.0,
+            "99": 45441.0,
+            "100": 42530.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2171550208.0,
+            "2": 2171550208.0,
+            "3": 2171550208.0,
+            "4": 2171550208.0,
+            "5": 2171550208.0,
+            "6": 2171550208.0,
+            "7": 2171550208.0,
+            "8": 2171550208.0,
+            "9": 2171550208.0,
+            "10": 2171550208.0,
+            "11": 2171550208.0,
+            "12": 2171550208.0,
+            "13": 2171550208.0,
+            "14": 2171550208.0,
+            "15": 2171550208.0,
+            "16": 2171550208.0,
+            "17": 2171550208.0,
+            "18": 2171550208.0,
+            "19": 2171550208.0,
+            "20": 2171550208.0,
+            "21": 2171550208.0,
+            "22": 2171550208.0,
+            "23": 2171550208.0,
+            "24": 2171550208.0,
+            "25": 2171550208.0,
+            "26": 2171550208.0,
+            "27": 2171550208.0,
+            "28": 2171550208.0,
+            "29": 2171550208.0,
+            "30": 2171550208.0,
+            "31": 2171550208.0,
+            "32": 2171550208.0,
+            "33": 2171550208.0,
+            "34": 2171550208.0,
+            "35": 2171550208.0,
+            "36": 2171550208.0,
+            "37": 2171550208.0,
+            "38": 2171550208.0,
+            "39": 2171550208.0,
+            "40": 2171550208.0,
+            "41": 2171550208.0,
+            "42": 2171550208.0,
+            "43": 2171550208.0,
+            "44": 2171550208.0,
+            "45": 2171550208.0,
+            "46": 2171550208.0,
+            "47": 2171550208.0,
+            "48": 2171550208.0,
+            "49": 2171550208.0,
+            "50": 2171550208.0,
+            "51": 2171550208.0,
+            "52": 2171550208.0,
+            "53": 2171550208.0,
+            "54": 2171550208.0,
+            "55": 2171550208.0,
+            "56": 2171550208.0,
+            "57": 2171550208.0,
+            "58": 2171550208.0,
+            "59": 2171550208.0,
+            "60": 2171550208.0,
+            "61": 2171550208.0,
+            "62": 2171550208.0,
+            "63": 2171550208.0,
+            "64": 2171550208.0,
+            "65": 2171550208.0,
+            "66": 2171550208.0,
+            "67": 2171550208.0,
+            "68": 2171550208.0,
+            "69": 2171550208.0,
+            "70": 2171550208.0,
+            "71": 2171550208.0,
+            "72": 2171550208.0,
+            "73": 2171550208.0,
+            "74": 2171550208.0,
+            "75": 2171550208.0,
+            "76": 2171550208.0,
+            "77": 2171550208.0,
+            "78": 2171550208.0,
+            "79": 2171550208.0,
+            "80": 2171550208.0,
+            "81": 2171550208.0,
+            "82": 2171550208.0,
+            "83": 2171550208.0,
+            "84": 2171550208.0,
+            "85": 2171550208.0,
+            "86": 2171550208.0,
+            "87": 2171550208.0,
+            "88": 2171550208.0,
+            "89": 2171550208.0,
+            "90": 2171550208.0,
+            "91": 2171550208.0,
+            "92": 2171550208.0,
+            "93": 2171550208.0,
+            "94": 2171550208.0,
+            "95": 2171550208.0,
+            "96": 2171550208.0,
+            "97": 2171550208.0,
+            "98": 2171550208.0,
+            "99": 2171550208.0,
+            "100": 2171550208.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 2407642624.0,
+            "2": 3336458752.0,
+            "3": 3336458752.0,
+            "4": 3336458752.0,
+            "5": 3336458752.0,
+            "6": 3336458752.0,
+            "7": 3336458752.0,
+            "8": 3336458752.0,
+            "9": 3336458752.0,
+            "10": 3336458752.0,
+            "11": 3336458752.0,
+            "12": 3336458752.0,
+            "13": 3336458752.0,
+            "14": 3336458752.0,
+            "15": 3336458752.0,
+            "16": 3336458752.0,
+            "17": 3336458752.0,
+            "18": 3336458752.0,
+            "19": 3336458752.0,
+            "20": 3336458752.0,
+            "21": 3336458752.0,
+            "22": 3336458752.0,
+            "23": 3336458752.0,
+            "24": 3336458752.0,
+            "25": 3336458752.0,
+            "26": 3336458752.0,
+            "27": 3336458752.0,
+            "28": 3336458752.0,
+            "29": 3336458752.0,
+            "30": 3336458752.0,
+            "31": 3336458752.0,
+            "32": 3336458752.0,
+            "33": 3336458752.0,
+            "34": 3336458752.0,
+            "35": 3336458752.0,
+            "36": 3336458752.0,
+            "37": 3336458752.0,
+            "38": 3336458752.0,
+            "39": 3336458752.0,
+            "40": 3336458752.0,
+            "41": 3336458752.0,
+            "42": 3336458752.0,
+            "43": 3336458752.0,
+            "44": 3336458752.0,
+            "45": 3336458752.0,
+            "46": 3336458752.0,
+            "47": 3336458752.0,
+            "48": 3336458752.0,
+            "49": 3336458752.0,
+            "50": 3336458752.0,
+            "51": 3336458752.0,
+            "52": 3336458752.0,
+            "53": 3336458752.0,
+            "54": 3336458752.0,
+            "55": 3336458752.0,
+            "56": 3336458752.0,
+            "57": 3336458752.0,
+            "58": 3336458752.0,
+            "59": 3336458752.0,
+            "60": 3336458752.0,
+            "61": 3336458752.0,
+            "62": 3336458752.0,
+            "63": 3336458752.0,
+            "64": 3336458752.0,
+            "65": 3336458752.0,
+            "66": 3336458752.0,
+            "67": 3336458752.0,
+            "68": 3336458752.0,
+            "69": 3336458752.0,
+            "70": 3336458752.0,
+            "71": 3336458752.0,
+            "72": 3336458752.0,
+            "73": 3336458752.0,
+            "74": 3336458752.0,
+            "75": 3336458752.0,
+            "76": 3336458752.0,
+            "77": 3336458752.0,
+            "78": 3336458752.0,
+            "79": 3336458752.0,
+            "80": 3336458752.0,
+            "81": 3336458752.0,
+            "82": 3336458752.0,
+            "83": 3336458752.0,
+            "84": 3336458752.0,
+            "85": 3336458752.0,
+            "86": 3336458752.0,
+            "87": 3336458752.0,
+            "88": 3336458752.0,
+            "89": 3336458752.0,
+            "90": 3336458752.0,
+            "91": 3336458752.0,
+            "92": 3336458752.0,
+            "93": 3336458752.0,
+            "94": 3336458752.0,
+            "95": 3336458752.0,
+            "96": 3336458752.0,
+            "97": 3336458752.0,
+            "98": 3336458752.0,
+            "99": 3336458752.0,
+            "100": 3336458752.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": 18.62611,
+            "2": 0.61916,
+            "3": 0.39111,
+            "4": 0.37734,
+            "5": 0.37747,
+            "6": 0.37685,
+            "7": 0.37677,
+            "8": 0.37728,
+            "9": 0.37655,
+            "10": 0.37691,
+            "11": 0.37785,
+            "12": 0.37904,
+            "13": 0.37764,
+            "14": 0.37699,
+            "15": 0.37715,
+            "16": 0.38824,
+            "17": 0.38806,
+            "18": 0.38018,
+            "19": 0.38683,
+            "20": 0.9055,
+            "21": 0.38303,
+            "22": 0.3989,
+            "23": 0.38148,
+            "24": 0.37842,
+            "25": 0.3783,
+            "26": 0.37826,
+            "27": 0.37811,
+            "28": 0.38399,
+            "29": 0.38106,
+            "30": 0.38545,
+            "31": 0.38376,
+            "32": 0.37822,
+            "33": 0.37908,
+            "34": 0.37752,
+            "35": 0.37707,
+            "36": 0.37805,
+            "37": 0.37768,
+            "38": 0.37787,
+            "39": 0.37768,
+            "40": 0.37772,
+            "41": 0.37854,
+            "42": 0.37822,
+            "43": 0.3784,
+            "44": 0.37704,
+            "45": 0.37698,
+            "46": 0.37731,
+            "47": 0.37806,
+            "48": 0.37732,
+            "49": 0.37787,
+            "50": 0.96201,
+            "51": 0.37939,
+            "52": 0.3783,
+            "53": 0.37741,
+            "54": 0.37713,
+            "55": 0.37693,
+            "56": 0.37705,
+            "57": 0.37763,
+            "58": 0.37733,
+            "59": 0.37723,
+            "60": 0.37677,
+            "61": 0.37741,
+            "62": 0.37846,
+            "63": 0.37789,
+            "64": 0.37762,
+            "65": 0.37726,
+            "66": 0.82486,
+            "67": 0.37916,
+            "68": 0.81188,
+            "69": 0.37737,
+            "70": 0.37671,
+            "71": 0.37812,
+            "72": 0.3783,
+            "73": 0.37834,
+            "74": 0.37781,
+            "75": 0.37676,
+            "76": 0.37767,
+            "77": 0.37767,
+            "78": 0.37779,
+            "79": 0.37804,
+            "80": 0.38597,
+            "81": 0.37771,
+            "82": 0.37768,
+            "83": 0.37796,
+            "84": 0.3771,
+            "85": 0.38399,
+            "86": 0.38623,
+            "87": 0.37928,
+            "88": 0.3908,
+            "89": 0.38126,
+            "90": 0.38257,
+            "91": 0.37842,
+            "92": 0.37962,
+            "93": 0.38289,
+            "94": 0.37797,
+            "95": 0.37837,
+            "96": 0.37748,
+            "97": 0.37811,
+            "98": 0.38381,
+            "99": 0.37833,
+            "100": 0.37842
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json
index 81670d237ce..642719d609f 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.16897,
-            "2": 0.35143,
-            "3": 0.28496,
-            "4": 0.28172,
-            "5": 0.28308,
-            "6": 0.2855,
-            "7": 0.28287,
-            "8": 0.28079,
-            "9": 0.2809,
-            "10": 0.28329,
-            "11": 0.28038,
-            "12": 0.28371,
-            "13": 0.28032,
-            "14": 0.28362,
-            "15": 0.28125,
-            "16": 0.28046,
-            "17": 0.28421,
-            "18": 0.28132,
-            "19": 0.2808,
-            "20": 0.28432,
-            "21": 0.28578,
-            "22": 0.28205,
-            "23": 0.28411,
-            "24": 0.28378,
-            "25": 0.28227,
-            "26": 0.28231,
-            "27": 0.28353,
-            "28": 0.28497,
-            "29": 0.29981,
-            "30": 0.28557,
-            "31": 0.28777,
-            "32": 0.28808,
-            "33": 0.28609,
-            "34": 0.32585,
-            "35": 0.341,
-            "36": 0.2886,
-            "37": 0.28157,
-            "38": 0.2916,
-            "39": 0.28501,
-            "40": 0.27952,
-            "41": 0.27767,
-            "42": 0.28062,
-            "43": 0.28781,
-            "44": 0.2839,
-            "45": 0.282,
-            "46": 0.27837,
-            "47": 0.27883,
-            "48": 0.27865,
-            "49": 0.28179,
-            "50": 0.27881,
-            "51": 0.27669,
-            "52": 0.28063,
-            "53": 0.27909,
-            "54": 0.27716,
-            "55": 0.27807,
-            "56": 0.2785,
-            "57": 0.27679,
-            "58": 0.28004,
-            "59": 0.27659,
-            "60": 0.27984,
-            "61": 0.2771,
-            "62": 0.27714,
-            "63": 0.2802,
-            "64": 0.2918,
-            "65": 0.27948,
-            "66": 0.27839,
-            "67": 0.28573,
-            "68": 0.27933,
-            "69": 0.27893,
-            "70": 0.27964,
-            "71": 0.2767,
-            "72": 0.27816,
-            "73": 0.28004,
-            "74": 0.27997,
-            "75": 0.28095,
-            "76": 0.27752,
-            "77": 0.27912,
-            "78": 0.28068,
-            "79": 0.27992,
-            "80": 0.28771,
-            "81": 0.28046,
-            "82": 0.28352,
-            "83": 0.28376,
-            "84": 0.28337,
-            "85": 0.28197,
-            "86": 0.27949,
-            "87": 0.27909,
-            "88": 0.28479,
-            "89": 0.28248,
-            "90": 0.27742,
-            "91": 0.27819,
-            "92": 0.2809,
-            "93": 0.28123,
-            "94": 0.27933,
-            "95": 0.28364,
-            "96": 0.28523,
-            "97": 0.28365,
-            "98": 0.27822,
-            "99": 0.28382,
-            "100": 0.28917
+            "1": 25.71894,
+            "2": 0.34844,
+            "3": 0.27498,
+            "4": 0.26037,
+            "5": 0.26158,
+            "6": 0.26112,
+            "7": 0.25983,
+            "8": 0.26046,
+            "9": 0.26084,
+            "10": 0.2682,
+            "11": 0.26401,
+            "12": 0.26721,
+            "13": 0.26076,
+            "14": 0.26222,
+            "15": 0.2543,
+            "16": 0.26175,
+            "17": 0.31454,
+            "18": 0.47931,
+            "19": 0.26259,
+            "20": 0.69917,
+            "21": 0.26316,
+            "22": 0.26474,
+            "23": 0.26088,
+            "24": 0.25816,
+            "25": 0.25832,
+            "26": 0.25678,
+            "27": 0.25785,
+            "28": 0.25895,
+            "29": 0.25888,
+            "30": 0.25913,
+            "31": 0.26035,
+            "32": 0.26324,
+            "33": 0.26028,
+            "34": 0.25857,
+            "35": 0.25864,
+            "36": 0.26043,
+            "37": 0.25816,
+            "38": 0.25979,
+            "39": 0.25847,
+            "40": 0.25813,
+            "41": 0.25846,
+            "42": 0.25664,
+            "43": 0.25705,
+            "44": 0.26337,
+            "45": 0.26143,
+            "46": 0.26024,
+            "47": 0.2583,
+            "48": 0.2592,
+            "49": 0.26051,
+            "50": 0.79372,
+            "51": 0.26784,
+            "52": 0.25688,
+            "53": 0.25931,
+            "54": 0.25883,
+            "55": 0.25833,
+            "56": 0.25645,
+            "57": 0.25691,
+            "58": 0.26093,
+            "59": 0.26089,
+            "60": 0.25935,
+            "61": 0.25786,
+            "62": 0.25771,
+            "63": 0.26223,
+            "64": 0.26036,
+            "65": 0.25957,
+            "66": 0.74086,
+            "67": 0.25826,
+            "68": 0.25657,
+            "69": 0.25496,
+            "70": 0.25447,
+            "71": 0.2713,
+            "72": 0.25135,
+            "73": 0.25078,
+            "74": 0.26569,
+            "75": 0.26382,
+            "76": 0.2633,
+            "77": 0.26309,
+            "78": 0.26574,
+            "79": 0.26362,
+            "80": 0.3128,
+            "81": 0.26022,
+            "82": 0.26605,
+            "83": 0.26244,
+            "84": 0.26413,
+            "85": 0.2656,
+            "86": 0.26904,
+            "87": 0.26661,
+            "88": 0.26377,
+            "89": 0.2667,
+            "90": 0.26433,
+            "91": 0.26317,
+            "92": 0.26411,
+            "93": 0.26798,
+            "94": 0.25821,
+            "95": 0.26018,
+            "96": 0.29437,
+            "97": 0.26414,
+            "98": 0.26347,
+            "99": 0.26108,
+            "100": 0.25931
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
index 2e0ee7ee230..0b23b1bfecd 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_h100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.3446,
-            "2": 0.55186,
-            "3": 0.52074,
-            "4": 0.52226,
-            "5": 0.51961,
-            "6": 0.52672,
-            "7": 0.52451,
-            "8": 0.52369,
-            "9": 0.54507,
-            "10": 0.53931,
-            "11": 0.55505,
-            "12": 0.52851,
-            "13": 0.51692,
-            "14": 0.52026,
-            "15": 0.51979,
-            "16": 0.53317,
-            "17": 0.52489,
-            "18": 0.59625,
-            "19": 0.52238,
-            "20": 0.53197,
-            "21": 0.52211,
-            "22": 0.51979,
-            "23": 0.52551,
-            "24": 0.52413,
-            "25": 0.52676,
-            "26": 0.5192,
-            "27": 0.52336,
-            "28": 0.53671,
-            "29": 0.53561,
-            "30": 0.51609,
-            "31": 0.55983,
-            "32": 0.5166,
-            "33": 0.53721,
-            "34": 0.52158,
-            "35": 0.53727,
-            "36": 0.5279,
-            "37": 0.51655,
-            "38": 0.51986,
-            "39": 0.5223,
-            "40": 0.52388,
-            "41": 0.52083,
-            "42": 0.52801,
-            "43": 0.52136,
-            "44": 0.52414,
-            "45": 0.52048,
-            "46": 0.53415,
-            "47": 0.54831,
-            "48": 0.58827,
-            "49": 0.55044,
-            "50": 0.52682,
-            "51": 0.52339,
-            "52": 0.51726,
-            "53": 0.518,
-            "54": 0.51935,
-            "55": 0.52073,
-            "56": 0.52732,
-            "57": 0.51867,
-            "58": 0.51876,
-            "59": 0.5213,
-            "60": 0.51779,
-            "61": 0.52225,
-            "62": 0.52041,
-            "63": 0.51793,
-            "64": 0.5135,
-            "65": 0.51913,
-            "66": 0.86034,
-            "67": 0.51468,
-            "68": 0.90156,
-            "69": 0.51931,
-            "70": 0.53602,
-            "71": 0.51818,
-            "72": 0.51744,
-            "73": 0.54454,
-            "74": 0.51831,
-            "75": 0.521,
-            "76": 0.52894,
-            "77": 0.53227,
-            "78": 0.51806,
-            "79": 0.51818,
-            "80": 0.51632,
-            "81": 0.51704,
-            "82": 0.51542,
-            "83": 0.51861,
-            "84": 0.53204,
-            "85": 0.52011,
-            "86": 0.53043,
-            "87": 0.94359,
-            "88": 0.51776,
-            "89": 0.51799,
-            "90": 0.51773,
-            "91": 0.51828,
-            "92": 0.52318,
-            "93": 0.51688,
-            "94": 0.51939,
-            "95": 0.51554,
-            "96": 0.9,
-            "97": 0.96079,
-            "98": 0.52856,
-            "99": 0.51996,
-            "100": 0.52921
+            "1": 25.3049,
+            "2": 0.96867,
+            "3": 0.50973,
+            "4": 0.4916,
+            "5": 0.48837,
+            "6": 0.48697,
+            "7": 0.48553,
+            "8": 0.48392,
+            "9": 0.50312,
+            "10": 0.50926,
+            "11": 0.49703,
+            "12": 0.50337,
+            "13": 0.4965,
+            "14": 0.49332,
+            "15": 0.49456,
+            "16": 0.49141,
+            "17": 0.49486,
+            "18": 0.49094,
+            "19": 0.49816,
+            "20": 0.49526,
+            "21": 0.4944,
+            "22": 0.49451,
+            "23": 0.89375,
+            "24": 1.14231,
+            "25": 0.49653,
+            "26": 0.49556,
+            "27": 0.49346,
+            "28": 0.49649,
+            "29": 0.49046,
+            "30": 0.49275,
+            "31": 0.49217,
+            "32": 0.492,
+            "33": 0.49189,
+            "34": 0.49161,
+            "35": 0.48929,
+            "36": 0.50013,
+            "37": 0.49187,
+            "38": 0.49624,
+            "39": 0.49444,
+            "40": 0.4924,
+            "41": 0.49691,
+            "42": 0.49262,
+            "43": 0.4991,
+            "44": 0.48077,
+            "45": 0.47788,
+            "46": 0.48199,
+            "47": 0.49826,
+            "48": 0.49278,
+            "49": 0.48988,
+            "50": 0.48958,
+            "51": 0.49301,
+            "52": 0.48885,
+            "53": 0.48896,
+            "54": 0.49306,
+            "55": 0.49203,
+            "56": 0.49425,
+            "57": 0.49088,
+            "58": 0.48671,
+            "59": 0.48576,
+            "60": 0.49276,
+            "61": 0.4913,
+            "62": 0.48886,
+            "63": 0.49215,
+            "64": 0.49049,
+            "65": 0.4937,
+            "66": 0.49731,
+            "67": 0.48964,
+            "68": 0.49368,
+            "69": 0.47854,
+            "70": 0.47863,
+            "71": 0.48038,
+            "72": 0.47911,
+            "73": 0.48181,
+            "74": 0.49298,
+            "75": 0.49322,
+            "76": 0.48959,
+            "77": 0.48669,
+            "78": 0.47649,
+            "79": 0.48313,
+            "80": 0.47614,
+            "81": 0.47749,
+            "82": 0.47372,
+            "83": 0.48543,
+            "84": 0.47903,
+            "85": 0.47638,
+            "86": 0.47539,
+            "87": 0.47854,
+            "88": 0.47715,
+            "89": 0.47616,
+            "90": 0.47457,
+            "91": 0.4771,
+            "92": 0.4792,
+            "93": 0.47493,
+            "94": 0.47522,
+            "95": 0.47459,
+            "96": 0.474,
+            "97": 0.48537,
+            "98": 0.47982,
+            "99": 0.47495,
+            "100": 0.47321
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
index b9a799c779f..e4524b5427a 100644
--- a/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json
@@ -432,106 +432,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 9.38956,
-            "2": 0.54892,
-            "3": 0.53756,
-            "4": 0.52845,
-            "5": 0.52687,
-            "6": 0.51818,
-            "7": 0.52819,
-            "8": 0.52051,
-            "9": 0.52526,
-            "10": 0.52865,
-            "11": 0.52834,
-            "12": 0.52573,
-            "13": 0.52783,
-            "14": 0.52938,
-            "15": 0.51899,
-            "16": 0.53517,
-            "17": 0.52289,
-            "18": 0.5363,
-            "19": 0.5954,
-            "20": 0.55838,
-            "21": 0.52166,
-            "22": 0.54146,
-            "23": 0.53649,
-            "24": 0.52785,
-            "25": 0.52349,
-            "26": 0.52481,
-            "27": 0.52376,
-            "28": 0.52226,
-            "29": 0.5291,
-            "30": 0.52613,
-            "31": 0.52719,
-            "32": 0.52341,
-            "33": 0.52646,
-            "34": 0.52272,
-            "35": 0.53016,
-            "36": 0.51941,
-            "37": 0.52643,
-            "38": 0.51914,
-            "39": 0.53109,
-            "40": 0.52353,
-            "41": 0.55102,
-            "42": 0.52656,
-            "43": 0.53223,
-            "44": 0.53438,
-            "45": 0.53126,
-            "46": 0.53776,
-            "47": 0.52511,
-            "48": 0.53521,
-            "49": 0.52743,
-            "50": 0.52883,
-            "51": 0.54078,
-            "52": 0.52088,
-            "53": 0.53221,
-            "54": 0.52473,
-            "55": 0.54396,
-            "56": 0.52771,
-            "57": 0.52699,
-            "58": 0.53079,
-            "59": 0.52445,
-            "60": 0.53037,
-            "61": 0.52164,
-            "62": 0.532,
-            "63": 0.52392,
-            "64": 0.53062,
-            "65": 0.52269,
-            "66": 0.53306,
-            "67": 0.5173,
-            "68": 0.54063,
-            "69": 0.52464,
-            "70": 0.92233,
-            "71": 0.53301,
-            "72": 0.52584,
-            "73": 0.55029,
-            "74": 0.54931,
-            "75": 0.54907,
-            "76": 0.53191,
-            "77": 0.53522,
-            "78": 0.53487,
-            "79": 0.52543,
-            "80": 0.53474,
-            "81": 0.52635,
-            "82": 0.54801,
-            "83": 0.52605,
-            "84": 0.53393,
-            "85": 0.52523,
-            "86": 0.53947,
-            "87": 0.52933,
-            "88": 0.53447,
-            "89": 0.53,
-            "90": 0.5287,
-            "91": 0.53326,
-            "92": 0.54604,
-            "93": 0.53649,
-            "94": 0.5297,
-            "95": 0.54163,
-            "96": 0.52549,
-            "97": 0.53256,
-            "98": 0.53104,
-            "99": 0.54062,
-            "100": 0.52332
+            "1": 25.29495,
+            "2": 0.59083,
+            "3": 0.51228,
+            "4": 0.86881,
+            "5": 0.4917,
+            "6": 0.49302,
+            "7": 0.49226,
+            "8": 0.49005,
+            "9": 0.56319,
+            "10": 0.66651,
+            "11": 0.48986,
+            "12": 0.48642,
+            "13": 0.48195,
+            "14": 0.48561,
+            "15": 0.48592,
+            "16": 0.49064,
+            "17": 0.48536,
+            "18": 0.483,
+            "19": 0.48082,
+            "20": 0.48238,
+            "21": 0.50394,
+            "22": 0.8666,
+            "23": 1.49846,
+            "24": 0.48279,
+            "25": 0.48011,
+            "26": 0.48147,
+            "27": 0.4828,
+            "28": 0.47915,
+            "29": 0.49097,
+            "30": 0.48131,
+            "31": 0.48075,
+            "32": 0.47908,
+            "33": 0.47968,
+            "34": 0.48222,
+            "35": 0.48057,
+            "36": 0.47723,
+            "37": 0.48,
+            "38": 0.48269,
+            "39": 0.47837,
+            "40": 0.48188,
+            "41": 0.47999,
+            "42": 0.4825,
+            "43": 0.49017,
+            "44": 0.48176,
+            "45": 0.48251,
+            "46": 0.47977,
+            "47": 0.48156,
+            "48": 0.48108,
+            "49": 0.48014,
+            "50": 0.47676,
+            "51": 0.49017,
+            "52": 0.481,
+            "53": 0.47836,
+            "54": 0.47545,
+            "55": 0.47796,
+            "56": 0.47606,
+            "57": 0.47601,
+            "58": 0.47957,
+            "59": 0.47812,
+            "60": 0.47515,
+            "61": 0.47947,
+            "62": 0.47591,
+            "63": 0.47577,
+            "64": 0.47566,
+            "65": 0.4769,
+            "66": 0.47889,
+            "67": 0.47584,
+            "68": 0.47578,
+            "69": 0.47401,
+            "70": 0.4759,
+            "71": 0.47514,
+            "72": 0.4742,
+            "73": 0.47824,
+            "74": 0.47726,
+            "75": 0.48289,
+            "76": 0.48194,
+            "77": 0.48719,
+            "78": 0.49039,
+            "79": 0.4775,
+            "80": 0.48402,
+            "81": 0.48084,
+            "82": 0.47553,
+            "83": 0.48122,
+            "84": 0.47896,
+            "85": 0.4766,
+            "86": 0.47712,
+            "87": 0.47753,
+            "88": 0.47535,
+            "89": 0.4749,
+            "90": 0.4776,
+            "91": 0.47619,
+            "92": 0.47613,
+            "93": 0.47698,
+            "94": 0.47658,
+            "95": 0.47543,
+            "96": 0.47852,
+            "97": 0.47566,
+            "98": 0.47444,
+            "99": 0.47759,
+            "100": 0.47631
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
new file mode 100644
index 00000000000..a890b5a0f5d
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 7.36469,
+            "52": 7.12044,
+            "53": 7.09167,
+            "54": 7.22712,
+            "55": 7.13495,
+            "56": 7.20751,
+            "57": 7.31287,
+            "58": 6.99063,
+            "59": 7.09849,
+            "60": 7.12665,
+            "61": 7.10047,
+            "62": 7.23974,
+            "63": 7.14358,
+            "64": 7.06717,
+            "65": 6.98408,
+            "66": 7.03692,
+            "67": 7.02875,
+            "68": 7.12914,
+            "69": 7.01425,
+            "70": 7.04954,
+            "71": 6.89312,
+            "72": 6.98513,
+            "73": 6.96734,
+            "74": 6.90236,
+            "75": 7.05611,
+            "76": 6.95986,
+            "77": 7.06862,
+            "78": 7.0204,
+            "79": 6.8505,
+            "80": 6.92019,
+            "81": 6.95982,
+            "82": 7.04575,
+            "83": 6.98617,
+            "84": 6.99991,
+            "85": 6.83511,
+            "86": 7.04087,
+            "87": 6.96604,
+            "88": 6.90125,
+            "89": 6.80345,
+            "90": 7.22384,
+            "91": 6.70505,
+            "92": 7.03979,
+            "93": 6.8857,
+            "94": 7.04044,
+            "95": 6.84746,
+            "96": 6.9546,
+            "97": 6.94425,
+            "98": 6.86865,
+            "99": 6.9948,
+            "100": 6.96761
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 41133.0,
+            "52": 43849.0,
+            "53": 43899.0,
+            "54": 41704.0,
+            "55": 43863.0,
+            "56": 43205.0,
+            "57": 42636.0,
+            "58": 43835.0,
+            "59": 44623.0,
+            "60": 41226.0,
+            "61": 39705.0,
+            "62": 44732.0,
+            "63": 44659.0,
+            "64": 45371.0,
+            "65": 44682.0,
+            "66": 45341.0,
+            "67": 43169.0,
+            "68": 42486.0,
+            "69": 43829.0,
+            "70": 45529.0,
+            "71": 43294.0,
+            "72": 44745.0,
+            "73": 45364.0,
+            "74": 42463.0,
+            "75": 44679.0,
+            "76": 43882.0,
+            "77": 42042.0,
+            "78": 40356.0,
+            "79": 38928.0,
+            "80": 41079.0,
+            "81": 45349.0,
+            "82": 43226.0,
+            "83": 38474.0,
+            "84": 42415.0,
+            "85": 43989.0,
+            "86": 45673.0,
+            "87": 40850.0,
+            "88": 41756.0,
+            "89": 41065.0,
+            "90": 44686.0,
+            "91": 46135.0,
+            "92": 41609.0,
+            "93": 43267.0,
+            "94": 39525.0,
+            "95": 43921.0,
+            "96": 44683.0,
+            "97": 45412.0,
+            "98": 41832.0,
+            "99": 45416.0,
+            "100": 42457.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1105773056.0,
+            "52": 1105773056.0,
+            "53": 1105773056.0,
+            "54": 1105773056.0,
+            "55": 1105773056.0,
+            "56": 1105773056.0,
+            "57": 1105773056.0,
+            "58": 1105773056.0,
+            "59": 1105773056.0,
+            "60": 1105773056.0,
+            "61": 1105773056.0,
+            "62": 1105773056.0,
+            "63": 1105773056.0,
+            "64": 1105773056.0,
+            "65": 1105773056.0,
+            "66": 1105773056.0,
+            "67": 1105773056.0,
+            "68": 1105773056.0,
+            "69": 1105773056.0,
+            "70": 1105773056.0,
+            "71": 1105773056.0,
+            "72": 1105773056.0,
+            "73": 1105773056.0,
+            "74": 1105773056.0,
+            "75": 1105773056.0,
+            "76": 1105773056.0,
+            "77": 1105773056.0,
+            "78": 1105773056.0,
+            "79": 1105773056.0,
+            "80": 1105773056.0,
+            "81": 1105773056.0,
+            "82": 1105773056.0,
+            "83": 1105773056.0,
+            "84": 1105773056.0,
+            "85": 1105773056.0,
+            "86": 1105773056.0,
+            "87": 1105773056.0,
+            "88": 1105773056.0,
+            "89": 1105773056.0,
+            "90": 1105773056.0,
+            "91": 1105773056.0,
+            "92": 1105773056.0,
+            "93": 1105773056.0,
+            "94": 1105773056.0,
+            "95": 1105773056.0,
+            "96": 1105773056.0,
+            "97": 1105773056.0,
+            "98": 1105773056.0,
+            "99": 1105773056.0,
+            "100": 1105773056.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 1823922688.0,
+            "52": 1823923712.0,
+            "53": 1823923712.0,
+            "54": 1823923712.0,
+            "55": 1823923712.0,
+            "56": 1823923712.0,
+            "57": 1823923712.0,
+            "58": 1823923712.0,
+            "59": 1823923712.0,
+            "60": 1823923712.0,
+            "61": 1823923712.0,
+            "62": 1823923712.0,
+            "63": 1823923712.0,
+            "64": 1823923712.0,
+            "65": 1823923712.0,
+            "66": 1823923712.0,
+            "67": 1823923712.0,
+            "68": 1823923712.0,
+            "69": 1823923712.0,
+            "70": 1823923712.0,
+            "71": 1823923712.0,
+            "72": 1823923712.0,
+            "73": 1823923712.0,
+            "74": 1823923712.0,
+            "75": 1823923712.0,
+            "76": 1823923712.0,
+            "77": 1823923712.0,
+            "78": 1823923712.0,
+            "79": 1823923712.0,
+            "80": 1823923712.0,
+            "81": 1823923712.0,
+            "82": 1823923712.0,
+            "83": 1823923712.0,
+            "84": 1823923712.0,
+            "85": 1823923712.0,
+            "86": 1823923712.0,
+            "87": 1823923712.0,
+            "88": 1823923712.0,
+            "89": 1823923712.0,
+            "90": 1823923712.0,
+            "91": 1823923712.0,
+            "92": 1823923712.0,
+            "93": 1823923712.0,
+            "94": 1823923712.0,
+            "95": 1823923712.0,
+            "96": 1823923712.0,
+            "97": 1823923712.0,
+            "98": 1823923712.0,
+            "99": 1823923712.0,
+            "100": 1823923712.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 23.79487,
+            "52": 0.54117,
+            "53": 0.50294,
+            "54": 0.49381,
+            "55": 0.49765,
+            "56": 0.49437,
+            "57": 0.48794,
+            "58": 0.4927,
+            "59": 0.492,
+            "60": 0.50378,
+            "61": 0.49484,
+            "62": 0.49441,
+            "63": 0.49721,
+            "64": 0.49973,
+            "65": 0.49641,
+            "66": 0.49959,
+            "67": 0.49735,
+            "68": 0.49554,
+            "69": 0.4954,
+            "70": 0.49556,
+            "71": 0.49515,
+            "72": 0.49547,
+            "73": 0.49564,
+            "74": 0.50072,
+            "75": 0.50384,
+            "76": 0.50256,
+            "77": 0.49599,
+            "78": 0.49854,
+            "79": 0.49618,
+            "80": 0.5065,
+            "81": 0.50877,
+            "82": 0.49521,
+            "83": 0.51145,
+            "84": 0.49943,
+            "85": 0.49798,
+            "86": 0.49691,
+            "87": 0.49859,
+            "88": 0.50159,
+            "89": 0.49713,
+            "90": 0.49297,
+            "91": 0.49503,
+            "92": 0.49824,
+            "93": 0.49313,
+            "94": 0.4893,
+            "95": 0.48841,
+            "96": 0.49,
+            "97": 0.48974,
+            "98": 0.4896,
+            "99": 0.49265,
+            "100": 0.49225
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
index 852fbf9819d..a7abdc1bdd4 100644
--- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
@@ -58,6 +58,7 @@ MODEL_ARGS:
   --log-validation-ppl-to-tensorboard: true
   --timing-log-level: 0
   --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   --wandb-save-dir: ${WANDB_SAVE_PATH}
 METRICS:
diff --git a/tests/test_utils/python_scripts/auto_reminder_github.py b/tests/test_utils/python_scripts/auto_reminder_github.py
index 7484244b717..94e0de1ddaa 100644
--- a/tests/test_utils/python_scripts/auto_reminder_github.py
+++ b/tests/test_utils/python_scripts/auto_reminder_github.py
@@ -6,6 +6,7 @@
 Usage: GH_TOKEN=ghp_... SLACK_TOKEN=xoxb-... SLACK_WEBHOOK_URL=https://... REPO=NVIDIA/Megatron-LM python github_pr_reminder.py
 """
 
+import html
 import logging
 import os
 import sys
@@ -231,10 +232,11 @@ def create_reminder(self, pr):
         stage_days = self.days_since(self.get_label_date(pr, stage))
         author_email = self.get_user_email(pr.user.login)
         reviewer_emails, action_message = self.get_reviewers(pr)
+        escaped_title = html.escape(pr.title, quote=False)
 
         return Reminder(
             id=pr.number,
-            pr=f"<{pr.html_url}|#{pr.number} - {pr.title}>",
+            pr=f"<{pr.html_url}|#{pr.number} - {escaped_title}>",
             milestone=pr.milestone.title if pr.milestone else "No Milestone",
             author=self.get_slack_user_id(author_email),
             priority="P0" if stage_days > 3 else "P1" if stage_days >= 1 else "P2",
diff --git a/tests/test_utils/python_scripts/download_golden_values.py b/tests/test_utils/python_scripts/download_golden_values.py
index e2294b32fbb..423cd0ec254 100644
--- a/tests/test_utils/python_scripts/download_golden_values.py
+++ b/tests/test_utils/python_scripts/download_golden_values.py
@@ -84,37 +84,36 @@ def main(pipeline_id: int, only_failing: bool):
                 ).glob("g*.json")
             )
 
-            if len(golden_values_sources) == 1:
-                golden_values_source = golden_values_sources[0]
-            else:
+            if len(golden_values_sources) < 1:
                 logger.info(
                     "Golden values for %s does not exist. Skip.", str(golden_values_sources)
                 )
                 continue
 
-            golden_values_source_name = golden_values_source.name
-            golden_values_source_name = golden_values_source_name.replace(
-                "generations", "golden_values"
-            )
-
-            golden_values_target = (
-                pathlib.Path("tests")
-                / "functional_tests"
-                / 'test_cases'
-                / job.stage
-                / job.name
-                / golden_values_source_name
-            )
+            for golden_values_source in golden_values_sources:
+                golden_values_source_name = golden_values_source.name
+                golden_values_source_name = golden_values_source_name.replace(
+                    "generations", "golden_values"
+                )
 
-            if golden_values_source.exists():
-                pathlib.Path(golden_values_target.parent).mkdir(parents=True, exist_ok=True)
-                logger.info(
-                    "Move artifacts from %s to %s", golden_values_source, golden_values_target
+                golden_values_target = (
+                    pathlib.Path("tests")
+                    / "functional_tests"
+                    / 'test_cases'
+                    / job.stage
+                    / job.name
+                    / golden_values_source_name
                 )
 
-                shutil.move(golden_values_source, golden_values_target)
-            else:
-                logger.info("Golden values for %s does not exist. Skip.", str(golden_values_source))
+                if golden_values_source.exists():
+                    pathlib.Path(golden_values_target.parent).mkdir(parents=True, exist_ok=True)
+                    logger.info(
+                        "Move artifacts from %s to %s", golden_values_source, golden_values_target
+                    )
+
+                    shutil.move(golden_values_source, golden_values_target)
+                else:
+                    logger.info("Golden values for %s does not exist. Skip.", str(golden_values_source))
 
             shutil.rmtree("tmp")
 
diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py
index 6ecd98a06c1..7f60ceb12d6 100644
--- a/tests/test_utils/python_scripts/launch_jet_workload.py
+++ b/tests/test_utils/python_scripts/launch_jet_workload.py
@@ -8,6 +8,7 @@
 import signal
 import sys
 import time
+import uuid
 import zipfile
 from typing import Dict, List, Optional
 
@@ -111,15 +112,12 @@ def launch_and_wait_for_completion(
                                         "HF_HUB_CACHE": "/lustre/fsw/coreai_dlalgo_mcore/hf_hub",
                                         "TRANSFORMERS_OFFLINE": "1",
                                         "CLUSTER": cluster,
+                                        "RUN_ID": str(uuid.uuid4()),
                                     }
                                 }
                             }
                         }
                     },
-                    "outputs": {
-                        "enabled": True,
-                        "artifacts_storages": [recipe_parser.resolve_artifact_config(cluster)],
-                    },
                 },
                 wait_for_validation=True,
                 max_wait_time=(60 * 60),
diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py
index 6e2b73e430f..8d006f70d19 100644
--- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py
+++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py
@@ -50,6 +50,9 @@ def is_flaky_failure(concat_allranks_logs: str) -> bool:
 @click.option("--environment", required=True, type=str, help="Environment of the workload")
 @click.option("--platform", required=True, type=str, help="Platform of the workload")
 @click.option("--container-image", required=True, type=str, help="Container image of the workload")
+@click.option(
+    "--n-repeat", required=False, type=int, help="Number of times to repeat the workload", default=1
+)
 @click.option("--data-dir", required=False, type=str, help="Data directory of the workload")
 @click.option("--tag", required=False, type=str, help="Tag of the workload")
 @click.option(
@@ -68,6 +71,7 @@ def main(
     environment,
     platform,
     container_image,
+    n_repeat: int = 1,
     data_dir: Optional[str] = None,
     tag: Optional[str] = None,
     enable_lightweight_mode: Optional[bool] = False,
@@ -92,6 +96,7 @@ def main(
     magic_values["assets_dir"] = "/opt/megatron-lm/assets_dir"
     magic_values["artifacts_dir"] = "/opt/megatron-lm/artifacts_dir"
     magic_values["environment"] = environment
+    magic_values["n_repeat"] = n_repeat
     magic_values["test_case"] = workload.spec["test_case"]
     magic_values["name"] = workload.spec["name"].format(**magic_values)
     workload.spec["script"] = workload.spec["script"].format(**magic_values)
@@ -113,8 +118,10 @@ def main(
             "PYTHONUNBUFFERED": "1",
             "OUTPUT_PATH": os.getcwd(),
             "ENABLE_LIGHTWEIGHT_MODE": str(enable_lightweight_mode).lower(),
-            "N_REPEAT": "1",
+            "N_REPEAT": str(n_repeat),
             "CLUSTER": "dgxh100_dgxc",
+            "NCCL_DEBUG": "INFO",
+            "NCCL_DEBUG_FILE": "/opt/megatron-lm/assets_dir/logs/nccl_debug.log",
         },
         packager=run.Packager(),
         volumes=artifacts,
diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py
index e26d04d6f20..394bda30a01 100644
--- a/tests/test_utils/python_scripts/recipe_parser.py
+++ b/tests/test_utils/python_scripts/recipe_parser.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import copy
 import itertools
 import logging
@@ -23,6 +24,8 @@ class dotdict(dict):
 def resolve_cluster_config(cluster: str) -> str:
     if cluster == "dgxh100_eos":
         return "eos"
+    if cluster == "dgxgb200_oci-hsg":
+        return "oci-hsg"
     if cluster == "dgxa100_dracooci":
         return "draco-oci-iad"
     if cluster == "dgxa100_dracooci-ord":
@@ -34,28 +37,27 @@ def resolve_cluster_config(cluster: str) -> str:
     raise ValueError(f"Unknown cluster {cluster} provided.")
 
 
-def resolve_artifact_config(cluster: str) -> str:
-    if cluster == "dgxh100_eos":
-        return "eos_lustre"
-    if cluster == "dgxa100_dracooci":
-        return "draco-oci_lustre"
-    if cluster == "dgxa100_dracooci-ord":
-        return "draco-oci-ord_lustre"
-    if cluster == "dgxh100_coreweave":
-        return "coreweave_lustre"
-    raise ValueError(f"Unknown cluster {cluster} provided.")
-
-
 def flatten_products(workload_manifest: dotdict) -> dotdict:
     """Flattens a nested dict of products"""
-    workload_manifest.products = [
-        dict(**dict(zip(inp.keys(), values)), **{"test_case": product["test_case"][0]})
-        for product in (workload_manifest.products or [])
-        if "products" in product
-        for inp in product["products"]
-        for values in itertools.product(*inp.values())
-    ]
+    flattened_products = []
+    products = workload_manifest.products or []
+
+    for product in products:
+        if "products" not in product:
+            continue
 
+        test_case = product["test_case"][0]
+        for param_dict in product["products"]:
+            # Generate all combinations of parameter values
+            param_combinations = itertools.product(*param_dict.values())
+
+            for value_combination in param_combinations:
+                # Map parameter names to their values
+                flattened = dict(zip(param_dict.keys(), value_combination))
+                flattened["test_case"] = test_case
+                flattened_products.append(flattened)
+
+    workload_manifest.products = flattened_products
     return workload_manifest
 
 
@@ -232,14 +234,13 @@ def load_workloads(
 
     workloads: List[dotdict] = []
     build_workloads: List = []
-    for file in list(recipes_dir.glob("*.yaml")) + list(local_dir.glob("*.yaml")):
+    for file in list(recipes_dir.glob("**/*.yaml")) + list(local_dir.glob("**/*.yaml")):
         workloads += load_and_flatten(config_path=str(file))
         if file.stem.startswith("_build"):
             build_workloads.append(load_config(config_path=str(file)))
 
     if scope:
         workloads = filter_by_scope(workload_manifests=workloads, scope=scope)
-
     if workloads and environment:
         workloads = filter_by_environment(workload_manifests=workloads, environment=environment)
 
diff --git a/tests/test_utils/recipes/_build-mcore-dev.yaml b/tests/test_utils/recipes/_build-mcore-dev.yaml
index 123250d7469..d82417ea5e3 100644
--- a/tests/test_utils/recipes/_build-mcore-dev.yaml
+++ b/tests/test_utils/recipes/_build-mcore-dev.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [maanug]
 spec:
   name: mcore-pyt-dev
-  platforms: [linux/amd64]
+  platforms: [linux/amd64,linux/arm64]
   source:
     # The image tag will be added via `jet-tests.yaml`
     # Tags are one of {buildcache, $CI_PIPELINE_ID}
diff --git a/tests/test_utils/recipes/_build-mcore-lts.yaml b/tests/test_utils/recipes/_build-mcore-lts.yaml
index d017b71c101..8efa6faa1e5 100644
--- a/tests/test_utils/recipes/_build-mcore-lts.yaml
+++ b/tests/test_utils/recipes/_build-mcore-lts.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [maanug]
 spec:
   name: mcore-pyt-lts
-  platforms: [linux/amd64]
+  platforms: [linux/amd64,linux/arm64]
   source:
     # The image tag will be added via `jet-tests.yaml`
     # Tags are one of {buildcache, $CI_PIPELINE_ID}
diff --git a/tests/test_utils/recipes/gb200/gpt.yaml b/tests/test_utils/recipes/gb200/gpt.yaml
new file mode 100644
index 00000000000..f387fbb9a13
--- /dev/null
+++ b/tests/test_utils/recipes/gb200/gpt.yaml
@@ -0,0 +1,423 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}_{environment}_{platforms}"
+  model: gpt
+  build: mcore-pyt-{environment}
+  nodes: 2
+  gpus: 4
+  n_repeat: 5
+  platforms: dgx_gb200
+  script_setup: |
+    unset https_proxy
+    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
+
+    # Checkout latest
+    cd /opt
+    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    git fetch origin $MCORE_MR_COMMIT
+    git checkout $MCORE_MR_COMMIT
+    git rev-parse HEAD
+
+    # Checkout backwards-ref
+    cd /opt
+    rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin $MCORE_BACKWARDS_COMMIT
+    git checkout $MCORE_BACKWARDS_COMMIT
+    git rev-parse HEAD
+    rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    NAME=$(echo {test_case}_{environment} | sed 's/dgx_gb200/dgx_a100/g')
+    export GPUS_PER_NODE={gpus}
+
+    ARGUMENTS=(
+        "DATA_PATH=/mnt/artifacts"
+        "DATA_CACHE_PATH=/lustre/fsw/coreai_dlalgo_mcore/mcore_ci/data/$RUN_ID/cache/"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/mnt/artifacts/"
+        "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
+        "N_REPEAT={n_repeat}"
+        "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
+        "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
+    )
+
+    set +x 
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} 
+    exit_code=$?
+    echo "Exit code: $exit_code"
+    rm -rf /lustre/fsw/coreai_dlalgo_mcore/mcore_ci/data/$RUN_ID || true
+    set -x
+    exit $exit_code
+
+products:
+  #######################################################################
+  # Nightly tests: Run DEV unless something is flaky       #
+  #######################################################################
+  - test_case: [gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp1_pp2]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp1_pp2_resume_torch_dist]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+  - test_case: [gpt3_mcore_tp1_pp4]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp1_pp4_resume_torch_dist]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+  - test_case: [gpt3_mcore_tp4_pp1_resume_torch]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp4_pp1_resume_torch_dist]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  #######################################################################
+  # mr, mr-github tests: Mostly DEV on mr, mr-github, and LTS on nightly cadence, except for  #
+  #             some very important tests.                              #
+  #######################################################################
+  - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github-broken]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp1]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+      # - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist]
+      #   products:
+      #     - environment: [dev]
+      #       scope: [mr]
+      #       platforms: [dgx_gb200] # Hangs: #513
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2_nondeterministic]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_gdn]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-github-slim]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_mla]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp2_pp2_uninstall_te]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_7b_tp1_pp4_memory_speed]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_7b_tp4_pp1_memory_speed]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_modelopt_distill_resume]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  #######################################################################
+  # Super important mr, mr-github tests that run for DEV per mr, mr-github       #
+  #######################################################################
+  - test_case: [gpt3_mcore_reruns_persistent_1]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github-broken]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-github-slim]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-github-slim]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_a100, dgx_gb200]
+  # - test_case: [gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [weekly]
+  #       platforms: [dgx_b200]
+  # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [weekly]
+  #       platforms: [dgx_b200]
+  # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [weekly]
+  #       platforms: [dgx_b200]
+  # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_mxfp8_tp_sp_cp]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [weekly]
+  #       platforms: [dgx_b200]
+  # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [weekly]
+  #       platforms: [dgx_b200]
+  - test_case: [gpt3_weekly_dgx_gb200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap]
+    products:
+      - environment: [dev]
+        scope: [weekly]
+        platforms: [dgx_gb200]
+  # - test_case: [gpt3_weekly_dgx_gb200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [weekly]
+  #       platforms: [dgx_gb200]
+  # - test_case: [gpt3_weekly_dgx_gb200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [weekly]
+  #       platforms: [dgx_gb200]
+  - test_case: [gpt3_weekly_dgx_gb200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap]
+    products:
+      - environment: [dev]
+        scope: [weekly]
+        platforms: [dgx_gb200]
diff --git a/tests/test_utils/recipes/gb200/moe.yaml b/tests/test_utils/recipes/gb200/moe.yaml
new file mode 100644
index 00000000000..28ae2415aac
--- /dev/null
+++ b/tests/test_utils/recipes/gb200/moe.yaml
@@ -0,0 +1,220 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}_{environment}_{platforms}"
+  model: moe
+  build: mcore-pyt-{environment}
+  nodes: 2
+  gpus: 4
+  n_repeat: 5
+  platforms: dgx_gb200
+  script_setup: |
+    unset https_proxy
+    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
+
+    # Checkout latest
+    cd /opt
+    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    git fetch origin $MCORE_MR_COMMIT
+    git checkout $MCORE_MR_COMMIT
+    git rev-parse HEAD
+
+    # Checkout backwards-ref
+    cd /opt
+    rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin $MCORE_BACKWARDS_COMMIT
+    git checkout $MCORE_BACKWARDS_COMMIT
+    git rev-parse HEAD
+    rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    NAME=$(echo {test_case}_{environment} | sed 's/dgx_gb200/dgx_a100/g')
+    export GPUS_PER_NODE={gpus}
+
+    ARGUMENTS=(
+        "DATA_PATH=/mnt/artifacts"
+        "DATA_CACHE_PATH=/lustre/fsw/coreai_dlalgo_mcore/mcore_ci/data/$RUN_ID/cache/"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/mnt/artifacts"
+        "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
+        "N_REPEAT={n_repeat}"
+        "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
+        "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  #######################################################################
+  # Nightly tests: Run both DEV and LTS unless something is flaky       #
+  #######################################################################
+  - test_case: [gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+        platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts]
+  #   products: # non-determinism: #478
+  #     - environment: [dev, lts]
+  #       scope: [nightly]
+  #######################################################################
+  # Weekly tests: Run both DEV and LTS unless something is flaky        #
+  #######################################################################
+  #######################################################################
+  # mr, mr-github tests: Mostly DEV on mr, mr-github, and LTS on nightly cadence, except for  #
+  #             some very important tests.                              #
+  #######################################################################
+  - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200] # hang: #513
+  - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph]
+    products:
+      - environment: [dev]
+        scope: [mr-broken]
+        platforms: [dgx_gb200] # hang: #513
+  # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200] # hang: #513
+  - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  # - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200]
+  # - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-slim]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_muon]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-slim]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading]
+    products:
+      - environment: [dev]
+        scope: [mr-broken]
+        platforms: [dgx_gb200]
+  #######################################################################
+  # Super important mr, mr-github tests that run for both DEV and LTS per mr, mr-github       #
+  #######################################################################
+  # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200]
+  # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_gb200]
+  ###########################
+  # Merge train tests       #
+  ###########################
+  - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-github-slim]
+        platforms: [dgx_gb200]
+  - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed]
+    products:
+      - environment: [dev]
+        scope: [mr-broken]
+        platforms: [dgx_gb200]
diff --git a/tests/test_utils/recipes/gb200/unit-tests.yaml b/tests/test_utils/recipes/gb200/unit-tests.yaml
new file mode 100644
index 00000000000..0e8cb72916b
--- /dev/null
+++ b/tests/test_utils/recipes/gb200/unit-tests.yaml
@@ -0,0 +1,153 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}_{environment}_{platforms}_{tag}"
+  model: unit-tests
+  nodes: 2
+  build: mcore-pyt-{environment}
+  gpus: 4
+  platforms: dgx_gb200
+  script_setup: |
+    unset https_proxy
+    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
+
+    # Checkout latest
+    cd /opt
+    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    git fetch origin $MCORE_MR_COMMIT
+    git checkout $MCORE_MR_COMMIT
+    git rev-parse HEAD
+
+    # Checkout backwards-ref
+    cd /opt
+    bash /opt/megatron-lm/.gitlab/scripts/fetch-legacy-suite.sh \
+      --backwards-commit $MCORE_BACKWARDS_COMMIT \
+      --repo $MCORE_REPO
+
+  script: |-
+    ls
+
+    TAG={tag}
+    ENVIRONMENT={environment}
+    BUCKET="{test_case}"
+    UNIT_TEST_REPEAT={n_repeat}
+    export GPUS_PER_NODE={gpus}
+
+    if [[ "$TAG" == "latest" ]]; then
+        TEST_PATH="/opt/megatron-lm"
+    else
+        TEST_PATH="/opt/megatron-lm-legacy/"
+    fi
+
+
+    bash $TEST_PATH/tests/unit_tests/run_ci_test.sh \
+      --tag $TAG \
+      --environment $ENVIRONMENT \
+      --bucket $BUCKET \
+      --unit-test-repeat $UNIT_TEST_REPEAT \
+      --log-dir {assets_dir}/logs/1/
+
+    ls -al 
+
+    cd $TEST_PATH
+    /opt/venv/bin/coverage xml 
+    cp .coverage {assets_dir}/coverage_report
+    cp coverage.xml {assets_dir}
+
+products:
+  - test_case: [tests/unit_tests/test_model_configs.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/test_fp8_param.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/pipeline_parallel/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/models/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/data/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/dist_checkpointing/test_optimizer.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/dist_checkpointing/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/dist_checkpointing/models/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/dist_checkpointing/models/test_moe_experts.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/transformer/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/transformer/moe/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/distributed/megatron_fsdp/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/**/*.py]
+    products:
+      - environment: [dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
diff --git a/tests/test_utils/recipes/bert.yaml b/tests/test_utils/recipes/h100/bert.yaml
similarity index 96%
rename from tests/test_utils/recipes/bert.yaml
rename to tests/test_utils/recipes/h100/bert.yaml
index 49fed7f5542..89499f93c5e 100644
--- a/tests/test_utils/recipes/bert.yaml
+++ b/tests/test_utils/recipes/h100/bert.yaml
@@ -59,22 +59,22 @@ products:
   - test_case: [bert_mcore_tp2_pp2]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [bert_mcore_tp2_pp2_local_spec]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [bert_mcore_tp2_pp2_resume_torch_dist]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [bert_mcore_tp2_pp2_resume_torch_dist_local_spec]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [bert_mcore_tp1_pp2]
     products:
diff --git a/tests/test_utils/recipes/ckpt_converter.yaml b/tests/test_utils/recipes/h100/ckpt_converter.yaml
similarity index 100%
rename from tests/test_utils/recipes/ckpt_converter.yaml
rename to tests/test_utils/recipes/h100/ckpt_converter.yaml
diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference-cuda-graphs.yaml
similarity index 100%
rename from tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml
rename to tests/test_utils/recipes/h100/gpt-dynamic-inference-cuda-graphs.yaml
diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml
similarity index 87%
rename from tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml
rename to tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml
index e882d721860..19d523eea8d 100644
--- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml
+++ b/tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: '{test_case}_{environment}_{platforms}'
+  name: "{test_case}_{environment}_{platforms}"
   model: gpt
   build: mcore-pyt-{environment}
   nodes: 1
@@ -45,7 +45,8 @@ spec:
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
@@ -69,11 +70,11 @@ products:
       - environment: [dev]
         scope: [mr]
         platforms: [dgx_h100]
-  - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq]
-    products:
-      - environment: [dev]
-        scope: [mr]
-        platforms: [dgx_h100]
+  # - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_h100]
   - test_case: [gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq]
     products:
       - environment: [dev]
diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml
similarity index 94%
rename from tests/test_utils/recipes/gpt-dynamic-inference.yaml
rename to tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml
index a3853c3d9e1..2915263c0e7 100644
--- a/tests/test_utils/recipes/gpt-dynamic-inference.yaml
+++ b/tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml
@@ -45,7 +45,8 @@ spec:
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
diff --git a/tests/test_utils/recipes/gpt-grads.yaml b/tests/test_utils/recipes/h100/gpt-grads.yaml
similarity index 97%
rename from tests/test_utils/recipes/gpt-grads.yaml
rename to tests/test_utils/recipes/h100/gpt-grads.yaml
index 28fee0b02e9..98d0ade3060 100644
--- a/tests/test_utils/recipes/gpt-grads.yaml
+++ b/tests/test_utils/recipes/h100/gpt-grads.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: gpt
   build: mcore-pyt-{environment}
   nodes: 1
diff --git a/tests/test_utils/recipes/gpt-grpo.yaml b/tests/test_utils/recipes/h100/gpt-grpo.yaml
similarity index 75%
rename from tests/test_utils/recipes/gpt-grpo.yaml
rename to tests/test_utils/recipes/h100/gpt-grpo.yaml
index 76f1ea2d3a9..e707c1c2431 100644
--- a/tests/test_utils/recipes/gpt-grpo.yaml
+++ b/tests/test_utils/recipes/h100/gpt-grpo.yaml
@@ -45,7 +45,7 @@ spec:
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
@@ -54,13 +54,28 @@ spec:
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
 
 products:
-  - test_case: [gpt_grpo_tp1_pp1_dp8_583m_throughputtest]
+  - test_case: [gpt_grpo_tp4_pp1_dp2_8b_throughput]
     products:
       - environment: [dev]
         scope: [mr]
         platforms: [dgx_h100]
-  - test_case: [gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github]
+  - test_case: [gpt_grpo_tp4_pp1_dp2_8b_throughput_github]
     products:
       - environment: [dev]
         scope: [mr-github]
         platforms: [dgx_h100]
+  - test_case: [gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest]
+    products:
+      - environment: [dev]
+        scope: [mr-broken]
+        platforms: [dgx_h100]
+  - test_case: [gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github]
+    products:
+      - environment: [dev]
+        scope: [mr-github-broken]
+        platforms: [dgx_h100]
+  - test_case: [gpt_grpo_tp2tp1_pp4pp2_dp8_583m_throughputtest]
+    products:
+      - environment: [dev]
+        scope: [mr-broken]
+        platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/gpt-nemo.yaml b/tests/test_utils/recipes/h100/gpt-nemo.yaml
similarity index 100%
rename from tests/test_utils/recipes/gpt-nemo.yaml
rename to tests/test_utils/recipes/h100/gpt-nemo.yaml
diff --git a/tests/test_utils/recipes/gpt-static-inference.yaml b/tests/test_utils/recipes/h100/gpt-static-inference.yaml
similarity index 93%
rename from tests/test_utils/recipes/gpt-static-inference.yaml
rename to tests/test_utils/recipes/h100/gpt-static-inference.yaml
index 39c2c3c934e..806762531fd 100644
--- a/tests/test_utils/recipes/gpt-static-inference.yaml
+++ b/tests/test_utils/recipes/h100/gpt-static-inference.yaml
@@ -44,8 +44,9 @@ spec:
         "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
@@ -57,7 +58,7 @@ products:
   - test_case: [gpt_static_inference_tp1_pp1_583m_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [mr, mr-github]
         platforms: [dgx_h100]
   - test_case: [gpt_static_inference_tp1_pp1_583m_cudagraphs]
     products:
diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/h100/gpt.yaml
similarity index 98%
rename from tests/test_utils/recipes/gpt.yaml
rename to tests/test_utils/recipes/h100/gpt.yaml
index 1b1ab41231d..90eddc55c27 100644
--- a/tests/test_utils/recipes/gpt.yaml
+++ b/tests/test_utils/recipes/h100/gpt.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: '{test_case}_{environment}_{platforms}'
+  name: "{test_case}_{environment}_{platforms}"
   model: gpt
   build: mcore-pyt-{environment}
   nodes: 1
@@ -342,6 +342,11 @@ products:
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
+  - test_case: [gpt3_mcore_te_tp2_pp1_gdn]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-github-slim]
+        platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_mla]
     products:
       - environment: [dev]
@@ -457,7 +462,7 @@ products:
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer]
     products:
       - environment: [lts]
-        scope: [mr]
+        scope: [nightly]
       - environment: [dev]
         scope: [mr, mr-github, mr-github-slim]
         platforms: [dgx_h100]
@@ -467,11 +472,11 @@ products:
         scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
-        scope: [mr]
+        scope: [nightly]
   - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
     products:
       - environment: [lts]
-        scope: [mr]
+        scope: [nightly]
       - environment: [dev]
         scope: [mr, mr-github, mr-github-slim]
         platforms: [dgx_h100]
@@ -481,7 +486,7 @@ products:
         scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
-        scope: [mr]
+        scope: [nightly]
   # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone]
   #   products:
   #     - environment: [dev]
diff --git a/tests/test_utils/recipes/mamba-dynamic-inference.yaml b/tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml
similarity index 85%
rename from tests/test_utils/recipes/mamba-dynamic-inference.yaml
rename to tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml
index 0d02ce29a54..c4c675746b9 100644
--- a/tests/test_utils/recipes/mamba-dynamic-inference.yaml
+++ b/tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml
@@ -45,7 +45,8 @@ spec:
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
@@ -57,5 +58,10 @@ products:
   - test_case: [hybrid_dynamic_inference_tp1_pp1_dp8_583m]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [mr-github]
+        platforms: [dgx_h100]
+  - test_case: [hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill]
+    products:
+      - environment: [dev]
+        scope: [mr-github]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/h100/mamba-static-inference.yaml
similarity index 89%
rename from tests/test_utils/recipes/mamba-static-inference.yaml
rename to tests/test_utils/recipes/h100/mamba-static-inference.yaml
index 4c60d45f889..b36c4a8f765 100644
--- a/tests/test_utils/recipes/mamba-static-inference.yaml
+++ b/tests/test_utils/recipes/h100/mamba-static-inference.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: '{test_case}_{environment}_{platforms}'
+  name: "{test_case}_{environment}_{platforms}"
   model: hybrid
   build: mcore-pyt-{environment}
   nodes: 1
@@ -45,7 +45,8 @@ spec:
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
@@ -57,7 +58,7 @@ products:
   - test_case: [hybrid_static_inference_tp1_pp1_2B_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [mr-broken, mr-github-broken]
         platforms: [dgx_h100]
   - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs]
     products:
diff --git a/tests/test_utils/recipes/mamba.yaml b/tests/test_utils/recipes/h100/mamba.yaml
similarity index 98%
rename from tests/test_utils/recipes/mamba.yaml
rename to tests/test_utils/recipes/h100/mamba.yaml
index 92b799d3d1c..47b731f7e00 100644
--- a/tests/test_utils/recipes/mamba.yaml
+++ b/tests/test_utils/recipes/h100/mamba.yaml
@@ -82,7 +82,7 @@ products:
   - test_case: [hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [mr, mr-github]
         platforms: [dgx_h100]
       # - environment: [lts] # disabled until triton is bumped
       #   scope: [nightly]
diff --git a/tests/test_utils/recipes/mimo.yaml b/tests/test_utils/recipes/h100/mimo.yaml
similarity index 100%
rename from tests/test_utils/recipes/mimo.yaml
rename to tests/test_utils/recipes/h100/mimo.yaml
diff --git a/tests/test_utils/recipes/h100/module_performance.yaml b/tests/test_utils/recipes/h100/module_performance.yaml
new file mode 100644
index 00000000000..a6cf19bdd5c
--- /dev/null
+++ b/tests/test_utils/recipes/h100/module_performance.yaml
@@ -0,0 +1,52 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}_{environment}_{platforms}"
+  model: common
+  build: mcore-pyt-{environment}
+  nodes: 1
+  gpus: 8
+  platforms: dgx_a100
+  script_setup: |
+    unset https_proxy
+    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
+
+    # Checkout latest
+    cd /opt
+    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    git fetch origin $MCORE_MR_COMMIT
+    git checkout $MCORE_MR_COMMIT
+    git rev-parse HEAD
+
+    # Checkout backwards-ref
+    cd /opt
+    rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin $MCORE_BACKWARDS_COMMIT
+    git checkout $MCORE_BACKWARDS_COMMIT
+    git rev-parse HEAD
+    rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+  script: |-
+    ls
+
+    cd /opt/megatron-lm
+
+    uv run --no-sync python -m torch.distributed.run \
+      --log-dir {assets_dir}/logs/1 \
+      --tee "0:3,7:3" \
+      --redirects "3" \
+      --nproc_per_node 8 \
+      -m tests.functional_tests.test_cases.common.{test_case}
+
+products:
+  - test_case: [moe_perf]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github]
+        platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml
new file mode 100644
index 00000000000..513aa92834b
--- /dev/null
+++ b/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml
@@ -0,0 +1,68 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: '{test_case}_{environment}_{platforms}'
+  model: moe
+  build: mcore-pyt-{environment}
+  nodes: 1
+  gpus: 8
+  n_repeat: 1
+  platforms: dgx_a100
+  script_setup: |
+    unset https_proxy
+    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
+
+    # Checkout latest
+    cd /opt
+    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    git fetch origin $MCORE_MR_COMMIT
+    git checkout $MCORE_MR_COMMIT
+    git rev-parse HEAD
+    # Checkout backwards-ref
+    cd /opt
+    rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin $MCORE_BACKWARDS_COMMIT
+    git checkout $MCORE_BACKWARDS_COMMIT
+    git rev-parse HEAD
+    rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    ARGUMENTS=(
+        "CHECKPOINT_LOAD_PATH=/mnt/artifacts"
+        "CHECKPOINT_SAVE_PATH=/tmp/checkpoints"
+        "DATA_PATH=null"
+        "DATA_CACHE_PATH=/workspace/data/cache"
+        "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
+        "N_REPEAT={n_repeat}"
+        "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
+        "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - test_case: [gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_zmq]
+    products:
+      - environment: [dev]
+        scope: [flaky]
+        platforms: [dgx_h100]
+  - test_case: [gpt_dynamic_inference_tp4_etp1_pp1_ep8_16B_logitsmatch_cudagraph_zmq]
+    products:
+      - environment: [dev]
+        scope: [flaky]
+        platforms: [dgx_h100]
+  
diff --git a/tests/test_utils/recipes/moe-dynamic-inference.yaml b/tests/test_utils/recipes/h100/moe-dynamic-inference.yaml
similarity index 93%
rename from tests/test_utils/recipes/moe-dynamic-inference.yaml
rename to tests/test_utils/recipes/h100/moe-dynamic-inference.yaml
index 6d8fdc533e1..fc1c07231c3 100644
--- a/tests/test_utils/recipes/moe-dynamic-inference.yaml
+++ b/tests/test_utils/recipes/h100/moe-dynamic-inference.yaml
@@ -45,7 +45,8 @@ spec:
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
diff --git a/tests/test_utils/recipes/h100/moe-grpo.yaml b/tests/test_utils/recipes/h100/moe-grpo.yaml
new file mode 100644
index 00000000000..de430b64fe0
--- /dev/null
+++ b/tests/test_utils/recipes/h100/moe-grpo.yaml
@@ -0,0 +1,61 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}_{environment}_{platforms}"
+  model: moe
+  build: mcore-pyt-{environment}
+  nodes: 1
+  gpus: 1
+  n_repeat: 1
+  platforms: dgx_a100
+  script_setup: |
+    unset https_proxy
+    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
+
+    # Checkout latest
+    cd /opt
+    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    git fetch origin $MCORE_MR_COMMIT
+    git checkout $MCORE_MR_COMMIT
+    git rev-parse HEAD
+    # Checkout backwards-ref
+    cd /opt
+    rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
+    git init
+    git remote add origin $MCORE_REPO
+    git fetch origin $MCORE_BACKWARDS_COMMIT
+    git checkout $MCORE_BACKWARDS_COMMIT
+    git rev-parse HEAD
+    rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    ARGUMENTS=(
+        "CHECKPOINT_LOAD_PATH=/mnt/artifacts"
+        "CHECKPOINT_SAVE_PATH=/tmp/checkpoints"
+        "DATA_PATH=/mnt/artifacts/"
+        "DATA_CACHE_PATH=/workspace/data/cache"
+        "TRAINING_SCRIPT_PATH=train_rl.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "N_REPEAT={n_repeat}"
+        "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
+        "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - test_case: [gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/h100/moe-static-inference.yaml
similarity index 94%
rename from tests/test_utils/recipes/moe-static-inference.yaml
rename to tests/test_utils/recipes/h100/moe-static-inference.yaml
index c23a772aa28..f10d293e953 100644
--- a/tests/test_utils/recipes/moe-static-inference.yaml
+++ b/tests/test_utils/recipes/h100/moe-static-inference.yaml
@@ -45,7 +45,8 @@ spec:
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
         "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
         "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/h100/moe.yaml
similarity index 85%
rename from tests/test_utils/recipes/moe.yaml
rename to tests/test_utils/recipes/h100/moe.yaml
index 86d2b248e39..06039d77440 100644
--- a/tests/test_utils/recipes/moe.yaml
+++ b/tests/test_utils/recipes/h100/moe.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: '{test_case}_{environment}_{platforms}'
+  name: "{test_case}_{environment}_{platforms}"
   model: moe
   build: mcore-pyt-{environment}
   nodes: 1
@@ -129,7 +129,7 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [mr-broken]
         platforms: [dgx_h100] # hang: #513
   # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental]
   #   products:
@@ -168,6 +168,16 @@ products:
       - environment: [dev]
         scope: [mr]
         platforms: [dgx_h100]
+  - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_h100]
+  - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective]
     products:
       - environment: [dev]
@@ -180,6 +190,31 @@ products:
       - environment: [dev]
         scope: [mr]
         platforms: [dgx_h100]
+  - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph]
+    products:
+      - environment: [dev]
+        scope: [mr]
+        platforms: [dgx_h100]
+  - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-slim]
+        platforms: [dgx_h100]
+  - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_muon]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-slim]
+        platforms: [dgx_h100]
+  - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading]
+    products:
+      - environment: [dev]
+        scope: [mr-broken]
+        platforms: [dgx_h100]
+  - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading]
+    products:
+      - environment: [dev]
+        scope: [mr-broken]
+        platforms: [dgx_h100]
   #######################################################################
   # Super important mr, mr-github tests that run for both DEV and LTS per mr, mr-github       #
   #######################################################################
@@ -204,5 +239,5 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [mr-broken]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/h100/multimodal-llava.yaml
similarity index 100%
rename from tests/test_utils/recipes/multimodal-llava.yaml
rename to tests/test_utils/recipes/h100/multimodal-llava.yaml
diff --git a/tests/test_utils/recipes/t5.yaml b/tests/test_utils/recipes/h100/t5.yaml
similarity index 96%
rename from tests/test_utils/recipes/t5.yaml
rename to tests/test_utils/recipes/h100/t5.yaml
index 96b560c6427..1761cd3f1e6 100644
--- a/tests/test_utils/recipes/t5.yaml
+++ b/tests/test_utils/recipes/h100/t5.yaml
@@ -59,27 +59,27 @@ products:
   - test_case: [t5_11b_mcore_tp4_pp1]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [t5_mcore_te_tp4_pp1]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [t5_mcore_te_tp4_pp1_resume_torch_dist]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [t5_mcore_tp4_pp1]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [t5_mcore_tp4_pp1_resume_torch_dist]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [nightly]
         platforms: [dgx_h100]
   - test_case: [t5_mcore_te_tp1_pp1_vp1_resume_torch]
     products:
diff --git a/tests/test_utils/recipes/unit-tests.yaml b/tests/test_utils/recipes/h100/unit-tests.yaml
similarity index 98%
rename from tests/test_utils/recipes/unit-tests.yaml
rename to tests/test_utils/recipes/h100/unit-tests.yaml
index 8e9421ac02e..c3527272782 100644
--- a/tests/test_utils/recipes/unit-tests.yaml
+++ b/tests/test_utils/recipes/h100/unit-tests.yaml
@@ -136,7 +136,7 @@ products:
         scope: [unit-tests]
         n_repeat: [1]
         time_limit: [1800]
-  - test_case: [tests/unit_tests/distributed/fsdp/**/*.py]
+  - test_case: [tests/unit_tests/distributed/megatron_fsdp/**/*.py]
     products:
       - environment: [lts, dev]
         tag: [latest]
diff --git a/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py
new file mode 100644
index 00000000000..85586095bd7
--- /dev/null
+++ b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py
@@ -0,0 +1,379 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+import gc
+import os
+import sys
+
+import pytest
+import torch
+
+from megatron.core.enums import ModelType
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_decoder_block_spec,
+    get_gpt_mtp_block_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+from megatron.core.pipeline_parallel.utils import set_streams
+from megatron.core.tensor_parallel.random import HAVE_TE, model_parallel_cuda_manual_seed
+from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.module import float16_to_fp32
+from megatron.core.utils import is_te_min_version, unwrap_model
+from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args
+from megatron.training.global_vars import (
+    destroy_global_vars,
+    get_args,
+    set_args,
+    set_global_variables,
+)
+from megatron.training.training import setup_model_and_optimizer
+from tests.unit_tests.test_utilities import Utils
+
+
+def is_deep_ep_available():
+    from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP
+
+    return HAVE_DEEP_EP
+
+
+def is_hybrid_ep_available():
+    from megatron.core.transformer.moe.fused_a2a import HAVE_HYBRIDEP
+
+    return HAVE_HYBRIDEP
+
+
+def save(fn, message):
+    with open(fn, 'w') as f:
+        f.write(message)
+
+
+class TestPartialCudaGraphedA2AOverlap:
+    """Test that CUDA graph outputs match ep-overlapped CUDA graph outputs for various scopes."""
+
+    def setup_method(self, method):
+        self.seq_length = 512
+        self.micro_batch_size = 2
+        # Store original environment variable values
+        self.original_env = {
+            'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'),
+            'NVTE_ALLOW_NONDETERMINISTIC_ALGO': os.environ.get('NVTE_ALLOW_NONDETERMINISTIC_ALGO'),
+        }
+        self.cuda_graph_helper = None
+        os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+        os.environ['NVTE_ALLOW_NONDETERMINISTIC_ALGO'] = '0'
+
+    def teardown_method(self, method):
+        # Restore original environment variable values
+        for key, value in self.original_env.items():
+            if value is None:
+                os.environ.pop(key, None)
+            else:
+                os.environ[key] = value
+        Utils.destroy_model_parallel()
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+        self.delete_cuda_graphs()
+
+        gc.collect()
+
+    def delete_cuda_graphs(self):
+        if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created():
+            self.cuda_graph_helper.delete_cuda_graphs()
+            self.cuda_graph_helper = None
+
+    def model_provider(
+        self,
+        pre_process=True,
+        post_process=True,
+        layer_spec_fn=get_gpt_decoder_block_spec,
+        **config_kwargs,
+    ):
+        model_parallel_cuda_manual_seed(123)
+        args = get_args()
+        config = core_transformer_config_from_args(args)
+        transformer_layer_spec = layer_spec_fn(
+            config,
+            use_transformer_engine=True,
+            normalization=args.normalization,
+            qk_l2_norm=args.qk_l2_norm,
+        )
+        if args.mtp_num_layers:
+            mtp_block_spec = get_gpt_mtp_block_spec(
+                config, transformer_layer_spec, use_transformer_engine=True
+            )
+        else:
+            mtp_block_spec = None
+        return GPTModel(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=True,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent,
+            mtp_block_spec=mtp_block_spec,
+        )
+
+    def create_test_args(
+        self, cuda_graph_impl, cuda_graph_scope, cuda_graph_warmup_steps, ep_size, **kwargs
+    ):
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+
+        sys.argv = ['test_cuda_graphs.py']
+        args = parse_args()
+        args.num_layers = 1
+        args.mtp_num_layers = None
+        args.vocab_size = 1024
+        args.hidden_size = 128
+        args.num_attention_heads = 8
+        args.max_position_embeddings = 512
+        args.global_batch_size = self.micro_batch_size * 8
+        args.micro_batch_size = self.micro_batch_size
+        args.create_attention_mask_in_dataloader = True
+        args.seq_length = self.seq_length
+        args.tensor_model_parallel_size = 2
+        args.sequence_parallel = True
+        args.pipeline_model_parallel_size = 1
+        args.context_parallel_size = 1
+        args.expert_model_parallel_size = ep_size
+        args.train_iters = 10
+        args.lr = 3e-5
+        args.bf16 = True
+        args.add_bias_linear = False
+        args.swiglu = True
+        args.use_distributed_optimizer = True
+        args.position_embedding_type = "rope"
+        args.rotary_percent = 1.0
+        args.hidden_dropout = 0.0
+        args.attention_dropout = 0.0
+        args.untie_embeddings_and_output_weights = True
+
+        # MoE settings
+        args.num_experts = 16
+        args.expert_model_parallel_size = ep_size
+        args.moe_shared_expert_intermediate_size = 1024
+        args.moe_layer_freq = kwargs.get("moe_layer_freq", "[0,0,1,1]")
+        args.moe_permute_fusion = True
+        args.moe_router_fusion = True
+        args.moe_router_topk = 2
+
+        # CUDA graph settings
+        args.cuda_graph_impl = cuda_graph_impl
+        args.cuda_graph_scope = cuda_graph_scope
+        args.cuda_graph_warmup_steps = cuda_graph_warmup_steps
+        args.use_te_rng_tracker = cuda_graph_impl != "none"
+
+        for key, value in kwargs.items():
+            assert hasattr(args, key)
+            setattr(args, key, value)
+
+        validate_args(args)
+        set_global_variables(args, False)
+        return args
+
+    def get_batch(self, seq_length, micro_batch_size):
+        data = list(range(seq_length))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, seq_length, seq_length), dtype=bool
+        ).cuda()
+        loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda()
+        return input_ids, labels, position_ids, attention_mask, loss_mask
+
+    def _run_1f1b_helper(self, gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps):
+        from megatron.core.models.common.model_chunk_schedule_plan import (
+            TransformerModelChunkSchedulePlan,
+        )
+        from megatron.core.pipeline_parallel.schedules import set_current_microbatch
+
+        schedule_plans = []
+        losses = []
+        set_current_microbatch(gpt_model[0], 1)
+
+        gpt_model[0].zero_grad_buffer()
+        optimizer.zero_grad()
+        assert cuda_graph_warmup_steps > 0, "cuda_graph_warmup_steps must be greater than 0"
+        for fwd_mb_idx in range(num_iters + 1):
+            # Capture CUDA graphs after warmup if helper is provided
+            if self.cuda_graph_helper is not None and fwd_mb_idx == cuda_graph_warmup_steps:
+                self.cuda_graph_helper.create_cudagraphs()
+
+            if fwd_mb_idx < cuda_graph_warmup_steps:
+                gpt_model[0].zero_grad_buffer()
+                optimizer.zero_grad()
+                output = gpt_model[0].forward(**data)
+                schedule_plans.append(None)
+            else:
+                if fwd_mb_idx == cuda_graph_warmup_steps:
+                    extra_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data)
+                    TransformerModelChunkSchedulePlan.run(extra_schedule_plan, None)
+                    schedule_plans[-1] = extra_schedule_plan
+                f_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data)
+                b_schedule_plan = schedule_plans[-1]
+                schedule_plans.append(f_schedule_plan)
+                if b_schedule_plan is not None:
+                    gpt_model[0].zero_grad_buffer()
+                    optimizer.zero_grad()
+                output = TransformerModelChunkSchedulePlan.run(
+                    f_schedule_plan,
+                    b_schedule_plan,
+                    b_grad=torch.ones_like(output) if fwd_mb_idx > 0 else None,
+                )
+            # Check output shapes
+            if fwd_mb_idx < num_iters:
+                assert output is not None
+                assert output.shape[0] == self.micro_batch_size
+                assert output.shape[1] == self.seq_length
+                losses.append(output)
+
+            if fwd_mb_idx < cuda_graph_warmup_steps:
+                output.backward(torch.ones_like(output))
+
+            for param in gpt_model[0].parameters():
+                assert param.main_grad is not None
+
+            update_successful, _, _ = optimizer.step()
+            assert update_successful
+
+        return losses
+
+    def _run_test_helper(
+        self,
+        ep_size,
+        cuda_graph_impl,
+        cuda_graph_scope,
+        cuda_graph_warmup_steps,
+        ep_overlap=False,
+        **kwargs,
+    ):
+        """Test fp8_param with gpt_model."""
+        args = self.create_test_args(
+            cuda_graph_impl,
+            cuda_graph_scope,
+            cuda_graph_warmup_steps,
+            ep_size,
+            overlap_moe_expert_parallel_comm=ep_overlap,
+            **kwargs,
+        )
+        if ep_overlap:
+            set_streams()
+        set_args(args)
+        torch.manual_seed(123)
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=2, expert_model_parallel_size=ep_size
+        )
+
+        input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch(
+            self.seq_length, self.micro_batch_size
+        )
+
+        gpt_model, optimizer, _ = setup_model_and_optimizer(
+            self.model_provider, ModelType.encoder_or_decoder
+        )
+        assert len(gpt_model) == 1  # Assume only one model in the model provider.
+
+        loss_list = []
+
+        if cuda_graph_impl == "transformer_engine":
+            from megatron.core.transformer.cuda_graphs import TECudaGraphHelper
+
+            self.cuda_graph_helper = TECudaGraphHelper(
+                model=gpt_model,
+                config=gpt_model[0].config,
+                seq_length=self.seq_length,
+                micro_batch_size=self.micro_batch_size,
+                optimizers=[optimizer],
+            )
+
+        num_iters = cuda_graph_warmup_steps + 2
+        data = {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "loss_mask": loss_mask,
+        }
+        if not ep_overlap:
+            for i in range(num_iters):
+                gpt_model[0].zero_grad_buffer()
+                optimizer.zero_grad()
+
+                # Capture CUDA graphs after warmup if helper is provided
+                if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps:
+                    self.cuda_graph_helper.create_cudagraphs()
+
+                output = unwrap_model(gpt_model[0]).forward(**data)
+                output = float16_to_fp32(output)
+
+                # Check output shapes
+                assert output.shape[0] == self.micro_batch_size
+                assert output.shape[1] == self.seq_length
+
+                # Verify gradients
+                output.backward(torch.ones_like(output))
+                for param in gpt_model[0].parameters():
+                    assert param.main_grad is not None
+
+                update_successful, _, _ = optimizer.step()
+                assert update_successful
+
+                loss_list.append(output)
+        else:
+            loss_list = self._run_1f1b_helper(
+                gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps
+            )
+
+        self.delete_cuda_graphs()
+
+        return loss_list
+
+    @pytest.mark.flaky
+    @pytest.mark.flaky_in_dev
+    @pytest.mark.skipif(
+        not (HAVE_TE and is_te_min_version("2.10.0")),
+        reason="Partial CUDA graph support requires TransformerEngine version >= 2.10.0",
+    )
+    @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep"])
+    def test_moe_partial_cudagraph_with_ep_overlap(self, moe_dispatcher_type):
+        extra_kwargs = {"moe_layer_freq": 1}
+        if moe_dispatcher_type == "deepep":
+            if not is_deep_ep_available():
+                pytest.skip("Deep EP is not available")
+            extra_kwargs["moe_token_dispatcher_type"] = "flex"
+            extra_kwargs["moe_flex_dispatcher_backend"] = "deepep"
+            extra_kwargs["moe_router_dtype"] = "fp32"
+        elif moe_dispatcher_type == "hybridep":
+            if not is_hybrid_ep_available():
+                pytest.skip("Hybrid EP is not available")
+            extra_kwargs["moe_token_dispatcher_type"] = "flex"
+            extra_kwargs["moe_flex_dispatcher_backend"] = "hybridep"
+        else:
+            extra_kwargs["moe_token_dispatcher_type"] = moe_dispatcher_type
+
+        loss_list_ref = self._run_test_helper(4, "none", None, 3, **extra_kwargs)
+        for cuda_graph_scope in [
+            [CudaGraphScope.attn],
+            [CudaGraphScope.attn, CudaGraphScope.moe_router],
+            [CudaGraphScope.attn, CudaGraphScope.moe_router, CudaGraphScope.moe_preprocess],
+        ]:
+            cuda_graph_warmup_steps = 3
+            loss_list = self._run_test_helper(
+                4,
+                "transformer_engine",
+                cuda_graph_scope,
+                cuda_graph_warmup_steps,
+                ep_overlap=True,
+                **extra_kwargs,
+            )
+            assert len(loss_list) == len(loss_list_ref)
+            for i in range(len(loss_list)):
+                assert torch.equal(
+                    loss_list[i].mean(), loss_list_ref[i].mean()
+                ), f"scope={cuda_graph_scope}, i={i},loss_list={loss_list[i]}, loss_list_ref={loss_list_ref[i]}"
+            print(f"[DEBUG] Pass {cuda_graph_scope}")
diff --git a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py
index 81e61a3404a..6c59dd3f9e3 100644
--- a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py
+++ b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py
@@ -23,7 +23,7 @@
 from tests.unit_tests.test_utilities import Utils
 
 
-def build_model(config):
+def build_model(config, use_padding_mask=False):
     seq_len = 32
     max_seq_len = 300
     # ids = random.sample([i for i in range(max_seq_len)], seq_len)
@@ -39,6 +39,12 @@ def build_model(config):
         "attention_mask": torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda(),
     }
 
+    # Optionally add padding_mask with same shape as input_ids
+    if use_padding_mask:
+        padding_mask = torch.zeros((1, seq_len), dtype=torch.bool).cuda()
+        padding_mask[0, -8:] = True
+        data["padding_mask"] = padding_mask
+
     # build layer spec
     transformer_layer_spec = get_gpt_decoder_block_spec(config=config, use_transformer_engine=True)
     mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec.layer_specs[-1], True)
@@ -48,7 +54,7 @@ def build_model(config):
         config=config,
         transformer_layer_spec=transformer_layer_spec,
         mtp_block_spec=mtp_block_spec,
-        vocab_size=100,
+        vocab_size=128,
         pre_process=True,
         post_process=True,
         max_sequence_length=max_seq_len,
@@ -174,3 +180,109 @@ def test_1f1b_schedule_model_chunk(self, mtp_layers, dispatcher_type, fp8_flag,
                 gpt_models[i] = None
             gc.collect()
             torch.cuda.empty_cache()
+
+    @pytest.mark.skipif(not is_te_min_version("1.9.0.dev0"), reason="Requires TE >= 1.9.0.dev0")
+    @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types())
+    @pytest.mark.parametrize("layers", [[2, 1], [1, 1]])
+    @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
+    def test_1f1b_schedule_model_chunk_with_padding_mask(self, dispatcher_type, layers, tp_size):
+        """
+        Verifies all-to-all overlap optimization with padding_mask produces
+        the same results as the reference implementation with various TP/EP/CP combinations.
+        """
+        # Re-initialize model parallel with the specified configuration
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp_size,
+            pipeline_model_parallel_size=1,
+            expert_model_parallel_size=4,
+            expert_tensor_parallel_size=1,
+        )
+        set_streams()
+
+        microbatches = 1
+
+        gpt_models = []
+        schedule_plans = []
+        ref_captures = []
+        datas = []
+
+        # create TransformerConfig
+        extra_kwargs = {
+            "moe_token_dispatcher_type": dispatcher_type,
+            "tensor_model_parallel_size": tp_size,
+            "sequence_parallel": tp_size > 1,
+        }
+        if dispatcher_type == "flex":
+            extra_kwargs["moe_flex_dispatcher_backend"] = "deepep"
+            extra_kwargs["moe_router_dtype"] = "fp32"
+        with deterministic_mode():
+            for layer_num in layers:
+                output_tensors = []
+                # build config
+                config = get_test_config(num_layers=layer_num, extra_kwargs=extra_kwargs)
+                # build model with padding_mask
+                gpt_model, schedule_plan, data = build_model(config, use_padding_mask=True)
+                gpt_model.cuda()
+                gpt_models.append(gpt_model)
+                datas.append(data)
+                schedule_plans.append(schedule_plan)
+
+                # run reference
+                for _ in range(microbatches):
+                    loss = gpt_model.forward(**data)
+                    loss = float16_to_fp32(loss)
+                    loss.backward(torch.ones_like(loss))
+                    output_tensors.append(loss)
+
+                capture = {"outputs": output_tensors}
+                for name, param in gpt_model.named_parameters():
+                    capture[name] = param.grad
+                ref_captures.append(capture)
+                gpt_model.zero_grad()
+            assert gpt_models[0].embedding is not None
+            assert gpt_models[1].embedding is not None
+            # run a2a overlap
+            capture_0 = {"outputs": []}
+            capture_1 = {"outputs": []}
+            a2a_captures = [capture_0, capture_1]
+            for i in range(microbatches):
+                # 1st forward
+                if i > 0:
+                    assert (
+                        schedule_plans[0].pre_process is None
+                    ), "pre_process should be released after backward"
+                    schedule_plans[0] = gpt_models[0].build_schedule_plan(**datas[0])
+                    schedule_plans[1] = gpt_models[1].build_schedule_plan(**datas[1])
+                f_input_0 = TransformerModelChunkSchedulePlan.run(schedule_plans[0], None)
+                capture_0["outputs"].append(f_input_0)
+                # overlap
+                f_input_1 = TransformerModelChunkSchedulePlan.run(
+                    schedule_plans[1], schedule_plans[0], b_grad=torch.ones_like(f_input_0)
+                )
+                capture_1["outputs"].append(f_input_1)
+                # last backward
+                TransformerModelChunkSchedulePlan.run(
+                    None, schedule_plans[1], b_grad=torch.ones_like(f_input_1)
+                )
+            for i in range(len(gpt_models)):
+                for name, param in gpt_models[i].named_parameters():
+                    a2a_captures[i][name] = param.grad
+
+            # compare results
+            for i in range(len(ref_captures)):
+                comp_res = compare_captures(ref_captures[i], a2a_captures[i], True, True)
+                assert comp_res[0], f"[rank {torch.distributed.get_rank()}] {comp_res[1]}"
+
+            # release resources is necessary, otherwise later testcases will oom
+            for i in range(len(schedule_plans)):
+                schedule_plans[i] = None
+                ref_captures[i] = None
+                a2a_captures[i] = None
+                for k in datas[i]:
+                    datas[i][k] = None
+                datas[i] = None
+                gpt_models[i].zero_grad()
+                gpt_models[i] = None
+            gc.collect()
+            torch.cuda.empty_cache()
diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py
index 3ebffb810e5..c6c4a75af99 100644
--- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py
+++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py
@@ -306,7 +306,59 @@ def test_transformer_layer_overlap_shared_expert(self):
             "moe_shared_expert_intermediate_size": 512,
         }
         overlap_config = get_test_config(extra_kwargs=extra_kwargs)
-        extra_kwargs["moe_shared_expert_overlap"] = True
+        extra_kwargs["moe_shared_expert_overlap"] = False
+        ref_config = get_test_config(extra_kwargs=extra_kwargs)
+        microbatches = 4
+        with deterministic_mode():
+            transformer_layer_spec = get_gpt_decoder_block_spec(
+                config=ref_config, use_transformer_engine=True
+            )
+            gpt_model = GPTModel(
+                config=ref_config,
+                transformer_layer_spec=transformer_layer_spec,
+                vocab_size=100,
+                pre_process=True,
+                post_process=True,
+                max_sequence_length=300,
+            )
+
+            params = reset_model(gpt_model)
+            input_tensors = [build_data() for _ in range(microbatches)]
+
+            fp8_context = get_fp8_context(ref_config, 0) if ref_config.fp8 else nullcontext()
+            with fp8_context:
+                capture_ref = run_transformer_layer_ref_with_capture(
+                    gpt_model, input_tensors, microbatches
+                )
+            del gpt_model
+
+            gpt_model = GPTModel(
+                config=overlap_config,
+                transformer_layer_spec=transformer_layer_spec,
+                vocab_size=100,
+                pre_process=True,
+                post_process=True,
+                max_sequence_length=300,
+            )
+            reset_model(gpt_model, params)
+            capture_a2a_overlap = run_transformer_layer_a2a_overlap_with_capture(
+                gpt_model, input_tensors, microbatches
+            )
+            comp_res = compare_captures(capture_ref, capture_a2a_overlap, True)
+            assert comp_res[0], f"[rank {torch.distributed.get_rank()}] {comp_res[1]}"
+
+    @pytest.mark.skipif(not is_te_min_version("1.9.0.dev0"), reason="Requires TE >= 1.9.0.dev0")
+    def test_transformer_layer_overlap_early_attn_memory_release(self):
+        """
+        Verifies all-to-all overlap optimization in transformer layer with early attn memory release
+        produces the same results as the reference implementation.
+        """
+        extra_kwargs = {
+            "moe_token_dispatcher_type": "alltoall",
+            "ep_overlap_early_attn_memory_release": True,
+            "overlap_moe_expert_parallel_comm": True,
+        }
+        overlap_config = get_test_config(extra_kwargs=extra_kwargs)
         ref_config = get_test_config(extra_kwargs=extra_kwargs)
         microbatches = 4
         with deterministic_mode():
@@ -450,8 +502,8 @@ def test_mtp_layer_overlap(self, dispatcher_type, fp8_flag):
             position_ids = torch.tensor(data, dtype=torch.int64).repeat((1, 1)).cuda()
             attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda()
             # get rotary pos emb
-            _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _ = gpt_model._preprocess(
-                input_ids, position_ids
+            _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _, _padding_mask = (
+                gpt_model._preprocess(input_ids, position_ids)
             )
             # reset model
             params = reset_model(gpt_model)
diff --git a/tests/unit_tests/a2a_overlap/utils.py b/tests/unit_tests/a2a_overlap/utils.py
index 7db4256a849..a52843956df 100644
--- a/tests/unit_tests/a2a_overlap/utils.py
+++ b/tests/unit_tests/a2a_overlap/utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import os
 from contextlib import contextmanager
 from dataclasses import dataclass
diff --git a/tests/unit_tests/data/test_builder.py b/tests/unit_tests/data/test_builder.py
index 939677268bb..d0e86c87fb8 100644
--- a/tests/unit_tests/data/test_builder.py
+++ b/tests/unit_tests/data/test_builder.py
@@ -5,7 +5,9 @@
 ##
 
 import os
+import random
 import tempfile
+from argparse import Namespace
 from collections import defaultdict
 from typing import Dict, Optional
 
@@ -13,11 +15,18 @@
 import pytest
 import torch
 
+from megatron.core.datasets.blended_dataset import BlendedDataset
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
+from megatron.core.datasets.indexed_dataset import DType, IndexedDatasetBuilder
 from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
 from megatron.core.datasets.utils import Split, compile_helpers, get_blend_from_list
+from megatron.training.tokenizer import build_tokenizer
+from megatron.training.utils import get_blend_and_blend_per_split
+from tests.unit_tests.dist_checkpointing import TempNamedDir
 from tests.unit_tests.test_utilities import Utils
+from tools.build_sequences_per_dataset import build_sequences_per_dataset
 
 _NUM_DATASETS = 10
 
@@ -32,6 +41,30 @@
 _MARGIN = 0.005
 
 
+def create_file_prefixes(tokenizer, number_of_files, maximum_number_of_documents, dataset_dir):
+    # Create dataset directory
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    # Create file prefixes
+    file_prefixes = []
+    for i in range(number_of_files):
+        file_prefix_path = os.path.join(dataset_dir, f"file_{i}")
+        builder = IndexedDatasetBuilder(
+            file_prefix_path + ".bin", dtype=DType.optimal_dtype(tokenizer.vocab_size)
+        )
+        number_of_documents = random.randint(10, maximum_number_of_documents)
+        for j in range(number_of_documents):
+            number_of_tokens = random.randint(50, 100)
+            tokenized_doc = [
+                str(random.randint(0, tokenizer.vocab_size - 1)) for _ in range(number_of_tokens)
+            ]
+            builder.add_document(tokenized_doc, [len(tokenized_doc)])
+        builder.finalize(file_prefix_path + ".idx")
+        file_prefixes.append(file_prefix_path)
+
+    return file_prefixes
+
+
 def do_setup(odir):
     paths = defaultdict(list)
 
@@ -297,5 +330,206 @@ def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
         ).build()
 
 
+@pytest.mark.parametrize("use_split", [True, False])
+@pytest.mark.parametrize("add_weights", [True, False])
+@pytest.mark.parametrize("fast_cache_load", [True, False])
+@pytest.mark.parametrize("sequences_per_dataset", [True, False])
+@pytest.mark.parametrize("defer_npy_index_mmap", [True, False])
+@pytest.mark.parametrize("vocab_size", [131072, 20000])
+@pytest.mark.parametrize("mid_level_dataset_surplus", [0.005, 0.01, 0])
+def test_fast_builder(
+    use_split,
+    add_weights,
+    fast_cache_load,
+    sequences_per_dataset,
+    defer_npy_index_mmap,
+    vocab_size,
+    mid_level_dataset_surplus,
+    tmp_path_dist_ckpt,
+    sequence_length: int = 5,
+    number_of_files: int = 10,
+    number_of_documents: int = 10,
+):
+    if use_split and fast_cache_load:
+        pytest.skip("Skipping test case when both use_split and fast_cache_load are True")
+
+    if torch.distributed.is_available():
+        Utils.initialize_distributed()
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
+    tokenizer = build_tokenizer(
+        Namespace(
+            vocab_size=vocab_size,
+            tokenizer_type="NullTokenizer",
+            rank=0,
+            make_vocab_size_divisible_by=128,
+            tensor_model_parallel_size=1,
+        )
+    )
+
+    with TempNamedDir(tmp_path_dist_ckpt / "test_fast_builder", sync=True) as temp_dir:
+        # Created file_prefixes (tokenizer, Number of files, number of documents, path) --> returns file prefixes (list of strings)
+        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+            file_prefixes = create_file_prefixes(
+                tokenizer, number_of_files, number_of_documents, os.path.join(temp_dir, "dataset")
+            )
+        else:
+            file_prefixes = []
+            for i in range(number_of_files):
+                file_prefix_path = os.path.join(temp_dir, "dataset", f"file_{i}")
+                file_prefixes.append(file_prefix_path)
+
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+        random.seed(1234)  # NOTE(asolergi-nv): re-sync random state across all ranks
+
+        data_cache_path = os.path.join(temp_dir, "cache")
+
+        args = Namespace(
+            seed=1234,
+            seq_length=sequence_length,
+            data_cache_path=data_cache_path,
+            split=None,
+            data_path=None,
+            train_data_path=None,
+            valid_data_path=None,
+            test_data_path=None,
+            per_split_data_args_path=None,
+            data_args_path=None,
+        )
+
+        # set up data mixture
+        if use_split:
+            args.data_path = file_prefixes
+            args.split = "70,20,10"
+        else:
+            train_file_prefixes = file_prefixes[0:6]
+            valid_file_prefixes = file_prefixes[6:9]
+            test_file_prefixes = file_prefixes[9:10]
+
+            if add_weights:
+                # Save original lists before modifying
+                train_file_prefixes_original = train_file_prefixes[:]
+                valid_file_prefixes_original = valid_file_prefixes[:]
+                test_file_prefixes_original = test_file_prefixes[:]
+
+                # For train_file_prefixes, alternately append a random int (10-100) and the file prefix.
+                train_file_prefixes = []
+                for fp in train_file_prefixes_original:
+                    train_file_prefixes.extend([random.randint(10, 100), fp])
+                # For valid/test, also add random weights (10-100).
+                valid_file_prefixes = []
+                for fp in valid_file_prefixes_original:
+                    valid_file_prefixes.extend([random.randint(10, 100), fp])
+                test_file_prefixes = []
+                for fp in test_file_prefixes_original:
+                    test_file_prefixes.extend([random.randint(10, 100), fp])
+
+            args.train_data_path = train_file_prefixes
+            args.valid_data_path = valid_file_prefixes
+            args.test_data_path = test_file_prefixes
+
+        if sequences_per_dataset:
+            args.path_to_sequences_per_dataset_json = os.path.join(
+                temp_dir, "sequences_per_dataset.json"
+            )
+            sequences_per_dataset = build_sequences_per_dataset(args)
+
+        blend, blend_per_split = get_blend_and_blend_per_split(args)
+
+        data_args = {
+            "random_seed": args.seed,
+            "sequence_length": args.seq_length,
+            "blend": blend,
+            "blend_per_split": blend_per_split,
+            "split": args.split,
+            "path_to_cache": args.data_cache_path,
+            "tokenizer": tokenizer,
+            "reset_position_ids": False,
+            "reset_attention_mask": False,
+            "eod_mask_loss": False,
+            "create_attention_mask": False,
+            "mid_level_dataset_surplus": mid_level_dataset_surplus,
+        }
+        config = GPTDatasetConfig(**data_args)
+
+        train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+            GPTDataset, [100, 10, 10], lambda: True, config
+        ).build()
+
+        fast_config = GPTDatasetConfig(
+            **data_args,
+            fast_cache_load=fast_cache_load,
+            defer_npy_index_mmap=defer_npy_index_mmap,
+            sequences_per_dataset=sequences_per_dataset,
+        )
+
+        train_ds_fast, valid_ds_fast, test_ds_fast = BlendedMegatronDatasetBuilder(
+            GPTDataset, [100, 10, 10], lambda: True, fast_config
+        ).build()
+
+        for ds_slow, ds_fast, split_name in zip(
+            [train_ds, valid_ds, test_ds],
+            [train_ds_fast, valid_ds_fast, test_ds_fast],
+            ["train", "valid", "test"],
+        ):
+            if not ds_slow:
+                continue
+            assert len(ds_slow) == len(
+                ds_fast
+            ), f"ds_slow: {len(ds_slow)}, ds_fast: {len(ds_fast)}, split_name: {split_name}"
+            if isinstance(ds_slow, GPTDataset):
+                assert torch.all(ds_slow[0]["tokens"] == ds_fast[0]["tokens"])
+                assert torch.all(ds_slow[-1]["tokens"] == ds_fast[-1]["tokens"])
+                numpy.testing.assert_array_equal(ds_slow.document_index, ds_fast.document_index)
+                numpy.testing.assert_array_equal(ds_slow.sample_index, ds_fast.sample_index)
+                numpy.testing.assert_array_equal(ds_slow.shuffle_index, ds_fast.shuffle_index)
+                numpy.testing.assert_array_equal(
+                    ds_slow.dataset.index.sequence_lengths, ds_fast.dataset.index.sequence_lengths
+                )
+                numpy.testing.assert_array_equal(
+                    ds_slow.dataset.index.document_indices, ds_fast.dataset.index.document_indices
+                )
+                numpy.testing.assert_array_equal(
+                    ds_slow.dataset.index.sequence_pointers, ds_fast.dataset.index.sequence_pointers
+                )
+            elif isinstance(ds_slow, BlendedDataset):
+                assert torch.all(ds_slow[0]["tokens"] == ds_fast[0]["tokens"])
+                assert torch.all(ds_slow[-1]["tokens"] == ds_fast[-1]["tokens"])
+                numpy.testing.assert_array_equal(ds_slow.dataset_index, ds_fast.dataset_index)
+                numpy.testing.assert_array_equal(
+                    ds_slow.dataset_sample_index, ds_fast.dataset_sample_index
+                )
+                for ds_slow_i, ds_fast_i in zip(ds_slow.datasets, ds_fast.datasets):
+                    assert torch.all(ds_slow_i[0]["tokens"] == ds_fast_i[0]["tokens"])
+                    assert torch.all(ds_slow_i[-1]["tokens"] == ds_fast_i[-1]["tokens"])
+                    numpy.testing.assert_array_equal(
+                        ds_slow_i.document_index, ds_fast_i.document_index
+                    )
+                    numpy.testing.assert_array_equal(ds_slow_i.sample_index, ds_fast_i.sample_index)
+                    numpy.testing.assert_array_equal(
+                        ds_slow_i.shuffle_index, ds_fast_i.shuffle_index
+                    )
+                    numpy.testing.assert_array_equal(
+                        ds_slow_i.dataset.index.sequence_lengths,
+                        ds_fast_i.dataset.index.sequence_lengths,
+                    )
+                    numpy.testing.assert_array_equal(
+                        ds_slow_i.dataset.index.document_indices,
+                        ds_fast_i.dataset.index.document_indices,
+                    )
+                    numpy.testing.assert_array_equal(
+                        ds_slow_i.dataset.index.sequence_pointers,
+                        ds_fast_i.dataset.index.sequence_pointers,
+                    )
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+
 if __name__ == "__main__":
     test_builder()
diff --git a/tests/unit_tests/dist_checkpointing/models/common.py b/tests/unit_tests/dist_checkpointing/models/common.py
index 31b5d9db3c9..8cb1dc4df65 100644
--- a/tests/unit_tests/dist_checkpointing/models/common.py
+++ b/tests/unit_tests/dist_checkpointing/models/common.py
@@ -91,7 +91,8 @@ def common_test_parallel_reconfiguration_e2e(
         save(gpt_model_A.sharded_state_dict(metadata=metadata), ckpt_dir_A, save_strategy)
         regular_state_dict_A = gpt_model_A.state_dict()
         Utils.destroy_model_parallel()
-
+        if metadata is not None:
+            metadata.pop("dp_cp_group")
         # Load checkpoint A with different TP/PP and save as checkpoint B
         # No FPS this time, only FPL
         Utils.initialize_model_parallel(*dest_tp_pp, **(dst_tp_pp_kwargs or {}), order=store_order)
diff --git a/tests/unit_tests/dist_checkpointing/models/test_bert_model.py b/tests/unit_tests/dist_checkpointing/models/test_bert_model.py
index 27f01447851..81b01c8f886 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_bert_model.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_bert_model.py
@@ -24,13 +24,11 @@
 
 
 def initialize_bert_model(
-    seed, layer_spec_fn=bert_layer_with_transformer_engine_spec, vocab_size=128, **config_kwargs
+    seed, layer_spec=bert_layer_with_transformer_engine_spec, vocab_size=128, **config_kwargs
 ):
     torch.manual_seed(seed)
     model_parallel_cuda_manual_seed(seed)
 
-    layer_spec = layer_spec_fn() if callable(layer_spec_fn) else layer_spec_fn
-
     default_config_kwargs = dict(
         num_layers=8,
         hidden_size=16,
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mamba.py b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
index ff2c6309977..85fbe5dd045 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_mamba.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
@@ -130,6 +130,8 @@ def test_parallel_reconfiguration_e2e(
                 )
             save(sharded_state_dict, ckpt_dir_A, save_strategy)
             Utils.destroy_model_parallel()
+            if metadata is not None:
+                metadata.pop("dp_cp_group")
 
             # Load checkpoint A with different TP/PP/expert/CP and save as checkpoint B
             # No FPS this time, only FPL
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
index 18cfbf67cee..0970e2adc8a 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
@@ -71,6 +71,9 @@ def test_parallel_reconfiguration_e2e(
             save(mlp_A.sharded_state_dict(prefix=layer_prefix, metadata=metadata), ckpt_dir_A)
             Utils.destroy_model_parallel()
 
+            if "dp_cp_group" in metadata.keys():
+                del metadata["dp_cp_group"]
+
             # Load checkpoint A with different TP/PP and save as checkpoint B
             Utils.initialize_model_parallel(*dest_tp_pp)
             mlp_B = initialize_mlp()
diff --git a/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
index b116d2cb603..ca546d746af 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
@@ -190,6 +190,9 @@ def test_parallel_reconfiguration_e2e(
             save(sharded_state_dict, ckpt_dir_A, save_strategy)
             Utils.destroy_model_parallel()
 
+            if "dp_cp_group" in metadata.keys():
+                del metadata["dp_cp_group"]
+
             # Load checkpoint A with different TP/PP/EP and save as checkpoint B
             # No FPS this time, only FPL
             Utils.initialize_model_parallel(
@@ -276,6 +279,9 @@ def test_sequential_grouped_mlp_interchangeable(
             save(sharded_state_dict, ckpt_dir_A, save_strategy)
             Utils.destroy_model_parallel()
 
+            if "dp_cp_group" in metadata.keys():
+                del metadata["dp_cp_group"]
+
             Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp)
             model_B = initialize_expert_layer(1, use_glu, expert_type=dest_module)
             load_strategy = None
@@ -351,6 +357,9 @@ def test_sequential_grouped_mlp_extra_state(
             save(sharded_state_dict, ckpt_dir_A, save_strategy)
             Utils.destroy_model_parallel()
 
+            if "dp_cp_group" in metadata.keys():
+                del metadata["dp_cp_group"]
+
             Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp)
             load_strategy = None
 
diff --git a/tests/unit_tests/dist_checkpointing/models/test_t5_model.py b/tests/unit_tests/dist_checkpointing/models/test_t5_model.py
index 1e44ee527ae..e393c806a94 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_t5_model.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_t5_model.py
@@ -6,14 +6,6 @@
 from megatron.core import parallel_state as ps
 from megatron.core.dist_checkpointing import load, save
 from megatron.core.dist_checkpointing.validation import StrictHandling
-from megatron.core.models.retro.decoder_spec import (
-    get_retro_decoder_layer_local_spec,
-    get_retro_decoder_layer_te_spec,
-)
-from megatron.core.models.retro.encoder_spec import (
-    get_retro_encoder_layer_local_spec,
-    get_retro_encoder_layer_te_spec,
-)
 from megatron.core.models.T5 import T5Model
 from megatron.core.models.T5.t5_spec import decoder_model_with_local_spec as t5_decoder_local_spec
 from megatron.core.models.T5.t5_spec import (
@@ -94,14 +86,8 @@ def test_sharded_state_dict_save_load(
         self, tmp_path_dist_ckpt, src_spec_type, dst_spec_type, model_type
     ):
         enc_dec_spec_fn = {
-            'te': {
-                't5': (t5_encoder_te_spec, t5_decoder_te_spec),
-                'retro': (get_retro_encoder_layer_te_spec, get_retro_decoder_layer_te_spec),
-            },
-            'local': {
-                't5': (t5_encoder_local_spec, t5_decoder_local_spec),
-                'retro': (get_retro_encoder_layer_local_spec, get_retro_decoder_layer_local_spec),
-            },
+            'te': {'t5': (t5_encoder_te_spec, t5_decoder_te_spec)},
+            'local': {'t5': (t5_encoder_local_spec, t5_decoder_local_spec)},
         }
         src_encoder_decoder_spec_fn = enc_dec_spec_fn[src_spec_type][model_type]
         dst_encoder_decoder_spec_fn = enc_dec_spec_fn[dst_spec_type][model_type]
@@ -155,14 +141,8 @@ def test_parallel_reconfiguration_e2e(
         *dest_tp_pp, dst_encpp = dest_tp_pp_encpp
 
         enc_dec_spec_fn = {
-            'te': {
-                't5': (t5_encoder_te_spec, t5_decoder_te_spec),
-                'retro': (get_retro_encoder_layer_te_spec, get_retro_decoder_layer_te_spec),
-            },
-            'local': {
-                't5': (t5_encoder_local_spec, t5_decoder_local_spec),
-                'retro': (get_retro_encoder_layer_local_spec, get_retro_decoder_layer_local_spec),
-            },
+            'te': {'t5': (t5_encoder_te_spec, t5_decoder_te_spec)},
+            'local': {'t5': (t5_encoder_local_spec, t5_decoder_local_spec)},
         }
 
         common_test_parallel_reconfiguration_e2e(
diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
deleted file mode 100644
index 3ca4613f59e..00000000000
--- a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import io
-from contextlib import nullcontext
-
-import numpy as np
-import pytest
-import torch
-from torch.distributed.checkpoint import CheckpointException
-
-from megatron.core import parallel_state
-from megatron.core.dist_checkpointing import ShardedTensor, load, save
-from megatron.core.dist_checkpointing.core import CheckpointingException, maybe_load_config
-from megatron.core.dist_checkpointing.dict_utils import diff
-from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensorFactory
-from megatron.core.dist_checkpointing.serialization import load_tensors_metadata
-from megatron.core.dist_checkpointing.strategies.resharding import (
-    apply_nd_flattened_tensors_reformulation,
-    restore_nd_flattened_tensors_formulation,
-)
-from megatron.core.dist_checkpointing.strategies.torch import get_reformulation_metadata
-from megatron.core.dist_checkpointing.validation import (
-    determine_global_metadata,
-    validate_sharding_integrity,
-)
-from tests.unit_tests.dist_checkpointing import TempNamedDir
-from tests.unit_tests.test_utilities import Utils
-
-
-class TestFlattenedResharding:
-    def setup_method(self, method):
-        pass
-
-    def teardown_method(self, method):
-        Utils.destroy_model_parallel()
-
-    @pytest.mark.flaky
-    @pytest.mark.flaky_in_dev  # Issue #2854
-    @pytest.mark.parametrize(
-        ('src_tp_pp', 'dest_tp_pp'),
-        [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
-    )
-    def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
-        Utils.initialize_model_parallel(*src_tp_pp)
-        with TempNamedDir(
-            tmp_path_dist_ckpt / 'test_flattened_partition_change_save_load'
-        ) as ckpt_dir:
-
-            state_dict = self._build_state_dict()
-
-            save(state_dict, ckpt_dir)
-
-            # change TPxPP
-            Utils.destroy_model_parallel()
-            Utils.initialize_model_parallel(*dest_tp_pp)
-            loaded_state_dict = load(self._build_state_dict(random=True), ckpt_dir)
-            expected_state_dict = {k: v.data for k, v in self._build_state_dict().items()}
-
-            diffs = diff(expected_state_dict, loaded_state_dict)
-            assert not any(diffs), diffs
-
-        Utils.destroy_model_parallel()
-
-    @pytest.mark.flaky
-    @pytest.mark.flaky_in_dev  # Issue #2854
-    @pytest.mark.parametrize(
-        ('src_tp_pp', 'dest_tp_pp', 'expected_ckpt_offsets_by_rank'),
-        [
-            (
-                (2, 4),
-                (2, 2),
-                {
-                    0: [(0, 0, 0), (0, 0, 10)],  # TP 0, DP 0, PP 0
-                    1: [(4, 0, 0), (4, 0, 10)],  # TP 1, DP 0, PP 0
-                    2: [(0, 0, 0), (0, 0, 10)],  # TP 0, DP 1, PP 0
-                    3: [(4, 0, 0), (4, 0, 10)],  # TP 1, DP 1, PP 0
-                    4: [(0, 0, 20), (0, 0, 30)],  # TP 0, DP 0, PP 1
-                    5: [(4, 0, 20), (4, 0, 30)],  # TP 1, DP 0, PP 1
-                    6: [(0, 0, 20), (0, 0, 30)],  # TP 0, DP 1, PP 1
-                    7: [(4, 0, 20), (4, 0, 30)],  # TP 1, DP 1, PP 1
-                },
-            ),
-            ((8, 1), (1, 2), {rank: [(tp, 0, 0) for tp in range(8)] for rank in range(8)}),
-        ],
-    )
-    def test_reformulate_nd_flattened_tensors(
-        self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, expected_ckpt_offsets_by_rank
-    ):
-        Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp')
-        with TempNamedDir(tmp_path_dist_ckpt / 'test_reformulate_nd_flattened_tensors') as ckpt_dir:
-
-            state_dict = self._build_state_dict()
-
-            ckpt_local_shape = state_dict['sd_key_flat'].local_shape
-
-            save(state_dict, ckpt_dir)
-
-            # change TPxPP
-            Utils.destroy_model_parallel()
-            Utils.initialize_model_parallel(*dest_tp_pp, order='tp-dp-pp')
-            load_state_dict = self._build_state_dict(random=True)
-
-            reformulation_metadata = get_reformulation_metadata(load_state_dict, ckpt_dir)
-            reformulated_state_dict, formulation_restore_data = (
-                apply_nd_flattened_tensors_reformulation(load_state_dict, reformulation_metadata)
-            )
-            assert isinstance(reformulated_state_dict['sd_key_unflat'], ShardedTensor)
-            assert isinstance(reformulated_state_dict['sd_key_flat'], dict)
-
-            assert reformulated_state_dict['sd_key_flat'].keys() == set(
-                (offset, ckpt_local_shape) for offset in expected_ckpt_offsets_by_rank[Utils.rank]
-            ), (
-                reformulated_state_dict['sd_key_flat'].keys(),
-                ckpt_local_shape,
-                expected_ckpt_offsets_by_rank[Utils.rank],
-            )
-
-            # We can even load the reformulated state dict with a high-level API
-            loaded_state_dict = load(
-                reformulated_state_dict, ckpt_dir, validate_access_integrity=False
-            )
-            loaded_state_dict = restore_nd_flattened_tensors_formulation(
-                loaded_state_dict, formulation_restore_data
-            )
-            expected_state_dict = {k: v.data for k, v in self._build_state_dict().items()}
-            diffs = diff(expected_state_dict, loaded_state_dict)
-            assert not any(diffs), diffs
-
-        Utils.destroy_model_parallel()
-
-    @pytest.mark.flaky
-    @pytest.mark.flaky_in_dev  # Issue #2854
-    @pytest.mark.parametrize(('src_tp_pp',), [((2, 4),), ((8, 1),), ((1, 1),), ((1, 4),)])
-    def test_load_tensor_metadata(self, tmp_path_dist_ckpt, src_tp_pp):
-        Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp')
-        with TempNamedDir(tmp_path_dist_ckpt / 'test_reformulate_nd_flattened_tensors') as ckpt_dir:
-
-            state_dict = self._build_state_dict()
-
-            save(state_dict, ckpt_dir)
-
-            # change TPxPP
-            Utils.destroy_model_parallel()
-            Utils.initialize_model_parallel(1, 1)
-
-            sharded_metadata = load_tensors_metadata(ckpt_dir)
-
-            for attr_name in ('local_shape', 'global_shape'):
-                flat_val = getattr(sharded_metadata['flat'], attr_name)
-                unflat_val = getattr(sharded_metadata['unflat'], attr_name)
-                assert flat_val == unflat_val, (attr_name, flat_val, unflat_val)
-
-            for sh_ten in sharded_metadata.values():
-                sh_ten.replica_id = Utils.rank
-            loaded_state_dict = load(sharded_metadata, ckpt_dir)
-            assert torch.all(
-                loaded_state_dict['unflat'] == torch.arange(8 * 5 * 40).reshape(8, 5, 40)
-            )
-            assert torch.all(loaded_state_dict['flat'] == torch.arange(8 * 5 * 40))
-
-        Utils.destroy_model_parallel()
-
-    def _build_state_dict(self, random=False):
-        tp_rank = parallel_state.get_tensor_model_parallel_rank()
-        tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
-        pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-        dp_rank = parallel_state.get_data_parallel_rank()
-        dp_size = parallel_state.get_data_parallel_world_size()
-
-        init_fn = torch.rand if random else torch.arange
-        global_ten = init_fn(8 * 5 * 40).reshape(8, 5, 40)
-        local_ten = global_ten
-        local_ten = local_ten.chunk(tp_size, dim=0)[tp_rank]
-        local_ten = local_ten.chunk(pp_size, dim=2)[pp_rank]
-        assert local_ten.shape == (8 // tp_size, 5, 40 // pp_size)
-
-        local_ten_size_by_dp = local_ten.numel()
-        assert local_ten_size_by_dp % dp_size == 0, (local_ten_size_by_dp, dp_size)
-        local_ten_size_by_dp = local_ten_size_by_dp // dp_size
-        # make a bit shifted DP slices so that they are not equal
-        start_jitter = dp_rank
-        end_jitter = dp_rank + 1 if dp_rank + 1 < dp_size else 0
-        local_dp_slice = slice(
-            local_ten_size_by_dp * dp_rank + start_jitter,
-            local_ten_size_by_dp * (dp_rank + 1) + end_jitter,
-        )
-        local_flat_ten = local_ten.flatten()[local_dp_slice]
-        if dp_rank == dp_size - 1:
-            assert local_flat_ten.numel() == local_ten_size_by_dp - dp_rank
-        else:
-            assert local_flat_ten.numel() == local_ten_size_by_dp + 1
-
-        state_dict = {
-            'sd_key_unflat': ShardedTensor.from_rank_offsets(
-                'unflat',
-                local_ten,
-                (0, tp_rank, tp_size),
-                (2, pp_rank, pp_size),
-                replica_id=dp_rank,
-            ),
-            'sd_key_flat': ShardedTensor.from_rank_offsets_flat(
-                'flat',
-                local_flat_ten,
-                local_ten.shape,
-                (0, tp_rank, tp_size),
-                (2, pp_rank, pp_size),
-                flattened_range=local_dp_slice,
-            ),
-        }
-        return state_dict
-
-    def test_flattened_tensors_are_properly_validated(self, tmp_path_dist_ckpt):
-        Utils.initialize_model_parallel()
-        # Global tensor of shape (6, 6) is built from:
-        # ranks 0, 1, 2 tensors of length 1, 2, 3
-        # and then ranks 3, ..., 7 tensors of length 6
-        local_flat_ten = torch.ones(Utils.rank + 1 if Utils.rank <= 2 else 6) * Utils.rank
-
-        global_flattened_len = 6 + (Utils.world_size - 3) * 6
-        if Utils.world_size == 8:
-            assert global_flattened_len == 1 + 2 + 3 + 5 * 6
-            local_ten_shape = (1, 6)
-        else:
-            local_ten_shape = (global_flattened_len,)
-
-        if Utils.rank == 0:
-            local_dp_slice_start = 0
-        elif Utils.rank == 1:
-            local_dp_slice_start = 1
-        elif Utils.rank == 2:
-            local_dp_slice_start = 3
-        else:
-            local_dp_slice_start = 0
-        local_dp_slice = slice(local_dp_slice_start, local_dp_slice_start + len(local_flat_ten))
-
-        state_dict = {
-            'sd_key_flat': ShardedTensor.from_rank_offsets_flat(
-                'flat',
-                local_flat_ten,
-                local_ten_shape,
-                *((0, max(0, Utils.rank - 2), 6),) if Utils.world_size == 8 else (),
-                flattened_range=local_dp_slice,
-                replica_id=0,
-            )
-        }
-        validate_sharding_integrity(determine_global_metadata(state_dict)[1])
-        if Utils.rank == 1:
-            old_state_dict = state_dict
-            state_dict = {}
-
-        with (
-            pytest.raises(CheckpointingException) if Utils.rank == 0 else nullcontext()
-        ) as exc_info:
-            validate_sharding_integrity(determine_global_metadata(state_dict)[1])
-        if Utils.rank == 0:
-            assert 'Flattened ranges dont cover the whole shard ShardedTensor' in str(
-                exc_info.value
-            )
-
-        if Utils.rank == 1:
-            state_dict = old_state_dict
-
-        if Utils.rank == 4:
-            state_dict = {}
-
-        with (
-            pytest.raises(CheckpointingException) if Utils.rank == 0 else nullcontext()
-        ) as exc_info:
-            validate_sharding_integrity(determine_global_metadata(state_dict)[1])
-        if Utils.rank == 0:
-            assert 'Invalid access pattern' in str(exc_info.value)
-
-        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py b/tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py
new file mode 100644
index 00000000000..3f60658a005
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py
@@ -0,0 +1,606 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from copy import deepcopy
+from functools import partial
+from unittest import mock
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import load, save
+from megatron.core.dist_checkpointing.dict_utils import nested_values
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_decoder_block_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.optimizer import ChainedOptimizer
+from megatron.core.optimizer.layer_wise_optimizer import LayerWiseDistributedOptimizer
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
+from megatron.core.transformer import MLATransformerConfig, TransformerConfig
+from megatron.core.utils import get_pg_size
+from megatron.training.arguments import parse_args
+from megatron.training.checkpointing import load_checkpoint, save_checkpoint
+from tests.unit_tests.dist_checkpointing import (
+    TempNamedDir,
+    init_basic_mock_args,
+    init_checkpointing_mock_args,
+    initialize_gpt_model,
+    setup_model_and_optimizer,
+    setup_moe_model_and_optimizer,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+def check_equal(input_1, input_2):
+    """Check if two inputs are equal, used for checking checkpointing."""
+    if isinstance(input_1, dict) and isinstance(input_2, dict):
+        assert input_1.keys() == input_2.keys()
+        for key in input_1.keys():
+            check_equal(input_1[key], input_2[key])
+    elif isinstance(input_1, list) and isinstance(input_2, list):
+        assert len(input_1) == len(input_2)
+        for i in range(len(input_1)):
+            check_equal(input_1[i], input_2[i])
+    elif isinstance(input_1, torch.Tensor) and isinstance(input_2, torch.Tensor):
+        assert torch.all(input_1 == input_2), f"Input 1: {input_1} != Input 2: {input_2}"
+    elif type(input_1) != type(input_2):
+        assert False, f"Input 1 type: {type(input_1)} != Input 2 type: {type(input_2)}"
+    else:
+        assert input_1 == input_2, f"Input 1: {input_1} != Input 2: {input_2}"
+
+
+def initialize_real_model(
+    seed,
+    pre_process,
+    post_process,
+    vp_stage=None,
+    is_moe=False,
+    is_mla=False,
+    virtual_pipeline_model_parallel_size=None,
+    **config_kwargs,
+):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    default_config_kwargs = dict(
+        num_layers=6,
+        hidden_size=16,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        pipeline_dtype=torch.bfloat16,
+        bf16=True,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
+    )
+    if is_moe:
+        default_config_kwargs["moe_ffn_hidden_size"] = 128
+        default_config_kwargs["num_moe_experts"] = 4
+        default_config_kwargs["add_bias_linear"] = False
+        # Pop unused fields
+        config_kwargs.pop("use_sp")
+        config_kwargs.pop("use_te")
+        config_kwargs.pop("use_grouped_mlp")
+        config_kwargs.pop("use_glu")
+    if is_mla:
+        default_config_kwargs["multi_latent_attention"] = True
+        default_config_kwargs["q_lora_rank"] = 96
+        default_config_kwargs["kv_lora_rank"] = 512
+        default_config_kwargs["qk_head_dim"] = 64
+        default_config_kwargs["qk_pos_emb_head_dim"] = 32
+        default_config_kwargs["v_head_dim"] = 64
+    config_kwargs.pop("pg_collection", None)
+    config_kwargs.pop("config", None)
+    default_config_kwargs.update(**config_kwargs)
+    config_cls = MLATransformerConfig if is_mla else TransformerConfig
+    transformer_config = config_cls(**default_config_kwargs)
+
+    if is_moe:
+        layer_spec = get_gpt_decoder_block_spec(
+            transformer_config, use_transformer_engine=True, vp_stage=vp_stage
+        )
+    else:
+        layer_spec = get_gpt_layer_with_transformer_engine_spec(multi_latent_attention=is_mla)
+    this_model = GPTModel(
+        config=transformer_config,
+        transformer_layer_spec=layer_spec,
+        vocab_size=128,
+        max_sequence_length=4,
+        pre_process=pre_process,
+        post_process=post_process,
+        vp_stage=vp_stage,
+    )
+
+    return this_model
+
+
+def load_checkpoint_no_arg_checks(*args, **kwargs):
+    with mock.patch('megatron.training.checkpointing.check_checkpoint_args'):
+        with mock.patch('megatron.training.checkpointing.update_num_microbatches'):
+            return load_checkpoint(*args, **kwargs)
+
+
+class TestLayerWiseOptimizer:
+    """Tests for LayerWiseDistributedOptimizer functionality."""
+
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_parameter_sharding(self):
+        """Test that parameters are correctly sharded across DP ranks."""
+        Utils.initialize_model_parallel(1, 1)
+
+        model, optimizer = setup_model_and_optimizer(
+            seed=2,
+            tp=1,
+            pp=1,
+            bf16=True,
+            dist_opt=False,
+            initialize_fn=initialize_gpt_model,
+            optimizer='dist_muon',
+        )
+
+        # Check if optimizer is ChainedOptimizer (expected for standard setup)
+        if isinstance(optimizer, ChainedOptimizer):
+            total_params = sum(
+                len(group['params'])
+                for opt in optimizer.chained_optimizers
+                for group in opt.param_groups
+            )
+            assert total_params > 0, "No parameters found in optimizer"
+
+    @pytest.mark.parametrize('tp', [1, 2, 4])
+    @pytest.mark.parametrize('pp', [1, 2, 4])
+    def test_broadcast_params(self, tp, pp):
+        """Test that parameter broadcasting works correctly across DP ranks."""
+        if tp * pp > 8:
+            pytest.skip(f"TP*PP > 8 is larger than world size")
+
+        Utils.initialize_model_parallel(tp, pp)
+
+        model, optimizer = setup_model_and_optimizer(
+            seed=2,
+            tp=tp,
+            pp=pp,
+            bf16=True,
+            dist_opt=False,
+            initialize_fn=initialize_gpt_model,
+            optimizer='dist_muon',
+        )
+
+        # If this is a LayerWiseDistributedOptimizer, test broadcast
+        if isinstance(optimizer, LayerWiseDistributedOptimizer):
+            # Store original param values
+            original_params = {}
+            for name, param in model[0].named_parameters():
+                original_params[name] = param.data.clone()
+
+            # Call broadcast (should be idempotent if no updates)
+            optimizer.broadcast_params()
+
+            # Check params are unchanged after broadcast without step
+            for name, param in model[0].named_parameters():
+                assert torch.allclose(param.data, original_params[name])
+
+    # TODO(deyuf): check bf16 False case
+    @pytest.mark.parametrize('tp', [1, 2, 4])
+    @pytest.mark.parametrize('pp', [1, 2, 4])
+    @pytest.mark.parametrize('bf16', [True])
+    def test_layer_wise_optimizer_save_load(self, tmp_path_dist_ckpt, tp, pp, bf16):
+        """Test save/load of LayerWiseDistributedOptimizer checkpoints."""
+        if tp * pp > 8:
+            pytest.skip(f"TP*PP > 8 is larger than world size")
+
+        Utils.initialize_model_parallel(tp, pp)
+
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_layer_wise_optimizer_A', sync=True
+        ) as ckpt_dir_A:
+            with TempNamedDir(
+                tmp_path_dist_ckpt / 'test_layer_wise_optimizer_B', sync=True
+            ) as ckpt_dir_B:
+                # Create model and optimizer A
+                model_A, optimizer_A = setup_model_and_optimizer(
+                    seed=2,
+                    tp=tp,
+                    pp=pp,
+                    bf16=bf16,
+                    dist_opt=False,
+                    initialize_fn=initialize_gpt_model,
+                    optimizer='dist_muon',
+                )
+
+                # Save checkpoint A
+                model_sharded_sd_A = model_A[0].sharded_state_dict()
+                optim_sd_A = optimizer_A.sharded_state_dict(model_sharded_sd_A)
+                save(optim_sd_A, ckpt_dir_A)
+
+                # Create model and optimizer B with different seed
+                model_B, optimizer_B = setup_model_and_optimizer(
+                    seed=3,
+                    tp=tp,
+                    pp=pp,
+                    bf16=bf16,
+                    dist_opt=False,
+                    initialize_fn=initialize_gpt_model,
+                    optimizer='dist_muon',
+                )
+
+                # Load checkpoint A into optimizer B
+                model_sharded_sd_B = model_B[0].sharded_state_dict()
+                load_sharded_sd = optimizer_B.sharded_state_dict(
+                    model_sharded_sd_B, is_loading=True
+                )
+                state_dict = load(load_sharded_sd, ckpt_dir_A)
+                optimizer_B.load_state_dict(state_dict)
+
+                # Save as checkpoint B
+                optim_sd_B = optimizer_B.sharded_state_dict(model_sharded_sd_B)
+                save(optim_sd_B, ckpt_dir_B)
+
+                Utils.destroy_model_parallel()
+
+                # Compare checkpoints
+                Utils.initialize_model_parallel(1, 1)
+                from megatron.core.dist_checkpointing import load_plain_tensors
+
+                plain_sd_A = load_plain_tensors(ckpt_dir_A)
+                plain_sd_B = load_plain_tensors(ckpt_dir_B)
+
+                check_equal(plain_sd_A, plain_sd_B)
+
+    @pytest.mark.parametrize('tp', [1, 2, 4])
+    @pytest.mark.parametrize('pp', [1, 2, 4])
+    def test_layer_wise_optimizer_grad_norm(self, tp, pp):
+        """Test that gradient norm calculation works correctly."""
+        if tp * pp > 8:
+            pytest.skip(f"TP*PP > 8 is larger than world size")
+
+        Utils.initialize_model_parallel(tp, pp)
+
+        model, optimizer = setup_model_and_optimizer(
+            seed=2,
+            tp=tp,
+            pp=pp,
+            bf16=True,
+            dist_opt=False,
+            initialize_fn=initialize_gpt_model,
+            optimizer='dist_muon',
+        )
+
+        # Create dummy gradients
+        for param in model[0].parameters():
+            if param.requires_grad:
+                param.grad = torch.randn_like(param.data)
+
+        # Test grad norm calculation
+        if isinstance(optimizer, LayerWiseDistributedOptimizer):
+            grad_norm = optimizer.get_grad_norm()
+            assert grad_norm is not None
+            assert grad_norm >= 0
+
+    @pytest.mark.parametrize('tp', [1, 2, 4])
+    @pytest.mark.parametrize('pp', [1, 2, 4])
+    def test_layer_wise_optimizer_count_zeros(self, tp, pp):
+        """Test that zero counting in gradients works correctly."""
+        if tp * pp > 8:
+            pytest.skip(f"TP*PP > 8 is larger than world size")
+
+        Utils.initialize_model_parallel(tp, pp)
+
+        model, optimizer = setup_model_and_optimizer(
+            seed=2,
+            tp=tp,
+            pp=pp,
+            bf16=True,
+            dist_opt=False,
+            initialize_fn=initialize_gpt_model,
+            optimizer='dist_muon',
+        )
+
+        # Create dummy gradients with some zeros
+        for param in model[0].parameters():
+            if param.requires_grad:
+                grad = torch.randn_like(param.data)
+                # Set some values to zero
+                grad[grad < 0] = 0
+                param.grad = grad
+
+        # Test zero counting
+        if isinstance(optimizer, LayerWiseDistributedOptimizer):
+            num_zeros = optimizer.count_zeros()
+            assert num_zeros >= 0
+
+    @pytest.mark.parametrize('src_tp', [1, 2, 4])
+    @pytest.mark.parametrize('src_pp', [1, 2, 4])
+    @pytest.mark.parametrize('dest_tp', [1, 2, 4])
+    @pytest.mark.parametrize('dest_pp', [1, 2, 4])
+    def test_layer_wise_optimizer_resharding(
+        self, tmp_path_dist_ckpt, src_tp, src_pp, dest_tp, dest_pp
+    ):
+        """Test resharding of LayerWiseDistributedOptimizer across different TP/PP."""
+        if src_tp * src_pp > 8:
+            pytest.skip(f"SRC_TP*SRC_PP > 8 is larger than world size")
+
+        if dest_tp * dest_pp > 8:
+            pytest.skip(f"DEST_TP*DEST_PP > 8 is larger than world size")
+
+        Utils.initialize_model_parallel(src_tp, src_pp)
+
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_layer_wise_resharding_A', sync=True
+        ) as ckpt_dir:
+            # Create and save with source configuration
+            model_A, optimizer_A = setup_model_and_optimizer(
+                seed=2,
+                tp=src_tp,
+                pp=src_pp,
+                bf16=True,
+                dist_opt=False,
+                initialize_fn=initialize_gpt_model,
+                optimizer='dist_muon',
+            )
+
+            model_sharded_sd = model_A[0].sharded_state_dict()
+            optim_sd = optimizer_A.sharded_state_dict(model_sharded_sd)
+            save(optim_sd, ckpt_dir)
+
+            Utils.destroy_model_parallel()
+
+            # Load with destination configuration
+            Utils.initialize_model_parallel(dest_tp, dest_pp)
+            model_B, optimizer_B = setup_model_and_optimizer(
+                seed=3,
+                tp=dest_tp,
+                pp=dest_pp,
+                bf16=True,
+                dist_opt=False,
+                initialize_fn=initialize_gpt_model,
+                optimizer='dist_muon',
+            )
+
+            model_sharded_sd = model_B[0].sharded_state_dict()
+            load_sharded_sd = optimizer_B.sharded_state_dict(model_sharded_sd, is_loading=True)
+
+            state_dict = load(load_sharded_sd, ckpt_dir)
+            optimizer_B.load_state_dict(state_dict)
+
+    @pytest.mark.parametrize('tp', [1, 2, 4])
+    @pytest.mark.parametrize('pp', [1, 2, 4])
+    @pytest.mark.parametrize('ep', [1, 2, 4])
+    def test_layer_wise_optimizer_with_moe(self, tmp_path_dist_ckpt, tp, pp, ep):
+        """Test LayerWiseDistributedOptimizer with MoE models."""
+        if tp * pp * ep > 8:
+            pytest.skip(f"TP*PP > 8 is larger than world size")
+
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            expert_model_parallel_size=ep,
+        )
+
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_layer_wise_moe', sync=True) as ckpt_dir:
+            # Create MoE model with optimizer
+            model, optimizer = setup_moe_model_and_optimizer(
+                seed=2, tp=tp, pp=pp, ep=ep, bf16=True, dist_opt=False, optimizer='dist_muon'
+            )
+
+            # Test that optimizer handles expert parallel parameters
+            if isinstance(optimizer, LayerWiseDistributedOptimizer):
+                # Check that expt_dp_params_list exists if EP > 1
+                if ep > 1:
+                    assert hasattr(optimizer, 'expt_dp_params_list')
+
+            # Test save/load
+            model_sharded_sd = model[0].sharded_state_dict()
+            optim_sd = optimizer.sharded_state_dict(model_sharded_sd)
+            save(optim_sd, ckpt_dir)
+
+            # Create new optimizer and load
+            model_new, optimizer_new = setup_moe_model_and_optimizer(
+                seed=3, tp=tp, pp=pp, ep=ep, bf16=True, dist_opt=False, optimizer='dist_muon'
+            )
+
+            model_sharded_sd = model_new[0].sharded_state_dict()
+            load_sharded_sd = optimizer_new.sharded_state_dict(model_sharded_sd, is_loading=True)
+            state_dict = load(load_sharded_sd, ckpt_dir)
+            optimizer_new.load_state_dict(state_dict)
+
+    def test_layer_wise_optimizer_replica_id(self):
+        """Test that LayerWiseDistributedOptimizer sets replica_id correctly."""
+        Utils.initialize_model_parallel(2, 2)
+
+        model, optimizer = setup_model_and_optimizer(
+            seed=2,
+            tp=2,
+            pp=2,
+            bf16=True,
+            dist_opt=False,
+            initialize_fn=initialize_gpt_model,
+            optimizer='dist_muon',
+        )
+
+        if isinstance(optimizer, LayerWiseDistributedOptimizer):
+            model_sharded_sd = model[0].sharded_state_dict()
+            optim_sd = optimizer.sharded_state_dict(model_sharded_sd)
+
+            # Extract ShardedTensors and check replica_id
+            from megatron.core.dist_checkpointing import ShardedTensor
+
+            for sh_base in nested_values(optim_sd):
+                if isinstance(sh_base, ShardedTensor):
+                    # Check that replica_id has been modified
+                    assert len(sh_base.replica_id) == 3
+                    # DP component should be 0 for layer-wise optimizer
+                    assert sh_base.replica_id[2] == 0
+
+    @pytest.mark.parametrize('dp_size', [1, 2, 4])
+    def test_layer_wise_optimizer_dp_sizes(self, dp_size):
+        """Test LayerWiseDistributedOptimizer with different DP sizes."""
+        # Use TP to vary DP size while keeping world size constant
+        world_size = 8
+        if world_size % dp_size != 0:
+            pytest.skip(f"World size {world_size} not divisible by DP size {dp_size}")
+
+        pp = 1
+        tp = world_size // dp_size
+
+        if tp == 0:
+            pytest.skip(f"Invalid TP configuration")
+
+        Utils.initialize_model_parallel(tp, pp)
+
+        model, optimizer = setup_model_and_optimizer(
+            seed=2,
+            tp=tp,
+            pp=1,
+            bf16=True,
+            dist_opt=False,
+            initialize_fn=initialize_gpt_model,
+            optimizer='dist_muon',
+        )
+
+        if isinstance(optimizer, LayerWiseDistributedOptimizer):
+            # Check parameter sharding based on DP size
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+            pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True)
+
+            actual_dp_size = get_pg_size(pg_collection.dp_cp)
+
+            if actual_dp_size > 1:
+                assert optimizer.dp_cp_params_list is not None
+                assert len(optimizer.dp_cp_params_list) == actual_dp_size
+            else:
+                assert optimizer.dp_cp_params_list is None
+
+    def test_layer_wise_optimizer_step(self):
+        """Test that step function works and returns expected values."""
+        Utils.initialize_model_parallel(2, 2)
+
+        model, optimizer = setup_model_and_optimizer(
+            seed=2,
+            tp=2,
+            pp=2,
+            bf16=True,
+            dist_opt=False,
+            initialize_fn=initialize_gpt_model,
+            optimizer='dist_muon',
+        )
+
+        # Create dummy gradients
+        for param in model[0].parameters():
+            if param.requires_grad:
+                param.grad = torch.randn_like(param.data)
+
+        if isinstance(optimizer, LayerWiseDistributedOptimizer):
+            # Perform step
+            update_successful, grad_norm, num_zeros = optimizer.step()
+
+            # Check return values
+            assert isinstance(update_successful, bool)
+            assert grad_norm is None or grad_norm >= 0
+            assert num_zeros is None or num_zeros >= 0
+
+    # TODO(@boxiangw): Add test for loading with different TP/PP sizes
+    @pytest.mark.parametrize("fully_parallel", [True, False])
+    @pytest.mark.parametrize('optimizer_type', ['dist_muon', 'muon'])
+    @pytest.mark.parametrize('tp', [1, 2, 4])
+    @pytest.mark.parametrize('pp', [1, 2])
+    @pytest.mark.parametrize('ep', [1, 2, 4])
+    @pytest.mark.parametrize('is_moe', [True, False])
+    @pytest.mark.parametrize('is_mla', [True, False])
+    def test_optimizer_common_state_dict(
+        self, tmp_path_dist_ckpt, fully_parallel, tp, pp, ep, is_moe, is_mla, optimizer_type
+    ):
+        if tp * pp * ep > 8:
+            pytest.skip(f"TP*PP*EP > 8 is larger than world size")
+
+        if ep > 1 and not is_moe:
+            pytest.skip(f"EP > 1 needs to be used with MoE")
+
+        initialize_fn = partial(initialize_real_model, is_moe=is_moe, is_mla=is_mla)
+
+        # Initialize parallel
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            expert_model_parallel_size=ep,
+        )
+        rank = torch.distributed.get_rank()
+
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_dp_sharding', sync=True) as ckpt_dir:
+            mock_args = parse_args(ignore_unknown_args=True)
+            mock_args.use_distributed_optimizer = False
+            with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args):
+                # Initialize model and optimizer A
+                if is_moe:
+                    model, optimizer_A = setup_moe_model_and_optimizer(
+                        seed=2,
+                        tp=tp,
+                        pp=pp,
+                        ep=ep,
+                        initialize_fn=initialize_fn,
+                        dist_opt=False,
+                        optimizer=optimizer_type,
+                    )
+                else:
+                    model, optimizer_A = setup_model_and_optimizer(
+                        seed=2,
+                        tp=tp,
+                        pp=pp,
+                        initialize_fn=initialize_fn,
+                        dist_opt=False,
+                        optimizer=optimizer_type,
+                    )
+
+                # Save checkpoint
+                init_checkpointing_mock_args(mock_args, ckpt_dir, fully_parallel=fully_parallel)
+                from megatron.training.training import preprocess_common_state_dict
+
+                save_checkpoint(
+                    10,
+                    model,
+                    optimizer_A,
+                    None,
+                    0,
+                    preprocess_common_state_dict_fn=preprocess_common_state_dict,
+                )
+
+                # Get optimizer A param state
+                optim_param_state_A = optimizer_A.state_dict()
+
+                # Initialize model and optimizer B
+                if is_moe:
+                    model, optimizer_B = setup_moe_model_and_optimizer(
+                        seed=3,
+                        tp=tp,
+                        pp=pp,
+                        ep=ep,
+                        initialize_fn=initialize_fn,
+                        dist_opt=False,
+                        optimizer=optimizer_type,
+                    )
+                else:
+                    model, optimizer_B = setup_model_and_optimizer(
+                        seed=3,
+                        tp=tp,
+                        pp=pp,
+                        initialize_fn=initialize_fn,
+                        dist_opt=False,
+                        optimizer=optimizer_type,
+                    )
+
+                # Load optimizer B from checkpoint
+                load_checkpoint_no_arg_checks(model, optimizer_B, None)
+
+                # Get optimizer B param state
+                optim_param_state_B = optimizer_B.state_dict()
+
+                # Test both param state dicts are equal
+                check_equal(optim_param_state_A, optim_param_state_B)
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_local.py b/tests/unit_tests/dist_checkpointing/test_local.py
index c768e518aba..62468eb1ac8 100644
--- a/tests/unit_tests/dist_checkpointing/test_local.py
+++ b/tests/unit_tests/dist_checkpointing/test_local.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import filecmp
 import logging
 import shutil
@@ -26,6 +27,7 @@
     LocalCheckpointManager,
 )
 
+from megatron.core import parallel_state
 from megatron.core.dist_checkpointing import ShardedTensor
 from megatron.core.dist_checkpointing.dict_utils import diff
 from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedTensorFactory
@@ -80,7 +82,10 @@ def test_sharded_tensors(self, tp, pp, use_torch_fsdp2):
         opt_param_scheduler = None
         rng_state = None
         iteration = None
-        optim_sd_kwargs = dict(sharding_type='fully_sharded_model_space')
+        dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+        metadata = {'distrib_optim_sharding_type': 'fully_reshardable', 'dp_cp_group': dp_cp_group}
+        model_sd_kwargs = dict(metadata={'dp_cp_group': dp_cp_group})
+        optim_sd_kwargs = dict(metadata=metadata)
         mock_args = parse_args(ignore_unknown_args=True)
         mock_args.no_save_optim = False
         mock_args.no_save_rng = True
@@ -93,6 +98,7 @@ def test_sharded_tensors(self, tp, pp, use_torch_fsdp2):
             opt_param_scheduler,
             rng_state,
             iteration=iteration,
+            model_sd_kwargs=model_sd_kwargs,
             optim_sd_kwargs=optim_sd_kwargs,
         )
         sharded_tensor_factories = find_matching_values(
diff --git a/tests/unit_tests/dist_checkpointing/test_mapping.py b/tests/unit_tests/dist_checkpointing/test_mapping.py
index 38582d75240..77f088d62b7 100644
--- a/tests/unit_tests/dist_checkpointing/test_mapping.py
+++ b/tests/unit_tests/dist_checkpointing/test_mapping.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import pytest
 import torch
@@ -39,26 +39,6 @@ def test_from_rank_offsets_constructor(self, dtype=torch.float, device='cuda'):
         assert sh_ten.global_offset == (0, 0, shape[2] * 3, 0)
         assert sh_ten.axis_fragmentations == (10, 1, 6, 1)
 
-    def test_from_rank_offsets_flat_constructor(self, dtype=torch.float, device='cuda'):
-        data = torch.arange(28, dtype=dtype, device=device).reshape((1, 4, 7))
-        shape = data.shape
-        rank_offsets = [(1, 0, 2), (2, 3, 5)]
-        flattened_range = slice(4, 9)
-        flat_data = data.flatten()[flattened_range]
-        sh_ten = ShardedTensor.from_rank_offsets_flat(
-            'keyA', flat_data, data.shape, *rank_offsets, flattened_range=flattened_range
-        )
-
-        # The main attributes properties are unchanged
-        assert isinstance(sh_ten, ShardedTensor)
-        assert sh_ten.dtype is dtype
-        assert sh_ten.local_shape == shape
-        assert sh_ten.global_shape == (shape[0], shape[1] * 2, shape[2] * 5)
-        assert sh_ten.global_offset == (0, 0, shape[2] * 3)
-        assert sh_ten.axis_fragmentations == (1, 2, 5)
-
-        assert torch.all(sh_ten.data == torch.arange(4, 9, device=device))
-
     def test_metadata_integrity_violation(self):
         data = torch.ones((1, 3, 7, 9), device='meta')
         rank_offsets = [(0, 0, 10), (2, 3, 6)]
@@ -73,19 +53,6 @@ def test_metadata_integrity_violation(self):
             sh_ten.global_offset = (0, 1, 0)
             sh_ten.validate_metadata_integrity()
 
-        with pytest.raises(CheckpointingException):
-            sh_ten = ShardedTensor.from_rank_offsets_flat(
-                'keyA', data, data.shape, *rank_offsets, flattened_range=slice(4, 9)
-            )
-
-        sh_ten = ShardedTensor.from_rank_offsets_flat(
-            'keyA', data.flatten()[4:9], data.shape, *rank_offsets, flattened_range=slice(4, 9)
-        )
-        assert sh_ten.local_shape == (1, 3, 7, 9)
-        with pytest.raises(CheckpointingException):
-            sh_ten.local_shape = (5,)
-            sh_ten.validate_metadata_integrity()
-
     def test_narrowing(self):
         data = torch.ones((1, 3, 7, 9))
         rank_offsets = [(0, 0, 10), (2, 3, 6)]
@@ -100,38 +67,6 @@ def test_narrowing(self):
         assert narr_sh_ten.global_shape == (10, 3, 12, 9)
         assert narr_sh_ten.global_offset == (0, 0, 6, 0)
 
-    def test_flat_narrow(self):
-        data = torch.arange(28).reshape((4, 7))
-        rank_offsets = [(0, 1, 2), (1, 3, 5)]
-        flattened_range = slice(4, 9)
-        flat_data = data.flatten()[flattened_range]
-        sh_ten = ShardedTensor.from_rank_offsets_flat(
-            'keyA', flat_data, data.shape, *rank_offsets, flattened_range=flattened_range
-        )
-
-        # The main attributes properties are unchanged
-        assert isinstance(sh_ten, ShardedTensor)
-        assert torch.all(sh_ten.data == torch.arange(4, 9))
-
-        (narrow_sh_ten,) = sh_ten.narrow(
-            0, 0, 1
-        )  # First seven elements of unflat, intersection has 3 elements
-        assert torch.all(narrow_sh_ten.data == torch.arange(4, 7))
-        assert narrow_sh_ten.local_shape == (1, 7)
-        assert narrow_sh_ten.global_shape == (2, 35)
-        assert narrow_sh_ten.global_offset == (1, 21)
-
-        (narrow_sh_ten,) = sh_ten.narrow(
-            0, 0, 3
-        )  # First 21 elements of unflat, intersection has all 5 elements
-        assert torch.all(narrow_sh_ten.data == torch.arange(4, 9))
-        assert narrow_sh_ten.local_shape == (3, 7)
-        assert narrow_sh_ten.global_shape == (6, 35)
-        assert narrow_sh_ten.global_offset == (3, 21)
-
-        narrow_sh_ten = sh_ten.narrow(0, 2, 1)  # empty intersection
-        assert not narrow_sh_ten, narrow_sh_ten
-
 
 class TestShardedTensorFactory:
     def test_build_and_merge(self):
diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
index 4c5bfd9b32e..d85ac62a498 100644
--- a/tests/unit_tests/dist_checkpointing/test_optimizer.py
+++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import re
 from copy import deepcopy
 from functools import partial
@@ -706,12 +707,10 @@ def test_bucket_space_optimizer_save_load(
             # Note: PP must be > 1 if TP <= 2 because of empty buckets otherwise
             ((2, 4), (2, 4), 'fully_reshardable', False),
             ((4, 2), (4, 2), 'dp_reshardable', None),
-            ((8, 1), (8, 1), 'fully_sharded_model_space', None),
             # DP resharding:
             ((4, 2), (4, 1), 'dp_reshardable', None),
             ((2, 4), (2, 2), 'fully_reshardable', False),
             ((2, 4), (2, 2), 'fully_reshardable', True),
-            ((1, 8), (1, 2), 'fully_sharded_model_space', None),
         ],
     )
     @pytest.mark.parametrize("initalize_fn", [initialize_pp_agnostic_model])
@@ -917,8 +916,12 @@ def preprocess_fn(optim_common_dict):
                     bf16=False,
                 )
 
+                metadata = {'distrib_optim_sharding_type': 'fully_reshardable'}
+
                 save(
-                    optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()),
+                    optimizer_A.sharded_state_dict(
+                        model_A[0].sharded_state_dict(), metadata=metadata
+                    ),
                     ckpt_dir_A,
                     preprocess_common_before_consistancy_check=preprocess_fn,
                 )
@@ -934,12 +937,17 @@ def preprocess_fn(optim_common_dict):
                     bf16=False,
                 )
                 load_sharded_state_dict = optimizer_B.sharded_state_dict(
-                    model_B[0].sharded_state_dict(), is_loading=True
+                    model_B[0].sharded_state_dict(), is_loading=True, metadata=metadata
                 )
                 state_dict = load(load_sharded_state_dict, ckpt_dir_A)
 
                 optimizer_B.load_state_dict(state_dict)
-                save(optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()), ckpt_dir_B)
+                save(
+                    optimizer_B.sharded_state_dict(
+                        model_B[0].sharded_state_dict(), metadata=metadata
+                    ),
+                    ckpt_dir_B,
+                )
                 Utils.destroy_model_parallel()
 
                 # Test both checkpoints are equal
@@ -999,10 +1007,7 @@ def test_optimizer_resharding(
                     initialize_fn=initialize_fn,
                 )
 
-                if fully_parallel:
-                    metadata = {'distrib_optim_sharding_type': 'fully_sharded_model_space'}
-                else:
-                    metadata = {'distrib_optim_sharding_type': 'fully_reshardable'}
+                metadata = {'distrib_optim_sharding_type': 'fully_reshardable'}
 
                 save(
                     optimizer_A.sharded_state_dict(
@@ -1086,10 +1091,7 @@ def test_chained_optimizer_resharding(
                     use_glu=use_glu,
                 )
 
-                if fully_parallel:
-                    metadata = {'distrib_optim_sharding_type': 'fully_sharded_model_space'}
-                else:
-                    metadata = {'distrib_optim_sharding_type': 'fully_reshardable'}
+                metadata = {'distrib_optim_sharding_type': 'fully_reshardable'}
 
                 save(
                     optimizer_A.sharded_state_dict(
diff --git a/tests/unit_tests/dist_checkpointing/test_pipeline_parallel_layout.py b/tests/unit_tests/dist_checkpointing/test_pipeline_parallel_layout.py
index 7a5f7d2dd91..19658986d6e 100644
--- a/tests/unit_tests/dist_checkpointing/test_pipeline_parallel_layout.py
+++ b/tests/unit_tests/dist_checkpointing/test_pipeline_parallel_layout.py
@@ -131,7 +131,6 @@ def create_args():
     args.ckpt_fully_parallel_save = False
     args.ckpt_fully_parallel_load = False
     args.auto_detect_ckpt_format = False
-    args.retro_add_retriever = False
     args.ckpt_convert_update_legacy_dist_opt_format = False
     args.ckpt_step = None
     args.use_dist_ckpt = True
@@ -150,9 +149,9 @@ def create_args():
     args.no_load_rng = True
     args.use_distributed_optimizer = True
     args.use_megatron_fsdp = False
-    args.dist_ckpt_save_pre_mcore_014 = False
     args.dist_ckpt_optim_fully_reshardable = False
     args.distrib_optim_fully_reshardable_mem_efficient = False
+    args.phase_transition_iterations = None
 
     yield args
 
diff --git a/tests/unit_tests/dist_checkpointing/test_serialization.py b/tests/unit_tests/dist_checkpointing/test_serialization.py
index d2bebc93101..0815633f9b5 100644
--- a/tests/unit_tests/dist_checkpointing/test_serialization.py
+++ b/tests/unit_tests/dist_checkpointing/test_serialization.py
@@ -77,16 +77,6 @@ def test_single_process_save_load(self, tmp_path_dist_ckpt):
             save(sharded_state_dict, ckpt_dir)
             torch.distributed.barrier()
 
-            saved_config = maybe_load_config(ckpt_dir)
-            if saved_config.sharded_backend == 'zarr':
-                assert (ckpt_dir / 'keyA').is_dir()
-                assert (ckpt_dir / 'keyB').is_dir()
-                assert not (ckpt_dir / 'keyC').exists()
-                assert not (ckpt_dir / 'sd_keyA').is_dir()
-
-                if HAVE_DTENSOR:
-                    assert (ckpt_dir / 'keyD').is_dir()
-
             load_ssd = {
                 'load_sd_keyA': ShardedTensor.from_rank_offsets(
                     'keyA', torch.ones(2, 4), replica_id=Utils.rank
@@ -127,13 +117,6 @@ def preprocess_fn(x):
                 preprocess_common_before_consistancy_check=preprocess_fn,
             )
 
-            saved_config = maybe_load_config(ckpt_dir)
-            if saved_config.sharded_backend == 'zarr':
-                assert (ckpt_dir / 'keyA').is_dir()
-                assert (ckpt_dir / 'keyB').is_dir()
-                assert not (ckpt_dir / 'keyC').exists()
-                assert not (ckpt_dir / 'sd_keyA').is_dir()
-
         Utils.destroy_model_parallel()
 
     def test_multi_process_save_log_difference(self, tmp_path_dist_ckpt, caplog):
@@ -426,7 +409,6 @@ def test_load_error_msg(self, tmp_path_dist_ckpt):
                 load(state_dict, ckpt_dir)
             assert f'is not a distributed checkpoint' in str(exc_info.value)
 
-            # Missing Zarr arrays
             torch.distributed.barrier()
             save(state_dict, ckpt_dir)
             sh_ten.key = 'different_key'
diff --git a/tests/unit_tests/dist_checkpointing/utils.py b/tests/unit_tests/dist_checkpointing/utils.py
index 62215815e67..dd12ecd7684 100644
--- a/tests/unit_tests/dist_checkpointing/utils.py
+++ b/tests/unit_tests/dist_checkpointing/utils.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 from functools import partial
 from typing import Any, Callable, Tuple, Union
 from unittest import mock
@@ -11,6 +12,7 @@
     get_gpt_layer_with_transformer_engine_spec,
 )
 from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+from megatron.core.optimizer.muon import get_megatron_muon_optimizer
 from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
 from megatron.core.transformer import TransformerConfig
 from megatron.training.arguments import parse_args
@@ -152,7 +154,6 @@ def init_checkpointing_mock_args(args, ckpt_dir, fully_parallel=False):
     args.consumed_train_samples = 0
     args.skipped_train_samples = 0
     args.consumed_valid_samples = 0
-    args.retro_add_retriever = False
     args.no_load_optim = False
     args.no_load_rng = False
     args.dist_ckpt_strictness = 'assume_ok_unexpected'
@@ -163,20 +164,19 @@ def init_checkpointing_mock_args(args, ckpt_dir, fully_parallel=False):
     args.num_attention_heads = NUM_ATTENTION_HEADS
     args.ckpt_step = None
     args.use_megatron_fsdp = False
-    args.dist_ckpt_save_pre_mcore_014 = False
     args.dist_ckpt_optim_fully_reshardable = False
     args.distrib_optim_fully_reshardable_mem_efficient = False
+    args.phase_transition_iterations = None
 
 
 def setup_model_and_optimizer(
-    seed,
-    tp,
-    pp,
-    initialize_fn=initialize_gpt_model,
-    bf16=True,
-    dist_opt=True,
-    data_parallel_sharding_strategy="optim_grads_params",
+    seed, tp, pp, initialize_fn=initialize_gpt_model, bf16=True, dist_opt=True, optimizer='adam'
 ):
+    if 'muon' in optimizer and dist_opt:
+        raise ValueError(
+            "Layer-wise distributed optimizer with Muon is not supported with distributed optimizer."
+        )
+
     mock_args = parse_args(ignore_unknown_args=True)
     with mock.patch('megatron.training.training.get_args', new=lambda: mock_args):
         init_basic_mock_args(mock_args, tp, pp, bf16=bf16)
@@ -195,17 +195,33 @@ def setup_model_and_optimizer(
         bf16=bf16,
         params_dtype=torch.bfloat16 if bf16 else torch.float,
         use_distributed_optimizer=dist_opt,
+        optimizer=optimizer,
     )
-    optimizer = get_megatron_optimizer(config, model)
+
+    if 'muon' in optimizer:
+        # Use layer-wise distributed optimizer with Muon
+        optimizer_type = optimizer
+        # default lr None feels wrong. only change muon lr to avoid breaking old tests
+        config.lr = 0.0
+        optimizer = get_megatron_muon_optimizer(
+            config, model, layer_wise_distributed_optimizer='dist' in optimizer_type
+        )
+    else:
+        optimizer_type = optimizer
+        optimizer = get_megatron_optimizer(config, model)
 
     torch.manual_seed(seed + 1)
     model_parallel_cuda_manual_seed(seed + 1)
 
-    for group in optimizer.optimizer.param_groups:
-        for p in group['params']:
-            if len(optimizer.optimizer.state[p]) == 0:
-                optimizer.optimizer.state[p]['exp_avg'] = torch.rand_like(p.data)
-                optimizer.optimizer.state[p]['exp_avg_sq'] = torch.rand_like(p.data)
+    if not 'muon' in optimizer_type:
+        for group in optimizer.optimizer.param_groups:
+            for p in group['params']:
+                if len(optimizer.optimizer.state[p]) == 0:
+                    optimizer.optimizer.state[p]['exp_avg'] = torch.rand_like(p.data)
+                    optimizer.optimizer.state[p]['exp_avg_sq'] = torch.rand_like(p.data)
+    else:
+        for opt in optimizer.chained_optimizers:
+            opt.init_state_fn(opt)
 
     optimizer.reload_model_params()
 
@@ -248,7 +264,12 @@ def setup_moe_model_and_optimizer(
     use_te=False,
     use_grouped_mlp=False,
     use_glu=False,
+    optimizer='adam',
 ):
+    if 'muon' in optimizer and dist_opt:
+        raise ValueError(
+            "Layer-wise distributed optimizer with Muon is not supported with distributed optimizer."
+        )
     mock_args = parse_args(ignore_unknown_args=True)
     with mock.patch('megatron.training.training.get_args', new=lambda: mock_args):
         init_basic_mock_args(mock_args, tp, pp, bf16=bf16)
@@ -272,18 +293,33 @@ def setup_moe_model_and_optimizer(
         bf16=bf16,
         params_dtype=torch.bfloat16 if bf16 else torch.float,
         use_distributed_optimizer=dist_opt,
+        optimizer=optimizer,
     )
-    optimizer = get_megatron_optimizer(config, model)
+
+    if 'muon' in optimizer:
+        optimizer_type = optimizer
+        # default lr None feels wrong. only change muon lr to avoid breaking old tests
+        config.lr = 0.0
+        optimizer = get_megatron_muon_optimizer(
+            config, model, layer_wise_distributed_optimizer='dist' in optimizer_type
+        )
+    else:
+        optimizer_type = optimizer
+        optimizer = get_megatron_optimizer(config, model)
 
     torch.manual_seed(seed + 1)
     model_parallel_cuda_manual_seed(seed + 1)
 
-    for opt in optimizer.chained_optimizers:
-        for group in opt.param_groups:
-            for p in group['params']:
-                if len(opt.state[p]) == 0:
-                    opt.state[p]['exp_avg'] = torch.rand_like(p.data)
-                    opt.state[p]['exp_avg_sq'] = torch.rand_like(p.data)
+    if not 'muon' in optimizer_type:
+        for opt in optimizer.chained_optimizers:
+            for group in opt.param_groups:
+                for p in group['params']:
+                    if len(opt.state[p]) == 0:
+                        opt.state[p]['exp_avg'] = torch.rand_like(p.data)
+                        opt.state[p]['exp_avg_sq'] = torch.rand_like(p.data)
+    else:
+        for opt in optimizer.chained_optimizers:
+            opt.init_state_fn(opt)
 
     optimizer.reload_model_params()
 
diff --git a/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py b/tests/unit_tests/distributed/megatron_fsdp/test_mcore_fully_sharded_data_parallel.py
similarity index 63%
rename from tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py
rename to tests/unit_tests/distributed/megatron_fsdp/test_mcore_fully_sharded_data_parallel.py
index 3b41daf58ef..d4c664cda9c 100644
--- a/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py
+++ b/tests/unit_tests/distributed/megatron_fsdp/test_mcore_fully_sharded_data_parallel.py
@@ -6,6 +6,7 @@
 import torch
 from packaging import version
 from torch import testing
+from torch.testing import assert_close
 
 import megatron.core.parallel_state as mpu
 from megatron.core.distributed import DistributedDataParallelConfig
@@ -16,6 +17,12 @@
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import is_torch_min_version
+from tests.unit_tests.distributed.megatron_fsdp.utils import (
+    make_gpt_mock_data_iterator,
+    make_moe_args_model_and_optimizer,
+    pretrain_forward_backward,
+    set_manual_seed,
+)
 from tests.unit_tests.test_utilities import Utils
 
 
@@ -220,13 +227,16 @@ def train_step(model, optimizer, inputs):
 
     # Testing fsdp_double_buffer with and without nccl_ub
     @pytest.mark.parametrize(
-        ("dp_size", "nccl_ub", "fsdp_double_buffer"), [(8, False, True), (8, True, True)]
+        ("dp_size", "nccl_ub", "fsdp_double_buffer", "fsdp_manual_registration"),
+        [(8, False, True, False), (8, True, True, False), (8, True, True, True)],
     )
-    def test_fsdp_user_buffer_registration(self, dp_size, nccl_ub, fsdp_double_buffer):
+    def test_fsdp_user_buffer_registration(
+        self, dp_size, nccl_ub, fsdp_double_buffer, fsdp_manual_registration
+    ):
         """Test that FSDP works correctly with user buffer registration.
         This test compares the training results of the baseline fsdp with the target fsdp config.
-        Baseline fsdp: nccl_ub=False, fsdp_double_buffer=False
-        Target fsdp: nccl_ub=[True, False], fsdp_double_buffer=[True, False]
+        Baseline fsdp: nccl_ub=False, fsdp_double_buffer=False, fsdp_manual_registration=False
+        Target fsdp: nccl_ub=[True, False], fsdp_double_buffer=[True, False], fsdp_manual_registration=[True, False]
         """
         if not is_torch_min_version("2.4.0"):
             pytest.skip("Megatron FSDP requires torch >= 2.4.0")
@@ -264,6 +274,7 @@ def test_fsdp_user_buffer_registration(self, dp_size, nccl_ub, fsdp_double_buffe
             use_megatron_fsdp=True,
             nccl_ub=False,
             fsdp_double_buffer=False,
+            fsdp_manual_registration=False,
         )
 
         # Setup FSDP config - target fsdp config
@@ -275,6 +286,7 @@ def test_fsdp_user_buffer_registration(self, dp_size, nccl_ub, fsdp_double_buffe
             use_megatron_fsdp=True,
             nccl_ub=nccl_ub,
             fsdp_double_buffer=fsdp_double_buffer,
+            fsdp_manual_registration=fsdp_manual_registration,
         )
 
         # Create two identical models
@@ -354,6 +366,13 @@ def train_step(model, optimizer, inputs):
         out1, loss1 = train_step(baseline_fsdp_model, optimizer1, input_data)
         out2, loss2 = train_step(target_fsdp_model, optimizer2, input_data)
 
+        # In case of manual registration, we need to manually register the buffer
+        # And proceed one more step to check the results
+        if fsdp_manual_registration:
+            out1, loss1 = train_step(baseline_fsdp_model, optimizer1, input_data)
+            target_fsdp_model.param_and_grad_buffer.manual_buffer_registration()
+            out2, loss2 = train_step(target_fsdp_model, optimizer2, input_data)
+
         testing.assert_close(out1, out2, rtol=0, atol=0)
         testing.assert_close(loss1, loss2, rtol=0, atol=0)
 
@@ -481,3 +500,228 @@ def test_fsdp_with_hybrid_sharding(self, num_fsdp_group):
                 atol=0,
                 msg=f"Parameter gradients for {name1} and {name2} don't match",
             )
+
+
+@pytest.fixture(scope="class")
+def ref_cache():
+    """
+    Shared read/write cache for an class.
+    Keys: arbitrary strings, values: anything (tensors, dicts, etc.).
+    """
+    return {}
+
+
+class TestMegatronFSDPE2E:
+
+    @staticmethod
+    def _training_loop(seed=42, **kwargs):
+        """
+        Run a small deterministic (optional) training loop using a mocked MoE/GPT model and optimizer.
+        This helper initializes model-parallel state, creates a model and optimizer via
+        make_moe_args_model_and_optimizer, constructs a mock GPT data iterator, and runs
+        NUM_TRAINING_STEPS iterations of forward/backward/optimization. Losses from each
+        training step are collected and returned.
+        Args:
+            seed (int, optional): RNG seed for reproducibility. Default: 42.
+            **kwargs: Configuration overrides (all optional). Recognized keys:
+                - vocab_size (int): Vocabulary size for the mock model. Default: 100.
+                - seq_length (int): Sequence length used for the mock data. Default: 128.
+                - micro_batch_size (int): Per-microbatch size. Default: 2.
+                - global_batch_size (int): Global batch size across data-parallel ranks. Default: 32.
+                - train_iters (int): Number of training iterations to run. Default: 20.
+                - tensor_model_parallel_size (int): Tensor model parallel world size. Default: 1.
+                - pipeline_model_parallel_size (int): Pipeline model parallel world size. Default: 1.
+                - num_layers_per_virtual_pipeline_stage (int or None): Virtual pipeline configuration.
+                - expert_model_parallel_size (int): Expert model parallel size for MoE. Default: 1.
+                - expert_tensor_parallel_size (int): Expert tensor parallel size for MoE. Default: 1.
+                - num_distributed_optimizer_instances (int): Number of distributed optimizer instances. Default: 1.
+        Returns:
+            list: A list of length train_iters containing the per-step language-model loss values
+            (the value appended from output[-1] each iteration). Loss objects are returned as produced
+            by the training utilities (typically tensors or scalars).
+        Side effects:
+            - Calls Utils.initialize_model_parallel(...) and Utils.destroy_model_parallel().
+            - Sets global RNG state via set_manual_seed(seed).
+            - Constructs models/optimizers via make_moe_args_model_and_optimizer and a data iterator
+              via make_gpt_mock_data_iterator.
+            - Runs optimizer.zero_grad(), pretrain_forward_backward(...), and optim.step() repeatedly.
+            - Calculates the number of micro-batches per step as:
+                global_batch_size // micro_batch_size // data_parallel_world_size.
+              This requires that global_batch_size be divisible by micro_batch_size * data_parallel_world_size.
+        Raises:
+            ValueError: If batch-size arithmetic or other setup assumptions (e.g., divisibility) are violated.
+        """
+        # Configuration parameters with defaults
+        VOCAB_SIZE = kwargs.get("vocab_size", 100)
+        MAX_SEQ_LEN = kwargs.get("seq_length", 128)
+        MICRO_BATCH_SIZE = kwargs.get("micro_batch_size", 2)
+        GLOBAL_BATCH_SIZE = kwargs.get("global_batch_size", 32)
+        NUM_TRAINING_STEPS = kwargs.get("train_iters", 20)
+        TP = kwargs.get("tensor_model_parallel_size", 1)
+        PP = kwargs.get("pipeline_model_parallel_size", 1)
+        VPP = kwargs.get("num_layers_per_virtual_pipeline_stage", None)
+        EP = kwargs.get("expert_model_parallel_size", 1)
+        ETP = kwargs.get("expert_tensor_parallel_size", 1)
+        OUTER_DP = kwargs.get("num_distributed_optimizer_instances", 1)
+
+        # Initialize model parallel groups
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=TP,
+            pipeline_model_parallel_size=PP,
+            expert_model_parallel_size=EP,
+            expert_tensor_parallel_size=ETP,
+            num_distributed_optimizer_instances=OUTER_DP,
+        )
+        DP_GROUP = mpu.get_data_parallel_group()
+
+        # Set manual seed for reproducibility
+        set_manual_seed(seed)
+
+        # Create model and optimizer
+        model_chunks, optim = make_moe_args_model_and_optimizer(
+            ut_filename="test_mcore_fully_sharded_data_parallel.py",
+            micro_batch_size=MICRO_BATCH_SIZE,
+            global_batch_size=GLOBAL_BATCH_SIZE,
+            vocab_size=VOCAB_SIZE,
+            padded_vocab_size=VOCAB_SIZE,
+            seq_length=MAX_SEQ_LEN,
+            sequence_parallel=TP > 1,
+            tensor_model_parallel_size=TP,
+            pipeline_model_parallel_size=PP,
+            num_layers_per_virtual_pipeline_stage=VPP,
+            train_iters=NUM_TRAINING_STEPS,
+            **kwargs,
+        )
+
+        # Prepare data iterator
+        data_iterator = make_gpt_mock_data_iterator(
+            dp_group=DP_GROUP,
+            vocab_size=VOCAB_SIZE,
+            sequence_length=MAX_SEQ_LEN,
+            batch_size=MICRO_BATCH_SIZE,
+            num_samples=GLOBAL_BATCH_SIZE * NUM_TRAINING_STEPS,
+        )
+
+        outputs = []
+
+        # Training loop
+        for _ in range(NUM_TRAINING_STEPS):
+            optim.zero_grad()
+            output = pretrain_forward_backward(
+                model=model_chunks,
+                data_iterator=data_iterator,
+                sequence_length=MAX_SEQ_LEN,
+                micro_batch_size=MICRO_BATCH_SIZE,
+                num_micro_batches=GLOBAL_BATCH_SIZE // MICRO_BATCH_SIZE // DP_GROUP.size(),
+            )
+            optim.step()
+
+            # Collect loss
+            outputs.append(output[-1])
+
+        Utils.destroy_model_parallel()
+
+        return outputs
+
+    @pytest.mark.flaky_in_dev
+    @pytest.mark.skipif(
+        not is_torch_min_version("2.4.0"), reason="Test needs to be updated for torch >= 2.4.0"
+    )
+    @pytest.mark.parametrize(
+        "nd_topology",
+        [
+            pytest.param({"TP": 2}, id="TP2"),
+            pytest.param({"EP": 2, "ETP": 2}, id="EP2_ETP2"),
+            pytest.param({"OUTER_DP": 2, "EP": 2}, id="OUTER_DP2_EP2"),
+        ],
+    )
+    @pytest.mark.parametrize(
+        ("fsdp_sharding_strategy", "use_double_buffer"),
+        [
+            ("optim_grads_params", False),
+            ("optim_grads_params", True),
+            ("optim_grads", False),
+            ("optim", True),
+        ],
+    )
+    def test_compatible_with_nd_parallel(
+        self, ref_cache, nd_topology, fsdp_sharding_strategy, use_double_buffer
+    ):
+        nd_topology_str = "_".join([f"{k}{v}" for k, v in nd_topology.items()])
+        if nd_topology_str not in ref_cache:
+            ref_cache[nd_topology_str] = TestMegatronFSDPE2E._training_loop(
+                use_distributed_optimizer=True
+            )
+
+        outputs = TestMegatronFSDPE2E._training_loop(
+            use_megatron_fsdp=True,
+            data_parallel_sharding_strategy=fsdp_sharding_strategy,
+            init_model_with_meta_device=True,
+            ckpt_format="fsdp_dtensor",
+            gradient_accumulation_fusion=False,
+            fsdp_double_buffer=use_double_buffer,
+        )
+        reference_outputs = ref_cache[nd_topology_str]
+
+        if torch.distributed.get_rank() == 0:
+            for step, (output, ref_output) in enumerate(zip(outputs, reference_outputs)):
+                loss = output["lm loss"]
+                ref_loss = ref_output["lm loss"]
+                assert_close(
+                    loss,
+                    ref_loss,
+                    atol=0,
+                    rtol=0.05,
+                    msg=(
+                        f"Loss mismatch at step {step}, FSDP Loss = {loss.item()}, "
+                        f"Reference Loss = {ref_loss.item()}"
+                        f", Compare = {compare_losses(loss.item(), ref_loss.item())}"
+                    ),
+                )
+
+
+def compare_losses(loss_a: float, loss_b: float, reference: str = "b"):
+    """
+    Compare two loss values with absolute and relative differences.
+
+    Parameters
+    ----------
+    loss_a : float
+        First loss value (e.g., baseline model).
+    loss_b : float
+        Second loss value (e.g., new model).
+    reference : {"a", "b"}, default "b"
+        Which loss to treat as the reference when computing the
+        relative difference. If "b", relative diff is vs loss_b;
+        if "a", vs loss_a.
+
+    Returns
+    -------
+    dict with keys:
+        "abs_diff" : float
+            |loss_a - loss_b|
+        "rel_diff" : float
+            |loss_a - loss_b| / reference_loss
+        "better" : str
+            "a" if loss_a < loss_b, "b" if loss_b < loss_a, "equal" otherwise.
+    """
+    abs_diff = abs(loss_a - loss_b)
+
+    if reference == "a":
+        ref = loss_a
+    else:
+        ref = loss_b
+
+    if ref == 0:
+        rel_diff = float("inf")  # or None, depending on your preference
+    else:
+        rel_diff = abs_diff / ref
+
+    if loss_a < loss_b:
+        better = "a"
+    elif loss_b < loss_a:
+        better = "b"
+    else:
+        better = "equal"
+
+    return {"abs_diff": abs_diff, "rel_diff": rel_diff, "better": better}
diff --git a/tests/unit_tests/distributed/fsdp/test_mfsdp_fully_shard.py b/tests/unit_tests/distributed/megatron_fsdp/test_mfsdp_fully_shard.py
similarity index 71%
rename from tests/unit_tests/distributed/fsdp/test_mfsdp_fully_shard.py
rename to tests/unit_tests/distributed/megatron_fsdp/test_mfsdp_fully_shard.py
index ed62abcc94a..9124bccee0e 100644
--- a/tests/unit_tests/distributed/fsdp/test_mfsdp_fully_shard.py
+++ b/tests/unit_tests/distributed/megatron_fsdp/test_mfsdp_fully_shard.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import logging
 import shutil
+from contextlib import nullcontext
 from copy import deepcopy
 from pathlib import Path
 
@@ -13,6 +15,8 @@
 
 from tests.unit_tests.test_utilities import Utils
 
+logger = logging.getLogger(__name__)
+
 HSDP = "hsdp"
 DP = "dp"
 DP_SHARD = "dp_shard"
@@ -30,6 +34,10 @@
 DIM_SIZE = 2
 NUM_LAYERS = 2
 NUM_STEPS = 2
+DELAYED_FP8_RECIPE = "fp8_delayed_scaling"
+CURRENT_FP8_RECIPE = "fp8_current_scaling"
+BLOCKWISE_FP8_RECIPE = "fp8_blockwise_scaling"
+MXFP8_BLOCKWISE_RECIPE = "mxfp8_blockwise"
 
 # Needed for `torch.distributed.checkpoint.{save,load}` because
 # multiple processes need to write to the same directory.
@@ -37,15 +45,22 @@
 
 
 def destroy_device_mesh(device_mesh):
-    from torch.distributed.device_mesh import _mesh_resources
 
     # Teardown device mesh.
     del device_mesh
-    _mesh_resources.mesh_stack.clear()
-    _mesh_resources.child_to_root_mapping.clear()
-    _mesh_resources.root_to_flatten_mapping.clear()
-    _mesh_resources.flatten_name_to_root_dims.clear()
-    _mesh_resources.mesh_dim_group_options.clear()
+    try:
+        from torch.distributed.device_mesh import _mesh_resources
+
+        _mesh_resources.child_to_root_mapping.clear()
+        _mesh_resources.root_to_flatten_mapping.clear()
+        _mesh_resources.mesh_stack.clear()
+        _mesh_resources.mesh_dim_group_options.clear()
+        _mesh_resources.flatten_name_to_root_dims.clear()
+    except Exception as e:
+        # Global _MeshEnv is on a convoluted deprecation path.
+        # Attempt to clean the global state, otherwise skip.
+        logger.warning(f"Did not clean the deprecated DeviceMesh global state. Skipping...\n{e}")
+        pass
 
 
 class ToyCNN(torch.nn.Module):
@@ -109,17 +124,33 @@ def forward(self, x, y):
 class ToyTETransformer(torch.nn.Module):
     """Toy Transformer model for testing Megatron-FSDP with Transformer Engine."""
 
-    def __init__(self, model_dim, num_heads, num_layers, output_dim):
+    def __init__(
+        self,
+        model_dim,
+        num_heads,
+        num_layers,
+        output_dim,
+        fuse_qkv_params=False,
+        params_dtype=torch.float32,
+        device="cuda",
+    ):
         super().__init__()
         self.layers = torch.nn.ModuleList(
             [
                 te.pytorch.TransformerLayer(
-                    hidden_size=model_dim, ffn_hidden_size=model_dim, num_attention_heads=num_heads
+                    hidden_size=model_dim,
+                    ffn_hidden_size=model_dim,
+                    num_attention_heads=num_heads,
+                    fuse_qkv_params=fuse_qkv_params,
+                    params_dtype=params_dtype,
+                    device=device,
                 )
                 for _ in range(num_layers)
             ]
         )
-        self.fc_out = te.pytorch.Linear(model_dim, output_dim)
+        self.fc_out = te.pytorch.Linear(
+            model_dim, output_dim, params_dtype=params_dtype, device=device
+        )
 
     def forward(self, x):
         for layer in self.layers:
@@ -128,9 +159,9 @@ def forward(self, x):
         return x
 
 
-def build_toy_model_and_optimizer(model_type: str, init_model_with_meta_device: bool, seed=None):
+def build_toy_model(model_type: str, init_model_with_meta_device: bool, seed=None):
     """
-    Helper function to build a toy model and optimizer for testing Megatron-FSDP.
+    Helper function to build a toy model for testing Megatron-FSDP.
     """
     # Set the seed to make sure the same model is initialized on all ranks.
     if seed is not None:
@@ -156,13 +187,16 @@ def build_toy_model_and_optimizer(model_type: str, init_model_with_meta_device:
             fsdp_unit_modules = [torch.nn.Transformer]
         elif model_type == TE_TRANSFORMER:
             toy_model = ToyTETransformer(
-                model_dim=DIM_SIZE, num_heads=2, num_layers=NUM_LAYERS, output_dim=DIM_SIZE
+                model_dim=DIM_SIZE,
+                num_heads=2,
+                num_layers=NUM_LAYERS,
+                output_dim=DIM_SIZE,
+                device="meta" if init_model_with_meta_device else "cuda",
             )
             fsdp_unit_modules = [te.pytorch.TransformerLayer]
-        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
     # Return the toy model, optimizer, and FSDP unit modules.
-    return toy_model, toy_adam, fsdp_unit_modules
+    return toy_model, fsdp_unit_modules
 
 
 def build_distributed_environment(mesh_dim_config: tuple):
@@ -223,17 +257,24 @@ def teardown_class(cls):
             (2, 2, 1, 2),
         ],
     )
-    @pytest.mark.parametrize("preserve_fp32_weights", [True, False])
-    @pytest.mark.parametrize("init_model_with_meta_device", [True, False])
+    @pytest.mark.parametrize(
+        "common_args",
+        [
+            {
+                "preserve_fp32_weights": True,
+                "init_model_with_meta_device": True,
+                "torch_compile": True,
+            },
+            {
+                "preserve_fp32_weights": False,
+                "init_model_with_meta_device": False,
+                "torch_compile": False,
+            },
+        ],
+    )
     @pytest.mark.skip(reason="Skipping this test as it is not working.")
     def test_fully_shard(
-        self,
-        model_type,
-        dp_shard_strategy,
-        dp_outer_strategy,
-        mesh_dim_config,
-        preserve_fp32_weights,
-        init_model_with_meta_device,
+        self, model_type, dp_shard_strategy, dp_outer_strategy, mesh_dim_config, common_args
     ):
         """
         Test the fully_shard API with different configurations.
@@ -245,6 +286,10 @@ def test_fully_shard(
         """
         from megatron.core.distributed.fsdp.src.megatron_fsdp.fully_shard import fully_shard
 
+        preserve_fp32_weights = common_args["preserve_fp32_weights"]
+        init_model_with_meta_device = common_args["init_model_with_meta_device"]
+        torch_compile = common_args["torch_compile"]
+
         # Skip due to lack of functionality.
         if init_model_with_meta_device and dp_shard_strategy == NO_SHARD:
             pytest.skip(
@@ -253,7 +298,7 @@ def test_fully_shard(
             )
         elif dp_outer_strategy == OPTIM:
             if dp_shard_strategy != OPTIM_GRADS_PARAMS:
-                # FIXME(@shjwudp, @cspades): This is an unexpected lack of support.
+                # TODO(@shjwudp, @cspades): Requires various modifications to support.
                 # [default0]:FAILED tests/unit_tests/distributed/test_mfsdp_fully_shard.py
                 # [False-True-True-True-mesh_dim_config0-optim-optim-cnn]
                 # [False-True-True-True-mesh_dim_config0-optim-optim_grads-cnn]
@@ -266,9 +311,8 @@ def test_fully_shard(
         device_mesh = build_distributed_environment(mesh_dim_config)
 
         # Construct toy model.
-        toy_model, toy_adam, fsdp_unit_modules = build_toy_model_and_optimizer(
-            model_type, init_model_with_meta_device
-        )
+        toy_model, fsdp_unit_modules = build_toy_model(model_type, init_model_with_meta_device)
+        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
         # Wrap in fully_shard.
         model, optimizer = fully_shard(
@@ -290,6 +334,7 @@ def test_fully_shard(
             grad_reduce_in_fp32=False,
             init_model_with_meta_device=init_model_with_meta_device,
         )
+        model = torch.compile(model) if torch_compile else model
 
         # Mock input and target.
         toy_input = torch.randn(1, DIM_SIZE, DIM_SIZE).to("cuda")
@@ -317,7 +362,7 @@ def test_fully_shard(
             # Validate gradients exist in the Torch Module, i.e. non-None and non-zero.
             grads_exist = any(
                 isinstance(p.grad, torch.Tensor) and p.grad.to_local().count_nonzero().item() > 0
-                for p in model.module.parameters()
+                for p in model.parameters()
             )
             sharding_group = (
                 device_mesh[HSDP].get_group()
@@ -397,9 +442,8 @@ def test_dcp_checkpoint_save_and_load(
         accuracy tests are non-trivial, i.e. don't just use the initialized weights.
         """
         # Test model.
-        toy_model, toy_adam, fsdp_unit_modules = build_toy_model_and_optimizer(
-            model_type, False, seed=0
-        )
+        toy_model, fsdp_unit_modules = build_toy_model(model_type, False, seed=0)
+        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
         # Wrap in fully_shard.
         model, optimizer = fully_shard(
@@ -478,9 +522,8 @@ def test_dcp_checkpoint_save_and_load(
         """
         # Initialize a new model for checkpoint loading. Set a different seed to force a different model init,
         # to ensure the checkpoint loading is accurate and non-trivial.
-        toy_model, toy_adam, fsdp_unit_modules = build_toy_model_and_optimizer(
-            model_type, False, seed=1
-        )
+        toy_model, fsdp_unit_modules = build_toy_model(model_type, False, seed=1)
+        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
         # Wrap in fully_shard.
         model, optimizer = fully_shard(
@@ -592,3 +635,143 @@ def test_dcp_checkpoint_save_and_load(
 
         # Destroy device mesh.
         destroy_device_mesh(device_mesh)
+
+    @pytest.mark.parametrize("shard_strategy", [OPTIM_GRADS_PARAMS, OPTIM_GRADS, OPTIM, NO_SHARD])
+    def test_fully_shard_ez(self, shard_strategy):
+        """
+        Test fully_shard(device_mesh=None). Represents the easiest entrypoint to Megatron-FSDP.
+        """
+        from megatron.core.distributed.fsdp.src.megatron_fsdp.fully_shard import (
+            fully_shard_model,
+            fully_shard_optimizer,
+        )
+
+        # Construct toy model.
+        toy_model, fsdp_unit_modules = build_toy_model(TRANSFORMER, False)
+
+        # Fully-shard the model.
+        mfsdp_model = fully_shard_model(
+            module=toy_model, fsdp_unit_modules=fsdp_unit_modules, zero_dp_strategy=shard_strategy
+        )
+
+        # Initialize the distributed optimizer on the MegatronFSDP model.
+        toy_adam = Adam(params=mfsdp_model.parameters(), lr=0.01)
+        optimizer = fully_shard_optimizer(optimizer=toy_adam)
+
+        # Mock input and target.
+        toy_input = torch.randn(1, DIM_SIZE, DIM_SIZE).to("cuda")
+        toy_target = torch.randn(1, DIM_SIZE, DIM_SIZE).to("cuda")
+
+        for step in range(NUM_STEPS):
+
+            # Forward pass.
+            output = mfsdp_model(toy_input, toy_input)
+
+            # Loss.
+            loss = mse_loss(output, toy_target)
+
+            # Backward pass.
+            loss.backward()
+
+            # Optimizer step.
+            optimizer.step()
+            optimizer.zero_grad()
+
+    @pytest.mark.parametrize("init_model_with_meta_device", [True, False])
+    @pytest.mark.parametrize(
+        "te_recipe",
+        [DELAYED_FP8_RECIPE, CURRENT_FP8_RECIPE, BLOCKWISE_FP8_RECIPE, MXFP8_BLOCKWISE_RECIPE],
+    )
+    def test_fully_shard_te_quantized(self, init_model_with_meta_device, te_recipe):
+        """
+        Test Megatron-FSDP with FP8 activations and parameters via TransformerEngine.
+        """
+        if te_recipe == MXFP8_BLOCKWISE_RECIPE:
+            # TODO(@cspades, @ko3n1g): Add this test case in.
+            pytest.skip(f"[Megatron CI/CD] MXFP8 requires Blackwell nodes to test.")
+
+        from megatron.core.distributed.fsdp.src.megatron_fsdp.fully_shard import (
+            fully_shard_model,
+            fully_shard_optimizer,
+        )
+
+        # Build FP8 recipe.
+        te_quant_recipe = None
+        if te_recipe == MXFP8_BLOCKWISE_RECIPE:
+            te_quant_recipe = te.common.recipe.MXFP8BlockScaling(
+                fp8_format=te.common.recipe.Format.HYBRID
+            )
+        elif te_recipe == DELAYED_FP8_RECIPE:
+            te_quant_recipe = te.common.recipe.DelayedScaling()
+        elif te_recipe == CURRENT_FP8_RECIPE:
+            te_quant_recipe = te.common.recipe.Float8CurrentScaling()
+        elif te_recipe == BLOCKWISE_FP8_RECIPE:
+            te_quant_recipe = te.common.recipe.Float8BlockScaling()
+
+        # Construct toy model compatible with FP8.
+        with (
+            te.pytorch.quantized_model_init(
+                recipe=te_quant_recipe,
+                # Needed for FP8 parameters with Megatron-FSDP.
+                preserve_high_precision_init_val=True,
+            )
+            if te_quant_recipe is not None
+            else nullcontext()
+        ):
+            # Fused QKV, BF16 precision for high-precision weights,
+            # and hidden dimension divisibility by 32 is required
+            # for some FP8 recipes such as MXFP8.
+            toy_model = ToyTETransformer(
+                model_dim=64,
+                num_heads=2,
+                num_layers=2,
+                output_dim=64,
+                fuse_qkv_params=True,
+                params_dtype=torch.bfloat16,
+                device="meta" if init_model_with_meta_device else "cuda",
+            )
+
+        # Fully-shard the model.
+        mfsdp_model = fully_shard_model(
+            module=toy_model,
+            fsdp_unit_modules=[te.pytorch.TransformerLayer, te.pytorch.Linear],
+            # Only ZeRO-3 / FSDP supports FP8 parameters.
+            zero_dp_strategy=3,
+            init_model_with_meta_device=init_model_with_meta_device,
+            # Required for FP8 parameter support, except for MXFP8 which has
+            # its own row-wise and col-wise (transpose) buffer management
+            # schedule that is natively managed by Megatron-FSDP.
+            keep_fp8_transpose_cache=True,
+            # Required for FP8 parameters. The optimizer state (and gradients)
+            # are never quantized, as TE produces high-precision wgrad and
+            # dgrad from FP8 weights and activations. Already defaults to True.
+            preserve_fp32_weights=True,
+        )
+
+        # Initialize the distributed optimizer on the MegatronFSDP model.
+        toy_adam = Adam(params=mfsdp_model.parameters(), lr=0.01)
+        optimizer = fully_shard_optimizer(optimizer=toy_adam)
+
+        # Mock input and target. Requires 2^N batch size for (MX)FP8 kernels.
+        toy_input = torch.randn(16, 64, 64, dtype=torch.bfloat16).to("cuda")
+        toy_target = torch.randn(16, 64, 64, dtype=torch.bfloat16).to("cuda")
+
+        for step in range(NUM_STEPS):
+
+            # Forward pass.
+            with (
+                te.pytorch.autocast(recipe=te_quant_recipe)
+                if te_quant_recipe is not None
+                else nullcontext()
+            ):
+                output = mfsdp_model(toy_input)
+
+            # Loss.
+            loss = mse_loss(output, toy_target)
+
+            # Backward pass.
+            loss.backward()
+
+            # Optimizer step.
+            optimizer.step()
+            optimizer.zero_grad()
diff --git a/tests/unit_tests/distributed/megatron_fsdp/utils.py b/tests/unit_tests/distributed/megatron_fsdp/utils.py
new file mode 100644
index 00000000000..18a2da63786
--- /dev/null
+++ b/tests/unit_tests/distributed/megatron_fsdp/utils.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+import sys
+from functools import partial
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.distributed import DistributedSampler
+
+from gpt_builders import gpt_builder
+from megatron.core.distributed import finalize_model_grads
+from megatron.core.enums import ModelType
+from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.utils import get_attr_wrapped_model
+from megatron.training.arguments import parse_args, validate_args
+from megatron.training.global_vars import destroy_global_vars, set_global_variables
+from megatron.training.training import setup_model_and_optimizer
+from megatron.training.utils import is_first_or_last_pipeline_stage
+from model_provider import model_provider
+
+
+def pretrain_forward_backward(
+    *, model, data_iterator, sequence_length=128, micro_batch_size=2, num_micro_batches=1
+):
+    forward_backward_func = get_forward_backward_func()
+    output = forward_backward_func(
+        forward_step_func=_forward_step_func,
+        data_iterator=data_iterator,
+        model=model,
+        num_microbatches=num_micro_batches,
+        seq_length=sequence_length,
+        micro_batch_size=micro_batch_size,
+        forward_only=False,
+    )
+    return output
+
+
+def make_gpt_mock_data_iterator(
+    dp_group, num_samples=1000, vocab_size=50257, sequence_length=128, batch_size=8, seed=42
+):
+    dataset = GPTMockDataset(
+        num_samples=num_samples, sequence_length=sequence_length, vocab_size=vocab_size, seed=seed
+    )
+    sampler = DistributedSampler(dataset, num_replicas=dp_group.size(), rank=dp_group.rank())
+    dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
+    for batch in dataloader:
+        batch["position_ids"] = torch.arange(sequence_length, dtype=torch.int64)
+        yield batch
+
+
+def make_moe_args_model_and_optimizer(ut_filename, **overrides):
+    sys.argv = [ut_filename]
+    base_args = dict(
+        num_layers=4,
+        mtp_num_layers=1,
+        hidden_size=128,
+        num_attention_heads=2,
+        max_position_embeddings=128,
+        bf16=False,
+        add_bias_linear=False,
+        swiglu=True,
+        position_embedding_type="rope",
+        rotary_percent=1.0,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        num_experts=4,
+        moe_shared_expert_intermediate_size=256,
+        moe_layer_freq=[0, 0, 1, 1],
+        moe_permute_fusion=True,
+        moe_router_fusion=True,
+        moe_router_topk=2,
+        moe_router_dtype="fp32",
+        create_attention_mask_in_dataloader=True,
+        lr=3e-5,
+        min_lr=3e-5,
+        use_distributed_optimizer=True,
+        finalize_model_grads_func=finalize_model_grads,
+    )
+
+    base_args.update(overrides)
+    args = parse_args()
+    for key, value in base_args.items():
+        setattr(args, key, value)
+
+    validate_args(args)
+
+    destroy_global_vars()
+    destroy_num_microbatches_calculator()
+    set_global_variables(args, build_tokenizer=False)
+
+    model, optimizer, _ = setup_model_and_optimizer(
+        model_provider_func=partial(model_provider, gpt_builder),
+        model_type=ModelType.encoder_or_decoder,
+    )
+    return model, optimizer
+
+
+def set_manual_seed(seed=42):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+
+class GPTMockDataset(Dataset):
+    """
+    Mock dataset for torchtitan GPT training tests
+    Generates synthetic tokenized sequences on-the-fly
+    """
+
+    def __init__(
+        self,
+        num_samples=10000,
+        micro_batch_size=1,
+        sequence_length=2048,
+        vocab_size=128256,
+        seed=42,
+    ):
+        """
+        Initialize mock dataset
+
+        Args:
+            num_samples: Total number of samples
+            sequence_length: Length of each sequence
+            vocab_size: Size of vocabulary
+            seed: Random seed for reproducibility
+        """
+        self.num_samples = num_samples
+        self.micro_batch_size = micro_batch_size
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.seed = seed
+
+        # Set numpy seed for deterministic generation
+        np.random.seed(seed)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        """
+        Generate a single training sample
+
+        Returns:
+            dict with 'tokens' and 'labels'
+        """
+        # Use idx as seed for reproducible but varied samples
+        rng = np.random.RandomState(self.seed + idx)
+
+        # Generate random token sequence
+        tokens = rng.randint(0, self.vocab_size, size=self.sequence_length, dtype=np.int64)
+
+        # Labels are tokens shifted by 1 (next token prediction)
+        labels = 1 + tokens
+
+        return {
+            'tokens': torch.from_numpy(tokens.copy()),
+            'labels': torch.from_numpy(labels.copy()),
+            "attention_mask": torch.ones(
+                (1, self.sequence_length, self.sequence_length), dtype=bool
+            ),
+            "loss_mask": torch.ones(self.sequence_length),
+        }
+
+
+def _forward_step_func(data_iterator, model, device="cuda"):
+
+    def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+        # If you have data parallel reduce loss across data parallel groups.
+        # If pipeline parallel, loss computation is done only in last stage.
+
+        return loss, {'lm loss': loss}
+
+    vp_stage = get_attr_wrapped_model(model, "vp_stage")
+
+    if not is_first_or_last_pipeline_stage(vp_stage):
+        tokens, labels, loss_mask, attention_mask, position_ids = None, None, None, None, None
+    else:
+        data = next(data_iterator)
+        tokens = data["tokens"].to(device, non_blocking=True)
+        labels = data["labels"].to(device, non_blocking=True)
+        loss_mask = data["loss_mask"].to(device, non_blocking=True)
+        attention_mask = (
+            None
+            if "attention_mask" not in data
+            else data["attention_mask"].to(device, non_blocking=True)
+        )
+        position_ids = data["position_ids"].to(device, non_blocking=True)
+
+    output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
diff --git a/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py b/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py
index 71e45f9d92e..e83f7142284 100644
--- a/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py
+++ b/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
 import contextlib
 from typing import Optional
 
@@ -169,15 +171,20 @@ def test_grad_sync(
         )
         != 0
     ):
-        # With above conditions, the data in param_and_grad_buffer.grad_data[0] equals to 1/data_parallel_word_size
-        # When average_in_collective=False, the grad data is always first scaled by 1/data_parallel_word_size and then summed by AR/RS
-        # when use_distributed_optimizer=True, only for rank=0 param_and_grad_buffer.grad_data[0] is updated, for other ranks
-        # another shard of grad_data is updated while param_and_grad_buffer.grad_data[0] is unchanged (=1/data_parallel_word_size)
+        # With above conditions, the data in param_and_grad_buffer.grad_data[0] equals
+        # 1/data_parallel_word_size.
+        # When average_in_collective=False, the grad data is always first scaled by
+        # 1/data_parallel_word_size and then summed by AR/RS.
+        # When use_distributed_optimizer=True, only for rank=0,
+        # param_and_grad_buffer.grad_data[0] is updated. For other ranks another shard of
+        # grad_data is updated while param_and_grad_buffer.grad_data[0] is unchanged
+        # (=1/data_parallel_word_size).
         non_ep_expected_grad_data_value_after_collective /= (
             parallel_state.get_data_parallel_world_size()
         )
     if ep_size > 1:
-        # For MoE models with exper parallelism, each expert will receive tokens from EPxETP times batches, such that the expert gradient will be EPxETP times after backward,
+        # For MoE models with exper parallelism, each expert will receive tokens from EPxETP
+        # times batches, such that the expert gradient will be EPxETP times after backward,
         # and the expected gradient after collective should be 1.0 as same as dense params.
         ep_param_and_grad_buffer.grad_data.data.fill_(float(ep_size * etp_size))
         ep_expected_grad_data_value_after_collective = 1
@@ -186,14 +193,30 @@ def test_grad_sync(
             and (not average_in_collective)
             and parallel_state.get_expert_data_parallel_rank(partial_expert_data_parallel=True) != 0
         ):
-            # With above conditions, the data in param_and_grad_buffer.grad_data[0] equals to 1/EDP
-            # When average_in_collective=False, the grad data is always first scaled by expert_data_parallel_size and then summed by AR/RS
-            # after SUM collective in expert_data_group, the scale will be 1.0.
+            # With above conditions, the data in param_and_grad_buffer.grad_data[0] equals 1/EDP.
+            # When average_in_collective=False, the grad data is always first scaled by
+            # expert_data_parallel_size and then summed by AR/RS.
+            # After SUM collective in expert_data_group, the scale will be 1.0.
             ep_expected_grad_data_value_after_collective /= (
                 parallel_state.get_expert_data_parallel_world_size()
             )
 
+    register_grad_sync_context = (
+        contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
+    )
+
+    # Call register_grad_ready for all params before starting test to seed tracking
+    # data structures.
     params = list(model.parameters())
+    for param in params:
+        with register_grad_sync_context:
+            bucket_group = param_to_bucket_group[param]
+            bucket_group.register_grad_ready(param)
+    # Call reset to set .is_first_batch to False.
+    for param in params:
+        bucket_group = param_to_bucket_group[param]
+        bucket_group.reset()
+
     map_bucket_to_last_param_idx = {}
     for i, param in enumerate(params):
         if not (param in param_to_bucket_group):
@@ -206,9 +229,6 @@ def test_grad_sync(
             param_idx = 0
         map_bucket_to_last_param_idx[bucket_group] = param_idx
 
-        register_grad_sync_context = (
-            contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
-        )
         finish_grad_sync_context = contextlib.nullcontext()
         if (
             param_idx < (len(bucket_group.params) - 1)
@@ -220,6 +240,7 @@ def test_grad_sync(
 
         with register_grad_sync_context:
             bucket_group.register_grad_ready(param)
+
         with finish_grad_sync_context:
             # When overlap_grad_reduce is True, this should throw an assertion error until all
             # params in the model have registered their grad above.
diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
index c09e2313d8d..609b2cc5a71 100644
--- a/tests/unit_tests/distributed/test_param_and_grad_buffer.py
+++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
@@ -1,6 +1,9 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
 import contextlib
 import math
 from typing import Optional
+from unittest import mock
 
 import pytest
 import torch
@@ -164,7 +167,6 @@ def _pad_param_if_needed(numel_unpadded):
 @pytest.mark.parametrize("overlap_grad_reduce", [False, True])
 @pytest.mark.parametrize("average_in_collective", [False, True])
 @pytest.mark.parametrize("num_distributed_optimizer_instances", [1, 2])
-# @pytest.mark.flaky
 def test_grad_sync(
     use_distributed_optimizer: bool,
     overlap_grad_reduce: bool,
@@ -201,10 +203,12 @@ def test_grad_sync(
 
     param_and_grad_buffer.grad_data.data.fill_(1.0)
     expected_grad_data_value_after_collective = 1
-    # under the following conditions, the data in param_and_grad_buffer.grad_data[0] equals to 1/DP
-    # this is because when average_in_collective=False, the grad data is always first scaled by 1/DP and then summed by AR/RS
-    # and when use_distributed_optimizer=True, only for rank=0 param_and_grad_buffer.grad_data[0] is updated, for other ranks
-    # another shard of grad_data is updated while param_and_grad_buffer.grad_data[0] is unchanged (=1/DP)
+    # Data in param_and_grad_buffer.grad_data[0] is 1/DP.
+    # When average_in_collective=False, the grad data is always first scaled by 1/DP and then
+    # summed by AR/RS.
+    # When use_distributed_optimizer=True, only rank0's param_and_grad_buffer.grad_data[0] is
+    # updated; other ranks update another shard of grad_data while keeping
+    # param_and_grad_buffer.grad_data[0] unchanged (=1/DP).
     if (
         use_distributed_optimizer
         and (not average_in_collective)
@@ -215,13 +219,25 @@ def test_grad_sync(
     ):
         expected_grad_data_value_after_collective /= parallel_state.get_data_parallel_world_size()
 
+    register_grad_sync_context = (
+        contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
+    )
+
+    # Call register_grad_ready for all params before starting test to seed tracking
+    # data structures.
     params = list(model.parameters())
+    for param in params:
+        with register_grad_sync_context:
+            bucket_group = param_to_bucket_group[param]
+            bucket_group.register_grad_ready(param)
+    # Call reset to set .is_first_batch to False.
+    for param in params:
+        bucket_group = param_to_bucket_group[param]
+        bucket_group.reset()
+
     for i, param in enumerate(params):
         assert param in param_to_bucket_group
         bucket_group = param_to_bucket_group[param]
-        register_grad_sync_context = (
-            contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
-        )
         finish_grad_sync_context = contextlib.nullcontext()
         if (
             i < (len(params) - 1)
@@ -233,6 +249,7 @@ def test_grad_sync(
 
         with register_grad_sync_context:
             bucket_group.register_grad_ready(param)
+
         with finish_grad_sync_context:
             # When overlap_grad_reduce is True, this should throw an assertion error until all
             # params in the model have registered their grad above.
@@ -249,3 +266,57 @@ def test_grad_sync(
             param_and_grad_buffer.grad_data.data.fill_(1.0)
 
     Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize("force_all_reduce", [False, True])
+def test_force_all_reduce_uses_correct_collective(force_all_reduce: bool):
+    """Test that force_all_reduce=True causes all-reduce to be used instead of reduce-scatter."""
+    Utils.initialize_model_parallel()
+
+    input_dim = 100
+    output_dim = 100
+    num_layers = 2
+    model, param_and_grad_buffer, _ = get_model_and_buffers(
+        input_dim=input_dim,
+        output_dim=output_dim,
+        num_layers=num_layers,
+        bias=True,
+        shared_embedding=False,
+        bucket_size=None,
+        use_distributed_optimizer=True,  # This normally uses reduce-scatter.
+        overlap_grad_reduce=False,
+        average_in_collective=False,
+    )
+
+    # Mock the collective operations to track which one is called.
+    with (
+        mock.patch('torch.distributed.all_reduce') as mock_all_reduce,
+        mock.patch(
+            'megatron.core.distributed.param_and_grad_buffer.dist_reduce_scatter_func'
+        ) as mock_reduce_scatter,
+    ):
+        # Set up the mocks to be no-ops.
+        mock_all_reduce.return_value = None
+        mock_reduce_scatter.return_value = None
+
+        # Trigger the grad sync via the DDP model's finish_grad_sync method.
+        model.finish_grad_sync(force_all_reduce=force_all_reduce)
+
+        if force_all_reduce:
+            # When force_all_reduce=True, all_reduce should be called.
+            assert (
+                mock_all_reduce.called
+            ), "Expected all_reduce to be called when force_all_reduce=True"
+            assert (
+                not mock_reduce_scatter.called
+            ), "Expected reduce_scatter NOT to be called when force_all_reduce=True"
+        else:
+            # When force_all_reduce=False with distributed optimizer, reduce_scatter should be called.
+            assert (
+                mock_reduce_scatter.called
+            ), "Expected reduce_scatter to be called when force_all_reduce=False"
+            assert (
+                not mock_all_reduce.called
+            ), "Expected all_reduce NOT to be called when force_all_reduce=False"
+
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/extension/test_kitchen_sdpa.py b/tests/unit_tests/extension/test_kitchen_sdpa.py
new file mode 100644
index 00000000000..6875b005c0b
--- /dev/null
+++ b/tests/unit_tests/extension/test_kitchen_sdpa.py
@@ -0,0 +1,486 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import os
+import queue
+from typing import Literal, Tuple
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.extensions.transformer_engine import TEDotProductAttention
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.quantization.quant_config import RecipeConfig
+from megatron.core.quantization.utils import get_quant_config_or_none
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    from megatron.core.extensions.kitchen import (
+        HAVE_KITCHEN,
+        KitchenDotProductAttention,
+        KitchenFlashAttention,
+    )
+
+except ImportError:
+    from unittest.mock import MagicMock
+
+    HAVE_KITCHEN = False
+    KitchenDotProductAttention = MagicMock()
+    KitchenFlashAttention = MagicMock()
+
+try:
+    import transformer_engine  # type: ignore[import-untyped]
+    from transformer_engine.pytorch.attention import (  # type: ignore[import-untyped]
+        dot_product_attention,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    from unittest.mock import MagicMock
+
+    HAVE_TE = False
+    transformer_engine = MagicMock()
+    dot_product_attention = MagicMock()
+
+
+# Create custom process groups
+Utils.initialize_model_parallel(tensor_model_parallel_size=1, context_parallel_size=1)
+model_parallel_cuda_manual_seed(123)
+
+# Get TP and CP process groups from device mesh
+tp_group = parallel_state.get_tensor_model_parallel_group()
+cp_group = parallel_state.get_context_parallel_group()
+
+pg_collection = ProcessGroupCollection(tp=tp_group, cp=cp_group)
+
+
+def get_attention_implementation(
+    impl: Literal["megatron", "te-fa", "te-unfused", "kitchen", "kitchen-fa"],
+    config: TransformerConfig,
+    layer_number: int,
+    attn_mask_type: AttnMaskType,
+    attention_type: str,
+    attention_dropout: float,
+    softmax_scale: float,
+    cp_comm_type: str = "a2a",
+) -> MegatronModule:
+    if impl == "megatron":
+        return DotProductAttention(
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+            cp_comm_type,
+            pg_collection,
+        )
+    elif impl == "te-fa" or impl == "te-unfused":
+        if attention_type == "self_attention":
+            attention_type = "self"
+        return TEDotProductAttention(
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+            cp_comm_type=cp_comm_type,
+            pg_collection=pg_collection,
+        )
+    elif impl == "kitchen":
+        attn = KitchenDotProductAttention(
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+            cp_comm_type,
+            pg_collection,
+        )
+        attn.finish_init(
+            get_quant_config_or_none("self_attention.core_attention", config.quant_recipe)
+        )
+        return attn
+    elif impl == "kitchen-fa":
+        if attention_type == "self_attention":
+            attention_type = "self"
+        attn = KitchenFlashAttention(
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+            cp_comm_type,
+            pg_collection,
+        )
+        attn.finish_init(
+            get_quant_config_or_none("self_attention.core_attention", config.quant_recipe)
+        )
+        return attn
+    else:
+        raise ValueError(f"Invalid implementation: {impl}")
+
+
+class DotProductAttentionModel(torch.nn.Module):
+    def __init__(
+        self,
+        impl: Literal["megatron", "te-fa", "te-unfused", "kitchen", "kitchen-fa"],
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float,
+        softmax_scale: float,
+    ):
+        super().__init__()
+        self.impl = impl
+        self.attention_module = get_attention_implementation(
+            impl,
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+        )
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: torch.Tensor,
+        attn_mask_type: AttnMaskType,
+    ):
+        return self.attention_module(query, key, value, attention_mask, attn_mask_type)
+
+    @property
+    def last_attention_probs(self):
+        return self.attention_module._last_attention_probs
+
+
+class CompareImplementations:
+
+    def _prepare_data(
+        self, config: TransformerConfig, seed: int, dtype: torch.dtype = torch.bfloat16
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+
+        b = 4
+        # np = number of attention heads per partition
+        np = config.num_attention_heads // config.tensor_model_parallel_size
+        # hn = hidden size per attention head (same as kv_channels)
+        hn = config.hidden_size // config.num_attention_heads
+        # sk = number of key tokens
+        sk = 256
+        # sq = number of query tokens
+        sq = 256
+
+        # bshd layout
+
+        shape = (sq, b, np, hn)
+
+        q = torch.randn(shape, dtype=dtype, device="cuda", requires_grad=True)
+        k = torch.randn(shape, dtype=dtype, device="cuda", requires_grad=True)
+        v = torch.randn(shape, dtype=dtype, device="cuda", requires_grad=True)
+
+        grad = torch.randn((sq, b, np * hn), dtype=dtype, device="cuda")
+        return q, k, v, grad
+
+    def run_attention_one_step(
+        self,
+        layer: DotProductAttentionModel,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        gradient: torch.Tensor,
+        attn_mask_type: AttnMaskType,
+    ):
+        layer.zero_grad()
+        query.grad = None
+        key.grad = None
+        value.grad = None
+
+        attention_mask = None
+
+        out = layer(query, key, value, attention_mask, attn_mask_type)
+
+        out.backward(gradient)
+
+        qgrad = query.grad
+        kgrad = key.grad
+        vgrad = value.grad
+
+        return out, qgrad, kgrad, vgrad  # , layer.last_attention_probs
+
+    def compare_implementations(
+        self,
+        impl1: Literal["megatron", "te-fa", "te-unfused", "kitchen", "kitchen-fa"],
+        impl2: Literal["megatron", "te-fa", "te-unfused", "kitchen", "kitchen-fa"],
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float,
+        softmax_scale: float,
+        out_error: float,
+        q_grad_error: float,
+        k_grad_error: float,
+        v_grad_error: float,
+        seed: int = 0,
+    ) -> None:
+        os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "0"
+        if impl1 == "te-fa" or impl2 == "te-fa":
+            dot_product_attention._attention_backends = {
+                "attention_params": None,
+                "use_flash_attention": None,
+                "flash_attention_backend": None,
+                "use_fused_attention": None,
+                "fused_attention_backend": None,
+                "use_unfused_attention": None,
+                "backend_selection_requires_update": False,
+            }
+            os.environ["NVTE_FLASH_ATTN"] = "1"
+        elif impl1 == "te-unfused" or impl2 == "te-unfused":
+            dot_product_attention._attention_backends = {
+                "attention_params": None,
+                "use_flash_attention": None,
+                "flash_attention_backend": None,
+                "use_fused_attention": None,
+                "fused_attention_backend": None,
+                "use_unfused_attention": None,
+                "backend_selection_requires_update": False,
+            }
+            os.environ["NVTE_FUSED_ATTN"] = "0"
+            os.environ["NVTE_FLASH_ATTN"] = "0"
+
+        # qkv are (sq, b, np, hn)
+        # grad is (sq, b, np * hn)
+        q, k, v, grad = self._prepare_data(config, seed)
+        layer1 = DotProductAttentionModel(
+            impl1,
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+        )
+        layer2 = DotProductAttentionModel(
+            impl2,
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+        )
+
+        query_layer, key_layer, value_layer = q, k, v
+
+        out1, q_grad1, k_grad1, v_grad1 = self.run_attention_one_step(
+            layer1,
+            query_layer.clone().detach().requires_grad_(True),
+            key_layer.clone().detach().requires_grad_(True),
+            value_layer.clone().detach().requires_grad_(True),
+            grad.clone().detach().requires_grad_(True),
+            attn_mask_type,
+        )
+        out2, q_grad2, k_grad2, v_grad2 = self.run_attention_one_step(
+            layer2,
+            query_layer.clone().detach().requires_grad_(True),
+            key_layer.clone().detach().requires_grad_(True),
+            value_layer.clone().detach().requires_grad_(True),
+            grad.clone().detach().requires_grad_(True),
+            attn_mask_type,
+        )
+
+        torch.testing.assert_close(out1, out2, atol=out_error, rtol=0.0)
+        torch.testing.assert_close(q_grad1, q_grad2, atol=q_grad_error, rtol=0.0)
+        torch.testing.assert_close(k_grad1, k_grad2, atol=k_grad_error, rtol=0.0)
+        torch.testing.assert_close(v_grad1, v_grad2, atol=v_grad_error, rtol=0.0)
+
+
+@pytest.mark.skipif(
+    not HAVE_KITCHEN or not HAVE_TE,
+    reason="Kitchen and Transformer Engine required for using kitchen backend.",
+)
+@pytest.mark.parametrize(
+    "impl1, impl2, errors",
+    [
+        ("megatron", "kitchen", (0.0, 0.0625, 0.125, 0.0625)),
+        ("kitchen", "te-fa", (0.0625, 0.1875, 0.125, 0.05)),
+        ("kitchen", "te-unfused", (0.05, 0.1875, 0.09375, 0.05)),
+    ],
+)
+def test_attention_implementations(
+    impl1: Literal["megatron", "te-fa", "te-unfused", "kitchen"],
+    impl2: Literal["megatron", "te-fa", "te-unfused", "kitchen"],
+    errors: Tuple[float, float, float, float],
+) -> None:
+    out_error, q_grad_error, k_grad_error, v_grad_error = errors
+
+    config = TransformerConfig(
+        num_layers=2,
+        hidden_size=1024,
+        num_attention_heads=8,
+        use_cpu_initialization=False,
+        gated_linear_unit=True,
+        bias_activation_fusion=True,
+        add_bias_linear=False,
+        use_kitchen=True,
+        use_kitchen_attention=True,
+        tensor_model_parallel_size=1,
+        bf16=True,
+        params_dtype=torch.bfloat16,
+        deterministic_mode=True,
+        quant_recipe=RecipeConfig.from_config_dict(
+            {
+                "matchers": {
+                    "attention": {
+                        "type": "glob",
+                        "enabled": True,
+                        "pattern": "*self_attention.core_attention",
+                        "config": "bf16_attn",
+                    },
+                    "attention_fa": {
+                        "type": "glob",
+                        "enabled": True,
+                        "pattern": "*self_attention.core_attention",
+                        "config": "bf16_fa",
+                    },
+                    "keep_in_hp": {
+                        "type": "glob",
+                        "enabled": True,
+                        "pattern": "*fc2",
+                        "config": "bf16",
+                    },
+                    "use_fp8_cs": {
+                        "type": "glob",
+                        "enabled": True,
+                        "pattern": "*",
+                        "config": "fp8_cs",
+                    },
+                },
+                "configs": {
+                    "bf16": {"kitchen_config_type": "QLinearParams", "recipe_idx": 1},
+                    "fp8_cs": {"kitchen_config_type": "QLinearParams", "recipe_idx": 2},
+                    "bf16_attn": {"kitchen_config_type": "QAttentionParams", "recipe_idx": 1},
+                    "bf16_fa": {
+                        "kitchen_config_type": "QFlashAttentionParams",
+                        "recipe_name": "triton_fa_bf16_for_all_natural",
+                    },
+                },
+            }
+        ),
+    )
+
+    CompareImplementations().compare_implementations(
+        impl1=impl1,
+        impl2=impl2,
+        config=config,
+        layer_number=1,
+        attn_mask_type=AttnMaskType.causal,
+        attention_type="self_attention",
+        attention_dropout=0.0,
+        softmax_scale=0.23,
+        seed=0,
+        out_error=out_error,
+        q_grad_error=q_grad_error,
+        k_grad_error=k_grad_error,
+        v_grad_error=v_grad_error,
+    )
+
+
+@pytest.mark.skipif(
+    not HAVE_KITCHEN or not HAVE_TE,
+    reason="Kitchen and Transformer Engine required for using kitchen backend.",
+)
+@pytest.mark.parametrize(
+    "impl1, impl2, errors",
+    [
+        ("kitchen-fa", "te-fa", (0.016, 0.07, 0.04, 0.01)),
+        ("kitchen-fa", "kitchen", (0.125, 0.25, 0.25, 0.125)),
+    ],
+)
+def test_kitchen_flash_attention_implementations(
+    impl1: Literal["megatron", "te-fa", "te-unfused", "kitchen", "kitchen-fa"],
+    impl2: Literal["megatron", "te-fa", "te-unfused", "kitchen", "kitchen-fa"],
+    errors: Tuple[float, float, float, float],
+) -> None:
+    """Test KitchenFlashAttention against other implementations."""
+    out_error, q_grad_error, k_grad_error, v_grad_error = errors
+
+    config = TransformerConfig(
+        num_layers=2,
+        hidden_size=1024,
+        num_attention_heads=8,
+        use_cpu_initialization=False,
+        gated_linear_unit=True,
+        bias_activation_fusion=True,
+        add_bias_linear=False,
+        use_kitchen=True,
+        use_kitchen_attention=True,
+        kitchen_attention_backend="fa",
+        tensor_model_parallel_size=1,
+        bf16=True,
+        params_dtype=torch.bfloat16,
+        deterministic_mode=True,
+        quant_recipe=RecipeConfig.from_config_dict(
+            {
+                "matchers": {
+                    "attention": {
+                        "type": "glob",
+                        "enabled": True,
+                        "pattern": "*self_attention.core_attention",
+                        "config": "bf16_fa",
+                    },
+                    "keep_in_hp": {
+                        "type": "glob",
+                        "enabled": True,
+                        "pattern": "*fc2",
+                        "config": "bf16",
+                    },
+                    "use_fp8_cs": {
+                        "type": "glob",
+                        "enabled": True,
+                        "pattern": "*",
+                        "config": "fp8_cs",
+                    },
+                },
+                "configs": {
+                    "bf16": {"kitchen_config_type": "QLinearParams", "recipe_idx": 1},
+                    "fp8_cs": {"kitchen_config_type": "QLinearParams", "recipe_idx": 2},
+                    "bf16_fa": {
+                        "kitchen_config_type": "QFlashAttentionParams",
+                        "recipe_name": "triton_fa_bf16_for_all_natural",
+                    },
+                },
+            }
+        ),
+    )
+
+    CompareImplementations().compare_implementations(
+        impl1=impl1,
+        impl2=impl2,
+        config=config,
+        layer_number=1,
+        attn_mask_type=AttnMaskType.causal,
+        attention_type="self_attention",
+        attention_dropout=0.0,
+        softmax_scale=0.23,
+        seed=0,
+        out_error=out_error,
+        q_grad_error=q_grad_error,
+        k_grad_error=k_grad_error,
+        v_grad_error=v_grad_error,
+    )
diff --git a/tests/unit_tests/find_test_cases.py b/tests/unit_tests/find_test_cases.py
index 2e9f5515b7d..1445206cab5 100644
--- a/tests/unit_tests/find_test_cases.py
+++ b/tests/unit_tests/find_test_cases.py
@@ -50,7 +50,8 @@ def expand_pattern(pattern):
 
 def main():
     BUCKET = sys.argv[1]
-    YAML_FILE = 'tests/test_utils/recipes/unit-tests.yaml'
+    GPU_TYPE = sys.argv[2]
+    YAML_FILE = f'tests/test_utils/recipes/{GPU_TYPE}/unit-tests.yaml'
 
     all_test_cases = get_test_cases(YAML_FILE)
     bucket_files = set(expand_pattern(BUCKET))
diff --git a/tests/unit_tests/inference/contexts/attention_metadata/test_tensor_ops.py b/tests/unit_tests/inference/contexts/attention_metadata/test_tensor_ops.py
new file mode 100644
index 00000000000..a44f0c0d155
--- /dev/null
+++ b/tests/unit_tests/inference/contexts/attention_metadata/test_tensor_ops.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.inference.contexts.attention_context.triton.tensor_ops import (
+    tensor_get_slice_after,
+    tensor_masked_update,
+    tensor_merge,
+)
+
+
+def tensor_get_slice_after_pytorch(
+    input_tensor: torch.Tensor, output_tensor: torch.Tensor, pos_on_device: torch.Tensor
+) -> None:
+    """Reference PyTorch implementation of tensor_get_slice_after."""
+
+    assert input_tensor.ndim == output_tensor.ndim, "Rank mismatch"
+    for i in range(1, input_tensor.ndim):
+        assert input_tensor.shape[i] == output_tensor.shape[i], f"Dimension {i} must match"
+
+    pos = pos_on_device[0].item()
+    assert 0 <= pos <= input_tensor.shape[0]
+
+    copy_size = min(input_tensor.shape[0] - pos, output_tensor.shape[0])
+    if copy_size > 0:
+        output_tensor[:copy_size].copy_(input_tensor[pos : pos + copy_size])
+
+
+def tensor_merge_pytorch(
+    tensor_a: torch.Tensor,
+    tensor_b: torch.Tensor,
+    output_tensor: torch.Tensor,
+    pos_on_device: torch.Tensor,
+) -> None:
+    """Reference PyTorch implementation of tensor_merge."""
+
+    assert tensor_a.ndim == tensor_b.ndim == output_tensor.ndim, "Rank mismatch across tensors"
+    for i in range(1, tensor_a.ndim):
+        assert (
+            tensor_a.shape[i] == tensor_b.shape[i] == output_tensor.shape[i]
+        ), f"Dimension {i} must match"
+
+    pos = pos_on_device[0].item()
+    assert 0 <= pos <= tensor_a.shape[0]
+    assert output_tensor.shape[0] >= tensor_a.shape[0]
+
+    if pos > 0:
+        output_tensor[:pos].copy_(tensor_a[:pos])
+
+    copy_size = min(tensor_b.shape[0], output_tensor.shape[0] - pos)
+    if copy_size > 0:
+        output_tensor[pos : pos + copy_size].copy_(tensor_b[:copy_size])
+
+
+@pytest.fixture
+def device():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    return torch.device("cuda")
+
+
+@pytest.fixture
+def slice_params():
+    return {"input_batch": 16, "output_batch": 20, "feature_dim": 256}
+
+
+def test_get_slice_after_basic(device, slice_params):
+    params = slice_params
+    input_tensor = torch.randn(params["input_batch"], params["feature_dim"], device=device)
+    pos_on_device = torch.tensor([5], device=device)
+
+    output_ref = torch.zeros(params["output_batch"], params["feature_dim"], device=device)
+    output_triton = torch.zeros_like(output_ref)
+    output_ref[15:] = 123.0
+    output_triton[15:] = 123.0
+
+    tensor_get_slice_after_pytorch(input_tensor, output_ref, pos_on_device)
+    tensor_get_slice_after(input_tensor, output_triton, pos_on_device, check_bounds=True)
+
+    assert torch.equal(output_ref, output_triton)
+    assert torch.equal(
+        output_triton[: params["input_batch"] - pos_on_device[0].item()],
+        input_tensor[pos_on_device[0].item() :],
+    )
+
+
+def test_get_slice_after_pos_zero(device, slice_params):
+    params = slice_params
+    input_tensor = torch.randn(params["input_batch"], params["feature_dim"], device=device)
+    output_tensor = torch.zeros(params["output_batch"], params["feature_dim"], device=device)
+
+    tensor_get_slice_after(
+        input_tensor, output_tensor, torch.tensor([0], device=device), check_bounds=True
+    )
+
+    copy_size = min(params["input_batch"], params["output_batch"])
+    assert torch.equal(output_tensor[:copy_size], input_tensor[:copy_size])
+
+
+def test_get_slice_after_pos_full(device, slice_params):
+    params = slice_params
+    input_tensor = torch.randn(params["input_batch"], params["feature_dim"], device=device)
+    output_tensor = torch.ones(params["output_batch"], params["feature_dim"], device=device)
+    original = output_tensor.clone()
+
+    tensor_get_slice_after(
+        input_tensor,
+        output_tensor,
+        torch.tensor([params["input_batch"]], device=device),
+        check_bounds=True,
+    )
+
+    assert torch.equal(output_tensor, original)
+
+
+def test_get_slice_after_exact_fit(device):
+    input_tensor = torch.randn(8, 256, device=device)
+    output_tensor = torch.zeros(5, 256, device=device)
+
+    tensor_get_slice_after(input_tensor, output_tensor, torch.tensor([3], device=device))
+
+    assert torch.equal(output_tensor, input_tensor[3:8])
+
+
+def test_get_slice_after_nd(device):
+    input_tensor = torch.randn(6, 4, 8, device=device)
+    output_tensor = torch.zeros(10, 4, 8, device=device)
+
+    tensor_get_slice_after(
+        input_tensor, output_tensor, torch.tensor([1], device=device), check_bounds=True
+    )
+
+    assert torch.equal(output_tensor[:5], input_tensor[1:6])
+
+
+def test_get_slice_after_bounds(device, slice_params):
+    params = slice_params
+    input_tensor = torch.randn(params["input_batch"], params["feature_dim"], device=device)
+    output_tensor = torch.zeros(params["output_batch"], params["feature_dim"], device=device)
+
+    with pytest.raises(AssertionError):
+        tensor_get_slice_after(
+            input_tensor,
+            output_tensor,
+            torch.tensor([params["input_batch"] + 1], device=device),
+            check_bounds=True,
+        )
+
+
+def test_get_slice_after_consistency(device):
+    input_tensor = torch.randn(32, 128, device=device)
+    output_ref = torch.zeros(16, 128, device=device)
+    output_triton = torch.zeros_like(output_ref)
+    pos_on_device = torch.tensor([8], device=device)
+
+    tensor_get_slice_after_pytorch(input_tensor, output_ref, pos_on_device)
+    tensor_get_slice_after(input_tensor, output_triton, pos_on_device)
+
+    assert torch.equal(output_ref, output_triton)
+
+
+@pytest.fixture
+def merge_params():
+    return {"tensor_a_batch": 8, "tensor_b_batch": 12, "output_batch": 32, "feature_dim": 256}
+
+
+@pytest.mark.parametrize("in_place", [False, True])
+def test_tensor_merge_basic(device, merge_params, in_place):
+    params = merge_params
+    pos_val = 5
+    pos_on_device = torch.tensor([pos_val], device=device)
+
+    tensor_b = torch.randn(params["tensor_b_batch"], params["feature_dim"], device=device)
+
+    if in_place:
+        tensor_a = torch.randn(params["output_batch"], params["feature_dim"], device=device)
+        output_triton = tensor_a.clone()
+
+        output_ref = tensor_a.clone()
+        tensor_merge_pytorch(tensor_a, tensor_b, output_ref, pos_on_device)
+        tensor_merge(output_triton, tensor_b, pos_on_device, output_tensor=None, check_bounds=True)
+    else:
+        tensor_a = torch.randn(params["tensor_a_batch"], params["feature_dim"], device=device)
+        output_ref = torch.zeros(params["output_batch"], params["feature_dim"], device=device)
+        output_triton = torch.zeros_like(output_ref)
+
+        tensor_merge_pytorch(tensor_a, tensor_b, output_ref, pos_on_device)
+        tensor_merge(
+            tensor_a, tensor_b, pos_on_device, output_tensor=output_triton, check_bounds=True
+        )
+
+    assert torch.equal(output_ref, output_triton)
+    assert torch.equal(output_triton[:pos_val], tensor_a[:pos_val])
+    assert torch.equal(output_triton[pos_val : pos_val + params["tensor_b_batch"]], tensor_b)
+
+
+def test_tensor_merge_pos_zero(device, merge_params):
+    params = merge_params
+    tensor_a = torch.randn(params["tensor_a_batch"], params["feature_dim"], device=device)
+    tensor_b = torch.randn(params["tensor_b_batch"], params["feature_dim"], device=device)
+    output_tensor = torch.zeros(params["output_batch"], params["feature_dim"], device=device)
+
+    tensor_merge(
+        tensor_a,
+        tensor_b,
+        torch.tensor([0], device=device),
+        output_tensor=output_tensor,
+        check_bounds=True,
+    )
+
+    assert torch.equal(output_tensor[: params["tensor_b_batch"]], tensor_b)
+
+
+def test_tensor_merge_pos_full(device, merge_params):
+    params = merge_params
+    tensor_a = torch.randn(params["tensor_a_batch"], params["feature_dim"], device=device)
+    tensor_b = torch.randn(params["tensor_b_batch"], params["feature_dim"], device=device)
+    output_tensor = torch.zeros(params["output_batch"], params["feature_dim"], device=device)
+
+    tensor_merge(
+        tensor_a,
+        tensor_b,
+        torch.tensor([params["tensor_a_batch"]], device=device),
+        output_tensor=output_tensor,
+        check_bounds=True,
+    )
+
+    assert torch.equal(output_tensor[: params["tensor_a_batch"]], tensor_a)
+    assert torch.equal(
+        output_tensor[
+            params["tensor_a_batch"] : params["tensor_a_batch"] + params["tensor_b_batch"]
+        ],
+        tensor_b,
+    )
+
+
+def test_tensor_merge_small(device):
+    tensor_a = torch.randn(3, 256, device=device)
+    tensor_b = torch.randn(5, 256, device=device)
+    output_tensor = torch.zeros(10, 256, device=device)
+
+    tensor_merge(tensor_a, tensor_b, torch.tensor([2], device=device), output_tensor=output_tensor)
+
+    assert torch.equal(output_tensor[:2], tensor_a[:2])
+    assert torch.equal(output_tensor[2:7], tensor_b)
+
+
+@pytest.mark.parametrize("ndim", [2, 3, 4])
+def test_tensor_masked_update(device, ndim):
+    """
+    Tests tensor_masked_update for 2D, 3D, and 4D tensors.
+    Covering 3 scenarios:
+    1. idx has only valid values (arbitrary order).
+    2. idx has mixed valid values and -1s (all -1s at the end).
+    3. idx has all -1s.
+    """
+
+    num_states = 32
+    batch_size = 8
+
+    # Define shapes based on dimensionality
+    if ndim == 2:
+        shape_states = (num_states, 64)
+        shape_new = (batch_size, 64)
+    elif ndim == 3:
+        shape_states = (num_states, 8, 8)
+        shape_new = (batch_size, 8, 8)
+    elif ndim == 4:
+        shape_states = (num_states, 4, 4, 4)
+        shape_new = (batch_size, 4, 4, 4)
+
+    def allocate_tensors():
+        states = torch.randn(shape_states, device=device)
+        new_states = torch.randn(shape_new, device=device)
+        return states, new_states
+
+    # Scenario 1: no -1s
+    states, new_states = allocate_tensors()
+    idx = torch.randperm(num_states, device=device)[:batch_size]
+    expected_states = states.clone()
+    expected_states[idx] = new_states
+    tensor_masked_update(states, idx, new_states)
+    assert torch.equal(states, expected_states), f"Failed {ndim}D: all valid idx values"
+
+    # Scenario 2: mix of regular values and -1s
+    states, new_states = allocate_tensors()
+    num_valid = batch_size // 2
+    valid_indices = torch.randperm(num_states, device=device)[:num_valid]
+    idx = torch.full((batch_size,), -1, dtype=torch.long, device=device)
+    idx[:num_valid] = valid_indices
+    expected_states = states.clone()
+    expected_states[valid_indices] = new_states[:num_valid]
+    tensor_masked_update(states, idx, new_states)
+    assert torch.equal(states, expected_states), f"Failed {ndim}D: mix of valid and mask values"
+
+    # Scenario 3: all -1s
+    states, new_states = allocate_tensors()
+    idx = torch.full((batch_size,), -1, dtype=torch.long, device=device)
+    expected_states = states.clone()
+    tensor_masked_update(states, idx, new_states)
+    assert torch.equal(states, expected_states), f"Failed {ndim}D: all mask values"
diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py
index 456154147f8..05e0306bfd8 100644
--- a/tests/unit_tests/inference/contexts/test_dynamic_context.py
+++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from megatron.core import parallel_state
 from megatron.core.inference.contexts.attention_context.mamba_metadata import (
     MambaInferenceStateConfig,
 )
@@ -52,6 +53,7 @@ def _get_dynamic_context(
         is_hybrid_model=False,
         layer_type_list=None,
         rounder=64,
+        paused_buffer_size_gb=None,
     ):
         set_rounder(rounder)
 
@@ -73,8 +75,11 @@ def _get_dynamic_context(
             num_attention_heads=num_attention_heads,
             max_sequence_length=max_sequence_length,
             num_cuda_graphs=None,
-            use_cuda_graphs_for_non_decode_steps=not is_hybrid_model,
+            use_cuda_graphs_for_non_decode_steps=True,
             buffer_size_gb=buffer_size_gb,
+            paused_buffer_size_gb=(
+                0.2 * buffer_size_gb if paused_buffer_size_gb is None else paused_buffer_size_gb
+            ),
             block_size_tokens=block_size_tokens,
             max_tokens=max_tokens,
             mamba_inference_state_config=mamba_inference_state_config,
@@ -107,18 +112,16 @@ def test_initialize_dynamic_context(self, is_hybrid_model: bool):
 
         if not is_hybrid_model:
             assert dynamic_context.block_allocator.total_count == 491
-            assert dynamic_context.block_allocator.active_count == 245
-            assert dynamic_context.max_total_requests == 490
-            # We make max_active_requests divisible by the REQUEST_ROUNDER.
-            assert dynamic_context.max_active_requests == 192
+            assert dynamic_context.block_allocator.active_count == 392
+            # We make max_requests divisible by the REQUEST_ROUNDER.
+            assert dynamic_context.max_requests == 448
             assert dynamic_context.max_tokens == 16384
             assert dynamic_context.num_mamba_layers == 0
             assert dynamic_context.mamba_metadata is None
         else:
-            assert dynamic_context.block_allocator.total_count == 555
-            assert dynamic_context.block_allocator.active_count == 277
-            assert dynamic_context.max_total_requests == 554
-            assert dynamic_context.max_active_requests == 256
+            assert dynamic_context.block_allocator.total_count == 556
+            assert dynamic_context.block_allocator.active_count == 444
+            assert dynamic_context.max_requests == 512
             assert dynamic_context.max_tokens == 16384
             assert dynamic_context.num_mamba_layers == 1
             assert dynamic_context.mamba_metadata is not None
@@ -156,12 +159,12 @@ def test_is_memory_available(self, is_hybrid_model):
             max_tokens=None,
             is_hybrid_model=is_hybrid_model,
         )
-        dynamic_context.block_allocator.active_count = 10
+        dynamic_context.block_allocator.total_avail = 10
         assert dynamic_context.block_allocator.is_memory_available(10)
         assert not dynamic_context.block_allocator.is_memory_available(11)
 
         assert dynamic_context.block_allocator.is_memory_available(1)
-        dynamic_context.block_allocator.active_count = 0
+        dynamic_context.block_allocator.total_avail = 0
         assert not dynamic_context.block_allocator.is_memory_available(1)
 
     @pytest.mark.internal
@@ -181,9 +184,9 @@ def test_request_overflow(self, is_hybrid_model: bool):
             rounder=1,
             is_hybrid_model=is_hybrid_model,
         )
-        dynamic_context.max_active_requests //= 2
+        dynamic_context.max_requests //= 2
         with pytest.raises(RequestOverflowError):
-            for i in range(dynamic_context.max_active_requests + 1):
+            for i in range(dynamic_context.max_requests + 1):
                 dynamic_context.add_request(
                     DynamicInferenceRequest(
                         request_id=i,
@@ -207,7 +210,7 @@ def test_token_overflow_error(self, is_hybrid_model: bool):
             max_sequence_length=512,
             buffer_size_gb=0.1,
             block_size_tokens=128,
-            max_tokens=200,  # setting low, but >= context.max_active_requests.
+            max_tokens=200,  # setting low, but >= context.max_requests.
             rounder=1,
             is_hybrid_model=is_hybrid_model,
         )
@@ -287,10 +290,12 @@ def test_reset(self, is_hybrid_model: bool):
         assert torch.all(dynamic_context.token_to_position_in_request == 0)
         assert torch.all(dynamic_context.token_to_block_idx == -1)
         assert torch.all(dynamic_context.token_to_local_position_within_kv_block == 0)
-        assert (
-            dynamic_context.block_allocator.active_count
-            == dynamic_context.block_allocator.total_count // 2
-        )
+        if not is_hybrid_model:
+            assert dynamic_context.block_allocator.active_count == 819
+            assert dynamic_context.block_allocator.total_count == 1024
+        else:
+            assert dynamic_context.block_allocator.active_count == 1517
+            assert dynamic_context.block_allocator.total_count == 1897
         assert torch.all(dynamic_context.request_to_kv_block_ids == -1)
         if is_hybrid_model:
             assert torch.all(dynamic_context.mamba_metadata.request_to_mamba_state_idx == -1)
@@ -312,7 +317,7 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model):
         )
 
         if is_hybrid_model:
-            expected_memory_blocks = [550, 551, 552, 553]
+            expected_memory_blocks = [551, 552, 553, 554]
         else:
             expected_memory_blocks = [486, 487, 488, 489]
         expected_block_count_avail = expected_memory_blocks[0]
@@ -378,7 +383,7 @@ def test_add_request(self, is_hybrid_model: bool):
         assert dynamic_context.request_kv_length_offsets[0] == 0
         assert dynamic_context.request_kv_block_counts[0] == 2
         assert dynamic_context.request_last_kv_block_id[0].item() == (
-            553 if is_hybrid_model else 489
+            554 if is_hybrid_model else 489
         )
         assert dynamic_context.request_last_kv_block_offset[0].item() == 15
         assert torch.all(
@@ -509,9 +514,8 @@ def test_add_dummy_requests_parallel_populates_state(self):
             torch.tensor([2, 1], device='cuda', dtype=torch.int32),
         )
 
-        termination_idx = DynamicInferenceRequest.get_metadata_labels()["termination_id"]
         assert torch.equal(
-            dynamic_context.request_metadata[:2, termination_idx],
+            dynamic_context.request_metadata["termination_id"][:2],
             torch.tensor([7.0, 8.0], device='cuda'),
         )
 
@@ -737,13 +741,13 @@ def test_update_request(self, is_hybrid_model: bool):
                 dynamic_context.request_to_kv_block_ids[0:10].cpu()
                 == torch.tensor(
                     [
-                        [543, 546, -1, -1],
-                        [544, 543, -1, -1],
-                        [548, 550, -1, -1],
+                        [544, 547, -1, -1],
+                        [545, 544, -1, -1],
                         [549, 551, -1, -1],
-                        [547, -1, -1, -1],
-                        [545, -1, -1, -1],
-                        [552, -1, -1, -1],
+                        [550, 552, -1, -1],
+                        [548, -1, -1, -1],
+                        [546, -1, -1, -1],
+                        [553, -1, -1, -1],
                         [-1, -1, -1, -1],
                         [-1, -1, -1, -1],
                         [-1, -1, -1, -1],
@@ -1199,3 +1203,51 @@ def test_calculate_and_store_log_probs(self):
                 )
 
                 current_global_token_offset += expected_len
+
+    @pytest.mark.internal
+    def test_pipeline_parallel_uneven_layers(self):
+        """
+        Test that DynamicInferenceContext synchronizes the total block count across
+        pipeline stages when they have unequal layer counts.
+        """
+        pp_size = 2
+        self._setup_model_parallel_group(tensor_parallel_size=1, pipeline_parallel_size=pp_size)
+
+        rank = parallel_state.get_pipeline_model_parallel_rank()
+
+        if rank == 0:
+            local_num_layers = 12
+        else:
+            local_num_layers = 4
+
+        context = DynamicInferenceContext(
+            params_dtype=torch.float32,
+            num_layers=local_num_layers,
+            kv_channels=64,
+            num_attention_heads=8,
+            max_sequence_length=128,
+            buffer_size_gb=0.1,
+            block_size_tokens=16,
+            max_tokens=1024,
+            pipeline_model_parallel_size=pp_size,
+            tensor_model_parallel_size=1,
+            unified_memory_level=0,
+        )
+
+        # Collect the total block counts on each rank
+        local_total_blocks = torch.tensor(
+            [context.block_allocator.total_count], device='cuda', dtype=torch.long
+        )
+        gathered_block_counts = [torch.zeros_like(local_total_blocks) for _ in range(pp_size)]
+        torch.distributed.all_gather(
+            gathered_block_counts,
+            local_total_blocks,
+            group=parallel_state.get_pipeline_model_parallel_group(),
+        )
+        all_counts = [t.item() for t in gathered_block_counts]
+
+        # Verify that there is only 1 unique value across all ranks
+        unique_counts = set(all_counts)
+        assert (
+            len(unique_counts) == 1
+        ), f"Block counts were not synchronized across ranks. Gathered: {all_counts}"
diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
index 0cd810d9ed7..d5803b3638e 100644
--- a/tests/unit_tests/inference/engines/test_dynamic_engine.py
+++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py
@@ -43,11 +43,12 @@
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
 from megatron.core.models.mamba.mamba_model import MambaModel
+from megatron.core.ssm.mamba_mixer import _check_mamba_sequence_packing_support
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import (
-    check_mamba_sequence_packing_support,
     get_mamba_inference_state_config_from_model,
     is_fa_min_version,
     is_te_min_version,
@@ -58,7 +59,7 @@
 def skip_if_mamba_sequence_packing_not_available(model_provider: str):
     if model_provider == "mamba":
         sequence_packing_available, reason_for_no_sequence_packing = (
-            check_mamba_sequence_packing_support()
+            _check_mamba_sequence_packing_support()
         )
         if not sequence_packing_available:
             pytest.skip(reason_for_no_sequence_packing)
@@ -89,7 +90,9 @@ class DynamicEngineTestConfig:
     num_gap_steps: int = 2
 
     context_buffer_size_gb: float = 0.1  # enough room for all tokens.
+    context_paused_buffer_size_gb: float | None = None
     context_block_size_tokens: int = 256
+    context_max_requests: Optional[int] = None
     context_max_tokens: Optional[int] = None
     tensor_model_parallel_size: int = 1
     pipeline_model_parallel_size: int = 1
@@ -104,7 +107,10 @@ class DynamicEngineTestConfig:
     return_log_probs: bool = False
     materialize_only_last_token_logits: bool = True
     skip_prompt_log_probs: bool = False
-    cuda_graph_scope: str = "full_iteration"
+    enable_chunked_prefill: bool = False
+    cuda_graph_scope: List[CudaGraphScope] = field(
+        default_factory=lambda: [CudaGraphScope.full_iteration]
+    )
     force_build_cuda_graphs: bool = False
     transformer_impl: str = "local"
     # If False, do not build cuda graphs in the tests, even if
@@ -128,6 +134,10 @@ def __post_init__(self):
             assert self.num_tokens_total is not None
             self.max_sequence_length = self.num_tokens_total
 
+        # Default paused buffer size.
+        if self.context_paused_buffer_size_gb is None:
+            self.context_paused_buffer_size_gb = 0.2 * self.context_buffer_size_gb
+
 
 @dataclass
 class DynamicEngineTestEnv:
@@ -220,11 +230,14 @@ def _build_inference_context(
             num_attention_heads=transformer_config.num_query_groups,
             max_sequence_length=test_config.max_sequence_length,
             num_cuda_graphs=test_config.num_cuda_graphs,
-            use_cuda_graphs_for_non_decode_steps=not test_config.model_provider == "mamba",
+            use_cuda_graphs_for_non_decode_steps=True,
             buffer_size_gb=test_config.context_buffer_size_gb,
+            paused_buffer_size_gb=test_config.context_paused_buffer_size_gb,
             block_size_tokens=test_config.context_block_size_tokens,
+            max_requests=test_config.context_max_requests,
             max_tokens=test_config.context_max_tokens,
             tensor_model_parallel_size=transformer_config.tensor_model_parallel_size,
+            pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size,
             mamba_inference_state_config=mamba_inference_state_config,
             materialize_only_last_token_logits=test_config.materialize_only_last_token_logits,
             use_flashinfer_fused_rope=None,  # default to using flash-infer if available
@@ -416,6 +429,7 @@ def _build_test_env(cls, test_config):
             inference_context,
             random_seed=test_config.random_seed,
             enable_cuda_graph=transformer_config.cuda_graph_impl == "local",
+            enable_chunked_prefill=test_config.enable_chunked_prefill,
         )
 
         # Test env.
@@ -432,7 +446,7 @@ def _run_step(cls, env):
         # the only thing that differs between requests is num_tokens_to_generate,
         # and engine.async_step() doesn't use this sampling param's
         # num_tokens_to_generate.
-        result = env.engine.step_modern(verbose=False)
+        result = env.engine.step_modern()
 
         # Suspend + resume.
         if (
@@ -526,7 +540,7 @@ def teardown_method(self, method):
     )
     @pytest.mark.parametrize("model_provider", ["gpt", "mamba"])
     @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4])
-    @pytest.mark.parametrize("cuda_graph_scope", ["full", "full_iteration"])
+    @pytest.mark.parametrize("cuda_graph_scope", [[], [CudaGraphScope.full_iteration]])
     def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None:
         """Simple test that runs without errors, and validates output."""
         skip_if_mamba_sequence_packing_not_available(model_provider)
@@ -674,12 +688,13 @@ def test_cuda_graph_token_counts(self) -> None:
 
         # Test num_cuda_graphs.
         for num_cuda_graphs, expected_cuda_graph_token_counts in [
-            (0, [40]),
-            (1, [40]),
-            (2, [40, 24]),
-            (4, [40, 32, 16]),
-            (8, [40, 32, 24, 16, 8]),
-            (16, [40, 32, 24, 16, 8]),
+            (0, [80]),
+            (1, [80]),
+            (2, [80, 40]),
+            (4, [80, 72, 48, 24]),
+            (8, [80, 64, 48, 32, 16]),
+            (16, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
+            (32, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
         ]:
 
             # Build cuda graphs (inside dynamic engine).
@@ -765,7 +780,7 @@ async def test_run_engine(self):
             test_config = DynamicEngineTestConfig(num_requests=8, use_fixed_output_lengths=True)
             env = self._build_test_env(test_config)
 
-            engine_task = asyncio.create_task(env.engine.run_engine(verbose=False))
+            engine_task = asyncio.create_task(env.engine.run_engine())
 
             request_completion_futures: Dict[int, asyncio.Future[DynamicInferenceRequest]] = {}
 
@@ -1141,7 +1156,7 @@ def test_chunked_prefill(self, model_provider: str):
         num_tokens_to_generate = 16
         max_sequence_length = prompt_length + num_tokens_to_generate
 
-        # Configure context to force chunking (chunked prefill is enabled by default)
+        # Configure context to force chunking
         env = self._run_test(
             num_requests=1,
             min_prompt_length=prompt_length,
@@ -1151,6 +1166,7 @@ def test_chunked_prefill(self, model_provider: str):
             model_provider=model_provider,
             context_block_size_tokens=256,
             context_max_tokens=1000,
+            enable_chunked_prefill=True,
         )
 
     @pytest.mark.internal
@@ -1180,6 +1196,7 @@ def test_chunked_prefill_with_log_probs(self):
             model_provider="gpt",
             context_block_size_tokens=256,
             context_max_tokens=1000,
+            enable_chunked_prefill=True,
         )
 
         # Validate results
@@ -1258,7 +1275,7 @@ def test_top_n_logprobs_dynamic(self, skip_prompt_log_probs: bool):
 
         # Step engine until all requests are finished
         while env.engine.has_unfinished_requests():
-            result = env.engine.step_modern(verbose=False)
+            result = env.engine.step_modern()
 
         # Validate results
         for request in requests_to_add:
@@ -1345,3 +1362,28 @@ def test_top_n_logprobs_dynamic(self, skip_prompt_log_probs: bool):
                     assert (
                         abs(log_prob - top_n_dict[token_str]) < 0.1
                     ), f"Request {request.request_id}, token {i}: log_prob mismatch {log_prob} vs {top_n_dict[token_str]}"
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(
+        not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"
+    )
+    @pytest.mark.parametrize("max_requests", [None, 4])
+    @torch.inference_mode()
+    def test_max_requests(self, max_requests: int | None):
+        """Test max requests."""
+        env = self._run_test(
+            context_max_requests=max_requests, num_tokens_to_generate=16, num_gap_steps=1
+        )
+        step_count = env.engine.step_count
+        context = env.engine.context
+        if max_requests is None:
+            assert context.max_requests == 816
+            assert step_count == 22
+        else:
+            assert max_requests < len(env.requests), (
+                f"Test is only useful if max_requests ({max_requests}) < "
+                f"num_requests ({len(env.requests)})."
+            )
+            assert context.max_requests == 4
+            assert step_count == 34
+        assert context.block_allocator.active_count == 655
diff --git a/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py b/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py
index 7b4fb4b4250..57326291a73 100644
--- a/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py
+++ b/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py
@@ -1,20 +1,18 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import asyncio
-import random
+import itertools
 import time
 from collections import deque
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional
 
+import msgpack
 import pytest
-import torch.distributed as dist
+import torch
 from tqdm import tqdm
 
-from megatron.core.inference.data_parallel_inference_coordinator import (
-    DataParallelInferenceCoordinator,
-)
 from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine, RequestEntry
+from megatron.core.inference.headers import Headers
 from megatron.core.inference.inference_client import InferenceClient
 from megatron.core.inference.inference_request import (
     DynamicInferenceRequest,
@@ -22,6 +20,7 @@
     Status,
 )
 from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.utils import get_asyncio_loop
 from tests.unit_tests.test_utilities import Utils
 
@@ -29,10 +28,35 @@
     import zmq
 
     HAVE_ZMQ = True
-except Exception:
+except ImportError:
     HAVE_ZMQ = False
 
-IS_ZMQ_FLAKY = True
+NUM_REQUESTS = 10
+NUM_TOKENS = 2
+DEFAULT_PORT = 46581
+ZMQ_FLAKY_SHUTDOWN = True
+
+
+class DummyTokenizer:
+    """Dummy tokenizer."""
+
+    def __init__(self, vocab_size: int = 10, bos: int | None = None, eod: int = 0, pad: int = 0):
+        self.vocab_size = vocab_size
+        self.bos = bos
+        self.eod = eod
+        self.pad = pad
+
+    def tokenize(self, prompt):
+        if isinstance(prompt, str):
+            return [int(tok) % self.vocab_size for tok in prompt.strip().split()]
+        return list(prompt)
+
+    def detokenize(self, tokens, skip_special_tokens: bool = False):
+        if isinstance(tokens, torch.Tensor):
+            tokens = tokens.tolist()
+        if skip_special_tokens and self.eod in tokens:
+            tokens = [tok for tok in tokens if tok != self.eod]
+        return " ".join(str(tok) for tok in tokens)
 
 
 class DummyContext:
@@ -45,6 +69,16 @@ def get_active_request_count(self) -> int:
         return self.active_cnt
 
 
+class DummyController:
+    """Dummy inference controller."""
+
+    def __init__(self):
+        self.tokenizer = DummyTokenizer()
+
+    def dummy_forward(self):
+        pass
+
+
 class DummyEngine(DynamicInferenceEngine):
     """Dummy inference engine that only implements coordinator-related methods."""
 
@@ -56,12 +90,15 @@ def __init__(self):
         self.is_suspended = False
         self._loop = get_asyncio_loop()
         self.context = DummyContext()
+        self.controller = DummyController()
         self.running = asyncio.Event()
         self.paused = asyncio.Event()
         self.stopped = asyncio.Event()
         self.pending_microbatch = deque()
         self.received_pause: bool = False
         self.received_stop: bool = False
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+        self.rank = torch.distributed.get_rank()
 
     def add_request(
         self, request_id: int, prompt: str, sampling_params: Optional[SamplingParams] = None
@@ -99,6 +136,13 @@ async def async_step(self, *, verbose: Optional[bool] = False) -> Dict:
                 finished_request_records.append(entry.record)
                 entry.future.set_result(entry.record)
                 to_remove.append(request_id)
+                # Send signal to coordinator.
+                if self.is_mp_coordinator:
+                    payload = msgpack.packb(
+                        [Headers.ENGINE_REPLY.value, [entry.record.serialize()]], use_bin_type=True
+                    )
+                    self.socket_for_receiving_requests.send(payload)
+
         for request_id in to_remove:
             del self.requests[request_id]
 
@@ -119,301 +163,289 @@ async def async_step(self, *, verbose: Optional[bool] = False) -> Dict:
         }
 
 
-@dataclass
-class CoordinatorTestConfig:
-    """Test configuration args."""
-
-    port: int = 46581
-    mp_port: int = 49581
-    launch_inference_coordinator: bool = True
-    stop_engines: bool = True
-    verify_results: bool = True
-
-    num_requests: int = 10**1
-    min_time_offset: float = 10 ** (-4)
-    max_time_offset: float = 10 ** (-3)
-    num_steps_to_finish: int = 1
-    num_iterations: int = 1
-
-    tensor_model_parallel_size: int = 1
-    pipeline_model_parallel_size: int = 1
-
-
-@dataclass
-class CoordinatorTestEnv:
-    """Test environment, including requests."""
-
-    config: CoordinatorTestConfig
-    requests: List[Tuple]
-    engine: DummyEngine
-    responses: List[List[DynamicInferenceRequest]] = field(default_factory=list)
-    timing_data: Dict[str, Optional[float]] = field(
-        default_factory=lambda: {
-            "start_time": None,
-            "init_time": None,
-            "done_time": None,
-            "stop_time": None,
-        }
-    )
+@pytest.fixture
+def initialize_model_parallel(request, monkeypatch):
+    """Fixture to initialize and destroy model parallel.
 
+    Parameters are passed via request.param as a tuple: (tp, pp, ep).
+    Defaults to (1, 1, 1) if not parametrized.
+    """
+    monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
 
-class TestCoordinator:
-
-    @classmethod
-    def _build_requests(cls, test_config: CoordinatorTestConfig) -> List[Tuple]:
-        ret = []
+    tp, pp, ep = getattr(request, "param", (1, 1, 1))
+    world_size = Utils.world_size
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=tp,
+        pipeline_model_parallel_size=pp,
+        expert_model_parallel_size=ep,
+    )
+    dp = world_size // (tp * pp * ep)
+    yield world_size, dp, tp, pp, ep
+    Utils.destroy_model_parallel()
 
-        for _ in range(test_config.num_requests):
-            arrival_delta = random.uniform(test_config.min_time_offset, test_config.max_time_offset)
-            num_tokens = test_config.num_steps_to_finish
-            ret.append(
-                ("Hello world!", SamplingParams(num_tokens_to_generate=num_tokens), arrival_delta)
-            )
-        return ret
 
-    @classmethod
-    def _build_test_env(cls, test_config):
-        Utils.initialize_model_parallel(
-            tensor_model_parallel_size=test_config.tensor_model_parallel_size,
-            pipeline_model_parallel_size=test_config.pipeline_model_parallel_size,
-        )
-        requests = cls._build_requests(test_config)
+@pytest.mark.skipif(ZMQ_FLAKY_SHUTDOWN, reason="ZMQ shutdown is flaky")
+class TestCoordinator:
+    """Test class for Data Parallel Inference Coordinator."""
+
+    def build_requests(self, num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS):
+        """Build a list of test requests."""
+        return [
+            ("Hello world!", SamplingParams(num_tokens_to_generate=num_tokens))
+            for _ in range(num_requests)
+        ]
+
+    async def run_coordinator_test(
+        self,
+        *,
+        launch_coordinator=True,
+        stop_engines=True,
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+    ):
+        """Run a coordinator test. Model parallel must already be initialized."""
         engine = DummyEngine()
-        engine.num_steps_to_finish = test_config.num_steps_to_finish
-        return CoordinatorTestEnv(config=test_config, requests=requests, engine=engine)
-
-    @classmethod
-    async def _run_test(cls, **test_config_kwargs):
-        # Test environment.
-        test_config = CoordinatorTestConfig(**test_config_kwargs)
-        env = cls._build_test_env(test_config)
-
-        # Connect each engine to their respective processes.
-        env.timing_data["start_time"] = time.time()
-        await env.engine.start_listening_to_data_parallel_coordinator(
-            inference_coordinator_port=test_config.port,
-            launch_inference_coordinator=test_config.launch_inference_coordinator,
+        requests = self.build_requests(num_requests, num_tokens)
+
+        dp_addr = await engine.start_listening_to_data_parallel_coordinator(
+            inference_coordinator_port=DEFAULT_PORT, launch_inference_coordinator=launch_coordinator
         )
 
-        results_success = False
-        shutdown_success = False
         try:
-            if dist.get_rank() == 0:
-                client = InferenceClient(test_config.port)
+            if torch.distributed.get_rank() == 0:
+                client = InferenceClient(dp_addr)
                 await client.start()
-                env.timing_data["init_time"] = time.time()
 
-                all_results = []
-                for _ in range(test_config.num_iterations):
-                    futures = []
-                    for request in tqdm(env.requests, "add_requests"):
-                        prompt, sampling_params, arrival_delta = request
-                        await asyncio.sleep(arrival_delta)
-                        fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
-                        futures.append(fut)
-                    results = await asyncio.wait_for(asyncio.gather(*futures), timeout=10.0)
-                    all_results.append(results)
-                env.timing_data["done_time"] = time.time()
-            results_success = True
-        finally:
-            try:
-                if dist.get_rank() == 0:
-                    if test_config.stop_engines:
-                        await asyncio.wait_for(client.stop_engines(), timeout=10.0)
-                    client.stop()
-                if test_config.stop_engines:
-                    await asyncio.wait_for(env.engine.engine_loop_task, timeout=10.0)
-                shutdown_success = True
-            except:
-                env.engine.engine_loop_task.cancel()
-
-        env.timing_data["stop_time"] = time.time()
-
-        assert results_success, "Did not receive all results successfully."
-        assert shutdown_success, "Did not shutdown successfully."
-        if dist.get_rank() == 0:
-            env.responses = all_results
-            if test_config.verify_results:
-                for batch in all_results:
-                    for record in batch:
-                        request = record[-1]
-                        assert request.status == Status.COMPLETED
+                futures = [
+                    client.add_request(prompt=prompt, sampling_params=params)
+                    for prompt, params in requests
+                ]
+                results = await asyncio.wait_for(asyncio.gather(*futures), timeout=10.0)
 
-        return env
+                for record in results:
+                    assert record[-1].status == Status.COMPLETED
+        finally:
+            if torch.distributed.get_rank() == 0:
+                if stop_engines:
+                    await asyncio.wait_for(client.stop_engines(), timeout=10.0)
+                client.stop()
+            if stop_engines:
+                try:
+                    await asyncio.wait_for(engine.engine_loop_task, timeout=30.0)
+                except asyncio.TimeoutError:
+                    engine.engine_loop_task.cancel()
 
-    def teardown_method(self, method):
-        Utils.destroy_model_parallel()
+        return dp_addr
 
     @pytest.mark.internal
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
     @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
     @pytest.mark.asyncio
-    async def test_simple(self):
-        """Simple test with no TP or PP."""
-        env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    @pytest.mark.parametrize(
+        "initialize_model_parallel",
+        [
+            pytest.param((tp, pp, ep), id=f"tp{tp}-pp{pp}-ep{ep}")
+            for tp, pp, ep in itertools.product([1, 2], [1, 2], [1, 2])
+            if tp * pp * ep <= Utils.world_size
+        ],
+        indirect=["initialize_model_parallel"],
+    )
+    async def test_parallel_configs(self, initialize_model_parallel):
+        """Test coordinator with various TP, PP, and EP configurations."""
+        await self.run_coordinator_test()
 
     @pytest.mark.internal
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
     @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
     @pytest.mark.asyncio
-    async def test_tp(self):
-        """Simple test with TP, but no PP."""
-        env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    async def test_coordinator_lifecycle(self, initialize_model_parallel):
+        """Test coordinator connection and port conflict behavior."""
+        engine1 = DummyEngine()
+        engine2 = None
+        engine3 = None
+        third_addr = None
+
+        # Launch first coordinator - binds to DEFAULT_PORT
+        first_addr = await engine1.start_listening_to_data_parallel_coordinator(
+            inference_coordinator_port=DEFAULT_PORT, launch_inference_coordinator=True
+        )
 
-    @pytest.mark.internal
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
-    @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
-    @pytest.mark.asyncio
-    async def test_pp(self):
-        """Simple test with no TP, but PP."""
-        env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=2)
+        try:
+            # Cancel engine1 loop without sending stop to coordinator
+            # This keeps coordinator process alive and holding the port
+            engine1.engine_loop_task.cancel()
+            try:
+                await engine1.engine_loop_task
+            except asyncio.CancelledError:
+                pass
+
+            # Connect engine2 to existing coordinator (don't launch new one)
+            engine2 = DummyEngine()
+            second_addr = await engine2.start_listening_to_data_parallel_coordinator(
+                inference_coordinator_port=DEFAULT_PORT, launch_inference_coordinator=False
+            )
 
-    @pytest.mark.internal
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
-    @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
-    @pytest.mark.asyncio
-    async def test_tp_pp(self):
-        """Simple test with both TP and PP."""
-        env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=2)
+            # Should connect to same port, but will not always in CI due to port conflicts.
+            first_port = int(first_addr.rsplit(":", 1)[-1])
+            second_port = int(second_addr.rsplit(":", 1)[-1])
+            # assert second_port == first_port
 
-    @pytest.mark.internal
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
-    @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
-    @pytest.mark.asyncio
-    async def test_pp(self):
-        """Simple test with no TP, but PP."""
-        env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=2)
+            # Cancel engine2
+            engine2.engine_loop_task.cancel()
+            try:
+                await engine2.engine_loop_task
+            except asyncio.CancelledError:
+                pass
+
+            # Launch new coordinator - should get different port since first is holding it
+            engine3 = DummyEngine()
+            third_addr = await engine3.start_listening_to_data_parallel_coordinator(
+                inference_coordinator_port=DEFAULT_PORT, launch_inference_coordinator=True
+            )
 
-    @pytest.mark.internal
-    @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
-    @pytest.mark.asyncio
-    async def test_tp_pp(self):
-        """Simple test with both TP and PP."""
-        env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=2)
+            # Verify we got a different port due to conflict
+            third_port = int(third_addr.rsplit(":", 1)[-1])
+            assert (
+                third_port != first_port
+            ), f"Expected different port due to conflict, but got same: {third_port}"
+
+        finally:
+            # Clean up engine3's coordinator
+            if engine3 is not None and third_addr is not None:
+                client3 = InferenceClient(third_addr)
+                await client3.start()
+                await asyncio.wait_for(client3.stop_engines(), timeout=10.0)
+                client3.stop()
+                try:
+                    await asyncio.wait_for(engine3.engine_loop_task, timeout=30.0)
+                except asyncio.TimeoutError:
+                    engine3.engine_loop_task.cancel()
+
+            # Rebuild engine and reconnect to engine1's coordinator
+            first_port = int(first_addr.rsplit(":", 1)[-1])
+            engine1 = DummyEngine()
+            await engine1.start_listening_to_data_parallel_coordinator(
+                inference_coordinator_port=first_port, launch_inference_coordinator=False
+            )
+            client1 = InferenceClient(first_addr)
+            await client1.start()
+            await asyncio.wait_for(client1.stop_engines(), timeout=10.0)
+            client1.stop()
+            try:
+                await asyncio.wait_for(engine1.engine_loop_task, timeout=30.0)
+            except asyncio.TimeoutError:
+                engine1.engine_loop_task.cancel()
 
     @pytest.mark.internal
     @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
     @pytest.mark.asyncio
-    async def test_pause(self):
-        """Pause/resume test."""
-        test_config = CoordinatorTestConfig(
-            tensor_model_parallel_size=2, pipeline_model_parallel_size=1, num_requests=32
-        )
-        env = self._build_test_env(test_config)
+    async def test_pause(self, initialize_model_parallel):
+        """Test pause and resume functionality."""
+        engine = DummyEngine()
+        requests = self.build_requests(num_requests=32)
 
-        await env.engine.start_listening_to_data_parallel_coordinator(
-            inference_coordinator_port=test_config.port, launch_inference_coordinator=True
+        dp_addr = await engine.start_listening_to_data_parallel_coordinator(
+            inference_coordinator_port=DEFAULT_PORT, launch_inference_coordinator=True
         )
 
-        success = False
+        success = True
         try:
-            if dist.get_rank() == 0:
-                # Start client as usual.
-                client = InferenceClient(test_config.port)
+            if torch.distributed.get_rank() == 0:
+                client = InferenceClient(dp_addr)
                 await client.start()
 
-                ### TEST 1: Pause after all requests have finished.
-                futures = []
-                for i, request in enumerate(env.requests[:2]):
-                    prompt, sampling_params, _ = request
-                    fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
-                    futures.append(fut)
-                # Wait a sufficient time for the requests to complete.
+                # Submit requests and pause after completion.
+                futures = [client.add_request(prompt=p, sampling_params=s) for p, s in requests[:2]]
                 await asyncio.sleep(0.1)
-                # Get a pause awaitable.
-                to_pause = client.pause_engines()
-                awaitables = futures + [to_pause]
-                # Gather all awaitables; assert that the requests actually complete.
+                awaitables = futures + [client.pause_engines()]
                 try:
-                    await asyncio.wait_for(asyncio.gather(*awaitables), timeout=0.1)
+                    await asyncio.wait_for(asyncio.gather(*awaitables), timeout=0.5)
                 except asyncio.TimeoutError:
-                    pytest.fail("Simple pause did not succeed.")
+                    pytest.fail("Pause operation timed out.")
 
-                ### TEST 2: Ensure that requests can be added while paused.
-                prompt, sampling_params, _ = env.requests[2]
-                paused_fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
+                # Ensure that requests can be added while paused.
+                prompt, params = requests[2]
+                future = client.add_request(prompt=prompt, sampling_params=params)
                 with pytest.raises(asyncio.TimeoutError):
-                    await asyncio.wait_for(paused_fut, timeout=0.1)
+                    await asyncio.wait_for(future, timeout=0.1)
 
-                ### TEST 3: Resume after pause and drain the queued requests.
+                # Resume and verify new requests complete.
                 client.unpause_engines()
                 # TODO: The system should not be incorrectly raising a cancelled error here.
                 with pytest.raises(asyncio.CancelledError):
-                    await paused_fut
-
-                ### TEST 4: Add new requests after resume.
-                futures = []
-                for i, request in enumerate(env.requests[3:4]):
-                    prompt, sampling_params, _ = request
-                    fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
-                    futures.append(fut)
-                # Wait a sufficient time for the requests to complete.
+                    await future
+
+                futures = [
+                    client.add_request(prompt=p, sampling_params=s) for p, s in requests[3:4]
+                ]
                 await asyncio.sleep(0.1)
-                # Gather all awaitables; assert that the requests actually complete.
                 try:
-                    await asyncio.wait_for(asyncio.gather(*futures), timeout=0.1)
+                    await asyncio.wait_for(asyncio.gather(*futures), timeout=0.5)
                 except asyncio.TimeoutError:
-                    pytest.fail("Simple resume did not succeed.")
-
-                ### TEST 5: Pause while requests are being processed.
-                ### Note: this situation cannot occur in a synchronous system.
-                if False:
-                    for request in env.engine.requests[4:6]:
-                        request.sampling_params.num_tokens_to_generate = 100
-                    futures = []
-                    for i, request in enumerate(env.requests[4:6]):
-                        prompt, sampling_params, _ = request
-                        fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
-                        futures.append(fut)
-                    # Do not wait for the requests to complete.
-                    await client.pause_engines()
-                    # Gather all awaitables; assert that the requests do not complete.
-                    with pytest.raises(asyncio.TimeoutError):
-                        await asyncio.wait_for(asyncio.gather(*futures), timeout=0.1)
-            success = True
+                    pytest.fail("Resumed requests did not complete in time.")
+        except:
+            success = False
         finally:
             try:
-                if dist.get_rank() == 0:
+                if torch.distributed.get_rank() == 0:
                     await asyncio.wait_for(client.stop_engines(), timeout=5.0)
                     client.stop()
-                await asyncio.wait_for(env.engine.engine_loop_task, timeout=5.0)
+                await asyncio.wait_for(engine.engine_loop_task, timeout=30.0)
             except asyncio.TimeoutError:
-                env.engine.engine_loop_task.cancel()
-        assert success, "Pause/resume test did not complete successfully."
+                engine.engine_loop_task.cancel()
+        assert success, "Pause/resume test failed."
 
     @pytest.mark.internal
     @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test")
-    @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI")
     @pytest.mark.asyncio
-    async def test_throughput(self):
+    async def test_throughput(self, initialize_model_parallel):
         """Throughput test with no TP or PP."""
-        import torch
-        import torch.distributed as dist
-
-        env = await self._run_test(
-            tensor_model_parallel_size=1,
-            pipeline_model_parallel_size=1,
-            num_requests=10**4,
-            num_iterations=10,
-            min_time_offset=0.0,
-            max_time_offset=0.0,
+        num_requests = 10**4
+        num_iterations = 10
+
+        engine = DummyEngine()
+        requests = self.build_requests(num_requests=num_requests)
+
+        start_time = time.time()
+        dp_addr = await engine.start_listening_to_data_parallel_coordinator(
+            inference_coordinator_port=DEFAULT_PORT, launch_inference_coordinator=True
         )
 
+        try:
+            if torch.distributed.get_rank() == 0:
+                client = InferenceClient(dp_addr)
+                await client.start()
+                init_time = time.time()
+
+                for _ in range(num_iterations):
+                    futures = []
+                    for prompt, sampling_params in tqdm(requests, "add_requests"):
+                        fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
+                        futures.append(fut)
+                    await asyncio.wait_for(asyncio.gather(*futures), timeout=10.0)
+                done_time = time.time()
+        finally:
+            if torch.distributed.get_rank() == 0:
+                await asyncio.wait_for(client.stop_engines(), timeout=10.0)
+                client.stop()
+            try:
+                await asyncio.wait_for(engine.engine_loop_task, timeout=30.0)
+            except asyncio.TimeoutError:
+                engine.engine_loop_task.cancel()
+
+        stop_time = time.time()
+
         flags = torch.tensor([1, 1, 1], dtype=torch.int, device=torch.cuda.current_device())
 
         init_duration = golden_init_duration = None
         run_duration = golden_run_duration = None
         stop_duration = golden_stop_duration = None
 
-        if dist.get_rank() == 0:
-            init_duration = (env.timing_data["init_time"] - env.timing_data["start_time"]) * 10**3
-            golden_init_duration = 4445.64  # ms
-            run_duration = (env.timing_data["done_time"] - env.timing_data["init_time"]) * 10**3
-            golden_run_duration = 2906.29  # ms
-            stop_duration = (env.timing_data["stop_time"] - env.timing_data["done_time"]) * 10**3
-            golden_stop_duration = 33.17  # ms
+        if torch.distributed.get_rank() == 0:
+            init_duration = (init_time - start_time) * 10**3
+            golden_init_duration = 6974.43  # ms
+            run_duration = (done_time - init_time) * 10**3
+            golden_run_duration = 4392.63  # ms
+            stop_duration = (stop_time - done_time) * 10**3
+            golden_stop_duration = 931.49  # ms
 
             def clamp_to_golden_value(value, golden_value, delta=0.1):
                 return value > golden_value * (1 - delta) and value < golden_value * (1 + delta)
@@ -426,10 +458,9 @@ def clamp_to_golden_value(value, golden_value, delta=0.1):
                 flags[2] = 0
 
         # Synchronize results
-        dist.broadcast(flags, src=0)
+        torch.distributed.broadcast(flags, src=0)
 
-        if dist.get_rank() == 0:
-            # Print current results.
+        if torch.distributed.get_rank() == 0:
             print(f"Initialization time: {init_duration:.2f} ms")
             print(f"Run time: {run_duration:.2f} ms")
             print(f"Stop time: {stop_duration:.2f} ms")
@@ -449,23 +480,10 @@ def clamp_to_golden_value(value, golden_value, delta=0.1):
 
             print(
                 f"ZMQ throughput is approximately "
-                f"{env.config.num_requests * env.config.num_iterations / (run_duration):.2f} "
+                f"{num_requests * num_iterations / run_duration:.2f} "
                 f"requests/ms"
             )
         else:
             assert flags[0].item() == 1
             assert flags[1].item() == 1
             assert flags[2].item() == 1
-
-
-if __name__ == "__main__":
-    test = TestCoordinator()
-    asyncio.run(test.test_simple())
-    asyncio.run(test.test_tp())
-    asyncio.run(test.test_pp())
-    asyncio.run(test.test_tp_pp())
-    asyncio.run(test.test_pause())
-    asyncio.run(test.test_throughput())
-    test.teardown_method(None)
-    print("~~~")
-    print("success.")
diff --git a/tests/unit_tests/inference/test_stop_words.py b/tests/unit_tests/inference/test_stop_words.py
new file mode 100644
index 00000000000..31665c0bb81
--- /dev/null
+++ b/tests/unit_tests/inference/test_stop_words.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""Unit tests for stop word functionality in dynamic inference."""
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from megatron.core.inference.sampling_params import SamplingParams
+
+
+class MockDynamicInferenceRequest:
+    """Mock class for DynamicInferenceRequest to test stop word detection."""
+
+    def __init__(
+        self,
+        request_id: int,
+        generated_tokens: Optional[List[int]] = None,
+        stop_word_ids: Optional[List[List[int]]] = None,
+        sampling_params: Optional[SamplingParams] = None,
+    ):
+        self.request_id = request_id
+        self.generated_tokens = generated_tokens if generated_tokens is not None else []
+        self.stop_word_ids = stop_word_ids
+        self.sampling_params = sampling_params or SamplingParams()
+
+
+class TestStopWordDetection:
+    """Test stop word detection logic."""
+
+    def _check_stop_words_for_request_post_append(
+        self, request: MockDynamicInferenceRequest
+    ) -> bool:
+        """
+        Check if a request should stop due to stop words (after token is appended).
+
+        This mirrors the logic in DynamicInferenceEngine._check_stop_words_for_request_post_append
+        """
+        # Check if request has stop words configured
+        if request.stop_word_ids is None or len(request.stop_word_ids) == 0:
+            return False
+
+        generated_tokens = request.generated_tokens
+
+        # Check if the sequence ends with any stop word
+        for stop_word_ids in request.stop_word_ids:
+            stop_len = len(stop_word_ids)
+            if len(generated_tokens) >= stop_len:
+                # Check if the last stop_len tokens match the stop word
+                if list(generated_tokens[-stop_len:]) == stop_word_ids:
+                    return True
+
+        return False
+
+    def test_no_stop_words_configured(self):
+        """Test that requests without stop words configured don't trigger stop."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=None
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_empty_stop_words_list(self):
+        """Test that empty stop words list doesn't trigger stop."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_single_token_stop_word_match(self):
+        """Test detection of single-token stop word."""
+        # Stop word is token 300
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[[300]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is True
+
+    def test_single_token_stop_word_no_match(self):
+        """Test no detection when single-token stop word doesn't match."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[[400]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_multi_token_stop_word_match(self):
+        """Test detection of multi-token stop word."""
+        # Stop word is tokens [200, 300]
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[[200, 300]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is True
+
+    def test_multi_token_stop_word_no_match_partial(self):
+        """Test no detection when only partial stop word matches."""
+        # Stop word is [200, 300], but generated ends with [100, 200]
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200], stop_word_ids=[[200, 300]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_multi_token_stop_word_no_match_wrong_order(self):
+        """Test no detection when tokens are present but in wrong order."""
+        # Stop word is [200, 300], but generated ends with [300, 200]
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 300, 200], stop_word_ids=[[200, 300]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_multiple_stop_words_first_matches(self):
+        """Test with multiple stop words where first one matches."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[[300], [400], [500]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is True
+
+    def test_multiple_stop_words_second_matches(self):
+        """Test with multiple stop words where second one matches."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 400], stop_word_ids=[[300], [400], [500]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is True
+
+    def test_multiple_stop_words_none_match(self):
+        """Test with multiple stop words where none match."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 600], stop_word_ids=[[300], [400], [500]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_stop_word_longer_than_generated(self):
+        """Test that stop word longer than generated tokens doesn't crash."""
+        # Stop word is 5 tokens, but only 3 tokens generated
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[[1, 2, 3, 4, 5]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_stop_word_exact_length_match(self):
+        """Test stop word that matches entire generated sequence."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[[100, 200, 300]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is True
+
+    def test_empty_generated_tokens(self):
+        """Test with no generated tokens."""
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[], stop_word_ids=[[300]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+    def test_stop_word_in_middle_not_end(self):
+        """Test that stop word in middle of sequence doesn't trigger (only end matters)."""
+        # Stop word is [200], which is in middle but not at end
+        request = MockDynamicInferenceRequest(
+            request_id=1, generated_tokens=[100, 200, 300], stop_word_ids=[[200]]
+        )
+        assert self._check_stop_words_for_request_post_append(request) is False
+
+
+class TestStopWordTrackingFlow:
+    """Test the stop word tracking flow between steps."""
+
+    def test_stop_word_finished_ids_tracking(self):
+        """Test that stop_word_finished_request_ids correctly tracks requests."""
+        stop_word_finished_request_ids = set()
+        stop_word_being_finished_ids = set()
+
+        # Simulate detecting stop word in post_process_requests
+        request_id = 42
+        stop_word_finished_request_ids.add(request_id)
+
+        assert request_id in stop_word_finished_request_ids
+        assert len(stop_word_finished_request_ids) == 1
+
+        # Simulate callback being called
+        active_request_ids = [42, 43, 44]
+        result = stop_word_finished_request_ids & set(active_request_ids)
+        stop_word_being_finished_ids = result
+        stop_word_finished_request_ids -= result
+
+        assert request_id in stop_word_being_finished_ids
+        assert request_id not in stop_word_finished_request_ids
+
+    def test_skip_extra_token_for_stop_word_requests(self):
+        """Test that extra token is skipped for stop word finished requests."""
+        stop_word_being_finished_ids = {42}
+        generated_tokens = {
+            42: [100, 200, 300],  # Already has tokens from previous step
+            43: [100, 200],
+        }
+
+        new_tokens = {42: 999, 43: 301}  # New tokens to potentially append
+
+        for request_id, token in new_tokens.items():
+            if request_id not in stop_word_being_finished_ids:
+                generated_tokens[request_id].append(token)
+
+        # Request 42 should NOT have the extra token
+        assert generated_tokens[42] == [100, 200, 300]
+        # Request 43 should have the new token
+        assert generated_tokens[43] == [100, 200, 301]
+
+
+class TestSamplingParamsStopWords:
+    """Test SamplingParams stop words field."""
+
+    def test_stop_words_default_none(self):
+        """Test that stop_words defaults to None."""
+        params = SamplingParams()
+        assert params.stop_words is None
+
+    def test_stop_words_can_be_set(self):
+        """Test that stop_words can be set."""
+        params = SamplingParams(stop_words=["STOP", "END"])
+        assert params.stop_words == ["STOP", "END"]
+
+    def test_stop_words_empty_list(self):
+        """Test that stop_words can be empty list."""
+        params = SamplingParams(stop_words=[])
+        assert params.stop_words == []
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/unit_tests/inference/test_wandb_logging.py b/tests/unit_tests/inference/test_wandb_logging.py
index 1d5d054b80e..cab464af503 100644
--- a/tests/unit_tests/inference/test_wandb_logging.py
+++ b/tests/unit_tests/inference/test_wandb_logging.py
@@ -85,8 +85,7 @@ def test_get_kvcache_utilization_stats_with_requests(self):
         assert 'block_count_avail' in stats
         assert 'active_token_count' in stats
         assert 'total_request_count' in stats
-        assert 'max_total_requests' in stats
-        assert 'max_active_requests' in stats
+        assert 'max_requests' in stats
 
         # Verify values for empty context
         assert stats['allocated_blocks'] == 0
@@ -133,10 +132,8 @@ def test_get_kvcache_utilization_stats_with_requests(self):
         assert stats_after['total_blocks'] > 0
 
         # Verify that max_requests remains constant
-        assert stats_after['max_total_requests'] == stats['max_total_requests']
-        assert stats_after['max_total_requests'] > 0
-        assert stats_after['max_active_requests'] == stats['max_active_requests']
-        assert stats_after['max_active_requests'] > 0
+        assert stats_after['max_requests'] == stats['max_requests']
+        assert stats_after['max_requests'] > 0
 
         # Verify block availability decreased after allocation
         assert stats_after['block_count_avail'] < stats['block_count_avail']
@@ -180,8 +177,7 @@ def test_kvcache_utilization_stats_types(self):
             'block_count_avail',
             'active_token_count',
             'total_request_count',
-            'max_total_requests',
-            'max_active_requests',
+            'max_requests',
         ]
 
         for field in int_fields:
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
index 8835b07be07..0885401e7a0 100644
--- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
+++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
@@ -118,6 +118,8 @@ def setup_model(
                 num_layers=transformer_config.num_layers // pipeline_model_parallel_size,
                 kv_channels=transformer_config.kv_channels,
                 num_attention_heads=transformer_config.num_attention_heads,
+                tensor_model_parallel_size=transformer_config.tensor_model_parallel_size,
+                pipeline_model_parallel_size=transformer_config.pipeline_model_parallel_size,
                 max_sequence_length=2048,
                 buffer_size_gb=0.2,
                 materialize_only_last_token_logits=False,
@@ -232,13 +234,16 @@ def detokenize(self, inp, skip_special_tokens=False):
         ), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}"
 
     @pytest.mark.parametrize("backend", ["torch"])
-    def test_sample_from_dynamic_logits(self, backend):
+    @pytest.mark.parametrize("materialize_only_last_token_logits", [True, False])
+    def test_sample_from_dynamic_logits(
+        self, backend: str, materialize_only_last_token_logits: bool
+    ):
         batch_size = 12
         self.setup_model(torch.float32, batch_size=batch_size, static=False)
         self.mock_tokenizer.eod = self.vocab_size
 
         context = self.text_generation_controller.inference_wrapped_model.inference_context
-        context.materialize_only_last_token_logits = True
+        context.materialize_only_last_token_logits = materialize_only_last_token_logits
 
         # Prepare sampling params in human-readable format, to aid with test maintenance.
         sampling_test_cases: List[Tuple[SamplingParams, List[int]]] = [
@@ -258,29 +263,37 @@ def test_sample_from_dynamic_logits(self, backend):
                 rev_sampling_dict[idx] = sampling_params
 
         # Prepare metadata for sample bookkeeping.
-        request_metadata_labels = DynamicInferenceRequest.get_metadata_labels()
-        request_metadata = torch.empty(
-            (batch_size, len(request_metadata_labels)), dtype=torch.float32
-        ).cuda()
-        top_k_values = torch.Tensor([s.top_k for s in rev_sampling_dict]).cuda()
-        request_metadata[:, request_metadata_labels["top_k"]] = top_k_values
-        top_p_values = torch.Tensor([s.top_p for s in rev_sampling_dict]).cuda()
-        request_metadata[:, request_metadata_labels["top_p"]] = top_p_values
-        temp_values = torch.Tensor([s.temperature for s in rev_sampling_dict]).cuda()
-        request_metadata[:, request_metadata_labels["temperature"]] = temp_values
+        temp_values = torch.Tensor([s.temperature for s in rev_sampling_dict])
+        top_k_values = torch.Tensor([s.top_k for s in rev_sampling_dict]).to(torch.int32)
+        top_p_values = torch.Tensor([s.top_p for s in rev_sampling_dict])
+        request_metadata = {
+            "temperature": temp_values,
+            "top_k": top_k_values,
+            "top_p": top_p_values,
+        }
+        self.text_generation_controller._request_metadata = request_metadata
+        self.text_generation_controller._sampling_backend = backend
+
+        context.padded_active_token_count = batch_size
+        context.request_query_lengths = torch.ones(batch_size, dtype=torch.int32)
+        context.paused_request_count = 0
+        context.total_request_count = batch_size
 
         # Bookkeeping.
-        self.text_generation_controller._dynamic_step_sample_bookkeeping(
-            request_metadata=request_metadata
-        )
+        self.text_generation_controller._dynamic_step_sample_bookkeeping()
 
         # Sampling.
         logits = torch.arange(0, self.vocab_size).repeat(batch_size, 1).unsqueeze(0).float().cuda()
-        sampled_logits = self.text_generation_controller._dynamic_step_sample_logits(
-            logits, backend=backend
-        )
+        self.text_generation_controller._dynamic_step_sample_logits(logits)
+        sampled_logits = self.text_generation_controller._sampled_tokens_cuda[:batch_size]
         vocab_indices = torch.arange(self.vocab_size).cuda()
 
+        # Move tensors to GPU for assertion checks.
+        temp_values = temp_values.cuda()
+        top_k_values = top_k_values.cuda()
+        top_p_values = top_p_values.cuda()
+
+        # Assert correct sampled values.
         top_k_values[top_k_values == 0] = self.vocab_size
         assert torch.all(
             sampled_logits >= self.vocab_size - top_k_values
@@ -753,21 +766,15 @@ def test_dynamic_top_n_logprobs_calculation(
 
         # Prepare sampling params
         top_n = 5
-        request_metadata_labels = DynamicInferenceRequest.get_metadata_labels()
-        request_metadata = torch.empty(
-            (batch_size, len(request_metadata_labels)), dtype=torch.float32
-        ).cuda()
-
-        # Set top_n_logprobs for all requests
-        request_metadata[:, request_metadata_labels["top_n_logprobs"]] = top_n
-        request_metadata[:, request_metadata_labels["skip_prompt_log_probs"]] = float(
-            skip_prompt_log_probs
-        )
-
-        # Bookkeeping
-        self.text_generation_controller._dynamic_step_sample_bookkeeping(
-            request_metadata=request_metadata
-        )
+        request_metadata = {
+            "top_n_logprobs": torch.full((batch_size,), top_n, dtype=torch.int32).cuda(),
+            "skip_prompt_log_probs": torch.full(
+                (batch_size,), float(skip_prompt_log_probs), dtype=torch.float32
+            ).cuda(),
+        }
+        self.text_generation_controller._request_metadata = request_metadata
+        self.text_generation_controller._active_request_count = batch_size
+        self.text_generation_controller._active_request_slice = slice(0, batch_size)
 
         if materialize_only_last_token_logits:
             # Decode mode: logits for last tokens only
diff --git a/tests/unit_tests/models/test_gpt_model.py b/tests/unit_tests/models/test_gpt_model.py
index 6936cfbe60a..cf3bd40ee4b 100644
--- a/tests/unit_tests/models/test_gpt_model.py
+++ b/tests/unit_tests/models/test_gpt_model.py
@@ -8,9 +8,13 @@
 import torch
 from packaging import version
 from pytest import approx
+from transformer_engine.pytorch.fp8 import check_fp8_support
 
 from megatron.core import parallel_state
 from megatron.core.hyper_comm_grid import HyperCommGrid
+from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext
+from megatron.core.inference.inference_request import DynamicInferenceRequest
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.models.gpt.gpt_layer_specs import (
     get_gpt_layer_with_transformer_engine_spec,
     get_mlp_module_spec,
@@ -18,8 +22,9 @@
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.module import Float16Module
 from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import is_te_min_version
+from megatron.core.utils import is_fa_min_version, is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
 
@@ -333,3 +338,108 @@ def test_gpt_model_with_custom_pg(self, tp_size, dp_size, cp_size):
         assert logits.shape[0] == sequence_length
         assert logits.shape[1] == micro_batch_size
         assert logits.shape[2] == self.gpt_model.config.hidden_size
+
+
+class TestGPTWithDynamicInference:
+    """Tests GPTModel with dynamic inference."""
+
+    @torch.inference_mode()
+    def setup_method(self, method):
+        fp8_available, reason_for_no_fp8 = check_fp8_support()
+        if not fp8_available:
+            pytest.skip(reason_for_no_fp8)
+
+        os.environ.pop('NVTE_FUSED_ATTN', None)
+        os.environ.pop('NVTE_FLASH_ATTN', None)
+        os.environ.pop('NVTE_UNFUSED_ATTN', None)
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        transformer_config = TransformerConfig(
+            num_layers=8,
+            hidden_size=256,
+            num_attention_heads=8,
+            use_cpu_initialization=True,
+            params_dtype=torch.bfloat16,
+            bf16=True,
+            fp8="hybrid",
+            fp8_recipe="tensorwise",
+        )
+
+        self.gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+            vocab_size=128,
+            max_sequence_length=DynamicInferenceContext.TOKEN_ROUNDER,
+            parallel_output=True,
+        )
+        self.gpt_model = Float16Module(self.gpt_model.config, self.gpt_model)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(
+        not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"
+    )
+    @torch.inference_mode()
+    def test_dynamic_inference_padding_with_fp8(self):
+        """
+        Tests that logits for padded tokens are zeroed out for fp8 inference.
+        """
+        self.gpt_model.cuda()
+        self.gpt_model.eval()
+        config = self.gpt_model.config
+
+        inference_context = DynamicInferenceContext(
+            params_dtype=config.params_dtype,
+            num_layers=config.num_layers,
+            kv_channels=config.hidden_size // config.num_attention_heads,
+            num_attention_heads=config.num_attention_heads,
+            max_sequence_length=self.gpt_model.module.max_sequence_length,
+            buffer_size_gb=1.0,
+            block_size_tokens=256,
+            materialize_only_last_token_logits=False,
+        )
+
+        # Add a request with 10 tokens. Since 10 is not a multiple of 64,
+        # this will create padding up to the padded length of 64.
+        active_token_count = 10
+        request = DynamicInferenceRequest(
+            request_id=0,
+            prompt_tokens=torch.arange(0, active_token_count, dtype=torch.long, device='cuda'),
+            sampling_params=SamplingParams(num_tokens_to_generate=1),
+        )
+        inference_context.add_request(request)
+
+        # Prepares the context, including calculating the padded token count.
+        inference_context.initialize_attention_state()
+
+        assert inference_context.active_token_count == active_token_count
+        assert inference_context.padded_active_token_count == DynamicInferenceContext.TOKEN_ROUNDER
+
+        # Prepare inputs for the forward pass.
+        padded_token_count = inference_context.padded_active_token_count
+        input_ids, position_ids = inference_context.current_input_and_position_ids()
+
+        # Run the forward pass with inference parameters.
+        logits = self.gpt_model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=None,
+            inference_context=inference_context,
+            runtime_gather_output=True,
+        )
+
+        # Verify the output shape.
+        assert logits.shape[0] == 1
+        assert logits.shape[1] == padded_token_count
+        assert logits.shape[2] == self.gpt_model.module.vocab_size
+
+        # Extract the logits corresponding to the padding tokens (from index 10 to 63).
+        padding_start_idx = inference_context.active_token_count
+        padding_end_idx = inference_context.padded_active_token_count
+        padding_logits = logits[0, padding_start_idx:padding_end_idx, :]
+
+        # Assert that all padding logits are zero.
+        assert torch.all(padding_logits == 0.0), "Logits for padding tokens are not all zero."
diff --git a/tests/unit_tests/models/test_gpt_model_batch_invariant.py b/tests/unit_tests/models/test_gpt_model_batch_invariant.py
new file mode 100644
index 00000000000..ead9125e5ec
--- /dev/null
+++ b/tests/unit_tests/models/test_gpt_model_batch_invariant.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import os
+
+import pytest
+import torch
+import torch.distributed as dist
+
+from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext
+from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.custom_layers.batch_invariant_kernels import set_batch_invariant_mode
+from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.module import Float16Module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from megatron.rl.rl_utils import selective_log_softmax
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    from flash_attn_3.flash_attn_interface import _flash_attn_forward
+    from flash_attn_3.flash_attn_interface import (
+        flash_attn_with_kvcache as flash_attn3_with_kvcache,
+    )
+
+    HAVE_FA3 = True
+except ImportError:
+    HAVE_FA3 = False
+
+
+class DummyTokenizer:
+    def __init__(self, vocab_size: int, bos: int | None = None, eod: int = 0, pad: int = 0):
+        self.vocab_size = vocab_size
+        self.bos = bos
+        self.eod = eod
+        self.pad = pad
+
+    def tokenize(self, prompt):
+        if isinstance(prompt, str):
+            tokens = [int(tok) % self.vocab_size for tok in prompt.strip().split()]
+        else:
+            tokens = list(prompt)
+        return tokens
+
+    def detokenize(self, tokens, skip_special_tokens: bool = False):
+        if isinstance(tokens, torch.Tensor):
+            tokens = tokens.tolist()
+        if skip_special_tokens and self.eod in tokens:
+            tokens = [tok for tok in tokens if tok != self.eod]
+        return " ".join(str(tok) for tok in tokens)
+
+    def offsets(self, tokens, text):
+        if isinstance(tokens, torch.Tensor):
+            tokens = tokens.tolist()
+        offsets = []
+        cursor = 0
+        for tok in tokens:
+            offsets.append(cursor)
+            cursor += len(str(tok)) + 1
+        return offsets
+
+
+def _configure_flash_attention_env():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    os.environ['NVTE_FUSED_ATTN'] = '0'
+    os.environ['NVTE_FLASH_ATTN'] = '1'
+    os.environ['NVTE_UNFUSED_ATTN'] = '0'
+
+
+def _build_flash_attn_bik_model(seq_len: int, vocab_size: int, hidden_size: int = 128) -> GPTModel:
+    cfg = TransformerConfig(
+        num_layers=2,
+        hidden_size=hidden_size,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        normalization="RMSNorm",
+        params_dtype=torch.bfloat16,
+        attention_backend=AttnBackend.flash,
+    )
+    cfg.fp16 = False
+    cfg.bf16 = True
+    model = GPTModel(
+        config=cfg,
+        transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+        vocab_size=vocab_size,
+        max_sequence_length=seq_len,
+    )
+    return model.cuda().eval()
+
+
+def _train_forward_logprobs(model: torch.nn.Module, tokens: torch.Tensor) -> torch.Tensor:
+    batch_size, seq_len = tokens.shape
+    position_ids = (
+        torch.arange(seq_len, device=tokens.device).unsqueeze(0).expand(batch_size, seq_len)
+    )
+    attention_mask = torch.ones(
+        batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=tokens.device
+    )
+    with torch.no_grad():
+        logits = model(input_ids=tokens, position_ids=position_ids, attention_mask=attention_mask)
+    logprobs = selective_log_softmax(logits[:, :-1, :], tokens[:, 1:])
+    return logprobs
+
+
+@pytest.mark.skipif(
+    not (is_te_min_version("2.10.0") and HAVE_FA3),
+    reason="TestGPTModelBatchInvariant requires TE >= 2.10.0 and FlashAttention-3",
+)
+class TestGPTModelBatchInvariant:
+    """End-to-end batch-invariance tests for GPT."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        _configure_flash_attention_env()
+        model_parallel_cuda_manual_seed(321)
+        self.sequence_length = 32
+        self.vocab_size = 96
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_forward_batch_invariant(self):
+        _configure_flash_attention_env()
+        model = _build_flash_attn_bik_model(self.sequence_length, self.vocab_size)
+        model = Float16Module(model.config, model).eval()
+        batch_size = 6
+        splits = [2, 1, 3]
+        input_ids = torch.randint(
+            low=0, high=self.vocab_size, size=(batch_size, self.sequence_length), device="cuda"
+        )
+        position_ids = (
+            torch.arange(self.sequence_length, device="cuda").unsqueeze(0).repeat(batch_size, 1)
+        )
+        attention_mask = torch.ones(
+            batch_size,
+            1,
+            self.sequence_length,
+            self.sequence_length,
+            dtype=torch.bool,
+            device="cuda",
+        )
+
+        with set_batch_invariant_mode(True):
+            with torch.no_grad():
+                logits_full = model(
+                    input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask
+                ).to(torch.float32)
+                chunked_logits = []
+                start = 0
+                for split in splits:
+                    end = start + split
+                    chunked_logits.append(
+                        model(
+                            input_ids=input_ids[start:end],
+                            position_ids=position_ids[start:end],
+                            attention_mask=attention_mask[start:end],
+                        ).to(torch.float32)
+                    )
+                    start = end
+                logits_chunked = torch.cat(chunked_logits, dim=0)
+
+        assert torch.equal(logits_full, logits_chunked)
+
+    def test_dynamic_engine_matches_batched_forward_rl(self):
+        _configure_flash_attention_env()
+        seq_len = 48
+        vocab_size = 96
+        base_model = _build_flash_attn_bik_model(seq_len, vocab_size)
+        inference_model = Float16Module(base_model.config, base_model).cuda().eval()
+
+        ctx = DynamicInferenceContext(
+            params_dtype=torch.bfloat16,
+            num_layers=base_model.config.num_layers,
+            kv_channels=base_model.config.kv_channels,
+            num_attention_heads=base_model.config.num_attention_heads,
+            max_sequence_length=seq_len,
+            buffer_size_gb=0.125,
+            block_size_tokens=16,
+            num_cuda_graphs=None,
+            materialize_only_last_token_logits=False,
+            use_cuda_graphs_for_non_decode_steps=False,
+            unified_memory_level=0,
+        )
+
+        wrapper_cfg = InferenceWrapperConfig(
+            hidden_size=base_model.config.hidden_size,
+            inference_batch_times_seqlen_threshold=-1,
+            fp32_residual_connection=False,
+            params_dtype=torch.bfloat16,
+            padded_vocab_size=vocab_size,
+            inference_max_seq_length=seq_len,
+            inference_max_requests=8,
+            nccl_all_reduce_for_prefill=False,
+        )
+        wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx)
+        tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0)
+        controller = SimpleTextGenerationController(wrapper, tokenizer)
+        engine = DynamicInferenceEngine(
+            controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123
+        )
+
+        base_vals = [3, 15, 27, 39]
+        lengths = [18, 11, 23, 13]
+        prompts = []
+        for base, length in zip(base_vals, lengths):
+            seq = [(base + i) % (vocab_size - 1) for i in range(length - 1)]
+            seq.append(tokenizer.eod)
+            prompts.append(seq)
+
+        sampling_params = SamplingParams(
+            num_tokens_to_generate=6,
+            temperature=1.0,
+            top_k=1,
+            top_p=0.0,
+            add_BOS=False,
+            return_log_probs=True,
+            termination_id=tokenizer.eod,
+        )
+
+        finished_requests = []
+        with set_batch_invariant_mode(True):
+            for request_id, prompt in enumerate(prompts, start=1):
+                engine.add_request(request_id, prompt, sampling_params)
+            while engine.has_unfinished_requests():
+                result = engine.step_modern()
+                finished_requests.extend(
+                    r.merge(engine.controller.tokenizer) for r in result["finished_request_records"]
+                )
+
+            assert finished_requests, "Dynamic engine did not produce any completed requests."
+
+            for req in finished_requests:
+                prompt_tokens = req.prompt_tokens.tolist()
+                generated_tokens = req.generated_tokens
+                full_sequence = torch.tensor(
+                    prompt_tokens + generated_tokens, dtype=torch.long, device="cuda"
+                ).unsqueeze(0)
+                baseline_log_probs = _train_forward_logprobs(
+                    inference_model, full_sequence
+                ).squeeze(0)
+                inference_log_probs = torch.tensor(
+                    req.prompt_log_probs + req.generated_log_probs,
+                    dtype=baseline_log_probs.dtype,
+                    device="cuda",
+                )
+                assert torch.equal(
+                    inference_log_probs, baseline_log_probs
+                ), "Log probabilities from dynamic engine did not match batched forward."
+
+    def test_dynamic_engine_is_batch_invariant(self):
+        """Check that the dynamic engine itself is batch invariant: changing the
+        order in which requests are added does not change per-request outputs."""
+        _configure_flash_attention_env()
+        seq_len = 48
+        vocab_size = 96
+        base_model = _build_flash_attn_bik_model(seq_len, vocab_size)
+        inference_model = Float16Module(base_model.config, base_model).cuda().eval()
+
+        def _run_engine_with_order(order):
+            ctx = DynamicInferenceContext(
+                params_dtype=torch.bfloat16,
+                num_layers=base_model.config.num_layers,
+                kv_channels=base_model.config.kv_channels,
+                num_attention_heads=base_model.config.num_attention_heads,
+                max_sequence_length=seq_len,
+                buffer_size_gb=0.125,
+                block_size_tokens=16,
+                num_cuda_graphs=None,
+                materialize_only_last_token_logits=False,
+                use_cuda_graphs_for_non_decode_steps=False,
+                unified_memory_level=0,
+            )
+
+            wrapper_cfg = InferenceWrapperConfig(
+                hidden_size=base_model.config.hidden_size,
+                inference_batch_times_seqlen_threshold=-1,
+                fp32_residual_connection=False,
+                params_dtype=torch.bfloat16,
+                padded_vocab_size=vocab_size,
+                inference_max_seq_length=seq_len,
+                inference_max_requests=8,
+                nccl_all_reduce_for_prefill=False,
+            )
+            wrapper = GPTInferenceWrapper(inference_model, wrapper_cfg, ctx)
+            tokenizer = DummyTokenizer(vocab_size=vocab_size, bos=None, eod=vocab_size - 1, pad=0)
+            controller = SimpleTextGenerationController(wrapper, tokenizer)
+            engine = DynamicInferenceEngine(
+                controller=controller, context=ctx, enable_cuda_graph=False, random_seed=123
+            )
+
+            base_vals = [3, 15, 27, 39]
+            lengths = [18, 11, 23, 13]
+            prompts = []
+            for base, length in zip(base_vals, lengths):
+                seq = [(base + i) % (vocab_size - 1) for i in range(length - 1)]
+                seq.append(tokenizer.eod)
+                prompts.append(seq)
+
+            sampling_params = SamplingParams(
+                num_tokens_to_generate=6,
+                temperature=1.0,
+                top_k=1,
+                top_p=0.0,
+                add_BOS=False,
+                return_log_probs=True,
+                termination_id=tokenizer.eod,
+            )
+
+            finished_by_id = {}
+            with set_batch_invariant_mode(True):
+                for request_id in order:
+                    engine.add_request(request_id, prompts[request_id - 1], sampling_params)
+                while engine.has_unfinished_requests():
+                    result = engine.step_modern()
+                    for r in result["finished_request_records"]:
+                        req = r.merge(engine.controller.tokenizer)
+                        finished_by_id[req.request_id] = req
+
+            return finished_by_id
+
+        # Run once with requests added in order 1,2,3,4...
+        num_requests = 4
+        order1 = list(range(1, num_requests + 1))
+        results1 = _run_engine_with_order(order1)
+
+        # Run again with the same requests but added in reverse order.
+        order2 = list(reversed(order1))
+        results2 = _run_engine_with_order(order2)
+
+        assert set(results1.keys()) == set(results2.keys())
+        for rid in results1.keys():
+            r1 = results1[rid]
+            r2 = results2[rid]
+            assert r1.prompt_tokens.tolist() == r2.prompt_tokens.tolist()
+            assert r1.generated_tokens == r2.generated_tokens
+            assert r1.prompt_log_probs == r2.prompt_log_probs
+            assert r1.generated_log_probs == r2.generated_log_probs
diff --git a/tests/unit_tests/models/test_gpt_model_quantization.py b/tests/unit_tests/models/test_gpt_model_quantization.py
index 2b7c5cc6ff8..e993c9be8d2 100644
--- a/tests/unit_tests/models/test_gpt_model_quantization.py
+++ b/tests/unit_tests/models/test_gpt_model_quantization.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+from megatron.core.enums import Fp8Recipe
 from megatron.core.models.gpt import GPTModel
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
 from megatron.core.quantization.quant_config import MatchContext, RecipeConfig
@@ -10,15 +11,21 @@
 from tests.unit_tests.test_utilities import Utils
 
 try:
-    HAVE_TE = True
-    import transformer_engine as te
+    from megatron.core.extensions.transformer_engine import HAVE_TE
 except ImportError:
     HAVE_TE = False
 
 try:
-    import nvidia_kitchen
-
-    HAVE_KITCHEN = True
+    from megatron.core.extensions.kitchen import (
+        HAVE_KITCHEN,
+        KitchenColumnParallelGroupedLinear,
+        KitchenColumnParallelLinear,
+        KitchenDotProductAttention,
+        KitchenFlashAttention,
+        KitchenLayerNormColumnParallelLinear,
+        KitchenRowParallelGroupedLinear,
+        KitchenRowParallelLinear,
+    )
 except ImportError:
     HAVE_KITCHEN = False
 
@@ -138,6 +145,121 @@ def test_kitchen_config_resolution_dense(self) -> None:
                 assert module.kitchen_quant_params.match_input == expected_match[name][0]
         assert visited_keys == set(expected_types.keys())
 
+    def test_kitchen_config_resolution_dense_compound_params(self) -> None:
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=False,
+            gated_linear_unit=True,
+            bias_activation_fusion=True,
+            add_bias_linear=False,
+            use_kitchen=True,
+            use_kitchen_attention=True,
+            quant_recipe=RecipeConfig.from_config_dict(
+                {
+                    "matchers": {
+                        "keep_in_fp8": {
+                            "type": "glob",
+                            "enabled": True,
+                            "pattern": "*fc2",
+                            "config": "fp8_cs",
+                        },
+                        "all": {"type": "glob", "enabled": True, "pattern": "*", "config": "bf16"},
+                    },
+                    "configs": {
+                        "bf16": {
+                            "kitchen_config_type": "CompoundParams",
+                            "configs": [
+                                {"kitchen_config_type": "QLinearParams", "recipe_idx": 1},
+                                {"kitchen_config_type": "QAttentionParams", "recipe_idx": 1},
+                            ],
+                        },
+                        "fp8_cs": {"kitchen_config_type": "QLinearParams", "recipe_idx": 2},
+                    },
+                }
+            ),
+        )
+        transformer_layer_spec = get_gpt_decoder_block_spec(
+            config=transformer_config, use_transformer_engine=True
+        )
+        padded_vocab_size = 512
+        max_position_embeddings = 4096
+        model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=padded_vocab_size,
+            max_sequence_length=max_position_embeddings,
+        )
+
+        expected_types = {
+            "decoder.layers.0.self_attention.linear_proj": KitchenRowParallelLinear,
+            "decoder.layers.1.self_attention.linear_proj": KitchenRowParallelLinear,
+            "decoder.layers.0.self_attention.linear_qkv": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.1.self_attention.linear_qkv": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc1": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.1.mlp.linear_fc1": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc2": KitchenRowParallelLinear,
+            "decoder.layers.1.mlp.linear_fc2": KitchenRowParallelLinear,
+            "decoder.layers.0.self_attention.core_attention": KitchenDotProductAttention,
+            "decoder.layers.1.self_attention.core_attention": KitchenDotProductAttention,
+        }
+
+        expected_match = {
+            "decoder.layers.0.self_attention.linear_proj": (
+                MatchContext("decoder.layers.0.self_attention.linear_proj", layer_number=0),
+                "bf16",
+            ),
+            "decoder.layers.1.self_attention.linear_proj": (
+                MatchContext("decoder.layers.1.self_attention.linear_proj", layer_number=1),
+                "bf16",
+            ),
+            "decoder.layers.0.self_attention.linear_qkv": (
+                MatchContext("decoder.layers.0.self_attention.linear_qkv", layer_number=0),
+                "bf16",
+            ),
+            "decoder.layers.1.self_attention.linear_qkv": (
+                MatchContext("decoder.layers.1.self_attention.linear_qkv", layer_number=1),
+                "bf16",
+            ),
+            "decoder.layers.0.mlp.linear_fc1": (
+                MatchContext("decoder.layers.0.mlp.linear_fc1", layer_number=0),
+                "bf16",
+            ),
+            "decoder.layers.1.mlp.linear_fc1": (
+                MatchContext("decoder.layers.1.mlp.linear_fc1", layer_number=1),
+                "bf16",
+            ),
+            "decoder.layers.0.mlp.linear_fc2": (
+                MatchContext("decoder.layers.0.mlp.linear_fc2", layer_number=0),
+                "fp8_cs",
+            ),
+            "decoder.layers.1.mlp.linear_fc2": (
+                MatchContext("decoder.layers.1.mlp.linear_fc2", layer_number=1),
+                "fp8_cs",
+            ),
+            "decoder.layers.0.self_attention.core_attention": (
+                MatchContext("decoder.layers.0.self_attention.core_attention", layer_number=0),
+                "bf16",
+            ),
+            "decoder.layers.1.self_attention.core_attention": (
+                MatchContext("decoder.layers.1.self_attention.core_attention", layer_number=1),
+                "bf16",
+            ),
+        }
+
+        visited_keys = set()
+        for name, module in model.named_modules():
+            if name in expected_types:
+                assert (
+                    type(module) == expected_types[name]
+                ), f"Expected {name} to be {expected_types[name]}, but it is {type(module)}"
+                visited_keys.add(name)
+                assert hasattr(module, "kitchen_quant_params")
+                assert module.kitchen_quant_params.params_config_key == expected_match[name][1]
+                assert module.kitchen_quant_params.match_input == expected_match[name][0]
+        assert visited_keys == set(expected_types.keys())
+
     def test_kitchen_config_resolution_moe(self) -> None:
         transformer_config = TransformerConfig(
             moe_layer_freq=1,
@@ -246,3 +368,339 @@ def test_kitchen_config_resolution_moe(self) -> None:
                 assert module.kitchen_quant_params.params_config_key == expected_match[name][1]
                 assert module.kitchen_quant_params.match_input == expected_match[name][0]
         assert visited_keys == set(expected_types.keys())
+
+    def test_kitchen_flash_attention_config_resolution(self) -> None:
+        """Test GPT model with KitchenFlashAttention configuration."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=False,
+            gated_linear_unit=True,
+            bias_activation_fusion=True,
+            add_bias_linear=False,
+            use_kitchen=True,
+            use_kitchen_attention=True,
+            kitchen_attention_backend="fa",
+            attention_dropout=0.0,
+            quant_recipe=RecipeConfig.from_config_dict(
+                {
+                    "matchers": {
+                        "attention": {
+                            "type": "glob",
+                            "enabled": True,
+                            "pattern": "*self_attention.core_attention",
+                            "config": "fa_bf16",
+                        },
+                        "keep_in_hp": {
+                            "type": "glob",
+                            "enabled": True,
+                            "pattern": "*fc2",
+                            "config": "bf16",
+                        },
+                        "use_fp8_cs": {
+                            "type": "glob",
+                            "enabled": True,
+                            "pattern": "*",
+                            "config": "fp8_cs",
+                        },
+                    },
+                    "configs": {
+                        "bf16": {"kitchen_config_type": "QLinearParams", "recipe_idx": 1},
+                        "fp8_cs": {"kitchen_config_type": "QLinearParams", "recipe_idx": 2},
+                        "fa_bf16": {
+                            "kitchen_config_type": "QFlashAttentionParams",
+                            "recipe_name": "triton_fa_bf16_for_all_base_2",
+                        },
+                    },
+                }
+            ),
+        )
+        transformer_layer_spec = get_gpt_decoder_block_spec(
+            config=transformer_config, use_transformer_engine=True
+        )
+        padded_vocab_size = 512
+        max_position_embeddings = 4096
+        model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=padded_vocab_size,
+            max_sequence_length=max_position_embeddings,
+        )
+
+        expected_types = {
+            "decoder.layers.0.self_attention.linear_proj": KitchenRowParallelLinear,
+            "decoder.layers.1.self_attention.linear_proj": KitchenRowParallelLinear,
+            "decoder.layers.0.self_attention.linear_qkv": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.1.self_attention.linear_qkv": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc1": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.1.mlp.linear_fc1": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc2": KitchenRowParallelLinear,
+            "decoder.layers.1.mlp.linear_fc2": KitchenRowParallelLinear,
+            "decoder.layers.0.self_attention.core_attention": KitchenFlashAttention,
+            "decoder.layers.1.self_attention.core_attention": KitchenFlashAttention,
+        }
+
+        expected_match = {
+            "decoder.layers.0.self_attention.linear_proj": (
+                MatchContext("decoder.layers.0.self_attention.linear_proj", layer_number=0),
+                "fp8_cs",
+            ),
+            "decoder.layers.1.self_attention.linear_proj": (
+                MatchContext("decoder.layers.1.self_attention.linear_proj", layer_number=1),
+                "fp8_cs",
+            ),
+            "decoder.layers.0.self_attention.linear_qkv": (
+                MatchContext("decoder.layers.0.self_attention.linear_qkv", layer_number=0),
+                "fp8_cs",
+            ),
+            "decoder.layers.1.self_attention.linear_qkv": (
+                MatchContext("decoder.layers.1.self_attention.linear_qkv", layer_number=1),
+                "fp8_cs",
+            ),
+            "decoder.layers.0.mlp.linear_fc1": (
+                MatchContext("decoder.layers.0.mlp.linear_fc1", layer_number=0),
+                "fp8_cs",
+            ),
+            "decoder.layers.1.mlp.linear_fc1": (
+                MatchContext("decoder.layers.1.mlp.linear_fc1", layer_number=1),
+                "fp8_cs",
+            ),
+            "decoder.layers.0.mlp.linear_fc2": (
+                MatchContext("decoder.layers.0.mlp.linear_fc2", layer_number=0),
+                "bf16",
+            ),
+            "decoder.layers.1.mlp.linear_fc2": (
+                MatchContext("decoder.layers.1.mlp.linear_fc2", layer_number=1),
+                "bf16",
+            ),
+            "decoder.layers.0.self_attention.core_attention": (
+                MatchContext("decoder.layers.0.self_attention.core_attention", layer_number=0),
+                "fa_bf16",
+            ),
+            "decoder.layers.1.self_attention.core_attention": (
+                MatchContext("decoder.layers.1.self_attention.core_attention", layer_number=1),
+                "fa_bf16",
+            ),
+        }
+
+        visited_keys = set()
+        for name, module in model.named_modules():
+            if name in expected_types:
+                assert (
+                    type(module) == expected_types[name]
+                ), f"Expected {name} to be {expected_types[name]}, but it is {type(module)}"
+                visited_keys.add(name)
+                assert hasattr(module, "kitchen_quant_params")
+                assert module.kitchen_quant_params.params_config_key == expected_match[name][1]
+                assert module.kitchen_quant_params.match_input == expected_match[name][0]
+        assert visited_keys == set(expected_types.keys())
+
+    def test_kitchen_flash_attention_with_compound_params(self) -> None:
+        """Test GPT model with KitchenFlashAttention using CompoundParams configuration."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=False,
+            gated_linear_unit=True,
+            bias_activation_fusion=True,
+            add_bias_linear=False,
+            use_kitchen=True,
+            use_kitchen_attention=True,
+            kitchen_attention_backend="fa",
+            attention_dropout=0.0,
+            quant_recipe=RecipeConfig.from_config_dict(
+                {
+                    "matchers": {
+                        "all": {"type": "glob", "enabled": True, "pattern": "*", "config": "mixed"}
+                    },
+                    "configs": {
+                        "mixed": {
+                            "kitchen_config_type": "CompoundParams",
+                            "configs": [
+                                {"kitchen_config_type": "QLinearParams", "recipe_idx": 2},
+                                {
+                                    "kitchen_config_type": "QFlashAttentionParams",
+                                    "recipe_name": "triton_fa_bf16_for_all_natural",
+                                },
+                            ],
+                        }
+                    },
+                }
+            ),
+        )
+        transformer_layer_spec = get_gpt_decoder_block_spec(
+            config=transformer_config, use_transformer_engine=True
+        )
+        padded_vocab_size = 512
+        max_position_embeddings = 4096
+        model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=padded_vocab_size,
+            max_sequence_length=max_position_embeddings,
+        )
+
+        expected_types = {
+            "decoder.layers.0.self_attention.linear_proj": KitchenRowParallelLinear,
+            "decoder.layers.1.self_attention.linear_proj": KitchenRowParallelLinear,
+            "decoder.layers.0.self_attention.linear_qkv": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.1.self_attention.linear_qkv": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc1": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.1.mlp.linear_fc1": KitchenLayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc2": KitchenRowParallelLinear,
+            "decoder.layers.1.mlp.linear_fc2": KitchenRowParallelLinear,
+            "decoder.layers.0.self_attention.core_attention": KitchenFlashAttention,
+            "decoder.layers.1.self_attention.core_attention": KitchenFlashAttention,
+        }
+
+        expected_config_key = "mixed"
+
+        visited_keys = set()
+        for name, module in model.named_modules():
+            if name in expected_types:
+                assert (
+                    type(module) == expected_types[name]
+                ), f"Expected {name} to be {expected_types[name]}, but it is {type(module)}"
+                visited_keys.add(name)
+                assert hasattr(module, "kitchen_quant_params")
+                assert module.kitchen_quant_params.params_config_key == expected_config_key
+        assert visited_keys == set(expected_types.keys())
+
+
+@pytest.mark.skipif(
+    not HAVE_TE, reason="Transformer Engine required for using TE backend with per-module quant."
+)
+class TestGPTModelTEQuantizationConfig:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_te_config_resolution_dense(self) -> None:
+        from megatron.core.extensions.transformer_engine import (
+            TELayerNormColumnParallelLinear,
+            TERowParallelLinear,
+        )
+
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=False,
+            gated_linear_unit=True,
+            bias_activation_fusion=True,
+            add_bias_linear=False,
+            quant_recipe=RecipeConfig.from_config_dict(
+                {
+                    "matchers": {
+                        "force_in_hp": {
+                            "type": "glob",
+                            "enabled": True,
+                            "pattern": "*fc2",
+                            "config": "bf16",
+                        },
+                        "use_fp8_cs": {
+                            "type": "glob",
+                            "enabled": True,
+                            "pattern": "*",
+                            "config": "fp8_cs",
+                        },
+                    },
+                    "configs": {
+                        "bf16": {
+                            "transformer_engine_config_type": "TEQuantizationParams",
+                            "training_recipe": {},
+                        },
+                        "fp8_cs": {
+                            "transformer_engine_config_type": "TEQuantizationParams",
+                            "training_recipe": {"fp8_quantization_recipe": "tensorwise"},
+                        },
+                    },
+                }
+            ),
+        )
+        transformer_layer_spec = get_gpt_decoder_block_spec(
+            config=transformer_config, use_transformer_engine=True
+        )
+        padded_vocab_size = 512
+        max_position_embeddings = 4096
+        model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=padded_vocab_size,
+            max_sequence_length=max_position_embeddings,
+        )
+
+        expected_types = {
+            "decoder.layers.0.self_attention.linear_proj": TERowParallelLinear,
+            "decoder.layers.1.self_attention.linear_proj": TERowParallelLinear,
+            "decoder.layers.0.self_attention.linear_qkv": TELayerNormColumnParallelLinear,
+            "decoder.layers.1.self_attention.linear_qkv": TELayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc1": TELayerNormColumnParallelLinear,
+            "decoder.layers.1.mlp.linear_fc1": TELayerNormColumnParallelLinear,
+            "decoder.layers.0.mlp.linear_fc2": TERowParallelLinear,
+            "decoder.layers.1.mlp.linear_fc2": TERowParallelLinear,
+        }
+
+        expected_match = {
+            "decoder.layers.0.self_attention.linear_proj": (
+                MatchContext("decoder.layers.0.self_attention.linear_proj", layer_number=0),
+                "fp8_cs",
+            ),
+            "decoder.layers.1.self_attention.linear_proj": (
+                MatchContext("decoder.layers.1.self_attention.linear_proj", layer_number=1),
+                "fp8_cs",
+            ),
+            "decoder.layers.0.self_attention.linear_qkv": (
+                MatchContext("decoder.layers.0.self_attention.linear_qkv", layer_number=0),
+                "fp8_cs",
+            ),
+            "decoder.layers.1.self_attention.linear_qkv": (
+                MatchContext("decoder.layers.1.self_attention.linear_qkv", layer_number=1),
+                "fp8_cs",
+            ),
+            "decoder.layers.0.mlp.linear_fc1": (
+                MatchContext("decoder.layers.0.mlp.linear_fc1", layer_number=0),
+                "fp8_cs",
+            ),
+            "decoder.layers.1.mlp.linear_fc1": (
+                MatchContext("decoder.layers.1.mlp.linear_fc1", layer_number=1),
+                "fp8_cs",
+            ),
+            "decoder.layers.0.mlp.linear_fc2": (
+                MatchContext("decoder.layers.0.mlp.linear_fc2", layer_number=0),
+                "bf16",
+            ),
+            "decoder.layers.1.mlp.linear_fc2": (
+                MatchContext("decoder.layers.1.mlp.linear_fc2", layer_number=1),
+                "bf16",
+            ),
+        }
+
+        visited_keys = set()
+        for name, module in model.named_modules():
+            if name in expected_types:
+                assert (
+                    type(module) == expected_types[name]
+                ), f"Expected {name} to be {expected_types[name]}, but it is {type(module)}"
+                visited_keys.add(name)
+                assert hasattr(module, "te_quant_params")
+                config_expected = expected_match[name][1]
+                if config_expected == "bf16":
+                    assert module.te_quant_params.training_recipe.fp8_quantization_recipe is None
+                    assert module.te_quant_params.training_recipe.fp4_quantization_recipe is None
+                    assert not module.te_quant_params.training_recipe.override_nonquantized_autocast
+                    assert module.te_quant_params.training_recipe.override_quantized_autocast
+                    assert module.te_quant_params.evaluation_recipe is None
+                else:  # fp8_cs
+                    assert (
+                        module.te_quant_params.training_recipe.fp8_quantization_recipe
+                        == Fp8Recipe.tensorwise
+                    )
+                    assert module.te_quant_params.training_recipe.fp4_quantization_recipe is None
+                    assert module.te_quant_params.evaluation_recipe is None
+        assert visited_keys == set(expected_types.keys())
diff --git a/tests/unit_tests/models/test_mamba_model.py b/tests/unit_tests/models/test_mamba_model.py
index ca42ae496be..9eb7b2dea9a 100644
--- a/tests/unit_tests/models/test_mamba_model.py
+++ b/tests/unit_tests/models/test_mamba_model.py
@@ -1,18 +1,32 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
+import os
 from datetime import timedelta
+from itertools import accumulate
 
 import pytest
 import torch
+from transformer_engine.pytorch.fp8 import check_fp8_support
 
 from megatron.core import parallel_state
 from megatron.core.hyper_comm_grid import HyperCommGrid
 from megatron.core.inference.contexts import BaseInferenceContext, StaticInferenceContext
+from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext
+from megatron.core.inference.inference_request import DynamicInferenceRequest
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
 from megatron.core.models.mamba.mamba_model import MambaModel
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer import TransformerConfig
-from megatron.core.utils import divide, is_torch_min_version
+from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.module import Float16Module
+from megatron.core.utils import (
+    divide,
+    get_mamba_inference_state_config_from_model,
+    is_fa_min_version,
+    is_torch_min_version,
+)
 from tests.unit_tests.test_utilities import Utils
 
 
@@ -62,7 +76,6 @@ def test_set_input_tensor(self):
         assert self.model.decoder.input_tensor.shape[2] == config.hidden_size
 
     def test_forward(self):
-        config: TransformerConfig = self.model.config
         sequence_length = self.model.max_sequence_length
         micro_batch_size = 2
 
@@ -83,8 +96,70 @@ def test_forward(self):
         assert logits.shape[1] == sequence_length
         assert logits.shape[2] == self.model.vocab_size
 
+    def test_forward_packed_sequence(self):
+        os.environ.pop('NVTE_FUSED_ATTN', None)
+        os.environ.pop('NVTE_FLASH_ATTN', None)
+        os.environ.pop('NVTE_UNFUSED_ATTN', None)
+        model_config = TransformerConfig(
+            num_layers=3,  # 1 Mamba layer, 1 attention layer, 1 MLP layer
+            hidden_size=256,  # The Mamba layer places several constraints on this
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            bf16=True,  # Needed for backend=flash
+            params_dtype=torch.bfloat16,  # Needed for backend=flash
+            attention_backend=AttnBackend.flash,  # Needed for packed sequence
+        )
+        vocab_size = 100
+        model = MambaModel(
+            config=model_config,
+            mamba_stack_spec=mamba_stack_spec,
+            vocab_size=vocab_size,
+            max_sequence_length=12,
+            hybrid_attention_ratio=0.3,
+            hybrid_mlp_ratio=0.3,
+        )
+
+        sequence_length = model.max_sequence_length
+        micro_batch_size = 1  # must be 1 for packed sequence
+
+        model.cuda()
+
+        data = [i % vocab_size for i in range(sequence_length)]
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        lengths = [4, 3, 5]
+        assert sum(lengths) == sequence_length
+        positions = [i for n in lengths for i in range(n)]
+        position_ids = (
+            torch.tensor(positions, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        )
+        attention_mask = None
+
+        cumsum = [0] + list(accumulate(lengths))
+        cu_seqlens = torch.tensor(cumsum, dtype=torch.int32).cuda()
+        max_seqlen = max(lengths)
+
+        packed_seq_params = PackedSeqParams(
+            qkv_format="thd",
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_kv=cu_seqlens,
+            cu_seqlens_q_padded=None,
+            cu_seqlens_kv_padded=None,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_kv=max_seqlen,
+        )
+
+        logits = model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            packed_seq_params=packed_seq_params,
+        )
+
+        assert logits.shape[0] == micro_batch_size
+        assert logits.shape[1] == sequence_length
+        assert logits.shape[2] == model.vocab_size
+
     def test_inference(self):
-        config: TransformerConfig = self.model.config
         micro_batch_size = 2
         inference_context: BaseInferenceContext = StaticInferenceContext(
             max_batch_size=micro_batch_size, max_sequence_length=self.model.max_sequence_length
@@ -218,3 +293,111 @@ def test_with_custom_process_groups(self, tmp_path, tp_size, cp_size, pp_size):
         assert logits.shape[0] == micro_batch_size
         assert logits.shape[1] == sequence_length
         assert logits.shape[2] == divide(model.vocab_size, tp_size)
+
+
+class TestMambaWithDynamicInference:
+    """Tests MambaModel with dynamic inference."""
+
+    @torch.inference_mode()
+    def setup_method(self, method):
+        fp8_available, reason_for_no_fp8 = check_fp8_support()
+        if not fp8_available:
+            pytest.skip(reason_for_no_fp8)
+
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        model_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=512,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            params_dtype=torch.bfloat16,
+            bf16=True,
+            fp8="hybrid",
+            fp8_recipe="tensorwise",
+        )
+
+        self.model = MambaModel(
+            config=model_config,
+            mamba_stack_spec=mamba_stack_spec,
+            vocab_size=128,
+            max_sequence_length=DynamicInferenceContext.TOKEN_ROUNDER,
+            hybrid_attention_ratio=0.5,
+            hybrid_mlp_ratio=0.0,
+        )
+        self.model = Float16Module(self.model.config, self.model)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(
+        not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"
+    )
+    @torch.inference_mode()
+    def test_dynamic_inference_padding_with_fp8(self):
+        """
+        Tests that logits for padded tokens are zeroed out for fp8 inference.
+        """
+        self.model.cuda()
+        self.model.eval()
+        config = self.model.config
+
+        mamba_inference_state_config = get_mamba_inference_state_config_from_model(
+            self.model.module
+        )
+
+        inference_context = DynamicInferenceContext(
+            params_dtype=config.params_dtype,
+            num_layers=config.num_layers,
+            kv_channels=config.hidden_size // config.num_attention_heads,
+            num_attention_heads=config.num_attention_heads,
+            max_sequence_length=self.model.module.max_sequence_length,
+            buffer_size_gb=1.0,
+            block_size_tokens=256,
+            materialize_only_last_token_logits=False,
+            mamba_inference_state_config=mamba_inference_state_config,
+        )
+
+        # Add a request with 10 tokens. Since 10 is not a multiple of 64 (TOKEN_ROUNDER),
+        # this will create padding up to the padded length of 64.
+        active_token_count = 10
+        request = DynamicInferenceRequest(
+            request_id=0,
+            prompt_tokens=torch.arange(0, active_token_count, dtype=torch.long, device='cuda'),
+            sampling_params=SamplingParams(num_tokens_to_generate=1),
+        )
+        inference_context.add_request(request)
+
+        # Prepares the context, including calculating the padded token count.
+        inference_context.initialize_attention_state()
+
+        assert inference_context.active_token_count == active_token_count
+        assert inference_context.padded_active_token_count == DynamicInferenceContext.TOKEN_ROUNDER
+
+        # Prepare inputs for the forward pass.
+        padded_token_count = inference_context.padded_active_token_count
+        input_ids, position_ids = inference_context.current_input_and_position_ids()
+
+        # Run the forward pass with inference parameters.
+        logits = self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=None,
+            inference_context=inference_context,
+            runtime_gather_output=True,
+        )
+
+        # Verify the output shape.
+        assert logits.shape[0] == 1
+        assert logits.shape[1] == padded_token_count
+        assert logits.shape[2] == self.model.module.vocab_size
+
+        # Extract the logits corresponding to the padding tokens (from index 10 to 63).
+        padding_start_idx = inference_context.active_token_count
+        padding_end_idx = inference_context.padded_active_token_count
+        padding_logits = logits[0, padding_start_idx:padding_end_idx, :]
+
+        # Assert that all padding logits are zero.
+        assert torch.all(padding_logits == 0.0), "Logits for padding tokens are not all zero."
diff --git a/tests/unit_tests/models/test_mamba_moe_model.py b/tests/unit_tests/models/test_mamba_moe_model.py
new file mode 100644
index 00000000000..e316b3f45e5
--- /dev/null
+++ b/tests/unit_tests/models/test_mamba_moe_model.py
@@ -0,0 +1,569 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import hashlib
+import inspect
+import json
+import os
+import sys
+from typing import Any, Dict, Mapping, Tuple
+
+import pytest  # type: ignore[import]
+import torch
+
+from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+from megatron.core.models.mamba.mamba_model import MambaModel
+from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.enums import AttnBackend
+from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args
+from megatron.training.global_vars import (
+    destroy_global_vars,
+    get_args,
+    set_args,
+    set_global_variables,
+)
+from tests.unit_tests.test_utilities import Utils
+
+GOLDEN_CONFIG: Dict[str, Any] = {
+    "_cpu_offloading_context": None,
+    "account_for_embedding_in_pipeline_split": False,
+    "account_for_loss_in_pipeline_split": False,
+    "activation_func": "megatron.core.activations.squared_relu",
+    "activation_func_clamp_value": None,
+    "activation_func_fp8_input_store": False,
+    "add_bias_linear": False,
+    "add_qkv_bias": False,
+    "apply_query_key_layer_scaling": False,
+    "apply_residual_connection_post_layernorm": False,
+    "apply_rope_fusion": False,
+    "async_tensor_model_parallel_allreduce": True,
+    "attention_backend": {
+        "__objclass__": "megatron.core.transformer.enums.AttnBackend",
+        "_name_": "flash",
+        "_sort_order_": 0,
+        "_value_": 1,
+    },
+    "attention_dropout": 0.0,
+    "attention_output_gate": False,
+    "attention_softmax_in_fp32": False,
+    "autocast_dtype": "torch.bfloat16",
+    "barrier_with_L1_time": True,
+    "batch_invariant_mode": False,
+    "batch_p2p_comm": True,
+    "batch_p2p_sync": True,
+    "bf16": True,
+    "bias_activation_fusion": False,
+    "bias_dropout_fusion": True,
+    "calculate_per_token_loss": False,
+    "clone_scatter_output_in_embedding": True,
+    "config_logger_dir": "",
+    "context_parallel_size": 1,
+    "cp_comm_type": "p2p",
+    "cpu_offloading": False,
+    "cpu_offloading_activations": True,
+    "cpu_offloading_double_buffering": False,
+    "cpu_offloading_num_layers": 0,
+    "cpu_offloading_weights": False,
+    "cross_entropy_fusion_impl": "native",
+    "cross_entropy_loss_fusion": True,
+    "cuda_graph_impl": "none",
+    "cuda_graph_retain_backward_graph": False,
+    "cuda_graph_scope": [],
+    "cuda_graph_use_single_mempool": False,
+    "cuda_graph_warmup_steps": 3,
+    "deallocate_pipeline_outputs": True,
+    "defer_embedding_wgrad_compute": False,
+    "delay_wgrad_compute": False,
+    "deterministic_mode": False,
+    "disable_bf16_reduced_precision_matmul": False,
+    "disable_parameter_transpose_cache": False,
+    "distribute_saved_activations": None,
+    "dsa_indexer_head_dim": None,
+    "dsa_indexer_loss_coeff": None,
+    "dsa_indexer_n_heads": None,
+    "dsa_indexer_topk": None,
+    "dsa_indexer_use_sparse_loss": False,
+    "embedding_init_method": {},
+    "embedding_init_method_std": 0.014,
+    "enable_autocast": False,
+    "enable_cuda_graph": False,
+    "ep_overlap_early_attn_memory_release": False,
+    "experimental_attention_variant": None,
+    "expert_model_parallel_size": 4,
+    "expert_tensor_parallel_size": 1,
+    "external_cuda_graph": False,
+    "ffn_hidden_size": 1856,
+    "finalize_model_grads_func": None,
+    "first_last_layers_bf16": False,
+    "flash_decode": False,
+    "fp16": False,
+    "fp32_residual_connection": False,
+    "fp4": None,
+    "fp4_param": False,
+    "fp4_quantizer_factory": None,
+    "fp4_recipe": "nvfp4",
+    "fp8": None,
+    "fp8_amax_compute_algo": "most_recent",
+    "fp8_amax_history_len": 1,
+    "fp8_dot_product_attention": False,
+    "fp8_interval": 1,
+    "fp8_margin": 0,
+    "fp8_multi_head_attention": False,
+    "fp8_param": False,
+    "fp8_quantizer_factory": None,
+    "fp8_recipe": "delayed",
+    "fp8_wgrad": True,
+    "fused_single_qkv_rope": False,
+    "gated_linear_unit": False,
+    "glu_linear_offset": 0.0,
+    "grad_scale_func": None,
+    "grad_sync_func": None,
+    "gradient_accumulation_fusion": True,
+    "hetereogenous_dist_checkpoint": False,
+    "heterogeneous_block_specs": False,
+    "hidden_dropout": 0.0,
+    "hidden_size": 2688,
+    "hierarchical_context_parallel_sizes": None,
+    "inference_fuse_tp_communication": False,
+    "inference_rng_tracker": False,
+    "inference_sampling_seed": 42,
+    "init_method": {},
+    "init_method_std": 0.014,
+    "init_model_with_meta_device": False,
+    "is_hybrid_model": True,
+    "kitchen_attention_backend": "sdpa",
+    "kv_channels": 128,
+    "layernorm_epsilon": 1e-05,
+    "layernorm_zero_centered_gamma": False,
+    "linear_attention_freq": None,
+    "linear_conv_kernel_dim": 4,
+    "linear_key_head_dim": 128,
+    "linear_num_key_heads": 16,
+    "linear_num_value_heads": 32,
+    "linear_value_head_dim": 128,
+    "log_max_attention_logit": False,
+    "mamba_head_dim": 64,
+    "mamba_num_groups": 8,
+    "mamba_num_heads": 64,
+    "mamba_state_dim": 128,
+    "masked_softmax_fusion": True,
+    "memory_efficient_layer_norm": False,
+    "microbatch_group_size_per_vp_stage": 1,
+    "mlp_chunks_for_prefill": 1,
+    "moe_apply_probs_on_input": False,
+    "moe_aux_loss_coeff": 0.0,
+    "moe_deepep_num_sms": 20,
+    "moe_enable_deepep": False,
+    "moe_expert_capacity_factor": None,
+    "moe_extended_tp": False,
+    "moe_ffn_hidden_size": 1856,
+    "moe_flex_dispatcher_backend": "deepep",
+    "moe_grouped_gemm": True,
+    "moe_hybridep_num_sms": 16,
+    "moe_input_jitter_eps": None,
+    "moe_latent_size": None,
+    "moe_layer_freq": 1,
+    "moe_layer_recompute": False,
+    "moe_pad_expert_input_to_capacity": False,
+    "moe_per_layer_logging": False,
+    "moe_permute_fusion": False,
+    "moe_router_bias_update_rate": 0.001,
+    "moe_router_dtype": "fp64",
+    "moe_router_enable_expert_bias": True,
+    "moe_router_force_load_balancing": False,
+    "moe_router_fusion": False,
+    "moe_router_group_topk": None,
+    "moe_router_load_balancing_type": "aux_loss",
+    "moe_router_num_groups": None,
+    "moe_router_padding_for_fp8": False,
+    "moe_router_padding_for_quantization": False,
+    "moe_router_pre_softmax": False,
+    "moe_router_score_function": "sigmoid",
+    "moe_router_topk": 6,
+    "moe_router_topk_limited_devices": None,
+    "moe_router_topk_scaling_factor": 2.5,
+    "moe_shared_expert_gate": False,
+    "moe_shared_expert_intermediate_size": 3712,
+    "moe_shared_expert_overlap": False,
+    "moe_token_dispatcher_type": "alltoall",
+    "moe_token_drop_policy": "probs",
+    "moe_token_dropping": False,
+    "moe_use_legacy_grouped_gemm": False,
+    "moe_z_loss_coeff": None,
+    "moe_enable_routing_replay": False,
+    "mrope_section": None,
+    "mtp_loss_scaling_factor": 0.1,
+    "mtp_num_layers": None,
+    "mtp_standalone": False,
+    "multi_latent_attention": False,
+    "no_rope_freq": None,
+    "no_sync_func": None,
+    "normalization": "RMSNorm",
+    "num_attention_heads": 32,
+    "num_layers": 52,
+    "num_layers_at_end_in_bf16": 1,
+    "num_layers_at_start_in_bf16": 1,
+    "num_layers_in_first_pipeline_stage": None,
+    "num_layers_in_last_pipeline_stage": None,
+    "num_microbatches_with_partial_activation_checkpoints": None,
+    "num_moe_experts": 128,
+    "num_query_groups": 2,
+    "output_layer_init_method": {},
+    "overlap_moe_expert_parallel_comm": False,
+    "overlap_p2p_comm": False,
+    "overlap_p2p_comm_warmup_flush": False,
+    "param_sync_func": None,
+    "params_dtype": "torch.bfloat16",
+    "perform_initialization": True,
+    "persist_layer_norm": True,
+    "pipeline_dtype": "torch.bfloat16",
+    "pipeline_model_parallel_comm_backend": None,
+    "pipeline_model_parallel_layout": None,
+    "pipeline_model_parallel_size": 1,
+    "qk_clip": False,
+    "qk_clip_alpha": 0.5,
+    "qk_clip_threshold": 100,
+    "qk_l2_norm": False,
+    "qk_layernorm": False,
+    "quant_recipe": None,
+    "recompute_granularity": None,
+    "recompute_method": None,
+    "recompute_modules": ["core_attn"],
+    "recompute_num_layers": None,
+    "rotary_interleaved": False,
+    "sequence_parallel": True,
+    "softmax_scale": None,
+    "softmax_type": "vanilla",
+    "symmetric_ar_type": None,
+    "tensor_model_parallel_size": 2,
+    "test_mode": False,
+    "timers": None,
+    "tp_comm_atomic_ag": False,
+    "tp_comm_atomic_rs": False,
+    "tp_comm_bootstrap_backend": "nccl",
+    "tp_comm_bulk_dgrad": True,
+    "tp_comm_bulk_wgrad": True,
+    "tp_comm_overlap": False,
+    "tp_comm_overlap_ag": True,
+    "tp_comm_overlap_disable_fc1": False,
+    "tp_comm_overlap_disable_qkv": False,
+    "tp_comm_overlap_rs": True,
+    "tp_comm_overlap_rs_dgrad": False,
+    "tp_comm_split_ag": True,
+    "tp_comm_split_rs": True,
+    "tp_only_amax_red": False,
+    "transformer_impl": "transformer_engine",
+    "use_cpu_initialization": None,
+    "use_fused_weighted_squared_relu": False,
+    "use_inference_optimized_layers": False,
+    "use_kitchen": False,
+    "use_kitchen_attention": False,
+    "use_mamba_mem_eff_path": True,
+    "use_ring_exchange_p2p": False,
+    "use_te_activation_func": False,
+    "use_te_rng_tracker": False,
+    "variable_seq_lengths": False,
+    "virtual_pipeline_model_parallel_size": None,
+    "wgrad_deferral_limit": 0,
+    "window_attn_skip_freq": None,
+    "window_size": None,
+    "fine_grained_activation_offloading": False,
+    "min_offloaded_tensor_size": 1024 * 1024,
+    "offload_modules": [],
+    "hybrid_context_parallel": False,
+    "max_seqlen_per_dp_cp_rank": None,
+}
+# Fields to ignore entirely (ephemeral, environment-specific, very large).
+SKIP_FIELDS = set()
+# Fields that are allowed to appear in the live config even if not yet in the golden.
+ALLOW_ADDED_FIELDS = set()
+
+
+def serialize_config(cfg: Any) -> Dict[str, Any]:
+    """Normalize a config object into a JSON-serializable dict."""
+    data = {k: v for k, v in vars(cfg).items() if k not in SKIP_FIELDS}
+    return _ser(data)
+
+
+def assert_config_matches_golden(cfg: Any) -> None:
+    """Compare live config to golden snapshot with readable diffs."""
+    current = serialize_config(cfg)
+    golden = GOLDEN_CONFIG
+
+    added, removed, changed = _diff_configs(golden, current)
+
+    # Ignore added fields that are explicitly allowed.
+    added = [k for k in added if k not in ALLOW_ADDED_FIELDS]
+
+    if added or removed or changed:
+        # Build actionable guidance for each type of drift
+        guidance_parts = []
+
+        if added:
+            guidance_parts.append(
+                f"\n\n[ADDED ARGS]: {sorted(added)}\n"
+                "  → Update GOLDEN_CONFIG in this test file to include the new arg(s) with "
+                "their default value(s).\n"
+                "  ⚠️  CAUTION: Review any logic associated with new args to ensure it doesn't "
+                "silently affect downstream model configs or behavior.\n"
+            )
+
+        if changed:
+            guidance_parts.append(
+                f"\n\n[CHANGED DEFAULTS]: {sorted(changed)}\n"
+                "  → Please don't change the default values of existing args unless "
+                "it is absolutely necessary for a bug fix.\n"
+                "  → If you must change the default value, please update the GOLDEN_CONFIG "
+                "in this test file to reflect the new default value.\n"
+            )
+
+        if removed:
+            guidance_parts.append(
+                f"\n\n[REMOVED ARGS]: {sorted(removed)}\n"
+                "  → Do NOT remove args directly. Instead, deprecate them with a warning message "
+                "to maintain backwards compatibility.\n"
+            )
+
+        guidance_parts.append(
+            "Please contact NV-username @jbarker if you are unsure how to proceed.\n"
+        )
+
+        header = "Mamba MoE config drift detected!\n" "═" * 60 + "".join(guidance_parts)
+        parts = [header]
+        if changed:
+            formatted = {k: {"expected": golden[k], "actual": current[k]} for k in sorted(changed)}
+            parts.append(
+                f"Changed field details:\n{json.dumps(formatted, indent=2, sort_keys=True)}"
+            )
+        pytest.fail("\n".join(parts))
+
+
+def regenerate_mamba_moe_golden(cfg: Any) -> Dict[str, Any]:
+    """Helper to regenerate the golden config; copy/paste into GOLDEN_CONFIG."""
+    serialized = serialize_config(cfg)
+    return serialized
+
+
+def _ser(obj: Any) -> Any:
+    """Recursively convert objects to JSON-friendly structures."""
+    if obj is None or isinstance(obj, (bool, int, float, str)):
+        return obj
+    if isinstance(obj, dict):
+        return {k: _ser(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [_ser(v) for v in obj]
+    if inspect.isfunction(obj) or inspect.ismethod(obj):
+        return f"{obj.__module__}.{obj.__name__}"
+    if inspect.isclass(obj):
+        return f"{obj.__module__}.{obj.__name__}"
+    if hasattr(obj, "__dict__"):
+        return {k: _ser(v) for k, v in vars(obj).items()}
+    try:
+        return str(obj)
+    except Exception:
+        return f"<unserializable:{type(obj).__name__}>"
+
+
+def _diff_configs(expected: Mapping[str, Any], actual: Mapping[str, Any]) -> Tuple[set, set, set]:
+    """Return added, removed, and changed top-level keys between dicts."""
+    expected_keys = set(expected)
+    actual_keys = set(actual)
+    added = actual_keys - expected_keys
+    removed = expected_keys - actual_keys
+    changed = {k for k in expected_keys & actual_keys if expected[k] != actual[k]}
+    return added, removed, changed
+
+
+class TestMambaMoEModel:
+    """Test the initialization and use of an MoE Mamba model."""
+
+    def create_test_args(self):
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+
+        sys.argv = ['test_mamba_moe_model.py']
+        args = parse_args()
+
+        # The following args would be set from the nano v3 checkpoint.
+        args.num_layers = 52
+        args.hidden_size = 2688
+        args.ffn_hidden_size = 1856
+        args.num_attention_heads = 32
+        args.num_query_groups = 2
+        args.group_query_attention = True
+        args.kv_channels = 128
+        args.position_embedding_type = 'none'
+        args.add_position_embedding = True
+        args.use_rotary_position_embeddings = False
+        args.rotary_base = 10000
+        args.rotary_percent = 1.0
+        args.rotary_interleaved = False
+        args.add_bias_linear = False
+        args.add_qkv_bias = False
+        args.squared_relu = True
+        args.swiglu = False
+        args.untie_embeddings_and_output_weights = True
+        args.apply_layernorm_1p = False
+        args.normalization = "RMSNorm"
+        args.apply_query_key_layer_scaling = False
+        args.attention_dropout = 0.0
+        args.hidden_dropout = 0.0
+        args.hybrid_override_pattern = "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME"
+        args.spec = ["megatron.core.models.mamba.mamba_layer_specs", "mamba_stack_spec"]
+        args.hybrid_attention_ratio = 0.0
+        args.hybrid_mlp_ratio = 0.0
+        args.num_experts = 128
+        args.moe_layer_freq = 1
+        args.moe_ffn_hidden_size = 1856
+        args.moe_router_topk = 6
+        args.moe_router_pre_softmax = False
+        args.moe_grouped_gemm = True
+        args.moe_shared_expert_intermediate_size = 3712
+        args.moe_router_score_function = "sigmoid"
+        args.moe_router_enable_expert_bias = True
+        args.moe_router_topk_scaling_factor = 2.5
+        args.mamba_state_dim = 128
+        args.mamba_head_dim = 64
+        args.mamba_num_groups = 8
+        args.mamba_num_heads = 64
+        args.is_hybrid_model = True
+        args.tokenizer_type = "TikTokenizer"
+        args.tiktoken_pattern = "v2"
+        args.tokenizer_model = "/mnt/artifacts/model/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json"
+        args.padded_vocab_size = 131072
+
+        # The following args would be set in the user's nano v3 config.
+        args.async_tensor_model_parallel_allreduce = True
+        args.attention_backend = AttnBackend.flash
+        args.bf16 = True
+        args.ckpt_format = 'torch_dist'
+        args.cross_entropy_loss_fusion = True
+        args.cuda_graph_impl = "none"
+        args.embedding_init_method_std = 0.014
+        args.expert_model_parallel_size = 4
+        args.expert_tensor_parallel_size = 1
+        args.init_method_std = 0.014
+        args.lr = 3e-5
+        args.max_position_embeddings = 1024
+        args.micro_batch_size = 2
+        args.moe_aux_loss_coeff = 0.0
+        args.moe_grouped_gemm = True
+        args.moe_route_load_balancing_type = "aux_loss"
+        args.moe_router_dtype = "fp64"
+        args.moe_router_pre_softmax = False
+        args.moe_token_dispatcher_type = "alltoall"
+        args.no_load_optim = True
+        args.no_load_rng = True
+        args.no_save_optim = True
+        args.pipeline_model_parallel_size = 1
+        args.position_embedding_type = None
+        args.recompute_granularity = None
+        args.seed = 42
+        args.seq_length = 1024
+        args.sequence_parallel = True
+        args.te_rng_tracker = True
+        args.tensor_model_parallel_size = 2
+        args.vocab_size = 131072
+
+        validate_args(args)
+        set_global_variables(args, False)
+        return args
+
+    def setup_method(self, method):
+
+        os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+        args = self.create_test_args()
+        set_args(args)
+
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=args.tensor_model_parallel_size,
+            pipeline_model_parallel_size=args.pipeline_model_parallel_size,
+            expert_model_parallel_size=args.expert_model_parallel_size,
+            expert_tensor_parallel_size=args.expert_tensor_parallel_size,
+        )
+        model_parallel_cuda_manual_seed(123)
+
+        model_config = core_transformer_config_from_args(args, TransformerConfig)
+
+        self.model = MambaModel(
+            config=model_config,
+            mamba_stack_spec=mamba_stack_spec,
+            vocab_size=args.vocab_size,
+            max_sequence_length=args.seq_length,
+            hybrid_attention_ratio=args.hybrid_attention_ratio,
+            hybrid_mlp_ratio=args.hybrid_mlp_ratio,
+            hybrid_override_pattern=args.hybrid_override_pattern,
+            position_embedding_type=args.position_embedding_type,
+            rotary_base=args.rotary_base,
+            rotary_percent=args.rotary_percent,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        """Sanity check for the constructor of the Mamba MoE model."""
+
+        args = get_args()
+
+        assert_config_matches_golden(self.model.config)
+
+        assert self.model.pre_process is True, "pre_process should be True"
+        assert self.model.post_process is True, "post_process should be True"
+        assert self.model.hybrid_attention_ratio == 0.0, "hybrid_attention_ratio should be 0.0"
+        assert self.model.hybrid_mlp_ratio == 0.0, "hybrid_mlp_ratio should be 0.0"
+        assert (
+            self.model.hybrid_override_pattern == args.hybrid_override_pattern
+        ), f"hybrid_override_pattern should be {args.hybrid_override_pattern}"
+        num_weights = sum([p.numel() for p in self.model.parameters()])
+        assert num_weights == 8449294624, f"Expected 8449294624 parameters, got {num_weights}"
+
+    def test_set_input_tensor(self):
+
+        args = get_args()
+
+        config: TransformerConfig = self.model.config
+        sequence_length = self.model.max_sequence_length
+        micro_batch_size = args.micro_batch_size
+
+        # [sequence length, batch size, hidden size]
+        input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+
+        self.model.set_input_tensor(input_tensor)
+
+        assert self.model.decoder.input_tensor.shape[0] == sequence_length
+        assert self.model.decoder.input_tensor.shape[1] == micro_batch_size
+        assert self.model.decoder.input_tensor.shape[2] == config.hidden_size
+
+    def test_forward(self):
+        """Basic smoke test for the forward pass of the Mamba MoE model."""
+
+        args = get_args()
+
+        # we must override this to avoid the need to initialize the optimizer
+        for param in self.model.parameters():
+            param.requires_grad = False
+
+        sequence_length = self.model.max_sequence_length
+        micro_batch_size = args.micro_batch_size
+
+        self.model.cuda()
+
+        data = list(range(sequence_length))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, sequence_length, sequence_length), dtype=bool
+        ).cuda()
+
+        logits = self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            runtime_gather_output=True,
+        )
+
+        assert logits.shape[0] == micro_batch_size
+        assert logits.shape[1] == sequence_length
+        assert logits.shape[2] == self.model.vocab_size
diff --git a/tests/unit_tests/optimizer/__init__.py b/tests/unit_tests/optimizer/__init__.py
new file mode 100644
index 00000000000..b5dff7b5663
--- /dev/null
+++ b/tests/unit_tests/optimizer/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/tests/unit_tests/optimizer/test_optimizer_config.py b/tests/unit_tests/optimizer/test_optimizer_config.py
new file mode 100644
index 00000000000..0ecb877ed27
--- /dev/null
+++ b/tests/unit_tests/optimizer/test_optimizer_config.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import torch
+
+from megatron.core.optimizer.optimizer_config import ParamKey, ParamPredicate
+
+
+def test_paramkey_matches():
+    len_1_predicate = ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1)
+    endswith_bias = ParamKey(name="*.bias")
+    has_dotbias = ParamKey(name="*.bias*")
+    len_1_param = ParamKey(predicate=len_1_predicate)
+    has_bias_or_len1_param = ParamKey(name="*.bias", predicate=len_1_predicate)
+    has_attr = ParamKey(attr="is_embedding_or_output_parameter")
+
+    assert endswith_bias.matches(torch.nn.Parameter(torch.empty(10, 10)), "interesting.bias")
+    assert not endswith_bias.matches(
+        torch.nn.Parameter(torch.empty(10, 10)), "something.bias.other"
+    )
+    assert has_dotbias.matches(torch.nn.Parameter(torch.empty(10)), "random.biasstuff")
+    assert not has_dotbias.matches(torch.nn.Parameter(torch.empty(10, 10)), "random_bias_name")
+    assert len_1_param.matches(torch.nn.Parameter(torch.empty(10)), "interesting.bias")
+    assert not len_1_param.matches(torch.nn.Parameter(torch.empty(10, 10)), "interesting_bias")
+    assert has_bias_or_len1_param.matches(
+        torch.nn.Parameter(torch.empty(10, 10)), "interesting.bias"
+    )
+    assert has_bias_or_len1_param.matches(torch.nn.Parameter(torch.empty(10)), "interesting_bias")
+    assert not has_bias_or_len1_param.matches(
+        torch.nn.Parameter(torch.empty(10, 10)), "random_bias_name"
+    )
+    p_with_attr = torch.nn.Parameter(torch.empty(10, 10))
+    setattr(p_with_attr, "is_embedding_or_output_parameter", True)
+    assert has_attr.matches(p_with_attr, "interesting.bias")
+    assert not has_attr.matches(torch.nn.Parameter(torch.empty(10, 10)), "interesting.bias")
+
+    # We expect that if the return of the attribute is False, it should not match even if
+    #  it has the attribute.
+    setattr(p_with_attr, "is_embedding_or_output_parameter", False)
+    assert not has_attr.matches(p_with_attr, "interesting.bias")
diff --git a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py
new file mode 100644
index 00000000000..558c6934a0c
--- /dev/null
+++ b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py
@@ -0,0 +1,573 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import gc
+import os
+from contextlib import nullcontext
+from typing import Dict, List, Optional, Tuple
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.transformer_config import MLATransformerConfig, TransformerConfig
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+# Tolerance for memory expectation check (GPU allocator jitter etc).
+EPSILON = 0.30
+EPSILON_A2A = 0.30
+DELTA = 20  # MiB
+
+
+def _reset_cuda_memory() -> None:
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+def _build_gpt_model(
+    *,
+    seed: int,
+    num_layers: int,
+    hidden_size: int,
+    num_attention_heads: int,
+    vocab_size: int,
+    seq_length: int,
+    num_experts: Optional[int],
+    fine_grained_activation_offloading: bool,
+    offload_modules: Optional[List[str]],
+    min_offloaded_tensor_size: int,
+    is_mla: bool,
+) -> GPTModel:
+    """Build a GPTModel that uses TE-based transformer layer spec."""
+    model_parallel_cuda_manual_seed(seed)
+    torch.manual_seed(seed)
+    ConfigClass = MLATransformerConfig if is_mla else TransformerConfig
+    transformer_config = ConfigClass(
+        num_layers=num_layers,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        use_cpu_initialization=True,
+        attention_backend=AttnBackend.unfused,
+        bf16=True,
+        # Recompute
+        recompute_modules=["layernorm", "moe_act"] if num_experts is not None else ["layernorm"],
+        recompute_granularity="selective",
+        # MoE
+        num_moe_experts=num_experts,
+        moe_grouped_gemm=(num_experts is not None),
+        # Fine-grained activation offloading
+        fine_grained_activation_offloading=fine_grained_activation_offloading,
+        offload_modules=offload_modules,
+        min_offloaded_tensor_size=min_offloaded_tensor_size,
+    )
+    gpt_model = GPTModel(
+        config=transformer_config,
+        transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_experts,
+            moe_grouped_gemm=num_experts is not None,
+            moe_use_legacy_grouped_gemm=False,
+            multi_latent_attention=is_mla,
+        ),
+        vocab_size=vocab_size,
+        max_sequence_length=seq_length,
+    ).bfloat16()
+    return gpt_model
+
+
+def _make_gpt_inputs(
+    *, seq_length: int, micro_batch_size: int, device: torch.device
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    data = list(range(seq_length))
+    input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device)
+    position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device)
+    attention_mask = torch.ones((micro_batch_size, 1, seq_length, seq_length), dtype=bool).to(
+        device
+    )
+    return input_ids, position_ids, attention_mask
+
+
+def _run_one_iter_and_capture(
+    model: GPTModel,
+    *,
+    input_ids: torch.Tensor,
+    position_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    enable_offload_reset: bool,
+) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], int]:
+    """
+    Run a single forward+backward iteration.
+
+    Returns:
+      - logits (CPU float32)
+      - selected grads (CPU float32)
+      - peak_memory_allocated (bytes) during the iteration
+    """
+
+    if enable_offload_reset:
+        off_interface.reset()
+
+    # for p in model.parameters():
+    #     if p.grad is not None:
+    #         p.grad = None
+
+    torch.cuda.reset_peak_memory_stats()
+    logits = model(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask)
+    loss = logits.float().sum()
+    loss.backward()
+    torch.cuda.synchronize()
+    peak_bytes = int(torch.cuda.max_memory_allocated())
+
+    # capture all gradients for correctness
+    grads: Dict[str, torch.Tensor] = {}
+    for name, p in model.named_parameters():
+        grads[name] = p.grad.detach().float().cpu() if p.grad is not None else None
+
+    return logits.detach().float().cpu(), grads, peak_bytes
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for offloading tests.")
+@pytest.mark.parametrize(
+    "is_moe, is_mla, offload_modules",
+    [
+        # Dense GPT modules
+        (False, True, ["attn_norm"]),
+        (True, False, ["qkv_linear"]),
+        (True, False, ["core_attn"]),
+        # # attn_proj depends on core_attn (validated in TransformerConfig.__post_init__)
+        (True, True, ["core_attn", "attn_proj"]),
+        (True, False, ["mlp_norm"]),
+        (True, False, ["expert_fc1"]),
+        (True, False, ["moe_act"]),
+    ],
+)
+def test_gpt_fine_grained_activation_offloading_correctness_and_memory(
+    is_moe: bool, is_mla: bool, offload_modules: List[str]
+):
+    """
+    Initialize a GPTModel and verify:
+    - forward output correctness under each offload_modules setting
+    - backward gradient correctness (subset)
+    - peak GPU memory is reduced roughly as expected (based on recorded offload bytes)
+    """
+    # setup distributed/model-parallel (same pattern as other UTs)
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
+    os.environ.pop("NVTE_UNFUSED_ATTN", None)
+    # os.environ["NVTE_FLASH_ATTN"] = "1"
+    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+
+    seed = 123
+    # Choose shapes large enough to make memory deltas stable but still fast.
+    num_experts = 4 if is_moe else None
+    num_layers = 8
+    hidden_size = 2048 if num_experts is None else 1024
+    num_attention_heads = 16 if hidden_size >= 2048 else 8
+    vocab_size = 1024
+    seq_length = 1024
+    micro_batch_size = 2
+    device = torch.device("cuda")
+
+    input_ids, position_ids, attention_mask = _make_gpt_inputs(
+        seq_length=seq_length, micro_batch_size=micro_batch_size, device=device
+    )
+
+    from megatron.core.pipeline_parallel import fine_grained_activation_offload as off
+
+    off_interface.reset_instance()
+
+    try:
+        # 1) Baseline run (no offloading)
+        _reset_cuda_memory()
+        base_model = _build_gpt_model(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=False,
+            offload_modules=None,
+            min_offloaded_tensor_size=1024 * 1024,
+            is_mla=is_mla,
+        ).cuda()
+        base_model.train()
+
+        # Warmup baseline once for allocator stability
+        _run_one_iter_and_capture(
+            base_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=False,
+        )
+        _reset_cuda_memory()
+        base_logits, base_grads, base_peak = _run_one_iter_and_capture(
+            base_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=False,
+        )
+        # Free baseline model GPU memory before offload path
+        del base_model
+        _reset_cuda_memory()
+
+        # 2) Offload run (warmup to record bytes + steady-state measurement)
+        off_model = _build_gpt_model(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=True,
+            offload_modules=offload_modules,
+            min_offloaded_tensor_size=1024,  # force offloading for UT determinism
+            is_mla=is_mla,
+        ).cuda()
+        off_model.train()
+
+        # Warmup 1 iter to populate cached chunks, then reset to finish warmup bookkeeping.
+        _run_one_iter_and_capture(
+            off_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=True,
+        )
+        # Reset once more to trigger post_warmup_callback and apply steady-state offload decisions.
+        off_interface.reset()
+
+        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+            PipelineOffloadManager,
+        )
+
+        mgr = PipelineOffloadManager.get_instance()
+        expected_offload_bytes = int(
+            sum(mgr.offload_summary_bytes.get(k, 0) for k in offload_modules)
+        )
+        expected_offload_mib = expected_offload_bytes / (1024**2)
+
+        _reset_cuda_memory()
+        off_logits, off_grads, off_peak = _run_one_iter_and_capture(
+            off_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=True,
+        )
+        del off_model
+        _reset_cuda_memory()
+
+        # 3) Correctness checks (forward + selected grads)
+        assert torch.allclose(off_logits, base_logits, rtol=1e-3, atol=1e-3)
+        assert set(off_grads.keys()) == set(base_grads.keys())
+        for name, gb in base_grads.items():
+            go = off_grads[name]
+            if gb is None or go is None:
+                assert gb is None and go is None, f"Grad None mismatch for {name}"
+                continue
+            assert torch.allclose(go, gb, rtol=1e-3, atol=1e-3), f"Grad mismatch for {name}"
+
+        # 4) Memory checks (peak allocated over forward+backward)
+        saved_mib = (base_peak - off_peak) / (1024**2)
+        assert saved_mib > 0.0, (
+            f"Expected GPU peak memory reduction for offload_modules={offload_modules}, "
+            f"but got saved={saved_mib:.2f}MiB (base={base_peak/(1024**2):.2f}MiB, "
+            f"off={off_peak/(1024**2):.2f}MiB)"
+        )
+
+        # If expectation is large enough, enforce approximate match.
+        # For tiny expectations, allocator noise may dominate; we only require a positive reduction.
+        if expected_offload_mib >= 2.0:
+            rel_err = abs(saved_mib - expected_offload_mib) / max(expected_offload_mib, 1e-6)
+            abs_err = abs(saved_mib - expected_offload_mib)
+            assert rel_err <= EPSILON and abs_err <= DELTA, (
+                f"Memory saving mismatch for offload_modules={offload_modules}: "
+                f"saved={saved_mib:.2f}MiB expected~={expected_offload_mib:.2f}MiB "
+                f"(rel_err={rel_err:.2f}, abs_err={abs_err:.2f})"
+            )
+            print(
+                f"Rank {torch.distributed.get_rank()}: Saved {saved_mib:.2f}MiB, expected {expected_offload_mib:.2f}MiB"
+            )
+    finally:
+        Utils.destroy_model_parallel()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for offloading tests.")
+@pytest.mark.skipif(
+    not is_te_min_version("1.9.0.dev0"),
+    reason="EP A2A overlap requires TE 1.9.0.dev0+ in this repo's tests.",
+)
+@pytest.mark.parametrize(
+    "dispatcher_backend, is_mla, offload_modules",
+    [
+        ("alltoall", True, ["attn_norm"]),
+        ("alltoall", True, ["core_attn"]),
+        ("alltoall", True, ["attn_norm", "core_attn", "attn_proj"]),
+        ("alltoall", True, ["mlp_norm"]),
+        ("alltoall", False, ["expert_fc1"]),
+        ("alltoall", False, ["moe_act"]),
+        ("alltoall", False, ["mlp_norm", "expert_fc1", "moe_act"]),
+        (
+            "alltoall",
+            True,
+            ["attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act"],
+        ),
+        (
+            "alltoall",
+            False,
+            ["attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act"],
+        ),
+    ],
+)
+def test_fine_grained_activation_offload_with_ep_a2a_overlap_compatibility(
+    dispatcher_backend: str, is_mla: bool, offload_modules: List[str]
+):
+    """
+    Compatibility test for:
+      - fine-grained activation offloading
+      - EP all-to-all overlap (overlap_moe_expert_parallel_comm)
+      - memory saving roughly matches expected offload bytes (when expectation is large enough)
+
+    The EP A2A overlap initialization pattern is aligned with
+    `tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py`.
+    """
+    from megatron.core.models.common.model_chunk_schedule_plan import (
+        TransformerModelChunkSchedulePlan,
+    )
+    from megatron.core.pipeline_parallel.utils import set_streams
+    from tests.unit_tests.a2a_overlap.utils import deterministic_mode
+
+    # EP overlap requires distributed initialization with EP groups.
+    ep_size = 4
+    if Utils.world_size % ep_size != 0:
+        pytest.skip(
+            f"Skipping: WORLD_SIZE={Utils.world_size} must be divisible by ep_size={ep_size}."
+        )
+
+    seed = 123
+    num_experts = 8  # must be divisible by ep_size
+    if num_experts % ep_size != 0:
+        pytest.skip(
+            f"Skipping: num_moe_experts={num_experts} must be divisible by ep_size={ep_size}."
+        )
+
+    # Small shapes to keep this compatibility test fast.
+    num_layers = 8
+    hidden_size = 1024
+    num_attention_heads = 16
+    vocab_size = 1024
+    seq_length = 1024
+    micro_batch_size = 2
+    device = torch.device("cuda")
+
+    from megatron.core.pipeline_parallel import fine_grained_activation_offload as off
+
+    def _make_schedule_inputs() -> Dict[str, torch.Tensor]:
+        data = list(range(seq_length))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device)
+        position_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device)
+        )
+        attention_mask = torch.ones((micro_batch_size, 1, seq_length, seq_length), dtype=bool).to(
+            device
+        )
+        labels = input_ids.clone()
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+        }
+
+    def _capture_params(model: torch.nn.Module) -> Dict[str, torch.Tensor]:
+        params: Dict[str, torch.Tensor] = {}
+        for name, p in model.named_parameters():
+            params[name] = p.detach().clone()
+        return params
+
+    def _restore_params(model: torch.nn.Module, params: Dict[str, torch.Tensor]) -> None:
+        for name, p in model.named_parameters():
+            p.data.copy_(params[name])
+
+    def _build_overlap_moe_gpt(
+        *, enable_offload: bool, is_mla: bool, dispatcher_backend: str
+    ) -> GPTModel:
+        model_parallel_cuda_manual_seed(seed)
+        torch.manual_seed(seed)
+        ConfigClass = MLATransformerConfig if is_mla else TransformerConfig
+        transformer_config = ConfigClass(
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            use_cpu_initialization=True,
+            attention_backend=AttnBackend.unfused,
+            # Recompute
+            recompute_modules=["layernorm", "moe_act"],
+            recompute_granularity="selective",
+            bf16=True,
+            # MoE + EP overlap
+            num_moe_experts=num_experts,
+            moe_grouped_gemm=True,
+            expert_model_parallel_size=ep_size,
+            moe_token_dispatcher_type="alltoall" if dispatcher_backend == "alltoall" else "flex",
+            moe_flex_dispatcher_backend=dispatcher_backend,
+            moe_router_dtype="fp32" if dispatcher_backend == "hybridep" else "fp64",
+            overlap_moe_expert_parallel_comm=True,
+            delay_wgrad_compute=True,
+            # Fine-grained activation offloading
+            fine_grained_activation_offloading=enable_offload,
+            offload_modules=offload_modules if enable_offload else None,
+            min_offloaded_tensor_size=1024,  # force offloading to exercise the code path
+        )
+        return (
+            GPTModel(
+                config=transformer_config,
+                transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(
+                    num_experts=num_experts,
+                    moe_grouped_gemm=True,
+                    moe_use_legacy_grouped_gemm=False,
+                    multi_latent_attention=is_mla,
+                ),
+                vocab_size=vocab_size,
+                max_sequence_length=seq_length,
+            )
+            .bfloat16()
+            .cuda()
+        )
+
+    def _run_schedule_1f1b_two_microbatches(
+        model: GPTModel, *, enable_offload_reset: bool
+    ) -> Tuple[List[torch.Tensor], Dict[str, torch.Tensor], int]:
+        """
+        Run a minimal 1F1B schedule (2 microbatches) using ModelChunkSchedulePlan.run().
+        This is the execution path that exercises EP A2A overlap scheduling.
+        """
+        if enable_offload_reset:
+            off_interface.reset()
+
+        data0 = _make_schedule_inputs()
+        data1 = _make_schedule_inputs()
+        plan0 = model.build_schedule_plan(**data0)
+
+        torch.cuda.reset_peak_memory_stats()
+        out0 = TransformerModelChunkSchedulePlan.run(plan0, None)
+        plan1 = model.build_schedule_plan(**data1)
+        out1 = TransformerModelChunkSchedulePlan.run(plan1, plan0, b_grad=torch.ones_like(out0))
+        TransformerModelChunkSchedulePlan.run(None, plan1, b_grad=torch.ones_like(out1))
+        torch.cuda.synchronize()
+        peak_bytes = int(torch.cuda.max_memory_allocated())
+
+        # capture outputs and grads
+        outputs = [out0.detach().float().cpu(), out1.detach().float().cpu()]
+        grads: Dict[str, torch.Tensor] = {}
+        for name, p in model.named_parameters():
+            grads[name] = p.grad.detach().float().cpu() if p.grad is not None else None
+        return outputs, grads, peak_bytes
+
+    # setup distributed/model-parallel
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
+    os.environ.pop("NVTE_UNFUSED_ATTN", None)
+
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        expert_model_parallel_size=ep_size,
+    )
+    set_streams()
+
+    off_interface.reset_instance()
+
+    try:
+        with deterministic_mode():
+            # Baseline: EP overlap on, offload off.
+            _reset_cuda_memory()
+            base_model = _build_overlap_moe_gpt(
+                enable_offload=False, is_mla=is_mla, dispatcher_backend=dispatcher_backend
+            )
+            base_model.train()
+            base_params = _capture_params(base_model)
+            # Warmup once for allocator stability / graph caching
+            _run_schedule_1f1b_two_microbatches(base_model, enable_offload_reset=False)
+            _reset_cuda_memory()
+            base_outs, base_grads, base_peak = _run_schedule_1f1b_two_microbatches(
+                base_model, enable_offload_reset=False
+            )
+            del base_model
+            _reset_cuda_memory()
+
+            # Offload: EP overlap on, fine-grained offload on.
+            off_model = _build_overlap_moe_gpt(
+                enable_offload=True, is_mla=is_mla, dispatcher_backend=dispatcher_backend
+            )
+            _restore_params(off_model, base_params)
+            off_model.train()
+            # Warmup once to populate cached chunks, then reset to apply steady-state offload decisions.
+            off_interface.reset()
+            _run_schedule_1f1b_two_microbatches(off_model, enable_offload_reset=False)
+            off_interface.reset()
+            from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+                PipelineOffloadManager,
+            )
+
+            mgr = PipelineOffloadManager.get_instance()
+            expected_offload_bytes = int(
+                sum(mgr.offload_summary_bytes.get(k, 0) for k in offload_modules)
+            )
+            expected_offload_mib = expected_offload_bytes / (1024**2)
+
+            _reset_cuda_memory()
+            off_outs, off_grads, off_peak = _run_schedule_1f1b_two_microbatches(
+                off_model, enable_offload_reset=True
+            )
+            del off_model
+            _reset_cuda_memory()
+
+            # Correctness (forward outputs + all grads)
+            assert len(off_outs) == len(base_outs) == 2
+            for i in range(2):
+                assert torch.allclose(off_outs[i], base_outs[i], rtol=1e-3, atol=1e-3)
+            assert set(off_grads.keys()) == set(base_grads.keys())
+            for name, gb in base_grads.items():
+                go = off_grads[name]
+                if gb is None or go is None:
+                    assert gb is None and go is None, f"Grad None mismatch for {name}"
+                    continue
+                assert torch.allclose(
+                    go, gb, rtol=1e-3, atol=1e-3
+                ), f"Rank {torch.distributed.get_rank()}: Grad mismatch for {name}"
+
+            # Memory checks (peak allocated during the scheduled 1F1B run)
+            saved_mib = (base_peak - off_peak) / (1024**2)
+            assert saved_mib > 0.0, (
+                f"Expected GPU peak memory reduction for offload_modules={offload_modules}, "
+                f"but got saved={saved_mib:.2f}MiB (base={base_peak/(1024**2):.2f}MiB, "
+                f"off={off_peak/(1024**2):.2f}MiB)"
+            )
+            # If expectation is large enough, enforce approximate match.
+            if expected_offload_mib >= 2.0:
+                rel_err = abs(saved_mib - expected_offload_mib) / max(expected_offload_mib, 1e-6)
+                abs_err = abs(saved_mib - expected_offload_mib)
+                print(
+                    f"Rank {torch.distributed.get_rank()}: Saved {saved_mib:.2f}MiB, expected {expected_offload_mib:.2f}MiB"
+                )
+                if abs_err > DELTA:
+                    assert rel_err <= EPSILON_A2A, (
+                        f"Memory saving mismatch for offload_modules={offload_modules}: "
+                        f"saved={saved_mib:.2f}MiB expected~={expected_offload_mib:.2f}MiB "
+                        f"(rel_err={rel_err:.2f}, abs_err={abs_err:.2f})"
+                    )
+    finally:
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/pipeline_parallel/test_multimodule_communicator.py b/tests/unit_tests/pipeline_parallel/test_multimodule_communicator.py
new file mode 100644
index 00000000000..22f790cc0a9
--- /dev/null
+++ b/tests/unit_tests/pipeline_parallel/test_multimodule_communicator.py
@@ -0,0 +1,782 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+import os
+import sys
+
+import pytest
+import torch
+import torch.distributed as dist
+from packaging import version
+
+from megatron.core import parallel_state
+from megatron.core.hyper_comm_grid import HyperCommGrid
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.pipeline_parallel.multimodule_communicator import MultiModulePipelineCommunicator
+from tests.unit_tests.pipeline_parallel.test_bridge_communicator import (
+    _avg_params,
+    _create_transformer_block,
+    _get_pg_collection_from_grid,
+    create_hypercomm_grid,
+    get_transformer_block_and_grid,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestMultiModulePipelineCommunicator:
+
+    @classmethod
+    def setup_class(cls):
+        """Set up distributed environment for the entire test class."""
+        if not dist.is_initialized():
+            dist.init_process_group(backend="nccl")
+        if torch.cuda.is_available():
+            torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+
+        world_size = dist.get_world_size()
+        if world_size != 8:
+            pytest.skip(
+                f"These tests require 8 GPUs, but only {world_size} are available.",
+                allow_module_level=True,
+            )
+
+    def teardown_class(cls):
+        Utils.destroy_model_parallel()
+
+    def test_multimodule_communicator_init(self):
+        """Test MultiModulePipelineCommunicator initialization."""
+
+        # Create process group grids for each module
+        image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1)
+        audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1)
+        llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1)
+        generator_grid = create_hypercomm_grid(offset=6, tp=2, cp=1, pp=1, dp=1)
+
+        # Define module-grid mapping
+        module_to_grid_map = {
+            'image_encoder': image_encoder_grid,
+            'audio_encoder': audio_encoder_grid,
+            'llm': llm_grid,
+            'generator': generator_grid,
+        }
+        # Define module computation topology
+        topology = {
+            'image_encoder': ['llm'],
+            'audio_encoder': ['llm'],
+            'llm': ['generator'],
+            'generator': [],
+        }
+        config = ModelParallelConfig(bf16=True)
+        # Initialize communicator
+        mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config)
+        # Test attributes match expectations
+        assert mllm_comm.module_to_grid_map == module_to_grid_map
+        assert mllm_comm.topology == topology
+        assert mllm_comm.config == config
+        assert mllm_comm.current_rank == dist.get_rank()
+
+    def test_compute_total_pipeline_stages(self):
+        """Test compute_total_pipeline_stages for overall chain and until specific ranks."""
+
+        # Create process group grids for each module
+        image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1)
+        audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1)
+        llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1)
+        generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2)
+
+        # Define module-grid mapping and topology
+        module_to_grid_map = {
+            'image_encoder': image_encoder_grid,
+            'audio_encoder': audio_encoder_grid,
+            'llm': llm_grid,
+            'generator': generator_grid,
+        }
+        topology = {
+            'image_encoder': ['llm'],
+            'audio_encoder': ['llm'],
+            'llm': ['generator'],
+            'generator': [],
+        }
+
+        # Overall total pipeline stages: max(1,1) + 2 + 1 = 4
+        total = MultiModulePipelineCommunicator.compute_total_pipeline_stages(
+            topology, module_to_grid_map
+        )
+        assert total == 4
+
+        llm_pp_rank = MultiModulePipelineCommunicator.compute_total_pipeline_stages(
+            topology, module_to_grid_map, rank=2, module_name='llm'
+        )
+        assert llm_pp_rank == 2
+
+    def test_send_forward_recv_forward(self):
+        """Test send_forward and recv_forward operations."""
+        if not dist.is_initialized():
+            pytest.skip("Distributed not initialized")
+
+        # Create process group grids for each module
+        image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1)
+        audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1)
+        llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1)
+        generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2)
+
+        # Set up module-grid mapping and topology
+        module_to_grid_map = {
+            'image_encoder': image_encoder_grid,
+            'audio_encoder': audio_encoder_grid,
+            'llm': llm_grid,
+            'generator': generator_grid,
+        }
+        topology = {
+            'image_encoder': ['llm'],
+            'audio_encoder': ['llm'],
+            'llm': ['generator'],
+            'generator': [],
+        }
+        config = ModelParallelConfig(pipeline_dtype=torch.float)
+        mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config)
+
+        # Simulate forward communication for each module
+        if mllm_comm.is_current_rank_in_grid(image_encoder_grid):
+            # Image encoder sends output forward
+            output_dict = {'image_encoder': torch.randn(2, 8, 128).cuda()}
+            mllm_comm.send_forward(output_dict)
+        if mllm_comm.is_current_rank_in_grid(audio_encoder_grid):
+            # Audio encoder sends output forward
+            output_dict = {'audio_encoder': torch.randn(2, 16, 128).cuda()}
+            mllm_comm.send_forward(output_dict)
+        if mllm_comm.is_current_rank_in_grid(llm_grid):
+            output_dict = {'llm': torch.randn(2, 32, 128).cuda()}
+            if dist.get_rank() == 2 or dist.get_rank() == 3:
+                # LLM stage receives both image and audio outputs
+                input_dict = mllm_comm.recv_forward()
+                assert input_dict['image_encoder'].shape == (2, 8, 128)
+                assert input_dict['audio_encoder'].shape == (2, 16, 128)
+                mllm_comm.send_forward(output_dict)
+            else:
+                # LLM stage receives concatenated LLM outputs
+                input_dict = mllm_comm.recv_forward(tensor_shape=(2, 32, 128))
+                assert input_dict['llm'].shape == (2, 32, 128)
+                mllm_comm.send_forward(output_dict)
+        if mllm_comm.is_current_rank_in_grid(generator_grid):
+            # Generator module receives final LLM output
+            input_dict = mllm_comm.recv_forward()
+            assert input_dict['llm'].shape == (1, 32, 128)
+
+    def test_send_forward_recv_forward_with_different_pp_size(self):
+        """Test for the case when pp(image_encoder) != pp(audio_encoder)."""
+        if not dist.is_initialized():
+            pytest.skip("Distributed not initialized")
+
+        # Create process group grids for each module
+        image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=2, dp=1)
+        audio_encoder_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=1, dp=1)
+        llm_grid = create_hypercomm_grid(offset=4, tp=1, cp=1, pp=4, dp=1)
+
+        # Set up module-grid mapping and topology
+        module_to_grid_map = {
+            'image_encoder': image_encoder_grid,
+            'audio_encoder': audio_encoder_grid,
+            'llm': llm_grid,
+        }
+        topology = {'image_encoder': ['llm'], 'audio_encoder': ['llm'], 'llm': []}
+        config = ModelParallelConfig(pipeline_dtype=torch.float)
+        mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config)
+
+        # Simulate forward communication for each module
+        if mllm_comm.is_current_rank_in_grid(image_encoder_grid):
+            output_dict = {'image_encoder': torch.randn(2, 8, 128).cuda()}
+            if dist.get_rank() == 0:
+                # Image encoder sends output forward
+                mllm_comm.send_forward(output_dict)
+            else:
+                # Image stage receives image outputs
+                input_dict = mllm_comm.recv_forward(tensor_shape=(2, 8, 128))
+                assert input_dict['image_encoder'].shape == (2, 8, 128)
+                mllm_comm.send_forward(output_dict)
+        if mllm_comm.is_current_rank_in_grid(audio_encoder_grid):
+            # Audio encoder sends output forward
+            output_dict = {'audio_encoder': torch.randn(2, 16, 128).cuda()}
+            mllm_comm.send_forward(output_dict)
+        if mllm_comm.is_current_rank_in_grid(llm_grid):
+            output_dict = {'llm': torch.randn(2, 32, 128).cuda()}
+            if dist.get_rank() == 4:
+                # LLM stage receives both image and audio outputs
+                input_dict = mllm_comm.recv_forward()
+                assert input_dict['image_encoder'].shape == (2, 8, 128)
+                assert input_dict['audio_encoder'].shape == (2, 16, 128)
+                mllm_comm.send_forward(output_dict)
+            elif dist.get_rank() == 5 or dist.get_rank() == 6:
+                # LLM stage receives concatenated LLM outputs
+                input_dict = mllm_comm.recv_forward(tensor_shape=(2, 32, 128))
+                assert input_dict['llm'].shape == (2, 32, 128)
+                mllm_comm.send_forward(output_dict)
+            elif dist.get_rank() == 7:
+                # LLM stage receives concatenated LLM outputs
+                input_dict = mllm_comm.recv_forward(tensor_shape=(2, 32, 128))
+                assert input_dict['llm'].shape == (2, 32, 128)
+
+    def test_send_backward_recv_backward(self):
+        """Test send_backward and recv_backward operations."""
+        if not dist.is_initialized():
+            pytest.skip("Distributed not initialized")
+
+        # Create process group grids for each module
+        image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1)
+        audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1)
+        llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1)
+        generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2)
+
+        # Set up module-grid mapping and topology
+        module_to_grid_map = {
+            'image_encoder': image_encoder_grid,
+            'audio_encoder': audio_encoder_grid,
+            'llm': llm_grid,
+            'generator': generator_grid,
+        }
+        topology = {
+            'image_encoder': ['llm'],
+            'audio_encoder': ['llm'],
+            'llm': ['generator'],
+            'generator': [],
+        }
+        config = ModelParallelConfig(pipeline_dtype=torch.float)
+        mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config)
+
+        # Simulate backward communication for each module
+        if mllm_comm.is_current_rank_in_grid(generator_grid):
+            # Generator sends gradient backward
+            grad_dict = {'llm': torch.randn(1, 32, 128).cuda()}
+            mllm_comm.send_backward(grad_dict)
+        if mllm_comm.is_current_rank_in_grid(llm_grid):
+            if dist.get_rank() == 4 or dist.get_rank() == 5:
+                # LLM receives expanded gradient and sends backward
+                received_grad = mllm_comm.recv_backward()
+                assert received_grad['llm'].shape == (2, 32, 128)
+                grad_dict = {'llm': torch.randn(2, 32, 128).cuda()}
+                mllm_comm.send_backward(grad_dict)
+            else:
+                # LLM receives gradient and sends backward to both image/audio encoders
+                received_grad = mllm_comm.recv_backward(tensor_shape=(2, 32, 128))
+                assert received_grad['llm'].shape == (2, 32, 128)
+                grad_dict = {
+                    'image_encoder': torch.randn(2, 8, 128).cuda(),
+                    'audio_encoder': torch.randn(2, 16, 128).cuda(),
+                }
+                mllm_comm.send_backward(grad_dict)
+        if mllm_comm.is_current_rank_in_grid(image_encoder_grid):
+            # Image encoder receives its gradient
+            received_grad = mllm_comm.recv_backward()
+            assert received_grad['image_encoder'].shape == (2, 8, 128)
+        if mllm_comm.is_current_rank_in_grid(audio_encoder_grid):
+            # Audio encoder receives its gradient
+            received_grad = mllm_comm.recv_backward()
+            assert received_grad['audio_encoder'].shape == (2, 16, 128)
+
+    @pytest.mark.skipif(
+        version.parse(torch.__version__) < version.parse('2.3.0'),
+        reason="Feature requires PyTorch 2.3 or later",
+    )
+    def test_send_forward_recv_backward_send_backward_recv_forward(self):
+        """Test send_forward_recv_backward and send_backward_recv_forward operations."""
+        if not dist.is_initialized():
+            pytest.skip("Distributed not initialized")
+
+        # Create process group grids for each module
+        image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1)
+        audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1)
+        llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1)
+        generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2)
+
+        # Set up module-grid mapping and topology
+        module_to_grid_map = {
+            'image_encoder': image_encoder_grid,
+            'audio_encoder': audio_encoder_grid,
+            'llm': llm_grid,
+            'generator': generator_grid,
+        }
+        topology = {
+            'image_encoder': ['llm'],
+            'audio_encoder': ['llm'],
+            'llm': ['generator'],
+            'generator': [],
+        }
+        config = ModelParallelConfig(pipeline_dtype=torch.float)
+        mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config)
+
+        # Simulate bidirectional send/recv for forward and backward in pipeline
+
+        # Encoder stages send forward to the first stage of LLM, and receive backward from the first stage of LLM
+        if mllm_comm.is_current_rank_in_grid(image_encoder_grid):
+            output_dict = {'image_encoder': torch.randn(2, 8, 128).cuda()}
+            received_grad = mllm_comm.send_forward_recv_backward(output_dict)
+            assert received_grad['image_encoder'].shape == (2, 8, 128)
+        if mllm_comm.is_current_rank_in_grid(audio_encoder_grid):
+            output_dict = {'audio_encoder': torch.randn(2, 16, 128).cuda()}
+            received_grad = mllm_comm.send_forward_recv_backward(output_dict)
+            assert received_grad['audio_encoder'].shape == (2, 16, 128)
+        if mllm_comm.is_current_rank_in_grid(llm_grid):
+            if dist.get_rank() == 2 or dist.get_rank() == 3:
+                grad_dict = {
+                    'image_encoder': torch.randn(2, 8, 128).cuda(),
+                    'audio_encoder': torch.randn(2, 16, 128).cuda(),
+                }
+                input_dict = mllm_comm.send_backward_recv_forward(grad_dict)
+                assert input_dict['image_encoder'].shape == (2, 8, 128)
+                assert input_dict['audio_encoder'].shape == (2, 16, 128)
+
+        # First stage of LLM sends forward to the second stage of LLM, and receive backward from the second stage of LLM
+        if mllm_comm.is_current_rank_in_grid(llm_grid):
+            if dist.get_rank() == 2 or dist.get_rank() == 3:
+                output_dict = {'llm': torch.randn(2, 32, 128).cuda()}
+                received_grad = mllm_comm.send_forward_recv_backward(
+                    output_dict, tensor_shape=(2, 32, 128)
+                )
+                assert received_grad['llm'].shape == (2, 32, 128)
+            if dist.get_rank() == 4 or dist.get_rank() == 5:
+                grad_dict = {'llm': torch.randn(2, 32, 128).cuda()}
+                input_dict = mllm_comm.send_backward_recv_forward(
+                    grad_dict, tensor_shape=(2, 32, 128)
+                )
+                assert input_dict['llm'].shape == (2, 32, 128)
+
+        # Second stage of LLM sends forward to generator, and receive backward from generator
+        if mllm_comm.is_current_rank_in_grid(llm_grid):
+            if dist.get_rank() == 4 or dist.get_rank() == 5:
+                output_dict = {'llm': torch.randn(2, 32, 128).cuda()}
+                received_grad = mllm_comm.send_forward_recv_backward(output_dict)
+                assert received_grad['llm'].shape == (2, 32, 128)
+        if mllm_comm.is_current_rank_in_grid(generator_grid):
+            grad_dict = {'llm': torch.randn(1, 32, 128).cuda()}
+            input_dict = mllm_comm.send_backward_recv_forward(grad_dict)
+            assert input_dict['llm'].shape == (1, 32, 128)
+
+    @pytest.mark.skipif(
+        version.parse(torch.__version__) < version.parse('2.3.0'),
+        reason="Feature requires PyTorch 2.3 or later",
+    )
+    def test_send_forward_recv_forward_with_transformer_blocks(self):
+        """Test send_forward and recv_forward operations."""
+
+        # Set model/test dimensions for easier debugging and output comparison
+        hidden_size = 16
+        sequence_length = 2
+        micro_batch_size = 2
+
+        # For reproducibility, set a fixed seed
+        torch.manual_seed(12345)
+        dtype = torch.float32
+
+        # Create random input hidden states tensor
+        hidden_states = torch.randn(
+            (sequence_length, micro_batch_size, hidden_size), device="cuda"
+        ).to(dtype)
+        current_rank = dist.get_rank()
+
+        # ========== Initialize tensor model-parallel environment ==========
+        parallel_state_tp = 2
+        Utils.initialize_model_parallel(tensor_model_parallel_size=2)
+
+        # ========== Build reference 1D grid and transformer block for weight sharing ==========
+        ref_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=8)
+        ref_pg_collection = _get_pg_collection_from_grid(ref_grid)
+        ref_block = _create_transformer_block(
+            dtype=dtype, hidden_size=hidden_size, pg_collection=ref_pg_collection
+        )
+        _avg_params(
+            ref_block, ref_grid.get_pg("dp")
+        )  # Ensure parameters are averaged across data parallel (DP)
+
+        # ========== Create different transformer blocks for each model stage ==========
+        # Image encoder
+        image_encoder_block, image_encoder_grid = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=1,
+            cp_size=1,
+            pp_size=1,
+            dp_size=1,
+            grid_offset=0,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        # Audio encoder
+        audio_encoder_block, audio_encoder_grid = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=1,
+            cp_size=1,
+            pp_size=1,
+            dp_size=1,
+            grid_offset=1,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        # LLM (Large Language Model) block with tensor & pipeline parallelism
+        llm_block, llm_grid = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=2,
+            cp_size=1,
+            pp_size=2,
+            dp_size=1,
+            grid_offset=2,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        # Generator block (final stage) with DP=2
+        generator_block, generator_grid = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=1,
+            cp_size=1,
+            pp_size=1,
+            dp_size=2,
+            grid_offset=6,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+
+        # ========== Define module-to-grid correspondence and pipeline topology ==========
+        module_to_grid_map = {
+            'image_encoder': image_encoder_grid,
+            'audio_encoder': audio_encoder_grid,
+            'llm': llm_grid,
+            'generator': generator_grid,
+        }
+        topology = {
+            'image_encoder': ['llm'],  # image_encoder sends output to llm
+            'audio_encoder': ['llm'],  # audio_encoder sends output to llm
+            'llm': ['generator'],  # llm sends output to generator
+            'generator': [],  # generator is the final module
+        }
+        config = ModelParallelConfig(pipeline_dtype=torch.float)
+        # Define dimension mapping for sequence, batch, hidden
+        dim_mapping = {'s': 0, 'h': 2, 'b': 1}
+        seq_dim = dim_mapping['s']
+
+        # Communication handler for multi-module pipeline (send/recv abstraction)
+        mllm_comm = MultiModulePipelineCommunicator(
+            module_to_grid_map, topology, config, dim_mapping=dim_mapping
+        )
+
+        # ========== Run actual distributed pipeline blocks (per process, depending on role) ==========
+        if mllm_comm.is_current_rank_in_grid(image_encoder_grid):
+            # Image encoder rank: run forward and send output
+            image_encoder_output = image_encoder_block(
+                hidden_states=hidden_states, attention_mask=None
+            )
+            output_dict = {'image_encoder': image_encoder_output}
+            mllm_comm.send_forward(output_dict)
+        if mllm_comm.is_current_rank_in_grid(audio_encoder_grid):
+            # Audio encoder rank: run forward and send output
+            audio_encoder_output = audio_encoder_block(
+                hidden_states=hidden_states, attention_mask=None
+            )
+            output_dict = {'audio_encoder': audio_encoder_output}
+            mllm_comm.send_forward(output_dict)
+        if mllm_comm.is_current_rank_in_grid(llm_grid):
+            if dist.get_rank() == 2 or dist.get_rank() == 3:
+                # LLM stage 0 (receives both image and audio, concatenates along seq_dim)
+                input_dict = mllm_comm.recv_forward()
+                llm_output = llm_block(
+                    hidden_states=torch.cat(
+                        [input_dict['image_encoder'], input_dict['audio_encoder']], dim=seq_dim
+                    ),
+                    attention_mask=None,
+                )
+                output_dict = {'llm': llm_output}
+                mllm_comm.send_forward(output_dict)
+            else:
+                # LLM stage 1 (receives output of previous LLM stage)
+                input_dict = mllm_comm.recv_forward(
+                    tensor_shape=(sequence_length * 2, micro_batch_size, hidden_size)
+                )
+                llm_output = llm_block(hidden_states=input_dict['llm'], attention_mask=None)
+                output_dict = {'llm': llm_output}
+                mllm_comm.send_forward(output_dict)
+
+        if mllm_comm.is_current_rank_in_grid(generator_grid):
+            # Generator block: only receives from llm and runs forward
+            input_dict = mllm_comm.recv_forward()
+            generator_output = generator_block(hidden_states=input_dict['llm'], attention_mask=None)
+
+        # ========== Build a reference (serial/global) pipeline for correctness checking ==========
+        global_image_encoder_block, _ = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=parallel_state_tp,
+            use_global_parallel_state=True,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        global_audio_encoder_block, _ = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=parallel_state_tp,
+            use_global_parallel_state=True,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        global_llm_block_pp_rank_0, _ = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=parallel_state_tp,
+            use_global_parallel_state=True,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        global_llm_block_pp_rank_1, _ = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=parallel_state_tp,
+            use_global_parallel_state=True,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        global_generator_block, _ = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=parallel_state_tp,
+            use_global_parallel_state=True,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+
+        # Run each stage sequentially as a global pipeline (for truth)
+        global_image_encoder_output = global_image_encoder_block(
+            hidden_states=hidden_states, attention_mask=None
+        )
+        global_audio_encoder_output = global_audio_encoder_block(
+            hidden_states=hidden_states, attention_mask=None
+        )
+        # Compare output between global and distributed blocks for image/audio stage
+        if current_rank == 0:
+            torch.testing.assert_close(
+                global_image_encoder_output, image_encoder_output, rtol=1e-3, atol=1e-3
+            )
+        if current_rank == 1:
+            torch.testing.assert_close(
+                global_audio_encoder_output, audio_encoder_output, rtol=1e-3, atol=1e-3
+            )
+
+        # Feed outputs to LLM stages (emulate pipeline cut with concatenation)
+        global_llm_input = torch.cat(
+            [global_image_encoder_output, global_audio_encoder_output], dim=seq_dim
+        )
+        global_llm_pp_rank_0_output = global_llm_block_pp_rank_0(
+            hidden_states=global_llm_input, attention_mask=None
+        )
+        if current_rank == 2 or current_rank == 3:
+            torch.testing.assert_close(
+                global_llm_pp_rank_0_output, llm_output, rtol=1e-3, atol=1e-3
+            )
+        global_llm_pp_rank_1_output = global_llm_block_pp_rank_1(
+            hidden_states=global_llm_pp_rank_0_output, attention_mask=None
+        )
+        if current_rank == 4 or current_rank == 5:
+            torch.testing.assert_close(
+                global_llm_pp_rank_1_output, llm_output, rtol=1e-3, atol=1e-3
+            )
+
+        # Generator output and comparison to distributed output (for each DP chunk)
+        global_generator_block_output = global_generator_block(
+            hidden_states=global_llm_pp_rank_1_output, attention_mask=None
+        )
+        global_generator_block_chunks = torch.split(
+            global_generator_block_output, global_generator_block_output.shape[1] // 2, dim=1
+        )
+        if current_rank == 6:
+            torch.testing.assert_close(
+                global_generator_block_chunks[0], generator_output, rtol=1e-3, atol=1e-3
+            )
+        if current_rank == 7:
+            torch.testing.assert_close(
+                global_generator_block_chunks[1], generator_output, rtol=1e-3, atol=1e-3
+            )
+
+    @pytest.mark.skipif(
+        version.parse(torch.__version__) < version.parse('2.3.0'),
+        reason="Feature requires PyTorch 2.3 or later",
+    )
+    @pytest.mark.parametrize(
+        "grid1_tp, grid1_pp, grid1_dp, grid2_tp, grid2_pp, grid2_dp, parallel_state_tp",
+        [
+            (2, 1, 1, 2, 1, 1, 2),  # TP2PP1DP1 to TP2PP1DP1
+            (2, 1, 1, 2, 2, 1, 2),  # TP2PP1DP1 to TP2PP2DP1
+            (2, 2, 1, 2, 2, 1, 2),  # TP2PP2DP1 to TP2PP2DP1
+            (4, 1, 1, 4, 1, 1, 4),  # TP4DP1 to TP4DP1
+            (2, 1, 2, 4, 1, 1, 2),  # TP2DP2 to TP4DP1
+            (4, 1, 1, 2, 1, 2, 2),  # TP4DP1 to TP2DP2
+            (2, 1, 2, 1, 1, 4, 2),  # TP2DP2 to TP1DP4
+        ],
+    )
+    def test_send_forward_recv_forward_with_transformer_blocks_and_different_parallelisms(
+        self, grid1_tp, grid1_pp, grid1_dp, grid2_tp, grid2_pp, grid2_dp, parallel_state_tp
+    ):
+        """Test bridge communicator with two transformer blocks having different process group configurations."""
+        # Model and input configuration
+        hidden_size = 16
+        sequence_length = 2
+        micro_batch_size = 8
+        torch.manual_seed(12345)
+        dtype = torch.float32
+
+        # Create random input tensor on CUDA
+        hidden_states = torch.randn(
+            (sequence_length, micro_batch_size, hidden_size), device="cuda"
+        ).to(dtype)
+        hidden_states_ref = hidden_states.clone()
+        current_rank = dist.get_rank()
+
+        # Initialize model parallel with desired TP
+        Utils.initialize_model_parallel(tensor_model_parallel_size=parallel_state_tp)
+
+        # Build a reference grid and block for parameter sharing & DP averaging
+        ref_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=8)
+        ref_pg_collection = _get_pg_collection_from_grid(ref_grid)
+        ref_block = _create_transformer_block(
+            dtype=dtype, hidden_size=hidden_size, pg_collection=ref_pg_collection
+        )
+        _avg_params(
+            ref_block, ref_grid.get_pg("dp")
+        )  # Synchronize parameters across DP for reproducibility
+
+        # ====== Create two transformer block+grid pairs with different TP/DP settings ======
+        block_grid_1, grid_1 = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=grid1_tp,
+            pp_size=grid1_pp,
+            dp_size=grid1_dp,
+            grid_offset=0,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+
+        block_grid_2, grid_2 = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=grid2_tp,
+            pp_size=grid2_pp,
+            dp_size=grid2_dp,
+            grid_offset=grid_1.size,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+
+        dist.barrier()  # Synchronize ranks before communication
+
+        # Module-grid map and pipeline communication topology
+        module_to_grid_map = {'image_encoder': grid_1, 'llm': grid_2}
+        topology = {
+            'image_encoder': ['llm'],  # image_encoder sends forward results to llm
+            'llm': [],  # llm is the last stage here
+        }
+        config = ModelParallelConfig(pipeline_dtype=torch.float)
+        mllm_comm = MultiModulePipelineCommunicator(
+            module_to_grid_map, topology, config, dim_mapping={'s': 0, 'h': 2, 'b': 1}
+        )
+
+        output_grid_2 = None
+        # If current rank is in the first grid, run first block and send output
+        if grid_1 is not None and mllm_comm.is_current_rank_in_grid(grid_1):
+            rank_module_info = mllm_comm.rank_module_map['image_encoder']
+            if rank_module_info.pp_rank == 0:
+                hidden_states = block_grid_1(hidden_states=hidden_states, attention_mask=None)
+                mllm_comm.send_forward({'image_encoder': hidden_states})
+            else:
+                input_dict = mllm_comm.recv_forward(
+                    tensor_shape=(sequence_length, micro_batch_size, hidden_size)
+                )
+                hidden_states = input_dict['image_encoder']
+                hidden_states = block_grid_1(hidden_states=hidden_states, attention_mask=None)
+                mllm_comm.send_forward({'image_encoder': hidden_states})
+
+        # If current rank is in second grid, receive and run the second block
+        if grid_2 is not None and mllm_comm.is_current_rank_in_grid(grid_2):
+            rank_module_info = mllm_comm.rank_module_map['llm']
+            if rank_module_info.pp_rank == 0:
+                input_dict = mllm_comm.recv_forward()
+                hidden_states = input_dict['image_encoder']
+                hidden_states = block_grid_2(hidden_states=hidden_states, attention_mask=None)
+                if rank_module_info.pp_rank == rank_module_info.pp_size - 1:
+                    output_grid_2 = hidden_states
+                else:
+                    mllm_comm.send_forward({'llm': hidden_states})
+            elif rank_module_info.pp_rank < rank_module_info.pp_size - 1:
+                input_dict = mllm_comm.recv_forward(
+                    tensor_shape=(
+                        sequence_length,
+                        (grid1_dp * micro_batch_size) // grid2_dp,
+                        hidden_size,
+                    )
+                )
+                hidden_states = input_dict['llm']
+                hidden_states = block_grid_2(hidden_states=hidden_states, attention_mask=None)
+                mllm_comm.send_forward({'llm': hidden_states})
+            else:
+                input_dict = mllm_comm.recv_forward(
+                    tensor_shape=(
+                        sequence_length,
+                        (grid1_dp * micro_batch_size) // grid2_dp,
+                        hidden_size,
+                    )
+                )
+                hidden_states = input_dict['llm']
+                output_grid_2 = block_grid_2(hidden_states=hidden_states, attention_mask=None)
+
+                # Compute expected output shape based on change in DP size (chunk/expand batch dimension appropriately)
+                factor = max(grid1_dp, grid2_dp) // min(grid1_dp, grid2_dp)
+                expected_output_shape = (
+                    sequence_length,
+                    (
+                        micro_batch_size * factor
+                        if grid1_dp > grid2_dp
+                        else micro_batch_size // factor
+                    ),
+                    hidden_size,
+                )
+                assert (
+                    output_grid_2.shape == expected_output_shape
+                ), f"Output2 shape mismatch: {output_grid_2.shape}"
+
+        # ====== Reference: global (replicated) pipeline forward for correctness checking ======
+        global_block_1, _ = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=parallel_state_tp,
+            use_global_parallel_state=True,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+        global_block_2, _ = get_transformer_block_and_grid(
+            ref_block,
+            tp_size=parallel_state_tp,
+            use_global_parallel_state=True,
+            hidden_size=hidden_size,
+            dtype=dtype,
+        )
+
+        for i in range(grid1_pp):
+            hidden_states_ref = global_block_1(hidden_states=hidden_states_ref, attention_mask=None)
+
+        for i in range(grid2_pp):
+            hidden_states_ref = global_block_2(hidden_states=hidden_states_ref, attention_mask=None)
+
+        # Output comparison under different DP compositions between grids
+        if (
+            grid_2 is not None
+            and mllm_comm.is_current_rank_in_grid(grid_2)
+            and rank_module_info.pp_rank == rank_module_info.pp_size - 1
+        ):
+            if grid1_dp == grid2_dp:
+                # DP size matches: all outputs directly compared
+                torch.testing.assert_close(hidden_states_ref, output_grid_2, rtol=1e-3, atol=1e-3)
+            elif grid1_dp < grid2_dp:
+                # If grid2 expands DP: each output_grid_2 chunk corresponds to a split of the reference output
+                grid2_dp_ranks = grid_2._gen_rank_enum([x for x in grid_2.dim_names if x != "dp"])
+                global_block_2_chunks = torch.split(
+                    hidden_states_ref, hidden_states_ref.shape[1] // (grid2_dp // grid1_dp), dim=1
+                )
+                relevant_chunk = None
+                for i, dp_ranks in enumerate(grid2_dp_ranks):
+                    if current_rank in dp_ranks:
+                        relevant_chunk = global_block_2_chunks[i % len(global_block_2_chunks)]
+                torch.testing.assert_close(relevant_chunk, output_grid_2, rtol=1e-3, atol=1e-3)
+            else:
+                # If DP shrinks (grid1_dp > grid2_dp): just compare the relevant first chunk
+                output_grid_2_first_chunk = torch.chunk(output_grid_2, grid1_dp // grid2_dp, dim=1)[
+                    0
+                ]
+                torch.testing.assert_close(
+                    hidden_states_ref, output_grid_2_first_chunk, rtol=1e-3, atol=1e-3
+                )
diff --git a/tests/unit_tests/pipeline_parallel/test_pipeline_layout.py b/tests/unit_tests/pipeline_parallel/test_pipeline_layout.py
index 04880fb432c..a3990d25ecf 100644
--- a/tests/unit_tests/pipeline_parallel/test_pipeline_layout.py
+++ b/tests/unit_tests/pipeline_parallel/test_pipeline_layout.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import os
 from pathlib import Path
@@ -21,6 +21,7 @@
 )
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.multi_token_prediction import mtp_on_this_rank
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.training.checkpointing import load_checkpoint, save_checkpoint
 from megatron.training.global_vars import set_args
@@ -53,6 +54,8 @@ def initialize_gpt_model(
         virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
         hidden_dropout=0.0,
         attention_dropout=0.0,
+        mtp_num_layers=1 if with_mtp else None,
+        mtp_loss_scaling_factor=1.0 if with_mtp else None,
     )
     default_config_kwargs.update(**config_kwargs)
     transformer_config = TransformerConfig(**default_config_kwargs)
@@ -61,9 +64,6 @@ def initialize_gpt_model(
         transformer_config.moe_ffn_hidden_size = 128
         transformer_config.num_moe_experts = 4
         transformer_config.add_bias_linear = False
-    if with_mtp:
-        transformer_config.mtp_num_layers = 1
-        transformer_config.mtp_loss_scaling_factor = 1.0
     model = []
     for i in range(virtual_pipeline_model_parallel_size or 1):
         if is_moe:
@@ -71,8 +71,11 @@ def initialize_gpt_model(
         else:
             layer_spec = layer_spec_fn()
 
-        if is_moe and with_mtp and mpu.is_pipeline_last_stage(ignore_virtual=False, vp_stage=i):
-            transformer_layer_spec_for_mtp = gpt_te_spec(transformer_config)
+        if with_mtp and mtp_on_this_rank(transformer_config, ignore_virtual=False, vp_stage=i):
+            if is_moe:
+                transformer_layer_spec_for_mtp = gpt_te_spec(transformer_config)
+            else:
+                transformer_layer_spec_for_mtp = layer_spec
             mtp_block_spec = get_gpt_mtp_block_spec(
                 transformer_config,
                 transformer_layer_spec_for_mtp,
@@ -81,6 +84,7 @@ def initialize_gpt_model(
             )
         else:
             mtp_block_spec = None
+
         pre_process = mpu.is_pipeline_first_stage(ignore_virtual=False, vp_stage=i)
         post_process = mpu.is_pipeline_last_stage(ignore_virtual=False, vp_stage=i)
         this_model = (
@@ -122,7 +126,6 @@ def create_args():
     args.ckpt_fully_parallel_save = False
     args.ckpt_fully_parallel_load = False
     args.auto_detect_ckpt_format = False
-    args.retro_add_retriever = False
     args.ckpt_convert_update_legacy_dist_opt_format = False
     args.ckpt_step = None
     args.use_dist_ckpt = True
@@ -141,9 +144,9 @@ def create_args():
     args.no_load_rng = True
     args.use_distributed_optimizer = True
     args.use_megatron_fsdp = False
-    args.dist_ckpt_save_pre_mcore_014 = False
     args.dist_ckpt_optim_fully_reshardable = False
     args.distrib_optim_fully_reshardable_mem_efficient = False
+    args.phase_transition_iterations = None
 
     yield args
 
@@ -163,7 +166,7 @@ def create_args():
                 [],
                 ["decoder"],
                 ["decoder"],
-                ["decoder"] * 2 + ["loss"],
+                ["decoder"] * 2 + ["mtp"] + ["loss"],
             ],
             False,
             True,
@@ -185,7 +188,19 @@ def create_args():
             False,
         ),
         ((1, 2, None), [["embedding"] + ["decoder"] * 4, ["decoder"] * 4 + ["loss"]], True, False),
-        ((1, 4, 2), "E|t*3|(t|)*5L", True, True),
+        ((1, 4, 2), "E|t*3|(t|)*5mL", True, True),  # mtp in the last stage
+        (
+            (1, 4, 2),
+            "E|t*3|(t|)*4tm|L",
+            True,
+            True,
+        ),  # mtp in the second last stage with a decoder layer
+        (
+            (1, 4, 2),
+            "E|t*3|(t|)*3tt|m|L",
+            True,
+            True,
+        ),  # mtp in the second last stage with no other layers
     ],
 )
 def test_forward_vpp(create_args, tmp_path_dist_ckpt, tp_pp_vpp, pp_layout, is_moe, with_mtp):
diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py
index b861aa2df49..7dbd9fb15b1 100644
--- a/tests/unit_tests/pipeline_parallel/test_schedules.py
+++ b/tests/unit_tests/pipeline_parallel/test_schedules.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
 import os
 
 import pytest
@@ -13,6 +15,10 @@
 from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator
 from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage
 from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.cuda_graphs import (
+    convert_schedule_table_to_order,
+    get_overlap_moe_expert_parallel_comm_order,
+)
 from tests.unit_tests.test_utilities import Utils
 
 rank = Utils.rank
@@ -106,7 +112,7 @@ def test_get_pipeline_parallel_order(
     schedule_table = schedule.get_schedule_table(
         num_microbatches, num_model_chunks, microbatch_group_size_per_vp_stage
     )
-    order = schedule.convert_schedule_table_to_order(
+    order = convert_schedule_table_to_order(
         num_warmup_microbatches, num_model_chunks, schedule_table
     )
 
@@ -127,6 +133,52 @@ def test_get_pipeline_parallel_order(
     for k, v in order_cnt.items():
         assert -k in order_cnt and order_cnt[-k] == v
 
+    layers_per_chunk = 2
+    num_layers_per_chunk = [layers_per_chunk] * num_model_chunks
+    # disable wgrad compute
+    overlapped_order, chunk_id_list = get_overlap_moe_expert_parallel_comm_order(
+        order, num_layers_per_chunk, False
+    )
+    assert max(overlapped_order) == num_model_chunks * layers_per_chunk
+    assert len(overlapped_order) == len(order) * layers_per_chunk
+    assert len(chunk_id_list) == len(overlapped_order)
+    order_cnt = {}
+    accumulated_order = 0
+    for o in overlapped_order:
+        order_cnt[o] = order_cnt.get(o, 0) + 1
+        if o < 0:
+            assert -o in order_cnt and order_cnt[-o] >= order_cnt[o]
+        elif -o in order_cnt:
+            assert order_cnt[-o] < order_cnt[o]
+        accumulated_order += o
+        assert accumulated_order >= 0
+    assert accumulated_order == 0
+
+    # enable wgrad compute
+    overlapped_order, chunk_id_list = get_overlap_moe_expert_parallel_comm_order(
+        order, num_layers_per_chunk, True
+    )
+    assert max(overlapped_order) == num_model_chunks * layers_per_chunk
+    assert len(overlapped_order) == len(order) * layers_per_chunk * 3 // 2
+    assert len(chunk_id_list) == len(overlapped_order)
+    from math import ceil
+
+    order_cnt = {}
+    accumulated_order = 0
+    prev_o = 0
+    for o in overlapped_order:
+        if ceil(o) != o:
+            assert prev_o - 0.5 == o
+        else:
+            order_cnt[o] = order_cnt.get(o, 0) + 1
+            if o < 0:
+                assert -o in order_cnt and order_cnt[-o] >= order_cnt[o]
+            elif -o in order_cnt:
+                assert order_cnt[-o] < order_cnt[o]
+        accumulated_order += o
+        prev_o = o
+    assert accumulated_order < 0
+
     Utils.destroy_model_parallel()
 
 
diff --git a/tests/unit_tests/post_training/test_modelopt_model_builder.py b/tests/unit_tests/post_training/test_modelopt_model_builder.py
new file mode 100644
index 00000000000..b489d659ec4
--- /dev/null
+++ b/tests/unit_tests/post_training/test_modelopt_model_builder.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Unit tests for model_provider integration with ModelOpt model_builder."""
+
+from argparse import Namespace
+
+import model_provider as mp
+
+
+def _sentinel_builder(return_value, calls):
+    """Create a builder stub that records invocation."""
+
+    def _builder(args, pre_process, post_process, vp_stage, config=None, pg_collection=None):
+        calls.append(
+            {
+                "args": args,
+                "pre_process": pre_process,
+                "post_process": post_process,
+                "vp_stage": vp_stage,
+                "config": config,
+                "pg_collection": pg_collection,
+            }
+        )
+        return return_value
+
+    return _builder
+
+
+def test_model_provider_switches_to_modelopt_builder(monkeypatch):
+    """Ensure model_provider delegates to ModelOpt builder when enabled."""
+    args = Namespace(record_memory_history=False, modelopt_enabled=True)
+    modelopt_calls = []
+    original_calls = []
+
+    modelopt_result = object()
+    original_result = object()
+
+    # Force ModelOpt availability and stub builders.
+    monkeypatch.setattr(mp, "has_nvidia_modelopt", True)
+    monkeypatch.setattr(mp, "get_args", lambda: args)
+    monkeypatch.setattr(
+        mp, "modelopt_gpt_mamba_builder", _sentinel_builder(modelopt_result, modelopt_calls)
+    )
+
+    # original_builder should be ignored when ModelOpt is enabled.
+    original_builder = _sentinel_builder(original_result, original_calls)
+
+    returned = mp.model_provider(
+        original_builder,
+        pre_process=False,
+        post_process=False,
+        vp_stage=1,
+        config="cfg",
+        pg_collection="pg",
+    )
+
+    assert returned is modelopt_result
+    assert modelopt_calls == [
+        {
+            "args": args,
+            "pre_process": False,
+            "post_process": False,
+            "vp_stage": 1,
+            "config": "cfg",
+            "pg_collection": "pg",
+        }
+    ]
+    assert len(original_calls) == 0
diff --git a/tests/unit_tests/post_training/test_modelopt_module_spec.py b/tests/unit_tests/post_training/test_modelopt_module_spec.py
index f27a22390f7..dac96785bc0 100644
--- a/tests/unit_tests/post_training/test_modelopt_module_spec.py
+++ b/tests/unit_tests/post_training/test_modelopt_module_spec.py
@@ -6,7 +6,7 @@
 import torch
 from packaging.version import Version
 
-from megatron.core import dist_checkpointing
+from megatron.core import dist_checkpointing, parallel_state
 from megatron.core.inference.contexts import StaticInferenceContext
 from megatron.core.models.gpt.gpt_layer_specs import (
     get_gpt_decoder_block_spec,
@@ -92,8 +92,11 @@ def setup_method(self, method):
     def test_sharded_state_dict_restore(self, tmp_path_dist_ckpt):
         """Save with the default TE spec and restore using the ModelOpt spec."""
         _dist_checkpoint_name = "default_model"
-        te_fused_sharded_state_dict = self.default_model.sharded_state_dict()
-        modelopt_sharded_state_dict = self.modelopt_model.sharded_state_dict()
+        metadata = {
+            "dp_cp_group": parallel_state.get_data_parallel_group(with_context_parallel=True)
+        }
+        te_fused_sharded_state_dict = self.default_model.sharded_state_dict(metadata=metadata)
+        modelopt_sharded_state_dict = self.modelopt_model.sharded_state_dict(metadata=metadata)
 
         with TempNamedDir(tmp_path_dist_ckpt / _dist_checkpoint_name, sync=True) as tmpdirname:
             dist_checkpointing.save(te_fused_sharded_state_dict, tmpdirname)
@@ -170,6 +173,7 @@ def setup_method(self, method):
             moe_ffn_hidden_size=128,
             moe_shared_expert_intermediate_size=128,
             qk_layernorm=True,
+            qk_l2_norm=True,
             use_cpu_initialization=True,
         )
         default_spec = get_gpt_decoder_block_spec(
diff --git a/tests/unit_tests/resharding/test_model_swap.py b/tests/unit_tests/resharding/test_model_swap.py
new file mode 100644
index 00000000000..73296a175ed
--- /dev/null
+++ b/tests/unit_tests/resharding/test_model_swap.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import copy
+import os
+import types
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+import torch.distributed as dist
+
+from megatron.core import parallel_state as mpu
+from megatron.core.hyper_comm_grid import HyperCommGrid
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.resharding.refit import swap_model_weights
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    import nvshmem.core
+
+    has_nvshmem = True
+except Exception:
+    has_nvshmem = False
+
+
+def _build_pg_collection(
+    tp_size: int, pp_size: int = None, ep_size: int = 1
+) -> ProcessGroupCollection:
+    cp_size = mpu.get_context_parallel_world_size()
+    if pp_size is None:
+        pp_size = mpu.get_pipeline_model_parallel_world_size()
+    world_size = dist.get_world_size()
+    dp_size = world_size // (tp_size * cp_size * ep_size * pp_size)
+    assert dp_size >= 1 and (tp_size * cp_size * ep_size * pp_size * dp_size) == world_size
+
+    grid = HyperCommGrid(
+        [tp_size, cp_size, ep_size, pp_size, dp_size], ["tp", "cp", "ep", "pp", "dp"]
+    )
+    tp_group = grid.create_pg("tp")
+    cp_group = grid.create_pg("cp")
+    pp_group = grid.create_pg("pp")
+    ep_group = grid.create_pg("ep")
+    dp_group = grid.create_pg("dp")
+    # Composite groups required by MoE/router and some utilities
+    tp_cp_group = grid.create_pg(["tp", "cp"])
+    mp_group = grid.create_pg(["tp", "cp", "ep", "pp"])
+    tp_ep_group = grid.create_pg(["tp", "ep"])
+    tp_ep_pp_group = grid.create_pg(["tp", "ep", "pp"])
+    dp_cp_group = grid.create_pg(["cp", "dp"])
+    tp_dp_cp_group = grid.create_pg(["tp", "cp", "dp"])
+    embd_group_ranks = mpu.default_embedding_ranks(dist.get_process_group_ranks(pp_group))
+    embd_group = dist.new_group(ranks=embd_group_ranks)
+    pos_embd_group_ranks = mpu.default_position_embedding_ranks(
+        dist.get_process_group_ranks(pp_group)
+    )
+    pos_embd_group = dist.new_group(ranks=pos_embd_group_ranks)
+    return ProcessGroupCollection(
+        tp=tp_group,
+        cp=cp_group,
+        pp=pp_group,
+        ep=ep_group,
+        embd=embd_group,
+        pos_embd=pos_embd_group,
+        dp=dp_group,
+        tp_cp=tp_cp_group,
+        mp=mp_group,
+        expt_tp=tp_group,
+        expt_dp=dp_group,
+        tp_ep=tp_ep_group,
+        tp_ep_pp=tp_ep_pp_group,
+        dp_cp=dp_cp_group,
+        tp_dp_cp=tp_dp_cp_group,
+    )
+
+
+def _build_gpt(
+    config: TransformerConfig,
+    vocab_size: int,
+    seq_len: int,
+    pg_collection,
+    parallel_output: bool = True,
+    num_moe_experts: Optional[int] = None,
+) -> GPTModel:
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=(num_moe_experts is not None)
+        ),
+        vocab_size=vocab_size,
+        max_sequence_length=seq_len,
+        pre_process=True,
+        post_process=True,
+        fp16_lm_cross_entropy=False,
+        parallel_output=parallel_output,
+        share_embeddings_and_output_weights=True,
+        position_embedding_type="rope",
+        rotary_percent=1.0,
+        pg_collection=pg_collection,
+    )
+    return model
+
+
+def _mp_config() -> ModelParallelConfig:
+    return ModelParallelConfig(
+        params_dtype=torch.float32,
+        use_cpu_initialization=True,
+        sequence_parallel=False,
+        gradient_accumulation_fusion=False,
+    )
+
+
+def _set_pg_collection(module, tp_group, dp_group):
+    module.pg_collection = types.SimpleNamespace(tp=tp_group, dp=dp_group, ep=None, pp=None)
+    return module
+
+
+@pytest.mark.parametrize(
+    "refit_backend",
+    [
+        pytest.param(
+            "nvshmem",
+            marks=pytest.mark.skipif(
+                not has_nvshmem,
+                reason="nvshmem.core is not available (NVSHMEM Python bindings not installed)",
+            ),
+        ),
+        "nccl",
+        "gloo",
+    ],
+)
+@pytest.mark.parametrize(
+    "src_tp,src_pp,src_ep,dst_tp,dst_pp,dst_ep,num_experts",
+    [
+        # TP only changes
+        (2, 1, 1, 1, 1, 1, None),  # TP2 -> TP1
+        (1, 1, 1, 2, 1, 1, None),  # TP1 -> TP2
+        (2, 1, 1, 4, 1, 1, None),  # TP2 -> TP4
+        # # PP only changes
+        (1, 2, 1, 1, 1, 1, None),  # PP2 -> PP1
+        (1, 1, 1, 1, 2, 1, None),  # PP1 -> PP2
+        # # Both TP and PP change
+        (2, 2, 1, 1, 1, 1, None),  # TP2,PP2 -> TP1,PP1
+        (1, 1, 1, 2, 2, 1, None),  # TP1,PP1 -> TP2,PP2
+        (2, 1, 1, 1, 2, 1, None),  # TP2,PP1 -> TP1,PP2
+        (1, 2, 1, 2, 1, 1, None),  # TP1,PP2 -> TP2,PP1
+        (1, 2, 1, 2, 4, 1, None),  # TP1,PP2 -> TP2,PP4
+        (1, 1, 2, 1, 1, 4, 4),  # EP2 -> EP4
+        (1, 1, 2, 1, 1, 1, 4),  # EP2 -> EP1
+        (1, 1, 1, 1, 1, 2, 4),
+        (1, 1, 2, 1, 2, 2, 4),
+    ],
+)
+def test_swap_gpt_parametrized(
+    refit_backend: str,
+    src_tp: int,
+    src_pp: int,
+    src_ep: int,
+    dst_tp: int,
+    dst_pp: int,
+    dst_ep: int,
+    num_experts: Optional[int],
+):
+    # Initialize environment with source MP sizing
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=src_tp, pipeline_model_parallel_size=src_pp
+    )
+    # Validate divisibility post-init using the default PG safely
+    world = dist.get_world_size()
+    if (world % (src_tp * src_pp * src_ep) != 0) or (world % (dst_tp * dst_pp * dst_ep) != 0):
+        Utils.destroy_model_parallel()
+        pytest.skip(
+            "WORLD_SIZE must be divisible by both src_tp*src_pp*src_ep and dst_tp*dst_pp*dst_ep"
+        )
+    model_parallel_cuda_manual_seed(1234)
+
+    torch.manual_seed(1234)
+    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+
+    # Small GPT config
+    seq_len = 8
+    vocab_size = 128
+    # --group-query-attention   --num-query-groups 8
+    cfg = TransformerConfig(
+        num_layers=4 if (src_pp > 1 or dst_pp > 1) else 2,
+        hidden_size=32,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        pipeline_dtype=torch.float32,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        moe_router_dtype="fp64",
+        moe_token_dispatcher_type="alltoall",
+        num_query_groups=4,
+    )
+
+    # Build PGs and models (always use unified PG builder so we can set EP)
+    src_pgs = _build_pg_collection(tp_size=src_tp, pp_size=src_pp, ep_size=src_ep)
+    dst_pgs = _build_pg_collection(tp_size=dst_tp, pp_size=dst_pp, ep_size=dst_ep)
+    # Apply EP configuration to TransformerConfigs when MoE is requested
+    src_cfg = copy.deepcopy(cfg)
+    dst_cfg = copy.deepcopy(cfg)
+    if num_experts is not None:
+        src_cfg.num_moe_experts = num_experts
+        dst_cfg.num_moe_experts = num_experts
+        # Ensure MoE MLP has an intermediate size; __post_init__ won't rerun after manual mutation
+        src_cfg.moe_ffn_hidden_size = src_cfg.ffn_hidden_size
+        dst_cfg.moe_ffn_hidden_size = dst_cfg.ffn_hidden_size
+        src_cfg.expert_model_parallel_size = src_ep
+        dst_cfg.expert_model_parallel_size = dst_ep
+        # Force grouped MLP path under Transformer Engine and satisfy requirements
+        src_cfg.moe_grouped_gemm = True
+        dst_cfg.moe_grouped_gemm = True
+        src_cfg.add_bias_linear = False
+        dst_cfg.add_bias_linear = False
+        # Require Transformer Engine for TEGroupedMLP; skip if unavailable
+        try:
+            import transformer_engine
+        except Exception:
+            Utils.destroy_model_parallel()
+            pytest.skip("Transformer Engine not available; skipping TE-grouped MoE test")
+    # Use parallel_output=False to gather TP logits inside model and emit only on last PP stage
+    src_model = (
+        _build_gpt(
+            src_cfg,
+            vocab_size,
+            seq_len,
+            src_pgs,
+            parallel_output=False,
+            num_moe_experts=num_experts,
+        )
+        .to(device)
+        .eval()
+    )
+    dst_model = (
+        _build_gpt(
+            dst_cfg,
+            vocab_size,
+            seq_len,
+            dst_pgs,
+            parallel_output=False,
+            num_moe_experts=num_experts,
+        )
+        .to(device)
+        .eval()
+    )
+
+    # Inputs
+    batch = 2
+    tokens = torch.randint(
+        low=0, high=vocab_size, size=(batch, seq_len), device=device, dtype=torch.long
+    )
+    position_ids = (
+        torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0).expand(batch, -1)
+    )
+    attention_mask = torch.ones((batch, 1, seq_len, seq_len), device=device, dtype=torch.bool)
+
+    # Collect source reference logits (parallel_output=False ensures full vocab on last PP stage)
+    ref_logits = torch.empty(batch, seq_len, vocab_size, device=device, dtype=torch.float32)
+    src_pp_ranks = dist.get_process_group_ranks(src_pgs.pp)
+    src_last_pp_rank = src_pp_ranks[-1]
+    with torch.no_grad():
+        src_out = src_model(tokens, position_ids, attention_mask)
+        if dist.get_rank() == src_last_pp_rank:
+            ref = src_out  # [b, s, vocab]
+            ref_logits.copy_(ref)
+    dist.broadcast(ref_logits, src=src_last_pp_rank, group=src_pgs.pp)
+
+    # Swap weights
+    swap_model_weights([src_model], [dst_model], refit_method=refit_backend)
+
+    # Collect destination logits (parallel_output=False ensures full vocab on last PP stage)
+    dst_logits = torch.empty(batch, seq_len, vocab_size, device=device, dtype=torch.float32)
+    dst_pp_ranks = dist.get_process_group_ranks(dst_pgs.pp)
+    dst_last_pp_rank = dst_pp_ranks[-1]
+    with torch.no_grad():
+        dst_out = dst_model(
+            tokens, position_ids, attention_mask
+        )  # last stage returns tensor, others return None
+        if dist.get_rank() == dst_last_pp_rank:
+            dst_logits.copy_(dst_out)  # [b, s, vocab]
+    dist.broadcast(dst_logits, src=dst_last_pp_rank, group=dst_pgs.pp)
+
+    # Compare
+    assert ref_logits.shape == dst_logits.shape
+    assert torch.allclose(
+        dst_logits, ref_logits, atol=1e-4, rtol=1e-4
+    ), f"Refit src(TP={src_tp},PP={src_pp})->dst(TP={dst_tp},PP={dst_pp}) GPT outputs differ"
+    dist.barrier()
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/rl/test_rl_batch_invariant.py b/tests/unit_tests/rl/test_rl_batch_invariant.py
new file mode 100644
index 00000000000..ab339755307
--- /dev/null
+++ b/tests/unit_tests/rl/test_rl_batch_invariant.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import os
+
+import pytest
+import torch
+
+from megatron.core.transformer.custom_layers.batch_invariant_kernels import set_batch_invariant_mode
+from megatron.rl.rl_utils import selective_log_softmax
+
+
+def test_selective_log_softmax_batch_invariant():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+
+    B, S, V = 4, 7, 16
+    device = torch.device("cuda")
+    logits = torch.randn(B, S, V, dtype=torch.float32, device=device)
+    labels = torch.randint(low=0, high=V, size=(B, S), device=device)
+
+    # Randomly permute the batch dimension; a batch-invariant implementation should
+    # produce outputs that are identical up to the same permutation.
+    perm = torch.randperm(B, device=device)
+
+    with set_batch_invariant_mode(True):
+        bik_logps = selective_log_softmax(logits, labels)  # [B, S]
+        bik_logps_perm = selective_log_softmax(
+            logits[perm], labels[perm]
+        )  # [B, S] corresponding to permuted batch
+
+    # Undo the permutation on the permuted outputs and compare elementwise.
+    # If the kernel is batch invariant, each example's output should not depend
+    # on its position in the batch.
+    assert torch.equal(bik_logps, bik_logps_perm[perm.argsort()])
diff --git a/tests/unit_tests/rl/test_rl_utils.py b/tests/unit_tests/rl/test_rl_utils.py
new file mode 100644
index 00000000000..cff62d40f0e
--- /dev/null
+++ b/tests/unit_tests/rl/test_rl_utils.py
@@ -0,0 +1,656 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import itertools
+import os
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
+from megatron.core.enums import ModelType
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+from megatron.core.pipeline_parallel.utils import is_pp_last_stage
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.rl import rl_utils
+from megatron.rl.agent.api import TokenRollout
+from megatron.training.arguments import parse_args, validate_args
+from megatron.training.global_vars import destroy_global_vars, set_global_variables
+from tests.unit_tests.test_utilities import Utils
+
+BATCH = 2
+SEQ = 4
+VOCAB = 754
+
+
+class MockModel(LanguageModule):
+    def __init__(self, batch=BATCH, seq=SEQ, vocab=VOCAB):
+        self.batch = batch
+        self.seq = seq
+        self.vocab = vocab
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+        self.config = TransformerConfig(
+            num_attention_heads=8, num_layers=8, pipeline_dtype=torch.bfloat16
+        )
+        self.model_type = ModelType.encoder_or_decoder
+
+    def __call__(self, x, position_ids, attention_mask, **kwargs):
+        del position_ids
+        del attention_mask
+        batch, seq = x.shape
+        mock_model_outputs = torch.ones((batch, seq, self.vocab), device=x.device)
+        return mock_model_outputs
+
+    def load_state_dict(self, params):
+        del params
+
+    def train(self, mode=True):
+        del mode
+
+    def state_dict(self):
+        return {}
+
+    def set_input_tensor(self, input_tensor):
+        pass
+
+
+class MockTokenizer:
+    def __init__(self):
+        self.pad = 42
+        self.eod = 43
+        self.vocab_size = VOCAB
+        self.bos = None
+
+    def detokenize(self, tokens):
+        return [str(tok) for tok in tokens]
+
+
+@pytest.fixture
+def initialize_model_parallel(request, monkeypatch):
+    """Fixture to initialize and destroy model parallel.
+
+    Parameters are passed via request.param as a tuple: (tp, pp)
+    Skips if world_size < tp * pp.
+    """
+    monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
+    monkeypatch.setenv("WANDB_MODE", "disabled")
+    monkeypatch.setenv("LOG_TO_WANDB", "false")
+
+    tp, pp = request.param
+    world_size = Utils.world_size
+    Utils.initialize_model_parallel(tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp)
+    dp = world_size // (tp * pp)
+    yield world_size, dp, tp, pp
+    Utils.destroy_model_parallel()
+    destroy_global_vars()
+    destroy_num_microbatches_calculator()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_global_state():
+    """Ensure global state is correctly cleaned up after every test."""
+    yield
+    destroy_global_vars()
+    destroy_num_microbatches_calculator()
+
+
+class TestRLUtils:
+    """Test class for RL utilities."""
+
+    def create_test_args(self, **kwargs):
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+
+        args = parse_args(ignore_unknown_args=True)
+        args.num_layers = 8
+        args.num_attention_heads = 8
+        args.vocab_size = VOCAB
+        args.hidden_size = 128
+        args.max_position_embeddings = 256
+        args.seq_length = 256
+        args.wandb_project = None
+
+        args.micro_batch_size = 1
+
+        for key, value in kwargs.items():
+            setattr(args, key, value)
+
+        args = validate_args(args)
+        set_global_variables(args, False)
+        return args
+
+    @pytest.mark.parametrize(
+        "initialize_model_parallel",
+        [
+            pytest.param((tp, pp), id=f"tp{tp}-pp{pp}")
+            for tp, pp in itertools.product([1, 2, 4, 8], [1, 2, 4, 8])
+            if tp * pp <= Utils.world_size
+        ],
+        indirect=["initialize_model_parallel"],
+    )
+    @pytest.mark.parametrize("use_sequence_packing", [False])
+    def test_get_logprobs(self, initialize_model_parallel, use_sequence_packing):
+        """Test that getting logprobs at least does not crash."""
+        self.create_test_args(rl_use_sequence_packing=use_sequence_packing)
+
+        model = MockModel()
+        tokens = torch.ones((BATCH, SEQ), dtype=torch.long)
+        logprobs = rl_utils.get_logprobs(
+            model, tokens, position_ids=None, sequence_packing=use_sequence_packing
+        )
+        if is_pp_last_stage(model.pg_collection.pp):
+            # We chop off 1 element from the sequence dimension.
+            assert logprobs.shape == (BATCH, SEQ - 1)
+            # As we return ones as logits, all logprobs should be the same.
+            assert torch.all(logprobs == logprobs[0, 0]).item()
+        else:
+            assert logprobs.shape == (BATCH, SEQ, VOCAB)
+
+    def test_grpo_loss_calculation_all_pi_eq(self):
+        # All policies are equal: clamping is inactive, ratios are ones.
+        current_logprobs = torch.ones(BATCH, SEQ)
+        old_logprobs = torch.ones(BATCH, SEQ)
+        ref_logprobs = torch.ones(BATCH, SEQ)
+        advantages = torch.zeros(BATCH)
+        loss, kl_term, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss(
+            current_logprobs=current_logprobs,
+            old_logprobs=old_logprobs,
+            ref_logprobs=ref_logprobs,
+            advantages=advantages,
+            clamp_eps_lower=0.1,
+            clamp_eps_upper=0.1,
+            kl_beta=0.1,
+            entropy_weight=0.0,
+        )
+        torch.testing.assert_close(loss, torch.zeros_like(loss))
+        torch.testing.assert_close(kl_term, torch.zeros_like(kl_term))
+        torch.testing.assert_close(ratios, torch.ones_like(ratios))
+        torch.testing.assert_close(entropy_term, -torch.ones_like(ratios) * torch.e)
+
+    def test_grpo_loss_calculation_2x_ratios(self):
+        # All policies are equal: clamping is inactive, ratios are ones.
+        current_logprobs = torch.ones(BATCH, SEQ)
+        old_logprobs = torch.ones(BATCH, SEQ) - torch.log(torch.tensor([2.0]))
+        ref_logprobs = torch.ones(BATCH, SEQ)
+        advantages = torch.ones(BATCH)
+        loss, kl_term, ratios, _, _, _ = rl_utils.calculate_grpo_loss(
+            current_logprobs=current_logprobs,
+            old_logprobs=old_logprobs,
+            ref_logprobs=ref_logprobs,
+            advantages=advantages,
+            clamp_eps_lower=2.1,
+            clamp_eps_upper=2.1,
+            kl_beta=0.0,
+            entropy_weight=0.0,
+        )
+        # Clamping does not affect us, as 2.1 [eps] > 2 [ratio].
+        # kl_beta = 0 -> we only have the non-kl term of the loss active.
+        torch.testing.assert_close(loss, -torch.ones_like(loss) * 2)
+        # pi and pi_{ref} are the same here.
+        torch.testing.assert_close(kl_term, torch.zeros_like(kl_term))
+        # Current probs are 2x more probable than old pi.
+        torch.testing.assert_close(ratios, torch.ones_like(ratios) * 2)
+
+    def test_entropy_calculation(self):
+        # All policies are equal: clamping is inactive, ratios are ones.
+        current_logprobs = torch.ones(BATCH, SEQ)
+        old_logprobs = torch.ones(BATCH, SEQ)
+        ref_logprobs = torch.ones(BATCH, SEQ)
+        advantages = torch.zeros(BATCH)
+        loss, _, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss(
+            current_logprobs=current_logprobs,
+            old_logprobs=old_logprobs,
+            ref_logprobs=ref_logprobs,
+            advantages=advantages,
+            clamp_eps_lower=0.1,
+            clamp_eps_upper=0.1,
+            kl_beta=0.0,
+            entropy_weight=1.0,
+        )
+        torch.testing.assert_close(loss, torch.ones_like(ratios) * torch.e)
+        torch.testing.assert_close(entropy_term, -torch.ones_like(ratios) * torch.e)
+
+    def test_grpo_loss_truncation(self):
+        # All ratios are 2
+        _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss(
+            current_logprobs=torch.ones(BATCH, SEQ),
+            old_logprobs=0.5 * torch.ones(BATCH, SEQ),
+            ref_logprobs=torch.ones(BATCH, SEQ),
+            advantages=torch.zeros(BATCH),
+            clamp_eps_lower=0.1,
+            clamp_eps_upper=0.1,
+            kl_beta=0.1,
+            entropy_weight=0.0,
+        )
+        assert truncated_from_above.float().mean() == 1
+        assert truncated_from_below.float().sum() == 0
+
+        # All ratios are 0.01
+        _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss(
+            current_logprobs=0.01 * torch.ones(BATCH, SEQ),
+            old_logprobs=torch.ones(BATCH, SEQ),
+            ref_logprobs=torch.ones(BATCH, SEQ),
+            advantages=torch.zeros(BATCH),
+            clamp_eps_lower=0.1,
+            clamp_eps_upper=0.1,
+            kl_beta=0.1,
+            entropy_weight=0.0,
+        )
+        assert truncated_from_above.float().sum() == 0
+        assert truncated_from_below.float().mean() == 1
+
+        # Mixed ratios: [[2., 0.5], [20., 1.]]
+        current_logprobs = torch.tensor([[1.0, 1.0], [1.0, 1.0]])
+        old_logprobs = torch.tensor([[0.5, 2.0], [0.05, 1.0]])
+        _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss(
+            current_logprobs=current_logprobs,
+            old_logprobs=old_logprobs,
+            ref_logprobs=old_logprobs,
+            advantages=torch.zeros(BATCH),
+            clamp_eps_lower=0.1,
+            clamp_eps_upper=0.1,
+            kl_beta=0.1,
+            entropy_weight=0.0,
+        )
+        torch.testing.assert_close(
+            truncated_from_above, torch.tensor([[True, False], [True, False]])
+        )
+        torch.testing.assert_close(
+            truncated_from_below, torch.tensor([[False, True], [False, False]])
+        )
+
+    @pytest.mark.parametrize(
+        "initialize_model_parallel",
+        [
+            pytest.param((tp, pp), id=f"tp{tp}-pp{pp}")
+            for tp, pp in itertools.product([1, 2, 4, 8], [1, 2, 4, 8])
+            if tp * pp <= Utils.world_size
+        ],
+        indirect=["initialize_model_parallel"],
+    )
+    def test_prepare_data_for_update(self, initialize_model_parallel):
+        """Test that getting logprobs at least does not crash."""
+        world_size, dp, tp, pp = initialize_model_parallel
+        # Here I assume that we will be consuming all data in one step.
+        group_size = 2
+        self.create_test_args(
+            micro_batch_size=2,
+            seq_length=4,
+            curr_iteration=1,
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            global_batch_size=dp * 2,
+            grpo_prompts_per_step=dp,
+            grpo_group_size=group_size,
+        )
+
+        model = MockModel()
+        tokenizer = MockTokenizer()
+
+        r1 = TokenRollout(
+            trajectory=[[1, 2, 3]],
+            reward=3.14,
+            generation_mask=[[False, True, True]],
+            logprobs=[[0.1, 0.2, 0.3]],
+            env_id='MEGAENV',
+            problem_id="2",
+        )
+        r2 = TokenRollout(
+            trajectory=[[1, 2, 3, 4]],
+            reward=0.14,
+            generation_mask=[[False, True, True, True]],
+            logprobs=[[0.1, 0.2, 0.3, -1.2]],
+            env_id='MEGAENV',
+            problem_id="2",
+        )
+
+        rollouts = [[r1, r2] for _ in range(dp)]
+        try:
+            rl_utils.prepare_data_for_update(
+                [model], {}, rollouts, tokenizer, sequence_packing=False, is_correction=False
+            )
+        except AssertionError as e:
+            # We expect trajectories to come padded there.
+            assert str(e).startswith('Rollout is not the correct length')
+
+        r1 = TokenRollout(
+            trajectory=torch.tensor([[1, 2, 3, tokenizer.eod]], dtype=torch.float).cuda(),
+            reward=3.14,
+            generation_mask=torch.tensor([[False, True, True, True]], dtype=torch.float).cuda(),
+            logprobs=torch.tensor([[-0.2, -0.3, -3.2]]).cuda(),
+            env_id='MEGAENV',
+            problem_id="2",
+        )
+        r2 = TokenRollout(
+            trajectory=torch.tensor([[1, 2, 234, tokenizer.eod]], dtype=torch.float).cuda(),
+            reward=0.14,
+            generation_mask=torch.tensor([[False, True, True, True]], dtype=torch.float).cuda(),
+            logprobs=torch.tensor([[-0.2, -0.3, -1.2]]),
+            env_id='MEGAENV',
+            problem_id="2",
+        )
+        rollouts = [[r1, r2] for _ in range(dp)]
+        data_iter = rl_utils.prepare_data_for_update(
+            [model], {}, rollouts, tokenizer, sequence_packing=False, is_correction=False
+        )
+
+        _, _, old_logprobs, _, _, _, _ = next(data_iter)
+        # All logits are ones in the MockModel.
+        # All probabilities should be uniform.
+        torch.testing.assert_close(old_logprobs.exp(), torch.ones_like(old_logprobs) / VOCAB)
+
+    @pytest.mark.parametrize("use_sequence_packing", [True, False])
+    @pytest.mark.parametrize("num_turns", [1, 2])
+    def test_prepare_trajectories(self, use_sequence_packing, num_turns):
+        """Test that rollouts are properly prepared for training."""
+        seq_length = 8
+        self.create_test_args(
+            rl_use_sequence_packing=use_sequence_packing,
+            rl_sequence_packing_bin_size=20,
+            rl_skip_bos_token=False,
+            micro_batch_size=1,
+            seq_length=seq_length,
+        )
+        tokenizer = MockTokenizer()
+
+        # Create rollouts of varying lengths
+        r1 = TokenRollout(
+            trajectory=[[1, 2, 3, tokenizer.eod]] * num_turns,
+            reward=3.14,
+            generation_mask=[[False, True, True, True]] * num_turns,
+            logprobs=[[0.1, 0.2, 0.3, 0.35]] * num_turns,
+            env_id='MEGAENV',
+            problem_id="1",
+        )
+        r2 = TokenRollout(
+            trajectory=[[4, 5, 6, 7, tokenizer.eod]] * num_turns,
+            reward=0.14,
+            generation_mask=[[False, True, True, True, True]] * num_turns,
+            logprobs=[[0.4, 0.5, 0.6, 0.7, 0.75]] * num_turns,
+            env_id='MEGAENV',
+            problem_id="2",
+        )
+        r3 = TokenRollout(
+            trajectory=[[8, 9, tokenizer.eod]] * num_turns,
+            reward=2.71,
+            generation_mask=[[False, True, True]] * num_turns,
+            logprobs=[[0.8, 0.9, 0.95]] * num_turns,
+            env_id='MEGAENV',
+            problem_id="3",
+        )
+
+        rollouts = [r1, r2, r3]
+
+        trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(
+            rollouts,
+            tokenizer,
+            seq_length,
+            sequence_packing=use_sequence_packing,
+            skip_bos_token=False,
+        )
+
+        expected_trajs = torch.tensor(
+            [
+                [1, 2, 3, tokenizer.eod] + [tokenizer.pad] * 4,
+                [4, 5, 6, 7, tokenizer.eod] + [tokenizer.pad] * 3,
+                [8, 9, tokenizer.eod] + [tokenizer.pad] * 5,
+            ],
+            dtype=torch.long,
+            device=trajs.device,
+        ).repeat_interleave(num_turns, dim=0)
+        assert torch.equal(trajs, expected_trajs)
+
+        expected_genmask = torch.tensor(
+            [
+                [False, True, True, True] + [False] * 4,
+                [False, True, True, True, True] + [False] * 3,
+                [False, True, True] + [False] * 5,
+            ],
+            dtype=torch.bool,
+            device=genmask.device,
+        ).repeat_interleave(num_turns, dim=0)
+        assert torch.equal(genmask, expected_genmask)
+
+        if use_sequence_packing:
+            expected_logprobs = torch.tensor(
+                [
+                    [0.1, 0.2, 0.3, 0.35] + [0.0] * 4,
+                    [0.4, 0.5, 0.6, 0.7, 0.75] + [0.0] * 3,
+                    [0.8, 0.9, 0.95] + [0.0] * 5,
+                ],
+                dtype=torch.float32,
+                device=inference_logprobs.device,
+            ).repeat_interleave(num_turns, dim=0)
+            torch.testing.assert_close(inference_logprobs, expected_logprobs, rtol=0, atol=0)
+        else:
+            expected_logprobs = [
+                [0.1, 0.2, 0.3, 0.35],
+                [0.4, 0.5, 0.6, 0.7, 0.75],
+                [0.8, 0.9, 0.95],
+            ]
+            expected_logprobs = [el for el in expected_logprobs for _ in range(num_turns)]
+            assert len(inference_logprobs) == len(expected_logprobs)
+            for got, exp in zip(inference_logprobs, expected_logprobs):
+                got_t = got if torch.is_tensor(got) else torch.tensor(got, dtype=torch.float32)
+                exp_t = torch.tensor(exp, dtype=torch.float32, device=got_t.device)
+                torch.testing.assert_close(got_t, exp_t, rtol=0, atol=0)
+
+    def test_single_turn_advantage_calculation(self):
+        rewards = [[-1, 1], [4, 4]]
+        num_turns = [[1, 1], [1, 1]]
+        advs = rl_utils.calculate_grpo_advantages(rewards, num_turns)
+        torch.testing.assert_close(
+            torch.tensor(advs), torch.tensor([-1, 1.0, 0.0, 0.0]), atol=1e-4, rtol=1e-5
+        )
+
+    def test_multi_turn_advantage_calculation(self):
+        rewards = [[-1, 1], [4, 4]]
+        num_turns = [[2, 1], [1, 3]]
+        advs = rl_utils.calculate_grpo_advantages(rewards, num_turns)
+        torch.testing.assert_close(
+            torch.tensor(advs),
+            torch.tensor([-1, -1, 1.0, 0.0, 0.0, 0.0, 0.0]),
+            atol=1e-4,
+            rtol=1e-5,
+        )
+
+    def test_pad_list_of_nones(self):
+        with pytest.raises(ValueError) as e_info:
+            rl_utils._pad_nonnull_with_zeros([None] * 3, 42)
+        assert "At least one" in str(e_info)
+
+    def test_pad_with_wrong_params(self):
+        with pytest.raises(ValueError) as e_info:
+            rl_utils._pad_nonnull_with_zeros([torch.zeros(5)], 4)
+        assert "larger length" in str(e_info)
+
+    def test_pad_full_size(self):
+        padded = rl_utils._pad_nonnull_with_zeros([torch.zeros(5), torch.zeros(5)], 5)
+        assert padded.shape == (2, 5)
+
+    def test_pad_some_nones(self):
+        padded = rl_utils._pad_nonnull_with_zeros([None, torch.zeros(5)], 5)
+        assert padded.shape == (2, 5)
+        assert (padded[0] == 0).all()
+
+    def test_pad_normal(self):
+        padded = rl_utils._pad_nonnull_with_zeros(
+            [torch.zeros(2), torch.zeros(3), torch.zeros(4)], 5
+        )
+        assert padded.shape == (3, 5)
+
+    @pytest.mark.parametrize(
+        "initialize_model_parallel",
+        [
+            pytest.param((tp, pp), id=f"tp{tp}-pp{pp}")
+            for tp, pp in itertools.product([1, 2], [1, 2])
+            if tp * pp <= Utils.world_size
+        ],
+        indirect=["initialize_model_parallel"],
+    )
+    def test_grad_buffer_offload(self, initialize_model_parallel):
+        """Test that grad buffer offload/restore correctly frees and restores GPU memory."""
+        world_size, dp, tp, pp = initialize_model_parallel
+        self.create_test_args(tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp)
+
+        model_parallel_cuda_manual_seed(123)
+
+        # Create a realistic GPTModel as used in RL training
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True
+        )
+        gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+            vocab_size=256,
+            max_sequence_length=32,
+        ).cuda()
+
+        ddp_config = DistributedDataParallelConfig(
+            grad_reduce_in_fp32=True,
+            use_distributed_optimizer=True,
+            overlap_grad_reduce=False,
+            bucket_size=None,  # Single bucket for simplicity
+        )
+
+        ddp_model = DistributedDataParallel(
+            transformer_config, ddp_config=ddp_config, module=gpt_model
+        )
+
+        all_buffers = ddp_model.buffers + ddp_model.expert_parallel_buffers
+
+        # Verify initial storage is allocated
+        initial_sizes = [buf.grad_data.storage().size() for buf in all_buffers]
+        assert all(size > 0 for size in initial_sizes), "Expected non-zero initial storage"
+
+        # Offload grad buffers to CPU
+        ddp_model.offload_grad_buffers()
+
+        # Verify storage is released
+        for buf in all_buffers:
+            assert buf.grad_data.storage().size() == 0, "Expected zero storage after offload"
+
+        # Restore grad buffers to GPU
+        ddp_model.restore_grad_buffers()
+
+        # Verify storage is restored
+        restored_sizes = [buf.grad_data.storage().size() for buf in all_buffers]
+        assert (
+            initial_sizes == restored_sizes
+        ), f"Expected restored sizes {restored_sizes} to match initial {initial_sizes}"
+
+    @pytest.mark.parametrize(
+        "initialize_model_parallel",
+        [
+            pytest.param((tp, pp), id=f"tp{tp}-pp{pp}")
+            for tp, pp in itertools.product([1, 2], [1, 2])
+            if tp * pp <= Utils.world_size
+        ],
+        indirect=["initialize_model_parallel"],
+    )
+    def test_optimizer_offload(self, initialize_model_parallel):
+        """Test that optimizer offload_to_cpu/restore_from_cpu correctly moves state to/from CPU."""
+        world_size, dp, tp, pp = initialize_model_parallel
+        self.create_test_args(tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp)
+        model_parallel_cuda_manual_seed(123)
+
+        # Create a realistic GPTModel as used in RL training
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True
+        )
+        gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+            vocab_size=256,
+            max_sequence_length=32,
+        ).cuda()
+
+        ddp_config = DistributedDataParallelConfig(
+            grad_reduce_in_fp32=True,
+            use_distributed_optimizer=True,
+            overlap_grad_reduce=False,
+            bucket_size=None,  # Single bucket for simplicity
+        )
+
+        ddp_model = DistributedDataParallel(
+            transformer_config, ddp_config=ddp_config, module=gpt_model
+        )
+
+        # Create optimizer
+        optimizer_config = OptimizerConfig(
+            optimizer='adam', bf16=True, use_distributed_optimizer=True
+        )
+        optimizer = get_megatron_optimizer(optimizer_config, [ddp_model])
+
+        # Manually initialize optimizer state (simulating what happens after first step)
+        # This avoids needing to run a full forward/backward/step cycle
+        for opt in optimizer.chained_optimizers:
+            if hasattr(opt, 'optimizer') and opt.optimizer is not None:
+                for group in opt.optimizer.param_groups:
+                    for p in group['params']:
+                        if len(opt.optimizer.state[p]) == 0:
+                            # Initialize Adam state (exp_avg and exp_avg_sq) on GPU
+                            opt.optimizer.state[p]['exp_avg'] = torch.rand_like(p.data)
+                            opt.optimizer.state[p]['exp_avg_sq'] = torch.rand_like(p.data)
+                            opt.optimizer.state[p]['step'] = torch.tensor(1)
+
+        # Helper to check if optimizer state tensors are on GPU or CPU
+        def get_optimizer_state_devices():
+            devices = set()
+            for opt in optimizer.chained_optimizers:
+                if hasattr(opt, 'optimizer') and opt.optimizer is not None:
+                    for state_dict in opt.optimizer.state.values():
+                        for v in state_dict.values():
+                            if isinstance(v, torch.Tensor):
+                                devices.add(str(v.device))
+            return devices
+
+        # Verify optimizer state is initially on GPU
+        initial_devices = get_optimizer_state_devices()
+        assert any(
+            'cuda' in d for d in initial_devices
+        ), f"Expected optimizer state on GPU initially, got devices: {initial_devices}"
+
+        # Record GPU memory before offload
+        torch.cuda.synchronize()
+        memory_before_offload = torch.cuda.memory_allocated()
+
+        # Offload optimizer state to CPU
+        optimizer.offload_to_cpu()
+
+        # Verify GPU memory decreased (optimizer state should be freed)
+        torch.cuda.synchronize()
+        memory_after_offload = torch.cuda.memory_allocated()
+        assert memory_after_offload < memory_before_offload, (
+            f"Expected GPU memory to decrease after offload. "
+            f"Before: {memory_before_offload}, After: {memory_after_offload}"
+        )
+
+        # Verify optimizer state is now on CPU
+        offloaded_devices = get_optimizer_state_devices()
+        assert all(
+            'cpu' in d for d in offloaded_devices
+        ), f"Expected all optimizer state on CPU after offload, got devices: {offloaded_devices}"
+
+        # Restore optimizer state to GPU
+        optimizer.restore_from_cpu()
+
+        # Verify optimizer state is back on GPU
+        restored_devices = get_optimizer_state_devices()
+        assert any(
+            'cuda' in d for d in restored_devices
+        ), f"Expected optimizer state on GPU after restore, got devices: {restored_devices}"
+
+        # Verify GPU memory increased after restore (optimizer state reallocated)
+        torch.cuda.synchronize()
+        memory_after_restore = torch.cuda.memory_allocated()
+        assert memory_after_restore > memory_after_offload, (
+            f"Expected GPU memory to increase after restore. "
+            f"After offload: {memory_after_offload}, After restore: {memory_after_restore}"
+        )
diff --git a/tests/unit_tests/rl/test_sequence_packing_utils.py b/tests/unit_tests/rl/test_sequence_packing_utils.py
new file mode 100644
index 00000000000..44a3de762f0
--- /dev/null
+++ b/tests/unit_tests/rl/test_sequence_packing_utils.py
@@ -0,0 +1,465 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from megatron.rl import rl_utils, sequence_packing_utils
+from megatron.training import arguments, global_vars
+
+
+class MockTokenizer:
+    def __init__(self):
+        self.pad = 42
+        self.eod = 43
+        self.vocab_size = 754
+        self.bos = None
+
+    def detokenize(self, tokens):
+        return [str(tok) for tok in tokens]
+
+
+def test_get_actual_sequence_lengths():
+    pad_token = 42
+
+    sequences = torch.tensor(
+        [
+            [1, 2, 3, pad_token, pad_token],
+            [4, 5, 6, 7, 8],
+            [9, pad_token, pad_token, pad_token, pad_token],
+            [pad_token, pad_token, pad_token, pad_token, pad_token],
+        ]
+    )
+
+    lengths = sequence_packing_utils.get_actual_sequence_lengths(sequences, pad_token)
+
+    assert lengths == [3, 5, 1, 0]
+
+
+def test_get_actual_sequence_lengths_with_interior_padding():
+    pad_token = 42
+
+    sequences = torch.tensor(
+        [[1, pad_token, 3, pad_token, pad_token], [pad_token, 2, 3, 4, pad_token]]
+    )
+
+    lengths = sequence_packing_utils.get_actual_sequence_lengths(sequences, pad_token)
+
+    assert lengths == [3, 4]
+
+
+def test_get_actual_sequence_lengths_invalid_shape():
+    pad_token = 42
+    sequences_1d = torch.tensor([1, 2, 3])
+
+    try:
+        sequence_packing_utils.get_actual_sequence_lengths(sequences_1d, pad_token)
+        assert False, "Should have raised ValueError"
+    except ValueError as e:
+        assert "Expected 2D tensor" in str(e)
+
+
+def test_sequence_packing_basic():
+    args = arguments.parse_args(ignore_unknown_args=True)
+    setattr(args, 'seq_length', 16)
+    global_vars.set_args(args)
+
+    tokenizer = MockTokenizer()
+    bin_size = 16
+    packer = sequence_packing_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad)
+
+    max_len = 5
+    sequences = [
+        torch.cat(
+            [
+                torch.tensor([1, 2, 3, tokenizer.eod]),
+                torch.full((1,), tokenizer.pad, dtype=torch.long),
+            ]
+        ),
+        torch.cat(
+            [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)]
+        ),
+        torch.tensor([6, 7, 8, 9, tokenizer.eod]),
+        torch.cat(
+            [torch.tensor([10, tokenizer.eod]), torch.full((3,), tokenizer.pad, dtype=torch.long)]
+        ),
+    ]
+
+    generation_masks = torch.tensor(
+        [
+            [False, True, True, True, False],
+            [False, True, True, False, False],
+            [False, True, True, True, True],
+            [False, True, False, False, False],
+        ]
+    )
+
+    rewards = torch.tensor([1.0, 2.0, 3.0, 4.0])
+
+    sequences_tensor = torch.stack(sequences)
+    packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = (
+        packer.pack_sequences(sequences_tensor, generation_masks)
+    )
+
+    assert packed_trajs is not None
+    assert packed_position_ids is not None
+    assert packed_attention_mask is not None
+    assert packed_loss_mask is not None
+    assert packing_info is not None
+
+    assert packed_trajs.shape[0] >= 1
+    assert packed_trajs.shape[1] == bin_size
+
+    for bin_idx in range(packed_trajs.shape[0]):
+        for i in range(packed_trajs.shape[1]):
+            if i == 0 or packed_trajs[bin_idx, i - 1] == tokenizer.eod:
+                if packed_trajs[bin_idx, i] != tokenizer.pad:
+                    assert packed_position_ids[bin_idx, i] == 0
+
+
+def test_sequence_packing_with_generation_masks():
+    args = arguments.parse_args(ignore_unknown_args=True)
+    setattr(args, 'seq_length', 20)
+    global_vars.set_args(args)
+
+    tokenizer = MockTokenizer()
+    bin_size = 20
+    packer = sequence_packing_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad)
+
+    sequences = [torch.tensor([1, 2, 3, tokenizer.eod]), torch.tensor([4, 5, 6, 7, tokenizer.eod])]
+
+    max_len = max(len(s) for s in sequences)
+    padded_sequences = []
+    for seq in sequences:
+        padded = torch.cat([seq, torch.full((max_len - len(seq),), tokenizer.pad, dtype=seq.dtype)])
+        padded_sequences.append(padded)
+
+    generation_masks = torch.tensor(
+        [[False, True, True, True, False], [False, True, True, True, True]]
+    )
+
+    padded_sequences_tensor = torch.stack(padded_sequences)
+    packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = (
+        packer.pack_sequences(padded_sequences_tensor, generation_masks)
+    )
+
+    assert packed_trajs.shape[0] == 1
+    assert packed_trajs.shape[1] == bin_size
+
+
+def test_sequence_packing_empty_bins():
+    args = arguments.parse_args(ignore_unknown_args=True)
+    setattr(args, 'seq_length', 8)
+    global_vars.set_args(args)
+
+    tokenizer = MockTokenizer()
+    bin_size = 8
+    num_empty_bins = 3
+
+    packed_trajs = torch.tensor(
+        [[1, 2, 3, tokenizer.eod, tokenizer.pad, tokenizer.pad, tokenizer.pad, tokenizer.pad]]
+    )
+    packed_position_ids = torch.tensor([[0, 1, 2, 3, 0, 0, 0, 0]])
+    packed_loss_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float)
+    packed_attention_mask = torch.ones(1, bin_size, bin_size)
+
+    empty_trajs, empty_position_ids, empty_loss_mask, empty_attention_mask, empty_packing_info = (
+        sequence_packing_utils.create_empty_bins(
+            num_empty_bins=num_empty_bins,
+            bin_size=bin_size,
+            packed_trajs=packed_trajs,
+            packed_position_ids=packed_position_ids,
+            packed_loss_mask=packed_loss_mask,
+            packed_attention_mask=packed_attention_mask,
+            tokenizer=tokenizer,
+        )
+    )
+
+    assert empty_trajs.shape[0] == num_empty_bins
+    assert empty_trajs.shape[1] == bin_size
+
+    for i in range(num_empty_bins):
+        assert torch.all(empty_trajs[i] == tokenizer.pad)
+        assert torch.all(empty_position_ids[i] == 0)
+        assert torch.all(empty_loss_mask[i] == 0)
+
+    assert len(empty_packing_info) == num_empty_bins
+    for info in empty_packing_info:
+        assert len(info['bin_seq_indices']) == 0
+        assert len(info['seq_starts']) == 0
+
+
+def test_sequence_packing_integration():
+    args = arguments.parse_args(ignore_unknown_args=True)
+    setattr(args, 'seq_length', 16)
+    global_vars.set_args(args)
+
+    tokenizer = MockTokenizer()
+    bin_size = 16
+
+    packer = sequence_packing_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad)
+
+    max_len = 5
+    sequences = [
+        torch.cat(
+            [
+                torch.tensor([1, 2, 3, tokenizer.eod]),
+                torch.full((1,), tokenizer.pad, dtype=torch.long),
+            ]
+        ),
+        torch.cat(
+            [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)]
+        ),
+        torch.tensor([6, 7, 8, 9, tokenizer.eod]),
+    ]
+    generation_masks = [
+        torch.tensor([False, True, True, True, False]),
+        torch.tensor([False, True, True, False, False]),
+        torch.tensor([False, True, True, True, True]),
+    ]
+
+    sequences_tensor = torch.stack(sequences)
+    packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = (
+        packer.pack_sequences(sequences_tensor, generation_masks)
+    )
+
+    assert packed_trajs is not None
+    assert packed_trajs.shape[1] == bin_size
+    assert packed_position_ids.shape == packed_trajs.shape
+    assert packed_loss_mask.shape == packed_trajs.shape
+
+    assert packed_trajs.shape[0] == 1
+
+    expected_start = torch.tensor(
+        [6, 7, 8, 9, tokenizer.eod, 1, 2, 3, tokenizer.eod, 4, 5, tokenizer.eod]
+    )
+    assert torch.all(packed_trajs[0, :12] == expected_start)
+
+    assert torch.all(packed_trajs[0, 12:] == tokenizer.pad)
+
+
+class MockGroupStats:
+    """Mock group stats object for testing."""
+
+    def __init__(self):
+        self.min_piold_to_inf_prob = None
+        self.max_piold_to_inf_prob = None
+        self.mean_piold_to_inf_prob = None
+        self.min_inf_train_prob_abs_diff = None
+        self.max_inf_train_prob_abs_diff = None
+        self.mean_inf_train_prob_abs_diff = None
+        self.min_inf_prob = None
+        self.max_inf_prob = None
+        self.mean_inf_prob = None
+
+
+def test_update_inference_logprobs_group_stats():
+    """Test the common statistics computation helper function."""
+    # Create matching logprobs (should give ratio ~1.0)
+    old_logprobs = torch.tensor([[-0.5, -0.3, -0.2, 0.0]])
+    inference_logprobs = torch.tensor([[-0.5, -0.3, -0.2, 0.0]])
+    mask = torch.tensor([[True, True, True, False]])
+
+    group_stats = MockGroupStats()
+
+    rl_utils.update_inference_logprobs_group_stats(
+        old_logprobs=old_logprobs,
+        inference_logprobs=inference_logprobs,
+        mask=mask,
+        group_stats=group_stats,
+    )
+
+    # When logprobs match exactly, ratio should be 1.0 and diff should be 0.0
+    assert abs(group_stats.mean_piold_to_inf_prob - 1.0) < 1e-6
+    assert abs(group_stats.mean_inf_train_prob_abs_diff) < 1e-6
+
+
+def test_update_inference_logprobs_group_stats_empty_mask():
+    """Test statistics computation with empty mask."""
+    old_logprobs = torch.tensor([[-0.5, -0.3]])
+    inference_logprobs = torch.tensor([[-0.5, -0.3]])
+    mask = torch.tensor([[False, False]])  # Empty mask
+
+    group_stats = MockGroupStats()
+
+    rl_utils.update_inference_logprobs_group_stats(
+        old_logprobs=old_logprobs,
+        inference_logprobs=inference_logprobs,
+        mask=mask,
+        group_stats=group_stats,
+    )
+
+    # With empty mask, stats should remain None
+    assert group_stats.mean_piold_to_inf_prob is None
+
+
+def test_update_inference_logprobs_group_stats_with_mismatch():
+    """Test statistics when inference and old logprobs differ."""
+    # Old logprobs
+    old_logprobs = torch.tensor([[-0.5, -0.5, -0.5]])
+    # Inference logprobs with different values
+    inference_logprobs = torch.tensor([[-1.0, -1.0, -1.0]])
+    mask = torch.tensor([[True, True, True]])
+
+    group_stats = MockGroupStats()
+
+    rl_utils.update_inference_logprobs_group_stats(
+        old_logprobs=old_logprobs,
+        inference_logprobs=inference_logprobs,
+        mask=mask,
+        group_stats=group_stats,
+    )
+
+    # With different logprobs, ratio should not be 1.0
+    # exp(-0.5) / exp(-1.0) = exp(0.5) ≈ 1.65
+    assert group_stats.mean_piold_to_inf_prob > 1.0
+
+    # Abs diff should be non-zero
+    assert group_stats.mean_inf_train_prob_abs_diff > 0.0
+
+
+def test_compute_packed_inference_logprobs_stats():
+    """Test compute_packed_inference_logprobs_stats with packed data."""
+    # Create packed data (simulating 2 bins)
+    # old_logprobs shape: [num_bins, seq_len-1]
+    old_logprobs = torch.tensor(
+        [
+            [-0.5, -0.3, -0.2, 0.0, 0.0, 0.0, 0.0],  # bin 0
+            [-0.4, -0.6, -0.1, 0.0, 0.0, 0.0, 0.0],  # bin 1
+        ]
+    )
+
+    # packed_inference_logprobs with same values (should give ratio ~1.0)
+    packed_inference_logprobs = torch.tensor(
+        [
+            [-0.5, -0.3, -0.2, 0.0, 0.0, 0.0, 0.0],  # bin 0
+            [-0.4, -0.6, -0.1, 0.0, 0.0, 0.0, 0.0],  # bin 1
+        ]
+    )
+
+    # packed_loss_mask: [num_bins, seq_len] - indicates valid positions
+    # Note: function shifts by 1, so packed_loss_mask[:, 1:] is used
+    packed_loss_mask = torch.tensor(
+        [
+            [0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],  # bin 0: 3 valid tokens
+            [0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],  # bin 1: 3 valid tokens
+        ]
+    )
+
+    group_stats = MockGroupStats()
+
+    sequence_packing_utils.compute_packed_inference_logprobs_stats(
+        old_logprobs=old_logprobs,
+        packed_inference_logprobs=packed_inference_logprobs,
+        packed_loss_mask=packed_loss_mask,
+        group_stats=group_stats,
+    )
+
+    # Verify statistics were computed
+    assert group_stats.min_piold_to_inf_prob is not None
+    assert group_stats.max_piold_to_inf_prob is not None
+    assert group_stats.mean_piold_to_inf_prob is not None
+
+    # When logprobs match exactly, ratio should be 1.0
+    assert abs(group_stats.mean_piold_to_inf_prob - 1.0) < 1e-6
+    assert abs(group_stats.mean_inf_train_prob_abs_diff) < 1e-6
+
+
+def test_compute_packed_inference_logprobs_stats_with_mismatch():
+    """Test compute_packed_inference_logprobs_stats when values differ."""
+    # old_logprobs
+    old_logprobs = torch.tensor([[-0.5, -0.5, -0.5, 0.0, 0.0, 0.0, 0.0]])
+
+    # Different inference logprobs
+    packed_inference_logprobs = torch.tensor([[-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0]])
+
+    # packed_loss_mask
+    packed_loss_mask = torch.tensor([[0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]])
+
+    group_stats = MockGroupStats()
+
+    sequence_packing_utils.compute_packed_inference_logprobs_stats(
+        old_logprobs=old_logprobs,
+        packed_inference_logprobs=packed_inference_logprobs,
+        packed_loss_mask=packed_loss_mask,
+        group_stats=group_stats,
+    )
+
+    # With different logprobs, ratio should not be 1.0
+    assert group_stats.mean_piold_to_inf_prob > 1.0
+    assert group_stats.mean_inf_train_prob_abs_diff > 0.0
+
+
+def test_compute_packed_inference_logprobs_stats_shape_mismatch():
+    """Test that function handles shape mismatch gracefully."""
+    # Mismatched shapes
+    old_logprobs = torch.tensor([[-0.5, -0.3, -0.2]])  # 3 elements
+    packed_inference_logprobs = torch.tensor([[-0.5, -0.3, -0.2]])
+    packed_loss_mask = torch.tensor([[0.0, 1.0, 1.0, 1.0, 1.0, 1.0]])  # 6 elements -> 5 after shift
+
+    group_stats = MockGroupStats()
+
+    # Should not raise, but stats should remain None due to shape mismatch
+    sequence_packing_utils.compute_packed_inference_logprobs_stats(
+        old_logprobs=old_logprobs,
+        packed_inference_logprobs=packed_inference_logprobs,
+        packed_loss_mask=packed_loss_mask,
+        group_stats=group_stats,
+    )
+
+    # Stats should remain None due to shape mismatch
+    assert group_stats.mean_piold_to_inf_prob is None
+
+
+@pytest.mark.parametrize(
+    "ratio,local_bins,world,expected_bs",
+    [
+        (1.0, 1, 8, 8),  # no stale data (ratio 1.), everything divides perfectly.
+        (1.0, 42, 8, 42 * 8),  # no stale data (ratio 1.), everything divides perfectly, more bins
+        (
+            0.5,
+            1,
+            8,
+            8,
+        ),  # 0.5 means we use half of all seqs per step, they all fit 1 bin -> we should reuse
+        (1 / 3, 4, 8, 16),  # third of the data per step, nonint division
+    ],
+)
+def test_get_bins_bs_and_steps(ratio, local_bins, world, expected_bs):
+    # Make a dummy struct to check only the required fields.
+    # Divide by ratio to make sure the samples are divisible by global_bs in the test.
+    n_seqs = int(world * 7 / ratio)
+    global_bs_in_seq = int(n_seqs * ratio)
+
+    def side_eff(
+        rank,
+        rampup_batch_size,
+        global_batch_size,
+        micro_batch_size,
+        data_parallel_size,
+        decrease_batch_size_if_needed,
+    ):
+        # Inside of the get_microbatch_dataloader, we compute the batch size in bins.
+        # We want to test this variable.
+        global actual_bs
+        actual_bs = global_batch_size
+
+    with patch('megatron.rl.sequence_packing_utils.get_num_microbatches', return_value=1):
+        with patch(
+            'megatron.rl.sequence_packing_utils.reconfigure_num_microbatches_calculator',
+            side_effect=side_eff,
+        ):
+            with patch('megatron.core.mpu.get_data_parallel_world_size', return_value=world):
+                sequence_packing_utils.update_microbatch_calculator(
+                    samples_ratio_per_step=ratio,
+                    num_bins_this_rank=local_bins,
+                    bin_seq_indices=[],
+                    global_batch_size=global_bs_in_seq,
+                    rampup_batch_size=1,
+                    micro_batch_size=1,
+                    decrease_batch_size_if_needed=False,
+                )
+
+    # Iterator is local, batch size is global
+    assert expected_bs == actual_bs
diff --git a/tests/unit_tests/run_ci_test.sh b/tests/unit_tests/run_ci_test.sh
index 81dd3ae2a14..c65f197d2db 100755
--- a/tests/unit_tests/run_ci_test.sh
+++ b/tests/unit_tests/run_ci_test.sh
@@ -117,7 +117,7 @@ export BUCKET
 IGNORE_ARGS=()
 while IFS= read -r line; do
     [[ -n "$line" ]] && IGNORE_ARGS+=("$line")
-done < <(python tests/unit_tests/find_test_cases.py "$BUCKET")
+done < <(python tests/unit_tests/find_test_cases.py "$BUCKET" "h100")
 
 echo "------ARGUMENTS for SLURM ---"
 MASTER_ADDR=${MASTER_ADDR:-localhost}
diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py
new file mode 100644
index 00000000000..1ccc70a2327
--- /dev/null
+++ b/tests/unit_tests/ssm/test_gated_delta_net.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from unittest import mock
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rope_utils import (
+    get_pos_emb_on_this_cp_rank as get_tensor_on_this_cp_rank,
+)
+from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+    get_experimental_attention_variant_module_spec,
+    get_transformer_block_with_experimental_attention_variant_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.ssm.gated_delta_net import GatedDeltaNet
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.training.arguments import parse_args
+from megatron.training.checkpointing import load_checkpoint, save_checkpoint
+from megatron.training.global_vars import set_args
+from megatron.training.training import get_model
+from megatron.training.utils import unwrap_model
+from tests.unit_tests.dist_checkpointing import (
+    TempNamedDir,
+    init_basic_mock_args,
+    init_checkpointing_mock_args,
+)
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    import fla
+
+    HAVE_FLA = True
+except ImportError:
+    HAVE_FLA = False
+
+
+@pytest.mark.parametrize(
+    ("tp_size", "sp", "cp_size"),
+    [
+        (1, False, 1),
+        (2, False, 1),
+        (2, True, 1),
+        # GDN does not support CP for now. Leave it for future work.
+    ],
+)
+@pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.")
+@pytest.mark.internal
+class TestGatedDeltaNet:
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self, tp_size, sp, cp_size):
+        # Initialize parallel and random seed
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp_size,
+            pipeline_model_parallel_size=1,
+            context_parallel_size=cp_size,
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.tp_size = tp_size
+        self.cp_size = cp_size
+        self.sp_size = tp_size if sp else 1
+
+        # Get TP and CP process groups from device mesh
+        tp_group = parallel_state.get_tensor_model_parallel_group()
+        cp_group = parallel_state.get_context_parallel_group()
+        pg_collection = ProcessGroupCollection(tp=tp_group, cp=cp_group)
+
+        # Initialize model
+        self.transformer_config = TransformerConfig(
+            hidden_size=256,
+            linear_conv_kernel_dim=2,
+            linear_key_head_dim=64,
+            linear_value_head_dim=64,
+            linear_num_key_heads=4,
+            linear_num_value_heads=8,
+            num_layers=1,
+            normalization="RMSNorm",
+            use_cpu_initialization=True,
+            layernorm_zero_centered_gamma=True,
+            num_attention_heads=8,
+            activation_func=F.silu,
+            bf16=True,
+            tensor_model_parallel_size=tp_size,
+            sequence_parallel=sp,
+            context_parallel_size=cp_size,
+            experimental_attention_variant="gated_delta_net",
+            linear_attention_freq=[1],
+            transformer_impl="transformer_engine",
+        )
+        gdn_submodules = get_experimental_attention_variant_module_spec(
+            config=self.transformer_config
+        ).submodules
+
+        self.gdn = GatedDeltaNet(
+            self.transformer_config,
+            submodules=gdn_submodules,
+            layer_number=1,
+            bias=False,
+            conv_bias=False,
+            conv_init=1.0,
+            use_qk_l2norm=True,
+            A_init_range=(1, 16),
+            pg_collection=pg_collection,
+        )
+        self.gdn = self.gdn.cuda().bfloat16()
+
+    def teardown_method(self):
+        Utils.destroy_model_parallel()
+
+    def test_gpu_forward(self):
+        gdn = self.gdn
+
+        micro_batch_size = 2
+        seq_length = 64
+        hidden_states = torch.ones(
+            (seq_length // self.sp_size // self.cp_size, micro_batch_size, gdn.config.hidden_size),
+            device=torch.cuda.current_device(),
+            dtype=torch.bfloat16,
+        )
+        attention_mask = None
+
+        output, bias = gdn(hidden_states, attention_mask)
+
+        assert output.dim() == 3, f"Output too many dimensions ({output.shape=})"
+        assert output.shape[0] == seq_length // self.sp_size // self.cp_size, (
+            f"Output shape {output.shape[0]=} mismatch with "
+            f" {seq_length=} // {self.sp_size=} // {self.cp_size=}."
+        )
+        assert (
+            output.shape[1] == micro_batch_size
+        ), f"Output shape {output.shape[1]=} mismatch with {micro_batch_size=}"
+        assert (
+            output.shape[2] == gdn.config.hidden_size
+        ), f"Output shape {output.shape[2]=} mismatch with {gdn.config.hidden_size=}"
+        assert (
+            output.dtype == hidden_states.dtype
+        ), f"Output dtype {output.dtype=} mismatch with {hidden_states.dtype=}"
+
+
+@pytest.mark.parametrize(
+    ("tp", "sp", "cp"),
+    [
+        (4, False, 1),  # TP w/o SP
+        (4, True, 1),  # TP w/ SP
+        # CP does not support GDN for now. Add it once it is supported.
+    ],
+)
+@pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.")
+def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp):
+    # Constants
+    seed = 123
+    sequence_length = 256
+    micro_batch_size = 4
+    hidden_size = 128
+
+    # Model initialization function
+    def initialize_gpt_model(
+        config, pre_process=True, post_process=True, vp_stage=None, pg_collection=None
+    ):
+        layer_spec = get_transformer_block_with_experimental_attention_variant_spec(
+            config=config, vp_stage=None, pp_rank=None
+        )
+        gpt_model = GPTModel(
+            config=config,
+            transformer_layer_spec=layer_spec,
+            vocab_size=128,
+            max_sequence_length=sequence_length,
+            pre_process=pre_process,
+            post_process=post_process,
+            vp_stage=vp_stage,
+            pg_collection=pg_collection,
+        )
+        return gpt_model
+
+    # Initialize baseline parallel state
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1
+    )
+
+    # Initialize input hidden states
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+    input_hidden_states = (
+        torch.rand((sequence_length, micro_batch_size, hidden_size))
+        .cuda()
+        .bfloat16()
+        .requires_grad_(True)
+    )
+
+    # Initialize transformer config
+    transformer_config = TransformerConfig(
+        hidden_size=128,
+        linear_conv_kernel_dim=2,
+        linear_key_head_dim=32,
+        linear_value_head_dim=32,
+        linear_num_key_heads=4,
+        linear_num_value_heads=8,
+        num_layers=1,
+        normalization="RMSNorm",
+        use_cpu_initialization=True,
+        layernorm_zero_centered_gamma=True,
+        num_attention_heads=8,
+        activation_func=F.silu,
+        bf16=True,
+        experimental_attention_variant="gated_delta_net",
+        linear_attention_freq=[1],
+        transformer_impl="transformer_engine",
+    )
+
+    with TempNamedDir(tmp_path_dist_ckpt / 'test_parallel_gdn', sync=True) as ckpt_dir:
+        # Set argument
+        mock_args = parse_args(ignore_unknown_args=True)
+        set_args(mock_args)
+
+        # Initialize baseline model
+        init_basic_mock_args(mock_args, 1, 1, bf16=True)
+        mock_args.context_parallel_size = 1
+        mock_args.sequence_parallel = 1
+        gpt_model = unwrap_model(get_model(initialize_gpt_model, config=transformer_config))
+
+        # Initialize args and save checkpoint
+        init_checkpointing_mock_args(mock_args, ckpt_dir, False)
+        mock_args.no_save_optim = True
+        mock_args.no_save_rng = True
+        mock_args.no_load_optim = True
+        mock_args.no_load_rng = True
+        save_checkpoint(10, gpt_model, None, None, 0)
+
+        # Calculate baseline output
+        attention = gpt_model[0].decoder.layers[0].self_attention
+        output_hidden_states_baseline, bias_hidden_states_baseline = attention(
+            input_hidden_states, attention_mask=None
+        )
+        output_hidden_states_baseline.sum().backward()
+
+        # Save baseline output
+        input_grad_baseline = input_hidden_states.grad.detach()
+        output_hidden_states_baseline = output_hidden_states_baseline.detach()
+
+        # Initialize parallel model
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp, pipeline_model_parallel_size=1, context_parallel_size=cp
+        )
+        torch.manual_seed(seed)
+        model_parallel_cuda_manual_seed(seed)
+        transformer_config.context_parallel_size = cp
+        transformer_config.tensor_model_parallel_size = tp
+        transformer_config.sequence_parallel = sp
+        init_basic_mock_args(mock_args, tp, 1, bf16=True)
+        mock_args.context_parallel_size = cp
+        mock_args.sequence_parallel = sp
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+        pg_collection.embd = parallel_state.get_embedding_group()
+        gpt_model = unwrap_model(
+            get_model(initialize_gpt_model, config=transformer_config, pg_collection=pg_collection)
+        )
+        with mock.patch('megatron.training.checkpointing.check_checkpoint_args'):
+            with mock.patch('megatron.training.checkpointing.update_num_microbatches'):
+                load_checkpoint(gpt_model, None, None)
+
+        # Function to get tensor on this tp and cp rank
+        cp_group = parallel_state.get_context_parallel_group()
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+
+        def get_tensor_on_this_rank(tensor):
+            if cp > 1:
+                tensor = get_tensor_on_this_cp_rank(tensor, 0, cp_group)
+            if tp > 1 and sp:
+                sp_seg = sequence_length // tp // cp
+                tensor = tensor[tp_rank * sp_seg : (tp_rank + 1) * sp_seg]
+            return tensor
+
+        # Calculate parallel model output
+        input_hidden_states = get_tensor_on_this_rank(input_hidden_states)
+        input_hidden_states = input_hidden_states.detach().requires_grad_(True)
+        parallel_attention = gpt_model[0].decoder.layers[0].self_attention
+        output_hidden_states_parallel, bias_hidden_states_parallel = parallel_attention(
+            input_hidden_states, attention_mask=None
+        )
+        output_hidden_states_parallel.sum().backward()
+        input_grad_parallel = input_hidden_states.grad.detach()
+
+        # Check if the output is the same
+        if cp:
+            atol, rtol = 5e-3, 5e-3
+        else:
+            atol, rtol = 5e-4, 5e-4
+        output_hidden_states_baseline = get_tensor_on_this_rank(output_hidden_states_baseline)
+        input_grad_baseline = get_tensor_on_this_rank(input_grad_baseline)
+
+        assert torch.all(
+            ~torch.isnan(output_hidden_states_baseline)
+        ), "output_hidden_states_baseline contains nan"
+        assert torch.all(
+            ~torch.isinf(output_hidden_states_baseline)
+        ), "output_hidden_states_baseline contains inf"
+        assert torch.all(~torch.isnan(input_grad_baseline)), "input_grad_baseline contains nan"
+        assert torch.all(~torch.isinf(input_grad_baseline)), "input_grad_baseline contains inf"
+        assert torch.all(
+            ~torch.isnan(output_hidden_states_parallel)
+        ), "output_hidden_states_parallel contains nan"
+        assert torch.all(
+            ~torch.isinf(output_hidden_states_parallel)
+        ), "output_hidden_states_parallel contains inf"
+        assert torch.all(~torch.isnan(input_grad_parallel)), "input_grad_parallel contains nan"
+        assert torch.all(~torch.isinf(input_grad_parallel)), "input_grad_parallel contains inf"
+
+        torch.testing.assert_close(
+            output_hidden_states_baseline,
+            output_hidden_states_parallel,
+            atol=atol,
+            rtol=rtol,
+            msg=lambda msg: f"Mismatch in output_hidden_states: {msg}",
+        )
+        torch.testing.assert_close(
+            input_grad_baseline,
+            input_grad_parallel,
+            atol=atol,
+            rtol=rtol,
+            msg=lambda msg: f"Mismatch in input_grad: {msg}",
+        )
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/tensor_parallel/test_random.py b/tests/unit_tests/tensor_parallel/test_random.py
index 47b607b8795..a15ad83cb90 100644
--- a/tests/unit_tests/tensor_parallel/test_random.py
+++ b/tests/unit_tests/tensor_parallel/test_random.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import pytest
 import torch
 
@@ -5,6 +7,7 @@
     CheckpointWithoutOutput,
     CudaRNGStatesTracker,
     checkpoint,
+    convert_cuda_rng_state,
     get_cuda_rng_tracker,
     model_parallel_cuda_manual_seed,
 )
@@ -33,6 +36,148 @@ def test_cuda_rng_states_tracker():
     assert torch.equal(rng_tracker.get_states()['state2'], rng_state)
 
 
+@pytest.mark.parametrize("use_cudagraphable_rng", [True, False])
+def test_double_fork_cuda_rng_states_tracker(use_cudagraphable_rng):
+    rng_tracker = CudaRNGStatesTracker(use_cudagraphable_rng=use_cudagraphable_rng)
+    rng_tracker.add("state1", 1234)
+    rng_tracker.add("state2", 5678)
+    randn_double_fork_1 = []
+    randn_double_fork_2 = []
+    with rng_tracker.fork("state1"):
+        randn_double_fork_1.append(torch.randn(10, device="cuda"))
+        with rng_tracker.fork("state2"):
+            randn_double_fork_2.append(torch.randn(10, device="cuda"))
+            with rng_tracker.fork("state1"):
+                randn_double_fork_1.append(torch.randn(10, device="cuda"))
+            randn_double_fork_2.append(torch.randn(10, device="cuda"))
+        randn_double_fork_1.append(torch.randn(10, device="cuda"))
+    if use_cudagraphable_rng:
+        double_fork_state1 = rng_tracker.get_states()["state1"].get_state()
+        double_fork_state2 = rng_tracker.get_states()["state2"].get_state()
+    else:
+        double_fork_state1 = rng_tracker.get_states()["state1"]
+        double_fork_state2 = rng_tracker.get_states()["state2"]
+
+    rng_tracker.reset()
+    rng_tracker.add("state1", 1234)
+    rng_tracker.add("state2", 5678)
+    randn_single_fork_1 = []
+    randn_single_fork_2 = []
+    with rng_tracker.fork("state1"):
+        randn_single_fork_1.append(torch.randn(10, device="cuda"))
+        randn_single_fork_1.append(torch.randn(10, device="cuda"))
+        randn_single_fork_1.append(torch.randn(10, device="cuda"))
+    with rng_tracker.fork("state2"):
+        randn_single_fork_2.append(torch.randn(10, device="cuda"))
+        randn_single_fork_2.append(torch.randn(10, device="cuda"))
+    if use_cudagraphable_rng:
+        single_fork_state1 = rng_tracker.get_states()["state1"].get_state()
+        single_fork_state2 = rng_tracker.get_states()["state2"].get_state()
+    else:
+        single_fork_state1 = rng_tracker.get_states()["state1"]
+        single_fork_state2 = rng_tracker.get_states()["state2"]
+
+    assert torch.equal(randn_double_fork_1[0], randn_single_fork_1[0])
+    assert torch.equal(randn_double_fork_1[1], randn_single_fork_1[1])
+    assert torch.equal(randn_double_fork_1[2], randn_single_fork_1[2])
+    assert torch.equal(randn_double_fork_2[0], randn_single_fork_2[0])
+    assert torch.equal(randn_double_fork_2[1], randn_single_fork_2[1])
+    assert torch.equal(double_fork_state1, single_fork_state1)
+    assert torch.equal(double_fork_state2, single_fork_state2)
+
+
+def test_convert_cuda_rng_state():
+    ## Get the default rng state
+    torch.cuda.manual_seed(999)
+    randn = torch.randn(10, device="cuda")
+    rng_state = torch.cuda.get_rng_state()
+
+    try:
+        from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker
+    except ImportError:
+        TECudaRNGStatesTracker = None
+
+    ## from non-graphable RNG to graphable RNG
+    # get state from non-graphable RNG
+    tracker = CudaRNGStatesTracker(use_cudagraphable_rng=False)
+    tracker.add("state1", 123)
+    for i in range(3):
+        with tracker.fork("state1"):
+            randn = torch.randn(10, device="cuda")
+    state = convert_cuda_rng_state(tracker.states_["state1"], to_graphable=True)
+    rand_tensors = []
+    for i in range(3):
+        with tracker.fork("state1"):
+            randn = torch.randn(10, device="cuda")
+            rand_tensors.append(randn)
+
+    # set state to local graph RNG
+    cudagraphable_tracker = CudaRNGStatesTracker(use_cudagraphable_rng=True)
+    cudagraphable_tracker.set_states({"state1": state.clone_state()})
+    for i in range(3):
+        with cudagraphable_tracker.fork("state1"):
+            randn = torch.randn(10, device="cuda")
+            assert torch.equal(randn, rand_tensors[i])
+
+    # set state to TE RNG
+    if TECudaRNGStatesTracker is not None:
+        te_tracker = TECudaRNGStatesTracker()
+        te_tracker.set_states({"state1": state})
+        for i in range(3):
+            with te_tracker.fork("state1"):
+                randn = torch.randn(10, device="cuda")
+                assert torch.equal(randn, rand_tensors[i])
+
+    ## from graphable RNG to non-graphable RNG
+    # get state from graphable RNG
+    cudagraphable_tracker = CudaRNGStatesTracker(use_cudagraphable_rng=True)
+    cudagraphable_tracker.add("state2", 123)
+    for i in range(3):
+        with cudagraphable_tracker.fork("state2"):
+            randn = torch.randn(10, device="cuda")
+    state = convert_cuda_rng_state(cudagraphable_tracker.states_["state2"], to_graphable=False)
+    rand_tensors = []
+    for i in range(3):
+        with cudagraphable_tracker.fork("state2"):
+            randn = torch.randn(10, device="cuda")
+            rand_tensors.append(randn)
+
+    # set state to non-graphable RNG
+    tracker = CudaRNGStatesTracker(use_cudagraphable_rng=False)
+    tracker.set_states({"state2": state})
+    for i in range(3):
+        with tracker.fork("state2"):
+            randn = torch.randn(10, device="cuda")
+            assert torch.equal(randn, rand_tensors[i])
+
+    ## from TE RNG to non-graphable RNG
+    if TECudaRNGStatesTracker is not None:
+        # get state from TE RNG
+        cudagraphable_tracker = TECudaRNGStatesTracker()
+        cudagraphable_tracker.add("state3", 123)
+        for i in range(3):
+            with cudagraphable_tracker.fork("state3"):
+                randn = torch.randn(10, device="cuda")
+        state = convert_cuda_rng_state(cudagraphable_tracker.states_["state3"], to_graphable=False)
+        rand_tensors = []
+        for i in range(3):
+            with cudagraphable_tracker.fork("state3"):
+                randn = torch.randn(10, device="cuda")
+                rand_tensors.append(randn)
+
+        # set state to non-graphable RNG
+        tracker = CudaRNGStatesTracker(use_cudagraphable_rng=False)
+        tracker.set_states({"state3": state})
+        for i in range(3):
+            with tracker.fork("state3"):
+                randn = torch.randn(10, device="cuda")
+                assert torch.equal(randn, rand_tensors[i])
+
+    ## After all tests, check if the default rng state is still the same.
+    rng_state_final = torch.cuda.get_rng_state()
+    assert torch.equal(rng_state, rng_state_final)
+
+
 def test_model_parallel_cuda_manual_seed():
     Utils.initialize_model_parallel(4, 2)
     model_parallel_cuda_manual_seed(0, force_reset_rng=True)
diff --git a/tests/unit_tests/test_argument_utils.py b/tests/unit_tests/test_argument_utils.py
new file mode 100644
index 00000000000..e5744c3b074
--- /dev/null
+++ b/tests/unit_tests/test_argument_utils.py
@@ -0,0 +1,643 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import signal
+from argparse import ArgumentError, ArgumentParser
+from dataclasses import dataclass, field
+from typing import Callable, Literal, Optional, Union
+
+import pytest
+
+from megatron.training.argument_utils import ArgumentGroupFactory, TypeInferenceError
+
+
+@dataclass
+class DummyConfig:
+    """A dummy configuration for testing."""
+
+    name: str = "default_name"
+    """Name of the configuration"""
+
+    count: int = 42
+    """Number of items"""
+
+    learning_rate: float = 0.001
+    """Learning rate for training"""
+
+    enabled: bool = False
+    """Whether feature is enabled"""
+
+    disabled_feature: bool = True
+    """Feature that is disabled by default"""
+
+    enum_setting: signal.Signals = signal.SIGTERM
+    """Setting with enum type to test enum handling"""
+
+
+@dataclass
+class ConfigWithOptional:
+    """Config with optional fields."""
+
+    required_field: str = "required"
+    """A required field"""
+
+    optional_field: Optional[int] = None
+    """An optional integer field"""
+
+    optional_str: Optional[str] = "default"
+    """An optional string with default"""
+
+    int_new_form: int | None = None
+    """Optional using new syntax"""
+
+    str_new_form: str | None = "default"
+    """Optional string using new syntax"""
+
+
+@dataclass
+class ConfigWithList:
+    """Config with list fields."""
+
+    tags: list[str] = field(default_factory=list)
+    """List of tags"""
+
+    numbers: list[int] = field(default_factory=lambda: [1, 2, 3])
+    """List of numbers with default"""
+
+
+@dataclass
+class ConfigWithLiteral:
+    """Config with Literal types."""
+
+    mode: Literal["train", "eval", "test"] = "train"
+    """Operating mode"""
+
+    precision: Literal[16, 32] = 32
+    """Precision level"""
+
+
+class TestArgumentGroupFactoryBasic:
+    """Test basic functionality of ArgumentGroupFactory."""
+
+    def test_creates_argument_group(self):
+        """Test that build_group creates an argument group."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        arg_group = factory.build_group(parser, title="Test Group")
+
+        assert arg_group is not None
+        assert arg_group.title == "Test Group"
+        assert arg_group.description == DummyConfig.__doc__
+
+    def test_all_fields_added(self):
+        """Test that all dataclass fields are added as arguments."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Parse empty args to get all defaults
+        args = parser.parse_args([])
+
+        # Check all fields exist
+        assert hasattr(args, 'name')
+        assert hasattr(args, 'count')
+        assert hasattr(args, 'learning_rate')
+        assert hasattr(args, 'enabled')
+        assert hasattr(args, 'disabled_feature')
+
+    def test_default_values_preserved(self):
+        """Test that default values from dataclass are preserved."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        factory.build_group(parser, title="Test Group")
+        args = parser.parse_args([])
+
+        assert args.name == "default_name"
+        assert args.count == 42
+        assert args.learning_rate == 0.001
+        assert args.enabled == False
+        assert args.disabled_feature == True
+
+    def test_argument_types(self):
+        """Test that argument types are correctly inferred."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Parse with actual values
+        args = parser.parse_args(
+            ['--name', 'test_name', '--count', '100', '--learning-rate', '0.01']
+        )
+
+        assert isinstance(args.name, str)
+        assert args.name == 'test_name'
+        assert isinstance(args.count, int)
+        assert args.count == 100
+        assert isinstance(args.learning_rate, float)
+        assert args.learning_rate == 0.01
+
+    def test_boolean_store_true(self):
+        """Test that boolean fields with default False use store_true."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Without flag, should be False
+        args = parser.parse_args([])
+        assert args.enabled == False
+
+        # With flag, should be True
+        args = parser.parse_args(['--enabled'])
+        assert args.enabled == True
+
+    def test_boolean_store_false(self):
+        """Test that boolean fields with default True use store_false with no- prefix."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Without flag, should be True
+        args = parser.parse_args([])
+        assert args.disabled_feature == True
+
+        # With --no- flag, should be False
+        args = parser.parse_args(['--no-disabled-feature'])
+        assert args.disabled_feature == False
+
+        # With --disable- flag, should also be False
+        args = parser.parse_args(['--disable-disabled-feature'])
+        assert args.disabled_feature == False
+
+    def test_field_docstrings_as_help(self):
+        """Test that field docstrings are extracted and used as help text."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        # Check that field_docstrings were extracted
+        assert 'name' in factory.field_docstrings
+        assert factory.field_docstrings['name'] == "Name of the configuration"
+        assert factory.field_docstrings['count'] == "Number of items"
+        assert factory.field_docstrings['learning_rate'] == "Learning rate for training"
+
+    def test_enum_handling(self):
+        """Test that enum types are handled correctly."""
+        parser = ArgumentParser(exit_on_error=False)
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        factory.build_group(parser, title="Test Group")
+
+        args = parser.parse_args([])
+        assert args.enum_setting == signal.SIGTERM
+
+        # test a different valid enum value
+        args = parser.parse_args(["--enum-setting", "SIGINT"])
+        assert args.enum_setting == signal.SIGINT
+
+        # test an invalid enum value
+        with pytest.raises(KeyError, match="sigbar"):
+            parser.parse_args(["--enum-setting", "sigbar"])
+
+
+class TestArgumentGroupFactoryExclusion:
+    """Test exclusion functionality."""
+
+    def test_exclude_single_field(self):
+        """Test excluding a single field."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig, exclude=['count'])
+
+        factory.build_group(parser, title="Test Group")
+        args = parser.parse_args([])
+
+        # Excluded field should not exist
+        assert hasattr(args, 'name')
+        assert not hasattr(args, 'count')
+        assert hasattr(args, 'learning_rate')
+
+    def test_exclude_multiple_fields(self):
+        """Test excluding multiple fields."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(DummyConfig, exclude=['count', 'learning_rate'])
+
+        factory.build_group(parser, title="Test Group")
+        args = parser.parse_args([])
+
+        assert hasattr(args, 'name')
+        assert not hasattr(args, 'count')
+        assert not hasattr(args, 'learning_rate')
+        assert hasattr(args, 'enabled')
+
+
+class TestArgumentGroupFactoryOptional:
+    """Test handling of Optional types."""
+
+    def test_optional_fields(self):
+        """Test that Optional fields are handled correctly."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithOptional)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Default values
+        args = parser.parse_args([])
+        assert args.required_field == "required"
+        assert args.optional_field is None
+        assert args.optional_str == "default"
+
+        # Provided values
+        args = parser.parse_args(
+            ['--required-field', 'new_value', '--optional-field', '123', '--optional-str', 'custom']
+        )
+        assert args.required_field == "new_value"
+        assert args.optional_field == 123
+        assert args.optional_str == "custom"
+
+
+class TestArgumentGroupFactoryList:
+    """Test handling of list types."""
+
+    def test_list_fields_with_default_factory(self):
+        """Test that list fields use nargs='+'."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithList)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Default values
+        args = parser.parse_args([])
+        assert args.tags == []
+        assert args.numbers == [1, 2, 3]
+
+        # Provided values
+        args = parser.parse_args(['--tags', 'tag1', 'tag2', 'tag3', '--numbers', '10', '20', '30'])
+        assert args.tags == ['tag1', 'tag2', 'tag3']
+        assert args.numbers == [10, 20, 30]
+
+
+class TestArgumentGroupFactoryLiteral:
+    """Test handling of Literal types."""
+
+    def test_literal_fields_have_choices(self):
+        """Test that Literal types create choice constraints."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithLiteral)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Default values
+        args = parser.parse_args([])
+        assert args.mode == "train"
+        assert args.precision == 32
+
+        # Valid choices
+        args = parser.parse_args(['--mode', 'eval', '--precision', '16'])
+        assert args.mode == "eval"
+        assert args.precision == 16
+
+    def test_literal_fields_reject_invalid_choices(self):
+        """Test that invalid Literal choices are rejected."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithLiteral)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Invalid choice should raise error
+        with pytest.raises(SystemExit):
+            parser.parse_args(['--mode', 'invalid'])
+
+        with pytest.raises(SystemExit):
+            parser.parse_args(['--precision', '64'])
+
+
+class TestArgumentGroupFactoryHelpers:
+    """Test helper methods."""
+
+    def test_format_arg_name_basic(self):
+        """Test basic argument name formatting."""
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        assert factory._format_arg_name("simple") == "--simple"
+        assert factory._format_arg_name("with_underscore") == "--with-underscore"
+        assert factory._format_arg_name("multiple_under_scores") == "--multiple-under-scores"
+
+    def test_format_arg_name_with_prefix(self):
+        """Test argument name formatting with prefix."""
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        assert factory._format_arg_name("feature", prefix="no") == "--no-feature"
+        assert factory._format_arg_name("feature", prefix="disable") == "--disable-feature"
+        assert factory._format_arg_name("multi_word", prefix="no") == "--no-multi-word"
+
+    def test_extract_type_primitive(self):
+        """Test type extraction for primitive types."""
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        assert factory._extract_type(int) == {"type": int}
+        assert factory._extract_type(str) == {"type": str}
+        assert factory._extract_type(float) == {"type": float}
+
+    def test_extract_type_optional(self):
+        """Test type extraction for Optional types."""
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        result = factory._extract_type(Optional[int])
+        assert result == {"type": int}
+
+        result = factory._extract_type(Optional[str])
+        assert result == {"type": str}
+
+    def test_extract_type_list(self):
+        """Test type extraction for list types."""
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        result = factory._extract_type(list[int])
+        assert result == {"type": int, "nargs": "+"}
+
+        result = factory._extract_type(list[str])
+        assert result == {"type": str, "nargs": "+"}
+
+    def test_extract_type_literal(self):
+        """Test type extraction for Literal types."""
+        factory = ArgumentGroupFactory(DummyConfig)
+
+        result = factory._extract_type(Literal["a", "b", "c"])
+        assert result == {"type": str, "choices": ("a", "b", "c")}
+
+        result = factory._extract_type(Literal[1, 2, 3])
+        assert result == {"type": int, "choices": (1, 2, 3)}
+
+
+@dataclass
+class ConfigWithArgparseMeta:
+    """Config with argparse_meta metadata for testing overrides."""
+
+    custom_help: str = field(
+        default="default_value",
+        metadata={"argparse_meta": {"help": "Custom help text from metadata"}},
+    )
+    """Original help text"""
+
+    custom_type: str = field(default="100", metadata={"argparse_meta": {"type": int}})
+    """Field with type override"""
+
+    custom_default: str = field(
+        default="original_default", metadata={"argparse_meta": {"default": "overridden_default"}}
+    )
+    """Field with default override"""
+
+    custom_choices: str = field(
+        default="option1",
+        metadata={"argparse_meta": {"choices": ["option1", "option2", "option3"]}},
+    )
+    """Field with choices override"""
+
+    custom_dest: str = field(
+        default="value", metadata={"argparse_meta": {"dest": "renamed_destination"}}
+    )
+    """Field with dest override"""
+
+    custom_action: bool = field(
+        default=False,
+        metadata={"argparse_meta": {"action": "store_const", "const": "special_value"}},
+    )
+    """Field with custom action override"""
+
+    multiple_overrides: int = field(
+        default=42,
+        metadata={
+            "argparse_meta": {
+                "type": str,
+                "help": "Multiple overrides applied",
+                "default": "999",
+                "dest": "multi_override_dest",
+            }
+        },
+    )
+    """Field with multiple metadata overrides"""
+
+    nargs_override: str = field(default="single", metadata={"argparse_meta": {"nargs": "?"}})
+    """Field with nargs override"""
+
+
+@dataclass
+class ConfigWithUnsupportedCallables:
+    """Config with argparse_meta metadata for testing overrides."""
+
+    unsupported_type: Optional[Callable] = None
+    """Cannot take a callable over CLI"""
+
+    unsupported_with_metadata: Optional[Callable] = field(
+        default=None, metadata={"argparse_meta": {"type": int, "choices": (0, 1, 2)}}
+    )
+    """This argument should be 0, 1, or 2. The appropriate
+    Callable will be set by some other logic.
+    """
+
+
+@dataclass
+class ConfigWithUnsupportedUnions:
+    """Config with argparse_meta metadata for testing overrides."""
+
+    unsupported_type: Union[int, str] = 0
+    """Cannot infer type of a Union"""
+
+    unsupported_with_metadata: Union[int, str] = field(
+        default=0, metadata={"argparse_meta": {"type": str, "choices": ("foo", "bar")}}
+    )
+    """Metadata should take precedence over the exception caused by Union"""
+
+
+class TestArgumentGroupFactoryArgparseMeta:
+    """Test argparse_meta metadata override functionality."""
+
+    def test_help_override(self):
+        """Test that argparse_meta can override help text."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Find the action for this argument
+        for action in parser._actions:
+            if hasattr(action, 'dest') and action.dest == 'custom_help':
+                assert action.help == "Custom help text from metadata"
+                return
+
+        pytest.fail("custom_help argument not found")
+
+    def test_type_override(self):
+        """Test that argparse_meta can override argument type."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Parse with integer value (metadata overrides type to int)
+        args = parser.parse_args(['--custom-type', '42'])
+
+        # Should be parsed as int, not str
+        assert isinstance(args.custom_type, int)
+        assert args.custom_type == 42
+
+    def test_default_override(self):
+        """Test that argparse_meta can override default value."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Parse with no arguments
+        args = parser.parse_args([])
+
+        # Should use metadata default, not field default
+        assert args.custom_default == "overridden_default"
+
+    def test_choices_override(self):
+        """Test that argparse_meta can override choices."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Valid choice from metadata
+        args = parser.parse_args(['--custom-choices', 'option2'])
+        assert args.custom_choices == "option2"
+
+        # Invalid choice should fail
+        with pytest.raises(SystemExit):
+            parser.parse_args(['--custom-choices', 'invalid_option'])
+
+    def test_dest_override(self):
+        """Test that argparse_meta can override destination name."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        args = parser.parse_args(['--custom-dest', 'test_value'])
+
+        # Should be stored in renamed destination
+        assert hasattr(args, 'renamed_destination')
+        assert args.renamed_destination == "test_value"
+
+    def test_action_override(self):
+        """Test that argparse_meta can override action."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        # With custom action=store_const and const="special_value"
+        args = parser.parse_args(['--custom-action'])
+        assert args.custom_action == "special_value"
+
+        # Without flag, should use default
+        args = parser.parse_args([])
+        assert args.custom_action == False
+
+    def test_multiple_overrides(self):
+        """Test that multiple argparse_meta overrides work together."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        # Parse with no arguments to check default override
+        args = parser.parse_args([])
+
+        # Check all overrides applied
+        assert hasattr(args, 'multi_override_dest')
+        assert args.multi_override_dest == "999"  # default override
+
+        # Parse with value to check type override
+        args = parser.parse_args(['--multiple-overrides', 'text_value'])
+        assert isinstance(args.multi_override_dest, str)  # type override
+        assert args.multi_override_dest == "text_value"
+
+        # Check help override was applied
+        for action in parser._actions:
+            if hasattr(action, 'dest') and action.dest == 'multi_override_dest':
+                assert action.help == "Multiple overrides applied"
+                break
+
+    def test_nargs_override(self):
+        """Test that argparse_meta can override nargs."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        factory.build_group(parser, title="Test Group")
+
+        # With nargs='?', argument is optional
+        args = parser.parse_args(['--nargs-override'])
+        assert args.nargs_override is None  # No value provided with '?'
+
+        # With value
+        args = parser.parse_args(['--nargs-override', 'provided_value'])
+        assert args.nargs_override == "provided_value"
+
+        # Without flag at all, should use default
+        args = parser.parse_args([])
+        assert args.nargs_override == "single"
+
+    def test_metadata_takes_precedence_over_inference(self):
+        """Test that metadata has highest precedence over type inference."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithArgparseMeta)
+
+        # Build kwargs for custom_type field which is str but metadata says int
+        from dataclasses import fields as dc_fields
+
+        for f in dc_fields(ConfigWithArgparseMeta):
+            if f.name == 'custom_type':
+                kwargs = factory._build_argparse_kwargs_from_field(f)
+                # Metadata type should override inferred type
+                assert kwargs['type'] == int
+                break
+
+    def test_unhandled_unsupported_callables(self):
+        """Test that an unsupported type produces a TypInferenceError."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(
+            ConfigWithUnsupportedCallables, exclude=["unsupported_with_metadata"]
+        )
+
+        with pytest.raises(TypeInferenceError, match="Unsupported type"):
+            factory.build_group(parser, title="Test Group")
+
+    def test_handled_unsupported_callables(self):
+        """Test an attribute with an unsupported type that has type info in the metadata."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(ConfigWithUnsupportedCallables, exclude=["unsupported_type"])
+
+        factory.build_group(parser, title="Test Group")
+
+        args = parser.parse_args(['--unsupported-with-metadata', '0'])
+        assert args.unsupported_with_metadata == 0
+
+    def test_unhandled_unsupported_unions(self):
+        """Test that an unsupported type produces a TypInferenceError."""
+        parser = ArgumentParser()
+        factory = ArgumentGroupFactory(
+            ConfigWithUnsupportedUnions, exclude=["unsupported_with_metadata"]
+        )
+
+        with pytest.raises(TypeInferenceError, match="Unions not supported by argparse"):
+            factory.build_group(parser, title="Test Group")
+
+    def test_handled_unsupported_unions(self):
+        """Test an attribute with an unsupported type that has type info in the metadata."""
+        parser = ArgumentParser(exit_on_error=False)
+        factory = ArgumentGroupFactory(ConfigWithUnsupportedUnions, exclude=["unsupported_type"])
+
+        factory.build_group(parser, title="Test Group")
+
+        args = parser.parse_args(['--unsupported-with-metadata', 'foo'])
+        assert args.unsupported_with_metadata == 'foo'
+
+        with pytest.raises(ArgumentError, match="invalid choice"):
+            args = parser.parse_args(['--unsupported-with-metadata', 'baz'])
diff --git a/tests/unit_tests/test_checkpointing.py b/tests/unit_tests/test_checkpointing.py
index 4bbf54301f5..9a7a44939a3 100644
--- a/tests/unit_tests/test_checkpointing.py
+++ b/tests/unit_tests/test_checkpointing.py
@@ -110,11 +110,9 @@ def create_args():
     args.no_load_rng = False
     args.log_progress = False
     args.ckpt_fully_parallel_save = False
-    args.dist_ckpt_save_pre_mcore_014 = False
     args.dist_ckpt_optim_fully_reshardable = False
     args.distrib_optim_fully_reshardable_mem_efficient = False
     args.auto_detect_ckpt_format = False
-    args.retro_add_retriever = False
     args.ckpt_convert_update_legacy_dist_opt_format = False
     args.ckpt_step = None
     args.swiglu = True
@@ -144,6 +142,7 @@ def create_ckpt_load_args(create_args):
     args.dist_ckpt_strictness = 'assume_ok_unexpected'
     args.use_megatron_fsdp = False
     args.strict_fsdp_dtensor_load = True
+    args.phase_transition_iterations = None
 
     yield args
 
diff --git a/tests/unit_tests/test_fp8_param.py b/tests/unit_tests/test_fp8_param.py
index 0b8d41769ec..361698f7127 100644
--- a/tests/unit_tests/test_fp8_param.py
+++ b/tests/unit_tests/test_fp8_param.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import contextlib
 import gc
@@ -36,7 +36,10 @@
 try:
     from transformer_engine.pytorch.tensor.utils import post_all_gather_processing
 
-    cuda_graph_supported = True
+    if is_te_min_version("2.10.0"):
+        cuda_graph_supported = True
+    else:
+        reason_for_no_cuda_graph = "Need newer TransformerEngine"
 except ImportError:
     reason_for_no_cuda_graph = "Need newer TransformerEngine"
 
@@ -65,12 +68,16 @@ class TestFP8Param:
     def setup_method(self, method):
         self.seq_length = 512
         self.micro_batch_size = 2
+        self.cuda_graph_helper = None
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
 
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
         destroy_global_vars()
         destroy_num_microbatches_calculator()
+        if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created():
+            self.cuda_graph_helper.delete_cuda_graphs()
+            self.cuda_graph_helper = None
         gc.collect()
 
     def model_provider(
@@ -209,13 +216,12 @@ def _run_test_helper(
             )
         assert len(gpt_model) == 1  # Assume only one model in the model provider.
 
-        cuda_graph_helper = None
         # Hard coded to use cuda_graph_impl="transformer_engine"
         cuda_graph_impl = "transformer_engine"
         if use_cuda_graph and cuda_graph_impl == "transformer_engine":
             from megatron.core.transformer.cuda_graphs import TECudaGraphHelper
 
-            cuda_graph_helper = TECudaGraphHelper(
+            self.cuda_graph_helper = TECudaGraphHelper(
                 model=gpt_model,
                 config=gpt_model[0].config,
                 seq_length=self.seq_length,
@@ -250,13 +256,13 @@ def _run_test_helper(
             # Capture CUDA graphs after warmup if helper is provided.
             # Hard coded cuda_graph_warmup_steps = 0.
             cuda_graph_warmup_steps = 0
-            if cuda_graph_helper is not None and i == cuda_graph_warmup_steps:
+            if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps:
                 if should_disable_forward_pre_hook(args):
                     disable_forward_pre_hook(gpt_model, param_sync=False)
-                cuda_graph_helper.create_cudagraphs()
+                self.cuda_graph_helper.create_cudagraphs()
                 if should_disable_forward_pre_hook(args):
                     enable_forward_pre_hook(gpt_model)
-                    cuda_graph_helper.cuda_graph_set_manual_hooks()
+                    self.cuda_graph_helper.cuda_graph_set_manual_hooks()
 
             # For the mxfp8_param with reuse_grad_buf_for_mxfp8_param_ag and dp_ag_overlap,
             # we need to call the _copy_main_params_to_param_buffer() after the grad buffer
@@ -297,6 +303,10 @@ def _run_test_helper(
 
             loss_list.append(loss.item())
 
+        if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created():
+            self.cuda_graph_helper.delete_cuda_graphs()
+            self.cuda_graph_helper = None
+
         return torch.tensor(loss_list)
 
     def run_test(self, tp_size, recipe, inference: bool = False, **kwargs):
diff --git a/tests/unit_tests/test_layer_wise_optimizer.py b/tests/unit_tests/test_layer_wise_optimizer.py
new file mode 100644
index 00000000000..05ce26bcfa0
--- /dev/null
+++ b/tests/unit_tests/test_layer_wise_optimizer.py
@@ -0,0 +1,440 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import os
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging.version import Version
+
+from megatron.core import parallel_state
+from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
+from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+from megatron.core.optimizer.layer_wise_optimizer import LayerWiseDistributedOptimizer
+from megatron.core.optimizer.optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import get_pg_size
+from tests.unit_tests.test_utilities import Utils
+
+# Skip all tests in this file for LTS versions
+pytestmark = pytest.mark.skipif(
+    Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"),
+    reason="Skip layer-wise optimizer for LTS test",
+)
+
+
+class SimpleModel(nn.Module):
+    """Simple model for testing LayerWiseDistributedOptimizer.
+
+    Model with 5 layers to ensure more than 8 parameters (10 total: 5 weights + 5 biases).
+    """
+
+    def __init__(self, input_size=80, hidden_size=48, output_size=10):
+        super().__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, 32)
+        self.fc3 = nn.Linear(32, 24)
+        self.fc4 = nn.Linear(24, 16)
+        self.fc5 = nn.Linear(16, output_size)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        x = F.relu(self.fc4(x))
+        x = self.fc5(x)
+        return x
+
+
+class TinyModel(nn.Module):
+    """Tiny model with only 1 layer (2 parameters: weight and bias)."""
+
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.fc1(x)
+
+
+@pytest.mark.skipif(
+    int(os.getenv('WORLD_SIZE', '1')) == 1, reason="Multi-rank test requires WORLD_SIZE > 1"
+)
+class TestLayerWiseOptimizer:
+    """Test class for LayerWiseDistributedOptimizer with common setup code."""
+
+    @pytest.fixture(autouse=True)
+    def setup_and_teardown(self):
+        """Setup and teardown for each test."""
+        world = int(os.getenv('WORLD_SIZE', '1'))
+        rank = int(os.getenv('RANK', '0'))
+        Utils.initialize_model_parallel()
+        yield
+        Utils.destroy_model_parallel()
+
+    def create_model_and_optimizer(
+        self,
+        model_class=SimpleModel,
+        clip_grad=1.0,
+        model_kwargs=None,
+        use_layer_wise=True,
+        copy_from=None,
+    ):
+        """Create model, DDP wrapper, and optimizer.
+
+        Args:
+            model_class: Model class to instantiate
+            clip_grad: Optional gradient clipping value
+            model_kwargs: Optional kwargs for model initialization
+            use_layer_wise: If True, wrap optimizer in LayerWiseDistributedOptimizer;
+                          if False, use get_megatron_optimizer instead (for reference)
+
+        Returns:
+            tuple: (model, optimizer, pg_collection)
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+
+        model = model_class(**model_kwargs).bfloat16().cuda()
+        model.requires_grad_(True)
+
+        ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False)
+        model = DistributedDataParallel(
+            TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
+        )
+        if copy_from:
+            model.module.load_state_dict(copy_from.module.state_dict())
+        else:
+            model.broadcast_params()
+
+        optimizer_config = OptimizerConfig(
+            optimizer='adam',
+            lr=0.01,
+            weight_decay=0.01,
+            bf16=not use_layer_wise,
+            use_distributed_optimizer=False,
+            clip_grad=clip_grad,
+        )
+
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+        pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True)
+        pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group()
+
+        optimizer = get_megatron_optimizer(optimizer_config, [model])
+        if use_layer_wise:
+            optimizer_config.bf16 = True
+            optimizer = LayerWiseDistributedOptimizer(
+                optimizer.chained_optimizers, optimizer_config, pg_collection
+            )
+        return model, optimizer, pg_collection
+
+    def create_reference_model(self, model):
+        """Create a reference model by cloning the current model."""
+        reference_model = type(model.module)().bfloat16().cuda()
+        reference_model.load_state_dict(model.module.state_dict())
+        return reference_model
+
+    def test_basic(self):
+        """Test basic LayerWiseDistributedOptimizer initialization and step with bf16."""
+        model, optimizer, pg_collection = self.create_model_and_optimizer()
+
+        # Verify basic properties
+        assert optimizer is not None, "Optimizer should not be None"
+        assert hasattr(optimizer, 'chained_optimizers'), "Should be a ChainedOptimizer"
+
+        reference_model = self.create_reference_model(model)
+
+        input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda')
+        output = model(input_tensor)
+        loss = output.sum()
+        loss.backward()
+
+        update_successful, grad_norm, num_zeros = optimizer.step()
+
+        assert update_successful, "Optimizer step should be successful"
+
+        # Verify parameters were updated
+        params_updated = 0
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            if not torch.equal(param.data, ref_param.data):
+                params_updated += 1
+
+        assert params_updated > 0, "At least some parameters should be updated"
+
+        # Verify all ranks have the same updated parameters (test allgather)
+        dp_size = get_pg_size(pg_collection.dp_cp)
+
+        if dp_size > 1:
+            for name, param in model.named_parameters():
+                # Gather parameters from all ranks
+                param_list = [torch.zeros_like(param.data) for _ in range(dp_size)]
+                torch.distributed.all_gather(param_list, param.data, group=pg_collection.dp_cp)
+
+                # Verify all ranks have the same parameter values
+                for i in range(1, dp_size):
+                    try:
+                        torch.testing.assert_close(param_list[0], param_list[i])
+                    except AssertionError as e:
+                        # Append additional context without overwriting the default message
+                        raise AssertionError(
+                            f"Parameter {name} differs between rank 0 and rank {i}. {str(e)}"
+                        ) from None
+
+    def test_get_grad_norm(self):
+        """Test LayerWiseDistributedOptimizer gradient norm computation."""
+        model, optimizer, pg_collection = self.create_model_and_optimizer()
+        reference_model, reference_optimizer, _ = self.create_model_and_optimizer(
+            use_layer_wise=False
+        )
+
+        # Set same gradients on both models
+        # note that model is different at this point but we're only testing grad norm here
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            grad_value = torch.randn_like(param)
+            torch.distributed.broadcast(grad_value, src=0, group=pg_collection.dp_cp)
+            param.main_grad = grad_value.float().detach()
+            ref_param.main_grad = grad_value.float().detach()
+
+        # Test get_grad_norm on both optimizers
+        optimizer.prepare_grads()
+        grad_norm = optimizer.get_grad_norm()
+
+        reference_optimizer.prepare_grads()
+        reference_grad_norm = reference_optimizer.get_grad_norm()
+
+        assert grad_norm is not None, "Grad norm should not be None"
+        assert grad_norm >= 0, "Grad norm should be non-negative"
+
+        # Compare with reference optimizer grad norm
+        torch.testing.assert_close(grad_norm, reference_grad_norm, rtol=1e-5, atol=1e-5)
+
+    def test_state_dict(self):
+        """Test LayerWiseDistributedOptimizer state dict save and load."""
+        model, optimizer, pg_collection = self.create_model_and_optimizer()
+
+        for param in model.parameters():
+            param.grad = torch.randn_like(param)
+        optimizer.step()
+
+        # Test state_dict
+        state_dict = optimizer.state_dict()
+
+        # Test load_state_dict
+        # TODO(deyuf): fix this. not going through get() will cause missing keys like wd_mult
+        # optimizer.load_state_dict(state_dict)
+
+    def test_sharded_state_dict(self):
+        """Test LayerWiseDistributedOptimizer sharded_state_dict method."""
+        model, optimizer, pg_collection = self.create_model_and_optimizer()
+
+        for param in model.parameters():
+            param.grad = torch.randn_like(param)
+        optimizer.step()
+
+        # Get model sharded state dict
+        model_sharded_state_dict = model.sharded_state_dict()
+
+        # Test sharded_state_dict
+        sharded_state_dict = optimizer.sharded_state_dict(model_sharded_state_dict)
+
+        # Verify the sharded_state_dict is not None and has expected structure
+        assert sharded_state_dict is not None, "Sharded state dict should not be None"
+        assert (
+            'optimizer' in sharded_state_dict
+        ), "Sharded state dict should contain 'optimizer' key"
+
+        # Verify that replica_id is set correctly (should be 0 for DP dimension)
+        from megatron.core.dist_checkpointing import ShardedTensor
+        from megatron.core.dist_checkpointing.dict_utils import nested_values
+
+        for sh_base in nested_values(sharded_state_dict):
+            if isinstance(sh_base, ShardedTensor):
+                assert (
+                    len(sh_base.replica_id) == 3
+                ), f'Expected replica_id format (PP, TP, DP), got: {sh_base.replica_id}'
+                assert (
+                    sh_base.replica_id[2] == 0
+                ), f'Expected DP replica_id to be 0 for layer-wise optimizer, got: {sh_base.replica_id[2]}'
+
+    def test_multiple_optimizers(self):
+        """Test LayerWiseDistributedOptimizer with multiple chained optimizers.
+
+        This test properly tests allgather functionality with multiple ranks.
+        """
+        model = SimpleModel().bfloat16().cuda()
+        model.requires_grad_(True)
+
+        ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False)
+        model = DistributedDataParallel(
+            TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
+        )
+
+        optimizer_config = OptimizerConfig(
+            optimizer='adam', lr=0.01, bf16=True, use_distributed_optimizer=False
+        )
+
+        # Split parameters into two groups for testing multiple optimizers
+        params = list(model.parameters())
+        mid_point = len(params) // 2
+        param_groups_1 = [{'params': params[:mid_point]}]
+        param_groups_2 = [{'params': params[mid_point:]}]
+
+        # Create two separate base optimizers
+        base_optimizer_1 = torch.optim.Adam(param_groups_1, lr=optimizer_config.lr)
+        base_optimizer_2 = torch.optim.Adam(param_groups_2, lr=optimizer_config.lr)
+
+        wrapped_optimizer_1 = FP32Optimizer(base_optimizer_1, optimizer_config, None)
+        wrapped_optimizer_2 = FP32Optimizer(base_optimizer_2, optimizer_config, None)
+
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+        pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True)
+        pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group()
+
+        optimizer = LayerWiseDistributedOptimizer(
+            [wrapped_optimizer_1, wrapped_optimizer_2], optimizer_config, pg_collection
+        )
+
+        assert len(optimizer.chained_optimizers) == 2, "Should have two chained optimizers"
+
+        # Set gradients and test optimizer step - this will trigger allgather
+        for param in model.parameters():
+            param.grad = torch.randn_like(param)
+
+        update_successful, grad_norm, num_zeros = optimizer.step()
+
+        assert update_successful, "Optimizer step should be successful"
+
+    def test_bf16_wrapping(self):
+        """Test LayerWiseDistributedOptimizer automatically wraps optimizer with bf16."""
+        model, optimizer, pg_collection = self.create_model_and_optimizer()
+
+        # Verify bf16 wrapping happened
+        assert isinstance(
+            optimizer.chained_optimizers[0], Float16OptimizerWithFloat16Params
+        ), "Optimizer should be wrapped in Float16OptimizerWithFloat16Params"
+
+        for param in model.parameters():
+            param.grad = torch.randn_like(param)
+
+        update_successful, grad_norm, num_zeros = optimizer.step()
+
+        assert update_successful, "Optimizer step should be successful"
+
+    def test_bf16_error(self):
+        """Test LayerWiseDistributedOptimizer raises error when receiving pre-wrapped Float16 optimizer."""
+        model = SimpleModel().bfloat16().cuda()
+        model.requires_grad_(True)
+
+        ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False)
+        model = DistributedDataParallel(
+            TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
+        )
+
+        optimizer_config = OptimizerConfig(
+            optimizer='adam', lr=0.01, bf16=True, use_distributed_optimizer=False
+        )
+
+        # Create base optimizer and manually wrap in Float16 optimizer
+        param_groups = [{'params': list(model.parameters())}]
+        base_optimizer = torch.optim.Adam(param_groups, lr=optimizer_config.lr)
+        wrapped_optimizer = Float16OptimizerWithFloat16Params(
+            base_optimizer, optimizer_config, None, None
+        )
+
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+        pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True)
+        pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group()
+
+        # Should raise TypeError when receiving already-wrapped Float16 optimizer
+        with pytest.raises(
+            TypeError, match='LayerWiseDistributedOptimizer received Float16 optimizer already'
+        ):
+            LayerWiseDistributedOptimizer([wrapped_optimizer], optimizer_config, pg_collection)
+
+    def _run_parameter_update_test(self, model_class=SimpleModel):
+        """Helper method to test parameter updates with a given model class.
+
+        Args:
+            model_class: Model class to use for testing
+        """
+        model, optimizer, pg_collection = self.create_model_and_optimizer(model_class=model_class)
+
+        # Create reference model and optimizer using the same function
+        reference_model, reference_optimizer, _ = self.create_model_and_optimizer(
+            model_class=model_class, use_layer_wise=False, copy_from=model
+        )
+
+        # Set same gradients on both models
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            assert torch.equal(param.data, ref_param.data)
+            torch.testing.assert_close(param.data, ref_param.data, rtol=1e-5, atol=1e-5)
+            grad_value = torch.randn_like(param)
+            torch.distributed.broadcast(grad_value, src=0, group=pg_collection.dp_cp)
+            param.main_grad = grad_value.clone().detach()
+            ref_param.main_grad = grad_value.clone().detach()
+
+        optimizer.step()
+
+        # Verify at least some parameters were updated
+        params_updated = 0
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            if not torch.equal(param.data, ref_param.data):
+                params_updated += 1
+
+        assert params_updated > 0, "At least some parameters should be updated"
+
+        reference_optimizer.step()
+
+        # Verify updated values match reference optimizer
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            torch.testing.assert_close(param.data, ref_param.data, rtol=1e-5, atol=1e-5)
+
+    def test_parameter_updates(self):
+        """Test LayerWiseDistributedOptimizer actually updates model parameters."""
+        self._run_parameter_update_test()
+
+    def test_parameter_updates_insufficient_parameters(self):
+        """Test LayerWiseDistributedOptimizer when there are insufficient parameters for all ranks.
+
+        Uses a tiny model with only 1 layer (2 parameters: weight and bias).
+        This will be insufficient when world size > 2.
+        """
+        self._run_parameter_update_test(model_class=TinyModel)
+
+    def test_broadcast_vs_allgather(self):
+        """Test LayerWiseDistributedOptimizer allgather code agains broadcast code."""
+        model, optimizer, pg_collection = self.create_model_and_optimizer(model_class=SimpleModel)
+
+        # Create reference model and optimizer using the same function
+        reference_model, reference_optimizer, _ = self.create_model_and_optimizer(
+            model_class=SimpleModel, copy_from=model
+        )
+
+        # Set same gradients on both models
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            assert torch.equal(param.data, ref_param.data)
+            torch.testing.assert_close(param.data, ref_param.data, rtol=0, atol=0)
+            grad_value = torch.randn_like(param)
+            torch.distributed.broadcast(grad_value, src=0, group=pg_collection.dp_cp)
+            param.main_grad = grad_value.clone().detach()
+            ref_param.main_grad = grad_value.clone().detach()
+
+        optimizer.step()
+
+        # Verify at least some parameters were updated
+        params_updated = 0
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            if not torch.equal(param.data, ref_param.data):
+                params_updated += 1
+
+        assert params_updated > 0, "At least some parameters should be updated"
+
+        # step() internal call allgather_params. replace reference object with bcast
+        reference_optimizer.allgather_params = reference_optimizer.broadcast_params
+        reference_optimizer.step()
+
+        # Verify updated values match reference optimizer
+        for param, ref_param in zip(model.parameters(), reference_model.parameters()):
+            torch.testing.assert_close(param.data, ref_param.data, rtol=0, atol=0)
diff --git a/tests/unit_tests/test_muon_optimizer.py b/tests/unit_tests/test_muon_optimizer.py
new file mode 100644
index 00000000000..cc99f7a16e6
--- /dev/null
+++ b/tests/unit_tests/test_muon_optimizer.py
@@ -0,0 +1,670 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import os
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging.version import Version
+
+from megatron.core import parallel_state
+from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
+from megatron.core.optimizer import OptimizerConfig
+from megatron.core.optimizer.muon import TensorParallelMuon, get_megatron_muon_optimizer
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+# Skip all tests in this file for LTS versions
+pytestmark = pytest.mark.skipif(
+    Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"),
+    reason="Skip muon optimizer for LTS test",
+)
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(80, 48)
+        self.fc2 = nn.Linear(48, 32)
+        self.fc3 = nn.Linear(32, 24)
+        self.fc4 = nn.Linear(24, 16)
+        self.fc5 = nn.Linear(16, 10)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        x = F.relu(self.fc4(x))
+        x = self.fc5(x)
+        return x
+
+
+def test_muon_optimizer_smoke():
+    """Smoke test for TensorParallelMuon optimizer."""
+    # Create a simple linear model for testing
+    model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    # Create TensorParallelMuon optimizer
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        momentum_beta=0.95,
+        use_nesterov=True,
+        weight_decay=0.01,
+        use_decoupled_weight_decay=True,
+        split_qkv=False,
+        fp32_matmul_prec="medium",
+        num_ns_steps=5,
+        scale_mode="spectral",
+        extra_scale_factor=1.0,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    # Test basic properties
+    assert optimizer is not None, "Optimizer should not be None"
+    assert hasattr(optimizer, 'param_groups'), "Optimizer should have param_groups"
+    assert len(optimizer.param_groups) > 0, "Optimizer should have at least one parameter group"
+
+    # Test forward and backward pass
+    input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda')
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    # Store original weight
+    original_weight = model.weight.data.clone()
+
+    # Test optimizer step
+    optimizer.step()
+
+    # Verify weight was updated
+    assert not torch.equal(
+        model.weight.data, original_weight
+    ), "Weight should be updated after optimizer step"
+
+    # Test zero_grad
+    optimizer.zero_grad()
+    assert model.weight.grad is None or torch.all(
+        model.weight.grad == 0
+    ), "Gradients should be zeroed"
+
+    # Test state_dict and load_state_dict
+    state_dict = optimizer.state_dict()
+    assert 'state' in state_dict, "State dict should contain state"
+    assert 'param_groups' in state_dict, "State dict should contain param_groups"
+
+    # Load state dict should not raise error
+    optimizer.load_state_dict(state_dict)
+
+
+@pytest.mark.skipif(
+    int(os.getenv('WORLD_SIZE', '1')) == 1, reason="Multi-rank test requires WORLD_SIZE > 1"
+)
+class TestMuonOptimizerMultiRank:
+    """Test class for Muon optimizer with multi-rank setup."""
+
+    @pytest.fixture(autouse=True)
+    def setup_and_teardown(self):
+        """Setup and teardown for each test."""
+        Utils.initialize_model_parallel()
+        yield
+        Utils.destroy_model_parallel()
+
+    def create_ddp_model(self, model):
+        """Wrap model in DDP.
+
+        Args:
+            model: Model to wrap
+
+        Returns:
+            DDP-wrapped model
+        """
+        ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False)
+        return DistributedDataParallel(
+            TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
+        )
+
+    def test_get_megatron_muon_optimizer_smoke(self):
+        """Smoke test for get_megatron_muon_optimizer function."""
+        model = Net().bfloat16().cuda()
+        model.requires_grad_(True)
+        model = self.create_ddp_model(model)
+
+        # Ensure all parameters require gradients
+        for param in model.parameters():
+            assert param.requires_grad, "All parameters should require gradients"
+
+        # Create optimizer config for Muon
+        optimizer_config = OptimizerConfig(
+            optimizer='muon',  # This will be changed internally to 'adam' for non-linear params
+            lr=0.01,
+            weight_decay=0.01,
+            bf16=True,
+            use_distributed_optimizer=False,  # Muon doesn't support distributed optimizer
+            muon_momentum=0.95,
+            muon_use_nesterov=True,
+            muon_fp32_matmul_prec="medium",
+            muon_num_ns_steps=5,
+            muon_scale_mode="spectral",
+            muon_tp_mode="duplicated",
+        )
+
+        # Test creating the optimizer
+        optimizer = get_megatron_muon_optimizer(
+            config=optimizer_config,
+            model_chunks=[model],
+            use_gloo_process_groups=True,
+            layer_wise_distributed_optimizer=False,
+        )
+
+        # Test basic properties
+        assert optimizer is not None, "Optimizer should not be None"
+        assert hasattr(optimizer, 'param_groups'), "Optimizer should have param_groups"
+        assert hasattr(optimizer, 'chained_optimizers'), "Should be a ChainedOptimizer"
+        assert len(optimizer.chained_optimizers) >= 1, "Should have at least one chained optimizer"
+
+        # Test forward and backward pass
+        input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda')
+        output = model(input_tensor)
+        loss = output.sum()
+        loss.backward()
+
+        # Store original parameters
+        original_params = {}
+        for name, param in model.named_parameters():
+            original_params[name] = param.data.clone()
+
+        # Test optimizer step
+        optimizer.step()
+
+        # Verify at least some parameters were updated
+        params_updated = 0
+        for name, param in model.named_parameters():
+            if not torch.equal(param.data, original_params[name]):
+                params_updated += 1
+
+        assert params_updated > 0, "At least some parameters should be updated after optimizer step"
+
+        # Test zero_grad
+        optimizer.zero_grad()
+        for param in model.parameters():
+            assert param.grad is None or torch.all(
+                param.grad == 0
+            ), f"Gradients should be zeroed for all parameters"
+
+        # Test state_dict and load_state_dict
+        state_dict = optimizer.state_dict()
+        assert isinstance(state_dict, list), "State dict should be a list"
+
+        # Load state dict should not raise error
+        optimizer.load_state_dict(state_dict)
+
+    def test_get_megatron_muon_optimizer_validation(self):
+        """Test validation logic for get_megatron_muon_optimizer."""
+        model = torch.nn.Linear(100, 50, bias=False, dtype=torch.bfloat16, device='cuda')
+        model.requires_grad_(True)
+        model = self.create_ddp_model(model)
+
+        # Test 1: Distributed optimizer should raise exception
+        optimizer_config_dist = OptimizerConfig(
+            optimizer='muon',
+            lr=0.01,
+            bf16=True,
+            use_distributed_optimizer=True,  # This should cause an exception
+        )
+
+        with pytest.raises(Exception, match='muon with dist optimizer is not supported'):
+            get_megatron_muon_optimizer(config=optimizer_config_dist, model_chunks=[model])
+
+        # Test 2: FP16 should raise exception
+        optimizer_config_fp16 = OptimizerConfig(
+            optimizer='muon',
+            lr=0.01,
+            fp16=True,  # This should cause an exception
+            use_distributed_optimizer=False,
+        )
+
+        with pytest.raises(Exception, match='muon with fp16 is not supported'):
+            get_megatron_muon_optimizer(config=optimizer_config_fp16, model_chunks=[model])
+
+        # Test 3: Invalid num_ns_steps should raise exception
+        optimizer_config_invalid_ns = OptimizerConfig(
+            optimizer='muon',
+            lr=0.01,
+            bf16=True,
+            use_distributed_optimizer=False,
+            muon_num_ns_steps=0,  # This should cause an exception
+        )
+
+        with pytest.raises(ValueError, match='num_ns_steps must be at least 1'):
+            get_megatron_muon_optimizer(config=optimizer_config_invalid_ns, model_chunks=[model])
+
+    def test_get_megatron_muon_optimizer_layer_wise(self):
+        """Test get_megatron_muon_optimizer with layer-wise distributed optimizer."""
+        model = Net().bfloat16().cuda()
+        model.requires_grad_(True)
+        model = self.create_ddp_model(model)
+
+        optimizer_config = OptimizerConfig(
+            optimizer='muon',
+            lr=0.01,
+            weight_decay=0.01,
+            bf16=True,
+            use_distributed_optimizer=False,
+            muon_momentum=0.95,
+            muon_use_nesterov=True,
+            muon_fp32_matmul_prec="medium",
+            muon_num_ns_steps=5,
+            muon_scale_mode="spectral",
+            muon_tp_mode="duplicated",
+        )
+
+        # Test with layer_wise_distributed_optimizer=True
+        optimizer = get_megatron_muon_optimizer(
+            config=optimizer_config,
+            model_chunks=[model],
+            use_gloo_process_groups=True,
+            layer_wise_distributed_optimizer=True,
+        )
+
+        # Verify it's a LayerWiseDistributedOptimizer
+        from megatron.core.optimizer.layer_wise_optimizer import LayerWiseDistributedOptimizer
+
+        assert isinstance(
+            optimizer, LayerWiseDistributedOptimizer
+        ), "Should return LayerWiseDistributedOptimizer"
+
+        # Test forward and backward pass
+        input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda')
+        output = model(input_tensor)
+        loss = output.sum()
+        loss.backward()
+
+        # Test optimizer step
+        update_successful, grad_norm, num_zeros = optimizer.step()
+
+        assert update_successful, "Optimizer step should be successful"
+        assert grad_norm is not None or grad_norm is None, "Grad norm should be returned"
+
+
+@pytest.mark.parametrize("mode", ["duplicated", "blockwise", "distributed"])
+def test_muon_optimizer_different_modes_single_rank(mode):
+    """Test TensorParallelMuon optimizer with different modes on single rank.
+
+    When TP size is 1, all modes should produce the same result.
+    """
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+
+    model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.normal_(0, 0.02)
+
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        momentum_beta=0.95,
+        weight_decay=0.0,  # Disable weight decay for deterministic comparison
+        num_ns_steps=5,
+        pg_collection=None,
+        mode=mode,
+    )
+
+    # Use fixed input for deterministic results
+    torch.manual_seed(42)
+    input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda')
+
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    original_weight = model.weight.data.clone()
+    optimizer.step()
+
+    # Verify weight was updated
+    assert not torch.equal(
+        model.weight.data, original_weight
+    ), f"Weight should be updated with mode={mode}"
+
+
+@pytest.mark.skipif(
+    int(os.getenv('WORLD_SIZE', '1')) == 1, reason="Multi-rank test requires WORLD_SIZE > 1"
+)
+class TestMuonOptimizerMultiRankTP:
+    """Test class for Muon optimizer with multi-rank and tensor parallel setup."""
+
+    @pytest.fixture(autouse=True)
+    def setup_and_teardown(self):
+        """Setup and teardown for each test with tensor parallel."""
+        world = int(os.getenv('WORLD_SIZE', '1'))
+        Utils.initialize_model_parallel(tensor_model_parallel_size=min(world, 2))
+        yield
+        Utils.destroy_model_parallel()
+
+    def create_tp_model_and_optimizer(self, mode):
+        """Create model with TP and optimizer.
+
+        Args:
+            mode: Muon optimizer mode
+
+        Returns:
+            tuple: (model, optimizer, pg_collection)
+        """
+        rank = int(os.getenv('RANK', '0'))
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+
+        # Create model with partition_dim for TP
+        torch.manual_seed(42 + rank)
+        model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda')
+        model.requires_grad_(True)
+        model.weight.data.normal_(0, 0.02)
+        model.weight.partition_dim = 0  # Set partition dimension for TP
+
+        optimizer = TensorParallelMuon(
+            params=[model.weight],
+            lr=0.01,
+            momentum_beta=0.95,
+            weight_decay=0.0,
+            num_ns_steps=5,
+            pg_collection=pg_collection,
+            mode=mode,
+        )
+
+        return model, optimizer
+
+    @pytest.mark.parametrize("mode", ["duplicated", "distributed"])
+    def test_muon_optimizer_modes_multirank_same_result(self, mode):
+        """Test that duplicated and distributed modes produce same results with TP > 1."""
+        model, optimizer = self.create_tp_model_and_optimizer(mode)
+
+        # Use fixed input for deterministic results
+        torch.manual_seed(42)
+        input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda')
+
+        output = model(input_tensor)
+        loss = output.sum()
+        loss.backward()
+
+        original_weight = model.weight.data.clone()
+        optimizer.step()
+
+        # Verify weight was updated
+        assert not torch.equal(
+            model.weight.data, original_weight
+        ), f"Weight should be updated with mode={mode}"
+
+    def test_muon_optimizer_blockwise_mode_different_result(self):
+        """Test that blockwise mode produces different results than duplicated/distributed with TP > 1."""
+        model, optimizer = self.create_tp_model_and_optimizer("blockwise")
+
+        # Use fixed input for deterministic results
+        torch.manual_seed(42)
+        input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda')
+
+        output = model(input_tensor)
+        loss = output.sum()
+        loss.backward()
+
+        original_weight = model.weight.data.clone()
+        optimizer.step()
+
+        # Verify weight was updated
+        assert not torch.equal(
+            model.weight.data, original_weight
+        ), "Weight should be updated with mode=blockwise"
+
+
+@pytest.mark.parametrize(
+    "coefficient_type_and_steps", [("simple", 3), ("quintic", 5), ("polar_express", 8)]
+)
+def test_muon_optimizer_coefficient_types(coefficient_type_and_steps):
+    """Test TensorParallelMuon optimizer with different coefficient types."""
+    model = torch.nn.Linear(80, 40, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        coefficient_type=coefficient_type_and_steps[0],
+        num_ns_steps=coefficient_type_and_steps[1],
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    input_tensor = torch.randn(16, 80, dtype=torch.float32, device='cuda')
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    original_weight = model.weight.data.clone()
+    optimizer.step()
+
+    assert not torch.equal(
+        model.weight.data, original_weight
+    ), f"Weight should be updated with coefficient_type={coefficient_type_and_steps[0]} and num_ns_steps={coefficient_type_and_steps[1]}"
+
+
+@pytest.mark.parametrize("scale_mode", ["spectral", "unit_rms_norm", "shape_scaling"])
+def test_muon_optimizer_scale_modes(scale_mode):
+    """Test TensorParallelMuon optimizer with different scale modes."""
+    model = torch.nn.Linear(60, 30, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        scale_mode=scale_mode,
+        num_ns_steps=5,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    input_tensor = torch.randn(16, 60, dtype=torch.float32, device='cuda')
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    original_weight = model.weight.data.clone()
+    optimizer.step()
+
+    assert not torch.equal(
+        model.weight.data, original_weight
+    ), f"Weight should be updated with scale_mode={scale_mode}"
+
+
+@pytest.mark.parametrize("use_nesterov", [True, False])
+def test_muon_optimizer_nesterov(use_nesterov):
+    """Test TensorParallelMuon optimizer with and without Nesterov momentum."""
+    model = torch.nn.Linear(50, 25, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        momentum_beta=0.9,
+        use_nesterov=use_nesterov,
+        num_ns_steps=5,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    input_tensor = torch.randn(16, 50, dtype=torch.float32, device='cuda')
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    original_weight = model.weight.data.clone()
+    optimizer.step()
+
+    assert not torch.equal(
+        model.weight.data, original_weight
+    ), f"Weight should be updated with use_nesterov={use_nesterov}"
+
+
+def test_muon_optimizer_multiple_steps():
+    """Test TensorParallelMuon optimizer across multiple optimization steps."""
+    model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        momentum_beta=0.95,
+        weight_decay=0.01,
+        num_ns_steps=5,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    weights_history = [model.weight.data.clone()]
+
+    for i in range(3):
+        input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda')
+        output = model(input_tensor)
+        loss = output.sum()
+        loss.backward()
+
+        optimizer.step()
+        optimizer.zero_grad()
+        weights_history.append(model.weight.data.clone())
+
+    # Verify weights changed at each step
+    for i in range(len(weights_history) - 1):
+        assert not torch.equal(
+            weights_history[i], weights_history[i + 1]
+        ), f"Weight should change at step {i}"
+
+
+def test_muon_optimizer_qkv_split():
+    """Test TensorParallelMuon optimizer with QKV splitting."""
+    # Create a model with QKV-like parameter
+    qkv_size = 3 * 64 * 16  # Combined Q, K, V dimensions, 16 heads x 64 per head
+    hidden_size = 1024
+    model = torch.nn.Linear(hidden_size, qkv_size, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    # Mark parameter as QKV
+    model.weight.is_qkv = True
+
+    # QKV split shapes: [Q_size, K_size, V_size]
+    qkv_split_shapes = (64, 64, 64)
+
+    # Test with split_qkv=True
+    optimizer_split = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        split_qkv=True,
+        is_qkv_fn=lambda p: getattr(p, 'is_qkv', False),
+        qkv_split_shapes=qkv_split_shapes,
+        num_ns_steps=5,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    input_tensor = torch.randn(16, hidden_size, dtype=torch.float32, device='cuda')
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    original_weight = model.weight.data.clone()
+    optimizer_split.step()
+    weight_with_split = model.weight.data.clone()
+
+    assert not torch.equal(
+        weight_with_split, original_weight
+    ), "QKV weight should be updated with split_qkv=True"
+
+    # Reset model and test with split_qkv=False
+    model.weight.data.fill_(1.0)
+    optimizer_no_split = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        split_qkv=False,
+        num_ns_steps=5,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    optimizer_no_split.step()
+    weight_without_split = model.weight.data.clone()
+
+    assert not torch.equal(
+        weight_without_split, original_weight
+    ), "QKV weight should be updated with split_qkv=False"
+
+    # Ensure the two results are different
+    assert not torch.equal(
+        weight_with_split, weight_without_split
+    ), "Weights should be different between split_qkv=True and split_qkv=False"
+
+
+def test_muon_optimizer_extra_scale_factor():
+    """Test TensorParallelMuon optimizer with different extra_scale_factor values."""
+    model = torch.nn.Linear(80, 40, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        extra_scale_factor=2.0,
+        num_ns_steps=5,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    input_tensor = torch.randn(16, 80, dtype=torch.float32, device='cuda')
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    original_weight = model.weight.data.clone()
+    optimizer.step()
+
+    assert not torch.equal(
+        model.weight.data, original_weight
+    ), "Weight should be updated with extra_scale_factor"
+
+
+@pytest.mark.parametrize("num_ns_steps", [5, 15, 25])
+def test_muon_optimizer_num_ns_steps(num_ns_steps):
+    """Test TensorParallelMuon optimizer with different numbers of Newton-Schulz steps."""
+    model = torch.nn.Linear(60, 30, bias=False, dtype=torch.float32, device='cuda')
+    model.requires_grad_(True)
+    model.weight.data.fill_(1.0)
+
+    optimizer = TensorParallelMuon(
+        params=[model.weight],
+        lr=0.01,
+        coefficient_type="quintic",
+        num_ns_steps=num_ns_steps,
+        pg_collection=None,
+        mode="duplicated",
+    )
+
+    input_tensor = torch.randn(16, 60, dtype=torch.float32, device='cuda')
+    output = model(input_tensor)
+    loss = output.sum()
+    loss.backward()
+
+    original_weight = model.weight.data.clone()
+    optimizer.step()
+
+    assert not torch.equal(
+        model.weight.data, original_weight
+    ), f"Weight should be updated with num_ns_steps={num_ns_steps}"
diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py
index cec200b6a4c..0cd2988e545 100644
--- a/tests/unit_tests/test_optimizer.py
+++ b/tests/unit_tests/test_optimizer.py
@@ -1,4 +1,7 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import os
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -10,7 +13,17 @@
 from transformer_engine.pytorch.fp8 import fp8_autocast
 
 from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
-from megatron.core.optimizer import ChainedOptimizer, OptimizerConfig, get_megatron_optimizer
+from megatron.core.optimizer import (
+    ChainedOptimizer,
+    OptimizerConfig,
+    ParamKey,
+    ParamPredicate,
+    _get_param_groups,
+    check_config_overrides_consistency,
+    get_megatron_optimizer,
+    get_standard_config_overrides,
+)
+from megatron.core.optimizer_param_scheduler import ParamGroupOverride
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import is_te_min_version, is_torch_min_version
@@ -22,7 +35,7 @@
     from transformer_engine.pytorch.fp8 import check_fp8_block_scaling_support
 
     fp8_block_scaling_available, reason_for_no_fp8_block_scaling = check_fp8_block_scaling_support()
-    from transformer_engine.common.recipe import Float8BlockScaling, Format
+    from transformer_engine.common.recipe import DelayedScaling, Float8BlockScaling, Format
 except:
     fp8_block_scaling_available = False
     reason_for_no_fp8_block_scaling = "FP8 block scaled GEMM requires Hopper and CUDA >= 12.9."
@@ -33,7 +46,7 @@
 
 
 class Net(nn.Module):
-    def __init__(self):
+    def __init__(self, add_layernorm=False):
         super().__init__()
         self.conv1 = nn.Conv2d(3, 6, 5)
         self.pool = nn.MaxPool2d(2, 2)
@@ -41,6 +54,10 @@ def __init__(self):
         self.fc1 = nn.Linear(16 * 5 * 5, 120)
         self.fc2 = nn.Linear(120, 84)
         self.fc3 = nn.Linear(84, 10)
+        if add_layernorm:
+            self.q_layernorm = nn.LayerNorm(10, bias=False)
+            self.k_layernorm = nn.LayerNorm(10, bias=False)
+            self.layernorm = nn.LayerNorm(10, bias=False)
 
     def forward(self, x):
         x = self.pool(F.relu(self.conv1(x)))
@@ -52,6 +69,224 @@ def forward(self, x):
         return x
 
 
+@patch('torch.distributed.get_world_size', return_value=1)
+@patch(
+    'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj)
+)
+def test_get_param_groups_no_overrides(mock_get_world_size):
+    net = Net()
+    # NOTE: to get no overrides, supply an empty dictionary rather than None.
+    param_groups = _get_param_groups([net], OptimizerConfig(optimizer='adam', lr=0.01), {})
+    assert len(param_groups) == 1
+    pg0 = param_groups[0]
+    assert pg0.keys() == {
+        'params',
+        'is_expert_parallel',
+        'default_config',
+        'wd_mult',
+        'lr_mult',
+        'is_decoupled_lr',
+        'max_lr',
+        'min_lr',
+        'is_vision_model_param',
+    }
+    assert pg0['params'] == list(net.parameters())
+    assert pg0['is_expert_parallel'] == False
+    assert pg0['default_config'] == True
+    assert pg0['wd_mult'] == 1.0
+    assert pg0['lr_mult'] == 1.0
+    assert pg0['is_decoupled_lr'] == False
+    assert pg0['max_lr'] == 0.01  # from the optimizer config default for lr
+    assert pg0['min_lr'] is None  # from the optimizer config default.
+
+
+@patch('torch.distributed.get_world_size', return_value=1)
+@patch(
+    'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj)
+)
+def test_get_param_groups_default_overrides(mock_get_world_size):
+    """Test that the default overrides are applied to the parameter groups."""
+    net = Net()
+    # NOTE: to get legacy default overrides, supply None.
+    opt_config = OptimizerConfig(optimizer='adam', lr=0.01)
+    check_config_overrides_consistency(opt_config, None)
+    param_groups = _get_param_groups([net], opt_config, None)
+    assert len(param_groups) == 2
+    pg0, pg1 = param_groups
+    wd_mults = {pg0['wd_mult'], pg1['wd_mult']}
+    assert wd_mults == {1.0, 0.0}
+
+
+@patch('torch.distributed.get_world_size', return_value=1)
+@patch(
+    'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj)
+)
+def test_get_param_groups_with_overrides(mock_get_world_size):
+    net = Net()
+    config_overrides = {
+        ParamKey(
+            name="*.bias",
+            predicate=ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1),
+        ): ParamGroupOverride(wd_mult=0.0)
+    }
+    opt_config = OptimizerConfig(optimizer='adam', lr=0.01)
+    check_config_overrides_consistency(opt_config, config_overrides)
+    param_groups = _get_param_groups([net], opt_config, config_overrides)
+    assert len(param_groups) == 2
+    p_set = set(net.parameters())
+
+    assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params'])
+    assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params'])
+    assert param_groups[0]['wd_mult'] == 0.0 or param_groups[1]['wd_mult'] == 0.0
+    assert param_groups[0]['wd_mult'] == 1.0 or param_groups[1]['wd_mult'] == 1.0
+    assert len(param_groups[0]['params']) > 0 and len(param_groups[1]['params']) > 0
+
+
+@patch('torch.distributed.get_world_size', return_value=1)
+@patch(
+    'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj)
+)
+def test_get_param_groups_multiple_matches(mock_get_world_size):
+    net = Net()
+
+    param_groups = _get_param_groups(
+        [net],
+        OptimizerConfig(optimizer='adam', lr=0.01),
+        {
+            ParamKey(name="*.bias"): ParamGroupOverride(min_lr=1e-4, wd_mult=0.0),
+            ParamKey(
+                predicate=ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1)
+            ): ParamGroupOverride(wd_mult=0.0, min_lr=1e-4),
+        },
+    )
+    config_overrides = {
+        ParamKey(
+            name="*.bias",
+            predicate=ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1),
+        ): ParamGroupOverride(min_lr=1e-4, wd_mult=0.0)
+    }
+    opt_config = OptimizerConfig(optimizer='adam', lr=0.01)
+    check_config_overrides_consistency(opt_config, config_overrides)
+    param_groups2 = _get_param_groups([net], opt_config, config_overrides)
+    assert len(param_groups) == 2
+    assert param_groups == param_groups2
+
+
+@patch('torch.distributed.get_world_size', return_value=1)
+@patch(
+    'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj)
+)
+def test_get_param_groups_overlapping_matches(mock_get_world_size):
+    """In this test, we see if we can have two matches that create three param groups."""
+    net = Net()
+    # We expect that all convolution parameters will have wd_mult=0.0
+    #  However the conv1 related parameters will additionally have a different LR schedule.
+    #  this should create three param groups (no match, conv1 (both wd_mult=0.0 and LR schedule), conv2 (only wd_mult=0.0))
+    config_overrides = {
+        ParamKey(name="*conv*"): ParamGroupOverride(wd_mult=0.0),
+        ParamKey(name="*conv1*"): ParamGroupOverride(min_lr=10, max_lr=20),
+    }
+    opt_config = OptimizerConfig(optimizer='adam', lr=0.01)
+    check_config_overrides_consistency(opt_config, config_overrides)
+    param_groups = _get_param_groups([net], opt_config, config_overrides)
+    assert len(param_groups) == 3
+    p_set = set(net.parameters())
+    assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params']) | set(
+        param_groups[2]['params']
+    )
+    assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params']) + len(
+        param_groups[2]['params']
+    )
+    assert (
+        param_groups[0]['wd_mult'] == 1.0
+    ), "We expect the first param group to be the None one, which should have wd_mult=1.0"
+    assert (
+        param_groups[1]['wd_mult'] == 0.0
+    ), "We expect the second param group to be the conv1 one, which should have wd_mult=0.0"
+    assert (
+        param_groups[2]['wd_mult'] == 0.0
+    ), "We expect the third param group to be the conv2 one, which should have wd_mult=0.0"
+    assert param_groups[1]['min_lr'] == 10
+    assert param_groups[1]['max_lr'] == 20
+    assert param_groups[2]['min_lr'] is None
+    assert param_groups[2]['max_lr'] == 0.01
+
+
+@patch('torch.distributed.get_world_size', return_value=1)
+@patch(
+    'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj)
+)
+def test_get_param_groups_with_standard_config_overrides(apply_wd_to_qk_layernorm: bool):
+    """In this test, we see if the standard config overrides are applied correctly."""
+
+    # Initialize the model with layernorm
+    net = Net()
+
+    config = OptimizerConfig(optimizer='adam', lr=0.01)
+    config_overrides = get_standard_config_overrides(config=config)
+    param_groups = _get_param_groups([net], config, config_overrides)
+
+    assert len(param_groups) == 2
+    p_set = set(net.parameters())
+
+    assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params'])
+    assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params'])
+    assert param_groups[0]['wd_mult'] == 0.0 or param_groups[1]['wd_mult'] == 0.0
+    assert param_groups[0]['wd_mult'] == 1.0 or param_groups[1]['wd_mult'] == 1.0
+    assert len(param_groups[0]['params']) > 0 and len(param_groups[1]['params']) > 0
+
+    # Both param groups should have 5 parameters.
+    # Param group A (wd_mult=1.0): conv1.weight, conv2.weight, fc1.weight, fc2.weight, fc3.weight
+    # Param group B (wd_mult=0.0): conv1.bias, conv2.bias, fc1.bias, fc2.bias, fc3.bias
+    assert len(param_groups[0]['params']) == 5, (
+        f"Expected 5 parameters in the first param group, "
+        f"but got {len(param_groups[0]['params'])}"
+    )
+    assert len(param_groups[1]['params']) == 5, (
+        f"Expected 5 parameters in the second param group, "
+        f"but got {len(param_groups[1]['params'])}"
+    )
+
+
+@patch('torch.distributed.get_world_size', return_value=1)
+@patch(
+    'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj)
+)
+def test_get_param_groups_appling_wd_to_qk_layernorm(apply_wd_to_qk_layernorm: bool):
+    """In this test, we see if the `apply_wd_to_qk_layernorm` config is applied correctly."""
+
+    # Initialize the model with layernorm
+    net = Net(add_layernorm=True)
+
+    config = OptimizerConfig(
+        optimizer='adam', lr=0.01, apply_wd_to_qk_layernorm=apply_wd_to_qk_layernorm
+    )
+    config_overrides = get_standard_config_overrides(config=config)
+    param_groups = _get_param_groups([net], config, config_overrides)
+
+    assert len(param_groups) == 2
+    p_set = set(net.parameters())
+
+    assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params'])
+    assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params'])
+    assert param_groups[0]['wd_mult'] == 1.0
+    assert param_groups[1]['wd_mult'] == 0.0
+
+    # There are two param groups, having 7, and 6 parameters respectively.
+    # Param group A (wd_mult=1.0): conv1.weight, conv2.weight, fc1.weight, fc2.weight, fc3.weight,
+    #    q_layernorm.weight, k_layernorm.weight
+    # Param group B (wd_mult=0.0): conv1.bias, conv2.bias, fc1.bias, fc2.bias, fc3.bias,
+    #    layernorm.weight
+    assert len(param_groups[0]['params']) == 7, (
+        f"Expected 5 parameters in the first param group, "
+        f"but got {len(param_groups[0]['params'])}"
+    )
+    assert len(param_groups[1]['params']) == 6, (
+        f"Expected 6 parameters in the second param group, "
+        f"but got {len(param_groups[1]['params'])}"
+    )
+
+
 def test_chained_optimizer():
     net = Net()
     optimizer_1 = Adam(list(net.parameters())[:2], lr=0.01)
@@ -150,7 +385,7 @@ def test_precision_aware_fused_adam():
     "moment_dtype",
     [torch.float32, torch.float16, torch.bfloat16, torch.uint8],
 )
-@pytest.mark.skip(reason="This test is flaky and needs to be fixed")
+@pytest.mark.skip(reason="inconsistent ci test runs resulting in NCCL errors")
 def test_precision_aware_optimizer(
     precision: str,
     main_params_dtype: torch.dtype,
@@ -245,24 +480,13 @@ def run_model(model, input, optim, fp8_recipe, fp8_recipe_settings):
         test_model, input, test_optim, fp8_recipe, fp8_recipe_settings
     )
 
-    rtol = 1e-3  # relative tolerance
-    atol = 1e-5  # absolute tolerance
+    rtol, atol = 1.6e-2, 1e-5
 
     # Compare grad norms - allow small difference due to precision
-    rel_diff = abs(test_grad_norm - baseline_grad_norm) / (
-        abs(baseline_grad_norm) + 1e-7  # avoid div by 0
-    )
-    abs_diff = abs(test_grad_norm - baseline_grad_norm)
-    assert (
-        rel_diff <= rtol or abs_diff <= atol
-    ), f"Grad norm mismatch: baseline={baseline_grad_norm}, test={test_grad_norm}, rel_diff={rel_diff}, abs_diff={abs_diff}"
+    torch.testing.assert_close(test_grad_norm, baseline_grad_norm, atol=atol, rtol=rtol)
 
     # Compare losses - allow small difference due to precision
-    loss_rel_diff = abs(test_loss - baseline_loss) / (abs(baseline_loss) + 1e-7)
-    loss_abs_diff = abs(test_loss - baseline_loss)
-    assert (
-        loss_rel_diff <= rtol or loss_abs_diff <= atol
-    ), f"Loss mismatch: baseline={baseline_loss}, test={test_loss}, rel_diff={loss_rel_diff}, abs_diff={loss_abs_diff}"
+    torch.testing.assert_close(test_loss, baseline_loss, atol=atol, rtol=rtol)
 
     # Save and reload state dict for the test model
     state_dict = test_optim.state_dict()
@@ -302,7 +526,13 @@ def test_optim_sharded_state_dict(use_distributed_optimizer: bool, precision: st
     optim = get_megatron_optimizer(optimizer_config, [model])
 
     model_sharded_state_dict = model.sharded_state_dict()
-    sharded_state_dict = optim.sharded_state_dict(model_sharded_state_dict)
+    metadata = {'distrib_optim_sharding_type': 'fully_reshardable'}
+    if precision == 'bf16' or use_distributed_optimizer:
+        sharded_state_dict = optim.sharded_state_dict(
+            model_sharded_state_dict, metadata=metadata, is_loading=True
+        )
+    else:
+        sharded_state_dict = optim.sharded_state_dict(model_sharded_state_dict)
 
     if 'optimizer' in sharded_state_dict and 'state' in sharded_state_dict['optimizer']:
         assert (
diff --git a/tests/unit_tests/test_optimizer_cpu_offloading.py b/tests/unit_tests/test_optimizer_cpu_offloading.py
index 1c367100dab..33febbb3eb0 100644
--- a/tests/unit_tests/test_optimizer_cpu_offloading.py
+++ b/tests/unit_tests/test_optimizer_cpu_offloading.py
@@ -39,6 +39,28 @@ def forward(self, x):
         return x
 
 
+class BigNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 2048)
+        self.fc2 = nn.Linear(2048, 8192)
+        self.fc3 = nn.Linear(8192, 2048)
+        self.fc4 = nn.Linear(2048, 100)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1)  # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        x = self.fc4(x)
+        return x
+
+
 def setup_seed(seed):
     random.seed(seed)  # Set Python's built-in random seed
     np.random.seed(seed)  # Set NumPy's random seed
@@ -139,3 +161,95 @@ def test_multi_device_hybrid_optimizer(
         assert torch.allclose(
             v, ref_params[k], atol=1e-03
         ), f"Weight {k} value mismatch, max error: {(v - ref_params[k]).abs().max()}"
+
+
+@pytest.mark.skipif(
+    torch.__version__ < '2.3.0',
+    reason=(
+        "Requires PyTorch 2.3.0 or higher, lower versions of pytorch have "
+        "misaligned optimizer accuracy for CPU and GPU."
+    ),
+)
+@pytest.mark.parametrize('n_steps', [1, 10])
+@pytest.mark.parametrize('offload_fraction', [1, 0.5, 0])
+@pytest.mark.parametrize('optimizer', ['adam', 'sgd'])
+@pytest.mark.parametrize('with_param_groups', [False, True])
+def test_overlap_cpu_optimizer_d2h_h2d_sync_correctness(
+    with_param_groups, optimizer, offload_fraction, n_steps
+):
+    setup_seed(42)
+    net1 = BigNet().cuda()
+    net2 = BigNet().cuda()
+    net2.load_state_dict(net1.state_dict())
+    base_lr = 1e-3
+    params = list(net1.parameters())
+    ref_params = list(net2.parameters())
+    if with_param_groups:
+        param_groups = [
+            {"params": params[: len(params) // 2], "wd_mult": 1.0, "lr_mult": 1e-4},
+            {"params": params[len(params) // 2 :], "wd_mult": 0.0, "lr_mult": 2e-4},
+        ]
+        params = param_groups
+        ref_param_groups = [
+            {"params": ref_params[: len(ref_params) // 2], "wd_mult": 1.0, "lr_mult": 1e-4},
+            {"params": ref_params[len(ref_params) // 2 :], "wd_mult": 0.0, "lr_mult": 2e-4},
+        ]
+        ref_params = ref_param_groups
+
+    if optimizer == 'adam':
+        cls_kwargs = dict(cpu_optimizer_cls=Adam, gpu_optimizer_cls=GPUAdam)
+    else:
+        cls_kwargs = dict(cpu_optimizer_cls=SGD, gpu_optimizer_cls=GPUSGD)
+
+    hdo = HybridDeviceOptimizer(
+        params,
+        offload_fraction=offload_fraction,
+        lr=base_lr,
+        overlap_cpu_optimizer_d2h_h2d=True,
+        **cls_kwargs,
+    )
+
+    ref_optimizer = cls_kwargs['gpu_optimizer_cls'](ref_params, lr=base_lr)
+
+    # 1. run step on optimizer, make sure there is state generated
+    assert len(hdo.state_dict()["state"]) == 0  # state is empty
+    input = torch.randn(1, 3, 32, 32).cuda()
+    output = net1(input)
+    output.sum().backward()
+    hdo.step()
+    output = net2(input)
+    output.sum().backward()
+    ref_optimizer.step()
+    # PyTorch SGD will not generate state
+    if optimizer != 'sgd':
+        assert len(hdo.state_dict()["state"]) != 0
+
+    # 2. check the state is on right device
+    if optimizer == 'adam':
+        first_param_id = hdo.state_dict()["param_groups"][0]["params"][0]
+        last_param_id = hdo.state_dict()["param_groups"][-1]["params"][-1]
+        if offload_fraction > 0:
+            assert not hdo.state_dict()["state"][first_param_id]["exp_avg"].is_cuda
+        if offload_fraction < 1:
+            assert hdo.state_dict()["state"][last_param_id]["exp_avg"].is_cuda
+
+    inputs = [torch.randn(1, 3, 32, 32).cuda() for _ in range(1, n_steps)]
+    for i in range(1, n_steps):
+        output = net1(inputs[i - 1])
+        output.sum().backward()
+        hdo.step()
+
+    for i in range(1, n_steps):
+        output = net2(inputs[i - 1])
+        output.sum().backward()
+        ref_optimizer.step()
+
+    params = net1.state_dict()
+    ref_params = net2.state_dict()
+    for k, v in params.items():
+        assert (v.isnan() == ref_params[k].isnan()).all()
+        torch.nan_to_num_(v, 0)
+        torch.nan_to_num_(ref_params[k], 0)
+        assert torch.allclose(
+            v, ref_params[k], atol=1e-03
+        ), f"Weight {k} value mismatch, max error: {(v - ref_params[k]).abs().max()}"
diff --git a/tests/unit_tests/test_optimizer_param_scheduler.py b/tests/unit_tests/test_optimizer_param_scheduler.py
index 9b781694546..670ca92c992 100644
--- a/tests/unit_tests/test_optimizer_param_scheduler.py
+++ b/tests/unit_tests/test_optimizer_param_scheduler.py
@@ -1,10 +1,13 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import math
 from unittest.mock import MagicMock
 
 import pytest
 
-from megatron.core.optimizer_param_scheduler import (  # Adjust import according to your module path
+from megatron.core.optimizer_param_scheduler import (
     OptimizerParamScheduler,
+    get_canonical_lr_for_logging,
 )
 
 
@@ -182,6 +185,51 @@ def test_step_function(mock_optimizer):
     assert math.isclose(param_group['weight_decay'], 0.01, rel_tol=1e-5)
 
 
+def test_step_updates_empty_param_groups():
+    """Empty param groups (rank-alignment stubs) must still receive lr updates.
+
+    get_canonical_lr_for_logging reads lr from default_config groups regardless
+    of whether they hold parameters, so step() must not skip them.
+    """
+    optimizer = MagicMock()
+    # lr and weight_decay are set by the scheduler's step() method
+    optimizer.param_groups = [
+        # Non-default group with its own max_lr override (lr will differ from the canonical schedule)
+        {'params': [1, 2], "min_lr": 0.001, "max_lr": 0.2, "default_config": False},
+        # Model parallelism may leave default_config groups empty on some ranks
+        {'params': [], "wd_mult": 0.0, 'default_config': True},
+    ]
+    scheduler = OptimizerParamScheduler(
+        optimizer=optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    scheduler.step(100)
+    non_empty, empty = optimizer.param_groups
+
+    # Verify learning rates: at step 100 warmup is complete so lr == max_lr
+    assert "lr" in non_empty, "non-empty param group must have an lr"
+    assert "lr" in empty, "empty param group must have an lr"
+    assert non_empty['lr'] == pytest.approx(0.2)  # warmup complete → this group's max_lr override
+    assert empty['lr'] == pytest.approx(0.1)  # warmup complete → scheduler's default max_lr
+    assert get_canonical_lr_for_logging(optimizer.param_groups) == pytest.approx(0.1)
+
+    # Verify weight decay: linear from 0.0 to 0.1 over 1000 steps → base wd is 0.01 at step 100
+    assert "weight_decay" in non_empty, "non-empty param group must have a weight decay"
+    assert "weight_decay" in empty, "empty param group must have a weight decay"
+    assert non_empty['weight_decay'] == pytest.approx(0.01)  # base wd, no wd_mult override
+    assert empty['weight_decay'] == pytest.approx(0.0)  # base wd * wd_mult=0.0
+
+
 def test_state_dict(mock_optimizer):
     scheduler = OptimizerParamScheduler(
         optimizer=mock_optimizer,
@@ -249,3 +297,54 @@ def test_load_state_dict(mock_optimizer):
     assert scheduler.end_wd == 0.2
     assert scheduler.wd_incr_steps == 500
     assert scheduler.wd_incr_style == 'cosine'
+
+
+# ── get_canonical_lr_for_logging tests ──────────────────────────────────────
+#
+# Returns the lr of the first default_config=True param group.  In practice
+# the scheduler always sets a valid lr on every group (including empty
+# rank-alignment stubs), so a default_config=True group with a float lr is
+# always present.
+
+
+class TestGetCanonicalLrForLogging:
+    """Tests for get_canonical_lr_for_logging."""
+
+    def test_single_default_config_group(self):
+        """Typical case: one default_config group with a valid lr."""
+        param_groups = [{'lr': 0.05, 'default_config': True}]
+        assert get_canonical_lr_for_logging(param_groups) == 0.05
+
+    def test_default_config_with_non_default_groups(self):
+        """default_config group is returned even when non-default groups are present."""
+        param_groups = [{'lr': 0.001, 'default_config': True}, {'lr': 0.999}]
+        assert get_canonical_lr_for_logging(param_groups) == 0.001
+
+    def test_default_config_after_non_default(self):
+        """default_config group is found even when it is not first in the list."""
+        param_groups = [{'lr': 0.50}, {'lr': 0.01, 'default_config': True}]
+        assert get_canonical_lr_for_logging(param_groups) == 0.01
+
+    def test_no_default_config_groups(self):
+        """Returns None when no group has default_config=True."""
+        param_groups = [{'lr': 0.50}, {'lr': 0.01}]
+        assert get_canonical_lr_for_logging(param_groups) is None
+
+    def test_missing_lr_key(self):
+        """Returns None (not KeyError) when the default_config group lacks an 'lr' key."""
+        param_groups = [{'default_config': True}]
+        assert get_canonical_lr_for_logging(param_groups) is None
+
+    def test_empty_param_groups(self):
+        """Returns None when there are no param groups at all."""
+        assert get_canonical_lr_for_logging([]) is None
+
+    def test_no_default_config_no_lr(self):
+        """Returns None when groups exist but none are default_config."""
+        param_groups = [{'params': []}]
+        assert get_canonical_lr_for_logging(param_groups) is None
+
+    def test_lr_zero_is_valid(self):
+        """lr=0.0 is a legitimate value, not to be confused with None."""
+        param_groups = [{'lr': 0.0, 'default_config': True}]
+        assert get_canonical_lr_for_logging(param_groups) == 0.0
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
index 51d6f94a161..40dd4a0083c 100644
--- a/tests/unit_tests/test_parallel_state.py
+++ b/tests/unit_tests/test_parallel_state.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
+from math import log2
+
 import pytest
 import torch
 
@@ -500,3 +502,60 @@ def golden_rank_result_from_past_code(
     assert expert_dp_group == expert_rank_generator.get_ranks(
         "dp"
     ), f"{expert_dp_group} != {expert_rank_generator.get_ranks('dp')}."
+
+
+@pytest.mark.parametrize(
+    "world_size, tp_size, cp_size, dp_size",
+    [(8, 1, 2, 4), (8, 1, 1, 8)],  # 8 GPUs, 1 TP, 2 CP, 4 DP  # 8 GPUs, 1 TP, 1 CP, 8 DP
+)
+def test_hybrid_dp_cp_groups(world_size, tp_size, cp_size, dp_size):
+    """
+    Test that hybrid DPxCP groups are created correctly.
+    """
+    Utils.destroy_model_parallel()
+
+    # Skip if world size doesn't match
+    actual_world_size = torch.cuda.device_count()
+    if actual_world_size != world_size:
+        pytest.skip(f"Test requires world_size={world_size}, but got {actual_world_size}")
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=tp_size,
+        context_parallel_size=cp_size,
+        hybrid_context_parallel=True,
+    )
+
+    dp_cp_size = ps.get_data_parallel_world_size(with_context_parallel=True)
+    group_sizes = [2**i for i in range(int(log2(dp_cp_size)))][1:]
+    for group_size in group_sizes:
+        group = ps.get_hybrid_data_context_parallel_groups(group_size=group_size)
+        assert group.size() == group_size
+
+    Utils.destroy_model_parallel()
+
+
+def test_separate_all_gather_group():
+    """Test separate all-gather group for improved communication overlap."""
+    # Test without creating AG group (default)
+    Utils.initialize_model_parallel(context_parallel_size=world_size, create_all_gather_group=False)
+    assert not ps.has_separate_all_gather_group()
+    assert ps._DATA_PARALLEL_GROUP_WITH_CP_AG is None
+    Utils.destroy_model_parallel()
+
+    # Test with creating AG group
+    Utils.initialize_model_parallel(context_parallel_size=world_size, create_all_gather_group=True)
+    assert ps.has_separate_all_gather_group()
+    assert ps._DATA_PARALLEL_GROUP_WITH_CP_AG is not None
+
+    # Verify it returns the correct group
+    ag_group = ps.get_data_parallel_group(with_context_parallel=True, independent_all_gather=True)
+    regular_group = ps.get_data_parallel_group(
+        with_context_parallel=True, independent_all_gather=False
+    )
+    assert ag_group is not None
+    assert regular_group is not None
+    # They should have the same ranks but different communicators
+    ag_ranks = torch.distributed.get_process_group_ranks(ag_group)
+    regular_ranks = torch.distributed.get_process_group_ranks(regular_group)
+    assert ag_ranks == regular_ranks
+
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/test_rl_utils.py b/tests/unit_tests/test_rl_utils.py
deleted file mode 100644
index 773ad0f3669..00000000000
--- a/tests/unit_tests/test_rl_utils.py
+++ /dev/null
@@ -1,656 +0,0 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-
-from unittest.mock import patch
-
-import torch
-
-from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
-from megatron.core.models.common.language_module.language_module import LanguageModule
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
-from megatron.core.models.gpt.gpt_model import GPTModel
-from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.transformer import TransformerConfig
-from megatron.core.transformer.module import Float16Module
-from megatron.rl import rl_utils
-from megatron.rl.agent.api import TokenRollout
-from megatron.training import arguments, global_vars
-from tests.unit_tests.test_utilities import Utils
-
-BATCH = 2
-SEQ = 4
-VOCAB = 754
-
-
-class MockModel(LanguageModule):
-    def __init__(self, batch=BATCH, seq=SEQ, vocab=VOCAB):
-        self.batch = batch
-        self.seq = seq
-        self.vocab = vocab
-        self.config = TransformerConfig(num_attention_heads=1, num_layers=1)
-
-    def __call__(self, x, position_ids, attention_mask, **kwargs):
-        del position_ids
-        del attention_mask
-        batch, seq = x.shape
-        mock_model_outputs = torch.ones((batch, seq, self.vocab), device=x.device)
-        return mock_model_outputs
-
-    def load_state_dict(self, params):
-        del params
-
-    def train(self, mode=True):
-        del mode
-
-    def state_dict(self):
-        return {}
-
-
-class MockTokenizer:
-    def __init__(self):
-        self.pad = 42
-        self.eod = 43
-        self.vocab_size = VOCAB
-        self.bos = None
-
-    def detokenize(self, tokens):
-        return [str(tok) for tok in tokens]
-
-
-def test_get_logprobs():
-    """Test that getting logprobs at least does not crash."""
-    # We use args inside of get_logprobs, we need to initialize them.
-    args = arguments.parse_args(ignore_unknown_args=True)
-    global_vars.set_args(args)
-
-    tokens = torch.ones((BATCH, SEQ), dtype=torch.long)
-    logprobs = rl_utils.get_logprobs(MockModel(), tokens, position_ids=None, attention_mask=None)
-    # We chop off 1 element from the sequence dimension.
-    assert logprobs.shape == (BATCH, SEQ - 1)
-    # As we return ones as logits, all logprobs should be the same.
-    assert torch.all(logprobs == logprobs[0, 0]).item()
-
-
-def test_get_logprobs_with_sequence_packing():
-    """Test that getting logprobs at least does not crash."""
-    # We use args inside of get_logprobs, we need to initialize them.
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'rl_use_sequence_packing', True)
-    global_vars.set_args(args)
-
-    tokens = torch.ones((BATCH, SEQ), dtype=torch.long)
-    logprobs = rl_utils.get_logprobs(MockModel(), tokens, position_ids=None, attention_mask=None)
-    # We chop off 1 element from the sequence dimension.
-    assert logprobs.shape == (BATCH, SEQ - 1)
-    # As we return ones as logits, all logprobs should be the same.
-    assert torch.all(logprobs == logprobs[0, 0]).item()
-
-
-def test_prepare_trajectories():
-    # Make sure sequence packing is disabled for this test
-    import megatron.training.global_vars as global_vars
-
-    old_args = global_vars.get_args() if global_vars.get_args() is not None else None
-
-    # Create minimal args without sequence packing
-    args = type('Args', (), {})()
-    args.rl_use_sequence_packing = False
-    args.rl_inference_logprobs_is_correction = True
-    global_vars.set_args(args)
-
-    tokenizer = MockTokenizer()
-    r1 = TokenRollout(
-        trajectory=[1, 2, tokenizer.eod],
-        reward=3.14,
-        generation_mask=[False, True, True],
-        logprobs=[0.1, 0.2, 0.3],
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    r2 = TokenRollout(
-        trajectory=[1, 2, tokenizer.eod],
-        reward=0.14,
-        generation_mask=[False, True, True],
-        logprobs=[0.1, 0.2, 0.3],
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    rollouts = [[r1, r2]]
-    seq_len = 7
-
-    trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len)
-
-    # Check that inference logprobs are being returned.
-    torch.testing.assert_close(inference_logprobs[0], torch.tensor([0.1, 0.2, 0.3]))
-    torch.testing.assert_close(inference_logprobs[1], torch.tensor([0.1, 0.2, 0.3]))
-
-    expected_mask = torch.tensor(
-        [
-            [False, True, True, False, False, False, False],
-            [False, True, True, False, False, False, False],
-        ]
-    )
-    torch.testing.assert_close(genmask, expected_mask)
-
-    expected_trajs = torch.tensor([[1, 2, 43, 42, 42, 42, 42], [1, 2, 43, 42, 42, 42, 42]])
-    torch.testing.assert_close(trajs, expected_trajs)
-
-
-def test_prepare_trajectories_with_packing():
-    """Test that rollouts data is properly prepared with sequence packing enabled."""
-    # Initialize args for sequence packing
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'micro_batch_size', 1)
-    setattr(args, 'global_batch_size', 1)
-    setattr(args, 'rl_use_sequence_packing', True)
-    global_vars.set_args(args)
-
-    tokenizer = MockTokenizer()
-    r1 = TokenRollout(
-        trajectory=[1, 2, tokenizer.eod],
-        reward=3.14,
-        generation_mask=[False, True, True],
-        logprobs=[0.1, 0.2, 0.3],
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    r2 = TokenRollout(
-        trajectory=[1, 2, 3, tokenizer.eod],
-        reward=0.14,
-        generation_mask=[False, True, True, True],
-        logprobs=[0.1, 0.2, 0.3, -1.2],
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    rollouts = [[r1, r2]]
-    seq_len = 7
-
-    trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len)
-
-    # With sequence packing, inference logprobs should be padded to same length
-    assert isinstance(inference_logprobs, torch.Tensor)
-    assert inference_logprobs.shape == (2, 7)  # 2 sequences, each padded to seq_len
-
-    # Check values (padded with zeros)
-    torch.testing.assert_close(
-        inference_logprobs[0], torch.tensor([0.1, 0.2, 0.3, 0.0, 0.0, 0.0, 0.0])
-    )
-    torch.testing.assert_close(
-        inference_logprobs[1], torch.tensor([0.1, 0.2, 0.3, -1.2, 0.0, 0.0, 0.0])
-    )
-
-    expected_mask = torch.tensor(
-        [
-            [False, True, True, False, False, False, False],
-            [False, True, True, True, False, False, False],
-        ]
-    )
-    torch.testing.assert_close(genmask, expected_mask)
-
-    expected_trajs = torch.tensor([[1, 2, 43, 42, 42, 42, 42], [1, 2, 3, 43, 42, 42, 42]])
-    torch.testing.assert_close(trajs, expected_trajs)
-
-
-def test_grpo_loss_calculation_all_pi_eq():
-    # All policies are equal: clamping is inactive, ratios are ones.
-    current_logprobs = torch.ones(BATCH, SEQ)
-    old_logprobs = torch.ones(BATCH, SEQ)
-    ref_logprobs = torch.ones(BATCH, SEQ)
-    advantages = torch.zeros(BATCH)
-    loss, kl_term, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss(
-        current_logprobs=current_logprobs,
-        old_logprobs=old_logprobs,
-        ref_logprobs=ref_logprobs,
-        advantages=advantages,
-        clamp_eps_lower=0.1,
-        clamp_eps_upper=0.1,
-        kl_beta=0.1,
-        entropy_weight=0.0,
-    )
-    torch.testing.assert_close(loss, torch.zeros_like(loss))
-    torch.testing.assert_close(kl_term, torch.zeros_like(kl_term))
-    torch.testing.assert_close(ratios, torch.ones_like(ratios))
-    torch.testing.assert_close(entropy_term, -torch.ones_like(ratios) * torch.e)
-
-
-def test_grpo_loss_calculation_2x_ratios():
-    # All policies are equal: clamping is inactive, ratios are ones.
-    current_logprobs = torch.ones(BATCH, SEQ)
-    old_logprobs = torch.ones(BATCH, SEQ) - torch.log(torch.Tensor([2]))
-    ref_logprobs = torch.ones(BATCH, SEQ)
-    advantages = torch.ones(BATCH)
-    loss, kl_term, ratios, _, _, _ = rl_utils.calculate_grpo_loss(
-        current_logprobs=current_logprobs,
-        old_logprobs=old_logprobs,
-        ref_logprobs=ref_logprobs,
-        advantages=advantages,
-        clamp_eps_lower=2.1,
-        clamp_eps_upper=2.1,
-        kl_beta=0.0,
-        entropy_weight=0.0,
-    )
-    # Clamping does not affect us, as 2.1 [eps] > 2 [ratio].
-    # kl_beta = 0 -> we only have the non-kl term of the loss active.
-    torch.testing.assert_close(loss, -torch.ones_like(loss) * 2)
-    # pi and pi_{ref} are the same here.
-    torch.testing.assert_close(kl_term, torch.zeros_like(kl_term))
-    # Current probs are 2x more probable than old pi.
-    torch.testing.assert_close(ratios, torch.ones_like(ratios) * 2)
-
-
-def test_entropy_calculation():
-    # All policies are equal: clamping is inactive, ratios are ones.
-    current_logprobs = torch.ones(BATCH, SEQ)
-    old_logprobs = torch.ones(BATCH, SEQ)
-    ref_logprobs = torch.ones(BATCH, SEQ)
-    advantages = torch.zeros(BATCH)
-    loss, _, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss(
-        current_logprobs=current_logprobs,
-        old_logprobs=old_logprobs,
-        ref_logprobs=ref_logprobs,
-        advantages=advantages,
-        clamp_eps_lower=0.1,
-        clamp_eps_upper=0.1,
-        kl_beta=0.0,
-        entropy_weight=1.0,
-    )
-    torch.testing.assert_close(loss, torch.ones_like(ratios) * torch.e)
-    torch.testing.assert_close(entropy_term, -torch.ones_like(ratios) * torch.e)
-
-
-def test_grpo_loss_truncation():
-
-    # All ratios are 2
-    _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss(
-        current_logprobs=torch.ones(BATCH, SEQ),
-        old_logprobs=0.5 * torch.ones(BATCH, SEQ),
-        ref_logprobs=torch.ones(BATCH, SEQ),
-        advantages=torch.zeros(BATCH),
-        clamp_eps_lower=0.1,
-        clamp_eps_upper=0.1,
-        kl_beta=0.1,
-        entropy_weight=0.0,
-    )
-    assert truncated_from_above.float().mean() == 1
-    assert truncated_from_below.float().sum() == 0
-
-    # All ratios are 0.01
-    _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss(
-        current_logprobs=0.01 * torch.ones(BATCH, SEQ),
-        old_logprobs=torch.ones(BATCH, SEQ),
-        ref_logprobs=torch.ones(BATCH, SEQ),
-        advantages=torch.zeros(BATCH),
-        clamp_eps_lower=0.1,
-        clamp_eps_upper=0.1,
-        kl_beta=0.1,
-        entropy_weight=0.0,
-    )
-    assert truncated_from_above.float().sum() == 0
-    assert truncated_from_below.float().mean() == 1
-
-    current_logprobs = torch.tensor([[1.0, 1.0], [1.0, 1.0]])
-    old_logprobs = torch.tensor([[0.5, 2.0], [0.05, 1.0]])
-    _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss(
-        current_logprobs=current_logprobs,
-        old_logprobs=old_logprobs,
-        ref_logprobs=old_logprobs,
-        advantages=torch.zeros(BATCH),
-        clamp_eps_lower=0.1,
-        clamp_eps_upper=0.1,
-        kl_beta=0.1,
-        entropy_weight=0.0,
-    )
-    # ratios: [[2., 0.5],[20., 1.]]
-    torch.testing.assert_close(truncated_from_above, torch.tensor([[True, False], [True, False]]))
-    torch.testing.assert_close(truncated_from_below, torch.tensor([[False, True], [False, False]]))
-
-
-@patch('megatron.rl.rl_utils.mpu')
-def test_prepare_data_for_update(mock_mpu):
-    """Test that getting logprobs at least does not crash."""
-    mock_mpu.get_expert_data_parallel_world_size.return_value = 0
-    # We use args inside of get_logprobs, we need to initialize them.
-
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'data_parallel_size', 1)
-    setattr(args, 'micro_batch_size', 2)
-    setattr(args, 'global_batch_size', 2)
-    setattr(args, 'seq_length', 4)
-    setattr(args, 'curr_iteration', 1)
-    global_vars.unset_global_variables()
-    global_vars.set_global_variables(args, build_tokenizer=False)
-
-    model = MockModel()
-    tokenizer = MockTokenizer()
-
-    r1 = TokenRollout(
-        trajectory=[1, 2, 3],
-        reward=3.14,
-        generation_mask=[False, True, True],
-        logprobs=[0.1, 0.2, 0.3],
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    r2 = TokenRollout(
-        trajectory=[1, 2, 3, 4],
-        reward=0.14,
-        generation_mask=[False, True, True, True],
-        logprobs=[0.1, 0.2, 0.3, -1.2],
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    rollouts = [[r1, r2]]
-    try:
-        data_iter = rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer)
-    except AssertionError as e:
-        # We expect trajectories to come padded there.
-        assert str(e).startswith('Rollout is not the correct length')
-
-    r1 = TokenRollout(
-        trajectory=torch.Tensor([1, 2, 3, tokenizer.eod]).cuda(),
-        reward=3.14,
-        generation_mask=torch.Tensor([False, True, True, True]).cuda(),
-        logprobs=torch.Tensor([-0.2, -0.3, -3.2]).cuda(),
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    r2 = TokenRollout(
-        trajectory=torch.Tensor([1, 2, 234, tokenizer.eod]).cuda(),
-        reward=0.14,
-        generation_mask=torch.Tensor([False, True, True, True]).cuda(),
-        logprobs=torch.Tensor([-0.2, -0.3, -1.2]),
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    rollouts = [[r1, r2]]
-    data_iter = rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer)
-
-    _, _, old_logprobs, _, _, _, _ = next(data_iter)
-    # All logits are ones in the MockModel.
-    # All probabilities should be uniform.
-    torch.testing.assert_close(old_logprobs.exp(), torch.ones_like(old_logprobs) / VOCAB)
-
-
-def test_sequence_packing_basic():
-    """Test basic sequence packing functionality."""
-    # Initialize args as required by SequencePacker
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'seq_length', 16)
-    global_vars.set_args(args)
-
-    tokenizer = MockTokenizer()
-    bin_size = 16
-    packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad)
-
-    # Create test sequences of varying lengths, all padded to same length
-    max_len = 5
-    sequences = [
-        torch.cat(
-            [
-                torch.tensor([1, 2, 3, tokenizer.eod]),
-                torch.full((1,), tokenizer.pad, dtype=torch.long),
-            ]
-        ),  # length 4 -> 5
-        torch.cat(
-            [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)]
-        ),  # length 3 -> 5
-        torch.tensor([6, 7, 8, 9, tokenizer.eod]),  # length 5
-        torch.cat(
-            [torch.tensor([10, tokenizer.eod]), torch.full((3,), tokenizer.pad, dtype=torch.long)]
-        ),  # length 2 -> 5
-    ]
-
-    generation_masks = torch.tensor(
-        [
-            [False, True, True, True, False],  # Matches padded length
-            [False, True, True, False, False],
-            [False, True, True, True, True],
-            [False, True, False, False, False],
-        ]
-    )
-
-    rewards = torch.tensor([1.0, 2.0, 3.0, 4.0])
-
-    # Pack sequences
-    packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = (
-        packer.pack_sequences(sequences, generation_masks)
-    )
-
-    # Verify packed data structure
-    assert packed_trajs is not None
-    assert packed_position_ids is not None
-    assert packed_attention_mask is not None
-    assert packed_loss_mask is not None
-    assert packing_info is not None
-
-    # Check that sequences fit in bins properly
-    # The packer trims sequences to their actual length (removing padding)
-    # Actual lengths: 4, 3, 5, 2 = 14 total tokens
-    # With bin_size=16, this should fit in 1 bin
-    assert packed_trajs.shape[0] >= 1  # At least one bin
-    assert packed_trajs.shape[1] == bin_size
-
-    # Verify position_ids are correct
-    for bin_idx in range(packed_trajs.shape[0]):
-        # Check that position_ids reset for each sequence in the bin
-        for i in range(packed_trajs.shape[1]):
-            if i == 0 or packed_trajs[bin_idx, i - 1] == tokenizer.eod:
-                # Start of a new sequence
-                if packed_trajs[bin_idx, i] != tokenizer.pad:
-                    assert packed_position_ids[bin_idx, i] == 0
-
-
-def test_sequence_packing_with_generation_masks():
-    """Test sequence packing with generation masks."""
-    # Initialize args as required by SequencePacker
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'seq_length', 20)
-    global_vars.set_args(args)
-
-    tokenizer = MockTokenizer()
-    bin_size = 20
-    packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad)
-
-    # Create test data with generation masks
-    sequences = [torch.tensor([1, 2, 3, tokenizer.eod]), torch.tensor([4, 5, 6, 7, tokenizer.eod])]
-
-    # Pad sequences to same length for stacking
-    max_len = max(len(s) for s in sequences)
-    padded_sequences = []
-    for seq in sequences:
-        padded = torch.cat([seq, torch.full((max_len - len(seq),), tokenizer.pad, dtype=seq.dtype)])
-        padded_sequences.append(padded)
-
-    generation_masks = torch.tensor(
-        [
-            [False, True, True, True, False],  # Padded to match max_len
-            [False, True, True, True, True],
-        ]
-    )
-
-    # Pack sequences
-    packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = (
-        packer.pack_sequences(padded_sequences, generation_masks)
-    )
-
-    # Verify packed tensors
-    assert packed_trajs.shape[0] == 1  # One bin
-    assert packed_trajs.shape[1] == bin_size
-
-    # Check that loss mask is set correctly for generation tokens
-    # The loss mask should be 1 for generation tokens and 0 for padding/prompt
-
-
-def test_sequence_packing_empty_bins():
-    """Test that empty bins are created correctly."""
-    # Initialize args if needed
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'seq_length', 8)
-    global_vars.set_args(args)
-
-    tokenizer = MockTokenizer()
-    bin_size = 8
-    num_empty_bins = 3
-
-    # Create a simple packed data structure
-    packed_trajs = torch.tensor(
-        [[1, 2, 3, tokenizer.eod, tokenizer.pad, tokenizer.pad, tokenizer.pad, tokenizer.pad]]
-    )
-    packed_position_ids = torch.tensor([[0, 1, 2, 3, 0, 0, 0, 0]])
-    packed_loss_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float)
-    packed_attention_mask = torch.ones(1, bin_size, bin_size)  # Simple full attention mask
-
-    # Create empty bins
-    empty_trajs, empty_position_ids, empty_loss_mask, empty_attention_mask, empty_packing_info = (
-        rl_utils.create_empty_bins(
-            num_empty_bins=num_empty_bins,
-            bin_size=bin_size,
-            packed_trajs=packed_trajs,
-            packed_position_ids=packed_position_ids,
-            packed_loss_mask=packed_loss_mask,
-            packed_attention_mask=packed_attention_mask,
-            tokenizer=tokenizer,
-        )
-    )
-
-    # Verify shapes
-    assert empty_trajs.shape[0] == num_empty_bins
-    assert empty_trajs.shape[1] == bin_size
-
-    # Check that empty bins are filled with padding
-    for i in range(num_empty_bins):
-        assert torch.all(empty_trajs[i] == tokenizer.pad)
-        assert torch.all(empty_position_ids[i] == 0)
-        assert torch.all(empty_loss_mask[i] == 0)
-
-    # Verify packing info for empty bins
-    assert len(empty_packing_info) == num_empty_bins
-    for info in empty_packing_info:
-        assert len(info['bin_seq_indices']) == 0  # No sequences in empty bins
-        assert len(info['seq_starts']) == 0  # No sequence starts
-
-
-def test_prepare_trajectories_with_sequence_packing():
-    """Test prepare_trajectories with sequence packing enabled."""
-    # Set up args with sequence packing
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'rl_use_sequence_packing', True)
-    setattr(args, 'rl_sequence_packing_bin_size', 16)
-    setattr(args, 'data_parallel_size', 1)
-    setattr(args, 'micro_batch_size', 2)
-    setattr(args, 'global_batch_size', 2)
-    setattr(args, 'seq_length', 16)
-    setattr(args, 'curr_iteration', 1)
-    global_vars.unset_global_variables()
-    global_vars.set_global_variables(args, build_tokenizer=False)
-
-    tokenizer = MockTokenizer()
-
-    # Create rollouts of varying lengths
-    r1 = TokenRollout(
-        trajectory=[1, 2, tokenizer.eod],
-        reward=3.14,
-        generation_mask=[False, True, True],
-        logprobs=[0.1, 0.2, 0.3],
-        env_id='MEGAENV',
-        problem_id="1",
-    )
-    r2 = TokenRollout(
-        trajectory=[4, 5, 6, 7, tokenizer.eod],
-        reward=0.14,
-        generation_mask=[False, True, True, True, True],
-        logprobs=[0.4, 0.5, 0.6, 0.7, 0.8],
-        env_id='MEGAENV',
-        problem_id="2",
-    )
-    r3 = TokenRollout(
-        trajectory=[8, 9, tokenizer.eod],
-        reward=2.71,
-        generation_mask=[False, True, True],
-        logprobs=[0.9, 1.0, 1.1],
-        env_id='MEGAENV',
-        problem_id="3",
-    )
-
-    rollouts = [[r1, r2, r3]]
-    seq_len = 16
-
-    # Call prepare_trajectories with sequence packing
-    trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len)
-
-    # With sequence packing enabled but called from prepare_trajectories,
-    # it might still return individual sequences (not packed into bins yet)
-    # because the actual packing happens later in prepare_data_for_update
-    assert trajs.shape[0] == 3  # Three sequences
-    assert trajs.shape[1] == seq_len
-
-    # Verify that each sequence is properly padded
-    # Sequence 1: [1, 2, eod, pad] + padding
-    assert trajs[0, 0] == 1
-    assert trajs[0, 1] == 2
-    assert trajs[0, 2] == tokenizer.eod
-    assert trajs[0, 3] == tokenizer.pad
-
-    # Sequence 2: [4, 5, 6, 7, eod, pad] + padding
-    assert trajs[1, 0] == 4
-    assert trajs[1, 1] == 5
-    assert trajs[1, 4] == tokenizer.eod
-    assert trajs[1, 5] == tokenizer.pad
-
-
-def test_sequence_packing_integration():
-    """Simple integration test for sequence packing - just verifies the packing works."""
-    # Initialize minimal args needed for SequencePacker
-    args = arguments.parse_args(ignore_unknown_args=True)
-    setattr(args, 'seq_length', 16)
-    global_vars.set_args(args)
-
-    tokenizer = MockTokenizer()
-    bin_size = 16
-
-    # Test that we can pack sequences and get expected outputs
-    packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad)
-
-    # Create test data - need to pad to same length for stacking
-    max_len = 5
-    sequences = [
-        torch.cat(
-            [
-                torch.tensor([1, 2, 3, tokenizer.eod]),
-                torch.full((1,), tokenizer.pad, dtype=torch.long),
-            ]
-        ),  # length 4 -> 5
-        torch.cat(
-            [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)]
-        ),  # length 3 -> 5
-        torch.tensor([6, 7, 8, 9, tokenizer.eod]),  # length 5
-    ]
-    generation_masks = [
-        torch.tensor([False, True, True, True, False]),
-        torch.tensor([False, True, True, False, False]),
-        torch.tensor([False, True, True, True, True]),
-    ]
-
-    # Pack sequences
-    packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = (
-        packer.pack_sequences(sequences, generation_masks)
-    )
-
-    # Basic assertions
-    assert packed_trajs is not None
-    assert packed_trajs.shape[1] == bin_size  # Each bin should be bin_size
-    assert packed_position_ids.shape == packed_trajs.shape
-    assert packed_loss_mask.shape == packed_trajs.shape
-
-    # Verify the sequences are packed correctly
-    # Total length: 4 + 3 + 5 = 12, should fit in 1 bin
-    assert packed_trajs.shape[0] == 1
-
-    # The packer sorts sequences by length (descending), so order is: seq3 (len 5), seq1 (len 4), seq2 (len 3)
-    expected_start = torch.tensor(
-        [6, 7, 8, 9, tokenizer.eod, 1, 2, 3, tokenizer.eod, 4, 5, tokenizer.eod]
-    )
-    assert torch.all(packed_trajs[0, :12] == expected_start)
-
-    # Rest should be padding
-    assert torch.all(packed_trajs[0, 12:] == tokenizer.pad)
diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py
index 3d8f5d9c33c..556cef81407 100644
--- a/tests/unit_tests/test_tokenizer.py
+++ b/tests/unit_tests/test_tokenizer.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import base64
 import json
 from argparse import Namespace
@@ -41,7 +43,7 @@ def local_test_specs():
             tokenizer_model=f"{TOKENIZER_DIR}/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json",
             tiktoken_pattern="v2",
             tiktoken_num_special_tokens=1000,
-            tiktoken_special_tokens=["<unk>", "<s>", "</s>"],
+            tokenizer_special_tokens=["<unk>", "<s>", "</s>"],
         ),
         Namespace(
             rank=0,
@@ -52,7 +54,7 @@ def local_test_specs():
             tokenizer_model=f"{TOKENIZER_DIR}/multiMixV5_fix_default_500000_128k.vocab.json",
             tiktoken_pattern="v1",
             tiktoken_num_special_tokens=1000,
-            tiktoken_special_tokens=["<unk>", "<s>", "</s>"],
+            tokenizer_special_tokens=["<unk>", "<s>", "</s>"],
         ),
         Namespace(
             rank=0,
@@ -117,7 +119,7 @@ def gpt2_tiktok_vocab(tmp_path_factory):
         tokenizer_model=str(file_name),
         tiktoken_pattern="v1",
         tiktoken_num_special_tokens=1000,
-        tiktoken_special_tokens=["<unk>", "<s>", "</s>"],
+        tokenizer_special_tokens=["<unk>", "<s>", "</s>"],
     )
 
 
diff --git a/tests/unit_tests/test_training.py b/tests/unit_tests/test_training.py
index 953a80e0945..2fd85724826 100644
--- a/tests/unit_tests/test_training.py
+++ b/tests/unit_tests/test_training.py
@@ -1,8 +1,16 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+from collections import defaultdict
+from pathlib import Path
 from types import SimpleNamespace
 
+import torch
+
+from megatron.training.checkpointing import save_grads
 from megatron.training.global_vars import set_args
 from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding
 from megatron.training.training import build_train_valid_test_data_iterators
+from tests.unit_tests.dist_checkpointing import TempNamedDir
 from tests.unit_tests.test_utilities import Utils
 
 
@@ -26,6 +34,7 @@ def create_test_args():
     args.full_validation = False
     args.multiple_validation_sets = False
     args.perform_rl_step = False
+    args.phase_transition_iterations = None
 
     return args
 
@@ -71,3 +80,56 @@ def old_round_impl(after, multiple):
 
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
+
+
+class TestSaveGrads:
+    """Tests for the save_grads function."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_save_grads(self, tmp_path_dist_ckpt):
+        """Test that save_grads creates the correct directory structure and saves
+        state_dict correctly.
+
+        With TP=1, PP=1 on 8 GPUs, we have 8 DP ranks. Only the rank with
+        expert_data_parallel_rank==0 should save. All ranks verify the result.
+        """
+        save_dir = str(tmp_path_dist_ckpt / "test_save_grads")
+
+        with TempNamedDir(save_dir, sync=True) as save_dir:
+            # Create a mock state_dict with gradients (use deterministic values for reproducibility).
+            state_dict = defaultdict(dict)
+            state_dict["model_chunk0"]["layer.weight"] = torch.arange(16).reshape(4, 4).float()
+            state_dict["model_chunk0"]["layer.bias"] = torch.arange(4).float()
+
+            iteration = 100
+            grad_label = "wgrads"
+
+            # All ranks call save_grads, but only expert_data_parallel_rank==0 actually saves.
+            save_grads(save_dir, dict(state_dict), iteration, grad_label)
+
+            # Synchronize before checking results since only rank 0 saves.
+            torch.distributed.barrier()
+
+            # All ranks verify the file was created by rank 0.
+            expected_dir = Path(save_dir) / grad_label / f"iter_{iteration:07d}"
+            assert expected_dir.exists(), f"Expected directory {expected_dir} to exist"
+
+            expected_file = expected_dir / "mp_rank_00.pth"
+            assert expected_file.exists(), f"Expected file {expected_file} to exist"
+
+            # Verify saved content.
+            loaded = torch.load(expected_file)
+            assert "model_chunk0" in loaded
+            assert "layer.weight" in loaded["model_chunk0"]
+            assert "layer.bias" in loaded["model_chunk0"]
+            assert torch.equal(
+                loaded["model_chunk0"]["layer.weight"], state_dict["model_chunk0"]["layer.weight"]
+            )
+            assert torch.equal(
+                loaded["model_chunk0"]["layer.bias"], state_dict["model_chunk0"]["layer.bias"]
+            )
diff --git a/tests/unit_tests/test_utilities.py b/tests/unit_tests/test_utilities.py
index f16f88f7865..39c78efb2b9 100644
--- a/tests/unit_tests/test_utilities.py
+++ b/tests/unit_tests/test_utilities.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import os
 from datetime import timedelta
 
@@ -27,8 +28,8 @@ def __init__(
 
 class Utils:
 
-    world_size = int(os.environ['WORLD_SIZE'])
-    rank = int(os.environ['LOCAL_RANK'])
+    world_size = int(os.environ.get('WORLD_SIZE', '1'))
+    rank = int(os.environ.get('LOCAL_RANK', '0'))
     inited = False
     store = None
 
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
index 282c4b8b2a4..fddde8dd36e 100644
--- a/tests/unit_tests/test_utils.py
+++ b/tests/unit_tests/test_utils.py
@@ -15,8 +15,10 @@
 import megatron.training.utils as training_util
 from megatron.core import config
 from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
 from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.moe.moe_layer import MoELayer
 from tests.unit_tests.test_utilities import Utils
 
 success_string = "hello,world"
@@ -238,7 +240,7 @@ def test_cross_check_param_hashes_across_dp_replicas():
 @pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 @pytest.mark.internal
-def test_param_norm(use_distributed_optimizer: bool):
+def test_param_norm_linear(use_distributed_optimizer: bool):
     world = int(os.getenv('WORLD_SIZE', '1'))
     rank = int(os.getenv('RANK', '0'))
 
@@ -286,6 +288,73 @@ def test_param_norm(use_distributed_optimizer: bool):
     _deinit_distributed()
 
 
+@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
+@pytest.mark.flaky
+@pytest.mark.flaky_in_dev
+@pytest.mark.internal
+def test_param_norm_moe(use_distributed_optimizer: bool):
+    world = int(os.getenv('WORLD_SIZE', '1'))
+    rank = int(os.getenv('RANK', '0'))
+
+    # Setup: distributed, model, mock_args.
+    _init_distributed(world, rank)
+    Utils.initialize_model_parallel()
+    transformer_config = TransformerConfig(
+        num_layers=1,
+        hidden_size=12,
+        num_attention_heads=4,
+        num_moe_experts=2,
+        use_cpu_initialization=True,
+        moe_token_dispatcher_type="alltoall",
+        moe_router_topk=2,
+        moe_aux_loss_coeff=0.01,
+        moe_grouped_gemm=True,
+        moe_ffn_hidden_size=128,
+        add_bias_linear=False,
+        bf16=True,
+    )
+    transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        num_experts=2, moe_grouped_gemm=True
+    )
+    model = MoELayer(transformer_config, transformer_layer_spec.submodules.mlp.submodules).to(
+        device='cuda'
+    )
+    model.requires_grad_(True)
+    # Initialize the model with all 1.0 for weights.
+    for param in model.parameters():
+        param.data.fill_(1.0)
+    ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=use_distributed_optimizer)
+    model = DistributedDataParallel(transformer_config, ddp_config, model)
+    for param in model.parameters():
+        assert param.requires_grad
+    mock_args = SimpleNamespace(bf16=True)
+
+    with mock.patch('megatron.training.utils.get_args', new=lambda: mock_args):
+        # Make sure norm is correct when `main_param` attribute is not available.
+        norm_no_fp32_copy = training_util.calc_params_l2_norm(model, force_create_fp32_copy=False)
+        norm_fp32_copy = training_util.calc_params_l2_norm(model, force_create_fp32_copy=True)
+        assert norm_no_fp32_copy == pytest.approx(norm_fp32_copy)
+
+        # Make sure norm is correct when `main_param` attribute is available.
+        optimizer_config = OptimizerConfig(
+            bf16=True, use_distributed_optimizer=use_distributed_optimizer
+        )
+        _ = get_megatron_optimizer(optimizer_config, [model])
+        for param in model.parameters():
+            # Only bf16/fp16 parameters get main_param attribute.
+            # Router weights are always fp32, so they won't have main_param.
+            if param.dtype in [torch.bfloat16, torch.float16]:
+                assert hasattr(param, 'main_param')
+                if use_distributed_optimizer:
+                    assert getattr(param, 'main_param_sharded', False)
+        norm_no_fp32_copy = training_util.calc_params_l2_norm(model, force_create_fp32_copy=False)
+        norm_fp32_copy = training_util.calc_params_l2_norm(model, force_create_fp32_copy=True)
+        assert norm_no_fp32_copy == pytest.approx(norm_fp32_copy)
+
+    # Teardown.
+    _deinit_distributed()
+
+
 @pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_straggler_detector():
diff --git a/tests/unit_tests/transformer/moe/conftest.py b/tests/unit_tests/transformer/moe/conftest.py
index dda2a6d2b92..6724b3bbf4a 100644
--- a/tests/unit_tests/transformer/moe/conftest.py
+++ b/tests/unit_tests/transformer/moe/conftest.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import os
 from pathlib import Path
 
diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
index ff31e41fb7c..25686c28b28 100644
--- a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
@@ -96,6 +96,7 @@ def test_capacity_padding_forward_backward(self, tp_size, ep_size, permute_fusio
         )
         container.dispatcher_drop_and_pad_test()
 
+    @pytest.mark.flaky_in_dev
     @pytest.mark.skipif(
         not is_te_min_version("1.7.0"), reason="TE 1.7.0 is required for MoE with FP8."
     )
diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py
index 108c9a65a4b..ccd11bf29af 100644
--- a/tests/unit_tests/transformer/moe/test_aux_loss.py
+++ b/tests/unit_tests/transformer/moe/test_aux_loss.py
@@ -18,6 +18,7 @@
 )
 from megatron.core.transformer.moe.router import TopKRouter
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.typed_torch import apply_module
 from megatron.training.initialize import _set_random_seed
 from tests.unit_tests.test_utilities import Utils
 from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestContainer
@@ -51,7 +52,7 @@ def partition_input(self, input):
     def aux_loss_test(self, input, baseline_grad, loss_name):
         partitioned_input = self.partition_input(input)
         moe_layer = self.moe_layer
-        probs, indices = moe_layer.router(partitioned_input)
+        probs, indices = apply_module(moe_layer.router)(partitioned_input)
         probs.sum().mul_(0).backward()
         aux_loss_grad = partitioned_input.grad
         torch.distributed.barrier()
@@ -62,7 +63,7 @@ def aux_loss_test(self, input, baseline_grad, loss_name):
         clear_aux_losses_tracker()
 
         with torch.no_grad():
-            probs, indices = moe_layer.router(partitioned_input)
+            probs, indices = apply_module(moe_layer.router)(partitioned_input)
             loss = get_moe_layer_wise_logging_tracker()[loss_name]['values']
             assert loss == 0, "Loss should be 0"
             clear_aux_losses_tracker()
@@ -84,7 +85,7 @@ def setup_method(self, method):
         moe_layer = baseline_container.moe_layer
         self.input = torch.randn((32, 8, moe_layer.config.hidden_size)).cuda()
         self.input.requires_grad = True
-        probs, indices = moe_layer.router(self.input)
+        probs, indices = apply_module(moe_layer.router)(self.input)
         probs.sum().mul_(0).backward()  # zero out the main gradients
         self.baseline_grad = self.input.grad
         self.input.grad = None
@@ -148,7 +149,7 @@ def setup_method(self, method):
         moe_layer = baseline_container.moe_layer
         self.input = torch.randn((32, 8, moe_layer.config.hidden_size)).cuda()
         self.input.requires_grad = True
-        probs, indices = moe_layer.router(self.input)
+        probs, indices = apply_module(moe_layer.router)(self.input)
         probs.sum().mul_(0).backward()  # zero out the main gradients
         self.baseline_grad = self.input.grad
         self.input.grad = None
@@ -331,7 +332,7 @@ def test_seq_aux_loss(self, tp_size, ep_size, cp_size):
         not torch.cuda.is_available() or not HAVE_ROUTER_FUSION,
         reason="CUDA or TE fused router ops not available",
     )
-    @pytest.mark.parametrize("aux_type", ["aux_loss", "seq_aux_loss"])
+    @pytest.mark.parametrize("aux_type", ["aux_loss", "seq_aux_loss", "global_aux_loss"])
     def test_aux_loss_fusion_equivalence(self, aux_type):
         # Compare fused vs unfused aux loss path to ensure numerical equivalence
         router_ref = self.new_router(
@@ -350,6 +351,7 @@ def test_aux_loss_fusion_equivalence(self, aux_type):
         loss_name_map = {
             "aux_loss": "load_balancing_loss",
             "seq_aux_loss": "seq_load_balancing_loss",
+            "global_aux_loss": "global_load_balancing_loss",
         }
         loss_name = loss_name_map[aux_type]
 
@@ -575,3 +577,185 @@ def test_force_balanced_aux_loss(self, tp_size, ep_size, cp_size):
                 reduce_from_tensor_model_parallel_region(aux_loss, router.tp_cp_group)
             assert aux_loss.item() == 1, f"{aux_loss_type}: {aux_loss.item()}"
             clear_aux_losses_tracker()
+
+
+class TestPaddingMaskAuxLoss:
+    """Test padding mask support in various aux loss types."""
+
+    def setup_model_parallel(self, tp_size=1, ep_size=1, cp_size=1, sequence_parallel=False):
+        """Initialize model parallel with given configuration.
+
+        Args:
+            tp_size: Tensor parallel size.
+            ep_size: Expert parallel size.
+            cp_size: Context parallel size.
+        """
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp_size,
+            pipeline_model_parallel_size=1,
+            context_parallel_size=cp_size,
+            expert_model_parallel_size=ep_size,
+        )
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+
+        # Store parallel configuration
+        self.tp_size = tp_size
+        self.ep_size = ep_size
+        self.cp_size = cp_size
+
+        # Default configuration
+        self.default_transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=12,
+            num_attention_heads=8,
+            num_moe_experts=32,
+            use_cpu_initialization=True,
+            moe_router_load_balancing_type="aux_loss",
+            moe_router_topk=8,
+            moe_aux_loss_coeff=1.0,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            add_bias_linear=False,
+            tensor_model_parallel_size=tp_size,
+            expert_model_parallel_size=ep_size,
+            context_parallel_size=cp_size,
+            sequence_parallel=sequence_parallel and tp_size > 1,
+        )
+
+    def new_router(self, **kwargs):
+        """Create a new router with updated configuration."""
+        pg_collection = get_default_pg_collection()
+        new_transformer_config = dataclasses.replace(self.default_transformer_config, **kwargs)
+        router = TopKRouter(config=new_transformer_config, pg_collection=pg_collection)
+        router.set_layer_number(0)
+        return router
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.parametrize("aux_loss_type", ["aux_loss", "seq_aux_loss", "global_aux_loss"])
+    @pytest.mark.parametrize(
+        "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)]
+    )
+    def test_padding_mask_removes_padding_tokens(self, aux_loss_type, tp_size, ep_size, cp_size):
+        """Test that padding tokens are correctly excluded from aux loss calculation."""
+        # Initialize model parallel with given configuration
+        self.setup_model_parallel(tp_size=tp_size, ep_size=ep_size, cp_size=cp_size)
+
+        try:
+            clear_aux_losses_tracker()
+
+            router = self.new_router(
+                moe_router_load_balancing_type=aux_loss_type,
+                moe_aux_loss_coeff=1.0,
+                moe_router_dtype="fp64",
+            ).cuda()
+
+            seq_len = 32
+            batch_size = 2
+            hidden_size = router.config.hidden_size
+
+            # Create input with padding
+            hidden_states_full = torch.randn(
+                (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda'
+            )
+
+            # Create padding mask: first half valid, second half padding
+            padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda')
+            padding_mask[seq_len // 2 :, :] = True
+
+            # Test with padding mask
+            router.weight.grad = None
+            scores_with_mask, routing_map_with_mask = router(
+                hidden_states_full, padding_mask=padding_mask
+            )
+            scores_with_mask.backward(torch.zeros_like(scores_with_mask))
+
+            loss_name = {
+                "aux_loss": "load_balancing_loss",
+                "seq_aux_loss": "seq_load_balancing_loss",
+                "global_aux_loss": "global_load_balancing_loss",
+            }[aux_loss_type]
+
+            tracker = get_moe_layer_wise_logging_tracker()
+            aux_loss_with_mask = tracker[loss_name]["values"][0].clone()
+            grad_with_mask = router.weight.grad.clone()
+
+            # Test without padding (with only half of the tokens)
+            clear_aux_losses_tracker()
+            router.weight.grad = None
+            hidden_states_valid = hidden_states_full[: seq_len // 2, :, :]
+            scores_without_mask, routing_map_without_mask = router(hidden_states_valid)
+            scores_without_mask.backward(torch.zeros_like(scores_without_mask))
+
+            aux_loss_without_mask = tracker[loss_name]["values"][0].clone()
+            grad_without_mask = router.weight.grad.clone()
+
+            # The aux loss with mask should be equal to the aux loss without mask
+            assert torch.equal(aux_loss_with_mask, aux_loss_without_mask)
+            assert torch.equal(grad_with_mask, grad_without_mask)
+
+            clear_aux_losses_tracker()
+        finally:
+            # Always cleanup model parallel
+            Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.parametrize(
+        "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)]
+    )
+    def test_padding_mask_with_z_loss(self, tp_size, ep_size, cp_size):
+        """Test that padding mask works correctly with z_loss."""
+        # Initialize model parallel with given configuration
+        self.setup_model_parallel(tp_size=tp_size, ep_size=ep_size, cp_size=cp_size)
+
+        try:
+            clear_aux_losses_tracker()
+
+            router = self.new_router(
+                moe_router_load_balancing_type="aux_loss",
+                moe_aux_loss_coeff=0.0,
+                moe_z_loss_coeff=1.0,
+                moe_router_dtype="fp32",
+            ).cuda()
+
+            seq_len = 32
+            batch_size = 2
+            hidden_size = router.config.hidden_size
+
+            # Create input
+            hidden_states_full = torch.randn(
+                (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda'
+            )
+
+            # Create padding mask: first half valid, second half padding
+            padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda')
+            padding_mask[seq_len // 2 :, :] = True
+
+            # Test with padding mask
+            router.weight.grad = None
+            scores_with_mask, _ = router(hidden_states_full, padding_mask=padding_mask)
+            scores_with_mask.sum().backward()
+
+            tracker = get_moe_layer_wise_logging_tracker()
+            z_loss_with_mask = tracker["z_loss"]["values"][0].clone()
+            grad_with_mask = router.weight.grad.clone()
+
+            # Test without padding (with only half of the tokens)
+            clear_aux_losses_tracker()
+            router.weight.grad = None
+            hidden_states_valid = hidden_states_full[: seq_len // 2, :, :]
+            scores_without_mask, _ = router(hidden_states_valid)
+            scores_without_mask.sum().backward()
+
+            z_loss_without_mask = tracker["z_loss"]["values"][0].clone()
+            grad_without_mask = router.weight.grad.clone()
+
+            # The z_loss with mask should be close to the z_loss without mask
+            assert torch.equal(z_loss_with_mask, z_loss_without_mask)
+            assert torch.equal(grad_with_mask, grad_without_mask)
+
+            clear_aux_losses_tracker()
+        finally:
+            # Always cleanup model parallel
+            Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/transformer/moe/test_moe_layer.py b/tests/unit_tests/transformer/moe/test_moe_layer.py
index 59385f757b3..11bd09f8449 100644
--- a/tests/unit_tests/transformer/moe/test_moe_layer.py
+++ b/tests/unit_tests/transformer/moe/test_moe_layer.py
@@ -192,3 +192,207 @@ def test_interleave_transformer_block(self, moe_layer_freq):
 
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
+
+
+class TestMoELayerFP16:
+    """Test MoE layer with FP16 precision."""
+
+    def setup_method(self, method):
+        pass
+
+    @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
+    @pytest.mark.parametrize("num_moe_experts", [2, 4])
+    @pytest.mark.parametrize("tp_size,ep_size", [(1, 1), (2, 2), (4, 2)])
+    def test_moe_layer_fp16_forward_backward(
+        self, num_moe_experts, moe_token_dispatcher_type, tp_size, ep_size
+    ):
+        """Test MoE layer forward and backward pass with fp16 params and inputs."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp_size, expert_model_parallel_size=ep_size
+        )
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+
+        hidden_size = 64
+        sequence_length = 32
+        micro_batch_size = 2
+
+        transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=hidden_size,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=False,
+            moe_token_dispatcher_type=moe_token_dispatcher_type,
+            moe_router_load_balancing_type="aux_loss",
+            moe_router_topk=2,
+            moe_aux_loss_coeff=0.01,
+            moe_grouped_gemm=False,  # Use SequentialMLP for fp16 test
+            moe_ffn_hidden_size=256,
+            add_bias_linear=False,
+            tensor_model_parallel_size=tp_size,
+            expert_model_parallel_size=ep_size,
+            sequence_parallel=tp_size > 1,
+            fp16=True,
+            params_dtype=torch.float16,
+        )
+
+        transformer_layer_spec = get_gpt_layer_local_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
+
+        moe_layer = MoELayer(
+            transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        ).cuda()
+
+        hidden_states = torch.randn(
+            sequence_length,
+            micro_batch_size,
+            hidden_size,
+            device=torch.cuda.current_device(),
+            dtype=torch.float16,
+            requires_grad=True,
+        )
+
+        # Forward pass
+        output, _ = moe_layer(hidden_states)
+
+        assert output.dtype == torch.float16, f"Expected fp16 output, got {output.dtype}"
+        assert output.shape == hidden_states.shape, f"Output shape mismatch"
+
+        # Backward pass
+        loss = output.sum()
+        loss.backward()
+
+        assert hidden_states.grad is not None, "Input gradients should exist"
+        assert (
+            hidden_states.grad.dtype == torch.float16
+        ), f"Expected fp16 gradients, got {hidden_states.grad.dtype}"
+
+        for name, param in moe_layer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Gradient for {name} should exist"
+
+        Utils.destroy_model_parallel()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+
+class TestMoELayerRecompute:
+    """Test MoE layer with recompute enabled (activation checkpointing).
+
+    Tests both code paths:
+    - fp8=False: uses tensor_parallel.checkpoint
+    - fp8=True: uses te_checkpoint (requires TE >= 1.7.0)
+    """
+
+    def setup_method(self, method):
+        pass
+
+    @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
+    @pytest.mark.parametrize("num_moe_experts", [2, 4])
+    @pytest.mark.parametrize("with_padding_mask", [True, False])
+    @pytest.mark.parametrize("tp_size,ep_size", [(1, 1), (4, 2)])
+    @pytest.mark.parametrize("fp8", [False, True])
+    def test_moe_layer_recompute_forward_backward(
+        self, num_moe_experts, moe_token_dispatcher_type, with_padding_mask, tp_size, ep_size, fp8
+    ):
+        """Test MoE layer forward and backward pass with recompute enabled.
+
+        When fp8=False, uses tensor_parallel.checkpoint.
+        When fp8=True, uses te_checkpoint (requires TE >= 1.7.0).
+        """
+        # Skip fp8 tests if TE version is not sufficient
+        if fp8 and not is_te_min_version("1.7.0.dev0"):
+            pytest.skip("FP8 MoE recompute requires TE 1.7.0 and later.")
+
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp_size, expert_model_parallel_size=ep_size
+        )
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+
+        hidden_size = 64
+        sequence_length = 32
+        micro_batch_size = 2
+
+        transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=hidden_size,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=False,
+            moe_token_dispatcher_type=moe_token_dispatcher_type,
+            moe_router_load_balancing_type="aux_loss",
+            moe_router_topk=2,
+            moe_aux_loss_coeff=0.01,
+            moe_grouped_gemm=False,
+            moe_ffn_hidden_size=256,
+            add_bias_linear=False,
+            # Enable recompute for MoE layer
+            recompute_granularity="selective",
+            recompute_modules=["moe"],
+            tensor_model_parallel_size=tp_size,
+            expert_model_parallel_size=ep_size,
+            sequence_parallel=tp_size > 1,
+            fp8=fp8,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+        )
+
+        # Use TE spec for fp8, local spec otherwise
+        if fp8:
+            transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                num_experts=num_moe_experts, moe_grouped_gemm=False
+            )
+        else:
+            transformer_layer_spec = get_gpt_layer_local_spec(
+                num_experts=num_moe_experts, moe_grouped_gemm=False
+            )
+
+        moe_layer = MoELayer(
+            transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        ).cuda()
+
+        hidden_states = torch.randn(
+            sequence_length,
+            micro_batch_size,
+            hidden_size,
+            device=torch.cuda.current_device(),
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+
+        # Create padding mask if needed: shape [batch_size, sequence_length]
+        padding_mask = None
+        if with_padding_mask:
+            padding_mask = torch.ones(
+                micro_batch_size,
+                sequence_length,
+                device=torch.cuda.current_device(),
+                dtype=torch.bool,
+            )
+            # Mark last 4 tokens as padding for each batch
+            padding_mask[:, -4:] = False
+
+        output, _ = moe_layer(hidden_states, padding_mask=padding_mask)
+
+        assert output.dtype == torch.bfloat16, f"Expected bf16 output, got {output.dtype}"
+        assert output.shape == hidden_states.shape, f"Output shape mismatch"
+
+        # Backward pass - this is where recompute/checkpoint is actually used
+        loss = output.sum()
+        loss.backward()
+
+        assert hidden_states.grad is not None, "Input gradients should exist"
+        assert (
+            hidden_states.grad.dtype == torch.bfloat16
+        ), f"Expected bf16 gradients, got {hidden_states.grad.dtype}"
+
+        for name, param in moe_layer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Gradient for {name} should exist"
+
+        Utils.destroy_model_parallel()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/transformer/moe/test_router_replay.py b/tests/unit_tests/transformer/moe/test_router_replay.py
new file mode 100644
index 00000000000..840fc0fd269
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_router_replay.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import pytest
+import torch
+
+from megatron.core.transformer.moe.moe_utils import topk_routing_with_score_function
+from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction
+
+
+def setup_function():
+    RouterReplay.global_router_replay_instances.clear()
+
+
+def teardown_function():
+    RouterReplay.global_router_replay_instances.clear()
+
+
+def test_record_mode_with_topk_routing_softmax_post():
+    rr = RouterReplay()
+    rr.set_router_replay_action(RouterReplayAction.RECORD)
+    logits = torch.randn(4, 6)
+    probs, routing_map = topk_routing_with_score_function(
+        logits=logits, topk=2, use_pre_softmax=False, router_replay=rr, score_function="softmax"
+    )
+    recorded = rr.get_recorded_indices()
+    expected_idx = torch.topk(logits, k=2, dim=1).indices
+    assert recorded is not None
+    assert torch.equal(recorded, expected_idx)
+    assert probs.shape == (4, 6)
+    assert routing_map.shape == (4, 6)
+    assert routing_map.sum(dim=1).eq(2).all()
+
+
+def test_replay_forward_with_topk_routing_softmax_pre():
+    rr = RouterReplay()
+    rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD)
+    logits = torch.randn(3, 5)
+    target = torch.tensor([[1, 2], [0, 3], [2, 4]], dtype=torch.long)
+    rr.set_target_indices(target)
+    probs, routing_map = topk_routing_with_score_function(
+        logits=logits, topk=2, use_pre_softmax=True, router_replay=rr, score_function="softmax"
+    )
+    assert routing_map.sum(dim=1).eq(2).all()
+    scores = torch.softmax(logits, dim=-1)
+    assert torch.equal(probs.gather(1, target), scores.gather(1, target))
+
+
+def test_replay_forward_with_topk_routing_softmax_post():
+    rr = RouterReplay()
+    rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD)
+    logits = torch.randn(3, 6)
+    target = torch.tensor([[1, 2], [0, 5], [3, 4]], dtype=torch.long)
+    rr.set_target_indices(target)
+    probs, routing_map = topk_routing_with_score_function(
+        logits=logits, topk=2, use_pre_softmax=False, router_replay=rr, score_function="softmax"
+    )
+    selected = torch.softmax(logits.gather(1, target), dim=-1)
+    assert torch.equal(probs.gather(1, target), selected)
+    assert routing_map.sum(dim=1).eq(2).all()
+
+
+def test_global_set_get_clear_indices():
+    r1 = RouterReplay()
+    r2 = RouterReplay()
+    t1 = torch.tensor([[0, 1]], dtype=torch.long)
+    t2 = torch.tensor([[1, 0]], dtype=torch.long)
+    RouterReplay.set_replay_data([t1, t2])
+    assert torch.equal(r1.target_topk_idx, t1)
+    assert torch.equal(r2.target_topk_idx, t2)
+    r1.record_indices(t1)
+    r2.record_indices(t2)
+    rec = RouterReplay.get_recorded_data()
+    assert len(rec) == 2
+    assert torch.equal(rec[0], t1)
+    assert torch.equal(rec[1], t2)
+    RouterReplay.clear_global_indices()
+    assert r1.target_topk_idx is None and r2.target_topk_idx is None
+    assert r1.get_recorded_indices() is None and r2.get_recorded_indices() is None
+
+
+def test_global_action_set_and_clear():
+    r1 = RouterReplay()
+    r2 = RouterReplay()
+    RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)
+    assert r1.router_replay_action == RouterReplayAction.REPLAY_FORWARD
+    assert r2.router_replay_action == RouterReplayAction.REPLAY_FORWARD
+    RouterReplay.clear_global_router_replay_action()
+    assert r1.router_replay_action is None and r2.router_replay_action is None
+
+
+def test_set_replay_data_length_mismatch():
+    _ = RouterReplay()
+    with pytest.raises(ValueError):
+        RouterReplay.set_replay_data(
+            [torch.tensor([[0, 1]], dtype=torch.long), torch.tensor([[1, 0]], dtype=torch.long)]
+        )
diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py
index 677d938cdc7..4d6b5ee2c3e 100644
--- a/tests/unit_tests/transformer/moe/test_routers.py
+++ b/tests/unit_tests/transformer/moe/test_routers.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
+from typing import cast
+
 import pytest
 import torch
 
@@ -47,7 +49,7 @@ def setup_method(self, method):
         self.sequential_mlp = MoELayer(
             self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
         )
-        self.router = self.sequential_mlp.router
+        self.router = cast(Router, self.sequential_mlp.router)
 
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
@@ -125,6 +127,53 @@ def test_aux_loss(self):
         out.sum().mul_(0).backward()
         assert self.sequential_mlp.router.weight.grad.abs().sum() > 0
 
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_router_with_padding_mask(self):
+        """Test that padding mask correctly excludes padding tokens from routing."""
+        self.router = self.router.cuda()
+        seq_len = 32
+        batch_size = 2
+        hidden_size = self.router.config.hidden_size
+
+        # Create input with shape [seq_len, batch_size, hidden_size]
+        hidden_states = torch.randn((seq_len, batch_size, hidden_size)).cuda().bfloat16()
+
+        # Create padding mask: first half valid, second half padding
+        # padding_mask shape: [seq_len, batch_size]
+        # Convention: True = padding (exclude), False = valid (include)
+        padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda')
+        padding_mask[seq_len // 2 :, :] = True  # Second half is padding
+
+        # Test forward pass with padding mask
+        with torch.no_grad():
+            probs_with_mask, routing_map_with_mask = self.router(
+                hidden_states, padding_mask=padding_mask
+            )
+
+            # Test forward pass without padding mask (only valid tokens)
+            hidden_states_valid = hidden_states[: seq_len // 2, :, :]
+            probs_without_mask, routing_map_without_mask = self.router(hidden_states_valid)
+
+            # The valid part of routing with mask should match routing without mask
+            probs_valid_part = probs_with_mask.reshape(seq_len, batch_size, -1)[
+                : seq_len // 2, :, :
+            ]
+            probs_valid_part = probs_valid_part.reshape(-1, probs_valid_part.shape[-1])
+
+            # Check that shapes are as expected
+            assert probs_with_mask.shape == (
+                seq_len * batch_size,
+                self.router.config.num_moe_experts,
+            )
+            assert routing_map_with_mask.shape == (
+                seq_len * batch_size,
+                self.router.config.num_moe_experts,
+            )
+
+            # Verify that probs for valid tokens are similar
+            assert torch.equal(probs_valid_part, probs_without_mask)
+
     @pytest.mark.internal
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_router_dtype(self):
@@ -271,7 +320,7 @@ def setup_method(self, method):
         self.moe_layer = MoELayer(
             self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
         ).cuda()
-        self.router = self.moe_layer.router
+        self.router = cast(Router, self.moe_layer.router)
 
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
@@ -378,7 +427,7 @@ def setup_method(self, method):
         self.moe_layer = MoELayer(
             self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
         )
-        self.router = self.moe_layer.router
+        self.router = cast(Router, self.moe_layer.router)
         assert self.router.expert_bias is not None
         assert self.router.local_tokens_per_expert is not None
 
diff --git a/tests/unit_tests/transformer/moe/test_shared_experts.py b/tests/unit_tests/transformer/moe/test_shared_experts.py
index f721c482937..6df4d2fd369 100644
--- a/tests/unit_tests/transformer/moe/test_shared_experts.py
+++ b/tests/unit_tests/transformer/moe/test_shared_experts.py
@@ -20,7 +20,8 @@ def teardown_method(self, method):
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @pytest.mark.internal
-    def test_gpu_forward(self):
+    @pytest.mark.parametrize("shared_expert_gate", [False, True])
+    def test_gpu_forward(self, shared_expert_gate):
         Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
         print("done intializing")
@@ -38,6 +39,7 @@ def test_gpu_forward(self):
             moe_router_load_balancing_type="sinkhorn",
             moe_router_topk=1,
             add_bias_linear=False,
+            moe_shared_expert_gate=shared_expert_gate,
         )
         transformer_layer_spec = get_gpt_layer_local_spec(
             num_experts=num_moe_experts, moe_grouped_gemm=False
@@ -49,7 +51,10 @@ def test_gpu_forward(self):
         assert isinstance(self.moe_layer, MoELayer)
 
         num_weights = sum([p.numel() for p in self.moe_layer.parameters()])
-        assert num_weights == 3480 + 1152
+        if shared_expert_gate:
+            assert num_weights == 3480 + 1152 + 12  # 12 is the weight of the gate
+        else:
+            assert num_weights == 3480 + 1152
         assert self.moe_layer.shared_experts is not None
         assert self.moe_layer.shared_experts.stream is None
         assert self.moe_layer.token_dispatcher.shared_experts is None
diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
index c2462ef73ad..24617952b94 100644
--- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py
+++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
@@ -11,6 +11,7 @@
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.moe.moe_utils import get_capacity
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.typed_torch import apply_module
 from megatron.core.utils import is_te_min_version
 from megatron.training.initialize import _set_random_seed
 from tests.unit_tests.test_utilities import Utils
@@ -129,7 +130,7 @@ def dispatcher_dropless_test(self):
         # Permute and then unpermute data are supposed to restore original data
         ans = hidden_states
         hidden_states.requires_grad = True
-        probs, indices = moe_layer.router(hidden_states)
+        probs, indices = apply_module(moe_layer.router)(hidden_states)
         probs = torch.ones_like(probs) / moe_layer.router.topk
 
         (permuted_local_hidden_states, tokens_per_expert, permuted_probs) = token_permutation(
@@ -166,7 +167,7 @@ def dispatcher_capacity_test(self):
         )
         hidden_states = hidden_states.cuda()
         hidden_states.requires_grad = True
-        probs, indices = moe_layer.router(hidden_states)
+        probs, indices = apply_module(moe_layer.router)(hidden_states)
 
         # Create the answer.
         prob_mask = probs != 0
@@ -225,7 +226,7 @@ def dispatcher_drop_and_pad_test(self):
         ).cuda()
         hidden_states.requires_grad = True
 
-        probs_1, indices_1 = moe_layer.router(hidden_states)
+        probs_1, indices_1 = apply_module(moe_layer.router)(hidden_states)
         (permuted_input_1, tokens_per_expert, permuted_probs_1) = token_permutation(
             moe_layer.token_dispatcher, hidden_states, probs_1, indices_1
         )
@@ -243,7 +244,7 @@ def dispatcher_drop_and_pad_test(self):
         moe_layer_2 = self.new_moe_layer(moe_pad_expert_input_to_capacity=True)
         moe_layer_2.load_state_dict(moe_layer.state_dict())
 
-        probs_2, indices_2 = moe_layer_2.router(hidden_states)
+        probs_2, indices_2 = apply_module(moe_layer_2.router)(hidden_states)
         (permuted_input_2, tokens_per_expert, permuted_probs_2) = token_permutation(
             moe_layer_2.token_dispatcher, hidden_states, probs_2, indices_2
         )
@@ -296,7 +297,7 @@ def dispatcher_router_padding_for_fp8_test(self):
         ).cuda()
         hidden_states.requires_grad = True
 
-        probs_1, indices_1 = moe_layer.router(hidden_states)
+        probs_1, indices_1 = apply_module(moe_layer.router)(hidden_states)
         (permuted_input_1, tokens_per_expert_1, permuted_probs_1) = token_permutation(
             moe_layer.token_dispatcher, hidden_states, probs_1, indices_1
         )
@@ -313,7 +314,7 @@ def dispatcher_router_padding_for_fp8_test(self):
         moe_layer_2 = self.new_moe_layer(moe_router_padding_for_quantization=True, fp8="hybrid")
         moe_layer_2.load_state_dict(moe_layer.state_dict())
 
-        probs_2, indices_2 = moe_layer_2.router(hidden_states)
+        probs_2, indices_2 = apply_module(moe_layer_2.router)(hidden_states)
         (permuted_input_2, tokens_per_expert_2, permuted_probs_2) = token_permutation(
             moe_layer_2.token_dispatcher, hidden_states, probs_2, indices_2
         )
@@ -363,6 +364,7 @@ def setup_method(self, method):
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
+    @pytest.mark.flaky_in_dev
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @pytest.mark.internal
     @pytest.mark.parametrize("tp_size,ep_size", [(8, 1), (1, 8), (2, 4), (1, 1)])
@@ -381,6 +383,7 @@ def test_forward_backward(self, tp_size, ep_size, permute_fusion):
 
         container.dispatcher_dropless_test()
 
+    @pytest.mark.flaky_in_dev
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @pytest.mark.internal
     @pytest.mark.parametrize("permute_fusion", permute_fusion_params)
diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py
index 7e0e8c55807..d7771d0920d 100644
--- a/tests/unit_tests/transformer/test_attention.py
+++ b/tests/unit_tests/transformer/test_attention.py
@@ -25,9 +25,11 @@
     HAVE_FUSED_QKV_ROPE = False
 
 
+@pytest.mark.parametrize("output_gate", [False, True])
 class TestParallelAttention:
 
-    def setup_method(self, method):
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self, output_gate):
         Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
         self.transformer_config = TransformerConfig(
@@ -37,6 +39,7 @@ def setup_method(self, method):
             use_cpu_initialization=True,
             bf16=True,
             params_dtype=torch.bfloat16,
+            attention_output_gate=output_gate,
         )
         self.parallel_attention = SelfAttention(
             self.transformer_config,
@@ -44,7 +47,7 @@ def setup_method(self, method):
             layer_number=1,
         )
 
-    def teardown_method(self, method):
+    def teardown_method(self):
         Utils.destroy_model_parallel()
 
     def test_constructor(self):
@@ -52,7 +55,10 @@ def test_constructor(self):
         assert self.parallel_attention.layer_number == 1
 
         num_weights = sum([p.numel() for p in self.parallel_attention.parameters()])
-        assert num_weights == 66304
+        if self.transformer_config.attention_output_gate:
+            assert num_weights == 82816
+        else:
+            assert num_weights == 66304
 
     def test_cpu_forward(self):
         # we can't currently do this because the global memory buffer is on GPU
@@ -90,6 +96,8 @@ def test_fused_rope_gpu_forward(self, rotary_interleaved, fused_qkv_rope):
         self.parallel_attention.config.apply_rope_fusion = True
         if rotary_interleaved and not is_te_min_version("2.3.0"):
             pytest.skip("Only TE >= 2.3.0 supports interleaved fused RoPE.")
+        if fused_qkv_rope and self.parallel_attention.config.attention_output_gate:
+            pytest.skip("Fused QKV RoPE does not support gated attention for now.")
         if fused_qkv_rope and not HAVE_FUSED_QKV_ROPE:
             pytest.skip("Fused QKV RoPE not available.")
         self.parallel_attention.config.rotary_interleaved = rotary_interleaved
@@ -157,20 +165,210 @@ def test_checkpointed_gpu_forward(self):
         assert bias.shape[0] == config.hidden_size
 
 
-class TestSelfAttention:
+@pytest.mark.skipif(not is_te_min_version("2.9.0"), reason="QK clipping requires TE >= 2.9.0")
+class TestClipQK:
 
     def setup_method(self, method):
-        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
 
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
+    def test_clip_qk_disabled_raises_error(self):
+        """Test that clip_qk raises ValueError when qk_clip is not enabled."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=False,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+        with pytest.raises(ValueError, match="qk_clip option needs to be enabled"):
+            attention.clip_qk()
+
+    def test_clip_qk_none_logits_raises_error(self):
+        """Test that clip_qk raises ValueError when current_max_attn_logits is None."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+        with pytest.raises(ValueError, match="current_max_attn_logits is None"):
+            attention.clip_qk()
+
+    def test_clip_qk_below_threshold_no_update(self):
+        """Test that weights are not updated when max logits are below threshold."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set current_max_attn_logits below threshold
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [50.0, 60.0, 70.0, 80.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should not be updated
+        assert torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_above_threshold_updates_weights(self):
+        """Test that weights are updated when max logits exceed threshold."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set current_max_attn_logits above threshold
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [150.0, 160.0, 170.0, 180.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated
+        assert not torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_gqa_configuration(self):
+        """Test clip_qk with GQA (Grouped Query Attention) configuration."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=8,
+            num_query_groups=4,  # GQA with 2 heads per group
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set current_max_attn_logits for all heads (8 heads)
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [150.0, 160.0, 170.0, 180.0, 190.0, 200.0, 210.0, 220.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated
+        assert not torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_mixed_logits(self):
+        """Test clip_qk with mixed logits (some above, some below threshold)."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set mixed current_max_attn_logits (some above, some below threshold)
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [80.0, 150.0, 90.0, 200.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated since at least one head exceeds threshold
+        assert not torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+
+@pytest.mark.parametrize("output_gate", [False, True])
+class TestSelfAttention:
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self, output_gate):
+        self.output_gate = output_gate
+        Utils.destroy_model_parallel()
+
+    def teardown_method(self):
+        Utils.destroy_model_parallel()
+
     def run_self_attention(self, pg_collection):
         tensor_model_parallel_size = torch.distributed.get_world_size(pg_collection.tp)
         self.transformer_config = TransformerConfig(
             num_layers=2,
             hidden_size=128,
             num_attention_heads=4,
+            attention_output_gate=self.output_gate,
             tensor_model_parallel_size=tensor_model_parallel_size,
             use_cpu_initialization=False,
         )
diff --git a/tests/unit_tests/transformer/test_attention_variant_dsa.py b/tests/unit_tests/transformer/test_attention_variant_dsa.py
new file mode 100644
index 00000000000..bd106aa6f0e
--- /dev/null
+++ b/tests/unit_tests/transformer/test_attention_variant_dsa.py
@@ -0,0 +1,1271 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+import megatron.core.parallel_state as parallel_state
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.experimental_attention_variant.dsa import (
+    DSAIndexer,
+    DSAIndexerLossAutoScaler,
+    DSAIndexerSubmodules,
+    DSAttention,
+    DSAttentionSubmodules,
+    compute_dsa_indexer_loss,
+    rotate_activation,
+)
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    from fast_hadamard_transform import hadamard_transform as _hadamard_transform
+
+    HAVE_HADAMARD = True
+except ImportError:
+    HAVE_HADAMARD = False
+    _hadamard_transform = None
+
+
+def mock_hadamard_transform(x: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+    """Mock implementation of hadamard_transform for testing without the library installed.
+
+    This is a simple identity-like transformation that preserves shape and applies scaling.
+    """
+    return x * scale
+
+
+@pytest.fixture(autouse=True)
+def patch_hadamard_if_needed():
+    """Automatically patch hadamard_transform in dsa module if not installed."""
+    if not HAVE_HADAMARD:
+        with patch(
+            'megatron.core.transformer.experimental_attention_variant.dsa.hadamard_transform',
+            mock_hadamard_transform,
+        ):
+            yield
+    else:
+        yield
+
+
+class TestRotateActivation:
+    """Test rotate_activation function."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_rotate_activation_shape(self):
+        """Test that rotate_activation preserves shape."""
+        batch_size = 2
+        seq_len = 16
+        hidden_size = 128
+
+        x = torch.randn(seq_len, batch_size, hidden_size, dtype=torch.bfloat16).cuda()
+        output = rotate_activation(x)
+
+        assert output.shape == x.shape
+        assert output.dtype == torch.bfloat16
+
+    def test_rotate_activation_dtype_check(self):
+        """Test that rotate_activation only accepts bfloat16."""
+        x = torch.randn(16, 2, 128, dtype=torch.float32).cuda()
+
+        with pytest.raises(AssertionError, match="only support bf16"):
+            rotate_activation(x)
+
+
+@pytest.mark.parametrize("seqlen_and_topk", [[16, 32], [64, 32]])
+class TestComputeDSAIndexerLoss:
+    """Test compute_dsa_indexer_loss function."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp'])
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_loss_shape(self, seqlen_and_topk):
+        """Test that indexer loss returns a scalar."""
+        batch_size = 2
+        seqlen = seqlen_and_topk[0]
+        num_heads = 4
+        head_dim = 128
+        index_topk = seqlen_and_topk[1]
+
+        # Create dummy index scores
+        index_scores = torch.randn(batch_size, seqlen, seqlen, dtype=torch.float32).cuda()
+
+        # Apply causal mask to index_scores before computing topk
+        causal_mask = torch.triu(
+            torch.full(
+                (seqlen, seqlen), float('-inf'), dtype=torch.float32, device=index_scores.device
+            ),
+            diagonal=1,
+        )
+        # [batch_size, seqlen, seqlen] + [seqlen, seqlen] -> [batch_size, seqlen, seqlen]
+        masked_index_scores = index_scores + causal_mask
+
+        # Get topk indices from masked index_scores
+        topk_k = min(index_topk, seqlen)
+        topk_indices = masked_index_scores.topk(topk_k, dim=-1)[1]
+
+        query = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        softmax_scale = head_dim**-0.5
+
+        loss = compute_dsa_indexer_loss(
+            index_scores=index_scores,
+            topk_indices=topk_indices,
+            query=query,
+            key=key,
+            softmax_scale=softmax_scale,
+            loss_coeff=1.0,
+            sparse_loss=False,
+            pg_collection=self.pg_collection,
+        )
+
+        assert loss.shape == torch.Size([])
+        assert loss.dtype == torch.float32
+        assert loss >= 0  # KL divergence should be non-negative
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_loss_sparse(self, seqlen_and_topk):
+        """Test sparse indexer loss computation."""
+        batch_size = 2
+        seqlen = seqlen_and_topk[0]
+        num_heads = 4
+        head_dim = 128
+        index_topk = seqlen_and_topk[1]
+
+        # Create dummy index scores
+        index_scores = torch.randn(batch_size, seqlen, seqlen, dtype=torch.float32).cuda()
+
+        # Apply causal mask to index_scores before computing topk
+        causal_mask = torch.triu(
+            torch.full(
+                (seqlen, seqlen), float('-inf'), dtype=torch.float32, device=index_scores.device
+            ),
+            diagonal=1,
+        )
+        # [batch_size, seqlen, seqlen] + [seqlen, seqlen] -> [batch_size, seqlen, seqlen]
+        masked_index_scores = index_scores + causal_mask
+
+        # Get topk indices from masked index_scores
+        topk_k = min(index_topk, seqlen)
+        topk_indices = masked_index_scores.topk(topk_k, dim=-1)[1]
+
+        query = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        softmax_scale = head_dim**-0.5
+
+        loss_sparse = compute_dsa_indexer_loss(
+            index_scores=index_scores,
+            topk_indices=topk_indices,
+            query=query,
+            key=key,
+            softmax_scale=softmax_scale,
+            loss_coeff=1.0,
+            sparse_loss=True,
+            pg_collection=self.pg_collection,
+        )
+
+        loss_dense = compute_dsa_indexer_loss(
+            index_scores=index_scores,
+            topk_indices=topk_indices,
+            query=query,
+            key=key,
+            softmax_scale=softmax_scale,
+            loss_coeff=1.0,
+            sparse_loss=False,
+            pg_collection=self.pg_collection,
+        )
+
+        # Sparse loss should be different from dense loss
+        if seqlen > index_topk:
+            assert loss_sparse != loss_dense
+        else:
+            assert loss_sparse == loss_dense
+        assert loss_sparse >= 0
+        assert loss_dense >= 0
+
+
+class TestDSAIndexerLossAutoScaler:
+    """Test DSAIndexerLossAutoScaler autograd function."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_forward_pass(self):
+        """Test that forward pass preserves output."""
+        output = torch.randn(16, 2, 128).cuda()
+        output.requires_grad_(True)
+        indexer_loss = torch.tensor(0.5).cuda()
+        indexer_loss.requires_grad_(True)
+
+        result = DSAIndexerLossAutoScaler.apply(output, indexer_loss)
+
+        assert torch.allclose(result, output, atol=0, rtol=0)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_backward_pass(self):
+        """Test that backward pass triggers indexer loss backward and scales gradient correctly."""
+        output = torch.randn(16, 2, 128).cuda()
+        output.requires_grad_(True)
+
+        # Create indexer_loss with computation graph
+        # This simulates compute_dsa_indexer_loss which computes KL divergence
+        dummy_input = torch.randn(10).cuda()
+        dummy_input.requires_grad_(True)
+        indexer_loss = dummy_input.mean()
+
+        # Set loss scale
+        scale = torch.tensor(2.0).cuda()
+        DSAIndexerLossAutoScaler.set_loss_scale(scale)
+
+        # Apply the autograd function
+        result = DSAIndexerLossAutoScaler.apply(output, indexer_loss)
+
+        # Trigger backward
+        main_loss = result.sum()
+        main_loss.backward()
+
+        # Check that gradients flow back to output
+        assert output.grad is not None, "Gradient should flow back to parameters"
+
+        # Check that indexer_loss backward was triggered
+        assert dummy_input.grad is not None, "Indexer loss backward should be triggered"
+
+        # Verify the gradient is scaled correctly
+        expected_grad_per_element = scale.item() / len(dummy_input)
+        assert torch.allclose(
+            dummy_input.grad,
+            torch.full_like(dummy_input, expected_grad_per_element),
+            rtol=0,
+            atol=0,
+        ), f"Gradient should be scaled by loss scale, expected {expected_grad_per_element}, got {dummy_input.grad[0].item()}"
+
+
+@pytest.mark.parametrize("seqlen", [16, 64])
+class TestDSAIndexer:
+    """Test DSA Indexer module basic functionality with TP=1."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        # Create MLA config with sparse attention parameters
+        self.index_topk = 32
+        self.config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=self.index_topk,
+        )
+
+        # Create indexer submodules spec
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups(
+            required_pgs=['tp', 'cp']
+        )
+        self.indexer = DSAIndexer(self.config, indexer_submodules, self.pg_collection)
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_dsa_indexer_constructor(self, seqlen):
+        """Test indexer initialization."""
+        assert isinstance(self.indexer, DSAIndexer)
+        assert self.indexer.hidden_size == 256
+        assert self.indexer.index_n_heads == 8
+        assert self.indexer.index_head_dim == 64
+        assert self.indexer.index_topk == 32
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_forward(self, seqlen):
+        """Test indexer forward pass."""
+        batch_size = 2
+
+        self.indexer.cuda()
+
+        # Create input tensors
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Forward pass
+        topk_indices = self.indexer(x, qr)
+
+        # Check output shape
+        assert topk_indices.shape == (batch_size, seqlen, min(self.config.dsa_indexer_topk, seqlen))
+        assert topk_indices.dtype == torch.long
+        assert torch.all((topk_indices >= 0) & (topk_indices < seqlen))
+        # Make sure no duplicate indices are selected
+        assert torch.all(
+            torch.sort(topk_indices, dim=-1).values[:, :, 1:]
+            != torch.sort(topk_indices, dim=-1).values[:, :, :-1]
+        )
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_forward_with_scores(self, seqlen):
+        """Test indexer forward pass with scores."""
+        batch_size = 2
+
+        self.indexer.cuda()
+
+        # Create input tensors
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Forward pass with scores
+        index_scores, topk_indices = self.indexer.forward_with_scores(x, qr)
+
+        # Check output shapes
+        assert index_scores.shape == (batch_size, seqlen, seqlen)
+        assert topk_indices.shape == (batch_size, seqlen, min(self.config.dsa_indexer_topk, seqlen))
+        assert index_scores.dtype == torch.float32
+        assert topk_indices.dtype == torch.long
+        assert torch.all((topk_indices >= 0) & (topk_indices < seqlen))
+        # Make sure no duplicate indices are selected
+        assert torch.all(
+            torch.sort(topk_indices, dim=-1).values[:, :, 1:]
+            != torch.sort(topk_indices, dim=-1).values[:, :, :-1]
+        )
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_with_mask(self, seqlen):
+        """Test indexer with attention mask."""
+        batch_size = 2
+
+        self.indexer.cuda()
+
+        # Create input tensors
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+        mask = torch.triu(
+            torch.full((batch_size, seqlen, seqlen), float('-inf'), dtype=torch.float32).cuda(),
+            diagonal=1,
+        )
+
+        # Forward pass with mask
+        index_scores, topk_indices = self.indexer.forward_with_scores(x, qr, mask=mask)
+
+        # Check that masked positions are not selected
+        # For causal mask, topk_indices[b, i, :] should all be <= i (except for the case that
+        # i < index_topk).
+        for b in range(batch_size):
+            for i in range(seqlen):
+                assert torch.all(topk_indices[b, i] <= max(self.index_topk, i))
+
+
+class TestDSAttention:
+    """Test DSAttention module basic functionality with TP=1."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        # Create MLA config with sparse attention parameters
+        self.config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=32,
+            dsa_indexer_loss_coeff=1.0,
+            dsa_indexer_use_sparse_loss=False,
+        )
+
+        # Create sparse attention submodules spec
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+        indexer_spec = ModuleSpec(module=DSAIndexer, submodules=indexer_submodules)
+        sparse_attention_submodules = DSAttentionSubmodules(indexer=indexer_spec)
+
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups(
+            required_pgs=['tp', 'cp']
+        )
+
+        self.sparse_attention = DSAttention(
+            config=self.config,
+            submodules=sparse_attention_submodules,
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+        )
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_dsa_constructor(self):
+        """Test sparse attention initialization."""
+        assert isinstance(self.sparse_attention, DSAttention)
+        assert hasattr(self.sparse_attention, 'indexer')
+        assert isinstance(self.sparse_attention.indexer, DSAIndexer)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_forward(self):
+        """Test sparse attention forward pass."""
+        seq_len = 16
+        batch_size = 2
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.hidden_size // num_heads
+
+        self.sparse_attention.cuda()
+
+        # Create input tensors [seq_len, batch, num_heads, head_dim]
+        query = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        value = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+
+        # Original hidden states and low-rank query
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Create causal attention mask
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Forward pass
+        output = self.sparse_attention(
+            query=query,
+            key=key,
+            value=value,
+            x=x,
+            qr=qr,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Check output shape
+        assert output.shape == (seq_len, batch_size, self.config.hidden_size)
+        assert output.dtype == torch.bfloat16
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_backward(self):
+        """Test sparse attention backward pass with indexer loss."""
+        seq_len = 16
+        batch_size = 2
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.hidden_size // num_heads
+
+        self.sparse_attention.train()
+        self.sparse_attention.cuda()
+
+        # Create input tensors
+        query = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        value = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+
+        # Original hidden states and low-rank query
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Create causal attention mask
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Forward pass
+        output = self.sparse_attention(
+            query=query,
+            key=key,
+            value=value,
+            x=x,
+            qr=qr,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Backward pass
+        loss = output.sum()
+        loss.backward()
+
+        # Check that gradients are computed for inputs
+        assert query.grad is not None
+        assert key.grad is not None
+        assert value.grad is not None
+
+        # Check that indexer parameters have gradients
+        for name, param in self.sparse_attention.indexer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Indexer parameter {name} has no gradient"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_topk_selection(self):
+        """Test that sparse attention correctly selects top-k indices."""
+        seq_len = 16
+        batch_size = 2
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.hidden_size // num_heads
+
+        self.sparse_attention.eval()
+        self.sparse_attention.cuda()
+
+        # Create input tensors
+        query = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        value = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+
+        # Original hidden states and low-rank query
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Create causal attention mask
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        with torch.no_grad():
+            # Get topk indices from indexer
+            _, topk_indices = self.sparse_attention.indexer.forward_with_scores(x, qr)
+
+            # Forward pass
+            output = self.sparse_attention(
+                query=query,
+                key=key,
+                value=value,
+                x=x,
+                qr=qr,
+                attention_mask=attention_mask,
+                attn_mask_type=AttnMaskType.causal,
+            )
+
+        # Check that topk_indices are valid
+        assert torch.all(topk_indices >= 0)
+        assert torch.all(topk_indices < seq_len)
+        assert topk_indices.shape[2] == min(self.config.dsa_indexer_topk, seq_len)
+
+
+# ======================================================================================
+# Tensor Parallel Consistency Tests
+# ======================================================================================
+
+
+@pytest.mark.parametrize("tensor_model_parallel_size", [2, 4, 8])
+@pytest.mark.parametrize("sequence_parallel", [False, True])
+class TestIndexerTensorParallel:
+    """Test DSA Indexer with different TP sizes and SP settings, compare with TP=1 baseline."""
+
+    def _create_config(self, sequence_parallel=False):
+        """Helper to create MLA config."""
+        # Get TP size from parallel_state
+        tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
+
+        return MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            sequence_parallel=sequence_parallel,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=32,
+        )
+
+    def _create_indexer(self, config, pg_collection):
+        """Helper to create indexer."""
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+
+        return DSAIndexer(config, indexer_submodules, pg_collection)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_weight_consistency(self, tensor_model_parallel_size, sequence_parallel):
+        """Test that indexer weights are identical across ALL GPUs."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(sequence_parallel=sequence_parallel)
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer = self._create_indexer(config, pg_collection).cuda()
+
+        # Check that all weights are identical across ALL ranks (not just TP group)
+        world_size = torch.distributed.get_world_size()
+        world_rank = torch.distributed.get_rank()
+
+        if world_size > 1:
+            for name, param in indexer.named_parameters():
+                # Gather weights from ALL ranks in WORLD group
+                param_list = [torch.zeros_like(param.data) for _ in range(world_size)]
+                torch.distributed.all_gather(param_list, param.data)
+
+                # All weights should be identical across all GPUs
+                for i in range(1, world_size):
+                    assert torch.allclose(
+                        param_list[0], param_list[i], rtol=0, atol=0
+                    ), f"Parameter {name} differs between rank 0 and rank {i} (world)"
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_forward_consistency(self, tensor_model_parallel_size, sequence_parallel):
+        """Test that indexer gives consistent results across different TP sizes and SP settings."""
+        # First run with TP=1 to get baseline
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tp1 = self._create_config(sequence_parallel=False)  # TP=1 doesn't use SP
+        pg_collection_tp1 = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer_tp1 = self._create_indexer(config_tp1, pg_collection_tp1).cuda()
+
+        seq_len = 64
+        batch_size = 2
+
+        # Create one common input (all ranks create same input with same seed)
+        x_input = torch.randn(
+            seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+        qr_input = torch.randn(
+            seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16
+        ).cuda()
+
+        # Forward pass with gradients enabled
+        index_scores_tp1, topk_indices_tp1 = indexer_tp1.forward_with_scores(x_input, qr_input)
+
+        # Backward pass
+        loss_tp1 = index_scores_tp1.sum()
+        loss_tp1.backward()
+
+        # Save gradients from TP=1
+        indexer_tp1_grads = {
+            name: param.grad.clone().cpu()
+            for name, param in indexer_tp1.named_parameters()
+            if param.grad is not None
+        }
+
+        Utils.destroy_model_parallel()
+
+        # Now run with target TP size
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tpn = self._create_config(sequence_parallel=sequence_parallel)
+        pg_collection_tpn = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer_tpn = self._create_indexer(config_tpn, pg_collection_tpn).cuda()
+
+        # Prepare input: split along seqlen if SP is enabled
+        if sequence_parallel:
+            tp_rank = parallel_state.get_tensor_model_parallel_rank()
+            seq_per_rank = seq_len // tensor_model_parallel_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x_tpn = x_input[start_idx:end_idx]
+            qr_tpn = qr_input[start_idx:end_idx]
+        else:
+            # No SP: all TP ranks see full input
+            x_tpn = x_input
+            qr_tpn = qr_input
+
+        # Forward pass with gradients enabled
+        index_scores_tpn, topk_indices_tpn = indexer_tpn.forward_with_scores(x_tpn, qr_tpn)
+
+        # Backward pass
+        loss_tpn = index_scores_tpn.sum()
+        loss_tpn.backward()
+
+        # Compare forward outputs
+        assert index_scores_tpn.shape == index_scores_tp1.shape
+        assert topk_indices_tpn.shape == topk_indices_tp1.shape
+
+        # Check that index scores are close (allow for floating point accumulation errors)
+        assert torch.allclose(
+            index_scores_tpn, index_scores_tp1, rtol=0, atol=0
+        ), f"Index scores mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}"
+
+        # Check that topk indices are exactly the same
+        assert torch.equal(
+            topk_indices_tpn, topk_indices_tp1
+        ), f"Top-k indices mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}"
+
+        # Compare gradients - indexer grads should be identical (duplicated weights)
+        for name, param in indexer_tpn.named_parameters():
+            if param.grad is not None and name in indexer_tp1_grads:
+                assert torch.allclose(
+                    param.grad.cpu(), indexer_tp1_grads[name], rtol=0, atol=0
+                ), f"Indexer gradient {name} mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}"
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_gradient_sync(self, tensor_model_parallel_size, sequence_parallel):
+        """Test that gradients are properly synchronized within TP group."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(sequence_parallel=sequence_parallel)
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer = self._create_indexer(config, pg_collection).cuda()
+
+        seq_len = 64
+        batch_size = 2
+
+        # Create one common input (all ranks create same input with same seed)
+        x_input = torch.randn(seq_len, batch_size, config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr_input = torch.randn(seq_len, batch_size, config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Prepare input: split along seqlen if SP is enabled
+        if sequence_parallel:
+            tp_rank = parallel_state.get_tensor_model_parallel_rank()
+            tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            seq_per_rank = seq_len // tp_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x = x_input[start_idx:end_idx]
+            qr = qr_input[start_idx:end_idx]
+        else:
+            # No SP: all TP ranks see full input
+            x = x_input
+            qr = qr_input
+
+        # Forward and backward
+        index_scores, topk_indices = indexer.forward_with_scores(x, qr)
+        loss = index_scores.sum()
+        loss.backward()
+
+        # Check that all parameters have gradients
+        for name, param in indexer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Parameter {name} has no gradient"
+
+        # After TP sync, check that gradients are identical within TP group
+        # Note: We only check TP group because DDP sync happens separately
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        if tp_size > 1:
+            for name, param in indexer.named_parameters():
+                if param.requires_grad and param.grad is not None:
+                    # Gather gradients from all ranks in TP group only
+                    grad_list = [torch.zeros_like(param.grad) for _ in range(tp_size)]
+                    torch.distributed.all_gather(grad_list, param.grad, group=pg_collection.tp)
+
+                    # All gradients should be identical within TP group after sync
+                    for i in range(1, tp_size):
+                        assert torch.allclose(
+                            grad_list[0], grad_list[i], rtol=0, atol=0
+                        ), f"Gradient for {name} differs between TP rank 0 and rank {i} after TP sync"
+
+        Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize("tensor_model_parallel_size", [2, 4])
+@pytest.mark.parametrize("sequence_parallel", [False, True])
+@pytest.mark.parametrize("use_sparse_indexer_loss", [False, True])
+class TestDSAttentionTensorParallel:
+    """Test DSAttention with different TP sizes, SP settings, and sparse indexer loss."""
+
+    def _create_config(self, sequence_parallel=False, use_sparse_indexer_loss=False):
+        """Helper to create MLA config."""
+        # Get TP size from parallel_state
+        tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
+
+        return MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            sequence_parallel=sequence_parallel,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=32,
+            dsa_indexer_loss_coeff=1.0,
+            dsa_indexer_use_sparse_loss=use_sparse_indexer_loss,
+        )
+
+    def _create_sparse_attention(self, config, pg_collection):
+        """Helper to create sparse attention."""
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+        indexer_spec = ModuleSpec(module=DSAIndexer, submodules=indexer_submodules)
+        sparse_attention_submodules = DSAttentionSubmodules(indexer=indexer_spec)
+
+        return DSAttention(
+            config=config,
+            submodules=sparse_attention_submodules,
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=pg_collection,
+        )
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_weight_consistency(
+        self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss
+    ):
+        """Test that sparse attention indexer weights are identical across ALL GPUs."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(
+            sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention = self._create_sparse_attention(config, pg_collection).cuda()
+
+        # Check that all indexer weights are identical across ALL ranks
+        world_size = torch.distributed.get_world_size()
+        world_rank = torch.distributed.get_rank()
+
+        if world_size > 1:
+            for name, param in sparse_attention.indexer.named_parameters():
+                # Gather weights from ALL ranks in WORLD group
+                param_list = [torch.zeros_like(param.data) for _ in range(world_size)]
+                torch.distributed.all_gather(param_list, param.data)
+
+                # All weights should be identical across all GPUs
+                for i in range(1, world_size):
+                    torch.testing.assert_close(param_list[0], param_list[i], rtol=0, atol=0)
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_forward_consistency(
+        self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss
+    ):
+        """Test that sparse attention gives consistent results across different TP, SP, and sparse loss settings."""
+        # First run with TP=1 to get baseline
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tp1 = self._create_config(
+            sequence_parallel=False, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )  # TP=1 doesn't use SP
+        pg_collection_tp1 = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention_tp1 = self._create_sparse_attention(config_tp1, pg_collection_tp1).cuda()
+
+        seq_len = 64
+        batch_size = 2
+        num_heads = config_tp1.num_attention_heads
+        head_dim = config_tp1.hidden_size // num_heads
+
+        # Create one common input (all ranks create same input with same seed)
+        query_input = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key_input = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        value_input = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        x_input = torch.randn(
+            seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+        qr_input = torch.randn(
+            seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16
+        ).cuda()
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Forward pass with gradients enabled
+        sparse_attention_tp1.train()
+        output_tp1 = sparse_attention_tp1(
+            query=query_input,
+            key=key_input,
+            value=value_input,
+            x=x_input,
+            qr=qr_input,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Backward pass
+        loss_tp1 = output_tp1.sum()
+        loss_tp1.backward()
+
+        # Save gradients from TP=1
+        indexer_tp1_grads = {
+            name: param.grad.clone()
+            for name, param in sparse_attention_tp1.indexer.named_parameters()
+            if param.grad is not None
+        }
+        query_tp1_grad = query_input.grad.clone().cpu()
+        key_tp1_grad = key_input.grad.clone().cpu()
+        value_tp1_grad = value_input.grad.clone().cpu()
+
+        Utils.destroy_model_parallel()
+
+        # Now run with target TP size
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tpn = self._create_config(
+            sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )
+        pg_collection_tpn = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention_tpn = self._create_sparse_attention(config_tpn, pg_collection_tpn).cuda()
+
+        # Create one common input (all ranks create same input with same seed)
+        query_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        key_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        value_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        x_input = torch.randn(
+            seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+        qr_input = torch.randn(
+            seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16
+        ).cuda()
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Prepare input: split along seqlen if SP is enabled
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        if sequence_parallel:
+            seq_per_rank = seq_len // tensor_model_parallel_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x_tpn = x_input[start_idx:end_idx]
+            qr_tpn = qr_input[start_idx:end_idx]
+        else:
+            x_tpn = x_input
+            qr_tpn = qr_input
+
+        query_input = query_input.detach()
+        key_input = key_input.detach()
+        value_input = value_input.detach()
+        head_per_rank = num_heads // tensor_model_parallel_size
+        start_head = tp_rank * head_per_rank
+        end_head = (tp_rank + 1) * head_per_rank
+        query_tpn = query_input[:, :, start_head:end_head, :].clone().requires_grad_(True)
+        key_tpn = key_input[:, :, start_head:end_head, :].clone().requires_grad_(True)
+        value_tpn = value_input[:, :, start_head:end_head, :].clone().requires_grad_(True)
+        attention_mask_tpn = attention_mask
+
+        # Forward pass with gradients enabled
+        sparse_attention_tpn.train()
+        output_tpn = sparse_attention_tpn(
+            query=query_tpn,
+            key=key_tpn,
+            value=value_tpn,
+            x=x_tpn,
+            qr=qr_tpn,
+            attention_mask=attention_mask_tpn,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Backward pass
+        loss_tpn = output_tpn.sum()
+        loss_tpn.backward()
+
+        from megatron.core.tensor_parallel.mappings import gather_from_tensor_model_parallel_region
+
+        output_tpn_gathered = gather_from_tensor_model_parallel_region(
+            output_tpn, group=pg_collection_tpn.tp
+        )
+        assert output_tpn_gathered.shape == output_tp1.shape
+        assert torch.allclose(
+            output_tpn_gathered.detach(), output_tp1.detach(), rtol=0, atol=0
+        ), f"Sparse attention outputs mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}, sparse_loss={use_sparse_indexer_loss}"
+
+        # 1. Check indexer gradients.
+        for name, param in sparse_attention_tpn.indexer.named_parameters():
+            if param.grad is not None and name in indexer_tp1_grads:
+                torch.testing.assert_close(
+                    param.grad, indexer_tp1_grads[name], rtol=1e-5, atol=1e-5
+                )
+
+        # 2. Query/Key/Value gradients need to be gathered along num_heads dim (dim 2) if SP is enabled
+        # Flatten last two dims: [seq_len, batch, num_heads, head_dim] -> [seq_len, batch, num_heads * head_dim]
+        sq, b, nh, hd = query_tpn.grad.shape
+        query_grad_flat = query_tpn.grad.reshape(sq, b, nh * hd)
+        key_grad_flat = key_tpn.grad.reshape(sq, b, nh * hd)
+        value_grad_flat = value_tpn.grad.reshape(sq, b, nh * hd)
+
+        # Gather along last dim
+        query_grad_gathered_flat = gather_from_tensor_model_parallel_region(
+            query_grad_flat, group=pg_collection_tpn.tp
+        )
+        key_grad_gathered_flat = gather_from_tensor_model_parallel_region(
+            key_grad_flat, group=pg_collection_tpn.tp
+        )
+        value_grad_gathered_flat = gather_from_tensor_model_parallel_region(
+            value_grad_flat, group=pg_collection_tpn.tp
+        )
+
+        # Reshape back: [seq_len, batch, num_heads * head_dim] -> [seq_len, batch, num_heads, head_dim]
+        query_tpn_grad_gathered = query_grad_gathered_flat.reshape(sq, b, num_heads, hd)
+        key_tpn_grad_gathered = key_grad_gathered_flat.reshape(sq, b, num_heads, hd)
+        value_tpn_grad_gathered = value_grad_gathered_flat.reshape(sq, b, num_heads, hd)
+
+        assert torch.allclose(
+            query_tpn_grad_gathered.cpu(), query_tp1_grad, rtol=0, atol=0
+        ), f"Query gradient mismatch between TP=1 and TP={tensor_model_parallel_size}"
+        assert torch.allclose(
+            key_tpn_grad_gathered.cpu(), key_tp1_grad, rtol=0, atol=0
+        ), f"Key gradient mismatch between TP=1 and TP={tensor_model_parallel_size}"
+        assert torch.allclose(
+            value_tpn_grad_gathered.cpu(), value_tp1_grad, rtol=0, atol=0
+        ), f"Value gradient mismatch between TP=1 and TP={tensor_model_parallel_size}"
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_gradient_sync(
+        self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss
+    ):
+        """Test that indexer gradients are properly synchronized within TP group."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(
+            sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention = self._create_sparse_attention(config, pg_collection).cuda()
+        sparse_attention.train()
+
+        seq_len = 64
+        batch_size = 2
+        num_heads = config.num_attention_heads
+        head_dim = config.hidden_size // num_heads
+
+        # Create one common input (all ranks create same input with same seed)
+        query_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        key_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        value_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        x_input = torch.randn(seq_len, batch_size, config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr_input = torch.randn(seq_len, batch_size, config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Prepare input: split along seqlen if SP is enabled
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        if sequence_parallel:
+            tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            seq_per_rank = seq_len // tp_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x = x_input[start_idx:end_idx]
+            qr = qr_input[start_idx:end_idx]
+        else:
+            x = x_input
+            qr = qr_input
+
+        # query, key, value should be split along num_heads dim
+        head_per_rank = num_heads // tensor_model_parallel_size
+        start_head = tp_rank * head_per_rank
+        end_head = (tp_rank + 1) * head_per_rank
+        query = query_input[:, :, start_head:end_head, :]
+        key = key_input[:, :, start_head:end_head, :]
+        value = value_input[:, :, start_head:end_head, :]
+
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        query.requires_grad_(True)
+        key.requires_grad_(True)
+        value.requires_grad_(True)
+
+        # Forward and backward
+        output = sparse_attention(
+            query=query,
+            key=key,
+            value=value,
+            x=x,
+            qr=qr,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        loss = output.sum()
+        loss.backward()
+
+        # Check that gradients exist before sync
+        assert query.grad is not None
+        assert key.grad is not None
+        assert value.grad is not None
+
+        # Check that indexer parameters have gradients
+        for name, param in sparse_attention.indexer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Indexer parameter {name} has no gradient"
+
+        # Check that indexer gradients are identical within TP group
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        if tp_size > 1:
+            for name, param in sparse_attention.indexer.named_parameters():
+                if param.requires_grad and param.grad is not None:
+                    # Gather gradients from all ranks in TP group only
+                    grad_list = [torch.zeros_like(param.grad) for _ in range(tp_size)]
+                    torch.distributed.all_gather(grad_list, param.grad, group=pg_collection.tp)
+
+                    # All gradients should be identical within TP group after sync
+                    for i in range(1, tp_size):
+                        assert torch.allclose(
+                            grad_list[0], grad_list[i], rtol=0, atol=0
+                        ), f"Indexer gradient for {name} differs between TP rank 0 and rank {i} after TP sync"
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py
index b92ff383d82..4696a3ed439 100644
--- a/tests/unit_tests/transformer/test_cuda_graphs.py
+++ b/tests/unit_tests/transformer/test_cuda_graphs.py
@@ -1,11 +1,25 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import gc
+import os
+import sys
+
 import pytest
 import torch
+from transformer_engine.pytorch.fp8 import check_fp8_support
 
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.enums import ModelType
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_decoder_block_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+    get_gpt_mtp_block_spec,
+)
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+from megatron.core.num_microbatches_calculator import (
+    destroy_num_microbatches_calculator,
+    init_num_microbatches_calculator,
+)
 from megatron.core.pipeline_parallel.schedules import set_current_microbatch
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.ssm.mamba_block import MambaStack
@@ -14,12 +28,28 @@
     initialize_rng_tracker,
     model_parallel_cuda_manual_seed,
 )
-from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord
+from megatron.core.transformer.cuda_graphs import (
+    CudaGraphManager,
+    TECudaGraphHelper,
+    _CudagraphGlobalRecord,
+)
+from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.moe.fused_a2a import reset_hybrid_ep_buffer
 from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_fa_min_version, is_te_min_version
+from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args
+from megatron.training.global_vars import (
+    destroy_global_vars,
+    get_args,
+    set_args,
+    set_global_variables,
+)
+from megatron.training.training import setup_model_and_optimizer
 from tests.unit_tests.test_utilities import Utils
 
+fp8_available, _ = check_fp8_support()
+
 
 class TestParallelTransformerBlockCudagraphs:
     def setup_method(self, method):
@@ -497,6 +527,569 @@ def test_gpu_cudagraph(self):
             del parallel_mamba_block.layers[_].cudagraph_manager.cudagraph_runners[0].fwd_graph
 
 
+# Global storage for comparing unique buffer counts across different num_microbatches,
+# keyed by (pp_size, vpp_size)
+_unique_buffer_counts = {}
+
+
+class TestTECudaGraphHelper:
+    def setup_method(self, method):
+        # Initialize parallel state
+        initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+        # Note: _unique_buffer_counts is intentionally NOT cleared here so we can
+        # compare values across parametrized test runs
+
+    @pytest.mark.parametrize("num_microbatches", [16, 64, 256])
+    @pytest.mark.parametrize("pp_size", [1, 2, 4])
+    @pytest.mark.parametrize("vpp_size", [None, 2])
+    def test_get_cuda_graph_input_data(self, num_microbatches, pp_size, vpp_size):
+        """Test _get_cuda_graph_input_data function in TECudaGraphHelper."""
+
+        if vpp_size and pp_size == 1:
+            pytest.skip("vpp_size must be None when pp_size is 1")
+
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=pp_size,
+            virtual_pipeline_model_parallel_size=vpp_size,
+        )
+
+        # Set up test configuration
+        seq_length = 128
+        micro_batch_size = 2
+        num_layers = 8
+        vocab_size = 1024
+        hidden_size = 64
+        num_attention_heads = 4
+
+        # Initialize num_microbatches calculator
+        init_num_microbatches_calculator(
+            rank=0,
+            rampup_batch_size=None,
+            global_batch_size=micro_batch_size * num_microbatches,
+            micro_batch_size=micro_batch_size,
+            data_parallel_size=1,
+            decrease_batch_size_if_needed=False,
+        )
+
+        # Create transformer config directly
+        transformer_config = TransformerConfig(
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            use_cpu_initialization=True,
+            cuda_graph_impl="transformer_engine",
+            use_te_rng_tracker=True,
+            bf16=True,
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=pp_size,
+            virtual_pipeline_model_parallel_size=vpp_size,
+            pipeline_dtype=torch.bfloat16,
+            context_parallel_size=1,
+        )
+
+        # Create model
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        model = []
+        for i in range(vpp_size or 1):
+            this_model = GPTModel(
+                config=transformer_config,
+                transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+                vocab_size=vocab_size,
+                max_sequence_length=seq_length,
+                parallel_output=True,
+                position_embedding_type="rope",
+                vp_stage=i if vpp_size else None,
+            ).cuda()
+            model.append(this_model)
+
+        # Initialize TECudaGraphHelper
+        cuda_graph_helper = TECudaGraphHelper(
+            model=model,
+            config=transformer_config,
+            seq_length=seq_length,
+            micro_batch_size=micro_batch_size,
+            optimizers=[],
+        )
+
+        # Call _get_cuda_graph_input_data (which internally calls _get_sample_arguments)
+        sample_args, make_graphed_callables_kwargs = cuda_graph_helper._get_cuda_graph_input_data()
+
+        # Extract sample_kwargs from the kwargs dict
+        # For TE >= 1.10.0, sample_kwargs should always be present
+        assert (
+            'sample_kwargs' in make_graphed_callables_kwargs
+        ), "sample_kwargs should be present in make_graphed_callables_kwargs for TE >= 1.10.0"
+        sample_kwargs = make_graphed_callables_kwargs['sample_kwargs']
+
+        # Basic checks
+        num_graphable_layers = len(cuda_graph_helper.flattened_callables)
+        if pp_size > 1:
+            expected_length = num_graphable_layers * num_microbatches
+        else:
+            expected_length = num_graphable_layers
+        assert len(sample_args) == expected_length, (
+            f"sample_args length mismatch: expected {expected_length}, " f"got {len(sample_args)}"
+        )
+        assert len(sample_kwargs) == expected_length, (
+            f"sample_kwargs length mismatch: expected {expected_length}, "
+            f"got {len(sample_kwargs)}"
+        )
+
+        # Check that all elements are not None
+        for i, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)):
+            assert args_item is not None, f"sample_args[{i}] is None"
+            assert kwargs_item is not None, f"sample_kwargs[{i}] is None"
+            assert isinstance(args_item, tuple), f"sample_args[{i}] should be a tuple"
+            assert isinstance(kwargs_item, dict), f"sample_kwargs[{i}] should be a dict"
+            assert len(args_item) > 0, f"sample_args[{i}] should not be empty"
+            # Check that hidden_states is present
+            assert "hidden_states" in kwargs_item or (
+                len(args_item) > 0 and torch.is_tensor(args_item[0])
+            ), f"sample_args[{i}] or sample_kwargs[{i}] should contain hidden_states"
+
+        # Check tensor properties
+        for i, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)):
+            # Get hidden_states from args or kwargs
+            if len(args_item) > 0 and torch.is_tensor(args_item[0]):
+                hidden_states = args_item[0]
+            elif "hidden_states" in kwargs_item:
+                hidden_states = kwargs_item["hidden_states"]
+            else:
+                continue
+
+            assert torch.is_tensor(hidden_states), f"hidden_states at index {i} should be a tensor"
+            # Check shape matches expected (accounting for TP/CP)
+            expected_seq_len = seq_length // transformer_config.context_parallel_size
+            if transformer_config.sequence_parallel:
+                expected_seq_len = expected_seq_len // transformer_config.tensor_model_parallel_size
+            assert hidden_states.shape[0] == expected_seq_len, (
+                f"hidden_states seq_len mismatch at index {i}: "
+                f"expected {expected_seq_len}, got {hidden_states.shape[0]}"
+            )
+            assert hidden_states.shape[1] == micro_batch_size, (
+                f"hidden_states batch_size mismatch at index {i}: "
+                f"expected {micro_batch_size}, got {hidden_states.shape[1]}"
+            )
+            assert hidden_states.shape[2] == transformer_config.hidden_size, (
+                f"hidden_states hidden_size mismatch at index {i}: "
+                f"expected {transformer_config.hidden_size}, got {hidden_states.shape[2]}"
+            )
+
+        # Memory optimization check: verify that buffers with same signature are reused
+        # Create a mapping of sample_keys to indices
+        sample_keys_to_indices = {}
+        for idx, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)):
+            # Create sample_keys similar to the function
+            args_keys = tuple((t.shape, t.dtype, t.layout) for t in args_item if torch.is_tensor(t))
+            kwargs_keys = tuple(
+                (k, v.shape, v.dtype, v.layout)
+                for k, v in sorted(kwargs_item.items())
+                if torch.is_tensor(v)
+            )
+            sample_keys = args_keys + kwargs_keys
+
+            if sample_keys not in sample_keys_to_indices:
+                sample_keys_to_indices[sample_keys] = []
+            sample_keys_to_indices[sample_keys].append(idx)
+
+        # Check that buffers with same signature share references (memory optimization)
+        # The optimization reuses buffers when:
+        # 1. They have the same signature (shape, dtype, layout)
+        # 2. The backward pass of the original buffer has completed
+        # 3. A new forward pass with matching signature needs a buffer
+        # Count how many times each tensor is reused
+        unique_tensors = set()
+        tensor_reuse_count = {}
+        for idx, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)):
+            # Get the first tensor from args (hidden_states)
+            if len(args_item) > 0 and torch.is_tensor(args_item[0]):
+                tensor_ptr = args_item[0].data_ptr()
+                unique_tensors.add(tensor_ptr)
+                tensor_reuse_count[tensor_ptr] = tensor_reuse_count.get(tensor_ptr, 0) + 1
+
+        # With memory optimization, we should see some buffers reused
+        # (i.e., some tensors should appear multiple times)
+        max_reuse = max(tensor_reuse_count.values()) if tensor_reuse_count else 0
+        total_entries = len(sample_args)
+        unique_buffer_count = len(unique_tensors)
+
+        # Verify that memory optimization is working:
+        # - The number of unique buffers should be <= total entries
+        # - With the 1F1B schedule and multiple microbatches, we should see some buffer reuse
+        # - The number of unique buffers should be bounded as num_microbatches grows.
+        assert unique_buffer_count <= total_entries, (
+            f"Memory optimization check: unique_buffer_count ({unique_buffer_count}) "
+            f"should be <= total_entries ({total_entries})"
+        )
+        global _unique_buffer_counts
+        # Use (pp_size, vpp_size) as key to track unique buffer counts per configuration
+        config_key = (pp_size, vpp_size)
+        if config_key not in _unique_buffer_counts:
+            _unique_buffer_counts[config_key] = unique_buffer_count
+        else:
+            assert unique_buffer_count == _unique_buffer_counts[config_key], (
+                f"Unique buffer count mismatch: expected {_unique_buffer_counts[config_key]}, "
+                f"got {unique_buffer_count}"
+            )
+
+        # Verify that buffers with the same signature can potentially be reused
+        # (the actual reuse depends on the schedule, but the mechanism should work)
+        if expected_length > 1:
+            # Check that we have multiple entries with the same signature
+            has_duplicate_signatures = any(
+                len(indices) > 1 for indices in sample_keys_to_indices.values()
+            )
+            assert has_duplicate_signatures, (
+                "Memory optimization: expected duplicate signatures for buffer reuse, "
+                "but all signatures are unique"
+            )
+
+            # We tested with a large number of microbatches, so we should see some buffer reuse.
+            if pp_size > 1:
+                assert max_reuse > 1, "Expected some buffer reuse"
+
+        # Verify that make_graphed_callables_kwargs contains expected keys
+        assert (
+            '_order' in make_graphed_callables_kwargs
+        ), "make_graphed_callables_kwargs should contain '_order'"
+        assert (
+            'num_warmup_iters' in make_graphed_callables_kwargs
+        ), "make_graphed_callables_kwargs should contain 'num_warmup_iters'"
+        assert (
+            'allow_unused_input' in make_graphed_callables_kwargs
+        ), "make_graphed_callables_kwargs should contain 'allow_unused_input'"
+
+        # Verify the order in kwargs matches expectations
+        order = make_graphed_callables_kwargs['_order']
+        num_model_chunks = cuda_graph_helper.num_model_chunks
+        forward_count = sum(1 for chunk_id in order if chunk_id > 0)
+        if pp_size > 1:
+            # Verify that all forward passes in order have corresponding entries in sample_args
+            assert forward_count == num_microbatches * num_model_chunks, (
+                f"Forward count mismatch: expected {num_microbatches * num_model_chunks}, "
+                f"got {forward_count}"
+            )
+            expected_order_length = num_microbatches * num_model_chunks * 2
+        else:
+            assert num_model_chunks == 1, "Expected only one model chunk for pp_size == 1"
+            assert forward_count == 1, "Expected only one forward pass for pp_size == 1"
+            expected_order_length = 2
+        assert (
+            len(order) == expected_order_length
+        ), f"Order length mismatch: expected {expected_order_length}, got {len(order)}"
+
+
+def is_deep_ep_available():
+    from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP
+
+    return HAVE_DEEP_EP
+
+
+def is_hybrid_ep_available():
+    from megatron.core.transformer.moe.fused_a2a import HAVE_HYBRIDEP
+
+    return HAVE_HYBRIDEP
+
+
+class TestPartialCudaGraph:
+    """Test that CUDA graph outputs match non-CUDA graph outputs for various scopes."""
+
+    def setup_method(self, method):
+        self.seq_length = 512
+        self.micro_batch_size = 2
+        self.tp_size = 2
+        self.cp_size = 2
+        self.cuda_graph_helper = None
+        # Store original environment variable values
+        self.original_env = {
+            'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'),
+            'NVTE_ALLOW_NONDETERMINISTIC_ALGO': os.environ.get('NVTE_ALLOW_NONDETERMINISTIC_ALGO'),
+        }
+        os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+        os.environ['NVTE_ALLOW_NONDETERMINISTIC_ALGO'] = '0'
+
+    def teardown_method(self, method):
+        # Restore original environment variable values
+        for key, value in self.original_env.items():
+            if value is None:
+                os.environ.pop(key, None)
+            else:
+                os.environ[key] = value
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+        if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created():
+            self.cuda_graph_helper.delete_cuda_graphs()
+            self.cuda_graph_helper = None
+        gc.collect()
+
+    def model_provider(
+        self,
+        pre_process=True,
+        post_process=True,
+        layer_spec_fn=get_gpt_decoder_block_spec,
+        **config_kwargs,
+    ):
+        args = get_args()
+        config = core_transformer_config_from_args(args)
+        transformer_layer_spec = layer_spec_fn(
+            config,
+            use_transformer_engine=True,
+            normalization=args.normalization,
+            qk_l2_norm=args.qk_l2_norm,
+        )
+        if args.mtp_num_layers:
+            mtp_block_spec = get_gpt_mtp_block_spec(
+                config, transformer_layer_spec, use_transformer_engine=True
+            )
+        else:
+            mtp_block_spec = None
+        return GPTModel(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=True,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent,
+            mtp_block_spec=mtp_block_spec,
+        )
+
+    def create_test_args(
+        self, cuda_graph_impl, cuda_graph_scope, cuda_graph_warmup_steps, ep_size, **kwargs
+    ):
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+
+        sys.argv = ['test_cuda_graphs.py']
+        args = parse_args()
+        args.num_layers = 4
+        args.mtp_num_layers = 1
+        args.vocab_size = 1024
+        args.hidden_size = 512
+        args.num_attention_heads = 8
+        args.max_position_embeddings = 512
+        args.global_batch_size = self.micro_batch_size * 8 // self.tp_size // self.cp_size
+        args.micro_batch_size = self.micro_batch_size
+        args.create_attention_mask_in_dataloader = True
+        args.seq_length = self.seq_length
+        args.tensor_model_parallel_size = self.tp_size
+        args.sequence_parallel = True if self.tp_size > 1 else False
+        args.pipeline_model_parallel_size = 1
+        args.context_parallel_size = self.cp_size
+        args.train_iters = 10
+        args.lr = 3e-5
+        args.bf16 = True
+        args.add_bias_linear = False
+        args.swiglu = True
+        args.use_distributed_optimizer = True
+        args.position_embedding_type = "rope"
+        args.rotary_percent = 1.0
+        args.hidden_dropout = 0.0
+        args.attention_dropout = 0.0
+
+        # MoE settings
+        args.num_experts = 4
+        args.expert_model_parallel_size = ep_size
+        args.expert_tensor_parallel_size = 1 if ep_size > 1 else self.tp_size
+        args.moe_shared_expert_intermediate_size = 1024
+        args.moe_layer_freq = [0, 0, 1, 1]
+        args.moe_permute_fusion = True
+        args.moe_router_fusion = True
+        args.moe_router_topk = 2
+        args.moe_router_dtype = "fp32"
+
+        # CUDA graph settings
+        args.cuda_graph_impl = cuda_graph_impl
+        args.cuda_graph_scope = cuda_graph_scope
+        args.cuda_graph_warmup_steps = cuda_graph_warmup_steps
+
+        # fp8 settings
+        if fp8_available:
+            args.fp8 = "e4m3"
+            args.fp8_recipe = "tensorwise"
+            args.first_last_layers_bf16 = True
+            args.num_layers_at_start_in_bf16 = 1
+            args.num_layers_at_end_in_bf16 = 1
+
+        for key, value in kwargs.items():
+            assert hasattr(args, key)
+            setattr(args, key, value)
+
+        validate_args(args)
+        set_global_variables(args, False)
+        return args
+
+    def get_batch(self, seq_length, micro_batch_size, cp_size):
+        data = list(range(seq_length // cp_size))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, seq_length // cp_size, seq_length), dtype=bool
+        ).cuda()
+        loss_mask = torch.ones(seq_length // cp_size).repeat((micro_batch_size, 1)).cuda()
+        return input_ids, labels, position_ids, attention_mask, loss_mask
+
+    def _run_test_helper(
+        self, ep_size, cuda_graph_impl, cuda_graph_scope, cuda_graph_warmup_steps, **kwargs
+    ):
+        """Test fp8_param with gpt_model."""
+        args = self.create_test_args(
+            cuda_graph_impl, cuda_graph_scope, cuda_graph_warmup_steps, ep_size, **kwargs
+        )
+
+        set_args(args)
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch(
+            self.seq_length, self.micro_batch_size, self.cp_size
+        )
+
+        gpt_model, optimizer, _ = setup_model_and_optimizer(
+            self.model_provider, ModelType.encoder_or_decoder
+        )
+        assert len(gpt_model) == 1  # Assume only one model in the model provider.
+
+        if cuda_graph_impl == "transformer_engine":
+            self.cuda_graph_helper = TECudaGraphHelper(
+                model=gpt_model,
+                config=gpt_model[0].config,
+                seq_length=self.seq_length,
+                micro_batch_size=self.micro_batch_size,
+                optimizers=[optimizer],
+            )
+
+        loss_list = []
+
+        for i in range(100):
+            gpt_model[0].zero_grad_buffer()
+            optimizer.zero_grad()
+
+            # Capture CUDA graphs after warmup if helper is provided
+            if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps:
+                self.cuda_graph_helper.create_cudagraphs()
+
+            gpt_model[0].set_is_first_microbatch()
+            output = gpt_model[0].forward(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                labels=labels,
+                loss_mask=loss_mask,
+            )
+
+            # Check output shapes
+            assert output.shape[0] == self.micro_batch_size
+            assert output.shape[1] == self.seq_length // self.cp_size
+
+            # Verify gradients
+            loss = output.mean()
+            loss.backward()
+
+            for param in gpt_model[0].parameters():
+                assert param.main_grad is not None
+
+            update_successful, _, _ = optimizer.step()
+            assert update_successful
+
+            loss_list.append(loss.item())
+
+        if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created():
+            self.cuda_graph_helper.delete_cuda_graphs()
+            self.cuda_graph_helper = None
+
+        return torch.tensor(loss_list)
+
+    @pytest.mark.flaky
+    @pytest.mark.flaky_in_dev
+    @pytest.mark.skipif(
+        not (HAVE_TE and is_te_min_version("2.10.0")),
+        reason="Partial CUDA graph UT support requires TransformerEngine version >= 2.10.0",
+    )
+    @pytest.mark.parametrize("ep_size", [1, 4])
+    @pytest.mark.parametrize("moe_dropless_dispatcher", [False, True])
+    @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep", "hybridep"])
+    def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispatcher_type):
+        initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True)
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=self.tp_size,
+            context_parallel_size=self.cp_size,
+            pipeline_model_parallel_size=1,
+            expert_tensor_parallel_size=1 if ep_size > 1 else self.tp_size,
+            expert_model_parallel_size=ep_size,
+        )
+
+        extra_kwargs = {}
+        if moe_dispatcher_type == "deepep":
+            if not is_deep_ep_available():
+                pytest.skip("Deep EP is not available")
+            extra_kwargs["moe_token_dispatcher_type"] = "flex"
+            extra_kwargs["moe_flex_dispatcher_backend"] = "deepep"
+        elif moe_dispatcher_type == "hybridep":
+            pytest.skip(
+                "Currently, the Hybrid EP is broken. "
+                "Temporarily skip the test and wait for the fix."
+            )
+            if not is_hybrid_ep_available():
+                pytest.skip("Hybrid EP is not available")
+            extra_kwargs["moe_token_dispatcher_type"] = "flex"
+            extra_kwargs["moe_flex_dispatcher_backend"] = "hybridep"
+        else:
+            extra_kwargs["moe_token_dispatcher_type"] = moe_dispatcher_type
+        if not moe_dropless_dispatcher:
+            if moe_dispatcher_type == "deepep":
+                pytest.skip("Deep EP doesn't support drop&pad MoE")
+            if moe_dispatcher_type == "hybridep" and ep_size == 1:
+                pytest.skip("Hybrid EP doesn't support drop&pad MoE with ep_size == 1")
+            extra_kwargs["moe_expert_capacity_factor"] = 1.0
+            extra_kwargs["moe_pad_expert_input_to_capacity"] = True
+
+        loss_list_ref = self._run_test_helper(ep_size, "none", None, 0, **extra_kwargs)
+        for cuda_graph_scope in [
+            None,
+            [CudaGraphScope.attn],
+            [CudaGraphScope.moe],
+            [CudaGraphScope.mlp, CudaGraphScope.moe_router],
+            [
+                CudaGraphScope.attn,
+                CudaGraphScope.mlp,
+                CudaGraphScope.moe_router,
+                CudaGraphScope.moe_preprocess,
+            ],
+        ]:
+            if (moe_dropless_dispatcher or moe_dispatcher_type == "hybridep") and (
+                cuda_graph_scope is None or CudaGraphScope.moe in cuda_graph_scope
+            ):
+                # Dropless MoE or Hybrid EP doesn't work with "moe" scope cudagraph. Skip.
+                continue
+            cuda_graph_warmup_steps = 3
+            loss_list = self._run_test_helper(
+                ep_size,
+                "transformer_engine",
+                cuda_graph_scope,
+                cuda_graph_warmup_steps,
+                **extra_kwargs,
+            )
+            assert torch.equal(loss_list, loss_list_ref)
+
+        if moe_dispatcher_type == "hybridep":
+            reset_hybrid_ep_buffer()
+        Utils.destroy_model_parallel()
+
+
 if __name__ == "__main__":
 
     test = TestParallelTransformerBlockCudagraphs()
@@ -508,3 +1101,8 @@ def test_gpu_cudagraph(self):
     llava_test.setup_method(method=None)
     llava_test.test_llava_cudagraph_is_last_layer_logic()
     llava_test.teardown_method(method=None)
+
+    test = TestPartialCudaGraph()
+    test.setup_method(method=None)
+    test.test_moe_partial_cudagraph(4, True, "alltoall")
+    test.teardown_method(method=None)
diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py
index 1d00c704d26..bc8514ee561 100644
--- a/tests/unit_tests/transformer/test_multi_latent_attention.py
+++ b/tests/unit_tests/transformer/test_multi_latent_attention.py
@@ -293,13 +293,17 @@ def test_gpu_forward_thd_padded(self):
             assert bias.shape[0] == config.hidden_size
 
             # Test that the get_query_key_value_tensors function properly handles padded cu_seqlens
-            query, key, value = self.parallel_attention.get_query_key_value_tensors(
-                hidden_states, None, None, packed_seq_params, None
+            query, key, value, q_compressed, kv_compressed = (
+                self.parallel_attention.get_query_key_value_tensors(
+                    hidden_states, None, None, packed_seq_params, None
+                )
             )
 
             assert query is not None
             assert key is not None
             assert value is not None
+            assert q_compressed is not None
+            assert kv_compressed is not None
             assert query.is_contiguous()
             assert key.is_contiguous()
             assert value.is_contiguous()
@@ -370,7 +374,9 @@ def test_up_proj_recomputed_gpu_forward(self):
             )
             hidden_states = hidden_states.cuda()
 
-            q, k, v = checkpointed_parallel_attention.get_query_key_value_tensors(hidden_states)
+            q, k, v, q_compressed, kv_compressed = (
+                checkpointed_parallel_attention.get_query_key_value_tensors(hidden_states)
+            )
             assert q.is_contiguous()
             assert k.is_contiguous()
             assert v.is_contiguous()
@@ -675,18 +681,30 @@ def test_gpu_forward_thd_precision(self):
             packed_seq_params = make_test_packed_seq_params(cu_seqlens=cu_seqlens)
 
             # fine-grained check
-            query_sbhd, key_sbhd, value_sbhd = self.parallel_attention.get_query_key_value_tensors(
-                hidden_states_sbhd, None, None, None, None
+            query_sbhd, key_sbhd, value_sbhd, q_compressed_sbhd, kv_compressed_sbhd = (
+                self.parallel_attention.get_query_key_value_tensors(
+                    hidden_states_sbhd, None, None, None, None
+                )
             )
-            query_thd, key_thd, value_thd = self.parallel_attention.get_query_key_value_tensors(
-                hidden_states_thd, None, None, packed_seq_params, None
+            query_thd, key_thd, value_thd, q_compressed_thd, kv_compressed_thd = (
+                self.parallel_attention.get_query_key_value_tensors(
+                    hidden_states_thd, None, None, packed_seq_params, None
+                )
             )
             _query_sbhd = query_sbhd.transpose(0, 1).contiguous().view(*query_thd.shape)
             _key_sbhd = key_sbhd.transpose(0, 1).contiguous().view(*key_thd.shape)
             _value_sbhd = value_sbhd.transpose(0, 1).contiguous().view(*value_thd.shape)
+            _q_compressed_sbhd = (
+                q_compressed_sbhd.transpose(0, 1).contiguous().view(*q_compressed_thd.shape)
+            )
+            _kv_compressed_sbhd = (
+                kv_compressed_sbhd.transpose(0, 1).contiguous().view(*kv_compressed_thd.shape)
+            )
             assert torch.equal(_query_sbhd, query_thd)
             assert torch.equal(_key_sbhd, key_thd)
             assert torch.equal(_value_sbhd, value_thd)
+            assert torch.equal(_q_compressed_sbhd, q_compressed_thd)
+            assert torch.equal(_kv_compressed_sbhd, kv_compressed_thd)
 
             core_attn_out_sbhd = self.parallel_attention.core_attention(
                 query_sbhd,
@@ -828,18 +846,30 @@ def test_gpu_forward_thd_precision(self):
             packed_seq_params = make_test_packed_seq_params(cu_seqlens=cu_seqlens)
 
             # fine-grained check
-            query_sbhd, key_sbhd, value_sbhd = self.parallel_attention.get_query_key_value_tensors(
-                hidden_states_sbhd, None, None, None, None
+            query_sbhd, key_sbhd, value_sbhd, q_compressed_sbhd, kv_compressed_sbhd = (
+                self.parallel_attention.get_query_key_value_tensors(
+                    hidden_states_sbhd, None, None, None, None
+                )
             )
-            query_thd, key_thd, value_thd = self.parallel_attention.get_query_key_value_tensors(
-                hidden_states_thd, None, None, packed_seq_params, None
+            query_thd, key_thd, value_thd, q_compressed_thd, kv_compressed_thd = (
+                self.parallel_attention.get_query_key_value_tensors(
+                    hidden_states_thd, None, None, packed_seq_params, None
+                )
             )
             _query_sbhd = query_sbhd.transpose(0, 1).contiguous().view(*query_thd.shape)
             _key_sbhd = key_sbhd.transpose(0, 1).contiguous().view(*key_thd.shape)
             _value_sbhd = value_sbhd.transpose(0, 1).contiguous().view(*value_thd.shape)
+            _q_compressed_sbhd = (
+                q_compressed_sbhd.transpose(0, 1).contiguous().view(*q_compressed_thd.shape)
+            )
+            _kv_compressed_sbhd = (
+                kv_compressed_sbhd.transpose(0, 1).contiguous().view(*kv_compressed_thd.shape)
+            )
             torch.testing.assert_close(_query_sbhd, query_thd, atol=1e-6, rtol=1e-6)
             torch.testing.assert_close(_key_sbhd, key_thd, atol=1e-6, rtol=1e-6)
             torch.testing.assert_close(_value_sbhd, value_thd, atol=1e-6, rtol=1e-6)
+            torch.testing.assert_close(_q_compressed_sbhd, q_compressed_thd, atol=1e-6, rtol=1e-6)
+            torch.testing.assert_close(_kv_compressed_sbhd, kv_compressed_thd, atol=1e-6, rtol=1e-6)
 
             core_attn_out_sbhd = self.parallel_attention.core_attention(
                 query_sbhd,
@@ -967,18 +997,30 @@ def test_gpu_forward_thd_precision(self):
             packed_seq_params = make_test_packed_seq_params(cu_seqlens=cu_seqlens)
 
             # fine-grained check
-            query_sbhd, key_sbhd, value_sbhd = self.parallel_attention.get_query_key_value_tensors(
-                hidden_states_sbhd, None, None, None, None
+            query_sbhd, key_sbhd, value_sbhd, q_compressed_sbhd, kv_compressed_sbhd = (
+                self.parallel_attention.get_query_key_value_tensors(
+                    hidden_states_sbhd, None, None, None, None
+                )
             )
-            query_thd, key_thd, value_thd = self.parallel_attention.get_query_key_value_tensors(
-                hidden_states_thd, None, None, packed_seq_params, None
+            query_thd, key_thd, value_thd, q_compressed_thd, kv_compressed_thd = (
+                self.parallel_attention.get_query_key_value_tensors(
+                    hidden_states_thd, None, None, packed_seq_params, None
+                )
             )
             _query_sbhd = query_sbhd.transpose(0, 1).contiguous().view(*query_thd.shape)
             _key_sbhd = key_sbhd.transpose(0, 1).contiguous().view(*key_thd.shape)
             _value_sbhd = value_sbhd.transpose(0, 1).contiguous().view(*value_thd.shape)
+            _q_compressed_sbhd = (
+                q_compressed_sbhd.transpose(0, 1).contiguous().view(*q_compressed_thd.shape)
+            )
+            _kv_compressed_sbhd = (
+                kv_compressed_sbhd.transpose(0, 1).contiguous().view(*kv_compressed_thd.shape)
+            )
             assert torch.equal(_query_sbhd, query_thd)
             assert torch.equal(_key_sbhd, key_thd)
             assert torch.equal(_value_sbhd, value_thd)
+            assert torch.equal(_q_compressed_sbhd, q_compressed_thd)
+            assert torch.equal(_kv_compressed_sbhd, kv_compressed_thd)
 
             core_attn_out_sbhd = self.parallel_attention.core_attention(
                 query_sbhd,
@@ -1034,6 +1076,234 @@ def test_gpu_forward_thd_precision(self):
             os.environ.update(_environ)
 
 
+@pytest.mark.skipif(not is_te_min_version("2.9.0"), reason="QK clipping requires TE >= 2.9.0")
+@pytest.mark.parametrize("rope_type", ('yarn', 'rope'))
+class TestMLAClipQK:
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_and_teardown(self, rope_type):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            q_lora_rank=32,
+            kv_lora_rank=32,
+            qk_head_dim=128,
+            v_head_dim=128,
+            qk_pos_emb_head_dim=64,
+            rope_type=rope_type,
+            rotary_base=10000,
+            original_max_position_embeddings=32,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_clip_qk_disabled_raises_error(self):
+        """Test that clip_qk raises ValueError when qk_clip is not enabled."""
+        if is_te_min_version("1.10.0"):
+            # Create config without qk_clip
+            config = MLATransformerConfig(
+                num_layers=2,
+                hidden_size=12,
+                num_attention_heads=4,
+                use_cpu_initialization=True,
+                q_lora_rank=32,
+                kv_lora_rank=32,
+                qk_head_dim=128,
+                v_head_dim=128,
+                qk_pos_emb_head_dim=64,
+                rotary_base=10000,
+                original_max_position_embeddings=32,
+                qk_clip=False,
+            )
+            attention = MLASelfAttention(
+                config,
+                get_mla_self_attn_submodules(),
+                layer_number=1,
+                attn_mask_type=AttnMaskType.causal,
+            )
+
+            with pytest.raises(ValueError, match="qk_clip option needs to be enabled"):
+                attention.clip_qk()
+
+    def test_clip_qk_none_logits_raises_error(self):
+        """Test that clip_qk raises ValueError when current_max_attn_logits is None."""
+        if is_te_min_version("1.10.0"):
+            attention = MLASelfAttention(
+                self.transformer_config,
+                get_mla_self_attn_submodules(),
+                layer_number=1,
+                attn_mask_type=AttnMaskType.causal,
+            )
+
+            with pytest.raises(ValueError, match="current_max_attn_logits is None"):
+                attention.clip_qk()
+
+    def test_clip_qk_below_threshold_no_update(self):
+        """Test that weights are not updated when max logits are below threshold."""
+        if not is_te_min_version("1.10.0"):
+            pytest.skip("MLA requires TransformerEngine >= 1.10.0")
+
+        attention = MLASelfAttention(
+            self.transformer_config,
+            get_mla_self_attn_submodules(),
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+        )
+        attention.cuda()
+
+        # Save original weights
+        if self.transformer_config.q_lora_rank is None:
+            original_q_weight = attention.linear_q_proj.weight.data.clone()
+        else:
+            original_q_weight = attention.linear_q_up_proj.weight.data.clone()
+        original_kv_weight = attention.linear_kv_up_proj.weight.data.clone()
+
+        # Set current_max_attn_logits below threshold
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [50.0, 60.0, 70.0, 80.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should not be updated
+        if self.transformer_config.q_lora_rank is None:
+            assert torch.equal(attention.linear_q_proj.weight.data, original_q_weight)
+        else:
+            assert torch.equal(attention.linear_q_up_proj.weight.data, original_q_weight)
+        assert torch.equal(attention.linear_kv_up_proj.weight.data, original_kv_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_above_threshold_updates_weights(self):
+        """Test that weights are updated when max logits exceed threshold."""
+        if not is_te_min_version("1.10.0"):
+            pytest.skip("MLA requires TransformerEngine >= 1.10.0")
+
+        attention = MLASelfAttention(
+            self.transformer_config,
+            get_mla_self_attn_submodules(),
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+        )
+        attention.cuda()
+
+        # Save original weights
+        if self.transformer_config.q_lora_rank is None:
+            original_q_weight = attention.linear_q_proj.weight.data.clone()
+        else:
+            original_q_weight = attention.linear_q_up_proj.weight.data.clone()
+        original_kv_weight = attention.linear_kv_up_proj.weight.data.clone()
+
+        # Set current_max_attn_logits above threshold
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [150.0, 160.0, 170.0, 180.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated
+        if self.transformer_config.q_lora_rank is None:
+            assert not torch.equal(attention.linear_q_proj.weight.data, original_q_weight)
+        else:
+            assert not torch.equal(attention.linear_q_up_proj.weight.data, original_q_weight)
+        assert not torch.equal(attention.linear_kv_up_proj.weight.data, original_kv_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_mixed_logits(self):
+        """Test clip_qk with mixed logits (some above, some below threshold)."""
+        if not is_te_min_version("1.10.0"):
+            pytest.skip("MLA requires TransformerEngine >= 1.10.0")
+
+        attention = MLASelfAttention(
+            self.transformer_config,
+            get_mla_self_attn_submodules(),
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+        )
+        attention.cuda()
+
+        # Save original weights
+        if self.transformer_config.q_lora_rank is None:
+            original_q_weight = attention.linear_q_proj.weight.data.clone()
+        else:
+            original_q_weight = attention.linear_q_up_proj.weight.data.clone()
+        original_kv_weight = attention.linear_kv_up_proj.weight.data.clone()
+
+        # Set mixed current_max_attn_logits (some above, some below threshold)
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [80.0, 150.0, 90.0, 200.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated since at least one head exceeds threshold
+        if self.transformer_config.q_lora_rank is None:
+            assert not torch.equal(attention.linear_q_proj.weight.data, original_q_weight)
+        else:
+            assert not torch.equal(attention.linear_q_up_proj.weight.data, original_q_weight)
+        assert not torch.equal(attention.linear_kv_up_proj.weight.data, original_kv_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_with_absorption_raises_error(self):
+        """Test that clip_qk raises ValueError when in absorption mode."""
+        if not is_te_min_version("1.10.0"):
+            pytest.skip("MLA requires TransformerEngine >= 1.10.0")
+
+        # Create config with cache_mla_latents enabled
+        config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            q_lora_rank=32,
+            kv_lora_rank=32,
+            qk_head_dim=128,
+            v_head_dim=128,
+            qk_pos_emb_head_dim=64,
+            rotary_base=10000,
+            original_max_position_embeddings=32,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = MLASelfAttention(
+            config,
+            get_mla_self_attn_submodules(),
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+        )
+        attention.cuda()
+
+        # Simulate absorption mode by setting cache_mla_latents and deleting linear_kv_up_proj
+        attention.cache_mla_latents = True
+        if hasattr(attention, 'linear_kv_up_proj'):
+            delattr(attention, 'linear_kv_up_proj')
+
+        # Set current_max_attn_logits
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [150.0, 160.0, 170.0, 180.0], device='cuda'
+        )
+
+        with pytest.raises(
+            ValueError,
+            match="qk_clip is not supported when cache_mla_latents is enabled and absorption is active",
+        ):
+            attention.clip_qk()
+
+
 @pytest.mark.experimental
 @pytest.mark.parametrize(
     ("rope_type", "apply_rope_fusion"),
diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py
index 65e58eaede4..05fb2c4fe63 100644
--- a/tests/unit_tests/transformer/test_multi_token_prediction.py
+++ b/tests/unit_tests/transformer/test_multi_token_prediction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import os
 import sys
@@ -14,11 +14,14 @@
 )
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.parallel_state import get_context_parallel_group
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.multi_token_prediction import (
     MTPLossLoggingHelper,
     MultiTokenPredictionBlock,
+    roll_tensor,
 )
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_te_min_version
@@ -245,6 +248,66 @@ def get_batch(self, seq_length, micro_batch_size):
         }
         return batch
 
+    def get_packed_batch(self, seq_lengths, micro_batch_size):
+        """
+        Create a packed sequence batch with multiple sequences of varying lengths.
+
+        Args:
+            seq_lengths: List of sequence lengths (e.g., [10, 15, 8] for 3 sequences)
+            micro_batch_size: Batch size (typically 1 for packed sequences)
+
+        Returns:
+            batch: Dictionary containing packed sequences and PackedSeqParams
+        """
+        total_seq_length = sum(seq_lengths)
+
+        # Create packed input_ids, labels, and position_ids
+        input_ids_list = []
+        labels_list = []
+        position_ids_list = []
+
+        for seq_len in seq_lengths:
+            data = list(range(seq_len))
+            input_ids_list.extend(data)
+            labels_list.extend([x + 1 for x in data])
+            position_ids_list.extend(data)
+
+        # Convert to tensors with shape [batch, total_seq_length]
+        input_ids = torch.tensor(input_ids_list, dtype=torch.int64).unsqueeze(0).cuda()
+        labels = torch.tensor(labels_list, dtype=torch.int64).unsqueeze(0).cuda()
+        position_ids = torch.tensor(position_ids_list, dtype=torch.int64).unsqueeze(0).cuda()
+
+        # Create attention mask for packed sequences (all ones for simplicity)
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, total_seq_length, total_seq_length), dtype=bool
+        ).cuda()
+
+        # Create loss mask with shape [batch, total_seq_length]
+        loss_mask = torch.ones(micro_batch_size, total_seq_length).cuda()
+
+        # Create cumulative sequence lengths for PackedSeqParams
+        cu_seqlens = torch.tensor(
+            [0] + [sum(seq_lengths[: i + 1]) for i in range(len(seq_lengths))], dtype=torch.int32
+        ).cuda()
+
+        packed_seq_params = PackedSeqParams(
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_kv=cu_seqlens,
+            max_seqlen_q=max(seq_lengths),
+            max_seqlen_kv=max(seq_lengths),
+            qkv_format='thd',
+        )
+
+        batch = {
+            'tokens': input_ids,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'packed_seq_params': packed_seq_params,
+        }
+        return batch
+
     @pytest.mark.skipif(
         not HAVE_TE or not is_te_min_version("2.1.0"),
         reason="grouped_gemm requires TransformerEngine >= 2.1.0",
@@ -405,6 +468,149 @@ def test_fp8_support(self, full_recompute):
         loss = output.mean()
         loss.backward()
 
+    @pytest.mark.skipif(
+        not HAVE_TE or not is_te_min_version("2.1.0"),
+        reason="grouped_gemm requires TransformerEngine >= 2.1.0",
+    )
+    @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1), (2, 2)])
+    def test_packed_sequences(self, tp, cp):
+        """Test MTP with packed sequences."""
+        # Create args with packed sequences support
+        seq_lengths = [16, 24, 12]  # Three sequences of different lengths
+        total_seq_length = sum(seq_lengths)
+
+        args = self.create_test_args(tp, cp, total_seq_length, micro_batch_size=1)
+        set_args(args)
+
+        torch.manual_seed(_SEED)
+        Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp)
+
+        # Get packed batch
+        batch = self.get_packed_batch(seq_lengths, micro_batch_size=1)
+        tokens = batch['tokens']
+        labels = batch['labels']
+        loss_mask = batch['loss_mask']
+        attention_mask = batch['attention_mask']
+        position_ids = batch['position_ids']
+        packed_seq_params = batch['packed_seq_params']
+
+        # Create model
+        gpt_model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+            self.model_provider, ModelType.encoder_or_decoder
+        )
+
+        # Forward pass with packed sequences
+        output = gpt_model[0].forward(
+            input_ids=tokens,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            loss_mask=loss_mask,
+            packed_seq_params=packed_seq_params,
+        )
+
+        # Verify output shape
+        assert output.shape[0] == 1  # batch size
+        assert output.shape[1] == total_seq_length
+
+        # Verify MTP loss was computed
+        tracker = MTPLossLoggingHelper.tracker
+        assert "values" in tracker
+        mtp_loss = tracker['values'].clone()
+        assert mtp_loss.shape[0] == args.mtp_num_layers
+        MTPLossLoggingHelper.clean_loss_in_tracker()
+
+        # Backward pass
+        loss = output.mean()
+        loss.backward()
+
+        # Verify gradients exist
+        for name, param in gpt_model[0].named_parameters():
+            assert param.main_grad is not None, f"Gradient missing for {name}"
+
+    @pytest.mark.parametrize("cp", [1, 2])
+    def test_roll_tensor_with_packed_sequences(self, cp):
+        """Test roll_tensor function with packed sequences, with and without CP.
+
+        For CP=1: Tests standard packed sequence rolling with verified expected values
+        For CP=2: Tests CP-enabled rolling executes without errors
+        """
+        Utils.initialize_model_parallel(tensor_model_parallel_size=1, context_parallel_size=cp)
+        cp_group = get_context_parallel_group() if cp > 1 else None
+        cp_rank = torch.distributed.get_rank(group=cp_group) if cp_group is not None else 0
+
+        if cp == 1:
+            # Test case: Simple packed sequences (CP disabled)
+            tensor = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32).cuda()
+            cu_seqlens = torch.tensor([0, 3, 5], dtype=torch.int32).cuda()
+
+            packed_seq_params = PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                max_seqlen_q=3,
+                max_seqlen_kv=3,
+                qkv_format='thd',
+            )
+
+            # Roll by -1 (shift left)
+            rolled, sum_val = roll_tensor(
+                tensor, shifts=-1, dims=0, cp_group=cp_group, packed_seq_params=packed_seq_params
+            )
+
+            # Expected: [2, 3, 0, 5, 0] - boundaries at indices 2 and 4 are zeroed
+            expected = torch.tensor([2, 3, 0, 5, 0], dtype=torch.float32).cuda()
+            assert torch.equal(rolled, expected), f"Expected {expected}, got {rolled}"
+        else:
+            # Test case: Packed sequences with CP=2
+            # Two sequences:
+            #   seq1 = [1, 2, 3, 4, 5, 6, 7, 8]
+            #   seq2 = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
+
+            if cp_rank == 0:
+                # CP Rank 0: first half of each sequence
+                tensor = torch.tensor(
+                    [1, 2, 7, 8, 11, 12, 13, 20, 21, 22], dtype=torch.float32
+                ).cuda()
+                expected = torch.tensor(
+                    [2, 3, 8, 0, 12, 13, 14, 21, 22, 0], dtype=torch.float32
+                ).cuda()
+            else:
+                # CP Rank 1: second half of each sequence
+                tensor = torch.tensor(
+                    [3, 4, 5, 6, 14, 15, 16, 17, 18, 19], dtype=torch.float32
+                ).cuda()
+                expected = torch.tensor(
+                    [4, 5, 6, 7, 15, 16, 17, 18, 19, 20], dtype=torch.float32
+                ).cuda()
+
+            cu_seqlens = torch.tensor([0, 8, 20], dtype=torch.int32).cuda()
+
+            packed_seq_params = PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                max_seqlen_q=6,  # max(4, 6) - max local seq length per sequence
+                max_seqlen_kv=6,
+                qkv_format='thd',
+            )
+
+            # Roll by -1 (shift left) with CP communication
+            rolled, sum_val = roll_tensor(
+                tensor, shifts=-1, dims=0, cp_group=cp_group, packed_seq_params=packed_seq_params
+            )
+
+            # Verify the rolled tensor matches expected values
+            assert (
+                rolled.shape == expected.shape
+            ), f"Shape mismatch: expected {expected.shape}, got {rolled.shape}"
+            assert torch.equal(
+                rolled, expected
+            ), f"CP Rank {cp_rank}: Expected\n{expected}\nbut got\n{rolled}\nDiff:\n{rolled - expected}"
+
+            # Verify sum is correct
+            assert sum_val.numel() == 1, "Sum should be a scalar"
+
+        Utils.destroy_model_parallel()
+
 
 class TestMTPLossLoggingHelper:
     def setup_method(self, method):
diff --git a/tests/unit_tests/transformer/test_quantization_config.py b/tests/unit_tests/transformer/test_quantization_config.py
index fe57934bdeb..6d92e59b9a9 100644
--- a/tests/unit_tests/transformer/test_quantization_config.py
+++ b/tests/unit_tests/transformer/test_quantization_config.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 from typing import Any, Dict
 
 import pytest
@@ -5,16 +7,19 @@
 from megatron.core.quantization.quant_config import GlobMatcher, MatchContext, RecipeConfig
 
 try:
-    import nvidia_kitchen
-    from nvidia_kitchen.config import (
+    from megatron.core.extensions.kitchen import (
+        HAVE_KITCHEN,
         AutogradFunctionImplementation,
+        QAttentionParamsConfigSchema,
+        QFlashAttentionParamsConfigSchema,
+        QLinearParamsConfigSchema,
         QuantizeRecipe,
+        QuantizeRecipeAttnBMM,
+        get_qattention_params_from_predefined,
+        get_qfa_params_from_recipe_name,
         get_qlinear_params_from_predefined,
     )
 
-    from megatron.core.extensions.kitchen import QLinearParamsConfigSchema
-
-    HAVE_KITCHEN = True
 except ImportError:
     HAVE_KITCHEN = False
 
@@ -61,6 +66,27 @@ def test_parse_qlinear_params_example() -> None:
         == AutogradFunctionImplementation.QUANTIZED
     )
 
+    qat_params = 6001
+    config = {"kitchen_config_type": "QAttentionParams", "recipe_idx": qat_params}
+    qattention_params_actual = QAttentionParamsConfigSchema.parse_config_dict(
+        config
+    ).to_kitchen_qattention()
+    qattention_params_expected = get_qattention_params_from_predefined(
+        QuantizeRecipeAttnBMM.MXFP8_EMULATION
+    )
+    assert type(qattention_params_actual.quantizer_bmm1) == type(
+        qattention_params_expected.quantizer_bmm1
+    )
+    assert type(qattention_params_actual.quantizer_bmm2) == type(
+        qattention_params_expected.quantizer_bmm2
+    )
+    assert type(qattention_params_actual.get_quantizer(True)) == type(
+        qattention_params_expected.get_quantizer(True)
+    )
+    assert type(qattention_params_actual.get_quantizer(False)) == type(
+        qattention_params_expected.get_quantizer(False)
+    )
+
 
 @pytest.mark.skipif(not HAVE_KITCHEN, reason="Kitchen required for using kitchen backend.")
 def test_error_from_malformed() -> None:
@@ -84,3 +110,71 @@ def test_error_from_malformed() -> None:
     }
     with pytest.raises(KeyError, match="Unexpected keys in config"):
         qlinear_params_actual = QLinearParamsConfigSchema.parse_config_dict(config)
+
+
+@pytest.mark.skipif(not HAVE_KITCHEN, reason="Kitchen required for using kitchen backend.")
+def test_parse_qflash_attention_params_example() -> None:
+    recipe_name = "triton_fa_bf16_for_all_base_2"
+    config = {"kitchen_config_type": "QFlashAttentionParams", "recipe_name": recipe_name}
+    qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config).to_kitchen_qfa()
+    qfa_params_expected = get_qfa_params_from_recipe_name(recipe_name)
+
+    # Verify they are the same object (since recipes are cached)
+    assert qfa_params_actual is qfa_params_expected
+    assert qfa_params_actual.backend == "triton"
+    assert qfa_params_actual.qk_dot_precisions == "bf16@bf16"
+    assert qfa_params_actual.pv_dot_precisions == "bf16@bf16"
+    assert qfa_params_actual.use_natural_transcendental_func is False
+
+    # Test with natural recipe
+    recipe_name = "triton_fa_bf16_for_all_natural"
+    config = {"kitchen_config_type": "QFlashAttentionParams", "recipe_name": recipe_name}
+    qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config).to_kitchen_qfa()
+    qfa_params_expected = get_qfa_params_from_recipe_name(recipe_name)
+
+    assert qfa_params_actual is qfa_params_expected
+    assert qfa_params_actual.backend == "triton"
+    assert qfa_params_actual.use_natural_transcendental_func is True
+
+
+@pytest.mark.skipif(not HAVE_KITCHEN, reason="Kitchen required for using kitchen backend.")
+def test_error_from_malformed_qflash_attention_params() -> None:
+    # Missing recipe_name
+    config: Dict[Any, Any] = {"kitchen_config_type": "QFlashAttentionParams"}
+    with pytest.raises(KeyError, match="Missing required keys"):
+        qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config)
+
+    # Missing kitchen_config_type
+    config = {"recipe_name": "triton_fa_bf16_for_all_base_2"}
+    with pytest.raises(KeyError, match="Missing required keys"):
+        qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config)
+
+    # Wrong config type
+    config = {
+        "kitchen_config_type": "QLinearParams",
+        "recipe_name": "triton_fa_bf16_for_all_base_2",
+    }
+    with pytest.raises(ValueError, match="Parsing config dict of incorrect type"):
+        qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config)
+
+    # Unsupported config type
+    config = {
+        "kitchen_config_type": "QUnknownParams",
+        "recipe_name": "triton_fa_bf16_for_all_base_2",
+    }
+    with pytest.raises(ValueError, match="Unsupported config type"):
+        qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config)
+
+    # Extra keys
+    config = {
+        "kitchen_config_type": "QFlashAttentionParams",
+        "recipe_name": "triton_fa_bf16_for_all_base_2",
+        "extra_key": "extra_value",
+    }
+    with pytest.raises(KeyError, match="Unexpected keys in config"):
+        qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config)
+
+    # Invalid recipe_name (not a string)
+    config = {"kitchen_config_type": "QFlashAttentionParams", "recipe_name": 123}
+    with pytest.raises(ValueError, match="recipe_name must be a string"):
+        qfa_params_actual = QFlashAttentionParamsConfigSchema.parse_config_dict(config)
diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py
deleted file mode 100644
index 85c5347c909..00000000000
--- a/tests/unit_tests/transformer/test_retro_attention.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import os
-import types
-
-import pytest
-import torch
-
-from megatron.core.models.retro import RetroConfig, get_retro_decoder_block_spec
-from megatron.core.models.retro.decoder_attention import (
-    RetroDecoderBiasDropoutAdd,
-    RetroDecoderCrossAttention,
-)
-from megatron.core.models.retro.encoder_attention import (
-    RetroEncoderBiasDropoutAdd,
-    RetroEncoderCrossAttention,
-    RetroEncoderLayerNorm,
-)
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.transformer.transformer_block import TransformerBlock
-from tests.unit_tests.test_utilities import Utils
-
-
-class TestRetroAttention:
-
-    @classmethod
-    def get_config(cls):
-        return RetroConfig(
-            num_layers=12,
-            hidden_size=16,
-            num_attention_heads=4,
-            use_cpu_initialization=True,
-            retro_num_neighbors=2,
-            retro_chunk_length=4,
-            retro_retrieved_length=8,
-            retro_split_preprocessing="98,2,0",
-        )
-
-    @classmethod
-    def get_modules(cls, config, use_transformer_engine, use_gpu):
-
-        # Retro decoder layer.
-        decoder_block_spec = get_retro_decoder_block_spec(
-            config, use_transformer_engine=use_transformer_engine
-        )
-        decoder_block = TransformerBlock(config=config, spec=decoder_block_spec)
-        decoder_layers = [
-            layer
-            for layer in decoder_block.layers
-            if isinstance(layer.cross_attention, RetroDecoderCrossAttention)
-        ]
-        decoder_layer = decoder_layers[0]
-
-        # Retro encoder layer.
-        encoder_block = decoder_layer.cross_attention.encoder
-        encoder_layers = [
-            layer
-            for layer in encoder_block.layers
-            if isinstance(layer.cross_attention, RetroEncoderCrossAttention)
-        ]
-        encoder_layer = encoder_layers[0]
-
-        # Modules.
-        modules = types.SimpleNamespace(
-            decoder_attn=decoder_layer.cross_attention,
-            decoder_bda=decoder_layer.cross_attn_bda,
-            encoder_attn=encoder_layer.cross_attention,
-            encoder_bda=encoder_layer.cross_attn_bda,
-            encoder_norm=encoder_layer.pre_mlp_layernorm,
-        )
-
-        # GPU.
-        if use_gpu:
-            [m.cuda() for m in vars(modules).values()]
-
-        return modules
-
-    def setup_method(self, method):
-        Utils.initialize_model_parallel(1, 1)
-        os.environ['NVTE_FLASH_ATTN'] = "0"
-        os.environ['NVTE_FUSED_ATTN'] = "0"
-
-        model_parallel_cuda_manual_seed(123)
-
-    def teardown_method(self, method):
-        Utils.destroy_model_parallel()
-
-    def test_constructor(self):
-
-        config = self.get_config()
-        modules = self.get_modules(config, use_transformer_engine=True, use_gpu=False)
-
-        assert isinstance(modules.decoder_attn, RetroDecoderCrossAttention)
-        assert isinstance(modules.decoder_bda, RetroDecoderBiasDropoutAdd)
-        assert isinstance(modules.encoder_attn, RetroEncoderCrossAttention)
-        assert isinstance(modules.encoder_bda, RetroEncoderBiasDropoutAdd)
-        assert isinstance(modules.encoder_norm, RetroEncoderLayerNorm)
-
-        assert modules.decoder_attn.attn.layer_number == 6
-        assert modules.encoder_attn.attn.layer_number == 1
-
-        get_nparams = lambda m: sum(p.numel() for p in m.parameters())
-        assert get_nparams(modules.decoder_attn) == 8768
-        assert get_nparams(modules.decoder_bda) == 0
-        assert get_nparams(modules.encoder_attn) == 1088
-        assert get_nparams(modules.encoder_bda) == 0
-        assert get_nparams(modules.encoder_norm) == 32
-
-    def test_cpu_forward(self):
-        # we can't currently do this because the global memory buffer is on GPU
-        pass
-
-    def run_gpu_forward(self, recompute_granularity, use_transformer_engine):
-
-        config = self.get_config()
-        config.recompute_granularity = recompute_granularity
-        modules = self.get_modules(config, use_transformer_engine, use_gpu=True)
-
-        seq_length = 32
-        micro_batch_size = 2
-        n_chunks_per_sample = seq_length // config.retro_chunk_length
-
-        # Init tensors.
-        hidden_states = torch.ones((seq_length, micro_batch_size, config.hidden_size)).cuda()
-        attention_mask = None
-        decoder_context = torch.ones(
-            (
-                config.retro_retrieved_length,
-                config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample,
-                config.hidden_size,
-            )
-        ).cuda()
-        encoder_context = torch.ones(
-            (config.retro_chunk_length, micro_batch_size * n_chunks_per_sample, config.hidden_size)
-        ).cuda()
-
-        # Forward decoder.
-        decoder_attn_output = modules.decoder_attn(hidden_states, attention_mask, decoder_context)
-        with torch.enable_grad():
-            decoder_bda_output = modules.decoder_bda(True, True)(
-                decoder_attn_output, hidden_states, config.hidden_dropout
-            )
-
-        # Forward encoder.
-        encoder_attn_output_tuples = modules.encoder_attn(decoder_context, None, encoder_context)
-        with torch.enable_grad():
-            encoder_bda_output = modules.encoder_bda(True, True)(
-                encoder_attn_output_tuples, decoder_context, config.retro_encoder_hidden_dropout
-            )
-        encoder_norm_output = modules.encoder_norm(encoder_bda_output)
-
-        # Verify decoder.
-        assert set(decoder_attn_output.keys()) == set(
-            ["ns", "bs", "d", "l", "pad", "attention_output", "attention_bias", "context"]
-        )
-        assert decoder_attn_output["ns"] == seq_length
-        assert decoder_attn_output["bs"] == micro_batch_size
-        assert decoder_attn_output["d"] == config.hidden_size
-        assert decoder_attn_output["l"] == n_chunks_per_sample
-        assert decoder_attn_output["pad"] == 3
-        assert tuple(decoder_attn_output["attention_output"].shape) == (
-            config.retro_chunk_length,
-            micro_batch_size * n_chunks_per_sample,
-            config.hidden_size,
-        )
-        assert tuple(decoder_attn_output["attention_bias"].shape) == (config.hidden_size,)
-        assert decoder_attn_output["context"].shape == (
-            config.retro_retrieved_length * config.retro_num_neighbors,
-            micro_batch_size * n_chunks_per_sample,
-            config.hidden_size,
-        )
-        assert decoder_bda_output.shape == hidden_states.shape
-
-        # Verify encoder.
-        assert len(encoder_attn_output_tuples) == config.retro_num_neighbors
-        for output, bias, residual in encoder_attn_output_tuples:
-            assert tuple(output.shape) == (
-                config.retro_retrieved_length,
-                micro_batch_size * n_chunks_per_sample,
-                config.hidden_size,
-            )
-            assert tuple(bias.shape) == (config.hidden_size,)
-            assert tuple(residual.shape) == (
-                config.retro_retrieved_length,
-                micro_batch_size * n_chunks_per_sample,
-                config.hidden_size,
-            )
-        assert encoder_bda_output.shape == (
-            config.retro_retrieved_length,
-            config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample,
-            config.hidden_size,
-        )
-        assert encoder_norm_output.shape == (
-            config.retro_retrieved_length,
-            config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample,
-            config.hidden_size,
-        )
-
-    @pytest.mark.flaky
-    @pytest.mark.flaky_in_dev
-    def test_gpu_forward(self):
-        for recompute_granularity in (None, 'selective'):
-            for use_transformer_engine in (True, False):
-                self.run_gpu_forward(recompute_granularity, use_transformer_engine)
diff --git a/tests/unit_tests/transformer/test_submodule_callables.py b/tests/unit_tests/transformer/test_submodule_callables.py
index 1ccb6fd5be8..73059495c06 100644
--- a/tests/unit_tests/transformer/test_submodule_callables.py
+++ b/tests/unit_tests/transformer/test_submodule_callables.py
@@ -64,7 +64,7 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches):
     output_tensors = []
     # get callables
     callables, dw = build_layer_callables(model)
-    attn, post_attn, dispatch, moe, combine, post_process = callables
+    attn, dispatch, moe, combine, post_process = callables
     assert post_process is None
     dummy_model = DummyState()
     dummy_model.decoder = DummyState()
@@ -76,24 +76,16 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches):
         node.chunk_state.model = dummy_model
 
         # attn fwd
-        hidden_states = attn(node, input_tensors[i])
-
-        # post attn fwd
-        local_tokens, probs = post_attn(node, hidden_states)
+        local_tokens, probs = attn(node, input_tensors[i])
 
         # dispatch fwd
         dispatched_tokens = dispatch(node, local_tokens, probs)
 
         # moe fwd
-        expert_outputs = moe(node, dispatched_tokens)
-        if model.mlp.use_shared_expert:
-            expert_output, shared_expert_output = expert_outputs
-        else:
-            expert_output = expert_outputs
-            shared_expert_output = None
+        expert_output = moe(node, dispatched_tokens)
 
         # combine fwd
-        hidden_states = combine(node, expert_output, shared_expert_output)
+        hidden_states = combine(node, expert_output)
 
         # loss
         output_tensors.append(hidden_states)
diff --git a/tests/unit_tests/transformer/test_te_layers_batch_invariant.py b/tests/unit_tests/transformer/test_te_layers_batch_invariant.py
new file mode 100644
index 00000000000..e2d52727925
--- /dev/null
+++ b/tests/unit_tests/transformer/test_te_layers_batch_invariant.py
@@ -0,0 +1,737 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import importlib
+import os
+
+import pytest
+import torch
+import torch.distributed as dist
+
+from megatron.core.extensions.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TENorm,
+    TERowParallelLinear,
+    te_general_gemm,
+)
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.custom_layers.batch_invariant_kernels import set_batch_invariant_mode
+from megatron.core.transformer.enums import AttnBackend, AttnMaskType
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import init_method_normal, is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    import flash_attn_3
+
+    HAVE_FA3 = True
+except ImportError:
+    HAVE_FA3 = False
+
+
+# ============================================================================
+# Batch-Invariant test helpers
+# ============================================================================
+def _split_concat_equal(layer, x_full, dim=0, forward_kwargs=None, out_dim_concat=0):
+    forward_kwargs = forward_kwargs or {}
+    b = x_full.shape[dim]
+    b1 = max(1, b // 4)
+    b2 = b - b1
+    xs = [torch.narrow(x_full, dim, 0, b1), torch.narrow(x_full, dim, b1, b2)]
+    with torch.no_grad():
+        with set_batch_invariant_mode(True):
+            out_full = layer(x_full, **forward_kwargs)
+            out1 = layer(xs[0], **forward_kwargs)
+            out2 = layer(xs[1], **forward_kwargs)
+    # Handle (out, bias) tuples from linear wrappers
+    if isinstance(out_full, tuple):
+        out_full = out_full[0]
+        out1 = out1[0]
+        out2 = out2[0]
+    out_cat = torch.cat([out1, out2], dim=out_dim_concat)
+    assert out_full.shape == out_cat.shape
+    assert torch.equal(out_full, out_cat)
+
+
+def _split_many_concat_equal(layer, x_full, splits, dim=0, forward_kwargs=None, out_dim_concat=0):
+    forward_kwargs = forward_kwargs or {}
+    assert sum(splits) == x_full.shape[dim], "Splits must sum to batch size"
+    # Make contiguous chunks to avoid unexpected view behavior
+    starts = [0]
+    for s in splits[:-1]:
+        starts.append(starts[-1] + s)
+    xs = [torch.narrow(x_full, dim, st, ln).contiguous() for st, ln in zip(starts, splits)]
+    with torch.no_grad():
+        with set_batch_invariant_mode(True):
+            out_full = layer(x_full, **forward_kwargs)
+            outs = [layer(xi, **forward_kwargs) for xi in xs]
+    if isinstance(out_full, tuple):
+        out_full = out_full[0]
+        outs = [o[0] for o in outs]
+    out_cat = torch.cat(outs, dim=out_dim_concat)
+    assert out_full.shape == out_cat.shape
+    assert torch.equal(out_full, out_cat)
+
+
+def _random_splits(total, num_parts):
+    assert num_parts >= 2 and total >= num_parts
+    cuts = torch.randperm(total - 1, device="cpu")[: num_parts - 1].tolist()
+    cuts = [0] + sorted(cuts) + [total - 1]
+    lens = [cuts[i + 1] - cuts[i] + 1 for i in range(len(cuts) - 1)]
+    delta = sum(lens) - total
+    i = 0
+    while delta > 0:
+        if lens[i] > 1:
+            lens[i] -= 1
+            delta -= 1
+        i = (i + 1) % len(lens)
+    return lens
+
+
+# ============================================================================
+# Randomized Batch Invariant Tests
+# ============================================================================
+
+
+def test_te_column_parallel_linear_batch_invariant_randomized():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg = TransformerConfig(
+        num_layers=1,
+        hidden_size=256,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    layer = (
+        TEColumnParallelLinear(
+            input_size=cfg.hidden_size,
+            output_size=512,
+            config=cfg,
+            init_method=init_method_normal(cfg.init_method_std),
+            gather_output=False,
+            bias=True,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+        .cuda()
+        .eval()
+    )
+
+    torch.manual_seed(123)
+    for _ in range(3):
+        B = int(torch.randint(48, 129, (1,)).item())
+        parts = int(torch.randint(4, 9, (1,)).item())
+        splits = _random_splits(B, parts)
+        x = torch.randn(B, cfg.hidden_size, device="cuda", dtype=torch.bfloat16)
+        _split_many_concat_equal(layer, x, splits=splits, dim=0, out_dim_concat=0)
+
+    Utils.destroy_model_parallel()
+
+
+def test_te_row_parallel_linear_batch_invariant_randomized():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg = TransformerConfig(
+        num_layers=1,
+        hidden_size=256,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    layer = (
+        TERowParallelLinear(
+            input_size=cfg.hidden_size,
+            output_size=384,
+            config=cfg,
+            init_method=init_method_normal(cfg.init_method_std),
+            bias=True,
+            input_is_parallel=True,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+        .cuda()
+        .eval()
+    )
+
+    torch.manual_seed(321)
+    for _ in range(3):
+        B = int(torch.randint(48, 129, (1,)).item())
+        parts = int(torch.randint(4, 9, (1,)).item())
+        splits = _random_splits(B, parts)
+        x = torch.randn(B, cfg.hidden_size, device="cuda", dtype=torch.bfloat16)
+        _split_many_concat_equal(layer, x, splits=splits, dim=0, out_dim_concat=0)
+
+    Utils.destroy_model_parallel()
+
+
+def test_te_layernorm_column_parallel_linear_batch_invariant_randomized():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg = TransformerConfig(
+        num_layers=1,
+        hidden_size=256,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    layer = (
+        TELayerNormColumnParallelLinear(
+            input_size=cfg.hidden_size,
+            output_size=512,
+            config=cfg,
+            init_method=init_method_normal(cfg.init_method_std),
+            gather_output=False,
+            bias=True,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+        .cuda()
+        .eval()
+    )
+
+    torch.manual_seed(456)
+    for _ in range(3):
+        B = int(torch.randint(48, 129, (1,)).item())
+        parts = int(torch.randint(4, 9, (1,)).item())
+        splits = _random_splits(B, parts)
+        x = torch.randn(B, cfg.hidden_size, device="cuda", dtype=torch.bfloat16)
+        _split_many_concat_equal(layer, x, splits=splits, dim=0, out_dim_concat=0)
+
+    Utils.destroy_model_parallel()
+
+
+def test_te_norm_batch_invariant_randomized():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg = TransformerConfig(
+        num_layers=1,
+        hidden_size=256,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    layer = TENorm(config=cfg, hidden_size=cfg.hidden_size, eps=cfg.layernorm_epsilon).cuda().eval()
+
+    torch.manual_seed(789)
+    for _ in range(3):
+        B = int(torch.randint(48, 129, (1,)).item())
+        parts = int(torch.randint(4, 9, (1,)).item())
+        splits = _random_splits(B, parts)
+        x = torch.randn(B, cfg.hidden_size, device="cuda", dtype=torch.bfloat16)
+        _split_many_concat_equal(layer, x, splits=splits, dim=0, out_dim_concat=0)
+
+    Utils.destroy_model_parallel()
+
+
+def test_column_parallel_linear_batch_invariant_randomized():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg = TransformerConfig(
+        num_layers=1,
+        hidden_size=256,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    layer = (
+        ColumnParallelLinear(
+            input_size=cfg.hidden_size,
+            output_size=320,
+            config=cfg,
+            init_method=init_method_normal(cfg.init_method_std),
+            gather_output=False,
+            bias=True,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+        .cuda()
+        .eval()
+    )
+
+    torch.manual_seed(246)
+    for _ in range(3):
+        B = int(torch.randint(48, 129, (1,)).item())
+        parts = int(torch.randint(4, 9, (1,)).item())
+        splits = _random_splits(B, parts)
+        x = torch.randn(B, cfg.hidden_size, device="cuda", dtype=torch.bfloat16)
+        _split_many_concat_equal(layer, x, splits=splits, dim=0, out_dim_concat=0)
+
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.skipif(
+    not (is_te_min_version("2.10.0") and HAVE_FA3),
+    reason="TE attention BIK tests require TE >= 2.10.0 and FlashAttention-3",
+)
+def test_te_attention_layer_batch_invariant_randomized():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+    os.environ["NVTE_FUSED_ATTN"] = "0"
+    os.environ["NVTE_FLASH_ATTN"] = "1"
+    os.environ["NVTE_UNFUSED_ATTN"] = "0"
+
+    cfg = TransformerConfig(
+        num_layers=1,
+        hidden_size=256,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    attn = TEDotProductAttention(
+        config=cfg, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type="self"
+    )
+    assert getattr(attn, "num_splits", None) == 1
+
+    torch.manual_seed(135)
+    for _ in range(3):
+        B = int(torch.randint(32, 97, (1,)).item())
+        parts = int(torch.randint(4, 9, (1,)).item())
+        splits = _random_splits(B, parts)
+        S = int(torch.randint(256, 513, (1,)).item())
+        H = cfg.num_attention_heads
+        D = cfg.hidden_size // H
+
+        q = torch.randn(S, B, H, D, device="cuda", dtype=torch.bfloat16)
+        k = torch.randn(S, B, H, D, device="cuda", dtype=torch.bfloat16)
+        v = torch.randn(S, B, H, D, device="cuda", dtype=torch.bfloat16)
+
+        # Random permutation of the batch dimension.
+        perm = torch.randperm(B, device="cuda")
+
+        # Also build contiguous chunks to continue testing split invariance.
+        starts = [0]
+        for s in splits[:-1]:
+            starts.append(starts[-1] + s)
+        q_chunks = [q[:, st : st + ln] for st, ln in zip(starts, splits)]
+        k_chunks = [k[:, st : st + ln] for st, ln in zip(starts, splits)]
+        v_chunks = [v[:, st : st + ln] for st, ln in zip(starts, splits)]
+
+        with torch.no_grad():
+            with set_batch_invariant_mode(True):
+                # Full batch
+                out_full = attn(q, k, v, attention_mask=None, attn_mask_type=AttnMaskType.causal)
+                # Chunked batches (batch-split invariance)
+                outs = [
+                    attn(qc, kc, vc, attention_mask=None, attn_mask_type=AttnMaskType.causal)
+                    for qc, kc, vc in zip(q_chunks, k_chunks, v_chunks)
+                ]
+                out_cat = torch.cat(outs, dim=1)
+
+                # Permuted batch (ordering invariance): permute B, run attention,
+                # then undo the permutation on the output batch dimension.
+                out_perm = attn(
+                    q[:, perm],
+                    k[:, perm],
+                    v[:, perm],
+                    attention_mask=None,
+                    attn_mask_type=AttnMaskType.causal,
+                )
+
+        assert out_full.shape == out_cat.shape == out_perm.shape
+
+        # Batch-split invariance: processing different contiguous chunks should
+        # produce exactly the same result as processing the full batch.
+        assert torch.equal(out_full, out_cat)
+
+        # Batch-order invariance: reordering the batch and then undoing the
+        # permutation on the output should give back the same tensor.
+        out_perm_unpermed = out_perm[:, perm.argsort()]
+        assert torch.equal(out_full, out_perm_unpermed)
+
+    Utils.destroy_model_parallel()
+
+
+# ============================================================================
+# Parity Tests: Batch-Invariant vs Regular TE Layers
+# ============================================================================
+
+
+def test_te_column_parallel_linear_parity():
+    """Test that batch-invariant and regular TE linear produce same forward/backward results."""
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg_bik = TransformerConfig(
+        num_layers=1,
+        hidden_size=128,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    cfg_regular = TransformerConfig(
+        num_layers=1,
+        hidden_size=128,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=False,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    # Create layers with same weights
+    torch.manual_seed(456)
+    layer_bik = TEColumnParallelLinear(
+        input_size=cfg_bik.hidden_size,
+        output_size=256,
+        config=cfg_bik,
+        init_method=init_method_normal(cfg_bik.init_method_std),
+        gather_output=False,
+        bias=True,
+        skip_bias_add=False,
+        is_expert=False,
+    ).cuda()
+
+    torch.manual_seed(456)  # Same seed for same initialization
+    layer_regular = TEColumnParallelLinear(
+        input_size=cfg_regular.hidden_size,
+        output_size=256,
+        config=cfg_regular,
+        init_method=init_method_normal(cfg_regular.init_method_std),
+        gather_output=False,
+        bias=True,
+        skip_bias_add=False,
+        is_expert=False,
+    ).cuda()
+
+    # Test forward pass
+    x = torch.randn(
+        64, cfg_bik.hidden_size, device="cuda", dtype=torch.bfloat16, requires_grad=True
+    )
+    x_clone = x.clone().detach().requires_grad_(True)
+
+    with set_batch_invariant_mode(True):
+        out_bik, _ = layer_bik(x)
+
+    with set_batch_invariant_mode(False):
+        out_regular, _ = layer_regular(x_clone)
+
+    # Check forward outputs are close
+    assert (
+        out_bik.shape == out_regular.shape
+    ), f"Shape mismatch: {out_bik.shape} vs {out_regular.shape}"
+    max_diff = (out_bik - out_regular).abs().max().item()
+    assert max_diff < 1e-3, f"Forward output difference too large: {max_diff}"
+
+    # Test backward pass
+    grad_output = torch.randn_like(out_bik)
+
+    out_bik.backward(grad_output)
+    out_regular.backward(grad_output.clone())
+
+    # Check gradients are close
+    grad_diff = (x.grad - x_clone.grad).abs().max().item()
+    assert grad_diff < 1e-3, f"Input gradient difference too large: {grad_diff}"
+
+    weight_grad_diff = (layer_bik.weight.grad - layer_regular.weight.grad).abs().max().item()
+    assert weight_grad_diff < 1e-3, f"Weight gradient difference too large: {weight_grad_diff}"
+
+    Utils.destroy_model_parallel()
+
+
+def test_te_rmsnorm_parity():
+    """Test that batch-invariant and regular TE RMSNorm produce same forward/backward results."""
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg_bik = TransformerConfig(
+        num_layers=1,
+        hidden_size=128,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    cfg_regular = TransformerConfig(
+        num_layers=1,
+        hidden_size=128,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=False,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    # Create layers with same weights
+    torch.manual_seed(789)
+    layer_bik = TENorm(
+        config=cfg_bik, hidden_size=cfg_bik.hidden_size, eps=cfg_bik.layernorm_epsilon
+    ).cuda()
+
+    torch.manual_seed(789)
+    layer_regular = TENorm(
+        config=cfg_regular, hidden_size=cfg_regular.hidden_size, eps=cfg_regular.layernorm_epsilon
+    ).cuda()
+
+    # Test forward pass
+    x = torch.randn(
+        48, cfg_bik.hidden_size, device="cuda", dtype=torch.bfloat16, requires_grad=True
+    )
+    x_clone = x.clone().detach().requires_grad_(True)
+    with set_batch_invariant_mode(False):
+        out_regular = layer_regular(x_clone)
+
+    with set_batch_invariant_mode(True):
+        out_bik = layer_bik(x)
+
+    # Check forward outputs are close
+    assert out_bik.shape == out_regular.shape
+    assert out_bik.dtype == out_regular.dtype
+    max_diff = (out_bik - out_regular).abs().max().item()
+    assert max_diff < 1e-3, f"Forward output difference too large: {max_diff}"
+
+    # Test backward pass
+    grad_output = torch.randn_like(out_bik)
+
+    out_bik.backward(grad_output)
+    out_regular.backward(grad_output.clone())
+
+    # Check gradients are close
+    grad_diff = (x.grad - x_clone.grad).abs().max().item()
+    assert grad_diff < 1e-3, f"Input gradient difference too large: {grad_diff}"
+
+    weight_grad_diff = (layer_bik.weight.grad - layer_regular.weight.grad).abs().max().item()
+    assert weight_grad_diff < 1e-3, f"Weight gradient difference too large: {weight_grad_diff}"
+
+    Utils.destroy_model_parallel()
+
+
+def test_te_layernorm_linear_parity():
+    """Test that batch-invariant and regular fused LayerNorm+Linear produce same results."""
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    Utils.initialize_model_parallel(1, 1)
+    model_parallel_cuda_manual_seed(123)
+
+    cfg_bik = TransformerConfig(
+        num_layers=1,
+        hidden_size=128,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=True,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    cfg_regular = TransformerConfig(
+        num_layers=1,
+        hidden_size=128,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        batch_invariant_mode=False,
+        params_dtype=torch.bfloat16,
+        normalization="RMSNorm",
+        layernorm_epsilon=1e-5,
+        attention_backend=AttnBackend.flash,
+    )
+
+    torch.manual_seed(321)
+    layer_bik = TELayerNormColumnParallelLinear(
+        input_size=cfg_bik.hidden_size,
+        output_size=256,
+        config=cfg_bik,
+        init_method=init_method_normal(cfg_bik.init_method_std),
+        gather_output=False,
+        bias=True,
+        skip_bias_add=False,
+        is_expert=False,
+    ).cuda()
+
+    torch.manual_seed(321)
+    layer_regular = TELayerNormColumnParallelLinear(
+        input_size=cfg_regular.hidden_size,
+        output_size=256,
+        config=cfg_regular,
+        init_method=init_method_normal(cfg_regular.init_method_std),
+        gather_output=False,
+        bias=True,
+        skip_bias_add=False,
+        is_expert=False,
+    ).cuda()
+
+    x = torch.randn(
+        48, cfg_bik.hidden_size, device="cuda", dtype=torch.bfloat16, requires_grad=True
+    )
+    x_clone = x.clone().detach().requires_grad_(True)
+
+    with set_batch_invariant_mode(True):
+        out_bik, _ = layer_bik(x)
+
+    with set_batch_invariant_mode(False):
+        out_regular, _ = layer_regular(x_clone)
+
+    assert out_bik.shape == out_regular.shape
+    max_diff = (out_bik - out_regular).abs().max().item()
+    assert max_diff < 1e-3, f"Forward output difference too large: {max_diff}"
+
+    grad_output = torch.randn_like(out_bik)
+
+    out_bik.backward(grad_output)
+    out_regular.backward(grad_output.clone())
+
+    grad_diff = (x.grad - x_clone.grad).abs().max().item()
+    assert grad_diff < 1e-3, f"Input gradient difference too large: {grad_diff}"
+
+    weight_grad_diff = (layer_bik.weight.grad - layer_regular.weight.grad).abs().max().item()
+    assert weight_grad_diff < 1e-3, f"Weight gradient difference too large: {weight_grad_diff}"
+
+    Utils.destroy_model_parallel()
+
+
+# Some tolerance for numerical differences between cuBLASLt and Triton
+def _tols(dtype: torch.dtype):
+    if dtype == torch.float16:
+        return dict(rtol=1e-2, atol=1e-2)
+    if dtype == torch.bfloat16:
+        return dict(rtol=2e-2, atol=2e-2)
+    return dict(rtol=1e-2, atol=5e-2)
+
+
+def _device(dtype=torch.float16):
+    return dict(device="cuda", dtype=dtype)
+
+
+# Helper to call TE general_gemm via Megatron's wrapper that manages workspace, etc.
+def _te_general_gemm(*args, **kwargs):
+    if te_general_gemm is None:
+        pytest.skip("TransformerEngine general_gemm is not available in this environment.")
+    return te_general_gemm(*args, **kwargs)
+
+
+# ============================================================================
+# Numerical Tests for General GEMM
+# ============================================================================
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+def test_bik_te_general_gemm_chunking_deterministic(dtype):
+    torch.manual_seed(123)
+    M1, M2, K, N = 37, 23, 128, 128
+    A1 = torch.randn(M1, K, **_device(dtype))
+    A2 = torch.randn(M2, K, **_device(dtype))
+    A = torch.cat([A1, A2], dim=0)
+    B = torch.randn(K, N, **_device(dtype))
+
+    layout = "TN"
+    with set_batch_invariant_mode(True):
+        # Full batch
+        C_full = _te_general_gemm(A, B, out_dtype=dtype, layout=layout)[0]
+        # Chunked batches
+        C_part1 = _te_general_gemm(A1, B, out_dtype=dtype, layout=layout)[0]
+        C_part2 = _te_general_gemm(A2, B, out_dtype=dtype, layout=layout)[0]
+        # For TN, output is [N, M]; concatenation should be along dim=1
+        cat_dim = 1 if layout == "TN" else 0
+        C_cat = torch.cat([C_part1, C_part2], dim=cat_dim)
+
+    # For TN, shapes are [N, M]
+    assert C_full.shape == (N, M1 + M2)
+    assert C_cat.shape == (N, M1 + M2)
+    # Exact equality expected due to deterministic Triton kernel
+    assert torch.equal(C_full, C_cat)
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+def test_bik_te_general_gemm_numerical_parity(dtype):
+    torch.manual_seed(111)
+    M, K, N = 64, 96, 96
+    A = torch.randn(M, K, **_device(dtype))
+    B = torch.randn(K, N, **_device(dtype))
+
+    C_ref = _te_general_gemm(A, B, out_dtype=dtype, layout="TN")[0]
+
+    # Batch-invariant inside context
+    with set_batch_invariant_mode(True):
+        C_bik = _te_general_gemm(A, B, out_dtype=dtype, layout="TN")[0]
+
+    torch.testing.assert_close(C_bik, C_ref, **_tols(dtype))
diff --git a/tools/bert_embedding/embed.py b/tools/bert_embedding/embed.py
index 2236182a751..effc6f6d91e 100644
--- a/tools/bert_embedding/embed.py
+++ b/tools/bert_embedding/embed.py
@@ -1,18 +1,20 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
 from functools import partial
+from types import SimpleNamespace
+from typing import Callable, Dict, List, Optional, Tuple, TypedDict
 import numpy as np
 import os
 import time
 import torch
+from torch.distributed import ProcessGroup
 from torch.utils.data import BatchSampler, DataLoader, SequentialSampler, Subset
 from torch.utils.data._utils.collate import default_collate
-from tqdm import tqdm
 
 from megatron.training import get_args, get_tokenizer, print_rank_0
 from megatron import core
 from megatron.training.arguments import core_transformer_config_from_args
-from megatron.core.datasets.retro.utils import get_blocks_by_rank
+from megatron.core import parallel_state
 from megatron.core.enums import ModelType
 from megatron.core.pipeline_parallel import get_forward_backward_func
 from megatron.legacy.model import BertModel
@@ -23,6 +25,20 @@
 from .external_libs import h5py
 from .huggingface import HuggingfaceEmbedder
 
+try:
+    from tqdm import tqdm
+
+    HAVE_TQDM = True
+except ImportError:
+    HAVE_TQDM = False
+
+try:
+    import h5py
+
+    HAVE_H5PY = True
+except ImportError:
+    HAVE_H5PY = False
+
 
 def collate_batch(samples):
     """Collate samples of various lengths.
@@ -127,6 +143,211 @@ def embed_data_loader(models, data_loader, tag):
     return embeddings
 
 
+class Block(TypedDict):
+    """Specific block arg type to mute mypy."""
+
+    range: Tuple[int, int]
+    path: str
+
+
+def get_blocks(
+    dirname: str, n_samples: int, block_size: int, validate: Optional[Callable] = None
+) -> SimpleNamespace:
+    """Divide range [0, num_samples) to sequence of block ranges.
+
+    This is a core method within the concept of block processing. The idea
+    is to divide a range (size n_samples) into a sequence of blocks. Each
+    block corresponds to a file within 'dirname' with name
+    '{start_idx}-{end_idx}.hdf5'. This method checks for the existence of
+    these files, and returns two lists, one for existing blocks and one for
+    missing blocks.
+
+    Args:
+        dirname (str): Path to directory containing block files.
+        n_samples (int): Ideal number of samples.
+            The total number of saved block data is <=n_samples.
+        block_size (int): Max number of samples per block file (e.g., 100000).
+        validate (Callable): Method for validating each block file during load.
+
+    Returns:
+        A namespace consisting of 2 lists: existing blocks, and missing blocks.
+        The total number of samples between the existing and missing blocks should
+        equal n_samples above.
+    """
+
+    if not HAVE_TQDM:
+        raise ImportError("tqdm is required to use the BertDataset. Please install tqdm.")
+
+    if not HAVE_H5PY:
+        raise ImportError("h5py is required to use the BertDataset. Please install h5py.")
+
+    assert os.path.isdir(dirname), "missing directory '%s.'" % dirname
+
+    # Block ranges.
+    block_start_idxs = list(range(0, n_samples, block_size))
+    block_end_idxs = [min(n_samples, i + block_size) for i in block_start_idxs]
+    block_ranges = list(zip(block_start_idxs, block_end_idxs))
+
+    # All block files (existing + missing).
+    n_digits = int(np.ceil(np.log(n_samples) / np.log(10)) + 1)
+
+    all_blocks: List[Block] = [
+        {
+            "range": r,
+            "path": os.path.join(
+                dirname, "%s-%s.hdf5" % tuple([str(i).zfill(n_digits) for i in r])
+            ),
+        }
+        for r in block_ranges
+    ]
+    all_block_path_set = set(block["path"] for block in all_blocks)
+
+    # Validate function.
+    validate = (lambda f: None) if validate is None else validate
+
+    # Delete corrupt files.
+    if torch.distributed.get_rank() == 0:
+        existing_block_paths = [
+            block["path"] for block in all_blocks if os.path.exists(block["path"])
+        ]
+        for index, path in enumerate(tqdm(existing_block_paths, "validating block.")):
+            assert path in all_block_path_set, "unexpected filename, '%s'." % path
+
+            try:
+                f = h5py.File(path, "r")
+            except Exception:
+                os.remove(path)
+                continue
+
+            try:
+                validate(f)
+            except Exception:
+                os.remove(path)
+            finally:
+                f.close()
+
+    # Wait for files to be deleted.
+    torch.distributed.barrier()
+
+    # Collect blocks.
+    blocks = SimpleNamespace(
+        existing=[b for b in all_blocks if os.path.exists(b["path"])],
+        missing=[b for b in all_blocks if not os.path.exists(b["path"])],
+    )
+
+    return blocks
+
+
+def get_blocks_by_rank(
+    dirname: str,
+    n_samples: int,
+    block_size: int,
+    validate: Optional[Callable] = None,
+    sample: Optional[float] = None,
+    process_group: Optional[ProcessGroup] = None,
+) -> SimpleNamespace:
+    """Divide existing and missing blocks evenly across all ranks.
+
+    See 'get_blocks()' above for description. The returned lists of existing and
+    missing blocks are split evenly across ranks via interleaving. This way,
+    each rank has a roughly equal number of blocks to process for a
+    downstream operation.
+
+    Args:
+        dirname (str): Path to directory containing block files.
+        n_samples (int): Ideal number of samples. The total number of saved block data
+            is <=n_samples.
+        block_size (int): Max number of samples per block file (e.g., 100000).
+        validate (Callable): Method for validating each block file during load.
+        sample (Optional[float]): If provided, sample a random subset of the blocks.
+            Used for validating preprocessing correctness.
+        process_group (Optional[ProcessGroup]): Process group for distributed operations.
+            If None, uses data parallel group.
+
+    Returns:
+        A namespace consisting of 2 lists: existing blocks, and missing blocks.
+        Each of these two lists is potentially a sub-sample of the total set of
+        existing and missing blocks, depending on whether sampling is used.
+        Additionally, the attributes n_existing_world and n_missing_world are the
+        total number of existing and missing blocks, independent of samples.
+        Therefore, (n_existing_world + n_missing_world) * block_size == n_samples.
+    """
+
+    if process_group is None:
+        process_group = parallel_state.get_data_parallel_group()
+
+    # Get world blocks.
+    blocks = get_blocks(dirname, n_samples, block_size, validate)
+
+    # This rank's existing and missing files.
+    rank_existing_blocks = blocks.existing[
+        process_group.rank() : len(blocks.existing) : process_group.size()
+    ]
+    rank_missing_blocks = blocks.missing[
+        process_group.rank() : len(blocks.missing) : process_group.size()
+    ]
+
+    # Extend rank's existing and missing blocks (with None) such that all ranks
+    # have equal length lists. This allows for easier tracking of global progress.
+    def get_world_max(n: int) -> int:
+        """Get max value across ranks.
+
+        Args:
+            n (int): Value on this rank.
+
+        Returns:
+            Max value across all ranks.
+        """
+        n_tensor = torch.cuda.LongTensor([n])
+        torch.distributed.all_reduce(n_tensor, op=torch.distributed.ReduceOp.MAX)
+        return n_tensor.item()
+
+    max_n_existing = get_world_max(len(rank_existing_blocks))
+    max_n_missing = get_world_max(len(rank_missing_blocks))
+
+    rank_existing_blocks += [None] * (max_n_existing - len(rank_existing_blocks))
+    rank_missing_blocks += [None] * (max_n_missing - len(rank_missing_blocks))
+
+    # Collect blocks.
+    blocks = SimpleNamespace(
+        n_existing_world=len(blocks.existing),
+        n_missing_world=len(blocks.missing),
+        existing=rank_existing_blocks,
+        missing=rank_missing_blocks,
+    )
+
+    if sample is not None:
+        # Sample existing and missing blocks evenly across all ranks. The
+        # returned lists of blocks are randomly sampled (without replacement)
+        # to yield `sample * len(blocks)` number of blocks.
+
+        # Randomly sample blocks.
+        def sample_blocks(_blocks: List[Optional[Dict]]) -> List[Optional[Dict]]:
+            """Sample a random subset of all blocks.
+
+            Args:
+                _blocks (List[Optional[Dict]]): List of all blocks.
+
+            Returns:
+                A random subset of the blocks.
+            """
+            n_blocks_sample = int(np.ceil(sample * len(_blocks)))
+            sampled_blocks: List[Optional[Dict]] = [b for b in _blocks if b is not None]
+
+            np.random.seed(None)
+            np.random.shuffle(sampled_blocks)
+
+            sampled_blocks = sampled_blocks[:n_blocks_sample]
+            sampled_blocks += [None] * (n_blocks_sample - len(sampled_blocks))
+
+            return sampled_blocks
+
+        blocks.existing = sample_blocks(blocks.existing)
+        blocks.missing = sample_blocks(blocks.missing)
+
+    return blocks
+
+
 class TextDataset(torch.utils.data.Dataset):
     '''Dataset that holds a list of strings.'''
 
diff --git a/tools/build_sequences_per_dataset.py b/tools/build_sequences_per_dataset.py
new file mode 100644
index 00000000000..e2787dd6434
--- /dev/null
+++ b/tools/build_sequences_per_dataset.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""
+Script to build a json file with the sequences per dataset to use with the --per-dataset-sequences-path. Accepts the same arguments as the training script.
+
+Usage:
+python3 tools/build_sequences_per_dataset.py --per-split-data-args-path my-training-dataset-blend.json --per-dataset-sequences-path my-training-dataset-blend-sequences-per-dataset.json
+
+"""
+
+import argparse
+import json
+from typing import Optional, Tuple, List
+
+
+from megatron.core.datasets.indexed_dataset import _IndexReader
+from megatron.training.utils import get_blend_and_blend_per_split
+
+def get_paths_from_blend(
+    blend: Optional[Tuple[List[str], Optional[List[float]]]],
+    blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]],
+) -> List[str]:
+    """Extract all dataset paths from blend and blend_per_split.
+
+    Args:
+        blend (Optional[Tuple[List[str], Optional[List[float]]]]): A blend tuple containing
+            a list of dataset paths and optionally a list of weights, e.g.,
+            (["path/to/dataset_1", "path/to/dataset_2"], [0.3, 0.7])
+        blend_per_split (Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]): 
+            A list of 3 blend tuples (for train, valid, test splits), where each element has 
+            the same structure as blend
+
+    Returns:
+        List[str]: A list of all unique dataset paths found in blend and blend_per_split
+    """
+    paths = []
+    
+    # Extract paths from blend
+    if blend is not None:
+        paths_list, _ = blend
+        paths.extend(paths_list)
+    
+    # Extract paths from blend_per_split
+    if blend_per_split is not None:
+        for split_blend in blend_per_split:
+            if split_blend is not None:
+                split_paths, _ = split_blend
+                paths.extend(split_paths)
+    
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_paths = []
+    for path in paths:
+        if path not in seen:
+            seen.add(path)
+            unique_paths.append(path)
+    
+    return unique_paths
+
+def build_sequences_per_dataset(args):
+    print("Building sequences per dataset...")
+
+    blend, blend_per_split = get_blend_and_blend_per_split(args)
+
+    file_prefixes = get_paths_from_blend(blend, blend_per_split)
+
+    print(f"Number of unique file prefixes: {len(file_prefixes)}")
+
+    sequence_count_dict = {}
+    for file_prefix in file_prefixes:
+        # NOTE(asolergi-nv): For every file prefix, read index file and get the number of sequences and documents
+        index_reader = _IndexReader(file_prefix + ".idx", False)
+        count = (index_reader.sequence_count, index_reader.document_count)
+        sequence_count_dict[file_prefix] = count
+
+    return sequence_count_dict
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data-path', nargs='*', default=None,
+                       help='The weight and prefix list for a set of train, validation, and test'
+                       'datasets which split according to --split. The accepted formats are: '
+                       '(1) a single prefix, '
+                       '(2) a list of weight prefix pairs e.g. weight1 prefix1 weight2 prefix2, '
+                       '(3) a list of prefixes e.g. prefix1 prefix2. '
+                       'For (3), weights are inferred from the lengths of the contributing datasets. '
+                       'This argument is exclusive to the other independent --*-data-path arguments.')
+    parser.add_argument('--train-data-path', nargs='*', default=None,
+                       help='The weight and prefix list for an independent train dataset. '
+                       'Follows the same pattern rules as --data-path.')
+    parser.add_argument('--valid-data-path', nargs='*', default=None,
+                       help='The weight and prefix list for an independent validation dataset. '
+                       'Follows the same pattern rules as --data-path.')
+    parser.add_argument('--test-data-path', nargs='*', default=None,
+                       help='The weight and prefix list for an independent test dataset. '
+                       'Follows the same pattern rules as --data-path.')
+    parser.add_argument('--data-args-path', type=str, default=None,
+                       help='Path to data-args. Instead of feeding `--data-path` '
+                       'with weighted dataset, we pass in a file path from which '
+                       'we read that argument. This is useful when the list of data is '
+                       'too big.')
+    parser.add_argument('--per-split-data-args-path', type=str, default=None,
+                       help='Path to per-split-data-args. Instead of feeding '
+                       '`--(train|valid|test)-data-path` with weighted dataset, '
+                       'we pass in a file path from which we read those arguments. '
+                       'This is useful when the list of data is too big. Format is a '
+                       'json file with `train`, `valid, `test` keys')
+    parser.add_argument('--per-dataset-sequences-path', type=str, required=True,
+                       help='Path to the output json file with the sequences per dataset.')
+    args = parser.parse_args()
+
+    sequence_count_dict = build_sequences_per_dataset(args)
+
+    with open(args.path_to_sequences_per_dataset_json, "w") as f:
+        json.dump(sequence_count_dict, f)
+
+    print(f"Done! Saving --path-to-sequences-per-dataset file to {args.path_to_sequences_per_dataset_json}")
\ No newline at end of file
diff --git a/tools/checkpoint/checkpoint_inspector.py b/tools/checkpoint/checkpoint_inspector.py
index c62f0ca7417..3d03f4db959 100644
--- a/tools/checkpoint/checkpoint_inspector.py
+++ b/tools/checkpoint/checkpoint_inspector.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 # python checkpoint_inspector.py inspect /path/to/checkpoint
 # torchrun --nproc_per_node=8 --nnodes=1 checkpoint_inspector.py convert-torch-dist-to-fsdp-dtensor /path/to/input_checkpoint /path/to/output_checkpoint --swiglu
 import gc
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 7d382a0d134..03e5329459c 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -54,7 +54,7 @@ def __init__(self, args):
     def initializer(self):
         # Use Encoder class as a container for global data
         if self.args.legacy_tokenizer:
-            tokenizer = build_tokenizer(self.args)
+            Encoder.tokenizer = build_tokenizer(self.args)
         else:
             Encoder.tokenizer = build_new_tokenizer(self.args)
         if self.args.split_sentences:
diff --git a/tools/retro/README.md b/tools/retro/README.md
deleted file mode 100644
index 395005e73bf..00000000000
--- a/tools/retro/README.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# Retro and InstructRetro
-
-Retro [(Borgeaud et al., 2022)](https://arxiv.org/abs/2112.04426) is an autoregressive decoder-only language model (LM)
-pretrained with retrieval-augmentation.
-Retro features practical scalability to support large-scale pretraining from scratch by retrieving from trillions of
-tokens.
-Pretraining with retrieval provides a more efficient storage mechanism of factual knowledge, when compared to storing
-factual knowledge implicitly within the network's parameters, thus largely reducing model parameters while achieving
-lower perplexity than standard GPT.
-Retro also provides the flexibility to update the
-knowledge stored in LMs [(Wang et al., 2023a)](https://arxiv.org/abs/2304.06762)
-by updating the retrieval database without training LMs again.
-
-InstructRetro [(Wang et al., 2023b)](https://arxiv.org/abs/2310.07713) further scales up the size of Retro to 48B,
-featuring the largest LLM pretrained with retrieval (as of December 2023).
-The obtained foundation model, Retro 48B, largely outperforms the GPT counterpart in terms of perplexity.
-With instruction tuning on Retro, InstructRetro demonstrates significant improvement over the instruction tuned GPT on
-downstream tasks in the zero-shot setting. Specifically, the average improvement of InstructRetro is 7% over its GPT
-counterpart across 8 short-form QA tasks, 10% over GPT across 4 challenging long-form QA tasks, and 16% over GPT across
-3 summarization tasks. We also find that one can ablate the encoder from InstructRetro architecture and directly use the
-InstructRetro decoder backbone as GPT, while achieving comparable results.
-
-This README provides an end-to-end tutorial to reproduce Retro and InstructRetro.
-
-# Contents
-
-* [Checkpoints](#checkpoints)
-* [End-to-end Reproduction Guide](#end-to-end-reproduction-guide)
-    * [Step 0: Prepare the environment](#step-0-prepare-the-environment)
-        * [Docker image](#docker-image)
-        * [Install dependencies](#install-dependencies)
-    * [Step 1: Build retrieval database](#step-1-build-retrieval-database)
-    * [Step 2: Pretraining](#step-2-pretraining)
-    * [Step 3: Perplexity evaluation](#step-3-perplexity-evaluation)
-    * [Step 4: Instruction tuning](#step-4-instruction-tuning)
-    * [Step 5: Downstream task evaluation](#step-5-downstream-task-evaluation)
-* [Citations](#citations)
-
-# Checkpoints
-
-We provide the pretrained checkpoints of Retro and InstructRetro in the following table. The checkpoints are available
-to download through the following links:
-
-| Model                   | Size | Instruction Tuning | Download Link 1                                                    | Download Link 2                                                                | Download Link 3                                                                                      |
-|-------------------------|------|--------------------|--------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
-| `retro-8b-base-4k`      | 8b   |                    | [Huggingface](https://huggingface.co/nvidia/retro-8b-base-4k)      | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-8b-base-4k)      | [Google Drive](https://drive.google.com/drive/folders/1uSQ5DAsuvx_8XcbtnVfs_MGvEOcx0uK_?usp=sharing) |
-| `retro-8b-instruct-4k`  | 8b   | ✅                  | [Huggingface](https://huggingface.co/nvidia/retro-8b-instruct-4k)  | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-8b-instruct-4k)  | [Google Drive](https://drive.google.com/drive/folders/1v5dKaSN0cm2lwyAWpFaJtlTrLhtMZXsI?usp=sharing) |
-| `retro-48b-base-4k`     | 48b  |                    | [Huggingface](https://huggingface.co/nvidia/retro-48b-base-4k)     | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-48b-base-4k)     | [Google Drive](https://drive.google.com/drive/folders/1rtNpf0CiLElSHQcr3aLI3zgfI3teGTP5?usp=sharing) |
-| `retro-48b-instruct-4k` | 48b  | ✅                  | [Huggingface](https://huggingface.co/nvidia/retro-48b-instruct-4k) | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-48b-instruct-4k) | [Google Drive](https://drive.google.com/drive/folders/1qdb0AQjSsAPGlWaIu3wgHPjf_nwLeY5h?usp=sharing) |
-
-# End-to-end Reproduction Guide
-
-In this README, we provide an end-to-end reproduction guide for InstructRetro, covering from large-scale retrieval
-construction, pretraining, perplexity evaluation, instruction tuning, to downstream task evaluation.
-
-If you are interested in evaluation only, we also [open-sourced our checkpoints](#checkpoints) and you can directly go
-to [Step 5](#step-5-downstream-task-evaluation) to evaluate the checkpoints on downstream tasks.
-
-## Step 0: Prepare the environment
-
-We recommend using docker environment to run the code.
-
-### Docker image
-
-We provide a docker build file in [tools/retro/examples/Dockerfile](examples/Dockerfile) for the reproduction. The
-docker image is based on the [NGC docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) `nvcr.io/nvidia/pytorch:23.09-py3`.
-
-### Install dependencies
-
-Clone the Megatron repo:
-
-```bash
-git clone --branch InstructRetro https://github.com/NVIDIA/Megatron-LM.git
-```
-
-If docker is not available, we recommend starting from a clean conda environment with the following runtime
-dependencies:
-
-- Python 3.10
-- NVIDIA CUDA® 12.2.1
-- NVIDIA cuBLAS 12.2.5.6
-- NVIDIA cuDNN 8.9.5
-- NVIDIA NCCL 2.18.5
-- PyTorch 2.1.0a0+32f93b1
-
-Then install Retro-specific dependencies, including:
-
-```bash
-pip install -U faiss-gpu
-pip install -U transformers
-pip install -U sentencepiece
-pip install -U h5py
-pip install -U nltk
-pip install -U einops
-```
-
-## Step 1: Build retrieval database
-
-In this step, we build a large-scale retrieval database for InstructRetro
-through [Faiss](https://github.com/facebookresearch/faiss) to retrieve from trillions of tokens, and preprocess (and
-save) the retrieval neighbors for the pretraining step.
-
-Please refer to [tools/retro/build_db.md](build_db.md) for more details.
-
-## Step 2: Pretraining
-
-*Please strictly follow Step 1 to build the retrieval database before pretraining to make sure the preprocessed
-retrieval neighbors match the pretraining corpus.*
-
-In the pretraining step, we support both pretraining from scratch and continued pretraining from a pretrained GPT model.
-
-We provide a template pretraining script to pretrain 843M Retro from scratch. Prepare your own arguments and update our
-templates in [tools/retro/examples/pretrain_model.sh](examples/pretrain_model.sh). Please note that the data path should
-be exactly matching the one used in Step 1 to make sure the preprocessed retrieval neighbors match the pretraining
-corpus.
-
-[//]: # (Take the example of the Wikipedia corpus)
-
-```bash
-bash tools/retro/examples/pretrain_model.sh
-```
-
-After pretraining, the model checkpoints will be saved in the `--save` directory if you specified the arg
-in `pretrain_model.sh`.
-
-To continue pretraining with retrieval from a pretrained GPT model, please specify `--load` in `pretrain_model.sh` to
-load the pretrained GPT model checkpoint (the architecture of GPT, including hidden size, number of layers, and
-activation methods, should be exactly the same as the one used for Retro). You should also
-specify  `--no-load-optim --finetune` to make sure the optimizer state is not loaded from the pretrained GPT model and
-the continued pretraining with retrieval is from a clean start. After the first job / the first run, you will continue
-pretraining with retrieval from your last checkpoint. In the follow-up jobs, you should launch the pretraining without
-the flags `--no-load-optim --finetune` to make sure the optimizer state is correctly loaded from your last job.
-
-## Step 3: Perplexity evaluation
-
-During pretraining, we will automatically evaluate the model perplexity on the specified validation corpus
-every `--eval-interval` steps. The validation corpus should be exactly the same as the one used in Step 1 to make sure
-the preprocessed retrieval neighbors match the pretraining corpus.
-
-To evaluate the perplexity of a pretrained model, please add `--skip-train` in `pretrain_model.sh` to skip the
-pretraining step and only evaluate the perplexity of the model specified in `--load` on the validation corpus. Run the
-above command again to evaluate the perplexity of a pretrained model:
-
-```bash
-bash tools/retro/examples/pretrain_model.sh
-```
-
-## Step 4: Instruction tuning
-
-In this step, we fine-tune the pretrained model on the downstream task with instructions. We provide a template
-instruction tuning script to fine-tune 843M Retro.
-
-We also provide an open-source blend of instruction tuning datasets. The dataset is available to download
-through [here](https://drive.google.com/file/d/1nzKwwYf8lYb9gN3P4YO8pFNU_B2nMYe1/view?usp=sharing). The blendable
-dataset consists of the following open-source instruction tuning datasets:
-
-### Instruction Tuning Dataset Breakdown
-
-| Dataset                                                    | Samples | Epochs | Sampling Prob |
-|------------------------------------------------------------|--------:|-------:|--------------:|
-| [soda](https://arxiv.org/abs/2212.10465)                   |    2560 |  0.005 |         0.020 |
-| [eli5](https://arxiv.org/abs/1907.09190)                   |    2561 |  0.055 |         0.020 |
-| [self_instruct_short](https://arxiv.org/abs/2212.10560)    |    1280 |  0.043 |         0.010 |
-| [self_instruct_long](https://arxiv.org/abs/2212.10560)     |    2560 |  0.333 |         0.020 |
-| [unnatural-instructions](https://arxiv.org/abs/2212.09689) |    2560 |  0.024 |         0.020 |
-| [flan_cot](https://arxiv.org/abs/2210.11416)               |    1280 |  0.093 |         0.010 |
-| [dolly](https://arxiv.org/abs/2305.13735)                  |    6400 |  0.938 |         0.050 |
-| [oasst-skip-noncode](https://open-assistant.io/)           |  104558 |  1.839 |         0.817 |
-| [oasst-skip-code](https://open-assistant.io/)              |    4243 |  1.839 |         0.033 |
-
-Refer to the paper links above for more details about each instruction tuning dataset.
-
-*We note that the provided instruction tuning dataset is all from open-source instruction tuning datasets. It is
-slightly different from what we use in [InstructRetro](https://arxiv.org/abs/2310.07713), which contains private and
-proprietary datasets. Thus a 1-2% accuracy difference in downstream tasks may be expected.*
-
-### Instruction tuning script
-
-Download
-the [blended instruction tuning dataset](https://drive.google.com/file/d/1nzKwwYf8lYb9gN3P4YO8pFNU_B2nMYe1/view?usp=sharing)
-in your data home directory `$DATA_HOME` and update our templates
-in [tools/retro/sft/sft_retro_lm.sh](sft/sft_retro_lm.sh).
-
-An example command to run instruction tuning on 843M Retro is as follows:
-
-```bash
-                                      [blend-dataset-name] [model-size] [batch-size]  [lr]    [checkpoints]
-bash tools/retro/sft/sft_retro_lm.sh       open_inst               843m            128    5e-6  <path/to/pretrained/retro>
-```
-
-The `blend_dataset_name` argument will blend all the datasets within the `$DATA_HOME` following the weights and
-configurations specified in the `${blend_dataset_name}.sh` ([open_inst.sh](sft/open_inst.sh) in the example above).
-The checkpoints will be saved in the `--save` directory. For example, it will be saved to
-`<SFT_HOME>/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6`.
-
-## Step 5: Downstream task evaluation
-
-In this step, we demonstrate how to run InstructRetro for zero-shot evaluation on downstream question answering (QA)
-tasks. We provide the pre-processed open-source evaluation datasets with a unified format for different tasks. The
-evaluation datasets used in our paper are available to download
-through [here](https://drive.google.com/drive/folders/1xw-N0LJR_lIWnH6BKzHIb49quVCS_V72?usp=sharing). Please stick to
-the same retro workdir used in Step 0-4 to make sure the preprocessed retrieval neighbors match the pretraining corpus.
-If you directly come to Step 5, an example retro workdir with `args.json` for 800M Retro is
-provided [here](https://drive.google.com/file/d/121GqAdMvf8bJEBZRt-SD4uhW-SRWgI3s/view?usp=sharing). Note that the args
-in the json can be overwritten through the command line.
-
-We present an example command to run retro generation given the InstructRetro checkpoints and the Natural Question (NQ)
-task. The example command is for the 843m InstructRetro obtained in Step 4. Please specify the directory for the NQ
-dataset and update the command accordingly for other checkpoints.
-
-```bash
-bash tools/retro/text_generation/retro_generate.sh nq 843m greedy test  0 20000 1000 5 pp1 <SFT_HOME>/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6 2
-```
-
-The generated responses will be saved in the corresponding checkpoint directory. For example, for the 843m
-InstructRetro, it will be saved to
-`<SFT_HOME>/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6/retro-generate-nq_5_2_843m_test_greedy_0_20000_1000.txt`.
-
-To evaluate the F1 / Exact Match (EM) scores of the generated responses, we provide an example script to run the
-evaluation on the NQ dataset. Please specify the directory for the NQ dataset and update the command accordingly for
-other checkpoints and downstream tasks.
-
-```bash
-python3 tools/retro/text_generation/evaluate.py
-```
-
-# Citations
-
-See more details from our papers:
-
-[Shall we Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study.](https://arxiv.org/abs/2304.06762)
-
-_Boxin Wang, Wei Ping, Peng Xu, Lawrence McAfee, Zihan Liu, Mohammad Shoeybi, Yi Dong, Oleksii Kuchaiev, Bo Li, Chaowei
-Xiao, Anima Anandkumar, Bryan Catanzaro._ (EMNLP 2023)
-
-[InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining.](https://arxiv.org/abs/2310.07713)
-
-_Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, Bryan Catanzaro._
-
-Please cite the papers as follows if you use the data or code from this repo:
-
-```bibtex
-@inproceedings{wang2023shall,
-    title   = {Shall We Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study},
-    author  = {Boxin Wang and Wei Ping and Peng Xu and Lawrence McAfee and Zihan Liu and Mohammad Shoeybi and Yi Dong and Oleksii Kuchaiev and Bo Li and Chaowei Xiao and Anima Anandkumar and Bryan Catanzaro},
-    journal = {The 2023 Conference on Empirical Methods in Natural Language Processing},
-    year    = {2023}
-}
-
-@article{wang2023instructretro,
-    title   = {InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining},
-    author  = {Boxin Wang and Wei Ping and Lawrence McAfee and Peng Xu and Bo Li and Mohammad Shoeybi and Bryan Catanzaro},
-    year    = {2023},
-    journal = {arXiv preprint arXiv: 2310.07713}
-}
-```
diff --git a/tools/retro/build_db.md b/tools/retro/build_db.md
deleted file mode 100644
index c99952485ab..00000000000
--- a/tools/retro/build_db.md
+++ /dev/null
@@ -1,421 +0,0 @@
-This directory contains a collection of tools for building the retrieval database and pretraining neighbors for Retro. This preprocessing pipeline is broken into 3 main stages:
-
-1. **Build retrieval chunk database** : Used for retrieving neighbors and continuation chunks, which are then passed through the retrieval encoder.
-2. **Build index for similarity search** : Train and build a search index for querying chunk neighbors.
-3. **Query pretraining neighbors** : For matching pretraining samples to database chunks. Neighbors are generated separately for training, validation, and test datasets.
-
-The following overview goes into more detail on the pipeline, code structure, usage, and pretraining.
-
-<!-- ################ contents ################ -->
-# Contents
-
-  * [Quick start](#quick-start)
-  * [Tutorial](#tutorial)
-  * [Code structure](#code-structure)
-  * [Arguments](#arguments)
-  <!-- * [Pretraining](#pretraining) -->
-
-<!-- ################ quick start ################ -->
-
-# Quick Start
-Key files:
-
-- `main.py` : Entry point for processing.
-- `examples/preprocess_data.sh` : Example preprocessing launch (calls `main.py`).
-- `examples/pretrain_data.sh` : Example pretraining launch (calls `pretrain_retro.py`).
-
-Use `--retro-tasks` to move through the preprocessing pipeline.
-
-- Simplest setup (builds everything): `--retro-tasks build`
-- Alternatively, for tuning compute resources, run stages independently:
-  - Build retrieval database: `--retro-tasks db-build`
-  - Build search index: `--retro-tasks index-build`
-  - Query neighbors: `--retro-tasks pretraining-query-neighbors`
-
-Sample code flow:
-
-- `main.py` : Entry point (e.g., using `--retro-tasks X`).
-- `db/build.py` : Build retrieval database.
-- `index/build.py` : Build search index. Calls the following two files:
-  - `index/train.py` : Train index on subset of database.
-  - `index/add.py` : Add database chunks to index.
-- `pretraining/query.py` : Query pretraining samples for database neighbors (saved to disk and used during pretraining).
-
-<!-- ################ tutorial ################ -->
-
-# Tutorial
-
-In this tutorial example, we use the Wikipedia corpus to demonstrate how we build a retrieval database and index for this corpus, and then query the pretraining datasets for their neighbors.
-
-## Step 1: Prepare your retrieval text corpus
-
-The format of text corpus follows the same format as in Megatron training. See [data precessing](../../README.md#data-preprocessing) for more details on how to convert your json dataset into the mmap format.
-
-Assume we have the Wikipedia corpus in the following format:
-
-```
-<retrieval/db/path>/Wikipedia_shuf_text_document.bin
-<retrieval/db/path>/Wikipedia_shuf_text_document.idx
-```
-
-We note that the retrieval database can also be a blend of multiple text corpus.
-
-## Step 2: Build retrieval chunk database
-
-This *database* (stored as a 2-D array, NOT a relational database) consists of a list of chunks (traditionally length 64) extracted from the original GPT token dataset. This is simply a consecutive, non-overlapping chunking of the token dataset. Chunking only takes place within a document, and therefore the final chunk of each document has length: 1 <= chunk_length <= max_chunk_length.
-
-We discard chunks that would convert to an empty Bert sequence (rare case, happens ~1/100,000 chunks in our case), since we use Bert embeddings for building our index. Thus, the total number of chunks in the database will be slightly less than a naive calculation.
-
-Take the Wikipedia corpus as an example to build the retrieval chunk database:
-
-Prepare the following arguments and update our templates in [tools/retro/examples/preprocess_data.sh](examples/preprocess_data.sh):
-- `--retro-workdir`: The directory in which the preprocessing pipeline saves its datasets and configuration files. 
-  **This argument should remain consistent for a full pass through the pipeline, and for pretraining.**
-- `--data-path`: text corpus path to build retrieval database. In the case of Wikipedia corpus, it could be
-```bash
-WIK="${DATA_HOME}/Wikipedia_shuf_text_document"
-
-DATA_BLEND=" \
-  1 ${WIK} \
-"
-```
-- `--load`: bert path to load bert embedder
-- `--vocab-file` and `--retro-bert-vocab-file`: bert vocab file
-- `--retro-gpt-tokenizer-model`: gpt tokenizer model file
-
-Then launch the script:
-```bash
-bash tools/retro/examples/preprocess_data.sh db-build
-```
-
-After the `db-build` is finished, the output includes:
-- The launching args will be saved in your `<retro-workdir>/args.json` for the following steps. 
-- The retrieval chunk database will be saved in your `<retro-workdir>/db/` with your dataset information in `<retro-workdir>/db/indexed_dataset_infos.json`.  
-
-## Step 3: Build index for similarity search
-
-To match pretraining chunks to database chunks, a search index must be built to perform this querying. We use Faiss (https://github.com/facebookresearch/faiss) for training and building this index. Generally, the index is trained on a subset of all chunks in the database (specified via `--retro-index-ntrain`). After training, all chunks are added into the index, to be available during querying.
-
-Indexes only accept 1-D floating point vectors for training and adding, so each chunk must first be embedded before passing to the index for either training or adding. We use Bert embeddings for this purpose, and the embeddings are generated automatically within the pipeline.
-
-Take the Wikipedia corpus as an example to build the retrieval chunk database:
-
-```bash
-bash tools/retro/examples/preprocess_data.sh index-train
-```
-The `index-train` step is expected to take less than 4-hour on a single DGX-A100 node given the template index configuration. 
-To scale up for larger retrieval database, please carefully tune the faiss hyper-parameters specified in `--retro-index-str`. Please refer to [Faiss](https://github.com/facebookresearch/faiss/wiki/The-index-factory) to learn more about the index configuration.  
-
-After the index is trained, the centroids, HNSW graph, and product quantizer is determined. However, the index is still empty, as there is no chunk added.
-
-Take the example of the Wikipedia corpus, with the default template, the output of `index-train` includes:
-- The embedded Bert embeddings of the sampled chunks for `index-train` is saved in `<retro-workdir>/index/train_emb/`.  
-- The empty index is saved in `<retro-workdir>/index/faiss-par-add/OPQ32_64,IVF65536_HNSW8,PQ32/empty_0.970.faissindex`.
-
-Then we add all chunks in the retrieval database into the index so that we perform fast query over the whole retrieval database:
-```bash
-bash tools/retro/examples/preprocess_data.sh index-add
-```
-
-We note that this step can be time-consuming as it will go through the whole retrieval database, embed chunk tokens  to BERT embeddings, and add them into the index. Please make sure you successfully add the whole retrieval database before moving on to the next stage.
-
-*In case your job is interrupted in the middle, you can just run the script again, and it will automatically skip the chunks that have been added into the index and start from the chunk where it is interrupted.*
-
-
-Following the Wikipedia configuration, an example output of the step `index-add` includes:
-- The index with retrieval data chunks added is saved in `<retro-workdir>/index/faiss-par-add/OPQ32_64,IVF65536_HNSW8,PQ32/added_0.970_0.950.faissindex`, which can be used to query the neighbors for pretraining.
-
-## Step 4: Query pretraining neighbors
-
-To ensure fast Retro pretraining, the database neighbors for pretraining samples are pre-computed and saved to disk, for efficient access within the Retro dataset. In this stage, the pretraining datasets (training, validation, and test) are iterated, each sample is broken into chunks, and the chunks are used for querying the index. Similar to when building the index, each chunk is embedded (via Bert) before querying the index.
-
-The saved neighbors are labeled with unique dataset properties (i.e., seed, sequence length, number of samples, etc.) to ensure the neighbors generated during preprocessing match the neighbors requested during pretraining. Please also make sure the pretraining configuration is the same as this step so that the neighbors are aligned.
-
-There are query-time hyper-parameters that can be tuned to improve the quality of the neighbors. These are specified in `RETRO_QUERY_EF_SEARCH` and `RETRO_QUERY_NPROBE`. The most important parameter is `RETRO_QUERY_NPROBE`, which controls the number of clusters to search during querying. This parameter can be tuned to improve the quality of the neighbors, but will also increase the query time. 
-We recommend following the tutorial of [faiss](https://github.com/facebookresearch/faiss/wiki/Index-IO,-cloning-and-hyper-parameter-tuning) to tune the hyper-parameters for your own retrieval database. 
-
-Take the Wikipedia corpus as an example to query the neighbors in the retrieval database:
-
-```bash
-bash tools/retro/examples/preprocess_data.sh query-pretraining-neighbors
-```
-
-The output of `query-pretraining-neighbors` on the Wikipedia corpus includes:
-- `<retro-workdir>/wiki/query/train_855ab50e05151610301e2a74c4030fbc`, which contains the pre-retrieved neighbors for the pretraining dataset. 
-- `<retro-workdir>/wiki/query/valid_40bc7330318d64accec28e1e63c59bad`, which contains the pre-retrieved neighbors for the validation set of the pretraining corpus.
-
-## Step 5: Visualization of retrieval neighbors
-
-We also provide cli tools to help visualize and inspect the quality of your retrieved neighbors. 
-
-To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following:
-
-```
-from tools.retro.cli import retro
-retro.init("/path/to/retro/workdir")
-```
-
-This initializes Megatron, and prepares the Retro data for inspection. We also print out some example commands to help you get familiar with the command lines.   
-
-An example output for the Wikipedia Corpus:
-
-```text
-setting number of micro-batches to constant 32
-> building BertWordPieceLowerCase tokenizer ...
-> initializing torch distributed ...
-> initialized tensor model parallel with size 1
-> initialized pipeline model parallel with size 1
-> compiling dataset index builder ...
-...
-...
- > sample ratios:
-   dataset 0, input: 1, achieved: 1
-> size of blendable dataset: 201000 samples
-> elapsed time for building blendable dataset indices: 0.00 (sec)
-> building indices for blendable datasets ...
- > sample ratios:
-   dataset 0, input: 1, achieved: 1
-> size of blendable dataset: 12864 samples
-> finished creating pretrained GPT datasets ...
-
-+++++++++++++++++++++++++++++++++++++++++++++++++++
-examples ... [ *note*: 'db' = chunk db; 'pt' = pretraining corpus. ]
-+++++++++++++++++++++++++++++++++++++++++++++++++++
-
-~~~~ indexed datasets ~~~~
-retro.get_db_num_indexed_datasets() : 1
-retro.get_db_indexed_dataset_infos() :
-  [(1.000000, Wikipedia_shuf_text_document)]
-
-~~~~ counts ~~~~
-retro.get_db_num_chunks : 68104992.
-
-retro.get_pt_num_samples('train') : 201000.
-retro.get_pt_num_samples('valid') : 12864.
-retro.get_pt_num_chunks('train') : 1608000.
-retro.get_pt_num_chunks('valid') : 102912.
-
-~~~~ tokens, text ~~~~
-retro.get_db_chunk_gpt(chunk_id) : [46809, 218340, 716, 647, ... , 251525, 872, 692, 4042]
-retro.get_db_chunk_bert(chunk_id) : [10680, 16216, 4313, 1745 ... , 8117, 1007, 1012, 1997]
-retro.get_db_chunk_text(chunk_id) : Jonas Geirnaert\n\nJonas  ... ort Flatlife (11 min). Of
-retro.get_db_chunk_and_continuation_text(chunk_id) :
-  ['Jonas Geirnaert  Jonas Ge ... ort Flatlife (11 min). Of',
-   'the copy he sent in for s ... abet, clearly has one. On']
-
-retro.get_pt_sample('train', sample_id) :
-  {
-    'dataset_idx' : 0
-    'text' : [   676     14  40656 184 ... 4\n    276  17361 251542]
-    'doc_ids' : [1246422 1596948 2403969]
-    'neighbor_chunks' : [[[  657380   657381]\n   ... \n  [34108760 34108761]]]
-    'neighbor_tokens' : [[[   276   9596 251511 . ... .    889    646   1723]]]
-  }
-
-(e.g., sample = retro.get_pt_sample(...))
-
-  sample['text'].shape : (513,)
-  sample['neighbor_tokens'].shape : (8, 20, 128)
-  sample['text'] : [   676     14  40656 184 ... 4\n    276  17361 251542]
-  sample['neighbor_tokens'][17][1] : [    14     14  30291   1 ... 682    328    379 251527]
-  retro.gpt_to_text(sample['text']) : also\nLatgalians (modern) ... ission criticised the AVN
-  retro.gpt_to_text(sample['neighbor_tokens']) : \n\nHis second marriage o ... Augusta Eardley-Wilmot (2
-+++++++++++++++++++++++++++++++++++++++++++++++++++
-```
-
-We can also directly call the function `retro.print_neighbor_texts(sample_id, chunk_id)` to inspect the retrieval neighbors for a specific sample and chunk within the pretraining corpus. For example,  
-
-```text
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-PRETRAINING CHUNK:
-  - also\nLatgalians (modern)\n\nReferences\n\nCategory:Defunct political parti ... e.\n\nAbout \nThe company was established established in 1997. It is listed
-NEIGHBOR_CHUNKS:
-  - the sides.\n\nNotes\n\nReferences\n\nCategory:Obaku Zen\n*\nCategory:Japane ... 2, 2008. It was founded by Anand Jagannathan, CEO of parent company Kriyari
-  - 2007).\n\nSee also\n Satellite Communications\n Tonga\n\nReferences\n\nExte ... y Procter & Gamble (P&G) in 1985 in order for P&G to compete in the "beauty
-  - Japan\nCategory:Fish of Russia\nCategory:Fish described in 1845 Mareco Inde ... lic Opinion (WAPOR)\n European Society for Opinion and Marketing Research (
-  - The current director of the company is Albert Bosch.\n\nSee also\n Coupon\n ...  some articles in Basque. Deia is the main product of the Editorial Iparrag
-  - A.Ş have been traded on the Istanbul Stock Exchange since 2000.\n\nReferenc ... with stores in California, New York City, and London.\n\nHistory \nSnapette
-  - \nCategory:Hawaiian mythology\nCategory:Hawaiian religion\nCategory:Religio ... crative state contracts. In 2008 Prokom became a part of the Asseco capital
-  - , and the Baltic countries, as well as an online store.\n\nReferences\n\nEx ... nd are involved in intracellular trafficking. This protein does not contain
-  - juice producer\nFood industry of Russia\n\nReferences\n\nExternal links\nWi ... panies formerly listed on the New York Stock Exchange General Grant's March
-  - is in private ownership.\n\nReferences\n\nExternal links\n\nCategory:Online ... ten and directed by Brent Hodge. The film stars Aubrey Plaza, Molly Hawkey,
-  - company's display technology to manufacture and sell display-only engines.\ ... for a group of naval vessels (a division in naval usage).\n\nUsage\n Russia
-  - .\n\nCarrols also operated a chain of outlets in neighbouring Estonia from  ... rama film directed by Raajeev Walia. It is produced by Aman Mehta and Bijal
-  - \n\nExternal links\nHightail website\nThe Next Web on YouSendIt rebrand to  ... eptember 2014, sitting mainly in the criminal division of that court.\n\nBe
-  - American television seasons\nCategory:2014 American television seasons\nCat ...  Canada and larger European cities.\n\nIn 2010, advertising in New Zealand,
-  - .\n\nNotes\n\nCategory:Trade unions\nCategory:Industrial Workers of the Wor ... x people, some of whom may have been working on a part-time basis. Its head
-  - \n List of podcasting companies\n\nReferences\n\nExternal links\n \n\nCateg ... ct.\n\nCategory:Populated places in the Ashanti Region Nkeirouka Ezekh\n\nN
-  - \n\nReferences\n\nExternal links\n ADESE official website\n\nCategory:Compa ...  State Street, and UBS Warburg. Its first CEO was Ian M. Drachman. The firm
-  - Hotel\n Sulake Corporation\n Sulake Press Room\n Habbo Hotel - Blog\n\nCate ... l: 김진태; born December 19, 1980), better known by his stage name Verbal Jint
-  - hockey player\n Ruutu.fi, a Finnish television streaming service operated b ...  from the bottom, a BDSM term\n Topping cycle, a cycle used in power plants
-  - of Surakarta\nCategory:Indonesian names\nCategory:Indonesian families\nCate ... mber 13, 2013 in Izhevsk on Universitetskaya Street (later it was given the
-  - facilities are also in Ankara and the company HQ is in Istanbul.\n\nReferen ... is currently a World Wide Web Consortium Working Draft.\n\nSee also\n Voice
-```
-
-The code snippet for the above example is also equivalent to
-```python
-tokens = retro.get_pt_sample('train', 0)
-for token_ids in tokens["neighbor_tokens"][0]:
-    print("- %s" % (retro.gpt_to_text(token_ids)))
-    print("-" * 20)
-```
-<!-- ################ code structure ################ -->
-# Code structure
-
-### `tools/retro/main.py`
-
-This is the main entry point for Retro preprocessing. Call `main.py --help` to see arguments. Additionally, some Retro arguments are in Megatron's core arguments, so also see `add_retro_args()` section of `megatron/arguments.py` for additional arguments. Two of the most important arguments to customize are `--retro-workdir` and `--retro-tasks`.
-
-- **`--retro-workdir`** : Set the directory in which the preprocessing pipeline saves its datasets and configuration files. This argument should remain consistent for a full pass through the pipeline, and for pretraining.
-
-- **`--retro-tasks`** : Set the stages of preprocessing to perform. As mentioned previously, the three high-level stages are: 1) build retrieval database, 2) build search index, and 3) query pretraining neighbors. `--retro-tasks` can be used to either run the full pipeline, or run each of these stages in isolation. The latter case is useful for tuning compute resources for each stage. For example, index training utilizes GPUs and requires relatively less time, while querying neighbors uses the CPU and is a relatively slow process. Example tasks include:
-
-  - **`--retro-tasks build`** : Run entire preprocessing pipeline.
-  - **`--retro-tasks db-build`** : Build retrieval database.
-  - **`--retro-tasks index-build`** : Train and build search index.
-  - **`--retro-tasks pretraining-query-neighbors`** : Query pretraining neighbors.
-
-Multiple tasks can be specified by separating with commas (e.g., `--retro-tasks db-build,index-build`). Additionally, various 'miscellaneous' tasks are currently including, primarily for validating data for each stage; these task names can be seen in `main.py`.
-
-### `tools/retro/examples`
-
-Example scripts for setting arguments and launch Retro preprocessing. The key files here are:
-
-- **`preprocess_data.sh`** : Example launch script for preprocessing retro data.
-- **`pretrain_model.sh`** : Example launch script for pretraining a retro model.
-
-### `tools/retro/db`
-
-Build the retrieval chunk database. The key files here are:
-
-- **`build.py`** : Entry point for building the database. This code is responsible for iterating the input datasets (i.e., `--data-path`), parsing each dataset into consecutive chunks, checking for empty Bert (Wordpiece) conversions, and storing this information to disk. Two databases are created: 1) the retrieval database, and 2) a sampled database used for training the search index.
-- **`dataset.py`** : Defines database class, for iterating or accessing chunks in the database. Each chunk contains its tokens, Bert conversion length, and dataset index.
-
-Input data:
-
-<!-- - Token datasets, as generated by `tools/preprocess_data.py`. Each dataset should include a `.bin` and `.idx` file. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`). -->
-- Token datasets, as loaded by `gpt_dataset.py`. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`).
-
-Output data:
-
-- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : The main retrieval database. (*Database* here is used to denote a list of indexed chunks, rather than a *relational database*.) The chunks in this database are added to the search index, and are used for retrieval during pretraining. This file contains a single dataset `'chunks'`, which contains 5 columns:
-
-  - `dataset_idx` : Dataset index, from list of blended indexed datasets.
-  - `document_idx` : Document index within dataset.
-  - `chunk_start_idx` : Chunk's starting token index within document.
-  - `chunk_end_idx` : Chunk's ending token index (exclusive) within document.
-  - `bert_chunk_length` : Length of Bert token sequence, after converting from GPT.
-
-- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Subset of training database that is used for training the search index. This file has the same structure as detailed above. In general, this database is significanly smaller than the `train.hdf5` database, since the search index only needs a relatively small number of samples to understand the data's structure. After training, all chunks in the main database (`train.hdf5`) are *added* to the search index.
-
-### `tools/retro/index`
-
-Build the search index. The key files here are:
-
-- `build.py` : Entry point for building the search index. First, the index is trained on the sampled chunk database (see above) by calling `train.py`, and then all chunks for the full database are added to the index by calling `add.py`. Note that training requires first embedding (using Bert) all chunks (a parallel operation), and then loading these embeddings and training the index (a sequential operation), so it's best to change one's compute setup after all chunks have been embedded and saved to disk.
-- `indexes/faiss_base.py` : Wrapper class for building a Faiss index, following the standard `train()` and `add()` operations.
-- `indexes/faiss_par_add.py` : Similar to above, except it uses an embarrassingly parallel (multi-node, multi-process) `add()` operation. Vectors are first added to separate index copies, and then merged together.
-
-Input data:
-
-- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Chunks used for training the search index.
-- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : Chunks used for adding to the *trained* search index.
-
-Output data:
-
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The final index, which has been trained and has had all database chunks added to it. This index is ready for querying neighbors. Here, `RETRO_INDEX_TYPE` and `RETRO_INDEX_STR` correspond to the same-name arguments `--retro-index-type` (e.g., `faiss-par-add`) and `--retro-index-str` (e.g., `OPQ32_256,IVF4194304_HNSW32,PQ32`).
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/empty.faissindex`** : Generally can be discarded once `added.faissindex` has been built, but this file contains the *post-training*, *pre-adding* index. Useful for debugging or building other indexes.
-
-### `tools/retro/pretraining`
-
-Query the pretraining datasets (training, validation, test) for their neighbors within the database. Neighbors are queried during preprocessing -- rather than during pretraining -- because querying is a fairly slow operation, so it would be a bottleneck if performed during pretraining. Queried neighbors are tagged with their unique identifying information (e.g., `train_indexmap_27662746ns_2048sl_1234s`), so as to avoid incorrect references during pretraining. The key files here are:
-
-- **`query.py`** : Entry point for querying. The pretraining datasets are iterated, and each chunk within each sample is queried using the search index. These neighbors are filtered by discarding any database chunks that fall within the same document as any chunk within a pretraining sample.
-- **`chunk_dataset.py`** : This creates an iterable 'chunk' dataset form of a pretraining dataset. This is just a light wrapper, but makes it easier to deterministically iterate and assign IDs to each chunk in a sample dataset.
-- **`retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
-
-Input data:
-
-- Token datasets, as loaded by `gpt_dataset.py`.
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The trained index, with all database chunks added to it (see previous section for details).
-
-Output data:
-
-- **`<RETRO_WORKDIR>/{train,valid,test}_XXns_YYsl_ZZs/WW.hdf5`** : These directories/files contain the indexes of neighbors for each chunk within each sample of the pretraining datasets. Each directory (e.g., `train_indexmap_2047435ns_2048sl_1234s`) contains a list of HDF5 files (e.g., one file might be called `0075700000-0075800000.hdf5`). Each HDF5 file contains a consecutive subset of neighbor IDs for a given chunk, for indexing into the main retrieval database. All HDF5 files taken together within a given directory, represent the entire set of neighbors for a dataset. The size of these HDF5 files is determined by the argument `--retro-block-size`. The `XX`, `YY`, `ZZ`, `WW` notation above denotes the dataset properties that are used for uniquely tagging the neighbor files, to ensure compatibility during model pretraining. These neighbor files are ultimated used by `retro_dataset.py` during pretraining, for building Retro samples.
-
-### `tools/retro/cli`
-
-Inspect preprocessed data. To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following:
-
-```
-from tools.retro.cli import retro
-retro.init("/path/to/retro/workdir")
-```
-
-This initializes Megatron, and prepares the Retro data for inspection. See the printed usage for available functions. Several routines are included for viewing data in the retrieval database and viewing pretraining samples and neighbors. For example:
-
-```python
-retro.get_db_num_indexed_datasets() # 15
-retro.get_db_chunk_text(92874113) # 'research project at ...  and philosophy'
-retro.get_pt_sample('train', 62005) # '[16084, 26158, 25387 ..., 6898, 9568]'
-```
-
-Most methods within the CLI are prefixed to denote the data being inspected:
-
-- **'db'** : Retrieval database (i.e., chunk tokens, document IDs, and dataset IDs)
-- **'pt'** : Pretraining datasets (i.e., sample tokens and neighbor tokens)
-
-### `tools/retro/utils.py`
-
-A collection of utility methods. Most importantly, this contains:
-
-- **`def get_gpt_tokenizer()`** : Get the GPT tokenizer.
-- **`def get_bert_tokenizer()`** : Get the Bert tokenizer.
-- **`class GPTToTextDataset`** : Wrapper class that converts GPT (BPE) samples to raw text.
-
-### `tools/bert_embedding`
-
-Generate Bert embeddings. The main files here are:
-
-- **`embed.py`** : Entry point for generating embeddings, and contains the two main embedding classes, `BertEmbedder` and `DiskDataParallelBertEmbedder` (more below). This file contains code for generating Megatron embeddings, while the file below contains code for Huggingface embeddings.
-- **`huggingface.py`** : Used by `embed.py` when the embedder is configured (see below) to output Huggingface embeddings.
-- **`dataset.py`** : Wrapper class for converting a raw-text dataset to Bert (Wordpiece) tokens.
-
-The Bert embeddings can be configured along two axes. The first axis is the output type:
-
-- **`class BertEmbedder`** : This class takes a raw-text dataset as input, generates its embeddings, and returns a Numpy array. The main functions are `embed_text_dataset` (accepts a raw-text dataset) and `embed_text` (accepts a string).
-- **`class DiskDataParallelBertEmbedder`** : This class wraps `BertEmbedder`, and rather than returning a Numpy array, it saves the embeddings to disk. Additionally, this class automatically splits data across data parallel ranks (using interleaving), and also processes data in a specified `block_size` (e.g., 1,000,000).
-
-The second axis is the type of embedding model to use, controlled by the argument `--bert-embedder-type`:
-
-- **`--bert-embedder-type megatron`** : Use Megatron's Bert model. The specific model used is dependent on the loaded checkpoint, vocab file, and tokenizer.
-- **`--bert-embedder-type huggingface`** : Use Huggingface's `bert-large-cased`. (*Note*: Huggingface's inclusion is likely to be deprecated; and there is no ability to configure cased/uncased.)
-
-### Pretraining
-
-- **`pretrain_retro.py`** : Launch script for pretraining Retro. Similar to `pretrain_gpt.py`, except this script handles loading neighbor tokens and setting up the neighbor attention mask.
-<!-- - `megatron/data/gpt_dataset.py` : ? -->
-- **`megatron/model/retro_transformer.py`** : Implementation of Retro model, including the main transformer, the retrieval encoder, and chunked cross-attention layers. Note that currently, `retro_transformer.py` contains several classes that are nearly identical to `transformer.py`, except for 1 or 2 lines, due to code changes that are yet to be integrated.
-- **`tools/retro/pretraining/retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
-
-
-<!-- ################ arguments ################ -->
-# Arguments
-
-See `tools/retro/main.py`'s `add_retro_args()` and `megatron/arguments.py`'s `_add_retro_args()` for details and descriptions. Here we list some particularly important arguments:
-
-- `--retro-workdir` : Mentioned previously, this argument determines the directory in which a set of Retro data is stored (during preprocessing) and loaded (during pretraining). Any change in this directory during preprocessing may result in preprocessing starting over from scratch, and any change before pretraining will result in pretraining throwing an error.
-- Preprocessing
-  - `--retro-gpt-chunk-length` : Retro chunk length (e.g., 64 in original paper).
-  - `--retro-tasks` : Comma-separated list of preprocessing tasks. Generally, the `build` task is the simplest way to run the preprocessing pipeline. For finer control, individual stages can be run by using tasks (in order): `db-build`, `index-build`, and `pretraining-query-neighbors`.
-  - `--retro-index-str` : Faiss index string that defines the index configuration. This will vary based on data size, compute/disk setup, and user needs. For example, this string looks something like `IVF262144_HNSW32,Flat` or `OPQ32_256,IVF4194304_HNSW32,PQ32`.
-- Pretraining
-  - `--retro-add-retriever` : Must be used to select Retro model.
-  - `--retro-num-neighbors` : Number of neighbors to retrieve from the retrieval database (defaults to 2).
-  - `--retro-num-retrieved-chunks` : For each neighbor, the number consecutive chunks to retrieve, including the initial neighbor (defaults to 2).
-  - `--retro-attention-gate` : Gated mechanism to incorporate information of cross attention from retrieved neighbor  (defaults to 1 during pretraining).
-
-<!-- ################ pretraining ################ -->
-<!-- # Pretraining -->
-<!-- - New retro args in arguments.py (add_retro_args). -->
-<!-- - Most important arg is `--retro-add-retriever`. -->
diff --git a/tools/retro/cli/__init__.py b/tools/retro/cli/__init__.py
deleted file mode 100644
index 2531017a28b..00000000000
--- a/tools/retro/cli/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-from .cli import retro
diff --git a/tools/retro/cli/__main__.py b/tools/retro/cli/__main__.py
deleted file mode 100644
index 37d096a9538..00000000000
--- a/tools/retro/cli/__main__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-
-from . import retro
-
-
-if __name__ == "__main__":
-    retro.init(os.environ["RETRO_PROJECT_DIR"])
diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py
deleted file mode 100644
index a5d953d2f7a..00000000000
--- a/tools/retro/cli/cli.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-import json
-import numpy as np
-import os
-import typing as T
-from types import SimpleNamespace
-
-from megatron.training.arguments import load_retro_config, parse_args, validate_args
-from megatron.core.datasets.retro.db.dataset import DBDataset
-from megatron.core.datasets.retro.db.utils import (
-    get_indexed_dataset_infos as get_db_indexed_dataset_infos,
-    get_merged_train_dataset as get_db_dataset,
-)
-from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets, RetroDataset
-from megatron.training.global_vars import set_global_variables
-from megatron.training.training import build_train_valid_test_datasets, update_train_iters
-from pretrain_retro import train_valid_test_datasets_provider
-from tools.retro.preprocess_data import get_tokenizers
-
-
-def shorten_str(s: str, n: int) -> str:
-    s = "\\n".join(s.splitlines())
-    return s if len(s) <= n else "%s ... %s" % (s[: n // 2], s[-n // 2 :])
-
-
-class retro:
-
-    config = None
-
-    ##############################################
-    # initialize.
-    ##############################################
-
-    @classmethod
-    def init(cls, project_dir: str) -> None:
-        '''Initialize Megatron, tokenizers, and datasets.'''
-
-        # Megatron args.
-        args = parse_args(extra_args_provider=None, ignore_unknown_args=False)
-        args.retro_project_dir = project_dir
-        args.micro_batch_size = 1
-        args.num_layers = 1
-        args.hidden_size = 1
-        args.num_attention_heads = 1
-        args.async_tensor_model_parallel_allreduce = False
-        args.retro_add_retriever = True # for building RetroDataset
-        validate_args(args)
-        set_global_variables(args)
-        update_train_iters(args)
-
-        # Retro config.
-        cls.config = load_retro_config(project_dir)
-        cls.config.retro_project_dir = project_dir
-        cls.config.retro_tokenizers = get_tokenizers(cls.config)
-
-        # Chunk database dataset.
-        cls.db_indexed_dataset_infos = get_db_indexed_dataset_infos(project_dir)
-        cls.db_dataset = get_db_dataset(project_dir,
-                                        cls.config.retro_gpt_chunk_length,
-                                        cls.config.retro_tokenizers.gpt.eod)
-
-        # Pretraining datasets.
-        pt_train_ds, pt_valid_ds, pt_test_ds = build_train_valid_test_datasets(
-            train_valid_test_datasets_provider)
-        cls.pt_datasets = SimpleNamespace(
-            train=pt_train_ds,
-            valid=pt_valid_ds,
-            test=pt_test_ds,
-        )
-
-        # Print usage.
-        cls.print_usage()
-
-    ##############################################
-    # utils.
-    ##############################################
-
-    @classmethod
-    def gpt_to_text(cls, token_ids: np.ndarray) -> str:
-        '''GPT tokens to text.'''
-        return cls.config.retro_tokenizers.gpt.detokenize(
-            token_ids.tolist() if isinstance(token_ids, np.ndarray) else token_ids
-        )
-
-    @classmethod
-    def text_to_bert(cls, text: str) -> np.ndarray:
-        '''Text to Bert tokens.'''
-        return cls.config.retro_tokenizers.bert.tokenize(text)
-
-    ##############################################
-    # chunk db.
-    ##############################################
-
-    @classmethod
-    def get_db_num_indexed_datasets(cls) -> int:
-        '''Number of indexed datasets within blended dataset.'''
-        return len(cls.db_indexed_dataset_infos)
-
-    @classmethod
-    def get_db_indexed_dataset_infos(cls) -> T.List[T.Tuple[float, str]]:
-        '''Dataset infos, including number of training & sampled sets.'''
-        return [(info["ratio"], info["prefix"]) for info in cls.db_indexed_dataset_infos]
-
-    @classmethod
-    def get_db_dataset(cls) -> DBDataset:
-        return cls.db_dataset
-
-    @classmethod
-    def get_db_num_chunks(cls) -> int:
-        '''Number of DB chunks.'''
-        return len(cls.get_db_dataset())
-
-    @classmethod
-    def get_db_chunk_gpt(cls, idx: int) -> T.List[int]:
-        '''Get DB chunk as GPT token ids.'''
-        return cls.get_db_dataset()[idx]["text"].tolist()
-
-    @classmethod
-    def get_db_chunk_bert(cls, idx: int) -> T.List[int]:
-        '''Get DB chunk as Bert token ids.'''
-        return cls.text_to_bert(cls.get_db_chunk_text(idx))
-
-    @classmethod
-    def get_db_chunk_text(cls, idx: int) -> str:
-        '''Get DB chunk as text.'''
-        return cls.gpt_to_text(cls.get_db_chunk_gpt(idx))
-
-    @classmethod
-    def get_db_chunk_and_continuation_text(cls, idx: int) -> T.List[str]:
-        '''Get DB chunk along with continuation, as text.'''
-
-        # Modulus used here to match original implementation (i.e., last
-        # chunks continuation wraps around to first chunk).
-        return [
-            cls.get_db_chunk_text(idx),
-            cls.get_db_chunk_text((idx + 1) % len(cls.get_db_dataset())),
-        ]
-
-    ##############################################
-    # pretraining corpus.
-    ##############################################
-
-    @classmethod
-    def get_pt_num_samples_and_chunks(cls, data_key: str) -> T.Tuple[int, int]:
-        '''Number of samples & chunks (e.g., 32*n_samples) in corpus.'''
-        assert hasattr(cls.pt_datasets, data_key), (
-            "pretraining set '%s' not found (choices: %s)."
-            % (data_key, ", ".join(vars(cls.pt_datasets).keys()))
-        )
-        chunk_dataset = getattr(cls.pt_datasets, data_key).chunk_dataset
-        return (
-            len(chunk_dataset.sample_dataset),
-            len(chunk_dataset),
-        )
-
-    @classmethod
-    def get_pt_num_samples(cls, data_key: str) -> int:
-        '''Number of pretraining samples.'''
-        return cls.get_pt_num_samples_and_chunks(data_key)[0]
-
-    @classmethod
-    def get_pt_num_chunks(cls, data_key: str) -> int:
-        '''Number of pretraining chunks (e.g., 32*n_samples).'''
-        return cls.get_pt_num_samples_and_chunks(data_key)[1]
-
-    @classmethod
-    def get_pt_dataset(cls, data_key: str) -> RetroDataset:
-        return getattr(cls.pt_datasets, data_key)
-
-    @classmethod
-    def get_pt_sample(cls, data_key: str, idx: int) -> dict:
-        return getattr(cls.pt_datasets, data_key)[idx]
-
-    @classmethod
-    def get_neighbor_tokens(cls, sample_id: int, chunk_id: int, data_key: str="train") -> T.Optional[dict]:
-        try:
-            sample = cls.get_pt_sample(data_key, sample_id)
-            sample_token_ids = sample["text"]
-            chunk_length = cls.args.retro_gpt_chunk_length
-            chunk_start_idx = chunk_id * chunk_length
-            chunk_end_idx = min(sample_token_ids.shape[0], chunk_start_idx + chunk_length)
-            chunk_token_ids = sample_token_ids[chunk_start_idx:chunk_end_idx]
-            neighbor_token_ids = sample["neighbor_tokens"][chunk_id]
-            return {
-                "chunk_tokens": chunk_token_ids,
-                "neighbor_tokens": neighbor_token_ids,
-            }
-        except Exception:
-            return None
-
-    @classmethod
-    def print_neighbor_texts(cls, sample_id: int, chunk_id: int, data_key: str="train") -> None:
-        tokens: dict = cls.get_neighbor_tokens(sample_id, chunk_id, data_key)
-        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
-        try:
-            print("PRETRAINING CHUNK:")
-            print("  - %s" % shorten_str(cls.gpt_to_text(tokens["chunk_tokens"]), 150))
-            print("NEIGHBOR_CHUNKS:")
-            for token_ids in tokens["neighbor_tokens"]:
-                print("  - %s" % shorten_str(cls.gpt_to_text(token_ids), 150))
-        except Exception:
-            print("<no neighbors for sample %d>" % sample_id)
-
-    ##############################################
-    # usage.
-    ##############################################
-
-    @classmethod
-    def print_usage(cls) -> None:
-        '''Print usage.'''
-
-        print()
-        print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
-        print("examples ... [ *note*: 'db' = chunk db; 'pt' = pretraining corpus. ]")
-        print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
-
-        print()
-        print("~~~~ indexed datasets ~~~~")
-        print("retro.get_db_num_indexed_datasets() : %s" % cls.get_db_num_indexed_datasets())
-        print("retro.get_db_indexed_dataset_infos() :")
-        for i, (ratio, prefix) in enumerate(cls.get_db_indexed_dataset_infos()):
-            print(
-                "  %s(%f, %s)%s"
-                % (
-                    "[" if i == 0 else " ",
-                    ratio,
-                    prefix,
-                    "]" if i == len(cls.db_indexed_dataset_infos) - 1 else ",",
-                )
-            )
-
-        print()
-        print("~~~~ counts ~~~~")
-        print("retro.get_db_num_chunks : %d." % cls.get_db_num_chunks())
-
-        print()
-        for sq_key in ("sample", "chunk"):
-            for data_key in ("train", "valid"):  # test?
-                print(
-                    "retro.get_pt_num_%ss('%s') : %d."
-                    % (sq_key, data_key, getattr(cls, f"get_pt_num_{sq_key}s")(data_key))
-                )
-
-        print()
-        print("~~~~ tokens, text ~~~~")
-        print(
-            "retro.get_db_chunk_gpt(chunk_id) : %s"
-            % shorten_str(str(retro.get_db_chunk_gpt(0)), 50)
-        )
-        print(
-            "retro.get_db_chunk_bert(chunk_id) : %s"
-            % shorten_str(str(retro.get_db_chunk_bert(0)), 50)
-        )
-        print(
-            "retro.get_db_chunk_text(chunk_id) : %s"
-            % shorten_str(retro.get_db_chunk_text(0).strip(), 50)
-        )
-        print("retro.get_db_chunk_and_continuation_text(chunk_id) :")
-        for i, t in enumerate(retro.get_db_chunk_and_continuation_text(0)):
-            print(
-                "  %s'%s'%s"
-                % (
-                    "[" if i == 0 else " ",
-                    shorten_str(t.strip().replace("\n", " "), 50),
-                    "]" if i == 1 else ",",
-                )
-            )
-
-        sample = cls.get_pt_sample("train", 0)
-        sample_chunk_id = sample["neighbor_tokens"].shape[0] // 2
-        sample_neighbor_id = 0
-        print()
-        print("retro.get_pt_sample('train', sample_id) :")
-        print("  {")
-        for k, v in sample.items():
-            print("    '%s' : %s" % (k, shorten_str(str(v), 50)))
-        print("  }")
-
-        print()
-        print("(e.g., sample = retro.get_pt_sample(...))")
-        print()
-        print("  sample['text'].shape : %s" % str(sample["text"].shape))
-        print("  sample['neighbor_tokens'].shape : %s" % str(sample["neighbor_tokens"].shape))
-        print("  sample['text'] : %s" % shorten_str(str(sample["text"]), 50))
-        print(
-            "  sample['neighbor_tokens'][17][1] : %s"
-            % shorten_str(str(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50)
-        )
-        print(
-            "  retro.gpt_to_text(sample['text']) : %s"
-            % shorten_str(cls.gpt_to_text(sample["text"]), 50)
-        )
-        print(
-            "  retro.gpt_to_text(sample['neighbor_tokens']) : %s"
-            % shorten_str(
-                cls.gpt_to_text(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50
-            )
-        )
-
-        print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
diff --git a/tools/retro/config_utils.py b/tools/retro/config_utils.py
deleted file mode 100644
index 00676c66ffd..00000000000
--- a/tools/retro/config_utils.py
+++ /dev/null
@@ -1,632 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-
-"""Config utils."""
-
-import argparse
-from collections import namedtuple, OrderedDict
-import dataclasses
-import enum
-import inspect
-import os
-import re
-import types
-import typing as T
-
-
-PARAM_KEYWORDS = {
-    "param",
-    "parameter",
-    "arg",
-    "argument",
-    "attribute",
-    "key",
-    "keyword",
-}
-RAISES_KEYWORDS = {"raises", "raise", "except", "exception"}
-DEPRECATION_KEYWORDS = {"deprecation", "deprecated"}
-RETURNS_KEYWORDS = {"return", "returns"}
-YIELDS_KEYWORDS = {"yield", "yields"}
-EXAMPLES_KEYWORDS = {"example", "examples"}
-
-
-class ParseError(RuntimeError):
-    """Base class for all parsing related errors."""
-
-
-class DocstringStyle(enum.Enum):
-    """Docstring style."""
-
-    REST = 1
-    GOOGLE = 2
-    NUMPYDOC = 3
-    EPYDOC = 4
-    AUTO = 255
-
-
-class RenderingStyle(enum.Enum):
-    """Rendering style when unparsing parsed docstrings."""
-
-    COMPACT = 1
-    CLEAN = 2
-    EXPANDED = 3
-
-
-class DocstringMeta:
-    """Docstring meta information.
-
-    Symbolizes lines in form of
-
-        :param arg: description
-        :raises ValueError: if something happens
-    """
-
-    def __init__(
-        self, args: T.List[str], description: T.Optional[str]
-    ) -> None:
-        """Initialize self.
-
-        :param args: list of arguments. The exact content of this variable is
-            dependent on the kind of docstring; it's used to distinguish
-            between custom docstring meta information items.
-        :param description: associated docstring description.
-        """
-        self.args = args
-        self.description = description
-
-
-class DocstringParam(DocstringMeta):
-    """DocstringMeta symbolizing :param metadata."""
-
-    def __init__(
-        self,
-        args: T.List[str],
-        description: T.Optional[str],
-        arg_name: str,
-        type_name: T.Optional[str],
-        is_optional: T.Optional[bool],
-        default: T.Optional[str],
-    ) -> None:
-        """Initialize self."""
-        super().__init__(args, description)
-        self.arg_name = arg_name
-        self.type_name = type_name
-        self.is_optional = is_optional
-        self.default = default
-
-
-class DocstringReturns(DocstringMeta):
-    """DocstringMeta symbolizing :returns or :yields metadata."""
-
-    def __init__(
-        self,
-        args: T.List[str],
-        description: T.Optional[str],
-        type_name: T.Optional[str],
-        is_generator: bool,
-        return_name: T.Optional[str] = None,
-    ) -> None:
-        """Initialize self."""
-        super().__init__(args, description)
-        self.type_name = type_name
-        self.is_generator = is_generator
-        self.return_name = return_name
-
-
-class DocstringRaises(DocstringMeta):
-    """DocstringMeta symbolizing :raises metadata."""
-
-    def __init__(
-        self,
-        args: T.List[str],
-        description: T.Optional[str],
-        type_name: T.Optional[str],
-    ) -> None:
-        """Initialize self."""
-        super().__init__(args, description)
-        self.type_name = type_name
-        self.description = description
-
-
-class DocstringDeprecated(DocstringMeta):
-    """DocstringMeta symbolizing deprecation metadata."""
-
-    def __init__(
-        self,
-        args: T.List[str],
-        description: T.Optional[str],
-        version: T.Optional[str],
-    ) -> None:
-        """Initialize self."""
-        super().__init__(args, description)
-        self.version = version
-        self.description = description
-
-
-class DocstringExample(DocstringMeta):
-    """DocstringMeta symbolizing example metadata."""
-
-    def __init__(
-        self,
-        args: T.List[str],
-        snippet: T.Optional[str],
-        description: T.Optional[str],
-    ) -> None:
-        """Initialize self."""
-        super().__init__(args, description)
-        self.snippet = snippet
-        self.description = description
-
-
-class Docstring:
-    """Docstring object representation."""
-
-    def __init__(
-        self,
-        style=None,  # type: T.Optional[DocstringStyle]
-    ) -> None:
-        """Initialize self."""
-        self.short_description = None  # type: T.Optional[str]
-        self.long_description = None  # type: T.Optional[str]
-        self.blank_after_short_description = False
-        self.blank_after_long_description = False
-        self.meta = []  # type: T.List[DocstringMeta]
-        self.style = style  # type: T.Optional[DocstringStyle]
-
-    @property
-    def params(self) -> T.List[DocstringParam]:
-        """Return a list of information on function params."""
-        return {m.arg_name:m for m in self.meta if isinstance(m, DocstringParam)}
-
-    @property
-    def raises(self) -> T.List[DocstringRaises]:
-        """Return a list of information on the exceptions that the function
-        may raise.
-        """
-        return [
-            item for item in self.meta if isinstance(item, DocstringRaises)
-        ]
-
-    @property
-    def returns(self) -> T.Optional[DocstringReturns]:
-        """Return a single information on function return.
-
-        Takes the first return information.
-        """
-        for item in self.meta:
-            if isinstance(item, DocstringReturns):
-                return item
-        return None
-
-    @property
-    def many_returns(self) -> T.List[DocstringReturns]:
-        """Return a list of information on function return."""
-        return [
-            item for item in self.meta if isinstance(item, DocstringReturns)
-        ]
-
-    @property
-    def deprecation(self) -> T.Optional[DocstringDeprecated]:
-        """Return a single information on function deprecation notes."""
-        for item in self.meta:
-            if isinstance(item, DocstringDeprecated):
-                return item
-        return None
-
-    @property
-    def examples(self) -> T.List[DocstringExample]:
-        """Return a list of information on function examples."""
-        return [
-            item for item in self.meta if isinstance(item, DocstringExample)
-        ]
-
-
-class SectionType(enum.IntEnum):
-    """Types of sections."""
-
-    SINGULAR = 0
-    """For sections like examples."""
-
-    MULTIPLE = 1
-    """For sections like params."""
-
-    SINGULAR_OR_MULTIPLE = 2
-    """For sections like returns or yields."""
-
-
-class Section(namedtuple("SectionBase", "title key type")):
-    """A docstring section."""
-
-
-GOOGLE_TYPED_ARG_REGEX = re.compile(r"\s*(.+?)\s*\(\s*(.*[^\s]+)\s*\)")
-GOOGLE_ARG_DESC_REGEX = re.compile(r".*\. Defaults to (.+)\.")
-MULTIPLE_PATTERN = re.compile(r"(\s*[^:\s]+:)|([^:]*\]:.*)")
-
-DEFAULT_SECTIONS = [
-    Section("Arguments", "param", SectionType.MULTIPLE),
-    Section("Args", "param", SectionType.MULTIPLE),
-    Section("Parameters", "param", SectionType.MULTIPLE),
-    Section("Params", "param", SectionType.MULTIPLE),
-    Section("Raises", "raises", SectionType.MULTIPLE),
-    Section("Exceptions", "raises", SectionType.MULTIPLE),
-    Section("Except", "raises", SectionType.MULTIPLE),
-    Section("Attributes", "attribute", SectionType.MULTIPLE),
-    Section("Example", "examples", SectionType.SINGULAR),
-    Section("Examples", "examples", SectionType.SINGULAR),
-    Section("Returns", "returns", SectionType.SINGULAR_OR_MULTIPLE),
-    Section("Yields", "yields", SectionType.SINGULAR_OR_MULTIPLE),
-]
-
-
-class GoogleDocstringParser:
-    """Parser for Google-style docstrings."""
-
-    def __init__(
-        self, sections: T.Optional[T.List[Section]] = None, title_colon=True
-    ):
-        """Setup sections.
-
-        :param sections: Recognized sections or None to defaults.
-        :param title_colon: require colon after section title.
-        """
-        if not sections:
-            sections = DEFAULT_SECTIONS
-        self.sections = {s.title: s for s in sections}
-        self.title_colon = title_colon
-        self._setup()
-
-    def _setup(self):
-        if self.title_colon:
-            colon = ":"
-        else:
-            colon = ""
-        self.titles_re = re.compile(
-            "^("
-            + "|".join(f"({t})" for t in self.sections)
-            + ")"
-            + colon
-            + "[ \t\r\f\v]*$",
-            flags=re.M,
-        )
-
-    def _build_meta(self, text: str, title: str) -> DocstringMeta:
-        """Build docstring element.
-
-        :param text: docstring element text
-        :param title: title of section containing element
-        :return:
-        """
-
-        section = self.sections[title]
-
-        if (
-            section.type == SectionType.SINGULAR_OR_MULTIPLE
-            and not MULTIPLE_PATTERN.match(text)
-        ) or section.type == SectionType.SINGULAR:
-            return self._build_single_meta(section, text)
-
-        if ":" not in text:
-            # raise ParseError(f"Expected a colon in {text!r}.")
-            return None
-
-        # Split spec and description
-        before, desc = text.split(":", 1)
-        if desc:
-            desc = desc[1:] if desc[0] == " " else desc
-            if "\n" in desc:
-                first_line, rest = desc.split("\n", 1)
-                desc = first_line + "\n" + inspect.cleandoc(rest)
-            desc = desc.strip("\n")
-
-        return self._build_multi_meta(section, before, desc)
-
-    @staticmethod
-    def _build_single_meta(section: Section, desc: str) -> DocstringMeta:
-        if section.key in RETURNS_KEYWORDS | YIELDS_KEYWORDS:
-            return DocstringReturns(
-                args=[section.key],
-                description=desc,
-                type_name=None,
-                is_generator=section.key in YIELDS_KEYWORDS,
-            )
-        if section.key in RAISES_KEYWORDS:
-            return DocstringRaises(
-                args=[section.key], description=desc, type_name=None
-            )
-        if section.key in EXAMPLES_KEYWORDS:
-            return DocstringExample(
-                args=[section.key], snippet=None, description=desc
-            )
-        if section.key in PARAM_KEYWORDS:
-            raise ParseError("Expected paramenter name.")
-        return DocstringMeta(args=[section.key], description=desc)
-
-    @staticmethod
-    def _build_multi_meta(
-        section: Section, before: str, desc: str
-    ) -> DocstringMeta:
-        if section.key in PARAM_KEYWORDS:
-            match = GOOGLE_TYPED_ARG_REGEX.match(before)
-            if match:
-                arg_name, type_name = match.group(1, 2)
-                if type_name.endswith(", optional"):
-                    is_optional = True
-                    type_name = type_name[:-10]
-                elif type_name.endswith("?"):
-                    is_optional = True
-                    type_name = type_name[:-1]
-                else:
-                    is_optional = False
-            else:
-                arg_name, type_name = before, None
-                is_optional = None
-
-            match = GOOGLE_ARG_DESC_REGEX.match(desc)
-            default = match.group(1) if match else None
-
-            return DocstringParam(
-                args=[section.key, before],
-                description=desc,
-                arg_name=arg_name,
-                type_name=type_name,
-                is_optional=is_optional,
-                default=default,
-            )
-        if section.key in RETURNS_KEYWORDS | YIELDS_KEYWORDS:
-            return DocstringReturns(
-                args=[section.key, before],
-                description=desc,
-                type_name=before,
-                is_generator=section.key in YIELDS_KEYWORDS,
-            )
-        if section.key in RAISES_KEYWORDS:
-            return DocstringRaises(
-                args=[section.key, before], description=desc, type_name=before
-            )
-        return DocstringMeta(args=[section.key, before], description=desc)
-
-    def add_section(self, section: Section):
-        """Add or replace a section.
-
-        :param section: The new section.
-        """
-
-        self.sections[section.title] = section
-        self._setup()
-
-    def parse(self, text: str) -> Docstring:
-        """Parse the Google-style docstring into its components.
-
-        :returns: parsed docstring
-        """
-        ret = Docstring(style=DocstringStyle.GOOGLE)
-        if not text:
-            return ret
-
-        # Clean according to PEP-0257
-        text = inspect.cleandoc(text)
-
-        # Find first title and split on its position
-        match = self.titles_re.search(text)
-        if match:
-            desc_chunk = text[: match.start()]
-            meta_chunk = text[match.start() :]
-        else:
-            desc_chunk = text
-            meta_chunk = ""
-
-        # Break description into short and long parts
-        parts = desc_chunk.split("\n", 1)
-        ret.short_description = parts[0] or None
-        if len(parts) > 1:
-            long_desc_chunk = parts[1] or ""
-            ret.blank_after_short_description = long_desc_chunk.startswith(
-                "\n"
-            )
-            ret.blank_after_long_description = long_desc_chunk.endswith("\n\n")
-            ret.long_description = long_desc_chunk.strip() or None
-
-        # Split by sections determined by titles
-        matches = list(self.titles_re.finditer(meta_chunk))
-        if not matches:
-            return ret
-        splits = []
-        for j in range(len(matches) - 1):
-            splits.append((matches[j].end(), matches[j + 1].start()))
-        splits.append((matches[-1].end(), len(meta_chunk)))
-
-        chunks = OrderedDict()  # type: T.Mapping[str,str]
-        for j, (start, end) in enumerate(splits):
-            title = matches[j].group(1)
-            if title not in self.sections:
-                continue
-
-            # Clear Any Unknown Meta
-            # Ref: https://github.com/rr-/docstring_parser/issues/29
-            meta_details = meta_chunk[start:end]
-            unknown_meta = re.search(r"\n\S", meta_details)
-            if unknown_meta is not None:
-                meta_details = meta_details[: unknown_meta.start()]
-
-            chunks[title] = meta_details.strip("\n")
-        if not chunks:
-            return ret
-
-        # Add elements from each chunk
-        for title, chunk in chunks.items():
-            # Determine indent
-            indent_match = re.search(r"^\s*", chunk)
-            if not indent_match:
-                raise ParseError(f'Can\'t infer indent from "{chunk}"')
-            indent = indent_match.group()
-
-            # Check for singular elements
-            if self.sections[title].type in [
-                SectionType.SINGULAR,
-                SectionType.SINGULAR_OR_MULTIPLE,
-            ]:
-                part = inspect.cleandoc(chunk)
-                ret.meta.append(self._build_meta(part, title))
-                continue
-
-            # Split based on lines which have exactly that indent
-            _re = "^" + indent + r"(?=\S)"
-            c_matches = list(re.finditer(_re, chunk, flags=re.M))
-            if not c_matches:
-                raise ParseError(f'No specification for "{title}": "{chunk}"')
-            c_splits = []
-            for j in range(len(c_matches) - 1):
-                c_splits.append((c_matches[j].end(), c_matches[j + 1].start()))
-            c_splits.append((c_matches[-1].end(), len(chunk)))
-            for j, (start, end) in enumerate(c_splits):
-                part = chunk[start:end].strip("\n")
-                ret.meta.append(self._build_meta(part, title))
-
-        return ret
-
-
-def verify_and_get_config_attr_descs(config_cls, strict_docstring_match=True):
-
-    assert dataclasses.is_dataclass(config_cls), f"uh oh <{config_cls.__name__}>."
-
-    # Parse docstring.
-    try:
-        docstring = GoogleDocstringParser().parse(config_cls.__doc__)
-    except Exception as e:
-        raise Exception(f"error parsing {config_cls.__name__} docstring.")
-    
-    # Get attributes and types.
-    config_attrs = docstring.params
-    config_types = config_cls.__annotations__
-
-    # Verify attribute names.
-    config_attr_keys = set(config_attrs.keys())
-    config_type_keys = set(config_types.keys())
-    missing_attr_keys = config_type_keys - config_attr_keys
-    extra_attr_keys = config_attr_keys - config_type_keys
-    if strict_docstring_match:
-        assert not missing_attr_keys and not extra_attr_keys, f"{config_cls.__name__} docstring is either missing attributes ({', '.join(missing_attr_keys) if missing_attr_keys else '--'}) or contains extra attributes ({', '.join(extra_attr_keys) if extra_attr_keys else '--'})."
-
-    # @todo
-    # Verify attribute type names.
-    # for key in config_attr_keys:
-    #     ... todo ...
-
-    # Verify base class attributes.
-    attrs = {k:v for base_cls in config_cls.__bases__ if dataclasses.is_dataclass(base_cls) for k,v in verify_and_get_config_attr_descs(base_cls, strict_docstring_match=strict_docstring_match).items()}
-    for key in config_attr_keys:
-        if key in config_types:
-            attrs[key] = {
-                "desc" : config_attrs[key].description,
-                "type" : config_types[key],
-            }
-
-    return attrs
-
-
-def add_config_args(parser, config_cls):
-    attrs = verify_and_get_config_attr_descs(config_cls, strict_docstring_match=False)
-    for key, attr in attrs.items():
-        _type = attr["type"]
-        if dataclasses.is_dataclass(_type):
-            group = parser.add_argument_group(title=attr["desc"])
-            add_config_args(group, _type)
-        else:
-
-            default_value = getattr(config_cls, key)
-            args = {
-                "help" : attr["desc"],
-                "default" : default_value,
-            }
-
-            if _type == bool:
-                assert isinstance(args["default"], (bool, type(None))), \
-                    f"boolean attribute '{key}' of {config_cls.__name__} " \
-                    "has non-boolean default value."
-
-                # When default=True, add 'no-{key}' arg.
-                if default_value:
-                    args["action"] = "store_false"
-                    args["dest"] = key
-                    key = "no-" + key
-                else:
-                    args["action"] = "store_true"
-
-            elif _type in (int, float):
-                args["type"] = _type
-
-            elif _type == list:
-                args["nargs"] = "*"
-
-            # else: ....... treat as string arg
-            #     raise Exception(f"specialize action for '{key}', type <{_type}>.")
-
-            try:
-                parser.add_argument(f"--{key.replace('_', '-')}", **args)
-            except argparse.ArgumentError as e:
-                pass
-
-
-def get_config_leaf_field_names(config_cls):
-    names = set()
-    for field in dataclasses.fields(config_cls):
-        if dataclasses.is_dataclass(field.type):
-            names.update(get_config_leaf_field_names(field.type))
-        else:
-            names.add(field.name)
-    return names
-
-
-def config_from_args(args, config_cls, add_custom_args=False):
-
-    # Collect config data in a dict.
-    data = {}
-    for field in dataclasses.fields(config_cls):
-        if dataclasses.is_dataclass(field.type):
-            data[field.name] = config_from_args(args, field.type)
-        else:
-            data[field.name] = getattr(args, field.name)
-
-    # Add custom args. (e.g., for tools, tasks)
-    if add_custom_args:
-
-        config_keys = get_config_leaf_field_names(config_cls)
-        arg_keys = set(vars(args).keys())
-        custom_keys = arg_keys - config_keys
-
-        custom_data = {k:v for k, v in vars(args).items() if k in custom_keys}
-        custom_config_cls = dataclasses.make_dataclass(
-            "CustomConfig",
-            [(k, type(v)) for k, v in custom_data.items()])
-        custom_config = custom_config_cls(**custom_data)
-        data["custom"] = custom_config
-
-    # Create config. [ todo: programmatically create dataclass that inherits
-    # TransformerConfig. ]
-    config = config_cls(**data)
-
-    return config
-
-
-def flatten_config(config, base_config_cls=None):
-
-    # Lift sub-config data.
-    flat_config = {}
-    for field in dataclasses.fields(config):
-        value = getattr(config, field.name)
-        if dataclasses.is_dataclass(value):
-            flat_config = { **flat_config, **flatten_config(value) }
-        else:
-            flat_config[field.name] = value
-
-    # Convert to dataclass.
-    if base_config_cls:
-        base_keys = set(field.name for field in dataclasses.fields(base_config_cls))
-        flat_config_cls = dataclasses.make_dataclass(
-            cls_name="FlatMegatronConfig",
-            fields=[(k, T.Any, dataclasses.field(default=None))
-                    for k, v in flat_config.items()
-                    if k not in base_keys],
-            bases=(base_config_cls,))
-        flat_config = flat_config_cls(**flat_config)
-
-    return flat_config
diff --git a/tools/retro/docker/Dockerfile b/tools/retro/docker/Dockerfile
deleted file mode 100644
index e8945b373a4..00000000000
--- a/tools/retro/docker/Dockerfile
+++ /dev/null
@@ -1,19 +0,0 @@
-FROM nvcr.io/nvidia/pytorch:23.09-py3
-
-RUN pip install -U faiss-gpu
-
-RUN apt update
-
-RUN apt install -qy htop
-
-RUN pip install -U transformers
-
-RUN pip install --upgrade google-api-python-client
-
-RUN pip install sentencepiece
-
-RUN pip install h5py
-
-RUN pip install nltk
-
-RUN pip install einops
diff --git a/tools/retro/preprocess_data.py b/tools/retro/preprocess_data.py
deleted file mode 100644
index 3a01d67cabb..00000000000
--- a/tools/retro/preprocess_data.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Preprocess data for Retro.
-
-Stages (see argument '--retro-tasks'):
-- Build chunk database (DB).
-- Build index (train, add).
-- Query pretraining neighbors.
-"""
-
-import json
-import os
-import sys
-import torch
-
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.core.datasets.retro.db import build_db
-from megatron.core.datasets.retro.index import add_to_index, train_index
-from megatron.core.datasets.retro.config import (
-    RetroBertEmbedders,
-    RetroGPTChunkDatasets,
-    RetroPreprocessingConfig,
-    RetroTokenizers,
-)
-from megatron.core.datasets.retro.query.gpt_chunk_dataset import build_gpt_chunk_datasets_from_gpt_datasets
-from megatron.core.datasets.retro.query.multi_split_gpt_dataset import (
-    MultiSplitGPTDataset,
-    MultiSplitGPTDatasetConfig,
-)
-from megatron.core.datasets.retro.query.query import query_neighbors
-from megatron.core.datasets.retro.query.utils import get_query_dir
-from megatron.core.datasets.retro.utils import retro_makedir
-from megatron.core.models.retro.utils import (
-    get_config_path,
-    get_gpt_data_dir,
-)
-from megatron.training import get_args, initialize_megatron, print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
-from megatron.training.tokenizer.tokenizer import (
-    _BertWordPieceTokenizer,
-    _GPT2BPETokenizer,
-    _GPTSentencePieceTokenizer,
-)
-from megatron.training import get_train_valid_test_num_samples
-from pretrain_gpt import is_dataset_built_on_rank
-from tools.bert_embedding import BertEmbedder, DiskDataParallelBertEmbedder
-from tools.retro.config_utils import add_config_args
-
-
-def add_retro_args(parser):
-    group = parser.add_argument_group(title="Retro preprocessing")
-    add_config_args(group, RetroPreprocessingConfig)
-    return parser
-
-
-def initialize_megatron_retro():
-    '''Initialize megatron & save Retro config.'''
-
-    # Prevent arguments.py from overriding preprocessing args.
-    project_dir_idx = sys.argv.index("--retro-project-dir")
-    retro_project_dir = sys.argv[project_dir_idx + 1]
-    del sys.argv[project_dir_idx] # delete key
-    del sys.argv[project_dir_idx] # delete value
-
-    # Initialize.
-    initialize_megatron(extra_args_provider=add_retro_args)
-
-    args = get_args()
-    args.retro_project_dir = retro_project_dir
-
-    # Retro config.
-    config = get_retro_preprocessing_config()
-
-    # Save retro config.
-    if config.retro_task_validate is None:
-        retro_makedir(config, config.retro_project_dir)
-        save_config(config)
-
-    return config
-
-
-def get_bert_embedders(config):
-    mem_embedder = BertEmbedder(
-        batch_size = config.retro_bert_batch_size,
-        max_bert_seq_length = config.retro_bert_max_chunk_length,
-        embedder_type = "megatron",
-    )
-    return RetroBertEmbedders(
-        mem = mem_embedder,
-        disk = DiskDataParallelBertEmbedder(mem_embedder, config.retro_block_size),
-    )
-
-
-def get_gpt_chunk_datasets(config):
-
-    args = get_args()
-
-    # Dataset config.
-    data_dir = get_gpt_data_dir(config.retro_project_dir)
-    blend = list(config.retro_gpt_data_path)
-    for i in range(len(blend) - 1, -1, -2):
-        blend[i] = os.path.join(data_dir, blend[i])
-    data_config = MultiSplitGPTDatasetConfig(
-        random_seed=config.retro_gpt_seed,
-        sequence_length=config.retro_gpt_seq_length,
-        blend=get_blend_from_list(blend),
-        blend_per_split=[
-            get_blend_from_list(args.train_data_path),
-            get_blend_from_list(args.valid_data_path),
-            get_blend_from_list(args.test_data_path)
-        ],
-        split=config.retro_gpt_split,
-        split_preprocessing=config.retro_gpt_split,
-        path_to_cache=config.retro_gpt_data_cache_path,
-        return_document_ids=True,
-        tokenizer=config.retro_tokenizers.gpt,
-        reset_position_ids=args.reset_position_ids,
-        reset_attention_mask=args.reset_attention_mask,
-        eod_mask_loss=args.eod_mask_loss,
-        mid_level_dataset_surplus=args.mid_level_dataset_surplus,
-    )
-
-    # GPT datasets.
-    print_rank_0(" > multi-split gpt datasets.")
-    train_valid_test_num_samples = get_train_valid_test_num_samples()
-    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
-        MultiSplitGPTDataset,
-        train_valid_test_num_samples,
-        is_dataset_built_on_rank,
-        data_config,
-    ).build()
-
-    gpt_datasets = {
-        "train" : (train_ds, train_valid_test_num_samples[0]),
-        "valid" : (valid_ds, train_valid_test_num_samples[1]),
-        "test"  : (test_ds, train_valid_test_num_samples[2]),
-    }
-
-    # Chunk datasets.
-    chunk_datasets = build_gpt_chunk_datasets_from_gpt_datasets(
-        project_dir=config.retro_project_dir,
-        gpt_datasets=gpt_datasets,
-        sample_length=config.retro_gpt_seq_length,
-        chunk_length=config.retro_gpt_chunk_length,
-    )
-    chunk_datasets = RetroGPTChunkDatasets(**chunk_datasets)
-
-    return chunk_datasets
-
-
-def get_gpt_tokenizer(config):
-    '''GPT (BPE) tokenizer.'''
-    tokenizer_type = config.retro_gpt_tokenizer_type
-    if tokenizer_type == "GPT2BPETokenizer":
-        assert config.retro_gpt_vocab_file and config.retro_gpt_merge_file
-        return _GPT2BPETokenizer(
-            vocab_file=os.path.join(
-                config.retro_project_dir,
-                config.retro_gpt_vocab_file,
-            ),
-            merge_file=os.path.join(
-                config.retro_project_dir,
-                config.retro_gpt_merge_file,
-            ),
-        )
-    elif tokenizer_type == 'GPTSentencePieceTokenizer':
-        assert config.retro_gpt_tokenizer_model is not None
-        return _GPTSentencePieceTokenizer(os.path.join(
-            config.retro_project_dir,
-            config.retro_gpt_tokenizer_model,
-        ))
-    else:
-        raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type)
-
-
-def get_bert_tokenizer(config):
-    '''Bert (Wordpiece) tokenizer.'''
-    lower_case = {
-        "BertWordPieceLowerCase" : True,
-        "BertWordPieceCase" : False,
-    }[config.retro_bert_tokenizer_type]
-    return _BertWordPieceTokenizer(
-        vocab_file=os.path.join(
-            config.retro_project_dir,
-            config.retro_bert_vocab_file,
-        ),
-        lower_case=lower_case,
-    )
-
-
-def get_tokenizers(config):
-    return RetroTokenizers(
-        gpt = get_gpt_tokenizer(config),
-        bert = get_bert_tokenizer(config),
-    )
-
-
-def get_retro_preprocessing_config():
-
-    # Arguments.
-    args = get_args()
-
-    # Retro config.
-    config = core_transformer_config_from_args(
-        args, config_class=RetroPreprocessingConfig)
-
-    # Add tools.
-    config.retro_tokenizers = get_tokenizers(config)
-    config.retro_bert_embedders = get_bert_embedders(config)
-    config.retro_gpt_chunk_datasets = get_gpt_chunk_datasets(config)
-
-    return config
-
-
-def save_config(config):
-    '''Save copy of config within retro project dir.'''
-
-    if torch.distributed.get_rank() == 0:
-
-        # GPT config + block size.
-        config_subset = {
-            k:v for k,v in vars(config).items()
-            if k.startswith("retro_gpt") and k != "retro_gpt_chunk_datasets"
-        }
-        config_subset["retro_block_size"] = config.retro_block_size
-
-        # Bert config.
-        config_subset["retro_bert_tokenizer_type"] = config.retro_bert_tokenizer_type
-        config_subset["retro_bert_vocab_file"] = config.retro_bert_vocab_file
-
-        # Neighbor directories.
-        query_dir = get_query_dir(config.retro_project_dir)
-        config_subset["retro_neighbor_dirs"] = {
-            k : (os.path.relpath(v["neighbor_dir"], query_dir) if v is not None else None)
-            for k, v in vars(config.retro_gpt_chunk_datasets).items()
-        }
-
-        # Save.
-        config_path = get_config_path(config.retro_project_dir)
-        with open(config_path, "w") as f:
-            json.dump(config_subset, f, indent=4, sort_keys=True)
-
-    torch.distributed.barrier()
-
-
-if __name__ == "__main__":
-
-    # Initalize Megatron.
-    config = initialize_megatron_retro()
-
-    # Expand tasks.
-    task_remap = {
-        "build" : [ "db-build", "index-train", "index-add", "query-neighbors" ],
-        "index-build" : [ "index-train", "index-add" ],
-        "db-build" : [ "db-build" ],
-        "index-train" : [ "index-train" ],
-        "index-add" : [ "index-add" ],
-        "query-neighbors" : [ "query-neighbors" ],
-    }
-    tasks = []
-    for task in config.retro_tasks:
-        tasks.extend(task_remap[task])
-    config.retro_tasks = tasks
-
-    # Select task to run.
-    for task in tasks:
-
-        print_rank_0("start '%s%s'." % (
-            "" if config.retro_task_validate is None else "[validate] ",
-            task,
-        ))
-
-        # DB (i.e., chunk db).
-        if task == "db-build":
-            build_db(config)
-
-        # Index.
-        elif task == "index-train":
-            train_index(config)
-        elif task == "index-add":
-            add_to_index(config)
-
-        # Query.
-        elif task == "query-neighbors":
-            query_neighbors(config)
-
-        else:
-            raise Exception("specialize for task '%s'." % task)
-
-        torch.distributed.barrier()
-
-        print_rank_0("end '%s%s'." % (
-            "" if config.retro_task_validate is None else "[validate] ",
-            task,
-        ))
diff --git a/tools/retro/sft/README.md b/tools/retro/sft/README.md
deleted file mode 100644
index e5898790383..00000000000
--- a/tools/retro/sft/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## Note
-
-The content within this `sft` directory is still under active development and will be updated soon.
\ No newline at end of file
diff --git a/tools/retro/sft/dataset_conv.py b/tools/retro/sft/dataset_conv.py
deleted file mode 100644
index 3dd8fa9cd56..00000000000
--- a/tools/retro/sft/dataset_conv.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import re
-import json
-import os
-from typing import Any, Iterable, Dict, Optional
-
-from numpy import ndarray
-from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
-from megatron.core.datasets.utils import Split
-import torch
-import numpy
-import glob
-from collections import OrderedDict
-
-from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
-from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
-from megatron.core.datasets.utils import Split
-from dataclasses import dataclass
-
-
-_DATASET_NAME_PATTERNS = {
-    Split.train: r"(?P<name>[^\0]+)\/(?P=name)\_QA\_train.json",
-    Split.valid: r"(?P<name>[^\0]+)\/(?P=name)\_QA\_dev.json",
-}
-
-
-@dataclass
-class JsonQADatasetConfig(BlendedMegatronDatasetConfig):
-    """Configuration object for the QA finetuning pipeline
-    """
-    ft_neighbours: int = 1
-
-    bert_retriever_neighbours: bool = False
-
-    longform_answer: bool = False
-
-    inference_only: bool = False
-
-    retrieved_neighbours: bool = False
-
-    fix_newsqa: bool = True
-
-    def __post_init__(self) -> None:
-        super().__post_init__()
-        assert self.blend_per_split is not None
-
-
-@dataclass
-class RetroJsonQADatasetConfig(JsonQADatasetConfig):
-    """Configuration object for the Retro QA finetuning pipeline
-    """
-    retro_num_neighbors: int = None
-
-    retro_gpt_retrieved_length: int = None
-
-    def __post_init__(self) -> None:
-        super().__post_init__()
-        assert self.retro_num_neighbors is not None
-        assert self.retro_gpt_retrieved_length is not None
-
-
-class JsonQADataset(MegatronDataset):
-
-    def __init__(self, dataset: Any, dataset_path: str, indices: ndarray, num_samples: Optional[int], index_split: Split, config: BlendedMegatronDatasetConfig) -> None:
-        super().__init__(dataset, dataset_path, indices, num_samples, index_split, config)
-        matches = re.findall(_DATASET_NAME_PATTERNS[index_split], dataset_path)
-        assert len(matches) == 1
-        assert len(matches[0]) > 0
-        self.dataset_name = matches[0]
-
-    @staticmethod
-    def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
-        return len(low_level_dataset)
-
-    @staticmethod
-    def build_low_level_dataset(dataset_path: str, config: JsonQADatasetConfig) -> Iterable:
-        assert os.path.isfile(dataset_path), f"{dataset_path} does not exist on disk"
-        return preprocess(dataset_path, config)
-
-    def __len__(self) -> int:
-        return len(self.dataset)
-
-    def __getitem__(self, idx: int) -> Dict[str, ndarray]:
-        sample = self.dataset[idx % len(self.dataset)]
-
-        # unpack tokens
-        query, answer, neighbours = sample
-
-        # tokenization
-        output_tokens = self.config.tokenizer.tokenize(answer)
-
-        input_tokens = reformat_prompt(
-            query,
-            neighbours,
-            self.dataset_name,
-            self.config.ft_neighbours,
-            len(output_tokens),
-            self.config.tokenizer,
-            self.config.sequence_length
-        )
-
-        # padding
-        tokens, answer_mask = pad_and_convert_to_numpy(
-            input_tokens, output_tokens, self.config.tokenizer.pad, self.config.sequence_length, self.config.tokenizer.eos
-        )
-
-        train_sample = {
-            'text': tokens,
-            'answer_mask': answer_mask,
-        }
-
-        return train_sample
-
-
-class RetroJsonQADataset(JsonQADataset):
-
-    def __getitem__(self, idx: int) -> Dict[str, ndarray]:
-
-        sample = self.dataset[idx % len(self.dataset)]
-
-        # unpack tokens
-        query, answer, neighbours = sample
-
-        # tokenization
-        output_tokens = self.config.tokenizer.tokenize(answer)
-
-        input_tokens = reformat_prompt_retro(
-            query,
-            neighbours,
-            self.dataset_name,
-            self.config.ft_neighbours,
-            len(output_tokens),
-            self.config.tokenizer,
-            self.config.sequence_length
-        )
-
-        # padding
-        tokens, answer_mask = pad_and_convert_to_numpy(
-            input_tokens,
-            output_tokens,
-            self.config.tokenizer.pad,
-            self.config.sequence_length,
-            self.config.tokenizer.eos
-        )
-
-        # get retro neighbors
-        # context chunk and answer chunk
-        n_chunks_per_sample = 2
-        num_neighbors = self.config.retro_num_neighbors
-        # disable retro encoder
-        neighbor_tokens = numpy.zeros(
-            [n_chunks_per_sample, num_neighbors, self.config.retro_gpt_retrieved_length],
-            dtype=numpy.int64
-        )
-
-        train_sample = {
-            'text': tokens,
-            'answer_mask': answer_mask,
-            'neighbor_tokens': neighbor_tokens,
-            'context_len': len(input_tokens)
-        }
-
-        return train_sample
-
-
-def format_multichoice(multichoice_options):
-    options_text = ["({}) {}".format(chr(ord('A') + i), option) for i, option in
-                    zip(range(len(multichoice_options)), multichoice_options)]
-    return "Choose one based on the following options: {}".format(" ".join(options_text))
-
-
-def format_multichoice_question(question, multichoice_options):
-    return "{}\n{}".format(question, format_multichoice(multichoice_options))
-
-
-def format_answer(answer):
-    return " {}".format(answer)
-
-
-def preprocess(dataset_path: str, config: JsonQADatasetConfig):
-    assert config.ft_neighbours > 0
-    if config.longform_answer:
-        nq_examples = []
-        with open(dataset_path, "r") as f:
-            for fn in f:
-                nq_examples.append(json.loads(fn))
-    else:
-        nq_examples = []
-        for my_data_file in sorted(glob.glob(dataset_path)):
-            with open(my_data_file, "r", encoding='utf-8') as f:
-                nq_examples.extend(json.load(f))
-
-    data = []
-    for instance in nq_examples:
-        question = instance["question"]
-        if 'qa_type' in instance and instance['qa_type'] == "multi_choice_qa":
-            question = format_multichoice_question(question, instance["multichoice_options"])
-        if config.bert_retriever_neighbours:
-            contexts = instance["bert_pretrain_corpus_neighbours"]
-            neighbours = ["source: " + ctx for ctx in contexts]
-        else:
-            if config.retrieved_neighbours:
-                contexts = instance["ctxs"]
-                neighbours = ["title: " + ctx["title"] + ", source: " + ctx["text"] for ctx in contexts]
-            else:
-                if "sub-paragraphs" in instance:
-                    if type(instance["sub-paragraphs"]) == list:  # doc2dial:
-                        neighbours = [
-                            "title: " + instance["sub-paragraphs"][0] + ", source: " + instance["sub-paragraphs"][1]]
-                    else:
-                        neighbours = ["title: , source: " + instance["sub-paragraphs"]]
-                elif config.fix_newsqa and "sub_paragraph" in instance:
-                    neighbours = ["title: , source: " + instance["sub_paragraph"]]
-                else:
-                    neighbours = ["title: , source: "]
-
-        if config.inference_only:
-            data.append((question, None, neighbours))
-        else:
-            if config.longform_answer:
-                if "longform_answer" in instance:
-                    answers = [instance["longform_answer"]]
-                else:
-                    continue
-            else:
-                if "answers" in instance:
-                    answers = instance["answers"]
-                elif "answer" in instance:
-                    if type(instance["answer"]) is str:
-                        answers = [instance["answer"]]
-                    elif type(instance["answer"]) is list:
-                        answers = instance["answer"]
-                    else:
-                        answers = [str(instance["answer"])]
-                else:
-                    raise ValueError("need to have answer or answers")
-            if len(answers) < 1:
-                continue
-            else:
-                if type(answers[0]) is dict:
-                    answers = [answers[0]["text"].strip()]
-                elif type(answers[0]) is str:
-                    answers = [answers[0]]
-                else:
-                    raise ValueError("unsupported type for answer(s)")
-
-            for answer in answers:
-                answer = format_answer(answer)
-                data.append((question, answer, neighbours))
-
-    return data
-
-
-def count_stat(dataset, tokenizer, k):
-    nb_lens = []
-    for i, d in enumerate(dataset):
-        query, answer, neighbours = d
-        nb_lens.extend([len(tokenizer.tokenize(neighbour)) for neighbour in neighbours[:k]])
-
-    print("len of nb", len(nb_lens))
-    print("max of len nb", max(nb_lens))
-    print("num of cut ", sum([l > 128 for l in nb_lens]), sum([l > 128 for l in nb_lens]) // len(nb_lens))
-    print("last max", sorted(nb_lens)[-10:])
-
-
-def reformat_prompt_retro(query, neighbours, dataset_name, ft_neighbours, \
-                          max_output_len, tokenizer, max_seq_length):
-    system = ("System: This is a chat between a user and an artificial intelligence assistant. The assistant gives "
-              "helpful, detailed, and polite answers to the user's questions.\n\n")
-
-    if dataset_name in ["oasst", "quiet_cockatoo", "open_inst", "quiet-cockatoo_commercial"]:
-        input_tokens = tokenizer.tokenize(system + query)
-        return input_tokens
-
-    short_span_with_context = ["drop", "NarrativeQA", "QASC", "Quoref", "ROPES", "squad1.1", "squad2.0", "newsqa", "nq",
-                               "tqa", "quac"]
-    yes_no_without_context = ["BoolQ"]
-    multichoices = [""]
-    formatted_dataset_name = ["doc2dial", "quac", "qrecc", "sharc"]
-
-    if dataset_name in formatted_dataset_name:
-        dialogue_turn = query
-    else:
-        if dataset_name in short_span_with_context:
-            user = "{} Answer the above question with a short phrase.".format(query)
-        elif dataset_name in yes_no_without_context:
-            user = "{} Answer the above question with True or False.".format(query)
-        else:
-            user = "{} Answer the above question with a long complete answer.".format(query)
-
-        if dataset_name in short_span_with_context:
-            dialogue_format = "User: {}\n\nAssistant: The answer is"
-            dialogue_turn = dialogue_format.format(user)
-        else:
-            dialogue_format = "User: {}\n\nAssistant:"
-            dialogue_turn = dialogue_format.format(user)
-
-    if ft_neighbours > 0:
-        context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n"
-        context_tokens = tokenizer.tokenize(context)
-        dialogue_tokens = tokenizer.tokenize(dialogue_turn)
-        system_tokens = tokenizer.tokenize(system)
-        context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens) - len(system_tokens)]
-        context = tokenizer.detokenize(context_tokens)
-
-        all_input = system + context + dialogue_turn
-        print(all_input)
-        input_tokens = tokenizer.tokenize(all_input)
-    else:
-        all_input = system + dialogue_turn
-        input_tokens = tokenizer.tokenize(all_input)
-
-    return input_tokens
-
-
-def flan_format(system, context, dialogue_turn, template_id=0):
-    templates = [
-        "{}User: Answer based on context:\n\n{}{}",
-        "{}User: {}Answer this question based on the article: {}",
-        "{}User: {}{}",
-        "{}User: {}Answer this question: {}",
-        "{}User: Read this article and answer this question {}{}",
-        "{}User: {}Based on the above article, answer a question. {}",
-        "{}User: Context: {}Question: {}"
-    ]
-    template = templates[template_id - 1].format(system, context, dialogue_turn)
-    return template
-
-
-def reformat_prompt(query, neighbours, dataset_name, ft_neighbours, \
-                    max_output_len, tokenizer, max_seq_length, template_id=0):
-    system = ("System: This is a chat between a user and an artificial intelligence assistant. The assistant gives "
-              "helpful, detailed, and polite answers to the user's questions based on the context. The assistant "
-              "should also indicate when the answer cannot be found in the context.\n\n")
-
-    if dataset_name in ["oasst", "quiet_cockatoo", "open_inst", "quiet-cockatoo_commercial"]:
-        input_tokens = tokenizer.tokenize(system + query)
-        return input_tokens
-
-    short_span_with_context = ["drop", "NarrativeQA", "QASC", "Quoref", "ROPES", "squad1.1", "squad2.0", "newsqa", "nq",
-                               "BioASQ", "DuoRC_ParaphraseRC", "TextbookQA", "tqa"]
-    yes_no_without_context = ["boolq", "multirc"]
-    multichoices = ["race"]
-    # multi-turn qa datasets
-    formatted_dataset_name = ["convqa", "chatgptgen", "doc2dial", "quac", "qrecc", "sharc"]
-
-    if dataset_name in formatted_dataset_name:
-        dialogue_turn = query
-    else:
-        if dataset_name in short_span_with_context:
-            if template_id == 0:
-                user = "Answer the following question with a short span. {}".format(query)
-            else:
-                user = query
-        elif dataset_name in yes_no_without_context:
-            user = "Answer the following question with True or False. {}".format(query)
-        elif dataset_name in multichoices:
-            user = "Answer the following question by selecting one of the provided options. {}".format(query)
-        else:
-            if template_id == 0:
-                user = "Please give a full and complete answer for the question. {}".format(query)
-            else:
-                user = query
-
-        if dataset_name in short_span_with_context:
-            if template_id == 0:
-                dialogue_format = "User: {}\n\nAssistant: The answer is"
-            else:
-                dialogue_format = "{}\n\nAssistant: The answer is"
-            dialogue_turn = dialogue_format.format(user)
-        else:
-            if template_id == 0:
-                dialogue_format = "User: {}\n\nAssistant:"
-            else:
-                dialogue_format = "{}\n\nAssistant:"
-            dialogue_turn = dialogue_format.format(user)
-
-    if ft_neighbours > 0:
-        context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n"
-        context_tokens = tokenizer.tokenize(context)
-        dialogue_tokens = tokenizer.tokenize(dialogue_turn)
-        system_tokens = tokenizer.tokenize(system)
-        context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens) - len(system_tokens)]
-        context = tokenizer.detokenize(context_tokens)
-
-        if template_id == 0:
-            all_input = system + context + dialogue_turn
-        else:
-            all_input = flan_format(system, context, dialogue_turn, template_id=template_id)
-        input_tokens = tokenizer.tokenize(all_input)
-    else:
-        all_input = system + dialogue_turn
-        input_tokens = tokenizer.tokenize(all_input)
-
-    return input_tokens
-
-
-def reformat_prompt_short(query, neighbours, dataset_name, ft_neighbours, \
-                          max_output_len, tokenizer, max_seq_length):
-    if not query.endswith("?"):
-        query = query + "?"
-    query = "Question: {} Answer: The answer is".format(query)
-
-    if ft_neighbours > 0:
-        context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n"
-        context_tokens = tokenizer.tokenize(context)
-        dialogue_tokens = tokenizer.tokenize(query)
-        context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens)]
-        context = tokenizer.detokenize(context_tokens)
-        all_input = context + query
-        input_tokens = tokenizer.tokenize(all_input)
-    else:
-        all_input = query
-        input_tokens = tokenizer.tokenize(all_input)
-
-    return input_tokens
-
-
-def pad_and_convert_to_numpy(input_ids, output_ids,
-                             pad_id, max_seq_length,
-                             eos_id):
-    """Pad sequences and convert them to numpy."""
-    if len(input_ids) > max_seq_length:
-        input_ids = input_ids[:max_seq_length - 1]
-
-    if len(input_ids + output_ids) > max_seq_length:
-        output_ids = output_ids[:max_seq_length - len(input_ids)]
-
-    tokens = input_ids + output_ids
-    answer_mask = [0] * len(input_ids) + [1] * len(output_ids)
-
-    # padding
-    num_tokens = len(tokens)
-    padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0
-
-    # Tokens.
-    filler = [pad_id] * padding_length
-    tokens = numpy.array(tokens + [eos_id] + filler, dtype=numpy.int64)
-
-    # answer mask
-    answer_mask = answer_mask + [1] + [0] * padding_length
-    answer_mask = numpy.array(answer_mask, dtype=numpy.int64)
-
-    return tokens, answer_mask
diff --git a/tools/retro/sft/open_inst.sh b/tools/retro/sft/open_inst.sh
deleted file mode 100644
index 9ebe063b810..00000000000
--- a/tools/retro/sft/open_inst.sh
+++ /dev/null
@@ -1 +0,0 @@
-DATA_BLEND="1.0 open_inst"
diff --git a/tools/retro/sft/sft_retro.py b/tools/retro/sft/sft_retro.py
deleted file mode 100644
index e71d841f4df..00000000000
--- a/tools/retro/sft/sft_retro.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Pretrain GPT"""
-
-import torch
-from functools import partial, reduce
-import sys, os
-
-sys.path.append(os.path.abspath(os.path.join(
-    os.path.join(os.path.dirname(__file__), "../../../"))))
-from megatron.training import get_args, get_retro_args
-from megatron.training import print_rank_0
-from megatron.training import get_timers
-from megatron.training import get_tokenizer
-from megatron.core import tensor_parallel
-from megatron.core.enums import ModelType
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.training import pretrain
-from megatron.training.utils import get_ltor_masks_and_position_ids
-from megatron.training.utils import average_losses_across_data_parallel_group
-from pretrain_gpt import is_dataset_built_on_rank
-from model_provider import model_provider
-from gpt_builders import gpt_builder
-from tools.retro.sft.dataset_conv import JsonQADataset, JsonQADatasetConfig, RetroJsonQADataset, RetroJsonQADatasetConfig
-
-
-def get_tasks_args(parser):
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='tasks')
-
-    # parameters for the knowledgeable dialogue generation
-    group.add_argument('--task', type=str, default=None,
-                       help='Task name.')
-    group.add_argument('--epochs', type=int, default=None,
-                       help='Number of finetunning epochs. Zero results in '
-                            'evaluation only.')
-    group.add_argument('--keep-last', action='store_true',
-                       help='Keep the last batch (maybe incomplete) in'
-                            'the data loader')
-    group.add_argument('--pretrained-checkpoint', type=str, default=None,
-                       help='Pretrained checkpoint used for finetunning.')
-    group.add_argument('--data-folder', type=str, default=None,
-                       help='dataset folder')
-    group.add_argument('--answer-loss-only', action='store_true', default=False,
-                       help='take the loss from answer part, ignore the context')
-    group.add_argument('--weight', type=float, default=1)
-    group.add_argument('--adaptor', action='store_true', default=False)
-    group.add_argument('--project-size', type=int, default=256)
-    group.add_argument('--cyclic-train-iters', type=int, default=None)
-    group.add_argument('--stored_params', type=dict, default=dict())
-    group.add_argument('--eval_ppl', action='store_true', default=False)
-    group.add_argument('--debug', action='store_true', default=False)
-    group.add_argument('--add_retriever', action='store_true', default=False)
-    group.add_argument('--return_doc_ids', action='store_true', default=False)
-    group.add_argument('--return_neighbor_ids', action='store_true', default=False)
-    group.add_argument('--add_offset_doc_ids', action='store_true', default=False)
-    group.add_argument('--offset_dict_path', type=str, default='')
-    group.add_argument('--neighbors_path', type=str, default='')
-    group.add_argument('--valid_neighbors_path', type=str, default='')
-    group.add_argument('--database_path', type=str, default='')
-    group.add_argument('--valid_database_path', type=str, default='')
-    group.add_argument('--encoder-layers', type=int, default=12)
-    group.add_argument('--encoder-hidden-dropout', type=float, default=0.1)
-    group.add_argument('--encoder-attention-dropout', type=float, default=0.1)
-    group.add_argument('--k', type=int, default=2)
-    group.add_argument('--r', type=int, default=128)
-    group.add_argument('--m', type=int, default=64)
-    group.add_argument('--dpr-mode', type=str, default="multi")
-    group.add_argument('--faiss-ckpt', type=str, default='')
-    group.add_argument('--original-db-file', type=str, default="")
-    group.add_argument('--ft_neighbours', type=int, default=1)
-    group.add_argument('--reuse-top', action='store_true', default=False)
-    group.add_argument('--shuffle_topn', action='store_true', default=False)
-    group.add_argument('--chunk0', action='store_true', default=False)
-    group.add_argument('--disable-encoder', action='store_true', default=False)
-    group.add_argument('--qa-space-pad', action='store_true', default=False)
-    group.add_argument('--retro-mask-encoder', action='store_true', default=False)
-    group.add_argument('--without-title', action='store_true', default=False)
-    group.add_argument('--longform-answer', action='store_true', default=False)
-    group.add_argument('--bert-retriever-neighbours', action='store_true', default=False)
-    group.add_argument('--prefix', action='store_true', default=False)
-    group.add_argument('--question-in-encoder', action='store_true', default=False)
-    group.add_argument('--reset_eval', type=bool, default=True)  ## by default reset eval for each eval
-    return parser
-
-
-def get_batch(data_iterator):
-    """Generate a batch"""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Items and their type.
-    keys = ['text', 'answer_mask']
-    datatype = torch.int64
-
-    if args.retro_add_retriever:
-        keys += 'neighbor_tokens', 'context_len'
-
-    # Broadcast data.
-    if data_iterator is not None:
-        try:
-            data = next(data_iterator)
-
-        except Exception:
-            data = data_iterator
-            raise ValueError("error with data_iterator")
-    else:
-        data = None
-
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
-    chunk_size = torch.min(data_b['context_len'])
-    retro_args = get_retro_args()
-    # two chunk retro has at least seq_len / 2 of chunk size
-    retro_args.retro_gpt_chunk_length = max(args.seq_length // 2, args.seq_length - chunk_size.item())
-
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-
-    answer_mask = data_b["answer_mask"].float()[:, 1:].contiguous()
-
-    if args.retro_add_retriever:
-        neighbor_tokens = data_b['neighbor_tokens'].view(-1,
-                                                         retro_args.retro_gpt_retrieved_length).long()  # [bs * l * k, r]
-
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-
-    if args.answer_loss_only:
-        loss_mask = loss_mask * answer_mask
-
-    if args.retro_add_retriever:
-        _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(
-            neighbor_tokens,
-            tokenizer.eod,
-            args.reset_position_ids,
-            args.reset_attention_mask,
-            args.eod_mask_loss)
-        neighbor_attention_mask = None
-        return tokens, labels, loss_mask, attention_mask, position_ids, \
-            neighbor_tokens, neighbor_attention_mask, neighbor_position_ids
-    else:
-        return tokens, labels, loss_mask, attention_mask, position_ids
-
-
-def loss_func(loss_mask, output_tensor):
-    losses = output_tensor.float()
-    loss_mask = loss_mask.view(-1).float()
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
-
-    # Reduce loss for logging.
-    averaged_loss = average_losses_across_data_parallel_group([loss])
-
-    return loss, {'lm loss': averaged_loss[0]}
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    if args.retro_add_retriever:
-        timers('batch-generator', log_level=2).start()
-        tokens, labels, loss_mask, attention_mask, position_ids, \
-            neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = get_batch(
-            data_iterator)
-        timers('batch-generator').stop()
-        output_tensor = model(tokens, position_ids, attention_mask,
-                              retriever_input_ids=neighbor_tokens,
-                              retriever_position_ids=neighbor_position_ids,
-                              retriever_attn_mask=neighbor_attention_mask,
-                              labels=labels)
-    else:
-        timers('batch-generator', log_level=2).start()
-        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-            data_iterator)
-        timers('batch-generator').stop()
-        output_tensor = model(tokens, position_ids, attention_mask,
-                              labels=labels)
-
-    return output_tensor, partial(loss_func, loss_mask)
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-    retro_args = get_retro_args()
-
-    tokenizer = get_tokenizer()
-
-    def fix_and_split_blend_pair(pair):
-        weight, name = pair
-        return [
-            [weight, os.path.join(args.data_folder, name, f"{name}_QA_train.json")],
-            [weight, os.path.join(args.data_folder, name, f"{name}_QA_dev.json")],
-            None,
-        ]
-
-    blend = [args.data_path[i:i+2] for i in range(0, len(args.data_path), 2)]
-
-    if len(blend) == 1:
-        blend_per_split =  [
-            os.path.join(args.data_folder, blend[0], f"{blend[0]}_QA_train.json"),
-            os.path.join(args.data_folder, blend[0], f"{blend[0]}_QA_dev.json"),
-            None,
-        ]
-    else:
-        blend_per_split = [
-            list(
-                reduce(
-                    lambda x, y: x + y,
-                    list(zip(*map(fix_and_split_blend_pair, blend)))[0]
-                )
-            ),
-            None,
-            None,
-        ]
-
-    blend_per_split = [get_blend_from_list(blend) for blend in blend_per_split]
-
-    extra_kwargs = {}
-
-    if args.retro_add_retriever:
-        dataset_cls = RetroJsonQADataset
-        config_cls = RetroJsonQADatasetConfig
-        extra_kwargs["retro_num_neighbors"] = args.retro_num_neighbors
-        extra_kwargs["retro_gpt_retrieved_length"] = retro_args.retro_gpt_retrieved_length
-    else:
-        dataset_cls = JsonQADataset
-        config_cls = JsonQADatasetConfig
-
-    config = config_cls(
-        random_seed=args.seed,
-        sequence_length=args.seq_length,
-        blend_per_split=blend_per_split,
-        split=args.split,
-        path_to_cache=args.data_cache_path,
-        tokenizer=tokenizer,
-        ft_neighbours=args.ft_neighbours,
-        bert_retriever_neighbours=args.bert_retriever_neighbours,
-        longform_answer=args.longform_answer,
-        inference_only=False,
-        retrieved_neighbours=False,
-        fix_newsqa=True,
-        mid_level_dataset_surplus=args.mid_level_dataset_surplus,
-        **extra_kwargs
-    )
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
-        dataset_cls,
-        train_val_test_num_samples,
-        is_dataset_built_on_rank,
-        config
-    ).build()
-    print_rank_0("> finished creating GPT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-if __name__ == "__main__":
-
-    # Temporary for transition to core datasets
-    train_valid_test_datasets_provider.is_distributed = True
-
-    pretrain(train_valid_test_datasets_provider, partial(model_provider, gpt_builder),
-        ModelType.retro_decoder,  # ModelType.encoder_or_decoder,
-        forward_step,
-        extra_args_provider=get_tasks_args
-    )
diff --git a/tools/retro/sft/sft_retro_lm.sh b/tools/retro/sft/sft_retro_lm.sh
deleted file mode 100644
index 8c13f1052c1..00000000000
--- a/tools/retro/sft/sft_retro_lm.sh
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/bin/bash
-# bash examples/qa/finetune_normal_lm.sh landrover_tasb_retrieved 843m 1 3e-6 1
-
-blend_name=$1
-model_size=$2
-global_bsz=$3
-lr=$4
-ft_neighbours=1
-model_card=pp1
-ckpt=$5
-TASK=none
-
-train_iters=1000
-
-
-DATA_HOME="<path/to/instruction/tuning/data/directory>"
-data_folder="$DATA_HOME"
-
-SFT_HOME="<path/to/megatron/repo>"
-
-TOKENIZER_MODEL="<path/to/gpt/tokenizer/model>"
-
-RETRO_WORKDIR="<path/to/retro/workdir>"
-
-K=2
-
-PRETRAINED_CHECKPOINT=${ckpt}
-
-SAVENAME="retro-${blend_name}_${model_card}_same_format_ctx${ft_neighbours}_${model_size}_${global_bsz}_${lr}"
-CHECKPOINT_PATH="${SFT_HOME}/checkpoints/applications/${SAVENAME}"
-TENSORBOARD_DIR="${SFT_HOME}/tensorboard/${SAVENAME}"
-mkdir -p ${TENSORBOARD_DIR}
-
-. ./tools/retro/sft/"${blend_name}".sh
-
-
-if [[ $model_size == "843m" ]]; then
-    # model param
-    mod_par=1
-    layers=24
-    hid_dim=1024
-    heads=16
-    pip_par=1
-
-    # node param
-    num_nodes=1
-    lr=5e-6
-    min_lr=5e-6
-fi
-
-
-GPT_ARGS="--apply-layernorm-1p \
-        --untie-embeddings-and-output-weights \
-        --disable-bias-linear \
-        --no-position-embedding \
-        --use-rotary-position-embeddings \
-        --rotary-percent 0.5 \
-        --swiglu \
-        --attention-dropout 0.0 \
-        --hidden-dropout 0.0 \
-        --pipeline-model-parallel-size $pip_par \
-        --tensor-model-parallel-size $mod_par \
-        --num-layers $layers \
-        --hidden-size $hid_dim \
-        --num-attention-heads $heads \
-        --seq-length 4096 \
-        --max-position-embeddings 4096 \
-        --lr-decay-style cosine \
-        --tokenizer-type GPTSentencePieceTokenizer \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --clip-grad 1.0 \
-        --weight-decay 0.01 \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.98 \
-        --log-params-norm \
-        --log-num-zeros-in-grad \
-        --bf16 \
-        --use-distributed-optimizer \
-"
-
-FT_ARGS="--eod-mask-loss \
-    --answer-loss-only \
-    --ft_neighbours ${ft_neighbours} \
-    --task $TASK"
-
-
-OUTPUT_ARGS="--log-interval 10 \
-             --save-interval 500 \
-             --eval-interval 200 \
-             --tensorboard-dir ${TENSORBOARD_DIR} \
-             --log-validation-ppl-to-tensorboard \
-             --eval-iters 100"
-
-options=" \
-    $GPT_ARGS \
-    --retro-workdir ${RETRO_WORKDIR} \
-    --retro-add-retriever \
-    --retro-num-neighbors ${K} \
-    --retro-attention-gate 0 \
-    --data-path ${DATA_BLEND} \
-    --data-folder ${data_folder} \
-    --recompute-activations \
-    --lr $lr \
-    --micro-batch-size 1 \
-    --global-batch-size ${global_bsz} \
-    --min-lr ${min_lr} \
-    --retro-cyclic-train-iters ${train_iters} \
-    --train-iters ${train_iters} \
-    --dataloader-type cyclic \
-    --save $CHECKPOINT_PATH \
-    $OUTPUT_ARGS \
-    $FT_ARGS"
-
-if [[ -d "$CHECKPOINT_PATH" ]]; then
-  options="$options \
-      --load $CHECKPOINT_PATH "
-else
-  echo $PRETRAINED_CHECKPOINT
-  options="$options \
-      --load $PRETRAINED_CHECKPOINT \
-      --finetune \
-      --no-load-rng \
-      --no-load-optim "
-fi
-
-######## Command. ########
-
-run_cmd="python -u ${SFT_HOME}/tools/retro/sft/sft_retro.py ${options}"
-
-export NCCL_DEBUG=INFO
-export NCCL_IB_TIMEOUT=19
-export NCCL_IB_SL=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-NPROCS=8
-CMD="\
-    pwd && cd ${SFT_HOME} && pwd && \
-    export PYTHONPATH=$PYTHONPATH:${SFT_HOME} && \
-    python -m torch.distributed.run \
-    --nproc_per_node ${NPROCS} \
-    --nnodes 1 \
-    --node_rank 0 \
-    --master_port 6000 \
-    ${run_cmd} \
-"
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "CMD = '$CMD'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-eval $CMD
-
diff --git a/tools/retro/text_generation/evaluate.py b/tools/retro/text_generation/evaluate.py
deleted file mode 100755
index 2031118cdc3..00000000000
--- a/tools/retro/text_generation/evaluate.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-
-import sys
-import os
-from tqdm import tqdm
-import string
-import json
-import regex
-import numpy as np
-
-sys.path.append(os.path.abspath(os.path.join(
-    os.path.join(os.path.dirname(__file__), "../../../"))))
-from tools.retro.text_generation.metrics import F1Metric
-
-
-def normalize_answer(s):
-    def remove_articles(text):
-        return regex.sub(r'\b(a|an|the)\b', ' ', text)
-
-    def white_space_fix(text):
-        return ' '.join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def compute_f1_score(predicted_answers, groundtruth_answer, exp_name="default"):
-    """Evaluating F1 Score"""
-    print(len(predicted_answers), len(groundtruth_answer))
-    if len(predicted_answers) != len(groundtruth_answer):
-        groundtruth_answer = groundtruth_answer[:len(predicted_answers)]
-
-    guess_list = []
-    answer_list = []
-
-    assert len(guess_list) == len(answer_list), \
-        "lengths of guess and answer are different!"
-
-    for pred, ans in zip(predicted_answers, groundtruth_answer):
-        pred = pred.strip()
-        if type(ans) == str:
-            ans = ans.strip()
-        elif type(ans) == dict:
-            ans = ans['text'].strip()
-        elif ans == None:
-            continue
-        if "<|endoftext|>" in pred:
-            pred = pred.replace("<|endoftext|>", "")
-        if ans == "no_passages_used":
-            ans = ""
-        guess_list.append(pred)
-        answer_list.append(ans)
-
-    precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
-    print('Method: %s; Precision: %.4f; recall: %.4f; f1: %.4f' % ( \
-        exp_name, precision, recall, f1))
-
-
-def load_groundtruth_file(data_file):
-    with open(data_file, "r") as f:
-        nq_examples = json.load(f)
-
-    data = []
-    for instance in nq_examples:
-        if "answers" in instance:
-            answers = instance["answers"]
-            if len(answers) < 1:
-                answers = [None]
-        elif "answer" in instance:
-            if type(instance["answer"]) is str:
-                answers = [instance["answer"]]
-            elif type(instance["answer"]) is list:
-                answers = instance["answer"]
-            else:
-                answers = [str(instance["answer"])]
-        else:
-            raise ValueError("need to have answer or answers")
-        data.append(answers[0])
-
-    return data
-
-
-def read_prediction(prediction_file):
-    prediction_list = []
-    print('reading %s' % prediction_file)
-    with open(prediction_file, "r") as f:
-        for i, line in enumerate(tqdm(f)):
-            if prediction_file.endswith("jsonl"):
-                line = json.loads(line)["pred"]
-                # print(line)
-            line = line.replace("Answer:", "")
-            line = line.replace("Answer: ", "")
-            line = line.replace('????  ', "")
-            line = line.replace('A: ', "")
-            line = line.replace("A:", "")
-
-            line = line.strip()
-
-            if "<|endoftext|>" in line:
-                line = line.replace("<|endoftext|>", "")
-            line = normalize_answer(line)  # normalize the answer
-            prediction_list.append(line)
-
-    return prediction_list
-
-
-def exact_match_score(prediction, ground_truth):
-    return normalize_answer(prediction) == normalize_answer(ground_truth)
-
-
-def ems(prediction, ground_truths):
-    return max([exact_match_score(prediction, gt) for gt in ground_truths])
-
-
-def evaluate_ems(prediction_file, ground_truth_file, dev_num=3000):
-    prediction_list = read_prediction(prediction_file)
-    ground_truths_list = []
-
-    if ground_truth_file.endswith(('txt', 'lst')):
-        raw_data = open(ground_truth_file, 'r')
-    else:
-        with open(ground_truth_file, 'r') as f:
-            raw_data = json.load(f)
-    if "dev" in ground_truth_file:
-        raw_data = raw_data[:dev_num]
-        prediction_list = prediction_list[:dev_num]
-
-    for each in raw_data:
-        if ground_truth_file.endswith('txt'):
-            each = json.loads(each)
-
-        if 'answers' in each:
-            ground_truths_list.append(each['answers'])
-        elif 'answer' in each:
-            ground_truths_list.append(each['answer'])
-        else:
-            ground_truths_list.append([each])
-
-    exactmatch = []
-
-    good_example_list = []
-    for i, each in enumerate(prediction_list):
-        score = ems(each, ground_truths_list[i])
-        exactmatch.append(score)
-        if score:
-            good_example_list.append(i)
-
-    final_em_score = np.mean(exactmatch)
-
-    print('Exact Match: %.4f;' % final_em_score)
-
-    print('done :-)')
-
-    return final_em_score, exactmatch
-
-
-def load_prediction(data_file):
-    data = []
-    with open(data_file, "r") as f:
-        for line in f.readlines():
-            data.append(line.strip())
-
-    return data
-
-
-def evaluate_f1(ground_truth_file, prediction_file, reduced_test_only=False):
-    groundtruth_answer = load_groundtruth_file(ground_truth_file)
-    predicted_answers = load_prediction(prediction_file)
-    if not reduced_test_only:
-        compute_f1_score(predicted_answers, groundtruth_answer)
-
-
-if __name__ == "__main__":
-    model_names = []
-    model_names += "retro-open_inst_pp1_same_format_ctx1_843m_128_5e-6",
-
-    for model_name in model_names:
-        ckpt_path = "/path/to/checkpoints/{}/".format(model_name)
-
-        n_ctx = 5
-        n_enc = 2
-        iter = 1000
-        model_param = "843m"
-
-        prediction_file = ckpt_path + "/retro-generate-nq_{}_{}_{}_test_greedy_0_20000_{}.txt".format(
-            n_ctx, n_enc, model_param, iter)
-        ground_truth_file = "/path/to/NQ/test.json"
-        print(prediction_file)
-        print(ground_truth_file)
-        evaluate_f1(ground_truth_file, prediction_file)
-        evaluate_ems(prediction_file, ground_truth_file)
-
-        print("=====================================")
diff --git a/tools/retro/text_generation/metrics.py b/tools/retro/text_generation/metrics.py
deleted file mode 100755
index bd0b5fe6b32..00000000000
--- a/tools/retro/text_generation/metrics.py
+++ /dev/null
@@ -1,80 +0,0 @@
-
-# The following code is adapted from
-# https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
-# which is licensed under the MIT license. More details on the license can be 
-# found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
-
-"""Provides standard metric evaluations for dialog."""
-
-from collections import Counter
-from typing import List
-import numpy as np
-import re
-from nltk import ngrams
-
-re_art = re.compile(r'\b(a|an|the)\b')
-re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
-
-
-def normalize_answer(s):
-    """
-    Lower text and remove punctuation, articles and extra whitespace.
-    """
-    s = s.lower()
-    s = re_punc.sub(' ', s)
-    s = re_art.sub(' ', s)
-    s = ' '.join(s.split())
-    return s
-
-
-class F1Metric:
-    """
-    Helper class which computes token-level F1.
-    """
-
-    @staticmethod
-    def _prec_recall_f1_score(pred_items, gold_items):
-        """
-        Compute precision, recall and f1 given a set of gold and prediction items.
-        :param pred_items: iterable of predicted values
-        :param gold_items: iterable of gold values
-        :return: tuple (p, r, f1) for precision, recall, f1
-        """
-        common = Counter(gold_items) & Counter(pred_items)
-        num_same = sum(common.values())
-        if num_same == 0:
-            return 0, 0, 0
-        precision = 1.0 * num_same / len(pred_items)
-        recall = 1.0 * num_same / len(gold_items)
-        f1 = (2 * precision * recall) / (precision + recall)
-        return precision, recall, f1
-
-    @staticmethod
-    def compute_each_pair(guess: str, answer: str, n=1):
-        if answer == "":
-            return None, None, None
-        if guess == "":
-            return 0, 0, 0
-        g_tokens = normalize_answer(guess).split()
-        a_tokens = normalize_answer(answer).split()
-        g_tokens = list(ngrams(g_tokens, n))
-        a_tokens = list(ngrams(a_tokens, n))
-        precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
-        return precision, recall, f1
-
-    @staticmethod
-    def compute_all_pairs(guesses: List[str], answers: List[str], n=1):
-        # additional augment:
-        print("guess:", len(guesses), ", answers:", len(answers))
-        assert len(guesses) == len(answers)
-
-        precision_list, recall_list, f1_list = [], [], []
-        for guess, answer in zip(guesses, answers):
-            precision, recall, f1 = F1Metric.compute_each_pair(guess, answer, n)
-            if precision is None or recall is None or f1 is None:
-                continue
-            precision_list.append(precision)
-            recall_list.append(recall)
-            f1_list.append(f1)
-
-        return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
diff --git a/tools/retro/text_generation/retro_api.py b/tools/retro/text_generation/retro_api.py
deleted file mode 100644
index b70677485d4..00000000000
--- a/tools/retro/text_generation/retro_api.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-
-"""Inference API."""
-import numpy as np
-import torch
-from megatron.core import mpu
-from megatron.training import print_rank_0, get_retro_args, get_args, get_tokenizer
-from megatron.inference.text_generation.communication import broadcast_float_list, broadcast_tensor, broadcast_int_list
-from megatron.inference.text_generation.generation import (
-    score_and_return_on_first_stage)
-from tools.retro.text_generation.retro_generation import (
-    retro_generate_tokens_probs_and_return_on_first_stage)
-from megatron.inference.text_generation.tokenization import (
-    detokenize_generations)
-
-
-def tokenize_prompts(prompts=None, tokens_to_generate=None,
-                     add_BOS=None, rank=0):
-    """Tokenize prompts and make them avaiable on all ranks."""
-
-    # On all ranks set to None so we can pass them to functions
-    sizes_list = None
-    prompts_tokens_cuda_long_tensor = None
-    prompts_length_cuda_long_tensor = None
-
-    # On the specified rank, build the above.
-    if torch.distributed.get_rank() == rank:
-        assert prompts is not None
-        assert tokens_to_generate is not None
-        # Tensor of tokens padded and their unpadded length.
-        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
-            _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS)
-        # We need the sizes of these tensors for the boradcast
-        sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
-                      prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
-
-    # First, broadcast the sizes.
-    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
-
-    # Now that we have the sizes, we can boradcast the tokens
-    # and length tensors.
-    sizes = sizes_tensor.tolist()
-    prompts_tokens_cuda_long_tensor = broadcast_tensor(
-        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
-    prompts_length_cuda_long_tensor = broadcast_tensor(
-        sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
-        rank=rank)
-
-    return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
-
-
-def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
-    """Given a set of prompts and number of tokens to generate:
-        - tokenize prompts
-        - set the sequence length to be the max of length of prompts
-          plus the number of tokens we would like to generate
-        - pad all the sequences to this length so we can convert them
-          into a 2D tensor.
-    """
-
-    # Tokenize all the prompts.
-    tokenizer = get_tokenizer()
-    if add_BOS:
-        prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt)
-                          for prompt in prompts]
-    else:
-        prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
-
-    # Now we have a list of list of tokens which each list has a different
-    # size. We want to extend this list to:
-    #   - incorporate the tokens that need to be generated
-    #   - make all the sequences equal length.
-    # Get the prompts length.
-    prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
-    # Get the max prompts length.
-    max_prompt_len = max(prompts_length)
-    # Set the tokens to generate to the max prompts length for Retro
-    args = get_args()
-    if args.retro_add_retriever:
-        tokens_to_generate = max_prompt_len
-    # Number of tokens in the each sample of the batch.
-    samples_length = max_prompt_len + tokens_to_generate
-    # Now update the list of list to be of the same size: samples_length.
-    for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
-        padding_size = samples_length - prompt_length
-        prompt_tokens.extend([tokenizer.eod] * padding_size)
-
-    # Now we are in a structured format, we can convert to tensors.
-    prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens)
-    prompts_length_tensor = torch.cuda.LongTensor(prompts_length)
-
-    return prompts_tokens_tensor, prompts_length_tensor
-
-
-def retro_generate_and_post_process(model,
-                              prompts=None,
-                              neighbours_array=None,
-                              tokens_to_generate=0,
-                              return_output_log_probs=False,
-                              top_k_sampling=0,
-                              top_p_sampling=0.0,
-                              temperature=1.0,
-                              add_BOS=False,
-                              use_eod_token_for_early_termination=True,
-                              random_seed=-1,
-                              logits_mask=None):
-    """Run inference and post-process outputs, i.e., detokenize,
-    move to cpu and convert to list."""
-
-    # Main inference.
-    tokens, lengths, output_log_probs = retro_generate(
-        model,
-        prompts=prompts,
-        neighbours_array=neighbours_array,
-        tokens_to_generate=tokens_to_generate,
-        return_output_log_probs=return_output_log_probs,
-        top_k_sampling=top_k_sampling,
-        top_p_sampling=top_p_sampling,
-        temperature=temperature,
-        add_BOS=add_BOS,
-        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
-        random_seed=random_seed,
-        logits_mask=logits_mask)
-
-    # Only post-process on first stage.
-    if mpu.is_pipeline_first_stage():
-        tokens, prompts_plus_generations, prompts_plus_generations_segments = \
-            detokenize_generations(tokens, lengths, True)
-
-        if return_output_log_probs:
-            output_log_probs = output_log_probs.cpu().numpy().tolist()
-            for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)):
-                output_log_probs[i] = prob[:len(seg) - 1]
-
-        return prompts_plus_generations, prompts_plus_generations_segments, \
-               output_log_probs, tokens
-
-    return None
-
-
-def retro_generate(model,
-             prompts=None,
-             neighbours_array=None,
-             tokens_to_generate=0,
-             return_output_log_probs=False,
-             top_k_sampling=0,
-             top_p_sampling=0.0,
-             temperature=1.0,
-             add_BOS=False,
-             use_eod_token_for_early_termination=True,
-             stop_on_double_eol=False,
-             stop_on_eol=False,
-             random_seed=-1,
-             logits_mask=None):
-    """Given prompts and input parameters, run inference and return:
-       tokens: prompts plus the generated tokens.
-       lengths: length of the prompt + generations. Note that we can
-           discard tokens in the tokens tensor that are after the
-           corresponding length.
-       output_log_probs: log probs of the tokens.
-    """
-
-    # Make sure input params are avaialble to all ranks.
-    values = [tokens_to_generate,
-              return_output_log_probs,
-              top_k_sampling, top_p_sampling,
-              temperature, add_BOS, use_eod_token_for_early_termination,
-              stop_on_double_eol,
-              stop_on_eol,
-              random_seed]
-    values_float_tensor = broadcast_float_list(10, float_list=values)
-    tokens_to_generate = int(values_float_tensor[0].item())
-    return_output_log_probs = bool(values_float_tensor[1].item())
-    top_k_sampling = int(values_float_tensor[2].item())
-    top_p_sampling = values_float_tensor[3].item()
-    temperature = values_float_tensor[4].item()
-    add_BOS = bool(values_float_tensor[5].item())
-    use_eod_token_for_early_termination = bool(values_float_tensor[6].item())
-    stop_on_double_eol = bool(values_float_tensor[7].item())
-    stop_on_eol = bool(values_float_tensor[8].item())
-    random_seed = int(values_float_tensor[9].item())
-
-    if random_seed != -1:
-        torch.random.manual_seed(random_seed)
-
-    # Tokenize prompts and get the batch.
-    # Note that these tensors are broadcaseted to all ranks.
-    if torch.distributed.get_rank() == 0:
-        assert prompts is not None
-
-    context_tokens_tensor, context_length_tensor = tokenize_prompts(
-        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
-
-    retro_args = get_retro_args()
-    retro_args.retro_gpt_chunk_length = context_length_tensor.item()
-
-    retro_args = get_retro_args()
-    args = get_args()
-    r = retro_args.retro_gpt_retrieved_length
-    l = int(np.ceil(min(args.max_position_embeddings, context_tokens_tensor.size(1)) / retro_args.retro_gpt_chunk_length))
-    if torch.distributed.get_rank() == 0:
-        neighbours_array = neighbours_array.reshape(1, args.retro_num_neighbors, r).repeat(l, axis=0)  ## dim (l, k, r)
-
-    if tokens_to_generate == 0:
-        return score_and_return_on_first_stage(
-            model, context_tokens_tensor, context_length_tensor)
-
-    # Main inference function.
-    # Note that the outputs are available on the first stage.
-    return retro_generate_tokens_probs_and_return_on_first_stage(
-        model, context_tokens_tensor, context_length_tensor,
-        neighbours_array=neighbours_array,
-        return_output_log_probs=return_output_log_probs,
-        top_k=top_k_sampling,
-        top_p=top_p_sampling,
-        temperature=temperature,
-        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
-        stop_on_double_eol=stop_on_double_eol,
-        stop_on_eol=stop_on_eol,
-        logits_mask=logits_mask)
\ No newline at end of file
diff --git a/tools/retro/text_generation/retro_generate.sh b/tools/retro/text_generation/retro_generate.sh
deleted file mode 100755
index 53f7d76476f..00000000000
--- a/tools/retro/text_generation/retro_generate.sh
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/bin/bash
-
-TASK=$1
-model_size=$2
-sampling=$3
-split=$4
-gen_start=$5
-num_gen=$6
-ckpt_step=${7}
-ft_neighbours=${8}
-model_card=${9}
-ckpt=${10}
-K=${11}
-retrieve=${12}
-
-QA_HOME="<path/to/megatron/repo>"
-
-TOKENIZER_MODEL="<path/to/gpt/tokenizer/model>"
-
-RETRO_WORKDIR="<path/to/retro/workdir>"
-
-
-if [[ $model_size == "843m" ]]; then
-    mod_par=1
-    layers=24
-    hid_dim=1024
-    heads=16
-    pip_par=1
-fi
-
-GPT_ARGS="--apply-layernorm-1p \
-        --untie-embeddings-and-output-weights \
-        --disable-bias-linear \
-        --no-position-embedding \
-        --use-rotary-position-embeddings \
-        --rotary-percent 0.5 \
-        --swiglu \
-        --attention-dropout 0.0 \
-        --hidden-dropout 0.0 \
-        --pipeline-model-parallel-size $pip_par \
-        --tensor-model-parallel-size $mod_par \
-        --num-layers $layers \
-        --hidden-size $hid_dim \
-        --num-attention-heads $heads \
-        --seq-length 4096 \
-        --max-position-embeddings 4096 \
-        --lr-decay-style cosine \
-        --tokenizer-type GPTSentencePieceTokenizer \
-        --tokenizer-model ${TOKENIZER_MODEL} \
-        --clip-grad 1.0 \
-        --weight-decay 0.01 \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.98 \
-        --log-params-norm \
-        --log-num-zeros-in-grad \
-        --bf16 \
-"
-
-
-sample_input_file="/path/to/instruct_tuning/data/$TASK/${split}.json"
-
-top_k=1
-micro_bsz=1
-SAMPLE_ARGS="--top_k $top_k"
-
-CHECKPOINT_PATH=${ckpt}
-sample_output_file="${CHECKPOINT_PATH}/retro-generate-${TASK}_${ft_neighbours}_${K}_${model_size}_${split}_${sampling}_${gen_start}_${num_gen}_${ckpt_step}.txt"
-
-DIR=`pwd`
-
-echo $sample_input_file
-echo $sample_output_file
-
-
-GEN_ARGS="$SAMPLE_ARGS \
-          --gen-start-idx $gen_start \
-          --num-gen $num_gen \
-          --ckpt-step ${ckpt_step} \
-          --sample-input-file $sample_input_file \
-          --sample-output-file $sample_output_file \
-          --retro-workdir ${RETRO_WORKDIR} \
-          --retro-add-retriever \
-          --retro-num-neighbors ${K} \
-          --reuse-top \
-          --retro-attention-gate 0 \
-          "
-
-if [[ $retrieve == 1 ]]; then
-    GEN_ARGS="$GEN_ARGS \
-          --use-retrieved-neighbours \
-          "
-fi
-
-FT_ARGS="--eod-mask-loss \
-    --answer-loss-only \
-    --ft_neighbours ${ft_neighbours} \
-    --task $TASK"
-
-DISTRIBUTED_ARGS="--nproc_per_node ${mod_par} \
-                  --nnodes ${pip_par} \
-                  --node_rank 0 \
-                  --master_port 8889"
-
-######## Command. ########
-
-COMMAND="python -m torch.distributed.run $DISTRIBUTED_ARGS ${DIR}/tools/retro/text_generation/retro_text_generation.py"
-
-COMMAND="$COMMAND \
-       $GPT_ARGS \
-       $GEN_ARGS \
-       --load $CHECKPOINT_PATH \
-       --micro-batch-size $micro_bsz \
-       $FT_ARGS"
-
-export NCCL_DEBUG=INFO
-export NCCL_IB_TIMEOUT=19
-export NCCL_IB_SL=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "CMD = '$CMD'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-eval $COMMAND
-
diff --git a/tools/retro/text_generation/retro_generation.py b/tools/retro/text_generation/retro_generation.py
deleted file mode 100644
index f69103de772..00000000000
--- a/tools/retro/text_generation/retro_generation.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-
-"""Generation utilities."""
-import torch
-import torch.nn.functional as F
-from megatron.training import get_args, get_tokenizer
-from megatron.training import get_retro_args
-from megatron.core import mpu
-from megatron.training.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron.inference.text_generation.communication import (
-    copy_from_last_to_first_pipeline_stage,
-    broadcast_from_last_pipeline_stage,
-    broadcast_from_last_to_first_pipeline_stage, broadcast_int_list, broadcast_tensor)
-from megatron.inference.text_generation.generation import _build_attention_mask_and_position_ids
-from megatron.inference.text_generation.sampling import sample
-
-
-
-def retro_generate_tokens_probs_and_return_on_first_stage(
-        model, tokens, lengths, neighbours_array=None,
-        return_output_log_probs=False,
-        top_k=0, top_p=0.0,
-        temperature=1.0,
-        use_eod_token_for_early_termination=True,
-        stop_on_double_eol=False,
-        stop_on_eol=False,
-        logits_mask=None):
-    """Main token generation function.
-
-    Args:
-        model: no interleaving is supported.
-        tokens: prompt tokens extended to be of size [b, max-sequence-length]
-        lengths: original prompt length, size: [b]
-        neighbours_array: neighbours array of size [b, l, k, r]
-        return_output_log_probs: flag to calculate the log probability of
-            the generated tokens. Note that the log probability is the one
-            from the original logit.
-        top_k, top_p: top-k and top-p sampling parameters.
-            Note that top-k = 1 is gready. Also, these paramters are
-            exclusive meaning that:
-                if top-k > 0 then we expect top-p=0.
-                if top-p > 0 then we check for top-k=0.
-        temperature: sampling temperature.
-        use_eod_token_for_early_termination: if True, do early termination if
-            all the sequences have reached this token.
-    Note: Outside of model, other parameters only need to be available on
-          rank 0.
-
-    Returns: Note that is size is adjusted to a lower value than
-             max-sequence-length if generation is terminated early.
-        tokens: prompt and generated tokens. size: [b, :]
-        generated_sequence_lengths: total length (including prompt) of
-            the generated sequence. size: [b]
-        output_log_probs: log probability of the selected tokens. size: [b, s]
-    """
-
-    args = get_args()
-    retro_args = get_retro_args()
-
-    tokenizer = get_tokenizer()
-
-    batch_size = tokens.size(0)
-    min_prompt_length = lengths.min().item()
-    max_sequence_length = tokens.size(1)
-    print("max_sequence_length", max_sequence_length)
-    print("min_prompt_length", min_prompt_length)
-    max_sequence_length = min(max_sequence_length, args.max_position_embeddings)
-
-    # If the context is too big, this happens
-    if min_prompt_length >= max_sequence_length:
-        raise ValueError("context length + tokens_to_generate too large")
-
-    # forward step.
-    unwrapped_model = unwrap_model(
-        model)
-    unwrapped_model.language_model.seq_length = max_sequence_length
-
-    # Added termination_id to support the case that we want to terminate the
-    # generation once that id is generated.
-    if hasattr(args, 'eos_id'):
-        termination_id = args.eos_id
-    else:
-        termination_id = tokenizer.eod
-
-    # ===================
-    # Pre-allocate memory
-    # ===================
-
-    # Log probability of the sequence (prompt + generated tokens).
-    output_log_probs = None
-    output_log_probs_size = (batch_size, max_sequence_length - 1)
-    # Lengths of generated seuquence including including prompts.
-    generated_sequence_lengths = None
-    if mpu.is_pipeline_last_stage():
-        if return_output_log_probs:
-            output_log_probs = torch.empty(output_log_probs_size,
-                                           dtype=torch.float32,
-                                           device=torch.cuda.current_device())
-        generated_sequence_lengths = torch.ones(
-            batch_size, dtype=torch.int64,
-            device=torch.cuda.current_device()) * max_sequence_length
-
-    # Whether we have reached a termination id.
-    is_generation_done = torch.zeros(batch_size, dtype=torch.uint8,
-                                     device=torch.cuda.current_device())
-
-    # =============
-    # Run infernece
-    # =============
-
-    with torch.no_grad():
-        attention_mask, position_ids = _build_attention_mask_and_position_ids(
-            tokens)
-        for context_length in range(min_prompt_length, max_sequence_length):
-            prev_context_length = 0
-            sizes_list = None
-            neighbor_tokens_cuda_long_tensor = None
-
-            # get the chunks for retrieval
-            if torch.distributed.get_rank() == 0:
-                neighbor_tokens = neighbours_array
-                neighbor_tokens_cuda_long_tensor = torch.cuda.LongTensor(
-                    neighbor_tokens.reshape((-1, retro_args.retro_gpt_retrieved_length)))
-                sizes_list = [neighbor_tokens_cuda_long_tensor.size(0),  # Batch size
-                              neighbor_tokens_cuda_long_tensor.size(1)]  # Sequence lenght
-            sizes_tensor = broadcast_int_list(2, int_list=sizes_list)
-            sizes = sizes_tensor.tolist()
-            neighbor_tokens_cuda_long_tensor = broadcast_tensor(
-                sizes, torch.int64, tensor=neighbor_tokens_cuda_long_tensor)
-
-            _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(
-                neighbor_tokens_cuda_long_tensor,
-                tokenizer.eod,
-                args.reset_position_ids,
-                args.reset_attention_mask,
-                args.eod_mask_loss)
-            neighbor_attention_mask = None
-
-            # Pick the slice that we need to pass through the network.
-            tokens2use = tokens[:, prev_context_length:4096]
-            positions2use = position_ids[:, prev_context_length:4096]
-            attention_mask2use = attention_mask[
-                                 ..., prev_context_length:4096, :4096]
-
-            logits = model(tokens2use, positions2use, attention_mask2use,
-                           retriever_input_ids=neighbor_tokens_cuda_long_tensor,
-                           retriever_position_ids=neighbor_position_ids, retriever_attn_mask=neighbor_attention_mask,
-                           )
-
-            if mpu.is_pipeline_last_stage():
-                # Always the last stage should have an output.
-                assert logits is not None
-
-                # Sample.
-                last_token_logits = logits[:, context_length - 1, :]
-                # last_token_logits = logits[:, -1, :]
-
-                # word banning
-                if logits_mask is not None:
-                    last_token_logits[:, logits_mask] = float('-Inf')
-
-                new_sample = sample(last_token_logits,
-                                    top_k=top_k,
-                                    top_p=top_p,
-                                    temperature=temperature,
-                                    vocab_size=tokenizer.vocab_size)
-
-                # If a prompt length is smaller or equal th current context
-                # length, it means we have started generating tokens
-                started = lengths <= context_length
-                # Update the tokens.
-                tokens[started, context_length] = new_sample[started]
-
-                # Calculate the log probabilities.
-                if return_output_log_probs:
-                    log_probs = F.log_softmax(logits, dim=2)
-                    if return_output_log_probs:
-                        # Pick the tokens that we need to get the log
-                        # probabilities for. Note that next input token is
-                        # the token which we selected in the current logits,
-                        # so shift by 1.
-                        indices = torch.unsqueeze(
-                            tokens[
-                            :,
-                            (prev_context_length + 1):(context_length + 1)],
-                            2)
-                        output_log_probs[:,
-                        prev_context_length:context_length] = \
-                            torch.gather(log_probs, 2, indices).squeeze(2)
-
-            # Update the tokens on the first stage so the next input to
-            # the network is correct.
-            copy_from_last_to_first_pipeline_stage(batch_size, torch.int64,
-                                                   tokens[:, context_length])
-
-            # Update the context length for the next token generation.
-            prev_context_length = context_length
-
-            # Check if all the sequences have hit the termination_id.
-            done = None
-            if mpu.is_pipeline_last_stage():
-                # TODO(rprenger) These stopping methods are tokenizer dependent
-                # instead tokenization should be in the inference loop so stop sequences can be used
-                if stop_on_double_eol:
-                    hit_double_eol = (new_sample == 628).byte() & started.byte()
-                    hit_two_eols = (new_sample == 198).byte() & (
-                            tokens[:, context_length - 1] == 198).byte() & started.byte()
-                    done_token = hit_double_eol | hit_two_eols
-                elif stop_on_eol:
-                    hit_double_eol = (new_sample == 628).byte() & started.byte()
-                    hit_eol = (new_sample == 198).byte() & started.byte()
-                    done_token = hit_double_eol | hit_eol
-                elif context_length > min_prompt_length + 64:  # previous retrov1 limitations
-                    done_token = 1
-                else:
-                    done_token = (new_sample == termination_id).byte() & \
-                                 started.byte()
-
-                just_finished = (done_token & ~is_generation_done).bool()
-                generated_sequence_lengths[just_finished.view(-1)] = \
-                    context_length + 1
-                is_generation_done = is_generation_done | done_token
-                done = torch.all(is_generation_done)
-            done = broadcast_from_last_pipeline_stage(1, torch.uint8,
-                                                      tensor=done)
-            if use_eod_token_for_early_termination and done:
-                break
-
-    # ===================================================
-    # Update the length of based on max generated length.
-    # ===================================================
-
-    tokens = tokens[:, :(context_length + 1)]
-    if mpu.is_pipeline_last_stage():
-        if return_output_log_probs:
-            output_log_probs = output_log_probs[:, :context_length]
-
-    # ======================================
-    # Broadcast to the first pipeline stage.
-    # ======================================
-
-    generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage(
-        batch_size, torch.int64, generated_sequence_lengths)
-    if return_output_log_probs:
-        output_log_probs_size = (batch_size, context_length)
-        output_log_probs = broadcast_from_last_to_first_pipeline_stage(
-            output_log_probs_size, torch.float32, output_log_probs)
-
-    return tokens, generated_sequence_lengths, output_log_probs
diff --git a/tools/retro/text_generation/retro_text_generation.py b/tools/retro/text_generation/retro_text_generation.py
deleted file mode 100755
index 27050090446..00000000000
--- a/tools/retro/text_generation/retro_text_generation.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Sample Generate GPT"""
-import torch
-import os
-import sys
-from typing import Union
-
-sys.path.append(os.path.abspath(os.path.join(
-    os.path.join(os.path.dirname(__file__), "../../../"))))
-from megatron.training import get_args, get_retro_args
-from megatron.training import print_rank_0
-from megatron.training import get_tokenizer
-from megatron.training.checkpointing import load_checkpoint
-from megatron.training.initialize import initialize_megatron
-from megatron.core.models.gpt import GPTModel
-from megatron.training import get_model
-from tools.retro.text_generation.retro_api import retro_generate_and_post_process
-from tools.retro.sft.sft_retro import get_tasks_args
-from tools.retro.sft.dataset_conv import reformat_prompt, preprocess, reformat_prompt_short
-import numpy as np
-import time
-import megatron.legacy.model
-from megatron.training.arguments import core_transformer_config_from_args
-
-
-
-def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
-    """Builds the model.
-
-    Args:
-        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
-        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
-
-
-    Returns:
-        Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
-    """
-    print_rank_0('building GPT model ...')
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-
-    assert args.use_legacy_models, 'retro text generation only implemented for legacy models'
-    
-    # not support core model yet
-    model = megatron.legacy.model.GPTModel(
-        config,
-        num_tokentypes=0,
-        parallel_output=False,
-        pre_process=pre_process,
-        post_process=post_process
-    )
-
-    return model
-
-
-def pad_neighbours_for_query_only(args, nb_tokens, pad_id, ft_neighbours):
-    # take top k neighbours and padding
-    neighbours_tokens = []
-    retro_args = get_retro_args()
-    r = retro_args.retro_gpt_retrieved_length
-
-    if args.reuse_top:
-        valid_nb_tokens = nb_tokens[:args.retro_num_neighbors]
-    else:
-        valid_nb_tokens = nb_tokens[ft_neighbours:args.retro_num_neighbors + ft_neighbours]
-
-    for nb_token in valid_nb_tokens:
-        if len(nb_token) >= r:
-            nb_token = nb_token[:r]
-        else:
-            nb_token = nb_token + [pad_id] * (r - len(nb_token))
-        neighbours_tokens.append(nb_token)
-    print("len(nb_tokens)", len(nb_tokens))
-    print("len(neighbours_tokens)", len(neighbours_tokens))
-    print("args.retro_num_neighbors", args.retro_num_neighbors)
-
-    if len(neighbours_tokens) < args.retro_num_neighbors:
-        assert ValueError("neighbours are not enough, add empty ones and create mask for those empty ones")
-    neighbours_tokens = np.array(neighbours_tokens)
-    return neighbours_tokens
-
-
-def add_text_generate_args(parser):
-    """Text generation arguments."""
-
-    parser = get_tasks_args(parser)
-    group = parser.add_argument_group(title='text generation')
-
-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--greedy", action='store_true', default=False,
-                       help='Use greedy sampling.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--top_k", type=int, default=0,
-                       help='Top k sampling.')
-    group.add_argument("--out-seq-length", type=int, default=256,
-                       help='Size of the output generated text.')
-    group.add_argument("--sample-input-file", type=str, default=None,
-                       help='Get input from file instead of interactive mode, '
-                            'each line is an input.')
-    group.add_argument("--sample-output-file", type=str, default=None,
-                       help='Output file got from --sample-input-file')
-    group.add_argument("--num-samples", type=int, default=0,
-                       help='Number of samples to generate unconditionally, '
-                            'defaults to 0 and interactive conditional sampling')
-    group.add_argument("--genfile", type=str,
-                       help='Output file when generating unconditionally')
-    group.add_argument("--recompute", action='store_true',
-                       help='During generation recompute all attention '
-                            'instead of using previously computed keys/values.')
-    group.add_argument("--epsilon", type=float, default=0.01,
-                       help="Minimum factor by which each probability is multiplied")
-    group.add_argument("--debug-gen", action='store_true',
-                       help="If set, additional debugging output is printed to stdout")
-    group.add_argument('--length-penalty', type=float, default=1.0,
-                       help='length penalty')
-    group.add_argument('--gen-start-idx', type=int, default=0,
-                       help='project size for adapters')
-    group.add_argument('--num-gen', type=int, default=-1,
-                       help='project size for adapters')
-    group.add_argument('--ckpt-step', type=int, default=None,
-                       help='setting ckpt step manually')
-    group.add_argument("--short-format", action='store_true',
-                       help='Use short format QA')
-    group.add_argument("--use-retrieved-neighbours", action='store_true', default=False,
-                       help='Use retrieved neighbours')
-    group.add_argument('--template-id', type=int, default=0,
-                       help='template id for generation,')
-    return parser
-
-
-def generate_samples_conditional(model):
-    args = get_args()
-    start = time.time()
-    avg_time = []
-    tokenizer = get_tokenizer()
-    model.eval()
-    if torch.distributed.get_rank() == 0:
-
-        data = preprocess(args.sample_input_file, inference_only=True,
-                          retrieved_neighbours=args.use_retrieved_neighbours)
-        print("total rows {}".format(len(data)))
-        all_data = data[args.gen_start_idx:]  # start from gen_start_idx
-        if args.num_gen > 0:
-            all_data = all_data[:args.num_gen]
-        input_count = len(all_data)
-        input_pos = 0
-
-    terminate_runs = 0
-    while True:
-        torch.distributed.barrier()
-        if torch.distributed.get_rank() == 0:
-            sentences = []
-            n_arrays = []
-            print("global batch size", args.global_batch_size)
-            for _ in range(args.global_batch_size):
-                print(input_pos)
-                if input_pos >= input_count:
-                    print("reach the last row")
-                    break
-                else:
-                    sample = all_data[input_pos]
-                input_pos += 1
-
-                if True:
-                    max_target_len = args.out_seq_length
-                    query, _, neighbours = sample
-
-                    neighbours_array = pad_neighbours_for_query_only(args,
-                                                                     [tokenizer.tokenize(neighbour) for neighbour in
-                                                                      neighbours], tokenizer.eod, args.ft_neighbours)
-                    print("neighbours_array.shape", neighbours_array.shape)
-                    tokenizer = get_tokenizer()
-
-                    if args.short_format:
-                        input_tokens = reformat_prompt_short(query, neighbours, args.task, args.ft_neighbours,
-                                                             max_target_len,
-                                                             tokenizer, args.seq_length)
-                    else:
-                        input_tokens = reformat_prompt(query, neighbours, args.task, args.ft_neighbours, max_target_len,
-                                                       tokenizer, args.seq_length, template_id=args.template_id)
-                    raw_text = tokenizer.detokenize(input_tokens)
-                    print(raw_text)
-                else:
-                    raise ValueError("invalid arg for task")
-                sentences.append(raw_text)
-            retro_args = get_retro_args()
-
-            resp_sentences, resp_sentences_seg, scores, \
-                tokens = retro_generate_and_post_process(model, prompts=sentences,
-                                                         neighbours_array=neighbours_array,
-                                                         tokens_to_generate=args.seq_length - retro_args.retro_gpt_chunk_length,
-                                                         return_output_log_probs=False,
-                                                         top_k_sampling=args.top_k,
-                                                         top_p_sampling=args.top_p,
-                                                         add_BOS=False,
-                                                         temperature=1.0)
-            print("len of resp_sentences", len(resp_sentences))
-            for prompt, generation in zip(sentences, resp_sentences):
-                datum = generation[len(prompt):]
-                print("prompt:", generation[:len(prompt)])
-                if "<|endoftext|>" in datum:
-                    datum = datum[:datum.find("<|endoftext|>")].strip()
-                datum = datum.replace("\n", " ")
-                print("cont:", datum)
-                yield datum
-            avg_time.append((time.time() - start) / args.global_batch_size)
-            print("avg time for each sample: ", sum(avg_time) / len(avg_time))
-            start = time.time()
-            if input_pos >= input_count:
-                print("finish all lines")
-                terminate_runs = 1
-        else:
-            retro_generate_and_post_process(model)
-
-        terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
-        torch.distributed.broadcast(terminate_runs_tensor, 0)
-        terminate_runs = terminate_runs_tensor[0].item()
-
-        if terminate_runs == 1:
-            return
-
-
-def generate_and_write_samples_conditional(model):
-    args = get_args()
-    if args.sample_output_file is None:
-        sample_output_file = args.sample_input_file + ".out"
-        print('`sample-output-file` not specified, setting '
-              'it to {}'.format(sample_output_file))
-    else:
-        sample_output_file = args.sample_output_file
-    with open(sample_output_file, 'w') as f:
-        for datum in generate_samples_conditional(model):
-            if torch.distributed.get_rank() == 0:
-                f.write(datum + '\n')
-
-
-def main():
-    """Main program."""
-
-    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'no_load_rng': True,
-                                       'no_load_optim': True})
-
-    # Set up model and load checkpoint
-    model = get_model(model_provider, wrap_with_ddp=False)
-    print(model)
-    args = get_args()
-
-    if args.load is not None:
-        _ = load_checkpoint(model, None, None)
-    model = model[0]
-
-    # Generate samples.
-    if args.sample_input_file is not None:
-        print(f"{args.sample_input_file}")
-        generate_and_write_samples_conditional(model)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/run_dynamic_text_generation_server.py b/tools/run_dynamic_text_generation_server.py
new file mode 100644
index 00000000000..615073b8fd0
--- /dev/null
+++ b/tools/run_dynamic_text_generation_server.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import argparse
+import asyncio
+
+import torch
+
+from examples.inference.gpt.gpt_dynamic_inference import (
+    add_dynamic_inference_args,
+    get_inference_context,
+    get_inference_controller,
+    get_model,
+)
+from megatron.core.inference.engines import DynamicInferenceEngine
+from megatron.core.inference.text_generation_server.dynamic_text_gen_server import run_flask_server
+from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer
+from megatron.core.utils import get_mamba_inference_state_config_from_model, trace_async_exceptions
+from megatron.post_training.arguments import add_modelopt_args
+from megatron.training import get_args, get_tokenizer
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generation_server_args(parser: argparse.ArgumentParser):
+    """Adds the required command line arguments for running the text generation server."""
+    parser = add_modelopt_args(parser)
+    parser = add_dynamic_inference_args(parser)
+    parser.add_argument("--port", type=int, default=5000, help="Port for Flask server to run on")
+    return parser
+
+
+@trace_async_exceptions
+async def run_text_generation_server(
+    engine: DynamicInferenceEngine, coordinator_port: int, flask_port: int
+):
+    """Runs the Flask server from rank 0 and initializes the DynamicInferenceEngine on all ranks.
+
+    Args:
+        engine (DynamicInferenceEngine): The dynamic inference engine.
+        coordinator_port (int): The network port for the dynamic inference DP coordinator.
+        flask_port (int): The network for port the frontend Flask server.
+    """
+
+    rank = torch.distributed.get_rank()
+
+    await engine.start_listening_to_data_parallel_coordinator(
+        inference_coordinator_port=coordinator_port, launch_inference_coordinator=True
+    )
+
+    server_task = None
+    if rank == 0:
+        server_task = asyncio.create_task(
+            run_flask_server(
+                coordinator_port=coordinator_port,
+                tokenizer=engine.controller.tokenizer,
+                rank=rank,
+                flask_port=flask_port,
+            )
+        )
+    engine_task = engine.engine_loop_task
+
+    tasks_to_run = [engine_task]
+    if server_task:
+        assert rank == 0
+
+        tasks_to_run.append(server_task)
+
+    await asyncio.gather(*tasks_to_run)
+
+
+if __name__ == "__main__":
+    with torch.inference_mode():
+        initialize_megatron(
+            extra_args_provider=add_text_generation_server_args,
+            args_defaults={'no_load_rng': True, 'no_load_optim': True},
+        )
+
+        args = get_args()
+        model = get_model()
+
+        if args.legacy_tokenizer:
+            tokenizer = get_tokenizer()
+        else:
+            tokenizer = build_tokenizer(args)
+
+        mamba_inference_state_config = get_mamba_inference_state_config_from_model(model)
+
+        # Enable return_log_probs to allow prompt logprobs computation for echo=True requests
+        # This sets materialize_only_last_token_logits=False in the inference context,
+        # which is required for lm-eval compatibility (loglikelihood evaluation tasks)
+        args.return_log_probs = True
+
+        context = get_inference_context(
+            None,
+            None,
+            calculate_max_sequence_length_from_requests=False,
+            mamba_inference_state_config=mamba_inference_state_config,
+        )
+
+        controller = get_inference_controller(model, context)
+
+        engine = DynamicInferenceEngine(
+            controller,
+            context,
+            enable_cuda_graph=args.cuda_graph_impl == "local",
+            random_seed=args.seed,
+            enable_chunked_prefill=not args.disable_chunked_prefill,
+        )
+
+        asyncio.run(run_text_generation_server(engine, args.inference_coordinator_port, args.port))
diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py
index dda2b8284b3..32d61444530 100644
--- a/tools/run_inference_performance_test.py
+++ b/tools/run_inference_performance_test.py
@@ -120,6 +120,7 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs
             max_tokens_override=args.inference_dynamic_batching_max_tokens_override,
             block_size_tokens=args.inference_dynamic_batching_block_size,
             tensor_model_parallel_size=args.tensor_model_parallel_size,
+            pipeline_model_parallel_size=args.pipeline_model_parallel_size,
             materialize_only_last_token_logits=not args.return_log_probs,
             mamba_inference_state_config=mamba_inference_state_config,
             cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents,
diff --git a/train_rl.py b/train_rl.py
index 7203ca26df5..cfc010b3c04 100644
--- a/train_rl.py
+++ b/train_rl.py
@@ -12,20 +12,27 @@
 from megatron.core import mpu
 from megatron.core.enums import ModelType
 from megatron.core.models.gpt import GPTModel
+from megatron.core.parallel_state import is_pipeline_last_stage
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 from megatron.core.utils import StragglerDetector
 from megatron.rl.rl_utils import (
     calculate_grpo_loss,
-    create_packed_seq_params_for_bin,
     get_logprobs,
     get_rl_runtime_state,
+    load_packed_data_by_index,
 )
 from megatron.training import get_args, get_timers, pretrain, print_rank_0
 from megatron.training.arguments import core_transformer_config_from_args
 from model_provider import model_provider
 
+from megatron.rl.sequence_packing_utils import get_default_packed_seq_params
+
 stimer = StragglerDetector()
 
+import logging
+
+logging.basicConfig(level=logging.INFO, force=True)
+
 
 def _gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None):
     # TODO(Peter): This is a hack to get around the fact that we are activation recomputation for training but not
@@ -189,7 +196,7 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False):
         data_iterator : Input data iterator
         model (GPTModel): The GPT Model
     """
-
+    runtime_state = get_rl_runtime_state()
     args = get_args()
     timers = get_timers()
 
@@ -199,70 +206,24 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False):
         batch_data = next(data_iterator)
     timers('batch-generator').stop()
 
-    seq_starts = None
-    seq_lengths = None
-    attention_mask = None
-    packed_seq_params = None
-    packed_seq_len = 0
-
     if args.rl_use_sequence_packing:
         # Get bin index from data iterator
         bin_tensor = batch_data[0]
-        bin_idx = bin_tensor.item()
-
-        # Get packing context (should always be available in packed mode)
-        runtime_state = get_rl_runtime_state()
-        packing_context = runtime_state.packing_context
-
-        idx = slice(bin_idx, bin_idx + 1)
-        # Extract packed data for this bin (already on GPU)
-        tokens = packing_context['packed_trajs'][idx]
-        position_ids = packing_context['packed_position_ids'][idx]
-        attention_mask = (
-            packing_context['packed_attention_mask'][idx]
-            if packing_context['packed_attention_mask'] is not None
-            else None
-        )
-        old_logprobs = packing_context['old_logprobs'][idx]
-        ref_logprobs = packing_context['ref_logprobs'][idx]
-        loss_mask = packing_context['packed_loss_mask'][idx, 1:]
-
-        # Get sequence-level data for this bin
-        packing_info = packing_context['packing_info']
-        seq_starts = packing_info['seq_starts'][bin_idx]
-        seq_indices = packing_info['bin_seq_indices'][bin_idx]
-
-        # Handle empty bins (used for padding to ensure all ranks have same iterations)
-        if not seq_indices:
-            seq_lengths = []
-            advantages = torch.tensor([], device='cuda')
-        else:
-            seq_lengths = [packing_info['seq_lengths'][idx] for idx in seq_indices]
-            advantages = packing_context['bin_advantages'][bin_idx]
-
-        # Extract packed inference_logprobs if available
-        if (
-            'packed_inference_logprobs' in packing_context
-            and args.rl_inference_logprobs_is_correction
-        ):
-            inference_logprobs = packing_context['packed_inference_logprobs'][idx]
-        else:
-            inference_logprobs = None
 
-        # Get cached PackedSeqParams for proper attention masking in Transformer Engine
-        # These were pre-computed in prepare_data_for_update to avoid repeated tensor allocations
-        if 'cached_packed_seq_params' in packing_context:
-            packed_seq_params, packed_seq_len = packing_context['cached_packed_seq_params'][bin_idx]
-        else:
-            packed_seq_params = create_packed_seq_params_for_bin(
-                packing_info=packing_info,
-                bin_idx=bin_idx,
-                bin_size=args.rl_sequence_packing_bin_size,
-                device=tokens.device,
-            )
-            packed_seq_len = packed_seq_params.cu_seqlens_q[-1].item() if packed_seq_params is not None else 0
+        (
+            tokens,
+            advantages,
+            old_logprobs,
+            loss_mask,
+            position_ids,
+            ref_logprobs,
+            inference_logprobs,
+            seq_starts,
+            seq_lengths,
+            seq_indices,
+            packed_seq_params,
+        ) = load_packed_data_by_index(bin_tensor.item(), runtime_state.packing_context, args.rl_inference_logprobs_is_correction)
 
-        runtime_state = get_rl_runtime_state()
         runtime_state.increment_sequences(len(seq_indices))
     else:
         # Extract unpacked data
@@ -276,6 +237,10 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False):
             inference_logprobs,
         ) = batch_data
 
+        seq_starts = None
+        seq_lengths = None
+        packed_seq_params = None
+
         # Move to CUDA
         tokens = tokens.cuda()
         position_ids = position_ids.cuda()
@@ -287,12 +252,18 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False):
             inference_logprobs.cuda() if args.rl_inference_logprobs_is_correction else None
         )
 
-        runtime_state = get_rl_runtime_state()
         runtime_state.increment_sequences(tokens.shape[0])
 
     # Common logic for both paths
     model_to_use = model[0] if isinstance(model, list) else model
 
+    if packed_seq_params is None:
+        packed_seq_params = get_default_packed_seq_params(
+            seq_length=tokens.shape[1],
+            max_sequences_per_bin=args.rl_sequence_packing_max_sequences_per_bin,
+            device=tokens.device,
+        )
+
     # Clear RoPE cache to avoid inference tensor errors
     try:
         for module in model_to_use.modules():
@@ -305,32 +276,42 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False):
 
     # Get current logprobs and calculate loss with straggler detection
     with stimer:
-        current_logprobs = get_logprobs(
-            model_to_use, tokens, position_ids, attention_mask, no_grad=False,
-            packed_seq_params=packed_seq_params,
-            packed_seq_len=packed_seq_len
+        logprobs_or_hidden_states = get_logprobs(
+            model_to_use, tokens, position_ids, no_grad=False, packed_seq_params=packed_seq_params
         )
 
-        # Calculate loss using unified function
-        loss, kl_term, ratios, entropy_term, truncated_from_above, truncated_from_below = (
-            calculate_grpo_loss(
-                current_logprobs=current_logprobs,
-                old_logprobs=old_logprobs,
-                ref_logprobs=ref_logprobs,
-                advantages=advantages,
-                clamp_eps_lower=args.grpo_clamp_eps_lower,
-                clamp_eps_upper=args.grpo_clamp_eps_upper,
-                kl_beta=args.grpo_kl_beta,
-                entropy_weight=args.grpo_entropy_term_weight,
-                inference_logprobs=inference_logprobs,
-                is_truncation_coef=args.rl_importance_sampling_truncation_coef,
-                seq_starts=seq_starts,
-                seq_lengths=seq_lengths,
+        if not is_pipeline_last_stage():
+            output_tensor = logprobs_or_hidden_states
+            kl_term, ratios, entropy_term, truncated_from_above, truncated_from_below = (
+                None,
+                None,
+                None,
+                None,
+                None,
             )
-        )
+        else:
+            # Calculate loss using unified function
+            current_logprobs = logprobs_or_hidden_states
+            loss, kl_term, ratios, entropy_term, truncated_from_above, truncated_from_below = (
+                calculate_grpo_loss(
+                    current_logprobs=current_logprobs,
+                    old_logprobs=old_logprobs,
+                    ref_logprobs=ref_logprobs,
+                    advantages=advantages,
+                    clamp_eps_lower=args.grpo_clamp_eps_lower,
+                    clamp_eps_upper=args.grpo_clamp_eps_upper,
+                    kl_beta=args.grpo_kl_beta,
+                    entropy_weight=args.grpo_entropy_term_weight,
+                    inference_logprobs=inference_logprobs,
+                    is_truncation_coef=args.rl_importance_sampling_truncation_coef,
+                    seq_starts=seq_starts,
+                    seq_lengths=seq_lengths,
+                )
+            )
+            output_tensor = loss
 
     # loss_mask will not be applied to 0th token as we do not have a logprob for it.
-    return loss, partial(
+    return output_tensor, partial(
         loss_func,
         loss_mask,
         kl_term,
@@ -392,7 +373,9 @@ def __getitem__(self, idx):
     # Temporary for transition to core datasets
     train_valid_test_datasets_provider.is_distributed = True
 
-    def _model_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None):
+    def _model_builder(
+        args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None
+    ):
         if getattr(args, "is_hybrid_model", False):
             return mamba_builder(
                 args,
@@ -417,5 +400,5 @@ def _model_builder(args, pre_process, post_process, vp_stage=None, config=None,
         partial(model_provider, _model_builder),
         ModelType.encoder_or_decoder,
         forward_step,
-        args_defaults={}, 
+        args_defaults={},
     )
diff --git a/uv.lock b/uv.lock
index 8cb9fb52538..68a06f8982f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,15 +1,23 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
     "python_full_version < '3.11' and sys_platform == 'linux'",
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
@@ -27,11 +35,11 @@ overrides = [
 
 [[package]]
 name = "absl-py"
-version = "2.3.1"
+version = "2.4.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/10/2a/c93173ffa1b39c1d0395b7e842bbdc62e556ca9d8d3b5572926f3e4ca752/absl_py-2.3.1.tar.gz", hash = "sha256:a97820526f7fbfd2ec1bce83f3f25e3a14840dac0d8e02a0b71cd75db3f77fc9", size = 116588, upload-time = "2025-07-03T09:31:44.05Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/64/c7/8de93764ad66968d19329a7e0c147a2bb3c7054c554d4a119111b8f9440f/absl_py-2.4.0.tar.gz", hash = "sha256:8c6af82722b35cf71e0f4d1d47dcaebfff286e27110a99fc359349b247dfb5d4", size = 116543, upload-time = "2026-01-28T10:17:05.322Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d", size = 135811, upload-time = "2025-07-03T09:31:42.253Z" },
+    { url = "https://files.pythonhosted.org/packages/18/a6/907a406bb7d359e6a63f99c313846d9eec4f7e6f7437809e03aa00fa3074/absl_py-2.4.0-py3-none-any.whl", hash = "sha256:88476fd881ca8aab94ffa78b7b6c632a782ab3ba1cd19c9bd423abc4fb4cd28d", size = 135750, upload-time = "2026-01-28T10:17:04.19Z" },
 ]
 
 [[package]]
@@ -50,20 +58,61 @@ wheels = [
 name = "aiobotocore"
 version = "2.26.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "aiohttp" },
-    { name = "aioitertools" },
-    { name = "botocore" },
-    { name = "jmespath" },
-    { name = "multidict" },
-    { name = "python-dateutil" },
-    { name = "wrapt" },
+    { name = "aiohttp", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "aioitertools", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "botocore", version = "1.41.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "jmespath", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "multidict", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "python-dateutil", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4d/f8/99fa90d9c25b78292899fd4946fce97b6353838b5ecc139ad8ba1436e70c/aiobotocore-2.26.0.tar.gz", hash = "sha256:50567feaf8dfe2b653570b4491f5bc8c6e7fb9622479d66442462c021db4fadc", size = 122026, upload-time = "2025-11-28T07:54:59.956Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/58/3bf0b7d474607dc7fd67dd1365c4e0f392c8177eaf4054e5ddee3ebd53b5/aiobotocore-2.26.0-py3-none-any.whl", hash = "sha256:a793db51c07930513b74ea7a95bd79aaa42f545bdb0f011779646eafa216abec", size = 87333, upload-time = "2025-11-28T07:54:58.457Z" },
 ]
 
+[[package]]
+name = "aiobotocore"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "aiohttp", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "aioitertools", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "botocore", version = "1.42.49", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jmespath", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "multidict", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "python-dateutil", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "wrapt", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/18/94/332629387f4a9fc691cac9c0cb078af877bfaba415b1a16411377f6ea310/aiobotocore-3.1.3.tar.gz", hash = "sha256:b1b6a95aa4c17410090f4adf16fd45e45a898140c83d4e9d554602f9310408c0", size = 122675, upload-time = "2026-02-14T12:11:01.745Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/29/a3e75834009121ebb695dc24f9fe804566b1bcc9b7d46f6fbe56fe972c6a/aiobotocore-3.1.3-py3-none-any.whl", hash = "sha256:3afc93bf14de304dbd4a2c90f36fb3ce6348b06a5a1ec7f87261be628d7876d9", size = 87717, upload-time = "2026-02-14T12:10:59.898Z" },
+]
+
 [[package]]
 name = "aiohappyeyeballs"
 version = "2.6.1"
@@ -75,7 +124,7 @@ wheels = [
 
 [[package]]
 name = "aiohttp"
-version = "3.13.2"
+version = "3.13.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohappyeyeballs" },
@@ -87,110 +136,122 @@ dependencies = [
     { name = "propcache" },
     { name = "yarl" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/1c/ce/3b83ebba6b3207a7135e5fcaba49706f8a4b6008153b4e30540c982fae26/aiohttp-3.13.2.tar.gz", hash = "sha256:40176a52c186aefef6eb3cad2cdd30cd06e3afbe88fe8ab2af9c0b90f228daca", size = 7837994, upload-time = "2025-10-28T20:59:39.937Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6d/34/939730e66b716b76046dedfe0842995842fa906ccc4964bba414ff69e429/aiohttp-3.13.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2372b15a5f62ed37789a6b383ff7344fc5b9f243999b0cd9b629d8bc5f5b4155", size = 736471, upload-time = "2025-10-28T20:55:27.924Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/cf/dcbdf2df7f6ca72b0bb4c0b4509701f2d8942cf54e29ca197389c214c07f/aiohttp-3.13.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7f8659a48995edee7229522984bd1009c1213929c769c2daa80b40fe49a180c", size = 493985, upload-time = "2025-10-28T20:55:29.456Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/87/71c8867e0a1d0882dcbc94af767784c3cb381c1c4db0943ab4aae4fed65e/aiohttp-3.13.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:939ced4a7add92296b0ad38892ce62b98c619288a081170695c6babe4f50e636", size = 489274, upload-time = "2025-10-28T20:55:31.134Z" },
-    { url = "https://files.pythonhosted.org/packages/38/0f/46c24e8dae237295eaadd113edd56dee96ef6462adf19b88592d44891dc5/aiohttp-3.13.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6315fb6977f1d0dd41a107c527fee2ed5ab0550b7d885bc15fee20ccb17891da", size = 1668171, upload-time = "2025-10-28T20:55:36.065Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/c6/4cdfb4440d0e28483681a48f69841fa5e39366347d66ef808cbdadddb20e/aiohttp-3.13.2-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6e7352512f763f760baaed2637055c49134fd1d35b37c2dedfac35bfe5cf8725", size = 1636036, upload-time = "2025-10-28T20:55:37.576Z" },
-    { url = "https://files.pythonhosted.org/packages/84/37/8708cf678628216fb678ab327a4e1711c576d6673998f4f43e86e9ae90dd/aiohttp-3.13.2-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e09a0a06348a2dd73e7213353c90d709502d9786219f69b731f6caa0efeb46f5", size = 1727975, upload-time = "2025-10-28T20:55:39.457Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/2e/3ebfe12fdcb9b5f66e8a0a42dffcd7636844c8a018f261efb2419f68220b/aiohttp-3.13.2-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a09a6d073fb5789456545bdee2474d14395792faa0527887f2f4ec1a486a59d3", size = 1815823, upload-time = "2025-10-28T20:55:40.958Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/4f/ca2ef819488cbb41844c6cf92ca6dd15b9441e6207c58e5ae0e0fc8d70ad/aiohttp-3.13.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b59d13c443f8e049d9e94099c7e412e34610f1f49be0f230ec656a10692a5802", size = 1669374, upload-time = "2025-10-28T20:55:42.745Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/fe/1fe2e1179a0d91ce09c99069684aab619bf2ccde9b20bd6ca44f8837203e/aiohttp-3.13.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:20db2d67985d71ca033443a1ba2001c4b5693fe09b0e29f6d9358a99d4d62a8a", size = 1555315, upload-time = "2025-10-28T20:55:44.264Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/2b/f3781899b81c45d7cbc7140cddb8a3481c195e7cbff8e36374759d2ab5a5/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:960c2fc686ba27b535f9fd2b52d87ecd7e4fd1cf877f6a5cba8afb5b4a8bd204", size = 1639140, upload-time = "2025-10-28T20:55:46.626Z" },
-    { url = "https://files.pythonhosted.org/packages/72/27/c37e85cd3ece6f6c772e549bd5a253d0c122557b25855fb274224811e4f2/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6c00dbcf5f0d88796151e264a8eab23de2997c9303dd7c0bf622e23b24d3ce22", size = 1645496, upload-time = "2025-10-28T20:55:48.933Z" },
-    { url = "https://files.pythonhosted.org/packages/66/20/3af1ab663151bd3780b123e907761cdb86ec2c4e44b2d9b195ebc91fbe37/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fed38a5edb7945f4d1bcabe2fcd05db4f6ec7e0e82560088b754f7e08d93772d", size = 1697625, upload-time = "2025-10-28T20:55:50.377Z" },
-    { url = "https://files.pythonhosted.org/packages/95/eb/ae5cab15efa365e13d56b31b0d085a62600298bf398a7986f8388f73b598/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:b395bbca716c38bef3c764f187860e88c724b342c26275bc03e906142fc5964f", size = 1542025, upload-time = "2025-10-28T20:55:51.861Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/2d/1683e8d67ec72d911397fe4e575688d2a9b8f6a6e03c8fdc9f3fd3d4c03f/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:204ffff2426c25dfda401ba08da85f9c59525cdc42bda26660463dd1cbcfec6f", size = 1714918, upload-time = "2025-10-28T20:55:53.515Z" },
-    { url = "https://files.pythonhosted.org/packages/99/a2/ffe8e0e1c57c5e542d47ffa1fcf95ef2b3ea573bf7c4d2ee877252431efc/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:05c4dd3c48fb5f15db31f57eb35374cb0c09afdde532e7fb70a75aede0ed30f6", size = 1656113, upload-time = "2025-10-28T20:55:55.438Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/42/d511aff5c3a2b06c09d7d214f508a4ad8ac7799817f7c3d23e7336b5e896/aiohttp-3.13.2-cp310-cp310-win32.whl", hash = "sha256:e574a7d61cf10351d734bcddabbe15ede0eaa8a02070d85446875dc11189a251", size = 432290, upload-time = "2025-10-28T20:55:56.96Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/ea/1c2eb7098b5bad4532994f2b7a8228d27674035c9b3234fe02c37469ef14/aiohttp-3.13.2-cp310-cp310-win_amd64.whl", hash = "sha256:364f55663085d658b8462a1c3f17b2b84a5c2e1ba858e1b79bff7b2e24ad1514", size = 455075, upload-time = "2025-10-28T20:55:58.373Z" },
-    { url = "https://files.pythonhosted.org/packages/35/74/b321e7d7ca762638cdf8cdeceb39755d9c745aff7a64c8789be96ddf6e96/aiohttp-3.13.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4647d02df098f6434bafd7f32ad14942f05a9caa06c7016fdcc816f343997dd0", size = 743409, upload-time = "2025-10-28T20:56:00.354Z" },
-    { url = "https://files.pythonhosted.org/packages/99/3d/91524b905ec473beaf35158d17f82ef5a38033e5809fe8742e3657cdbb97/aiohttp-3.13.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e3403f24bcb9c3b29113611c3c16a2a447c3953ecf86b79775e7be06f7ae7ccb", size = 497006, upload-time = "2025-10-28T20:56:01.85Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/d3/7f68bc02a67716fe80f063e19adbd80a642e30682ce74071269e17d2dba1/aiohttp-3.13.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:43dff14e35aba17e3d6d5ba628858fb8cb51e30f44724a2d2f0c75be492c55e9", size = 493195, upload-time = "2025-10-28T20:56:03.314Z" },
-    { url = "https://files.pythonhosted.org/packages/98/31/913f774a4708775433b7375c4f867d58ba58ead833af96c8af3621a0d243/aiohttp-3.13.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2a9ea08e8c58bb17655630198833109227dea914cd20be660f52215f6de5613", size = 1747759, upload-time = "2025-10-28T20:56:04.904Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/63/04efe156f4326f31c7c4a97144f82132c3bb21859b7bb84748d452ccc17c/aiohttp-3.13.2-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53b07472f235eb80e826ad038c9d106c2f653584753f3ddab907c83f49eedead", size = 1704456, upload-time = "2025-10-28T20:56:06.986Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/02/4e16154d8e0a9cf4ae76f692941fd52543bbb148f02f098ca73cab9b1c1b/aiohttp-3.13.2-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e736c93e9c274fce6419af4aac199984d866e55f8a4cec9114671d0ea9688780", size = 1807572, upload-time = "2025-10-28T20:56:08.558Z" },
-    { url = "https://files.pythonhosted.org/packages/34/58/b0583defb38689e7f06798f0285b1ffb3a6fb371f38363ce5fd772112724/aiohttp-3.13.2-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ff5e771f5dcbc81c64898c597a434f7682f2259e0cd666932a913d53d1341d1a", size = 1895954, upload-time = "2025-10-28T20:56:10.545Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/f3/083907ee3437425b4e376aa58b2c915eb1a33703ec0dc30040f7ae3368c6/aiohttp-3.13.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3b6fb0c207cc661fa0bf8c66d8d9b657331ccc814f4719468af61034b478592", size = 1747092, upload-time = "2025-10-28T20:56:12.118Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/61/98a47319b4e425cc134e05e5f3fc512bf9a04bf65aafd9fdcda5d57ec693/aiohttp-3.13.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:97a0895a8e840ab3520e2288db7cace3a1981300d48babeb50e7425609e2e0ab", size = 1606815, upload-time = "2025-10-28T20:56:14.191Z" },
-    { url = "https://files.pythonhosted.org/packages/97/4b/e78b854d82f66bb974189135d31fce265dee0f5344f64dd0d345158a5973/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9e8f8afb552297aca127c90cb840e9a1d4bfd6a10d7d8f2d9176e1acc69bad30", size = 1723789, upload-time = "2025-10-28T20:56:16.101Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/fc/9d2ccc794fc9b9acd1379d625c3a8c64a45508b5091c546dea273a41929e/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed2f9c7216e53c3df02264f25d824b079cc5914f9e2deba94155190ef648ee40", size = 1718104, upload-time = "2025-10-28T20:56:17.655Z" },
-    { url = "https://files.pythonhosted.org/packages/66/65/34564b8765ea5c7d79d23c9113135d1dd3609173da13084830f1507d56cf/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:99c5280a329d5fa18ef30fd10c793a190d996567667908bef8a7f81f8202b948", size = 1785584, upload-time = "2025-10-28T20:56:19.238Z" },
-    { url = "https://files.pythonhosted.org/packages/30/be/f6a7a426e02fc82781afd62016417b3948e2207426d90a0e478790d1c8a4/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ca6ffef405fc9c09a746cb5d019c1672cd7f402542e379afc66b370833170cf", size = 1595126, upload-time = "2025-10-28T20:56:20.836Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/c7/8e22d5d28f94f67d2af496f14a83b3c155d915d1fe53d94b66d425ec5b42/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:47f438b1a28e926c37632bff3c44df7d27c9b57aaf4e34b1def3c07111fdb782", size = 1800665, upload-time = "2025-10-28T20:56:22.922Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/11/91133c8b68b1da9fc16555706aa7276fdf781ae2bb0876c838dd86b8116e/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9acda8604a57bb60544e4646a4615c1866ee6c04a8edef9b8ee6fd1d8fa2ddc8", size = 1739532, upload-time = "2025-10-28T20:56:25.924Z" },
-    { url = "https://files.pythonhosted.org/packages/17/6b/3747644d26a998774b21a616016620293ddefa4d63af6286f389aedac844/aiohttp-3.13.2-cp311-cp311-win32.whl", hash = "sha256:868e195e39b24aaa930b063c08bb0c17924899c16c672a28a65afded9c46c6ec", size = 431876, upload-time = "2025-10-28T20:56:27.524Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/63/688462108c1a00eb9f05765331c107f95ae86f6b197b865d29e930b7e462/aiohttp-3.13.2-cp311-cp311-win_amd64.whl", hash = "sha256:7fd19df530c292542636c2a9a85854fab93474396a52f1695e799186bbd7f24c", size = 456205, upload-time = "2025-10-28T20:56:29.062Z" },
-    { url = "https://files.pythonhosted.org/packages/29/9b/01f00e9856d0a73260e86dd8ed0c2234a466c5c1712ce1c281548df39777/aiohttp-3.13.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b1e56bab2e12b2b9ed300218c351ee2a3d8c8fdab5b1ec6193e11a817767e47b", size = 737623, upload-time = "2025-10-28T20:56:30.797Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/1b/4be39c445e2b2bd0aab4ba736deb649fabf14f6757f405f0c9685019b9e9/aiohttp-3.13.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:364e25edaabd3d37b1db1f0cbcee8c73c9a3727bfa262b83e5e4cf3489a2a9dc", size = 492664, upload-time = "2025-10-28T20:56:32.708Z" },
-    { url = "https://files.pythonhosted.org/packages/28/66/d35dcfea8050e131cdd731dff36434390479b4045a8d0b9d7111b0a968f1/aiohttp-3.13.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c5c94825f744694c4b8db20b71dba9a257cd2ba8e010a803042123f3a25d50d7", size = 491808, upload-time = "2025-10-28T20:56:34.57Z" },
-    { url = "https://files.pythonhosted.org/packages/00/29/8e4609b93e10a853b65f8291e64985de66d4f5848c5637cddc70e98f01f8/aiohttp-3.13.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba2715d842ffa787be87cbfce150d5e88c87a98e0b62e0f5aa489169a393dbbb", size = 1738863, upload-time = "2025-10-28T20:56:36.377Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/fa/4ebdf4adcc0def75ced1a0d2d227577cd7b1b85beb7edad85fcc87693c75/aiohttp-3.13.2-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:585542825c4bc662221fb257889e011a5aa00f1ae4d75d1d246a5225289183e3", size = 1700586, upload-time = "2025-10-28T20:56:38.034Z" },
-    { url = "https://files.pythonhosted.org/packages/da/04/73f5f02ff348a3558763ff6abe99c223381b0bace05cd4530a0258e52597/aiohttp-3.13.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:39d02cb6025fe1aabca329c5632f48c9532a3dabccd859e7e2f110668972331f", size = 1768625, upload-time = "2025-10-28T20:56:39.75Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/49/a825b79ffec124317265ca7d2344a86bcffeb960743487cb11988ffb3494/aiohttp-3.13.2-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e67446b19e014d37342f7195f592a2a948141d15a312fe0e700c2fd2f03124f6", size = 1867281, upload-time = "2025-10-28T20:56:41.471Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/48/adf56e05f81eac31edcfae45c90928f4ad50ef2e3ea72cb8376162a368f8/aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4356474ad6333e41ccefd39eae869ba15a6c5299c9c01dfdcfdd5c107be4363e", size = 1752431, upload-time = "2025-10-28T20:56:43.162Z" },
-    { url = "https://files.pythonhosted.org/packages/30/ab/593855356eead019a74e862f21523db09c27f12fd24af72dbc3555b9bfd9/aiohttp-3.13.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:eeacf451c99b4525f700f078becff32c32ec327b10dcf31306a8a52d78166de7", size = 1562846, upload-time = "2025-10-28T20:56:44.85Z" },
-    { url = "https://files.pythonhosted.org/packages/39/0f/9f3d32271aa8dc35036e9668e31870a9d3b9542dd6b3e2c8a30931cb27ae/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8a9b889aeabd7a4e9af0b7f4ab5ad94d42e7ff679aaec6d0db21e3b639ad58d", size = 1699606, upload-time = "2025-10-28T20:56:46.519Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/3c/52d2658c5699b6ef7692a3f7128b2d2d4d9775f2a68093f74bca06cf01e1/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:fa89cb11bc71a63b69568d5b8a25c3ca25b6d54c15f907ca1c130d72f320b76b", size = 1720663, upload-time = "2025-10-28T20:56:48.528Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/d4/8f8f3ff1fb7fb9e3f04fcad4e89d8a1cd8fc7d05de67e3de5b15b33008ff/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8aa7c807df234f693fed0ecd507192fc97692e61fee5702cdc11155d2e5cadc8", size = 1737939, upload-time = "2025-10-28T20:56:50.77Z" },
-    { url = "https://files.pythonhosted.org/packages/03/d3/ddd348f8a27a634daae39a1b8e291ff19c77867af438af844bf8b7e3231b/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:9eb3e33fdbe43f88c3c75fa608c25e7c47bbd80f48d012763cb67c47f39a7e16", size = 1555132, upload-time = "2025-10-28T20:56:52.568Z" },
-    { url = "https://files.pythonhosted.org/packages/39/b8/46790692dc46218406f94374903ba47552f2f9f90dad554eed61bfb7b64c/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9434bc0d80076138ea986833156c5a48c9c7a8abb0c96039ddbb4afc93184169", size = 1764802, upload-time = "2025-10-28T20:56:54.292Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/e4/19ce547b58ab2a385e5f0b8aa3db38674785085abcf79b6e0edd1632b12f/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ff15c147b2ad66da1f2cbb0622313f2242d8e6e8f9b79b5206c84523a4473248", size = 1719512, upload-time = "2025-10-28T20:56:56.428Z" },
-    { url = "https://files.pythonhosted.org/packages/70/30/6355a737fed29dcb6dfdd48682d5790cb5eab050f7b4e01f49b121d3acad/aiohttp-3.13.2-cp312-cp312-win32.whl", hash = "sha256:27e569eb9d9e95dbd55c0fc3ec3a9335defbf1d8bc1d20171a49f3c4c607b93e", size = 426690, upload-time = "2025-10-28T20:56:58.736Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/0d/b10ac09069973d112de6ef980c1f6bb31cb7dcd0bc363acbdad58f927873/aiohttp-3.13.2-cp312-cp312-win_amd64.whl", hash = "sha256:8709a0f05d59a71f33fd05c17fc11fcb8c30140506e13c2f5e8ee1b8964e1b45", size = 453465, upload-time = "2025-10-28T20:57:00.795Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/78/7e90ca79e5aa39f9694dcfd74f4720782d3c6828113bb1f3197f7e7c4a56/aiohttp-3.13.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7519bdc7dfc1940d201651b52bf5e03f5503bda45ad6eacf64dda98be5b2b6be", size = 732139, upload-time = "2025-10-28T20:57:02.455Z" },
-    { url = "https://files.pythonhosted.org/packages/db/ed/1f59215ab6853fbaa5c8495fa6cbc39edfc93553426152b75d82a5f32b76/aiohttp-3.13.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:088912a78b4d4f547a1f19c099d5a506df17eacec3c6f4375e2831ec1d995742", size = 490082, upload-time = "2025-10-28T20:57:04.784Z" },
-    { url = "https://files.pythonhosted.org/packages/68/7b/fe0fe0f5e05e13629d893c760465173a15ad0039c0a5b0d0040995c8075e/aiohttp-3.13.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5276807b9de9092af38ed23ce120539ab0ac955547b38563a9ba4f5b07b95293", size = 489035, upload-time = "2025-10-28T20:57:06.894Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/04/db5279e38471b7ac801d7d36a57d1230feeee130bbe2a74f72731b23c2b1/aiohttp-3.13.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1237c1375eaef0db4dcd7c2559f42e8af7b87ea7d295b118c60c36a6e61cb811", size = 1720387, upload-time = "2025-10-28T20:57:08.685Z" },
-    { url = "https://files.pythonhosted.org/packages/31/07/8ea4326bd7dae2bd59828f69d7fdc6e04523caa55e4a70f4a8725a7e4ed2/aiohttp-3.13.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:96581619c57419c3d7d78703d5b78c1e5e5fc0172d60f555bdebaced82ded19a", size = 1688314, upload-time = "2025-10-28T20:57:10.693Z" },
-    { url = "https://files.pythonhosted.org/packages/48/ab/3d98007b5b87ffd519d065225438cc3b668b2f245572a8cb53da5dd2b1bc/aiohttp-3.13.2-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2713a95b47374169409d18103366de1050fe0ea73db358fc7a7acb2880422d4", size = 1756317, upload-time = "2025-10-28T20:57:12.563Z" },
-    { url = "https://files.pythonhosted.org/packages/97/3d/801ca172b3d857fafb7b50c7c03f91b72b867a13abca982ed6b3081774ef/aiohttp-3.13.2-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:228a1cd556b3caca590e9511a89444925da87d35219a49ab5da0c36d2d943a6a", size = 1858539, upload-time = "2025-10-28T20:57:14.623Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/0d/4764669bdf47bd472899b3d3db91fffbe925c8e3038ec591a2fd2ad6a14d/aiohttp-3.13.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac6cde5fba8d7d8c6ac963dbb0256a9854e9fafff52fbcc58fdf819357892c3e", size = 1739597, upload-time = "2025-10-28T20:57:16.399Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/52/7bd3c6693da58ba16e657eb904a5b6decfc48ecd06e9ac098591653b1566/aiohttp-3.13.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2bef8237544f4e42878c61cef4e2839fee6346dc60f5739f876a9c50be7fcdb", size = 1555006, upload-time = "2025-10-28T20:57:18.288Z" },
-    { url = "https://files.pythonhosted.org/packages/48/30/9586667acec5993b6f41d2ebcf96e97a1255a85f62f3c653110a5de4d346/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:16f15a4eac3bc2d76c45f7ebdd48a65d41b242eb6c31c2245463b40b34584ded", size = 1683220, upload-time = "2025-10-28T20:57:20.241Z" },
-    { url = "https://files.pythonhosted.org/packages/71/01/3afe4c96854cfd7b30d78333852e8e851dceaec1c40fd00fec90c6402dd2/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:bb7fb776645af5cc58ab804c58d7eba545a97e047254a52ce89c157b5af6cd0b", size = 1712570, upload-time = "2025-10-28T20:57:22.253Z" },
-    { url = "https://files.pythonhosted.org/packages/11/2c/22799d8e720f4697a9e66fd9c02479e40a49de3de2f0bbe7f9f78a987808/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e1b4951125ec10c70802f2cb09736c895861cd39fd9dcb35107b4dc8ae6220b8", size = 1733407, upload-time = "2025-10-28T20:57:24.37Z" },
-    { url = "https://files.pythonhosted.org/packages/34/cb/90f15dd029f07cebbd91f8238a8b363978b530cd128488085b5703683594/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:550bf765101ae721ee1d37d8095f47b1f220650f85fe1af37a90ce75bab89d04", size = 1550093, upload-time = "2025-10-28T20:57:26.257Z" },
-    { url = "https://files.pythonhosted.org/packages/69/46/12dce9be9d3303ecbf4d30ad45a7683dc63d90733c2d9fe512be6716cd40/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fe91b87fc295973096251e2d25a811388e7d8adf3bd2b97ef6ae78bc4ac6c476", size = 1758084, upload-time = "2025-10-28T20:57:28.349Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/c8/0932b558da0c302ffd639fc6362a313b98fdf235dc417bc2493da8394df7/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e0c8e31cfcc4592cb200160344b2fb6ae0f9e4effe06c644b5a125d4ae5ebe23", size = 1716987, upload-time = "2025-10-28T20:57:30.233Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/8b/f5bd1a75003daed099baec373aed678f2e9b34f2ad40d85baa1368556396/aiohttp-3.13.2-cp313-cp313-win32.whl", hash = "sha256:0740f31a60848d6edb296a0df827473eede90c689b8f9f2a4cdde74889eb2254", size = 425859, upload-time = "2025-10-28T20:57:32.105Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/28/a8a9fc6957b2cee8902414e41816b5ab5536ecf43c3b1843c10e82c559b2/aiohttp-3.13.2-cp313-cp313-win_amd64.whl", hash = "sha256:a88d13e7ca367394908f8a276b89d04a3652044612b9a408a0bb22a5ed976a1a", size = 452192, upload-time = "2025-10-28T20:57:34.166Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/36/e2abae1bd815f01c957cbf7be817b3043304e1c87bad526292a0410fdcf9/aiohttp-3.13.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:2475391c29230e063ef53a66669b7b691c9bfc3f1426a0f7bcdf1216bdbac38b", size = 735234, upload-time = "2025-10-28T20:57:36.415Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/e3/1ee62dde9b335e4ed41db6bba02613295a0d5b41f74a783c142745a12763/aiohttp-3.13.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:f33c8748abef4d8717bb20e8fb1b3e07c6adacb7fd6beaae971a764cf5f30d61", size = 490733, upload-time = "2025-10-28T20:57:38.205Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/aa/7a451b1d6a04e8d15a362af3e9b897de71d86feac3babf8894545d08d537/aiohttp-3.13.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ae32f24bbfb7dbb485a24b30b1149e2f200be94777232aeadba3eecece4d0aa4", size = 491303, upload-time = "2025-10-28T20:57:40.122Z" },
-    { url = "https://files.pythonhosted.org/packages/57/1e/209958dbb9b01174870f6a7538cd1f3f28274fdbc88a750c238e2c456295/aiohttp-3.13.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d7f02042c1f009ffb70067326ef183a047425bb2ff3bc434ead4dd4a4a66a2b", size = 1717965, upload-time = "2025-10-28T20:57:42.28Z" },
-    { url = "https://files.pythonhosted.org/packages/08/aa/6a01848d6432f241416bc4866cae8dc03f05a5a884d2311280f6a09c73d6/aiohttp-3.13.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:93655083005d71cd6c072cdab54c886e6570ad2c4592139c3fb967bfc19e4694", size = 1667221, upload-time = "2025-10-28T20:57:44.869Z" },
-    { url = "https://files.pythonhosted.org/packages/87/4f/36c1992432d31bbc789fa0b93c768d2e9047ec8c7177e5cd84ea85155f36/aiohttp-3.13.2-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0db1e24b852f5f664cd728db140cf11ea0e82450471232a394b3d1a540b0f906", size = 1757178, upload-time = "2025-10-28T20:57:47.216Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/b4/8e940dfb03b7e0f68a82b88fd182b9be0a65cb3f35612fe38c038c3112cf/aiohttp-3.13.2-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b009194665bcd128e23eaddef362e745601afa4641930848af4c8559e88f18f9", size = 1838001, upload-time = "2025-10-28T20:57:49.337Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/ef/39f3448795499c440ab66084a9db7d20ca7662e94305f175a80f5b7e0072/aiohttp-3.13.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c038a8fdc8103cd51dbd986ecdce141473ffd9775a7a8057a6ed9c3653478011", size = 1716325, upload-time = "2025-10-28T20:57:51.327Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/51/b311500ffc860b181c05d91c59a1313bdd05c82960fdd4035a15740d431e/aiohttp-3.13.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:66bac29b95a00db411cd758fea0e4b9bdba6d549dfe333f9a945430f5f2cc5a6", size = 1547978, upload-time = "2025-10-28T20:57:53.554Z" },
-    { url = "https://files.pythonhosted.org/packages/31/64/b9d733296ef79815226dab8c586ff9e3df41c6aff2e16c06697b2d2e6775/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4ebf9cfc9ba24a74cf0718f04aac2a3bbe745902cc7c5ebc55c0f3b5777ef213", size = 1682042, upload-time = "2025-10-28T20:57:55.617Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/30/43d3e0f9d6473a6db7d472104c4eff4417b1e9df01774cb930338806d36b/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a4b88ebe35ce54205c7074f7302bd08a4cb83256a3e0870c72d6f68a3aaf8e49", size = 1680085, upload-time = "2025-10-28T20:57:57.59Z" },
-    { url = "https://files.pythonhosted.org/packages/16/51/c709f352c911b1864cfd1087577760ced64b3e5bee2aa88b8c0c8e2e4972/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:98c4fb90bb82b70a4ed79ca35f656f4281885be076f3f970ce315402b53099ae", size = 1728238, upload-time = "2025-10-28T20:57:59.525Z" },
-    { url = "https://files.pythonhosted.org/packages/19/e2/19bd4c547092b773caeb48ff5ae4b1ae86756a0ee76c16727fcfd281404b/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:ec7534e63ae0f3759df3a1ed4fa6bc8f75082a924b590619c0dd2f76d7043caa", size = 1544395, upload-time = "2025-10-28T20:58:01.914Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/87/860f2803b27dfc5ed7be532832a3498e4919da61299b4a1f8eb89b8ff44d/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5b927cf9b935a13e33644cbed6c8c4b2d0f25b713d838743f8fe7191b33829c4", size = 1742965, upload-time = "2025-10-28T20:58:03.972Z" },
-    { url = "https://files.pythonhosted.org/packages/67/7f/db2fc7618925e8c7a601094d5cbe539f732df4fb570740be88ed9e40e99a/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:88d6c017966a78c5265d996c19cdb79235be5e6412268d7e2ce7dee339471b7a", size = 1697585, upload-time = "2025-10-28T20:58:06.189Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/07/9127916cb09bb38284db5036036042b7b2c514c8ebaeee79da550c43a6d6/aiohttp-3.13.2-cp314-cp314-win32.whl", hash = "sha256:f7c183e786e299b5d6c49fb43a769f8eb8e04a2726a2bd5887b98b5cc2d67940", size = 431621, upload-time = "2025-10-28T20:58:08.636Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/41/554a8a380df6d3a2bba8a7726429a23f4ac62aaf38de43bb6d6cde7b4d4d/aiohttp-3.13.2-cp314-cp314-win_amd64.whl", hash = "sha256:fe242cd381e0fb65758faf5ad96c2e460df6ee5b2de1072fe97e4127927e00b4", size = 457627, upload-time = "2025-10-28T20:58:11Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/8e/3824ef98c039d3951cb65b9205a96dd2b20f22241ee17d89c5701557c826/aiohttp-3.13.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:f10d9c0b0188fe85398c61147bbd2a657d616c876863bfeff43376e0e3134673", size = 767360, upload-time = "2025-10-28T20:58:13.358Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/0f/6a03e3fc7595421274fa34122c973bde2d89344f8a881b728fa8c774e4f1/aiohttp-3.13.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e7c952aefdf2460f4ae55c5e9c3e80aa72f706a6317e06020f80e96253b1accd", size = 504616, upload-time = "2025-10-28T20:58:15.339Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/aa/ed341b670f1bc8a6f2c6a718353d13b9546e2cef3544f573c6a1ff0da711/aiohttp-3.13.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c20423ce14771d98353d2e25e83591fa75dfa90a3c1848f3d7c68243b4fbded3", size = 509131, upload-time = "2025-10-28T20:58:17.693Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/f0/c68dac234189dae5c4bbccc0f96ce0cc16b76632cfc3a08fff180045cfa4/aiohttp-3.13.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e96eb1a34396e9430c19d8338d2ec33015e4a87ef2b4449db94c22412e25ccdf", size = 1864168, upload-time = "2025-10-28T20:58:20.113Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/65/75a9a76db8364b5d0e52a0c20eabc5d52297385d9af9c35335b924fafdee/aiohttp-3.13.2-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:23fb0783bc1a33640036465019d3bba069942616a6a2353c6907d7fe1ccdaf4e", size = 1719200, upload-time = "2025-10-28T20:58:22.583Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/55/8df2ed78d7f41d232f6bd3ff866b6f617026551aa1d07e2f03458f964575/aiohttp-3.13.2-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e1a9bea6244a1d05a4e57c295d69e159a5c50d8ef16aa390948ee873478d9a5", size = 1843497, upload-time = "2025-10-28T20:58:24.672Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/e0/94d7215e405c5a02ccb6a35c7a3a6cfff242f457a00196496935f700cde5/aiohttp-3.13.2-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0a3d54e822688b56e9f6b5816fb3de3a3a64660efac64e4c2dc435230ad23bad", size = 1935703, upload-time = "2025-10-28T20:58:26.758Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/78/1eeb63c3f9b2d1015a4c02788fb543141aad0a03ae3f7a7b669b2483f8d4/aiohttp-3.13.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7a653d872afe9f33497215745da7a943d1dc15b728a9c8da1c3ac423af35178e", size = 1792738, upload-time = "2025-10-28T20:58:29.787Z" },
-    { url = "https://files.pythonhosted.org/packages/41/75/aaf1eea4c188e51538c04cc568040e3082db263a57086ea74a7d38c39e42/aiohttp-3.13.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:56d36e80d2003fa3fc0207fac644216d8532e9504a785ef9a8fd013f84a42c61", size = 1624061, upload-time = "2025-10-28T20:58:32.529Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/c2/3b6034de81fbcc43de8aeb209073a2286dfb50b86e927b4efd81cf848197/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:78cd586d8331fb8e241c2dd6b2f4061778cc69e150514b39a9e28dd050475661", size = 1789201, upload-time = "2025-10-28T20:58:34.618Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/38/c15dcf6d4d890217dae79d7213988f4e5fe6183d43893a9cf2fe9e84ca8d/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:20b10bbfbff766294fe99987f7bb3b74fdd2f1a2905f2562132641ad434dcf98", size = 1776868, upload-time = "2025-10-28T20:58:38.835Z" },
-    { url = "https://files.pythonhosted.org/packages/04/75/f74fd178ac81adf4f283a74847807ade5150e48feda6aef024403716c30c/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9ec49dff7e2b3c85cdeaa412e9d438f0ecd71676fde61ec57027dd392f00c693", size = 1790660, upload-time = "2025-10-28T20:58:41.507Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/80/7368bd0d06b16b3aba358c16b919e9c46cf11587dc572091031b0e9e3ef0/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:94f05348c4406450f9d73d38efb41d669ad6cd90c7ee194810d0eefbfa875a7a", size = 1617548, upload-time = "2025-10-28T20:58:43.674Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/4b/a6212790c50483cb3212e507378fbe26b5086d73941e1ec4b56a30439688/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:fa4dcb605c6f82a80c7f95713c2b11c3b8e9893b3ebd2bc9bde93165ed6107be", size = 1817240, upload-time = "2025-10-28T20:58:45.787Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/f7/ba5f0ba4ea8d8f3c32850912944532b933acbf0f3a75546b89269b9b7dde/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cf00e5db968c3f67eccd2778574cf64d8b27d95b237770aa32400bd7a1ca4f6c", size = 1762334, upload-time = "2025-10-28T20:58:47.936Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/83/1a5a1856574588b1cad63609ea9ad75b32a8353ac995d830bf5da9357364/aiohttp-3.13.2-cp314-cp314t-win32.whl", hash = "sha256:d23b5fe492b0805a50d3371e8a728a9134d8de5447dce4c885f5587294750734", size = 464685, upload-time = "2025-10-28T20:58:50.642Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/4d/d22668674122c08f4d56972297c51a624e64b3ed1efaa40187607a7cb66e/aiohttp-3.13.2-cp314-cp314t-win_amd64.whl", hash = "sha256:ff0a7b0a82a7ab905cbda74006318d1b12e37c797eb1b0d4eb3e316cf47f658f", size = 498093, upload-time = "2025-10-28T20:58:52.782Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/36/d6/5aec9313ee6ea9c7cde8b891b69f4ff4001416867104580670a31daeba5b/aiohttp-3.13.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d5a372fd5afd301b3a89582817fdcdb6c34124787c70dbcc616f259013e7eef7", size = 738950, upload-time = "2026-01-03T17:29:13.002Z" },
+    { url = "https://files.pythonhosted.org/packages/68/03/8fa90a7e6d11ff20a18837a8e2b5dd23db01aabc475aa9271c8ad33299f5/aiohttp-3.13.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:147e422fd1223005c22b4fe080f5d93ced44460f5f9c105406b753612b587821", size = 496099, upload-time = "2026-01-03T17:29:15.268Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/23/b81f744d402510a8366b74eb420fc0cc1170d0c43daca12d10814df85f10/aiohttp-3.13.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:859bd3f2156e81dd01432f5849fc73e2243d4a487c4fd26609b1299534ee1845", size = 491072, upload-time = "2026-01-03T17:29:16.922Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/e1/56d1d1c0dd334cd203dd97706ce004c1aa24b34a813b0b8daf3383039706/aiohttp-3.13.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dca68018bf48c251ba17c72ed479f4dafe9dbd5a73707ad8d28a38d11f3d42af", size = 1671588, upload-time = "2026-01-03T17:29:18.539Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/34/8d7f962604f4bc2b4e39eb1220dac7d4e4cba91fb9ba0474b4ecd67db165/aiohttp-3.13.3-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:fee0c6bc7db1de362252affec009707a17478a00ec69f797d23ca256e36d5940", size = 1640334, upload-time = "2026-01-03T17:29:21.028Z" },
+    { url = "https://files.pythonhosted.org/packages/94/1d/fcccf2c668d87337ddeef9881537baee13c58d8f01f12ba8a24215f2b804/aiohttp-3.13.3-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c048058117fd649334d81b4b526e94bde3ccaddb20463a815ced6ecbb7d11160", size = 1722656, upload-time = "2026-01-03T17:29:22.531Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/98/c6f3b081c4c606bc1e5f2ec102e87d6411c73a9ef3616fea6f2d5c98c062/aiohttp-3.13.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:215a685b6fbbfcf71dfe96e3eba7a6f58f10da1dfdf4889c7dd856abe430dca7", size = 1817625, upload-time = "2026-01-03T17:29:24.276Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/c0/cfcc3d2e11b477f86e1af2863f3858c8850d751ce8dc39c4058a072c9e54/aiohttp-3.13.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2c184bb1fe2cbd2cefba613e9db29a5ab559323f994b6737e370d3da0ac455", size = 1672604, upload-time = "2026-01-03T17:29:26.099Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/77/6b4ffcbcac4c6a5d041343a756f34a6dd26174ae07f977a64fe028dda5b0/aiohttp-3.13.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:75ca857eba4e20ce9f546cd59c7007b33906a4cd48f2ff6ccf1ccfc3b646f279", size = 1554370, upload-time = "2026-01-03T17:29:28.121Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/f0/e3ddfa93f17d689dbe014ba048f18e0c9f9b456033b70e94349a2e9048be/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81e97251d9298386c2b7dbeb490d3d1badbdc69107fb8c9299dd04eb39bddc0e", size = 1642023, upload-time = "2026-01-03T17:29:30.002Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/45/c14019c9ec60a8e243d06d601b33dcc4fd92379424bde3021725859d7f99/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:c0e2d366af265797506f0283487223146af57815b388623f0357ef7eac9b209d", size = 1649680, upload-time = "2026-01-03T17:29:31.782Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/fd/09c9451dae5aa5c5ed756df95ff9ef549d45d4be663bafd1e4954fd836f0/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4e239d501f73d6db1522599e14b9b321a7e3b1de66ce33d53a765d975e9f4808", size = 1692407, upload-time = "2026-01-03T17:29:33.392Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/81/938bc2ec33c10efd6637ccb3d22f9f3160d08e8f3aa2587a2c2d5ab578eb/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0db318f7a6f065d84cb1e02662c526294450b314a02bd9e2a8e67f0d8564ce40", size = 1543047, upload-time = "2026-01-03T17:29:34.855Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/23/80488ee21c8d567c83045e412e1d9b7077d27171591a4eb7822586e8c06a/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:bfc1cc2fe31a6026a8a88e4ecfb98d7f6b1fec150cfd708adbfd1d2f42257c29", size = 1715264, upload-time = "2026-01-03T17:29:36.389Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/83/259a8da6683182768200b368120ab3deff5370bed93880fb9a3a86299f34/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af71fff7bac6bb7508956696dce8f6eec2bbb045eceb40343944b1ae62b5ef11", size = 1657275, upload-time = "2026-01-03T17:29:38.162Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/4f/2c41f800a0b560785c10fb316216ac058c105f9be50bdc6a285de88db625/aiohttp-3.13.3-cp310-cp310-win32.whl", hash = "sha256:37da61e244d1749798c151421602884db5270faf479cf0ef03af0ff68954c9dd", size = 434053, upload-time = "2026-01-03T17:29:40.074Z" },
+    { url = "https://files.pythonhosted.org/packages/80/df/29cd63c7ecfdb65ccc12f7d808cac4fa2a19544660c06c61a4a48462de0c/aiohttp-3.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:7e63f210bc1b57ef699035f2b4b6d9ce096b5914414a49b0997c839b2bd2223c", size = 456687, upload-time = "2026-01-03T17:29:41.819Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/4c/a164164834f03924d9a29dc3acd9e7ee58f95857e0b467f6d04298594ebb/aiohttp-3.13.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b", size = 746051, upload-time = "2026-01-03T17:29:43.287Z" },
+    { url = "https://files.pythonhosted.org/packages/82/71/d5c31390d18d4f58115037c432b7e0348c60f6f53b727cad33172144a112/aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64", size = 499234, upload-time = "2026-01-03T17:29:44.822Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/c9/741f8ac91e14b1d2e7100690425a5b2b919a87a5075406582991fb7de920/aiohttp-3.13.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea", size = 494979, upload-time = "2026-01-03T17:29:46.405Z" },
+    { url = "https://files.pythonhosted.org/packages/75/b5/31d4d2e802dfd59f74ed47eba48869c1c21552c586d5e81a9d0d5c2ad640/aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a", size = 1748297, upload-time = "2026-01-03T17:29:48.083Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/3e/eefad0ad42959f226bb79664826883f2687d602a9ae2941a18e0484a74d3/aiohttp-3.13.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540", size = 1707172, upload-time = "2026-01-03T17:29:49.648Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/3a/54a64299fac2891c346cdcf2aa6803f994a2e4beeaf2e5a09dcc54acc842/aiohttp-3.13.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b", size = 1805405, upload-time = "2026-01-03T17:29:51.244Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/70/ddc1b7169cf64075e864f64595a14b147a895a868394a48f6a8031979038/aiohttp-3.13.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3", size = 1899449, upload-time = "2026-01-03T17:29:53.938Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/7e/6815aab7d3a56610891c76ef79095677b8b5be6646aaf00f69b221765021/aiohttp-3.13.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1", size = 1748444, upload-time = "2026-01-03T17:29:55.484Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/f2/073b145c4100da5511f457dc0f7558e99b2987cf72600d42b559db856fbc/aiohttp-3.13.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3", size = 1606038, upload-time = "2026-01-03T17:29:57.179Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/c1/778d011920cae03ae01424ec202c513dc69243cf2db303965615b81deeea/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440", size = 1724156, upload-time = "2026-01-03T17:29:58.914Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/cb/3419eabf4ec1e9ec6f242c32b689248365a1cf621891f6f0386632525494/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7", size = 1722340, upload-time = "2026-01-03T17:30:01.962Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/e5/76cf77bdbc435bf233c1f114edad39ed4177ccbfab7c329482b179cff4f4/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c", size = 1783041, upload-time = "2026-01-03T17:30:03.609Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/d4/dd1ca234c794fd29c057ce8c0566b8ef7fd6a51069de5f06fa84b9a1971c/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51", size = 1596024, upload-time = "2026-01-03T17:30:05.132Z" },
+    { url = "https://files.pythonhosted.org/packages/55/58/4345b5f26661a6180afa686c473620c30a66afdf120ed3dd545bbc809e85/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4", size = 1804590, upload-time = "2026-01-03T17:30:07.135Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/06/05950619af6c2df7e0a431d889ba2813c9f0129cec76f663e547a5ad56f2/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29", size = 1740355, upload-time = "2026-01-03T17:30:09.083Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/80/958f16de79ba0422d7c1e284b2abd0c84bc03394fbe631d0a39ffa10e1eb/aiohttp-3.13.3-cp311-cp311-win32.whl", hash = "sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239", size = 433701, upload-time = "2026-01-03T17:30:10.869Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/f2/27cdf04c9851712d6c1b99df6821a6623c3c9e55956d4b1e318c337b5a48/aiohttp-3.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f", size = 457678, upload-time = "2026-01-03T17:30:12.719Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" },
+    { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" },
+    { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" },
+    { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" },
+    { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" },
+    { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" },
+    { url = "https://files.pythonhosted.org/packages/97/8a/12ca489246ca1faaf5432844adbfce7ff2cc4997733e0af120869345643a/aiohttp-3.13.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5dff64413671b0d3e7d5918ea490bdccb97a4ad29b3f311ed423200b2203e01c", size = 734190, upload-time = "2026-01-03T17:30:45.832Z" },
+    { url = "https://files.pythonhosted.org/packages/32/08/de43984c74ed1fca5c014808963cc83cb00d7bb06af228f132d33862ca76/aiohttp-3.13.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:87b9aab6d6ed88235aa2970294f496ff1a1f9adcd724d800e9b952395a80ffd9", size = 491783, upload-time = "2026-01-03T17:30:47.466Z" },
+    { url = "https://files.pythonhosted.org/packages/17/f8/8dd2cf6112a5a76f81f81a5130c57ca829d101ad583ce57f889179accdda/aiohttp-3.13.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:425c126c0dc43861e22cb1c14ba4c8e45d09516d0a3ae0a3f7494b79f5f233a3", size = 490704, upload-time = "2026-01-03T17:30:49.373Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/40/a46b03ca03936f832bc7eaa47cfbb1ad012ba1be4790122ee4f4f8cba074/aiohttp-3.13.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f9120f7093c2a32d9647abcaf21e6ad275b4fbec5b55969f978b1a97c7c86bf", size = 1720652, upload-time = "2026-01-03T17:30:50.974Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/7e/917fe18e3607af92657e4285498f500dca797ff8c918bd7d90b05abf6c2a/aiohttp-3.13.3-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:697753042d57f4bf7122cab985bf15d0cef23c770864580f5af4f52023a56bd6", size = 1692014, upload-time = "2026-01-03T17:30:52.729Z" },
+    { url = "https://files.pythonhosted.org/packages/71/b6/cefa4cbc00d315d68973b671cf105b21a609c12b82d52e5d0c9ae61d2a09/aiohttp-3.13.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6de499a1a44e7de70735d0b39f67c8f25eb3d91eb3103be99ca0fa882cdd987d", size = 1759777, upload-time = "2026-01-03T17:30:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/e3/e06ee07b45e59e6d81498b591fc589629be1553abb2a82ce33efe2a7b068/aiohttp-3.13.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:37239e9f9a7ea9ac5bf6b92b0260b01f8a22281996da609206a84df860bc1261", size = 1861276, upload-time = "2026-01-03T17:30:56.512Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/24/75d274228acf35ceeb2850b8ce04de9dd7355ff7a0b49d607ee60c29c518/aiohttp-3.13.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f76c1e3fe7d7c8afad7ed193f89a292e1999608170dcc9751a7462a87dfd5bc0", size = 1743131, upload-time = "2026-01-03T17:30:58.256Z" },
+    { url = "https://files.pythonhosted.org/packages/04/98/3d21dde21889b17ca2eea54fdcff21b27b93f45b7bb94ca029c31ab59dc3/aiohttp-3.13.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fc290605db2a917f6e81b0e1e0796469871f5af381ce15c604a3c5c7e51cb730", size = 1556863, upload-time = "2026-01-03T17:31:00.445Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/84/da0c3ab1192eaf64782b03971ab4055b475d0db07b17eff925e8c93b3aa5/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4021b51936308aeea0367b8f006dc999ca02bc118a0cc78c303f50a2ff6afb91", size = 1682793, upload-time = "2026-01-03T17:31:03.024Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/0f/5802ada182f575afa02cbd0ec5180d7e13a402afb7c2c03a9aa5e5d49060/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:49a03727c1bba9a97d3e93c9f93ca03a57300f484b6e935463099841261195d3", size = 1716676, upload-time = "2026-01-03T17:31:04.842Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/8c/714d53bd8b5a4560667f7bbbb06b20c2382f9c7847d198370ec6526af39c/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3d9908a48eb7416dc1f4524e69f1d32e5d90e3981e4e37eb0aa1cd18f9cfa2a4", size = 1733217, upload-time = "2026-01-03T17:31:06.868Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/79/e2176f46d2e963facea939f5be2d26368ce543622be6f00a12844d3c991f/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2712039939ec963c237286113c68dbad80a82a4281543f3abf766d9d73228998", size = 1552303, upload-time = "2026-01-03T17:31:08.958Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/6a/28ed4dea1759916090587d1fe57087b03e6c784a642b85ef48217b0277ae/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:7bfdc049127717581866fa4708791220970ce291c23e28ccf3922c700740fdc0", size = 1763673, upload-time = "2026-01-03T17:31:10.676Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/35/4a3daeb8b9fab49240d21c04d50732313295e4bd813a465d840236dd0ce1/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8057c98e0c8472d8846b9c79f56766bcc57e3e8ac7bfd510482332366c56c591", size = 1721120, upload-time = "2026-01-03T17:31:12.575Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/9f/d643bb3c5fb99547323e635e251c609fbbc660d983144cfebec529e09264/aiohttp-3.13.3-cp313-cp313-win32.whl", hash = "sha256:1449ceddcdbcf2e0446957863af03ebaaa03f94c090f945411b61269e2cb5daf", size = 427383, upload-time = "2026-01-03T17:31:14.382Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/f1/ab0395f8a79933577cdd996dd2f9aa6014af9535f65dddcf88204682fe62/aiohttp-3.13.3-cp313-cp313-win_amd64.whl", hash = "sha256:693781c45a4033d31d4187d2436f5ac701e7bbfe5df40d917736108c1cc7436e", size = 453899, upload-time = "2026-01-03T17:31:15.958Z" },
+    { url = "https://files.pythonhosted.org/packages/99/36/5b6514a9f5d66f4e2597e40dea2e3db271e023eb7a5d22defe96ba560996/aiohttp-3.13.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:ea37047c6b367fd4bd632bff8077449b8fa034b69e812a18e0132a00fae6e808", size = 737238, upload-time = "2026-01-03T17:31:17.909Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/49/459327f0d5bcd8c6c9ca69e60fdeebc3622861e696490d8674a6d0cb90a6/aiohttp-3.13.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6fc0e2337d1a4c3e6acafda6a78a39d4c14caea625124817420abceed36e2415", size = 492292, upload-time = "2026-01-03T17:31:19.919Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/0b/b97660c5fd05d3495b4eb27f2d0ef18dc1dc4eff7511a9bf371397ff0264/aiohttp-3.13.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c685f2d80bb67ca8c3837823ad76196b3694b0159d232206d1e461d3d434666f", size = 493021, upload-time = "2026-01-03T17:31:21.636Z" },
+    { url = "https://files.pythonhosted.org/packages/54/d4/438efabdf74e30aeceb890c3290bbaa449780583b1270b00661126b8aae4/aiohttp-3.13.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e377758516d262bde50c2584fc6c578af272559c409eecbdd2bae1601184d6", size = 1717263, upload-time = "2026-01-03T17:31:23.296Z" },
+    { url = "https://files.pythonhosted.org/packages/71/f2/7bddc7fd612367d1459c5bcf598a9e8f7092d6580d98de0e057eb42697ad/aiohttp-3.13.3-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:34749271508078b261c4abb1767d42b8d0c0cc9449c73a4df494777dc55f0687", size = 1669107, upload-time = "2026-01-03T17:31:25.334Z" },
+    { url = "https://files.pythonhosted.org/packages/00/5a/1aeaecca40e22560f97610a329e0e5efef5e0b5afdf9f857f0d93839ab2e/aiohttp-3.13.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:82611aeec80eb144416956ec85b6ca45a64d76429c1ed46ae1b5f86c6e0c9a26", size = 1760196, upload-time = "2026-01-03T17:31:27.394Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/f8/0ff6992bea7bd560fc510ea1c815f87eedd745fe035589c71ce05612a19a/aiohttp-3.13.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2fff83cfc93f18f215896e3a190e8e5cb413ce01553901aca925176e7568963a", size = 1843591, upload-time = "2026-01-03T17:31:29.238Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/d1/e30e537a15f53485b61f5be525f2157da719819e8377298502aebac45536/aiohttp-3.13.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bbe7d4cecacb439e2e2a8a1a7b935c25b812af7a5fd26503a66dadf428e79ec1", size = 1720277, upload-time = "2026-01-03T17:31:31.053Z" },
+    { url = "https://files.pythonhosted.org/packages/84/45/23f4c451d8192f553d38d838831ebbc156907ea6e05557f39563101b7717/aiohttp-3.13.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b928f30fe49574253644b1ca44b1b8adbd903aa0da4b9054a6c20fc7f4092a25", size = 1548575, upload-time = "2026-01-03T17:31:32.87Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/ed/0a42b127a43712eda7807e7892c083eadfaf8429ca8fb619662a530a3aab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7b5e8fe4de30df199155baaf64f2fcd604f4c678ed20910db8e2c66dc4b11603", size = 1679455, upload-time = "2026-01-03T17:31:34.76Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/b5/c05f0c2b4b4fe2c9d55e73b6d3ed4fd6c9dc2684b1d81cbdf77e7fad9adb/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:8542f41a62bcc58fc7f11cf7c90e0ec324ce44950003feb70640fc2a9092c32a", size = 1687417, upload-time = "2026-01-03T17:31:36.699Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/6b/915bc5dad66aef602b9e459b5a973529304d4e89ca86999d9d75d80cbd0b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5e1d8c8b8f1d91cd08d8f4a3c2b067bfca6ec043d3ff36de0f3a715feeedf926", size = 1729968, upload-time = "2026-01-03T17:31:38.622Z" },
+    { url = "https://files.pythonhosted.org/packages/11/3b/e84581290a9520024a08640b63d07673057aec5ca548177a82026187ba73/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:90455115e5da1c3c51ab619ac57f877da8fd6d73c05aacd125c5ae9819582aba", size = 1545690, upload-time = "2026-01-03T17:31:40.57Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/04/0c3655a566c43fd647c81b895dfe361b9f9ad6d58c19309d45cff52d6c3b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:042e9e0bcb5fba81886c8b4fbb9a09d6b8a00245fd8d88e4d989c1f96c74164c", size = 1746390, upload-time = "2026-01-03T17:31:42.857Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/53/71165b26978f719c3419381514c9690bd5980e764a09440a10bb816ea4ab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2eb752b102b12a76ca02dff751a801f028b4ffbbc478840b473597fc91a9ed43", size = 1702188, upload-time = "2026-01-03T17:31:44.984Z" },
+    { url = "https://files.pythonhosted.org/packages/29/a7/cbe6c9e8e136314fa1980da388a59d2f35f35395948a08b6747baebb6aa6/aiohttp-3.13.3-cp314-cp314-win32.whl", hash = "sha256:b556c85915d8efaed322bf1bdae9486aa0f3f764195a0fb6ee962e5c71ef5ce1", size = 433126, upload-time = "2026-01-03T17:31:47.463Z" },
+    { url = "https://files.pythonhosted.org/packages/de/56/982704adea7d3b16614fc5936014e9af85c0e34b58f9046655817f04306e/aiohttp-3.13.3-cp314-cp314-win_amd64.whl", hash = "sha256:9bf9f7a65e7aa20dd764151fb3d616c81088f91f8df39c3893a536e279b4b984", size = 459128, upload-time = "2026-01-03T17:31:49.2Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/2a/3c79b638a9c3d4658d345339d22070241ea341ed4e07b5ac60fb0f418003/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:05861afbbec40650d8a07ea324367cb93e9e8cc7762e04dd4405df99fa65159c", size = 769512, upload-time = "2026-01-03T17:31:51.134Z" },
+    { url = "https://files.pythonhosted.org/packages/29/b9/3e5014d46c0ab0db8707e0ac2711ed28c4da0218c358a4e7c17bae0d8722/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2fc82186fadc4a8316768d61f3722c230e2c1dcab4200d52d2ebdf2482e47592", size = 506444, upload-time = "2026-01-03T17:31:52.85Z" },
+    { url = "https://files.pythonhosted.org/packages/90/03/c1d4ef9a054e151cd7839cdc497f2638f00b93cbe8043983986630d7a80c/aiohttp-3.13.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0add0900ff220d1d5c5ebbf99ed88b0c1bbf87aa7e4262300ed1376a6b13414f", size = 510798, upload-time = "2026-01-03T17:31:54.91Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/76/8c1e5abbfe8e127c893fe7ead569148a4d5a799f7cf958d8c09f3eedf097/aiohttp-3.13.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:568f416a4072fbfae453dcf9a99194bbb8bdeab718e08ee13dfa2ba0e4bebf29", size = 1868835, upload-time = "2026-01-03T17:31:56.733Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/ac/984c5a6f74c363b01ff97adc96a3976d9c98940b8969a1881575b279ac5d/aiohttp-3.13.3-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:add1da70de90a2569c5e15249ff76a631ccacfe198375eead4aadf3b8dc849dc", size = 1720486, upload-time = "2026-01-03T17:31:58.65Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/9a/b7039c5f099c4eb632138728828b33428585031a1e658d693d41d07d89d1/aiohttp-3.13.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b47b7ba335d2e9b1239fa571131a87e2d8ec96b333e68b2a305e7a98b0bae2", size = 1847951, upload-time = "2026-01-03T17:32:00.989Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/02/3bec2b9a1ba3c19ff89a43a19324202b8eb187ca1e928d8bdac9bbdddebd/aiohttp-3.13.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3dd4dce1c718e38081c8f35f323209d4c1df7d4db4bab1b5c88a6b4d12b74587", size = 1941001, upload-time = "2026-01-03T17:32:03.122Z" },
+    { url = "https://files.pythonhosted.org/packages/37/df/d879401cedeef27ac4717f6426c8c36c3091c6e9f08a9178cc87549c537f/aiohttp-3.13.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34bac00a67a812570d4a460447e1e9e06fae622946955f939051e7cc895cfab8", size = 1797246, upload-time = "2026-01-03T17:32:05.255Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/15/be122de1f67e6953add23335c8ece6d314ab67c8bebb3f181063010795a7/aiohttp-3.13.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a19884d2ee70b06d9204b2727a7b9f983d0c684c650254679e716b0b77920632", size = 1627131, upload-time = "2026-01-03T17:32:07.607Z" },
+    { url = "https://files.pythonhosted.org/packages/12/12/70eedcac9134cfa3219ab7af31ea56bc877395b1ac30d65b1bc4b27d0438/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5f8ca7f2bb6ba8348a3614c7918cc4bb73268c5ac2a207576b7afea19d3d9f64", size = 1795196, upload-time = "2026-01-03T17:32:09.59Z" },
+    { url = "https://files.pythonhosted.org/packages/32/11/b30e1b1cd1f3054af86ebe60df96989c6a414dd87e27ad16950eee420bea/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:b0d95340658b9d2f11d9697f59b3814a9d3bb4b7a7c20b131df4bcef464037c0", size = 1782841, upload-time = "2026-01-03T17:32:11.445Z" },
+    { url = "https://files.pythonhosted.org/packages/88/0d/d98a9367b38912384a17e287850f5695c528cff0f14f791ce8ee2e4f7796/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:a1e53262fd202e4b40b70c3aff944a8155059beedc8a89bba9dc1f9ef06a1b56", size = 1795193, upload-time = "2026-01-03T17:32:13.705Z" },
+    { url = "https://files.pythonhosted.org/packages/43/a5/a2dfd1f5ff5581632c7f6a30e1744deda03808974f94f6534241ef60c751/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:d60ac9663f44168038586cab2157e122e46bdef09e9368b37f2d82d354c23f72", size = 1621979, upload-time = "2026-01-03T17:32:15.965Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/f0/12973c382ae7c1cccbc4417e129c5bf54c374dfb85af70893646e1f0e749/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:90751b8eed69435bac9ff4e3d2f6b3af1f57e37ecb0fbeee59c0174c9e2d41df", size = 1822193, upload-time = "2026-01-03T17:32:18.219Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/5f/24155e30ba7f8c96918af1350eb0663e2430aad9e001c0489d89cd708ab1/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fc353029f176fd2b3ec6cfc71be166aba1936fe5d73dd1992ce289ca6647a9aa", size = 1769801, upload-time = "2026-01-03T17:32:20.25Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/f8/7314031ff5c10e6ece114da79b338ec17eeff3a079e53151f7e9f43c4723/aiohttp-3.13.3-cp314-cp314t-win32.whl", hash = "sha256:2e41b18a58da1e474a057b3d35248d8320029f61d70a37629535b16a0c8f3767", size = 466523, upload-time = "2026-01-03T17:32:22.215Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/63/278a98c715ae467624eafe375542d8ba9b4383a016df8fdefe0ae28382a7/aiohttp-3.13.3-cp314-cp314t-win_amd64.whl", hash = "sha256:44531a36aa2264a1860089ffd4dce7baf875ee5a6079d5fb42e261c704ef7344", size = 499694, upload-time = "2026-01-03T17:32:24.546Z" },
+]
+
+[[package]]
+name = "aiohttp-cors"
+version = "0.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/d89e846a5444b3d5eb8985a6ddb0daef3774928e1bfbce8e84ec97b0ffa7/aiohttp_cors-0.8.1.tar.gz", hash = "sha256:ccacf9cb84b64939ea15f859a146af1f662a6b1d68175754a07315e305fb1403", size = 38626, upload-time = "2025-03-31T14:16:20.048Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl", hash = "sha256:3180cf304c5c712d626b9162b195b1db7ddf976a2a25172b35bb2448b890a80d", size = 25231, upload-time = "2025-03-31T14:16:18.478Z" },
 ]
 
 [[package]]
@@ -262,10 +323,10 @@ name = "anyio"
 version = "4.9.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "exceptiongroup", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
     { name = "idna" },
     { name = "sniffio" },
-    { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" }
 wheels = [
@@ -274,37 +335,37 @@ wheels = [
 
 [[package]]
 name = "apache-tvm-ffi"
-version = "0.1.3"
+version = "0.1.8.post2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/f0/af641a18833f35b37f01ecbdbf9baa0095805475adf8cd52ebeb7698fa8c/apache_tvm_ffi-0.1.3.tar.gz", hash = "sha256:d33f0bc0d028cddf321d69724c916504272a7f03dfc1d8e507d9d0f88b6f7cbf", size = 1276869, upload-time = "2025-11-21T05:11:00.562Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/68/13/ad0af6fb5203df6c92e404c5465d44a60bae7de0741a93fb1a3b4829692e/apache_tvm_ffi-0.1.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8999f431b3acd04a2d79f38e2ebfbb089d0f43ed87528674d7bda6d3f796ddc", size = 1743043, upload-time = "2025-11-21T05:10:05.255Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/64/f362d0010daacea93a928de0c31df6b7d40ef8cd57e9117535ee0adc2704/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:81f187d08d9040ec98b22fb6906c68b1df60b41567f2b507293f53f630b0136f", size = 1895551, upload-time = "2025-11-21T05:10:07.223Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/98/daa0f491312ebe4dccc7d84799c0b5b1bc5eee6b1093208a4fbb98175579/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dacfd2974a60a6b531a5fe8a3985f60368fc88a8ab3872c381fc1a80315d3d24", size = 1969790, upload-time = "2025-11-21T05:10:09.032Z" },
-    { url = "https://files.pythonhosted.org/packages/87/9c/68e30812874e60b141b99202dd3c4e4de964a7cb62cf6455de170b3a5111/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff65bf8a96dbbd2725937ff1502e52571e7a90d81d355a21a303328dd06449cc", size = 1844888, upload-time = "2025-11-21T05:10:10.871Z" },
-    { url = "https://files.pythonhosted.org/packages/49/97/ffe70c4679aebef0c1e32eec3970dc7e35113995d318aeb8c2ef0e4a3eb9/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:48ad3df2224f1b0943344895c6cba2f3f0a53bc67ddafdd3e9d7a34f56100aa9", size = 1953886, upload-time = "2025-11-21T05:10:12.55Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/f3/e03e5716a4e025d060585a9ca3123ce76e13dff8f464cda4d5e48ef9a26a/apache_tvm_ffi-0.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:6d56b2026aa614bd56d20375e5062ddb8d4baebd7a6b93476bbe3f0339cfa095", size = 1725820, upload-time = "2025-11-21T05:10:14.043Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/f0/d19a0b8e97e102f8376e18cd8234cc0a5f37d5c935ce74bf587e15f8450e/apache_tvm_ffi-0.1.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fae211bb8693c118109e106b73393164e3ca878823185cfd6e03765e04056f37", size = 1742398, upload-time = "2025-11-21T05:10:15.384Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/0c/699e26a3b7db2c1627ac87335deccf8a8b6cb2e218766fe9acd5aadb5f78/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:79ff39b5d6a2ed8665f4b91282391a052e8c7c76ac0f12f776ad0747f212f201", size = 1895272, upload-time = "2025-11-21T05:10:17.164Z" },
-    { url = "https://files.pythonhosted.org/packages/22/39/f64a1f1a23dc3298d3f50ceb275eb9b98b6898ea3df52e6d95fed756610c/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e2cc20f00d98e263ca35fef9a139fe65992988deddd570498ff77c11780ce22e", size = 1969033, upload-time = "2025-11-21T05:10:18.855Z" },
-    { url = "https://files.pythonhosted.org/packages/51/dc/fb9e25b83a57ae7b4df7308d839febf13d2e77b481ea79800e89f1eee470/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b2d1c8c421aaa0685fcc77347566da68e45d8d2dc150c2ee957906b1186d62", size = 1844972, upload-time = "2025-11-21T05:10:20.201Z" },
-    { url = "https://files.pythonhosted.org/packages/63/f2/ef1521e617254c2fe38b2f60440694de426b2402b225e1cc4ae04e9a22c2/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:adbc2f3b496d67199adaa999baecb9a3c9137cf1fc32163a4834950062bd0dd7", size = 1954220, upload-time = "2025-11-21T05:10:21.571Z" },
-    { url = "https://files.pythonhosted.org/packages/96/7c/1cadf17119f75b4d22761f8c003a767e63d456aac3f738ae42403ef7d990/apache_tvm_ffi-0.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:d797b29f70ea8c1843f4141a6b12b9770579a2b770f76898a96b721d2f987a23", size = 1725528, upload-time = "2025-11-21T05:10:23.043Z" },
-    { url = "https://files.pythonhosted.org/packages/21/b4/9983c1df90d239cc15055469c795a894bab85ffd75f9325d2f5e392dbf09/apache_tvm_ffi-0.1.3-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:71d1de0c139cae3824c1e8b511acf6b2bfd37deccfc640cb83b80ba17b33d6e3", size = 1719369, upload-time = "2025-11-21T05:10:24.768Z" },
-    { url = "https://files.pythonhosted.org/packages/01/e3/1b47af4391863351d9db42ab1ed116e3eba2c4ef49c1e161e4cd0ba379d9/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b0bc38da581c54c862840960c5bf0da5bb78aa007630d6f026675d1d4b1df898", size = 1867353, upload-time = "2025-11-21T05:10:26.481Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/6e/0d12246b90534be733accdfbfe6e2d5bde8d7c722293c21821fe10b09412/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:48160e8fa0235e8f3fad45102c4e856edb798c8b2954603f80f6721e3c0fd7ef", size = 1945829, upload-time = "2025-11-21T05:10:27.831Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/89/c4ad96b76a6e2d38795871bfb048c74aa60d1a7c01fab48cbe4e8c10f1a2/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b1c215d4608e17d7f2382f3c6b2903a4696255727ac905041f3a005c50a98afc", size = 1817481, upload-time = "2025-11-21T05:10:29.543Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/c7/2f6bc83fcc987c2eb00037c3f27f1d182c2f0d8976a16807ef1395a8ece1/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b75cc773bc29db64bb69f11d260ec66e88ad0a4a951d25650f69d3b2c9f9a186", size = 1927595, upload-time = "2025-11-21T05:10:30.882Z" },
-    { url = "https://files.pythonhosted.org/packages/12/a0/597c522588abef7fcf3fe38492cf832eed8ba9123f01d3c33dfaec174dcc/apache_tvm_ffi-0.1.3-cp312-abi3-win_amd64.whl", hash = "sha256:86fd1e1012ec2ec25213f714f5f28e6f6b897360776872d5f71c4be8cae8aeb8", size = 1706236, upload-time = "2025-11-21T05:10:32.25Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/76/8404875ee3fb61a3c97026e2eaab8d97e7f974601e444d5abb37a765c686/apache_tvm_ffi-0.1.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0ef290a792d6e3734e2fe1ff19b2b82e6bd3af6714216c7fe32d0a39c0d0e8df", size = 1750006, upload-time = "2025-11-21T05:10:33.594Z" },
-    { url = "https://files.pythonhosted.org/packages/98/98/7989ccb343044f97491cb1e46e675da75defc82a56495c320dcb1e31583b/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7b137ab0c7ec6507f61e88885ddbd3541d7d14d8ca25938f5fa106ca06996d3", size = 1880792, upload-time = "2025-11-21T05:10:35.239Z" },
-    { url = "https://files.pythonhosted.org/packages/64/2e/f772e75f947ebfa2faa305980ba2c172ae26a53f66c8f0c1f8915c4fa690/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d5187a90cf1c0663b8071f34f621f49ba83866412298deed9c4a94d1d991711b", size = 1953343, upload-time = "2025-11-21T05:10:36.879Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/a8/7d1d75f70d5a2cd283ded60784d9657c59fa7516f4b3c32437f70901d117/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54001ceab111e708a1638fd9e40713d9d55f6a073037a2d4a9f1982f8dda3c69", size = 1829560, upload-time = "2025-11-21T05:10:38.421Z" },
-    { url = "https://files.pythonhosted.org/packages/21/3a/6bee12cf517ace0bb8fd83bb72f6ca227743a49bab0c30918f523b5428df/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:996d87d6f180250e734ce7b7cce39f234e3ad3369fffb3882c8f29c79d280db4", size = 1937457, upload-time = "2025-11-21T05:10:40.505Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/99/107f082536447dba2a628e1571dd423b577df6bd8e441896e3f8b0929001/apache_tvm_ffi-0.1.3-cp314-cp314t-win_amd64.whl", hash = "sha256:6010c918c62fb19995e70c4f149dfc5c248783da0d22d5c40e84649bd89a9357", size = 1766053, upload-time = "2025-11-21T05:10:41.859Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/e3/e9/a13952726228fa6282154ecf927092396bc759739e5e045019f6ab92f3ca/apache_tvm_ffi-0.1.8.post2.tar.gz", hash = "sha256:4513e38852894f290172ecfefcbc18d34e817fd29c16a0f1770e130c82b4067e", size = 2441111, upload-time = "2026-01-13T18:11:27.864Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cd/65/0c67653e6431716f2706e29f2e2e1ce9a6f9d9f7615c0c637a4881c3f5a5/apache_tvm_ffi-0.1.8.post2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e11e03c865297c65c2f206c90b8014890bc52a3059d8148b47cd2c2759bcea90", size = 1838436, upload-time = "2026-01-13T18:10:22.334Z" },
+    { url = "https://files.pythonhosted.org/packages/46/8f/13fe7acbd7497312fda5faf51545fcb50c0ed5398cfe525d006ba29f1b9b/apache_tvm_ffi-0.1.8.post2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e855f2b3f60ec16939b00e1b594ce7f488f96e387b12547e98643177f70ab2b1", size = 1996102, upload-time = "2026-01-13T18:10:23.97Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/f8/b469a4d91ea74f627cb220835049fb60a566f7427f27c9f66c6c54a287b6/apache_tvm_ffi-0.1.8.post2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:854ecd90a1039d542c531fa6a4928f5633452aedf1ed7f646f3bbbeca8217156", size = 2069067, upload-time = "2026-01-13T18:10:25.425Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/88/663e532e7ba625a3998724ae0207ce620c32a057c339b4e4ae0be6810d85/apache_tvm_ffi-0.1.8.post2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1894b6f9c2b45bc9df8e407d041e575128591b998ced09f974675d2bb6b8bc9", size = 1939413, upload-time = "2026-01-13T18:10:28.61Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/16/6ec659fd5b3b163de9adc75bf29fc90460d212b489947b77b8ed89c01472/apache_tvm_ffi-0.1.8.post2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef922ef3ed971a4e161a0385ef9f67af379d52b0d83d62c08b79f6707b6660b5", size = 2053058, upload-time = "2026-01-13T18:10:30.721Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/a8/d01f81987db9bbfc4b242575d3fe79f72aeba3582ca449fec28d19938400/apache_tvm_ffi-0.1.8.post2-cp310-cp310-win_amd64.whl", hash = "sha256:146f98dcd21052eeed96ad07472bdffd8189fb2106edc6e3de91e28e3b000bf8", size = 1809231, upload-time = "2026-01-13T18:10:32.293Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/86/7db24692281d80204d07d77346ad4cb87f6183f1364ed94311993a47ed1a/apache_tvm_ffi-0.1.8.post2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:40f5fba3e06617f16888a0fdaf7ab4049841ff6e741644be822400438b771fe7", size = 1840013, upload-time = "2026-01-13T18:10:33.724Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/cc/fbaef883c6ba8e2c56ffcca997f2c076d1c14787799a62f39bd52c7126d5/apache_tvm_ffi-0.1.8.post2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9eb6d228fa22b6a5da140d761962f022a154746c91fe7608c49062deaf671f9f", size = 1995159, upload-time = "2026-01-13T18:10:35.727Z" },
+    { url = "https://files.pythonhosted.org/packages/49/08/f1e984e3573d0cbd6d53f3f73a12691fba153afc529fbd506d78e739b330/apache_tvm_ffi-0.1.8.post2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:581c0acf845859be0cc26ac79f3663a83393b662c97c7125ebb78f0228b69d96", size = 2068543, upload-time = "2026-01-13T18:10:39.12Z" },
+    { url = "https://files.pythonhosted.org/packages/35/1f/5336d430a133cf66ca9dac8ae9b6e25d8b99275a6687656421a1deee9f1b/apache_tvm_ffi-0.1.8.post2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:beadc7bb480ae02d02e2108543f6f4b4170d77e361ab3ccb43697d174ec185b0", size = 1939018, upload-time = "2026-01-13T18:10:40.621Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/67/969c66a27a128cf738d0c068e0d4451d691d8197929c797cbe8e59c6cfc9/apache_tvm_ffi-0.1.8.post2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e593d191c7ca0726ebcd3b024a4bc8140694fdfce2e7b02493f38ad5c4c9ecf7", size = 2053068, upload-time = "2026-01-13T18:10:43.241Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/f1/84881a799d227fdc4a61fbf0cb8d5ceb6a72ad788fa9070e5853ed9759b6/apache_tvm_ffi-0.1.8.post2-cp311-cp311-win_amd64.whl", hash = "sha256:1c685f19d0f26d9356c7c77a1cb652a3632ec9ee6cd21aa1d8cfb968743ec1fd", size = 1809557, upload-time = "2026-01-13T18:10:44.743Z" },
+    { url = "https://files.pythonhosted.org/packages/12/8b/a39d6c6eb1a87f6003e2717695cc6d44cc65ccd57dae5a0af944c0d25751/apache_tvm_ffi-0.1.8.post2-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:c13ec7fc8f255767998b301ace0cd1e7d17ba76b48ffeb97ca9eb22a3314e250", size = 1811882, upload-time = "2026-01-13T18:10:46.317Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/3a/7b1c9edcaeaebb945038144896cf17eb828a40b6ace0371823e133132664/apache_tvm_ffi-0.1.8.post2-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c78b4caf17304a1f47881bccdb2f9ac24d98b3b7fbe761a6dd4fd0585934d96", size = 1967259, upload-time = "2026-01-13T18:10:47.851Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/b6/463602f57dda2e1c69165c044c07061cd59404593f313a427a3ad9c02cf3/apache_tvm_ffi-0.1.8.post2-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4a48da3fa8f47130f3502134f01e97044388c5217e7b91be4b0acec4feab81a0", size = 2044821, upload-time = "2026-01-13T18:10:49.396Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/e6/9cdc7f4814b2fbdfceba5dc640c3704d07d8db18e3d1aef5aa49bbf1ba7e/apache_tvm_ffi-0.1.8.post2-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:61cc98e489ebc03bc96d1a966dc863eb1c0a607383f6bf4a416ff0a96170ca85", size = 1910964, upload-time = "2026-01-13T18:10:51.345Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/f5/a2e5487cdad575fe6cf34f8a23f8c49e08ce5808fa75dc19d98bcebc20ec/apache_tvm_ffi-0.1.8.post2-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:caa48509f0c7d9b896823b492a9ee42afac2548065c1ec7ef07f9a0dc30d2796", size = 2025814, upload-time = "2026-01-13T18:10:52.804Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/0d/8922c142281187ae6b989579876d00d20b84ccd3878aad487b91d951d254/apache_tvm_ffi-0.1.8.post2-cp312-abi3-win_amd64.whl", hash = "sha256:985831722d1dd562d13e8e34102fd99f42f964c53fc7cf9d80fc4f7602f89196", size = 1790204, upload-time = "2026-01-13T18:10:54.558Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/6e/2c21e754adf5c08fff154ee0a75b01568a4ed5da2d8f4a4a95d8451736e0/apache_tvm_ffi-0.1.8.post2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:4a3f6cb1173cfe19a1b66fd8577a6f3ce644bdc22691961c07c64304a7c3f17a", size = 1842240, upload-time = "2026-01-13T18:10:56.652Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/0a/342dd451d714b683143bd0d7dbd26279772dedf1d827a7efd357f05ff0aa/apache_tvm_ffi-0.1.8.post2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ac6c2d4e117ca63974bcd20fdf5715d01f3b4d0ed78921f493461050daf7c1a3", size = 1980660, upload-time = "2026-01-13T18:10:58.892Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/63/59f00116530cf7513866467de9044dbdd1954a536009e56c44f167743b35/apache_tvm_ffi-0.1.8.post2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0bc5456f971097dcd973daba32cb6f321893873c53235159ab6426b0c7bef7e2", size = 2052810, upload-time = "2026-01-13T18:11:01.698Z" },
+    { url = "https://files.pythonhosted.org/packages/46/dc/e22c784937fdc907785a764d773ef57a925c443d8ec01ad8bff43dd8d8d6/apache_tvm_ffi-0.1.8.post2-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f2016b4b31e7f75d71c638bbd1ae43d6e239cf8e20b539fb9de6917b3fb25bc", size = 1923716, upload-time = "2026-01-13T18:11:03.225Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/39/695f5642979d1d2d4cd3fca92e7b3b324ebba734b8aab9bdbacc26d4a05c/apache_tvm_ffi-0.1.8.post2-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c0ca7be630d0888eae163a4298ddfb3f7bd837112c7e6ffcd7157e34e78215b", size = 2035440, upload-time = "2026-01-13T18:11:04.841Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/e0/ed152425e51b7c8a4ce81d33683b43d87e770a76a65922dc7524a0106ae8/apache_tvm_ffi-0.1.8.post2-cp314-cp314t-win_amd64.whl", hash = "sha256:ecb0d9f7f410ba3b4d92547c2477f73f8406455448f4ea8c146515671fd20210", size = 1849938, upload-time = "2026-01-13T18:11:06.312Z" },
 ]
 
 [[package]]
@@ -339,68 +400,68 @@ wheels = [
 
 [[package]]
 name = "av"
-version = "16.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/15/c3/fd72a0315bc6c943ced1105aaac6e0ec1be57c70d8a616bd05acaa21ffee/av-16.0.1.tar.gz", hash = "sha256:dd2ce779fa0b5f5889a6d9e00fbbbc39f58e247e52d31044272648fe16ff1dbf", size = 3904030, upload-time = "2025-10-13T12:28:51.082Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e8/3c/eefa29b7d0f5afdf7af9197bbecad8ec2ad06bcb5ac7e909c05a624b00a6/av-16.0.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:8b141aaa29a3afc96a1d467d106790782c1914628b57309eaadb8c10c299c9c0", size = 27206679, upload-time = "2025-10-13T12:24:41.145Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/89/a474feb07d5b94aa5af3771b0fe328056e2e0a840039b329f4fa2a1fd13a/av-16.0.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:4b8a08a59a5be0082af063d3f4b216e3950340121c6ea95b505a3f5f5cc8f21d", size = 21774556, upload-time = "2025-10-13T12:24:44.332Z" },
-    { url = "https://files.pythonhosted.org/packages/be/e5/4361010dcac398bc224823e4b2a47803845e159af9f95164662c523770dc/av-16.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:792e7fc3c08eae005ff36486983966476e553cbb55aaeb0ec99adc4909377320", size = 38176763, upload-time = "2025-10-13T12:24:46.98Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/db/b27bdd20c9dc80de5b8792dae16dd6f4edf16408c0c7b28070c6228a8057/av-16.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:4e8ef5df76d8d0ee56139789f80bb90ad1a82a7e6df6e080e2e95c06fa22aea7", size = 39696277, upload-time = "2025-10-13T12:24:50.951Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/c8/dd48e6a3ac1e922c141475a0dc30e2b6dfdef9751b3274829889a9281cce/av-16.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4f7a6985784a7464f078e419c71f5528c3e550ee5d605e7149b4a37a111eb136", size = 39576660, upload-time = "2025-10-13T12:24:55.773Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/f0/223d047e2e60672a2fb5e51e28913de8d52195199f3e949cbfda1e6cd64b/av-16.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3f45c8d7b803b6faa2a25a26de5964a0a897de68298d9c9672c7af9d65d8b48a", size = 40752775, upload-time = "2025-10-13T12:25:00.827Z" },
-    { url = "https://files.pythonhosted.org/packages/18/73/73acad21c9203bc63d806e8baf42fe705eb5d36dafd1996b71ab5861a933/av-16.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:58e6faf1d9328d8cc6be14c5aadacb7d2965ed6d6ae1af32696993096543ff00", size = 32302328, upload-time = "2025-10-13T12:25:06.042Z" },
-    { url = "https://files.pythonhosted.org/packages/49/d3/f2a483c5273fccd556dfa1fce14fab3b5d6d213b46e28e54e254465a2255/av-16.0.1-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:e310d1fb42879df9bad2152a8db6d2ff8bf332c8c36349a09d62cc122f5070fb", size = 27191982, upload-time = "2025-10-13T12:25:10.622Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/39/dff28bd252131b3befd09d8587992fe18c09d5125eaefc83a6434d5f56ff/av-16.0.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:2f4b357e5615457a84e6b6290916b22864b76b43d5079e1a73bc27581a5b9bac", size = 21760305, upload-time = "2025-10-13T12:25:14.882Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/4d/2312d50a09c84a9b4269f7fea5de84f05dd2b7c7113dd961d31fad6c64c4/av-16.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:286665c77034c3a98080169b8b5586d5568a15da81fbcdaf8099252f2d232d7c", size = 38691616, upload-time = "2025-10-13T12:25:20.063Z" },
-    { url = "https://files.pythonhosted.org/packages/15/9a/3d2d30b56252f998e53fced13720e2ce809c4db477110f944034e0fa4c9f/av-16.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f88de8e5b8ea29e41af4d8d61df108323d050ccfbc90f15b13ec1f99ce0e841e", size = 40216464, upload-time = "2025-10-13T12:25:24.848Z" },
-    { url = "https://files.pythonhosted.org/packages/98/cb/3860054794a47715b4be0006105158c7119a57be58d9e8882b72e4d4e1dd/av-16.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0cdb71ebe4d1b241cf700f8f0c44a7d2a6602b921e16547dd68c0842113736e1", size = 40094077, upload-time = "2025-10-13T12:25:30.238Z" },
-    { url = "https://files.pythonhosted.org/packages/41/58/79830fb8af0a89c015250f7864bbd427dff09c70575c97847055f8a302f7/av-16.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:28c27a65d40e8cf82b6db2543f8feeb8b56d36c1938f50773494cd3b073c7223", size = 41279948, upload-time = "2025-10-13T12:25:35.24Z" },
-    { url = "https://files.pythonhosted.org/packages/83/79/6e1463b04382f379f857113b851cf5f9d580a2f7bd794211cd75352f4e04/av-16.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:ffea39ac7574f234f5168f9b9602e8d4ecdd81853238ec4d661001f03a6d3f64", size = 32297586, upload-time = "2025-10-13T12:25:39.826Z" },
-    { url = "https://files.pythonhosted.org/packages/44/78/12a11d7a44fdd8b26a65e2efa1d8a5826733c8887a989a78306ec4785956/av-16.0.1-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:e41a8fef85dfb2c717349f9ff74f92f9560122a9f1a94b1c6c9a8a9c9462ba71", size = 27206375, upload-time = "2025-10-13T12:25:44.423Z" },
-    { url = "https://files.pythonhosted.org/packages/27/19/3a4d3882852a0ee136121979ce46f6d2867b974eb217a2c9a070939f55ad/av-16.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:6352a64b25c9f985d4f279c2902db9a92424e6f2c972161e67119616f0796cb9", size = 21752603, upload-time = "2025-10-13T12:25:49.122Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/6e/f7abefba6e008e2f69bebb9a17ba38ce1df240c79b36a5b5fcacf8c8fcfd/av-16.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5201f7b4b5ed2128118cb90c2a6d64feedb0586ca7c783176896c78ffb4bbd5c", size = 38931978, upload-time = "2025-10-13T12:25:55.021Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/7a/1305243ab47f724fdd99ddef7309a594e669af7f0e655e11bdd2c325dfae/av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:daecc2072b82b6a942acbdaa9a2e00c05234c61fef976b22713983c020b07992", size = 40549383, upload-time = "2025-10-13T12:26:00.897Z" },
-    { url = "https://files.pythonhosted.org/packages/32/b2/357cc063185043eb757b4a48782bff780826103bcad1eb40c3ddfc050b7e/av-16.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6573da96e8bebc3536860a7def108d7dbe1875c86517072431ced702447e6aea", size = 40241993, upload-time = "2025-10-13T12:26:06.993Z" },
-    { url = "https://files.pythonhosted.org/packages/20/bb/ced42a4588ba168bf0ef1e9d016982e3ba09fde6992f1dda586fd20dcf71/av-16.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4bc064e48a8de6c087b97dd27cf4ef8c13073f0793108fbce3ecd721201b2502", size = 41532235, upload-time = "2025-10-13T12:26:12.488Z" },
-    { url = "https://files.pythonhosted.org/packages/15/37/c7811eca0f318d5fd3212f7e8c3d8335f75a54907c97a89213dc580b8056/av-16.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0c669b6b6668c8ae74451c15ec6d6d8a36e4c3803dc5d9910f607a174dd18f17", size = 32296912, upload-time = "2025-10-13T12:26:19.187Z" },
-    { url = "https://files.pythonhosted.org/packages/86/59/972f199ccc4f8c9e51f59e0f8962a09407396b3f6d11355e2c697ba555f9/av-16.0.1-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:4c61c6c120f5c5d95c711caf54e2c4a9fb2f1e613ac0a9c273d895f6b2602e44", size = 27170433, upload-time = "2025-10-13T12:26:24.673Z" },
-    { url = "https://files.pythonhosted.org/packages/53/9d/0514cbc185fb20353ab25da54197fbd169a233e39efcbb26533c36a9dbb9/av-16.0.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ecc2e41320c69095f44aff93470a0d32c30892b2dbad0a08040441c81efa379", size = 21717654, upload-time = "2025-10-13T12:26:29.12Z" },
-    { url = "https://files.pythonhosted.org/packages/32/8c/881409dd124b4e07d909d2b70568acb21126fc747656390840a2238651c9/av-16.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:036f0554d6faef3f4a94acaeb0cedd388e3ab96eb0eb5a14ec27c17369c466c9", size = 38651601, upload-time = "2025-10-13T12:26:33.919Z" },
-    { url = "https://files.pythonhosted.org/packages/35/fd/867ba4cc3ab504442dc89b0c117e6a994fc62782eb634c8f31304586f93e/av-16.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:876415470a62e4a3550cc38db2fc0094c25e64eea34d7293b7454125d5958190", size = 40278604, upload-time = "2025-10-13T12:26:39.2Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/87/63cde866c0af09a1fa9727b4f40b34d71b0535785f5665c27894306f1fbc/av-16.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:56902a06bd0828d13f13352874c370670882048267191ff5829534b611ba3956", size = 39984854, upload-time = "2025-10-13T12:26:44.581Z" },
-    { url = "https://files.pythonhosted.org/packages/71/3b/8f40a708bff0e6b0f957836e2ef1f4d4429041cf8d99a415a77ead8ac8a3/av-16.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe988c2bf0fc2d952858f791f18377ea4ae4e19ba3504793799cd6c2a2562edf", size = 41270352, upload-time = "2025-10-13T12:26:50.817Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/b5/c114292cb58a7269405ae13b7ba48c7d7bfeebbb2e4e66c8073c065a4430/av-16.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:708a66c248848029bf518f0482b81c5803846f1b597ef8013b19c014470b620f", size = 32273242, upload-time = "2025-10-13T12:26:55.788Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/e9/a5b714bc078fdcca8b46c8a0b38484ae5c24cd81d9c1703d3e8ae2b57259/av-16.0.1-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:79a77ee452537030c21a0b41139bedaf16629636bf764b634e93b99c9d5f4558", size = 27248984, upload-time = "2025-10-13T12:27:00.564Z" },
-    { url = "https://files.pythonhosted.org/packages/06/ef/ff777aaf1f88e3f6ce94aca4c5806a0c360e68d48f9d9f0214e42650f740/av-16.0.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:080823a6ff712f81e7089ae9756fb1512ca1742a138556a852ce50f58e457213", size = 21828098, upload-time = "2025-10-13T12:27:05.433Z" },
-    { url = "https://files.pythonhosted.org/packages/34/d7/a484358d24a42bedde97f61f5d6ee568a7dd866d9df6e33731378db92d9e/av-16.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:04e00124afa8b46a850ed48951ddda61de874407fb8307d6a875bba659d5727e", size = 40051697, upload-time = "2025-10-13T12:27:10.525Z" },
-    { url = "https://files.pythonhosted.org/packages/73/87/6772d6080837da5d5c810a98a95bde6977e1f5a6e2e759e8c9292af9ec69/av-16.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:bc098c1c6dc4e7080629a7e9560e67bd4b5654951e17e5ddfd2b1515cfcd37db", size = 41352596, upload-time = "2025-10-13T12:27:16.217Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/58/fe448c60cf7f85640a0ed8936f16bac874846aa35e1baa521028949c1ea3/av-16.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e6ffd3559a72c46a76aa622630751a821499ba5a780b0047ecc75105d43a6b61", size = 41183156, upload-time = "2025-10-13T12:27:21.574Z" },
-    { url = "https://files.pythonhosted.org/packages/85/c6/a039a0979d0c278e1bed6758d5a6186416c3ccb8081970df893fdf9a0d99/av-16.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7a3f1a36b550adadd7513f4f5ee956f9e06b01a88e59f3150ef5fec6879d6f79", size = 42302331, upload-time = "2025-10-13T12:27:26.953Z" },
-    { url = "https://files.pythonhosted.org/packages/18/7b/2ca4a9e3609ff155436dac384e360f530919cb1e328491f7df294be0f0dc/av-16.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c6de794abe52b8c0be55d8bb09ade05905efa74b1a5ab4860b4b9c2bfb6578bf", size = 32462194, upload-time = "2025-10-13T12:27:32.942Z" },
-    { url = "https://files.pythonhosted.org/packages/14/9a/6d17e379906cf53a7a44dfac9cf7e4b2e7df2082ba2dbf07126055effcc1/av-16.0.1-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:4b55ba69a943ae592ad7900da67129422954789de9dc384685d6b529925f542e", size = 27167101, upload-time = "2025-10-13T12:27:38.886Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/34/891816cd82d5646cb5a51d201d20be0a578232536d083b7d939734258067/av-16.0.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:d4a0c47b6c9bbadad8909b82847f5fe64a608ad392f0b01704e427349bcd9a47", size = 21722708, upload-time = "2025-10-13T12:27:43.29Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/20/c24ad34038423ab8c9728cef3301e0861727c188442dcfd70a4a10834c63/av-16.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:8bba52f3035708456f6b1994d10b0371b45cfd8f917b5e84ff81aef4ec2f08bf", size = 38638842, upload-time = "2025-10-13T12:27:49.776Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/32/034412309572ba3ad713079d07a3ffc13739263321aece54a3055d7a4f1f/av-16.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:08e34c7e7b5e55e29931180bbe21095e1874ac120992bf6b8615d39574487617", size = 40197789, upload-time = "2025-10-13T12:27:55.688Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/9c/40496298c32f9094e7df28641c5c58aa6fb07554dc232a9ac98a9894376f/av-16.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0d6250ab9db80c641b299987027c987f14935ea837ea4c02c5f5182f6b69d9e5", size = 39980829, upload-time = "2025-10-13T12:28:01.507Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/7e/5c38268ac1d424f309b13b2de4597ad28daea6039ee5af061e62918b12a8/av-16.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7b621f28d8bcbb07cdcd7b18943ddc040739ad304545715ae733873b6e1b739d", size = 41205928, upload-time = "2025-10-13T12:28:08.431Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/07/3176e02692d8753a6c4606021c60e4031341afb56292178eee633b6760a4/av-16.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:92101f49082392580c9dba4ba2fe5b931b3bb0fb75a1a848bfb9a11ded68be91", size = 32272836, upload-time = "2025-10-13T12:28:13.405Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/47/10e03b88de097385d1550cbb6d8de96159131705c13adb92bd9b7e677425/av-16.0.1-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:07c464bf2bc362a154eccc82e235ef64fd3aaf8d76fc8ed63d0ae520943c6d3f", size = 27248864, upload-time = "2025-10-13T12:28:17.467Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/60/7447f206bec3e55e81371f1989098baa2fe9adb7b46c149e6937b7e7c1ca/av-16.0.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:750da0673864b669c95882c7b25768cd93ece0e47010d74ebcc29dbb14d611f8", size = 21828185, upload-time = "2025-10-13T12:28:21.461Z" },
-    { url = "https://files.pythonhosted.org/packages/68/48/ee2680e7a01bc4911bbe902b814346911fa2528697a44f3043ee68e0f07e/av-16.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0b7c0d060863b2e341d07cd26851cb9057b7979814148b028fb7ee5d5eb8772d", size = 40040572, upload-time = "2025-10-13T12:28:26.585Z" },
-    { url = "https://files.pythonhosted.org/packages/da/68/2c43d28871721ae07cde432d6e36ae2f7035197cbadb43764cc5bf3d4b33/av-16.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:e67c2eca6023ca7d76b0709c5f392b23a5defba499f4c262411f8155b1482cbd", size = 41344288, upload-time = "2025-10-13T12:28:32.512Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/7f/1d801bff43ae1af4758c45eee2eaae64f303bbb460e79f352f08587fd179/av-16.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e3243d54d84986e8fbdc1946db634b0c41fe69b6de35a99fa8b763e18503d040", size = 41175142, upload-time = "2025-10-13T12:28:38.356Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/06/bb363138687066bbf8997c1433dbd9c81762bae120955ea431fb72d69d26/av-16.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bcf73efab5379601e6510abd7afe5f397d0f6defe69b1610c2f37a4a17996b", size = 42293932, upload-time = "2025-10-13T12:28:43.442Z" },
-    { url = "https://files.pythonhosted.org/packages/92/15/5e713098a085f970ccf88550194d277d244464d7b3a7365ad92acb4b6dc1/av-16.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6368d4ff153d75469d2a3217bc403630dc870a72fe0a014d9135de550d731a86", size = 32460624, upload-time = "2025-10-13T12:28:48.767Z" },
+version = "16.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/78/cd/3a83ffbc3cc25b39721d174487fb0d51a76582f4a1703f98e46170ce83d4/av-16.1.0.tar.gz", hash = "sha256:a094b4fd87a3721dacf02794d3d2c82b8d712c85b9534437e82a8a978c175ffd", size = 4285203, upload-time = "2026-01-11T07:31:33.772Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/97/51/2217a9249409d2e88e16e3f16f7c0def9fd3e7ffc4238b2ec211f9935bdb/av-16.1.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:2395748b0c34fe3a150a1721e4f3d4487b939520991b13e7b36f8926b3b12295", size = 26942590, upload-time = "2026-01-09T20:17:58.588Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/cd/a7070f4febc76a327c38808e01e2ff6b94531fe0b321af54ea3915165338/av-16.1.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:72d7ac832710a158eeb7a93242370aa024a7646516291c562ee7f14a7ea881fd", size = 21507910, upload-time = "2026-01-09T20:18:02.309Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/30/ec812418cd9b297f0238fe20eb0747d8a8b68d82c5f73c56fe519a274143/av-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6cbac833092e66b6b0ac4d81ab077970b8ca874951e9c3974d41d922aaa653ed", size = 38738309, upload-time = "2026-01-09T20:18:04.701Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/b8/6c5795bf1f05f45c5261f8bce6154e0e5e86b158a6676650ddd77c28805e/av-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:eb990672d97c18f99c02f31c8d5750236f770ffe354b5a52c5f4d16c5e65f619", size = 40293006, upload-time = "2026-01-09T20:18:07.238Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/44/5e183bcb9333fc3372ee6e683be8b0c9b515a506894b2d32ff465430c074/av-16.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05ad70933ac3b8ef896a820ea64b33b6cca91a5fac5259cb9ba7fa010435be15", size = 40123516, upload-time = "2026-01-09T20:18:09.955Z" },
+    { url = "https://files.pythonhosted.org/packages/12/1d/b5346d582a3c3d958b4d26a2cc63ce607233582d956121eb20d2bbe55c2e/av-16.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d831a1062a3c47520bf99de6ec682bd1d64a40dfa958e5457bb613c5270e7ce3", size = 41463289, upload-time = "2026-01-09T20:18:12.459Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/31/acc946c0545f72b8d0d74584cb2a0ade9b7dfe2190af3ef9aa52a2e3c0b1/av-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:358ab910fef3c5a806c55176f2b27e5663b33c4d0a692dafeb049c6ed71f8aff", size = 31754959, upload-time = "2026-01-09T20:18:14.718Z" },
+    { url = "https://files.pythonhosted.org/packages/48/d0/b71b65d1b36520dcb8291a2307d98b7fc12329a45614a303ff92ada4d723/av-16.1.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:e88ad64ee9d2b9c4c5d891f16c22ae78e725188b8926eb88187538d9dd0b232f", size = 26927747, upload-time = "2026-01-09T20:18:16.976Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/79/720a5a6ccdee06eafa211b945b0a450e3a0b8fc3d12922f0f3c454d870d2/av-16.1.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cb296073fa6935724de72593800ba86ae49ed48af03960a4aee34f8a611f442b", size = 21492232, upload-time = "2026-01-09T20:18:19.266Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/4f/a1ba8d922f2f6d1a3d52419463ef26dd6c4d43ee364164a71b424b5ae204/av-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:720edd4d25aa73723c1532bb0597806d7b9af5ee34fc02358782c358cfe2f879", size = 39291737, upload-time = "2026-01-09T20:18:21.513Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/31/fc62b9fe8738d2693e18d99f040b219e26e8df894c10d065f27c6b4f07e3/av-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c7f2bc703d0df260a1fdf4de4253c7f5500ca9fc57772ea241b0cb241bcf972e", size = 40846822, upload-time = "2026-01-09T20:18:24.275Z" },
+    { url = "https://files.pythonhosted.org/packages/53/10/ab446583dbce730000e8e6beec6ec3c2753e628c7f78f334a35cad0317f4/av-16.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d69c393809babada7d54964d56099e4b30a3e1f8b5736ca5e27bd7be0e0f3c83", size = 40675604, upload-time = "2026-01-09T20:18:26.866Z" },
+    { url = "https://files.pythonhosted.org/packages/31/d7/1003be685277005f6d63fd9e64904ee222fe1f7a0ea70af313468bb597db/av-16.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:441892be28582356d53f282873c5a951592daaf71642c7f20165e3ddcb0b4c63", size = 42015955, upload-time = "2026-01-09T20:18:29.461Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/4a/fa2a38ee9306bf4579f556f94ecbc757520652eb91294d2a99c7cf7623b9/av-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:273a3e32de64819e4a1cd96341824299fe06f70c46f2288b5dc4173944f0fd62", size = 31750339, upload-time = "2026-01-09T20:18:32.249Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/84/2535f55edcd426cebec02eb37b811b1b0c163f26b8d3f53b059e2ec32665/av-16.1.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:640f57b93f927fba8689f6966c956737ee95388a91bd0b8c8b5e0481f73513d6", size = 26945785, upload-time = "2026-01-09T20:18:34.486Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/17/ffb940c9e490bf42e86db4db1ff426ee1559cd355a69609ec1efe4d3a9eb/av-16.1.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:ae3fb658eec00852ebd7412fdc141f17f3ddce8afee2d2e1cf366263ad2a3b35", size = 21481147, upload-time = "2026-01-09T20:18:36.716Z" },
+    { url = "https://files.pythonhosted.org/packages/15/c1/e0d58003d2d83c3921887d5c8c9b8f5f7de9b58dc2194356a2656a45cfdc/av-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:27ee558d9c02a142eebcbe55578a6d817fedfde42ff5676275504e16d07a7f86", size = 39517197, upload-time = "2026-01-11T09:57:31.937Z" },
+    { url = "https://files.pythonhosted.org/packages/32/77/787797b43475d1b90626af76f80bfb0c12cfec5e11eafcfc4151b8c80218/av-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7ae547f6d5fa31763f73900d43901e8c5fa6367bb9a9840978d57b5a7ae14ed2", size = 41174337, upload-time = "2026-01-11T09:57:35.792Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/ac/d90df7f1e3b97fc5554cf45076df5045f1e0a6adf13899e10121229b826c/av-16.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8cf065f9d438e1921dc31fc7aa045790b58aee71736897866420d80b5450f62a", size = 40817720, upload-time = "2026-01-11T09:57:39.039Z" },
+    { url = "https://files.pythonhosted.org/packages/80/6f/13c3a35f9dbcebafd03fe0c4cbd075d71ac8968ec849a3cfce406c35a9d2/av-16.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a345877a9d3cc0f08e2bc4ec163ee83176864b92587afb9d08dff50f37a9a829", size = 42267396, upload-time = "2026-01-11T09:57:42.115Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/b9/275df9607f7fb44317ccb1d4be74827185c0d410f52b6e2cd770fe209118/av-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:f49243b1d27c91cd8c66fdba90a674e344eb8eb917264f36117bf2b6879118fd", size = 31752045, upload-time = "2026-01-11T09:57:45.106Z" },
+    { url = "https://files.pythonhosted.org/packages/75/2a/63797a4dde34283dd8054219fcb29294ba1c25d68ba8c8c8a6ae53c62c45/av-16.1.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:ce2a1b3d8bf619f6c47a9f28cfa7518ff75ddd516c234a4ee351037b05e6a587", size = 26916715, upload-time = "2026-01-11T09:57:47.682Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/c4/0b49cf730d0ae8cda925402f18ae814aef351f5772d14da72dd87ff66448/av-16.1.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:408dbe6a2573ca58a855eb8cd854112b33ea598651902c36709f5f84c991ed8e", size = 21452167, upload-time = "2026-01-11T09:57:50.606Z" },
+    { url = "https://files.pythonhosted.org/packages/51/23/408806503e8d5d840975aad5699b153aaa21eb6de41ade75248a79b7a37f/av-16.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:57f657f86652a160a8a01887aaab82282f9e629abf94c780bbdbb01595d6f0f7", size = 39215659, upload-time = "2026-01-11T09:57:53.757Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/19/a8528d5bba592b3903f44c28dab9cc653c95fcf7393f382d2751a1d1523e/av-16.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:adbad2b355c2ee4552cac59762809d791bda90586d134a33c6f13727fb86cb3a", size = 40874970, upload-time = "2026-01-11T09:57:56.802Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/24/2dbcdf0e929ad56b7df078e514e7bd4ca0d45cba798aff3c8caac097d2f7/av-16.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f42e1a68ec2aebd21f7eb6895be69efa6aa27eec1670536876399725bbda4b99", size = 40530345, upload-time = "2026-01-11T09:58:00.421Z" },
+    { url = "https://files.pythonhosted.org/packages/54/27/ae91b41207f34e99602d1c72ab6ffd9c51d7c67e3fbcd4e3a6c0e54f882c/av-16.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58fe47aeaef0f100c40ec8a5de9abbd37f118d3ca03829a1009cf288e9aef67c", size = 41972163, upload-time = "2026-01-11T09:58:03.756Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/7a/22158fb923b2a9a00dfab0e96ef2e8a1763a94dd89e666a5858412383d46/av-16.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:565093ebc93b2f4b76782589564869dadfa83af5b852edebedd8fee746457d06", size = 31729230, upload-time = "2026-01-11T09:58:07.254Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/f1/878f8687d801d6c4565d57ebec08449c46f75126ebca8e0fed6986599627/av-16.1.0-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:574081a24edb98343fd9f473e21ae155bf61443d4ec9d7708987fa597d6b04b2", size = 27008769, upload-time = "2026-01-11T09:58:10.266Z" },
+    { url = "https://files.pythonhosted.org/packages/30/f1/bd4ce8c8b5cbf1d43e27048e436cbc9de628d48ede088a1d0a993768eb86/av-16.1.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:9ab00ea29c25ebf2ea1d1e928d7babb3532d562481c5d96c0829212b70756ad0", size = 21590588, upload-time = "2026-01-11T09:58:12.629Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/dd/c81f6f9209201ff0b5d5bed6da6c6e641eef52d8fbc930d738c3f4f6f75d/av-16.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a84a91188c1071f238a9523fd42dbe567fb2e2607b22b779851b2ce0eac1b560", size = 40638029, upload-time = "2026-01-11T09:58:15.399Z" },
+    { url = "https://files.pythonhosted.org/packages/15/4d/07edff82b78d0459a6e807e01cd280d3180ce832efc1543de80d77676722/av-16.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c2cd0de4dd022a7225ff224fde8e7971496d700be41c50adaaa26c07bb50bf97", size = 41970776, upload-time = "2026-01-11T09:58:19.075Z" },
+    { url = "https://files.pythonhosted.org/packages/da/9d/1f48b354b82fa135d388477cd1b11b81bdd4384bd6a42a60808e2ec2d66b/av-16.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0816143530624a5a93bc5494f8c6eeaf77549b9366709c2ac8566c1e9bff6df5", size = 41764751, upload-time = "2026-01-11T09:58:22.788Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/c7/a509801e98db35ec552dd79da7bdbcff7104044bfeb4c7d196c1ce121593/av-16.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e3a28053af29644696d0c007e897d19b1197585834660a54773e12a40b16974c", size = 43034355, upload-time = "2026-01-11T09:58:26.125Z" },
+    { url = "https://files.pythonhosted.org/packages/36/8b/e5f530d9e8f640da5f5c5f681a424c65f9dd171c871cd255d8a861785a6e/av-16.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e3e67144a202b95ed299d165232533989390a9ea3119d37eccec697dc6dbb0c", size = 31947047, upload-time = "2026-01-11T09:58:31.867Z" },
+    { url = "https://files.pythonhosted.org/packages/df/18/8812221108c27d19f7e5f486a82c827923061edf55f906824ee0fcaadf50/av-16.1.0-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:39a634d8e5a87e78ea80772774bfd20c0721f0d633837ff185f36c9d14ffede4", size = 26916179, upload-time = "2026-01-11T09:58:36.506Z" },
+    { url = "https://files.pythonhosted.org/packages/38/ef/49d128a9ddce42a2766fe2b6595bd9c49e067ad8937a560f7838a541464e/av-16.1.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:0ba32fb9e9300948a7fa9f8a3fc686e6f7f77599a665c71eb2118fdfd2c743f9", size = 21460168, upload-time = "2026-01-11T09:58:39.231Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/a9/b310d390844656fa74eeb8c2750e98030877c75b97551a23a77d3f982741/av-16.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:ca04d17815182d34ce3edc53cbda78a4f36e956c0fd73e3bab249872a831c4d7", size = 39210194, upload-time = "2026-01-11T09:58:42.138Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/7b/e65aae179929d0f173af6e474ad1489b5b5ad4c968a62c42758d619e54cf/av-16.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ee0e8de2e124a9ef53c955fe2add6ee7c56cc8fd83318265549e44057db77142", size = 40811675, upload-time = "2026-01-11T09:58:45.871Z" },
+    { url = "https://files.pythonhosted.org/packages/54/3f/5d7edefd26b6a5187d6fac0f5065ee286109934f3dea607ef05e53f05b31/av-16.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:22bf77a2f658827043a1e184b479c3bf25c4c43ab32353677df2d119f080e28f", size = 40543942, upload-time = "2026-01-11T09:58:49.759Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/24/f8b17897b67be0900a211142f5646a99d896168f54d57c81f3e018853796/av-16.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2dd419d262e6a71cab206d80bbf28e0a10d0f227b671cdf5e854c028faa2d043", size = 41924336, upload-time = "2026-01-11T09:58:53.344Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/cf/d32bc6bbbcf60b65f6510c54690ed3ae1c4ca5d9fafbce835b6056858686/av-16.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:53585986fd431cd436f290fba662cfb44d9494fbc2949a183de00acc5b33fa88", size = 31735077, upload-time = "2026-01-11T09:58:56.684Z" },
+    { url = "https://files.pythonhosted.org/packages/53/f4/9b63dc70af8636399bd933e9df4f3025a0294609510239782c1b746fc796/av-16.1.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:76f5ed8495cf41e1209a5775d3699dc63fdc1740b94a095e2485f13586593205", size = 27014423, upload-time = "2026-01-11T09:58:59.703Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/da/787a07a0d6ed35a0888d7e5cfb8c2ffa202f38b7ad2c657299fac08eb046/av-16.1.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:8d55397190f12a1a3ae7538be58c356cceb2bf50df1b33523817587748ce89e5", size = 21595536, upload-time = "2026-01-11T09:59:02.508Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/f4/9a7d8651a611be6e7e3ab7b30bb43779899c8cac5f7293b9fb634c44a3f3/av-16.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:9d51d9037437218261b4bbf9df78a95e216f83d7774fbfe8d289230b5b2e28e2", size = 40642490, upload-time = "2026-01-11T09:59:05.842Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/e4/eb79bc538a94b4ff93cd4237d00939cba797579f3272490dd0144c165a21/av-16.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0ce07a89c15644407f49d942111ca046e323bbab0a9078ff43ee57c9b4a50dad", size = 41976905, upload-time = "2026-01-11T09:59:09.169Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/f5/f6db0dd86b70167a4d55ee0d9d9640983c570d25504f2bde42599f38241e/av-16.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:cac0c074892ea97113b53556ff41c99562db7b9f09f098adac1f08318c2acad5", size = 41770481, upload-time = "2026-01-11T09:59:12.74Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/8b/33651d658e45e16ab7671ea5fcf3d20980ea7983234f4d8d0c63c65581a5/av-16.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7dec3dcbc35a187ce450f65a2e0dda820d5a9e6553eea8344a1459af11c98649", size = 43036824, upload-time = "2026-01-11T09:59:16.507Z" },
+    { url = "https://files.pythonhosted.org/packages/83/41/7f13361db54d7e02f11552575c0384dadaf0918138f4eaa82ea03a9f9580/av-16.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6f90dc082ff2068ddbe77618400b44d698d25d9c4edac57459e250c16b33d700", size = 31948164, upload-time = "2026-01-11T09:59:19.501Z" },
 ]
 
 [[package]]
 name = "babel"
-version = "2.17.0"
+version = "2.18.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
+    { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" },
 ]
 
 [[package]]
@@ -484,15 +545,15 @@ wheels = [
 
 [[package]]
 name = "beautifulsoup4"
-version = "4.14.2"
+version = "4.14.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "soupsieve" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/77/e9/df2358efd7659577435e2177bfa69cba6c33216681af51a707193dec162a/beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e", size = 625822, upload-time = "2025-09-29T10:05:42.613Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/94/fe/3aed5d0be4d404d12d36ab97e2f1791424d9ca39c2f754a6285d59a3b01d/beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515", size = 106392, upload-time = "2025-09-29T10:05:43.771Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
 ]
 
 [[package]]
@@ -636,16 +697,53 @@ wheels = [
 name = "botocore"
 version = "1.41.5"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "jmespath" },
-    { name = "python-dateutil" },
-    { name = "urllib3" },
+    { name = "jmespath", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "python-dateutil", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "urllib3", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/90/22/7fe08c726a2e3b11a0aef8bf177e83891c9cb2dc1809d35c9ed91a9e60e6/botocore-1.41.5.tar.gz", hash = "sha256:0367622b811597d183bfcaab4a350f0d3ede712031ce792ef183cabdee80d3bf", size = 14668152, upload-time = "2025-11-26T20:27:38.026Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/4e/4e/21cd0b8f365449f1576f93de1ec8718ed18a7a3bc086dfbdeb79437bba7a/botocore-1.41.5-py3-none-any.whl", hash = "sha256:3fef7fcda30c82c27202d232cfdbd6782cb27f20f8e7e21b20606483e66ee73a", size = 14337008, upload-time = "2025-11-26T20:27:35.208Z" },
 ]
 
+[[package]]
+name = "botocore"
+version = "1.42.49"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "jmespath", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "python-dateutil", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "urllib3", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c5/95/c3a3765ab65073695161e7180d631428cb6e67c18d97e8897871dfe51fcc/botocore-1.42.49.tar.gz", hash = "sha256:333115a64a507697b0c450ade7e2d82bc8b4e21c0051542514532b455712bdcc", size = 14958380, upload-time = "2026-02-13T20:29:47.218Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d6/cd/7e7ceeff26889d1fd923f069381e3b2b85ff6d46c6fd1409ed8f486cc06f/botocore-1.42.49-py3-none-any.whl", hash = "sha256:1c33544f72101eed4ccf903ebb667a803e14e25b2af4e0836e4b871da1c0af37", size = 14630510, upload-time = "2026-02-13T20:29:43.086Z" },
+]
+
 [[package]]
 name = "braceexpand"
 version = "0.1.7"
@@ -675,22 +773,22 @@ wheels = [
 
 [[package]]
 name = "causal-conv1d"
-version = "1.5.3.post1"
+version = "1.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ninja" },
     { name = "packaging" },
     { name = "torch", marker = "sys_platform == 'never'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/64/cb/104778c728dc3d5ea3bf65a484e3a4cdbe894bdaa2586320e2f61d007b8c/causal_conv1d-1.5.3.post1.tar.gz", hash = "sha256:aba1b717484472d0b2f2e40520a1c03f35fe5155555bd753d1c324afc56ba468", size = 24198, upload-time = "2025-10-10T10:16:23.921Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/db/df/63a384c49743b9fc8fec4c05dbd0b515e1c1c2b07e4559acc4fc37c69223/causal_conv1d-1.6.0.tar.gz", hash = "sha256:4eae3220d08e1e88238f3a0a88783147cbdf47f612cc610add75127c7a37ca3e", size = 29356, upload-time = "2026-01-12T17:33:32.794Z" }
 
 [[package]]
 name = "certifi"
-version = "2025.11.12"
+version = "2026.1.4"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" },
 ]
 
 [[package]]
@@ -698,7 +796,7 @@ name = "cffi"
 version = "2.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pycparser", marker = "implementation_name != 'PyPy' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pycparser", marker = "implementation_name != 'PyPy'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
 wheels = [
@@ -869,7 +967,7 @@ name = "click"
 version = "8.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
 wheels = [
@@ -894,6 +992,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
 
+[[package]]
+name = "colorful"
+version = "0.5.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/82/31/109ef4bedeb32b4202e02ddb133162457adc4eb890a9ed9c05c9dd126ed0/colorful-0.5.8.tar.gz", hash = "sha256:bb16502b198be2f1c42ba3c52c703d5f651d826076817185f0294c1a549a7445", size = 209361, upload-time = "2025-10-29T11:53:21.663Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/11/25cdf9d5fc21efd30134fc74c43702c6f7ef09ebae8ed927f1283403ad8d/colorful-0.5.8-py2.py3-none-any.whl", hash = "sha256:a9381fdda3337fbaba5771991020abc69676afa102646650b759927892875992", size = 201334, upload-time = "2025-10-29T11:53:20.251Z" },
+]
+
 [[package]]
 name = "contextlib2"
 version = "21.6.0"
@@ -905,101 +1015,115 @@ wheels = [
 
 [[package]]
 name = "coverage"
-version = "7.12.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/89/26/4a96807b193b011588099c3b5c89fbb05294e5b90e71018e065465f34eb6/coverage-7.12.0.tar.gz", hash = "sha256:fc11e0a4e372cb5f282f16ef90d4a585034050ccda536451901abfb19a57f40c", size = 819341, upload-time = "2025-11-18T13:34:20.766Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/26/4a/0dc3de1c172d35abe512332cfdcc43211b6ebce629e4cc42e6cd25ed8f4d/coverage-7.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:32b75c2ba3f324ee37af3ccee5b30458038c50b349ad9b88cee85096132a575b", size = 217409, upload-time = "2025-11-18T13:31:53.122Z" },
-    { url = "https://files.pythonhosted.org/packages/01/c3/086198b98db0109ad4f84241e8e9ea7e5fb2db8c8ffb787162d40c26cc76/coverage-7.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb2a1b6ab9fe833714a483a915de350abc624a37149649297624c8d57add089c", size = 217927, upload-time = "2025-11-18T13:31:54.458Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/5f/34614dbf5ce0420828fc6c6f915126a0fcb01e25d16cf141bf5361e6aea6/coverage-7.12.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5734b5d913c3755e72f70bf6cc37a0518d4f4745cde760c5d8e12005e62f9832", size = 244678, upload-time = "2025-11-18T13:31:55.805Z" },
-    { url = "https://files.pythonhosted.org/packages/55/7b/6b26fb32e8e4a6989ac1d40c4e132b14556131493b1d06bc0f2be169c357/coverage-7.12.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b527a08cdf15753279b7afb2339a12073620b761d79b81cbe2cdebdb43d90daa", size = 246507, upload-time = "2025-11-18T13:31:57.05Z" },
-    { url = "https://files.pythonhosted.org/packages/06/42/7d70e6603d3260199b90fb48b537ca29ac183d524a65cc31366b2e905fad/coverage-7.12.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9bb44c889fb68004e94cab71f6a021ec83eac9aeabdbb5a5a88821ec46e1da73", size = 248366, upload-time = "2025-11-18T13:31:58.362Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/4a/d86b837923878424c72458c5b25e899a3c5ca73e663082a915f5b3c4d749/coverage-7.12.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4b59b501455535e2e5dde5881739897967b272ba25988c89145c12d772810ccb", size = 245366, upload-time = "2025-11-18T13:31:59.572Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/c2/2adec557e0aa9721875f06ced19730fdb7fc58e31b02b5aa56f2ebe4944d/coverage-7.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d8842f17095b9868a05837b7b1b73495293091bed870e099521ada176aa3e00e", size = 246408, upload-time = "2025-11-18T13:32:00.784Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/4b/8bd1f1148260df11c618e535fdccd1e5aaf646e55b50759006a4f41d8a26/coverage-7.12.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c5a6f20bf48b8866095c6820641e7ffbe23f2ac84a2efc218d91235e404c7777", size = 244416, upload-time = "2025-11-18T13:32:01.963Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/13/3a248dd6a83df90414c54a4e121fd081fb20602ca43955fbe1d60e2312a9/coverage-7.12.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:5f3738279524e988d9da2893f307c2093815c623f8d05a8f79e3eff3a7a9e553", size = 244681, upload-time = "2025-11-18T13:32:03.408Z" },
-    { url = "https://files.pythonhosted.org/packages/76/30/aa833827465a5e8c938935f5d91ba055f70516941078a703740aaf1aa41f/coverage-7.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0d68c1f7eabbc8abe582d11fa393ea483caf4f44b0af86881174769f185c94d", size = 245300, upload-time = "2025-11-18T13:32:04.686Z" },
-    { url = "https://files.pythonhosted.org/packages/38/24/f85b3843af1370fb3739fa7571819b71243daa311289b31214fe3e8c9d68/coverage-7.12.0-cp310-cp310-win32.whl", hash = "sha256:7670d860e18b1e3ee5930b17a7d55ae6287ec6e55d9799982aa103a2cc1fa2ef", size = 220008, upload-time = "2025-11-18T13:32:05.806Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/a2/c7da5b9566f7164db9eefa133d17761ecb2c2fde9385d754e5b5c80f710d/coverage-7.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:f999813dddeb2a56aab5841e687b68169da0d3f6fc78ccf50952fa2463746022", size = 220943, upload-time = "2025-11-18T13:32:07.166Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/0c/0dfe7f0487477d96432e4815537263363fb6dd7289743a796e8e51eabdf2/coverage-7.12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa124a3683d2af98bd9d9c2bfa7a5076ca7e5ab09fdb96b81fa7d89376ae928f", size = 217535, upload-time = "2025-11-18T13:32:08.812Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/f5/f9a4a053a5bbff023d3bec259faac8f11a1e5a6479c2ccf586f910d8dac7/coverage-7.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d93fbf446c31c0140208dcd07c5d882029832e8ed7891a39d6d44bd65f2316c3", size = 218044, upload-time = "2025-11-18T13:32:10.329Z" },
-    { url = "https://files.pythonhosted.org/packages/95/c5/84fc3697c1fa10cd8571919bf9693f693b7373278daaf3b73e328d502bc8/coverage-7.12.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:52ca620260bd8cd6027317bdd8b8ba929be1d741764ee765b42c4d79a408601e", size = 248440, upload-time = "2025-11-18T13:32:12.536Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/36/2d93fbf6a04670f3874aed397d5a5371948a076e3249244a9e84fb0e02d6/coverage-7.12.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f3433ffd541380f3a0e423cff0f4926d55b0cc8c1d160fdc3be24a4c03aa65f7", size = 250361, upload-time = "2025-11-18T13:32:13.852Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/49/66dc65cc456a6bfc41ea3d0758c4afeaa4068a2b2931bf83be6894cf1058/coverage-7.12.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f7bbb321d4adc9f65e402c677cd1c8e4c2d0105d3ce285b51b4d87f1d5db5245", size = 252472, upload-time = "2025-11-18T13:32:15.068Z" },
-    { url = "https://files.pythonhosted.org/packages/35/1f/ebb8a18dffd406db9fcd4b3ae42254aedcaf612470e8712f12041325930f/coverage-7.12.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:22a7aade354a72dff3b59c577bfd18d6945c61f97393bc5fb7bd293a4237024b", size = 248592, upload-time = "2025-11-18T13:32:16.328Z" },
-    { url = "https://files.pythonhosted.org/packages/da/a8/67f213c06e5ea3b3d4980df7dc344d7fea88240b5fe878a5dcbdfe0e2315/coverage-7.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3ff651dcd36d2fea66877cd4a82de478004c59b849945446acb5baf9379a1b64", size = 250167, upload-time = "2025-11-18T13:32:17.687Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/00/e52aef68154164ea40cc8389c120c314c747fe63a04b013a5782e989b77f/coverage-7.12.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:31b8b2e38391a56e3cea39d22a23faaa7c3fc911751756ef6d2621d2a9daf742", size = 248238, upload-time = "2025-11-18T13:32:19.2Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/a4/4d88750bcf9d6d66f77865e5a05a20e14db44074c25fd22519777cb69025/coverage-7.12.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:297bc2da28440f5ae51c845a47c8175a4db0553a53827886e4fb25c66633000c", size = 247964, upload-time = "2025-11-18T13:32:21.027Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/6b/b74693158899d5b47b0bf6238d2c6722e20ba749f86b74454fac0696bb00/coverage-7.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ff7651cc01a246908eac162a6a86fc0dbab6de1ad165dfb9a1e2ec660b44984", size = 248862, upload-time = "2025-11-18T13:32:22.304Z" },
-    { url = "https://files.pythonhosted.org/packages/18/de/6af6730227ce0e8ade307b1cc4a08e7f51b419a78d02083a86c04ccceb29/coverage-7.12.0-cp311-cp311-win32.whl", hash = "sha256:313672140638b6ddb2c6455ddeda41c6a0b208298034544cfca138978c6baed6", size = 220033, upload-time = "2025-11-18T13:32:23.714Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/a1/e7f63021a7c4fe20994359fcdeae43cbef4a4d0ca36a5a1639feeea5d9e1/coverage-7.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:a1783ed5bd0d5938d4435014626568dc7f93e3cb99bc59188cc18857c47aa3c4", size = 220966, upload-time = "2025-11-18T13:32:25.599Z" },
-    { url = "https://files.pythonhosted.org/packages/77/e8/deae26453f37c20c3aa0c4433a1e32cdc169bf415cce223a693117aa3ddd/coverage-7.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:4648158fd8dd9381b5847622df1c90ff314efbfc1df4550092ab6013c238a5fc", size = 219637, upload-time = "2025-11-18T13:32:27.265Z" },
-    { url = "https://files.pythonhosted.org/packages/02/bf/638c0427c0f0d47638242e2438127f3c8ee3cfc06c7fdeb16778ed47f836/coverage-7.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:29644c928772c78512b48e14156b81255000dcfd4817574ff69def189bcb3647", size = 217704, upload-time = "2025-11-18T13:32:28.906Z" },
-    { url = "https://files.pythonhosted.org/packages/08/e1/706fae6692a66c2d6b871a608bbde0da6281903fa0e9f53a39ed441da36a/coverage-7.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8638cbb002eaa5d7c8d04da667813ce1067080b9a91099801a0053086e52b736", size = 218064, upload-time = "2025-11-18T13:32:30.161Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/8b/eb0231d0540f8af3ffda39720ff43cb91926489d01524e68f60e961366e4/coverage-7.12.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:083631eeff5eb9992c923e14b810a179798bb598e6a0dd60586819fc23be6e60", size = 249560, upload-time = "2025-11-18T13:32:31.835Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/a1/67fb52af642e974d159b5b379e4d4c59d0ebe1288677fbd04bbffe665a82/coverage-7.12.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:99d5415c73ca12d558e07776bd957c4222c687b9f1d26fa0e1b57e3598bdcde8", size = 252318, upload-time = "2025-11-18T13:32:33.178Z" },
-    { url = "https://files.pythonhosted.org/packages/41/e5/38228f31b2c7665ebf9bdfdddd7a184d56450755c7e43ac721c11a4b8dab/coverage-7.12.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e949ebf60c717c3df63adb4a1a366c096c8d7fd8472608cd09359e1bd48ef59f", size = 253403, upload-time = "2025-11-18T13:32:34.45Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/4b/df78e4c8188f9960684267c5a4897836f3f0f20a20c51606ee778a1d9749/coverage-7.12.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6d907ddccbca819afa2cd014bc69983b146cca2735a0b1e6259b2a6c10be1e70", size = 249984, upload-time = "2025-11-18T13:32:35.747Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/51/bb163933d195a345c6f63eab9e55743413d064c291b6220df754075c2769/coverage-7.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b1518ecbad4e6173f4c6e6c4a46e49555ea5679bf3feda5edb1b935c7c44e8a0", size = 251339, upload-time = "2025-11-18T13:32:37.352Z" },
-    { url = "https://files.pythonhosted.org/packages/15/40/c9b29cdb8412c837cdcbc2cfa054547dd83affe6cbbd4ce4fdb92b6ba7d1/coverage-7.12.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:51777647a749abdf6f6fd8c7cffab12de68ab93aab15efc72fbbb83036c2a068", size = 249489, upload-time = "2025-11-18T13:32:39.212Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/da/b3131e20ba07a0de4437a50ef3b47840dfabf9293675b0cd5c2c7f66dd61/coverage-7.12.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:42435d46d6461a3b305cdfcad7cdd3248787771f53fe18305548cba474e6523b", size = 249070, upload-time = "2025-11-18T13:32:40.598Z" },
-    { url = "https://files.pythonhosted.org/packages/70/81/b653329b5f6302c08d683ceff6785bc60a34be9ae92a5c7b63ee7ee7acec/coverage-7.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5bcead88c8423e1855e64b8057d0544e33e4080b95b240c2a355334bb7ced937", size = 250929, upload-time = "2025-11-18T13:32:42.915Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/00/250ac3bca9f252a5fb1338b5ad01331ebb7b40223f72bef5b1b2cb03aa64/coverage-7.12.0-cp312-cp312-win32.whl", hash = "sha256:dcbb630ab034e86d2a0f79aefd2be07e583202f41e037602d438c80044957baa", size = 220241, upload-time = "2025-11-18T13:32:44.665Z" },
-    { url = "https://files.pythonhosted.org/packages/64/1c/77e79e76d37ce83302f6c21980b45e09f8aa4551965213a10e62d71ce0ab/coverage-7.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:2fd8354ed5d69775ac42986a691fbf68b4084278710cee9d7c3eaa0c28fa982a", size = 221051, upload-time = "2025-11-18T13:32:46.008Z" },
-    { url = "https://files.pythonhosted.org/packages/31/f5/641b8a25baae564f9e52cac0e2667b123de961985709a004e287ee7663cc/coverage-7.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:737c3814903be30695b2de20d22bcc5428fdae305c61ba44cdc8b3252984c49c", size = 219692, upload-time = "2025-11-18T13:32:47.372Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/14/771700b4048774e48d2c54ed0c674273702713c9ee7acdfede40c2666747/coverage-7.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47324fffca8d8eae7e185b5bb20c14645f23350f870c1649003618ea91a78941", size = 217725, upload-time = "2025-11-18T13:32:49.22Z" },
-    { url = "https://files.pythonhosted.org/packages/17/a7/3aa4144d3bcb719bf67b22d2d51c2d577bf801498c13cb08f64173e80497/coverage-7.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ccf3b2ede91decd2fb53ec73c1f949c3e034129d1e0b07798ff1d02ea0c8fa4a", size = 218098, upload-time = "2025-11-18T13:32:50.78Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/9c/b846bbc774ff81091a12a10203e70562c91ae71badda00c5ae5b613527b1/coverage-7.12.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b365adc70a6936c6b0582dc38746b33b2454148c02349345412c6e743efb646d", size = 249093, upload-time = "2025-11-18T13:32:52.554Z" },
-    { url = "https://files.pythonhosted.org/packages/76/b6/67d7c0e1f400b32c883e9342de4a8c2ae7c1a0b57c5de87622b7262e2309/coverage-7.12.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bc13baf85cd8a4cfcf4a35c7bc9d795837ad809775f782f697bf630b7e200211", size = 251686, upload-time = "2025-11-18T13:32:54.862Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/75/b095bd4b39d49c3be4bffbb3135fea18a99a431c52dd7513637c0762fecb/coverage-7.12.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:099d11698385d572ceafb3288a5b80fe1fc58bf665b3f9d362389de488361d3d", size = 252930, upload-time = "2025-11-18T13:32:56.417Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/f3/466f63015c7c80550bead3093aacabf5380c1220a2a93c35d374cae8f762/coverage-7.12.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:473dc45d69694069adb7680c405fb1e81f60b2aff42c81e2f2c3feaf544d878c", size = 249296, upload-time = "2025-11-18T13:32:58.074Z" },
-    { url = "https://files.pythonhosted.org/packages/27/86/eba2209bf2b7e28c68698fc13437519a295b2d228ba9e0ec91673e09fa92/coverage-7.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:583f9adbefd278e9de33c33d6846aa8f5d164fa49b47144180a0e037f0688bb9", size = 251068, upload-time = "2025-11-18T13:32:59.646Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/55/ca8ae7dbba962a3351f18940b359b94c6bafdd7757945fdc79ec9e452dc7/coverage-7.12.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2089cc445f2dc0af6f801f0d1355c025b76c24481935303cf1af28f636688f0", size = 249034, upload-time = "2025-11-18T13:33:01.481Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/d7/39136149325cad92d420b023b5fd900dabdd1c3a0d1d5f148ef4a8cedef5/coverage-7.12.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:950411f1eb5d579999c5f66c62a40961f126fc71e5e14419f004471957b51508", size = 248853, upload-time = "2025-11-18T13:33:02.935Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/b6/76e1add8b87ef60e00643b0b7f8f7bb73d4bf5249a3be19ebefc5793dd25/coverage-7.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1aab7302a87bafebfe76b12af681b56ff446dc6f32ed178ff9c092ca776e6bc", size = 250619, upload-time = "2025-11-18T13:33:04.336Z" },
-    { url = "https://files.pythonhosted.org/packages/95/87/924c6dc64f9203f7a3c1832a6a0eee5a8335dbe5f1bdadcc278d6f1b4d74/coverage-7.12.0-cp313-cp313-win32.whl", hash = "sha256:d7e0d0303c13b54db495eb636bc2465b2fb8475d4c8bcec8fe4b5ca454dfbae8", size = 220261, upload-time = "2025-11-18T13:33:06.493Z" },
-    { url = "https://files.pythonhosted.org/packages/91/77/dd4aff9af16ff776bf355a24d87eeb48fc6acde54c907cc1ea89b14a8804/coverage-7.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce61969812d6a98a981d147d9ac583a36ac7db7766f2e64a9d4d059c2fe29d07", size = 221072, upload-time = "2025-11-18T13:33:07.926Z" },
-    { url = "https://files.pythonhosted.org/packages/70/49/5c9dc46205fef31b1b226a6e16513193715290584317fd4df91cdaf28b22/coverage-7.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bcec6f47e4cb8a4c2dc91ce507f6eefc6a1b10f58df32cdc61dff65455031dfc", size = 219702, upload-time = "2025-11-18T13:33:09.631Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/62/f87922641c7198667994dd472a91e1d9b829c95d6c29529ceb52132436ad/coverage-7.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:459443346509476170d553035e4a3eed7b860f4fe5242f02de1010501956ce87", size = 218420, upload-time = "2025-11-18T13:33:11.153Z" },
-    { url = "https://files.pythonhosted.org/packages/85/dd/1cc13b2395ef15dbb27d7370a2509b4aee77890a464fb35d72d428f84871/coverage-7.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04a79245ab2b7a61688958f7a855275997134bc84f4a03bc240cf64ff132abf6", size = 218773, upload-time = "2025-11-18T13:33:12.569Z" },
-    { url = "https://files.pythonhosted.org/packages/74/40/35773cc4bb1e9d4658d4fb669eb4195b3151bef3bbd6f866aba5cd5dac82/coverage-7.12.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:09a86acaaa8455f13d6a99221d9654df249b33937b4e212b4e5a822065f12aa7", size = 260078, upload-time = "2025-11-18T13:33:14.037Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/ee/231bb1a6ffc2905e396557585ebc6bdc559e7c66708376d245a1f1d330fc/coverage-7.12.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:907e0df1b71ba77463687a74149c6122c3f6aac56c2510a5d906b2f368208560", size = 262144, upload-time = "2025-11-18T13:33:15.601Z" },
-    { url = "https://files.pythonhosted.org/packages/28/be/32f4aa9f3bf0b56f3971001b56508352c7753915345d45fab4296a986f01/coverage-7.12.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b57e2d0ddd5f0582bae5437c04ee71c46cd908e7bc5d4d0391f9a41e812dd12", size = 264574, upload-time = "2025-11-18T13:33:17.354Z" },
-    { url = "https://files.pythonhosted.org/packages/68/7c/00489fcbc2245d13ab12189b977e0cf06ff3351cb98bc6beba8bd68c5902/coverage-7.12.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:58c1c6aa677f3a1411fe6fb28ec3a942e4f665df036a3608816e0847fad23296", size = 259298, upload-time = "2025-11-18T13:33:18.958Z" },
-    { url = "https://files.pythonhosted.org/packages/96/b4/f0760d65d56c3bea95b449e02570d4abd2549dc784bf39a2d4721a2d8ceb/coverage-7.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4c589361263ab2953e3c4cd2a94db94c4ad4a8e572776ecfbad2389c626e4507", size = 262150, upload-time = "2025-11-18T13:33:20.644Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/71/9a9314df00f9326d78c1e5a910f520d599205907432d90d1c1b7a97aa4b1/coverage-7.12.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:91b810a163ccad2e43b1faa11d70d3cf4b6f3d83f9fd5f2df82a32d47b648e0d", size = 259763, upload-time = "2025-11-18T13:33:22.189Z" },
-    { url = "https://files.pythonhosted.org/packages/10/34/01a0aceed13fbdf925876b9a15d50862eb8845454301fe3cdd1df08b2182/coverage-7.12.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:40c867af715f22592e0d0fb533a33a71ec9e0f73a6945f722a0c85c8c1cbe3a2", size = 258653, upload-time = "2025-11-18T13:33:24.239Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/04/81d8fd64928acf1574bbb0181f66901c6c1c6279c8ccf5f84259d2c68ae9/coverage-7.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:68b0d0a2d84f333de875666259dadf28cc67858bc8fd8b3f1eae84d3c2bec455", size = 260856, upload-time = "2025-11-18T13:33:26.365Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/76/fa2a37bfaeaf1f766a2d2360a25a5297d4fb567098112f6517475eee120b/coverage-7.12.0-cp313-cp313t-win32.whl", hash = "sha256:73f9e7fbd51a221818fd11b7090eaa835a353ddd59c236c57b2199486b116c6d", size = 220936, upload-time = "2025-11-18T13:33:28.165Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/52/60f64d932d555102611c366afb0eb434b34266b1d9266fc2fe18ab641c47/coverage-7.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:24cff9d1f5743f67db7ba46ff284018a6e9aeb649b67aa1e70c396aa1b7cb23c", size = 222001, upload-time = "2025-11-18T13:33:29.656Z" },
-    { url = "https://files.pythonhosted.org/packages/77/df/c303164154a5a3aea7472bf323b7c857fed93b26618ed9fc5c2955566bb0/coverage-7.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:c87395744f5c77c866d0f5a43d97cc39e17c7f1cb0115e54a2fe67ca75c5d14d", size = 220273, upload-time = "2025-11-18T13:33:31.415Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/2e/fc12db0883478d6e12bbd62d481210f0c8daf036102aa11434a0c5755825/coverage-7.12.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a1c59b7dc169809a88b21a936eccf71c3895a78f5592051b1af8f4d59c2b4f92", size = 217777, upload-time = "2025-11-18T13:33:32.86Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/c1/ce3e525d223350c6ec16b9be8a057623f54226ef7f4c2fee361ebb6a02b8/coverage-7.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8787b0f982e020adb732b9f051f3e49dd5054cebbc3f3432061278512a2b1360", size = 218100, upload-time = "2025-11-18T13:33:34.532Z" },
-    { url = "https://files.pythonhosted.org/packages/15/87/113757441504aee3808cb422990ed7c8bcc2d53a6779c66c5adef0942939/coverage-7.12.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5ea5a9f7dc8877455b13dd1effd3202e0bca72f6f3ab09f9036b1bcf728f69ac", size = 249151, upload-time = "2025-11-18T13:33:36.135Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/1d/9529d9bd44049b6b05bb319c03a3a7e4b0a8a802d28fa348ad407e10706d/coverage-7.12.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fdba9f15849534594f60b47c9a30bc70409b54947319a7c4fd0e8e3d8d2f355d", size = 251667, upload-time = "2025-11-18T13:33:37.996Z" },
-    { url = "https://files.pythonhosted.org/packages/11/bb/567e751c41e9c03dc29d3ce74b8c89a1e3396313e34f255a2a2e8b9ebb56/coverage-7.12.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a00594770eb715854fb1c57e0dea08cce6720cfbc531accdb9850d7c7770396c", size = 253003, upload-time = "2025-11-18T13:33:39.553Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/b3/c2cce2d8526a02fb9e9ca14a263ca6fc074449b33a6afa4892838c903528/coverage-7.12.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5560c7e0d82b42eb1951e4f68f071f8017c824ebfd5a6ebe42c60ac16c6c2434", size = 249185, upload-time = "2025-11-18T13:33:42.086Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/a7/967f93bb66e82c9113c66a8d0b65ecf72fc865adfba5a145f50c7af7e58d/coverage-7.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2e26b481c9159c2773a37947a9718cfdc58893029cdfb177531793e375cfc", size = 251025, upload-time = "2025-11-18T13:33:43.634Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/b2/f2f6f56337bc1af465d5b2dc1ee7ee2141b8b9272f3bf6213fcbc309a836/coverage-7.12.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6e1a8c066dabcde56d5d9fed6a66bc19a2883a3fe051f0c397a41fc42aedd4cc", size = 248979, upload-time = "2025-11-18T13:33:46.04Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/7a/bf4209f45a4aec09d10a01a57313a46c0e0e8f4c55ff2965467d41a92036/coverage-7.12.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f7ba9da4726e446d8dd8aae5a6cd872511184a5d861de80a86ef970b5dacce3e", size = 248800, upload-time = "2025-11-18T13:33:47.546Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/b7/1e01b8696fb0521810f60c5bbebf699100d6754183e6cc0679bf2ed76531/coverage-7.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e0f483ab4f749039894abaf80c2f9e7ed77bbf3c737517fb88c8e8e305896a17", size = 250460, upload-time = "2025-11-18T13:33:49.537Z" },
-    { url = "https://files.pythonhosted.org/packages/71/ae/84324fb9cb46c024760e706353d9b771a81b398d117d8c1fe010391c186f/coverage-7.12.0-cp314-cp314-win32.whl", hash = "sha256:76336c19a9ef4a94b2f8dc79f8ac2da3f193f625bb5d6f51a328cd19bfc19933", size = 220533, upload-time = "2025-11-18T13:33:51.16Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/71/1033629deb8460a8f97f83e6ac4ca3b93952e2b6f826056684df8275e015/coverage-7.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:7c1059b600aec6ef090721f8f633f60ed70afaffe8ecab85b59df748f24b31fe", size = 221348, upload-time = "2025-11-18T13:33:52.776Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/5f/ac8107a902f623b0c251abdb749be282dc2ab61854a8a4fcf49e276fce2f/coverage-7.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:172cf3a34bfef42611963e2b661302a8931f44df31629e5b1050567d6b90287d", size = 219922, upload-time = "2025-11-18T13:33:54.316Z" },
-    { url = "https://files.pythonhosted.org/packages/79/6e/f27af2d4da367f16077d21ef6fe796c874408219fa6dd3f3efe7751bd910/coverage-7.12.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:aa7d48520a32cb21c7a9b31f81799e8eaec7239db36c3b670be0fa2403828d1d", size = 218511, upload-time = "2025-11-18T13:33:56.343Z" },
-    { url = "https://files.pythonhosted.org/packages/67/dd/65fd874aa460c30da78f9d259400d8e6a4ef457d61ab052fd248f0050558/coverage-7.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:90d58ac63bc85e0fb919f14d09d6caa63f35a5512a2205284b7816cafd21bb03", size = 218771, upload-time = "2025-11-18T13:33:57.966Z" },
-    { url = "https://files.pythonhosted.org/packages/55/e0/7c6b71d327d8068cb79c05f8f45bf1b6145f7a0de23bbebe63578fe5240a/coverage-7.12.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca8ecfa283764fdda3eae1bdb6afe58bf78c2c3ec2b2edcb05a671f0bba7b3f9", size = 260151, upload-time = "2025-11-18T13:33:59.597Z" },
-    { url = "https://files.pythonhosted.org/packages/49/ce/4697457d58285b7200de6b46d606ea71066c6e674571a946a6ea908fb588/coverage-7.12.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:874fe69a0785d96bd066059cd4368022cebbec1a8958f224f0016979183916e6", size = 262257, upload-time = "2025-11-18T13:34:01.166Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/33/acbc6e447aee4ceba88c15528dbe04a35fb4d67b59d393d2e0d6f1e242c1/coverage-7.12.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3c889c0b8b283a24d721a9eabc8ccafcfc3aebf167e4cd0d0e23bf8ec4e339", size = 264671, upload-time = "2025-11-18T13:34:02.795Z" },
-    { url = "https://files.pythonhosted.org/packages/87/ec/e2822a795c1ed44d569980097be839c5e734d4c0c1119ef8e0a073496a30/coverage-7.12.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bb5b894b3ec09dcd6d3743229dc7f2c42ef7787dc40596ae04c0edda487371e", size = 259231, upload-time = "2025-11-18T13:34:04.397Z" },
-    { url = "https://files.pythonhosted.org/packages/72/c5/a7ec5395bb4a49c9b7ad97e63f0c92f6bf4a9e006b1393555a02dae75f16/coverage-7.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:79a44421cd5fba96aa57b5e3b5a4d3274c449d4c622e8f76882d76635501fd13", size = 262137, upload-time = "2025-11-18T13:34:06.068Z" },
-    { url = "https://files.pythonhosted.org/packages/67/0c/02c08858b764129f4ecb8e316684272972e60777ae986f3865b10940bdd6/coverage-7.12.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:33baadc0efd5c7294f436a632566ccc1f72c867f82833eb59820ee37dc811c6f", size = 259745, upload-time = "2025-11-18T13:34:08.04Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/04/4fd32b7084505f3829a8fe45c1a74a7a728cb251aaadbe3bec04abcef06d/coverage-7.12.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c406a71f544800ef7e9e0000af706b88465f3573ae8b8de37e5f96c59f689ad1", size = 258570, upload-time = "2025-11-18T13:34:09.676Z" },
-    { url = "https://files.pythonhosted.org/packages/48/35/2365e37c90df4f5342c4fa202223744119fe31264ee2924f09f074ea9b6d/coverage-7.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e71bba6a40883b00c6d571599b4627f50c360b3d0d02bfc658168936be74027b", size = 260899, upload-time = "2025-11-18T13:34:11.259Z" },
-    { url = "https://files.pythonhosted.org/packages/05/56/26ab0464ca733fa325e8e71455c58c1c374ce30f7c04cebb88eabb037b18/coverage-7.12.0-cp314-cp314t-win32.whl", hash = "sha256:9157a5e233c40ce6613dead4c131a006adfda70e557b6856b97aceed01b0e27a", size = 221313, upload-time = "2025-11-18T13:34:12.863Z" },
-    { url = "https://files.pythonhosted.org/packages/da/1c/017a3e1113ed34d998b27d2c6dba08a9e7cb97d362f0ec988fcd873dcf81/coverage-7.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e84da3a0fd233aeec797b981c51af1cabac74f9bd67be42458365b30d11b5291", size = 222423, upload-time = "2025-11-18T13:34:15.14Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/36/bcc504fdd5169301b52568802bb1b9cdde2e27a01d39fbb3b4b508ab7c2c/coverage-7.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:01d24af36fedda51c2b1aca56e4330a3710f83b02a5ff3743a6b015ffa7c9384", size = 220459, upload-time = "2025-11-18T13:34:17.222Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/a3/43b749004e3c09452e39bb56347a008f0a0668aad37324a99b5c8ca91d9e/coverage-7.12.0-py3-none-any.whl", hash = "sha256:159d50c0b12e060b15ed3d39f87ed43d4f7f7ad40b8a534f4dd331adbb51104a", size = 209503, upload-time = "2025-11-18T13:34:18.892Z" },
+version = "7.13.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/24/56/95b7e30fa389756cb56630faa728da46a27b8c6eb46f9d557c68fff12b65/coverage-7.13.4.tar.gz", hash = "sha256:e5c8f6ed1e61a8b2dcdf31eb0b9bbf0130750ca79c1c49eb898e2ad86f5ccc91", size = 827239, upload-time = "2026-02-09T12:59:03.86Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/d4/7827d9ffa34d5d4d752eec907022aa417120936282fc488306f5da08c292/coverage-7.13.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0fc31c787a84f8cd6027eba44010517020e0d18487064cd3d8968941856d1415", size = 219152, upload-time = "2026-02-09T12:56:11.974Z" },
+    { url = "https://files.pythonhosted.org/packages/35/b0/d69df26607c64043292644dbb9dc54b0856fabaa2cbb1eeee3331cc9e280/coverage-7.13.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a32ebc02a1805adf637fc8dec324b5cdacd2e493515424f70ee33799573d661b", size = 219667, upload-time = "2026-02-09T12:56:13.33Z" },
+    { url = "https://files.pythonhosted.org/packages/82/a4/c1523f7c9e47b2271dbf8c2a097e7a1f89ef0d66f5840bb59b7e8814157b/coverage-7.13.4-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e24f9156097ff9dc286f2f913df3a7f63c0e333dcafa3c196f2c18b4175ca09a", size = 246425, upload-time = "2026-02-09T12:56:14.552Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/02/aa7ec01d1a5023c4b680ab7257f9bfde9defe8fdddfe40be096ac19e8177/coverage-7.13.4-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:8041b6c5bfdc03257666e9881d33b1abc88daccaf73f7b6340fb7946655cd10f", size = 248229, upload-time = "2026-02-09T12:56:16.31Z" },
+    { url = "https://files.pythonhosted.org/packages/35/98/85aba0aed5126d896162087ef3f0e789a225697245256fc6181b95f47207/coverage-7.13.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2a09cfa6a5862bc2fc6ca7c3def5b2926194a56b8ab78ffcf617d28911123012", size = 250106, upload-time = "2026-02-09T12:56:18.024Z" },
+    { url = "https://files.pythonhosted.org/packages/96/72/1db59bd67494bc162e3e4cd5fbc7edba2c7026b22f7c8ef1496d58c2b94c/coverage-7.13.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:296f8b0af861d3970c2a4d8c91d48eb4dd4771bcef9baedec6a9b515d7de3def", size = 252021, upload-time = "2026-02-09T12:56:19.272Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/97/72899c59c7066961de6e3daa142d459d47d104956db43e057e034f015c8a/coverage-7.13.4-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e101609bcbbfb04605ea1027b10dc3735c094d12d40826a60f897b98b1c30256", size = 247114, upload-time = "2026-02-09T12:56:21.051Z" },
+    { url = "https://files.pythonhosted.org/packages/39/1f/f1885573b5970235e908da4389176936c8933e86cb316b9620aab1585fa2/coverage-7.13.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:aa3feb8db2e87ff5e6d00d7e1480ae241876286691265657b500886c98f38bda", size = 248143, upload-time = "2026-02-09T12:56:22.585Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/cf/e80390c5b7480b722fa3e994f8202807799b85bc562aa4f1dde209fbb7be/coverage-7.13.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4fc7fa81bbaf5a02801b65346c8b3e657f1d93763e58c0abdf7c992addd81a92", size = 246152, upload-time = "2026-02-09T12:56:23.748Z" },
+    { url = "https://files.pythonhosted.org/packages/44/bf/f89a8350d85572f95412debb0fb9bb4795b1d5b5232bd652923c759e787b/coverage-7.13.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:33901f604424145c6e9c2398684b92e176c0b12df77d52db81c20abd48c3794c", size = 249959, upload-time = "2026-02-09T12:56:25.209Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/6e/612a02aece8178c818df273e8d1642190c4875402ca2ba74514394b27aba/coverage-7.13.4-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:bb28c0f2cf2782508a40cec377935829d5fcc3ad9a3681375af4e84eb34b6b58", size = 246416, upload-time = "2026-02-09T12:56:26.475Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/98/b5afc39af67c2fa6786b03c3a7091fc300947387ce8914b096db8a73d67a/coverage-7.13.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d107aff57a83222ddbd8d9ee705ede2af2cc926608b57abed8ef96b50b7e8f9", size = 247025, upload-time = "2026-02-09T12:56:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/51/30/2bba8ef0682d5bd210c38fe497e12a06c9f8d663f7025e9f5c2c31ce847d/coverage-7.13.4-cp310-cp310-win32.whl", hash = "sha256:a6f94a7d00eb18f1b6d403c91a88fd58cfc92d4b16080dfdb774afc8294469bf", size = 221758, upload-time = "2026-02-09T12:56:29.051Z" },
+    { url = "https://files.pythonhosted.org/packages/78/13/331f94934cf6c092b8ea59ff868eb587bc8fe0893f02c55bc6c0183a192e/coverage-7.13.4-cp310-cp310-win_amd64.whl", hash = "sha256:2cb0f1e000ebc419632bbe04366a8990b6e32c4e0b51543a6484ffe15eaeda95", size = 222693, upload-time = "2026-02-09T12:56:30.366Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/ad/b59e5b451cf7172b8d1043dc0fa718f23aab379bc1521ee13d4bd9bfa960/coverage-7.13.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d490ba50c3f35dd7c17953c68f3270e7ccd1c6642e2d2afe2d8e720b98f5a053", size = 219278, upload-time = "2026-02-09T12:56:31.673Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/17/0cb7ca3de72e5f4ef2ec2fa0089beafbcaaaead1844e8b8a63d35173d77d/coverage-7.13.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:19bc3c88078789f8ef36acb014d7241961dbf883fd2533d18cb1e7a5b4e28b11", size = 219783, upload-time = "2026-02-09T12:56:33.104Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/63/325d8e5b11e0eaf6d0f6a44fad444ae58820929a9b0de943fa377fe73e85/coverage-7.13.4-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3998e5a32e62fdf410c0dbd3115df86297995d6e3429af80b8798aad894ca7aa", size = 250200, upload-time = "2026-02-09T12:56:34.474Z" },
+    { url = "https://files.pythonhosted.org/packages/76/53/c16972708cbb79f2942922571a687c52bd109a7bd51175aeb7558dff2236/coverage-7.13.4-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:8e264226ec98e01a8e1054314af91ee6cde0eacac4f465cc93b03dbe0bce2fd7", size = 252114, upload-time = "2026-02-09T12:56:35.749Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/c2/7ab36d8b8cc412bec9ea2d07c83c48930eb4ba649634ba00cb7e4e0f9017/coverage-7.13.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a3aa4e7b9e416774b21797365b358a6e827ffadaaca81b69ee02946852449f00", size = 254220, upload-time = "2026-02-09T12:56:37.796Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/4d/cf52c9a3322c89a0e6febdfbc83bb45c0ed3c64ad14081b9503adee702e7/coverage-7.13.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:71ca20079dd8f27fcf808817e281e90220475cd75115162218d0e27549f95fef", size = 256164, upload-time = "2026-02-09T12:56:39.016Z" },
+    { url = "https://files.pythonhosted.org/packages/78/e9/eb1dd17bd6de8289df3580e967e78294f352a5df8a57ff4671ee5fc3dcd0/coverage-7.13.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e2f25215f1a359ab17320b47bcdaca3e6e6356652e8256f2441e4ef972052903", size = 250325, upload-time = "2026-02-09T12:56:40.668Z" },
+    { url = "https://files.pythonhosted.org/packages/71/07/8c1542aa873728f72267c07278c5cc0ec91356daf974df21335ccdb46368/coverage-7.13.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d65b2d373032411e86960604dc4edac91fdfb5dca539461cf2cbe78327d1e64f", size = 251913, upload-time = "2026-02-09T12:56:41.97Z" },
+    { url = "https://files.pythonhosted.org/packages/74/d7/c62e2c5e4483a748e27868e4c32ad3daa9bdddbba58e1bc7a15e252baa74/coverage-7.13.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94eb63f9b363180aff17de3e7c8760c3ba94664ea2695c52f10111244d16a299", size = 249974, upload-time = "2026-02-09T12:56:43.323Z" },
+    { url = "https://files.pythonhosted.org/packages/98/9f/4c5c015a6e98ced54efd0f5cf8d31b88e5504ecb6857585fc0161bb1e600/coverage-7.13.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e856bf6616714c3a9fbc270ab54103f4e685ba236fa98c054e8f87f266c93505", size = 253741, upload-time = "2026-02-09T12:56:45.155Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/59/0f4eef89b9f0fcd9633b5d350016f54126ab49426a70ff4c4e87446cabdc/coverage-7.13.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:65dfcbe305c3dfe658492df2d85259e0d79ead4177f9ae724b6fb245198f55d6", size = 249695, upload-time = "2026-02-09T12:56:46.636Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/2c/b7476f938deb07166f3eb281a385c262675d688ff4659ad56c6c6b8e2e70/coverage-7.13.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b507778ae8a4c915436ed5c2e05b4a6cecfa70f734e19c22a005152a11c7b6a9", size = 250599, upload-time = "2026-02-09T12:56:48.13Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/34/c3420709d9846ee3785b9f2831b4d94f276f38884032dca1457fa83f7476/coverage-7.13.4-cp311-cp311-win32.whl", hash = "sha256:784fc3cf8be001197b652d51d3fd259b1e2262888693a4636e18879f613a62a9", size = 221780, upload-time = "2026-02-09T12:56:50.479Z" },
+    { url = "https://files.pythonhosted.org/packages/61/08/3d9c8613079d2b11c185b865de9a4c1a68850cfda2b357fae365cf609f29/coverage-7.13.4-cp311-cp311-win_amd64.whl", hash = "sha256:2421d591f8ca05b308cf0092807308b2facbefe54af7c02ac22548b88b95c98f", size = 222715, upload-time = "2026-02-09T12:56:51.815Z" },
+    { url = "https://files.pythonhosted.org/packages/18/1a/54c3c80b2f056164cc0a6cdcb040733760c7c4be9d780fe655f356f433e4/coverage-7.13.4-cp311-cp311-win_arm64.whl", hash = "sha256:79e73a76b854d9c6088fe5d8b2ebe745f8681c55f7397c3c0a016192d681045f", size = 221385, upload-time = "2026-02-09T12:56:53.194Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/81/4ce2fdd909c5a0ed1f6dedb88aa57ab79b6d1fbd9b588c1ac7ef45659566/coverage-7.13.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02231499b08dabbe2b96612993e5fc34217cdae907a51b906ac7fca8027a4459", size = 219449, upload-time = "2026-02-09T12:56:54.889Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/96/5238b1efc5922ddbdc9b0db9243152c09777804fb7c02ad1741eb18a11c0/coverage-7.13.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40aa8808140e55dc022b15d8aa7f651b6b3d68b365ea0398f1441e0b04d859c3", size = 219810, upload-time = "2026-02-09T12:56:56.33Z" },
+    { url = "https://files.pythonhosted.org/packages/78/72/2f372b726d433c9c35e56377cf1d513b4c16fe51841060d826b95caacec1/coverage-7.13.4-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5b856a8ccf749480024ff3bd7310adaef57bf31fd17e1bfc404b7940b6986634", size = 251308, upload-time = "2026-02-09T12:56:57.858Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/a0/2ea570925524ef4e00bb6c82649f5682a77fac5ab910a65c9284de422600/coverage-7.13.4-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2c048ea43875fbf8b45d476ad79f179809c590ec7b79e2035c662e7afa3192e3", size = 254052, upload-time = "2026-02-09T12:56:59.754Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/ac/45dc2e19a1939098d783c846e130b8f862fbb50d09e0af663988f2f21973/coverage-7.13.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b7b38448866e83176e28086674fe7368ab8590e4610fb662b44e345b86d63ffa", size = 255165, upload-time = "2026-02-09T12:57:01.287Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/4d/26d236ff35abc3b5e63540d3386e4c3b192168c1d96da5cb2f43c640970f/coverage-7.13.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:de6defc1c9badbf8b9e67ae90fd00519186d6ab64e5cc5f3d21359c2a9b2c1d3", size = 257432, upload-time = "2026-02-09T12:57:02.637Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/55/14a966c757d1348b2e19caf699415a2a4c4f7feaa4bbc6326a51f5c7dd1b/coverage-7.13.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7eda778067ad7ffccd23ecffce537dface96212576a07924cbf0d8799d2ded5a", size = 251716, upload-time = "2026-02-09T12:57:04.056Z" },
+    { url = "https://files.pythonhosted.org/packages/77/33/50116647905837c66d28b2af1321b845d5f5d19be9655cb84d4a0ea806b4/coverage-7.13.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e87f6c587c3f34356c3759f0420693e35e7eb0e2e41e4c011cb6ec6ecbbf1db7", size = 253089, upload-time = "2026-02-09T12:57:05.503Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/b4/8efb11a46e3665d92635a56e4f2d4529de6d33f2cb38afd47d779d15fc99/coverage-7.13.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8248977c2e33aecb2ced42fef99f2d319e9904a36e55a8a68b69207fb7e43edc", size = 251232, upload-time = "2026-02-09T12:57:06.879Z" },
+    { url = "https://files.pythonhosted.org/packages/51/24/8cd73dd399b812cc76bb0ac260e671c4163093441847ffe058ac9fda1e32/coverage-7.13.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:25381386e80ae727608e662474db537d4df1ecd42379b5ba33c84633a2b36d47", size = 255299, upload-time = "2026-02-09T12:57:08.245Z" },
+    { url = "https://files.pythonhosted.org/packages/03/94/0a4b12f1d0e029ce1ccc1c800944a9984cbe7d678e470bb6d3c6bc38a0da/coverage-7.13.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:ee756f00726693e5ba94d6df2bdfd64d4852d23b09bb0bc700e3b30e6f333985", size = 250796, upload-time = "2026-02-09T12:57:10.142Z" },
+    { url = "https://files.pythonhosted.org/packages/73/44/6002fbf88f6698ca034360ce474c406be6d5a985b3fdb3401128031eef6b/coverage-7.13.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fdfc1e28e7c7cdce44985b3043bc13bbd9c747520f94a4d7164af8260b3d91f0", size = 252673, upload-time = "2026-02-09T12:57:12.197Z" },
+    { url = "https://files.pythonhosted.org/packages/de/c6/a0279f7c00e786be75a749a5674e6fa267bcbd8209cd10c9a450c655dfa7/coverage-7.13.4-cp312-cp312-win32.whl", hash = "sha256:01d4cbc3c283a17fc1e42d614a119f7f438eabb593391283adca8dc86eff1246", size = 221990, upload-time = "2026-02-09T12:57:14.085Z" },
+    { url = "https://files.pythonhosted.org/packages/77/4e/c0a25a425fcf5557d9abd18419c95b63922e897bc86c1f327f155ef234a9/coverage-7.13.4-cp312-cp312-win_amd64.whl", hash = "sha256:9401ebc7ef522f01d01d45532c68c5ac40fb27113019b6b7d8b208f6e9baa126", size = 222800, upload-time = "2026-02-09T12:57:15.944Z" },
+    { url = "https://files.pythonhosted.org/packages/47/ac/92da44ad9a6f4e3a7debd178949d6f3769bedca33830ce9b1dcdab589a37/coverage-7.13.4-cp312-cp312-win_arm64.whl", hash = "sha256:b1ec7b6b6e93255f952e27ab58fbc68dcc468844b16ecbee881aeb29b6ab4d8d", size = 221415, upload-time = "2026-02-09T12:57:17.497Z" },
+    { url = "https://files.pythonhosted.org/packages/db/23/aad45061a31677d68e47499197a131eea55da4875d16c1f42021ab963503/coverage-7.13.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b66a2da594b6068b48b2692f043f35d4d3693fb639d5ea8b39533c2ad9ac3ab9", size = 219474, upload-time = "2026-02-09T12:57:19.332Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/70/9b8b67a0945f3dfec1fd896c5cefb7c19d5a3a6d74630b99a895170999ae/coverage-7.13.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3599eb3992d814d23b35c536c28df1a882caa950f8f507cef23d1cbf334995ac", size = 219844, upload-time = "2026-02-09T12:57:20.66Z" },
+    { url = "https://files.pythonhosted.org/packages/97/fd/7e859f8fab324cef6c4ad7cff156ca7c489fef9179d5749b0c8d321281c2/coverage-7.13.4-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:93550784d9281e374fb5a12bf1324cc8a963fd63b2d2f223503ef0fd4aa339ea", size = 250832, upload-time = "2026-02-09T12:57:22.007Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/dc/b2442d10020c2f52617828862d8b6ee337859cd8f3a1f13d607dddda9cf7/coverage-7.13.4-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b720ce6a88a2755f7c697c23268ddc47a571b88052e6b155224347389fdf6a3b", size = 253434, upload-time = "2026-02-09T12:57:23.339Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/88/6728a7ad17428b18d836540630487231f5470fb82454871149502f5e5aa2/coverage-7.13.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b322db1284a2ed3aa28ffd8ebe3db91c929b7a333c0820abec3d838ef5b3525", size = 254676, upload-time = "2026-02-09T12:57:24.774Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/bc/21244b1b8cedf0dff0a2b53b208015fe798d5f2a8d5348dbfece04224fff/coverage-7.13.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f4594c67d8a7c89cf922d9df0438c7c7bb022ad506eddb0fdb2863359ff78242", size = 256807, upload-time = "2026-02-09T12:57:26.125Z" },
+    { url = "https://files.pythonhosted.org/packages/97/a0/ddba7ed3251cff51006737a727d84e05b61517d1784a9988a846ba508877/coverage-7.13.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:53d133df809c743eb8bce33b24bcababb371f4441340578cd406e084d94a6148", size = 251058, upload-time = "2026-02-09T12:57:27.614Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/55/e289addf7ff54d3a540526f33751951bf0878f3809b47f6dfb3def69c6f7/coverage-7.13.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:76451d1978b95ba6507a039090ba076105c87cc76fc3efd5d35d72093964d49a", size = 252805, upload-time = "2026-02-09T12:57:29.066Z" },
+    { url = "https://files.pythonhosted.org/packages/13/4e/cc276b1fa4a59be56d96f1dabddbdc30f4ba22e3b1cd42504c37b3313255/coverage-7.13.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7f57b33491e281e962021de110b451ab8a24182589be17e12a22c79047935e23", size = 250766, upload-time = "2026-02-09T12:57:30.522Z" },
+    { url = "https://files.pythonhosted.org/packages/94/44/1093b8f93018f8b41a8cf29636c9292502f05e4a113d4d107d14a3acd044/coverage-7.13.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:1731dc33dc276dafc410a885cbf5992f1ff171393e48a21453b78727d090de80", size = 254923, upload-time = "2026-02-09T12:57:31.946Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/55/ea2796da2d42257f37dbea1aab239ba9263b31bd91d5527cdd6db5efe174/coverage-7.13.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:bd60d4fe2f6fa7dff9223ca1bbc9f05d2b6697bc5961072e5d3b952d46e1b1ea", size = 250591, upload-time = "2026-02-09T12:57:33.842Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/fa/7c4bb72aacf8af5020675aa633e59c1fbe296d22aed191b6a5b711eb2bc7/coverage-7.13.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9181a3ccead280b828fae232df12b16652702b49d41e99d657f46cc7b1f6ec7a", size = 252364, upload-time = "2026-02-09T12:57:35.743Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/38/a8d2ec0146479c20bbaa7181b5b455a0c41101eed57f10dd19a78ab44c80/coverage-7.13.4-cp313-cp313-win32.whl", hash = "sha256:f53d492307962561ac7de4cd1de3e363589b000ab69617c6156a16ba7237998d", size = 222010, upload-time = "2026-02-09T12:57:37.25Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/0c/dbfafbe90a185943dcfbc766fe0e1909f658811492d79b741523a414a6cc/coverage-7.13.4-cp313-cp313-win_amd64.whl", hash = "sha256:e6f70dec1cc557e52df5306d051ef56003f74d56e9c4dd7ddb07e07ef32a84dd", size = 222818, upload-time = "2026-02-09T12:57:38.734Z" },
+    { url = "https://files.pythonhosted.org/packages/04/d1/934918a138c932c90d78301f45f677fb05c39a3112b96fd2c8e60503cdc7/coverage-7.13.4-cp313-cp313-win_arm64.whl", hash = "sha256:fb07dc5da7e849e2ad31a5d74e9bece81f30ecf5a42909d0a695f8bd1874d6af", size = 221438, upload-time = "2026-02-09T12:57:40.223Z" },
+    { url = "https://files.pythonhosted.org/packages/52/57/ee93ced533bcb3e6df961c0c6e42da2fc6addae53fb95b94a89b1e33ebd7/coverage-7.13.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:40d74da8e6c4b9ac18b15331c4b5ebc35a17069410cad462ad4f40dcd2d50c0d", size = 220165, upload-time = "2026-02-09T12:57:41.639Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/e0/969fc285a6fbdda49d91af278488d904dcd7651b2693872f0ff94e40e84a/coverage-7.13.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4223b4230a376138939a9173f1bdd6521994f2aff8047fae100d6d94d50c5a12", size = 220516, upload-time = "2026-02-09T12:57:44.215Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/b8/9531944e16267e2735a30a9641ff49671f07e8138ecf1ca13db9fd2560c7/coverage-7.13.4-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:1d4be36a5114c499f9f1f9195e95ebf979460dbe2d88e6816ea202010ba1c34b", size = 261804, upload-time = "2026-02-09T12:57:45.989Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/f3/e63df6d500314a2a60390d1989240d5f27318a7a68fa30ad3806e2a9323e/coverage-7.13.4-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:200dea7d1e8095cc6e98cdabe3fd1d21ab17d3cee6dab00cadbb2fe35d9c15b9", size = 263885, upload-time = "2026-02-09T12:57:47.42Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/67/7654810de580e14b37670b60a09c599fa348e48312db5b216d730857ffe6/coverage-7.13.4-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8eb931ee8e6d8243e253e5ed7336deea6904369d2fd8ae6e43f68abbf167092", size = 266308, upload-time = "2026-02-09T12:57:49.345Z" },
+    { url = "https://files.pythonhosted.org/packages/37/6f/39d41eca0eab3cc82115953ad41c4e77935286c930e8fad15eaed1389d83/coverage-7.13.4-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:75eab1ebe4f2f64d9509b984f9314d4aa788540368218b858dad56dc8f3e5eb9", size = 267452, upload-time = "2026-02-09T12:57:50.811Z" },
+    { url = "https://files.pythonhosted.org/packages/50/6d/39c0fbb8fc5cd4d2090811e553c2108cf5112e882f82505ee7495349a6bf/coverage-7.13.4-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c35eb28c1d085eb7d8c9b3296567a1bebe03ce72962e932431b9a61f28facf26", size = 261057, upload-time = "2026-02-09T12:57:52.447Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/a2/60010c669df5fa603bb5a97fb75407e191a846510da70ac657eb696b7fce/coverage-7.13.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:eb88b316ec33760714a4720feb2816a3a59180fd58c1985012054fa7aebee4c2", size = 263875, upload-time = "2026-02-09T12:57:53.938Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/d9/63b22a6bdbd17f1f96e9ed58604c2a6b0e72a9133e37d663bef185877cf6/coverage-7.13.4-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7d41eead3cc673cbd38a4417deb7fd0b4ca26954ff7dc6078e33f6ff97bed940", size = 261500, upload-time = "2026-02-09T12:57:56.012Z" },
+    { url = "https://files.pythonhosted.org/packages/70/bf/69f86ba1ad85bc3ad240e4c0e57a2e620fbc0e1645a47b5c62f0e941ad7f/coverage-7.13.4-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:fb26a934946a6afe0e326aebe0730cdff393a8bc0bbb65a2f41e30feddca399c", size = 265212, upload-time = "2026-02-09T12:57:57.5Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/f2/5f65a278a8c2148731831574c73e42f57204243d33bedaaf18fa79c5958f/coverage-7.13.4-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:dae88bc0fc77edaa65c14be099bd57ee140cf507e6bfdeea7938457ab387efb0", size = 260398, upload-time = "2026-02-09T12:57:59.027Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/80/6e8280a350ee9fea92f14b8357448a242dcaa243cb2c72ab0ca591f66c8c/coverage-7.13.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:845f352911777a8e722bfce168958214951e07e47e5d5d9744109fa5fe77f79b", size = 262584, upload-time = "2026-02-09T12:58:01.129Z" },
+    { url = "https://files.pythonhosted.org/packages/22/63/01ff182fc95f260b539590fb12c11ad3e21332c15f9799cb5e2386f71d9f/coverage-7.13.4-cp313-cp313t-win32.whl", hash = "sha256:2fa8d5f8de70688a28240de9e139fa16b153cc3cbb01c5f16d88d6505ebdadf9", size = 222688, upload-time = "2026-02-09T12:58:02.736Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/43/89de4ef5d3cd53b886afa114065f7e9d3707bdb3e5efae13535b46ae483d/coverage-7.13.4-cp313-cp313t-win_amd64.whl", hash = "sha256:9351229c8c8407645840edcc277f4a2d44814d1bc34a2128c11c2a031d45a5dd", size = 223746, upload-time = "2026-02-09T12:58:05.362Z" },
+    { url = "https://files.pythonhosted.org/packages/35/39/7cf0aa9a10d470a5309b38b289b9bb07ddeac5d61af9b664fe9775a4cb3e/coverage-7.13.4-cp313-cp313t-win_arm64.whl", hash = "sha256:30b8d0512f2dc8c8747557e8fb459d6176a2c9e5731e2b74d311c03b78451997", size = 222003, upload-time = "2026-02-09T12:58:06.952Z" },
+    { url = "https://files.pythonhosted.org/packages/92/11/a9cf762bb83386467737d32187756a42094927150c3e107df4cb078e8590/coverage-7.13.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:300deaee342f90696ed186e3a00c71b5b3d27bffe9e827677954f4ee56969601", size = 219522, upload-time = "2026-02-09T12:58:08.623Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/28/56e6d892b7b052236d67c95f1936b6a7cf7c3e2634bf27610b8cbd7f9c60/coverage-7.13.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:29e3220258d682b6226a9b0925bc563ed9a1ebcff3cad30f043eceea7eaf2689", size = 219855, upload-time = "2026-02-09T12:58:10.176Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/69/233459ee9eb0c0d10fcc2fe425a029b3fa5ce0f040c966ebce851d030c70/coverage-7.13.4-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:391ee8f19bef69210978363ca930f7328081c6a0152f1166c91f0b5fdd2a773c", size = 250887, upload-time = "2026-02-09T12:58:12.503Z" },
+    { url = "https://files.pythonhosted.org/packages/06/90/2cdab0974b9b5bbc1623f7876b73603aecac11b8d95b85b5b86b32de5eab/coverage-7.13.4-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0dd7ab8278f0d58a0128ba2fca25824321f05d059c1441800e934ff2efa52129", size = 253396, upload-time = "2026-02-09T12:58:14.615Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/15/ea4da0f85bf7d7b27635039e649e99deb8173fe551096ea15017f7053537/coverage-7.13.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78cdf0d578b15148b009ccf18c686aa4f719d887e76e6b40c38ffb61d264a552", size = 254745, upload-time = "2026-02-09T12:58:16.162Z" },
+    { url = "https://files.pythonhosted.org/packages/99/11/bb356e86920c655ca4d61daee4e2bbc7258f0a37de0be32d233b561134ff/coverage-7.13.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:48685fee12c2eb3b27c62f2658e7ea21e9c3239cba5a8a242801a0a3f6a8c62a", size = 257055, upload-time = "2026-02-09T12:58:17.892Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/0f/9ae1f8cb17029e09da06ca4e28c9e1d5c1c0a511c7074592e37e0836c915/coverage-7.13.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4e83efc079eb39480e6346a15a1bcb3e9b04759c5202d157e1dd4303cd619356", size = 250911, upload-time = "2026-02-09T12:58:19.495Z" },
+    { url = "https://files.pythonhosted.org/packages/89/3a/adfb68558fa815cbc29747b553bc833d2150228f251b127f1ce97e48547c/coverage-7.13.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ecae9737b72408d6a950f7e525f30aca12d4bd8dd95e37342e5beb3a2a8c4f71", size = 252754, upload-time = "2026-02-09T12:58:21.064Z" },
+    { url = "https://files.pythonhosted.org/packages/32/b1/540d0c27c4e748bd3cd0bd001076ee416eda993c2bae47a73b7cc9357931/coverage-7.13.4-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ae4578f8528569d3cf303fef2ea569c7f4c4059a38c8667ccef15c6e1f118aa5", size = 250720, upload-time = "2026-02-09T12:58:22.622Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/95/383609462b3ffb1fe133014a7c84fc0dd01ed55ac6140fa1093b5af7ebb1/coverage-7.13.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:6fdef321fdfbb30a197efa02d48fcd9981f0d8ad2ae8903ac318adc653f5df98", size = 254994, upload-time = "2026-02-09T12:58:24.548Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/ba/1761138e86c81680bfc3c49579d66312865457f9fe405b033184e5793cb3/coverage-7.13.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b0f6ccf3dbe577170bebfce1318707d0e8c3650003cb4b3a9dd744575daa8b5", size = 250531, upload-time = "2026-02-09T12:58:26.271Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/8e/05900df797a9c11837ab59c4d6fe94094e029582aab75c3309a93e6fb4e3/coverage-7.13.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75fcd519f2a5765db3f0e391eb3b7d150cce1a771bf4c9f861aeab86c767a3c0", size = 252189, upload-time = "2026-02-09T12:58:27.807Z" },
+    { url = "https://files.pythonhosted.org/packages/00/bd/29c9f2db9ea4ed2738b8a9508c35626eb205d51af4ab7bf56a21a2e49926/coverage-7.13.4-cp314-cp314-win32.whl", hash = "sha256:8e798c266c378da2bd819b0677df41ab46d78065fb2a399558f3f6cae78b2fbb", size = 222258, upload-time = "2026-02-09T12:58:29.441Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/4d/1f8e723f6829977410efeb88f73673d794075091c8c7c18848d273dc9d73/coverage-7.13.4-cp314-cp314-win_amd64.whl", hash = "sha256:245e37f664d89861cf2329c9afa2c1fe9e6d4e1a09d872c947e70718aeeac505", size = 223073, upload-time = "2026-02-09T12:58:31.026Z" },
+    { url = "https://files.pythonhosted.org/packages/51/5b/84100025be913b44e082ea32abcf1afbf4e872f5120b7a1cab1d331b1e13/coverage-7.13.4-cp314-cp314-win_arm64.whl", hash = "sha256:ad27098a189e5838900ce4c2a99f2fe42a0bf0c2093c17c69b45a71579e8d4a2", size = 221638, upload-time = "2026-02-09T12:58:32.599Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/e4/c884a405d6ead1370433dad1e3720216b4f9fd8ef5b64bfd984a2a60a11a/coverage-7.13.4-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:85480adfb35ffc32d40918aad81b89c69c9cc5661a9b8a81476d3e645321a056", size = 220246, upload-time = "2026-02-09T12:58:34.181Z" },
+    { url = "https://files.pythonhosted.org/packages/81/5c/4d7ed8b23b233b0fffbc9dfec53c232be2e695468523242ea9fd30f97ad2/coverage-7.13.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:79be69cf7f3bf9b0deeeb062eab7ac7f36cd4cc4c4dd694bd28921ba4d8596cc", size = 220514, upload-time = "2026-02-09T12:58:35.704Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/6f/3284d4203fd2f28edd73034968398cd2d4cb04ab192abc8cff007ea35679/coverage-7.13.4-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:caa421e2684e382c5d8973ac55e4f36bed6821a9bad5c953494de960c74595c9", size = 261877, upload-time = "2026-02-09T12:58:37.864Z" },
+    { url = "https://files.pythonhosted.org/packages/09/aa/b672a647bbe1556a85337dc95bfd40d146e9965ead9cc2fe81bde1e5cbce/coverage-7.13.4-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14375934243ee05f56c45393fe2ce81fe5cc503c07cee2bdf1725fb8bef3ffaf", size = 264004, upload-time = "2026-02-09T12:58:39.492Z" },
+    { url = "https://files.pythonhosted.org/packages/79/a1/aa384dbe9181f98bba87dd23dda436f0c6cf2e148aecbb4e50fc51c1a656/coverage-7.13.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:25a41c3104d08edb094d9db0d905ca54d0cd41c928bb6be3c4c799a54753af55", size = 266408, upload-time = "2026-02-09T12:58:41.852Z" },
+    { url = "https://files.pythonhosted.org/packages/53/5e/5150bf17b4019bc600799f376bb9606941e55bd5a775dc1e096b6ffea952/coverage-7.13.4-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6f01afcff62bf9a08fb32b2c1d6e924236c0383c02c790732b6537269e466a72", size = 267544, upload-time = "2026-02-09T12:58:44.093Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/ed/f1de5c675987a4a7a672250d2c5c9d73d289dbf13410f00ed7181d8017dd/coverage-7.13.4-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:eb9078108fbf0bcdde37c3f4779303673c2fa1fe8f7956e68d447d0dd426d38a", size = 260980, upload-time = "2026-02-09T12:58:45.721Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/e3/fe758d01850aa172419a6743fe76ba8b92c29d181d4f676ffe2dae2ba631/coverage-7.13.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0e086334e8537ddd17e5f16a344777c1ab8194986ec533711cbe6c41cde841b6", size = 263871, upload-time = "2026-02-09T12:58:47.334Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/76/b829869d464115e22499541def9796b25312b8cf235d3bb00b39f1675395/coverage-7.13.4-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:725d985c5ab621268b2edb8e50dfe57633dc69bda071abc470fed55a14935fd3", size = 261472, upload-time = "2026-02-09T12:58:48.995Z" },
+    { url = "https://files.pythonhosted.org/packages/14/9e/caedb1679e73e2f6ad240173f55218488bfe043e38da577c4ec977489915/coverage-7.13.4-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:3c06f0f1337c667b971ca2f975523347e63ec5e500b9aa5882d91931cd3ef750", size = 265210, upload-time = "2026-02-09T12:58:51.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/10/0dd02cb009b16ede425b49ec344aba13a6ae1dc39600840ea6abcb085ac4/coverage-7.13.4-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:590c0ed4bf8e85f745e6b805b2e1c457b2e33d5255dd9729743165253bc9ad39", size = 260319, upload-time = "2026-02-09T12:58:53.081Z" },
+    { url = "https://files.pythonhosted.org/packages/92/8e/234d2c927af27c6d7a5ffad5bd2cf31634c46a477b4c7adfbfa66baf7ebb/coverage-7.13.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:eb30bf180de3f632cd043322dad5751390e5385108b2807368997d1a92a509d0", size = 262638, upload-time = "2026-02-09T12:58:55.258Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/64/e5547c8ff6964e5965c35a480855911b61509cce544f4d442caa759a0702/coverage-7.13.4-cp314-cp314t-win32.whl", hash = "sha256:c4240e7eded42d131a2d2c4dec70374b781b043ddc79a9de4d55ca71f8e98aea", size = 223040, upload-time = "2026-02-09T12:58:56.936Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/96/38086d58a181aac86d503dfa9c47eb20715a79c3e3acbdf786e92e5c09a8/coverage-7.13.4-cp314-cp314t-win_amd64.whl", hash = "sha256:4c7d3cc01e7350f2f0f6f7036caaf5673fb56b6998889ccfe9e1c1fe75a9c932", size = 224148, upload-time = "2026-02-09T12:58:58.645Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/72/8d10abd3740a0beb98c305e0c3faf454366221c0f37a8bcf8f60020bb65a/coverage-7.13.4-cp314-cp314t-win_arm64.whl", hash = "sha256:23e3f687cf945070d1c90f85db66d11e3025665d8dafa831301a0e0038f3db9b", size = 222172, upload-time = "2026-02-09T12:59:00.396Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/4a/331fe2caf6799d591109bb9c08083080f6de90a823695d412a935622abb2/coverage-7.13.4-py3-none-any.whl", hash = "sha256:1af1641e57cf7ba1bd67d677c9abdbcd6cc2ab7da3bca7fa1e2b7e50e65f2ad0", size = 211242, upload-time = "2026-02-09T12:59:02.032Z" },
 ]
 
 [package.optional-dependencies]
@@ -1047,117 +1171,164 @@ wheels = [
 
 [[package]]
 name = "cuda-bindings"
-version = "13.0.3"
+version = "12.9.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-pathfinder" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/98/0666ee759cd2e5306f911cbc95d2c6c814326906ed6b9c09e817a4b4a7c8/cuda_bindings-13.0.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56e46a9e984bb754e56b9d060cf027fe99f08a97651ce6d8aa1c2032476d01e", size = 11762523, upload-time = "2025-10-21T15:08:45.913Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/36/2b2a43c8a6f8d8ff7a5ec7de1357ba3f1438ea69281c9deb90df29d55d56/cuda_bindings-13.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f797ce534a303525259be0ae7ee9cfcf4f7874b22f1f9b8e85555509dccb83", size = 12136098, upload-time = "2025-10-21T15:08:48.233Z" },
-    { url = "https://files.pythonhosted.org/packages/47/67/5de1d48189511114859a1a131193896f88271c067a64b1159787e2d9f89b/cuda_bindings-13.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:74307cea1feee6c32a6e27b42e77beb22cd21cff4b7764fd214fa6ff89f8bd69", size = 11106982, upload-time = "2025-10-21T15:08:50.433Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/67/9e171ee6359d4aabf2d8202802c85487cae6c2eb52b9352bb7754583802f/cuda_bindings-13.0.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfd66c25a133365c4f93e3396c38c64b04400ccafd18c3a889ae251a1bfabaa1", size = 11807212, upload-time = "2025-10-21T15:08:52.988Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/66/d7036d9e402e6b5b57877a7496aba3bf2a0090f4fa3f072743fce3373eba/cuda_bindings-13.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9afede5937474864aa794eb57399dbdf5d2b05427aadcac275209b0857528a61", size = 12198791, upload-time = "2025-10-21T15:08:55.687Z" },
-    { url = "https://files.pythonhosted.org/packages/83/25/620ce2afb6ea6d5da89d98375c85641f691924eef574247f7f0dd99f8bee/cuda_bindings-13.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:fce6d6b162457475b35e1a259ab643e683d1d20a84459fea898782e2f1e10a3b", size = 11138783, upload-time = "2025-10-21T15:08:57.741Z" },
-    { url = "https://files.pythonhosted.org/packages/61/3c/c33fd3aa5fcc89aa1c135e477a0561f29142ab5fe028ca425fc87f7f0a74/cuda_bindings-13.0.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b899e5a513c11eaa18648f9bf5265d8de2a93f76ef66a6bfca0a2887303965cd", size = 11709086, upload-time = "2025-10-21T15:09:00.005Z" },
-    { url = "https://files.pythonhosted.org/packages/21/ac/6b34452a3836c9fbabcd360689a353409d15f500dd9d9ced7f837549e383/cuda_bindings-13.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf41d9e69019939aa15296fa66ea7d3fdb8d2c6383f729f4b1156c8b37808a06", size = 12128303, upload-time = "2025-10-21T15:09:02.889Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/76/ad9cc2f0496886c37aefbc00256197a6043a3f04bbe959481e6908310afe/cuda_bindings-13.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:6b12ccd98f447aea9589d32caf9efda0c193994080752a60f790b646d519fe8c", size = 11237397, upload-time = "2025-10-21T15:09:05.421Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/36/41ccc303eb6be8ae82c5edd2ccae938876e8a794660e8bb96a193174a978/cuda_bindings-13.0.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb16a7f769c9c67469add7a1d9f6c14dd44637f6921cb6b9eb82cb5015b35c3d", size = 11537064, upload-time = "2025-10-21T15:09:07.84Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/ac/699889100536f1b63779646291e74eefa818087a0974eb271314d850f5dc/cuda_bindings-13.0.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:512d0d803a5e47a8a42d5a34ce0932802bf72fe952fdb11ac798715a35c6e5cb", size = 11910447, upload-time = "2025-10-21T15:09:09.942Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/f9/a2f5910aaf21f4cd43f456ea80f47f1424eece5b8f063dac1980304b8ef0/cuda_bindings-13.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:dd83e8d79587e265b82d3e589ba6b061770537443dfb1bb4a74f755c8b13f62b", size = 11211659, upload-time = "2025-10-21T15:09:12.639Z" },
-    { url = "https://files.pythonhosted.org/packages/11/67/9656e003f18c5b32e1a2496998b24f4355ec978c5f3639b0eb9f6d0ff83f/cuda_bindings-13.0.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c859e326c776a47e66c50386a10c84fe34291eb6e711610c9fd7cc27d446334f", size = 11522409, upload-time = "2025-10-21T15:09:14.674Z" },
-    { url = "https://files.pythonhosted.org/packages/18/d8/a83379caa7c1bed4195e704c24467a6c07fe8e29c7055ccd4f00c5702363/cuda_bindings-13.0.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e675dbd009fb5e66d63fd13a8ff35f849120f01bcc4dafadbced3004605c3588", size = 11903148, upload-time = "2025-10-21T15:09:16.918Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/e0/ff1eeda06364df8c750843432ac6efb33a06df38261f0a1ceee59bb7dac2/cuda_bindings-13.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:193762306b6032c00a141fc38bcef92c6fb4d332fd2d6a550c7f950e7fd8acd8", size = 11543153, upload-time = "2025-10-21T15:09:19.252Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/99/0042dc5e98e3364480b1aaabc0f5c150d037825b264bba35ac7a883e46ee/cuda_bindings-13.0.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c7e6e89cdfc9b34f16a065cc6ad6c4bab19ce5dcef8da3ace8ad10bda899fa0", size = 11594384, upload-time = "2025-10-21T15:09:21.938Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/c4/a931a90ce763bd7d587e18e73e4ce246b8547c78247c4f50ee24efc0e984/cuda_bindings-13.0.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e93866465e7ff4b7ebdf711cf9cd680499cd875f992058c68be08d4775ac233d", size = 11920899, upload-time = "2025-10-21T15:09:26.306Z" },
-    { url = "https://files.pythonhosted.org/packages/14/3e/5725b2e5b9ac22bf19a50ec5f7611301ab6111c98ccf1b6b125fdaa71550/cuda_bindings-13.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bd2364bc49925837ce18dda259c3a36e539977ca0297799a54891cae1d5213f5", size = 11160621, upload-time = "2025-10-21T15:09:28.7Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/2c/ec611e27ba48a9056f3b0610c5e27727e539f3905356cfe07acea18e772c/cuda_bindings-13.0.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed06ef3507bd0aefb0da367e3d15676a8c7443bd68a88f298562d60b41078c20", size = 11521928, upload-time = "2025-10-21T15:09:30.714Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/2e/02cebf281ef5201b6bb9ea193b1a4d26e6233c46571cfb04c4a7dede12b9/cuda_bindings-13.0.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ab845487ca2c14accdcb393a559a3070469ea4b591d05e6ef439471f47f3e24", size = 11902749, upload-time = "2025-10-21T15:09:32.688Z" },
-    { url = "https://files.pythonhosted.org/packages/36/d2/088c28751f54df7a251259ef3f99d34c428e12653f15db02fd62a96247af/cuda_bindings-13.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:aaa0934e16aa20ec10fbb1ecc53a6961b8d1c06a970fe05cc6ee7d2a805a090f", size = 11697137, upload-time = "2025-10-21T15:09:35.232Z" },
+    { url = "https://files.pythonhosted.org/packages/37/31/bfcc870f69c6a017c4ad5c42316207fc7551940db6f3639aa4466ec5faf3/cuda_bindings-12.9.4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a022c96b8bd847e8dc0675523431149a4c3e872f440e3002213dbb9e08f0331a", size = 11800959, upload-time = "2025-10-21T14:51:26.458Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/d8/b546104b8da3f562c1ff8ab36d130c8fe1dd6a045ced80b4f6ad74f7d4e1/cuda_bindings-12.9.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d3c842c2a4303b2a580fe955018e31aea30278be19795ae05226235268032e5", size = 12148218, upload-time = "2025-10-21T14:51:28.855Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/1e/9c8ed3f3dbed7b7d038805fdc65cbc65fda9983e84437778a9571e7092bc/cuda_bindings-12.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:f69107389e6b9948969bfd0a20c4f571fd1aefcfb1d2e1b72cc8ba5ecb7918ab", size = 11464568, upload-time = "2025-10-21T14:51:31.454Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/2b/ebcbb60aa6dba830474cd360c42e10282f7a343c0a1f58d24fbd3b7c2d77/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a6a429dc6c13148ff1e27c44f40a3dd23203823e637b87fd0854205195988306", size = 11840604, upload-time = "2025-10-21T14:51:34.565Z" },
+    { url = "https://files.pythonhosted.org/packages/45/e7/b47792cc2d01c7e1d37c32402182524774dadd2d26339bd224e0e913832e/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c912a3d9e6b6651853eed8eed96d6800d69c08e94052c292fec3f282c5a817c9", size = 12210593, upload-time = "2025-10-21T14:51:36.574Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/be/90d32049e06abcfba4b2e7df1dbcb5e16215c8852eef0cd8b25f38a66bd4/cuda_bindings-12.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:443b0875916879c2e4c3722941e25e42d5ab9bcbf34c9e83404fb100fa1f6913", size = 11490933, upload-time = "2025-10-21T14:51:38.792Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/c2/65bfd79292b8ff18be4dd7f7442cea37bcbc1a228c1886f1dea515c45b67/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:694ba35023846625ef471257e6b5a4bc8af690f961d197d77d34b1d1db393f56", size = 11760260, upload-time = "2025-10-21T14:51:40.79Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" },
+    { url = "https://files.pythonhosted.org/packages/df/6b/9c1b1a6c01392bfdd758e9486f52a1a72bc8f49e98f9355774ef98b5fb4e/cuda_bindings-12.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:696ca75d249ddf287d01b9a698b8e2d8a05046495a9c051ca15659dc52d17615", size = 11586961, upload-time = "2025-10-21T14:51:45.394Z" },
+    { url = "https://files.pythonhosted.org/packages/05/8b/b4b2d1c7775fa403b64333e720cfcfccef8dcb9cdeb99947061ca5a77628/cuda_bindings-12.9.4-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cf8bfaedc238f3b115d957d1fd6562b7e8435ba57f6d0e2f87d0e7149ccb2da5", size = 11570071, upload-time = "2025-10-21T14:51:47.472Z" },
+    { url = "https://files.pythonhosted.org/packages/63/56/e465c31dc9111be3441a9ba7df1941fe98f4aa6e71e8788a3fb4534ce24d/cuda_bindings-12.9.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:32bdc5a76906be4c61eb98f546a6786c5773a881f3b166486449b5d141e4a39f", size = 11906628, upload-time = "2025-10-21T14:51:49.905Z" },
+    { url = "https://files.pythonhosted.org/packages/05/d0/d0e4e2e047d8e899f023fa15ad5e9894ce951253f4c894f1cd68490fdb14/cuda_bindings-12.9.4-cp313-cp313-win_amd64.whl", hash = "sha256:a2e82c8985948f953c2be51df45c3fe11c812a928fca525154fb9503190b3e64", size = 11556719, upload-time = "2025-10-21T14:51:52.248Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/07/6aff13bc1e977e35aaa6b22f52b172e2890c608c6db22438cf7ed2bf43a6/cuda_bindings-12.9.4-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3adf4958dcf68ae7801a59b73fb00a8b37f8d0595060d66ceae111b1002de38d", size = 11566797, upload-time = "2025-10-21T14:51:54.581Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/84/1e6be415e37478070aeeee5884c2022713c1ecc735e6d82d744de0252eee/cuda_bindings-12.9.4-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56e0043c457a99ac473ddc926fe0dc4046694d99caef633e92601ab52cbe17eb", size = 11925991, upload-time = "2025-10-21T14:51:56.535Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/3c/972edfddb4ae8a9fccd3c3766ed47453b6f805b6026b32f10209dd4b8ad4/cuda_bindings-12.9.4-cp313-cp313t-win_amd64.whl", hash = "sha256:b32d8b685f0e66f5658bcf4601ef034e89fc2843582886f0a58784a4302da06c", size = 11894363, upload-time = "2025-10-21T14:51:58.633Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/b5/96a6696e20c4ffd2b327f54c7d0fde2259bdb998d045c25d5dedbbe30290/cuda_bindings-12.9.4-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f53a7f453d4b2643d8663d036bafe29b5ba89eb904c133180f295df6dc151e5", size = 11624530, upload-time = "2025-10-21T14:52:01.539Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/af/6dfd8f2ed90b1d4719bc053ff8940e494640fe4212dc3dd72f383e4992da/cuda_bindings-12.9.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b72ee72a9cc1b531db31eebaaee5c69a8ec3500e32c6933f2d3b15297b53686", size = 11922703, upload-time = "2025-10-21T14:52:03.585Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/87/652796522cc1a7af559460e1ce59b642e05c1468b9c08522a9a096b4cf04/cuda_bindings-12.9.4-cp314-cp314-win_amd64.whl", hash = "sha256:53a10c71fdbdb743e0268d07964e5a996dd00b4e43831cbfce9804515d97d575", size = 11517716, upload-time = "2025-10-21T14:52:06.013Z" },
+    { url = "https://files.pythonhosted.org/packages/39/73/d2fc40c043bac699c3880bf88d3cebe9d88410cd043795382826c93a89f0/cuda_bindings-12.9.4-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20f2699d61d724de3eb3f3369d57e2b245f93085cab44fd37c3bea036cea1a6f", size = 11565056, upload-time = "2025-10-21T14:52:08.338Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/19/90ac264acc00f6df8a49378eedec9fd2db3061bf9263bf9f39fd3d8377c3/cuda_bindings-12.9.4-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d80bffc357df9988dca279734bc9674c3934a654cab10cadeed27ce17d8635ee", size = 11924658, upload-time = "2025-10-21T14:52:10.411Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/52/a30f46e822bfa6b4a659d1e8de8c4a4adf908ea075dac568b55362541bd8/cuda_bindings-12.9.4-cp314-cp314t-win_amd64.whl", hash = "sha256:53e11991a92ff6f26a0c8a98554cd5d6721c308a6b7bfb08bebac9201e039e43", size = 12055608, upload-time = "2025-10-21T14:52:12.335Z" },
 ]
 
 [[package]]
 name = "cuda-pathfinder"
-version = "1.3.2"
+version = "1.3.5"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b4/b2/a4982b5c7315d2dd211092d1ab226cb0c69b902480d0a58fde89b5991d71/cuda_pathfinder-1.3.2-py3-none-any.whl", hash = "sha256:7bd2774bc6be93aea226d579f415a63803b2b2c062207ed06c1d6dfc9cfacc3c", size = 27375, upload-time = "2025-10-29T21:51:45.342Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/b5/e4056e4058fb56519fcddf1face6fe3ff2398953b41615fafe9fb1540bf2/cuda_pathfinder-1.3.5-py3-none-any.whl", hash = "sha256:6c88220f8637cb35d2a75c620d72efebf683b248b923713d8fbe235844c1a4b9", size = 33711, upload-time = "2026-02-23T18:34:27.253Z" },
 ]
 
 [[package]]
 name = "cuda-python"
-version = "13.0.3"
+version = "12.9.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-bindings" },
-    { name = "cuda-pathfinder" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/31/5f/beaa12a11b051027eec0b041df01c6690db4f02e3b2e8fadd5a0eeb4df52/cuda_python-13.0.3-py3-none-any.whl", hash = "sha256:914cd7e2dd075bd06a2d5121c1d9ccdd3d0c94b03ea5a44dbd98d24d8ed93bab", size = 7605, upload-time = "2025-10-21T15:48:59.222Z" },
+    { url = "https://files.pythonhosted.org/packages/af/f3/6b032a554019cfb3447e671798c1bd3e79b5f1af20d10253f56cea269ef2/cuda_python-12.9.4-py3-none-any.whl", hash = "sha256:d2cacea882a69863f1e7d27ee71d75f0684f4c76910aff839067e4f89c902279", size = 7594, upload-time = "2025-10-21T14:55:12.846Z" },
 ]
 
 [[package]]
 name = "cython"
-version = "3.2.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/83/36/cce2972e13e83ffe58bc73bfd9d37340b5e5113e8243841a57511c7ae1c2/cython-3.2.1.tar.gz", hash = "sha256:2be1e4d0cbdf7f4cd4d9b8284a034e1989b59fd060f6bd4d24bf3729394d2ed8", size = 3270455, upload-time = "2025-11-12T19:02:59.847Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/87/74/f9fe9e7034f24aef407e7816880c012d8e863bedaa6b42b9ff33e79ea139/cython-3.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f1d10b3731171a33563ba81fdcba39c229e45087269dfbe07a1c00e7dcb2537f", size = 2957374, upload-time = "2025-11-12T19:03:10.132Z" },
-    { url = "https://files.pythonhosted.org/packages/65/47/f9dd519117f520aaf4d723c88fd9e9139262a0379edc01e71a1e9825e082/cython-3.2.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92b814b6066d178a5057b557d372e2a03854e947e41cb9dec21db732fbd14c3c", size = 3366838, upload-time = "2025-11-12T19:03:11.742Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/3e/d967acfafef00056c3ba832692b9bb358ede2919f641e4a2d24828adacc6/cython-3.2.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9fc6abd0532007827d8c6143b2bfedf80c7cb89a3c1c12f058336663489ed2e", size = 3535901, upload-time = "2025-11-12T19:03:13.545Z" },
-    { url = "https://files.pythonhosted.org/packages/68/79/bc46e714ecb010f80a8aa7f7eaf412c53cbabbe7489590d6aba5f4478ba5/cython-3.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:14f1ed135347587cfddcd3c3219667cac4f0ea0b66aa1c4c0187d50a1b92c222", size = 2764043, upload-time = "2025-11-12T19:03:15.584Z" },
-    { url = "https://files.pythonhosted.org/packages/48/d4/ba7b9f341ec168de78bd659600e04bb7de3b2d069bf98b2178a135e88ea4/cython-3.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3cb32c650e7f4476941d1f735cae75a2067d5e3279576273bb8802e8ea907222", size = 2949720, upload-time = "2025-11-12T19:03:17.492Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/47/c42417f424c0b928361f48d7dd0ae72716ee21f647b73ceb16f66b98663e/cython-3.2.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a2b306813d7f28aa0a2c3e4e63ada1427a8109917532df942cd5429db228252", size = 3242127, upload-time = "2025-11-12T19:03:19.227Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/fc/1040460889129551649ec35be45e05169871fbcf71bd8e13c533e86f9468/cython-3.2.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0959d9a36d4f004ce63acc1474b3c606745af98b65e8ae709efd0c10988e9d6b", size = 3377094, upload-time = "2025-11-12T19:03:21.25Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/f2/8c754298eefa40e21af0ae3592837c6e71254900d5aea1c8859e96b11de5/cython-3.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:60c62e734421365135cc2842013d883136054a26c617c001be494235edfc447a", size = 2767824, upload-time = "2025-11-12T19:03:23.317Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/0e/19d5041b87f98ed19c94c388607cd27c1f7458078c3bad5de2dead55b2e1/cython-3.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ea5097d97afd2ab14e98637b7033eba5146de29a5dedf89f5e946076396ab891", size = 2966736, upload-time = "2025-11-12T19:03:25.064Z" },
-    { url = "https://files.pythonhosted.org/packages/84/b8/bcc36d9d2464348106984956608a52a42a01ab44ea64031207dffdebc078/cython-3.2.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4bf12de0475bb6a21e2336a4a04dc4a2b4dd0507a2a3c703e045f3484266605", size = 3221633, upload-time = "2025-11-12T19:03:26.754Z" },
-    { url = "https://files.pythonhosted.org/packages/79/20/7d4807fe4ebcef9f20f2e5f93312d0f5d02f9f76524fd4e37706d04e83f7/cython-3.2.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18c64a0f69a1b8164de70ec7efc72250c589fec21519170de21582300f6aaed9", size = 3389542, upload-time = "2025-11-12T19:03:28.656Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/92/b06ba6721299293bc41e89732070132c453bdbaaeabb8f8cc76851b75345/cython-3.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:5ba14907d5826d8010e82306ce279a0d3650f5b50a4813c80836a17b2213c520", size = 2755307, upload-time = "2025-11-12T19:03:30.684Z" },
-    { url = "https://files.pythonhosted.org/packages/40/28/c6e36c214baeb27ae45b518552e74457536c7c964b1a55b5900b047fa467/cython-3.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b4e850fc7a2f72d19679dd083fe4d20bf66860fceabb4f3207112f240249d708", size = 2957307, upload-time = "2025-11-12T19:03:32.471Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/c8/b0b9ba64f81f2875c42aab5c0979d6454cd1ac6b3c1e2373ad552701565d/cython-3.2.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d20ca4afe993f7dccad3aeddbf4c3536cb0fd3ad6dc7a225935a666a5655af2", size = 3210919, upload-time = "2025-11-12T19:03:34.274Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/33/5d9ca6abba0e77e1851b843dd1b3c4095fbc6373166935e83c4414f80e88/cython-3.2.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5a54a757d01ca6a260b02ce5baf17d9db1c2253566ab5844ee4966ff2a69c19", size = 3373350, upload-time = "2025-11-12T19:03:35.927Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/29/4408c3486ff380a2d6ae0d4b71da5195efcef3c4360017113ee7d1cb7335/cython-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:1b81e56584727a328e00d91c164f8f0f2c59b02bf6857c3f000cd830fa571453", size = 2753425, upload-time = "2025-11-12T19:03:38.157Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/32/c1aa03ccadda89487ff31b90d8651c3706ce2744bf4f2c2ae213147e89bd/cython-3.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d7af6ad01c0fe1965d1d3badaeb6df53c1f37383ebae1ccb405b73f628f87713", size = 2967833, upload-time = "2025-11-12T19:03:40.233Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/dc/3488d3ade0635408a2ebb05561a3009e2f54616bfefd1f107088dfeb2c4c/cython-3.2.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3ea7cd085b62acb67c0fbde5cd17a7d9e47992c965e81ec977cf9ea7c59cd65", size = 3256237, upload-time = "2025-11-12T19:03:42.005Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/ba/f3d35d3803c9a424fa8812893847114deb9e2440c1bc67a31ab9ec4b9355/cython-3.2.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:986aea38fdf231e78d73745f83271c5654852c822dc5141a1d3fba64429a6aa6", size = 3383100, upload-time = "2025-11-12T19:03:43.675Z" },
-    { url = "https://files.pythonhosted.org/packages/86/dc/d72dbb2f8e7ca95d2d18fd86f32b2e385996576230e7ecddd7d250786825/cython-3.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:4960e26cd34c1385f21646339f2e0361fcdd2ed3c01cdb50fe734add577ec56a", size = 2790322, upload-time = "2025-11-12T19:03:45.373Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/7e/1194f4ba98b981bbdca945a292e4f49e87ea09d69516b24445409e7cf611/cython-3.2.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:4e9167316bf6ecfea33dcca62f074605648fb93cc053ef46b5deb3e5d12fc0d3", size = 2872858, upload-time = "2025-11-12T19:03:55.074Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/1a/393ca8ffec7ad3f02b8e4bffaba3dba4fb62c4a1c4c0b6dbf3b80e709fe3/cython-3.2.1-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3095df6cd470064742f428c937bed7200c5123b9e19ee04aa09ec61281e565a3", size = 3209664, upload-time = "2025-11-12T19:03:56.771Z" },
-    { url = "https://files.pythonhosted.org/packages/37/57/f209f64c609d3d8fac60a572e56da2f621dc1789e399c58db61d5645a31f/cython-3.2.1-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:db3f53b2d9afb206075a2605f1150aa019f0733c7795a38eccc6119c2e9c3f7b", size = 2854607, upload-time = "2025-11-12T19:03:59.413Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/af/1e5c73fe52423f40776130b0be914fd9f9f8dc26c4f6ea4c2ed04772d558/cython-3.2.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0fc5e7687ac8f8e2b2fb95648f43e9e074ebaa72fd5cb3d8e20e5f1e8b8e02d9", size = 2991567, upload-time = "2025-11-12T19:04:02.209Z" },
-    { url = "https://files.pythonhosted.org/packages/39/2c/3ea175b6b1fdfb429f9e9c395240d894155b3c0615caced05fef43264cba/cython-3.2.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:bbb3bc152bc0de82b031c8d355418fa4890a92424209d59366c2c0bc9e6cf53c", size = 2889178, upload-time = "2025-11-12T19:04:05.272Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/88/b2ab22a3a3feac78c62354a823c5c0c33659909e9918f53aa05904532b4b/cython-3.2.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:a2022bc48ad0c2c0e0485bf0b54902913a3d81086b7d435f4437620c667799f6", size = 3223755, upload-time = "2025-11-12T19:04:07.262Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/56/9ba58629a03cbffb5965a3c65ccd91fa683d95d588c21a875da72fdc249b/cython-3.2.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:99fdd4ffc2dcb513f4be9ce71c6fedd895b96b1f814655b6bbab196df497b090", size = 3113456, upload-time = "2025-11-12T19:04:09.175Z" },
-    { url = "https://files.pythonhosted.org/packages/56/5b/148c1a7ea5aebe460a70cad716a77e5fd0205be2de9fc5250491eb13ad8c/cython-3.2.1-cp39-abi3-win32.whl", hash = "sha256:06071f85bd5ce040464d43b2f9f287742a79f905e81b709fe904567230f1ed51", size = 2434223, upload-time = "2025-11-12T19:04:11.294Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/54/bb9b0c9db2a92a5e93747ca3027cfc645741411f8f1c6af2fb2a7b82df5d/cython-3.2.1-cp39-abi3-win_arm64.whl", hash = "sha256:e87c131d59480aee1ebac622b64f287c0e1d665ad1a1b7d498ac48accdb36c6b", size = 2439268, upload-time = "2025-11-12T19:04:12.931Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/30/373775b8d933d781d055c1dd0f110f275a101f320dab724c8c63a7c1b945/cython-3.2.1-py3-none-any.whl", hash = "sha256:cd72c46e7bffe8250c52d400e72c8d5d3086437b6aeec5b0eca99ccd337f5834", size = 1254219, upload-time = "2025-11-12T19:02:56.14Z" },
+version = "3.2.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/85/7574c9cd44b69a27210444b6650f6477f56c75fee1b70d7672d3e4166167/cython-3.2.4.tar.gz", hash = "sha256:84226ecd313b233da27dc2eb3601b4f222b8209c3a7216d8733b031da1dc64e6", size = 3280291, upload-time = "2026-01-04T14:14:14.473Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/10/720e0fb84eab4c927c4dd6b61eb7993f7732dd83d29ba6d73083874eade9/cython-3.2.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02cb0cc0f23b9874ad262d7d2b9560aed9c7e2df07b49b920bda6f2cc9cb505e", size = 2960836, upload-time = "2026-01-04T14:14:51.103Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/3d/b26f29092c71c36e0462752885bdfb18c23c176af4de953fdae2772a8941/cython-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f136f379a4a54246facd0eb6f1ee15c3837cb314ce87b677582ec014db4c6845", size = 3370134, upload-time = "2026-01-04T14:14:53.627Z" },
+    { url = "https://files.pythonhosted.org/packages/56/9e/539fb0d09e4f5251b5b14f8daf77e71fee021527f1013791038234618b6b/cython-3.2.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:35ab0632186057406ec729374c737c37051d2eacad9d515d94e5a3b3e58a9b02", size = 3537552, upload-time = "2026-01-04T14:14:56.852Z" },
+    { url = "https://files.pythonhosted.org/packages/10/c6/82d19a451c050d1be0f05b1a3302267463d391db548f013ee88b5348a8e9/cython-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:ca2399dc75796b785f74fb85c938254fa10c80272004d573c455f9123eceed86", size = 2766191, upload-time = "2026-01-04T14:14:58.709Z" },
+    { url = "https://files.pythonhosted.org/packages/85/cc/8f06145ec3efa121c8b1b67f06a640386ddacd77ee3e574da582a21b14ee/cython-3.2.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff9af2134c05e3734064808db95b4dd7341a39af06e8945d05ea358e1741aaed", size = 2953769, upload-time = "2026-01-04T14:15:00.361Z" },
+    { url = "https://files.pythonhosted.org/packages/55/b0/706cf830eddd831666208af1b3058c2e0758ae157590909c1f634b53bed9/cython-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67922c9de058a0bfb72d2e75222c52d09395614108c68a76d9800f150296ddb3", size = 3243841, upload-time = "2026-01-04T14:15:02.066Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/25/58893afd4ef45f79e3d4db82742fa4ff874b936d67a83c92939053920ccd/cython-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b362819d155fff1482575e804e43e3a8825332d32baa15245f4642022664a3f4", size = 3378083, upload-time = "2026-01-04T14:15:04.248Z" },
+    { url = "https://files.pythonhosted.org/packages/32/e4/424a004d7c0d8a4050c81846ebbd22272ececfa9a498cb340aa44fccbec2/cython-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:1a64a112a34ec719b47c01395647e54fb4cf088a511613f9a3a5196694e8e382", size = 2769990, upload-time = "2026-01-04T14:15:06.53Z" },
+    { url = "https://files.pythonhosted.org/packages/91/4d/1eb0c7c196a136b1926f4d7f0492a96c6fabd604d77e6cd43b56a3a16d83/cython-3.2.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:64d7f71be3dd6d6d4a4c575bb3a4674ea06d1e1e5e4cd1b9882a2bc40ed3c4c9", size = 2970064, upload-time = "2026-01-04T14:15:08.567Z" },
+    { url = "https://files.pythonhosted.org/packages/03/1c/46e34b08bea19a1cdd1e938a4c123e6299241074642db9d81983cef95e9f/cython-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:869487ea41d004f8b92171f42271fbfadb1ec03bede3158705d16cd570d6b891", size = 3226757, upload-time = "2026-01-04T14:15:10.812Z" },
+    { url = "https://files.pythonhosted.org/packages/12/33/3298a44d201c45bcf0d769659725ae70e9c6c42adf8032f6d89c8241098d/cython-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:55b6c44cd30821f0b25220ceba6fe636ede48981d2a41b9bbfe3c7902ce44ea7", size = 3388969, upload-time = "2026-01-04T14:15:12.45Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/f3/4275cd3ea0a4cf4606f9b92e7f8766478192010b95a7f516d1b7cf22cb10/cython-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:767b143704bdd08a563153448955935844e53b852e54afdc552b43902ed1e235", size = 2756457, upload-time = "2026-01-04T14:15:14.67Z" },
+    { url = "https://files.pythonhosted.org/packages/18/b5/1cfca43b7d20a0fdb1eac67313d6bb6b18d18897f82dd0f17436bdd2ba7f/cython-3.2.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:28e8075087a59756f2d059273184b8b639fe0f16cf17470bd91c39921bc154e0", size = 2960506, upload-time = "2026-01-04T14:15:16.733Z" },
+    { url = "https://files.pythonhosted.org/packages/71/bb/8f28c39c342621047fea349a82fac712a5e2b37546d2f737bbde48d5143d/cython-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03893c88299a2c868bb741ba6513357acd104e7c42265809fd58dce1456a36fc", size = 3213148, upload-time = "2026-01-04T14:15:18.804Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/d2/16fa02f129ed2b627e88d9d9ebd5ade3eeb66392ae5ba85b259d2d52b047/cython-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f81eda419b5ada7b197bbc3c5f4494090e3884521ffd75a3876c93fbf66c9ca8", size = 3375764, upload-time = "2026-01-04T14:15:20.817Z" },
+    { url = "https://files.pythonhosted.org/packages/91/3f/deb8f023a5c10c0649eb81332a58c180fad27c7533bb4aae138b5bc34d92/cython-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:83266c356c13c68ffe658b4905279c993d8a5337bb0160fa90c8a3e297ea9a2e", size = 2754238, upload-time = "2026-01-04T14:15:23.001Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/d7/3bda3efce0c5c6ce79cc21285dbe6f60369c20364e112f5a506ee8a1b067/cython-3.2.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d4b4fd5332ab093131fa6172e8362f16adef3eac3179fd24bbdc392531cb82fa", size = 2971496, upload-time = "2026-01-04T14:15:25.038Z" },
+    { url = "https://files.pythonhosted.org/packages/89/ed/1021ffc80b9c4720b7ba869aea8422c82c84245ef117ebe47a556bdc00c3/cython-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3b5ac54e95f034bc7fb07313996d27cbf71abc17b229b186c1540942d2dc28e", size = 3256146, upload-time = "2026-01-04T14:15:26.741Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/51/ca221ec7e94b3c5dc4138dcdcbd41178df1729c1e88c5dfb25f9d30ba3da/cython-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90f43be4eaa6afd58ce20d970bb1657a3627c44e1760630b82aa256ba74b4acb", size = 3383458, upload-time = "2026-01-04T14:15:28.425Z" },
+    { url = "https://files.pythonhosted.org/packages/79/2e/1388fc0243240cd54994bb74f26aaaf3b2e22f89d3a2cf8da06d75d46ca2/cython-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:983f9d2bb8a896e16fa68f2b37866ded35fa980195eefe62f764ddc5f9f5ef8e", size = 2791241, upload-time = "2026-01-04T14:15:30.448Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/8b/fd393f0923c82be4ec0db712fffb2ff0a7a131707b842c99bf24b549274d/cython-3.2.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:36bf3f5eb56d5281aafabecbaa6ed288bc11db87547bba4e1e52943ae6961ccf", size = 2875622, upload-time = "2026-01-04T14:15:39.749Z" },
+    { url = "https://files.pythonhosted.org/packages/73/48/48530d9b9d64ec11dbe0dd3178a5fe1e0b27977c1054ecffb82be81e9b6a/cython-3.2.4-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6d5267f22b6451eb1e2e1b88f6f78a2c9c8733a6ddefd4520d3968d26b824581", size = 3210669, upload-time = "2026-01-04T14:15:41.911Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/91/4865fbfef1f6bb4f21d79c46104a53d1a3fa4348286237e15eafb26e0828/cython-3.2.4-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3b6e58f73a69230218d5381817850ce6d0da5bb7e87eb7d528c7027cbba40b06", size = 2856835, upload-time = "2026-01-04T14:15:43.815Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/39/60317957dbef179572398253f29d28f75f94ab82d6d39ea3237fb6c89268/cython-3.2.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e71efb20048358a6b8ec604a0532961c50c067b5e63e345e2e359fff72feaee8", size = 2994408, upload-time = "2026-01-04T14:15:45.422Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/30/7c24d9292650db4abebce98abc9b49c820d40fa7c87921c0a84c32f4efe7/cython-3.2.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:28b1e363b024c4b8dcf52ff68125e635cb9cb4b0ba997d628f25e32543a71103", size = 2891478, upload-time = "2026-01-04T14:15:47.394Z" },
+    { url = "https://files.pythonhosted.org/packages/86/70/03dc3c962cde9da37a93cca8360e576f904d5f9beecfc9d70b1f820d2e5f/cython-3.2.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:31a90b4a2c47bb6d56baeb926948348ec968e932c1ae2c53239164e3e8880ccf", size = 3225663, upload-time = "2026-01-04T14:15:49.446Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/97/10b50c38313c37b1300325e2e53f48ea9a2c078a85c0c9572057135e31d5/cython-3.2.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e65e4773021f8dc8532010b4fbebe782c77f9a0817e93886e518c93bd6a44e9d", size = 3115628, upload-time = "2026-01-04T14:15:51.323Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/b1/d6a353c9b147848122a0db370863601fdf56de2d983b5c4a6a11e6ee3cd7/cython-3.2.4-cp39-abi3-win32.whl", hash = "sha256:2b1f12c0e4798293d2754e73cd6f35fa5bbdf072bdc14bc6fc442c059ef2d290", size = 2437463, upload-time = "2026-01-04T14:15:53.787Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/d8/319a1263b9c33b71343adfd407e5daffd453daef47ebc7b642820a8b68ed/cython-3.2.4-cp39-abi3-win_arm64.whl", hash = "sha256:3b8e62049afef9da931d55de82d8f46c9a147313b69d5ff6af6e9121d545ce7a", size = 2442754, upload-time = "2026-01-04T14:15:55.382Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/fa/d3c15189f7c52aaefbaea76fb012119b04b9013f4bf446cb4eb4c26c4e6b/cython-3.2.4-py3-none-any.whl", hash = "sha256:732fc93bc33ae4b14f6afaca663b916c2fdd5dcbfad7114e17fb2434eeaea45c", size = 1257078, upload-time = "2026-01-04T14:14:12.373Z" },
 ]
 
 [[package]]
 name = "datasets"
-version = "4.4.1"
+version = "2.2.1"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "dill" },
-    { name = "filelock" },
-    { name = "fsspec", extra = ["http"], marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
-    { name = "httpx" },
-    { name = "huggingface-hub" },
-    { name = "multiprocess" },
+    { name = "aiohttp", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "dill", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, extra = ["http"], marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "huggingface-hub", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "multiprocess", version = "0.70.19", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "packaging", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "pyarrow", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "requests", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "responses", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "tqdm", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "xxhash", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/31/64/1e6fb2a0eb6b0d55117233cf33279ba6d680c0f031ebae81281a47c92760/datasets-2.2.1.tar.gz", hash = "sha256:d362717c4394589b516c8f397ff20a6fe720454aed877ab61d06f3bc05df9544", size = 302132, upload-time = "2022-05-11T17:02:29.543Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d7/2d/41e8aec8d4bad6f07adfcbc89cf743e0d31c876371d453b2936bcfa7fe34/datasets-2.2.1-py3-none-any.whl", hash = "sha256:1938f3e99599422de50b9b54fe802aca854ed130382dab0b3820c821f7ae6d5e", size = 342193, upload-time = "2022-05-11T17:02:27.047Z" },
+]
+
+[[package]]
+name = "datasets"
+version = "4.5.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "dill", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "filelock", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "fsspec", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, extra = ["http"], marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "httpx", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "huggingface-hub", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "multiprocess", version = "0.70.18", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging" },
-    { name = "pandas" },
-    { name = "pyarrow" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "tqdm" },
-    { name = "xxhash" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pyarrow", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "pyyaml", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "requests", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "tqdm", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "xxhash", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/93/bf/0dae295d6d1ba0b1a200a9dd216838464b5bbd05da01407cb1330b377445/datasets-4.4.1.tar.gz", hash = "sha256:80322699aa8c0bbbdb7caa87906da689c3c2e29523cff698775c67f28fdab1fc", size = 585341, upload-time = "2025-11-05T16:00:38.162Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/55/bf/bb927bde63d649296c83e883171ae77074717c1b80fe2868b328bd0dbcbb/datasets-4.5.0.tar.gz", hash = "sha256:00c698ce1c2452e646cc5fad47fef39d3fe78dd650a8a6eb205bb45eb63cd500", size = 588384, upload-time = "2026-01-14T18:27:54.297Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/5e/6f8d874366788ad5d549e9ba258037d974dda6e004843be1bda794571701/datasets-4.4.1-py3-none-any.whl", hash = "sha256:c1163de5211e42546079ab355cc0250c7e6db16eb209ac5ac6252f801f596c44", size = 511591, upload-time = "2025-11-05T16:00:36.365Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/d5/0d563ea3c205eee226dc8053cf7682a8ac588db8acecd0eda2b587987a0b/datasets-4.5.0-py3-none-any.whl", hash = "sha256:b5d7e08096ffa407dd69e58b1c0271c9b2506140839b8d99af07375ad31b6726", size = 515196, upload-time = "2026-01-14T18:27:52.419Z" },
 ]
 
 [[package]]
@@ -1183,7 +1354,8 @@ name = "deprecated"
 version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "wrapt" },
+    { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "wrapt", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" }
 wheels = [
@@ -1194,11 +1366,52 @@ wheels = [
 name = "dill"
 version = "0.4.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976, upload-time = "2025-04-16T00:41:48.867Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" },
 ]
 
+[[package]]
+name = "dill"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" },
+]
+
+[[package]]
+name = "distlib"
+version = "0.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" },
+]
+
 [[package]]
 name = "docker"
 version = "7.1.0"
@@ -1226,11 +1439,42 @@ wheels = [
 name = "docutils"
 version = "0.21.2"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444, upload-time = "2024-04-23T18:57:18.24Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" },
 ]
 
+[[package]]
+name = "docutils"
+version = "0.22.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" },
+]
+
 [[package]]
 name = "ebmlite"
 version = "3.4.1"
@@ -1242,20 +1486,20 @@ wheels = [
 
 [[package]]
 name = "einops"
-version = "0.8.1"
+version = "0.8.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e5/81/df4fbe24dff8ba3934af99044188e20a98ed441ad17a274539b74e82e126/einops-0.8.1.tar.gz", hash = "sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84", size = 54805, upload-time = "2025-02-09T03:17:00.434Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359, upload-time = "2025-02-09T03:17:01.998Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" },
 ]
 
 [[package]]
 name = "emerging-optimizers"
 version = "0.1.0"
-source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=fb1add873e7851ec34b48581ea1b15761b73d189#fb1add873e7851ec34b48581ea1b15761b73d189" }
+source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0#d5363b4a418128cd8111983b191c4b8869a9766b" }
 dependencies = [
     { name = "absl-py" },
-    { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "torch", marker = "sys_platform == 'never'" },
     { name = "typing-extensions" },
 ]
 
@@ -1264,7 +1508,7 @@ name = "exceptiongroup"
 version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
 wheels = [
@@ -1288,17 +1532,18 @@ wheels = [
 
 [[package]]
 name = "fastapi"
-version = "0.122.0"
+version = "0.133.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-doc" },
     { name = "pydantic" },
     { name = "starlette" },
     { name = "typing-extensions" },
+    { name = "typing-inspection" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b2/de/3ee97a4f6ffef1fb70bf20561e4f88531633bb5045dc6cebc0f8471f764d/fastapi-0.122.0.tar.gz", hash = "sha256:cd9b5352031f93773228af8b4c443eedc2ac2aa74b27780387b853c3726fb94b", size = 346436, upload-time = "2025-11-24T19:17:47.95Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c2/04/ab382c7c03dd545f2c964d06e87ad0d5faa944a2434186ad9c285f5d87e0/fastapi-0.133.0.tar.gz", hash = "sha256:b900a2bf5685cdb0647a41d5900bdeafc3a9e8a28ac08c6246b76699e164d60d", size = 373265, upload-time = "2026-02-24T09:53:40.143Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7a/93/aa8072af4ff37b795f6bbf43dcaf61115f40f49935c7dbb180c9afc3f421/fastapi-0.122.0-py3-none-any.whl", hash = "sha256:a456e8915dfc6c8914a50d9651133bd47ec96d331c5b44600baa635538a30d67", size = 110671, upload-time = "2025-11-24T19:17:45.96Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/b4/023e75a2ec3f5440e380df6caf4d28edc0806d007193e6fb0707237886a4/fastapi-0.133.0-py3-none-any.whl", hash = "sha256:0a78878483d60702a1dde864c24ab349a1a53ef4db6b6f74f8cd4a2b2bc67d2f", size = 104787, upload-time = "2026-02-24T09:53:41.404Z" },
 ]
 
 [[package]]
@@ -1318,11 +1563,11 @@ wheels = [
 
 [[package]]
 name = "filelock"
-version = "3.20.0"
+version = "3.24.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/58/46/0028a82567109b5ef6e4d2a1f04a583fb513e6cf9527fcdd09afd817deeb/filelock-3.20.0.tar.gz", hash = "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4", size = 18922, upload-time = "2025-10-08T18:03:50.056Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/73/92/a8e2479937ff39185d20dd6a851c1a63e55849e447a55e798cc2e1f49c65/filelock-3.24.3.tar.gz", hash = "sha256:011a5644dc937c22699943ebbfc46e969cdde3e171470a6e40b9533e5a72affa", size = 37935, upload-time = "2026-02-19T00:48:20.543Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
 ]
 
 [[package]]
@@ -1334,6 +1579,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
 ]
 
+[[package]]
+name = "fla-core"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "einops" },
+    { name = "torch", marker = "sys_platform == 'never'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f1/de/0d6bd5664ba2e711cabdde11ccb41ddcdd866c531e40900af3601bd7b8c6/fla_core-0.4.1.tar.gz", hash = "sha256:38ab28966eeadc2141b29e87c2bf72a8a4851e00af9d25bbbc3596b1fb53450d", size = 319608, upload-time = "2025-12-24T18:07:37.669Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f6/43/945ef69eb48a14c30fd7323d3e0b560c821ae71e6d3ef979e06a901bc3b9/fla_core-0.4.1-py3-none-any.whl", hash = "sha256:93c6afe4c80fc7bc705fa8aeea6a46d2cf2d77383f9619a41863c7114c801bab", size = 437282, upload-time = "2025-12-24T18:07:34.41Z" },
+]
+
 [[package]]
 name = "flake8"
 version = "7.1.0"
@@ -1348,6 +1606,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/43/d5147aadaa52558e94e024811f2f9543b4bd7203b3a9659eeb5dff9c61b3/flake8-7.1.0-py2.py3-none-any.whl", hash = "sha256:2e416edcc62471a64cea09353f4e7bdba32aeb079b6e360554c659a122b1bc6a", size = 57569, upload-time = "2024-06-15T21:37:05.342Z" },
 ]
 
+[[package]]
+name = "flash-linear-attention"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "fla-core" },
+    { name = "transformers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/46/83/7d8ec7ffb5229080b1c9b772338ff588cbd63282ac355ede2a12a6e174a8/flash_linear_attention-0.4.1.tar.gz", hash = "sha256:127ee7273ed15ac17f72bcf4c75e1051719d8fbe0a2d1d047e59406f36d81ee2", size = 158280, upload-time = "2025-12-24T18:07:38.812Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/63/d5/6327559a9d5b9243b10c3984f1bcef256ed2ad06d105a3bb8f7b2979659c/flash_linear_attention-0.4.1-py3-none-any.whl", hash = "sha256:d18bdfe9d1f4b424676444eac9d50fb8433b70e5d4e0e0878b20bcbcdbea57ce", size = 287415, upload-time = "2025-12-24T18:07:35.815Z" },
+]
+
 [[package]]
 name = "flash-mla"
 version = "1.0.0+9edee0c"
@@ -1363,7 +1634,7 @@ dependencies = [
     { name = "einops" },
     { name = "ninja" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-cudnn-frontend" },
     { name = "nvidia-cutlass-dsl" },
     { name = "nvidia-ml-py" },
@@ -1380,7 +1651,7 @@ wheels = [
 
 [[package]]
 name = "flask"
-version = "3.1.2"
+version = "3.1.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "blinker" },
@@ -1390,9 +1661,9 @@ dependencies = [
     { name = "markupsafe" },
     { name = "werkzeug" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dc/6d/cfe3c0fcc5e477df242b98bfe186a4c34357b4847e87ecaef04507332dab/flask-3.1.2.tar.gz", hash = "sha256:bf656c15c80190ed628ad08cdfd3aaa35beb087855e2f494910aa3774cc4fd87", size = 720160, upload-time = "2025-08-19T21:03:21.205Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/f9/7f9263c5695f4bd0023734af91bedb2ff8209e8de6ead162f35d8dc762fd/flask-3.1.2-py3-none-any.whl", hash = "sha256:ca1d8112ec8a6158cc29ea4858963350011b5c846a414cdb7a954aa9e967d03c", size = 103308, upload-time = "2025-08-19T21:03:19.499Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" },
 ]
 
 [[package]]
@@ -1535,6 +1806,26 @@ wheels = [
 name = "fsspec"
 version = "2025.10.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/24/7f/2747c0d332b9acfa75dc84447a066fdf812b5a6b8d30472b74d309bfe8cb/fsspec-2025.10.0.tar.gz", hash = "sha256:b6789427626f068f9a83ca4e8a3cc050850b6c0f71f99ddb4f542b8266a26a59", size = 309285, upload-time = "2025-10-30T14:58:44.036Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/02/a6b21098b1d5d6249b7c5ab69dde30108a71e4e819d4a9778f1de1d5b70d/fsspec-2025.10.0-py3-none-any.whl", hash = "sha256:7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d", size = 200966, upload-time = "2025-10-30T14:58:42.53Z" },
@@ -1542,7 +1833,24 @@ wheels = [
 
 [package.optional-dependencies]
 http = [
-    { name = "aiohttp" },
+    { name = "aiohttp", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+]
+
+[[package]]
+name = "fsspec"
+version = "2026.2.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1559,14 +1867,56 @@ wheels = [
 
 [[package]]
 name = "gitpython"
-version = "3.1.45"
+version = "3.1.46"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "gitdb" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/df/b5/59d16470a1f0dfe8c793f9ef56fd3826093fc52b3bd96d6b9d6c26c7e27b/gitpython-3.1.46.tar.gz", hash = "sha256:400124c7d0ef4ea03f7310ac2fbf7151e09ff97f2a3288d64a440c584a29c37f", size = 215371, upload-time = "2026-01-01T15:37:32.073Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6a/09/e21df6aef1e1ffc0c816f0522ddc3f6dcded766c3261813131c78a704470/gitpython-3.1.46-py3-none-any.whl", hash = "sha256:79812ed143d9d25b6d176a10bb511de0f9c67b1fa641d82097b0ab90398a2058", size = 208620, upload-time = "2026-01-01T15:37:30.574Z" },
+]
+
+[[package]]
+name = "google-api-core"
+version = "2.30.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-auth" },
+    { name = "googleapis-common-protos" },
+    { name = "proto-plus" },
+    { name = "protobuf" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/22/98/586ec94553b569080caef635f98a3723db36a38eac0e3d7eb3ea9d2e4b9a/google_api_core-2.30.0.tar.gz", hash = "sha256:02edfa9fab31e17fc0befb5f161b3bf93c9096d99aed584625f38065c511ad9b", size = 176959, upload-time = "2026-02-18T20:28:11.926Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/27/09c33d67f7e0dcf06d7ac17d196594e66989299374bfb0d4331d1038e76b/google_api_core-2.30.0-py3-none-any.whl", hash = "sha256:80be49ee937ff9aba0fd79a6eddfde35fe658b9953ab9b79c57dd7061afa8df5", size = 173288, upload-time = "2026-02-18T20:28:10.367Z" },
+]
+
+[[package]]
+name = "google-auth"
+version = "2.48.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography" },
+    { name = "pyasn1-modules" },
+    { name = "rsa" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0c/41/242044323fbd746615884b1c16639749e73665b718209946ebad7ba8a813/google_auth-2.48.0.tar.gz", hash = "sha256:4f7e706b0cd3208a3d940a19a822c37a476ddba5450156c3e6624a71f7c841ce", size = 326522, upload-time = "2026-01-26T19:22:47.157Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/1d/d6466de3a5249d35e832a52834115ca9d1d0de6abc22065f049707516d47/google_auth-2.48.0-py3-none-any.whl", hash = "sha256:2e2a537873d449434252a9632c28bfc268b0adb1e53f9fb62afc5333a975903f", size = 236499, upload-time = "2026-01-26T19:22:45.099Z" },
+]
+
+[[package]]
+name = "googleapis-common-protos"
+version = "1.72.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e5/7b/adfd75544c415c487b33061fe7ae526165241c1ea133f9a9125a56b39fd8/googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5", size = 147433, upload-time = "2025-11-06T18:29:24.087Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" },
 ]
 
 [[package]]
@@ -1580,63 +1930,63 @@ wheels = [
 
 [[package]]
 name = "grpcio"
-version = "1.76.0"
+version = "1.78.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/88/17/ff4795dc9a34b6aee6ec379f1b66438a3789cd1315aac0cbab60d92f74b3/grpcio-1.76.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:65a20de41e85648e00305c1bb09a3598f840422e522277641145a32d42dcefcc", size = 5840037, upload-time = "2025-10-21T16:20:25.069Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/ff/35f9b96e3fa2f12e1dcd58a4513a2e2294a001d64dec81677361b7040c9a/grpcio-1.76.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:40ad3afe81676fd9ec6d9d406eda00933f218038433980aa19d401490e46ecde", size = 11836482, upload-time = "2025-10-21T16:20:30.113Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/1c/8374990f9545e99462caacea5413ed783014b3b66ace49e35c533f07507b/grpcio-1.76.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:035d90bc79eaa4bed83f524331d55e35820725c9fbb00ffa1904d5550ed7ede3", size = 6407178, upload-time = "2025-10-21T16:20:32.733Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/77/36fd7d7c75a6c12542c90a6d647a27935a1ecaad03e0ffdb7c42db6b04d2/grpcio-1.76.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4215d3a102bd95e2e11b5395c78562967959824156af11fa93d18fdd18050990", size = 7075684, upload-time = "2025-10-21T16:20:35.435Z" },
-    { url = "https://files.pythonhosted.org/packages/38/f7/e3cdb252492278e004722306c5a8935eae91e64ea11f0af3437a7de2e2b7/grpcio-1.76.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49ce47231818806067aea3324d4bf13825b658ad662d3b25fada0bdad9b8a6af", size = 6611133, upload-time = "2025-10-21T16:20:37.541Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/20/340db7af162ccd20a0893b5f3c4a5d676af7b71105517e62279b5b61d95a/grpcio-1.76.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8cc3309d8e08fd79089e13ed4819d0af72aa935dd8f435a195fd152796752ff2", size = 7195507, upload-time = "2025-10-21T16:20:39.643Z" },
-    { url = "https://files.pythonhosted.org/packages/10/f0/b2160addc1487bd8fa4810857a27132fb4ce35c1b330c2f3ac45d697b106/grpcio-1.76.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:971fd5a1d6e62e00d945423a567e42eb1fa678ba89072832185ca836a94daaa6", size = 8160651, upload-time = "2025-10-21T16:20:42.492Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/2c/ac6f98aa113c6ef111b3f347854e99ebb7fb9d8f7bb3af1491d438f62af4/grpcio-1.76.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d9adda641db7207e800a7f089068f6f645959f2df27e870ee81d44701dd9db3", size = 7620568, upload-time = "2025-10-21T16:20:45.995Z" },
-    { url = "https://files.pythonhosted.org/packages/90/84/7852f7e087285e3ac17a2703bc4129fafee52d77c6c82af97d905566857e/grpcio-1.76.0-cp310-cp310-win32.whl", hash = "sha256:063065249d9e7e0782d03d2bca50787f53bd0fb89a67de9a7b521c4a01f1989b", size = 3998879, upload-time = "2025-10-21T16:20:48.592Z" },
-    { url = "https://files.pythonhosted.org/packages/10/30/d3d2adcbb6dd3ff59d6ac3df6ef830e02b437fb5c90990429fd180e52f30/grpcio-1.76.0-cp310-cp310-win_amd64.whl", hash = "sha256:a6ae758eb08088d36812dd5d9af7a9859c05b1e0f714470ea243694b49278e7b", size = 4706892, upload-time = "2025-10-21T16:20:50.697Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/00/8163a1beeb6971f66b4bbe6ac9457b97948beba8dd2fc8e1281dce7f79ec/grpcio-1.76.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:2e1743fbd7f5fa713a1b0a8ac8ebabf0ec980b5d8809ec358d488e273b9cf02a", size = 5843567, upload-time = "2025-10-21T16:20:52.829Z" },
-    { url = "https://files.pythonhosted.org/packages/10/c1/934202f5cf335e6d852530ce14ddb0fef21be612ba9ecbbcbd4d748ca32d/grpcio-1.76.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:a8c2cf1209497cf659a667d7dea88985e834c24b7c3b605e6254cbb5076d985c", size = 11848017, upload-time = "2025-10-21T16:20:56.705Z" },
-    { url = "https://files.pythonhosted.org/packages/11/0b/8dec16b1863d74af6eb3543928600ec2195af49ca58b16334972f6775663/grpcio-1.76.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08caea849a9d3c71a542827d6df9d5a69067b0a1efbea8a855633ff5d9571465", size = 6412027, upload-time = "2025-10-21T16:20:59.3Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/64/7b9e6e7ab910bea9d46f2c090380bab274a0b91fb0a2fe9b0cd399fffa12/grpcio-1.76.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f0e34c2079d47ae9f6188211db9e777c619a21d4faba6977774e8fa43b085e48", size = 7075913, upload-time = "2025-10-21T16:21:01.645Z" },
-    { url = "https://files.pythonhosted.org/packages/68/86/093c46e9546073cefa789bd76d44c5cb2abc824ca62af0c18be590ff13ba/grpcio-1.76.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8843114c0cfce61b40ad48df65abcfc00d4dba82eae8718fab5352390848c5da", size = 6615417, upload-time = "2025-10-21T16:21:03.844Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/b6/5709a3a68500a9c03da6fb71740dcdd5ef245e39266461a03f31a57036d8/grpcio-1.76.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8eddfb4d203a237da6f3cc8a540dad0517d274b5a1e9e636fd8d2c79b5c1d397", size = 7199683, upload-time = "2025-10-21T16:21:06.195Z" },
-    { url = "https://files.pythonhosted.org/packages/91/d3/4b1f2bf16ed52ce0b508161df3a2d186e4935379a159a834cb4a7d687429/grpcio-1.76.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:32483fe2aab2c3794101c2a159070584e5db11d0aa091b2c0ea9c4fc43d0d749", size = 8163109, upload-time = "2025-10-21T16:21:08.498Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/61/d9043f95f5f4cf085ac5dd6137b469d41befb04bd80280952ffa2a4c3f12/grpcio-1.76.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dcfe41187da8992c5f40aa8c5ec086fa3672834d2be57a32384c08d5a05b4c00", size = 7626676, upload-time = "2025-10-21T16:21:10.693Z" },
-    { url = "https://files.pythonhosted.org/packages/36/95/fd9a5152ca02d8881e4dd419cdd790e11805979f499a2e5b96488b85cf27/grpcio-1.76.0-cp311-cp311-win32.whl", hash = "sha256:2107b0c024d1b35f4083f11245c0e23846ae64d02f40b2b226684840260ed054", size = 3997688, upload-time = "2025-10-21T16:21:12.746Z" },
-    { url = "https://files.pythonhosted.org/packages/60/9c/5c359c8d4c9176cfa3c61ecd4efe5affe1f38d9bae81e81ac7186b4c9cc8/grpcio-1.76.0-cp311-cp311-win_amd64.whl", hash = "sha256:522175aba7af9113c48ec10cc471b9b9bd4f6ceb36aeb4544a8e2c80ed9d252d", size = 4709315, upload-time = "2025-10-21T16:21:15.26Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/05/8e29121994b8d959ffa0afd28996d452f291b48cfc0875619de0bde2c50c/grpcio-1.76.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:81fd9652b37b36f16138611c7e884eb82e0cec137c40d3ef7c3f9b3ed00f6ed8", size = 5799718, upload-time = "2025-10-21T16:21:17.939Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/75/11d0e66b3cdf998c996489581bdad8900db79ebd83513e45c19548f1cba4/grpcio-1.76.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:04bbe1bfe3a68bbfd4e52402ab7d4eb59d72d02647ae2042204326cf4bbad280", size = 11825627, upload-time = "2025-10-21T16:21:20.466Z" },
-    { url = "https://files.pythonhosted.org/packages/28/50/2f0aa0498bc188048f5d9504dcc5c2c24f2eb1a9337cd0fa09a61a2e75f0/grpcio-1.76.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d388087771c837cdb6515539f43b9d4bf0b0f23593a24054ac16f7a960be16f4", size = 6359167, upload-time = "2025-10-21T16:21:23.122Z" },
-    { url = "https://files.pythonhosted.org/packages/66/e5/bbf0bb97d29ede1d59d6588af40018cfc345b17ce979b7b45424628dc8bb/grpcio-1.76.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:9f8f757bebaaea112c00dba718fc0d3260052ce714e25804a03f93f5d1c6cc11", size = 7044267, upload-time = "2025-10-21T16:21:25.995Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/86/f6ec2164f743d9609691115ae8ece098c76b894ebe4f7c94a655c6b03e98/grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:980a846182ce88c4f2f7e2c22c56aefd515daeb36149d1c897f83cf57999e0b6", size = 6573963, upload-time = "2025-10-21T16:21:28.631Z" },
-    { url = "https://files.pythonhosted.org/packages/60/bc/8d9d0d8505feccfdf38a766d262c71e73639c165b311c9457208b56d92ae/grpcio-1.76.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f92f88e6c033db65a5ae3d97905c8fea9c725b63e28d5a75cb73b49bda5024d8", size = 7164484, upload-time = "2025-10-21T16:21:30.837Z" },
-    { url = "https://files.pythonhosted.org/packages/67/e6/5d6c2fc10b95edf6df9b8f19cf10a34263b7fd48493936fffd5085521292/grpcio-1.76.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4baf3cbe2f0be3289eb68ac8ae771156971848bb8aaff60bad42005539431980", size = 8127777, upload-time = "2025-10-21T16:21:33.577Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/c8/dce8ff21c86abe025efe304d9e31fdb0deaaa3b502b6a78141080f206da0/grpcio-1.76.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:615ba64c208aaceb5ec83bfdce7728b80bfeb8be97562944836a7a0a9647d882", size = 7594014, upload-time = "2025-10-21T16:21:41.882Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/42/ad28191ebf983a5d0ecef90bab66baa5a6b18f2bfdef9d0a63b1973d9f75/grpcio-1.76.0-cp312-cp312-win32.whl", hash = "sha256:45d59a649a82df5718fd9527ce775fd66d1af35e6d31abdcdc906a49c6822958", size = 3984750, upload-time = "2025-10-21T16:21:44.006Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/00/7bd478cbb851c04a48baccaa49b75abaa8e4122f7d86da797500cccdd771/grpcio-1.76.0-cp312-cp312-win_amd64.whl", hash = "sha256:c088e7a90b6017307f423efbb9d1ba97a22aa2170876223f9709e9d1de0b5347", size = 4704003, upload-time = "2025-10-21T16:21:46.244Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/ed/71467ab770effc9e8cef5f2e7388beb2be26ed642d567697bb103a790c72/grpcio-1.76.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:26ef06c73eb53267c2b319f43e6634c7556ea37672029241a056629af27c10e2", size = 5807716, upload-time = "2025-10-21T16:21:48.475Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/85/c6ed56f9817fab03fa8a111ca91469941fb514e3e3ce6d793cb8f1e1347b/grpcio-1.76.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:45e0111e73f43f735d70786557dc38141185072d7ff8dc1829d6a77ac1471468", size = 11821522, upload-time = "2025-10-21T16:21:51.142Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/31/2b8a235ab40c39cbc141ef647f8a6eb7b0028f023015a4842933bc0d6831/grpcio-1.76.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:83d57312a58dcfe2a3a0f9d1389b299438909a02db60e2f2ea2ae2d8034909d3", size = 6362558, upload-time = "2025-10-21T16:21:54.213Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/64/9784eab483358e08847498ee56faf8ff6ea8e0a4592568d9f68edc97e9e9/grpcio-1.76.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:3e2a27c89eb9ac3d81ec8835e12414d73536c6e620355d65102503064a4ed6eb", size = 7049990, upload-time = "2025-10-21T16:21:56.476Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/94/8c12319a6369434e7a184b987e8e9f3b49a114c489b8315f029e24de4837/grpcio-1.76.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:61f69297cba3950a524f61c7c8ee12e55c486cb5f7db47ff9dcee33da6f0d3ae", size = 6575387, upload-time = "2025-10-21T16:21:59.051Z" },
-    { url = "https://files.pythonhosted.org/packages/15/0f/f12c32b03f731f4a6242f771f63039df182c8b8e2cf8075b245b409259d4/grpcio-1.76.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6a15c17af8839b6801d554263c546c69c4d7718ad4321e3166175b37eaacca77", size = 7166668, upload-time = "2025-10-21T16:22:02.049Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/2d/3ec9ce0c2b1d92dd59d1c3264aaec9f0f7c817d6e8ac683b97198a36ed5a/grpcio-1.76.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:25a18e9810fbc7e7f03ec2516addc116a957f8cbb8cbc95ccc80faa072743d03", size = 8124928, upload-time = "2025-10-21T16:22:04.984Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/74/fd3317be5672f4856bcdd1a9e7b5e17554692d3db9a3b273879dc02d657d/grpcio-1.76.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42", size = 7589983, upload-time = "2025-10-21T16:22:07.881Z" },
-    { url = "https://files.pythonhosted.org/packages/45/bb/ca038cf420f405971f19821c8c15bcbc875505f6ffadafe9ffd77871dc4c/grpcio-1.76.0-cp313-cp313-win32.whl", hash = "sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f", size = 3984727, upload-time = "2025-10-21T16:22:10.032Z" },
-    { url = "https://files.pythonhosted.org/packages/41/80/84087dc56437ced7cdd4b13d7875e7439a52a261e3ab4e06488ba6173b0a/grpcio-1.76.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8", size = 4702799, upload-time = "2025-10-21T16:22:12.709Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/46/39adac80de49d678e6e073b70204091e76631e03e94928b9ea4ecf0f6e0e/grpcio-1.76.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62", size = 5808417, upload-time = "2025-10-21T16:22:15.02Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/f5/a4531f7fb8b4e2a60b94e39d5d924469b7a6988176b3422487be61fe2998/grpcio-1.76.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd", size = 11828219, upload-time = "2025-10-21T16:22:17.954Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/1c/de55d868ed7a8bd6acc6b1d6ddc4aa36d07a9f31d33c912c804adb1b971b/grpcio-1.76.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc", size = 6367826, upload-time = "2025-10-21T16:22:20.721Z" },
-    { url = "https://files.pythonhosted.org/packages/59/64/99e44c02b5adb0ad13ab3adc89cb33cb54bfa90c74770f2607eea629b86f/grpcio-1.76.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a", size = 7049550, upload-time = "2025-10-21T16:22:23.637Z" },
-    { url = "https://files.pythonhosted.org/packages/43/28/40a5be3f9a86949b83e7d6a2ad6011d993cbe9b6bd27bea881f61c7788b6/grpcio-1.76.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba", size = 6575564, upload-time = "2025-10-21T16:22:26.016Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/a9/1be18e6055b64467440208a8559afac243c66a8b904213af6f392dc2212f/grpcio-1.76.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09", size = 7176236, upload-time = "2025-10-21T16:22:28.362Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/55/dba05d3fcc151ce6e81327541d2cc8394f442f6b350fead67401661bf041/grpcio-1.76.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc", size = 8125795, upload-time = "2025-10-21T16:22:31.075Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/45/122df922d05655f63930cf42c9e3f72ba20aadb26c100ee105cad4ce4257/grpcio-1.76.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc", size = 7592214, upload-time = "2025-10-21T16:22:33.831Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/6e/0b899b7f6b66e5af39e377055fb4a6675c9ee28431df5708139df2e93233/grpcio-1.76.0-cp314-cp314-win32.whl", hash = "sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e", size = 4062961, upload-time = "2025-10-21T16:22:36.468Z" },
-    { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/1f/de/de568532d9907552700f80dcec38219d8d298ad9e71f5e0a095abaf2761e/grpcio-1.78.1.tar.gz", hash = "sha256:27c625532d33ace45d57e775edf1982e183ff8641c72e4e91ef7ba667a149d72", size = 12835760, upload-time = "2026-02-20T01:16:10.869Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/30/0534b643dafd54824769d6260b89c71d518e4ef8b5ad16b84d1ae9272978/grpcio-1.78.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:4393bef64cf26dc07cd6f18eaa5170ae4eebaafd4418e7e3a59ca9526a6fa30b", size = 5947661, upload-time = "2026-02-20T01:12:34.922Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/f8/f678566655ab822da0f713789555e7eddca7ef93da99f480c63de3aa94b4/grpcio-1.78.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:917047c19cd120b40aab9a4b8a22e9ce3562f4a1343c0d62b3cd2d5199da3d67", size = 11819948, upload-time = "2026-02-20T01:12:39.709Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/0b/a4b4210d946055f4e5a8430f2802202ae8f831b4b00d36d55055c5cf4b6a/grpcio-1.78.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ff7de398bb3528d44d17e6913a7cfe639e3b15c65595a71155322df16978c5e1", size = 6519850, upload-time = "2026-02-20T01:12:42.715Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/d9/a1e657a73000a71fa75ec7140ff3a8dc32eb3427560620e477c6a2735527/grpcio-1.78.1-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:15f6e636d1152667ddb4022b37534c161c8477274edb26a0b65b215dd0a81e97", size = 7198654, upload-time = "2026-02-20T01:12:46.164Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/28/a61c5bdf53c1638e657bb5eebb93c789837820e1fdb965145f05eccc2994/grpcio-1.78.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:27b5cb669603efb7883a882275db88b6b5d6b6c9f0267d5846ba8699b7ace338", size = 6727238, upload-time = "2026-02-20T01:12:48.472Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/3e/aa143d0687801986a29d85788c96089449f36651cd4e2a493737ae0c5be9/grpcio-1.78.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:86edb3966778fa05bfdb333688fde5dc9079f9e2a9aa6a5c42e9564b7656ba04", size = 7300960, upload-time = "2026-02-20T01:12:51.139Z" },
+    { url = "https://files.pythonhosted.org/packages/30/d3/53e0f26b46417f28d14b5951fc6a1eff79c08c8a339e967c0a19ec7cf9e9/grpcio-1.78.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:849cc62eb989bc3be5629d4f3acef79be0d0ff15622201ed251a86d17fef6494", size = 8285274, upload-time = "2026-02-20T01:12:53.315Z" },
+    { url = "https://files.pythonhosted.org/packages/29/d0/e0e9fd477ce86c07ed1ed1d5c34790f050b6d58bfde77b02b36e23f8b235/grpcio-1.78.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9a00992d6fafe19d648b9ccb4952200c50d8e36d0cce8cf026c56ed3fdc28465", size = 7726620, upload-time = "2026-02-20T01:12:56.498Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/b5/e138a9f7810d196081b2e047c378ca12358c5906d79c42ddec41bb43d528/grpcio-1.78.1-cp310-cp310-win32.whl", hash = "sha256:f8759a1347f3b4f03d9a9d4ce8f9f31ad5e5d0144ba06ccfb1ffaeb0ba4c1e20", size = 4076778, upload-time = "2026-02-20T01:12:59.098Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/95/9b02316b85731df0943a635ca6d02f155f673c4f17e60be0c4892a6eb051/grpcio-1.78.1-cp310-cp310-win_amd64.whl", hash = "sha256:e840405a3f1249509892be2399f668c59b9d492068a2cf326d661a8c79e5e747", size = 4798925, upload-time = "2026-02-20T01:13:03.186Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/1e/ad774af3b2c84f49c6d8c4a7bea4c40f02268ea8380630c28777edda463b/grpcio-1.78.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:3a8aa79bc6e004394c0abefd4b034c14affda7b66480085d87f5fbadf43b593b", size = 5951132, upload-time = "2026-02-20T01:13:05.942Z" },
+    { url = "https://files.pythonhosted.org/packages/48/9d/ad3c284bedd88c545e20675d98ae904114d8517a71b0efc0901e9166628f/grpcio-1.78.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:8e1fcb419da5811deb47b7749b8049f7c62b993ba17822e3c7231e3e0ba65b79", size = 11831052, upload-time = "2026-02-20T01:13:09.604Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/08/20d12865e47242d03c3ade9bb2127f5b4aded964f373284cfb357d47c5ac/grpcio-1.78.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b071dccac245c32cd6b1dd96b722283b855881ca0bf1c685cf843185f5d5d51e", size = 6524749, upload-time = "2026-02-20T01:13:21.692Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/53/a8b72f52b253ec0cfdf88a13e9236a9d717c332b8aa5f0ba9e4699e94b55/grpcio-1.78.1-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:d6fb962947e4fe321eeef3be1ba5ba49d32dea9233c825fcbade8e858c14aaf4", size = 7198995, upload-time = "2026-02-20T01:13:24.275Z" },
+    { url = "https://files.pythonhosted.org/packages/13/3c/ac769c8ded1bcb26bb119fb472d3374b481b3cf059a0875db9fc77139c17/grpcio-1.78.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6afd191551fd72e632367dfb083e33cd185bf9ead565f2476bba8ab864ae496", size = 6730770, upload-time = "2026-02-20T01:13:26.522Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/c3/2275ef4cc5b942314321f77d66179be4097ff484e82ca34bf7baa5b1ddbc/grpcio-1.78.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b2acd83186305c0802dbc4d81ed0ec2f3e8658d7fde97cfba2f78d7372f05b89", size = 7305036, upload-time = "2026-02-20T01:13:30.923Z" },
+    { url = "https://files.pythonhosted.org/packages/91/cb/3c2aa99e12cbbfc72c2ed8aa328e6041709d607d668860380e6cd00ba17d/grpcio-1.78.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5380268ab8513445740f1f77bd966d13043d07e2793487e61fd5b5d0935071eb", size = 8288641, upload-time = "2026-02-20T01:13:39.42Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/b2/21b89f492260ac645775d9973752ca873acfd0609d6998e9d3065a21ea2f/grpcio-1.78.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:389b77484959bdaad6a2b7dda44d7d1228381dd669a03f5660392aa0e9385b22", size = 7730967, upload-time = "2026-02-20T01:13:41.697Z" },
+    { url = "https://files.pythonhosted.org/packages/24/03/6b89eddf87fdffb8fa9d37375d44d3a798f4b8116ac363a5f7ca84caa327/grpcio-1.78.1-cp311-cp311-win32.whl", hash = "sha256:9dee66d142f4a8cca36b5b98a38f006419138c3c89e72071747f8fca415a6d8f", size = 4076680, upload-time = "2026-02-20T01:13:43.781Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/a8/204460b1bc1dff9862e98f56a2d14be3c4171f929f8eaf8c4517174b4270/grpcio-1.78.1-cp311-cp311-win_amd64.whl", hash = "sha256:43b930cf4f9c4a2262bb3e5d5bc40df426a72538b4f98e46f158b7eb112d2d70", size = 4801074, upload-time = "2026-02-20T01:13:46.315Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/ed/d2eb9d27fded1a76b2a80eb9aa8b12101da7e41ce2bac0ad3651e88a14ae/grpcio-1.78.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:41e4605c923e0e9a84a2718e4948a53a530172bfaf1a6d1ded16ef9c5849fca2", size = 5913389, upload-time = "2026-02-20T01:13:49.005Z" },
+    { url = "https://files.pythonhosted.org/packages/69/1b/40034e9ab010eeb3fa41ec61d8398c6dbf7062f3872c866b8f72700e2522/grpcio-1.78.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:39da1680d260c0c619c3b5fa2dc47480ca24d5704c7a548098bca7de7f5dd17f", size = 11811839, upload-time = "2026-02-20T01:13:51.839Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/69/fe16ef2979ea62b8aceb3a3f1e7a8bbb8b717ae2a44b5899d5d426073273/grpcio-1.78.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b5d5881d72a09b8336a8f874784a8eeffacde44a7bc1a148bce5a0243a265ef0", size = 6475805, upload-time = "2026-02-20T01:13:55.423Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/1e/069e0a9062167db18446917d7c00ae2e91029f96078a072bedc30aaaa8c3/grpcio-1.78.1-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:888ceb7821acd925b1c90f0cdceaed1386e69cfe25e496e0771f6c35a156132f", size = 7169955, upload-time = "2026-02-20T01:13:59.553Z" },
+    { url = "https://files.pythonhosted.org/packages/38/fc/44a57e2bb4a755e309ee4e9ed2b85c9af93450b6d3118de7e69410ee05fa/grpcio-1.78.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8942bdfc143b467c264b048862090c4ba9a0223c52ae28c9ae97754361372e42", size = 6690767, upload-time = "2026-02-20T01:14:02.31Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/87/21e16345d4c75046d453916166bc72a3309a382c8e97381ec4b8c1a54729/grpcio-1.78.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:716a544969660ed609164aff27b2effd3ff84e54ac81aa4ce77b1607ca917d22", size = 7266846, upload-time = "2026-02-20T01:14:12.974Z" },
+    { url = "https://files.pythonhosted.org/packages/11/df/d6261983f9ca9ef4d69893765007a9a3211b91d9faf85a2591063df381c7/grpcio-1.78.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4d50329b081c223d444751076bb5b389d4f06c2b32d51b31a1e98172e6cecfb9", size = 8253522, upload-time = "2026-02-20T01:14:17.407Z" },
+    { url = "https://files.pythonhosted.org/packages/de/7c/4f96a0ff113c5d853a27084d7590cd53fdb05169b596ea9f5f27f17e021e/grpcio-1.78.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7e836778c13ff70edada16567e8da0c431e8818eaae85b80d11c1ba5782eccbb", size = 7698070, upload-time = "2026-02-20T01:14:20.032Z" },
+    { url = "https://files.pythonhosted.org/packages/17/3c/7b55c0b5af88fbeb3d0c13e25492d3ace41ac9dbd0f5f8f6c0fb613b6706/grpcio-1.78.1-cp312-cp312-win32.whl", hash = "sha256:07eb016ea7444a22bef465cce045512756956433f54450aeaa0b443b8563b9ca", size = 4066474, upload-time = "2026-02-20T01:14:22.602Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/17/388c12d298901b0acf10b612b650692bfed60e541672b1d8965acbf2d722/grpcio-1.78.1-cp312-cp312-win_amd64.whl", hash = "sha256:02b82dcd2fa580f5e82b4cf62ecde1b3c7cc9ba27b946421200706a6e5acaf85", size = 4797537, upload-time = "2026-02-20T01:14:25.444Z" },
+    { url = "https://files.pythonhosted.org/packages/df/72/754754639cfd16ad04619e1435a518124b2d858e5752225376f9285d4c51/grpcio-1.78.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:2b7ad2981550ce999e25ce3f10c8863f718a352a2fd655068d29ea3fd37b4907", size = 5919437, upload-time = "2026-02-20T01:14:29.403Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/84/6267d1266f8bc335d3a8b7ccf981be7de41e3ed8bd3a49e57e588212b437/grpcio-1.78.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:409bfe22220889b9906739910a0ee4c197a967c21b8dd14b4b06dd477f8819ce", size = 11803701, upload-time = "2026-02-20T01:14:32.624Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/56/c9098e8b920a54261cd605bbb040de0cde1ca4406102db0aa2c0b11d1fb4/grpcio-1.78.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:34b6cb16f4b67eeb5206250dc5b4d5e8e3db939535e58efc330e4c61341554bd", size = 6479416, upload-time = "2026-02-20T01:14:35.926Z" },
+    { url = "https://files.pythonhosted.org/packages/86/cf/5d52024371ee62658b7ed72480200524087528844ec1b65265bbcd31c974/grpcio-1.78.1-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:39d21fd30d38a5afb93f0e2e71e2ec2bd894605fb75d41d5a40060c2f98f8d11", size = 7174087, upload-time = "2026-02-20T01:14:39.98Z" },
+    { url = "https://files.pythonhosted.org/packages/31/e6/5e59551afad4279e27335a6d60813b8aa3ae7b14fb62cea1d329a459c118/grpcio-1.78.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:09fbd4bcaadb6d8604ed1504b0bdf7ac18e48467e83a9d930a70a7fefa27e862", size = 6692881, upload-time = "2026-02-20T01:14:42.466Z" },
+    { url = "https://files.pythonhosted.org/packages/db/8f/940062de2d14013c02f51b079eb717964d67d46f5d44f22038975c9d9576/grpcio-1.78.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:db681513a1bdd879c0b24a5a6a70398da5eaaba0e077a306410dc6008426847a", size = 7269092, upload-time = "2026-02-20T01:14:45.826Z" },
+    { url = "https://files.pythonhosted.org/packages/09/87/9db657a4b5f3b15560ec591db950bc75a1a2f9e07832578d7e2b23d1a7bd/grpcio-1.78.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f81816faa426da461e9a597a178832a351d6f1078102590a4b32c77d251b71eb", size = 8252037, upload-time = "2026-02-20T01:14:48.57Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/37/b980e0265479ec65e26b6e300a39ceac33ecb3f762c2861d4bac990317cf/grpcio-1.78.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffbb760df1cd49e0989f9826b2fd48930700db6846ac171eaff404f3cfbe5c28", size = 7695243, upload-time = "2026-02-20T01:14:51.376Z" },
+    { url = "https://files.pythonhosted.org/packages/98/46/5fc42c100ab702fa1ea41a75c890c563c3f96432b4a287d5a6369654f323/grpcio-1.78.1-cp313-cp313-win32.whl", hash = "sha256:1a56bf3ee99af5cf32d469de91bf5de79bdac2e18082b495fc1063ea33f4f2d0", size = 4065329, upload-time = "2026-02-20T01:14:53.952Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/da/806d60bb6611dfc16cf463d982bd92bd8b6bd5f87dfac66b0a44dfe20995/grpcio-1.78.1-cp313-cp313-win_amd64.whl", hash = "sha256:8991c2add0d8505178ff6c3ae54bd9386279e712be82fa3733c54067aae9eda1", size = 4797637, upload-time = "2026-02-20T01:14:57.276Z" },
+    { url = "https://files.pythonhosted.org/packages/96/3a/2d2ec4d2ce2eb9d6a2b862630a0d9d4ff4239ecf1474ecff21442a78612a/grpcio-1.78.1-cp314-cp314-linux_armv7l.whl", hash = "sha256:d101fe49b1e0fb4a7aa36ed0c3821a0f67a5956ef572745452d2cd790d723a3f", size = 5920256, upload-time = "2026-02-20T01:15:00.23Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/92/dccb7d087a1220ed358753945230c1ddeeed13684b954cb09db6758f1271/grpcio-1.78.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:5ce1855e8cfc217cdf6bcfe0cf046d7cf81ddcc3e6894d6cfd075f87a2d8f460", size = 11813749, upload-time = "2026-02-20T01:15:03.312Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/47/c20e87f87986da9998f30f14776ce27e61f02482a3a030ffe265089342c6/grpcio-1.78.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd26048d066b51f39fe9206e2bcc2cea869a5e5b2d13c8d523f4179193047ebd", size = 6488739, upload-time = "2026-02-20T01:15:14.349Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/c2/088bd96e255133d7d87c3eed0d598350d16cde1041bdbe2bb065967aaf91/grpcio-1.78.1-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4b8d7fda614cf2af0f73bbb042f3b7fee2ecd4aea69ec98dbd903590a1083529", size = 7173096, upload-time = "2026-02-20T01:15:17.687Z" },
+    { url = "https://files.pythonhosted.org/packages/60/ce/168db121073a03355ce3552b3b1f790b5ded62deffd7d98c5f642b9d3d81/grpcio-1.78.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:656a5bd142caeb8b1efe1fe0b4434ecc7781f44c97cfc7927f6608627cf178c0", size = 6693861, upload-time = "2026-02-20T01:15:20.911Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/d0/90b30ec2d9425215dd56922d85a90babbe6ee7e8256ba77d866b9c0d3aba/grpcio-1.78.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:99550e344482e3c21950c034f74668fccf8a546d50c1ecb4f717543bbdc071ba", size = 7278083, upload-time = "2026-02-20T01:15:23.698Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/fb/73f9ba0b082bcd385d46205095fd9c917754685885b28fce3741e9f54529/grpcio-1.78.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8f27683ca68359bd3f0eb4925824d71e538f84338b3ae337ead2ae43977d7541", size = 8252546, upload-time = "2026-02-20T01:15:26.517Z" },
+    { url = "https://files.pythonhosted.org/packages/85/c5/6a89ea3cb5db6c3d9ed029b0396c49f64328c0cf5d2630ffeed25711920a/grpcio-1.78.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a40515b69ac50792f9b8ead260f194ba2bb3285375b6c40c7ff938f14c3df17d", size = 7696289, upload-time = "2026-02-20T01:15:29.718Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/05/63a7495048499ef437b4933d32e59b7f737bd5368ad6fb2479e2bd83bf2c/grpcio-1.78.1-cp314-cp314-win32.whl", hash = "sha256:2c473b54ef1618f4fb85e82ff4994de18143b74efc088b91b5a935a3a45042ba", size = 4142186, upload-time = "2026-02-20T01:15:32.786Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/ce/adfe7e5f701d503be7778291757452e3fab6b19acf51917c79f5d1cf7f8a/grpcio-1.78.1-cp314-cp314-win_amd64.whl", hash = "sha256:e2a6b33d1050dce2c6f563c5caf7f7cbeebf7fba8cde37ffe3803d50526900d1", size = 4932000, upload-time = "2026-02-20T01:15:36.127Z" },
 ]
 
 [[package]]
@@ -1663,7 +2013,7 @@ wheels = [
 
 [[package]]
 name = "hatchling"
-version = "1.28.0"
+version = "1.29.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "packaging" },
@@ -1672,38 +2022,38 @@ dependencies = [
     { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "trove-classifiers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/8e/e480359492affde4119a131da729dd26da742c2c9b604dff74836e47eef9/hatchling-1.28.0.tar.gz", hash = "sha256:4d50b02aece6892b8cd0b3ce6c82cb218594d3ec5836dbde75bf41a21ab004c8", size = 55365, upload-time = "2025-11-27T00:31:13.766Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/cf/9c/b4cfe330cd4f49cff17fd771154730555fa4123beb7f292cf0098b4e6c20/hatchling-1.29.0.tar.gz", hash = "sha256:793c31816d952cee405b83488ce001c719f325d9cda69f1fc4cd750527640ea6", size = 55656, upload-time = "2026-02-23T19:42:06.539Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/0d/a5/48cb7efb8b4718b1a4c0c331e3364a3a33f614ff0d6afd2b93ee883d3c47/hatchling-1.28.0-py3-none-any.whl", hash = "sha256:dc48722b68b3f4bbfa3ff618ca07cdea6750e7d03481289ffa8be1521d18a961", size = 76075, upload-time = "2025-11-27T00:31:12.544Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/8a/44032265776062a89171285ede55a0bdaadc8ac00f27f0512a71a9e3e1c8/hatchling-1.29.0-py3-none-any.whl", hash = "sha256:50af9343281f34785fab12da82e445ed987a6efb34fd8c2fc0f6e6630dbcc1b0", size = 76356, upload-time = "2026-02-23T19:42:05.197Z" },
 ]
 
 [[package]]
 name = "hf-xet"
-version = "1.2.0"
+version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload-time = "2025-10-24T19:04:11.422Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/40/e2e0a7eb9a51fe8828ba2d47fe22a7e74914ea8a0db68a18c3aa7449c767/hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813", size = 2717584, upload-time = "2025-10-24T19:04:09.586Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/7d/daf7f8bc4594fdd59a8a596f9e3886133fdc68e675292218a5e4c1b7e834/hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc", size = 3315004, upload-time = "2025-10-24T19:04:00.314Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/ba/45ea2f605fbf6d81c8b21e4d970b168b18a53515923010c312c06cd83164/hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5", size = 3222636, upload-time = "2025-10-24T19:03:58.111Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/1d/04513e3cab8f29ab8c109d309ddd21a2705afab9d52f2ba1151e0c14f086/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f", size = 3408448, upload-time = "2025-10-24T19:04:20.951Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/7c/60a2756d7feec7387db3a1176c632357632fbe7849fce576c5559d4520c7/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832", size = 3503401, upload-time = "2025-10-24T19:04:22.549Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/64/48fffbd67fb418ab07451e4ce641a70de1c40c10a13e25325e24858ebe5a/hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382", size = 2900866, upload-time = "2025-10-24T19:04:33.461Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" },
-    { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" },
-    { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" },
-    { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" },
-    { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" },
-    { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" },
-    { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/4f/3a/9aa61729228fb03e946409c51963f0cd2fd7c109f4ab93edc5f04a10be86/hf_xet-1.3.0.tar.gz", hash = "sha256:9c154ad63e17aca970987b2cf17dbd8a0c09bb18aeb246f637647a8058e4522b", size = 641390, upload-time = "2026-02-24T00:16:19.935Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/18/16954a87cfdfdc04792f1ffc9a29c0a48253ab10ec0f4856f39c7f7bf7cd/hf_xet-1.3.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:95bdeab4747cb45f855601e39b9e86ae92b4a114978ada6e0401961fcc5d2958", size = 3759481, upload-time = "2026-02-24T00:16:03.387Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/6f/a55752047e9b0e69517775531c14680331f00c9cd4dc07f5e9b7f7f68a12/hf_xet-1.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f99992583f27b139392601fe99e88df155dc4de7feba98ed27ce2d3e6b4a65bb", size = 3517927, upload-time = "2026-02-24T00:16:02.108Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/71/a909dbf9c8b166aa3f15db2bcf5d8afbe9d53170922edde2b919cf0bc455/hf_xet-1.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:687a71fc6d2eaa79d864da3aa13e5d887e124d357f5f306bfff6c385eea9d990", size = 4174328, upload-time = "2026-02-24T00:15:55.056Z" },
+    { url = "https://files.pythonhosted.org/packages/21/cc/dec0d971bb5872345b8d64363a0b78ed6a147eea5b4281575ce5a8150f42/hf_xet-1.3.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:75d19813ed0e24525409bc22566282ae9bc93e5d764b185565e863dc28280a45", size = 3953184, upload-time = "2026-02-24T00:15:53.43Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/d8/d4259146e7c7089dd3f22cd62676d665bcfbc27428a070abee8985e0ab33/hf_xet-1.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:078af43569c2e05233137a93a33d2293f95c272745eaf030a9bb5f27bb0c9e9c", size = 4152800, upload-time = "2026-02-24T00:16:10.391Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/0d/39d9d32e4cde689da618739197e264bba5a55d870377d5d32cdd5c03fad8/hf_xet-1.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be8731e1620cc8549025c39ed3917c8fd125efaeae54ae679214a3d573e6c109", size = 4390499, upload-time = "2026-02-24T00:16:11.671Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/27/5b9c323bf5513e8971702eeac43ba5cb554921e0f292ad52f20ed6028131/hf_xet-1.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:1552616c0e0fa728a4ffdffa106e91faa0fd4edb44868e79b464fad00b2758ee", size = 3634124, upload-time = "2026-02-24T00:16:20.964Z" },
+    { url = "https://files.pythonhosted.org/packages/85/32/76949adb65b7ca54c1e2b0519a98f7c88221b9091ae8780fc76d7d1bae70/hf_xet-1.3.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:a61496eccf412d7c51a5613c31a2051d357ddea6be53a0672c7644cf39bfefe9", size = 3759780, upload-time = "2026-02-24T00:16:09.037Z" },
+    { url = "https://files.pythonhosted.org/packages/63/c4/ad6fa712611711c129fa49eb17baaf0665647eb0abce32d94ccd44b69c6d/hf_xet-1.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:aba35218871cc438826076778958f7ab2a1f4f8d654e91c307073a815360558f", size = 3517640, upload-time = "2026-02-24T00:16:07.536Z" },
+    { url = "https://files.pythonhosted.org/packages/15/6b/b44659c5261cde6320a579d0acc949f19283a13d32fc9389fc49639f435e/hf_xet-1.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c444d8f657dedd7a72aa0ef0178fe01fe92b04b58014ee49e2b3b4985aea1529", size = 4174285, upload-time = "2026-02-24T00:16:00.848Z" },
+    { url = "https://files.pythonhosted.org/packages/61/cf/16ef1b366482fa4e71d1642b019158d7ac891bcb961477102ceadfe69436/hf_xet-1.3.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:6d1bbda7900d72bc591cd39a64e35ad07f89a24f90e3d7b7c692cb93a1926cde", size = 3952705, upload-time = "2026-02-24T00:15:59.355Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/5a/d03453902ab9373715f50f3969979782a355df94329ea958ae78304ca06b/hf_xet-1.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:588f5df302e7dba5c3b60d4e5c683f95678526c29b9f64cbeb23e9f1889c6b83", size = 4152353, upload-time = "2026-02-24T00:16:15.857Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/98/d3cd8cdd8d771bee9a03bd52faed6fa114a68a107a0e337aaf0b4c52bf0c/hf_xet-1.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:944ae454b296c42b18219c37f245c78d0e64a734057423e9309f4938faa85d7f", size = 4390010, upload-time = "2026-02-24T00:16:18.713Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/10/3c58501d44d7a148d749ffa6046cbd14aa75a7ab07c9e7a984f86294cc53/hf_xet-1.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:34cdd5f10e61b7a1a7542672d20887c85debcfeb70a471ff1506f5a4c9441e42", size = 3634277, upload-time = "2026-02-24T00:16:23.718Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/00/22d3d896466ded4c46ef6465b85fa434fa97d79f8f61cea322afde1d6157/hf_xet-1.3.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:df4447f69086dcc6418583315eda6ed09033ac1fbbc784fedcbbbdf67bea1680", size = 3761293, upload-time = "2026-02-24T00:16:06.012Z" },
+    { url = "https://files.pythonhosted.org/packages/97/fd/ebb0ea49e9bd9eb9f52844e417e0e6e9c8a59a1e84790691873fa910adc5/hf_xet-1.3.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:39f4fe714628adc2214ab4a67391182ee751bc4db581868cb3204900817758a8", size = 3523345, upload-time = "2026-02-24T00:16:04.615Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/bb/72ceaaf619cad23d151a281d52e15456bae72f52c3795e820c0b64a5f637/hf_xet-1.3.0-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9b16e53ed6b5c8197cefb3fd12047a430b7034428effed463c03cec68de7e9a3", size = 4178623, upload-time = "2026-02-24T00:15:57.857Z" },
+    { url = "https://files.pythonhosted.org/packages/19/30/3280f4b5e407b442923a80ac0b2d96a65be7494457c55695e63f9a2b33dd/hf_xet-1.3.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:92051a1f73019489be77f6837671024ec785a3d1b888466b09d3a9ea15c4a1b5", size = 3958884, upload-time = "2026-02-24T00:15:56.326Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/13/5174c6d52583e54a761c88570ca657d621ac684747613f47846debfd6d4d/hf_xet-1.3.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:943046b160e7804a85e68a659d2eee1a83ce3661f72d1294d3cc5ece0f45a355", size = 4158146, upload-time = "2026-02-24T00:16:13.158Z" },
+    { url = "https://files.pythonhosted.org/packages/12/13/ea8619021b119e19efdcaeec72f762b5be923cf79b5d4434f2cbbff39829/hf_xet-1.3.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9b798a95d41b4f33b0b455c8aa76ff1fd26a587a4dd3bdec29f0a37c60b78a2f", size = 4395565, upload-time = "2026-02-24T00:16:14.574Z" },
+    { url = "https://files.pythonhosted.org/packages/64/cd/b81d922118a171bfbbecffd60a477e79188ab876260412fac47226a685bf/hf_xet-1.3.0-cp37-abi3-win_amd64.whl", hash = "sha256:227eee5b99d19b9f20c31d901a0c2373af610a24a34e6c2701072c9de48d6d95", size = 3637830, upload-time = "2026-02-24T00:16:22.474Z" },
 ]
 
 [[package]]
@@ -1751,21 +2101,22 @@ http2 = [
 
 [[package]]
 name = "huggingface-hub"
-version = "0.36.0"
+version = "0.36.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
-    { name = "fsspec" },
-    { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "fsspec", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "requests" },
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" },
 ]
 
 [[package]]
@@ -1870,25 +2221,25 @@ wheels = [
 
 [[package]]
 name = "jmespath"
-version = "1.0.1"
+version = "1.1.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
+    { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
 ]
 
 [[package]]
 name = "joblib"
-version = "1.5.2"
+version = "1.5.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e8/5d/447af5ea094b9e4c4054f82e223ada074c552335b9b4b2d14bd9b35a67c4/joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55", size = 331077, upload-time = "2025-08-27T12:15:46.575Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
 ]
 
 [[package]]
 name = "jsonschema"
-version = "4.25.1"
+version = "4.26.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "attrs" },
@@ -1896,9 +2247,9 @@ dependencies = [
     { name = "referencing" },
     { name = "rpds-py" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/74/69/f7185de793a29082a9f3c7728268ffb31cb5095131a9c139a74078e27336/jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85", size = 357342, upload-time = "2025-08-18T17:03:50.038Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bf/9c/8c95d856233c1f82500c2450b8c68576b4cf1c871db3afac5c34ff84e6fd/jsonschema-4.25.1-py3-none-any.whl", hash = "sha256:3fba0169e345c7175110351d456342c364814cfcf3b964ba4587f22915230a63", size = 90040, upload-time = "2025-08-18T17:03:48.373Z" },
+    { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" },
 ]
 
 [[package]]
@@ -1933,7 +2284,7 @@ wheels = [
 
 [[package]]
 name = "leptonai"
-version = "0.26.7"
+version = "0.27.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1951,14 +2302,14 @@ dependencies = [
     { name = "pydantic" },
     { name = "python-multipart" },
     { name = "pyyaml" },
-    { name = "ray" },
+    { name = "ray", extra = ["default"] },
     { name = "requests" },
     { name = "rich" },
     { name = "typing-extensions" },
     { name = "uvicorn" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c2/4d/2b5ab13294b23326ba1d8ef6ad703b1d9535bf72a0617030ddd6238eb925/leptonai-0.26.7-py3-none-any.whl", hash = "sha256:74996da36bf177d2b148887dd349627ab8cd78b94623d543bc91ed9ad65ba0e2", size = 2452890, upload-time = "2025-11-07T20:07:14.99Z" },
+    { url = "https://files.pythonhosted.org/packages/88/80/281af82242d9e20e9c0b19fb35c2a7a6df728b14f25483271d8169ef0a9a/leptonai-0.27.0-py3-none-any.whl", hash = "sha256:2a83d77a3bfcd86b877483ab503b4cde970b0a4d4143535510dac67d565fc1a4", size = 2476455, upload-time = "2026-01-17T03:31:56.977Z" },
 ]
 
 [[package]]
@@ -2044,7 +2395,7 @@ wheels = [
 
 [[package]]
 name = "mamba-ssm"
-version = "2.2.6.post3"
+version = "2.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "einops" },
@@ -2055,29 +2406,63 @@ dependencies = [
     { name = "transformers" },
     { name = "triton", marker = "sys_platform == 'never'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b6/0c/9373a469ff7a33bdd0644e55fa45165ba3900274dcf7fe9f10ccc232aef9/mamba_ssm-2.2.6.post3.tar.gz", hash = "sha256:826a3cdb651959f191dac64502f8a29627d9116fe6bb7c57e4f562da1aea7bf3", size = 113913, upload-time = "2025-10-10T06:00:44.939Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/69/a87f06d9dba78c041adb81f2228e978aab179477c64f1a210c0fe0d63e8d/mamba_ssm-2.3.0.tar.gz", hash = "sha256:8294e12125f76021e4e190f4137e84a84935920eeda5d0037a6917524456b303", size = 121116, upload-time = "2026-01-12T17:07:22.152Z" }
 
 [[package]]
 name = "markdown"
-version = "3.10"
+version = "3.10.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/7dd27d9d863b3376fcf23a5a13cb5d024aed1db46f963f1b5735ae43b3be/markdown-3.10.tar.gz", hash = "sha256:37062d4f2aa4b2b6b32aefb80faa300f82cc790cb949a35b8caede34f2b68c0e", size = 364931, upload-time = "2025-11-03T19:51:15.007Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/70/81/54e3ce63502cd085a0c556652a4e1b919c45a446bd1e5300e10c44c8c521/markdown-3.10-py3-none-any.whl", hash = "sha256:b5b99d6951e2e4948d939255596523444c0e677c669700b1d17aa4a8a464cb7c", size = 107678, upload-time = "2025-11-03T19:51:13.887Z" },
+    { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" },
 ]
 
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
 dependencies = [
-    { name = "mdurl" },
+    { name = "mdurl", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596, upload-time = "2023-06-03T06:41:14.443Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" },
 ]
 
+[[package]]
+name = "markdown-it-py"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "mdurl", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
+]
+
 [[package]]
 name = "markupsafe"
 version = "3.0.3"
@@ -2177,7 +2562,8 @@ name = "mdit-py-plugins"
 version = "0.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markdown-it-py" },
+    { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b2/fd/a756d36c0bfba5f6e39a1cdbdbfdd448dc02692467d83816dff4592a1ebc/mdit_py_plugins-0.5.0.tar.gz", hash = "sha256:f4918cb50119f50446560513a8e311d574ff6aaed72606ddae6d35716fe809c6", size = 44655, upload-time = "2025-08-11T07:25:49.083Z" }
 wheels = [
@@ -2198,7 +2584,7 @@ name = "megatron-core"
 source = { editable = "." }
 dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging" },
     { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
@@ -2207,9 +2593,11 @@ dependencies = [
 dev = [
     { name = "av" },
     { name = "causal-conv1d" },
-    { name = "datasets" },
+    { name = "datasets", version = "4.5.0", source = { registry = "https://pypi.org/simple" } },
     { name = "einops" },
+    { name = "emerging-optimizers" },
     { name = "fastapi" },
+    { name = "flash-linear-attention" },
     { name = "flashinfer-python" },
     { name = "mamba-ssm" },
     { name = "megatron-energon", extra = ["av-decode"], marker = "extra == 'extra-13-megatron-core-dev'" },
@@ -2219,18 +2607,20 @@ dev = [
     { name = "nvidia-resiliency-ext" },
     { name = "nvtx" },
     { name = "onnxscript" },
-    { name = "opentelemetry-api" },
+    { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" } },
     { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "tensorstore", version = "0.1.79", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.81", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "tqdm" },
-    { name = "transformer-engine", extra = ["core-cu13", "pytorch"], marker = "extra == 'extra-13-megatron-core-dev'" },
+    { name = "transformer-engine", marker = "extra == 'extra-13-megatron-core-dev'" },
     { name = "wget" },
 ]
 lts = [
     { name = "av" },
     { name = "causal-conv1d" },
-    { name = "datasets" },
+    { name = "datasets", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "datasets", version = "4.5.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "einops" },
+    { name = "emerging-optimizers" },
     { name = "fastapi" },
     { name = "flashinfer-python" },
     { name = "mamba-ssm" },
@@ -2239,9 +2629,9 @@ lts = [
     { name = "nv-grouped-gemm" },
     { name = "nvtx" },
     { name = "onnxscript" },
-    { name = "opentelemetry-api" },
+    { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" } },
     { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "tensorstore", version = "0.1.79", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.81", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "tqdm" },
     { name = "wget" },
 ]
@@ -2264,15 +2654,18 @@ build = [
     { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 ci = [
-    { name = "pandas" },
+    { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "python-gitlab" },
     { name = "slack-sdk" },
 ]
 docs = [
-    { name = "myst-parser" },
+    { name = "myst-parser", version = "4.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "myst-parser", version = "5.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-sphinx-theme" },
     { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "sphinx-autobuild", version = "2024.10.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "sphinx-autobuild", version = "2025.8.25", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "sphinx-autodoc2" },
@@ -2302,7 +2695,8 @@ test = [
     { name = "pytest-random-order" },
     { name = "pyyaml" },
     { name = "tensorboard" },
-    { name = "wrapt" },
+    { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "wrapt", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 
 [package.metadata]
@@ -2315,10 +2709,13 @@ requires-dist = [
     { name = "datasets", marker = "extra == 'lts'" },
     { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" },
     { name = "einops", marker = "extra == 'lts'", specifier = "~=0.8" },
+    { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0" },
+    { name = "emerging-optimizers", marker = "extra == 'lts'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0" },
     { name = "fastapi", marker = "extra == 'dev'", specifier = "~=0.50" },
     { name = "fastapi", marker = "extra == 'lts'", specifier = "~=0.50" },
-    { name = "flashinfer-python", marker = "extra == 'dev'" },
-    { name = "flashinfer-python", marker = "extra == 'lts'" },
+    { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.4.0" },
+    { name = "flashinfer-python", marker = "extra == 'dev'", specifier = "~=0.5.0" },
+    { name = "flashinfer-python", marker = "extra == 'lts'", specifier = "~=0.5.0" },
     { name = "flask-restful", marker = "extra == 'mlm'" },
     { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" },
     { name = "mamba-ssm", marker = "extra == 'lts'", specifier = "~=2.2" },
@@ -2342,10 +2739,10 @@ requires-dist = [
     { name = "tensorstore", marker = "extra == 'dev'", specifier = "~=0.1,!=0.1.46,!=0.1.72" },
     { name = "tensorstore", marker = "extra == 'lts'", specifier = "~=0.1,!=0.1.46,!=0.1.72" },
     { name = "tiktoken", marker = "extra == 'mlm'" },
-    { name = "torch" },
+    { name = "torch", specifier = ">=2.6.0" },
     { name = "tqdm", marker = "extra == 'dev'" },
     { name = "tqdm", marker = "extra == 'lts'" },
-    { name = "transformer-engine", extras = ["core-cu13", "pytorch"], marker = "extra == 'dev'", specifier = ">=2.9.0a0,<2.10.0" },
+    { name = "transformer-engine", extras = ["core-cu13", "pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=5671fd3675906cda1ade26c24a65d3dedd88eb89" },
     { name = "transformers", marker = "extra == 'mlm'" },
     { name = "wandb", marker = "extra == 'mlm'" },
     { name = "wget", marker = "extra == 'dev'" },
@@ -2360,7 +2757,7 @@ build = [
     { name = "nvidia-mathdx" },
     { name = "packaging", specifier = ">=24.2" },
     { name = "pybind11" },
-    { name = "setuptools", specifier = "<80.0.0" },
+    { name = "setuptools", specifier = ">=77.0.0,<80.0.0" },
     { name = "torch" },
 ]
 ci = [
@@ -2384,7 +2781,7 @@ linting = [
     { name = "ruff", specifier = "~=0.9.0" },
 ]
 no-pypi-wheels = [
-    { name = "emerging-optimizers", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=fb1add873e7851ec34b48581ea1b15761b73d189" },
+    { name = "emerging-optimizers", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0" },
     { name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" },
 ]
 test = [
@@ -2412,10 +2809,11 @@ dependencies = [
     { name = "click" },
     { name = "multi-storage-client" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pillow" },
     { name = "pyyaml" },
-    { name = "s3fs" },
+    { name = "s3fs", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "s3fs", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "tqdm" },
     { name = "webdataset" },
@@ -2441,7 +2839,7 @@ version = "0.5.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" }
 wheels = [
@@ -2553,14 +2951,14 @@ wheels = [
 
 [[package]]
 name = "multi-storage-client"
-version = "0.36.0"
+version = "0.42.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "jmespath" },
     { name = "jsonschema" },
     { name = "lark" },
-    { name = "opentelemetry-api" },
+    { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" } },
     { name = "prettytable" },
     { name = "psutil" },
     { name = "python-dateutil" },
@@ -2571,168 +2969,184 @@ dependencies = [
     { name = "xattr" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/be/5f/8011fd041f695670b339c25f059b68207c315250ccc25a08f190bff78318/multi_storage_client-0.36.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:763cdb5e24b78adf33882b1d1c0d15021cc2c0088ffc6e7b0269259f0cd45fd2", size = 5299321, upload-time = "2025-11-26T20:03:58.147Z" },
-    { url = "https://files.pythonhosted.org/packages/51/06/cfd17d307fe29fbbce9f196ec1d8dda3f93fd44711c0adb282d9c393a2b2/multi_storage_client-0.36.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:eb84ea0bdffcfddf9beb7239c6d0b1950a67a0afe36ef970da70ba4ab373c0c9", size = 5420867, upload-time = "2025-11-26T20:05:32.445Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/7f/bf22f9c67c70d5ec2f6a7a4798cb106f3023bf25ba6c21b0ade1a53fa5b3/multi_storage_client-0.36.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff03a0213ce1377abee61e8deb87607f0ccd35c245fbaab2fee51d2e591e833e", size = 3188237, upload-time = "2025-11-26T20:01:51.354Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/20/c0c019b3dc7719f79c1826364fc9c3e1bbe9b00246b1d7414ce2b4defd0b/multi_storage_client-0.36.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f16e577ef4ee6f8ac481b3f2290e7b0525676efd82c71fb694ba4e6c65a8facd", size = 3363259, upload-time = "2025-11-26T20:00:10.679Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/f8/eea6be7f4258c811373dc989e8eaa23a404499c2574059f6fd876d6904e4/multi_storage_client-0.36.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6c913b132573fbd7a5ada63086d3ce2669b913b79206f86867cc674d57b9164d", size = 5299844, upload-time = "2025-11-26T20:00:32.46Z" },
-    { url = "https://files.pythonhosted.org/packages/df/aa/b73441dc17097ee92e7efac5080e2cfb8fe4515dd4dc91ca351829e6b7a9/multi_storage_client-0.36.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:4dd2ccf67deae403098a5e867ce33d35ce348d2acd1a743c9ef485b3b1eea65c", size = 5424007, upload-time = "2025-11-26T19:55:30.305Z" },
-    { url = "https://files.pythonhosted.org/packages/54/d6/850550de6b0dc740ced2f8fbf83f13f757860b5fdaa652e477c567c01f34/multi_storage_client-0.36.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:04b31b6a5d6a3c90a592b23a4b90368fa1dcca8cb03f76a862d307f8b072c1d3", size = 3188451, upload-time = "2025-11-26T19:56:32.191Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/c5/93e038c0cce46cb9b1b8e19f7215ce3e7fa1af5e0a9662f36dfe47062f7e/multi_storage_client-0.36.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:252f84116f674962eabd066e16040f0304f6191c06ab09ef2ec02dbfd2c4d2ea", size = 3366554, upload-time = "2025-11-26T19:58:37.742Z" },
-    { url = "https://files.pythonhosted.org/packages/28/a2/46320db394150a2f0547930b902e8ad045a084fb519f408e2c9b4ca673a0/multi_storage_client-0.36.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2058e8e8f8fd9eef033171b0bf1966596e9862c7f20c2886101ad979996c453b", size = 5293778, upload-time = "2025-11-26T20:07:11.731Z" },
-    { url = "https://files.pythonhosted.org/packages/00/2d/658af3b4104c4f2aa2621469482dca8270490601e98d8f7997361499adaa/multi_storage_client-0.36.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:22b69c7f3c9ffa166f38bafa7e08f6b664a5dbee8c88d5d740bed719e6f410a1", size = 5418642, upload-time = "2025-11-26T19:58:15.717Z" },
-    { url = "https://files.pythonhosted.org/packages/09/2f/6441794bf8dc195d614d63ad2b7068ad7703972fd6f960d43202d29748b1/multi_storage_client-0.36.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b384fb326637e79706ff706e60f384b24fdbcc824420bb66ef615a9ef5ffb4ec", size = 3194133, upload-time = "2025-11-26T20:05:54.618Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/ba/b07361ff84e5bd263e299b03776382f59bd92862573c915dd705a09f3c1d/multi_storage_client-0.36.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7111567b971a68719c0eb68245d49a0a3c3bf5af2f609351446f20ac3e83c0d5", size = 3364563, upload-time = "2025-11-26T20:04:20.3Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/4a/cbd61589a457e2f4fbacd08b7e7dd11cdb74690857f4b40042844b1ff894/multi_storage_client-0.36.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a8137558d5f05e4722c54540e2d6067ea61e9ce3d736fa9cb5c541c7f94d1b48", size = 5293550, upload-time = "2025-11-26T20:03:36.459Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/3d/7499a9d537fa950a9acf11604b1f9372ed2cadd582b55f1c7cb885ce6f40/multi_storage_client-0.36.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:5394c5e040c32433b42e902d9fcf03f8a475c5c9ff1cca80743b2cb944c8af9e", size = 5417538, upload-time = "2025-11-26T20:06:16.782Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/c3/1b1adc3b3b8569d258a34dbedb6a8c51fc94b947b2df276e251f0f1e23a2/multi_storage_client-0.36.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:195e8c8d57d812b73efd41b96cd60825c484d317ec86379fad3e435e9365a4a6", size = 3193426, upload-time = "2025-11-26T20:00:56.034Z" },
-    { url = "https://files.pythonhosted.org/packages/60/f5/f8b97a87d928057b493733760f37de70ae5ffff84b86f6efae101cdd57a2/multi_storage_client-0.36.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8402d0e1cefedf38ad9eefe8b3c56d3a44cfec7775ef711da18e7dbf72669444", size = 3363531, upload-time = "2025-11-26T20:02:35.296Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/bb/6d7d9c53ce7b834cd7539ac579816c1482095127fc69a698750db21b1059/multi_storage_client-0.42.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:aa58acbea25b78dd902ce07b080d3feb6a80e51154c711449a0751f8cd37742e", size = 8805264, upload-time = "2026-02-06T20:58:49.246Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/57/f9bec92d9a76467898a4ebdf501182151c6b5dc6d00a0a89a374b7f70ad3/multi_storage_client-0.42.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa278cc2bb7cdf80bc3407ced7d8a8b258801093b90e720059f6c4cdc5d68085", size = 5154902, upload-time = "2026-02-06T21:04:05.221Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/2c/a417437abcc5c8ab0396fddfe9a158ea60e770e8d461ed0b2146a8efbf62/multi_storage_client-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32baa11cd3ce853f2072620134fe92e2ed3d682355fae2f86226c366717814ce", size = 5422721, upload-time = "2026-02-06T20:59:29.687Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/90/1e69cb6d71418b38a9409b0e2564efe1e7c12e18e63e478591ae0317dbcc/multi_storage_client-0.42.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3db30610d6bb15a5c211af9d7b11c8a1a13265893c1a625d5aaadacdb61a9a8e", size = 8805275, upload-time = "2026-02-06T20:58:10.943Z" },
+    { url = "https://files.pythonhosted.org/packages/de/dd/a55dc9e60113f98af10075c3e33b97007bfbfd2e6f8bc6a1b2b1b43857c8/multi_storage_client-0.42.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8c2d491475eec5e80ad706eca7005d9bd17d30b29166e891c18695b42336493", size = 5155309, upload-time = "2026-02-06T20:56:22.528Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/b6/648a1d6b4482634fbb0d5bc0ba156b42fafd4f364227f9203bc4ac70dbac/multi_storage_client-0.42.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91a87e05e0e09b8fbd6804bb1ac85a28213d4371e91d06d9c35ad12b247f28ec", size = 5422770, upload-time = "2026-02-06T21:01:41.97Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/5a/6af92f30d09c97a314594029c115da0c44d5fa14e772983d88ad8023d355/multi_storage_client-0.42.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5c71c128b9f81cfbd59f1e2c2acfb2559658dfecde904496b7845901f0161430", size = 8798046, upload-time = "2026-02-06T21:02:32.674Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/b2/e686bcbe754bfede1773153d928422b2c4b25453faf0e228cf9cadfa73e0/multi_storage_client-0.42.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afe72fcb3f44ddc23488ab65bbab8575181fe15f63d297074703a36f4d8f7cc9", size = 5155767, upload-time = "2026-02-06T21:01:02.151Z" },
+    { url = "https://files.pythonhosted.org/packages/05/44/2b7e0ec6fa68f208cb919c38df346cca37c910906f564a43f74731eb6cdb/multi_storage_client-0.42.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30410d59d1f93758640a15779af6379a961bfa0f9607809a2b869e8b750efac7", size = 5421800, upload-time = "2026-02-06T21:03:04.852Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/ba/c342143f3820a1debd223149bb362246c983b6b6ef70ad245b0d9cfc8509/multi_storage_client-0.42.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:dde8cbbd066f2756f5fc7efe7f2be713a4b9212f28ddd49dc9d8008148e86e97", size = 8797502, upload-time = "2026-02-06T21:03:27.859Z" },
+    { url = "https://files.pythonhosted.org/packages/52/9c/43cfac582592df71723add55a40b7007c6c6412e4188e44a752ff5376a85/multi_storage_client-0.42.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2321ab65f464aeee17c91d3e999ab9df42ba7fb8e8e67ee764f3f76c9b11a2f", size = 5155687, upload-time = "2026-02-06T20:58:30.288Z" },
+    { url = "https://files.pythonhosted.org/packages/54/b6/c745f2bc357ba83373ad655fa7296a21e9022402cbab811dbd22aed2f87f/multi_storage_client-0.42.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c305df6d59e81f909c6a23d35fc3d0ceaac723238457f236a3f55261db0b5bae", size = 5422017, upload-time = "2026-02-06T20:56:45.672Z" },
 ]
 
 [[package]]
 name = "multidict"
-version = "6.7.0"
+version = "6.7.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/80/1e/5492c365f222f907de1039b91f922b93fa4f764c713ee858d235495d8f50/multidict-6.7.0.tar.gz", hash = "sha256:c6e99d9a65ca282e578dfea819cfa9c0a62b2499d8677392e09feaf305e9e6f5", size = 101834, upload-time = "2025-10-06T14:52:30.657Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a9/63/7bdd4adc330abcca54c85728db2327130e49e52e8c3ce685cec44e0f2e9f/multidict-6.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9f474ad5acda359c8758c8accc22032c6abe6dc87a8be2440d097785e27a9349", size = 77153, upload-time = "2025-10-06T14:48:26.409Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/bb/b6c35ff175ed1a3142222b78455ee31be71a8396ed3ab5280fbe3ebe4e85/multidict-6.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a9db5a870f780220e931d0002bbfd88fb53aceb6293251e2c839415c1b20e", size = 44993, upload-time = "2025-10-06T14:48:28.4Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/1f/064c77877c5fa6df6d346e68075c0f6998547afe952d6471b4c5f6a7345d/multidict-6.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03ca744319864e92721195fa28c7a3b2bc7b686246b35e4078c1e4d0eb5466d3", size = 44607, upload-time = "2025-10-06T14:48:29.581Z" },
-    { url = "https://files.pythonhosted.org/packages/04/7a/bf6aa92065dd47f287690000b3d7d332edfccb2277634cadf6a810463c6a/multidict-6.7.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f0e77e3c0008bc9316e662624535b88d360c3a5d3f81e15cf12c139a75250046", size = 241847, upload-time = "2025-10-06T14:48:32.107Z" },
-    { url = "https://files.pythonhosted.org/packages/94/39/297a8de920f76eda343e4ce05f3b489f0ab3f9504f2576dfb37b7c08ca08/multidict-6.7.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08325c9e5367aa379a3496aa9a022fe8837ff22e00b94db256d3a1378c76ab32", size = 242616, upload-time = "2025-10-06T14:48:34.054Z" },
-    { url = "https://files.pythonhosted.org/packages/39/3a/d0eee2898cfd9d654aea6cb8c4addc2f9756e9a7e09391cfe55541f917f7/multidict-6.7.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e2862408c99f84aa571ab462d25236ef9cb12a602ea959ba9c9009a54902fc73", size = 222333, upload-time = "2025-10-06T14:48:35.9Z" },
-    { url = "https://files.pythonhosted.org/packages/05/48/3b328851193c7a4240815b71eea165b49248867bbb6153a0aee227a0bb47/multidict-6.7.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4d72a9a2d885f5c208b0cb91ff2ed43636bb7e345ec839ff64708e04f69a13cc", size = 253239, upload-time = "2025-10-06T14:48:37.302Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/ca/0706a98c8d126a89245413225ca4a3fefc8435014de309cf8b30acb68841/multidict-6.7.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:478cc36476687bac1514d651cbbaa94b86b0732fb6855c60c673794c7dd2da62", size = 251618, upload-time = "2025-10-06T14:48:38.963Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/4f/9c7992f245554d8b173f6f0a048ad24b3e645d883f096857ec2c0822b8bd/multidict-6.7.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6843b28b0364dc605f21481c90fadb5f60d9123b442eb8a726bb74feef588a84", size = 241655, upload-time = "2025-10-06T14:48:40.312Z" },
-    { url = "https://files.pythonhosted.org/packages/31/79/26a85991ae67efd1c0b1fc2e0c275b8a6aceeb155a68861f63f87a798f16/multidict-6.7.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:23bfeee5316266e5ee2d625df2d2c602b829435fc3a235c2ba2131495706e4a0", size = 239245, upload-time = "2025-10-06T14:48:41.848Z" },
-    { url = "https://files.pythonhosted.org/packages/14/1e/75fa96394478930b79d0302eaf9a6c69f34005a1a5251ac8b9c336486ec9/multidict-6.7.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:680878b9f3d45c31e1f730eef731f9b0bc1da456155688c6745ee84eb818e90e", size = 233523, upload-time = "2025-10-06T14:48:43.749Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/5e/085544cb9f9c4ad2b5d97467c15f856df8d9bac410cffd5c43991a5d878b/multidict-6.7.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:eb866162ef2f45063acc7a53a88ef6fe8bf121d45c30ea3c9cd87ce7e191a8d4", size = 243129, upload-time = "2025-10-06T14:48:45.225Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/c3/e9d9e2f20c9474e7a8fcef28f863c5cbd29bb5adce6b70cebe8bdad0039d/multidict-6.7.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:df0e3bf7993bdbeca5ac25aa859cf40d39019e015c9c91809ba7093967f7a648", size = 248999, upload-time = "2025-10-06T14:48:46.703Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/3f/df171b6efa3239ae33b97b887e42671cd1d94d460614bfb2c30ffdab3b95/multidict-6.7.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:661709cdcd919a2ece2234f9bae7174e5220c80b034585d7d8a755632d3e2111", size = 243711, upload-time = "2025-10-06T14:48:48.146Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/2f/9b5564888c4e14b9af64c54acf149263721a283aaf4aa0ae89b091d5d8c1/multidict-6.7.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:096f52730c3fb8ed419db2d44391932b63891b2c5ed14850a7e215c0ba9ade36", size = 237504, upload-time = "2025-10-06T14:48:49.447Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/3a/0bd6ca0f7d96d790542d591c8c3354c1e1b6bfd2024d4d92dc3d87485ec7/multidict-6.7.0-cp310-cp310-win32.whl", hash = "sha256:afa8a2978ec65d2336305550535c9c4ff50ee527914328c8677b3973ade52b85", size = 41422, upload-time = "2025-10-06T14:48:50.789Z" },
-    { url = "https://files.pythonhosted.org/packages/00/35/f6a637ea2c75f0d3b7c7d41b1189189acff0d9deeb8b8f35536bb30f5e33/multidict-6.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:b15b3afff74f707b9275d5ba6a91ae8f6429c3ffb29bbfd216b0b375a56f13d7", size = 46050, upload-time = "2025-10-06T14:48:51.938Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/b8/f7bf8329b39893d02d9d95cf610c75885d12fc0f402b1c894e1c8e01c916/multidict-6.7.0-cp310-cp310-win_arm64.whl", hash = "sha256:4b73189894398d59131a66ff157837b1fafea9974be486d036bb3d32331fdbf0", size = 43153, upload-time = "2025-10-06T14:48:53.146Z" },
-    { url = "https://files.pythonhosted.org/packages/34/9e/5c727587644d67b2ed479041e4b1c58e30afc011e3d45d25bbe35781217c/multidict-6.7.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4d409aa42a94c0b3fa617708ef5276dfe81012ba6753a0370fcc9d0195d0a1fc", size = 76604, upload-time = "2025-10-06T14:48:54.277Z" },
-    { url = "https://files.pythonhosted.org/packages/17/e4/67b5c27bd17c085a5ea8f1ec05b8a3e5cba0ca734bfcad5560fb129e70ca/multidict-6.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14c9e076eede3b54c636f8ce1c9c252b5f057c62131211f0ceeec273810c9721", size = 44715, upload-time = "2025-10-06T14:48:55.445Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/e1/866a5d77be6ea435711bef2a4291eed11032679b6b28b56b4776ab06ba3e/multidict-6.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c09703000a9d0fa3c3404b27041e574cc7f4df4c6563873246d0e11812a94b6", size = 44332, upload-time = "2025-10-06T14:48:56.706Z" },
-    { url = "https://files.pythonhosted.org/packages/31/61/0c2d50241ada71ff61a79518db85ada85fdabfcf395d5968dae1cbda04e5/multidict-6.7.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a265acbb7bb33a3a2d626afbe756371dce0279e7b17f4f4eda406459c2b5ff1c", size = 245212, upload-time = "2025-10-06T14:48:58.042Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/e0/919666a4e4b57fff1b57f279be1c9316e6cdc5de8a8b525d76f6598fefc7/multidict-6.7.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51cb455de290ae462593e5b1cb1118c5c22ea7f0d3620d9940bf695cea5a4bd7", size = 246671, upload-time = "2025-10-06T14:49:00.004Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/cc/d027d9c5a520f3321b65adea289b965e7bcbd2c34402663f482648c716ce/multidict-6.7.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:db99677b4457c7a5c5a949353e125ba72d62b35f74e26da141530fbb012218a7", size = 225491, upload-time = "2025-10-06T14:49:01.393Z" },
-    { url = "https://files.pythonhosted.org/packages/75/c4/bbd633980ce6155a28ff04e6a6492dd3335858394d7bb752d8b108708558/multidict-6.7.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f470f68adc395e0183b92a2f4689264d1ea4b40504a24d9882c27375e6662bb9", size = 257322, upload-time = "2025-10-06T14:49:02.745Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/6d/d622322d344f1f053eae47e033b0b3f965af01212de21b10bcf91be991fb/multidict-6.7.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0db4956f82723cc1c270de9c6e799b4c341d327762ec78ef82bb962f79cc07d8", size = 254694, upload-time = "2025-10-06T14:49:04.15Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/9f/78f8761c2705d4c6d7516faed63c0ebdac569f6db1bef95e0d5218fdc146/multidict-6.7.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e56d780c238f9e1ae66a22d2adf8d16f485381878250db8d496623cd38b22bd", size = 246715, upload-time = "2025-10-06T14:49:05.967Z" },
-    { url = "https://files.pythonhosted.org/packages/78/59/950818e04f91b9c2b95aab3d923d9eabd01689d0dcd889563988e9ea0fd8/multidict-6.7.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9d14baca2ee12c1a64740d4531356ba50b82543017f3ad6de0deb943c5979abb", size = 243189, upload-time = "2025-10-06T14:49:07.37Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/3d/77c79e1934cad2ee74991840f8a0110966d9599b3af95964c0cd79bb905b/multidict-6.7.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:295a92a76188917c7f99cda95858c822f9e4aae5824246bba9b6b44004ddd0a6", size = 237845, upload-time = "2025-10-06T14:49:08.759Z" },
-    { url = "https://files.pythonhosted.org/packages/63/1b/834ce32a0a97a3b70f86437f685f880136677ac00d8bce0027e9fd9c2db7/multidict-6.7.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39f1719f57adbb767ef592a50ae5ebb794220d1188f9ca93de471336401c34d2", size = 246374, upload-time = "2025-10-06T14:49:10.574Z" },
-    { url = "https://files.pythonhosted.org/packages/23/ef/43d1c3ba205b5dec93dc97f3fba179dfa47910fc73aaaea4f7ceb41cec2a/multidict-6.7.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0a13fb8e748dfc94749f622de065dd5c1def7e0d2216dba72b1d8069a389c6ff", size = 253345, upload-time = "2025-10-06T14:49:12.331Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/03/eaf95bcc2d19ead522001f6a650ef32811aa9e3624ff0ad37c445c7a588c/multidict-6.7.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e3aa16de190d29a0ea1b48253c57d99a68492c8dd8948638073ab9e74dc9410b", size = 246940, upload-time = "2025-10-06T14:49:13.821Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/df/ec8a5fd66ea6cd6f525b1fcbb23511b033c3e9bc42b81384834ffa484a62/multidict-6.7.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a048ce45dcdaaf1defb76b2e684f997fb5abf74437b6cb7b22ddad934a964e34", size = 242229, upload-time = "2025-10-06T14:49:15.603Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/a2/59b405d59fd39ec86d1142630e9049243015a5f5291ba49cadf3c090c541/multidict-6.7.0-cp311-cp311-win32.whl", hash = "sha256:a90af66facec4cebe4181b9e62a68be65e45ac9b52b67de9eec118701856e7ff", size = 41308, upload-time = "2025-10-06T14:49:16.871Z" },
-    { url = "https://files.pythonhosted.org/packages/32/0f/13228f26f8b882c34da36efa776c3b7348455ec383bab4a66390e42963ae/multidict-6.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:95b5ffa4349df2887518bb839409bcf22caa72d82beec453216802f475b23c81", size = 46037, upload-time = "2025-10-06T14:49:18.457Z" },
-    { url = "https://files.pythonhosted.org/packages/84/1f/68588e31b000535a3207fd3c909ebeec4fb36b52c442107499c18a896a2a/multidict-6.7.0-cp311-cp311-win_arm64.whl", hash = "sha256:329aa225b085b6f004a4955271a7ba9f1087e39dcb7e65f6284a988264a63912", size = 43023, upload-time = "2025-10-06T14:49:19.648Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/9e/9f61ac18d9c8b475889f32ccfa91c9f59363480613fc807b6e3023d6f60b/multidict-6.7.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8a3862568a36d26e650a19bb5cbbba14b71789032aebc0423f8cc5f150730184", size = 76877, upload-time = "2025-10-06T14:49:20.884Z" },
-    { url = "https://files.pythonhosted.org/packages/38/6f/614f09a04e6184f8824268fce4bc925e9849edfa654ddd59f0b64508c595/multidict-6.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:960c60b5849b9b4f9dcc9bea6e3626143c252c74113df2c1540aebce70209b45", size = 45467, upload-time = "2025-10-06T14:49:22.054Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/93/c4f67a436dd026f2e780c433277fff72be79152894d9fc36f44569cab1a6/multidict-6.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2049be98fb57a31b4ccf870bf377af2504d4ae35646a19037ec271e4c07998aa", size = 43834, upload-time = "2025-10-06T14:49:23.566Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/f5/013798161ca665e4a422afbc5e2d9e4070142a9ff8905e482139cd09e4d0/multidict-6.7.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0934f3843a1860dd465d38895c17fce1f1cb37295149ab05cd1b9a03afacb2a7", size = 250545, upload-time = "2025-10-06T14:49:24.882Z" },
-    { url = "https://files.pythonhosted.org/packages/71/2f/91dbac13e0ba94669ea5119ba267c9a832f0cb65419aca75549fcf09a3dc/multidict-6.7.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b3e34f3a1b8131ba06f1a73adab24f30934d148afcd5f5de9a73565a4404384e", size = 258305, upload-time = "2025-10-06T14:49:26.778Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/b0/754038b26f6e04488b48ac621f779c341338d78503fb45403755af2df477/multidict-6.7.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:efbb54e98446892590dc2458c19c10344ee9a883a79b5cec4bc34d6656e8d546", size = 242363, upload-time = "2025-10-06T14:49:28.562Z" },
-    { url = "https://files.pythonhosted.org/packages/87/15/9da40b9336a7c9fa606c4cf2ed80a649dffeb42b905d4f63a1d7eb17d746/multidict-6.7.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a35c5fc61d4f51eb045061e7967cfe3123d622cd500e8868e7c0c592a09fedc4", size = 268375, upload-time = "2025-10-06T14:49:29.96Z" },
-    { url = "https://files.pythonhosted.org/packages/82/72/c53fcade0cc94dfaad583105fd92b3a783af2091eddcb41a6d5a52474000/multidict-6.7.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29fe6740ebccba4175af1b9b87bf553e9c15cd5868ee967e010efcf94e4fd0f1", size = 269346, upload-time = "2025-10-06T14:49:31.404Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/e2/9baffdae21a76f77ef8447f1a05a96ec4bc0a24dae08767abc0a2fe680b8/multidict-6.7.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:123e2a72e20537add2f33a79e605f6191fba2afda4cbb876e35c1a7074298a7d", size = 256107, upload-time = "2025-10-06T14:49:32.974Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/06/3f06f611087dc60d65ef775f1fb5aca7c6d61c6db4990e7cda0cef9b1651/multidict-6.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b284e319754366c1aee2267a2036248b24eeb17ecd5dc16022095e747f2f4304", size = 253592, upload-time = "2025-10-06T14:49:34.52Z" },
-    { url = "https://files.pythonhosted.org/packages/20/24/54e804ec7945b6023b340c412ce9c3f81e91b3bf5fa5ce65558740141bee/multidict-6.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:803d685de7be4303b5a657b76e2f6d1240e7e0a8aa2968ad5811fa2285553a12", size = 251024, upload-time = "2025-10-06T14:49:35.956Z" },
-    { url = "https://files.pythonhosted.org/packages/14/48/011cba467ea0b17ceb938315d219391d3e421dfd35928e5dbdc3f4ae76ef/multidict-6.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c04a328260dfd5db8c39538f999f02779012268f54614902d0afc775d44e0a62", size = 251484, upload-time = "2025-10-06T14:49:37.631Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/2f/919258b43bb35b99fa127435cfb2d91798eb3a943396631ef43e3720dcf4/multidict-6.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8a19cdb57cd3df4cd865849d93ee14920fb97224300c88501f16ecfa2604b4e0", size = 263579, upload-time = "2025-10-06T14:49:39.502Z" },
-    { url = "https://files.pythonhosted.org/packages/31/22/a0e884d86b5242b5a74cf08e876bdf299e413016b66e55511f7a804a366e/multidict-6.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b2fd74c52accced7e75de26023b7dccee62511a600e62311b918ec5c168fc2a", size = 259654, upload-time = "2025-10-06T14:49:41.32Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/e5/17e10e1b5c5f5a40f2fcbb45953c9b215f8a4098003915e46a93f5fcaa8f/multidict-6.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3e8bfdd0e487acf992407a140d2589fe598238eaeffa3da8448d63a63cd363f8", size = 251511, upload-time = "2025-10-06T14:49:46.021Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/9a/201bb1e17e7af53139597069c375e7b0dcbd47594604f65c2d5359508566/multidict-6.7.0-cp312-cp312-win32.whl", hash = "sha256:dd32a49400a2c3d52088e120ee00c1e3576cbff7e10b98467962c74fdb762ed4", size = 41895, upload-time = "2025-10-06T14:49:48.718Z" },
-    { url = "https://files.pythonhosted.org/packages/46/e2/348cd32faad84eaf1d20cce80e2bb0ef8d312c55bca1f7fa9865e7770aaf/multidict-6.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:92abb658ef2d7ef22ac9f8bb88e8b6c3e571671534e029359b6d9e845923eb1b", size = 46073, upload-time = "2025-10-06T14:49:50.28Z" },
-    { url = "https://files.pythonhosted.org/packages/25/ec/aad2613c1910dce907480e0c3aa306905830f25df2e54ccc9dea450cb5aa/multidict-6.7.0-cp312-cp312-win_arm64.whl", hash = "sha256:490dab541a6a642ce1a9d61a4781656b346a55c13038f0b1244653828e3a83ec", size = 43226, upload-time = "2025-10-06T14:49:52.304Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/86/33272a544eeb36d66e4d9a920602d1a2f57d4ebea4ef3cdfe5a912574c95/multidict-6.7.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bee7c0588aa0076ce77c0ea5d19a68d76ad81fcd9fe8501003b9a24f9d4000f6", size = 76135, upload-time = "2025-10-06T14:49:54.26Z" },
-    { url = "https://files.pythonhosted.org/packages/91/1c/eb97db117a1ebe46d457a3d235a7b9d2e6dcab174f42d1b67663dd9e5371/multidict-6.7.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7ef6b61cad77091056ce0e7ce69814ef72afacb150b7ac6a3e9470def2198159", size = 45117, upload-time = "2025-10-06T14:49:55.82Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/d8/6c3442322e41fb1dd4de8bd67bfd11cd72352ac131f6368315617de752f1/multidict-6.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c0359b1ec12b1d6849c59f9d319610b7f20ef990a6d454ab151aa0e3b9f78ca", size = 43472, upload-time = "2025-10-06T14:49:57.048Z" },
-    { url = "https://files.pythonhosted.org/packages/75/3f/e2639e80325af0b6c6febdf8e57cc07043ff15f57fa1ef808f4ccb5ac4cd/multidict-6.7.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cd240939f71c64bd658f186330603aac1a9a81bf6273f523fca63673cb7378a8", size = 249342, upload-time = "2025-10-06T14:49:58.368Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/cc/84e0585f805cbeaa9cbdaa95f9a3d6aed745b9d25700623ac89a6ecff400/multidict-6.7.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60a4d75718a5efa473ebd5ab685786ba0c67b8381f781d1be14da49f1a2dc60", size = 257082, upload-time = "2025-10-06T14:49:59.89Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/9c/ac851c107c92289acbbf5cfb485694084690c1b17e555f44952c26ddc5bd/multidict-6.7.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53a42d364f323275126aff81fb67c5ca1b7a04fda0546245730a55c8c5f24bc4", size = 240704, upload-time = "2025-10-06T14:50:01.485Z" },
-    { url = "https://files.pythonhosted.org/packages/50/cc/5f93e99427248c09da95b62d64b25748a5f5c98c7c2ab09825a1d6af0e15/multidict-6.7.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3b29b980d0ddbecb736735ee5bef69bb2ddca56eff603c86f3f29a1128299b4f", size = 266355, upload-time = "2025-10-06T14:50:02.955Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/0c/2ec1d883ceb79c6f7f6d7ad90c919c898f5d1c6ea96d322751420211e072/multidict-6.7.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f8a93b1c0ed2d04b97a5e9336fd2d33371b9a6e29ab7dd6503d63407c20ffbaf", size = 267259, upload-time = "2025-10-06T14:50:04.446Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/2d/f0b184fa88d6630aa267680bdb8623fb69cb0d024b8c6f0d23f9a0f406d3/multidict-6.7.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ff96e8815eecacc6645da76c413eb3b3d34cfca256c70b16b286a687d013c32", size = 254903, upload-time = "2025-10-06T14:50:05.98Z" },
-    { url = "https://files.pythonhosted.org/packages/06/c9/11ea263ad0df7dfabcad404feb3c0dd40b131bc7f232d5537f2fb1356951/multidict-6.7.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7516c579652f6a6be0e266aec0acd0db80829ca305c3d771ed898538804c2036", size = 252365, upload-time = "2025-10-06T14:50:07.511Z" },
-    { url = "https://files.pythonhosted.org/packages/41/88/d714b86ee2c17d6e09850c70c9d310abac3d808ab49dfa16b43aba9d53fd/multidict-6.7.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:040f393368e63fb0f3330e70c26bfd336656bed925e5cbe17c9da839a6ab13ec", size = 250062, upload-time = "2025-10-06T14:50:09.074Z" },
-    { url = "https://files.pythonhosted.org/packages/15/fe/ad407bb9e818c2b31383f6131ca19ea7e35ce93cf1310fce69f12e89de75/multidict-6.7.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b3bc26a951007b1057a1c543af845f1c7e3e71cc240ed1ace7bf4484aa99196e", size = 249683, upload-time = "2025-10-06T14:50:10.714Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/a4/a89abdb0229e533fb925e7c6e5c40201c2873efebc9abaf14046a4536ee6/multidict-6.7.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7b022717c748dd1992a83e219587aabe45980d88969f01b316e78683e6285f64", size = 261254, upload-time = "2025-10-06T14:50:12.28Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/aa/0e2b27bd88b40a4fb8dc53dd74eecac70edaa4c1dd0707eb2164da3675b3/multidict-6.7.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:9600082733859f00d79dee64effc7aef1beb26adb297416a4ad2116fd61374bd", size = 257967, upload-time = "2025-10-06T14:50:14.16Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/8e/0c67b7120d5d5f6d874ed85a085f9dc770a7f9d8813e80f44a9fec820bb7/multidict-6.7.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:94218fcec4d72bc61df51c198d098ce2b378e0ccbac41ddbed5ef44092913288", size = 250085, upload-time = "2025-10-06T14:50:15.639Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/55/b73e1d624ea4b8fd4dd07a3bb70f6e4c7c6c5d9d640a41c6ffe5cdbd2a55/multidict-6.7.0-cp313-cp313-win32.whl", hash = "sha256:a37bd74c3fa9d00be2d7b8eca074dc56bd8077ddd2917a839bd989612671ed17", size = 41713, upload-time = "2025-10-06T14:50:17.066Z" },
-    { url = "https://files.pythonhosted.org/packages/32/31/75c59e7d3b4205075b4c183fa4ca398a2daf2303ddf616b04ae6ef55cffe/multidict-6.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:30d193c6cc6d559db42b6bcec8a5d395d34d60c9877a0b71ecd7c204fcf15390", size = 45915, upload-time = "2025-10-06T14:50:18.264Z" },
-    { url = "https://files.pythonhosted.org/packages/31/2a/8987831e811f1184c22bc2e45844934385363ee61c0a2dcfa8f71b87e608/multidict-6.7.0-cp313-cp313-win_arm64.whl", hash = "sha256:ea3334cabe4d41b7ccd01e4d349828678794edbc2d3ae97fc162a3312095092e", size = 43077, upload-time = "2025-10-06T14:50:19.853Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/68/7b3a5170a382a340147337b300b9eb25a9ddb573bcdfff19c0fa3f31ffba/multidict-6.7.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:ad9ce259f50abd98a1ca0aa6e490b58c316a0fce0617f609723e40804add2c00", size = 83114, upload-time = "2025-10-06T14:50:21.223Z" },
-    { url = "https://files.pythonhosted.org/packages/55/5c/3fa2d07c84df4e302060f555bbf539310980362236ad49f50eeb0a1c1eb9/multidict-6.7.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07f5594ac6d084cbb5de2df218d78baf55ef150b91f0ff8a21cc7a2e3a5a58eb", size = 48442, upload-time = "2025-10-06T14:50:22.871Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/56/67212d33239797f9bd91962bb899d72bb0f4c35a8652dcdb8ed049bef878/multidict-6.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:0591b48acf279821a579282444814a2d8d0af624ae0bc600aa4d1b920b6e924b", size = 46885, upload-time = "2025-10-06T14:50:24.258Z" },
-    { url = "https://files.pythonhosted.org/packages/46/d1/908f896224290350721597a61a69cd19b89ad8ee0ae1f38b3f5cd12ea2ac/multidict-6.7.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:749a72584761531d2b9467cfbdfd29487ee21124c304c4b6cb760d8777b27f9c", size = 242588, upload-time = "2025-10-06T14:50:25.716Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/67/8604288bbd68680eee0ab568fdcb56171d8b23a01bcd5cb0c8fedf6e5d99/multidict-6.7.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b4c3d199f953acd5b446bf7c0de1fe25d94e09e79086f8dc2f48a11a129cdf1", size = 249966, upload-time = "2025-10-06T14:50:28.192Z" },
-    { url = "https://files.pythonhosted.org/packages/20/33/9228d76339f1ba51e3efef7da3ebd91964d3006217aae13211653193c3ff/multidict-6.7.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9fb0211dfc3b51efea2f349ec92c114d7754dd62c01f81c3e32b765b70c45c9b", size = 228618, upload-time = "2025-10-06T14:50:29.82Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/2d/25d9b566d10cab1c42b3b9e5b11ef79c9111eaf4463b8c257a3bd89e0ead/multidict-6.7.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a027ec240fe73a8d6281872690b988eed307cd7d91b23998ff35ff577ca688b5", size = 257539, upload-time = "2025-10-06T14:50:31.731Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/b1/8d1a965e6637fc33de3c0d8f414485c2b7e4af00f42cab3d84e7b955c222/multidict-6.7.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1d964afecdf3a8288789df2f5751dc0a8261138c3768d9af117ed384e538fad", size = 256345, upload-time = "2025-10-06T14:50:33.26Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/0c/06b5a8adbdeedada6f4fb8d8f193d44a347223b11939b42953eeb6530b6b/multidict-6.7.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:caf53b15b1b7df9fbd0709aa01409000a2b4dd03a5f6f5cc548183c7c8f8b63c", size = 247934, upload-time = "2025-10-06T14:50:34.808Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/31/b2491b5fe167ca044c6eb4b8f2c9f3b8a00b24c432c365358eadac5d7625/multidict-6.7.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:654030da3197d927f05a536a66186070e98765aa5142794c9904555d3a9d8fb5", size = 245243, upload-time = "2025-10-06T14:50:36.436Z" },
-    { url = "https://files.pythonhosted.org/packages/61/1a/982913957cb90406c8c94f53001abd9eafc271cb3e70ff6371590bec478e/multidict-6.7.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:2090d3718829d1e484706a2f525e50c892237b2bf9b17a79b059cb98cddc2f10", size = 235878, upload-time = "2025-10-06T14:50:37.953Z" },
-    { url = "https://files.pythonhosted.org/packages/be/c0/21435d804c1a1cf7a2608593f4d19bca5bcbd7a81a70b253fdd1c12af9c0/multidict-6.7.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2d2cfeec3f6f45651b3d408c4acec0ebf3daa9bc8a112a084206f5db5d05b754", size = 243452, upload-time = "2025-10-06T14:50:39.574Z" },
-    { url = "https://files.pythonhosted.org/packages/54/0a/4349d540d4a883863191be6eb9a928846d4ec0ea007d3dcd36323bb058ac/multidict-6.7.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:4ef089f985b8c194d341eb2c24ae6e7408c9a0e2e5658699c92f497437d88c3c", size = 252312, upload-time = "2025-10-06T14:50:41.612Z" },
-    { url = "https://files.pythonhosted.org/packages/26/64/d5416038dbda1488daf16b676e4dbfd9674dde10a0cc8f4fc2b502d8125d/multidict-6.7.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e93a0617cd16998784bf4414c7e40f17a35d2350e5c6f0bd900d3a8e02bd3762", size = 246935, upload-time = "2025-10-06T14:50:43.972Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/8c/8290c50d14e49f35e0bd4abc25e1bc7711149ca9588ab7d04f886cdf03d9/multidict-6.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f0feece2ef8ebc42ed9e2e8c78fc4aa3cf455733b507c09ef7406364c94376c6", size = 243385, upload-time = "2025-10-06T14:50:45.648Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/a0/f83ae75e42d694b3fbad3e047670e511c138be747bc713cf1b10d5096416/multidict-6.7.0-cp313-cp313t-win32.whl", hash = "sha256:19a1d55338ec1be74ef62440ca9e04a2f001a04d0cc49a4983dc320ff0f3212d", size = 47777, upload-time = "2025-10-06T14:50:47.154Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/80/9b174a92814a3830b7357307a792300f42c9e94664b01dee8e457551fa66/multidict-6.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3da4fb467498df97e986af166b12d01f05d2e04f978a9c1c680ea1988e0bc4b6", size = 53104, upload-time = "2025-10-06T14:50:48.851Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/28/04baeaf0428d95bb7a7bea0e691ba2f31394338ba424fb0679a9ed0f4c09/multidict-6.7.0-cp313-cp313t-win_arm64.whl", hash = "sha256:b4121773c49a0776461f4a904cdf6264c88e42218aaa8407e803ca8025872792", size = 45503, upload-time = "2025-10-06T14:50:50.16Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/b1/3da6934455dd4b261d4c72f897e3a5728eba81db59959f3a639245891baa/multidict-6.7.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3bab1e4aff7adaa34410f93b1f8e57c4b36b9af0426a76003f441ee1d3c7e842", size = 75128, upload-time = "2025-10-06T14:50:51.92Z" },
-    { url = "https://files.pythonhosted.org/packages/14/2c/f069cab5b51d175a1a2cb4ccdf7a2c2dabd58aa5bd933fa036a8d15e2404/multidict-6.7.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b8512bac933afc3e45fb2b18da8e59b78d4f408399a960339598374d4ae3b56b", size = 44410, upload-time = "2025-10-06T14:50:53.275Z" },
-    { url = "https://files.pythonhosted.org/packages/42/e2/64bb41266427af6642b6b128e8774ed84c11b80a90702c13ac0a86bb10cc/multidict-6.7.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:79dcf9e477bc65414ebfea98ffd013cb39552b5ecd62908752e0e413d6d06e38", size = 43205, upload-time = "2025-10-06T14:50:54.911Z" },
-    { url = "https://files.pythonhosted.org/packages/02/68/6b086fef8a3f1a8541b9236c594f0c9245617c29841f2e0395d979485cde/multidict-6.7.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:31bae522710064b5cbeddaf2e9f32b1abab70ac6ac91d42572502299e9953128", size = 245084, upload-time = "2025-10-06T14:50:56.369Z" },
-    { url = "https://files.pythonhosted.org/packages/15/ee/f524093232007cd7a75c1d132df70f235cfd590a7c9eaccd7ff422ef4ae8/multidict-6.7.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a0df7ff02397bb63e2fd22af2c87dfa39e8c7f12947bc524dbdc528282c7e34", size = 252667, upload-time = "2025-10-06T14:50:57.991Z" },
-    { url = "https://files.pythonhosted.org/packages/02/a5/eeb3f43ab45878f1895118c3ef157a480db58ede3f248e29b5354139c2c9/multidict-6.7.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7a0222514e8e4c514660e182d5156a415c13ef0aabbd71682fc714e327b95e99", size = 233590, upload-time = "2025-10-06T14:50:59.589Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/1e/76d02f8270b97269d7e3dbd45644b1785bda457b474315f8cf999525a193/multidict-6.7.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2397ab4daaf2698eb51a76721e98db21ce4f52339e535725de03ea962b5a3202", size = 264112, upload-time = "2025-10-06T14:51:01.183Z" },
-    { url = "https://files.pythonhosted.org/packages/76/0b/c28a70ecb58963847c2a8efe334904cd254812b10e535aefb3bcce513918/multidict-6.7.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8891681594162635948a636c9fe0ff21746aeb3dd5463f6e25d9bea3a8a39ca1", size = 261194, upload-time = "2025-10-06T14:51:02.794Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/63/2ab26e4209773223159b83aa32721b4021ffb08102f8ac7d689c943fded1/multidict-6.7.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18706cc31dbf402a7945916dd5cddf160251b6dab8a2c5f3d6d5a55949f676b3", size = 248510, upload-time = "2025-10-06T14:51:04.724Z" },
-    { url = "https://files.pythonhosted.org/packages/93/cd/06c1fa8282af1d1c46fd55c10a7930af652afdce43999501d4d68664170c/multidict-6.7.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f844a1bbf1d207dd311a56f383f7eda2d0e134921d45751842d8235e7778965d", size = 248395, upload-time = "2025-10-06T14:51:06.306Z" },
-    { url = "https://files.pythonhosted.org/packages/99/ac/82cb419dd6b04ccf9e7e61befc00c77614fc8134362488b553402ecd55ce/multidict-6.7.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:d4393e3581e84e5645506923816b9cc81f5609a778c7e7534054091acc64d1c6", size = 239520, upload-time = "2025-10-06T14:51:08.091Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/f3/a0f9bf09493421bd8716a362e0cd1d244f5a6550f5beffdd6b47e885b331/multidict-6.7.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:fbd18dc82d7bf274b37aa48d664534330af744e03bccf696d6f4c6042e7d19e7", size = 245479, upload-time = "2025-10-06T14:51:10.365Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/01/476d38fc73a212843f43c852b0eee266b6971f0e28329c2184a8df90c376/multidict-6.7.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b6234e14f9314731ec45c42fc4554b88133ad53a09092cc48a88e771c125dadb", size = 258903, upload-time = "2025-10-06T14:51:12.466Z" },
-    { url = "https://files.pythonhosted.org/packages/49/6d/23faeb0868adba613b817d0e69c5f15531b24d462af8012c4f6de4fa8dc3/multidict-6.7.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:08d4379f9744d8f78d98c8673c06e202ffa88296f009c71bbafe8a6bf847d01f", size = 252333, upload-time = "2025-10-06T14:51:14.48Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/cc/48d02ac22b30fa247f7dad82866e4b1015431092f4ba6ebc7e77596e0b18/multidict-6.7.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9fe04da3f79387f450fd0061d4dd2e45a72749d31bf634aecc9e27f24fdc4b3f", size = 243411, upload-time = "2025-10-06T14:51:16.072Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/03/29a8bf5a18abf1fe34535c88adbdfa88c9fb869b5a3b120692c64abe8284/multidict-6.7.0-cp314-cp314-win32.whl", hash = "sha256:fbafe31d191dfa7c4c51f7a6149c9fb7e914dcf9ffead27dcfd9f1ae382b3885", size = 40940, upload-time = "2025-10-06T14:51:17.544Z" },
-    { url = "https://files.pythonhosted.org/packages/82/16/7ed27b680791b939de138f906d5cf2b4657b0d45ca6f5dd6236fdddafb1a/multidict-6.7.0-cp314-cp314-win_amd64.whl", hash = "sha256:2f67396ec0310764b9222a1728ced1ab638f61aadc6226f17a71dd9324f9a99c", size = 45087, upload-time = "2025-10-06T14:51:18.875Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/3c/e3e62eb35a1950292fe39315d3c89941e30a9d07d5d2df42965ab041da43/multidict-6.7.0-cp314-cp314-win_arm64.whl", hash = "sha256:ba672b26069957ee369cfa7fc180dde1fc6f176eaf1e6beaf61fbebbd3d9c000", size = 42368, upload-time = "2025-10-06T14:51:20.225Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/40/cd499bd0dbc5f1136726db3153042a735fffd0d77268e2ee20d5f33c010f/multidict-6.7.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:c1dcc7524066fa918c6a27d61444d4ee7900ec635779058571f70d042d86ed63", size = 82326, upload-time = "2025-10-06T14:51:21.588Z" },
-    { url = "https://files.pythonhosted.org/packages/13/8a/18e031eca251c8df76daf0288e6790561806e439f5ce99a170b4af30676b/multidict-6.7.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:27e0b36c2d388dc7b6ced3406671b401e84ad7eb0656b8f3a2f46ed0ce483718", size = 48065, upload-time = "2025-10-06T14:51:22.93Z" },
-    { url = "https://files.pythonhosted.org/packages/40/71/5e6701277470a87d234e433fb0a3a7deaf3bcd92566e421e7ae9776319de/multidict-6.7.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a7baa46a22e77f0988e3b23d4ede5513ebec1929e34ee9495be535662c0dfe2", size = 46475, upload-time = "2025-10-06T14:51:24.352Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/6a/bab00cbab6d9cfb57afe1663318f72ec28289ea03fd4e8236bb78429893a/multidict-6.7.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7bf77f54997a9166a2f5675d1201520586439424c2511723a7312bdb4bcc034e", size = 239324, upload-time = "2025-10-06T14:51:25.822Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/5f/8de95f629fc22a7769ade8b41028e3e5a822c1f8904f618d175945a81ad3/multidict-6.7.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e011555abada53f1578d63389610ac8a5400fc70ce71156b0aa30d326f1a5064", size = 246877, upload-time = "2025-10-06T14:51:27.604Z" },
-    { url = "https://files.pythonhosted.org/packages/23/b4/38881a960458f25b89e9f4a4fdcb02ac101cfa710190db6e5528841e67de/multidict-6.7.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:28b37063541b897fd6a318007373930a75ca6d6ac7c940dbe14731ffdd8d498e", size = 225824, upload-time = "2025-10-06T14:51:29.664Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/39/6566210c83f8a261575f18e7144736059f0c460b362e96e9cf797a24b8e7/multidict-6.7.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:05047ada7a2fde2631a0ed706f1fd68b169a681dfe5e4cf0f8e4cb6618bbc2cd", size = 253558, upload-time = "2025-10-06T14:51:31.684Z" },
-    { url = "https://files.pythonhosted.org/packages/00/a3/67f18315100f64c269f46e6c0319fa87ba68f0f64f2b8e7fd7c72b913a0b/multidict-6.7.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:716133f7d1d946a4e1b91b1756b23c088881e70ff180c24e864c26192ad7534a", size = 252339, upload-time = "2025-10-06T14:51:33.699Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/2a/1cb77266afee2458d82f50da41beba02159b1d6b1f7973afc9a1cad1499b/multidict-6.7.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d1bed1b467ef657f2a0ae62844a607909ef1c6889562de5e1d505f74457d0b96", size = 244895, upload-time = "2025-10-06T14:51:36.189Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/72/09fa7dd487f119b2eb9524946ddd36e2067c08510576d43ff68469563b3b/multidict-6.7.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ca43bdfa5d37bd6aee89d85e1d0831fb86e25541be7e9d376ead1b28974f8e5e", size = 241862, upload-time = "2025-10-06T14:51:41.291Z" },
-    { url = "https://files.pythonhosted.org/packages/65/92/bc1f8bd0853d8669300f732c801974dfc3702c3eeadae2f60cef54dc69d7/multidict-6.7.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:44b546bd3eb645fd26fb949e43c02a25a2e632e2ca21a35e2e132c8105dc8599", size = 232376, upload-time = "2025-10-06T14:51:43.55Z" },
-    { url = "https://files.pythonhosted.org/packages/09/86/ac39399e5cb9d0c2ac8ef6e10a768e4d3bc933ac808d49c41f9dc23337eb/multidict-6.7.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a6ef16328011d3f468e7ebc326f24c1445f001ca1dec335b2f8e66bed3006394", size = 240272, upload-time = "2025-10-06T14:51:45.265Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/b6/fed5ac6b8563ec72df6cb1ea8dac6d17f0a4a1f65045f66b6d3bf1497c02/multidict-6.7.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:5aa873cbc8e593d361ae65c68f85faadd755c3295ea2c12040ee146802f23b38", size = 248774, upload-time = "2025-10-06T14:51:46.836Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/8d/b954d8c0dc132b68f760aefd45870978deec6818897389dace00fcde32ff/multidict-6.7.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:3d7b6ccce016e29df4b7ca819659f516f0bc7a4b3efa3bb2012ba06431b044f9", size = 242731, upload-time = "2025-10-06T14:51:48.541Z" },
-    { url = "https://files.pythonhosted.org/packages/16/9d/a2dac7009125d3540c2f54e194829ea18ac53716c61b655d8ed300120b0f/multidict-6.7.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:171b73bd4ee683d307599b66793ac80981b06f069b62eea1c9e29c9241aa66b0", size = 240193, upload-time = "2025-10-06T14:51:50.355Z" },
-    { url = "https://files.pythonhosted.org/packages/39/ca/c05f144128ea232ae2178b008d5011d4e2cea86e4ee8c85c2631b1b94802/multidict-6.7.0-cp314-cp314t-win32.whl", hash = "sha256:b2d7f80c4e1fd010b07cb26820aae86b7e73b681ee4889684fb8d2d4537aab13", size = 48023, upload-time = "2025-10-06T14:51:51.883Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/8f/0a60e501584145588be1af5cc829265701ba3c35a64aec8e07cbb71d39bb/multidict-6.7.0-cp314-cp314t-win_amd64.whl", hash = "sha256:09929cab6fcb68122776d575e03c6cc64ee0b8fca48d17e135474b042ce515cd", size = 53507, upload-time = "2025-10-06T14:51:53.672Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/ae/3148b988a9c6239903e786eac19c889fab607c31d6efa7fb2147e5680f23/multidict-6.7.0-cp314-cp314t-win_arm64.whl", hash = "sha256:cc41db090ed742f32bd2d2c721861725e6109681eddf835d0a82bd3a5c382827", size = 44804, upload-time = "2025-10-06T14:51:55.415Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/da/7d22601b625e241d4f23ef1ebff8acfc60da633c9e7e7922e24d10f592b3/multidict-6.7.0-py3-none-any.whl", hash = "sha256:394fc5c42a333c9ffc3e421a4c85e08580d990e08b99f6bf35b4132114c5dcb3", size = 12317, upload-time = "2025-10-06T14:52:29.272Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/0b/19348d4c98980c4851d2f943f8ebafdece2ae7ef737adcfa5994ce8e5f10/multidict-6.7.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c93c3db7ea657dd4637d57e74ab73de31bccefe144d3d4ce370052035bc85fb5", size = 77176, upload-time = "2026-01-26T02:42:59.784Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/04/9de3f8077852e3d438215c81e9b691244532d2e05b4270e89ce67b7d103c/multidict-6.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:974e72a2474600827abaeda71af0c53d9ebbc3c2eb7da37b37d7829ae31232d8", size = 44996, upload-time = "2026-01-26T02:43:01.674Z" },
+    { url = "https://files.pythonhosted.org/packages/31/5c/08c7f7fe311f32e83f7621cd3f99d805f45519cd06fafb247628b861da7d/multidict-6.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdea2e7b2456cfb6694fb113066fd0ec7ea4d67e3a35e1f4cbeea0b448bf5872", size = 44631, upload-time = "2026-01-26T02:43:03.169Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/7f/0e3b1390ae772f27501199996b94b52ceeb64fe6f9120a32c6c3f6b781be/multidict-6.7.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:17207077e29342fdc2c9a82e4b306f1127bf1ea91f8b71e02d4798a70bb99991", size = 242561, upload-time = "2026-01-26T02:43:04.733Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/f4/8719f4f167586af317b69dd3e90f913416c91ca610cac79a45c53f590312/multidict-6.7.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4f49cb5661344764e4c7c7973e92a47a59b8fc19b6523649ec9dc4960e58a03", size = 242223, upload-time = "2026-01-26T02:43:06.695Z" },
+    { url = "https://files.pythonhosted.org/packages/47/ab/7c36164cce64a6ad19c6d9a85377b7178ecf3b89f8fd589c73381a5eedfd/multidict-6.7.1-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a9fc4caa29e2e6ae408d1c450ac8bf19892c5fca83ee634ecd88a53332c59981", size = 222322, upload-time = "2026-01-26T02:43:08.472Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/79/a25add6fb38035b5337bc5734f296d9afc99163403bbcf56d4170f97eb62/multidict-6.7.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c5f0c21549ab432b57dcc82130f388d84ad8179824cc3f223d5e7cfbfd4143f6", size = 254005, upload-time = "2026-01-26T02:43:10.127Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/7b/64a87cf98e12f756fc8bd444b001232ffff2be37288f018ad0d3f0aae931/multidict-6.7.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7dfb78d966b2c906ae1d28ccf6e6712a3cd04407ee5088cd276fe8cb42186190", size = 251173, upload-time = "2026-01-26T02:43:11.731Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/ac/b605473de2bb404e742f2cc3583d12aedb2352a70e49ae8fce455b50c5aa/multidict-6.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b0d9b91d1aa44db9c1f1ecd0d9d2ae610b2f4f856448664e01a3b35899f3f92", size = 243273, upload-time = "2026-01-26T02:43:13.063Z" },
+    { url = "https://files.pythonhosted.org/packages/03/65/11492d6a0e259783720f3bc1d9ea55579a76f1407e31ed44045c99542004/multidict-6.7.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dd96c01a9dcd4889dcfcf9eb5544ca0c77603f239e3ffab0524ec17aea9a93ee", size = 238956, upload-time = "2026-01-26T02:43:14.843Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/a7/7ee591302af64e7c196fb63fe856c788993c1372df765102bd0448e7e165/multidict-6.7.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:067343c68cd6612d375710f895337b3a98a033c94f14b9a99eff902f205424e2", size = 233477, upload-time = "2026-01-26T02:43:16.025Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/99/c109962d58756c35fd9992fed7f2355303846ea2ff054bb5f5e9d6b888de/multidict-6.7.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5884a04f4ff56c6120f6ccf703bdeb8b5079d808ba604d4d53aec0d55dc33568", size = 243615, upload-time = "2026-01-26T02:43:17.84Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/5f/1973e7c771c86e93dcfe1c9cc55a5481b610f6614acfc28c0d326fe6bfad/multidict-6.7.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8affcf1c98b82bc901702eb73b6947a1bfa170823c153fe8a47b5f5f02e48e40", size = 249930, upload-time = "2026-01-26T02:43:19.06Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/a5/f170fc2268c3243853580203378cd522446b2df632061e0a5409817854c7/multidict-6.7.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0d17522c37d03e85c8098ec8431636309b2682cf12e58f4dbc76121fb50e4962", size = 243807, upload-time = "2026-01-26T02:43:20.286Z" },
+    { url = "https://files.pythonhosted.org/packages/de/01/73856fab6d125e5bc652c3986b90e8699a95e84b48d72f39ade6c0e74a8c/multidict-6.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:24c0cf81544ca5e17cfcb6e482e7a82cd475925242b308b890c9452a074d4505", size = 239103, upload-time = "2026-01-26T02:43:21.508Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/46/f1220bd9944d8aa40d8ccff100eeeee19b505b857b6f603d6078cb5315b0/multidict-6.7.1-cp310-cp310-win32.whl", hash = "sha256:d82dd730a95e6643802f4454b8fdecdf08667881a9c5670db85bc5a56693f122", size = 41416, upload-time = "2026-01-26T02:43:22.703Z" },
+    { url = "https://files.pythonhosted.org/packages/68/00/9b38e272a770303692fc406c36e1a4c740f401522d5787691eb38a8925a8/multidict-6.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:cf37cbe5ced48d417ba045aca1b21bafca67489452debcde94778a576666a1df", size = 46022, upload-time = "2026-01-26T02:43:23.77Z" },
+    { url = "https://files.pythonhosted.org/packages/64/65/d8d42490c02ee07b6bbe00f7190d70bb4738b3cce7629aaf9f213ef730dd/multidict-6.7.1-cp310-cp310-win_arm64.whl", hash = "sha256:59bc83d3f66b41dac1e7460aac1d196edc70c9ba3094965c467715a70ecb46db", size = 43238, upload-time = "2026-01-26T02:43:24.882Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" },
+    { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" },
+    { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" },
+    { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" },
+    { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" },
+    { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" },
+    { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" },
+    { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/22/929c141d6c0dba87d3e1d38fbdf1ba8baba86b7776469f2bc2d3227a1e67/multidict-6.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2b41f5fed0ed563624f1c17630cb9941cf2309d4df00e494b551b5f3e3d67a23", size = 76174, upload-time = "2026-01-26T02:44:18.509Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/75/bc704ae15fee974f8fccd871305e254754167dce5f9e42d88a2def741a1d/multidict-6.7.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84e61e3af5463c19b67ced91f6c634effb89ef8bfc5ca0267f954451ed4bb6a2", size = 45116, upload-time = "2026-01-26T02:44:19.745Z" },
+    { url = "https://files.pythonhosted.org/packages/79/76/55cd7186f498ed080a18440c9013011eb548f77ae1b297206d030eb1180a/multidict-6.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:935434b9853c7c112eee7ac891bc4cb86455aa631269ae35442cb316790c1445", size = 43524, upload-time = "2026-01-26T02:44:21.571Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/3c/414842ef8d5a1628d68edee29ba0e5bcf235dbfb3ccd3ea303a7fe8c72ff/multidict-6.7.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432feb25a1cb67fe82a9680b4d65fb542e4635cb3166cd9c01560651ad60f177", size = 249368, upload-time = "2026-01-26T02:44:22.803Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/32/befed7f74c458b4a525e60519fe8d87eef72bb1e99924fa2b0f9d97a221e/multidict-6.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e82d14e3c948952a1a85503817e038cba5905a3352de76b9a465075d072fba23", size = 256952, upload-time = "2026-01-26T02:44:24.306Z" },
+    { url = "https://files.pythonhosted.org/packages/03/d6/c878a44ba877f366630c860fdf74bfb203c33778f12b6ac274936853c451/multidict-6.7.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4cfb48c6ea66c83bcaaf7e4dfa7ec1b6bbcf751b7db85a328902796dfde4c060", size = 240317, upload-time = "2026-01-26T02:44:25.772Z" },
+    { url = "https://files.pythonhosted.org/packages/68/49/57421b4d7ad2e9e60e25922b08ceb37e077b90444bde6ead629095327a6f/multidict-6.7.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1d540e51b7e8e170174555edecddbd5538105443754539193e3e1061864d444d", size = 267132, upload-time = "2026-01-26T02:44:27.648Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/fe/ec0edd52ddbcea2a2e89e174f0206444a61440b40f39704e64dc807a70bd/multidict-6.7.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:273d23f4b40f3dce4d6c8a821c741a86dec62cded82e1175ba3d99be128147ed", size = 268140, upload-time = "2026-01-26T02:44:29.588Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/73/6e1b01cbeb458807aa0831742232dbdd1fa92bfa33f52a3f176b4ff3dc11/multidict-6.7.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d624335fd4fa1c08a53f8b4be7676ebde19cd092b3895c421045ca87895b429", size = 254277, upload-time = "2026-01-26T02:44:30.902Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/b2/5fb8c124d7561a4974c342bc8c778b471ebbeb3cc17df696f034a7e9afe7/multidict-6.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:12fad252f8b267cc75b66e8fc51b3079604e8d43a75428ffe193cd9e2195dfd6", size = 252291, upload-time = "2026-01-26T02:44:32.31Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/96/51d4e4e06bcce92577fcd488e22600bd38e4fd59c20cb49434d054903bd2/multidict-6.7.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:03ede2a6ffbe8ef936b92cb4529f27f42be7f56afcdab5ab739cd5f27fb1cbf9", size = 250156, upload-time = "2026-01-26T02:44:33.734Z" },
+    { url = "https://files.pythonhosted.org/packages/db/6b/420e173eec5fba721a50e2a9f89eda89d9c98fded1124f8d5c675f7a0c0f/multidict-6.7.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:90efbcf47dbe33dcf643a1e400d67d59abeac5db07dc3f27d6bdeae497a2198c", size = 249742, upload-time = "2026-01-26T02:44:35.222Z" },
+    { url = "https://files.pythonhosted.org/packages/44/a3/ec5b5bd98f306bc2aa297b8c6f11a46714a56b1e6ef5ebda50a4f5d7c5fb/multidict-6.7.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c4b9bfc148f5a91be9244d6264c53035c8a0dcd2f51f1c3c6e30e30ebaa1c84", size = 262221, upload-time = "2026-01-26T02:44:36.604Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/f7/e8c0d0da0cd1e28d10e624604e1a36bcc3353aaebdfdc3a43c72bc683a12/multidict-6.7.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:401c5a650f3add2472d1d288c26deebc540f99e2fb83e9525007a74cd2116f1d", size = 258664, upload-time = "2026-01-26T02:44:38.008Z" },
+    { url = "https://files.pythonhosted.org/packages/52/da/151a44e8016dd33feed44f730bd856a66257c1ee7aed4f44b649fb7edeb3/multidict-6.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:97891f3b1b3ffbded884e2916cacf3c6fc87b66bb0dde46f7357404750559f33", size = 249490, upload-time = "2026-01-26T02:44:39.386Z" },
+    { url = "https://files.pythonhosted.org/packages/87/af/a3b86bf9630b732897f6fc3f4c4714b90aa4361983ccbdcd6c0339b21b0c/multidict-6.7.1-cp313-cp313-win32.whl", hash = "sha256:e1c5988359516095535c4301af38d8a8838534158f649c05dd1050222321bcb3", size = 41695, upload-time = "2026-01-26T02:44:41.318Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/35/e994121b0e90e46134673422dd564623f93304614f5d11886b1b3e06f503/multidict-6.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:960c83bf01a95b12b08fd54324a4eb1d5b52c88932b5cba5d6e712bb3ed12eb5", size = 45884, upload-time = "2026-01-26T02:44:42.488Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/61/42d3e5dbf661242a69c97ea363f2d7b46c567da8eadef8890022be6e2ab0/multidict-6.7.1-cp313-cp313-win_arm64.whl", hash = "sha256:563fe25c678aaba333d5399408f5ec3c383ca5b663e7f774dd179a520b8144df", size = 43122, upload-time = "2026-01-26T02:44:43.664Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/b3/e6b21c6c4f314bb956016b0b3ef2162590a529b84cb831c257519e7fde44/multidict-6.7.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:c76c4bec1538375dad9d452d246ca5368ad6e1c9039dadcf007ae59c70619ea1", size = 83175, upload-time = "2026-01-26T02:44:44.894Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/76/23ecd2abfe0957b234f6c960f4ade497f55f2c16aeb684d4ecdbf1c95791/multidict-6.7.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:57b46b24b5d5ebcc978da4ec23a819a9402b4228b8a90d9c656422b4bdd8a963", size = 48460, upload-time = "2026-01-26T02:44:46.106Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/57/a0ed92b23f3a042c36bc4227b72b97eca803f5f1801c1ab77c8a212d455e/multidict-6.7.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e954b24433c768ce78ab7929e84ccf3422e46deb45a4dc9f93438f8217fa2d34", size = 46930, upload-time = "2026-01-26T02:44:47.278Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/66/02ec7ace29162e447f6382c495dc95826bf931d3818799bbef11e8f7df1a/multidict-6.7.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3bd231490fa7217cc832528e1cd8752a96f0125ddd2b5749390f7c3ec8721b65", size = 242582, upload-time = "2026-01-26T02:44:48.604Z" },
+    { url = "https://files.pythonhosted.org/packages/58/18/64f5a795e7677670e872673aca234162514696274597b3708b2c0d276cce/multidict-6.7.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:253282d70d67885a15c8a7716f3a73edf2d635793ceda8173b9ecc21f2fb8292", size = 250031, upload-time = "2026-01-26T02:44:50.544Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/ed/e192291dbbe51a8290c5686f482084d31bcd9d09af24f63358c3d42fd284/multidict-6.7.1-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0b4c48648d7649c9335cf1927a8b87fa692de3dcb15faa676c6a6f1f1aabda43", size = 228596, upload-time = "2026-01-26T02:44:51.951Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/7e/3562a15a60cf747397e7f2180b0a11dc0c38d9175a650e75fa1b4d325e15/multidict-6.7.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98bc624954ec4d2c7cb074b8eefc2b5d0ce7d482e410df446414355d158fe4ca", size = 257492, upload-time = "2026-01-26T02:44:53.902Z" },
+    { url = "https://files.pythonhosted.org/packages/24/02/7d0f9eae92b5249bb50ac1595b295f10e263dd0078ebb55115c31e0eaccd/multidict-6.7.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1b99af4d9eec0b49927b4402bcbb58dea89d3e0db8806a4086117019939ad3dd", size = 255899, upload-time = "2026-01-26T02:44:55.316Z" },
+    { url = "https://files.pythonhosted.org/packages/00/e3/9b60ed9e23e64c73a5cde95269ef1330678e9c6e34dd4eb6b431b85b5a10/multidict-6.7.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aac4f16b472d5b7dc6f66a0d49dd57b0e0902090be16594dc9ebfd3d17c47e7", size = 247970, upload-time = "2026-01-26T02:44:56.783Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/06/538e58a63ed5cfb0bd4517e346b91da32fde409d839720f664e9a4ae4f9d/multidict-6.7.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:21f830fe223215dffd51f538e78c172ed7c7f60c9b96a2bf05c4848ad49921c3", size = 245060, upload-time = "2026-01-26T02:44:58.195Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/2f/d743a3045a97c895d401e9bd29aaa09b94f5cbdf1bd561609e5a6c431c70/multidict-6.7.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f5dd81c45b05518b9aa4da4aa74e1c93d715efa234fd3e8a179df611cc85e5f4", size = 235888, upload-time = "2026-01-26T02:44:59.57Z" },
+    { url = "https://files.pythonhosted.org/packages/38/83/5a325cac191ab28b63c52f14f1131f3b0a55ba3b9aa65a6d0bf2a9b921a0/multidict-6.7.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:eb304767bca2bb92fb9c5bd33cedc95baee5bb5f6c88e63706533a1c06ad08c8", size = 243554, upload-time = "2026-01-26T02:45:01.054Z" },
+    { url = "https://files.pythonhosted.org/packages/20/1f/9d2327086bd15da2725ef6aae624208e2ef828ed99892b17f60c344e57ed/multidict-6.7.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c9035dde0f916702850ef66460bc4239d89d08df4d02023a5926e7446724212c", size = 252341, upload-time = "2026-01-26T02:45:02.484Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/2c/2a1aa0280cf579d0f6eed8ee5211c4f1730bd7e06c636ba2ee6aafda302e/multidict-6.7.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:af959b9beeb66c822380f222f0e0a1889331597e81f1ded7f374f3ecb0fd6c52", size = 246391, upload-time = "2026-01-26T02:45:03.862Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/03/7ca022ffc36c5a3f6e03b179a5ceb829be9da5783e6fe395f347c0794680/multidict-6.7.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:41f2952231456154ee479651491e94118229844dd7226541788be783be2b5108", size = 243422, upload-time = "2026-01-26T02:45:05.296Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/1d/b31650eab6c5778aceed46ba735bd97f7c7d2f54b319fa916c0f96e7805b/multidict-6.7.1-cp313-cp313t-win32.whl", hash = "sha256:df9f19c28adcb40b6aae30bbaa1478c389efd50c28d541d76760199fc1037c32", size = 47770, upload-time = "2026-01-26T02:45:06.754Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/5b/2d2d1d522e51285bd61b1e20df8f47ae1a9d80839db0b24ea783b3832832/multidict-6.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d54ecf9f301853f2c5e802da559604b3e95bb7a3b01a9c295c6ee591b9882de8", size = 53109, upload-time = "2026-01-26T02:45:08.044Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/a3/cc409ba012c83ca024a308516703cf339bdc4b696195644a7215a5164a24/multidict-6.7.1-cp313-cp313t-win_arm64.whl", hash = "sha256:5a37ca18e360377cfda1d62f5f382ff41f2b8c4ccb329ed974cc2e1643440118", size = 45573, upload-time = "2026-01-26T02:45:09.349Z" },
+    { url = "https://files.pythonhosted.org/packages/91/cc/db74228a8be41884a567e88a62fd589a913708fcf180d029898c17a9a371/multidict-6.7.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8f333ec9c5eb1b7105e3b84b53141e66ca05a19a605368c55450b6ba208cb9ee", size = 75190, upload-time = "2026-01-26T02:45:10.651Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/22/492f2246bb5b534abd44804292e81eeaf835388901f0c574bac4eeec73c5/multidict-6.7.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a407f13c188f804c759fc6a9f88286a565c242a76b27626594c133b82883b5c2", size = 44486, upload-time = "2026-01-26T02:45:11.938Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/4f/733c48f270565d78b4544f2baddc2fb2a245e5a8640254b12c36ac7ac68e/multidict-6.7.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0e161ddf326db5577c3a4cc2d8648f81456e8a20d40415541587a71620d7a7d1", size = 43219, upload-time = "2026-01-26T02:45:14.346Z" },
+    { url = "https://files.pythonhosted.org/packages/24/bb/2c0c2287963f4259c85e8bcbba9182ced8d7fca65c780c38e99e61629d11/multidict-6.7.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:1e3a8bb24342a8201d178c3b4984c26ba81a577c80d4d525727427460a50c22d", size = 245132, upload-time = "2026-01-26T02:45:15.712Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/f9/44d4b3064c65079d2467888794dea218d1601898ac50222ab8a9a8094460/multidict-6.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97231140a50f5d447d3164f994b86a0bed7cd016e2682f8650d6a9158e14fd31", size = 252420, upload-time = "2026-01-26T02:45:17.293Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/13/78f7275e73fa17b24c9a51b0bd9d73ba64bb32d0ed51b02a746eb876abe7/multidict-6.7.1-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6b10359683bd8806a200fd2909e7c8ca3a7b24ec1d8132e483d58e791d881048", size = 233510, upload-time = "2026-01-26T02:45:19.356Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/25/8167187f62ae3cbd52da7893f58cb036b47ea3fb67138787c76800158982/multidict-6.7.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:283ddac99f7ac25a4acadbf004cb5ae34480bbeb063520f70ce397b281859362", size = 264094, upload-time = "2026-01-26T02:45:20.834Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/e7/69a3a83b7b030cf283fb06ce074a05a02322359783424d7edf0f15fe5022/multidict-6.7.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:538cec1e18c067d0e6103aa9a74f9e832904c957adc260e61cd9d8cf0c3b3d37", size = 260786, upload-time = "2026-01-26T02:45:22.818Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/3b/8ec5074bcfc450fe84273713b4b0a0dd47c0249358f5d82eb8104ffe2520/multidict-6.7.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eee46ccb30ff48a1e35bb818cc90846c6be2b68240e42a78599166722cea709", size = 248483, upload-time = "2026-01-26T02:45:24.368Z" },
+    { url = "https://files.pythonhosted.org/packages/48/5a/d5a99e3acbca0e29c5d9cba8f92ceb15dce78bab963b308ae692981e3a5d/multidict-6.7.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa263a02f4f2dd2d11a7b1bb4362aa7cb1049f84a9235d31adf63f30143469a0", size = 248403, upload-time = "2026-01-26T02:45:25.982Z" },
+    { url = "https://files.pythonhosted.org/packages/35/48/e58cd31f6c7d5102f2a4bf89f96b9cf7e00b6c6f3d04ecc44417c00a5a3c/multidict-6.7.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:2e1425e2f99ec5bd36c15a01b690a1a2456209c5deed58f95469ffb46039ccbb", size = 240315, upload-time = "2026-01-26T02:45:27.487Z" },
+    { url = "https://files.pythonhosted.org/packages/94/33/1cd210229559cb90b6786c30676bb0c58249ff42f942765f88793b41fdce/multidict-6.7.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:497394b3239fc6f0e13a78a3e1b61296e72bf1c5f94b4c4eb80b265c37a131cd", size = 245528, upload-time = "2026-01-26T02:45:28.991Z" },
+    { url = "https://files.pythonhosted.org/packages/64/f2/6e1107d226278c876c783056b7db43d800bb64c6131cec9c8dfb6903698e/multidict-6.7.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:233b398c29d3f1b9676b4b6f75c518a06fcb2ea0b925119fb2c1bc35c05e1601", size = 258784, upload-time = "2026-01-26T02:45:30.503Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/c1/11f664f14d525e4a1b5327a82d4de61a1db604ab34c6603bb3c2cc63ad34/multidict-6.7.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:93b1818e4a6e0930454f0f2af7dfce69307ca03cdcfb3739bf4d91241967b6c1", size = 251980, upload-time = "2026-01-26T02:45:32.603Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/9f/75a9ac888121d0c5bbd4ecf4eead45668b1766f6baabfb3b7f66a410e231/multidict-6.7.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f33dc2a3abe9249ea5d8360f969ec7f4142e7ac45ee7014d8f8d5acddf178b7b", size = 243602, upload-time = "2026-01-26T02:45:34.043Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/e7/50bf7b004cc8525d80dbbbedfdc7aed3e4c323810890be4413e589074032/multidict-6.7.1-cp314-cp314-win32.whl", hash = "sha256:3ab8b9d8b75aef9df299595d5388b14530839f6422333357af1339443cff777d", size = 40930, upload-time = "2026-01-26T02:45:36.278Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/bf/52f25716bbe93745595800f36fb17b73711f14da59ed0bb2eba141bc9f0f/multidict-6.7.1-cp314-cp314-win_amd64.whl", hash = "sha256:5e01429a929600e7dab7b166062d9bb54a5eed752384c7384c968c2afab8f50f", size = 45074, upload-time = "2026-01-26T02:45:37.546Z" },
+    { url = "https://files.pythonhosted.org/packages/97/ab/22803b03285fa3a525f48217963da3a65ae40f6a1b6f6cf2768879e208f9/multidict-6.7.1-cp314-cp314-win_arm64.whl", hash = "sha256:4885cb0e817aef5d00a2e8451d4665c1808378dc27c2705f1bf4ef8505c0d2e5", size = 42471, upload-time = "2026-01-26T02:45:38.889Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/6d/f9293baa6146ba9507e360ea0292b6422b016907c393e2f63fc40ab7b7b5/multidict-6.7.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:0458c978acd8e6ea53c81eefaddbbee9c6c5e591f41b3f5e8e194780fe026581", size = 82401, upload-time = "2026-01-26T02:45:40.254Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/68/53b5494738d83558d87c3c71a486504d8373421c3e0dbb6d0db48ad42ee0/multidict-6.7.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:c0abd12629b0af3cf590982c0b413b1e7395cd4ec026f30986818ab95bfaa94a", size = 48143, upload-time = "2026-01-26T02:45:41.635Z" },
+    { url = "https://files.pythonhosted.org/packages/37/e8/5284c53310dcdc99ce5d66563f6e5773531a9b9fe9ec7a615e9bc306b05f/multidict-6.7.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:14525a5f61d7d0c94b368a42cff4c9a4e7ba2d52e2672a7b23d84dc86fb02b0c", size = 46507, upload-time = "2026-01-26T02:45:42.99Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/fc/6800d0e5b3875568b4083ecf5f310dcf91d86d52573160834fb4bfcf5e4f/multidict-6.7.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:17307b22c217b4cf05033dabefe68255a534d637c6c9b0cc8382718f87be4262", size = 239358, upload-time = "2026-01-26T02:45:44.376Z" },
+    { url = "https://files.pythonhosted.org/packages/41/75/4ad0973179361cdf3a113905e6e088173198349131be2b390f9fa4da5fc6/multidict-6.7.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a7e590ff876a3eaf1c02a4dfe0724b6e69a9e9de6d8f556816f29c496046e59", size = 246884, upload-time = "2026-01-26T02:45:47.167Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/9c/095bb28b5da139bd41fb9a5d5caff412584f377914bd8787c2aa98717130/multidict-6.7.1-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:5fa6a95dfee63893d80a34758cd0e0c118a30b8dcb46372bf75106c591b77889", size = 225878, upload-time = "2026-01-26T02:45:48.698Z" },
+    { url = "https://files.pythonhosted.org/packages/07/d0/c0a72000243756e8f5a277b6b514fa005f2c73d481b7d9e47cd4568aa2e4/multidict-6.7.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a0543217a6a017692aa6ae5cc39adb75e587af0f3a82288b1492eb73dd6cc2a4", size = 253542, upload-time = "2026-01-26T02:45:50.164Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/6b/f69da15289e384ecf2a68837ec8b5ad8c33e973aa18b266f50fe55f24b8c/multidict-6.7.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f99fe611c312b3c1c0ace793f92464d8cd263cc3b26b5721950d977b006b6c4d", size = 252403, upload-time = "2026-01-26T02:45:51.779Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/76/b9669547afa5a1a25cd93eaca91c0da1c095b06b6d2d8ec25b713588d3a1/multidict-6.7.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9004d8386d133b7e6135679424c91b0b854d2d164af6ea3f289f8f2761064609", size = 244889, upload-time = "2026-01-26T02:45:53.27Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/a9/a50d2669e506dad33cfc45b5d574a205587b7b8a5f426f2fbb2e90882588/multidict-6.7.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e628ef0e6859ffd8273c69412a2465c4be4a9517d07261b33334b5ec6f3c7489", size = 241982, upload-time = "2026-01-26T02:45:54.919Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/bb/1609558ad8b456b4827d3c5a5b775c93b87878fd3117ed3db3423dfbce1b/multidict-6.7.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:841189848ba629c3552035a6a7f5bf3b02eb304e9fea7492ca220a8eda6b0e5c", size = 232415, upload-time = "2026-01-26T02:45:56.981Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/59/6f61039d2aa9261871e03ab9dc058a550d240f25859b05b67fd70f80d4b3/multidict-6.7.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ce1bbd7d780bb5a0da032e095c951f7014d6b0a205f8318308140f1a6aba159e", size = 240337, upload-time = "2026-01-26T02:45:58.698Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/29/fdc6a43c203890dc2ae9249971ecd0c41deaedfe00d25cb6564b2edd99eb/multidict-6.7.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b26684587228afed0d50cf804cc71062cc9c1cdf55051c4c6345d372947b268c", size = 248788, upload-time = "2026-01-26T02:46:00.862Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/14/a153a06101323e4cf086ecee3faadba52ff71633d471f9685c42e3736163/multidict-6.7.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9f9af11306994335398293f9958071019e3ab95e9a707dc1383a35613f6abcb9", size = 242842, upload-time = "2026-01-26T02:46:02.824Z" },
+    { url = "https://files.pythonhosted.org/packages/41/5f/604ae839e64a4a6efc80db94465348d3b328ee955e37acb24badbcd24d83/multidict-6.7.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b4938326284c4f1224178a560987b6cf8b4d38458b113d9b8c1db1a836e640a2", size = 240237, upload-time = "2026-01-26T02:46:05.898Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/60/c3a5187bf66f6fb546ff4ab8fb5a077cbdd832d7b1908d4365c7f74a1917/multidict-6.7.1-cp314-cp314t-win32.whl", hash = "sha256:98655c737850c064a65e006a3df7c997cd3b220be4ec8fe26215760b9697d4d7", size = 48008, upload-time = "2026-01-26T02:46:07.468Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/f7/addf1087b860ac60e6f382240f64fb99f8bfb532bb06f7c542b83c29ca61/multidict-6.7.1-cp314-cp314t-win_amd64.whl", hash = "sha256:497bde6223c212ba11d462853cfa4f0ae6ef97465033e7dc9940cdb3ab5b48e5", size = 53542, upload-time = "2026-01-26T02:46:08.809Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/81/4629d0aa32302ef7b2ec65c75a728cc5ff4fa410c50096174c1632e70b3e/multidict-6.7.1-cp314-cp314t-win_arm64.whl", hash = "sha256:2bbd113e0d4af5db41d5ebfe9ccaff89de2120578164f86a5d17d5a576d1e5b2", size = 44719, upload-time = "2026-01-26T02:46:11.146Z" },
+    { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
 ]
 
 [[package]]
 name = "multiprocess"
 version = "0.70.18"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "dill" },
+    { name = "dill", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/72/fd/2ae3826f5be24c6ed87266bc4e59c46ea5b059a103f3d7e7eb76a52aeecb/multiprocess-0.70.18.tar.gz", hash = "sha256:f9597128e6b3e67b23956da07cf3d2e5cba79e2f4e0fba8d7903636663ec6d0d", size = 1798503, upload-time = "2025-04-17T03:11:27.742Z" }
 wheels = [
@@ -2751,10 +3165,36 @@ wheels = [
 ]
 
 [[package]]
-name = "mypy-extensions"
-version = "1.1.0"
+name = "multiprocess"
+version = "0.70.19"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "dill", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/b6/10832f96b499690854e574360be342a282f5f7dba58eff791299ff6c0637/multiprocess-0.70.19-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:02e5c35d7d6cd2bdc89c1858867f7bde4012837411023a4696c148c1bdd7c80e", size = 135131, upload-time = "2026-01-19T06:47:20.479Z" },
+    { url = "https://files.pythonhosted.org/packages/99/50/faef2d8106534b0dc4a0b772668a1a99682696ebf17d3c0f13f2ed6a656a/multiprocess-0.70.19-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:79576c02d1207ec405b00cabf2c643c36070800cca433860e14539df7818b2aa", size = 135131, upload-time = "2026-01-19T06:47:21.879Z" },
+    { url = "https://files.pythonhosted.org/packages/94/b1/0b71d18b76bf423c2e8ee00b31db37d17297ab3b4db44e188692afdca628/multiprocess-0.70.19-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c6b6d78d43a03b68014ca1f0b7937d965393a670c5de7c29026beb2258f2f896", size = 135134, upload-time = "2026-01-19T06:47:23.262Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/aa/714635c727dbfc251139226fa4eaf1b07f00dc12d9cd2eb25f931adaf873/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1bbf1b69af1cf64cd05f65337d9215b88079ec819cd0ea7bac4dab84e162efe7", size = 144743, upload-time = "2026-01-19T06:47:24.562Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/e1/155f6abf5e6b5d9cef29b6d0167c180846157a4aca9b9bee1a217f67c959/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5be9ec7f0c1c49a4f4a6fd20d5dda4aeabc2d39a50f4ad53720f1cd02b3a7c2e", size = 144738, upload-time = "2026-01-19T06:47:26.636Z" },
+    { url = "https://files.pythonhosted.org/packages/af/cb/f421c2869d75750a4f32301cc20c4b63fab6376e9a75c8e5e655bdeb3d9b/multiprocess-0.70.19-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1c3dce098845a0db43b32a0b76a228ca059a668071cfeaa0f40c36c0b1585d45", size = 144741, upload-time = "2026-01-19T06:47:27.985Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" },
+    { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" },
+    { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/74/d2c27e03cb84251dfe7249b8e82923643c6d48fa4883b9476b025e7dc7eb/multiprocess-0.70.19-py313-none-any.whl", hash = "sha256:8d5eb4ec5017ba2fab4e34a747c6d2c2b6fecfe9e7236e77988db91580ada952", size = 156414, upload-time = "2026-01-19T06:47:35.915Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/61/af9115673a5870fd885247e2f1b68c4f1197737da315b520a91c757a861a/multiprocess-0.70.19-py314-none-any.whl", hash = "sha256:e8cc7fbdff15c0613f0a1f1f8744bef961b0a164c0ca29bdff53e9d2d93c5e5f", size = 160318, upload-time = "2026-01-19T06:47:37.497Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477, upload-time = "2026-01-19T06:47:38.619Z" },
+]
+
+[[package]]
+name = "mypy-extensions"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
 ]
@@ -2763,20 +3203,59 @@ wheels = [
 name = "myst-parser"
 version = "4.0.1"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
 dependencies = [
-    { name = "docutils" },
-    { name = "jinja2" },
-    { name = "markdown-it-py" },
-    { name = "mdit-py-plugins" },
-    { name = "pyyaml" },
+    { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "mdit-py-plugins", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pyyaml", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/66/a5/9626ba4f73555b3735ad86247a8077d4603aa8628537687c839ab08bfe44/myst_parser-4.0.1.tar.gz", hash = "sha256:5cfea715e4f3574138aecbf7d54132296bfd72bb614d31168f48c477a830a7c4", size = 93985, upload-time = "2025-02-12T10:53:03.833Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/5f/df/76d0321c3797b54b60fef9ec3bd6f4cfd124b9e422182156a1dd418722cf/myst_parser-4.0.1-py3-none-any.whl", hash = "sha256:9134e88959ec3b5780aedf8a99680ea242869d012e8821db3126d427edc9c95d", size = 84579, upload-time = "2025-02-12T10:53:02.078Z" },
 ]
 
+[[package]]
+name = "myst-parser"
+version = "5.0.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "mdit-py-plugins", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/fa/7b45eef11b7971f0beb29d27b7bfe0d747d063aa29e170d9edd004733c8a/myst_parser-5.0.0.tar.gz", hash = "sha256:f6f231452c56e8baa662cc352c548158f6a16fcbd6e3800fc594978002b94f3a", size = 98535, upload-time = "2026-01-15T09:08:18.036Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/ac/686789b9145413f1a61878c407210e41bfdb097976864e0913078b24098c/myst_parser-5.0.0-py3-none-any.whl", hash = "sha256:ab31e516024918296e169139072b81592336f2fef55b8986aa31c9f04b5f7211", size = 84533, upload-time = "2026-01-15T09:08:16.788Z" },
+]
+
 [[package]]
 name = "nemo-run"
 version = "0.7.0rc0.dev0"
@@ -2790,7 +3269,7 @@ dependencies = [
     { name = "jinja2" },
     { name = "leptonai" },
     { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "omegaconf" },
     { name = "packaging" },
     { name = "rich" },
@@ -2814,21 +3293,29 @@ wheels = [
 
 [[package]]
 name = "networkx"
-version = "3.6"
+version = "3.6.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e8/fc/7b6fd4d22c8c4dc5704430140d8b3f520531d4fe7328b8f8d03f5a7950e8/networkx-3.6.tar.gz", hash = "sha256:285276002ad1f7f7da0f7b42f004bcba70d381e936559166363707fdad3d72ad", size = 2511464, upload-time = "2025-11-24T03:03:47.158Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/07/c7/d64168da60332c17d24c0d2f08bdf3987e8d1ae9d84b5bbd0eec2eb26a55/networkx-3.6-py3-none-any.whl", hash = "sha256:cdb395b105806062473d3be36458d8f1459a4e4b98e236a66c3a48996e07684f", size = 2063713, upload-time = "2025-11-24T03:03:45.21Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
 ]
 
 [[package]]
@@ -2859,7 +3346,7 @@ wheels = [
 
 [[package]]
 name = "nltk"
-version = "3.9.2"
+version = "3.9.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
@@ -2867,9 +3354,9 @@ dependencies = [
     { name = "regex" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/76/3a5e4312c19a028770f86fd7c058cf9f4ec4321c6cf7526bab998a5b683c/nltk-3.9.2.tar.gz", hash = "sha256:0f409e9b069ca4177c1903c3e843eef90c7e92992fa4931ae607da6de49e1419", size = 2887629, upload-time = "2025-10-01T07:19:23.764Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e1/8f/915e1c12df07c70ed779d18ab83d065718a926e70d3ea33eb0cd66ffb7c0/nltk-3.9.3.tar.gz", hash = "sha256:cb5945d6424a98d694c2b9a0264519fab4363711065a46aa0ae7a2195b92e71f", size = 2923673, upload-time = "2026-02-24T12:05:53.833Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/60/90/81ac364ef94209c100e12579629dc92bf7a709a84af32f8c551b02c07e94/nltk-3.9.2-py3-none-any.whl", hash = "sha256:1e209d2b3009110635ed9709a67a1a3e33a10f799490fa71cf4bec218c11c88a", size = 1513404, upload-time = "2025-10-01T07:19:21.648Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/7e/9af5a710a1236e4772de8dfcc6af942a561327bb9f42b5b4a24d0cf100fd/nltk-3.9.3-py3-none-any.whl", hash = "sha256:60b3db6e9995b3dd976b1f0fa7dec22069b2677e759c28eb69b62ddd44870522", size = 1525385, upload-time = "2026-02-24T12:05:46.54Z" },
 ]
 
 [[package]]
@@ -2940,106 +3427,112 @@ wheels = [
 
 [[package]]
 name = "numpy"
-version = "2.3.5"
+version = "2.4.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/77/84dd1d2e34d7e2792a236ba180b5e8fcc1e3e414e761ce0253f63d7f572e/numpy-2.3.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de5672f4a7b200c15a4127042170a694d4df43c992948f5e1af57f0174beed10", size = 17034641, upload-time = "2025-11-16T22:49:19.336Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/ea/25e26fa5837106cde46ae7d0b667e20f69cbbc0efd64cba8221411ab26ae/numpy-2.3.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218", size = 12528324, upload-time = "2025-11-16T22:49:22.582Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/1a/e85f0eea4cf03d6a0228f5c0256b53f2df4bc794706e7df019fc622e47f1/numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d", size = 5356872, upload-time = "2025-11-16T22:49:25.408Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/bb/35ef04afd567f4c989c2060cde39211e4ac5357155c1833bcd1166055c61/numpy-2.3.5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5", size = 6893148, upload-time = "2025-11-16T22:49:27.549Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/2b/05bbeb06e2dff5eab512dfc678b1cc5ee94d8ac5956a0885c64b6b26252b/numpy-2.3.5-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7", size = 14557282, upload-time = "2025-11-16T22:49:30.964Z" },
-    { url = "https://files.pythonhosted.org/packages/65/fb/2b23769462b34398d9326081fad5655198fcf18966fcb1f1e49db44fbf31/numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4", size = 16897903, upload-time = "2025-11-16T22:49:34.191Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/14/085f4cf05fc3f1e8aa95e85404e984ffca9b2275a5dc2b1aae18a67538b8/numpy-2.3.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e", size = 16341672, upload-time = "2025-11-16T22:49:37.2Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/3b/1f73994904142b2aa290449b3bb99772477b5fd94d787093e4f24f5af763/numpy-2.3.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748", size = 18838896, upload-time = "2025-11-16T22:49:39.727Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/b9/cf6649b2124f288309ffc353070792caf42ad69047dcc60da85ee85fea58/numpy-2.3.5-cp311-cp311-win32.whl", hash = "sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c", size = 6563608, upload-time = "2025-11-16T22:49:42.079Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/44/9fe81ae1dcc29c531843852e2874080dc441338574ccc4306b39e2ff6e59/numpy-2.3.5-cp311-cp311-win_amd64.whl", hash = "sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c", size = 13078442, upload-time = "2025-11-16T22:49:43.99Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/a7/f99a41553d2da82a20a2f22e93c94f928e4490bb447c9ff3c4ff230581d3/numpy-2.3.5-cp311-cp311-win_arm64.whl", hash = "sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa", size = 10458555, upload-time = "2025-11-16T22:49:47.092Z" },
-    { url = "https://files.pythonhosted.org/packages/44/37/e669fe6cbb2b96c62f6bbedc6a81c0f3b7362f6a59230b23caa673a85721/numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e", size = 16733873, upload-time = "2025-11-16T22:49:49.84Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" },
-    { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" },
-    { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" },
-    { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" },
-    { url = "https://files.pythonhosted.org/packages/db/69/9cde09f36da4b5a505341180a3f2e6fadc352fd4d2b7096ce9778db83f1a/numpy-2.3.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff", size = 16728251, upload-time = "2025-11-16T22:50:19.013Z" },
-    { url = "https://files.pythonhosted.org/packages/79/fb/f505c95ceddd7027347b067689db71ca80bd5ecc926f913f1a23e65cf09b/numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188", size = 12254652, upload-time = "2025-11-16T22:50:21.487Z" },
-    { url = "https://files.pythonhosted.org/packages/78/da/8c7738060ca9c31b30e9301ee0cf6c5ffdbf889d9593285a1cead337f9a5/numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0", size = 5083172, upload-time = "2025-11-16T22:50:24.562Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/b4/ee5bb2537fb9430fd2ef30a616c3672b991a4129bb1c7dcc42aa0abbe5d7/numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903", size = 6622990, upload-time = "2025-11-16T22:50:26.47Z" },
-    { url = "https://files.pythonhosted.org/packages/95/03/dc0723a013c7d7c19de5ef29e932c3081df1c14ba582b8b86b5de9db7f0f/numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d", size = 14248902, upload-time = "2025-11-16T22:50:28.861Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/10/ca162f45a102738958dcec8023062dad0cbc17d1ab99d68c4e4a6c45fb2b/numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017", size = 16597430, upload-time = "2025-11-16T22:50:31.56Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/51/c1e29be863588db58175175f057286900b4b3327a1351e706d5e0f8dd679/numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf", size = 16024551, upload-time = "2025-11-16T22:50:34.242Z" },
-    { url = "https://files.pythonhosted.org/packages/83/68/8236589d4dbb87253d28259d04d9b814ec0ecce7cb1c7fed29729f4c3a78/numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce", size = 18533275, upload-time = "2025-11-16T22:50:37.651Z" },
-    { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" },
-    { url = "https://files.pythonhosted.org/packages/13/cb/71744144e13389d577f867f745b7df2d8489463654a918eea2eeb166dfc9/numpy-2.3.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd", size = 16827292, upload-time = "2025-11-16T22:50:47.715Z" },
-    { url = "https://files.pythonhosted.org/packages/71/80/ba9dc6f2a4398e7f42b708a7fdc841bb638d353be255655498edbf9a15a8/numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f", size = 12378897, upload-time = "2025-11-16T22:50:51.327Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/6d/db2151b9f64264bcceccd51741aa39b50150de9b602d98ecfe7e0c4bff39/numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a", size = 5207391, upload-time = "2025-11-16T22:50:54.542Z" },
-    { url = "https://files.pythonhosted.org/packages/80/ae/429bacace5ccad48a14c4ae5332f6aa8ab9f69524193511d60ccdfdc65fa/numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139", size = 6721275, upload-time = "2025-11-16T22:50:56.794Z" },
-    { url = "https://files.pythonhosted.org/packages/74/5b/1919abf32d8722646a38cd527bc3771eb229a32724ee6ba340ead9b92249/numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e", size = 14306855, upload-time = "2025-11-16T22:50:59.208Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/87/6831980559434973bebc30cd9c1f21e541a0f2b0c280d43d3afd909b66d0/numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9", size = 16657359, upload-time = "2025-11-16T22:51:01.991Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/91/c797f544491ee99fd00495f12ebb7802c440c1915811d72ac5b4479a3356/numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946", size = 16093374, upload-time = "2025-11-16T22:51:05.291Z" },
-    { url = "https://files.pythonhosted.org/packages/74/a6/54da03253afcbe7a72785ec4da9c69fb7a17710141ff9ac5fcb2e32dbe64/numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1", size = 18594587, upload-time = "2025-11-16T22:51:08.585Z" },
-    { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/97/1a914559c19e32d6b2e233cf9a6a114e67c856d35b1d6babca571a3e880f/numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82", size = 16735706, upload-time = "2025-11-16T22:51:19.558Z" },
-    { url = "https://files.pythonhosted.org/packages/57/d4/51233b1c1b13ecd796311216ae417796b88b0616cfd8a33ae4536330748a/numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0", size = 12264507, upload-time = "2025-11-16T22:51:22.492Z" },
-    { url = "https://files.pythonhosted.org/packages/45/98/2fe46c5c2675b8306d0b4a3ec3494273e93e1226a490f766e84298576956/numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63", size = 5093049, upload-time = "2025-11-16T22:51:25.171Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/0e/0698378989bb0ac5f1660c81c78ab1fe5476c1a521ca9ee9d0710ce54099/numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9", size = 6626603, upload-time = "2025-11-16T22:51:27Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/a6/9ca0eecc489640615642a6cbc0ca9e10df70df38c4d43f5a928ff18d8827/numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b", size = 14262696, upload-time = "2025-11-16T22:51:29.402Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/f6/07ec185b90ec9d7217a00eeeed7383b73d7e709dae2a9a021b051542a708/numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520", size = 16597350, upload-time = "2025-11-16T22:51:32.167Z" },
-    { url = "https://files.pythonhosted.org/packages/75/37/164071d1dde6a1a84c9b8e5b414fa127981bad47adf3a6b7e23917e52190/numpy-2.3.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c", size = 16040190, upload-time = "2025-11-16T22:51:35.403Z" },
-    { url = "https://files.pythonhosted.org/packages/08/3c/f18b82a406b04859eb026d204e4e1773eb41c5be58410f41ffa511d114ae/numpy-2.3.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8", size = 18536749, upload-time = "2025-11-16T22:51:39.698Z" },
-    { url = "https://files.pythonhosted.org/packages/40/79/f82f572bf44cf0023a2fe8588768e23e1592585020d638999f15158609e1/numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248", size = 6335432, upload-time = "2025-11-16T22:51:42.476Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/2e/235b4d96619931192c91660805e5e49242389742a7a82c27665021db690c/numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e", size = 12919388, upload-time = "2025-11-16T22:51:45.275Z" },
-    { url = "https://files.pythonhosted.org/packages/07/2b/29fd75ce45d22a39c61aad74f3d718e7ab67ccf839ca8b60866054eb15f8/numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2", size = 10476651, upload-time = "2025-11-16T22:51:47.749Z" },
-    { url = "https://files.pythonhosted.org/packages/17/e1/f6a721234ebd4d87084cfa68d081bcba2f5cfe1974f7de4e0e8b9b2a2ba1/numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41", size = 16834503, upload-time = "2025-11-16T22:51:50.443Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/1c/baf7ffdc3af9c356e1c135e57ab7cf8d247931b9554f55c467efe2c69eff/numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad", size = 12381612, upload-time = "2025-11-16T22:51:53.609Z" },
-    { url = "https://files.pythonhosted.org/packages/74/91/f7f0295151407ddc9ba34e699013c32c3c91944f9b35fcf9281163dc1468/numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39", size = 5210042, upload-time = "2025-11-16T22:51:56.213Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/3b/78aebf345104ec50dd50a4d06ddeb46a9ff5261c33bcc58b1c4f12f85ec2/numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20", size = 6724502, upload-time = "2025-11-16T22:51:58.584Z" },
-    { url = "https://files.pythonhosted.org/packages/02/c6/7c34b528740512e57ef1b7c8337ab0b4f0bddf34c723b8996c675bc2bc91/numpy-2.3.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52", size = 14308962, upload-time = "2025-11-16T22:52:01.698Z" },
-    { url = "https://files.pythonhosted.org/packages/80/35/09d433c5262bc32d725bafc619e095b6a6651caf94027a03da624146f655/numpy-2.3.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b", size = 16655054, upload-time = "2025-11-16T22:52:04.267Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/ab/6a7b259703c09a88804fa2430b43d6457b692378f6b74b356155283566ac/numpy-2.3.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3", size = 16091613, upload-time = "2025-11-16T22:52:08.651Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/88/330da2071e8771e60d1038166ff9d73f29da37b01ec3eb43cb1427464e10/numpy-2.3.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227", size = 18591147, upload-time = "2025-11-16T22:52:11.453Z" },
-    { url = "https://files.pythonhosted.org/packages/51/41/851c4b4082402d9ea860c3626db5d5df47164a712cb23b54be028b184c1c/numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5", size = 6479806, upload-time = "2025-11-16T22:52:14.641Z" },
-    { url = "https://files.pythonhosted.org/packages/90/30/d48bde1dfd93332fa557cff1972fbc039e055a52021fbef4c2c4b1eefd17/numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf", size = 13105760, upload-time = "2025-11-16T22:52:17.975Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459, upload-time = "2025-11-16T22:52:20.55Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/65/f9dea8e109371ade9c782b4e4756a82edf9d3366bca495d84d79859a0b79/numpy-2.3.5-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310", size = 16910689, upload-time = "2025-11-16T22:52:23.247Z" },
-    { url = "https://files.pythonhosted.org/packages/00/4f/edb00032a8fb92ec0a679d3830368355da91a69cab6f3e9c21b64d0bb986/numpy-2.3.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c", size = 12457053, upload-time = "2025-11-16T22:52:26.367Z" },
-    { url = "https://files.pythonhosted.org/packages/16/a4/e8a53b5abd500a63836a29ebe145fc1ab1f2eefe1cfe59276020373ae0aa/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18", size = 5285635, upload-time = "2025-11-16T22:52:29.266Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/2f/37eeb9014d9c8b3e9c55bc599c68263ca44fdbc12a93e45a21d1d56df737/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff", size = 6801770, upload-time = "2025-11-16T22:52:31.421Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/e4/68d2f474df2cb671b2b6c2986a02e520671295647dad82484cde80ca427b/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb", size = 14391768, upload-time = "2025-11-16T22:52:33.593Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/50/94ccd8a2b141cb50651fddd4f6a48874acb3c91c8f0842b08a6afc4b0b21/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7", size = 16729263, upload-time = "2025-11-16T22:52:36.369Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/ee/346fa473e666fe14c52fcdd19ec2424157290a032d4c41f98127bfb31ac7/numpy-2.3.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425", size = 12967213, upload-time = "2025-11-16T22:52:39.38Z" },
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/57/fd/0005efbd0af48e55eb3c7208af93f2862d4b1a56cd78e84309a2d959208d/numpy-2.4.2.tar.gz", hash = "sha256:659a6107e31a83c4e33f763942275fd278b21d095094044eb35569e86a21ddae", size = 20723651, upload-time = "2026-01-31T23:13:10.135Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/44/71852273146957899753e69986246d6a176061ea183407e95418c2aa4d9a/numpy-2.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7e88598032542bd49af7c4747541422884219056c268823ef6e5e89851c8825", size = 16955478, upload-time = "2026-01-31T23:10:25.623Z" },
+    { url = "https://files.pythonhosted.org/packages/74/41/5d17d4058bd0cd96bcbd4d9ff0fb2e21f52702aab9a72e4a594efa18692f/numpy-2.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7edc794af8b36ca37ef5fcb5e0d128c7e0595c7b96a2318d1badb6fcd8ee86b1", size = 14965467, upload-time = "2026-01-31T23:10:28.186Z" },
+    { url = "https://files.pythonhosted.org/packages/49/48/fb1ce8136c19452ed15f033f8aee91d5defe515094e330ce368a0647846f/numpy-2.4.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:6e9f61981ace1360e42737e2bae58b27bf28a1b27e781721047d84bd754d32e7", size = 5475172, upload-time = "2026-01-31T23:10:30.848Z" },
+    { url = "https://files.pythonhosted.org/packages/40/a9/3feb49f17bbd1300dd2570432961f5c8a4ffeff1db6f02c7273bd020a4c9/numpy-2.4.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:cb7bbb88aa74908950d979eeaa24dbdf1a865e3c7e45ff0121d8f70387b55f73", size = 6805145, upload-time = "2026-01-31T23:10:32.352Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/39/fdf35cbd6d6e2fcad42fcf85ac04a85a0d0fbfbf34b30721c98d602fd70a/numpy-2.4.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f069069931240b3fc703f1e23df63443dbd6390614c8c44a87d96cd0ec81eb1", size = 15966084, upload-time = "2026-01-31T23:10:34.502Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/46/6fa4ea94f1ddf969b2ee941290cca6f1bfac92b53c76ae5f44afe17ceb69/numpy-2.4.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c02ef4401a506fb60b411467ad501e1429a3487abca4664871d9ae0b46c8ba32", size = 16899477, upload-time = "2026-01-31T23:10:37.075Z" },
+    { url = "https://files.pythonhosted.org/packages/09/a1/2a424e162b1a14a5bd860a464ab4e07513916a64ab1683fae262f735ccd2/numpy-2.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2653de5c24910e49c2b106499803124dde62a5a1fe0eedeaecf4309a5f639390", size = 17323429, upload-time = "2026-01-31T23:10:39.704Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/a2/73014149ff250628df72c58204822ac01d768697913881aacf839ff78680/numpy-2.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1ae241bbfc6ae276f94a170b14785e561cb5e7f626b6688cf076af4110887413", size = 18635109, upload-time = "2026-01-31T23:10:41.924Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/0c/73e8be2f1accd56df74abc1c5e18527822067dced5ec0861b5bb882c2ce0/numpy-2.4.2-cp311-cp311-win32.whl", hash = "sha256:df1b10187212b198dd45fa943d8985a3c8cf854aed4923796e0e019e113a1bda", size = 6237915, upload-time = "2026-01-31T23:10:45.26Z" },
+    { url = "https://files.pythonhosted.org/packages/76/ae/e0265e0163cf127c24c3969d29f1c4c64551a1e375d95a13d32eab25d364/numpy-2.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:b9c618d56a29c9cb1c4da979e9899be7578d2e0b3c24d52079c166324c9e8695", size = 12607972, upload-time = "2026-01-31T23:10:47.021Z" },
+    { url = "https://files.pythonhosted.org/packages/29/a5/c43029af9b8014d6ea157f192652c50042e8911f4300f8f6ed3336bf437f/numpy-2.4.2-cp311-cp311-win_arm64.whl", hash = "sha256:47c5a6ed21d9452b10227e5e8a0e1c22979811cad7dcc19d8e3e2fb8fa03f1a3", size = 10485763, upload-time = "2026-01-31T23:10:50.087Z" },
+    { url = "https://files.pythonhosted.org/packages/51/6e/6f394c9c77668153e14d4da83bcc247beb5952f6ead7699a1a2992613bea/numpy-2.4.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:21982668592194c609de53ba4933a7471880ccbaadcc52352694a59ecc860b3a", size = 16667963, upload-time = "2026-01-31T23:10:52.147Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f8/55483431f2b2fd015ae6ed4fe62288823ce908437ed49db5a03d15151678/numpy-2.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40397bda92382fcec844066efb11f13e1c9a3e2a8e8f318fb72ed8b6db9f60f1", size = 14693571, upload-time = "2026-01-31T23:10:54.789Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/20/18026832b1845cdc82248208dd929ca14c9d8f2bac391f67440707fff27c/numpy-2.4.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:b3a24467af63c67829bfaa61eecf18d5432d4f11992688537be59ecd6ad32f5e", size = 5203469, upload-time = "2026-01-31T23:10:57.343Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/33/2eb97c8a77daaba34eaa3fa7241a14ac5f51c46a6bd5911361b644c4a1e2/numpy-2.4.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:805cc8de9fd6e7a22da5aed858e0ab16be5a4db6c873dde1d7451c541553aa27", size = 6550820, upload-time = "2026-01-31T23:10:59.429Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/91/b97fdfd12dc75b02c44e26c6638241cc004d4079a0321a69c62f51470c4c/numpy-2.4.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d82351358ffbcdcd7b686b90742a9b86632d6c1c051016484fa0b326a0a1548", size = 15663067, upload-time = "2026-01-31T23:11:01.291Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/c6/a18e59f3f0b8071cc85cbc8d80cd02d68aa9710170b2553a117203d46936/numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e35d3e0144137d9fdae62912e869136164534d64a169f86438bc9561b6ad49f", size = 16619782, upload-time = "2026-01-31T23:11:03.669Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/83/9751502164601a79e18847309f5ceec0b1446d7b6aa12305759b72cf98b2/numpy-2.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adb6ed2ad29b9e15321d167d152ee909ec73395901b70936f029c3bc6d7f4460", size = 17013128, upload-time = "2026-01-31T23:11:05.913Z" },
+    { url = "https://files.pythonhosted.org/packages/61/c4/c4066322256ec740acc1c8923a10047818691d2f8aec254798f3dd90f5f2/numpy-2.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8906e71fd8afcb76580404e2a950caef2685df3d2a57fe82a86ac8d33cc007ba", size = 18345324, upload-time = "2026-01-31T23:11:08.248Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/af/6157aa6da728fa4525a755bfad486ae7e3f76d4c1864138003eb84328497/numpy-2.4.2-cp312-cp312-win32.whl", hash = "sha256:ec055f6dae239a6299cace477b479cca2fc125c5675482daf1dd886933a1076f", size = 5960282, upload-time = "2026-01-31T23:11:10.497Z" },
+    { url = "https://files.pythonhosted.org/packages/92/0f/7ceaaeaacb40567071e94dbf2c9480c0ae453d5bb4f52bea3892c39dc83c/numpy-2.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:209fae046e62d0ce6435fcfe3b1a10537e858249b3d9b05829e2a05218296a85", size = 12314210, upload-time = "2026-01-31T23:11:12.176Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/a3/56c5c604fae6dd40fa2ed3040d005fca97e91bd320d232ac9931d77ba13c/numpy-2.4.2-cp312-cp312-win_arm64.whl", hash = "sha256:fbde1b0c6e81d56f5dccd95dd4a711d9b95df1ae4009a60887e56b27e8d903fa", size = 10220171, upload-time = "2026-01-31T23:11:14.684Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/22/815b9fe25d1d7ae7d492152adbc7226d3eff731dffc38fe970589fcaaa38/numpy-2.4.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25f2059807faea4b077a2b6837391b5d830864b3543627f381821c646f31a63c", size = 16663696, upload-time = "2026-01-31T23:11:17.516Z" },
+    { url = "https://files.pythonhosted.org/packages/09/f0/817d03a03f93ba9c6c8993de509277d84e69f9453601915e4a69554102a1/numpy-2.4.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bd3a7a9f5847d2fb8c2c6d1c862fa109c31a9abeca1a3c2bd5a64572955b2979", size = 14688322, upload-time = "2026-01-31T23:11:19.883Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b4/f805ab79293c728b9a99438775ce51885fd4f31b76178767cfc718701a39/numpy-2.4.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8e4549f8a3c6d13d55041925e912bfd834285ef1dd64d6bc7d542583355e2e98", size = 5198157, upload-time = "2026-01-31T23:11:22.375Z" },
+    { url = "https://files.pythonhosted.org/packages/74/09/826e4289844eccdcd64aac27d13b0fd3f32039915dd5b9ba01baae1f436c/numpy-2.4.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:aea4f66ff44dfddf8c2cffd66ba6538c5ec67d389285292fe428cb2c738c8aef", size = 6546330, upload-time = "2026-01-31T23:11:23.958Z" },
+    { url = "https://files.pythonhosted.org/packages/19/fb/cbfdbfa3057a10aea5422c558ac57538e6acc87ec1669e666d32ac198da7/numpy-2.4.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3cd545784805de05aafe1dde61752ea49a359ccba9760c1e5d1c88a93bbf2b7", size = 15660968, upload-time = "2026-01-31T23:11:25.713Z" },
+    { url = "https://files.pythonhosted.org/packages/04/dc/46066ce18d01645541f0186877377b9371b8fa8017fa8262002b4ef22612/numpy-2.4.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0d9b7c93578baafcbc5f0b83eaf17b79d345c6f36917ba0c67f45226911d499", size = 16607311, upload-time = "2026-01-31T23:11:28.117Z" },
+    { url = "https://files.pythonhosted.org/packages/14/d9/4b5adfc39a43fa6bf918c6d544bc60c05236cc2f6339847fc5b35e6cb5b0/numpy-2.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f74f0f7779cc7ae07d1810aab8ac6b1464c3eafb9e283a40da7309d5e6e48fbb", size = 17012850, upload-time = "2026-01-31T23:11:30.888Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/20/adb6e6adde6d0130046e6fdfb7675cc62bc2f6b7b02239a09eb58435753d/numpy-2.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c7ac672d699bf36275c035e16b65539931347d68b70667d28984c9fb34e07fa7", size = 18334210, upload-time = "2026-01-31T23:11:33.214Z" },
+    { url = "https://files.pythonhosted.org/packages/78/0e/0a73b3dff26803a8c02baa76398015ea2a5434d9b8265a7898a6028c1591/numpy-2.4.2-cp313-cp313-win32.whl", hash = "sha256:8e9afaeb0beff068b4d9cd20d322ba0ee1cecfb0b08db145e4ab4dd44a6b5110", size = 5958199, upload-time = "2026-01-31T23:11:35.385Z" },
+    { url = "https://files.pythonhosted.org/packages/43/bc/6352f343522fcb2c04dbaf94cb30cca6fd32c1a750c06ad6231b4293708c/numpy-2.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:7df2de1e4fba69a51c06c28f5a3de36731eb9639feb8e1cf7e4a7b0daf4cf622", size = 12310848, upload-time = "2026-01-31T23:11:38.001Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/8d/6da186483e308da5da1cc6918ce913dcfe14ffde98e710bfeff2a6158d4e/numpy-2.4.2-cp313-cp313-win_arm64.whl", hash = "sha256:0fece1d1f0a89c16b03442eae5c56dc0be0c7883b5d388e0c03f53019a4bfd71", size = 10221082, upload-time = "2026-01-31T23:11:40.392Z" },
+    { url = "https://files.pythonhosted.org/packages/25/a1/9510aa43555b44781968935c7548a8926274f815de42ad3997e9e83680dd/numpy-2.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5633c0da313330fd20c484c78cdd3f9b175b55e1a766c4a174230c6b70ad8262", size = 14815866, upload-time = "2026-01-31T23:11:42.495Z" },
+    { url = "https://files.pythonhosted.org/packages/36/30/6bbb5e76631a5ae46e7923dd16ca9d3f1c93cfa8d4ed79a129814a9d8db3/numpy-2.4.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d9f64d786b3b1dd742c946c42d15b07497ed14af1a1f3ce840cce27daa0ce913", size = 5325631, upload-time = "2026-01-31T23:11:44.7Z" },
+    { url = "https://files.pythonhosted.org/packages/46/00/3a490938800c1923b567b3a15cd17896e68052e2145d8662aaf3e1ffc58f/numpy-2.4.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:b21041e8cb6a1eb5312dd1d2f80a94d91efffb7a06b70597d44f1bd2dfc315ab", size = 6646254, upload-time = "2026-01-31T23:11:46.341Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/e9/fac0890149898a9b609caa5af7455a948b544746e4b8fe7c212c8edd71f8/numpy-2.4.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:00ab83c56211a1d7c07c25e3217ea6695e50a3e2f255053686b081dc0b091a82", size = 15720138, upload-time = "2026-01-31T23:11:48.082Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/5c/08887c54e68e1e28df53709f1893ce92932cc6f01f7c3d4dc952f61ffd4e/numpy-2.4.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fb882da679409066b4603579619341c6d6898fc83a8995199d5249f986e8e8f", size = 16655398, upload-time = "2026-01-31T23:11:50.293Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/89/253db0fa0e66e9129c745e4ef25631dc37d5f1314dad2b53e907b8538e6d/numpy-2.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:66cb9422236317f9d44b67b4d18f44efe6e9c7f8794ac0462978513359461554", size = 17079064, upload-time = "2026-01-31T23:11:52.927Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/d5/cbade46ce97c59c6c3da525e8d95b7abe8a42974a1dc5c1d489c10433e88/numpy-2.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0f01dcf33e73d80bd8dc0f20a71303abbafa26a19e23f6b68d1aa9990af90257", size = 18379680, upload-time = "2026-01-31T23:11:55.22Z" },
+    { url = "https://files.pythonhosted.org/packages/40/62/48f99ae172a4b63d981babe683685030e8a3df4f246c893ea5c6ef99f018/numpy-2.4.2-cp313-cp313t-win32.whl", hash = "sha256:52b913ec40ff7ae845687b0b34d8d93b60cb66dcee06996dd5c99f2fc9328657", size = 6082433, upload-time = "2026-01-31T23:11:58.096Z" },
+    { url = "https://files.pythonhosted.org/packages/07/38/e054a61cfe48ad9f1ed0d188e78b7e26859d0b60ef21cd9de4897cdb5326/numpy-2.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:5eea80d908b2c1f91486eb95b3fb6fab187e569ec9752ab7d9333d2e66bf2d6b", size = 12451181, upload-time = "2026-01-31T23:11:59.782Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/a4/a05c3a6418575e185dd84d0b9680b6bb2e2dc3e4202f036b7b4e22d6e9dc/numpy-2.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:fd49860271d52127d61197bb50b64f58454e9f578cb4b2c001a6de8b1f50b0b1", size = 10290756, upload-time = "2026-01-31T23:12:02.438Z" },
+    { url = "https://files.pythonhosted.org/packages/18/88/b7df6050bf18fdcfb7046286c6535cabbdd2064a3440fca3f069d319c16e/numpy-2.4.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:444be170853f1f9d528428eceb55f12918e4fda5d8805480f36a002f1415e09b", size = 16663092, upload-time = "2026-01-31T23:12:04.521Z" },
+    { url = "https://files.pythonhosted.org/packages/25/7a/1fee4329abc705a469a4afe6e69b1ef7e915117747886327104a8493a955/numpy-2.4.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d1240d50adff70c2a88217698ca844723068533f3f5c5fa6ee2e3220e3bdb000", size = 14698770, upload-time = "2026-01-31T23:12:06.96Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/0b/f9e49ba6c923678ad5bc38181c08ac5e53b7a5754dbca8e581aa1a56b1ff/numpy-2.4.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:7cdde6de52fb6664b00b056341265441192d1291c130e99183ec0d4b110ff8b1", size = 5208562, upload-time = "2026-01-31T23:12:09.632Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/12/d7de8f6f53f9bb76997e5e4c069eda2051e3fe134e9181671c4391677bb2/numpy-2.4.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:cda077c2e5b780200b6b3e09d0b42205a3d1c68f30c6dceb90401c13bff8fe74", size = 6543710, upload-time = "2026-01-31T23:12:11.969Z" },
+    { url = "https://files.pythonhosted.org/packages/09/63/c66418c2e0268a31a4cf8a8b512685748200f8e8e8ec6c507ce14e773529/numpy-2.4.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d30291931c915b2ab5717c2974bb95ee891a1cf22ebc16a8006bd59cd210d40a", size = 15677205, upload-time = "2026-01-31T23:12:14.33Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/6c/7f237821c9642fb2a04d2f1e88b4295677144ca93285fd76eff3bcba858d/numpy-2.4.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bba37bc29d4d85761deed3954a1bc62be7cf462b9510b51d367b769a8c8df325", size = 16611738, upload-time = "2026-01-31T23:12:16.525Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/a7/39c4cdda9f019b609b5c473899d87abff092fc908cfe4d1ecb2fcff453b0/numpy-2.4.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b2f0073ed0868db1dcd86e052d37279eef185b9c8db5bf61f30f46adac63c909", size = 17028888, upload-time = "2026-01-31T23:12:19.306Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b3/e84bb64bdfea967cc10950d71090ec2d84b49bc691df0025dddb7c26e8e3/numpy-2.4.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7f54844851cdb630ceb623dcec4db3240d1ac13d4990532446761baede94996a", size = 18339556, upload-time = "2026-01-31T23:12:21.816Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f5/954a291bc1192a27081706862ac62bb5920fbecfbaa302f64682aa90beed/numpy-2.4.2-cp314-cp314-win32.whl", hash = "sha256:12e26134a0331d8dbd9351620f037ec470b7c75929cb8a1537f6bfe411152a1a", size = 6006899, upload-time = "2026-01-31T23:12:24.14Z" },
+    { url = "https://files.pythonhosted.org/packages/05/cb/eff72a91b2efdd1bc98b3b8759f6a1654aa87612fc86e3d87d6fe4f948c4/numpy-2.4.2-cp314-cp314-win_amd64.whl", hash = "sha256:068cdb2d0d644cdb45670810894f6a0600797a69c05f1ac478e8d31670b8ee75", size = 12443072, upload-time = "2026-01-31T23:12:26.33Z" },
+    { url = "https://files.pythonhosted.org/packages/37/75/62726948db36a56428fce4ba80a115716dc4fad6a3a4352487f8bb950966/numpy-2.4.2-cp314-cp314-win_arm64.whl", hash = "sha256:6ed0be1ee58eef41231a5c943d7d1375f093142702d5723ca2eb07db9b934b05", size = 10494886, upload-time = "2026-01-31T23:12:28.488Z" },
+    { url = "https://files.pythonhosted.org/packages/36/2f/ee93744f1e0661dc267e4b21940870cabfae187c092e1433b77b09b50ac4/numpy-2.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:98f16a80e917003a12c0580f97b5f875853ebc33e2eaa4bccfc8201ac6869308", size = 14818567, upload-time = "2026-01-31T23:12:30.709Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/24/6535212add7d76ff938d8bdc654f53f88d35cddedf807a599e180dcb8e66/numpy-2.4.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:20abd069b9cda45874498b245c8015b18ace6de8546bf50dfa8cea1696ed06ef", size = 5328372, upload-time = "2026-01-31T23:12:32.962Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/9d/c48f0a035725f925634bf6b8994253b43f2047f6778a54147d7e213bc5a7/numpy-2.4.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:e98c97502435b53741540a5717a6749ac2ada901056c7db951d33e11c885cc7d", size = 6649306, upload-time = "2026-01-31T23:12:34.797Z" },
+    { url = "https://files.pythonhosted.org/packages/81/05/7c73a9574cd4a53a25907bad38b59ac83919c0ddc8234ec157f344d57d9a/numpy-2.4.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da6cad4e82cb893db4b69105c604d805e0c3ce11501a55b5e9f9083b47d2ffe8", size = 15722394, upload-time = "2026-01-31T23:12:36.565Z" },
+    { url = "https://files.pythonhosted.org/packages/35/fa/4de10089f21fc7d18442c4a767ab156b25c2a6eaf187c0db6d9ecdaeb43f/numpy-2.4.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e4424677ce4b47fe73c8b5556d876571f7c6945d264201180db2dc34f676ab5", size = 16653343, upload-time = "2026-01-31T23:12:39.188Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/f9/d33e4ffc857f3763a57aa85650f2e82486832d7492280ac21ba9efda80da/numpy-2.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2b8f157c8a6f20eb657e240f8985cc135598b2b46985c5bccbde7616dc9c6b1e", size = 17078045, upload-time = "2026-01-31T23:12:42.041Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/b8/54bdb43b6225badbea6389fa038c4ef868c44f5890f95dd530a218706da3/numpy-2.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5daf6f3914a733336dab21a05cdec343144600e964d2fcdabaac0c0269874b2a", size = 18380024, upload-time = "2026-01-31T23:12:44.331Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/55/6e1a61ded7af8df04016d81b5b02daa59f2ea9252ee0397cb9f631efe9e5/numpy-2.4.2-cp314-cp314t-win32.whl", hash = "sha256:8c50dd1fc8826f5b26a5ee4d77ca55d88a895f4e4819c7ecc2a9f5905047a443", size = 6153937, upload-time = "2026-01-31T23:12:47.229Z" },
+    { url = "https://files.pythonhosted.org/packages/45/aa/fa6118d1ed6d776b0983f3ceac9b1a5558e80df9365b1c3aa6d42bf9eee4/numpy-2.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:fcf92bee92742edd401ba41135185866f7026c502617f422eb432cfeca4fe236", size = 12631844, upload-time = "2026-01-31T23:12:48.997Z" },
+    { url = "https://files.pythonhosted.org/packages/32/0a/2ec5deea6dcd158f254a7b372fb09cfba5719419c8d66343bab35237b3fb/numpy-2.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:1f92f53998a17265194018d1cc321b2e96e900ca52d54c7c77837b71b9465181", size = 10565379, upload-time = "2026-01-31T23:12:51.345Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f8/50e14d36d915ef64d8f8bc4a087fc8264d82c785eda6711f80ab7e620335/numpy-2.4.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:89f7268c009bc492f506abd6f5265defa7cb3f7487dc21d357c3d290add45082", size = 16833179, upload-time = "2026-01-31T23:12:53.5Z" },
+    { url = "https://files.pythonhosted.org/packages/17/17/809b5cad63812058a8189e91a1e2d55a5a18fd04611dbad244e8aeae465c/numpy-2.4.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:e6dee3bb76aa4009d5a912180bf5b2de012532998d094acee25d9cb8dee3e44a", size = 14889755, upload-time = "2026-01-31T23:12:55.933Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/ea/181b9bcf7627fc8371720316c24db888dcb9829b1c0270abf3d288b2e29b/numpy-2.4.2-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:cd2bd2bbed13e213d6b55dc1d035a4f91748a7d3edc9480c13898b0353708920", size = 5399500, upload-time = "2026-01-31T23:12:58.671Z" },
+    { url = "https://files.pythonhosted.org/packages/33/9f/413adf3fc955541ff5536b78fcf0754680b3c6d95103230252a2c9408d23/numpy-2.4.2-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:cf28c0c1d4c4bf00f509fa7eb02c58d7caf221b50b467bcb0d9bbf1584d5c821", size = 6714252, upload-time = "2026-01-31T23:13:00.518Z" },
+    { url = "https://files.pythonhosted.org/packages/91/da/643aad274e29ccbdf42ecd94dafe524b81c87bcb56b83872d54827f10543/numpy-2.4.2-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e04ae107ac591763a47398bb45b568fc38f02dbc4aa44c063f67a131f99346cb", size = 15797142, upload-time = "2026-01-31T23:13:02.219Z" },
+    { url = "https://files.pythonhosted.org/packages/66/27/965b8525e9cb5dc16481b30a1b3c21e50c7ebf6e9dbd48d0c4d0d5089c7e/numpy-2.4.2-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:602f65afdef699cda27ec0b9224ae5dc43e328f4c24c689deaf77133dbee74d0", size = 16727979, upload-time = "2026-01-31T23:13:04.62Z" },
+    { url = "https://files.pythonhosted.org/packages/de/e5/b7d20451657664b07986c2f6e3be564433f5dcaf3482d68eaecd79afaf03/numpy-2.4.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:be71bf1edb48ebbbf7f6337b5bfd2f895d1902f6335a5830b20141fc126ffba0", size = 12502577, upload-time = "2026-01-31T23:13:07.08Z" },
 ]
 
 [[package]]
 name = "nv-grouped-gemm"
-version = "1.1.4.post6"
+version = "1.1.4.post8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "absl-py" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "torch", marker = "sys_platform == 'never'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/05/79/87c45f32e661b25e0aaa1e325ba166511f57be5dff8f0fcabc12d3e73b64/nv_grouped_gemm-1.1.4.post6.tar.gz", hash = "sha256:dad6115f4b4ff7ceb0bc40ad44e923c13a24fc88cfe1e20b1a6b4c9cf24c445c", size = 26508, upload-time = "2025-10-10T18:52:29.508Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/02/ad/046a097b63a96c1ba1d85f0031dbe7fcbdb33e6c445dfbaba2ffaefdd497/nv_grouped_gemm-1.1.4.post8.tar.gz", hash = "sha256:ab321693f0292cfd8a26dc7b6f14decd9eb00e209494de7218e4fad36191275d", size = 20821209, upload-time = "2025-12-17T02:22:38.432Z" }
 
 [[package]]
 name = "nv-one-logger-core"
@@ -3071,6 +3564,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/15/97e6e4ddfe5fc35bcee74a45b7c33fb73abb83713c7dfa26420b971a86c3/nv_one_logger_training_telemetry-2.3.1-py3-none-any.whl", hash = "sha256:5319443829b59378a498c3c62ac98973e14f31be675c229ff2b14e2fe109aa0b", size = 44140, upload-time = "2025-10-29T21:21:40.72Z" },
 ]
 
+[[package]]
+name = "nvdlfw-inspect"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyyaml" },
+    { name = "torch", marker = "sys_platform == 'never'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/86/94188e03e5d4dd7b73c390b0cddcde5618b3799c18e327b2bf15763f6137/nvdlfw_inspect-0.2.2-py3-none-any.whl", hash = "sha256:8a4dc2814c5a4cd19ae304170b9bfa514538ef3c3eb243a45a82404ec3cb279d", size = 30964, upload-time = "2025-12-03T10:52:01.933Z" },
+]
+
 [[package]]
 name = "nvidia-cublas-cu12"
 version = "12.8.4.1"
@@ -3126,21 +3631,24 @@ wheels = [
 
 [[package]]
 name = "nvidia-cudnn-frontend"
-version = "1.16.0"
+version = "1.18.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/cf/3cd3cc682df5488288c6043fc0977090497ff015a082ab160076fecb080a/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83ecbe6d1145dc208a9ae82aa0b45b2c8f74ed8a43d3a102a13eef2117e2fedd", size = 1835542, upload-time = "2025-11-07T01:28:20.133Z" },
-    { url = "https://files.pythonhosted.org/packages/92/45/87f3f2d94a928be21459949b03b0b8bcea13531d30094ad84a8ae4fca761/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77cb06b91877c8489363867434ba1d9936f3e10bf7ed98d82e98f5f578611920", size = 1950339, upload-time = "2025-11-07T01:31:41.69Z" },
-    { url = "https://files.pythonhosted.org/packages/be/f5/1662f18084ef4441bfb3a01383cbf77194905b53474dcb51c0d0f373c74b/nvidia_cudnn_frontend-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:ee3f3886f107919dad48cbc905fa6ae9207c8d7d5a24165e55625ea96f0fe40f", size = 1367883, upload-time = "2025-11-07T01:25:17.791Z" },
-    { url = "https://files.pythonhosted.org/packages/10/b7/d0a3a337f5e83f26ff79a7fd63a859181ff2911f1d905d6fbab5fc80170d/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c360d5840d6eb597aade9e9c8780e24aec283b8e6bc97d52881c821a35c92aa9", size = 1837573, upload-time = "2025-11-07T01:29:05.507Z" },
-    { url = "https://files.pythonhosted.org/packages/95/dc/465a14f2d235778405f2e84fce336d07ab045bf1c7df6404bdf8033e06a8/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c4a8fc573d85a86e08b15d9bf37f729e2487298781867a492a59cde6ac295e2", size = 1952630, upload-time = "2025-11-07T01:32:00.242Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/89/f14435f616603a999975930c4456d6140127f6acb19a877c752beccad837/nvidia_cudnn_frontend-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:a257f10a932ffde9741f644efd3611acf77e2fd89d493d81bc6a8353c48f1ec2", size = 1368775, upload-time = "2025-11-07T01:25:42.252Z" },
-    { url = "https://files.pythonhosted.org/packages/00/39/79b606e805abd67ab4fa72f752a5413a496159f10d94fbdb1d67bb5ae86c/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd6fdd71c0896ff2ca1809d914cbd17f2904d55863f8881f47946e1d634c7a88", size = 1839271, upload-time = "2025-11-07T01:29:53.06Z" },
-    { url = "https://files.pythonhosted.org/packages/09/21/a0e0d50ba8d7b639fe635500fee0d9c0319561b1ae72176d7024ec04b439/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:16efb069d4bda4d3b99134f59f376cfd4d09558298bd96af778fdc7f2851e696", size = 1954062, upload-time = "2025-11-07T01:32:18.556Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/d6/30ae67bb9c010e9459d1211c56d73373eb4e3dd9f57f4c3c1fe0966efcb1/nvidia_cudnn_frontend-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:7b7860db03767c158accbe0b4e9c9553506513cc970ff08ed28c7761681ac466", size = 1368435, upload-time = "2025-11-07T01:26:28.022Z" },
-    { url = "https://files.pythonhosted.org/packages/32/2c/b4376afef0a6342c56e82e3465c1f8f5c719f588293a50dd04019a22ae6e/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b6bcb3a2fbff80538958e21e2227520f082a961164865aaeedaac527f61084f9", size = 1839805, upload-time = "2025-11-07T01:30:31.056Z" },
-    { url = "https://files.pythonhosted.org/packages/71/13/836b90354036154ab82db3861210e5736983fe1fc44bb39c146ad93b333b/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cbdad88b2bec5dde837f8fa7632022334cddb4756f923b5421c06a712cb59d31", size = 1953953, upload-time = "2025-11-07T01:33:03.781Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/30/3025f34f2c86ceef85134dc1f323f8cf2a26d3ffddc5ada48528c80bfae1/nvidia_cudnn_frontend-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:138de2bc4697fabb2eb2f0f601a7e31f8fe97874908e26e33d737276f335473c", size = 1368359, upload-time = "2025-11-07T01:26:51.561Z" },
+    { url = "https://files.pythonhosted.org/packages/86/be/f5a1e633c524c13c0182213ab27dab42dca29a3c785be5ff74d2d185aed1/nvidia_cudnn_frontend-1.18.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:baa6fbc8e7c55f1c78c0374ed9a890e1cf81acaca0c92d6135d18a8e3c985244", size = 2023500, upload-time = "2026-01-27T23:31:34.747Z" },
+    { url = "https://files.pythonhosted.org/packages/82/a7/765a17c6a9496196c34f269d17dfb902b6c618c0261c0962511e95302e81/nvidia_cudnn_frontend-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e4bcca42259e358002c8867e3624a558f66cd5dff2cc6c3aafd860ef2f41730", size = 2154278, upload-time = "2026-01-27T23:06:55.784Z" },
+    { url = "https://files.pythonhosted.org/packages/19/a1/7caae2243540bc60e47eae95f0fd913c9baa05cf94df0471914f70d45158/nvidia_cudnn_frontend-1.18.0-cp310-cp310-win_amd64.whl", hash = "sha256:06252021ef1e5a7256f1e70429a426b01792636c05cc547fe8e64c6885a9652e", size = 1590158, upload-time = "2026-01-27T23:08:26.703Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/9a/83d3d080118de4a7810fa019349edec634b8b37b9cafaacd05719de62dd6/nvidia_cudnn_frontend-1.18.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6d4d0b88d617b233a503c84980b54d840b60b2734497d1a7a071ec5293daec2", size = 2023709, upload-time = "2026-01-27T23:32:10.912Z" },
+    { url = "https://files.pythonhosted.org/packages/13/c7/c3624b3ed77b102618f26295e816b27f1c3ebb1143730237a9f51d403c3f/nvidia_cudnn_frontend-1.18.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:382ea063b92cbfd5b442cb75ff8422932d78276aecf139e46713ed1ad3d07af4", size = 2155568, upload-time = "2026-01-27T23:07:13.277Z" },
+    { url = "https://files.pythonhosted.org/packages/52/dd/8613dfd029d076b86a8a87efe3f4bb4ab73cec15fa8fc27e665098f4d167/nvidia_cudnn_frontend-1.18.0-cp311-cp311-win_amd64.whl", hash = "sha256:baa509effc4d299d3f04e549d4188f88bca8a8b527f483cbd2f66bc18f13a8b1", size = 1591244, upload-time = "2026-01-27T23:08:44.691Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/b4/604e230378680ee117849a4e1045baca092f93161a829291a84d5acce70c/nvidia_cudnn_frontend-1.18.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:310b417f2848a83d1437203fcaeea320a74fb7f28af20bf42bf5afc9c01f1c12", size = 2027408, upload-time = "2026-01-27T23:32:46.576Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/52/08f98262e77b1cbcc834cc1a5db494d0661ea1dbdea58c2e2d51a57fdaca/nvidia_cudnn_frontend-1.18.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c023539ca6de99234cf5102c3ec0d6af817f5396fc93028a22ba5b834a35b8a", size = 2159245, upload-time = "2026-01-27T23:07:32.664Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/1f/751a5a8cfdc95fb4dc556192d37369ae488c30c473fe9a3ec720b23d07ea/nvidia_cudnn_frontend-1.18.0-cp312-cp312-win_amd64.whl", hash = "sha256:e13f7dd46cdb4762dde87f181f06d1c5e15e9478bbdd547bfa74d9b11f415aae", size = 1591041, upload-time = "2026-01-27T23:09:04.118Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/bd/db791a26ebb6a6e1268f518e18c82d8ad18546f7008f4b0d5bde15f927de/nvidia_cudnn_frontend-1.18.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a6e2b7bd43705ffa4af3b187374fdd5e7d09fc228a4d65fc8b4b0a537a8e605", size = 2027249, upload-time = "2026-01-27T23:33:22.46Z" },
+    { url = "https://files.pythonhosted.org/packages/19/74/3038cf496d5de7cfdff730f5202e438c17d9123de507059340e02ddff9d7/nvidia_cudnn_frontend-1.18.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0544206b02cae9da4f044ca3fe7416b99e0c8a8052285dd3e5a8fc445d34f9c", size = 2160001, upload-time = "2026-01-27T23:07:50.248Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/5e/148cc6609dba326e620e4d949246020dfba05ca07d0387442e62b71d19b6/nvidia_cudnn_frontend-1.18.0-cp313-cp313-win_amd64.whl", hash = "sha256:7eefa5f10cc003df5f3593f82f1ee6c001fc3412bdc78430c751914dfceefd7f", size = 1591270, upload-time = "2026-01-27T23:09:21.435Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/0a/515209dd2afc6027bf1112bf415f575bfe9628d18877abe7424cb597dd7b/nvidia_cudnn_frontend-1.18.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b489da1b30f1d7da822b37b89cc4f68afd80e020eb57e4ab24921f8b57f6e946", size = 2028689, upload-time = "2026-02-11T21:32:04.235Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/57/52d18e1f50979eeabfafb408ec73068afc5a1e1ccd21636240317cd456d4/nvidia_cudnn_frontend-1.18.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:37688c81a34ac590aff9de4c34d2968bab949411af707baa327616ebd4b34ae1", size = 2160182, upload-time = "2026-02-11T21:25:18.437Z" },
+    { url = "https://files.pythonhosted.org/packages/67/53/df2810b56d259ef96fa6beaa1381bd14c29fbe82836b409516e864c5e177/nvidia_cudnn_frontend-1.18.0-cp314-cp314-win_amd64.whl", hash = "sha256:5053b473fa74168b5fbf35934cd6187f88aa03b8447b9f2cd417332d5e5c9569", size = 1592759, upload-time = "2026-02-11T21:32:33.87Z" },
 ]
 
 [[package]]
@@ -3215,23 +3723,34 @@ wheels = [
 
 [[package]]
 name = "nvidia-cutlass-dsl"
-version = "4.3.1"
+version = "4.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-cutlass-dsl-libs-base" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/09/42fca58af350265131b6f8665ad5b62526c95e6692788460bd5306d3efe2/nvidia_cutlass_dsl-4.4.0-py3-none-any.whl", hash = "sha256:2d1f34333e4d774002d44b53262d71aaf738700fcf3858290629f9a7b374c61c", size = 10168, upload-time = "2026-02-14T03:38:54.267Z" },
+]
+
+[[package]]
+name = "nvidia-cutlass-dsl-libs-base"
+version = "4.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-python" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/75/c3/3cd4c440f386a24c348c7c67adff5e38bb2405d08579ae3ac9312fa14ee4/nvidia_cutlass_dsl-4.3.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:29d6ccb56955e6528c818591fe752a820305951a73fbb69f9a816b3e228d57f8", size = 58726035, upload-time = "2025-11-28T00:59:03.749Z" },
-    { url = "https://files.pythonhosted.org/packages/35/b5/854b713e2355e6211624dfc9df65aca5ebc2a8aaae97a696def34a4b9c9a/nvidia_cutlass_dsl-4.3.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f54d98339d4fca37d39390933186c4a7987291b57129da9bf45c7746d47786af", size = 58591793, upload-time = "2025-11-28T01:03:01.473Z" },
-    { url = "https://files.pythonhosted.org/packages/45/24/432ab11c9da47742518e008f61c58166b3cced5d39df987155d103d5e18e/nvidia_cutlass_dsl-4.3.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c7b27b3faf2d3cb4e9504ad55129ac58c09aa59f3af6eaabb88f4bda010a2792", size = 58725123, upload-time = "2025-11-28T00:58:11.337Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/07/59509304cac496275a0a7bdae436c267829611b38e4500b2622424c9f737/nvidia_cutlass_dsl-4.3.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:24cfbf55aad55b3dd06ddaa340d13028b4e49b15e0e557105187a9d0bbc260db", size = 58592193, upload-time = "2025-11-28T00:59:54.448Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/c5/f1586c64fcf569b890da776d08a32836a3ef2450cbe9e3ac2971dbecbcce/nvidia_cutlass_dsl-4.3.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:025a8c7a0fb80626e2a893954ea19b2e1ece8d131078c7da12b7fabc2634d04d", size = 58726236, upload-time = "2025-11-28T00:59:29.376Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/5b/fe6a2db1688a690a94f8ad03706fa6db2055d82fab0c4fab764e8c89640f/nvidia_cutlass_dsl-4.3.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b95ce5633e09f12c8d1fcd30c5db06b8325d41b3da0875d3e8a4c110ed5b5cdf", size = 58591826, upload-time = "2025-11-28T01:00:19.559Z" },
-    { url = "https://files.pythonhosted.org/packages/40/fe/5e48c63ff5a510c0edbac5167921a819c70f71daf3b6ead0e0e5346b2a42/nvidia_cutlass_dsl-4.3.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c8e816cc061b34e016906fa87948f2b0fa836a95f27732c14097f3ddda8286e2", size = 58725695, upload-time = "2025-11-28T01:01:32.1Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/ef/34b1bdd375226b818cd810145e207cceb50fd12eaa87e88a6e67820574d4/nvidia_cutlass_dsl-4.3.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f71adcfb56607fc86ea621edcf9503eaa31f66f70efd7ab719c33683db082183", size = 58592065, upload-time = "2025-11-28T01:02:35.83Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/af/cf64251bae66077769adbcd9a2e96b86aeb3c41490c5ee0a939a1a3b511e/nvidia_cutlass_dsl_libs_base-4.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:703169d0843ad7e310b397aa95128e3fa983571a9a488f826c2968f3e71df2b8", size = 75460001, upload-time = "2026-02-14T03:44:18.705Z" },
+    { url = "https://files.pythonhosted.org/packages/87/94/42af69f7de79658d45116a32f5b6c9d5cfc37a37d989f057445c20db9b1e/nvidia_cutlass_dsl_libs_base-4.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:264fc34a096bd144ebb8ff0f1fcd5eeeaa9d30528cfd801141a9f7856a58b95a", size = 74345534, upload-time = "2026-02-14T03:47:04.545Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/08/1b1481b382f0bfddb91fe19c425dae7ffcb0dacb19a60d4fa490f19cabdf/nvidia_cutlass_dsl_libs_base-4.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:18249a0c13a7b7fe08fbf600ce38a871538067cfe7b20ef2bc131a5902a67377", size = 75457259, upload-time = "2026-02-14T03:44:48.408Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/2f/4d525af7805a7cf04f25efd9900d9acca1d6a8973f436b6058dfec5b545f/nvidia_cutlass_dsl_libs_base-4.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c09ee076f2b61ba26523686f550a2c642a35ec178861a5e0a38f2979ad515604", size = 74345003, upload-time = "2026-02-14T03:46:37.751Z" },
+    { url = "https://files.pythonhosted.org/packages/33/34/63a1dce4d65cd6fd29b9d50286abbfcdd965c3ca2156cf423eda2ab1fc5d/nvidia_cutlass_dsl_libs_base-4.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9cde72efb065d9bea29a92ca85835eaedec20bf89af22798d2d2a551ccd51731", size = 75458501, upload-time = "2026-02-14T03:45:15.866Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/ae/5bbd3c9d7909d64a7f139b480c70ff3220554f64775e941c95438265ef1f/nvidia_cutlass_dsl_libs_base-4.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e31a2fcc9854417242ee072c9b8fd1257d5ee422166dfd85eb3f8784fee34dd8", size = 74345995, upload-time = "2026-02-14T03:45:42.9Z" },
+    { url = "https://files.pythonhosted.org/packages/48/5c/c76ec134e0fbd4ee2f31b32e1fbcb727e7f6323d136a3fc7a8ea3aa3e75d/nvidia_cutlass_dsl_libs_base-4.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ad63fe382b36f69f2a9b51d35e95cbcb240565d06a990e5a19a8eacae49c8b94", size = 75456473, upload-time = "2026-02-14T03:43:51.005Z" },
+    { url = "https://files.pythonhosted.org/packages/32/22/65c0abbc8518d3f80b5d8adbd8cec640f16f8c0620b01cfbecbfd14d6899/nvidia_cutlass_dsl_libs_base-4.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:b0eb94678159f750db6bf214d79e0b815e9b5a53fad3925fda53e1591cbdeb0d", size = 74345762, upload-time = "2026-02-14T03:46:09.745Z" },
 ]
 
 [[package]]
@@ -3244,21 +3763,21 @@ wheels = [
 
 [[package]]
 name = "nvidia-ml-py"
-version = "13.580.82"
+version = "13.590.48"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/dd/6c/4a533f2c0185027c465adb6063086bc3728301e95f483665bfa9ebafb2d3/nvidia_ml_py-13.580.82.tar.gz", hash = "sha256:0c028805dc53a0e2a6985ea801888197765ac2ef8f1c9e29a7bf0d3616a5efc7", size = 47999, upload-time = "2025-09-11T16:44:56.267Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/af/a0/f4fc18cf72f06821a9a665085435b901449986855519d5b3843532db35e9/nvidia_ml_py-13.590.48.tar.gz", hash = "sha256:8184d1be52914ac7f0991cd1c0d946c65dc88a840c754cd12c274b77b88760dd", size = 49732, upload-time = "2026-01-22T01:14:56.456Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/96/d6d25a4c307d6645f4a9b91d620c0151c544ad38b5e371313a87d2761004/nvidia_ml_py-13.580.82-py3-none-any.whl", hash = "sha256:4361db337b0c551e2d101936dae2e9a60f957af26818e8c0c3a1f32b8db8d0a7", size = 49008, upload-time = "2025-09-11T16:44:54.915Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/72/fb2af0d259a651affdce65fd6a495f0e07a685a0136baf585c5065204ee7/nvidia_ml_py-13.590.48-py3-none-any.whl", hash = "sha256:fd43d30ee9cd0b7940f5f9f9220b68d42722975e3992b6c21d14144c48760e43", size = 50680, upload-time = "2026-01-22T01:14:55.281Z" },
 ]
 
 [[package]]
 name = "nvidia-modelopt"
-version = "0.39.0"
+version = "0.41.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ninja" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-ml-py" },
     { name = "packaging" },
     { name = "pulp" },
@@ -3267,13 +3786,12 @@ dependencies = [
     { name = "rich" },
     { name = "safetensors" },
     { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "torch", marker = "sys_platform == 'never'" },
-    { name = "torchprofile" },
     { name = "tqdm" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b0/d5/b03ad3ffa28984b629a72da678fa98f912fc45bac3b514c4a70cf2a82fe3/nvidia_modelopt-0.39.0-py3-none-any.whl", hash = "sha256:32f05317c81be1ff2ffeab749e5258b7bea8e4c6e60a09c760584f25ad03f648", size = 864981, upload-time = "2025-11-13T07:35:42.761Z" },
+    { url = "https://files.pythonhosted.org/packages/16/09/30147ab0d0409d3492f1d37469fe0586c82aeec6eec9a907f59d24094516/nvidia_modelopt-0.41.0-py3-none-any.whl", hash = "sha256:ffa5f903d22653649318831a470550ae55ee04716c068d5ade61c3176fdc1d7d", size = 934582, upload-time = "2026-01-20T17:21:28.494Z" },
 ]
 
 [[package]]
@@ -3297,11 +3815,11 @@ wheels = [
 
 [[package]]
 name = "nvidia-nvshmem-cu12"
-version = "3.3.20"
+version = "3.4.5"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/92/9d/3dd98852568fb845ec1f7902c90a22b240fe1cbabda411ccedf2fd737b7b/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b0b960da3842212758e4fa4696b94f129090b30e5122fea3c5345916545cff0", size = 124484616, upload-time = "2025-08-04T20:24:59.172Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/6c/99acb2f9eb85c29fc6f3a7ac4dccfd992e22666dd08a642b303311326a97/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5", size = 124657145, upload-time = "2025-08-04T20:25:19.995Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15", size = 138936938, upload-time = "2025-09-06T00:32:05.589Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" },
 ]
 
 [[package]]
@@ -3344,7 +3862,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydata-sphinx-theme" },
     { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8c/79/017fab2f7167a9a9795665f894d04f77aafceca80821b51589bb4b23ff5c/nvidia_sphinx_theme-0.0.9.post1-py3-none-any.whl", hash = "sha256:21ca60206dff2f380d7783d64bbaf71a5b9cacae53c7d0686f089c16b5a3d45a", size = 143816, upload-time = "2025-11-09T23:16:55.719Z" },
@@ -3354,6 +3873,7 @@ wheels = [
 name = "nvtx"
 version = "0.2.14"
 source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0e/03/b8a4391523a92163167fd0fee6769c223e8612043cb07aebc1173ca83fc9/nvtx-0.2.14.tar.gz", hash = "sha256:12945242a31bde70b1f15cae867f8706bdff290e2f808a11738e03ebefdf847f", size = 119864, upload-time = "2025-12-01T18:06:16.674Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ed/ca/fa76ea4985fd8f3d8c437bffec2580b1cac7f2401671089ac842610ae466/nvtx-0.2.14-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b70b2415ab97edf19514be226d5058898922c6b6bb1d7fdd5ef92d1e086f3e0f", size = 695204, upload-time = "2025-11-27T17:28:52.688Z" },
     { url = "https://files.pythonhosted.org/packages/b9/1f/0aa62d52062d700dbed36dd2ebfddf5133c72180d448cce66545e5ccbe5d/nvtx-0.2.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23ab874f9c70e5433f39e40ca318ffcfc14fb43ed6798e6be5a30f74e4ca831f", size = 686698, upload-time = "2025-11-27T17:23:19.335Z" },
@@ -3367,6 +3887,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/a3/603ecdfd5cd97feee59c7e51da4929e22eac8dbe68ac78df53e74152813f/nvtx-0.2.14-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8cd1f2b464675b4d3c2036b7bbaf975baa9307f0795107dc69c556c0c8d191d", size = 710057, upload-time = "2025-11-27T17:28:08.127Z" },
     { url = "https://files.pythonhosted.org/packages/97/29/945dd440e6bd459e6064f321ed425dbae7d03d39ffa97a38e5434fbcda27/nvtx-0.2.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6532556d81f782e24eb12c5e0c75e297493d6ab0431177c93c12bb29c523ea9e", size = 717825, upload-time = "2025-11-27T17:22:57.556Z" },
     { url = "https://files.pythonhosted.org/packages/16/3e/5d7872f2a0809237e3d524f81a7a3c7fbeb98bdc9dcec4723b75a45cd552/nvtx-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:cd86f78ed56aede301b03e5ab8cb1aaeb8ba0b5ed683f98f87fbe474996d73f2", size = 118546, upload-time = "2025-11-27T17:30:32.549Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/04/1c8b1ce8b729a96218c1d9c0d399ea556765ab2199311ca9e1693507834d/nvtx-0.2.14-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51d48a98db0c3f4b701d3422ef34bf34c0c9256036d036dd115d48c6286b7b82", size = 791447, upload-time = "2025-11-28T22:52:07.744Z" },
+    { url = "https://files.pythonhosted.org/packages/72/a8/608bfa862de1673e63386b0e32520a05ed968524c22babe273565a1c9027/nvtx-0.2.14-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:638f66b6119fb3adfe3f5e2ba2d0cca9580bc4a898cd702b639d727a4a405c59", size = 742277, upload-time = "2025-11-28T22:56:48.341Z" },
+    { url = "https://files.pythonhosted.org/packages/32/bb/579545bb24e4d1d643e42c9e323d32fcf327522027346686c12595f15ed9/nvtx-0.2.14-cp313-cp313t-win_amd64.whl", hash = "sha256:d5dfaf02a91fd2a123e104d59681dc768c07b66b05e4afc4c05ee125e45f6261", size = 131705, upload-time = "2025-11-28T22:57:30.24Z" },
+    { url = "https://files.pythonhosted.org/packages/07/60/9b4ed6dd0153b17817d3344f444bed731d284907c99a4fcc0910a594b114/nvtx-0.2.14-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:12c21b01b426e85054606d5d8e78d08ab804f1231d4f24be6ded595f901b1125", size = 740863, upload-time = "2025-11-28T22:53:34.642Z" },
+    { url = "https://files.pythonhosted.org/packages/93/e5/c4095778d690c8eac535048c44f4aff61e77ad0573b324655e3c8d4b7b86/nvtx-0.2.14-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:296aa978e572d2854a980506cb9de6fe641d496b46879b56c6e6df7467056303", size = 747776, upload-time = "2025-11-28T22:55:54.441Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/3f/05150e9953b6e818b2c103ff881a43c99063cf06f7e9b474f94f79674fcc/nvtx-0.2.14-cp314-cp314-win_amd64.whl", hash = "sha256:e265cce4d7ecfb56b9e974be736bba308be47402edfc09dd6a5f91a8eafa90c3", size = 120583, upload-time = "2025-11-28T22:58:37.289Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/4c/b607cb591d4600ff1771e64563cf6b395024ffad0b13fe09aa10f7b8d786/nvtx-0.2.14-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fc5522766fff59cf62e42c31324b1c405d308d7755e847e25d286f29e217f54a", size = 794398, upload-time = "2025-11-28T22:51:16.927Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/af/9d67e2995673e25711ee79bcc52a552926c074943fc59b42fa56996ad50f/nvtx-0.2.14-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:671b592464038054cc31a5d8c53a460d22fc38b066bbd055e086be8dd49fa43b", size = 746054, upload-time = "2025-11-28T22:55:33.65Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/10/143ce5b3e07921176fc2b6f808afde7335f06e93af1a29ac6f4cfa02cf4b/nvtx-0.2.14-cp314-cp314t-win_amd64.whl", hash = "sha256:2567ce29e905062c239a33ba91a46ca7307561c40fd7b37ec64c00cd78f9bdab", size = 138050, upload-time = "2025-11-28T22:57:09.773Z" },
 ]
 
 [[package]]
@@ -3384,94 +3913,379 @@ wheels = [
 
 [[package]]
 name = "onnx"
-version = "1.19.1"
+version = "1.20.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ml-dtypes" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "protobuf" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/27/2f/c619eb65769357e9b6de9212c9a821ab39cd484448e5d6b3fb5fb0a64c6d/onnx-1.19.1.tar.gz", hash = "sha256:737524d6eb3907d3499ea459c6f01c5a96278bb3a0f2ff8ae04786fb5d7f1ed5", size = 12033525, upload-time = "2025-10-10T04:01:34.342Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5b/f3/892eea0206ed13a986239bd508c82b974387ef1b0ffd83ece0ce0725aaf6/onnx-1.19.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:7343250cc5276cf439fe623b8f92e11cf0d1eebc733ae4a8b2e86903bb72ae68", size = 18319433, upload-time = "2025-10-10T03:59:47.236Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/f3/c7ea4a1dfda9b9ddeff914a601ffaf5ed151b3352529f223eae74c03c8d1/onnx-1.19.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1fb8f79de7f3920bb82b537f3c6ac70c0ce59f600471d9c3eed2b5f8b079b748", size = 18043327, upload-time = "2025-10-10T03:59:50.854Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/eb/30159bb6a108b03f2b7521410369a5bd8d296be3fbf0b30ab7acd9ef42ad/onnx-1.19.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92b9d2dece41cc84213dbbfd1acbc2a28c27108c53bd28ddb6d1043fbfcbd2d5", size = 18216877, upload-time = "2025-10-10T03:59:54.512Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/86/dc034e5a723a20ca45aa8dd76dda53c358a5f955908e1436f42c21bdfb3a/onnx-1.19.1-cp310-cp310-win32.whl", hash = "sha256:c0b1a2b6bb19a0fc9f5de7661a547136d082c03c169a5215e18ff3ececd2a82f", size = 16344116, upload-time = "2025-10-10T03:59:57.991Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/60/537f2c19050f71445ee00ed91e78a396b6189dd1fce61b29ac6a0d651c7e/onnx-1.19.1-cp310-cp310-win_amd64.whl", hash = "sha256:1c0498c00db05fcdb3426697d330dcecc3f60020015065e2c76fa795f2c9a605", size = 16462819, upload-time = "2025-10-10T04:00:01.157Z" },
-    { url = "https://files.pythonhosted.org/packages/36/07/0019c72924909e4f64b9199770630ab7b8d7914b912b03230e68f5eda7ae/onnx-1.19.1-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:17aaf5832126de0a5197a5864e4f09a764dd7681d3035135547959b4b6b77a09", size = 18320936, upload-time = "2025-10-10T04:00:04.235Z" },
-    { url = "https://files.pythonhosted.org/packages/af/2f/5c47acf740dc35f0decc640844260fbbdc0efa0565657c93fd7ff30f13f3/onnx-1.19.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01b292a4d0b197c45d8184545bbc8ae1df83466341b604187c1b05902cb9c920", size = 18044269, upload-time = "2025-10-10T04:00:07.449Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/61/6c457ee8c3a62a3cad0a4bfa4c5436bb3ac4df90c3551d40bee1224b5b51/onnx-1.19.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1839af08ab4a909e4af936b8149c27f8c64b96138981024e251906e0539d8bf9", size = 18218092, upload-time = "2025-10-10T04:00:11.135Z" },
-    { url = "https://files.pythonhosted.org/packages/54/d5/ab832e1369505e67926a70e9a102061f89ad01f91aa296c4b1277cb81b25/onnx-1.19.1-cp311-cp311-win32.whl", hash = "sha256:0bdbb676e3722bd32f9227c465d552689f49086f986a696419d865cb4e70b989", size = 16344809, upload-time = "2025-10-10T04:00:14.634Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/b5/6eb4611d24b85002f878ba8476b4cecbe6f9784c0236a3c5eff85236cc0a/onnx-1.19.1-cp311-cp311-win_amd64.whl", hash = "sha256:1346853df5c1e3ebedb2e794cf2a51e0f33759affd655524864ccbcddad7035b", size = 16464319, upload-time = "2025-10-10T04:00:18.235Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/ff/f0e1f06420c70e20d497fec7c94a864d069943b6312bedd4224c0ab946f8/onnx-1.19.1-cp311-cp311-win_arm64.whl", hash = "sha256:2d69c280c0e665b7f923f499243b9bb84fe97970b7a4668afa0032045de602c8", size = 16437503, upload-time = "2025-10-10T04:00:21.247Z" },
-    { url = "https://files.pythonhosted.org/packages/50/07/f6c5b2cffef8c29e739616d1415aea22f7b7ef1f19c17f02b7cff71f5498/onnx-1.19.1-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:3612193a89ddbce5c4e86150869b9258780a82fb8c4ca197723a4460178a6ce9", size = 18327840, upload-time = "2025-10-10T04:00:24.259Z" },
-    { url = "https://files.pythonhosted.org/packages/93/20/0568ebd52730287ae80cac8ac893a7301c793ea1630984e2519ee92b02a9/onnx-1.19.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6c2fd2f744e7a3880ad0c262efa2edf6d965d0bd02b8f327ec516ad4cb0f2f15", size = 18042539, upload-time = "2025-10-10T04:00:27.693Z" },
-    { url = "https://files.pythonhosted.org/packages/14/fd/cd7a0fd10a04f8cc5ae436b63e0022e236fe51b9dbb8ee6317fd48568c72/onnx-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:485d3674d50d789e0ee72fa6f6e174ab81cb14c772d594f992141bd744729d8a", size = 18218271, upload-time = "2025-10-10T04:00:30.495Z" },
-    { url = "https://files.pythonhosted.org/packages/65/68/cc8b8c05469fe08384b446304ad7e6256131ca0463bf6962366eebec98c0/onnx-1.19.1-cp312-cp312-win32.whl", hash = "sha256:638bc56ff1a5718f7441e887aeb4e450f37a81c6eac482040381b140bd9ba601", size = 16345111, upload-time = "2025-10-10T04:00:34.982Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/5e/d1cb16693598a512c2cf9ffe0841d8d8fd2c83ae8e889efd554f5aa427cf/onnx-1.19.1-cp312-cp312-win_amd64.whl", hash = "sha256:bc7e2e4e163e679721e547958b5a7db875bf822cad371b7c1304aa4401a7c7a4", size = 16465621, upload-time = "2025-10-10T04:00:39.107Z" },
-    { url = "https://files.pythonhosted.org/packages/90/32/da116cc61fdef334782aa7f87a1738431dd1af1a5d1a44bd95d6d51ad260/onnx-1.19.1-cp312-cp312-win_arm64.whl", hash = "sha256:17c215b1c0f20fe93b4cbe62668247c1d2294b9bc7f6be0ca9ced28e980c07b7", size = 16437505, upload-time = "2025-10-10T04:00:42.255Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/b8/ab1fdfe2e8502f4dc4289fc893db35816bd20d080d8370f86e74dda5f598/onnx-1.19.1-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:4e5f938c68c4dffd3e19e4fd76eb98d298174eb5ebc09319cdd0ec5fe50050dc", size = 18327815, upload-time = "2025-10-10T04:00:45.682Z" },
-    { url = "https://files.pythonhosted.org/packages/04/40/eb875745a4b92aea10e5e32aa2830f409c4d7b6f7b48ca1c4eaad96636c5/onnx-1.19.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:86e20a5984b017feeef2dbf4ceff1c7c161ab9423254968dd77d3696c38691d0", size = 18041464, upload-time = "2025-10-10T04:00:48.557Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/8e/8586135f40dbe4989cec4d413164bc8fc5c73d37c566f33f5ea3a7f2b6f6/onnx-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d9c467f0f29993c12f330736af87972f30adb8329b515f39d63a0db929cb2c", size = 18218244, upload-time = "2025-10-10T04:00:51.891Z" },
-    { url = "https://files.pythonhosted.org/packages/51/b5/4201254b8683129db5da3fb55aa1f7e56d0a8d45c66ce875dec21ca1ff25/onnx-1.19.1-cp313-cp313-win32.whl", hash = "sha256:65eee353a51b4e4ca3e797784661e5376e2b209f17557e04921eac9166a8752e", size = 16345330, upload-time = "2025-10-10T04:00:54.858Z" },
-    { url = "https://files.pythonhosted.org/packages/69/67/c6d239afbcdbeb6805432969b908b5c9f700c96d332b34e3f99518d76caf/onnx-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:c3bc87e38b53554b1fc9ef7b275c81c6f5c93c90a91935bb0aa8d4d498a6d48e", size = 16465567, upload-time = "2025-10-10T04:00:57.893Z" },
-    { url = "https://files.pythonhosted.org/packages/99/fe/89f1e40f5bc54595ff0dcf5391ce19e578b528973ccc74dd99800196d30d/onnx-1.19.1-cp313-cp313-win_arm64.whl", hash = "sha256:e41496f400afb980ec643d80d5164753a88a85234fa5c06afdeebc8b7d1ec252", size = 16437562, upload-time = "2025-10-10T04:01:00.703Z" },
-    { url = "https://files.pythonhosted.org/packages/86/43/b186ccbc8fe7e93643a6a6d40bbf2bb6ce4fb9469bbd3453c77e270c50ad/onnx-1.19.1-cp313-cp313t-macosx_12_0_universal2.whl", hash = "sha256:5f6274abf0fd74e80e78ecbb44bd44509409634525c89a9b38276c8af47dc0a2", size = 18355703, upload-time = "2025-10-10T04:01:03.735Z" },
-    { url = "https://files.pythonhosted.org/packages/60/f1/22ee4d8b8f9fa4cb1d1b9579da3b4b5187ddab33846ec5ac744af02c0e2b/onnx-1.19.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:07dcd4d83584eb4bf8f21ac04c82643712e5e93ac2a0ed10121ec123cb127e1e", size = 18047830, upload-time = "2025-10-10T04:01:06.552Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/a4/8f3d51e3a095d42cdf2039a590cff06d024f2a10efbd0b1a2a6b3825f019/onnx-1.19.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1975860c3e720db25d37f1619976582828264bdcc64fa7511c321ac4fc01add3", size = 18221126, upload-time = "2025-10-10T04:01:09.77Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/0d/f9d6c2237083f1aac14b37f0b03b0d81f1147a8e2af0c3828165e0a6a67b/onnx-1.19.1-cp313-cp313t-win_amd64.whl", hash = "sha256:9807d0e181f6070ee3a6276166acdc571575d1bd522fc7e89dba16fd6e7ffed9", size = 16465560, upload-time = "2025-10-10T04:01:13.212Z" },
-    { url = "https://files.pythonhosted.org/packages/36/70/8418a58faa7d606d6a92cab69ae8d361b3b3969bf7e7e9a65a86d5d1b674/onnx-1.19.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b6ee83e6929d75005482d9f304c502ac7c9b8d6db153aa6b484dae74d0f28570", size = 18042812, upload-time = "2025-10-10T04:01:15.919Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/3b/8a/335c03a8683a88a32f9a6bb98899ea6df241a41df64b37b9696772414794/onnx-1.20.1.tar.gz", hash = "sha256:ded16de1df563d51fbc1ad885f2a426f814039d8b5f4feb77febe09c0295ad67", size = 12048980, upload-time = "2026-01-10T01:40:03.043Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/cc/4ba3c80cfaffdb541dc5a23eaccb045a627361e94ecaeba30496270f15b3/onnx-1.20.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:3fe243e83ad737637af6512708454e720d4b0864def2b28e6b0ee587b80a50be", size = 17904206, upload-time = "2026-01-10T01:38:58.574Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/fc/3a1c4ae2cd5cfab2d0ebc1842769b04b417fe13946144a7c8ce470dd9c85/onnx-1.20.1-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e24e96b48f27e4d6b44cb0b195b367a2665da2d819621eec51903d575fc49d38", size = 17414849, upload-time = "2026-01-10T01:39:01.494Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/ab/5017945291b981f2681fb620f2d5b6070e02170c648770711ef1eac79d56/onnx-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0903e6088ed5e8f59ebd381ab2a6e9b2a60b4c898f79aa2fe76bb79cf38a5031", size = 17513600, upload-time = "2026-01-10T01:39:04.348Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/b0/063e79dc365972af876d786bacc6acd8909691af2b9296615ff74ad182f3/onnx-1.20.1-cp310-cp310-win32.whl", hash = "sha256:17483e59082b2ca6cadd2b48fd8dce937e5b2c985ed5583fefc38af928be1826", size = 16239159, upload-time = "2026-01-10T01:39:07.254Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/73/a992271eb3683e676239d71b5a78ad3cf4d06d2223c387e701bf305da199/onnx-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:e2b0cf797faedfd3b83491dc168ab5f1542511448c65ceb482f20f04420cbf3a", size = 16391718, upload-time = "2026-01-10T01:39:09.96Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/38/1a0e74d586c08833404100f5c052f92732fb5be417c0b2d7cb0838443bfe/onnx-1.20.1-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:53426e1b458641e7a537e9f176330012ff59d90206cac1c1a9d03cdd73ed3095", size = 17904965, upload-time = "2026-01-10T01:39:13.532Z" },
+    { url = "https://files.pythonhosted.org/packages/96/25/64b076e9684d17335f80b15b3bf502f7a8e1a89f08a6b208d4f2861b3011/onnx-1.20.1-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca7281f8c576adf396c338cf43fff26faee8d4d2e2577b8e73738f37ceccf945", size = 17415179, upload-time = "2026-01-10T01:39:16.516Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/d5/6743b409421ced20ad5af1b3a7b4c4e568689ffaca86db431692fca409a6/onnx-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2297f428c51c7fc6d8fad0cf34384284dfeff3f86799f8e83ef905451348ade0", size = 17513672, upload-time = "2026-01-10T01:39:19.35Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/6b/dae82e6fdb2043302f29adca37522312ea2be55b75907b59be06fbdffe87/onnx-1.20.1-cp311-cp311-win32.whl", hash = "sha256:63d9cbcab8c96841eadeb7c930e07bfab4dde8081eb76fb68e0dfb222706b81e", size = 16239336, upload-time = "2026-01-10T01:39:22.506Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/17/a0d7863390c1f2067d7c02dcc1477034965c32aaa1407bfcf775305ffee4/onnx-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:d78cde72d7ca8356a2d99c5dc0dbf67264254828cae2c5780184486c0cd7b3bf", size = 16392120, upload-time = "2026-01-10T01:39:25.106Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/72/9b879a46eb7a3322223791f36bf9c25d95da9ed93779eabb75a560f22e5b/onnx-1.20.1-cp311-cp311-win_arm64.whl", hash = "sha256:0104bb2d4394c179bcea3df7599a45a2932b80f4633840896fcf0d7d8daecea2", size = 16346923, upload-time = "2026-01-10T01:39:27.782Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/4c/4b17e82f91ab9aa07ff595771e935ca73547b035030dc5f5a76e63fbfea9/onnx-1.20.1-cp312-abi3-macosx_12_0_universal2.whl", hash = "sha256:1d923bb4f0ce1b24c6859222a7e6b2f123e7bfe7623683662805f2e7b9e95af2", size = 17903547, upload-time = "2026-01-10T01:39:31.015Z" },
+    { url = "https://files.pythonhosted.org/packages/64/5e/1bfa100a9cb3f2d3d5f2f05f52f7e60323b0e20bb0abace1ae64dbc88f25/onnx-1.20.1-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ddc0b7d8b5a94627dc86c533d5e415af94cbfd103019a582669dad1f56d30281", size = 17412021, upload-time = "2026-01-10T01:39:33.885Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/71/d3fec0dcf9a7a99e7368112d9c765154e81da70fcba1e3121131a45c245b/onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9336b6b8e6efcf5c490a845f6afd7e041c89a56199aeda384ed7d58fb953b080", size = 17510450, upload-time = "2026-01-10T01:39:36.589Z" },
+    { url = "https://files.pythonhosted.org/packages/74/a7/edce1403e05a46e59b502fae8e3350ceeac5841f8e8f1561e98562ed9b09/onnx-1.20.1-cp312-abi3-win32.whl", hash = "sha256:564c35a94811979808ab5800d9eb4f3f32c12daedba7e33ed0845f7c61ef2431", size = 16238216, upload-time = "2026-01-10T01:39:39.46Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/c7/8690c81200ae652ac550c1df52f89d7795e6cc941f3cb38c9ef821419e80/onnx-1.20.1-cp312-abi3-win_amd64.whl", hash = "sha256:9fe7f9a633979d50984b94bda8ceb7807403f59a341d09d19342dc544d0ca1d5", size = 16389207, upload-time = "2026-01-10T01:39:41.955Z" },
+    { url = "https://files.pythonhosted.org/packages/01/a0/4fb0e6d36eaf079af366b2c1f68bafe92df6db963e2295da84388af64abc/onnx-1.20.1-cp312-abi3-win_arm64.whl", hash = "sha256:21d747348b1c8207406fa2f3e12b82f53e0d5bb3958bcd0288bd27d3cb6ebb00", size = 16344155, upload-time = "2026-01-10T01:39:45.536Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/bb/715fad292b255664f0e603f1b2ef7bf2b386281775f37406beb99fa05957/onnx-1.20.1-cp313-cp313t-macosx_12_0_universal2.whl", hash = "sha256:29197b768f5acdd1568ddeb0a376407a2817844f6ac1ef8c8dd2d974c9ab27c3", size = 17912296, upload-time = "2026-01-10T01:39:48.21Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/c3/541af12c3d45e159a94ee701100ba9e94b7bd8b7a8ac5ca6838569f894f8/onnx-1.20.1-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f0371aa67f51917a09cc829ada0f9a79a58f833449e03d748f7f7f53787c43c", size = 17416925, upload-time = "2026-01-10T01:39:50.82Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/3b/d5660a7d2ddf14f531ca66d409239f543bb290277c3f14f4b4b78e32efa3/onnx-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be1e5522200b203b34327b2cf132ddec20ab063469476e1f5b02bb7bd259a489", size = 17515602, upload-time = "2026-01-10T01:39:54.132Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/b4/47225ab2a92562eff87ba9a1a028e3535d659a7157d7cde659003998b8e3/onnx-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:15c815313bbc4b2fdc7e4daeb6e26b6012012adc4d850f4e3b09ed327a7ea92a", size = 16395729, upload-time = "2026-01-10T01:39:57.577Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/7d/1bbe626ff6b192c844d3ad34356840cc60fca02e2dea0db95e01645758b1/onnx-1.20.1-cp313-cp313t-win_arm64.whl", hash = "sha256:eb335d7bcf9abac82a0d6a0fda0363531ae0b22cfd0fc6304bff32ee29905def", size = 16348968, upload-time = "2026-01-10T01:40:00.491Z" },
 ]
 
 [[package]]
 name = "onnx-ir"
-version = "0.1.12"
+version = "0.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ml-dtypes" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "onnx" },
+    { name = "sympy" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/6c/1a/2a94112a39d01a9d1490f5ef3c205d8a17fe1ca27f307b026c40d62d8e9f/onnx_ir-0.1.12.tar.gz", hash = "sha256:742e0bff875d0547724187560b3f441833191c8aa939c05f14176f4892784deb", size = 112699, upload-time = "2025-10-28T23:43:54.129Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/a5/acc43c8fa6edbc584d127fb6bbd13ae9ebfc01b9675c74e0da2de15fa4a6/onnx_ir-0.2.0.tar.gz", hash = "sha256:8bad3906691987290789b26d05e0dbff467029a0b1e411e12e4cae02e43503e4", size = 141693, upload-time = "2026-02-24T02:31:10.998Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c8/36/c4df116f5dcaa82ec7944e5d25624a3811f6603fd190660b0b079ea759fb/onnx_ir-0.1.12-py3-none-any.whl", hash = "sha256:17f86faf8a53b979430bde1bc6022c7a162b0d1534550ddb17a1d37eb993e765", size = 129277, upload-time = "2025-10-28T23:43:52.493Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/df/a99736bcca6b16e36c687ce4996abcf4ce73c514fddd9e730cfcb6a334f2/onnx_ir-0.2.0-py3-none-any.whl", hash = "sha256:eb14d1399c2442bd1ff702719e70074e9cedfa3af5729416a32752c9e0f82591", size = 164100, upload-time = "2026-02-24T02:31:09.454Z" },
 ]
 
 [[package]]
 name = "onnxscript"
-version = "0.5.6"
+version = "0.6.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ml-dtypes" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "onnx" },
     { name = "onnx-ir" },
     { name = "packaging" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fb/4b/eed2199327bbf12c3443d7835893e3c4c23b1c1a4aa13efe0f7fbe0a6bf9/onnxscript-0.5.6.tar.gz", hash = "sha256:cc3338b2976daffd2af0bb6ac4866a4dca76aefface1666a0d7bc65ad9850822", size = 587017, upload-time = "2025-10-31T03:50:38.656Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/2b/538fdeb0e25bed5d7e0f954af5710543e2629499fb74381afc3333f8a8ae/onnxscript-0.6.2.tar.gz", hash = "sha256:abb2e6f464db40c9b8c7fbb3e64cca04cf3f4495e67c4eda5eac17b784191ce3", size = 590865, upload-time = "2026-02-10T22:53:39.638Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/66/56/e6b179397497ab93266b6eb00743403a6a699a29063a423c4a14595d3db9/onnxscript-0.6.2-py3-none-any.whl", hash = "sha256:20e3c3fd1da19b3655549d5455a2df719db47374fe430e01e865ae69127c37b9", size = 689064, upload-time = "2026-02-10T22:53:41.663Z" },
+]
+
+[[package]]
+name = "opencensus"
+version = "0.11.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-api-core" },
+    { name = "opencensus-context" },
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/15/a7/a46dcffa1b63084f9f17fe3c8cb20724c4c8f91009fd0b2cfdb27d5d2b35/opencensus-0.11.4.tar.gz", hash = "sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2", size = 64966, upload-time = "2024-01-03T18:04:07.085Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl", hash = "sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864", size = 128225, upload-time = "2024-01-03T18:04:05.127Z" },
+]
+
+[[package]]
+name = "opencensus-context"
+version = "0.1.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/96/3b6f638f6275a8abbd45e582448723bffa29c1fb426721dedb5c72f7d056/opencensus-context-0.1.3.tar.gz", hash = "sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c", size = 4066, upload-time = "2022-08-03T22:20:22.359Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fd/1e/a5462bfe28a2add00dc0abec7dd9b742ac3207b73e5c97bde9747b503971/onnxscript-0.5.6-py3-none-any.whl", hash = "sha256:b0c3355fea3eecab8ca291da8b77afddcaacd3ada5ee59294390a049ea123938", size = 683045, upload-time = "2025-10-31T03:50:41.15Z" },
+    { url = "https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl", hash = "sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039", size = 5060, upload-time = "2022-08-03T22:20:20.352Z" },
 ]
 
 [[package]]
 name = "opentelemetry-api"
 version = "1.33.1"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
 dependencies = [
-    { name = "deprecated" },
-    { name = "importlib-metadata" },
+    { name = "deprecated", marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "importlib-metadata", marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/9a/8d/1f5a45fbcb9a7d87809d460f09dc3399e3fbd31d7f3e14888345e9d29951/opentelemetry_api-1.33.1.tar.gz", hash = "sha256:1c6055fc0a2d3f23a50c7e17e16ef75ad489345fd3df1f8b8af7c0bbf8a109e8", size = 65002, upload-time = "2025-05-16T18:52:41.146Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/05/44/4c45a34def3506122ae61ad684139f0bbc4e00c39555d4f7e20e0e001c8a/opentelemetry_api-1.33.1-py3-none-any.whl", hash = "sha256:4db83ebcf7ea93e64637ec6ee6fabee45c5cbe4abd9cf3da95c43828ddb50b83", size = 65771, upload-time = "2025-05-16T18:52:17.419Z" },
 ]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.39.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "importlib-metadata", marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/97/b9/3161be15bb8e3ad01be8be5a968a9237c3027c5be504362ff800fca3e442/opentelemetry_api-1.39.1.tar.gz", hash = "sha256:fbde8c80e1b937a2c61f20347e91c0c18a1940cecf012d62e65a7caf08967c9c", size = 65767, upload-time = "2025-12-11T13:32:39.182Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cf/df/d3f1ddf4bb4cb50ed9b1139cc7b1c54c34a1e7ce8fd1b9a37c0d1551a6bd/opentelemetry_api-1.39.1-py3-none-any.whl", hash = "sha256:2edd8463432a7f8443edce90972169b195e7d6a05500cd29e6d13898187c9950", size = 66356, upload-time = "2025-12-11T13:32:17.304Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-prometheus"
+version = "0.54b1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "opentelemetry-sdk", version = "1.33.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "prometheus-client", marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/da/ef/563c6413dbf042f1b738c4e23f7ff5f80fd0ae5ba64037433a5eeb0a1f79/opentelemetry_exporter_prometheus-0.54b1.tar.gz", hash = "sha256:6a28fde40ac8693bd653b84ba9deff75721fd05edf4e4313939327ea336ad3a9", size = 14948, upload-time = "2025-05-16T18:52:46.152Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/4e/f351f6fe640c270158cf2521978978eb8a8fb3113fdaa5e98eedc22e8126/opentelemetry_exporter_prometheus-0.54b1-py3-none-any.whl", hash = "sha256:78052c9818140021b8b3738f653f8bf4088a34bf970144c6816b5f561c0178dc", size = 12950, upload-time = "2025-05-16T18:52:26.51Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-prometheus"
+version = "0.60b1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "opentelemetry-api", version = "1.39.1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "opentelemetry-sdk", version = "1.39.1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "prometheus-client", marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/14/39/7dafa6fff210737267bed35a8855b6ac7399b9e582b8cf1f25f842517012/opentelemetry_exporter_prometheus-0.60b1.tar.gz", hash = "sha256:a4011b46906323f71724649d301b4dc188aaa068852e814f4df38cc76eac616b", size = 14976, upload-time = "2025-12-11T13:32:42.944Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/0d/4be6bf5477a3eb3d917d2f17d3c0b6720cd6cb97898444a61d43cc983f5c/opentelemetry_exporter_prometheus-0.60b1-py3-none-any.whl", hash = "sha256:49f59178de4f4590e3cef0b8b95cf6e071aae70e1f060566df5546fad773b8fd", size = 13019, upload-time = "2025-12-11T13:32:23.974Z" },
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.39.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.33.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "opentelemetry-semantic-conventions", version = "0.54b1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "typing-extensions", marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/67/12/909b98a7d9b110cce4b28d49b2e311797cffdce180371f35eba13a72dd00/opentelemetry_sdk-1.33.1.tar.gz", hash = "sha256:85b9fcf7c3d23506fbc9692fd210b8b025a1920535feec50bd54ce203d57a531", size = 161885, upload-time = "2025-05-16T18:52:52.832Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/8e/ae2d0742041e0bd7fe0d2dcc5e7cce51dcf7d3961a26072d5b43cc8fa2a7/opentelemetry_sdk-1.33.1-py3-none-any.whl", hash = "sha256:19ea73d9a01be29cacaa5d6c8ce0adc0b7f7b4d58cc52f923e4413609f670112", size = 118950, upload-time = "2025-05-16T18:52:37.297Z" },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.39.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "opentelemetry-api", version = "1.39.1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "opentelemetry-semantic-conventions", version = "0.60b1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/fb/c76080c9ba07e1e8235d24cdcc4d125ef7aa3edf23eb4e497c2e50889adc/opentelemetry_sdk-1.39.1.tar.gz", hash = "sha256:cf4d4563caf7bff906c9f7967e2be22d0d6b349b908be0d90fb21c8e9c995cc6", size = 171460, upload-time = "2025-12-11T13:32:49.369Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/98/e91cf858f203d86f4eccdf763dcf01cf03f1dae80c3750f7e635bfa206b6/opentelemetry_sdk-1.39.1-py3-none-any.whl", hash = "sha256:4d5482c478513ecb0a5d938dcc61394e647066e0cc2676bee9f3af3f3f45f01c", size = 132565, upload-time = "2025-12-11T13:32:35.069Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.54b1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "deprecated", marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/2c/d7990fc1ffc82889d466e7cd680788ace44a26789809924813b164344393/opentelemetry_semantic_conventions-0.54b1.tar.gz", hash = "sha256:d1cecedae15d19bdaafca1e56b29a66aa286f50b5d08f036a145c7f3e9ef9cee", size = 118642, upload-time = "2025-05-16T18:52:53.962Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0a/80/08b1698c52ff76d96ba440bf15edc2f4bc0a279868778928e947c1004bdd/opentelemetry_semantic_conventions-0.54b1-py3-none-any.whl", hash = "sha256:29dab644a7e435b58d3a3918b58c333c92686236b30f7891d5e51f02933ca60d", size = 194938, upload-time = "2025-05-16T18:52:38.796Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.60b1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "opentelemetry-api", version = "1.39.1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/91/df/553f93ed38bf22f4b999d9be9c185adb558982214f33eae539d3b5cd0858/opentelemetry_semantic_conventions-0.60b1.tar.gz", hash = "sha256:87c228b5a0669b748c76d76df6c364c369c28f1c465e50f661e39737e84bc953", size = 137935, upload-time = "2025-12-11T13:32:50.487Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/5e/5958555e09635d09b75de3c4f8b9cae7335ca545d77392ffe7331534c402/opentelemetry_semantic_conventions-0.60b1-py3-none-any.whl", hash = "sha256:9fa8c8b0c110da289809292b0591220d3a7b53c1526a23021e977d68597893fb", size = 219982, upload-time = "2025-12-11T13:32:36.955Z" },
+]
+
 [[package]]
 name = "overrides"
 version = "7.7.0"
@@ -3483,23 +4297,26 @@ wheels = [
 
 [[package]]
 name = "packaging"
-version = "25.0"
+version = "26.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
 ]
 
 [[package]]
 name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+]
 dependencies = [
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "python-dateutil" },
-    { name = "pytz" },
-    { name = "tzdata" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "python-dateutil", marker = "python_full_version < '3.11'" },
+    { name = "pytz", marker = "python_full_version < '3.11'" },
+    { name = "tzdata", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
 wheels = [
@@ -3552,6 +4369,84 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
 ]
 
+[[package]]
+name = "pandas"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "python-dateutil", marker = "python_full_version >= '3.11'" },
+    { name = "tzdata", marker = "(python_full_version >= '3.11' and sys_platform == 'emscripten') or (python_full_version >= '3.11' and sys_platform == 'win32')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2e/0c/b28ed414f080ee0ad153f848586d61d1878f91689950f037f976ce15f6c8/pandas-3.0.1.tar.gz", hash = "sha256:4186a699674af418f655dbd420ed87f50d56b4cd6603784279d9eef6627823c8", size = 4641901, upload-time = "2026-02-17T22:20:16.434Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/07/c7087e003ceee9b9a82539b40414ec557aa795b584a1a346e89180853d79/pandas-3.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de09668c1bf3b925c07e5762291602f0d789eca1b3a781f99c1c78f6cac0e7ea", size = 10323380, upload-time = "2026-02-17T22:18:16.133Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/27/90683c7122febeefe84a56f2cde86a9f05f68d53885cebcc473298dfc33e/pandas-3.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:24ba315ba3d6e5806063ac6eb717504e499ce30bd8c236d8693a5fd3f084c796", size = 9923455, upload-time = "2026-02-17T22:18:19.13Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/f1/ed17d927f9950643bc7631aa4c99ff0cc83a37864470bc419345b656a41f/pandas-3.0.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:406ce835c55bac912f2a0dcfaf27c06d73c6b04a5dde45f1fd3169ce31337389", size = 10753464, upload-time = "2026-02-17T22:18:21.134Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/7c/870c7e7daec2a6c7ff2ac9e33b23317230d4e4e954b35112759ea4a924a7/pandas-3.0.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:830994d7e1f31dd7e790045235605ab61cff6c94defc774547e8b7fdfbff3dc7", size = 11255234, upload-time = "2026-02-17T22:18:24.175Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/39/3653fe59af68606282b989c23d1a543ceba6e8099cbcc5f1d506a7bae2aa/pandas-3.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a64ce8b0f2de1d2efd2ae40b0abe7f8ae6b29fbfb3812098ed5a6f8e235ad9bf", size = 11767299, upload-time = "2026-02-17T22:18:26.824Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/31/1daf3c0c94a849c7a8dab8a69697b36d313b229918002ba3e409265c7888/pandas-3.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9832c2c69da24b602c32e0c7b1b508a03949c18ba08d4d9f1c1033426685b447", size = 12333292, upload-time = "2026-02-17T22:18:28.996Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/67/af63f83cd6ca603a00fe8530c10a60f0879265b8be00b5930e8e78c5b30b/pandas-3.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:84f0904a69e7365f79a0c77d3cdfccbfb05bf87847e3a51a41e1426b0edb9c79", size = 9892176, upload-time = "2026-02-17T22:18:31.79Z" },
+    { url = "https://files.pythonhosted.org/packages/79/ab/9c776b14ac4b7b4140788eca18468ea39894bc7340a408f1d1e379856a6b/pandas-3.0.1-cp311-cp311-win_arm64.whl", hash = "sha256:4a68773d5a778afb31d12e34f7dd4612ab90de8c6fb1d8ffe5d4a03b955082a1", size = 9151328, upload-time = "2026-02-17T22:18:35.721Z" },
+    { url = "https://files.pythonhosted.org/packages/37/51/b467209c08dae2c624873d7491ea47d2b47336e5403309d433ea79c38571/pandas-3.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:476f84f8c20c9f5bc47252b66b4bb25e1a9fc2fa98cead96744d8116cb85771d", size = 10344357, upload-time = "2026-02-17T22:18:38.262Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f1/e2567ffc8951ab371db2e40b2fe068e36b81d8cf3260f06ae508700e5504/pandas-3.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0ab749dfba921edf641d4036c4c21c0b3ea70fea478165cb98a998fb2a261955", size = 9884543, upload-time = "2026-02-17T22:18:41.476Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/39/327802e0b6d693182403c144edacbc27eb82907b57062f23ef5a4c4a5ea7/pandas-3.0.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8e36891080b87823aff3640c78649b91b8ff6eea3c0d70aeabd72ea43ab069b", size = 10396030, upload-time = "2026-02-17T22:18:43.822Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/fe/89d77e424365280b79d99b3e1e7d606f5165af2f2ecfaf0c6d24c799d607/pandas-3.0.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:532527a701281b9dd371e2f582ed9094f4c12dd9ffb82c0c54ee28d8ac9520c4", size = 10876435, upload-time = "2026-02-17T22:18:45.954Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/a6/2a75320849dd154a793f69c951db759aedb8d1dd3939eeacda9bdcfa1629/pandas-3.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:356e5c055ed9b0da1580d465657bc7d00635af4fd47f30afb23025352ba764d1", size = 11405133, upload-time = "2026-02-17T22:18:48.533Z" },
+    { url = "https://files.pythonhosted.org/packages/58/53/1d68fafb2e02d7881df66aa53be4cd748d25cbe311f3b3c85c93ea5d30ca/pandas-3.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9d810036895f9ad6345b8f2a338dd6998a74e8483847403582cab67745bff821", size = 11932065, upload-time = "2026-02-17T22:18:50.837Z" },
+    { url = "https://files.pythonhosted.org/packages/75/08/67cc404b3a966b6df27b38370ddd96b3b023030b572283d035181854aac5/pandas-3.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:536232a5fe26dd989bd633e7a0c450705fdc86a207fec7254a55e9a22950fe43", size = 9741627, upload-time = "2026-02-17T22:18:53.905Z" },
+    { url = "https://files.pythonhosted.org/packages/86/4f/caf9952948fb00d23795f09b893d11f1cacb384e666854d87249530f7cbe/pandas-3.0.1-cp312-cp312-win_arm64.whl", hash = "sha256:0f463ebfd8de7f326d38037c7363c6dacb857c5881ab8961fb387804d6daf2f7", size = 9052483, upload-time = "2026-02-17T22:18:57.31Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/48/aad6ec4f8d007534c091e9a7172b3ec1b1ee6d99a9cbb936b5eab6c6cf58/pandas-3.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5272627187b5d9c20e55d27caf5f2cd23e286aba25cadf73c8590e432e2b7262", size = 10317509, upload-time = "2026-02-17T22:18:59.498Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/14/5990826f779f79148ae9d3a2c39593dc04d61d5d90541e71b5749f35af95/pandas-3.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:661e0f665932af88c7877f31da0dc743fe9c8f2524bdffe23d24fdcb67ef9d56", size = 9860561, upload-time = "2026-02-17T22:19:02.265Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/80/f01ff54664b6d70fed71475543d108a9b7c888e923ad210795bef04ffb7d/pandas-3.0.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:75e6e292ff898679e47a2199172593d9f6107fd2dd3617c22c2946e97d5df46e", size = 10365506, upload-time = "2026-02-17T22:19:05.017Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/85/ab6d04733a7d6ff32bfc8382bf1b07078228f5d6ebec5266b91bfc5c4ff7/pandas-3.0.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1ff8cf1d2896e34343197685f432450ec99a85ba8d90cce2030c5eee2ef98791", size = 10873196, upload-time = "2026-02-17T22:19:07.204Z" },
+    { url = "https://files.pythonhosted.org/packages/48/a9/9301c83d0b47c23ac5deab91c6b39fd98d5b5db4d93b25df8d381451828f/pandas-3.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eca8b4510f6763f3d37359c2105df03a7a221a508f30e396a51d0713d462e68a", size = 11370859, upload-time = "2026-02-17T22:19:09.436Z" },
+    { url = "https://files.pythonhosted.org/packages/59/fe/0c1fc5bd2d29c7db2ab372330063ad555fb83e08422829c785f5ec2176ca/pandas-3.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:06aff2ad6f0b94a17822cf8b83bbb563b090ed82ff4fe7712db2ce57cd50d9b8", size = 11924584, upload-time = "2026-02-17T22:19:11.562Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/7d/216a1588b65a7aa5f4535570418a599d943c85afb1d95b0876fc00aa1468/pandas-3.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:9fea306c783e28884c29057a1d9baa11a349bbf99538ec1da44c8476563d1b25", size = 9742769, upload-time = "2026-02-17T22:19:13.926Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/cb/810a22a6af9a4e97c8ab1c946b47f3489c5bca5adc483ce0ffc84c9cc768/pandas-3.0.1-cp313-cp313-win_arm64.whl", hash = "sha256:a8d37a43c52917427e897cb2e429f67a449327394396a81034a4449b99afda59", size = 9043855, upload-time = "2026-02-17T22:19:16.09Z" },
+    { url = "https://files.pythonhosted.org/packages/92/fa/423c89086cca1f039cf1253c3ff5b90f157b5b3757314aa635f6bf3e30aa/pandas-3.0.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d54855f04f8246ed7b6fc96b05d4871591143c46c0b6f4af874764ed0d2d6f06", size = 10752673, upload-time = "2026-02-17T22:19:18.304Z" },
+    { url = "https://files.pythonhosted.org/packages/22/23/b5a08ec1f40020397f0faba72f1e2c11f7596a6169c7b3e800abff0e433f/pandas-3.0.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e1b677accee34a09e0dc2ce5624e4a58a1870ffe56fc021e9caf7f23cd7668f", size = 10404967, upload-time = "2026-02-17T22:19:20.726Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/81/94841f1bb4afdc2b52a99daa895ac2c61600bb72e26525ecc9543d453ebc/pandas-3.0.1-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9cabbdcd03f1b6cd254d6dda8ae09b0252524be1592594c00b7895916cb1324", size = 10320575, upload-time = "2026-02-17T22:19:24.919Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/8b/2ae37d66a5342a83adadfd0cb0b4bf9c3c7925424dd5f40d15d6cfaa35ee/pandas-3.0.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ae2ab1f166668b41e770650101e7090824fd34d17915dd9cd479f5c5e0065e9", size = 10710921, upload-time = "2026-02-17T22:19:27.181Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/61/772b2e2757855e232b7ccf7cb8079a5711becb3a97f291c953def15a833f/pandas-3.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6bf0603c2e30e2cafac32807b06435f28741135cb8697eae8b28c7d492fc7d76", size = 11334191, upload-time = "2026-02-17T22:19:29.411Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/08/b16c6df3ef555d8495d1d265a7963b65be166785d28f06a350913a4fac78/pandas-3.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6c426422973973cae1f4a23e51d4ae85974f44871b24844e4f7de752dd877098", size = 11782256, upload-time = "2026-02-17T22:19:32.34Z" },
+    { url = "https://files.pythonhosted.org/packages/55/80/178af0594890dee17e239fca96d3d8670ba0f5ff59b7d0439850924a9c09/pandas-3.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b03f91ae8c10a85c1613102c7bef5229b5379f343030a3ccefeca8a33414cf35", size = 10485047, upload-time = "2026-02-17T22:19:34.605Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/8b/4bb774a998b97e6c2fd62a9e6cfdaae133b636fd1c468f92afb4ae9a447a/pandas-3.0.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:99d0f92ed92d3083d140bf6b97774f9f13863924cf3f52a70711f4e7588f9d0a", size = 10322465, upload-time = "2026-02-17T22:19:36.803Z" },
+    { url = "https://files.pythonhosted.org/packages/72/3a/5b39b51c64159f470f1ca3b1c2a87da290657ca022f7cd11442606f607d1/pandas-3.0.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3b66857e983208654294bb6477b8a63dee26b37bdd0eb34d010556e91261784f", size = 9910632, upload-time = "2026-02-17T22:19:39.001Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/f7/b449ffb3f68c11da12fc06fbf6d2fa3a41c41e17d0284d23a79e1c13a7e4/pandas-3.0.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56cf59638bf24dc9bdf2154c81e248b3289f9a09a6d04e63608c159022352749", size = 10440535, upload-time = "2026-02-17T22:19:41.157Z" },
+    { url = "https://files.pythonhosted.org/packages/55/77/6ea82043db22cb0f2bbfe7198da3544000ddaadb12d26be36e19b03a2dc5/pandas-3.0.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c1a9f55e0f46951874b863d1f3906dcb57df2d9be5c5847ba4dfb55b2c815249", size = 10893940, upload-time = "2026-02-17T22:19:43.493Z" },
+    { url = "https://files.pythonhosted.org/packages/03/30/f1b502a72468c89412c1b882a08f6eed8a4ee9dc033f35f65d0663df6081/pandas-3.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1849f0bba9c8a2fb0f691d492b834cc8dadf617e29015c66e989448d58d011ee", size = 11442711, upload-time = "2026-02-17T22:19:46.074Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/f0/ebb6ddd8fc049e98cabac5c2924d14d1dda26a20adb70d41ea2e428d3ec4/pandas-3.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3d288439e11b5325b02ae6e9cc83e6805a62c40c5a6220bea9beb899c073b1c", size = 11963918, upload-time = "2026-02-17T22:19:48.838Z" },
+    { url = "https://files.pythonhosted.org/packages/09/f8/8ce132104074f977f907442790eaae24e27bce3b3b454e82faa3237ff098/pandas-3.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:93325b0fe372d192965f4cca88d97667f49557398bbf94abdda3bf1b591dbe66", size = 9862099, upload-time = "2026-02-17T22:19:51.081Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/b7/6af9aac41ef2456b768ef0ae60acf8abcebb450a52043d030a65b4b7c9bd/pandas-3.0.1-cp314-cp314-win_arm64.whl", hash = "sha256:97ca08674e3287c7148f4858b01136f8bdfe7202ad25ad04fec602dd1d29d132", size = 9185333, upload-time = "2026-02-17T22:19:53.266Z" },
+    { url = "https://files.pythonhosted.org/packages/66/fc/848bb6710bc6061cb0c5badd65b92ff75c81302e0e31e496d00029fe4953/pandas-3.0.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:58eeb1b2e0fb322befcf2bbc9ba0af41e616abadb3d3414a6bc7167f6cbfce32", size = 10772664, upload-time = "2026-02-17T22:19:55.806Z" },
+    { url = "https://files.pythonhosted.org/packages/69/5c/866a9bbd0f79263b4b0db6ec1a341be13a1473323f05c122388e0f15b21d/pandas-3.0.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cd9af1276b5ca9e298bd79a26bda32fa9cc87ed095b2a9a60978d2ca058eaf87", size = 10421286, upload-time = "2026-02-17T22:19:58.091Z" },
+    { url = "https://files.pythonhosted.org/packages/51/a4/2058fb84fb1cfbfb2d4a6d485e1940bb4ad5716e539d779852494479c580/pandas-3.0.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94f87a04984d6b63788327cd9f79dda62b7f9043909d2440ceccf709249ca988", size = 10342050, upload-time = "2026-02-17T22:20:01.376Z" },
+    { url = "https://files.pythonhosted.org/packages/22/1b/674e89996cc4be74db3c4eb09240c4bb549865c9c3f5d9b086ff8fcfbf00/pandas-3.0.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85fe4c4df62e1e20f9db6ebfb88c844b092c22cd5324bdcf94bfa2fc1b391221", size = 10740055, upload-time = "2026-02-17T22:20:04.328Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/f8/e954b750764298c22fa4614376531fe63c521ef517e7059a51f062b87dca/pandas-3.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:331ca75a2f8672c365ae25c0b29e46f5ac0c6551fdace8eec4cd65e4fac271ff", size = 11357632, upload-time = "2026-02-17T22:20:06.647Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/02/c6e04b694ffd68568297abd03588b6d30295265176a5c01b7459d3bc35a3/pandas-3.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:15860b1fdb1973fffade772fdb931ccf9b2f400a3f5665aef94a00445d7d8dd5", size = 11810974, upload-time = "2026-02-17T22:20:08.946Z" },
+    { url = "https://files.pythonhosted.org/packages/89/41/d7dfb63d2407f12055215070c42fc6ac41b66e90a2946cdc5e759058398b/pandas-3.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:44f1364411d5670efa692b146c748f4ed013df91ee91e9bec5677fb1fd58b937", size = 10884622, upload-time = "2026-02-17T22:20:11.711Z" },
+    { url = "https://files.pythonhosted.org/packages/68/b0/34937815889fa982613775e4b97fddd13250f11012d769949c5465af2150/pandas-3.0.1-cp314-cp314t-win_arm64.whl", hash = "sha256:108dd1790337a494aa80e38def654ca3f0968cf4f362c85f44c15e471667102d", size = 9452085, upload-time = "2026-02-17T22:20:14.331Z" },
+]
+
 [[package]]
 name = "paramiko"
 version = "4.0.0"
@@ -3569,11 +4464,11 @@ wheels = [
 
 [[package]]
 name = "pathspec"
-version = "0.12.1"
+version = "1.0.4"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200, upload-time = "2026-01-27T03:59:46.938Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206, upload-time = "2026-01-27T03:59:45.137Z" },
 ]
 
 [[package]]
@@ -3587,109 +4482,109 @@ wheels = [
 
 [[package]]
 name = "pillow"
-version = "12.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5d/08/26e68b6b5da219c2a2cb7b563af008b53bb8e6b6fcb3fa40715fcdb2523a/pillow-12.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:3adfb466bbc544b926d50fe8f4a4e6abd8c6bffd28a26177594e6e9b2b76572b", size = 5289809, upload-time = "2025-10-15T18:21:27.791Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/e9/4e58fb097fb74c7b4758a680aacd558810a417d1edaa7000142976ef9d2f/pillow-12.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1ac11e8ea4f611c3c0147424eae514028b5e9077dd99ab91e1bd7bc33ff145e1", size = 4650606, upload-time = "2025-10-15T18:21:29.823Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/e0/1fa492aa9f77b3bc6d471c468e62bfea1823056bf7e5e4f1914d7ab2565e/pillow-12.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d49e2314c373f4c2b39446fb1a45ed333c850e09d0c59ac79b72eb3b95397363", size = 6221023, upload-time = "2025-10-15T18:21:31.415Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/09/4de7cd03e33734ccd0c876f0251401f1314e819cbfd89a0fcb6e77927cc6/pillow-12.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c7b2a63fd6d5246349f3d3f37b14430d73ee7e8173154461785e43036ffa96ca", size = 8024937, upload-time = "2025-10-15T18:21:33.453Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/69/0688e7c1390666592876d9d474f5e135abb4acb39dcb583c4dc5490f1aff/pillow-12.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d64317d2587c70324b79861babb9c09f71fbb780bad212018874b2c013d8600e", size = 6334139, upload-time = "2025-10-15T18:21:35.395Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/1c/880921e98f525b9b44ce747ad1ea8f73fd7e992bafe3ca5e5644bf433dea/pillow-12.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d77153e14b709fd8b8af6f66a3afbb9ed6e9fc5ccf0b6b7e1ced7b036a228782", size = 7026074, upload-time = "2025-10-15T18:21:37.219Z" },
-    { url = "https://files.pythonhosted.org/packages/28/03/96f718331b19b355610ef4ebdbbde3557c726513030665071fd025745671/pillow-12.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32ed80ea8a90ee3e6fa08c21e2e091bba6eda8eccc83dbc34c95169507a91f10", size = 6448852, upload-time = "2025-10-15T18:21:39.168Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/a0/6a193b3f0cc9437b122978d2c5cbce59510ccf9a5b48825096ed7472da2f/pillow-12.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c828a1ae702fc712978bda0320ba1b9893d99be0badf2647f693cc01cf0f04fa", size = 7117058, upload-time = "2025-10-15T18:21:40.997Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/c4/043192375eaa4463254e8e61f0e2ec9a846b983929a8d0a7122e0a6d6fff/pillow-12.0.0-cp310-cp310-win32.whl", hash = "sha256:bd87e140e45399c818fac4247880b9ce719e4783d767e030a883a970be632275", size = 6295431, upload-time = "2025-10-15T18:21:42.518Z" },
-    { url = "https://files.pythonhosted.org/packages/92/c6/c2f2fc7e56301c21827e689bb8b0b465f1b52878b57471a070678c0c33cd/pillow-12.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:455247ac8a4cfb7b9bc45b7e432d10421aea9fc2e74d285ba4072688a74c2e9d", size = 7000412, upload-time = "2025-10-15T18:21:44.404Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/d2/5f675067ba82da7a1c238a73b32e3fd78d67f9d9f80fbadd33a40b9c0481/pillow-12.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:6ace95230bfb7cd79ef66caa064bbe2f2a1e63d93471c3a2e1f1348d9f22d6b7", size = 2435903, upload-time = "2025-10-15T18:21:46.29Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/5a/a2f6773b64edb921a756eb0729068acad9fc5208a53f4a349396e9436721/pillow-12.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0fd00cac9c03256c8b2ff58f162ebcd2587ad3e1f2e397eab718c47e24d231cc", size = 5289798, upload-time = "2025-10-15T18:21:47.763Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/05/069b1f8a2e4b5a37493da6c5868531c3f77b85e716ad7a590ef87d58730d/pillow-12.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3475b96f5908b3b16c47533daaa87380c491357d197564e0ba34ae75c0f3257", size = 4650589, upload-time = "2025-10-15T18:21:49.515Z" },
-    { url = "https://files.pythonhosted.org/packages/61/e3/2c820d6e9a36432503ead175ae294f96861b07600a7156154a086ba7111a/pillow-12.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:110486b79f2d112cf6add83b28b627e369219388f64ef2f960fef9ebaf54c642", size = 6230472, upload-time = "2025-10-15T18:21:51.052Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/89/63427f51c64209c5e23d4d52071c8d0f21024d3a8a487737caaf614a5795/pillow-12.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5269cc1caeedb67e6f7269a42014f381f45e2e7cd42d834ede3c703a1d915fe3", size = 8033887, upload-time = "2025-10-15T18:21:52.604Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/1b/c9711318d4901093c15840f268ad649459cd81984c9ec9887756cca049a5/pillow-12.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa5129de4e174daccbc59d0a3b6d20eaf24417d59851c07ebb37aeb02947987c", size = 6343964, upload-time = "2025-10-15T18:21:54.619Z" },
-    { url = "https://files.pythonhosted.org/packages/41/1e/db9470f2d030b4995083044cd8738cdd1bf773106819f6d8ba12597d5352/pillow-12.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bee2a6db3a7242ea309aa7ee8e2780726fed67ff4e5b40169f2c940e7eb09227", size = 7034756, upload-time = "2025-10-15T18:21:56.151Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/b0/6177a8bdd5ee4ed87cba2de5a3cc1db55ffbbec6176784ce5bb75aa96798/pillow-12.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:90387104ee8400a7b4598253b4c406f8958f59fcf983a6cea2b50d59f7d63d0b", size = 6458075, upload-time = "2025-10-15T18:21:57.759Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/5e/61537aa6fa977922c6a03253a0e727e6e4a72381a80d63ad8eec350684f2/pillow-12.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc91a56697869546d1b8f0a3ff35224557ae7f881050e99f615e0119bf934b4e", size = 7125955, upload-time = "2025-10-15T18:21:59.372Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/3d/d5033539344ee3cbd9a4d69e12e63ca3a44a739eb2d4c8da350a3d38edd7/pillow-12.0.0-cp311-cp311-win32.whl", hash = "sha256:27f95b12453d165099c84f8a8bfdfd46b9e4bda9e0e4b65f0635430027f55739", size = 6298440, upload-time = "2025-10-15T18:22:00.982Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/42/aaca386de5cc8bd8a0254516957c1f265e3521c91515b16e286c662854c4/pillow-12.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b583dc9070312190192631373c6c8ed277254aa6e6084b74bdd0a6d3b221608e", size = 6999256, upload-time = "2025-10-15T18:22:02.617Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/f1/9197c9c2d5708b785f631a6dfbfa8eb3fb9672837cb92ae9af812c13b4ed/pillow-12.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:759de84a33be3b178a64c8ba28ad5c135900359e85fb662bc6e403ad4407791d", size = 2436025, upload-time = "2025-10-15T18:22:04.598Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/90/4fcce2c22caf044e660a198d740e7fbc14395619e3cb1abad12192c0826c/pillow-12.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:53561a4ddc36facb432fae7a9d8afbfaf94795414f5cdc5fc52f28c1dca90371", size = 5249377, upload-time = "2025-10-15T18:22:05.993Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/e0/ed960067543d080691d47d6938ebccbf3976a931c9567ab2fbfab983a5dd/pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:71db6b4c1653045dacc1585c1b0d184004f0d7e694c7b34ac165ca70c0838082", size = 4650343, upload-time = "2025-10-15T18:22:07.718Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/a1/f81fdeddcb99c044bf7d6faa47e12850f13cee0849537a7d27eeab5534d4/pillow-12.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2fa5f0b6716fc88f11380b88b31fe591a06c6315e955c096c35715788b339e3f", size = 6232981, upload-time = "2025-10-15T18:22:09.287Z" },
-    { url = "https://files.pythonhosted.org/packages/88/e1/9098d3ce341a8750b55b0e00c03f1630d6178f38ac191c81c97a3b047b44/pillow-12.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82240051c6ca513c616f7f9da06e871f61bfd7805f566275841af15015b8f98d", size = 8041399, upload-time = "2025-10-15T18:22:10.872Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/62/a22e8d3b602ae8cc01446d0c57a54e982737f44b6f2e1e019a925143771d/pillow-12.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55f818bd74fe2f11d4d7cbc65880a843c4075e0ac7226bc1a23261dbea531953", size = 6347740, upload-time = "2025-10-15T18:22:12.769Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/87/424511bdcd02c8d7acf9f65caa09f291a519b16bd83c3fb3374b3d4ae951/pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b87843e225e74576437fd5b6a4c2205d422754f84a06942cfaf1dc32243e45a8", size = 7040201, upload-time = "2025-10-15T18:22:14.813Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/4d/435c8ac688c54d11755aedfdd9f29c9eeddf68d150fe42d1d3dbd2365149/pillow-12.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c607c90ba67533e1b2355b821fef6764d1dd2cbe26b8c1005ae84f7aea25ff79", size = 6462334, upload-time = "2025-10-15T18:22:16.375Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/f2/ad34167a8059a59b8ad10bc5c72d4d9b35acc6b7c0877af8ac885b5f2044/pillow-12.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:21f241bdd5080a15bc86d3466a9f6074a9c2c2b314100dd896ac81ee6db2f1ba", size = 7134162, upload-time = "2025-10-15T18:22:17.996Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/b1/a7391df6adacf0a5c2cf6ac1cf1fcc1369e7d439d28f637a847f8803beb3/pillow-12.0.0-cp312-cp312-win32.whl", hash = "sha256:dd333073e0cacdc3089525c7df7d39b211bcdf31fc2824e49d01c6b6187b07d0", size = 6298769, upload-time = "2025-10-15T18:22:19.923Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/0b/d87733741526541c909bbf159e338dcace4f982daac6e5a8d6be225ca32d/pillow-12.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe611163f6303d1619bbcb653540a4d60f9e55e622d60a3108be0d5b441017a", size = 7001107, upload-time = "2025-10-15T18:22:21.644Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/96/aaa61ce33cc98421fb6088af2a03be4157b1e7e0e87087c888e2370a7f45/pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad", size = 2436012, upload-time = "2025-10-15T18:22:23.621Z" },
-    { url = "https://files.pythonhosted.org/packages/62/f2/de993bb2d21b33a98d031ecf6a978e4b61da207bef02f7b43093774c480d/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:0869154a2d0546545cde61d1789a6524319fc1897d9ee31218eae7a60ccc5643", size = 4045493, upload-time = "2025-10-15T18:22:25.758Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/b6/bc8d0c4c9f6f111a783d045310945deb769b806d7574764234ffd50bc5ea/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:a7921c5a6d31b3d756ec980f2f47c0cfdbce0fc48c22a39347a895f41f4a6ea4", size = 4120461, upload-time = "2025-10-15T18:22:27.286Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/57/d60d343709366a353dc56adb4ee1e7d8a2cc34e3fbc22905f4167cfec119/pillow-12.0.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1ee80a59f6ce048ae13cda1abf7fbd2a34ab9ee7d401c46be3ca685d1999a399", size = 3576912, upload-time = "2025-10-15T18:22:28.751Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/a4/a0a31467e3f83b94d37568294b01d22b43ae3c5d85f2811769b9c66389dd/pillow-12.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c50f36a62a22d350c96e49ad02d0da41dbd17ddc2e29750dbdba4323f85eb4a5", size = 5249132, upload-time = "2025-10-15T18:22:30.641Z" },
-    { url = "https://files.pythonhosted.org/packages/83/06/48eab21dd561de2914242711434c0c0eb992ed08ff3f6107a5f44527f5e9/pillow-12.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5193fde9a5f23c331ea26d0cf171fbf67e3f247585f50c08b3e205c7aeb4589b", size = 4650099, upload-time = "2025-10-15T18:22:32.73Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/bd/69ed99fd46a8dba7c1887156d3572fe4484e3f031405fcc5a92e31c04035/pillow-12.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bde737cff1a975b70652b62d626f7785e0480918dece11e8fef3c0cf057351c3", size = 6230808, upload-time = "2025-10-15T18:22:34.337Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/94/8fad659bcdbf86ed70099cb60ae40be6acca434bbc8c4c0d4ef356d7e0de/pillow-12.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6597ff2b61d121172f5844b53f21467f7082f5fb385a9a29c01414463f93b07", size = 8037804, upload-time = "2025-10-15T18:22:36.402Z" },
-    { url = "https://files.pythonhosted.org/packages/20/39/c685d05c06deecfd4e2d1950e9a908aa2ca8bc4e6c3b12d93b9cafbd7837/pillow-12.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b817e7035ea7f6b942c13aa03bb554fc44fea70838ea21f8eb31c638326584e", size = 6345553, upload-time = "2025-10-15T18:22:38.066Z" },
-    { url = "https://files.pythonhosted.org/packages/38/57/755dbd06530a27a5ed74f8cb0a7a44a21722ebf318edbe67ddbd7fb28f88/pillow-12.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f4f1231b7dec408e8670264ce63e9c71409d9583dd21d32c163e25213ee2a344", size = 7037729, upload-time = "2025-10-15T18:22:39.769Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/b6/7e94f4c41d238615674d06ed677c14883103dce1c52e4af16f000338cfd7/pillow-12.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e51b71417049ad6ab14c49608b4a24d8fb3fe605e5dfabfe523b58064dc3d27", size = 6459789, upload-time = "2025-10-15T18:22:41.437Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/14/4448bb0b5e0f22dd865290536d20ec8a23b64e2d04280b89139f09a36bb6/pillow-12.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d120c38a42c234dc9a8c5de7ceaaf899cf33561956acb4941653f8bdc657aa79", size = 7130917, upload-time = "2025-10-15T18:22:43.152Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/ca/16c6926cc1c015845745d5c16c9358e24282f1e588237a4c36d2b30f182f/pillow-12.0.0-cp313-cp313-win32.whl", hash = "sha256:4cc6b3b2efff105c6a1656cfe59da4fdde2cda9af1c5e0b58529b24525d0a098", size = 6302391, upload-time = "2025-10-15T18:22:44.753Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/2a/dd43dcfd6dae9b6a49ee28a8eedb98c7d5ff2de94a5d834565164667b97b/pillow-12.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:4cf7fed4b4580601c4345ceb5d4cbf5a980d030fd5ad07c4d2ec589f95f09905", size = 7007477, upload-time = "2025-10-15T18:22:46.838Z" },
-    { url = "https://files.pythonhosted.org/packages/77/f0/72ea067f4b5ae5ead653053212af05ce3705807906ba3f3e8f58ddf617e6/pillow-12.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:9f0b04c6b8584c2c193babcccc908b38ed29524b29dd464bc8801bf10d746a3a", size = 2435918, upload-time = "2025-10-15T18:22:48.399Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/5e/9046b423735c21f0487ea6cb5b10f89ea8f8dfbe32576fe052b5ba9d4e5b/pillow-12.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7fa22993bac7b77b78cae22bad1e2a987ddf0d9015c63358032f84a53f23cdc3", size = 5251406, upload-time = "2025-10-15T18:22:49.905Z" },
-    { url = "https://files.pythonhosted.org/packages/12/66/982ceebcdb13c97270ef7a56c3969635b4ee7cd45227fa707c94719229c5/pillow-12.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f135c702ac42262573fe9714dfe99c944b4ba307af5eb507abef1667e2cbbced", size = 4653218, upload-time = "2025-10-15T18:22:51.587Z" },
-    { url = "https://files.pythonhosted.org/packages/16/b3/81e625524688c31859450119bf12674619429cab3119eec0e30a7a1029cb/pillow-12.0.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c85de1136429c524e55cfa4e033b4a7940ac5c8ee4d9401cc2d1bf48154bbc7b", size = 6266564, upload-time = "2025-10-15T18:22:53.215Z" },
-    { url = "https://files.pythonhosted.org/packages/98/59/dfb38f2a41240d2408096e1a76c671d0a105a4a8471b1871c6902719450c/pillow-12.0.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:38df9b4bfd3db902c9c2bd369bcacaf9d935b2fff73709429d95cc41554f7b3d", size = 8069260, upload-time = "2025-10-15T18:22:54.933Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/3d/378dbea5cd1874b94c312425ca77b0f47776c78e0df2df751b820c8c1d6c/pillow-12.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d87ef5795da03d742bf49439f9ca4d027cde49c82c5371ba52464aee266699a", size = 6379248, upload-time = "2025-10-15T18:22:56.605Z" },
-    { url = "https://files.pythonhosted.org/packages/84/b0/d525ef47d71590f1621510327acec75ae58c721dc071b17d8d652ca494d8/pillow-12.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aff9e4d82d082ff9513bdd6acd4f5bd359f5b2c870907d2b0a9c5e10d40c88fe", size = 7066043, upload-time = "2025-10-15T18:22:58.53Z" },
-    { url = "https://files.pythonhosted.org/packages/61/2c/aced60e9cf9d0cde341d54bf7932c9ffc33ddb4a1595798b3a5150c7ec4e/pillow-12.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8d8ca2b210ada074d57fcee40c30446c9562e542fc46aedc19baf758a93532ee", size = 6490915, upload-time = "2025-10-15T18:23:00.582Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/26/69dcb9b91f4e59f8f34b2332a4a0a951b44f547c4ed39d3e4dcfcff48f89/pillow-12.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:99a7f72fb6249302aa62245680754862a44179b545ded638cf1fef59befb57ef", size = 7157998, upload-time = "2025-10-15T18:23:02.627Z" },
-    { url = "https://files.pythonhosted.org/packages/61/2b/726235842220ca95fa441ddf55dd2382b52ab5b8d9c0596fe6b3f23dafe8/pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9", size = 6306201, upload-time = "2025-10-15T18:23:04.709Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/3d/2afaf4e840b2df71344ababf2f8edd75a705ce500e5dc1e7227808312ae1/pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b", size = 7013165, upload-time = "2025-10-15T18:23:06.46Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/75/3fa09aa5cf6ed04bee3fa575798ddf1ce0bace8edb47249c798077a81f7f/pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47", size = 2437834, upload-time = "2025-10-15T18:23:08.194Z" },
-    { url = "https://files.pythonhosted.org/packages/54/2a/9a8c6ba2c2c07b71bec92cf63e03370ca5e5f5c5b119b742bcc0cde3f9c5/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:beeae3f27f62308f1ddbcfb0690bf44b10732f2ef43758f169d5e9303165d3f9", size = 4045531, upload-time = "2025-10-15T18:23:10.121Z" },
-    { url = "https://files.pythonhosted.org/packages/84/54/836fdbf1bfb3d66a59f0189ff0b9f5f666cee09c6188309300df04ad71fa/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:d4827615da15cd59784ce39d3388275ec093ae3ee8d7f0c089b76fa87af756c2", size = 4120554, upload-time = "2025-10-15T18:23:12.14Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/cd/16aec9f0da4793e98e6b54778a5fbce4f375c6646fe662e80600b8797379/pillow-12.0.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:3e42edad50b6909089750e65c91aa09aaf1e0a71310d383f11321b27c224ed8a", size = 3576812, upload-time = "2025-10-15T18:23:13.962Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/b7/13957fda356dc46339298b351cae0d327704986337c3c69bb54628c88155/pillow-12.0.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e5d8efac84c9afcb40914ab49ba063d94f5dbdf5066db4482c66a992f47a3a3b", size = 5252689, upload-time = "2025-10-15T18:23:15.562Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/f5/eae31a306341d8f331f43edb2e9122c7661b975433de5e447939ae61c5da/pillow-12.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:266cd5f2b63ff316d5a1bba46268e603c9caf5606d44f38c2873c380950576ad", size = 4650186, upload-time = "2025-10-15T18:23:17.379Z" },
-    { url = "https://files.pythonhosted.org/packages/86/62/2a88339aa40c4c77e79108facbd307d6091e2c0eb5b8d3cf4977cfca2fe6/pillow-12.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:58eea5ebe51504057dd95c5b77d21700b77615ab0243d8152793dc00eb4faf01", size = 6230308, upload-time = "2025-10-15T18:23:18.971Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/33/5425a8992bcb32d1cb9fa3dd39a89e613d09a22f2c8083b7bf43c455f760/pillow-12.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f13711b1a5ba512d647a0e4ba79280d3a9a045aaf7e0cc6fbe96b91d4cdf6b0c", size = 8039222, upload-time = "2025-10-15T18:23:20.909Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/61/3f5d3b35c5728f37953d3eec5b5f3e77111949523bd2dd7f31a851e50690/pillow-12.0.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6846bd2d116ff42cba6b646edf5bf61d37e5cbd256425fa089fee4ff5c07a99e", size = 6346657, upload-time = "2025-10-15T18:23:23.077Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/be/ee90a3d79271227e0f0a33c453531efd6ed14b2e708596ba5dd9be948da3/pillow-12.0.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c98fa880d695de164b4135a52fd2e9cd7b7c90a9d8ac5e9e443a24a95ef9248e", size = 7038482, upload-time = "2025-10-15T18:23:25.005Z" },
-    { url = "https://files.pythonhosted.org/packages/44/34/a16b6a4d1ad727de390e9bd9f19f5f669e079e5826ec0f329010ddea492f/pillow-12.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa3ed2a29a9e9d2d488b4da81dcb54720ac3104a20bf0bd273f1e4648aff5af9", size = 6461416, upload-time = "2025-10-15T18:23:27.009Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/39/1aa5850d2ade7d7ba9f54e4e4c17077244ff7a2d9e25998c38a29749eb3f/pillow-12.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d034140032870024e6b9892c692fe2968493790dd57208b2c37e3fb35f6df3ab", size = 7131584, upload-time = "2025-10-15T18:23:29.752Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/db/4fae862f8fad0167073a7733973bfa955f47e2cac3dc3e3e6257d10fab4a/pillow-12.0.0-cp314-cp314-win32.whl", hash = "sha256:1b1b133e6e16105f524a8dec491e0586d072948ce15c9b914e41cdadd209052b", size = 6400621, upload-time = "2025-10-15T18:23:32.06Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/24/b350c31543fb0107ab2599464d7e28e6f856027aadda995022e695313d94/pillow-12.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8dc232e39d409036af549c86f24aed8273a40ffa459981146829a324e0848b4b", size = 7142916, upload-time = "2025-10-15T18:23:34.71Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/9b/0ba5a6fd9351793996ef7487c4fdbde8d3f5f75dbedc093bb598648fddf0/pillow-12.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:d52610d51e265a51518692045e372a4c363056130d922a7351429ac9f27e70b0", size = 2523836, upload-time = "2025-10-15T18:23:36.967Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/7a/ceee0840aebc579af529b523d530840338ecf63992395842e54edc805987/pillow-12.0.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1979f4566bb96c1e50a62d9831e2ea2d1211761e5662afc545fa766f996632f6", size = 5255092, upload-time = "2025-10-15T18:23:38.573Z" },
-    { url = "https://files.pythonhosted.org/packages/44/76/20776057b4bfd1aef4eeca992ebde0f53a4dce874f3ae693d0ec90a4f79b/pillow-12.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b2e4b27a6e15b04832fe9bf292b94b5ca156016bbc1ea9c2c20098a0320d6cf6", size = 4653158, upload-time = "2025-10-15T18:23:40.238Z" },
-    { url = "https://files.pythonhosted.org/packages/82/3f/d9ff92ace07be8836b4e7e87e6a4c7a8318d47c2f1463ffcf121fc57d9cb/pillow-12.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fb3096c30df99fd01c7bf8e544f392103d0795b9f98ba71a8054bcbf56b255f1", size = 6267882, upload-time = "2025-10-15T18:23:42.434Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/7a/4f7ff87f00d3ad33ba21af78bfcd2f032107710baf8280e3722ceec28cda/pillow-12.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7438839e9e053ef79f7112c881cef684013855016f928b168b81ed5835f3e75e", size = 8071001, upload-time = "2025-10-15T18:23:44.29Z" },
-    { url = "https://files.pythonhosted.org/packages/75/87/fcea108944a52dad8cca0715ae6247e271eb80459364a98518f1e4f480c1/pillow-12.0.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d5c411a8eaa2299322b647cd932586b1427367fd3184ffbb8f7a219ea2041ca", size = 6380146, upload-time = "2025-10-15T18:23:46.065Z" },
-    { url = "https://files.pythonhosted.org/packages/91/52/0d31b5e571ef5fd111d2978b84603fce26aba1b6092f28e941cb46570745/pillow-12.0.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7e091d464ac59d2c7ad8e7e08105eaf9dafbc3883fd7265ffccc2baad6ac925", size = 7067344, upload-time = "2025-10-15T18:23:47.898Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/f4/2dd3d721f875f928d48e83bb30a434dee75a2531bca839bb996bb0aa5a91/pillow-12.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:792a2c0be4dcc18af9d4a2dfd8a11a17d5e25274a1062b0ec1c2d79c76f3e7f8", size = 6491864, upload-time = "2025-10-15T18:23:49.607Z" },
-    { url = "https://files.pythonhosted.org/packages/30/4b/667dfcf3d61fc309ba5a15b141845cece5915e39b99c1ceab0f34bf1d124/pillow-12.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:afbefa430092f71a9593a99ab6a4e7538bc9eabbf7bf94f91510d3503943edc4", size = 7158911, upload-time = "2025-10-15T18:23:51.351Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/2f/16cabcc6426c32218ace36bf0d55955e813f2958afddbf1d391849fee9d1/pillow-12.0.0-cp314-cp314t-win32.whl", hash = "sha256:3830c769decf88f1289680a59d4f4c46c72573446352e2befec9a8512104fa52", size = 6408045, upload-time = "2025-10-15T18:23:53.177Z" },
-    { url = "https://files.pythonhosted.org/packages/35/73/e29aa0c9c666cf787628d3f0dcf379f4791fba79f4936d02f8b37165bdf8/pillow-12.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:905b0365b210c73afb0ebe9101a32572152dfd1c144c7e28968a331b9217b94a", size = 7148282, upload-time = "2025-10-15T18:23:55.316Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/70/6b41bdcddf541b437bbb9f47f94d2db5d9ddef6c37ccab8c9107743748a4/pillow-12.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:99353a06902c2e43b43e8ff74ee65a7d90307d82370604746738a1e0661ccca7", size = 2525630, upload-time = "2025-10-15T18:23:57.149Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/b3/582327e6c9f86d037b63beebe981425d6811104cb443e8193824ef1a2f27/pillow-12.0.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b22bd8c974942477156be55a768f7aa37c46904c175be4e158b6a86e3a6b7ca8", size = 5215068, upload-time = "2025-10-15T18:23:59.594Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/d6/67748211d119f3b6540baf90f92fae73ae51d5217b171b0e8b5f7e5d558f/pillow-12.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:805ebf596939e48dbb2e4922a1d3852cfc25c38160751ce02da93058b48d252a", size = 4614994, upload-time = "2025-10-15T18:24:01.669Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/e1/f8281e5d844c41872b273b9f2c34a4bf64ca08905668c8ae730eedc7c9fa/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cae81479f77420d217def5f54b5b9d279804d17e982e0f2fa19b1d1e14ab5197", size = 5246639, upload-time = "2025-10-15T18:24:03.403Z" },
-    { url = "https://files.pythonhosted.org/packages/94/5a/0d8ab8ffe8a102ff5df60d0de5af309015163bf710c7bb3e8311dd3b3ad0/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aeaefa96c768fc66818730b952a862235d68825c178f1b3ffd4efd7ad2edcb7c", size = 6986839, upload-time = "2025-10-15T18:24:05.344Z" },
-    { url = "https://files.pythonhosted.org/packages/20/2e/3434380e8110b76cd9eb00a363c484b050f949b4bbe84ba770bb8508a02c/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f2d0abef9e4e2f349305a4f8cc784a8a6c2f58a8c4892eea13b10a943bd26e", size = 5313505, upload-time = "2025-10-15T18:24:07.137Z" },
-    { url = "https://files.pythonhosted.org/packages/57/ca/5a9d38900d9d74785141d6580950fe705de68af735ff6e727cb911b64740/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdee52571a343d721fb2eb3b090a82d959ff37fc631e3f70422e0c2e029f3e76", size = 5963654, upload-time = "2025-10-15T18:24:09.579Z" },
-    { url = "https://files.pythonhosted.org/packages/95/7e/f896623c3c635a90537ac093c6a618ebe1a90d87206e42309cb5d98a1b9e/pillow-12.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b290fd8aa38422444d4b50d579de197557f182ef1068b75f5aa8558638b8d0a5", size = 6997850, upload-time = "2025-10-15T18:24:11.495Z" },
+version = "12.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1f/42/5c74462b4fd957fcd7b13b04fb3205ff8349236ea74c7c375766d6c82288/pillow-12.1.1.tar.gz", hash = "sha256:9ad8fa5937ab05218e2b6a4cff30295ad35afd2f83ac592e68c0d871bb0fdbc4", size = 46980264, upload-time = "2026-02-11T04:23:07.146Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/30/5bd3d794762481f8c8ae9c80e7b76ecea73b916959eb587521358ef0b2f9/pillow-12.1.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f1625b72740fdda5d77b4def688eb8fd6490975d06b909fd19f13f391e077e0", size = 5304099, upload-time = "2026-02-11T04:20:06.13Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/c1/aab9e8f3eeb4490180e357955e15c2ef74b31f64790ff356c06fb6cf6d84/pillow-12.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:178aa072084bd88ec759052feca8e56cbb14a60b39322b99a049e58090479713", size = 4657880, upload-time = "2026-02-11T04:20:09.291Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/0a/9879e30d56815ad529d3985aeff5af4964202425c27261a6ada10f7cbf53/pillow-12.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b66e95d05ba806247aaa1561f080abc7975daf715c30780ff92a20e4ec546e1b", size = 6222587, upload-time = "2026-02-11T04:20:10.82Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/5f/a1b72ff7139e4f89014e8d451442c74a774d5c43cd938fb0a9f878576b37/pillow-12.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:89c7e895002bbe49cdc5426150377cbbc04767d7547ed145473f496dfa40408b", size = 8027678, upload-time = "2026-02-11T04:20:12.455Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/c2/c7cb187dac79a3d22c3ebeae727abee01e077c8c7d930791dc592f335153/pillow-12.1.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a5cbdcddad0af3da87cb16b60d23648bc3b51967eb07223e9fed77a82b457c4", size = 6335777, upload-time = "2026-02-11T04:20:14.441Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/7b/f9b09a7804ec7336effb96c26d37c29d27225783dc1501b7d62dcef6ae25/pillow-12.1.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9f51079765661884a486727f0729d29054242f74b46186026582b4e4769918e4", size = 7027140, upload-time = "2026-02-11T04:20:16.387Z" },
+    { url = "https://files.pythonhosted.org/packages/98/b2/2fa3c391550bd421b10849d1a2144c44abcd966daadd2f7c12e19ea988c4/pillow-12.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:99c1506ea77c11531d75e3a412832a13a71c7ebc8192ab9e4b2e355555920e3e", size = 6449855, upload-time = "2026-02-11T04:20:18.554Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ff/9caf4b5b950c669263c39e96c78c0d74a342c71c4f43fd031bb5cb7ceac9/pillow-12.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:36341d06738a9f66c8287cf8b876d24b18db9bd8740fa0672c74e259ad408cff", size = 7151329, upload-time = "2026-02-11T04:20:20.646Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/f8/4b24841f582704da675ca535935bccb32b00a6da1226820845fac4a71136/pillow-12.1.1-cp310-cp310-win32.whl", hash = "sha256:6c52f062424c523d6c4db85518774cc3d50f5539dd6eed32b8f6229b26f24d40", size = 6325574, upload-time = "2026-02-11T04:20:22.43Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/f9/9f6b01c0881d7036063aa6612ef04c0e2cad96be21325a1e92d0203f8e91/pillow-12.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:c6008de247150668a705a6338156efb92334113421ceecf7438a12c9a12dab23", size = 7032347, upload-time = "2026-02-11T04:20:23.932Z" },
+    { url = "https://files.pythonhosted.org/packages/79/13/c7922edded3dcdaf10c59297540b72785620abc0538872c819915746757d/pillow-12.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:1a9b0ee305220b392e1124a764ee4265bd063e54a751a6b62eff69992f457fa9", size = 2453457, upload-time = "2026-02-11T04:20:25.392Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/46/5da1ec4a5171ee7bf1a0efa064aba70ba3d6e0788ce3f5acd1375d23c8c0/pillow-12.1.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e879bb6cd5c73848ef3b2b48b8af9ff08c5b71ecda8048b7dd22d8a33f60be32", size = 5304084, upload-time = "2026-02-11T04:20:27.501Z" },
+    { url = "https://files.pythonhosted.org/packages/78/93/a29e9bc02d1cf557a834da780ceccd54e02421627200696fcf805ebdc3fb/pillow-12.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:365b10bb9417dd4498c0e3b128018c4a624dc11c7b97d8cc54effe3b096f4c38", size = 4657866, upload-time = "2026-02-11T04:20:29.827Z" },
+    { url = "https://files.pythonhosted.org/packages/13/84/583a4558d492a179d31e4aae32eadce94b9acf49c0337c4ce0b70e0a01f2/pillow-12.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d4ce8e329c93845720cd2014659ca67eac35f6433fd3050393d85f3ecef0dad5", size = 6232148, upload-time = "2026-02-11T04:20:31.329Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/e2/53c43334bbbb2d3b938978532fbda8e62bb6e0b23a26ce8592f36bcc4987/pillow-12.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc354a04072b765eccf2204f588a7a532c9511e8b9c7f900e1b64e3e33487090", size = 8038007, upload-time = "2026-02-11T04:20:34.225Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/a6/3d0e79c8a9d58150dd98e199d7c1c56861027f3829a3a60b3c2784190180/pillow-12.1.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7e7976bf1910a8116b523b9f9f58bf410f3e8aa330cd9a2bb2953f9266ab49af", size = 6345418, upload-time = "2026-02-11T04:20:35.858Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/c8/46dfeac5825e600579157eea177be43e2f7ff4a99da9d0d0a49533509ac5/pillow-12.1.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:597bd9c8419bc7c6af5604e55847789b69123bbe25d65cc6ad3012b4f3c98d8b", size = 7034590, upload-time = "2026-02-11T04:20:37.91Z" },
+    { url = "https://files.pythonhosted.org/packages/af/bf/e6f65d3db8a8bbfeaf9e13cc0417813f6319863a73de934f14b2229ada18/pillow-12.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2c1fc0f2ca5f96a3c8407e41cca26a16e46b21060fe6d5b099d2cb01412222f5", size = 6458655, upload-time = "2026-02-11T04:20:39.496Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/c2/66091f3f34a25894ca129362e510b956ef26f8fb67a0e6417bc5744e56f1/pillow-12.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:578510d88c6229d735855e1f278aa305270438d36a05031dfaae5067cc8eb04d", size = 7159286, upload-time = "2026-02-11T04:20:41.139Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5a/24bc8eb526a22f957d0cec6243146744966d40857e3d8deb68f7902ca6c1/pillow-12.1.1-cp311-cp311-win32.whl", hash = "sha256:7311c0a0dcadb89b36b7025dfd8326ecfa36964e29913074d47382706e516a7c", size = 6328663, upload-time = "2026-02-11T04:20:43.184Z" },
+    { url = "https://files.pythonhosted.org/packages/31/03/bef822e4f2d8f9d7448c133d0a18185d3cce3e70472774fffefe8b0ed562/pillow-12.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:fbfa2a7c10cc2623f412753cddf391c7f971c52ca40a3f65dc5039b2939e8563", size = 7031448, upload-time = "2026-02-11T04:20:44.696Z" },
+    { url = "https://files.pythonhosted.org/packages/49/70/f76296f53610bd17b2e7d31728b8b7825e3ac3b5b3688b51f52eab7c0818/pillow-12.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:b81b5e3511211631b3f672a595e3221252c90af017e399056d0faabb9538aa80", size = 2453651, upload-time = "2026-02-11T04:20:46.243Z" },
+    { url = "https://files.pythonhosted.org/packages/07/d3/8df65da0d4df36b094351dce696f2989bec731d4f10e743b1c5f4da4d3bf/pillow-12.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab323b787d6e18b3d91a72fc99b1a2c28651e4358749842b8f8dfacd28ef2052", size = 5262803, upload-time = "2026-02-11T04:20:47.653Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/71/5026395b290ff404b836e636f51d7297e6c83beceaa87c592718747e670f/pillow-12.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:adebb5bee0f0af4909c30db0d890c773d1a92ffe83da908e2e9e720f8edf3984", size = 4657601, upload-time = "2026-02-11T04:20:49.328Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/2e/1001613d941c67442f745aff0f7cc66dd8df9a9c084eb497e6a543ee6f7e/pillow-12.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb66b7cc26f50977108790e2456b7921e773f23db5630261102233eb355a3b79", size = 6234995, upload-time = "2026-02-11T04:20:51.032Z" },
+    { url = "https://files.pythonhosted.org/packages/07/26/246ab11455b2549b9233dbd44d358d033a2f780fa9007b61a913c5b2d24e/pillow-12.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aee2810642b2898bb187ced9b349e95d2a7272930796e022efaf12e99dccd293", size = 8045012, upload-time = "2026-02-11T04:20:52.882Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/8b/07587069c27be7535ac1fe33874e32de118fbd34e2a73b7f83436a88368c/pillow-12.1.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a0b1cd6232e2b618adcc54d9882e4e662a089d5768cd188f7c245b4c8c44a397", size = 6349638, upload-time = "2026-02-11T04:20:54.444Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/79/6df7b2ee763d619cda2fb4fea498e5f79d984dae304d45a8999b80d6cf5c/pillow-12.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7aac39bcf8d4770d089588a2e1dd111cbaa42df5a94be3114222057d68336bd0", size = 7041540, upload-time = "2026-02-11T04:20:55.97Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/5e/2ba19e7e7236d7529f4d873bdaf317a318896bac289abebd4bb00ef247f0/pillow-12.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ab174cd7d29a62dd139c44bf74b698039328f45cb03b4596c43473a46656b2f3", size = 6462613, upload-time = "2026-02-11T04:20:57.542Z" },
+    { url = "https://files.pythonhosted.org/packages/03/03/31216ec124bb5c3dacd74ce8efff4cc7f52643653bad4825f8f08c697743/pillow-12.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:339ffdcb7cbeaa08221cd401d517d4b1fe7a9ed5d400e4a8039719238620ca35", size = 7166745, upload-time = "2026-02-11T04:20:59.196Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/e7/7c4552d80052337eb28653b617eafdef39adfb137c49dd7e831b8dc13bc5/pillow-12.1.1-cp312-cp312-win32.whl", hash = "sha256:5d1f9575a12bed9e9eedd9a4972834b08c97a352bd17955ccdebfeca5913fa0a", size = 6328823, upload-time = "2026-02-11T04:21:01.385Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/17/688626d192d7261bbbf98846fc98995726bddc2c945344b65bec3a29d731/pillow-12.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:21329ec8c96c6e979cd0dfd29406c40c1d52521a90544463057d2aaa937d66a6", size = 7033367, upload-time = "2026-02-11T04:21:03.536Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/fe/a0ef1f73f939b0eca03ee2c108d0043a87468664770612602c63266a43c4/pillow-12.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:af9a332e572978f0218686636610555ae3defd1633597be015ed50289a03c523", size = 2453811, upload-time = "2026-02-11T04:21:05.116Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/11/6db24d4bd7685583caeae54b7009584e38da3c3d4488ed4cd25b439de486/pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:d242e8ac078781f1de88bf823d70c1a9b3c7950a44cdf4b7c012e22ccbcd8e4e", size = 4062689, upload-time = "2026-02-11T04:21:06.804Z" },
+    { url = "https://files.pythonhosted.org/packages/33/c0/ce6d3b1fe190f0021203e0d9b5b99e57843e345f15f9ef22fcd43842fd21/pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:02f84dfad02693676692746df05b89cf25597560db2857363a208e393429f5e9", size = 4138535, upload-time = "2026-02-11T04:21:08.452Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/c6/d5eb6a4fb32a3f9c21a8c7613ec706534ea1cf9f4b3663e99f0d83f6fca8/pillow-12.1.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:e65498daf4b583091ccbb2556c7000abf0f3349fcd57ef7adc9a84a394ed29f6", size = 3601364, upload-time = "2026-02-11T04:21:10.194Z" },
+    { url = "https://files.pythonhosted.org/packages/14/a1/16c4b823838ba4c9c52c0e6bbda903a3fe5a1bdbf1b8eb4fff7156f3e318/pillow-12.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c6db3b84c87d48d0088943bf33440e0c42370b99b1c2a7989216f7b42eede60", size = 5262561, upload-time = "2026-02-11T04:21:11.742Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/ad/ad9dc98ff24f485008aa5cdedaf1a219876f6f6c42a4626c08bc4e80b120/pillow-12.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8b7e5304e34942bf62e15184219a7b5ad4ff7f3bb5cca4d984f37df1a0e1aee2", size = 4657460, upload-time = "2026-02-11T04:21:13.786Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/1b/f1a4ea9a895b5732152789326202a82464d5254759fbacae4deea3069334/pillow-12.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:18e5bddd742a44b7e6b1e773ab5db102bd7a94c32555ba656e76d319d19c3850", size = 6232698, upload-time = "2026-02-11T04:21:15.949Z" },
+    { url = "https://files.pythonhosted.org/packages/95/f4/86f51b8745070daf21fd2e5b1fe0eb35d4db9ca26e6d58366562fb56a743/pillow-12.1.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc44ef1f3de4f45b50ccf9136999d71abb99dca7706bc75d222ed350b9fd2289", size = 8041706, upload-time = "2026-02-11T04:21:17.723Z" },
+    { url = "https://files.pythonhosted.org/packages/29/9b/d6ecd956bb1266dd1045e995cce9b8d77759e740953a1c9aad9502a0461e/pillow-12.1.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a8eb7ed8d4198bccbd07058416eeec51686b498e784eda166395a23eb99138e", size = 6346621, upload-time = "2026-02-11T04:21:19.547Z" },
+    { url = "https://files.pythonhosted.org/packages/71/24/538bff45bde96535d7d998c6fed1a751c75ac7c53c37c90dc2601b243893/pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47b94983da0c642de92ced1702c5b6c292a84bd3a8e1d1702ff923f183594717", size = 7038069, upload-time = "2026-02-11T04:21:21.378Z" },
+    { url = "https://files.pythonhosted.org/packages/94/0e/58cb1a6bc48f746bc4cb3adb8cabff73e2742c92b3bf7a220b7cf69b9177/pillow-12.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:518a48c2aab7ce596d3bf79d0e275661b846e86e4d0e7dec34712c30fe07f02a", size = 6460040, upload-time = "2026-02-11T04:21:23.148Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/57/9045cb3ff11eeb6c1adce3b2d60d7d299d7b273a2e6c8381a524abfdc474/pillow-12.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a550ae29b95c6dc13cf69e2c9dc5747f814c54eeb2e32d683e5e93af56caa029", size = 7164523, upload-time = "2026-02-11T04:21:25.01Z" },
+    { url = "https://files.pythonhosted.org/packages/73/f2/9be9cb99f2175f0d4dbadd6616ce1bf068ee54a28277ea1bf1fbf729c250/pillow-12.1.1-cp313-cp313-win32.whl", hash = "sha256:a003d7422449f6d1e3a34e3dd4110c22148336918ddbfc6a32581cd54b2e0b2b", size = 6332552, upload-time = "2026-02-11T04:21:27.238Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/eb/b0834ad8b583d7d9d42b80becff092082a1c3c156bb582590fcc973f1c7c/pillow-12.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:344cf1e3dab3be4b1fa08e449323d98a2a3f819ad20f4b22e77a0ede31f0faa1", size = 7040108, upload-time = "2026-02-11T04:21:29.462Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/7d/fc09634e2aabdd0feabaff4a32f4a7d97789223e7c2042fd805ea4b4d2c2/pillow-12.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:5c0dd1636633e7e6a0afe7bf6a51a14992b7f8e60de5789018ebbdfae55b040a", size = 2453712, upload-time = "2026-02-11T04:21:31.072Z" },
+    { url = "https://files.pythonhosted.org/packages/19/2a/b9d62794fc8a0dd14c1943df68347badbd5511103e0d04c035ffe5cf2255/pillow-12.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0330d233c1a0ead844fc097a7d16c0abff4c12e856c0b325f231820fee1f39da", size = 5264880, upload-time = "2026-02-11T04:21:32.865Z" },
+    { url = "https://files.pythonhosted.org/packages/26/9d/e03d857d1347fa5ed9247e123fcd2a97b6220e15e9cb73ca0a8d91702c6e/pillow-12.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5dae5f21afb91322f2ff791895ddd8889e5e947ff59f71b46041c8ce6db790bc", size = 4660616, upload-time = "2026-02-11T04:21:34.97Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/ec/8a6d22afd02570d30954e043f09c32772bfe143ba9285e2fdb11284952cd/pillow-12.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2e0c664be47252947d870ac0d327fea7e63985a08794758aa8af5b6cb6ec0c9c", size = 6269008, upload-time = "2026-02-11T04:21:36.623Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/1d/6d875422c9f28a4a361f495a5f68d9de4a66941dc2c619103ca335fa6446/pillow-12.1.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:691ab2ac363b8217f7d31b3497108fb1f50faab2f75dfb03284ec2f217e87bf8", size = 8073226, upload-time = "2026-02-11T04:21:38.585Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/cd/134b0b6ee5eda6dc09e25e24b40fdafe11a520bc725c1d0bbaa5e00bf95b/pillow-12.1.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e9e8064fb1cc019296958595f6db671fba95209e3ceb0c4734c9baf97de04b20", size = 6380136, upload-time = "2026-02-11T04:21:40.562Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/a9/7628f013f18f001c1b98d8fffe3452f306a70dc6aba7d931019e0492f45e/pillow-12.1.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:472a8d7ded663e6162dafdf20015c486a7009483ca671cece7a9279b512fcb13", size = 7067129, upload-time = "2026-02-11T04:21:42.521Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/f8/66ab30a2193b277785601e82ee2d49f68ea575d9637e5e234faaa98efa4c/pillow-12.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:89b54027a766529136a06cfebeecb3a04900397a3590fd252160b888479517bf", size = 6491807, upload-time = "2026-02-11T04:21:44.22Z" },
+    { url = "https://files.pythonhosted.org/packages/da/0b/a877a6627dc8318fdb84e357c5e1a758c0941ab1ddffdafd231983788579/pillow-12.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:86172b0831b82ce4f7877f280055892b31179e1576aa00d0df3bb1bbf8c3e524", size = 7190954, upload-time = "2026-02-11T04:21:46.114Z" },
+    { url = "https://files.pythonhosted.org/packages/83/43/6f732ff85743cf746b1361b91665d9f5155e1483817f693f8d57ea93147f/pillow-12.1.1-cp313-cp313t-win32.whl", hash = "sha256:44ce27545b6efcf0fdbdceb31c9a5bdea9333e664cda58a7e674bb74608b3986", size = 6336441, upload-time = "2026-02-11T04:21:48.22Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/44/e865ef3986611bb75bfabdf94a590016ea327833f434558801122979cd0e/pillow-12.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a285e3eb7a5a45a2ff504e31f4a8d1b12ef62e84e5411c6804a42197c1cf586c", size = 7045383, upload-time = "2026-02-11T04:21:50.015Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/c6/f4fb24268d0c6908b9f04143697ea18b0379490cb74ba9e8d41b898bd005/pillow-12.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cc7d296b5ea4d29e6570dabeaed58d31c3fea35a633a69679fb03d7664f43fb3", size = 2456104, upload-time = "2026-02-11T04:21:51.633Z" },
+    { url = "https://files.pythonhosted.org/packages/03/d0/bebb3ffbf31c5a8e97241476c4cf8b9828954693ce6744b4a2326af3e16b/pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:417423db963cb4be8bac3fc1204fe61610f6abeed1580a7a2cbb2fbda20f12af", size = 4062652, upload-time = "2026-02-11T04:21:53.19Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/c0/0e16fb0addda4851445c28f8350d8c512f09de27bbb0d6d0bbf8b6709605/pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:b957b71c6b2387610f556a7eb0828afbe40b4a98036fc0d2acfa5a44a0c2036f", size = 4138823, upload-time = "2026-02-11T04:22:03.088Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/fb/6170ec655d6f6bb6630a013dd7cf7bc218423d7b5fa9071bf63dc32175ae/pillow-12.1.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:097690ba1f2efdeb165a20469d59d8bb03c55fb6621eb2041a060ae8ea3e9642", size = 3601143, upload-time = "2026-02-11T04:22:04.909Z" },
+    { url = "https://files.pythonhosted.org/packages/59/04/dc5c3f297510ba9a6837cbb318b87dd2b8f73eb41a43cc63767f65cb599c/pillow-12.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2815a87ab27848db0321fb78c7f0b2c8649dee134b7f2b80c6a45c6831d75ccd", size = 5266254, upload-time = "2026-02-11T04:22:07.656Z" },
+    { url = "https://files.pythonhosted.org/packages/05/30/5db1236b0d6313f03ebf97f5e17cda9ca060f524b2fcc875149a8360b21c/pillow-12.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f7ed2c6543bad5a7d5530eb9e78c53132f93dfa44a28492db88b41cdab885202", size = 4657499, upload-time = "2026-02-11T04:22:09.613Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/18/008d2ca0eb612e81968e8be0bbae5051efba24d52debf930126d7eaacbba/pillow-12.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:652a2c9ccfb556235b2b501a3a7cf3742148cd22e04b5625c5fe057ea3e3191f", size = 6232137, upload-time = "2026-02-11T04:22:11.434Z" },
+    { url = "https://files.pythonhosted.org/packages/70/f1/f14d5b8eeb4b2cd62b9f9f847eb6605f103df89ef619ac68f92f748614ea/pillow-12.1.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d6e4571eedf43af33d0fc233a382a76e849badbccdf1ac438841308652a08e1f", size = 8042721, upload-time = "2026-02-11T04:22:13.321Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/d6/17824509146e4babbdabf04d8171491fa9d776f7061ff6e727522df9bd03/pillow-12.1.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b574c51cf7d5d62e9be37ba446224b59a2da26dc4c1bb2ecbe936a4fb1a7cb7f", size = 6347798, upload-time = "2026-02-11T04:22:15.449Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/ee/c85a38a9ab92037a75615aba572c85ea51e605265036e00c5b67dfafbfe2/pillow-12.1.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a37691702ed687799de29a518d63d4682d9016932db66d4e90c345831b02fb4e", size = 7039315, upload-time = "2026-02-11T04:22:17.24Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/f3/bc8ccc6e08a148290d7523bde4d9a0d6c981db34631390dc6e6ec34cacf6/pillow-12.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f95c00d5d6700b2b890479664a06e754974848afaae5e21beb4d83c106923fd0", size = 6462360, upload-time = "2026-02-11T04:22:19.111Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/ab/69a42656adb1d0665ab051eec58a41f169ad295cf81ad45406963105408f/pillow-12.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:559b38da23606e68681337ad74622c4dbba02254fc9cb4488a305dd5975c7eeb", size = 7165438, upload-time = "2026-02-11T04:22:21.041Z" },
+    { url = "https://files.pythonhosted.org/packages/02/46/81f7aa8941873f0f01d4b55cc543b0a3d03ec2ee30d617a0448bf6bd6dec/pillow-12.1.1-cp314-cp314-win32.whl", hash = "sha256:03edcc34d688572014ff223c125a3f77fb08091e4607e7745002fc214070b35f", size = 6431503, upload-time = "2026-02-11T04:22:22.833Z" },
+    { url = "https://files.pythonhosted.org/packages/40/72/4c245f7d1044b67affc7f134a09ea619d4895333d35322b775b928180044/pillow-12.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:50480dcd74fa63b8e78235957d302d98d98d82ccbfac4c7e12108ba9ecbdba15", size = 7176748, upload-time = "2026-02-11T04:22:24.64Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/ad/8a87bdbe038c5c698736e3348af5c2194ffb872ea52f11894c95f9305435/pillow-12.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:5cb1785d97b0c3d1d1a16bc1d710c4a0049daefc4935f3a8f31f827f4d3d2e7f", size = 2544314, upload-time = "2026-02-11T04:22:26.685Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/9d/efd18493f9de13b87ede7c47e69184b9e859e4427225ea962e32e56a49bc/pillow-12.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1f90cff8aa76835cba5769f0b3121a22bd4eb9e6884cfe338216e557a9a548b8", size = 5268612, upload-time = "2026-02-11T04:22:29.884Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/f1/4f42eb2b388eb2ffc660dcb7f7b556c1015c53ebd5f7f754965ef997585b/pillow-12.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1f1be78ce9466a7ee64bfda57bdba0f7cc499d9794d518b854816c41bf0aa4e9", size = 4660567, upload-time = "2026-02-11T04:22:31.799Z" },
+    { url = "https://files.pythonhosted.org/packages/01/54/df6ef130fa43e4b82e32624a7b821a2be1c5653a5fdad8469687a7db4e00/pillow-12.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:42fc1f4677106188ad9a55562bbade416f8b55456f522430fadab3cef7cd4e60", size = 6269951, upload-time = "2026-02-11T04:22:33.921Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/48/618752d06cc44bb4aae8ce0cd4e6426871929ed7b46215638088270d9b34/pillow-12.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98edb152429ab62a1818039744d8fbb3ccab98a7c29fc3d5fcef158f3f1f68b7", size = 8074769, upload-time = "2026-02-11T04:22:35.877Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/bd/f1d71eb39a72fa088d938655afba3e00b38018d052752f435838961127d8/pillow-12.1.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d470ab1178551dd17fdba0fef463359c41aaa613cdcd7ff8373f54be629f9f8f", size = 6381358, upload-time = "2026-02-11T04:22:37.698Z" },
+    { url = "https://files.pythonhosted.org/packages/64/ef/c784e20b96674ed36a5af839305f55616f8b4f8aa8eeccf8531a6e312243/pillow-12.1.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6408a7b064595afcab0a49393a413732a35788f2a5092fdc6266952ed67de586", size = 7068558, upload-time = "2026-02-11T04:22:39.597Z" },
+    { url = "https://files.pythonhosted.org/packages/73/cb/8059688b74422ae61278202c4e1ad992e8a2e7375227be0a21c6b87ca8d5/pillow-12.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5d8c41325b382c07799a3682c1c258469ea2ff97103c53717b7893862d0c98ce", size = 6493028, upload-time = "2026-02-11T04:22:42.73Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/da/e3c008ed7d2dd1f905b15949325934510b9d1931e5df999bb15972756818/pillow-12.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c7697918b5be27424e9ce568193efd13d925c4481dd364e43f5dff72d33e10f8", size = 7191940, upload-time = "2026-02-11T04:22:44.543Z" },
+    { url = "https://files.pythonhosted.org/packages/01/4a/9202e8d11714c1fc5951f2e1ef362f2d7fbc595e1f6717971d5dd750e969/pillow-12.1.1-cp314-cp314t-win32.whl", hash = "sha256:d2912fd8114fc5545aa3a4b5576512f64c55a03f3ebcca4c10194d593d43ea36", size = 6438736, upload-time = "2026-02-11T04:22:46.347Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/ca/cbce2327eb9885476b3957b2e82eb12c866a8b16ad77392864ad601022ce/pillow-12.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:4ceb838d4bd9dab43e06c363cab2eebf63846d6a4aeaea283bbdfd8f1a8ed58b", size = 7182894, upload-time = "2026-02-11T04:22:48.114Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/d2/de599c95ba0a973b94410477f8bf0b6f0b5e67360eb89bcb1ad365258beb/pillow-12.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7b03048319bfc6170e93bd60728a1af51d3dd7704935feb228c4d4faab35d334", size = 2546446, upload-time = "2026-02-11T04:22:50.342Z" },
+    { url = "https://files.pythonhosted.org/packages/56/11/5d43209aa4cb58e0cc80127956ff1796a68b928e6324bbf06ef4db34367b/pillow-12.1.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:600fd103672b925fe62ed08e0d874ea34d692474df6f4bf7ebe148b30f89f39f", size = 5228606, upload-time = "2026-02-11T04:22:52.106Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/d5/3b005b4e4fda6698b371fa6c21b097d4707585d7db99e98d9b0b87ac612a/pillow-12.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:665e1b916b043cef294bc54d47bf02d87e13f769bc4bc5fa225a24b3a6c5aca9", size = 4622321, upload-time = "2026-02-11T04:22:53.827Z" },
+    { url = "https://files.pythonhosted.org/packages/df/36/ed3ea2d594356fd8037e5a01f6156c74bc8d92dbb0fa60746cc96cabb6e8/pillow-12.1.1-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:495c302af3aad1ca67420ddd5c7bd480c8867ad173528767d906428057a11f0e", size = 5247579, upload-time = "2026-02-11T04:22:56.094Z" },
+    { url = "https://files.pythonhosted.org/packages/54/9a/9cc3e029683cf6d20ae5085da0dafc63148e3252c2f13328e553aaa13cfb/pillow-12.1.1-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8fd420ef0c52c88b5a035a0886f367748c72147b2b8f384c9d12656678dfdfa9", size = 6989094, upload-time = "2026-02-11T04:22:58.288Z" },
+    { url = "https://files.pythonhosted.org/packages/00/98/fc53ab36da80b88df0967896b6c4b4cd948a0dc5aa40a754266aa3ae48b3/pillow-12.1.1-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f975aa7ef9684ce7e2c18a3aa8f8e2106ce1e46b94ab713d156b2898811651d3", size = 5313850, upload-time = "2026-02-11T04:23:00.554Z" },
+    { url = "https://files.pythonhosted.org/packages/30/02/00fa585abfd9fe9d73e5f6e554dc36cc2b842898cbfc46d70353dae227f8/pillow-12.1.1-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8089c852a56c2966cf18835db62d9b34fef7ba74c726ad943928d494fa7f4735", size = 5963343, upload-time = "2026-02-11T04:23:02.934Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/26/c56ce33ca856e358d27fda9676c055395abddb82c35ac0f593877ed4562e/pillow-12.1.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:cb9bb857b2d057c6dfc72ac5f3b44836924ba15721882ef103cecb40d002d80e", size = 7029880, upload-time = "2026-02-11T04:23:04.783Z" },
 ]
 
 [[package]]
 name = "platformdirs"
-version = "4.5.0"
+version = "4.9.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/61/33/9611380c2bdb1225fdef633e2a9610622310fed35ab11dac9620972ee088/platformdirs-4.5.0.tar.gz", hash = "sha256:70ddccdd7c99fc5942e9fc25636a8b34d04c24b335100223152c2803e4063312", size = 21632, upload-time = "2025-10-08T17:44:48.791Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/04/fea538adf7dbbd6d186f551d595961e564a3b6715bdf276b477460858672/platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291", size = 28394, upload-time = "2026-02-16T03:56:10.574Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651, upload-time = "2025-10-08T17:44:47.223Z" },
+    { url = "https://files.pythonhosted.org/packages/48/31/05e764397056194206169869b50cf2fee4dbbbc71b344705b9c0d878d4d8/platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd", size = 21168, upload-time = "2026-02-16T03:56:08.891Z" },
 ]
 
 [[package]]
@@ -3715,11 +4610,11 @@ wheels = [
 
 [[package]]
 name = "prometheus-client"
-version = "0.23.1"
+version = "0.24.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/23/53/3edb5d68ecf6b38fcbcc1ad28391117d2a322d9a1a3eff04bfdb184d8c3b/prometheus_client-0.23.1.tar.gz", hash = "sha256:6ae8f9081eaaaf153a2e959d2e6c4f4fb57b12ef76c8c7980202f1e57b48b2ce", size = 80481, upload-time = "2025-09-18T20:47:25.043Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/58/a794d23feb6b00fc0c72787d7e87d872a6730dd9ed7c7b3e954637d8f280/prometheus_client-0.24.1.tar.gz", hash = "sha256:7e0ced7fbbd40f7b84962d5d2ab6f17ef88a72504dcf7c0b40737b43b2a461f9", size = 85616, upload-time = "2026-01-14T15:26:26.965Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b8/db/14bafcb4af2139e046d03fd00dea7873e48eafe18b7d2797e73d6681f210/prometheus_client-0.23.1-py3-none-any.whl", hash = "sha256:dd1913e6e76b59cfe44e7a4b83e01afc9873c1bdfd2ed8739f1e76aeca115f99", size = 61145, upload-time = "2025-09-18T20:47:23.875Z" },
+    { url = "https://files.pythonhosted.org/packages/74/c3/24a2f845e3917201628ecaba4f18bab4d18a337834c1df2a159ee9d22a42/prometheus_client-0.24.1-py3-none-any.whl", hash = "sha256:150db128af71a5c2482b36e588fc8a6b95e498750da4b17065947c16070f4055", size = 64057, upload-time = "2026-01-14T15:26:24.42Z" },
 ]
 
 [[package]]
@@ -3861,45 +4756,59 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
 ]
 
+[[package]]
+name = "proto-plus"
+version = "1.27.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/02/8832cde80e7380c600fbf55090b6ab7b62bd6825dbedde6d6657c15a1f8e/proto_plus-1.27.1.tar.gz", hash = "sha256:912a7460446625b792f6448bade9e55cd4e41e6ac10e27009ef71a7f317fa147", size = 56929, upload-time = "2026-02-02T17:34:49.035Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5d/79/ac273cbbf744691821a9cca88957257f41afe271637794975ca090b9588b/proto_plus-1.27.1-py3-none-any.whl", hash = "sha256:e4643061f3a4d0de092d62aa4ad09fa4756b2cbb89d4627f3985018216f9fefc", size = 50480, upload-time = "2026-02-02T17:34:47.339Z" },
+]
+
 [[package]]
 name = "protobuf"
-version = "6.33.1"
+version = "6.33.5"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/0a/03/a1440979a3f74f16cab3b75b0da1a1a7f922d56a8ddea96092391998edc0/protobuf-6.33.1.tar.gz", hash = "sha256:97f65757e8d09870de6fd973aeddb92f85435607235d20b2dfed93405d00c85b", size = 443432, upload-time = "2025-11-13T16:44:18.895Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/25/7c72c307aafc96fa87062aa6291d9f7c94836e43214d43722e86037aac02/protobuf-6.33.5.tar.gz", hash = "sha256:6ddcac2a081f8b7b9642c09406bc6a4290128fce5f471cddd165960bb9119e5c", size = 444465, upload-time = "2026-01-29T21:51:33.494Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/06/f1/446a9bbd2c60772ca36556bac8bfde40eceb28d9cc7838755bc41e001d8f/protobuf-6.33.1-cp310-abi3-win32.whl", hash = "sha256:f8d3fdbc966aaab1d05046d0240dd94d40f2a8c62856d41eaa141ff64a79de6b", size = 425593, upload-time = "2025-11-13T16:44:06.275Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/79/8780a378c650e3df849b73de8b13cf5412f521ca2ff9b78a45c247029440/protobuf-6.33.1-cp310-abi3-win_amd64.whl", hash = "sha256:923aa6d27a92bf44394f6abf7ea0500f38769d4b07f4be41cb52bd8b1123b9ed", size = 436883, upload-time = "2025-11-13T16:44:09.222Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/93/26213ff72b103ae55bb0d73e7fb91ea570ef407c3ab4fd2f1f27cac16044/protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:fe34575f2bdde76ac429ec7b570235bf0c788883e70aee90068e9981806f2490", size = 427522, upload-time = "2025-11-13T16:44:10.475Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/32/df4a35247923393aa6b887c3b3244a8c941c32a25681775f96e2b418f90e/protobuf-6.33.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:f8adba2e44cde2d7618996b3fc02341f03f5bc3f2748be72dc7b063319276178", size = 324445, upload-time = "2025-11-13T16:44:11.869Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/d0/d796e419e2ec93d2f3fa44888861c3f88f722cde02b7c3488fcc6a166820/protobuf-6.33.1-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:0f4cf01222c0d959c2b399142deb526de420be8236f22c71356e2a544e153c53", size = 339161, upload-time = "2025-11-13T16:44:12.778Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/2a/3c5f05a4af06649547027d288747f68525755de692a26a7720dced3652c0/protobuf-6.33.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:8fd7d5e0eb08cd5b87fd3df49bc193f5cfd778701f47e11d127d0afc6c39f1d1", size = 323171, upload-time = "2025-11-13T16:44:14.035Z" },
-    { url = "https://files.pythonhosted.org/packages/08/b4/46310463b4f6ceef310f8348786f3cff181cea671578e3d9743ba61a459e/protobuf-6.33.1-py3-none-any.whl", hash = "sha256:d595a9fd694fdeb061a62fbe10eb039cc1e444df81ec9bb70c7fc59ebcb1eafa", size = 170477, upload-time = "2025-11-13T16:44:17.633Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/79/af92d0a8369732b027e6d6084251dd8e782c685c72da161bd4a2e00fbabb/protobuf-6.33.5-cp310-abi3-win32.whl", hash = "sha256:d71b040839446bac0f4d162e758bea99c8251161dae9d0983a3b88dee345153b", size = 425769, upload-time = "2026-01-29T21:51:21.751Z" },
+    { url = "https://files.pythonhosted.org/packages/55/75/bb9bc917d10e9ee13dee8607eb9ab963b7cf8be607c46e7862c748aa2af7/protobuf-6.33.5-cp310-abi3-win_amd64.whl", hash = "sha256:3093804752167bcab3998bec9f1048baae6e29505adaf1afd14a37bddede533c", size = 437118, upload-time = "2026-01-29T21:51:24.022Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/6b/e48dfc1191bc5b52950246275bf4089773e91cb5ba3592621723cdddca62/protobuf-6.33.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a5cb85982d95d906df1e2210e58f8e4f1e3cdc088e52c921a041f9c9a0386de5", size = 427766, upload-time = "2026-01-29T21:51:25.413Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/b1/c79468184310de09d75095ed1314b839eb2f72df71097db9d1404a1b2717/protobuf-6.33.5-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:9b71e0281f36f179d00cbcb119cb19dec4d14a81393e5ea220f64b286173e190", size = 324638, upload-time = "2026-01-29T21:51:26.423Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/f5/65d838092fd01c44d16037953fd4c2cc851e783de9b8f02b27ec4ffd906f/protobuf-6.33.5-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:8afa18e1d6d20af15b417e728e9f60f3aa108ee76f23c3b2c07a2c3b546d3afd", size = 339411, upload-time = "2026-01-29T21:51:27.446Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/53/a9443aa3ca9ba8724fdfa02dd1887c1bcd8e89556b715cfbacca6b63dbec/protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:cbf16ba3350fb7b889fca858fb215967792dc125b35c7976ca4818bee3521cf0", size = 323465, upload-time = "2026-01-29T21:51:28.925Z" },
+    { url = "https://files.pythonhosted.org/packages/57/bf/2086963c69bdac3d7cff1cc7ff79b8ce5ea0bec6797a017e1be338a46248/protobuf-6.33.5-py3-none-any.whl", hash = "sha256:69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02", size = 170687, upload-time = "2026-01-29T21:51:32.557Z" },
 ]
 
 [[package]]
 name = "psutil"
-version = "7.1.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e1/88/bdd0a41e5857d5d703287598cbf08dad90aed56774ea52ae071bae9071b6/psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74", size = 489059, upload-time = "2025-11-02T12:25:54.619Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/93/0c49e776b8734fef56ec9c5c57f923922f2cf0497d62e0f419465f28f3d0/psutil-7.1.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0005da714eee687b4b8decd3d6cc7c6db36215c9e74e5ad2264b90c3df7d92dc", size = 239751, upload-time = "2025-11-02T12:25:58.161Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/8d/b31e39c769e70780f007969815195a55c81a63efebdd4dbe9e7a113adb2f/psutil-7.1.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:19644c85dcb987e35eeeaefdc3915d059dac7bd1167cdcdbf27e0ce2df0c08c0", size = 240368, upload-time = "2025-11-02T12:26:00.491Z" },
-    { url = "https://files.pythonhosted.org/packages/62/61/23fd4acc3c9eebbf6b6c78bcd89e5d020cfde4acf0a9233e9d4e3fa698b4/psutil-7.1.3-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95ef04cf2e5ba0ab9eaafc4a11eaae91b44f4ef5541acd2ee91d9108d00d59a7", size = 287134, upload-time = "2025-11-02T12:26:02.613Z" },
-    { url = "https://files.pythonhosted.org/packages/30/1c/f921a009ea9ceb51aa355cb0cc118f68d354db36eae18174bab63affb3e6/psutil-7.1.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1068c303be3a72f8e18e412c5b2a8f6d31750fb152f9cb106b54090296c9d251", size = 289904, upload-time = "2025-11-02T12:26:05.207Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/82/62d68066e13e46a5116df187d319d1724b3f437ddd0f958756fc052677f4/psutil-7.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:18349c5c24b06ac5612c0428ec2a0331c26443d259e2a0144a9b24b4395b58fa", size = 249642, upload-time = "2025-11-02T12:26:07.447Z" },
-    { url = "https://files.pythonhosted.org/packages/df/ad/c1cd5fe965c14a0392112f68362cfceb5230819dbb5b1888950d18a11d9f/psutil-7.1.3-cp313-cp313t-win_arm64.whl", hash = "sha256:c525ffa774fe4496282fb0b1187725793de3e7c6b29e41562733cae9ada151ee", size = 245518, upload-time = "2025-11-02T12:26:09.719Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/bb/6670bded3e3236eb4287c7bcdc167e9fae6e1e9286e437f7111caed2f909/psutil-7.1.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b403da1df4d6d43973dc004d19cee3b848e998ae3154cc8097d139b77156c353", size = 239843, upload-time = "2025-11-02T12:26:11.968Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/66/853d50e75a38c9a7370ddbeefabdd3d3116b9c31ef94dc92c6729bc36bec/psutil-7.1.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ad81425efc5e75da3f39b3e636293360ad8d0b49bed7df824c79764fb4ba9b8b", size = 240369, upload-time = "2025-11-02T12:26:14.358Z" },
-    { url = "https://files.pythonhosted.org/packages/41/bd/313aba97cb5bfb26916dc29cf0646cbe4dd6a89ca69e8c6edce654876d39/psutil-7.1.3-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8f33a3702e167783a9213db10ad29650ebf383946e91bc77f28a5eb083496bc9", size = 288210, upload-time = "2025-11-02T12:26:16.699Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/fa/76e3c06e760927a0cfb5705eb38164254de34e9bd86db656d4dbaa228b04/psutil-7.1.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fac9cd332c67f4422504297889da5ab7e05fd11e3c4392140f7370f4208ded1f", size = 291182, upload-time = "2025-11-02T12:26:18.848Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/1d/5774a91607035ee5078b8fd747686ebec28a962f178712de100d00b78a32/psutil-7.1.3-cp314-cp314t-win_amd64.whl", hash = "sha256:3792983e23b69843aea49c8f5b8f115572c5ab64c153bada5270086a2123c7e7", size = 250466, upload-time = "2025-11-02T12:26:21.183Z" },
-    { url = "https://files.pythonhosted.org/packages/00/ca/e426584bacb43a5cb1ac91fae1937f478cd8fbe5e4ff96574e698a2c77cd/psutil-7.1.3-cp314-cp314t-win_arm64.whl", hash = "sha256:31d77fcedb7529f27bb3a0472bea9334349f9a04160e8e6e5020f22c59893264", size = 245756, upload-time = "2025-11-02T12:26:23.148Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/94/46b9154a800253e7ecff5aaacdf8ebf43db99de4a2dfa18575b02548654e/psutil-7.1.3-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bdbcd0e58ca14996a42adf3621a6244f1bb2e2e528886959c72cf1e326677ab", size = 238359, upload-time = "2025-11-02T12:26:25.284Z" },
-    { url = "https://files.pythonhosted.org/packages/68/3a/9f93cff5c025029a36d9a92fef47220ab4692ee7f2be0fba9f92813d0cb8/psutil-7.1.3-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:bc31fa00f1fbc3c3802141eede66f3a2d51d89716a194bf2cd6fc68310a19880", size = 239171, upload-time = "2025-11-02T12:26:27.23Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/b1/5f49af514f76431ba4eea935b8ad3725cdeb397e9245ab919dbc1d1dc20f/psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb428f9f05c1225a558f53e30ccbad9930b11c3fc206836242de1091d3e7dd3", size = 263261, upload-time = "2025-11-02T12:26:29.48Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/95/992c8816a74016eb095e73585d747e0a8ea21a061ed3689474fabb29a395/psutil-7.1.3-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56d974e02ca2c8eb4812c3f76c30e28836fffc311d55d979f1465c1feeb2b68b", size = 264635, upload-time = "2025-11-02T12:26:31.74Z" },
-    { url = "https://files.pythonhosted.org/packages/55/4c/c3ed1a622b6ae2fd3c945a366e64eb35247a31e4db16cf5095e269e8eb3c/psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd", size = 247633, upload-time = "2025-11-02T12:26:33.887Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" },
+version = "7.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/08/510cbdb69c25a96f4ae523f733cdc963ae654904e8db864c07585ef99875/psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b", size = 130595, upload-time = "2026-01-28T18:14:57.293Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/f5/97baea3fe7a5a9af7436301f85490905379b1c6f2dd51fe3ecf24b4c5fbf/psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea", size = 131082, upload-time = "2026-01-28T18:14:59.732Z" },
+    { url = "https://files.pythonhosted.org/packages/37/d6/246513fbf9fa174af531f28412297dd05241d97a75911ac8febefa1a53c6/psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63", size = 181476, upload-time = "2026-01-28T18:15:01.884Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/b5/9182c9af3836cca61696dabe4fd1304e17bc56cb62f17439e1154f225dd3/psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312", size = 184062, upload-time = "2026-01-28T18:15:04.436Z" },
+    { url = "https://files.pythonhosted.org/packages/16/ba/0756dca669f5a9300d0cbcbfae9a4c30e446dfc7440ffe43ded5724bfd93/psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b", size = 139893, upload-time = "2026-01-28T18:15:06.378Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/61/8fa0e26f33623b49949346de05ec1ddaad02ed8ba64af45f40a147dbfa97/psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9", size = 135589, upload-time = "2026-01-28T18:15:08.03Z" },
+    { url = "https://files.pythonhosted.org/packages/81/69/ef179ab5ca24f32acc1dac0c247fd6a13b501fd5534dbae0e05a1c48b66d/psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00", size = 130664, upload-time = "2026-01-28T18:15:09.469Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/64/665248b557a236d3fa9efc378d60d95ef56dd0a490c2cd37dafc7660d4a9/psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9", size = 131087, upload-time = "2026-01-28T18:15:11.724Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/2e/e6782744700d6759ebce3043dcfa661fb61e2fb752b91cdeae9af12c2178/psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a", size = 182383, upload-time = "2026-01-28T18:15:13.445Z" },
+    { url = "https://files.pythonhosted.org/packages/57/49/0a41cefd10cb7505cdc04dab3eacf24c0c2cb158a998b8c7b1d27ee2c1f5/psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf", size = 185210, upload-time = "2026-01-28T18:15:16.002Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/2c/ff9bfb544f283ba5f83ba725a3c5fec6d6b10b8f27ac1dc641c473dc390d/psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1", size = 141228, upload-time = "2026-01-28T18:15:18.385Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/fc/f8d9c31db14fcec13748d373e668bc3bed94d9077dbc17fb0eebc073233c/psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841", size = 136284, upload-time = "2026-01-28T18:15:19.912Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" },
+    { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" },
+    { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" },
+    { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" },
 ]
 
 [[package]]
@@ -3911,70 +4820,106 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/6c/64cafaceea3f99927e84b38a362ec6a8f24f33061c90bda77dfe1cd4c3c6/pulp-3.3.0-py3-none-any.whl", hash = "sha256:dd6ad2d63f196d1254eddf9dcff5cd224912c1f046120cb7c143c5b0eda63fae", size = 16387700, upload-time = "2025-09-18T08:14:53.368Z" },
 ]
 
+[[package]]
+name = "py-spy"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/19/e2/ff811a367028b87e86714945bb9ecb5c1cc69114a8039a67b3a862cef921/py_spy-0.4.1.tar.gz", hash = "sha256:e53aa53daa2e47c2eef97dd2455b47bb3a7e7f962796a86cc3e7dbde8e6f4db4", size = 244726, upload-time = "2025-07-31T19:33:25.172Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/e3/3a32500d845bdd94f6a2b4ed6244982f42ec2bc64602ea8fcfe900678ae7/py_spy-0.4.1-py2.py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:809094208c6256c8f4ccadd31e9a513fe2429253f48e20066879239ba12cd8cc", size = 3682508, upload-time = "2025-07-31T19:33:13.753Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/bf/e4d280e9e0bec71d39fc646654097027d4bbe8e04af18fb68e49afcff404/py_spy-0.4.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:1fb8bf71ab8df95a95cc387deed6552934c50feef2cf6456bc06692a5508fd0c", size = 1796395, upload-time = "2025-07-31T19:33:15.325Z" },
+    { url = "https://files.pythonhosted.org/packages/df/79/9ed50bb0a9de63ed023aa2db8b6265b04a7760d98c61eb54def6a5fddb68/py_spy-0.4.1-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee776b9d512a011d1ad3907ed53ae32ce2f3d9ff3e1782236554e22103b5c084", size = 2034938, upload-time = "2025-07-31T19:33:17.194Z" },
+    { url = "https://files.pythonhosted.org/packages/53/a5/36862e3eea59f729dfb70ee6f9e14b051d8ddce1aa7e70e0b81d9fe18536/py_spy-0.4.1-py2.py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:532d3525538254d1859b49de1fbe9744df6b8865657c9f0e444bf36ce3f19226", size = 2658968, upload-time = "2025-07-31T19:33:18.916Z" },
+    { url = "https://files.pythonhosted.org/packages/08/f8/9ea0b586b065a623f591e5e7961282ec944b5fbbdca33186c7c0296645b3/py_spy-0.4.1-py2.py3-none-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4972c21890b6814017e39ac233c22572c4a61fd874524ebc5ccab0f2237aee0a", size = 2147541, upload-time = "2025-07-31T19:33:20.565Z" },
+    { url = "https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6a80ec05eb8a6883863a367c6a4d4f2d57de68466f7956b6367d4edd5c61bb29", size = 2763338, upload-time = "2025-07-31T19:33:22.202Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/da/fcc9a9fcd4ca946ff402cff20348e838b051d69f50f5d1f5dca4cd3c5eb8/py_spy-0.4.1-py2.py3-none-win_amd64.whl", hash = "sha256:d92e522bd40e9bf7d87c204033ce5bb5c828fca45fa28d970f58d71128069fdc", size = 1818784, upload-time = "2025-07-31T19:33:23.802Z" },
+]
+
 [[package]]
 name = "pyarrow"
-version = "22.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/9b/cb3f7e0a345353def531ca879053e9ef6b9f38ed91aebcf68b09ba54dec0/pyarrow-22.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:77718810bd3066158db1e95a63c160ad7ce08c6b0710bc656055033e39cdad88", size = 34223968, upload-time = "2025-10-24T10:03:31.21Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/41/3184b8192a120306270c5307f105b70320fdaa592c99843c5ef78aaefdcf/pyarrow-22.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:44d2d26cda26d18f7af7db71453b7b783788322d756e81730acb98f24eb90ace", size = 35942085, upload-time = "2025-10-24T10:03:38.146Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/3d/a1eab2f6f08001f9fb714b8ed5cfb045e2fe3e3e3c0c221f2c9ed1e6d67d/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b9d71701ce97c95480fecb0039ec5bb889e75f110da72005743451339262f4ce", size = 44964613, upload-time = "2025-10-24T10:03:46.516Z" },
-    { url = "https://files.pythonhosted.org/packages/46/46/a1d9c24baf21cfd9ce994ac820a24608decf2710521b29223d4334985127/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:710624ab925dc2b05a6229d47f6f0dac1c1155e6ed559be7109f684eba048a48", size = 47627059, upload-time = "2025-10-24T10:03:55.353Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/4c/f711acb13075c1391fd54bc17e078587672c575f8de2a6e62509af026dcf/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f963ba8c3b0199f9d6b794c90ec77545e05eadc83973897a4523c9e8d84e9340", size = 47947043, upload-time = "2025-10-24T10:04:05.408Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/70/1f3180dd7c2eab35c2aca2b29ace6c519f827dcd4cfeb8e0dca41612cf7a/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bd0d42297ace400d8febe55f13fdf46e86754842b860c978dfec16f081e5c653", size = 50206505, upload-time = "2025-10-24T10:04:15.786Z" },
-    { url = "https://files.pythonhosted.org/packages/80/07/fea6578112c8c60ffde55883a571e4c4c6bc7049f119d6b09333b5cc6f73/pyarrow-22.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:00626d9dc0f5ef3a75fe63fd68b9c7c8302d2b5bbc7f74ecaedba83447a24f84", size = 28101641, upload-time = "2025-10-24T10:04:22.57Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/b7/18f611a8cdc43417f9394a3ccd3eace2f32183c08b9eddc3d17681819f37/pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:3e294c5eadfb93d78b0763e859a0c16d4051fc1c5231ae8956d61cb0b5666f5a", size = 34272022, upload-time = "2025-10-24T10:04:28.973Z" },
-    { url = "https://files.pythonhosted.org/packages/26/5c/f259e2526c67eb4b9e511741b19870a02363a47a35edbebc55c3178db22d/pyarrow-22.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:69763ab2445f632d90b504a815a2a033f74332997052b721002298ed6de40f2e", size = 35995834, upload-time = "2025-10-24T10:04:35.467Z" },
-    { url = "https://files.pythonhosted.org/packages/50/8d/281f0f9b9376d4b7f146913b26fac0aa2829cd1ee7e997f53a27411bbb92/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:b41f37cabfe2463232684de44bad753d6be08a7a072f6a83447eeaf0e4d2a215", size = 45030348, upload-time = "2025-10-24T10:04:43.366Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/e5/53c0a1c428f0976bf22f513d79c73000926cb00b9c138d8e02daf2102e18/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35ad0f0378c9359b3f297299c3309778bb03b8612f987399a0333a560b43862d", size = 47699480, upload-time = "2025-10-24T10:04:51.486Z" },
-    { url = "https://files.pythonhosted.org/packages/95/e1/9dbe4c465c3365959d183e6345d0a8d1dc5b02ca3f8db4760b3bc834cf25/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8382ad21458075c2e66a82a29d650f963ce51c7708c7c0ff313a8c206c4fd5e8", size = 48011148, upload-time = "2025-10-24T10:04:59.585Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/b4/7caf5d21930061444c3cf4fa7535c82faf5263e22ce43af7c2759ceb5b8b/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1a812a5b727bc09c3d7ea072c4eebf657c2f7066155506ba31ebf4792f88f016", size = 50276964, upload-time = "2025-10-24T10:05:08.175Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/f3/cec89bd99fa3abf826f14d4e53d3d11340ce6f6af4d14bdcd54cd83b6576/pyarrow-22.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ec5d40dd494882704fb876c16fa7261a69791e784ae34e6b5992e977bd2e238c", size = 28106517, upload-time = "2025-10-24T10:05:14.314Z" },
-    { url = "https://files.pythonhosted.org/packages/af/63/ba23862d69652f85b615ca14ad14f3bcfc5bf1b99ef3f0cd04ff93fdad5a/pyarrow-22.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:bea79263d55c24a32b0d79c00a1c58bb2ee5f0757ed95656b01c0fb310c5af3d", size = 34211578, upload-time = "2025-10-24T10:05:21.583Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/d0/f9ad86fe809efd2bcc8be32032fa72e8b0d112b01ae56a053006376c5930/pyarrow-22.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:12fe549c9b10ac98c91cf791d2945e878875d95508e1a5d14091a7aaa66d9cf8", size = 35989906, upload-time = "2025-10-24T10:05:29.485Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/a8/f910afcb14630e64d673f15904ec27dd31f1e009b77033c365c84e8c1e1d/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:334f900ff08ce0423407af97e6c26ad5d4e3b0763645559ece6fbf3747d6a8f5", size = 45021677, upload-time = "2025-10-24T10:05:38.274Z" },
-    { url = "https://files.pythonhosted.org/packages/13/95/aec81f781c75cd10554dc17a25849c720d54feafb6f7847690478dcf5ef8/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c6c791b09c57ed76a18b03f2631753a4960eefbbca80f846da8baefc6491fcfe", size = 47726315, upload-time = "2025-10-24T10:05:47.314Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/d4/74ac9f7a54cfde12ee42734ea25d5a3c9a45db78f9def949307a92720d37/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c3200cb41cdbc65156e5f8c908d739b0dfed57e890329413da2748d1a2cd1a4e", size = 47990906, upload-time = "2025-10-24T10:05:58.254Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/71/fedf2499bf7a95062eafc989ace56572f3343432570e1c54e6599d5b88da/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ac93252226cf288753d8b46280f4edf3433bf9508b6977f8dd8526b521a1bbb9", size = 50306783, upload-time = "2025-10-24T10:06:08.08Z" },
-    { url = "https://files.pythonhosted.org/packages/68/ed/b202abd5a5b78f519722f3d29063dda03c114711093c1995a33b8e2e0f4b/pyarrow-22.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:44729980b6c50a5f2bfcc2668d36c569ce17f8b17bccaf470c4313dcbbf13c9d", size = 27972883, upload-time = "2025-10-24T10:06:14.204Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/d6/d0fac16a2963002fc22c8fa75180a838737203d558f0ed3b564c4a54eef5/pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e6e95176209257803a8b3d0394f21604e796dadb643d2f7ca21b66c9c0b30c9a", size = 34204629, upload-time = "2025-10-24T10:06:20.274Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/9c/1d6357347fbae062ad3f17082f9ebc29cc733321e892c0d2085f42a2212b/pyarrow-22.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:001ea83a58024818826a9e3f89bf9310a114f7e26dfe404a4c32686f97bd7901", size = 35985783, upload-time = "2025-10-24T10:06:27.301Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/c0/782344c2ce58afbea010150df07e3a2f5fdad299cd631697ae7bd3bac6e3/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ce20fe000754f477c8a9125543f1936ea5b8867c5406757c224d745ed033e691", size = 45020999, upload-time = "2025-10-24T10:06:35.387Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/8b/5362443737a5307a7b67c1017c42cd104213189b4970bf607e05faf9c525/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e0a15757fccb38c410947df156f9749ae4a3c89b2393741a50521f39a8cf202a", size = 47724601, upload-time = "2025-10-24T10:06:43.551Z" },
-    { url = "https://files.pythonhosted.org/packages/69/4d/76e567a4fc2e190ee6072967cb4672b7d9249ac59ae65af2d7e3047afa3b/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cedb9dd9358e4ea1d9bce3665ce0797f6adf97ff142c8e25b46ba9cdd508e9b6", size = 48001050, upload-time = "2025-10-24T10:06:52.284Z" },
-    { url = "https://files.pythonhosted.org/packages/01/5e/5653f0535d2a1aef8223cee9d92944cb6bccfee5cf1cd3f462d7cb022790/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:252be4a05f9d9185bb8c18e83764ebcfea7185076c07a7a662253af3a8c07941", size = 50307877, upload-time = "2025-10-24T10:07:02.405Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/f8/1d0bd75bf9328a3b826e24a16e5517cd7f9fbf8d34a3184a4566ef5a7f29/pyarrow-22.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:a4893d31e5ef780b6edcaf63122df0f8d321088bb0dee4c8c06eccb1ca28d145", size = 27977099, upload-time = "2025-10-24T10:08:07.259Z" },
-    { url = "https://files.pythonhosted.org/packages/90/81/db56870c997805bf2b0f6eeeb2d68458bf4654652dccdcf1bf7a42d80903/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", size = 34336685, upload-time = "2025-10-24T10:07:11.47Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/98/0727947f199aba8a120f47dfc229eeb05df15bcd7a6f1b669e9f882afc58/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ba95112d15fd4f1105fb2402c4eab9068f0554435e9b7085924bcfaac2cc306f", size = 36032158, upload-time = "2025-10-24T10:07:18.626Z" },
-    { url = "https://files.pythonhosted.org/packages/96/b4/9babdef9c01720a0785945c7cf550e4acd0ebcd7bdd2e6f0aa7981fa85e2/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:c064e28361c05d72eed8e744c9605cbd6d2bb7481a511c74071fd9b24bc65d7d", size = 44892060, upload-time = "2025-10-24T10:07:26.002Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/ca/2f8804edd6279f78a37062d813de3f16f29183874447ef6d1aadbb4efa0f/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6f9762274496c244d951c819348afbcf212714902742225f649cf02823a6a10f", size = 47504395, upload-time = "2025-10-24T10:07:34.09Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/f0/77aa5198fd3943682b2e4faaf179a674f0edea0d55d326d83cb2277d9363/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a9d9ffdc2ab696f6b15b4d1f7cec6658e1d788124418cb30030afbae31c64746", size = 48066216, upload-time = "2025-10-24T10:07:43.528Z" },
-    { url = "https://files.pythonhosted.org/packages/79/87/a1937b6e78b2aff18b706d738c9e46ade5bfcf11b294e39c87706a0089ac/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ec1a15968a9d80da01e1d30349b2b0d7cc91e96588ee324ce1b5228175043e95", size = 50288552, upload-time = "2025-10-24T10:07:53.519Z" },
-    { url = "https://files.pythonhosted.org/packages/60/ae/b5a5811e11f25788ccfdaa8f26b6791c9807119dffcf80514505527c384c/pyarrow-22.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:bba208d9c7decf9961998edf5c65e3ea4355d5818dd6cd0f6809bec1afb951cc", size = 28262504, upload-time = "2025-10-24T10:08:00.932Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/b0/0fa4d28a8edb42b0a7144edd20befd04173ac79819547216f8a9f36f9e50/pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:9bddc2cade6561f6820d4cd73f99a0243532ad506bc510a75a5a65a522b2d74d", size = 34224062, upload-time = "2025-10-24T10:08:14.101Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/a8/7a719076b3c1be0acef56a07220c586f25cd24de0e3f3102b438d18ae5df/pyarrow-22.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e70ff90c64419709d38c8932ea9fe1cc98415c4f87ea8da81719e43f02534bc9", size = 35990057, upload-time = "2025-10-24T10:08:21.842Z" },
-    { url = "https://files.pythonhosted.org/packages/89/3c/359ed54c93b47fb6fe30ed16cdf50e3f0e8b9ccfb11b86218c3619ae50a8/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:92843c305330aa94a36e706c16209cd4df274693e777ca47112617db7d0ef3d7", size = 45068002, upload-time = "2025-10-24T10:08:29.034Z" },
-    { url = "https://files.pythonhosted.org/packages/55/fc/4945896cc8638536ee787a3bd6ce7cec8ec9acf452d78ec39ab328efa0a1/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:6dda1ddac033d27421c20d7a7943eec60be44e0db4e079f33cc5af3b8280ccde", size = 47737765, upload-time = "2025-10-24T10:08:38.559Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/5e/7cb7edeb2abfaa1f79b5d5eb89432356155c8426f75d3753cbcb9592c0fd/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:84378110dd9a6c06323b41b56e129c504d157d1a983ce8f5443761eb5256bafc", size = 48048139, upload-time = "2025-10-24T10:08:46.784Z" },
-    { url = "https://files.pythonhosted.org/packages/88/c6/546baa7c48185f5e9d6e59277c4b19f30f48c94d9dd938c2a80d4d6b067c/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:854794239111d2b88b40b6ef92aa478024d1e5074f364033e73e21e3f76b25e0", size = 50314244, upload-time = "2025-10-24T10:08:55.771Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/79/755ff2d145aafec8d347bf18f95e4e81c00127f06d080135dfc86aea417c/pyarrow-22.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:b883fe6fd85adad7932b3271c38ac289c65b7337c2c132e9569f9d3940620730", size = 28757501, upload-time = "2025-10-24T10:09:59.891Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/d2/237d75ac28ced3147912954e3c1a174df43a95f4f88e467809118a8165e0/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7a820d8ae11facf32585507c11f04e3f38343c1e784c9b5a8b1da5c930547fe2", size = 34355506, upload-time = "2025-10-24T10:09:02.953Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/2c/733dfffe6d3069740f98e57ff81007809067d68626c5faef293434d11bd6/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:c6ec3675d98915bf1ec8b3c7986422682f7232ea76cad276f4c8abd5b7319b70", size = 36047312, upload-time = "2025-10-24T10:09:10.334Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/2b/29d6e3782dc1f299727462c1543af357a0f2c1d3c160ce199950d9ca51eb/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3e739edd001b04f654b166204fc7a9de896cf6007eaff33409ee9e50ceaff754", size = 45081609, upload-time = "2025-10-24T10:09:18.61Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/42/aa9355ecc05997915af1b7b947a7f66c02dcaa927f3203b87871c114ba10/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7388ac685cab5b279a41dfe0a6ccd99e4dbf322edfb63e02fc0443bf24134e91", size = 47703663, upload-time = "2025-10-24T10:09:27.369Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/62/45abedde480168e83a1de005b7b7043fd553321c1e8c5a9a114425f64842/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f633074f36dbc33d5c05b5dc75371e5660f1dbf9c8b1d95669def05e5425989c", size = 48066543, upload-time = "2025-10-24T10:09:34.908Z" },
-    { url = "https://files.pythonhosted.org/packages/84/e9/7878940a5b072e4f3bf998770acafeae13b267f9893af5f6d4ab3904b67e/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4c19236ae2402a8663a2c8f21f1870a03cc57f0bef7e4b6eb3238cc82944de80", size = 50288838, upload-time = "2025-10-24T10:09:44.394Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/03/f335d6c52b4a4761bcc83499789a1e2e16d9d201a58c327a9b5cc9a41bd9/pyarrow-22.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0c34fe18094686194f204a3b1787a27456897d8a2d62caf84b61e8dfbc0252ae", size = 29185594, upload-time = "2025-10-24T10:09:53.111Z" },
+version = "23.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/a8/24e5dc6855f50a62936ceb004e6e9645e4219a8065f304145d7fb8a79d5d/pyarrow-23.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:3fab8f82571844eb3c460f90a75583801d14ca0cc32b1acc8c361650e006fd56", size = 34307390, upload-time = "2026-02-16T10:08:08.654Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/8e/4be5617b4aaae0287f621ad31c6036e5f63118cfca0dc57d42121ff49b51/pyarrow-23.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3f91c038b95f71ddfc865f11d5876c42f343b4495535bd262c7b321b0b94507c", size = 35853761, upload-time = "2026-02-16T10:08:17.811Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/08/3e56a18819462210432ae37d10f5c8eed3828be1d6c751b6e6a2e93c286a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d0744403adabef53c985a7f8a082b502a368510c40d184df349a0a8754533258", size = 44493116, upload-time = "2026-02-16T10:08:25.792Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/82/c40b68001dbec8a3faa4c08cd8c200798ac732d2854537c5449dc859f55a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c33b5bf406284fd0bba436ed6f6c3ebe8e311722b441d89397c54f871c6863a2", size = 47564532, upload-time = "2026-02-16T10:08:34.27Z" },
+    { url = "https://files.pythonhosted.org/packages/20/bc/73f611989116b6f53347581b02177f9f620efdf3cd3f405d0e83cdf53a83/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ddf743e82f69dcd6dbbcb63628895d7161e04e56794ef80550ac6f3315eeb1d5", size = 48183685, upload-time = "2026-02-16T10:08:42.889Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/cc/6c6b3ecdae2a8c3aced99956187e8302fc954cc2cca2a37cf2111dad16ce/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e052a211c5ac9848ae15d5ec875ed0943c0221e2fcfe69eee80b604b4e703222", size = 50605582, upload-time = "2026-02-16T10:08:51.641Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/94/d359e708672878d7638a04a0448edf7c707f9e5606cee11e15aaa5c7535a/pyarrow-23.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:5abde149bb3ce524782d838eb67ac095cd3fd6090eba051130589793f1a7f76d", size = 27521148, upload-time = "2026-02-16T10:08:58.077Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" },
+    { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" },
+    { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" },
+    { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" },
+    { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" },
+    { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" },
+    { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" },
+    { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" },
+    { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" },
+    { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" },
+    { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" },
+    { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" },
+    { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" },
+    { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" },
+    { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" },
+]
+
+[[package]]
+name = "pyasn1"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/b6/6e630dff89739fcd427e3f72b3d905ce0acb85a45d4ec3e2678718a3487f/pyasn1-0.6.2.tar.gz", hash = "sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b", size = 146586, upload-time = "2026-01-16T18:04:18.534Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/b5/a96872e5184f354da9c84ae119971a0a4c221fe9b27a4d94bd43f2596727/pyasn1-0.6.2-py3-none-any.whl", hash = "sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf", size = 83371, upload-time = "2026-01-16T18:04:17.174Z" },
+]
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyasn1" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
 ]
 
 [[package]]
 name = "pybind11"
-version = "3.0.1"
+version = "3.0.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/2f/7b/a6d8dcb83c457e24a9df1e4d8fd5fb8034d4bbc62f3c324681e8a9ba57c2/pybind11-3.0.1.tar.gz", hash = "sha256:9c0f40056a016da59bab516efb523089139fcc6f2ba7e4930854c61efb932051", size = 546914, upload-time = "2025-08-22T20:09:27.265Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a5/98/9118a0659646f1628c592ef9bb48e0056efa6bf27c951fd12a178e0136fb/pybind11-3.0.2.tar.gz", hash = "sha256:432f01aeb68e361a3a7fc7575c2c7f497595bf640f747acd909ff238dd766e06", size = 577131, upload-time = "2026-02-17T04:46:52.556Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cd/8a/37362fc2b949d5f733a8b0f2ff51ba423914cabefe69f1d1b6aab710f5fe/pybind11-3.0.1-py3-none-any.whl", hash = "sha256:aa8f0aa6e0a94d3b64adfc38f560f33f15e589be2175e103c0a33c6bce55ee89", size = 293611, upload-time = "2025-08-22T20:09:25.235Z" },
+    { url = "https://files.pythonhosted.org/packages/88/c5/e98d9c51f3d5300d5e40ad9037dd6b3b60736fd02ab68dcc98c96be7592d/pybind11-3.0.2-py3-none-any.whl", hash = "sha256:f8a6500548919cc33bcd220d5f984688326f574fa97f1107f2f4fdb4c6fb019f", size = 310158, upload-time = "2026-02-17T04:46:49.91Z" },
 ]
 
 [[package]]
@@ -3988,11 +4933,11 @@ wheels = [
 
 [[package]]
 name = "pycparser"
-version = "2.23"
+version = "3.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/fe/cf/d2d3b9f5699fb1e4615c8e32ff220203e43b248e1dfcc6736ad9057731ca/pycparser-2.23.tar.gz", hash = "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2", size = 173734, upload-time = "2025-09-09T13:23:47.91Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a0/e3/59cd50310fc9b59512193629e1984c1f95e5c8ae6e5d8c69532ccc65a7fe/pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934", size = 118140, upload-time = "2025-09-09T13:23:46.651Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
 ]
 
 [[package]]
@@ -4136,10 +5081,12 @@ dependencies = [
     { name = "accessible-pygments" },
     { name = "babel" },
     { name = "beautifulsoup4" },
-    { name = "docutils" },
+    { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pygments" },
     { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/00/20/bb50f9de3a6de69e6abd6b087b52fa2418a0418b19597601605f855ad044/pydata_sphinx_theme-0.16.1.tar.gz", hash = "sha256:a08b7f0b7f70387219dc659bff0893a7554d5eb39b59d3b8ef37b8401b7642d7", size = 2412693, upload-time = "2024-12-17T10:53:39.537Z" }
@@ -4183,11 +5130,11 @@ wheels = [
 
 [[package]]
 name = "pyjwt"
-version = "2.10.1"
+version = "2.11.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785, upload-time = "2024-11-28T03:43:29.933Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/5c/5a/b46fa56bf322901eee5b0454a34343cdbdae202cd421775a8ee4e42fd519/pyjwt-2.11.0.tar.gz", hash = "sha256:35f95c1f0fbe5d5ba6e43f00271c275f7a1a4db1dab27bf708073b75318ea623", size = 98019, upload-time = "2026-01-30T19:59:55.694Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997, upload-time = "2024-11-28T03:43:27.893Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/01/c26ce75ba460d5cd503da9e13b21a33804d38c2165dec7b716d06b13010c/pyjwt-2.11.0-py3-none-any.whl", hash = "sha256:94a6bde30eb5c8e04fee991062b534071fd1439ef58d2adc9ccb823e7bcd0469", size = 28224, upload-time = "2026-01-30T19:59:54.539Z" },
 ]
 
 [package.optional-dependencies]
@@ -4202,7 +5149,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "astroid" },
     { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "dill" },
+    { name = "dill", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "dill", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "isort" },
     { name = "mccabe" },
     { name = "platformdirs" },
@@ -4216,39 +5164,37 @@ wheels = [
 
 [[package]]
 name = "pynacl"
-version = "1.6.1"
+version = "1.6.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cffi", marker = "platform_python_implementation != 'PyPy' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" },
-    { url = "https://files.pythonhosted.org/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" },
-    { url = "https://files.pythonhosted.org/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" },
-    { url = "https://files.pythonhosted.org/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" },
-    { url = "https://files.pythonhosted.org/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" },
-    { url = "https://files.pythonhosted.org/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" },
-    { url = "https://files.pythonhosted.org/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" },
-    { url = "https://files.pythonhosted.org/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" },
-    { url = "https://files.pythonhosted.org/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" },
-    { url = "https://files.pythonhosted.org/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" },
-    { url = "https://files.pythonhosted.org/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/d9/9a/4019b524b03a13438637b11538c82781a5eda427394380381af8f04f467a/pynacl-1.6.2.tar.gz", hash = "sha256:018494d6d696ae03c7e656e5e74cdfd8ea1326962cc401bcf018f1ed8436811c", size = 3511692, upload-time = "2026-01-01T17:48:10.851Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/79/0e3c34dc3c4671f67d251c07aa8eb100916f250ee470df230b0ab89551b4/pynacl-1.6.2-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:622d7b07cc5c02c666795792931b50c91f3ce3c2649762efb1ef0d5684c81594", size = 390064, upload-time = "2026-01-01T17:31:57.264Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/1c/23a26e931736e13b16483795c8a6b2f641bf6a3d5238c22b070a5112722c/pynacl-1.6.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d071c6a9a4c94d79eb665db4ce5cedc537faf74f2355e4d502591d850d3913c0", size = 809370, upload-time = "2026-01-01T17:31:59.198Z" },
+    { url = "https://files.pythonhosted.org/packages/87/74/8d4b718f8a22aea9e8dcc8b95deb76d4aae380e2f5b570cc70b5fd0a852d/pynacl-1.6.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe9847ca47d287af41e82be1dd5e23023d3c31a951da134121ab02e42ac218c9", size = 1408304, upload-time = "2026-01-01T17:32:01.162Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/73/be4fdd3a6a87fe8a4553380c2b47fbd1f7f58292eb820902f5c8ac7de7b0/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:04316d1fc625d860b6c162fff704eb8426b1a8bcd3abacea11142cbd99a6b574", size = 844871, upload-time = "2026-01-01T17:32:02.824Z" },
+    { url = "https://files.pythonhosted.org/packages/55/ad/6efc57ab75ee4422e96b5f2697d51bbcf6cdcc091e66310df91fbdc144a8/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44081faff368d6c5553ccf55322ef2819abb40e25afaec7e740f159f74813634", size = 1446356, upload-time = "2026-01-01T17:32:04.452Z" },
+    { url = "https://files.pythonhosted.org/packages/78/b7/928ee9c4779caa0a915844311ab9fb5f99585621c5d6e4574538a17dca07/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:a9f9932d8d2811ce1a8ffa79dcbdf3970e7355b5c8eb0c1a881a57e7f7d96e88", size = 826814, upload-time = "2026-01-01T17:32:06.078Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/a9/1bdba746a2be20f8809fee75c10e3159d75864ef69c6b0dd168fc60e485d/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:bc4a36b28dd72fb4845e5d8f9760610588a96d5a51f01d84d8c6ff9849968c14", size = 1411742, upload-time = "2026-01-01T17:32:07.651Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/2f/5e7ea8d85f9f3ea5b6b87db1d8388daa3587eed181bdeb0306816fdbbe79/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bffb6d0f6becacb6526f8f42adfb5efb26337056ee0831fb9a7044d1a964444", size = 801714, upload-time = "2026-01-01T17:32:09.558Z" },
+    { url = "https://files.pythonhosted.org/packages/06/ea/43fe2f7eab5f200e40fb10d305bf6f87ea31b3bbc83443eac37cd34a9e1e/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:2fef529ef3ee487ad8113d287a593fa26f48ee3620d92ecc6f1d09ea38e0709b", size = 1372257, upload-time = "2026-01-01T17:32:11.026Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/54/c9ea116412788629b1347e415f72195c25eb2f3809b2d3e7b25f5c79f13a/pynacl-1.6.2-cp314-cp314t-win32.whl", hash = "sha256:a84bf1c20339d06dc0c85d9aea9637a24f718f375d861b2668b2f9f96fa51145", size = 231319, upload-time = "2026-01-01T17:32:12.46Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/04/64e9d76646abac2dccf904fccba352a86e7d172647557f35b9fe2a5ee4a1/pynacl-1.6.2-cp314-cp314t-win_amd64.whl", hash = "sha256:320ef68a41c87547c91a8b58903c9caa641ab01e8512ce291085b5fe2fcb7590", size = 244044, upload-time = "2026-01-01T17:32:13.781Z" },
+    { url = "https://files.pythonhosted.org/packages/33/33/7873dc161c6a06f43cda13dec67b6fe152cb2f982581151956fa5e5cdb47/pynacl-1.6.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d29bfe37e20e015a7d8b23cfc8bd6aa7909c92a1b8f41ee416bbb3e79ef182b2", size = 188740, upload-time = "2026-01-01T17:32:15.083Z" },
+    { url = "https://files.pythonhosted.org/packages/be/7b/4845bbf88e94586ec47a432da4e9107e3fc3ce37eb412b1398630a37f7dd/pynacl-1.6.2-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:c949ea47e4206af7c8f604b8278093b674f7c79ed0d4719cc836902bf4517465", size = 388458, upload-time = "2026-01-01T17:32:16.829Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/b4/e927e0653ba63b02a4ca5b4d852a8d1d678afbf69b3dbf9c4d0785ac905c/pynacl-1.6.2-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8845c0631c0be43abdd865511c41eab235e0be69c81dc66a50911594198679b0", size = 800020, upload-time = "2026-01-01T17:32:18.34Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/81/d60984052df5c97b1d24365bc1e30024379b42c4edcd79d2436b1b9806f2/pynacl-1.6.2-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:22de65bb9010a725b0dac248f353bb072969c94fa8d6b1f34b87d7953cf7bbe4", size = 1399174, upload-time = "2026-01-01T17:32:20.239Z" },
+    { url = "https://files.pythonhosted.org/packages/68/f7/322f2f9915c4ef27d140101dd0ed26b479f7e6f5f183590fd32dfc48c4d3/pynacl-1.6.2-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46065496ab748469cdd999246d17e301b2c24ae2fdf739132e580a0e94c94a87", size = 835085, upload-time = "2026-01-01T17:32:22.24Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/d0/f301f83ac8dbe53442c5a43f6a39016f94f754d7a9815a875b65e218a307/pynacl-1.6.2-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a66d6fb6ae7661c58995f9c6435bda2b1e68b54b598a6a10247bfcdadac996c", size = 1437614, upload-time = "2026-01-01T17:32:23.766Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/58/fc6e649762b029315325ace1a8c6be66125e42f67416d3dbd47b69563d61/pynacl-1.6.2-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:26bfcd00dcf2cf160f122186af731ae30ab120c18e8375684ec2670dccd28130", size = 818251, upload-time = "2026-01-01T17:32:25.69Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/a8/b917096b1accc9acd878819a49d3d84875731a41eb665f6ebc826b1af99e/pynacl-1.6.2-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:c8a231e36ec2cab018c4ad4358c386e36eede0319a0c41fed24f840b1dac59f6", size = 1402859, upload-time = "2026-01-01T17:32:27.215Z" },
+    { url = "https://files.pythonhosted.org/packages/85/42/fe60b5f4473e12c72f977548e4028156f4d340b884c635ec6b063fe7e9a5/pynacl-1.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:68be3a09455743ff9505491220b64440ced8973fe930f270c8e07ccfa25b1f9e", size = 791926, upload-time = "2026-01-01T17:32:29.314Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/f9/e40e318c604259301cc091a2a63f237d9e7b424c4851cafaea4ea7c4834e/pynacl-1.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8b097553b380236d51ed11356c953bf8ce36a29a3e596e934ecabe76c985a577", size = 1363101, upload-time = "2026-01-01T17:32:31.263Z" },
+    { url = "https://files.pythonhosted.org/packages/48/47/e761c254f410c023a469284a9bc210933e18588ca87706ae93002c05114c/pynacl-1.6.2-cp38-abi3-win32.whl", hash = "sha256:5811c72b473b2f38f7e2a3dc4f8642e3a3e9b5e7317266e4ced1fba85cae41aa", size = 227421, upload-time = "2026-01-01T17:32:33.076Z" },
+    { url = "https://files.pythonhosted.org/packages/41/ad/334600e8cacc7d86587fe5f565480fde569dfb487389c8e1be56ac21d8ac/pynacl-1.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:62985f233210dee6548c223301b6c25440852e13d59a8b81490203c3227c5ba0", size = 239754, upload-time = "2026-01-01T17:32:34.557Z" },
+    { url = "https://files.pythonhosted.org/packages/29/7d/5945b5af29534641820d3bd7b00962abbbdfee84ec7e19f0d5b3175f9a31/pynacl-1.6.2-cp38-abi3-win_arm64.whl", hash = "sha256:834a43af110f743a754448463e8fd61259cd4ab5bbedcf70f9dabad1d28a394c", size = 184801, upload-time = "2026-01-01T17:32:36.309Z" },
 ]
 
 [[package]]
@@ -4347,24 +5293,24 @@ wheels = [
 
 [[package]]
 name = "python-gitlab"
-version = "7.0.0"
+version = "8.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "requests" },
     { name = "requests-toolbelt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5e/c4/0b613303b4f0fcda69b3d2e03d0a1fb1b6b079a7c7832e03a8d92461e9fe/python_gitlab-7.0.0.tar.gz", hash = "sha256:e4d934430f64efc09e6208b782c61cc0a3389527765e03ffbef17f4323dce441", size = 400568, upload-time = "2025-10-29T15:06:02.069Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c4/68/02645bc9d71554e7a263b118e4e55dafe4c4735c1ba74f9712232ed84054/python_gitlab-8.0.0.tar.gz", hash = "sha256:03eae5a9d105448796e6c0e192d402c266057e75790cf4f42c143dddf91313ce", size = 401334, upload-time = "2026-01-28T01:22:27.005Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4f/9e/811edc46a15f8deb828cba7ef8aab3451dc11ca72d033f3df72a5af865d9/python_gitlab-7.0.0-py3-none-any.whl", hash = "sha256:712a6c8c5e79e7e66f6dabb25d8fe7831a6b238d4a5132f8231df6b3b890ceff", size = 144415, upload-time = "2025-10-29T15:06:00.232Z" },
+    { url = "https://files.pythonhosted.org/packages/52/60/ba68e51e90a99b14af639463e5d617239029ec25927a0990ff28bd851916/python_gitlab-8.0.0-py3-none-any.whl", hash = "sha256:c635e6722c5710d35ddadfcf95c362b0aa8de11ab3972bc4f230ebd58a6c49ee", size = 144483, upload-time = "2026-01-28T01:22:25.772Z" },
 ]
 
 [[package]]
 name = "python-multipart"
-version = "0.0.20"
+version = "0.0.22"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158, upload-time = "2024-12-16T19:45:46.972Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload-time = "2024-12-16T19:45:44.423Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" },
 ]
 
 [[package]]
@@ -4488,7 +5434,7 @@ wheels = [
 
 [[package]]
 name = "ray"
-version = "2.51.1"
+version = "2.54.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
@@ -4501,21 +5447,41 @@ dependencies = [
     { name = "requests" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/72/4b/8ded0ecb0ed08b75af47340fac4b14b15196a76a6d733f3945cc5cb77354/ray-2.51.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e8ce218c85e9f4043c37136fc90b41343bdb844fcdc9520f21c000d1d8d49f89", size = 68039113, upload-time = "2025-11-01T03:23:30.619Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/a7/aba274bd1e1014cb232ee04548cc3d7aab9b84eb13c44d71b72d189421f9/ray-2.51.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:36feb519f31c52d3b4dbcd68ffb2baf93195ceec06ea711e21559096bab95fed", size = 70340511, upload-time = "2025-11-01T03:23:38.217Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/42/a5712f4f8c911ea5b8b3cb406ceef18a1c1bc98490c66fa902cb72391af3/ray-2.51.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:8a21f5914baa3deefcb4fa5f3878e03b589c190b864fe1b80e6dc0cbfba26004", size = 71166513, upload-time = "2025-11-01T03:23:44.123Z" },
-    { url = "https://files.pythonhosted.org/packages/91/1e/eeae1da4ffac6eeeeafce2d11c0b6133fd4df1b3e53bc44d61c30c05b6d9/ray-2.51.1-cp310-cp310-win_amd64.whl", hash = "sha256:a82417b89260ed751a76e9cfaef6d11392ab0da464cde1a9d07a0bb7dc272a7b", size = 26695587, upload-time = "2025-11-01T03:23:49.739Z" },
-    { url = "https://files.pythonhosted.org/packages/43/66/f1e11291d9fdf0634ea763cfb167cf449773d13918bb04390e6263b7129b/ray-2.51.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bd8211fc033be1bce9c039e474e97a9077be593020978fdcfba1d770bdc40ba5", size = 68043927, upload-time = "2025-11-01T03:23:59.655Z" },
-    { url = "https://files.pythonhosted.org/packages/be/89/9a11d0addbba6143f5a34929ed1fdef51159328b9b76a877c0c7f98b2848/ray-2.51.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d2d7c8af45441ff50bc002352d31e0afec5c85dd5075bf527027178931497bce", size = 70460551, upload-time = "2025-11-01T03:24:05.77Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/67/40a8d63e4cb3ff1a1a5a12db77ca655e21cb13f10e024a9513f24ed11d98/ray-2.51.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:dd353010d2548bc345e46c45795f70291bb460c236aa6a3393b51a9cd861b56f", size = 71280610, upload-time = "2025-11-01T03:24:11.981Z" },
-    { url = "https://files.pythonhosted.org/packages/62/97/90bcfed6b8c986f9ea24def19bbb81480575dd5fa87630eeaa4c92652507/ray-2.51.1-cp311-cp311-win_amd64.whl", hash = "sha256:606c6e0733eb18fc307c9645ea84ccbd1aad8a5ba8bad764bed54b94e926d33c", size = 26691238, upload-time = "2025-11-01T03:24:16.978Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/95/51e44ce79e42f02ca1c4d4c5501e6dd49f3a384c5f6324aceb4e0015988a/ray-2.51.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ef847b025ca758baee4571a1ca001d973897cad772f8e95d7f303d24c38b649e", size = 68029226, upload-time = "2025-11-01T03:24:21.928Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/b5/a93e39e131067edb7cba3385a609f61aaaf7aa54728cd3a7474bfbf3b0fc/ray-2.51.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:0bed9408712bad1511e65683a455302f88d94e5e5cb6a58cc4a154b61d8a0b4a", size = 70502423, upload-time = "2025-11-01T03:24:27.398Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/59/69b7a653ed8176fc7fd894d462ed34bb1477e7fa71700324de99179b5b7e/ray-2.51.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:4e786da7862cf73664977d0212a505d6d5a585beadf63e7dc1e1c129259bee20", size = 71353730, upload-time = "2025-11-01T03:24:33.495Z" },
-    { url = "https://files.pythonhosted.org/packages/38/91/0c4fe7aed34baa14d9c050c88f39ff16083d555bd6dcd6c4ffb4332a6f8a/ray-2.51.1-cp312-cp312-win_amd64.whl", hash = "sha256:198fda93074a6863555f4003e9013bb2ba0cd50b59b18c02affdc294b28a2eef", size = 26674921, upload-time = "2025-11-01T03:24:38.394Z" },
-    { url = "https://files.pythonhosted.org/packages/65/1c/3ebf7277d8ae5f99150a5890bff4bdc627021e3a1be7caacd075d2996c7a/ray-2.51.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:d81547886435142dbd79bff1d4e4edf578a5f20e3b11bbd4ced49cfafbd37d27", size = 67974221, upload-time = "2025-11-01T03:24:44.118Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/47/13ba6c4d0e97aff94dcf8537f2832d1101c2080a0aea5c973a4de1d4d8bd/ray-2.51.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:3f2bd2acf9b7f4738c17d08592caaad26eafb7a4fc380ad9ab42d5f0a78f73ad", size = 70410610, upload-time = "2025-11-01T03:24:50.075Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/87/3cdf6d0504659d8192baa6576dd7a17ea395a4d969010274f7cc0e894281/ray-2.51.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:265ecd6fd6d4a695b09c686e17d58fca0c09e7198c073628ae7bf4974b03e9ca", size = 71269225, upload-time = "2025-11-01T03:24:55.929Z" },
+    { url = "https://files.pythonhosted.org/packages/64/13/b86d791b41f33220335eba18fc4841f1ebddae41e562c6a216846404c88d/ray-2.54.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:a22937f09ee74a43171df338d84b45ef882c1c05748947ca9d5343a44d4b9379", size = 70097079, upload-time = "2026-02-18T04:04:35.409Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/bb/f54980d45ecfd0ceb39b6a966bd64fc0597746af1917d7fe3cbdb9f72752/ray-2.54.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:1e63e491155695d527513ffe9d33a6aeb3f3cdccb6309adadfd6f8dd7c0300f7", size = 71951024, upload-time = "2026-02-18T04:04:42.817Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/b1/8cc4e45a3ce87aabcb70696b448b20840bcbaa5c98bdb4807a2749541fda/ray-2.54.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:2d140409e4ca06d8d6a06f71d441b53f6edcd930ebe67a6988f652915db81070", size = 72783364, upload-time = "2026-02-18T04:04:48.311Z" },
+    { url = "https://files.pythonhosted.org/packages/12/79/7fb2f5698319cd28f0599fc9848a77dd7a64e0d82486c78dd94c6dce5095/ray-2.54.0-cp310-cp310-win_amd64.whl", hash = "sha256:86da6ff60b57394aa47158b2f3fc2616a87492e828983451f04e676b192b49ce", size = 27452281, upload-time = "2026-02-18T04:04:53.252Z" },
+    { url = "https://files.pythonhosted.org/packages/08/58/6209b2231947f3c8df09ce1436f1c76c4a11fcafd57c8def852dcbb6d8ef/ray-2.54.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8e39dd56b47a0a1820d5a5a54385bbe54d1d67e1093736d12d8ed4e99d0fa455", size = 70098998, upload-time = "2026-02-18T04:04:58.801Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/29/7871f4206e6b00a9bb784c16dad32ccd01e9df5a93545db92de220eb2871/ray-2.54.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:491ae56ab80d8822c4eaf4d5bb96dcf32a6231d8d7b76eb8034400eb9be1bb18", size = 72066630, upload-time = "2026-02-18T04:05:04.957Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/e8/d2c8ebd9cd945abc817b01ad02a29df78cdb86cd07d764587e16977389d0/ray-2.54.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:928bb09245a3c6f7c3c113ba8eafc69f948da9602d7f33e8251ecdf97c157615", size = 72895723, upload-time = "2026-02-18T04:05:10.686Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/96/a5ea3a149a943475cda1d68fdcdb14c86251826c652c232ae853600ad7e7/ray-2.54.0-cp311-cp311-win_amd64.whl", hash = "sha256:1e786330de55b3ba2228e36ec305381a9b86f0b01a8b6072c5811c3bc4dd9a3d", size = 27448371, upload-time = "2026-02-18T04:05:16.34Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/16/45eefb51eb1767342a6dbf41af0b432279e422e56160705fcd1098a7ec53/ray-2.54.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cf5c33b4b13850ec24a5bd5f9d9e0a8161f8e586bfd297e52913d170dec447fe", size = 70084880, upload-time = "2026-02-18T04:05:22.007Z" },
+    { url = "https://files.pythonhosted.org/packages/60/ad/e07aca3637e9c3ec4857ec4366208099cf8488ece8061a9925ba29b66382/ray-2.54.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:795ae21d6b764245d3f521bc5833446d58569e7dfde9c5777417eb285d87450f", size = 72107346, upload-time = "2026-02-18T04:05:27.999Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/b9/cc5ea8460c3dc602e6b7198277a7c59ba2b8929374ab22efa8df9f3deac8/ray-2.54.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:a972afd5aa3dda99d0b2f369b5f62e5dd95865ab7d37bf2e0a0e0d2cfbd9b325", size = 72967230, upload-time = "2026-02-18T04:05:33.771Z" },
+    { url = "https://files.pythonhosted.org/packages/de/d7/744de3b1bb881701330ddcbb2f6efaccd65915d564ece899a3838f9fb105/ray-2.54.0-cp312-cp312-win_amd64.whl", hash = "sha256:2ee074ede491d0aacfa339c003f5d7a15826e1e2a72ce873234ccbc0446e19b3", size = 27427353, upload-time = "2026-02-18T04:05:38.853Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/f2/5c0161d10445e703b7d01413ab54ec1cc5e27032555279d296df89b9c4ee/ray-2.54.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5ad77961fea16c697a0fb0e51216dd39c0bec28868cde54ac668edd58d12b8ae", size = 70030991, upload-time = "2026-02-18T04:05:43.966Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/8c/4a4a38eaec6e9614076a96967f58540f4f8d4aa0c793f43150c5df23cb9a/ray-2.54.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:8952c23a8aa94f10728c2d16e0dc3732d09aa0e6254801757ff494984a214f45", size = 72013826, upload-time = "2026-02-18T04:05:49.866Z" },
+    { url = "https://files.pythonhosted.org/packages/42/ac/e7ec2a406bd755f61c7090460fa5ab3f09b00c3c2d8db6d0b559f78a30eb/ray-2.54.0-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:ab89e6089abb6e46fb98fdd96d399b31a852d79127cd8ac00746c61d93defa2c", size = 72880209, upload-time = "2026-02-18T04:05:55.498Z" },
+]
+
+[package.optional-dependencies]
+default = [
+    { name = "aiohttp" },
+    { name = "aiohttp-cors" },
+    { name = "colorful" },
+    { name = "grpcio" },
+    { name = "opencensus" },
+    { name = "opentelemetry-exporter-prometheus", version = "0.54b1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "opentelemetry-exporter-prometheus", version = "0.60b1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk", version = "1.33.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "opentelemetry-sdk", version = "1.39.1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
+    { name = "prometheus-client" },
+    { name = "py-spy" },
+    { name = "pydantic" },
+    { name = "requests" },
+    { name = "smart-open" },
+    { name = "virtualenv" },
 ]
 
 [[package]]
@@ -4525,7 +5491,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "attrs" },
     { name = "rpds-py" },
-    { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" }
 wheels = [
@@ -4534,109 +5500,123 @@ wheels = [
 
 [[package]]
 name = "regex"
-version = "2025.11.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cc/a9/546676f25e573a4cf00fe8e119b78a37b6a8fe2dc95cda877b30889c9c45/regex-2025.11.3.tar.gz", hash = "sha256:1fedc720f9bb2494ce31a58a1631f9c82df6a09b49c19517ea5cc280b4541e01", size = 414669, upload-time = "2025-11-03T21:34:22.089Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8a/d6/d788d52da01280a30a3f6268aef2aa71043bff359c618fea4c5b536654d5/regex-2025.11.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2b441a4ae2c8049106e8b39973bfbddfb25a179dda2bdb99b0eeb60c40a6a3af", size = 488087, upload-time = "2025-11-03T21:30:47.317Z" },
-    { url = "https://files.pythonhosted.org/packages/69/39/abec3bd688ec9bbea3562de0fd764ff802976185f5ff22807bf0a2697992/regex-2025.11.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fa2eed3f76677777345d2f81ee89f5de2f5745910e805f7af7386a920fa7313", size = 290544, upload-time = "2025-11-03T21:30:49.912Z" },
-    { url = "https://files.pythonhosted.org/packages/39/b3/9a231475d5653e60002508f41205c61684bb2ffbf2401351ae2186897fc4/regex-2025.11.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8b4a27eebd684319bdf473d39f1d79eed36bf2cd34bd4465cdb4618d82b3d56", size = 288408, upload-time = "2025-11-03T21:30:51.344Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/c5/1929a0491bd5ac2d1539a866768b88965fa8c405f3e16a8cef84313098d6/regex-2025.11.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cf77eac15bd264986c4a2c63353212c095b40f3affb2bc6b4ef80c4776c1a28", size = 781584, upload-time = "2025-11-03T21:30:52.596Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/fd/16aa16cf5d497ef727ec966f74164fbe75d6516d3d58ac9aa989bc9cdaad/regex-2025.11.3-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b7f9ee819f94c6abfa56ec7b1dbab586f41ebbdc0a57e6524bd5e7f487a878c7", size = 850733, upload-time = "2025-11-03T21:30:53.825Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/49/3294b988855a221cb6565189edf5dc43239957427df2d81d4a6b15244f64/regex-2025.11.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:838441333bc90b829406d4a03cb4b8bf7656231b84358628b0406d803931ef32", size = 898691, upload-time = "2025-11-03T21:30:55.575Z" },
-    { url = "https://files.pythonhosted.org/packages/14/62/b56d29e70b03666193369bdbdedfdc23946dbe9f81dd78ce262c74d988ab/regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cfe6d3f0c9e3b7e8c0c694b24d25e677776f5ca26dce46fd6b0489f9c8339391", size = 791662, upload-time = "2025-11-03T21:30:57.262Z" },
-    { url = "https://files.pythonhosted.org/packages/15/fc/e4c31d061eced63fbf1ce9d853975f912c61a7d406ea14eda2dd355f48e7/regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2ab815eb8a96379a27c3b6157fcb127c8f59c36f043c1678110cea492868f1d5", size = 782587, upload-time = "2025-11-03T21:30:58.788Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/bb/5e30c7394bcf63f0537121c23e796be67b55a8847c3956ae6068f4c70702/regex-2025.11.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:728a9d2d173a65b62bdc380b7932dd8e74ed4295279a8fe1021204ce210803e7", size = 774709, upload-time = "2025-11-03T21:31:00.081Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/c4/fce773710af81b0cb37cb4ff0947e75d5d17dee304b93d940b87a67fc2f4/regex-2025.11.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:509dc827f89c15c66a0c216331260d777dd6c81e9a4e4f830e662b0bb296c313", size = 845773, upload-time = "2025-11-03T21:31:01.583Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/5e/9466a7ec4b8ec282077095c6eb50a12a389d2e036581134d4919e8ca518c/regex-2025.11.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:849202cd789e5f3cf5dcc7822c34b502181b4824a65ff20ce82da5524e45e8e9", size = 836164, upload-time = "2025-11-03T21:31:03.244Z" },
-    { url = "https://files.pythonhosted.org/packages/95/18/82980a60e8ed1594eb3c89eb814fb276ef51b9af7caeab1340bfd8564af6/regex-2025.11.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b6f78f98741dcc89607c16b1e9426ee46ce4bf31ac5e6b0d40e81c89f3481ea5", size = 779832, upload-time = "2025-11-03T21:31:04.876Z" },
-    { url = "https://files.pythonhosted.org/packages/03/cc/90ab0fdbe6dce064a42015433f9152710139fb04a8b81b4fb57a1cb63ffa/regex-2025.11.3-cp310-cp310-win32.whl", hash = "sha256:149eb0bba95231fb4f6d37c8f760ec9fa6fabf65bab555e128dde5f2475193ec", size = 265802, upload-time = "2025-11-03T21:31:06.581Z" },
-    { url = "https://files.pythonhosted.org/packages/34/9d/e9e8493a85f3b1ddc4a5014465f5c2b78c3ea1cbf238dcfde78956378041/regex-2025.11.3-cp310-cp310-win_amd64.whl", hash = "sha256:ee3a83ce492074c35a74cc76cf8235d49e77b757193a5365ff86e3f2f93db9fd", size = 277722, upload-time = "2025-11-03T21:31:08.144Z" },
-    { url = "https://files.pythonhosted.org/packages/15/c4/b54b24f553966564506dbf873a3e080aef47b356a3b39b5d5aba992b50db/regex-2025.11.3-cp310-cp310-win_arm64.whl", hash = "sha256:38af559ad934a7b35147716655d4a2f79fcef2d695ddfe06a06ba40ae631fa7e", size = 270289, upload-time = "2025-11-03T21:31:10.267Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/90/4fb5056e5f03a7048abd2b11f598d464f0c167de4f2a51aa868c376b8c70/regex-2025.11.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eadade04221641516fa25139273505a1c19f9bf97589a05bc4cfcd8b4a618031", size = 488081, upload-time = "2025-11-03T21:31:11.946Z" },
-    { url = "https://files.pythonhosted.org/packages/85/23/63e481293fac8b069d84fba0299b6666df720d875110efd0338406b5d360/regex-2025.11.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:feff9e54ec0dd3833d659257f5c3f5322a12eee58ffa360984b716f8b92983f4", size = 290554, upload-time = "2025-11-03T21:31:13.387Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/9d/b101d0262ea293a0066b4522dfb722eb6a8785a8c3e084396a5f2c431a46/regex-2025.11.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3b30bc921d50365775c09a7ed446359e5c0179e9e2512beec4a60cbcef6ddd50", size = 288407, upload-time = "2025-11-03T21:31:14.809Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/64/79241c8209d5b7e00577ec9dca35cd493cc6be35b7d147eda367d6179f6d/regex-2025.11.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f99be08cfead2020c7ca6e396c13543baea32343b7a9a5780c462e323bd8872f", size = 793418, upload-time = "2025-11-03T21:31:16.556Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/e2/23cd5d3573901ce8f9757c92ca4db4d09600b865919b6d3e7f69f03b1afd/regex-2025.11.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6dd329a1b61c0ee95ba95385fb0c07ea0d3fe1a21e1349fa2bec272636217118", size = 860448, upload-time = "2025-11-03T21:31:18.12Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/4c/aecf31beeaa416d0ae4ecb852148d38db35391aac19c687b5d56aedf3a8b/regex-2025.11.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c5238d32f3c5269d9e87be0cf096437b7622b6920f5eac4fd202468aaeb34d2", size = 907139, upload-time = "2025-11-03T21:31:20.753Z" },
-    { url = "https://files.pythonhosted.org/packages/61/22/b8cb00df7d2b5e0875f60628594d44dba283e951b1ae17c12f99e332cc0a/regex-2025.11.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10483eefbfb0adb18ee9474498c9a32fcf4e594fbca0543bb94c48bac6183e2e", size = 800439, upload-time = "2025-11-03T21:31:22.069Z" },
-    { url = "https://files.pythonhosted.org/packages/02/a8/c4b20330a5cdc7a8eb265f9ce593f389a6a88a0c5f280cf4d978f33966bc/regex-2025.11.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78c2d02bb6e1da0720eedc0bad578049cad3f71050ef8cd065ecc87691bed2b0", size = 782965, upload-time = "2025-11-03T21:31:23.598Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/4c/ae3e52988ae74af4b04d2af32fee4e8077f26e51b62ec2d12d246876bea2/regex-2025.11.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e6b49cd2aad93a1790ce9cffb18964f6d3a4b0b3dbdbd5de094b65296fce6e58", size = 854398, upload-time = "2025-11-03T21:31:25.008Z" },
-    { url = "https://files.pythonhosted.org/packages/06/d1/a8b9cf45874eda14b2e275157ce3b304c87e10fb38d9fc26a6e14eb18227/regex-2025.11.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:885b26aa3ee56433b630502dc3d36ba78d186a00cc535d3806e6bfd9ed3c70ab", size = 845897, upload-time = "2025-11-03T21:31:26.427Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/fe/1830eb0236be93d9b145e0bd8ab499f31602fe0999b1f19e99955aa8fe20/regex-2025.11.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ddd76a9f58e6a00f8772e72cff8ebcff78e022be95edf018766707c730593e1e", size = 788906, upload-time = "2025-11-03T21:31:28.078Z" },
-    { url = "https://files.pythonhosted.org/packages/66/47/dc2577c1f95f188c1e13e2e69d8825a5ac582ac709942f8a03af42ed6e93/regex-2025.11.3-cp311-cp311-win32.whl", hash = "sha256:3e816cc9aac1cd3cc9a4ec4d860f06d40f994b5c7b4d03b93345f44e08cc68bf", size = 265812, upload-time = "2025-11-03T21:31:29.72Z" },
-    { url = "https://files.pythonhosted.org/packages/50/1e/15f08b2f82a9bbb510621ec9042547b54d11e83cb620643ebb54e4eb7d71/regex-2025.11.3-cp311-cp311-win_amd64.whl", hash = "sha256:087511f5c8b7dfbe3a03f5d5ad0c2a33861b1fc387f21f6f60825a44865a385a", size = 277737, upload-time = "2025-11-03T21:31:31.422Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/fc/6500eb39f5f76c5e47a398df82e6b535a5e345f839581012a418b16f9cc3/regex-2025.11.3-cp311-cp311-win_arm64.whl", hash = "sha256:1ff0d190c7f68ae7769cd0313fe45820ba07ffebfddfaa89cc1eb70827ba0ddc", size = 270290, upload-time = "2025-11-03T21:31:33.041Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/74/18f04cb53e58e3fb107439699bd8375cf5a835eec81084e0bddbd122e4c2/regex-2025.11.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bc8ab71e2e31b16e40868a40a69007bc305e1109bd4658eb6cad007e0bf67c41", size = 489312, upload-time = "2025-11-03T21:31:34.343Z" },
-    { url = "https://files.pythonhosted.org/packages/78/3f/37fcdd0d2b1e78909108a876580485ea37c91e1acf66d3bb8e736348f441/regex-2025.11.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:22b29dda7e1f7062a52359fca6e58e548e28c6686f205e780b02ad8ef710de36", size = 291256, upload-time = "2025-11-03T21:31:35.675Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/26/0a575f58eb23b7ebd67a45fccbc02ac030b737b896b7e7a909ffe43ffd6a/regex-2025.11.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3a91e4a29938bc1a082cc28fdea44be420bf2bebe2665343029723892eb073e1", size = 288921, upload-time = "2025-11-03T21:31:37.07Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/98/6a8dff667d1af907150432cf5abc05a17ccd32c72a3615410d5365ac167a/regex-2025.11.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08b884f4226602ad40c5d55f52bf91a9df30f513864e0054bad40c0e9cf1afb7", size = 798568, upload-time = "2025-11-03T21:31:38.784Z" },
-    { url = "https://files.pythonhosted.org/packages/64/15/92c1db4fa4e12733dd5a526c2dd2b6edcbfe13257e135fc0f6c57f34c173/regex-2025.11.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3e0b11b2b2433d1c39c7c7a30e3f3d0aeeea44c2a8d0bae28f6b95f639927a69", size = 864165, upload-time = "2025-11-03T21:31:40.559Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/e7/3ad7da8cdee1ce66c7cd37ab5ab05c463a86ffeb52b1a25fe7bd9293b36c/regex-2025.11.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:87eb52a81ef58c7ba4d45c3ca74e12aa4b4e77816f72ca25258a85b3ea96cb48", size = 912182, upload-time = "2025-11-03T21:31:42.002Z" },
-    { url = "https://files.pythonhosted.org/packages/84/bd/9ce9f629fcb714ffc2c3faf62b6766ecb7a585e1e885eb699bcf130a5209/regex-2025.11.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a12ab1f5c29b4e93db518f5e3872116b7e9b1646c9f9f426f777b50d44a09e8c", size = 803501, upload-time = "2025-11-03T21:31:43.815Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/0f/8dc2e4349d8e877283e6edd6c12bdcebc20f03744e86f197ab6e4492bf08/regex-2025.11.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7521684c8c7c4f6e88e35ec89680ee1aa8358d3f09d27dfbdf62c446f5d4c695", size = 787842, upload-time = "2025-11-03T21:31:45.353Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/73/cff02702960bc185164d5619c0c62a2f598a6abff6695d391b096237d4ab/regex-2025.11.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7fe6e5440584e94cc4b3f5f4d98a25e29ca12dccf8873679a635638349831b98", size = 858519, upload-time = "2025-11-03T21:31:46.814Z" },
-    { url = "https://files.pythonhosted.org/packages/61/83/0e8d1ae71e15bc1dc36231c90b46ee35f9d52fab2e226b0e039e7ea9c10a/regex-2025.11.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:8e026094aa12b43f4fd74576714e987803a315c76edb6b098b9809db5de58f74", size = 850611, upload-time = "2025-11-03T21:31:48.289Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/f5/70a5cdd781dcfaa12556f2955bf170cd603cb1c96a1827479f8faea2df97/regex-2025.11.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:435bbad13e57eb5606a68443af62bed3556de2f46deb9f7d4237bc2f1c9fb3a0", size = 789759, upload-time = "2025-11-03T21:31:49.759Z" },
-    { url = "https://files.pythonhosted.org/packages/59/9b/7c29be7903c318488983e7d97abcf8ebd3830e4c956c4c540005fcfb0462/regex-2025.11.3-cp312-cp312-win32.whl", hash = "sha256:3839967cf4dc4b985e1570fd8d91078f0c519f30491c60f9ac42a8db039be204", size = 266194, upload-time = "2025-11-03T21:31:51.53Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/67/3b92df89f179d7c367be654ab5626ae311cb28f7d5c237b6bb976cd5fbbb/regex-2025.11.3-cp312-cp312-win_amd64.whl", hash = "sha256:e721d1b46e25c481dc5ded6f4b3f66c897c58d2e8cfdf77bbced84339108b0b9", size = 277069, upload-time = "2025-11-03T21:31:53.151Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/55/85ba4c066fe5094d35b249c3ce8df0ba623cfd35afb22d6764f23a52a1c5/regex-2025.11.3-cp312-cp312-win_arm64.whl", hash = "sha256:64350685ff08b1d3a6fff33f45a9ca183dc1d58bbfe4981604e70ec9801bbc26", size = 270330, upload-time = "2025-11-03T21:31:54.514Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/a7/dda24ebd49da46a197436ad96378f17df30ceb40e52e859fc42cac45b850/regex-2025.11.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c1e448051717a334891f2b9a620fe36776ebf3dd8ec46a0b877c8ae69575feb4", size = 489081, upload-time = "2025-11-03T21:31:55.9Z" },
-    { url = "https://files.pythonhosted.org/packages/19/22/af2dc751aacf88089836aa088a1a11c4f21a04707eb1b0478e8e8fb32847/regex-2025.11.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9b5aca4d5dfd7fbfbfbdaf44850fcc7709a01146a797536a8f84952e940cca76", size = 291123, upload-time = "2025-11-03T21:31:57.758Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/88/1a3ea5672f4b0a84802ee9891b86743438e7c04eb0b8f8c4e16a42375327/regex-2025.11.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:04d2765516395cf7dda331a244a3282c0f5ae96075f728629287dfa6f76ba70a", size = 288814, upload-time = "2025-11-03T21:32:01.12Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/8c/f5987895bf42b8ddeea1b315c9fedcfe07cadee28b9c98cf50d00adcb14d/regex-2025.11.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d9903ca42bfeec4cebedba8022a7c97ad2aab22e09573ce9976ba01b65e4361", size = 798592, upload-time = "2025-11-03T21:32:03.006Z" },
-    { url = "https://files.pythonhosted.org/packages/99/2a/6591ebeede78203fa77ee46a1c36649e02df9eaa77a033d1ccdf2fcd5d4e/regex-2025.11.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:639431bdc89d6429f6721625e8129413980ccd62e9d3f496be618a41d205f160", size = 864122, upload-time = "2025-11-03T21:32:04.553Z" },
-    { url = "https://files.pythonhosted.org/packages/94/d6/be32a87cf28cf8ed064ff281cfbd49aefd90242a83e4b08b5a86b38e8eb4/regex-2025.11.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f117efad42068f9715677c8523ed2be1518116d1c49b1dd17987716695181efe", size = 912272, upload-time = "2025-11-03T21:32:06.148Z" },
-    { url = "https://files.pythonhosted.org/packages/62/11/9bcef2d1445665b180ac7f230406ad80671f0fc2a6ffb93493b5dd8cd64c/regex-2025.11.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4aecb6f461316adf9f1f0f6a4a1a3d79e045f9b71ec76055a791affa3b285850", size = 803497, upload-time = "2025-11-03T21:32:08.162Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/a7/da0dc273d57f560399aa16d8a68ae7f9b57679476fc7ace46501d455fe84/regex-2025.11.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3b3a5f320136873cc5561098dfab677eea139521cb9a9e8db98b7e64aef44cbc", size = 787892, upload-time = "2025-11-03T21:32:09.769Z" },
-    { url = "https://files.pythonhosted.org/packages/da/4b/732a0c5a9736a0b8d6d720d4945a2f1e6f38f87f48f3173559f53e8d5d82/regex-2025.11.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:75fa6f0056e7efb1f42a1c34e58be24072cb9e61a601340cc1196ae92326a4f9", size = 858462, upload-time = "2025-11-03T21:32:11.769Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/f5/a2a03df27dc4c2d0c769220f5110ba8c4084b0bfa9ab0f9b4fcfa3d2b0fc/regex-2025.11.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:dbe6095001465294f13f1adcd3311e50dd84e5a71525f20a10bd16689c61ce0b", size = 850528, upload-time = "2025-11-03T21:32:13.906Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/09/e1cd5bee3841c7f6eb37d95ca91cdee7100b8f88b81e41c2ef426910891a/regex-2025.11.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:454d9b4ae7881afbc25015b8627c16d88a597479b9dea82b8c6e7e2e07240dc7", size = 789866, upload-time = "2025-11-03T21:32:15.748Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/51/702f5ea74e2a9c13d855a6a85b7f80c30f9e72a95493260193c07f3f8d74/regex-2025.11.3-cp313-cp313-win32.whl", hash = "sha256:28ba4d69171fc6e9896337d4fc63a43660002b7da53fc15ac992abcf3410917c", size = 266189, upload-time = "2025-11-03T21:32:17.493Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/00/6e29bb314e271a743170e53649db0fdb8e8ff0b64b4f425f5602f4eb9014/regex-2025.11.3-cp313-cp313-win_amd64.whl", hash = "sha256:bac4200befe50c670c405dc33af26dad5a3b6b255dd6c000d92fe4629f9ed6a5", size = 277054, upload-time = "2025-11-03T21:32:19.042Z" },
-    { url = "https://files.pythonhosted.org/packages/25/f1/b156ff9f2ec9ac441710764dda95e4edaf5f36aca48246d1eea3f1fd96ec/regex-2025.11.3-cp313-cp313-win_arm64.whl", hash = "sha256:2292cd5a90dab247f9abe892ac584cb24f0f54680c73fcb4a7493c66c2bf2467", size = 270325, upload-time = "2025-11-03T21:32:21.338Z" },
-    { url = "https://files.pythonhosted.org/packages/20/28/fd0c63357caefe5680b8ea052131acbd7f456893b69cc2a90cc3e0dc90d4/regex-2025.11.3-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:1eb1ebf6822b756c723e09f5186473d93236c06c579d2cc0671a722d2ab14281", size = 491984, upload-time = "2025-11-03T21:32:23.466Z" },
-    { url = "https://files.pythonhosted.org/packages/df/ec/7014c15626ab46b902b3bcc4b28a7bae46d8f281fc7ea9c95e22fcaaa917/regex-2025.11.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1e00ec2970aab10dc5db34af535f21fcf32b4a31d99e34963419636e2f85ae39", size = 292673, upload-time = "2025-11-03T21:32:25.034Z" },
-    { url = "https://files.pythonhosted.org/packages/23/ab/3b952ff7239f20d05f1f99e9e20188513905f218c81d52fb5e78d2bf7634/regex-2025.11.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a4cb042b615245d5ff9b3794f56be4138b5adc35a4166014d31d1814744148c7", size = 291029, upload-time = "2025-11-03T21:32:26.528Z" },
-    { url = "https://files.pythonhosted.org/packages/21/7e/3dc2749fc684f455f162dcafb8a187b559e2614f3826877d3844a131f37b/regex-2025.11.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:44f264d4bf02f3176467d90b294d59bf1db9fe53c141ff772f27a8b456b2a9ed", size = 807437, upload-time = "2025-11-03T21:32:28.363Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/0b/d529a85ab349c6a25d1ca783235b6e3eedf187247eab536797021f7126c6/regex-2025.11.3-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7be0277469bf3bd7a34a9c57c1b6a724532a0d235cd0dc4e7f4316f982c28b19", size = 873368, upload-time = "2025-11-03T21:32:30.4Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/18/2d868155f8c9e3e9d8f9e10c64e9a9f496bb8f7e037a88a8bed26b435af6/regex-2025.11.3-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0d31e08426ff4b5b650f68839f5af51a92a5b51abd8554a60c2fbc7c71f25d0b", size = 914921, upload-time = "2025-11-03T21:32:32.123Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/71/9d72ff0f354fa783fe2ba913c8734c3b433b86406117a8db4ea2bf1c7a2f/regex-2025.11.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e43586ce5bd28f9f285a6e729466841368c4a0353f6fd08d4ce4630843d3648a", size = 812708, upload-time = "2025-11-03T21:32:34.305Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/19/ce4bf7f5575c97f82b6e804ffb5c4e940c62609ab2a0d9538d47a7fdf7d4/regex-2025.11.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0f9397d561a4c16829d4e6ff75202c1c08b68a3bdbfe29dbfcdb31c9830907c6", size = 795472, upload-time = "2025-11-03T21:32:36.364Z" },
-    { url = "https://files.pythonhosted.org/packages/03/86/fd1063a176ffb7b2315f9a1b08d17b18118b28d9df163132615b835a26ee/regex-2025.11.3-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:dd16e78eb18ffdb25ee33a0682d17912e8cc8a770e885aeee95020046128f1ce", size = 868341, upload-time = "2025-11-03T21:32:38.042Z" },
-    { url = "https://files.pythonhosted.org/packages/12/43/103fb2e9811205e7386366501bc866a164a0430c79dd59eac886a2822950/regex-2025.11.3-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:ffcca5b9efe948ba0661e9df0fa50d2bc4b097c70b9810212d6b62f05d83b2dd", size = 854666, upload-time = "2025-11-03T21:32:40.079Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/22/e392e53f3869b75804762c7c848bd2dd2abf2b70fb0e526f58724638bd35/regex-2025.11.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c56b4d162ca2b43318ac671c65bd4d563e841a694ac70e1a976ac38fcf4ca1d2", size = 799473, upload-time = "2025-11-03T21:32:42.148Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/f9/8bd6b656592f925b6845fcbb4d57603a3ac2fb2373344ffa1ed70aa6820a/regex-2025.11.3-cp313-cp313t-win32.whl", hash = "sha256:9ddc42e68114e161e51e272f667d640f97e84a2b9ef14b7477c53aac20c2d59a", size = 268792, upload-time = "2025-11-03T21:32:44.13Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/87/0e7d603467775ff65cd2aeabf1b5b50cc1c3708556a8b849a2fa4dd1542b/regex-2025.11.3-cp313-cp313t-win_amd64.whl", hash = "sha256:7a7c7fdf755032ffdd72c77e3d8096bdcb0eb92e89e17571a196f03d88b11b3c", size = 280214, upload-time = "2025-11-03T21:32:45.853Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/d0/2afc6f8e94e2b64bfb738a7c2b6387ac1699f09f032d363ed9447fd2bb57/regex-2025.11.3-cp313-cp313t-win_arm64.whl", hash = "sha256:df9eb838c44f570283712e7cff14c16329a9f0fb19ca492d21d4b7528ee6821e", size = 271469, upload-time = "2025-11-03T21:32:48.026Z" },
-    { url = "https://files.pythonhosted.org/packages/31/e9/f6e13de7e0983837f7b6d238ad9458800a874bf37c264f7923e63409944c/regex-2025.11.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9697a52e57576c83139d7c6f213d64485d3df5bf84807c35fa409e6c970801c6", size = 489089, upload-time = "2025-11-03T21:32:50.027Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/5c/261f4a262f1fa65141c1b74b255988bd2fa020cc599e53b080667d591cfc/regex-2025.11.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e18bc3f73bd41243c9b38a6d9f2366cd0e0137a9aebe2d8ff76c5b67d4c0a3f4", size = 291059, upload-time = "2025-11-03T21:32:51.682Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/57/f14eeb7f072b0e9a5a090d1712741fd8f214ec193dba773cf5410108bb7d/regex-2025.11.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:61a08bcb0ec14ff4e0ed2044aad948d0659604f824cbd50b55e30b0ec6f09c73", size = 288900, upload-time = "2025-11-03T21:32:53.569Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/6b/1d650c45e99a9b327586739d926a1cd4e94666b1bd4af90428b36af66dc7/regex-2025.11.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9c30003b9347c24bcc210958c5d167b9e4f9be786cb380a7d32f14f9b84674f", size = 799010, upload-time = "2025-11-03T21:32:55.222Z" },
-    { url = "https://files.pythonhosted.org/packages/99/ee/d66dcbc6b628ce4e3f7f0cbbb84603aa2fc0ffc878babc857726b8aab2e9/regex-2025.11.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4e1e592789704459900728d88d41a46fe3969b82ab62945560a31732ffc19a6d", size = 864893, upload-time = "2025-11-03T21:32:57.239Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/2d/f238229f1caba7ac87a6c4153d79947fb0261415827ae0f77c304260c7d3/regex-2025.11.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6538241f45eb5a25aa575dbba1069ad786f68a4f2773a29a2bd3dd1f9de787be", size = 911522, upload-time = "2025-11-03T21:32:59.274Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/3d/22a4eaba214a917c80e04f6025d26143690f0419511e0116508e24b11c9b/regex-2025.11.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce22519c989bb72a7e6b36a199384c53db7722fe669ba891da75907fe3587db", size = 803272, upload-time = "2025-11-03T21:33:01.393Z" },
-    { url = "https://files.pythonhosted.org/packages/84/b1/03188f634a409353a84b5ef49754b97dbcc0c0f6fd6c8ede505a8960a0a4/regex-2025.11.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:66d559b21d3640203ab9075797a55165d79017520685fb407b9234d72ab63c62", size = 787958, upload-time = "2025-11-03T21:33:03.379Z" },
-    { url = "https://files.pythonhosted.org/packages/99/6a/27d072f7fbf6fadd59c64d210305e1ff865cc3b78b526fd147db768c553b/regex-2025.11.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:669dcfb2e38f9e8c69507bace46f4889e3abbfd9b0c29719202883c0a603598f", size = 859289, upload-time = "2025-11-03T21:33:05.374Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/70/1b3878f648e0b6abe023172dacb02157e685564853cc363d9961bcccde4e/regex-2025.11.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:32f74f35ff0f25a5021373ac61442edcb150731fbaa28286bbc8bb1582c89d02", size = 850026, upload-time = "2025-11-03T21:33:07.131Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/d5/68e25559b526b8baab8e66839304ede68ff6727237a47727d240006bd0ff/regex-2025.11.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e6c7a21dffba883234baefe91bc3388e629779582038f75d2a5be918e250f0ed", size = 789499, upload-time = "2025-11-03T21:33:09.141Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/df/43971264857140a350910d4e33df725e8c94dd9dee8d2e4729fa0d63d49e/regex-2025.11.3-cp314-cp314-win32.whl", hash = "sha256:795ea137b1d809eb6836b43748b12634291c0ed55ad50a7d72d21edf1cd565c4", size = 271604, upload-time = "2025-11-03T21:33:10.9Z" },
-    { url = "https://files.pythonhosted.org/packages/01/6f/9711b57dc6894a55faf80a4c1b5aa4f8649805cb9c7aef46f7d27e2b9206/regex-2025.11.3-cp314-cp314-win_amd64.whl", hash = "sha256:9f95fbaa0ee1610ec0fc6b26668e9917a582ba80c52cc6d9ada15e30aa9ab9ad", size = 280320, upload-time = "2025-11-03T21:33:12.572Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/7e/f6eaa207d4377481f5e1775cdeb5a443b5a59b392d0065f3417d31d80f87/regex-2025.11.3-cp314-cp314-win_arm64.whl", hash = "sha256:dfec44d532be4c07088c3de2876130ff0fbeeacaa89a137decbbb5f665855a0f", size = 273372, upload-time = "2025-11-03T21:33:14.219Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/06/49b198550ee0f5e4184271cee87ba4dfd9692c91ec55289e6282f0f86ccf/regex-2025.11.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ba0d8a5d7f04f73ee7d01d974d47c5834f8a1b0224390e4fe7c12a3a92a78ecc", size = 491985, upload-time = "2025-11-03T21:33:16.555Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/bf/abdafade008f0b1c9da10d934034cb670432d6cf6cbe38bbb53a1cfd6cf8/regex-2025.11.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:442d86cf1cfe4faabf97db7d901ef58347efd004934da045c745e7b5bd57ac49", size = 292669, upload-time = "2025-11-03T21:33:18.32Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/ef/0c357bb8edbd2ad8e273fcb9e1761bc37b8acbc6e1be050bebd6475f19c1/regex-2025.11.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:fd0a5e563c756de210bb964789b5abe4f114dacae9104a47e1a649b910361536", size = 291030, upload-time = "2025-11-03T21:33:20.048Z" },
-    { url = "https://files.pythonhosted.org/packages/79/06/edbb67257596649b8fb088d6aeacbcb248ac195714b18a65e018bf4c0b50/regex-2025.11.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf3490bcbb985a1ae97b2ce9ad1c0f06a852d5b19dde9b07bdf25bf224248c95", size = 807674, upload-time = "2025-11-03T21:33:21.797Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/d9/ad4deccfce0ea336296bd087f1a191543bb99ee1c53093dcd4c64d951d00/regex-2025.11.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3809988f0a8b8c9dcc0f92478d6501fac7200b9ec56aecf0ec21f4a2ec4b6009", size = 873451, upload-time = "2025-11-03T21:33:23.741Z" },
-    { url = "https://files.pythonhosted.org/packages/13/75/a55a4724c56ef13e3e04acaab29df26582f6978c000ac9cd6810ad1f341f/regex-2025.11.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f4ff94e58e84aedb9c9fce66d4ef9f27a190285b451420f297c9a09f2b9abee9", size = 914980, upload-time = "2025-11-03T21:33:25.999Z" },
-    { url = "https://files.pythonhosted.org/packages/67/1e/a1657ee15bd9116f70d4a530c736983eed997b361e20ecd8f5ca3759d5c5/regex-2025.11.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eb542fd347ce61e1321b0a6b945d5701528dca0cd9759c2e3bb8bd57e47964d", size = 812852, upload-time = "2025-11-03T21:33:27.852Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/6f/f7516dde5506a588a561d296b2d0044839de06035bb486b326065b4c101e/regex-2025.11.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2d5919075a1f2e413c00b056ea0c2f065b3f5fe83c3d07d325ab92dce51d6", size = 795566, upload-time = "2025-11-03T21:33:32.364Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/dd/3d10b9e170cc16fb34cb2cef91513cf3df65f440b3366030631b2984a264/regex-2025.11.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:3f8bf11a4827cc7ce5a53d4ef6cddd5ad25595d3c1435ef08f76825851343154", size = 868463, upload-time = "2025-11-03T21:33:34.459Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/8e/935e6beff1695aa9085ff83195daccd72acc82c81793df480f34569330de/regex-2025.11.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:22c12d837298651e5550ac1d964e4ff57c3f56965fc1812c90c9fb2028eaf267", size = 854694, upload-time = "2025-11-03T21:33:36.793Z" },
-    { url = "https://files.pythonhosted.org/packages/92/12/10650181a040978b2f5720a6a74d44f841371a3d984c2083fc1752e4acf6/regex-2025.11.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:62ba394a3dda9ad41c7c780f60f6e4a70988741415ae96f6d1bf6c239cf01379", size = 799691, upload-time = "2025-11-03T21:33:39.079Z" },
-    { url = "https://files.pythonhosted.org/packages/67/90/8f37138181c9a7690e7e4cb388debbd389342db3c7381d636d2875940752/regex-2025.11.3-cp314-cp314t-win32.whl", hash = "sha256:4bf146dca15cdd53224a1bf46d628bd7590e4a07fbb69e720d561aea43a32b38", size = 274583, upload-time = "2025-11-03T21:33:41.302Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/cd/867f5ec442d56beb56f5f854f40abcfc75e11d10b11fdb1869dd39c63aaf/regex-2025.11.3-cp314-cp314t-win_amd64.whl", hash = "sha256:adad1a1bcf1c9e76346e091d22d23ac54ef28e1365117d99521631078dfec9de", size = 284286, upload-time = "2025-11-03T21:33:43.324Z" },
-    { url = "https://files.pythonhosted.org/packages/20/31/32c0c4610cbc070362bf1d2e4ea86d1ea29014d400a6d6c2486fcfd57766/regex-2025.11.3-cp314-cp314t-win_arm64.whl", hash = "sha256:c54f768482cef41e219720013cd05933b6f971d9562544d691c68699bf2b6801", size = 274741, upload-time = "2025-11-03T21:33:45.557Z" },
+version = "2026.2.19"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/c0/d8079d4f6342e4cec5c3e7d7415b5cd3e633d5f4124f7a4626908dbe84c7/regex-2026.2.19.tar.gz", hash = "sha256:6fb8cb09b10e38f3ae17cc6dc04a1df77762bd0351b6ba9041438e7cc85ec310", size = 414973, upload-time = "2026-02-19T19:03:47.899Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/de/f10b4506acfd684de4e42b0aa56ccea1a778a18864da8f6d319a40591062/regex-2026.2.19-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f5a37a17d110f9d5357a43aa7e3507cb077bf3143d1c549a45c4649e90e40a70", size = 488369, upload-time = "2026-02-19T18:59:45.01Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/2f/b4eaef1f0b4d0bf2a73eaf07c08f6c13422918a4180c9211ce0521746d0c/regex-2026.2.19-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:676c4e6847a83a1d5732b4ed553881ad36f0a8133627bb695a89ecf3571499d3", size = 290743, upload-time = "2026-02-19T18:59:48.527Z" },
+    { url = "https://files.pythonhosted.org/packages/76/7c/805413bd0a88d04688c0725c222cfb811bd54a2f571004c24199a1ae55d6/regex-2026.2.19-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:82336faeecac33297cd42857c3b36f12b91810e3fdd276befdd128f73a2b43fa", size = 288652, upload-time = "2026-02-19T18:59:50.2Z" },
+    { url = "https://files.pythonhosted.org/packages/08/ff/2c4cd530a878b1975398e76faef4285f11e7c9ccf1aaedfd528bfcc1f580/regex-2026.2.19-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:52136f5b71f095cb74b736cc3a1b578030dada2e361ef2f07ca582240b703946", size = 781759, upload-time = "2026-02-19T18:59:51.836Z" },
+    { url = "https://files.pythonhosted.org/packages/37/45/9608ab1b41f6740ff4076eabadde8e8b3f3400942b348ac41e8599ccc131/regex-2026.2.19-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4192464fe3e6cb0ef6751f7d3b16f886d8270d359ed1590dd555539d364f0ff7", size = 850947, upload-time = "2026-02-19T18:59:53.739Z" },
+    { url = "https://files.pythonhosted.org/packages/90/3a/66471b6c4f7cac17e14bf5300e46661bba2b17ffb0871bd2759e837a6f82/regex-2026.2.19-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e561dd47a85d2660d3d3af4e6cb2da825cf20f121e577147963f875b83d32786", size = 898794, upload-time = "2026-02-19T18:59:55.993Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/d2/38c53929a5931f7398e5e49f5a5a3079cb2aba30119b4350608364cfad8c/regex-2026.2.19-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00ec994d7824bf01cd6c7d14c7a6a04d9aeaf7c42a2bc22d2359d715634d539b", size = 791922, upload-time = "2026-02-19T18:59:58.216Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/bd/b046e065630fa25059d9c195b7b5308ea94da45eee65d40879772500f74c/regex-2026.2.19-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2cb00aabd96b345d56a8c2bc328c8d6c4d29935061e05078bf1f02302e12abf5", size = 783345, upload-time = "2026-02-19T18:59:59.948Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/8f/045c643d2fa255a985e8f87d848e4be230b711a8935e4bdc58e60b8f7b84/regex-2026.2.19-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f374366ed35673ea81b86a8859c457d4fae6ba092b71024857e9e237410c7404", size = 768055, upload-time = "2026-02-19T19:00:01.65Z" },
+    { url = "https://files.pythonhosted.org/packages/72/9f/ab7ae9f5447559562f1a788bbc85c0e526528c5e6c20542d18e4afc86aad/regex-2026.2.19-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f9417fd853fcd00b7d55167e692966dd12d95ba1a88bf08a62002ccd85030790", size = 774955, upload-time = "2026-02-19T19:00:03.368Z" },
+    { url = "https://files.pythonhosted.org/packages/37/5c/f16fc23c56f60b6f4ff194604a6e53bb8aec7b6e8e4a23a482dee8d77235/regex-2026.2.19-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:12e86a01594031abf892686fcb309b041bf3de3d13d99eb7e2b02a8f3c687df1", size = 846010, upload-time = "2026-02-19T19:00:05.079Z" },
+    { url = "https://files.pythonhosted.org/packages/51/c8/6be4c854135d7c9f35d4deeafdaf124b039ecb4ffcaeb7ed0495ad2c97ca/regex-2026.2.19-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:79014115e6fdf18fd9b32e291d58181bf42d4298642beaa13fd73e69810e4cb6", size = 755938, upload-time = "2026-02-19T19:00:07.148Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/8d/f683d49b9663a5324b95a328e69d397f6dade7cb84154eec116bf79fe150/regex-2026.2.19-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:31aefac2506967b7dd69af2c58eca3cc8b086d4110b66d6ac6e9026f0ee5b697", size = 835773, upload-time = "2026-02-19T19:00:08.939Z" },
+    { url = "https://files.pythonhosted.org/packages/16/cd/619224b90da09f167fe4497c350a0d0b30edc539ee9244bf93e604c073c3/regex-2026.2.19-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:49cef7bb2a491f91a8869c7cdd90babf0a417047ab0bf923cd038ed2eab2ccb8", size = 780075, upload-time = "2026-02-19T19:00:10.838Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/88/19cfb0c262d6f9d722edef29157125418bf90eb3508186bf79335afeedae/regex-2026.2.19-cp310-cp310-win32.whl", hash = "sha256:3a039474986e7a314ace6efb9ce52f5da2bdb80ac4955358723d350ec85c32ad", size = 266004, upload-time = "2026-02-19T19:00:12.371Z" },
+    { url = "https://files.pythonhosted.org/packages/82/af/5b487e0287ef72545d7ae92edecdacbe3d44e531cac24fda7de5598ba8dd/regex-2026.2.19-cp310-cp310-win_amd64.whl", hash = "sha256:5b81ff4f9cad99f90c807a00c5882fbcda86d8b3edd94e709fb531fc52cb3d25", size = 277895, upload-time = "2026-02-19T19:00:13.75Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/19/b6715a187ffca4d2979af92a46ce922445ba41f910bf187ccd666a2d52ef/regex-2026.2.19-cp310-cp310-win_arm64.whl", hash = "sha256:a032bc01a4bc73fc3cadba793fce28eb420da39338f47910c59ffcc11a5ba5ef", size = 270465, upload-time = "2026-02-19T19:00:15.127Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/93/43f405a98f54cc59c786efb4fc0b644615ed2392fc89d57d30da11f35b5b/regex-2026.2.19-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:93b16a18cadb938f0f2306267161d57eb33081a861cee9ffcd71e60941eb5dfc", size = 488365, upload-time = "2026-02-19T19:00:17.857Z" },
+    { url = "https://files.pythonhosted.org/packages/66/46/da0efce22cd8f5ae28eeb25ac69703f49edcad3331ac22440776f4ea0867/regex-2026.2.19-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:78af1e499cab704131f6f4e2f155b7f54ce396ca2acb6ef21a49507e4752e0be", size = 290737, upload-time = "2026-02-19T19:00:19.869Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/19/f735078448132c1c974974d30d5306337bc297fe6b6f126164bff72c1019/regex-2026.2.19-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:eb20c11aa4c3793c9ad04c19a972078cdadb261b8429380364be28e867a843f2", size = 288654, upload-time = "2026-02-19T19:00:21.307Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/3e/6d7c24a2f423c03ad03e3fbddefa431057186ac1c4cb4fa98b03c7f39808/regex-2026.2.19-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:db5fd91eec71e7b08de10011a2223d0faa20448d4e1380b9daa179fa7bf58906", size = 793785, upload-time = "2026-02-19T19:00:22.926Z" },
+    { url = "https://files.pythonhosted.org/packages/67/32/fdb8107504b3122a79bde6705ac1f9d495ed1fe35b87d7cfc1864471999a/regex-2026.2.19-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:fdbade8acba71bb45057c2b72f477f0b527c4895f9c83e6cfc30d4a006c21726", size = 860731, upload-time = "2026-02-19T19:00:25.196Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/fd/cc8c6f05868defd840be6e75919b1c3f462357969ac2c2a0958363b4dc23/regex-2026.2.19-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:31a5f561eb111d6aae14202e7043fb0b406d3c8dddbbb9e60851725c9b38ab1d", size = 907350, upload-time = "2026-02-19T19:00:27.093Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/1b/4590db9caa8db3d5a3fe31197c4e42c15aab3643b549ef6a454525fa3a61/regex-2026.2.19-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4584a3ee5f257b71e4b693cc9be3a5104249399f4116fe518c3f79b0c6fc7083", size = 800628, upload-time = "2026-02-19T19:00:29.392Z" },
+    { url = "https://files.pythonhosted.org/packages/76/05/513eaa5b96fa579fd0b813e19ec047baaaf573d7374ff010fa139b384bf7/regex-2026.2.19-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:196553ba2a2f47904e5dc272d948a746352e2644005627467e055be19d73b39e", size = 773711, upload-time = "2026-02-19T19:00:30.996Z" },
+    { url = "https://files.pythonhosted.org/packages/95/65/5aed06d8c54563d37fea496cf888be504879a3981a7c8e12c24b2c92c209/regex-2026.2.19-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0c10869d18abb759a3317c757746cc913d6324ce128b8bcec99350df10419f18", size = 783186, upload-time = "2026-02-19T19:00:34.598Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/57/79a633ad90f2371b4ef9cd72ba3a69a1a67d0cfaab4fe6fa8586d46044ef/regex-2026.2.19-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e689fed279cbe797a6b570bd18ff535b284d057202692c73420cb93cca41aa32", size = 854854, upload-time = "2026-02-19T19:00:37.306Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/2d/0f113d477d9e91ec4545ec36c82e58be25038d06788229c91ad52da2b7f5/regex-2026.2.19-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:0782bd983f19ac7594039c9277cd6f75c89598c1d72f417e4d30d874105eb0c7", size = 762279, upload-time = "2026-02-19T19:00:39.793Z" },
+    { url = "https://files.pythonhosted.org/packages/39/cb/237e9fa4f61469fd4f037164dbe8e675a376c88cf73aaaa0aedfd305601c/regex-2026.2.19-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:dbb240c81cfed5d4a67cb86d7676d9f7ec9c3f186310bec37d8a1415210e111e", size = 846172, upload-time = "2026-02-19T19:00:42.134Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/7c/104779c5915cc4eb557a33590f8a3f68089269c64287dd769afd76c7ce61/regex-2026.2.19-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80d31c3f1fe7e4c6cd1831cd4478a0609903044dfcdc4660abfe6fb307add7f0", size = 789078, upload-time = "2026-02-19T19:00:43.908Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/4a/eae4e88b1317fb2ff57794915e0099198f51e760f6280b320adfa0ad396d/regex-2026.2.19-cp311-cp311-win32.whl", hash = "sha256:66e6a43225ff1064f8926adbafe0922b370d381c3330edaf9891cade52daa790", size = 266013, upload-time = "2026-02-19T19:00:47.274Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/29/ba89eb8fae79705e07ad1bd69e568f776159d2a8093c9dbc5303ee618298/regex-2026.2.19-cp311-cp311-win_amd64.whl", hash = "sha256:59a7a5216485a1896c5800e9feb8ff9213e11967b482633b6195d7da11450013", size = 277906, upload-time = "2026-02-19T19:00:49.011Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/1a/042d8f04b28e318df92df69d8becb0f42221eb3dd4fe5e976522f4337c76/regex-2026.2.19-cp311-cp311-win_arm64.whl", hash = "sha256:ec661807ffc14c8d14bb0b8c1bb3d5906e476bc96f98b565b709d03962ee4dd4", size = 270463, upload-time = "2026-02-19T19:00:50.988Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/73/13b39c7c9356f333e564ab4790b6cb0df125b8e64e8d6474e73da49b1955/regex-2026.2.19-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:c1665138776e4ac1aa75146669236f7a8a696433ec4e525abf092ca9189247cc", size = 489541, upload-time = "2026-02-19T19:00:52.728Z" },
+    { url = "https://files.pythonhosted.org/packages/15/77/fcc7bd9a67000d07fbcc11ed226077287a40d5c84544e62171d29d3ef59c/regex-2026.2.19-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d792b84709021945597e05656aac059526df4e0c9ef60a0eaebb306f8fafcaa8", size = 291414, upload-time = "2026-02-19T19:00:54.51Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/87/3997fc72dc59233426ef2e18dfdd105bb123812fff740ee9cc348f1a3243/regex-2026.2.19-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:db970bcce4d63b37b3f9eb8c893f0db980bbf1d404a1d8d2b17aa8189de92c53", size = 289140, upload-time = "2026-02-19T19:00:56.841Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/d0/b7dd3883ed1cff8ee0c0c9462d828aaf12be63bf5dc55453cbf423523b13/regex-2026.2.19-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03d706fbe7dfec503c8c3cb76f9352b3e3b53b623672aa49f18a251a6c71b8e6", size = 798767, upload-time = "2026-02-19T19:00:59.014Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/7e/8e2d09103832891b2b735a2515abf377db21144c6dd5ede1fb03c619bf09/regex-2026.2.19-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8dbff048c042beef60aa1848961384572c5afb9e8b290b0f1203a5c42cf5af65", size = 864436, upload-time = "2026-02-19T19:01:00.772Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/2e/afea8d23a6db1f67f45e3a0da3057104ce32e154f57dd0c8997274d45fcd/regex-2026.2.19-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ccaaf9b907ea6b4223d5cbf5fa5dff5f33dc66f4907a25b967b8a81339a6e332", size = 912391, upload-time = "2026-02-19T19:01:02.865Z" },
+    { url = "https://files.pythonhosted.org/packages/59/3c/ea5a4687adaba5e125b9bd6190153d0037325a0ba3757cc1537cc2c8dd90/regex-2026.2.19-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:75472631eee7898e16a8a20998d15106cb31cfde21cdf96ab40b432a7082af06", size = 803702, upload-time = "2026-02-19T19:01:05.298Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/c5/624a0705e8473a26488ec1a3a4e0b8763ecfc682a185c302dfec71daea35/regex-2026.2.19-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d89f85a5ccc0cec125c24be75610d433d65295827ebaf0d884cbe56df82d4774", size = 775980, upload-time = "2026-02-19T19:01:07.047Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/4b/ed776642533232b5599b7c1f9d817fe11faf597e8a92b7a44b841daaae76/regex-2026.2.19-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0d9f81806abdca3234c3dd582b8a97492e93de3602c8772013cb4affa12d1668", size = 788122, upload-time = "2026-02-19T19:01:08.744Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/58/e93e093921d13b9784b4f69896b6e2a9e09580a265c59d9eb95e87d288f2/regex-2026.2.19-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9dadc10d1c2bbb1326e572a226d2ec56474ab8aab26fdb8cf19419b372c349a9", size = 858910, upload-time = "2026-02-19T19:01:10.488Z" },
+    { url = "https://files.pythonhosted.org/packages/85/77/ff1d25a0c56cd546e0455cbc93235beb33474899690e6a361fa6b52d265b/regex-2026.2.19-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6bc25d7e15f80c9dc7853cbb490b91c1ec7310808b09d56bd278fe03d776f4f6", size = 764153, upload-time = "2026-02-19T19:01:12.156Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/ef/8ec58df26d52d04443b1dc56f9be4b409f43ed5ae6c0248a287f52311fc4/regex-2026.2.19-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:965d59792f5037d9138da6fed50ba943162160443b43d4895b182551805aff9c", size = 850348, upload-time = "2026-02-19T19:01:14.147Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/b3/c42fd5ed91639ce5a4225b9df909180fc95586db071f2bf7c68d2ccbfbe6/regex-2026.2.19-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:38d88c6ed4a09ed61403dbdf515d969ccba34669af3961ceb7311ecd0cef504a", size = 789977, upload-time = "2026-02-19T19:01:15.838Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/22/bc3b58ebddbfd6ca5633e71fd41829ee931963aad1ebeec55aad0c23044e/regex-2026.2.19-cp312-cp312-win32.whl", hash = "sha256:5df947cabab4b643d4791af5e28aecf6bf62e6160e525651a12eba3d03755e6b", size = 266381, upload-time = "2026-02-19T19:01:17.952Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/4a/6ff550b63e67603ee60e69dc6bd2d5694e85046a558f663b2434bdaeb285/regex-2026.2.19-cp312-cp312-win_amd64.whl", hash = "sha256:4146dc576ea99634ae9c15587d0c43273b4023a10702998edf0fa68ccb60237a", size = 277274, upload-time = "2026-02-19T19:01:19.826Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/29/9ec48b679b1e87e7bc8517dff45351eab38f74fbbda1fbcf0e9e6d4e8174/regex-2026.2.19-cp312-cp312-win_arm64.whl", hash = "sha256:cdc0a80f679353bd68450d2a42996090c30b2e15ca90ded6156c31f1a3b63f3b", size = 270509, upload-time = "2026-02-19T19:01:22.075Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/2d/a849835e76ac88fcf9e8784e642d3ea635d183c4112150ca91499d6703af/regex-2026.2.19-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8df08decd339e8b3f6a2eb5c05c687fe9d963ae91f352bc57beb05f5b2ac6879", size = 489329, upload-time = "2026-02-19T19:01:23.841Z" },
+    { url = "https://files.pythonhosted.org/packages/da/aa/78ff4666d3855490bae87845a5983485e765e1f970da20adffa2937b241d/regex-2026.2.19-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3aa0944f1dc6e92f91f3b306ba7f851e1009398c84bfd370633182ee4fc26a64", size = 291308, upload-time = "2026-02-19T19:01:25.605Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/58/714384efcc07ae6beba528a541f6e99188c5cc1bc0295337f4e8a868296d/regex-2026.2.19-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c13228fbecb03eadbfd8f521732c5fda09ef761af02e920a3148e18ad0e09968", size = 289033, upload-time = "2026-02-19T19:01:27.243Z" },
+    { url = "https://files.pythonhosted.org/packages/75/ec/6438a9344d2869cf5265236a06af1ca6d885e5848b6561e10629bc8e5a11/regex-2026.2.19-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0d0e72703c60d68b18b27cde7cdb65ed2570ae29fb37231aa3076bfb6b1d1c13", size = 798798, upload-time = "2026-02-19T19:01:28.877Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/be/b1ce2d395e3fd2ce5f2fde2522f76cade4297cfe84cd61990ff48308749c/regex-2026.2.19-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:46e69a4bf552e30e74a8aa73f473c87efcb7f6e8c8ece60d9fd7bf13d5c86f02", size = 864444, upload-time = "2026-02-19T19:01:30.933Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/97/a3406460c504f7136f140d9461960c25f058b0240e4424d6fb73c7a067ab/regex-2026.2.19-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8edda06079bd770f7f0cf7f3bba1a0b447b96b4a543c91fe0c142d034c166161", size = 912633, upload-time = "2026-02-19T19:01:32.744Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/d9/e5dbef95008d84e9af1dc0faabbc34a7fbc8daa05bc5807c5cf86c2bec49/regex-2026.2.19-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9cbc69eae834afbf634f7c902fc72ff3e993f1c699156dd1af1adab5d06b7fe7", size = 803718, upload-time = "2026-02-19T19:01:34.61Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/e5/61d80132690a1ef8dc48e0f44248036877aebf94235d43f63a20d1598888/regex-2026.2.19-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bcf57d30659996ee5c7937999874504c11b5a068edc9515e6a59221cc2744dd1", size = 775975, upload-time = "2026-02-19T19:01:36.525Z" },
+    { url = "https://files.pythonhosted.org/packages/05/32/ae828b3b312c972cf228b634447de27237d593d61505e6ad84723f8eabba/regex-2026.2.19-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8e6e77cd92216eb489e21e5652a11b186afe9bdefca8a2db739fd6b205a9e0a4", size = 788129, upload-time = "2026-02-19T19:01:38.498Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/25/d74f34676f22bec401eddf0e5e457296941e10cbb2a49a571ca7a2c16e5a/regex-2026.2.19-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b9ab8dec42afefa6314ea9b31b188259ffdd93f433d77cad454cd0b8d235ce1c", size = 858818, upload-time = "2026-02-19T19:01:40.409Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/eb/0bc2b01a6b0b264e1406e5ef11cae3f634c3bd1a6e61206fd3227ce8e89c/regex-2026.2.19-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:294c0fb2e87c6bcc5f577c8f609210f5700b993151913352ed6c6af42f30f95f", size = 764186, upload-time = "2026-02-19T19:01:43.009Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/37/5fe5a630d0d99ecf0c3570f8905dafbc160443a2d80181607770086c9812/regex-2026.2.19-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:c0924c64b082d4512b923ac016d6e1dcf647a3560b8a4c7e55cbbd13656cb4ed", size = 850363, upload-time = "2026-02-19T19:01:45.015Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/45/ef68d805294b01ec030cfd388724ba76a5a21a67f32af05b17924520cb0b/regex-2026.2.19-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:790dbf87b0361606cb0d79b393c3e8f4436a14ee56568a7463014565d97da02a", size = 790026, upload-time = "2026-02-19T19:01:47.51Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/3a/40d3b66923dfc5aeba182f194f0ca35d09afe8c031a193e6ae46971a0a0e/regex-2026.2.19-cp313-cp313-win32.whl", hash = "sha256:43cdde87006271be6963896ed816733b10967baaf0e271d529c82e93da66675b", size = 266372, upload-time = "2026-02-19T19:01:49.469Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/f2/39082e8739bfd553497689e74f9d5e5bb531d6f8936d0b94f43e18f219c0/regex-2026.2.19-cp313-cp313-win_amd64.whl", hash = "sha256:127ea69273485348a126ebbf3d6052604d3c7da284f797bba781f364c0947d47", size = 277253, upload-time = "2026-02-19T19:01:51.208Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/c2/852b9600d53fb47e47080c203e2cdc0ac7e84e37032a57e0eaa37446033a/regex-2026.2.19-cp313-cp313-win_arm64.whl", hash = "sha256:5e56c669535ac59cbf96ca1ece0ef26cb66809990cda4fa45e1e32c3b146599e", size = 270505, upload-time = "2026-02-19T19:01:52.865Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/a2/e0b4575b93bc84db3b1fab24183e008691cd2db5c0ef14ed52681fbd94dd/regex-2026.2.19-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:93d881cab5afdc41a005dba1524a40947d6f7a525057aa64aaf16065cf62faa9", size = 492202, upload-time = "2026-02-19T19:01:54.816Z" },
+    { url = "https://files.pythonhosted.org/packages/24/b5/b84fec8cbb5f92a7eed2b6b5353a6a9eed9670fee31817c2da9eb85dc797/regex-2026.2.19-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:80caaa1ddcc942ec7be18427354f9d58a79cee82dea2a6b3d4fd83302e1240d7", size = 292884, upload-time = "2026-02-19T19:01:58.254Z" },
+    { url = "https://files.pythonhosted.org/packages/70/0c/fe89966dfae43da46f475362401f03e4d7dc3a3c955b54f632abc52669e0/regex-2026.2.19-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d793c5b4d2b4c668524cd1651404cfc798d40694c759aec997e196fe9729ec60", size = 291236, upload-time = "2026-02-19T19:01:59.966Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/f7/bda2695134f3e63eb5cccbbf608c2a12aab93d261ff4e2fe49b47fabc948/regex-2026.2.19-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5100acb20648d9efd3f4e7e91f51187f95f22a741dcd719548a6cf4e1b34b3f", size = 807660, upload-time = "2026-02-19T19:02:01.632Z" },
+    { url = "https://files.pythonhosted.org/packages/11/56/6e3a4bf5e60d17326b7003d91bbde8938e439256dec211d835597a44972d/regex-2026.2.19-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5e3a31e94d10e52a896adaa3adf3621bd526ad2b45b8c2d23d1bbe74c7423007", size = 873585, upload-time = "2026-02-19T19:02:03.522Z" },
+    { url = "https://files.pythonhosted.org/packages/35/5e/c90c6aa4d1317cc11839359479cfdd2662608f339e84e81ba751c8a4e461/regex-2026.2.19-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8497421099b981f67c99eba4154cf0dfd8e47159431427a11cfb6487f7791d9e", size = 915243, upload-time = "2026-02-19T19:02:05.608Z" },
+    { url = "https://files.pythonhosted.org/packages/90/7c/981ea0694116793001496aaf9524e5c99e122ec3952d9e7f1878af3a6bf1/regex-2026.2.19-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1e7a08622f7d51d7a068f7e4052a38739c412a3e74f55817073d2e2418149619", size = 812922, upload-time = "2026-02-19T19:02:08.115Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/be/9eda82afa425370ffdb3fa9f3ea42450b9ae4da3ff0a4ec20466f69e371b/regex-2026.2.19-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8abe671cf0f15c26b1ad389bf4043b068ce7d3b1c5d9313e12895f57d6738555", size = 781318, upload-time = "2026-02-19T19:02:10.072Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/d5/50f0bbe56a8199f60a7b6c714e06e54b76b33d31806a69d0703b23ce2a9e/regex-2026.2.19-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5a8f28dd32a4ce9c41758d43b5b9115c1c497b4b1f50c457602c1d571fa98ce1", size = 795649, upload-time = "2026-02-19T19:02:11.96Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/09/d039f081e44a8b0134d0bb2dd805b0ddf390b69d0b58297ae098847c572f/regex-2026.2.19-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:654dc41a5ba9b8cc8432b3f1aa8906d8b45f3e9502442a07c2f27f6c63f85db5", size = 868844, upload-time = "2026-02-19T19:02:14.043Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/53/e2903b79a19ec8557fe7cd21cd093956ff2dbc2e0e33969e3adbe5b184dd/regex-2026.2.19-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:4a02faea614e7fdd6ba8b3bec6c8e79529d356b100381cec76e638f45d12ca04", size = 770113, upload-time = "2026-02-19T19:02:16.161Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/e2/784667767b55714ebb4e59bf106362327476b882c0b2f93c25e84cc99b1a/regex-2026.2.19-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d96162140bb819814428800934c7b71b7bffe81fb6da2d6abc1dcca31741eca3", size = 854922, upload-time = "2026-02-19T19:02:18.155Z" },
+    { url = "https://files.pythonhosted.org/packages/59/78/9ef4356bd4aed752775bd18071034979b85f035fec51f3a4f9dea497a254/regex-2026.2.19-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c227f2922153ee42bbeb355fd6d009f8c81d9d7bdd666e2276ce41f53ed9a743", size = 799636, upload-time = "2026-02-19T19:02:20.04Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/54/fcfc9287f20c5c9bd8db755aafe3e8cf4d99a6a3f1c7162ee182e0ca9374/regex-2026.2.19-cp313-cp313t-win32.whl", hash = "sha256:a178df8ec03011153fbcd2c70cb961bc98cbbd9694b28f706c318bee8927c3db", size = 268968, upload-time = "2026-02-19T19:02:22.816Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/a0/ff24c6cb1273e42472706d277147fc38e1f9074a280fb6034b0fc9b69415/regex-2026.2.19-cp313-cp313t-win_amd64.whl", hash = "sha256:2c1693ca6f444d554aa246b592355b5cec030ace5a2729eae1b04ab6e853e768", size = 280390, upload-time = "2026-02-19T19:02:25.231Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/b6/a3f6ad89d780ffdeebb4d5e2e3e30bd2ef1f70f6a94d1760e03dd1e12c60/regex-2026.2.19-cp313-cp313t-win_arm64.whl", hash = "sha256:c0761d7ae8d65773e01515ebb0b304df1bf37a0a79546caad9cbe79a42c12af7", size = 271643, upload-time = "2026-02-19T19:02:27.175Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/e2/7ad4e76a6dddefc0d64dbe12a4d3ca3947a19ddc501f864a5df2a8222ddd/regex-2026.2.19-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:03d191a9bcf94d31af56d2575210cb0d0c6a054dbcad2ea9e00aa4c42903b919", size = 489306, upload-time = "2026-02-19T19:02:29.058Z" },
+    { url = "https://files.pythonhosted.org/packages/14/95/ee1736135733afbcf1846c58671046f99c4d5170102a150ebb3dd8d701d9/regex-2026.2.19-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:516ee067c6c721d0d0bfb80a2004edbd060fffd07e456d4e1669e38fe82f922e", size = 291218, upload-time = "2026-02-19T19:02:31.083Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/08/180d1826c3d7065200a5168c6b993a44947395c7bb6e04b2c2a219c34225/regex-2026.2.19-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:997862c619994c4a356cb7c3592502cbd50c2ab98da5f61c5c871f10f22de7e5", size = 289097, upload-time = "2026-02-19T19:02:33.485Z" },
+    { url = "https://files.pythonhosted.org/packages/28/93/0651924c390c5740f5f896723f8ddd946a6c63083a7d8647231c343912ff/regex-2026.2.19-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02b9e1b8a7ebe2807cd7bbdf662510c8e43053a23262b9f46ad4fc2dfc9d204e", size = 799147, upload-time = "2026-02-19T19:02:35.669Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/00/2078bd8bcd37d58a756989adbfd9f1d0151b7ca4085a9c2a07e917fbac61/regex-2026.2.19-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6c8fb3b19652e425ff24169dad3ee07f99afa7996caa9dfbb3a9106cd726f49a", size = 865239, upload-time = "2026-02-19T19:02:38.012Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/13/75195161ec16936b35a365fa8c1dd2ab29fd910dd2587765062b174d8cfc/regex-2026.2.19-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50f1ee9488dd7a9fda850ec7c68cad7a32fa49fd19733f5403a3f92b451dcf73", size = 911904, upload-time = "2026-02-19T19:02:40.737Z" },
+    { url = "https://files.pythonhosted.org/packages/96/72/ac42f6012179343d1c4bd0ffee8c948d841cb32ea188d37e96d80527fcc9/regex-2026.2.19-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ab780092b1424d13200aa5a62996e95f65ee3db8509be366437439cdc0af1a9f", size = 803518, upload-time = "2026-02-19T19:02:42.923Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/d1/75a08e2269b007b9783f0f86aa64488e023141219cb5f14dc1e69cda56c6/regex-2026.2.19-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:17648e1a88e72d88641b12635e70e6c71c5136ba14edba29bf8fc6834005a265", size = 775866, upload-time = "2026-02-19T19:02:45.189Z" },
+    { url = "https://files.pythonhosted.org/packages/92/41/70e7d05faf6994c2ca7a9fcaa536da8f8e4031d45b0ec04b57040ede201f/regex-2026.2.19-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2f914ae8c804c8a8a562fe216100bc156bfb51338c1f8d55fe32cf407774359a", size = 788224, upload-time = "2026-02-19T19:02:47.804Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/83/34a2dd601f9deb13c20545c674a55f4a05c90869ab73d985b74d639bac43/regex-2026.2.19-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c7e121a918bbee3f12ac300ce0a0d2f2c979cf208fb071ed8df5a6323281915c", size = 859682, upload-time = "2026-02-19T19:02:50.583Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/30/136db9a09a7f222d6e48b806f3730e7af6499a8cad9c72ac0d49d52c746e/regex-2026.2.19-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2fedd459c791da24914ecc474feecd94cf7845efb262ac3134fe27cbd7eda799", size = 764223, upload-time = "2026-02-19T19:02:52.777Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/ea/bb947743c78a16df481fa0635c50aa1a439bb80b0e6dc24cd4e49c716679/regex-2026.2.19-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:ea8dfc99689240e61fb21b5fc2828f68b90abf7777d057b62d3166b7c1543c4c", size = 850101, upload-time = "2026-02-19T19:02:55.87Z" },
+    { url = "https://files.pythonhosted.org/packages/25/27/e3bfe6e97a99f7393665926be02fef772da7f8aa59e50bc3134e4262a032/regex-2026.2.19-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9fff45852160960f29e184ec8a5be5ab4063cfd0b168d439d1fc4ac3744bf29e", size = 789904, upload-time = "2026-02-19T19:02:58.523Z" },
+    { url = "https://files.pythonhosted.org/packages/84/7b/7e2be6f00cea59d08761b027ad237002e90cac74b1607200ebaa2ba3d586/regex-2026.2.19-cp314-cp314-win32.whl", hash = "sha256:5390b130cce14a7d1db226a3896273b7b35be10af35e69f1cca843b6e5d2bb2d", size = 271784, upload-time = "2026-02-19T19:03:00.418Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/f6/639911530335773e7ec60bcaa519557b719586024c1d7eaad1daf87b646b/regex-2026.2.19-cp314-cp314-win_amd64.whl", hash = "sha256:e581f75d5c0b15669139ca1c2d3e23a65bb90e3c06ba9d9ea194c377c726a904", size = 280506, upload-time = "2026-02-19T19:03:02.302Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/ec/2582b56b4e036d46bb9b5d74a18548439ffa16c11cf59076419174d80f48/regex-2026.2.19-cp314-cp314-win_arm64.whl", hash = "sha256:7187fdee1be0896c1499a991e9bf7c78e4b56b7863e7405d7bb687888ac10c4b", size = 273557, upload-time = "2026-02-19T19:03:04.836Z" },
+    { url = "https://files.pythonhosted.org/packages/49/0b/f901cfeb4efd83e4f5c3e9f91a6de77e8e5ceb18555698aca3a27e215ed3/regex-2026.2.19-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:5ec1d7c080832fdd4e150c6f5621fe674c70c63b3ae5a4454cebd7796263b175", size = 492196, upload-time = "2026-02-19T19:03:08.188Z" },
+    { url = "https://files.pythonhosted.org/packages/94/0a/349b959e3da874e15eda853755567b4cde7e5309dbb1e07bfe910cfde452/regex-2026.2.19-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8457c1bc10ee9b29cdfd897ccda41dce6bde0e9abd514bcfef7bcd05e254d411", size = 292878, upload-time = "2026-02-19T19:03:10.272Z" },
+    { url = "https://files.pythonhosted.org/packages/98/b0/9d81b3c2c5ddff428f8c506713737278979a2c476f6e3675a9c51da0c389/regex-2026.2.19-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cce8027010d1ffa3eb89a0b19621cdc78ae548ea2b49fea1f7bfb3ea77064c2b", size = 291235, upload-time = "2026-02-19T19:03:12.5Z" },
+    { url = "https://files.pythonhosted.org/packages/04/e7/be7818df8691dbe9508c381ea2cc4c1153e4fdb1c4b06388abeaa93bd712/regex-2026.2.19-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11c138febb40546ff9e026dbbc41dc9fb8b29e61013fa5848ccfe045f5b23b83", size = 807893, upload-time = "2026-02-19T19:03:15.064Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/b6/b898a8b983190cfa0276031c17beb73cfd1db07c03c8c37f606d80b655e2/regex-2026.2.19-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:74ff212aa61532246bb3036b3dfea62233414b0154b8bc3676975da78383cac3", size = 873696, upload-time = "2026-02-19T19:03:17.848Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/98/126ba671d54f19080ec87cad228fb4f3cc387fff8c4a01cb4e93f4ff9d94/regex-2026.2.19-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d00c95a2b6bfeb3ea1cb68d1751b1dfce2b05adc2a72c488d77a780db06ab867", size = 915493, upload-time = "2026-02-19T19:03:20.343Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/10/550c84a1a1a7371867fe8be2bea7df55e797cbca4709974811410e195c5d/regex-2026.2.19-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:311fcccb76af31be4c588d5a17f8f1a059ae8f4b097192896ebffc95612f223a", size = 813094, upload-time = "2026-02-19T19:03:23.287Z" },
+    { url = "https://files.pythonhosted.org/packages/29/fb/ba221d2fc76a27b6b7d7a60f73a7a6a7bac21c6ba95616a08be2bcb434b0/regex-2026.2.19-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:77cfd6b5e7c4e8bf7a39d243ea05882acf5e3c7002b0ef4756de6606893b0ecd", size = 781583, upload-time = "2026-02-19T19:03:26.872Z" },
+    { url = "https://files.pythonhosted.org/packages/26/f1/af79231301297c9e962679efc04a31361b58dc62dec1fc0cb4b8dd95956a/regex-2026.2.19-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6380f29ff212ec922b6efb56100c089251940e0526a0d05aa7c2d9b571ddf2fe", size = 795875, upload-time = "2026-02-19T19:03:29.223Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/90/1e1d76cb0a2d0a4f38a039993e1c5cd971ae50435d751c5bae4f10e1c302/regex-2026.2.19-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:655f553a1fa3ab8a7fd570eca793408b8d26a80bfd89ed24d116baaf13a38969", size = 868916, upload-time = "2026-02-19T19:03:31.415Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/67/a1c01da76dbcfed690855a284c665cc0a370e7d02d1bd635cf9ff7dd74b8/regex-2026.2.19-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:015088b8558502f1f0bccd58754835aa154a7a5b0bd9d4c9b7b96ff4ae9ba876", size = 770386, upload-time = "2026-02-19T19:03:33.972Z" },
+    { url = "https://files.pythonhosted.org/packages/49/6f/94842bf294f432ff3836bfd91032e2ecabea6d284227f12d1f935318c9c4/regex-2026.2.19-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9e6693b8567a59459b5dda19104c4a4dbbd4a1c78833eacc758796f2cfef1854", size = 855007, upload-time = "2026-02-19T19:03:36.238Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/93/393cd203ca0d1d368f05ce12d2c7e91a324bc93c240db2e6d5ada05835f4/regex-2026.2.19-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4071209fd4376ab5ceec72ad3507e9d3517c59e38a889079b98916477a871868", size = 799863, upload-time = "2026-02-19T19:03:38.497Z" },
+    { url = "https://files.pythonhosted.org/packages/43/d9/35afda99bd92bf1a5831e55a4936d37ea4bed6e34c176a3c2238317faf4f/regex-2026.2.19-cp314-cp314t-win32.whl", hash = "sha256:2905ff4a97fad42f2d0834d8b1ea3c2f856ec209837e458d71a061a7d05f9f01", size = 274742, upload-time = "2026-02-19T19:03:40.804Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/42/7edc3344dcc87b698e9755f7f685d463852d481302539dae07135202d3ca/regex-2026.2.19-cp314-cp314t-win_amd64.whl", hash = "sha256:64128549b600987e0f335c2365879895f860a9161f283b14207c800a6ed623d3", size = 284443, upload-time = "2026-02-19T19:03:42.954Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/45/affdf2d851b42adf3d13fc5b3b059372e9bd299371fd84cf5723c45871fa/regex-2026.2.19-cp314-cp314t-win_arm64.whl", hash = "sha256:a09ae430e94c049dc6957f6baa35ee3418a3a77f3c12b6e02883bd80a2b679b0", size = 274932, upload-time = "2026-02-19T19:03:45.488Z" },
 ]
 
 [[package]]
@@ -4666,148 +5646,174 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" },
 ]
 
+[[package]]
+name = "responses"
+version = "0.18.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "requests", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+    { name = "urllib3", marker = "python_full_version >= '3.14' and sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/03/a5/186653e51cb20fe3ac793403334d4d077fbb7bb18a9c5c2fce8304d5a2e2/responses-0.18.0.tar.gz", hash = "sha256:380cad4c1c1dc942e5e8a8eaae0b4d4edf708f4f010db8b7bcfafad1fcd254ff", size = 45885, upload-time = "2022-02-02T19:59:52.834Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl", hash = "sha256:15c63ad16de13ee8e7182d99c9334f64fd81f1ee79f90748d527c28f7ca9dd51", size = 38735, upload-time = "2022-02-02T19:59:52.833Z" },
+]
+
 [[package]]
 name = "rich"
-version = "14.2.0"
+version = "14.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markdown-it-py" },
+    { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fb/d2/8920e102050a0de7bfabeb4c4614a49248cf8d5d7a8d01885fbb24dc767a/rich-14.2.0.tar.gz", hash = "sha256:73ff50c7c0c1c77c8243079283f4edb376f0f6442433aecb8ce7e6d0b92d1fe4", size = 219990, upload-time = "2025-10-09T14:16:53.064Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" },
+    { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" },
 ]
 
 [[package]]
-name = "roman-numerals-py"
-version = "3.1.0"
+name = "roman-numerals"
+version = "4.1.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/30/76/48fd56d17c5bdbdf65609abbc67288728a98ed4c02919428d4f52d23b24b/roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d", size = 9017, upload-time = "2025-02-22T07:34:54.333Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/41dc953bbeb056c17d5f7a519f50fdf010bd0553be2d630bc69d1e022703/roman_numerals-4.1.0.tar.gz", hash = "sha256:1af8b147eb1405d5839e78aeb93131690495fe9da5c91856cb33ad55a7f1e5b2", size = 9077, upload-time = "2025-12-17T18:25:34.381Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c", size = 7742, upload-time = "2025-02-22T07:34:52.422Z" },
+    { url = "https://files.pythonhosted.org/packages/04/54/6f679c435d28e0a568d8e8a7c0a93a09010818634c3c3907fc98d8983770/roman_numerals-4.1.0-py3-none-any.whl", hash = "sha256:647ba99caddc2cc1e55a51e4360689115551bf4476d90e8162cf8c345fe233c7", size = 7676, upload-time = "2025-12-17T18:25:33.098Z" },
 ]
 
 [[package]]
 name = "rpds-py"
-version = "0.29.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/98/33/23b3b3419b6a3e0f559c7c0d2ca8fc1b9448382b25245033788785921332/rpds_py-0.29.0.tar.gz", hash = "sha256:fe55fe686908f50154d1dc599232016e50c243b438c3b7432f24e2895b0e5359", size = 69359, upload-time = "2025-11-16T14:50:39.532Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9e/7a/c5b2ff381b74bc742768e8d870f26babac4ef256ba160bdbf8d57af56461/rpds_py-0.29.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4ae4b88c6617e1b9e5038ab3fccd7bac0842fdda2b703117b2aa99bc85379113", size = 372385, upload-time = "2025-11-16T14:47:36.287Z" },
-    { url = "https://files.pythonhosted.org/packages/28/36/531f1eb4d5bed4a9c150f363a7ec4a98d2dc746151bba5473bc38ee85dec/rpds_py-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7d9128ec9d8cecda6f044001fde4fb71ea7c24325336612ef8179091eb9596b9", size = 362869, upload-time = "2025-11-16T14:47:38.196Z" },
-    { url = "https://files.pythonhosted.org/packages/54/df/7e9c0493a2015d9c82807a2d5f023ea9774e27a4c15b33ef1cdb7456138d/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d37812c3da8e06f2bb35b3cf10e4a7b68e776a706c13058997238762b4e07f4f", size = 391582, upload-time = "2025-11-16T14:47:39.746Z" },
-    { url = "https://files.pythonhosted.org/packages/15/38/42a981c3592ef46fbd7e17adbf8730cc5ec87e6aa1770c658c44bbb52960/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:66786c3fb1d8de416a7fa8e1cb1ec6ba0a745b2b0eee42f9b7daa26f1a495545", size = 405685, upload-time = "2025-11-16T14:47:41.472Z" },
-    { url = "https://files.pythonhosted.org/packages/12/45/628b8c15856c3849c3f52ec6dac93c046ed5faeed4a435af03b70525fd29/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58f5c77f1af888b5fd1876c9a0d9858f6f88a39c9dd7c073a88e57e577da66d", size = 527067, upload-time = "2025-11-16T14:47:43.036Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/ba/6b56d09badeabd95098016d72a437d4a0fd82d4672ce92a7607df5d70a42/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:799156ef1f3529ed82c36eb012b5d7a4cf4b6ef556dd7cc192148991d07206ae", size = 412532, upload-time = "2025-11-16T14:47:44.484Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/39/2f1f3db92888314b50b8f9641f679188bd24b3665a8cb9923b7201ae8011/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453783477aa4f2d9104c4b59b08c871431647cb7af51b549bbf2d9eb9c827756", size = 392736, upload-time = "2025-11-16T14:47:46.053Z" },
-    { url = "https://files.pythonhosted.org/packages/60/43/3c3b1dcd827e50f2ae28786d846b8a351080d8a69a3b49bc10ae44cc39b1/rpds_py-0.29.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:24a7231493e3c4a4b30138b50cca089a598e52c34cf60b2f35cebf62f274fdea", size = 406300, upload-time = "2025-11-16T14:47:47.268Z" },
-    { url = "https://files.pythonhosted.org/packages/da/02/bc96021b67f8525e6bcdd68935c4543ada61e1f3dcb067ed037d68b8c6d2/rpds_py-0.29.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7033c1010b1f57bb44d8067e8c25aa6fa2e944dbf46ccc8c92b25043839c3fd2", size = 423641, upload-time = "2025-11-16T14:47:48.878Z" },
-    { url = "https://files.pythonhosted.org/packages/38/e9/c435ddb602ced19a80b8277a41371734f33ad3f91cc4ceb4d82596800a3c/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0248b19405422573621172ab8e3a1f29141362d13d9f72bafa2e28ea0cdca5a2", size = 574153, upload-time = "2025-11-16T14:47:50.435Z" },
-    { url = "https://files.pythonhosted.org/packages/84/82/dc3c32e1f89ecba8a59600d4cd65fe0ad81b6c636ccdbf6cd177fd6a7bac/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f9f436aee28d13b9ad2c764fc273e0457e37c2e61529a07b928346b219fcde3b", size = 600304, upload-time = "2025-11-16T14:47:51.599Z" },
-    { url = "https://files.pythonhosted.org/packages/35/98/785290e0b7142470735dc1b1f68fb33aae29e5296f062c88396eedf796c8/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:24a16cb7163933906c62c272de20ea3c228e4542c8c45c1d7dc2b9913e17369a", size = 562211, upload-time = "2025-11-16T14:47:53.094Z" },
-    { url = "https://files.pythonhosted.org/packages/30/58/4eeddcb0737c6875f3e30c65dc9d7e7a10dfd5779646a990fa602c6d56c5/rpds_py-0.29.0-cp310-cp310-win32.whl", hash = "sha256:1a409b0310a566bfd1be82119891fefbdce615ccc8aa558aff7835c27988cbef", size = 221803, upload-time = "2025-11-16T14:47:54.404Z" },
-    { url = "https://files.pythonhosted.org/packages/54/77/b35a8dbdcbeb32505500547cdafaa9f8863e85f8faac50ef34464ec5a256/rpds_py-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5523b0009e7c3c1263471b69d8da1c7d41b3ecb4cb62ef72be206b92040a950", size = 235530, upload-time = "2025-11-16T14:47:56.061Z" },
-    { url = "https://files.pythonhosted.org/packages/36/ab/7fb95163a53ab122c74a7c42d2d2f012819af2cf3deb43fb0d5acf45cc1a/rpds_py-0.29.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b9c764a11fd637e0322a488560533112837f5334ffeb48b1be20f6d98a7b437", size = 372344, upload-time = "2025-11-16T14:47:57.279Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/45/f3c30084c03b0d0f918cb4c5ae2c20b0a148b51ba2b3f6456765b629bedd/rpds_py-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fd2164d73812026ce970d44c3ebd51e019d2a26a4425a5dcbdfa93a34abc383", size = 363041, upload-time = "2025-11-16T14:47:58.908Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/e9/4d044a1662608c47a87cbb37b999d4d5af54c6d6ebdda93a4d8bbf8b2a10/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a097b7f7f7274164566ae90a221fd725363c0e9d243e2e9ed43d195ccc5495c", size = 391775, upload-time = "2025-11-16T14:48:00.197Z" },
-    { url = "https://files.pythonhosted.org/packages/50/c9/7616d3ace4e6731aeb6e3cd85123e03aec58e439044e214b9c5c60fd8eb1/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7cdc0490374e31cedefefaa1520d5fe38e82fde8748cbc926e7284574c714d6b", size = 405624, upload-time = "2025-11-16T14:48:01.496Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/e2/6d7d6941ca0843609fd2d72c966a438d6f22617baf22d46c3d2156c31350/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89ca2e673ddd5bde9b386da9a0aac0cab0e76f40c8f0aaf0d6311b6bbf2aa311", size = 527894, upload-time = "2025-11-16T14:48:03.167Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/f7/aee14dc2db61bb2ae1e3068f134ca9da5f28c586120889a70ff504bb026f/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5d9da3ff5af1ca1249b1adb8ef0573b94c76e6ae880ba1852f033bf429d4588", size = 412720, upload-time = "2025-11-16T14:48:04.413Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/e2/2293f236e887c0360c2723d90c00d48dee296406994d6271faf1712e94ec/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8238d1d310283e87376c12f658b61e1ee23a14c0e54c7c0ce953efdbdc72deed", size = 392945, upload-time = "2025-11-16T14:48:06.252Z" },
-    { url = "https://files.pythonhosted.org/packages/14/cd/ceea6147acd3bd1fd028d1975228f08ff19d62098078d5ec3eed49703797/rpds_py-0.29.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:2d6fb2ad1c36f91c4646989811e84b1ea5e0c3cf9690b826b6e32b7965853a63", size = 406385, upload-time = "2025-11-16T14:48:07.575Z" },
-    { url = "https://files.pythonhosted.org/packages/52/36/fe4dead19e45eb77a0524acfdbf51e6cda597b26fc5b6dddbff55fbbb1a5/rpds_py-0.29.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:534dc9df211387547267ccdb42253aa30527482acb38dd9b21c5c115d66a96d2", size = 423943, upload-time = "2025-11-16T14:48:10.175Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/7b/4551510803b582fa4abbc8645441a2d15aa0c962c3b21ebb380b7e74f6a1/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d456e64724a075441e4ed648d7f154dc62e9aabff29bcdf723d0c00e9e1d352f", size = 574204, upload-time = "2025-11-16T14:48:11.499Z" },
-    { url = "https://files.pythonhosted.org/packages/64/ba/071ccdd7b171e727a6ae079f02c26f75790b41555f12ca8f1151336d2124/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a738f2da2f565989401bd6fd0b15990a4d1523c6d7fe83f300b7e7d17212feca", size = 600587, upload-time = "2025-11-16T14:48:12.822Z" },
-    { url = "https://files.pythonhosted.org/packages/03/09/96983d48c8cf5a1e03c7d9cc1f4b48266adfb858ae48c7c2ce978dbba349/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a110e14508fd26fd2e472bb541f37c209409876ba601cf57e739e87d8a53cf95", size = 562287, upload-time = "2025-11-16T14:48:14.108Z" },
-    { url = "https://files.pythonhosted.org/packages/40/f0/8c01aaedc0fa92156f0391f39ea93b5952bc0ec56b897763858f95da8168/rpds_py-0.29.0-cp311-cp311-win32.whl", hash = "sha256:923248a56dd8d158389a28934f6f69ebf89f218ef96a6b216a9be6861804d3f4", size = 221394, upload-time = "2025-11-16T14:48:15.374Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/a5/a8b21c54c7d234efdc83dc034a4d7cd9668e3613b6316876a29b49dece71/rpds_py-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:539eb77eb043afcc45314d1be09ea6d6cafb3addc73e0547c171c6d636957f60", size = 235713, upload-time = "2025-11-16T14:48:16.636Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/1f/df3c56219523947b1be402fa12e6323fe6d61d883cf35d6cb5d5bb6db9d9/rpds_py-0.29.0-cp311-cp311-win_arm64.whl", hash = "sha256:bdb67151ea81fcf02d8f494703fb728d4d34d24556cbff5f417d74f6f5792e7c", size = 229157, upload-time = "2025-11-16T14:48:17.891Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/50/bc0e6e736d94e420df79be4deb5c9476b63165c87bb8f19ef75d100d21b3/rpds_py-0.29.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a0891cfd8db43e085c0ab93ab7e9b0c8fee84780d436d3b266b113e51e79f954", size = 376000, upload-time = "2025-11-16T14:48:19.141Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/3a/46676277160f014ae95f24de53bed0e3b7ea66c235e7de0b9df7bd5d68ba/rpds_py-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3897924d3f9a0361472d884051f9a2460358f9a45b1d85a39a158d2f8f1ad71c", size = 360575, upload-time = "2025-11-16T14:48:20.443Z" },
-    { url = "https://files.pythonhosted.org/packages/75/ba/411d414ed99ea1afdd185bbabeeaac00624bd1e4b22840b5e9967ade6337/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a21deb8e0d1571508c6491ce5ea5e25669b1dd4adf1c9d64b6314842f708b5d", size = 392159, upload-time = "2025-11-16T14:48:22.12Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/b1/e18aa3a331f705467a48d0296778dc1fea9d7f6cf675bd261f9a846c7e90/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9efe71687d6427737a0a2de9ca1c0a216510e6cd08925c44162be23ed7bed2d5", size = 410602, upload-time = "2025-11-16T14:48:23.563Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/6c/04f27f0c9f2299274c76612ac9d2c36c5048bb2c6c2e52c38c60bf3868d9/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:40f65470919dc189c833e86b2c4bd21bd355f98436a2cef9e0a9a92aebc8e57e", size = 515808, upload-time = "2025-11-16T14:48:24.949Z" },
-    { url = "https://files.pythonhosted.org/packages/83/56/a8412aa464fb151f8bc0d91fb0bb888adc9039bd41c1c6ba8d94990d8cf8/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:def48ff59f181130f1a2cb7c517d16328efac3ec03951cca40c1dc2049747e83", size = 416015, upload-time = "2025-11-16T14:48:26.782Z" },
-    { url = "https://files.pythonhosted.org/packages/04/4c/f9b8a05faca3d9e0a6397c90d13acb9307c9792b2bff621430c58b1d6e76/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7bd570be92695d89285a4b373006930715b78d96449f686af422debb4d3949", size = 395325, upload-time = "2025-11-16T14:48:28.055Z" },
-    { url = "https://files.pythonhosted.org/packages/34/60/869f3bfbf8ed7b54f1ad9a5543e0fdffdd40b5a8f587fe300ee7b4f19340/rpds_py-0.29.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:5a572911cd053137bbff8e3a52d31c5d2dba51d3a67ad902629c70185f3f2181", size = 410160, upload-time = "2025-11-16T14:48:29.338Z" },
-    { url = "https://files.pythonhosted.org/packages/91/aa/e5b496334e3aba4fe4c8a80187b89f3c1294c5c36f2a926da74338fa5a73/rpds_py-0.29.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d583d4403bcbf10cffc3ab5cee23d7643fcc960dff85973fd3c2d6c86e8dbb0c", size = 425309, upload-time = "2025-11-16T14:48:30.691Z" },
-    { url = "https://files.pythonhosted.org/packages/85/68/4e24a34189751ceb6d66b28f18159922828dd84155876551f7ca5b25f14f/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:070befbb868f257d24c3bb350dbd6e2f645e83731f31264b19d7231dd5c396c7", size = 574644, upload-time = "2025-11-16T14:48:31.964Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/cf/474a005ea4ea9c3b4f17b6108b6b13cebfc98ebaff11d6e1b193204b3a93/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fc935f6b20b0c9f919a8ff024739174522abd331978f750a74bb68abd117bd19", size = 601605, upload-time = "2025-11-16T14:48:33.252Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/b1/c56f6a9ab8c5f6bb5c65c4b5f8229167a3a525245b0773f2c0896686b64e/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8c5a8ecaa44ce2d8d9d20a68a2483a74c07f05d72e94a4dff88906c8807e77b0", size = 564593, upload-time = "2025-11-16T14:48:34.643Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/13/0494cecce4848f68501e0a229432620b4b57022388b071eeff95f3e1e75b/rpds_py-0.29.0-cp312-cp312-win32.whl", hash = "sha256:ba5e1aeaf8dd6d8f6caba1f5539cddda87d511331714b7b5fc908b6cfc3636b7", size = 223853, upload-time = "2025-11-16T14:48:36.419Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/6a/51e9aeb444a00cdc520b032a28b07e5f8dc7bc328b57760c53e7f96997b4/rpds_py-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:b5f6134faf54b3cb83375db0f113506f8b7770785be1f95a631e7e2892101977", size = 239895, upload-time = "2025-11-16T14:48:37.956Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/d4/8bce56cdad1ab873e3f27cb31c6a51d8f384d66b022b820525b879f8bed1/rpds_py-0.29.0-cp312-cp312-win_arm64.whl", hash = "sha256:b016eddf00dca7944721bf0cd85b6af7f6c4efaf83ee0b37c4133bd39757a8c7", size = 230321, upload-time = "2025-11-16T14:48:39.71Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/d9/c5de60d9d371bbb186c3e9bf75f4fc5665e11117a25a06a6b2e0afb7380e/rpds_py-0.29.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1585648d0760b88292eecab5181f5651111a69d90eff35d6b78aa32998886a61", size = 375710, upload-time = "2025-11-16T14:48:41.063Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/b3/0860cdd012291dc21272895ce107f1e98e335509ba986dd83d72658b82b9/rpds_py-0.29.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:521807963971a23996ddaf764c682b3e46459b3c58ccd79fefbe16718db43154", size = 360582, upload-time = "2025-11-16T14:48:42.423Z" },
-    { url = "https://files.pythonhosted.org/packages/92/8a/a18c2f4a61b3407e56175f6aab6deacdf9d360191a3d6f38566e1eaf7266/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a8896986efaa243ab713c69e6491a4138410f0fe36f2f4c71e18bd5501e8014", size = 391172, upload-time = "2025-11-16T14:48:43.75Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/49/e93354258508c50abc15cdcd5fcf7ac4117f67bb6233ad7859f75e7372a0/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d24564a700ef41480a984c5ebed62b74e6ce5860429b98b1fede76049e953e6", size = 409586, upload-time = "2025-11-16T14:48:45.498Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/8d/a27860dae1c19a6bdc901f90c81f0d581df1943355802961a57cdb5b6cd1/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e6596b93c010d386ae46c9fba9bfc9fc5965fa8228edeac51576299182c2e31c", size = 516339, upload-time = "2025-11-16T14:48:47.308Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/ad/a75e603161e79b7110c647163d130872b271c6b28712c803c65d492100f7/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5cc58aac218826d054c7da7f95821eba94125d88be673ff44267bb89d12a5866", size = 416201, upload-time = "2025-11-16T14:48:48.615Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/42/555b4ee17508beafac135c8b450816ace5a96194ce97fefc49d58e5652ea/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de73e40ebc04dd5d9556f50180395322193a78ec247e637e741c1b954810f295", size = 395095, upload-time = "2025-11-16T14:48:50.027Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/f0/c90b671b9031e800ec45112be42ea9f027f94f9ac25faaac8770596a16a1/rpds_py-0.29.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:295ce5ac7f0cf69a651ea75c8f76d02a31f98e5698e82a50a5f4d4982fbbae3b", size = 410077, upload-time = "2025-11-16T14:48:51.515Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/80/9af8b640b81fe21e6f718e9dec36c0b5f670332747243130a5490f292245/rpds_py-0.29.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1ea59b23ea931d494459c8338056fe7d93458c0bf3ecc061cd03916505369d55", size = 424548, upload-time = "2025-11-16T14:48:53.237Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/0b/b5647446e991736e6a495ef510e6710df91e880575a586e763baeb0aa770/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f49d41559cebd608042fdcf54ba597a4a7555b49ad5c1c0c03e0af82692661cd", size = 573661, upload-time = "2025-11-16T14:48:54.769Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/b3/1b1c9576839ff583d1428efbf59f9ee70498d8ce6c0b328ac02f1e470879/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:05a2bd42768ea988294ca328206efbcc66e220d2d9b7836ee5712c07ad6340ea", size = 600937, upload-time = "2025-11-16T14:48:56.247Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/7b/b6cfca2f9fee4c4494ce54f7fb1b9f578867495a9aa9fc0d44f5f735c8e0/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:33ca7bdfedd83339ca55da3a5e1527ee5870d4b8369456b5777b197756f3ca22", size = 564496, upload-time = "2025-11-16T14:48:57.691Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/fb/ba29ec7f0f06eb801bac5a23057a9ff7670623b5e8013bd59bec4aa09de8/rpds_py-0.29.0-cp313-cp313-win32.whl", hash = "sha256:20c51ae86a0bb9accc9ad4e6cdeec58d5ebb7f1b09dd4466331fc65e1766aae7", size = 223126, upload-time = "2025-11-16T14:48:59.058Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/6b/0229d3bed4ddaa409e6d90b0ae967ed4380e4bdd0dad6e59b92c17d42457/rpds_py-0.29.0-cp313-cp313-win_amd64.whl", hash = "sha256:6410e66f02803600edb0b1889541f4b5cc298a5ccda0ad789cc50ef23b54813e", size = 239771, upload-time = "2025-11-16T14:49:00.872Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/38/d2868f058b164f8efd89754d85d7b1c08b454f5c07ac2e6cc2e9bd4bd05b/rpds_py-0.29.0-cp313-cp313-win_arm64.whl", hash = "sha256:56838e1cd9174dc23c5691ee29f1d1be9eab357f27efef6bded1328b23e1ced2", size = 229994, upload-time = "2025-11-16T14:49:02.673Z" },
-    { url = "https://files.pythonhosted.org/packages/52/91/5de91c5ec7d41759beec9b251630824dbb8e32d20c3756da1a9a9d309709/rpds_py-0.29.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:37d94eadf764d16b9a04307f2ab1d7af6dc28774bbe0535c9323101e14877b4c", size = 365886, upload-time = "2025-11-16T14:49:04.133Z" },
-    { url = "https://files.pythonhosted.org/packages/85/7c/415d8c1b016d5f47ecec5145d9d6d21002d39dce8761b30f6c88810b455a/rpds_py-0.29.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d472cf73efe5726a067dce63eebe8215b14beabea7c12606fd9994267b3cfe2b", size = 355262, upload-time = "2025-11-16T14:49:05.543Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/14/bf83e2daa4f980e4dc848aed9299792a8b84af95e12541d9e7562f84a6ef/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72fdfd5ff8992e4636621826371e3ac5f3e3b8323e9d0e48378e9c13c3dac9d0", size = 384826, upload-time = "2025-11-16T14:49:07.301Z" },
-    { url = "https://files.pythonhosted.org/packages/33/b8/53330c50a810ae22b4fbba5e6cf961b68b9d72d9bd6780a7c0a79b070857/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2549d833abdf8275c901313b9e8ff8fba57e50f6a495035a2a4e30621a2f7cc4", size = 394234, upload-time = "2025-11-16T14:49:08.782Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/32/01e2e9645cef0e584f518cfde4567563e57db2257244632b603f61b40e50/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4448dad428f28a6a767c3e3b80cde3446a22a0efbddaa2360f4bb4dc836d0688", size = 520008, upload-time = "2025-11-16T14:49:10.253Z" },
-    { url = "https://files.pythonhosted.org/packages/98/c3/0d1b95a81affae2b10f950782e33a1fd2edd6ce2a479966cac98c9a66f57/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:115f48170fd4296a33938d8c11f697f5f26e0472e43d28f35624764173a60e4d", size = 409569, upload-time = "2025-11-16T14:49:12.478Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/60/aa3b8678f3f009f675b99174fa2754302a7fbfe749162e8043d111de2d88/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e5bb73ffc029820f4348e9b66b3027493ae00bca6629129cd433fd7a76308ee", size = 385188, upload-time = "2025-11-16T14:49:13.88Z" },
-    { url = "https://files.pythonhosted.org/packages/92/02/5546c1c8aa89c18d40c1fcffdcc957ba730dee53fb7c3ca3a46f114761d2/rpds_py-0.29.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:b1581fcde18fcdf42ea2403a16a6b646f8eb1e58d7f90a0ce693da441f76942e", size = 398587, upload-time = "2025-11-16T14:49:15.339Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/e0/ad6eeaf47e236eba052fa34c4073078b9e092bd44da6bbb35aaae9580669/rpds_py-0.29.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16e9da2bda9eb17ea318b4c335ec9ac1818e88922cbe03a5743ea0da9ecf74fb", size = 416641, upload-time = "2025-11-16T14:49:16.832Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/93/0acedfd50ad9cdd3879c615a6dc8c5f1ce78d2fdf8b87727468bb5bb4077/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:28fd300326dd21198f311534bdb6d7e989dd09b3418b3a91d54a0f384c700967", size = 566683, upload-time = "2025-11-16T14:49:18.342Z" },
-    { url = "https://files.pythonhosted.org/packages/62/53/8c64e0f340a9e801459fc6456821abc15b3582cb5dc3932d48705a9d9ac7/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2aba991e041d031c7939e1358f583ae405a7bf04804ca806b97a5c0e0af1ea5e", size = 592730, upload-time = "2025-11-16T14:49:19.767Z" },
-    { url = "https://files.pythonhosted.org/packages/85/ef/3109b6584f8c4b0d2490747c916df833c127ecfa82be04d9a40a376f2090/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f437026dbbc3f08c99cc41a5b2570c6e1a1ddbe48ab19a9b814254128d4ea7a", size = 557361, upload-time = "2025-11-16T14:49:21.574Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/3b/61586475e82d57f01da2c16edb9115a618afe00ce86fe1b58936880b15af/rpds_py-0.29.0-cp313-cp313t-win32.whl", hash = "sha256:6e97846e9800a5d0fe7be4d008f0c93d0feeb2700da7b1f7528dabafb31dfadb", size = 211227, upload-time = "2025-11-16T14:49:23.03Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/3a/12dc43f13594a54ea0c9d7e9d43002116557330e3ad45bc56097ddf266e2/rpds_py-0.29.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f49196aec7c4b406495f60e6f947ad71f317a765f956d74bbd83996b9edc0352", size = 225248, upload-time = "2025-11-16T14:49:24.841Z" },
-    { url = "https://files.pythonhosted.org/packages/89/b1/0b1474e7899371d9540d3bbb2a499a3427ae1fc39c998563fe9035a1073b/rpds_py-0.29.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:394d27e4453d3b4d82bb85665dc1fcf4b0badc30fc84282defed71643b50e1a1", size = 363731, upload-time = "2025-11-16T14:49:26.683Z" },
-    { url = "https://files.pythonhosted.org/packages/28/12/3b7cf2068d0a334ed1d7b385a9c3c8509f4c2bcba3d4648ea71369de0881/rpds_py-0.29.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:55d827b2ae95425d3be9bc9a5838b6c29d664924f98146557f7715e331d06df8", size = 354343, upload-time = "2025-11-16T14:49:28.24Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/73/5afcf8924bc02a749416eda64e17ac9c9b28f825f4737385295a0e99b0c1/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc31a07ed352e5462d3ee1b22e89285f4ce97d5266f6d1169da1142e78045626", size = 385406, upload-time = "2025-11-16T14:49:29.943Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/37/5db736730662508535221737a21563591b6f43c77f2e388951c42f143242/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c4695dd224212f6105db7ea62197144230b808d6b2bba52238906a2762f1d1e7", size = 396162, upload-time = "2025-11-16T14:49:31.833Z" },
-    { url = "https://files.pythonhosted.org/packages/70/0d/491c1017d14f62ce7bac07c32768d209a50ec567d76d9f383b4cfad19b80/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcae1770b401167f8b9e1e3f566562e6966ffa9ce63639916248a9e25fa8a244", size = 517719, upload-time = "2025-11-16T14:49:33.804Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/25/b11132afcb17cd5d82db173f0c8dab270ffdfaba43e5ce7a591837ae9649/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:90f30d15f45048448b8da21c41703b31c61119c06c216a1bf8c245812a0f0c17", size = 409498, upload-time = "2025-11-16T14:49:35.222Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/7d/e6543cedfb2e6403a1845710a5ab0e0ccf8fc288e0b5af9a70bfe2c12053/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a91e0ab77bdc0004b43261a4b8cd6d6b451e8d443754cfda830002b5745b32", size = 382743, upload-time = "2025-11-16T14:49:36.704Z" },
-    { url = "https://files.pythonhosted.org/packages/75/11/a4ebc9f654293ae9fefb83b2b6be7f3253e85ea42a5db2f77d50ad19aaeb/rpds_py-0.29.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:4aa195e5804d32c682e453b34474f411ca108e4291c6a0f824ebdc30a91c973c", size = 400317, upload-time = "2025-11-16T14:49:39.132Z" },
-    { url = "https://files.pythonhosted.org/packages/52/18/97677a60a81c7f0e5f64e51fb3f8271c5c8fcabf3a2df18e97af53d7c2bf/rpds_py-0.29.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7971bdb7bf4ee0f7e6f67fa4c7fbc6019d9850cc977d126904392d363f6f8318", size = 416979, upload-time = "2025-11-16T14:49:40.575Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/69/28ab391a9968f6c746b2a2db181eaa4d16afaa859fedc9c2f682d19f7e18/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8ae33ad9ce580c7a47452c3b3f7d8a9095ef6208e0a0c7e4e2384f9fc5bf8212", size = 567288, upload-time = "2025-11-16T14:49:42.24Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/d3/0c7afdcdb830eee94f5611b64e71354ffe6ac8df82d00c2faf2bfffd1d4e/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:c661132ab2fb4eeede2ef69670fd60da5235209874d001a98f1542f31f2a8a94", size = 593157, upload-time = "2025-11-16T14:49:43.782Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/ac/a0fcbc2feed4241cf26d32268c195eb88ddd4bd862adfc9d4b25edfba535/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bb78b3a0d31ac1bde132c67015a809948db751cb4e92cdb3f0b242e430b6ed0d", size = 554741, upload-time = "2025-11-16T14:49:45.557Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/f1/fcc24137c470df8588674a677f33719d5800ec053aaacd1de8a5d5d84d9e/rpds_py-0.29.0-cp314-cp314-win32.whl", hash = "sha256:f475f103488312e9bd4000bc890a95955a07b2d0b6e8884aef4be56132adbbf1", size = 215508, upload-time = "2025-11-16T14:49:47.562Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/c7/1d169b2045512eac019918fc1021ea07c30e84a4343f9f344e3e0aa8c788/rpds_py-0.29.0-cp314-cp314-win_amd64.whl", hash = "sha256:b9cf2359a4fca87cfb6801fae83a76aedf66ee1254a7a151f1341632acf67f1b", size = 228125, upload-time = "2025-11-16T14:49:49.064Z" },
-    { url = "https://files.pythonhosted.org/packages/be/36/0cec88aaba70ec4a6e381c444b0d916738497d27f0c30406e3d9fcbd3bc2/rpds_py-0.29.0-cp314-cp314-win_arm64.whl", hash = "sha256:9ba8028597e824854f0f1733d8b964e914ae3003b22a10c2c664cb6927e0feb9", size = 221992, upload-time = "2025-11-16T14:49:50.777Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/fa/a2e524631717c9c0eb5d90d30f648cfba6b731047821c994acacb618406c/rpds_py-0.29.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:e71136fd0612556b35c575dc2726ae04a1669e6a6c378f2240312cf5d1a2ab10", size = 366425, upload-time = "2025-11-16T14:49:52.691Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/a4/6d43ebe0746ff694a30233f63f454aed1677bd50ab7a59ff6b2bb5ac61f2/rpds_py-0.29.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:76fe96632d53f3bf0ea31ede2f53bbe3540cc2736d4aec3b3801b0458499ef3a", size = 355282, upload-time = "2025-11-16T14:49:54.292Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/a7/52fd8270e0320b09eaf295766ae81dd175f65394687906709b3e75c71d06/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9459a33f077130dbb2c7c3cea72ee9932271fb3126404ba2a2661e4fe9eb7b79", size = 384968, upload-time = "2025-11-16T14:49:55.857Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/7d/e6bc526b7a14e1ef80579a52c1d4ad39260a058a51d66c6039035d14db9d/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5c9546cfdd5d45e562cc0444b6dddc191e625c62e866bf567a2c69487c7ad28a", size = 394714, upload-time = "2025-11-16T14:49:57.343Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/3f/f0ade3954e7db95c791e7eaf978aa7e08a756d2046e8bdd04d08146ed188/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12597d11d97b8f7e376c88929a6e17acb980e234547c92992f9f7c058f1a7310", size = 520136, upload-time = "2025-11-16T14:49:59.162Z" },
-    { url = "https://files.pythonhosted.org/packages/87/b3/07122ead1b97009715ab9d4082be6d9bd9546099b2b03fae37c3116f72be/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28de03cf48b8a9e6ec10318f2197b83946ed91e2891f651a109611be4106ac4b", size = 409250, upload-time = "2025-11-16T14:50:00.698Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/c6/dcbee61fd1dc892aedcb1b489ba661313101aa82ec84b1a015d4c63ebfda/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7951c964069039acc9d67a8ff1f0a7f34845ae180ca542b17dc1456b1f1808", size = 384940, upload-time = "2025-11-16T14:50:02.312Z" },
-    { url = "https://files.pythonhosted.org/packages/47/11/914ecb6f3574cf9bf8b38aced4063e0f787d6e1eb30b181a7efbc6c1da9a/rpds_py-0.29.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:c07d107b7316088f1ac0177a7661ca0c6670d443f6fe72e836069025e6266761", size = 399392, upload-time = "2025-11-16T14:50:03.829Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/fd/2f4bd9433f58f816434bb934313584caa47dbc6f03ce5484df8ac8980561/rpds_py-0.29.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1de2345af363d25696969befc0c1688a6cb5e8b1d32b515ef84fc245c6cddba3", size = 416796, upload-time = "2025-11-16T14:50:05.558Z" },
-    { url = "https://files.pythonhosted.org/packages/79/a5/449f0281af33efa29d5c71014399d74842342ae908d8cd38260320167692/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:00e56b12d2199ca96068057e1ae7f9998ab6e99cda82431afafd32f3ec98cca9", size = 566843, upload-time = "2025-11-16T14:50:07.243Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/32/0a6a1ccee2e37fcb1b7ba9afde762b77182dbb57937352a729c6cd3cf2bb/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3919a3bbecee589300ed25000b6944174e07cd20db70552159207b3f4bbb45b8", size = 593956, upload-time = "2025-11-16T14:50:09.029Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/3d/eb820f95dce4306f07a495ede02fb61bef36ea201d9137d4fcd5ab94ec1e/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e7fa2ccc312bbd91e43aa5e0869e46bc03278a3dddb8d58833150a18b0f0283a", size = 557288, upload-time = "2025-11-16T14:50:10.73Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/f8/b8ff786f40470462a252918e0836e0db903c28e88e3eec66bc4a7856ee5d/rpds_py-0.29.0-cp314-cp314t-win32.whl", hash = "sha256:97c817863ffc397f1e6a6e9d2d89fe5408c0a9922dac0329672fb0f35c867ea5", size = 211382, upload-time = "2025-11-16T14:50:12.827Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/7f/1a65ae870bc9d0576aebb0c501ea5dccf1ae2178fe2821042150ebd2e707/rpds_py-0.29.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2023473f444752f0f82a58dfcbee040d0a1b3d1b3c2ec40e884bd25db6d117d2", size = 225919, upload-time = "2025-11-16T14:50:14.734Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/ac/b97e80bf107159e5b9ba9c91df1ab95f69e5e41b435f27bdd737f0d583ac/rpds_py-0.29.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:acd82a9e39082dc5f4492d15a6b6c8599aa21db5c35aaf7d6889aea16502c07d", size = 373963, upload-time = "2025-11-16T14:50:16.205Z" },
-    { url = "https://files.pythonhosted.org/packages/40/5a/55e72962d5d29bd912f40c594e68880d3c7a52774b0f75542775f9250712/rpds_py-0.29.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:715b67eac317bf1c7657508170a3e011a1ea6ccb1c9d5f296e20ba14196be6b3", size = 364644, upload-time = "2025-11-16T14:50:18.22Z" },
-    { url = "https://files.pythonhosted.org/packages/99/2a/6b6524d0191b7fc1351c3c0840baac42250515afb48ae40c7ed15499a6a2/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3b1b87a237cb2dba4db18bcfaaa44ba4cd5936b91121b62292ff21df577fc43", size = 393847, upload-time = "2025-11-16T14:50:20.012Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/b8/c5692a7df577b3c0c7faed7ac01ee3c608b81750fc5d89f84529229b6873/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c3c3e8101bb06e337c88eb0c0ede3187131f19d97d43ea0e1c5407ea74c0cbf", size = 407281, upload-time = "2025-11-16T14:50:21.64Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/57/0546c6f84031b7ea08b76646a8e33e45607cc6bd879ff1917dc077bb881e/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8e54d6e61f3ecd3abe032065ce83ea63417a24f437e4a3d73d2f85ce7b7cfe", size = 529213, upload-time = "2025-11-16T14:50:23.219Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/c1/01dd5f444233605555bc11fe5fed6a5c18f379f02013870c176c8e630a23/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3fbd4e9aebf110473a420dea85a238b254cf8a15acb04b22a5a6b5ce8925b760", size = 413808, upload-time = "2025-11-16T14:50:25.262Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/0a/60f98b06156ea2a7af849fb148e00fbcfdb540909a5174a5ed10c93745c7/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fdf53d36e6c72819993e35d1ebeeb8e8fc688d0c6c2b391b55e335b3afba5a", size = 394600, upload-time = "2025-11-16T14:50:26.956Z" },
-    { url = "https://files.pythonhosted.org/packages/37/f1/dc9312fc9bec040ece08396429f2bd9e0977924ba7a11c5ad7056428465e/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:ea7173df5d86f625f8dde6d5929629ad811ed8decda3b60ae603903839ac9ac0", size = 408634, upload-time = "2025-11-16T14:50:28.989Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/41/65024c9fd40c89bb7d604cf73beda4cbdbcebe92d8765345dd65855b6449/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:76054d540061eda273274f3d13a21a4abdde90e13eaefdc205db37c05230efce", size = 426064, upload-time = "2025-11-16T14:50:30.674Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/e0/cf95478881fc88ca2fdbf56381d7df36567cccc39a05394beac72182cd62/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:9f84c549746a5be3bc7415830747a3a0312573afc9f95785eb35228bb17742ec", size = 575871, upload-time = "2025-11-16T14:50:33.428Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/c0/df88097e64339a0218b57bd5f9ca49898e4c394db756c67fccc64add850a/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:0ea962671af5cb9a260489e311fa22b2e97103e3f9f0caaea6f81390af96a9ed", size = 601702, upload-time = "2025-11-16T14:50:36.051Z" },
-    { url = "https://files.pythonhosted.org/packages/87/f4/09ffb3ebd0cbb9e2c7c9b84d252557ecf434cd71584ee1e32f66013824df/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:f7728653900035fb7b8d06e1e5900545d8088efc9d5d4545782da7df03ec803f", size = 564054, upload-time = "2025-11-16T14:50:37.733Z" },
+version = "0.30.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/0c/0c411a0ec64ccb6d104dcabe0e713e05e153a9a2c3c2bd2b32ce412166fe/rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288", size = 370490, upload-time = "2025-11-30T20:21:33.256Z" },
+    { url = "https://files.pythonhosted.org/packages/19/6a/4ba3d0fb7297ebae71171822554abe48d7cab29c28b8f9f2c04b79988c05/rpds_py-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4cc2206b76b4f576934f0ed374b10d7ca5f457858b157ca52064bdfc26b9fc00", size = 359751, upload-time = "2025-11-30T20:21:34.591Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/7c/e4933565ef7f7a0818985d87c15d9d273f1a649afa6a52ea35ad011195ea/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:389a2d49eded1896c3d48b0136ead37c48e221b391c052fba3f4055c367f60a6", size = 389696, upload-time = "2025-11-30T20:21:36.122Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/01/6271a2511ad0815f00f7ed4390cf2567bec1d4b1da39e2c27a41e6e3b4de/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:32c8528634e1bf7121f3de08fa85b138f4e0dc47657866630611b03967f041d7", size = 403136, upload-time = "2025-11-30T20:21:37.728Z" },
+    { url = "https://files.pythonhosted.org/packages/55/64/c857eb7cd7541e9b4eee9d49c196e833128a55b89a9850a9c9ac33ccf897/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f207f69853edd6f6700b86efb84999651baf3789e78a466431df1331608e5324", size = 524699, upload-time = "2025-11-30T20:21:38.92Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/ed/94816543404078af9ab26159c44f9e98e20fe47e2126d5d32c9d9948d10a/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:67b02ec25ba7a9e8fa74c63b6ca44cf5707f2fbfadae3ee8e7494297d56aa9df", size = 412022, upload-time = "2025-11-30T20:21:40.407Z" },
+    { url = "https://files.pythonhosted.org/packages/61/b5/707f6cf0066a6412aacc11d17920ea2e19e5b2f04081c64526eb35b5c6e7/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0e95f6819a19965ff420f65578bacb0b00f251fefe2c8b23347c37174271f3", size = 390522, upload-time = "2025-11-30T20:21:42.17Z" },
+    { url = "https://files.pythonhosted.org/packages/13/4e/57a85fda37a229ff4226f8cbcf09f2a455d1ed20e802ce5b2b4a7f5ed053/rpds_py-0.30.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:a452763cc5198f2f98898eb98f7569649fe5da666c2dc6b5ddb10fde5a574221", size = 404579, upload-time = "2025-11-30T20:21:43.769Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/da/c9339293513ec680a721e0e16bf2bac3db6e5d7e922488de471308349bba/rpds_py-0.30.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e0b65193a413ccc930671c55153a03ee57cecb49e6227204b04fae512eb657a7", size = 421305, upload-time = "2025-11-30T20:21:44.994Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/be/522cb84751114f4ad9d822ff5a1aa3c98006341895d5f084779b99596e5c/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:858738e9c32147f78b3ac24dc0edb6610000e56dc0f700fd5f651d0a0f0eb9ff", size = 572503, upload-time = "2025-11-30T20:21:46.91Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/9b/de879f7e7ceddc973ea6e4629e9b380213a6938a249e94b0cdbcc325bb66/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:da279aa314f00acbb803da1e76fa18666778e8a8f83484fba94526da5de2cba7", size = 598322, upload-time = "2025-11-30T20:21:48.709Z" },
+    { url = "https://files.pythonhosted.org/packages/48/ac/f01fc22efec3f37d8a914fc1b2fb9bcafd56a299edbe96406f3053edea5a/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7c64d38fb49b6cdeda16ab49e35fe0da2e1e9b34bc38bd78386530f218b37139", size = 560792, upload-time = "2025-11-30T20:21:50.024Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/da/4e2b19d0f131f35b6146425f846563d0ce036763e38913d917187307a671/rpds_py-0.30.0-cp310-cp310-win32.whl", hash = "sha256:6de2a32a1665b93233cde140ff8b3467bdb9e2af2b91079f0333a0974d12d464", size = 221901, upload-time = "2025-11-30T20:21:51.32Z" },
+    { url = "https://files.pythonhosted.org/packages/96/cb/156d7a5cf4f78a7cc571465d8aec7a3c447c94f6749c5123f08438bcf7bc/rpds_py-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:1726859cd0de969f88dc8673bdd954185b9104e05806be64bcd87badbe313169", size = 235823, upload-time = "2025-11-30T20:21:52.505Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" },
+    { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" },
+    { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" },
+    { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" },
+    { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" },
+    { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" },
+    { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" },
+    { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" },
+    { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" },
+    { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" },
+    { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" },
+    { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" },
+    { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" },
+    { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" },
+    { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" },
+    { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" },
+    { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2", size = 374887, upload-time = "2025-11-30T20:22:41.812Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/32/55fb50ae104061dbc564ef15cc43c013dc4a9f4527a1f4d99baddf56fe5f/rpds_py-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8", size = 358904, upload-time = "2025-11-30T20:22:43.479Z" },
+    { url = "https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4", size = 389945, upload-time = "2025-11-30T20:22:44.819Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/a8/073cac3ed2c6387df38f71296d002ab43496a96b92c823e76f46b8af0543/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136", size = 407783, upload-time = "2025-11-30T20:22:46.103Z" },
+    { url = "https://files.pythonhosted.org/packages/77/57/5999eb8c58671f1c11eba084115e77a8899d6e694d2a18f69f0ba471ec8b/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7", size = 515021, upload-time = "2025-11-30T20:22:47.458Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/af/5ab4833eadc36c0a8ed2bc5c0de0493c04f6c06de223170bd0798ff98ced/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2", size = 414589, upload-time = "2025-11-30T20:22:48.872Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6", size = 394025, upload-time = "2025-11-30T20:22:50.196Z" },
+    { url = "https://files.pythonhosted.org/packages/91/c4/fc70cd0249496493500e7cc2de87504f5aa6509de1e88623431fec76d4b6/rpds_py-0.30.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e", size = 408895, upload-time = "2025-11-30T20:22:51.87Z" },
+    { url = "https://files.pythonhosted.org/packages/58/95/d9275b05ab96556fefff73a385813eb66032e4c99f411d0795372d9abcea/rpds_py-0.30.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d", size = 422799, upload-time = "2025-11-30T20:22:53.341Z" },
+    { url = "https://files.pythonhosted.org/packages/06/c1/3088fc04b6624eb12a57eb814f0d4997a44b0d208d6cace713033ff1a6ba/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7", size = 572731, upload-time = "2025-11-30T20:22:54.778Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/42/c612a833183b39774e8ac8fecae81263a68b9583ee343db33ab571a7ce55/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31", size = 599027, upload-time = "2025-11-30T20:22:56.212Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/60/525a50f45b01d70005403ae0e25f43c0384369ad24ffe46e8d9068b50086/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95", size = 563020, upload-time = "2025-11-30T20:22:58.2Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/5d/47c4655e9bcd5ca907148535c10e7d489044243cc9941c16ed7cd53be91d/rpds_py-0.30.0-cp313-cp313-win32.whl", hash = "sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d", size = 223139, upload-time = "2025-11-30T20:23:00.209Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/e1/485132437d20aa4d3e1d8b3fb5a5e65aa8139f1e097080c2a8443201742c/rpds_py-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15", size = 240224, upload-time = "2025-11-30T20:23:02.008Z" },
+    { url = "https://files.pythonhosted.org/packages/24/95/ffd128ed1146a153d928617b0ef673960130be0009c77d8fbf0abe306713/rpds_py-0.30.0-cp313-cp313-win_arm64.whl", hash = "sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1", size = 230645, upload-time = "2025-11-30T20:23:03.43Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/1b/b10de890a0def2a319a2626334a7f0ae388215eb60914dbac8a3bae54435/rpds_py-0.30.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a", size = 364443, upload-time = "2025-11-30T20:23:04.878Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/bf/27e39f5971dc4f305a4fb9c672ca06f290f7c4e261c568f3dea16a410d47/rpds_py-0.30.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e", size = 353375, upload-time = "2025-11-30T20:23:06.342Z" },
+    { url = "https://files.pythonhosted.org/packages/40/58/442ada3bba6e8e6615fc00483135c14a7538d2ffac30e2d933ccf6852232/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000", size = 383850, upload-time = "2025-11-30T20:23:07.825Z" },
+    { url = "https://files.pythonhosted.org/packages/14/14/f59b0127409a33c6ef6f5c1ebd5ad8e32d7861c9c7adfa9a624fc3889f6c/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db", size = 392812, upload-time = "2025-11-30T20:23:09.228Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/66/e0be3e162ac299b3a22527e8913767d869e6cc75c46bd844aa43fb81ab62/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2", size = 517841, upload-time = "2025-11-30T20:23:11.186Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/55/fa3b9cf31d0c963ecf1ba777f7cf4b2a2c976795ac430d24a1f43d25a6ba/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa", size = 408149, upload-time = "2025-11-30T20:23:12.864Z" },
+    { url = "https://files.pythonhosted.org/packages/60/ca/780cf3b1a32b18c0f05c441958d3758f02544f1d613abf9488cd78876378/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083", size = 383843, upload-time = "2025-11-30T20:23:14.638Z" },
+    { url = "https://files.pythonhosted.org/packages/82/86/d5f2e04f2aa6247c613da0c1dd87fcd08fa17107e858193566048a1e2f0a/rpds_py-0.30.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9", size = 396507, upload-time = "2025-11-30T20:23:16.105Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/9a/453255d2f769fe44e07ea9785c8347edaf867f7026872e76c1ad9f7bed92/rpds_py-0.30.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0", size = 414949, upload-time = "2025-11-30T20:23:17.539Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/31/622a86cdc0c45d6df0e9ccb6becdba5074735e7033c20e401a6d9d0e2ca0/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94", size = 565790, upload-time = "2025-11-30T20:23:19.029Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/5d/15bbf0fb4a3f58a3b1c67855ec1efcc4ceaef4e86644665fff03e1b66d8d/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08", size = 590217, upload-time = "2025-11-30T20:23:20.885Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/61/21b8c41f68e60c8cc3b2e25644f0e3681926020f11d06ab0b78e3c6bbff1/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27", size = 555806, upload-time = "2025-11-30T20:23:22.488Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/39/7e067bb06c31de48de3eb200f9fc7c58982a4d3db44b07e73963e10d3be9/rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6", size = 211341, upload-time = "2025-11-30T20:23:24.449Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/4d/222ef0b46443cf4cf46764d9c630f3fe4abaa7245be9417e56e9f52b8f65/rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d", size = 225768, upload-time = "2025-11-30T20:23:25.908Z" },
+    { url = "https://files.pythonhosted.org/packages/86/81/dad16382ebbd3d0e0328776d8fd7ca94220e4fa0798d1dc5e7da48cb3201/rpds_py-0.30.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0", size = 362099, upload-time = "2025-11-30T20:23:27.316Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be", size = 353192, upload-time = "2025-11-30T20:23:29.151Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/c4/76eb0e1e72d1a9c4703c69607cec123c29028bff28ce41588792417098ac/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f", size = 384080, upload-time = "2025-11-30T20:23:30.785Z" },
+    { url = "https://files.pythonhosted.org/packages/72/87/87ea665e92f3298d1b26d78814721dc39ed8d2c74b86e83348d6b48a6f31/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f", size = 394841, upload-time = "2025-11-30T20:23:32.209Z" },
+    { url = "https://files.pythonhosted.org/packages/77/ad/7783a89ca0587c15dcbf139b4a8364a872a25f861bdb88ed99f9b0dec985/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87", size = 516670, upload-time = "2025-11-30T20:23:33.742Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/3c/2882bdac942bd2172f3da574eab16f309ae10a3925644e969536553cb4ee/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18", size = 408005, upload-time = "2025-11-30T20:23:35.253Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/81/9a91c0111ce1758c92516a3e44776920b579d9a7c09b2b06b642d4de3f0f/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad", size = 382112, upload-time = "2025-11-30T20:23:36.842Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/8e/1da49d4a107027e5fbc64daeab96a0706361a2918da10cb41769244b805d/rpds_py-0.30.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07", size = 399049, upload-time = "2025-11-30T20:23:38.343Z" },
+    { url = "https://files.pythonhosted.org/packages/df/5a/7ee239b1aa48a127570ec03becbb29c9d5a9eb092febbd1699d567cae859/rpds_py-0.30.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f", size = 415661, upload-time = "2025-11-30T20:23:40.263Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ea/caa143cf6b772f823bc7929a45da1fa83569ee49b11d18d0ada7f5ee6fd6/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65", size = 565606, upload-time = "2025-11-30T20:23:42.186Z" },
+    { url = "https://files.pythonhosted.org/packages/64/91/ac20ba2d69303f961ad8cf55bf7dbdb4763f627291ba3d0d7d67333cced9/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f", size = 591126, upload-time = "2025-11-30T20:23:44.086Z" },
+    { url = "https://files.pythonhosted.org/packages/21/20/7ff5f3c8b00c8a95f75985128c26ba44503fb35b8e0259d812766ea966c7/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53", size = 553371, upload-time = "2025-11-30T20:23:46.004Z" },
+    { url = "https://files.pythonhosted.org/packages/72/c7/81dadd7b27c8ee391c132a6b192111ca58d866577ce2d9b0ca157552cce0/rpds_py-0.30.0-cp314-cp314-win32.whl", hash = "sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed", size = 215298, upload-time = "2025-11-30T20:23:47.696Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/d2/1aaac33287e8cfb07aab2e6b8ac1deca62f6f65411344f1433c55e6f3eb8/rpds_py-0.30.0-cp314-cp314-win_amd64.whl", hash = "sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950", size = 228604, upload-time = "2025-11-30T20:23:49.501Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/95/ab005315818cc519ad074cb7784dae60d939163108bd2b394e60dc7b5461/rpds_py-0.30.0-cp314-cp314-win_arm64.whl", hash = "sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6", size = 222391, upload-time = "2025-11-30T20:23:50.96Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/68/154fe0194d83b973cdedcdcc88947a2752411165930182ae41d983dcefa6/rpds_py-0.30.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb", size = 364868, upload-time = "2025-11-30T20:23:52.494Z" },
+    { url = "https://files.pythonhosted.org/packages/83/69/8bbc8b07ec854d92a8b75668c24d2abcb1719ebf890f5604c61c9369a16f/rpds_py-0.30.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8", size = 353747, upload-time = "2025-11-30T20:23:54.036Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/00/ba2e50183dbd9abcce9497fa5149c62b4ff3e22d338a30d690f9af970561/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7", size = 383795, upload-time = "2025-11-30T20:23:55.556Z" },
+    { url = "https://files.pythonhosted.org/packages/05/6f/86f0272b84926bcb0e4c972262f54223e8ecc556b3224d281e6598fc9268/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898", size = 393330, upload-time = "2025-11-30T20:23:57.033Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/e9/0e02bb2e6dc63d212641da45df2b0bf29699d01715913e0d0f017ee29438/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e", size = 518194, upload-time = "2025-11-30T20:23:58.637Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/ca/be7bca14cf21513bdf9c0606aba17d1f389ea2b6987035eb4f62bd923f25/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419", size = 408340, upload-time = "2025-11-30T20:24:00.2Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/c7/736e00ebf39ed81d75544c0da6ef7b0998f8201b369acf842f9a90dc8fce/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551", size = 383765, upload-time = "2025-11-30T20:24:01.759Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/3f/da50dfde9956aaf365c4adc9533b100008ed31aea635f2b8d7b627e25b49/rpds_py-0.30.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8", size = 396834, upload-time = "2025-11-30T20:24:03.687Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/00/34bcc2565b6020eab2623349efbdec810676ad571995911f1abdae62a3a0/rpds_py-0.30.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5", size = 415470, upload-time = "2025-11-30T20:24:05.232Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/28/882e72b5b3e6f718d5453bd4d0d9cf8df36fddeb4ddbbab17869d5868616/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404", size = 565630, upload-time = "2025-11-30T20:24:06.878Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/97/04a65539c17692de5b85c6e293520fd01317fd878ea1995f0367d4532fb1/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856", size = 591148, upload-time = "2025-11-30T20:24:08.445Z" },
+    { url = "https://files.pythonhosted.org/packages/85/70/92482ccffb96f5441aab93e26c4d66489eb599efdcf96fad90c14bbfb976/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40", size = 556030, upload-time = "2025-11-30T20:24:10.956Z" },
+    { url = "https://files.pythonhosted.org/packages/20/53/7c7e784abfa500a2b6b583b147ee4bb5a2b3747a9166bab52fec4b5b5e7d/rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0", size = 211570, upload-time = "2025-11-30T20:24:12.735Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" },
+    { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" },
+    { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" },
+    { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" },
+    { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" },
+    { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" },
+    { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" },
+    { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" },
+]
+
+[[package]]
+name = "rsa"
+version = "4.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyasn1" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
 ]
 
 [[package]]
@@ -4839,16 +5845,53 @@ wheels = [
 name = "s3fs"
 version = "2025.10.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "aiobotocore" },
-    { name = "aiohttp" },
-    { name = "fsspec" },
+    { name = "aiobotocore", version = "2.26.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "aiohttp", marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
+    { name = "fsspec", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/bb/ee/7cf7de3b17ef6db10b027cc9f8a1108ceb6333e267943e666a35882b1474/s3fs-2025.10.0.tar.gz", hash = "sha256:e8be6cddc77aceea1681ece0f472c3a7f8ef71a0d2acddb1cc92bb6afa3e9e4f", size = 80383, upload-time = "2025-10-30T15:06:04.647Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/2d/fc/56cba14af8ad8fd020c85b6e44328520ac55939bb1f9d01444ad470504cb/s3fs-2025.10.0-py3-none-any.whl", hash = "sha256:da7ef25efc1541f5fca8e1116361e49ea1081f83f4e8001fbd77347c625da28a", size = 30357, upload-time = "2025-10-30T15:06:03.48Z" },
 ]
 
+[[package]]
+name = "s3fs"
+version = "2026.2.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "aiobotocore", version = "3.1.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "aiohttp", marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fa/be/392c8c5e0da9bfa139e41084690dd49a5e3e931099f78f52d3f6070105c6/s3fs-2026.2.0.tar.gz", hash = "sha256:91cb2a9f76e35643b76eeac3f47a6165172bb3def671f76b9111c8dd5779a2ac", size = 84152, upload-time = "2026-02-05T21:57:57.968Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/57/e1/64c264db50b68de8a438b60ceeb921b2f22da3ebb7ad6255150225d0beac/s3fs-2026.2.0-py3-none-any.whl", hash = "sha256:65198835b86b1d5771112b0085d1da52a6ede36508b1aaa6cae2aedc765dfe10", size = 31328, upload-time = "2026-02-05T21:57:56.532Z" },
+]
+
 [[package]]
 name = "safetensors"
 version = "0.7.0"
@@ -4937,83 +5980,91 @@ wheels = [
 
 [[package]]
 name = "scipy"
-version = "1.16.3"
+version = "1.17.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9b/5f/6f37d7439de1455ce9c5a556b8d1db0979f03a796c030bafdf08d35b7bf9/scipy-1.16.3-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97", size = 36630881, upload-time = "2025-10-28T17:31:47.104Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/89/d70e9f628749b7e4db2aa4cd89735502ff3f08f7b9b27d2e799485987cd9/scipy-1.16.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511", size = 28941012, upload-time = "2025-10-28T17:31:53.411Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/a8/0e7a9a6872a923505dbdf6bb93451edcac120363131c19013044a1e7cb0c/scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005", size = 20931935, upload-time = "2025-10-28T17:31:57.361Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/c7/020fb72bd79ad798e4dbe53938543ecb96b3a9ac3fe274b7189e23e27353/scipy-1.16.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb", size = 23534466, upload-time = "2025-10-28T17:32:01.875Z" },
-    { url = "https://files.pythonhosted.org/packages/be/a0/668c4609ce6dbf2f948e167836ccaf897f95fb63fa231c87da7558a374cd/scipy-1.16.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876", size = 33593618, upload-time = "2025-10-28T17:32:06.902Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/6e/8942461cf2636cdae083e3eb72622a7fbbfa5cf559c7d13ab250a5dbdc01/scipy-1.16.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2", size = 35899798, upload-time = "2025-10-28T17:32:12.665Z" },
-    { url = "https://files.pythonhosted.org/packages/79/e8/d0f33590364cdbd67f28ce79368b373889faa4ee959588beddf6daef9abe/scipy-1.16.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e", size = 36226154, upload-time = "2025-10-28T17:32:17.961Z" },
-    { url = "https://files.pythonhosted.org/packages/39/c1/1903de608c0c924a1749c590064e65810f8046e437aba6be365abc4f7557/scipy-1.16.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733", size = 38878540, upload-time = "2025-10-28T17:32:23.907Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/d0/22ec7036ba0b0a35bccb7f25ab407382ed34af0b111475eb301c16f8a2e5/scipy-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78", size = 38722107, upload-time = "2025-10-28T17:32:29.921Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/60/8a00e5a524bb3bf8898db1650d350f50e6cffb9d7a491c561dc9826c7515/scipy-1.16.3-cp311-cp311-win_arm64.whl", hash = "sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184", size = 25506272, upload-time = "2025-10-28T17:32:34.577Z" },
-    { url = "https://files.pythonhosted.org/packages/40/41/5bf55c3f386b1643812f3a5674edf74b26184378ef0f3e7c7a09a7e2ca7f/scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6", size = 36659043, upload-time = "2025-10-28T17:32:40.285Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/0f/65582071948cfc45d43e9870bf7ca5f0e0684e165d7c9ef4e50d783073eb/scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07", size = 28898986, upload-time = "2025-10-28T17:32:45.325Z" },
-    { url = "https://files.pythonhosted.org/packages/96/5e/36bf3f0ac298187d1ceadde9051177d6a4fe4d507e8f59067dc9dd39e650/scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9", size = 20889814, upload-time = "2025-10-28T17:32:49.277Z" },
-    { url = "https://files.pythonhosted.org/packages/80/35/178d9d0c35394d5d5211bbff7ac4f2986c5488b59506fef9e1de13ea28d3/scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686", size = 23565795, upload-time = "2025-10-28T17:32:53.337Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/46/d1146ff536d034d02f83c8afc3c4bab2eddb634624d6529a8512f3afc9da/scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203", size = 33349476, upload-time = "2025-10-28T17:32:58.353Z" },
-    { url = "https://files.pythonhosted.org/packages/79/2e/415119c9ab3e62249e18c2b082c07aff907a273741b3f8160414b0e9193c/scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1", size = 35676692, upload-time = "2025-10-28T17:33:03.88Z" },
-    { url = "https://files.pythonhosted.org/packages/27/82/df26e44da78bf8d2aeaf7566082260cfa15955a5a6e96e6a29935b64132f/scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe", size = 36019345, upload-time = "2025-10-28T17:33:09.773Z" },
-    { url = "https://files.pythonhosted.org/packages/82/31/006cbb4b648ba379a95c87262c2855cd0d09453e500937f78b30f02fa1cd/scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70", size = 38678975, upload-time = "2025-10-28T17:33:15.809Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/7f/acbd28c97e990b421af7d6d6cd416358c9c293fc958b8529e0bd5d2a2a19/scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc", size = 38555926, upload-time = "2025-10-28T17:33:21.388Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/69/c5c7807fd007dad4f48e0a5f2153038dc96e8725d3345b9ee31b2b7bed46/scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2", size = 25463014, upload-time = "2025-10-28T17:33:25.975Z" },
-    { url = "https://files.pythonhosted.org/packages/72/f1/57e8327ab1508272029e27eeef34f2302ffc156b69e7e233e906c2a5c379/scipy-1.16.3-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c", size = 36617856, upload-time = "2025-10-28T17:33:31.375Z" },
-    { url = "https://files.pythonhosted.org/packages/44/13/7e63cfba8a7452eb756306aa2fd9b37a29a323b672b964b4fdeded9a3f21/scipy-1.16.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d", size = 28874306, upload-time = "2025-10-28T17:33:36.516Z" },
-    { url = "https://files.pythonhosted.org/packages/15/65/3a9400efd0228a176e6ec3454b1fa998fbbb5a8defa1672c3f65706987db/scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9", size = 20865371, upload-time = "2025-10-28T17:33:42.094Z" },
-    { url = "https://files.pythonhosted.org/packages/33/d7/eda09adf009a9fb81827194d4dd02d2e4bc752cef16737cc4ef065234031/scipy-1.16.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4", size = 23524877, upload-time = "2025-10-28T17:33:48.483Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/6b/3f911e1ebc364cb81320223a3422aab7d26c9c7973109a9cd0f27c64c6c0/scipy-1.16.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959", size = 33342103, upload-time = "2025-10-28T17:33:56.495Z" },
-    { url = "https://files.pythonhosted.org/packages/21/f6/4bfb5695d8941e5c570a04d9fcd0d36bce7511b7d78e6e75c8f9791f82d0/scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88", size = 35697297, upload-time = "2025-10-28T17:34:04.722Z" },
-    { url = "https://files.pythonhosted.org/packages/04/e1/6496dadbc80d8d896ff72511ecfe2316b50313bfc3ebf07a3f580f08bd8c/scipy-1.16.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234", size = 36021756, upload-time = "2025-10-28T17:34:13.482Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/bd/a8c7799e0136b987bda3e1b23d155bcb31aec68a4a472554df5f0937eef7/scipy-1.16.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d", size = 38696566, upload-time = "2025-10-28T17:34:22.384Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/01/1204382461fcbfeb05b6161b594f4007e78b6eba9b375382f79153172b4d/scipy-1.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304", size = 38529877, upload-time = "2025-10-28T17:35:51.076Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/14/9d9fbcaa1260a94f4bb5b64ba9213ceb5d03cd88841fe9fd1ffd47a45b73/scipy-1.16.3-cp313-cp313-win_arm64.whl", hash = "sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2", size = 25455366, upload-time = "2025-10-28T17:35:59.014Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/a3/9ec205bd49f42d45d77f1730dbad9ccf146244c1647605cf834b3a8c4f36/scipy-1.16.3-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b", size = 37027931, upload-time = "2025-10-28T17:34:31.451Z" },
-    { url = "https://files.pythonhosted.org/packages/25/06/ca9fd1f3a4589cbd825b1447e5db3a8ebb969c1eaf22c8579bd286f51b6d/scipy-1.16.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079", size = 29400081, upload-time = "2025-10-28T17:34:39.087Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/56/933e68210d92657d93fb0e381683bc0e53a965048d7358ff5fbf9e6a1b17/scipy-1.16.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a", size = 21391244, upload-time = "2025-10-28T17:34:45.234Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/7e/779845db03dc1418e215726329674b40576879b91814568757ff0014ad65/scipy-1.16.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119", size = 23929753, upload-time = "2025-10-28T17:34:51.793Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/4b/f756cf8161d5365dcdef9e5f460ab226c068211030a175d2fc7f3f41ca64/scipy-1.16.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c", size = 33496912, upload-time = "2025-10-28T17:34:59.8Z" },
-    { url = "https://files.pythonhosted.org/packages/09/b5/222b1e49a58668f23839ca1542a6322bb095ab8d6590d4f71723869a6c2c/scipy-1.16.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e", size = 35802371, upload-time = "2025-10-28T17:35:08.173Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/8d/5964ef68bb31829bde27611f8c9deeac13764589fe74a75390242b64ca44/scipy-1.16.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135", size = 36190477, upload-time = "2025-10-28T17:35:16.7Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/f2/b31d75cb9b5fa4dd39a0a931ee9b33e7f6f36f23be5ef560bf72e0f92f32/scipy-1.16.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6", size = 38796678, upload-time = "2025-10-28T17:35:26.354Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/1e/b3723d8ff64ab548c38d87055483714fefe6ee20e0189b62352b5e015bb1/scipy-1.16.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc", size = 38640178, upload-time = "2025-10-28T17:35:35.304Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/f3/d854ff38789aca9b0cc23008d607ced9de4f7ab14fa1ca4329f86b3758ca/scipy-1.16.3-cp313-cp313t-win_arm64.whl", hash = "sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a", size = 25803246, upload-time = "2025-10-28T17:35:42.155Z" },
-    { url = "https://files.pythonhosted.org/packages/99/f6/99b10fd70f2d864c1e29a28bbcaa0c6340f9d8518396542d9ea3b4aaae15/scipy-1.16.3-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:875555ce62743e1d54f06cdf22c1e0bc47b91130ac40fe5d783b6dfa114beeb6", size = 36606469, upload-time = "2025-10-28T17:36:08.741Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/74/043b54f2319f48ea940dd025779fa28ee360e6b95acb7cd188fad4391c6b/scipy-1.16.3-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:bb61878c18a470021fb515a843dc7a76961a8daceaaaa8bad1332f1bf4b54657", size = 28872043, upload-time = "2025-10-28T17:36:16.599Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/e1/24b7e50cc1c4ee6ffbcb1f27fe9f4c8b40e7911675f6d2d20955f41c6348/scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f2622206f5559784fa5c4b53a950c3c7c1cf3e84ca1b9c4b6c03f062f289ca26", size = 20862952, upload-time = "2025-10-28T17:36:22.966Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/3a/3e8c01a4d742b730df368e063787c6808597ccb38636ed821d10b39ca51b/scipy-1.16.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7f68154688c515cdb541a31ef8eb66d8cd1050605be9dcd74199cbd22ac739bc", size = 23508512, upload-time = "2025-10-28T17:36:29.731Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/60/c45a12b98ad591536bfe5330cb3cfe1850d7570259303563b1721564d458/scipy-1.16.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3c820ddb80029fe9f43d61b81d8b488d3ef8ca010d15122b152db77dc94c22", size = 33413639, upload-time = "2025-10-28T17:36:37.982Z" },
-    { url = "https://files.pythonhosted.org/packages/71/bc/35957d88645476307e4839712642896689df442f3e53b0fa016ecf8a3357/scipy-1.16.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d3837938ae715fc0fe3c39c0202de3a8853aff22ca66781ddc2ade7554b7e2cc", size = 35704729, upload-time = "2025-10-28T17:36:46.547Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/15/89105e659041b1ca11c386e9995aefacd513a78493656e57789f9d9eab61/scipy-1.16.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aadd23f98f9cb069b3bd64ddc900c4d277778242e961751f77a8cb5c4b946fb0", size = 36086251, upload-time = "2025-10-28T17:36:55.161Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/87/c0ea673ac9c6cc50b3da2196d860273bc7389aa69b64efa8493bdd25b093/scipy-1.16.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b7c5f1bda1354d6a19bc6af73a649f8285ca63ac6b52e64e658a5a11d4d69800", size = 38716681, upload-time = "2025-10-28T17:37:04.1Z" },
-    { url = "https://files.pythonhosted.org/packages/91/06/837893227b043fb9b0d13e4bd7586982d8136cb249ffb3492930dab905b8/scipy-1.16.3-cp314-cp314-win_amd64.whl", hash = "sha256:e5d42a9472e7579e473879a1990327830493a7047506d58d73fc429b84c1d49d", size = 39358423, upload-time = "2025-10-28T17:38:20.005Z" },
-    { url = "https://files.pythonhosted.org/packages/95/03/28bce0355e4d34a7c034727505a02d19548549e190bedd13a721e35380b7/scipy-1.16.3-cp314-cp314-win_arm64.whl", hash = "sha256:6020470b9d00245926f2d5bb93b119ca0340f0d564eb6fbaad843eaebf9d690f", size = 26135027, upload-time = "2025-10-28T17:38:24.966Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/6f/69f1e2b682efe9de8fe9f91040f0cd32f13cfccba690512ba4c582b0bc29/scipy-1.16.3-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:e1d27cbcb4602680a49d787d90664fa4974063ac9d4134813332a8c53dbe667c", size = 37028379, upload-time = "2025-10-28T17:37:14.061Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/2d/e826f31624a5ebbab1cd93d30fd74349914753076ed0593e1d56a98c4fb4/scipy-1.16.3-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:9b9c9c07b6d56a35777a1b4cc8966118fb16cfd8daf6743867d17d36cfad2d40", size = 29400052, upload-time = "2025-10-28T17:37:21.709Z" },
-    { url = "https://files.pythonhosted.org/packages/69/27/d24feb80155f41fd1f156bf144e7e049b4e2b9dd06261a242905e3bc7a03/scipy-1.16.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:3a4c460301fb2cffb7f88528f30b3127742cff583603aa7dc964a52c463b385d", size = 21391183, upload-time = "2025-10-28T17:37:29.559Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/d3/1b229e433074c5738a24277eca520a2319aac7465eea7310ea6ae0e98ae2/scipy-1.16.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa", size = 23930174, upload-time = "2025-10-28T17:37:36.306Z" },
-    { url = "https://files.pythonhosted.org/packages/16/9d/d9e148b0ec680c0f042581a2be79a28a7ab66c0c4946697f9e7553ead337/scipy-1.16.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f379b54b77a597aa7ee5e697df0d66903e41b9c85a6dd7946159e356319158e8", size = 33497852, upload-time = "2025-10-28T17:37:42.228Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/22/4e5f7561e4f98b7bea63cf3fd7934bff1e3182e9f1626b089a679914d5c8/scipy-1.16.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4aff59800a3b7f786b70bfd6ab551001cb553244988d7d6b8299cb1ea653b353", size = 35798595, upload-time = "2025-10-28T17:37:48.102Z" },
-    { url = "https://files.pythonhosted.org/packages/83/42/6644d714c179429fc7196857866f219fef25238319b650bb32dde7bf7a48/scipy-1.16.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:da7763f55885045036fabcebd80144b757d3db06ab0861415d1c3b7c69042146", size = 36186269, upload-time = "2025-10-28T17:37:53.72Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/70/64b4d7ca92f9cf2e6fc6aaa2eecf80bb9b6b985043a9583f32f8177ea122/scipy-1.16.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d", size = 38802779, upload-time = "2025-10-28T17:37:59.393Z" },
-    { url = "https://files.pythonhosted.org/packages/61/82/8d0e39f62764cce5ffd5284131e109f07cf8955aef9ab8ed4e3aa5e30539/scipy-1.16.3-cp314-cp314t-win_amd64.whl", hash = "sha256:d9f48cafc7ce94cf9b15c6bffdc443a81a27bf7075cf2dcd5c8b40f85d10c4e7", size = 39471128, upload-time = "2025-10-28T17:38:05.259Z" },
-    { url = "https://files.pythonhosted.org/packages/64/47/a494741db7280eae6dc033510c319e34d42dd41b7ac0c7ead39354d1a2b5/scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562", size = 26464127, upload-time = "2025-10-28T17:38:11.34Z" },
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/75/b4ce781849931fef6fd529afa6b63711d5a733065722d0c3e2724af9e40a/scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec", size = 31613675, upload-time = "2026-02-23T00:16:00.13Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/58/bccc2861b305abdd1b8663d6130c0b3d7cc22e8d86663edbc8401bfd40d4/scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696", size = 28162057, upload-time = "2026-02-23T00:16:09.456Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/ee/18146b7757ed4976276b9c9819108adbc73c5aad636e5353e20746b73069/scipy-1.17.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a3472cfbca0a54177d0faa68f697d8ba4c80bbdc19908c3465556d9f7efce9ee", size = 20334032, upload-time = "2026-02-23T00:16:17.358Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/e6/cef1cf3557f0c54954198554a10016b6a03b2ec9e22a4e1df734936bd99c/scipy-1.17.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:766e0dc5a616d026a3a1cffa379af959671729083882f50307e18175797b3dfd", size = 22709533, upload-time = "2026-02-23T00:16:25.791Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/60/8804678875fc59362b0fb759ab3ecce1f09c10a735680318ac30da8cd76b/scipy-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:744b2bf3640d907b79f3fd7874efe432d1cf171ee721243e350f55234b4cec4c", size = 33062057, upload-time = "2026-02-23T00:16:36.931Z" },
+    { url = "https://files.pythonhosted.org/packages/09/7d/af933f0f6e0767995b4e2d705a0665e454d1c19402aa7e895de3951ebb04/scipy-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43af8d1f3bea642559019edfe64e9b11192a8978efbd1539d7bc2aaa23d92de4", size = 35349300, upload-time = "2026-02-23T00:16:49.108Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/3d/7ccbbdcbb54c8fdc20d3b6930137c782a163fa626f0aef920349873421ba/scipy-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd96a1898c0a47be4520327e01f874acfd61fb48a9420f8aa9f6483412ffa444", size = 35127333, upload-time = "2026-02-23T00:17:01.293Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/19/f926cb11c42b15ba08e3a71e376d816ac08614f769b4f47e06c3580c836a/scipy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4eb6c25dd62ee8d5edf68a8e1c171dd71c292fdae95d8aeb3dd7d7de4c364082", size = 37741314, upload-time = "2026-02-23T00:17:12.576Z" },
+    { url = "https://files.pythonhosted.org/packages/95/da/0d1df507cf574b3f224ccc3d45244c9a1d732c81dcb26b1e8a766ae271a8/scipy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:d30e57c72013c2a4fe441c2fcb8e77b14e152ad48b5464858e07e2ad9fbfceff", size = 36607512, upload-time = "2026-02-23T00:17:23.424Z" },
+    { url = "https://files.pythonhosted.org/packages/68/7f/bdd79ceaad24b671543ffe0ef61ed8e659440eb683b66f033454dcee90eb/scipy-1.17.1-cp311-cp311-win_arm64.whl", hash = "sha256:9ecb4efb1cd6e8c4afea0daa91a87fbddbce1b99d2895d151596716c0b2e859d", size = 24599248, upload-time = "2026-02-23T00:17:34.561Z" },
+    { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954, upload-time = "2026-02-23T00:17:49.855Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662, upload-time = "2026-02-23T00:18:01.64Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366, upload-time = "2026-02-23T00:18:12.015Z" },
+    { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017, upload-time = "2026-02-23T00:18:21.502Z" },
+    { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842, upload-time = "2026-02-23T00:18:35.367Z" },
+    { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890, upload-time = "2026-02-23T00:18:49.188Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557, upload-time = "2026-02-23T00:18:54.74Z" },
+    { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856, upload-time = "2026-02-23T00:19:00.307Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682, upload-time = "2026-02-23T00:19:07.67Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" },
+    { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" },
+    { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" },
+    { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" },
+    { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" },
+    { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" },
+    { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" },
+    { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" },
+    { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" },
+    { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" },
+    { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" },
+    { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" },
+    { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" },
 ]
 
 [[package]]
@@ -5082,15 +6133,15 @@ wheels = [
 
 [[package]]
 name = "sentry-sdk"
-version = "2.46.0"
+version = "2.53.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "certifi" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7c/d7/c140a5837649e2bf2ec758494fde1d9a016c76777eab64e75ef38d685bbb/sentry_sdk-2.46.0.tar.gz", hash = "sha256:91821a23460725734b7741523021601593f35731808afc0bb2ba46c27b8acd91", size = 374761, upload-time = "2025-11-24T09:34:13.932Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/06/66c8b705179bc54087845f28fd1b72f83751b6e9a195628e2e9af9926505/sentry_sdk-2.53.0.tar.gz", hash = "sha256:6520ef2c4acd823f28efc55e43eb6ce2e6d9f954a95a3aa96b6fd14871e92b77", size = 412369, upload-time = "2026-02-16T11:11:14.743Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4b/b6/ce7c502a366f4835b1f9c057753f6989a92d3c70cbadb168193f5fb7499b/sentry_sdk-2.46.0-py2.py3-none-any.whl", hash = "sha256:4eeeb60198074dff8d066ea153fa6f241fef1668c10900ea53a4200abc8da9b1", size = 406266, upload-time = "2025-11-24T09:34:12.114Z" },
+    { url = "https://files.pythonhosted.org/packages/47/d4/2fdf854bc3b9c7f55219678f812600a20a138af2dd847d99004994eada8f/sentry_sdk-2.53.0-py2.py3-none-any.whl", hash = "sha256:46e1ed8d84355ae54406c924f6b290c3d61f4048625989a723fd622aab838899", size = 437908, upload-time = "2026-02-16T11:11:13.227Z" },
 ]
 
 [[package]]
@@ -5122,11 +6173,24 @@ wheels = [
 
 [[package]]
 name = "slack-sdk"
-version = "3.39.0"
+version = "3.40.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3a/18/784859b33a3f9c8cdaa1eda4115eb9fe72a0a37304718887d12991eeb2fd/slack_sdk-3.40.1.tar.gz", hash = "sha256:a215333bc251bc90abf5f5110899497bf61a3b5184b6d9ee35d73ebf09ec3fd0", size = 250379, upload-time = "2026-02-18T22:11:01.819Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6e/e1/bb81f93c9f403e3b573c429dd4838ec9b44e4ef35f3b0759eb49557ab6e3/slack_sdk-3.40.1-py2.py3-none-any.whl", hash = "sha256:cd8902252979aa248092b0d77f3a9ea3cc605bc5d53663ad728e892e26e14a65", size = 313687, upload-time = "2026-02-18T22:11:00.027Z" },
+]
+
+[[package]]
+name = "smart-open"
+version = "7.5.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b6/dd/645f3eb93fce38eadbb649e85684730b1fc3906c2674ca59bddc2ca2bd2e/slack_sdk-3.39.0.tar.gz", hash = "sha256:6a56be10dc155c436ff658c6b776e1c082e29eae6a771fccf8b0a235822bbcb1", size = 247207, upload-time = "2025-11-20T15:27:57.556Z" }
+dependencies = [
+    { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "wrapt", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e8/be/a66598b305763861a9ab15ff0f2fbc44e47b1ce7a776797337a4eef37c66/smart_open-7.5.1.tar.gz", hash = "sha256:3f08e16827c4733699e6b2cc40328a3568f900cb12ad9a3ad233ba6c872d9fe7", size = 54034, upload-time = "2026-02-23T11:01:28.979Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl", hash = "sha256:b1556b2f5b8b12b94e5ea3f56c4f2c7f04462e4e1013d325c5764ff118044fa8", size = 309850, upload-time = "2025-11-20T15:27:55.729Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/ea/dcdecd68acebb49d3fd560473a43499b1635076f7f1ae8641c060fe7ce74/smart_open-7.5.1-py3-none-any.whl", hash = "sha256:3e07cbbd9c8a908bcb8e25d48becf1a5cbb4886fa975e9f34c672ed171df2318", size = 64108, upload-time = "2026-02-23T11:01:27.429Z" },
 ]
 
 [[package]]
@@ -5172,7 +6236,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cffi" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
 wheels = [
@@ -5187,11 +6251,11 @@ wheels = [
 
 [[package]]
 name = "soupsieve"
-version = "2.8"
+version = "2.8.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
+    { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" },
 ]
 
 [[package]]
@@ -5206,7 +6270,7 @@ dependencies = [
     { name = "alabaster", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "babel", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "colorama", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "docutils", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "imagesize", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "jinja2", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
@@ -5228,40 +6292,78 @@ wheels = [
 
 [[package]]
 name = "sphinx"
-version = "8.2.3"
+version = "9.0.4"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 dependencies = [
-    { name = "alabaster", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "babel", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "colorama", marker = "(python_full_version >= '3.11' and sys_platform == 'win32') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "docutils", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "imagesize", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "jinja2", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pygments", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "requests", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "roman-numerals-py", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "snowballstemmer", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "alabaster", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "babel", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "colorama", marker = "(python_full_version == '3.11.*' and sys_platform == 'win32') or (python_full_version != '3.11.*' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "imagesize", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pygments", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "requests", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "roman-numerals", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "snowballstemmer", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/38/ad/4360e50ed56cb483667b8e6dadf2d3fda62359593faabbe749a27c4eaca6/sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348", size = 8321876, upload-time = "2025-03-02T22:31:59.658Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/42/50/a8c6ccc36d5eacdfd7913ddccd15a9cee03ecafc5ee2bc40e1f168d85022/sphinx-9.0.4.tar.gz", hash = "sha256:594ef59d042972abbc581d8baa577404abe4e6c3b04ef61bd7fc2acbd51f3fa3", size = 8710502, upload-time = "2025-12-04T07:45:27.343Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/31/53/136e9eca6e0b9dc0e1962e2c908fbea2e5ac000c2a2fbd9a35797958c48b/sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3", size = 3589741, upload-time = "2025-03-02T22:31:56.836Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/3f/4bbd76424c393caead2e1eb89777f575dee5c8653e2d4b6afd7a564f5974/sphinx-9.0.4-py3-none-any.whl", hash = "sha256:5bebc595a5e943ea248b99c13814c1c5e10b3ece718976824ffa7959ff95fffb", size = 3917713, upload-time = "2025-12-04T07:45:24.944Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "alabaster", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "babel", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "colorama", marker = "(python_full_version >= '3.12' and sys_platform == 'win32') or (python_full_version < '3.12' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "imagesize", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pygments", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "requests", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "roman-numerals", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "snowballstemmer", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cd/bd/f08eb0f4eed5c83f1ba2a3bd18f7745a2b1525fad70660a1c00224ec468a/sphinx-9.1.0.tar.gz", hash = "sha256:7741722357dd75f8190766926071fed3bdc211c74dd2d7d4df5404da95930ddb", size = 8718324, upload-time = "2025-12-31T15:09:27.646Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl", hash = "sha256:c84fdd4e782504495fe4f2c0b3413d6c2bf388589bb352d439b2a3bb99991978", size = 3921742, upload-time = "2025-12-31T15:09:25.561Z" },
 ]
 
 [[package]]
@@ -5293,15 +6395,24 @@ resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 dependencies = [
     { name = "colorama", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "starlette", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "uvicorn", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "watchfiles", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
@@ -5332,7 +6443,8 @@ version = "0.5.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fc/2b/a964715e7f5295f77509e59309959f4125122d648f86b4fe7d70ca1d882c/sphinx-copybutton-0.5.2.tar.gz", hash = "sha256:4cf17c82fb9646d1bc9ca92ac280813a3b605d8c421225fd9913154103ee1fbd", size = 23039, upload-time = "2023-04-14T08:10:22.998Z" }
 wheels = [
@@ -5395,15 +6507,15 @@ wheels = [
 
 [[package]]
 name = "starlette"
-version = "0.50.0"
+version = "0.52.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
-    { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ba/b8/73a0e6a6e079a9d9cfa64113d771e421640b6f679a52eeb9b32f72d871a1/starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca", size = 2646985, upload-time = "2025-11-01T15:25:27.516Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c4/68/79977123bb7be889ad680d79a40f339082c1978b5cfcf62c2d8d196873ac/starlette-0.52.1.tar.gz", hash = "sha256:834edd1b0a23167694292e94f597773bc3f89f362be6effee198165a35d62933", size = 2653702, upload-time = "2026-01-18T13:34:11.062Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" },
+    { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" },
 ]
 
 [[package]]
@@ -5445,7 +6557,7 @@ dependencies = [
     { name = "grpcio" },
     { name = "markdown" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging" },
     { name = "pillow" },
     { name = "protobuf" },
@@ -5505,44 +6617,56 @@ wheels = [
 
 [[package]]
 name = "tensorstore"
-version = "0.1.79"
+version = "0.1.81"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 dependencies = [
     { name = "ml-dtypes", marker = "python_full_version >= '3.11'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/26/2c/50ab489a0862ca88d2d766130a6fec45ccd5174f0e04081d8b7b07a8aedd/tensorstore-0.1.79.tar.gz", hash = "sha256:8dad44a8a7f2952a5d0030a8bd868b3cfdff048bd40ab53e7226f3d8b0881c5e", size = 7075782, upload-time = "2025-11-11T22:05:23.824Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/68/a9/1695d7ea197c4568c2f02f34b203eef702ec8080422331f00a65c6fb2a37/tensorstore-0.1.79-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:11a2c62694ea9c21770bc5a09938d3d15c4b9662b738ae6e1e513c26ed96251a", size = 16466511, upload-time = "2025-11-11T22:04:18.614Z" },
-    { url = "https://files.pythonhosted.org/packages/db/0e/5ce8a615c7f9ad7cf8ed4ac6e182fe0ef46fd06fef89757e49ba84a6ba9e/tensorstore-0.1.79-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e152d334bf34fbabdfe8e5bc35b87d1f9947065924ff83c29e659308b36e948", size = 14499810, upload-time = "2025-11-11T22:04:21.725Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/29/2cb9552138fe84ab29421489121350e4af0502eafff31ccd9017490be0d8/tensorstore-0.1.79-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4230b8fd29795e88e441f749d881973eca8dadf33c5262b367839fb8891f79b", size = 18937510, upload-time = "2025-11-11T22:04:24.221Z" },
-    { url = "https://files.pythonhosted.org/packages/42/70/d2a672a93faebdd176cd8541405cd5614b14d3d8dc812fbeaf2cf46d390a/tensorstore-0.1.79-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83072ee0e551d6dca582e154b64c8b8066d276ec0759784e3149c28212a61f18", size = 20910324, upload-time = "2025-11-11T22:04:26.769Z" },
-    { url = "https://files.pythonhosted.org/packages/91/d5/7958cbfb614c4ffa5070ae9575874d46937067c0d81a7739e67fb1d62de5/tensorstore-0.1.79-cp311-cp311-win_amd64.whl", hash = "sha256:6c98c6b74c00e00eba7969292144e471d5c45d67088f0dc08e3a4c60a15ee191", size = 13206191, upload-time = "2025-11-11T22:04:29.254Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/a2/a77be16b4a882ace36da0748305795f35306bdad568472f208bd89b96b9d/tensorstore-0.1.79-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:71aa9b45436d888c37b965f7b71195916d15438119b7dccb66a3b0776bfba367", size = 16485740, upload-time = "2025-11-11T22:04:33.478Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/e4/7fe268ec41aa70b71a1c56b1ec83346fbcbf12f4bfbefc79d14fb9c03408/tensorstore-0.1.79-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:108c0e867aa2c87d4982cc6325a2de0c4f5bd63c2bea18adb193a370c40594ce", size = 14508736, upload-time = "2025-11-11T22:04:38.613Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/f1/b1248dae02598ce534834413e841f915a32ab185c36ecd05e4c67bdc8d19/tensorstore-0.1.79-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:debd435042c00be68ba1fb3cf59325a7babb3f4a3cf4744c87dde346802cbbb4", size = 18947817, upload-time = "2025-11-11T22:04:40.768Z" },
-    { url = "https://files.pythonhosted.org/packages/87/4a/60e234147570e21bbab4ac70ab79dd794a5ef9a4945d36c34c1914a73205/tensorstore-0.1.79-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:608f7178ec6e4e4a3c26545b0a44f44bf83438d04bf2d960cd0e7699eaa99ef6", size = 20929832, upload-time = "2025-11-11T22:04:43.613Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/48/0531868bce12a2f520002e810d4200ec6f01ba33a2f27b6bd7289fbc197b/tensorstore-0.1.79-cp312-cp312-win_amd64.whl", hash = "sha256:a071c6c255b7e412957a6aa563bc4250242c7894edad06ae6358e3d30b7d88ce", size = 13211970, upload-time = "2025-11-11T22:04:46.179Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/0b/54a44e55836d8e8f576343134c0e3db71c6c837d39a0ac44699aba5b01df/tensorstore-0.1.79-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:1e8e2d098829919caac6a62cf568902e34789069ceddb28497d6e36ebcb95c0b", size = 16485855, upload-time = "2025-11-11T22:04:48.734Z" },
-    { url = "https://files.pythonhosted.org/packages/04/59/cadb9a45896d480882476df4759cda1659c70669aff87a4d5a4a07ded084/tensorstore-0.1.79-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:29cf4336153af136ac8ac528e2ed46df19367edae7e14e37bca1a8b7c4848ef2", size = 14508277, upload-time = "2025-11-11T22:04:50.775Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/cb/3647bdd03c7692882ebc10c19df9ede49f290c216b2906f785edbdb53ef1/tensorstore-0.1.79-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94d8fc9df1721b0287046aca7209fd5040889cad4202e7b73a1fdb77cd9b71c6", size = 18949307, upload-time = "2025-11-11T22:04:53.145Z" },
-    { url = "https://files.pythonhosted.org/packages/20/a0/f91ac492cf2ee9f7541aefaaed4ad1258e73e33f3cd3e06cdce5859431db/tensorstore-0.1.79-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9f2dc3342e4686af98f6e259dc9fb377f1bf657b649c247bf6647bbe4f98090", size = 20930427, upload-time = "2025-11-11T22:04:55.353Z" },
-    { url = "https://files.pythonhosted.org/packages/69/a6/752fd11747eb9fead715b02d389da7fb180a56172b885de0b48b20237d1e/tensorstore-0.1.79-cp313-cp313-win_amd64.whl", hash = "sha256:0fd6165f3df49abc7c9de029b2b72d74bebd2ff2481a5ced003607eb61c56d3e", size = 13212196, upload-time = "2025-11-11T22:05:00.451Z" },
-    { url = "https://files.pythonhosted.org/packages/46/57/1649019893accb3f195780fec55b8bf6793343faf140040bc73f1c28d6a5/tensorstore-0.1.79-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:6f8f5a940eab434a951c2dadcc7c0516c7bef6d8b7a7144054f7a0c56152b5f5", size = 16488849, upload-time = "2025-11-11T22:05:03.014Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/23/2668cb120e855a6a7a8a5eb0eba30e2e7020da932a4d3fa13c6ee3c41f9f/tensorstore-0.1.79-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:97756d2cba3c5ce21e15602c2af5a02521cc0ecda7f9fb6d18da2f3bd51827f4", size = 14511448, upload-time = "2025-11-11T22:05:05.58Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/0e/c38f079f3933cc284aab53d52976f6cb4f1ad43bb6a704ac27e0b710f176/tensorstore-0.1.79-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:847982652273fb7b2d694b789205747aaf3e50ae64738c5cb7b5eb03d86a9947", size = 18949282, upload-time = "2025-11-11T22:05:07.562Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/99/03479deea5bfd27a0d8a8c75d5f1d85417a7bbc9c6c7a90fb85b4a4e347a/tensorstore-0.1.79-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7af9422269c2bfcdecf9dd55309060665ab9c2d7f6c892377ed32c032400feea", size = 20931601, upload-time = "2025-11-11T22:05:10.098Z" },
-    { url = "https://files.pythonhosted.org/packages/26/36/2617edf6c6d6fc73b3ff96d9d0b97332adf0d0c56fa2014a226bf4f7dfa6/tensorstore-0.1.79-cp314-cp314-win_amd64.whl", hash = "sha256:bbd8c1ab7d2e3c03ded3d40bb373ee9a67668e33a564484927865ce43b210386", size = 13599766, upload-time = "2025-11-11T22:05:12.265Z" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/f6/e2403fc05b97ba74ad408a98a42c288e6e1b8eacc23780c153b0e5166179/tensorstore-0.1.81.tar.gz", hash = "sha256:687546192ea6f6c8ae28d18f13103336f68017d928b9f5a00325e9b0548d9c25", size = 7120819, upload-time = "2026-02-06T18:56:12.535Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cd/df/f472bd0dee801d7e33c53335ad0fcde9c71e5f9324241faa0a6b4be4270a/tensorstore-0.1.81-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:f64fb510f293079f9e5c63cb227e8a76904655a32912fc107c1e63bd8dc3e187", size = 16501390, upload-time = "2026-02-06T18:55:13.678Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/93/5f40c51d7b15d3574b1788a251dd4e3abd0415dab71811e126d2da5e826b/tensorstore-0.1.81-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4282587598885ff447f08369ac9bb681a65e224888cfa8ef8f3dd63544759e6c", size = 14535592, upload-time = "2026-02-06T18:55:16.44Z" },
+    { url = "https://files.pythonhosted.org/packages/76/48/b7adcc8eca502ce8050c18cea066ca0c0122df7a686e10da6470e55456b4/tensorstore-0.1.81-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b4ea06038f6912bb6ed8a89db0c31e4e3d1b2404f3365dc756e4bc42bd6a89c", size = 19038732, upload-time = "2026-02-06T18:55:18.924Z" },
+    { url = "https://files.pythonhosted.org/packages/40/b0/99294895b030bd7d9ebc06e7ed523d0c09ab65667e031f8a67923f398f86/tensorstore-0.1.81-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51d59f7db9cdae02fce9d347300c0ccfb8265052945757e95592a265eb620b15", size = 21038447, upload-time = "2026-02-06T18:55:21.085Z" },
+    { url = "https://files.pythonhosted.org/packages/32/e6/1ce977baf09aa3889f10f04460b588a6c8876ea441e51090c671f0400a6f/tensorstore-0.1.81-cp311-cp311-win_amd64.whl", hash = "sha256:fdb9579a729cccc02127cab5abf26f57a0e27968ba65c9c548ad058f5a45417f", size = 13221673, upload-time = "2026-02-06T18:55:23.195Z" },
+    { url = "https://files.pythonhosted.org/packages/85/82/00037db699f74d792efe2696305ddd6932e04306899e3701824a7f7de961/tensorstore-0.1.81-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:7aefa1e3eadca804bce05215184c9cde29205ac2f3b443ca15a4e1846d31af4e", size = 16521245, upload-time = "2026-02-06T18:55:25.559Z" },
+    { url = "https://files.pythonhosted.org/packages/86/2e/1deca1b955cb959eec13fd342ffaa2fd84e4770b4e2bcb95a2f541875a52/tensorstore-0.1.81-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7e001d3edc6758eb5dc80556da9e945c1381f0529102fcc0301358ba6b9b70ed", size = 14543561, upload-time = "2026-02-06T18:55:27.624Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/e4/b4343eae773f72a8777f82c5328191a06d8a5195e62105c14b7dcc49823f/tensorstore-0.1.81-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c27e07f4e91e6dc6a0878e13e2c5931d1716196b67b0df927f2f571de2576e9", size = 19043982, upload-time = "2026-02-06T18:55:30.076Z" },
+    { url = "https://files.pythonhosted.org/packages/31/6c/d8c8508a9f4a83dc910d2365c484ba0debf5e531782065e3657fc8fc9b54/tensorstore-0.1.81-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fcb4786c4955e2d88d518b5b5a367427e3ad21d059cba366ad7aebf5fcc2302e", size = 21049171, upload-time = "2026-02-06T18:55:34.383Z" },
+    { url = "https://files.pythonhosted.org/packages/44/a9/c1a751e35a0fcff7f795398c4f98b6c8ea0f00fe7d7704f66a1e08d4352f/tensorstore-0.1.81-cp312-cp312-win_amd64.whl", hash = "sha256:b96cbf1ee74d9038762b2d81305ee1589ec89913a440df6cbd514bc5879655d2", size = 13226573, upload-time = "2026-02-06T18:55:36.463Z" },
+    { url = "https://files.pythonhosted.org/packages/06/c0/32f7d52bfcf1728f557cccb17ac85f57bcc3fa92f4034368d6e7d7d06406/tensorstore-0.1.81-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:7bb563ad4d4d6c4748d9fe4f01f639ddf4ffef83ac180fc3b6d73f46ad854e62", size = 16521316, upload-time = "2026-02-06T18:55:39.557Z" },
+    { url = "https://files.pythonhosted.org/packages/38/b9/06ffc44e38ca18aeb3973f6b709d4d2102e17a8d700c7c3e2af3f2830722/tensorstore-0.1.81-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2ff7e6c457596cf21f31c690e451fe634ac804fc98ff8131188e99d5ef7d29bc", size = 14543212, upload-time = "2026-02-06T18:55:42.246Z" },
+    { url = "https://files.pythonhosted.org/packages/00/01/3c27962f7258ad0bb552c3cd324fa2e01f746c8b6e81bd25d468f72204e8/tensorstore-0.1.81-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b218a6fe09c72c002f2c6480fc58b78cdbba8bb9c6f3a0d7dd1f70625cb37995", size = 19044489, upload-time = "2026-02-06T18:55:44.957Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/ea/fe0f14a1da96d6e0aa6c24d6c31f3ce4b203f8e8a1a2e359489e52b33400/tensorstore-0.1.81-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f33e7c11035c14dad01aeba012051643110cbb95c239e512106fe1be692c98b6", size = 21052658, upload-time = "2026-02-06T18:55:47.138Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/e2/cc189d799982f02c200b22405c4d3f28845df6321de2ac3a35ae087758ed/tensorstore-0.1.81-cp313-cp313-win_amd64.whl", hash = "sha256:b55126bcf084cc5fe0151bf465f3a5dedb5b5da0133d01227f75d0e71f9cfae5", size = 13226848, upload-time = "2026-02-06T18:55:49.631Z" },
+    { url = "https://files.pythonhosted.org/packages/89/b0/0ca436391f832fad365977623f3c08c4fbbf553fd9a112604aa106646654/tensorstore-0.1.81-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a48c23e4df50681d8f4f365b08a0beb114ab210accbde9f34d37fd7b45c31005", size = 16525537, upload-time = "2026-02-06T18:55:51.708Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/02/c10052b86cf8d47b4cf41e5f139b4003c69bb69e506759b0eb87b873d213/tensorstore-0.1.81-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0be0ce646263820f3d4c9ba738d8e9be7da241cbe093ca2fd02e25023344347c", size = 14547490, upload-time = "2026-02-06T18:55:53.899Z" },
+    { url = "https://files.pythonhosted.org/packages/01/d1/bd86c46367624522967e896ca45d77ba9085de3f15081fdad6576ba70aa9/tensorstore-0.1.81-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93996e756dce82589f5a19e27b4e7c0b5b40221a7e41ddce46dc13d378dbd157", size = 19050938, upload-time = "2026-02-06T18:55:56.123Z" },
+    { url = "https://files.pythonhosted.org/packages/11/a2/59a8e9a33cd9e17461f918bda4a20712ed3c51c52e0e42b2f673441bc90d/tensorstore-0.1.81-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:444c088919a739c20ca1f87935d72de4fd87605eb2c0f093b8d49251b7884aef", size = 21055275, upload-time = "2026-02-06T18:55:58.259Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/ec/2988f210729b523975b1bee030cabd64b256943c08463331598f1e03bd4f/tensorstore-0.1.81-cp314-cp314-win_amd64.whl", hash = "sha256:f7aa0a3a470c4d832faff7d77dd688b1d352b718d110c95ceba54ec637ca3ffa", size = 13614713, upload-time = "2026-02-06T18:56:00.291Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/5d/60e990df3f1dc57c33644375a0eccb906a79fd8a5e2d81238f856c65ad7f/tensorstore-0.1.81-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:6c36d8a827120aa15e50ec5c36dd7e73978d86ba4f46d073fb648d8dda3948e9", size = 16605091, upload-time = "2026-02-06T18:56:02.807Z" },
+    { url = "https://files.pythonhosted.org/packages/85/22/f599576815227735d3e34f86f05a8b39d8b15fd979d0029383ebae23978d/tensorstore-0.1.81-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3c31d831707c4ff3c6ecdcba129f7c39e982572837b2f93e02ccb83fc8581bca", size = 14631573, upload-time = "2026-02-06T18:56:04.892Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/76/b5d0b424b7af057a3d4de3f312eba9ddf8a3c750a766b42e0b7f6c2ebef0/tensorstore-0.1.81-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9fba383f108d7450bf9a03487ac7fa3bb2c3080c91cee9d2da3bb217b560846b", size = 19065251, upload-time = "2026-02-06T18:56:06.972Z" },
+    { url = "https://files.pythonhosted.org/packages/54/6c/0f113eae73b1e8eb2f712cf5f1efd269452f0f0045158fae43ce7b4701b4/tensorstore-0.1.81-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f88c52f592e2982682045199cabf360462146749d48b7be2969cd640e877c6c3", size = 21066488, upload-time = "2026-02-06T18:56:10.236Z" },
 ]
 
 [[package]]
@@ -5608,27 +6732,32 @@ wheels = [
 
 [[package]]
 name = "tokenizers"
-version = "0.22.1"
+version = "0.22.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/1c/46/fb6854cec3278fbfa4a75b50232c77622bc517ac886156e6afbfa4d8fc6e/tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9", size = 363123, upload-time = "2025-09-19T09:49:23.424Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/bf/33/f4b2d94ada7ab297328fc671fed209368ddb82f965ec2224eb1892674c3a/tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73", size = 3069318, upload-time = "2025-09-19T09:49:11.848Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/58/2aa8c874d02b974990e89ff95826a4852a8b2a273c7d1b4411cdd45a4565/tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc", size = 2926478, upload-time = "2025-09-19T09:49:09.759Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/3b/55e64befa1e7bfea963cf4b787b2cea1011362c4193f5477047532ce127e/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a", size = 3256994, upload-time = "2025-09-19T09:48:56.701Z" },
-    { url = "https://files.pythonhosted.org/packages/71/0b/fbfecf42f67d9b7b80fde4aabb2b3110a97fac6585c9470b5bff103a80cb/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7", size = 3153141, upload-time = "2025-09-19T09:48:59.749Z" },
-    { url = "https://files.pythonhosted.org/packages/17/a9/b38f4e74e0817af8f8ef925507c63c6ae8171e3c4cb2d5d4624bf58fca69/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21", size = 3508049, upload-time = "2025-09-19T09:49:05.868Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/48/dd2b3dac46bb9134a88e35d72e1aa4869579eacc1a27238f1577270773ff/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214", size = 3710730, upload-time = "2025-09-19T09:49:01.832Z" },
-    { url = "https://files.pythonhosted.org/packages/93/0e/ccabc8d16ae4ba84a55d41345207c1e2ea88784651a5a487547d80851398/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f", size = 3412560, upload-time = "2025-09-19T09:49:03.867Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/c6/dc3a0db5a6766416c32c034286d7c2d406da1f498e4de04ab1b8959edd00/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4", size = 3250221, upload-time = "2025-09-19T09:49:07.664Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/a6/2c8486eef79671601ff57b093889a345dd3d576713ef047776015dc66de7/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879", size = 9345569, upload-time = "2025-09-19T09:49:14.214Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/16/32ce667f14c35537f5f605fe9bea3e415ea1b0a646389d2295ec348d5657/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446", size = 9271599, upload-time = "2025-09-19T09:49:16.639Z" },
-    { url = "https://files.pythonhosted.org/packages/51/7c/a5f7898a3f6baa3fc2685c705e04c98c1094c523051c805cdd9306b8f87e/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a", size = 9533862, upload-time = "2025-09-19T09:49:19.146Z" },
-    { url = "https://files.pythonhosted.org/packages/36/65/7e75caea90bc73c1dd8d40438adf1a7bc26af3b8d0a6705ea190462506e1/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390", size = 9681250, upload-time = "2025-09-19T09:49:21.501Z" },
-    { url = "https://files.pythonhosted.org/packages/30/2c/959dddef581b46e6209da82df3b78471e96260e2bc463f89d23b1bf0e52a/tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82", size = 2472003, upload-time = "2025-09-19T09:49:27.089Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/92/97/5dbfabf04c7e348e655e907ed27913e03db0923abb5dfdd120d7b25630e1/tokenizers-0.22.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c", size = 3100275, upload-time = "2026-01-05T10:41:02.158Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/47/174dca0502ef88b28f1c9e06b73ce33500eedfac7a7692108aec220464e7/tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001", size = 2981472, upload-time = "2026-01-05T10:41:00.276Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/84/7990e799f1309a8b87af6b948f31edaa12a3ed22d11b352eaf4f4b2e5753/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7", size = 3290736, upload-time = "2026-01-05T10:40:32.165Z" },
+    { url = "https://files.pythonhosted.org/packages/78/59/09d0d9ba94dcd5f4f1368d4858d24546b4bdc0231c2354aa31d6199f0399/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd", size = 3168835, upload-time = "2026-01-05T10:40:38.847Z" },
+    { url = "https://files.pythonhosted.org/packages/47/50/b3ebb4243e7160bda8d34b731e54dd8ab8b133e50775872e7a434e524c28/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5", size = 3521673, upload-time = "2026-01-05T10:40:56.614Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/fa/89f4cb9e08df770b57adb96f8cbb7e22695a4cb6c2bd5f0c4f0ebcf33b66/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e", size = 3724818, upload-time = "2026-01-05T10:40:44.507Z" },
+    { url = "https://files.pythonhosted.org/packages/64/04/ca2363f0bfbe3b3d36e95bf67e56a4c88c8e3362b658e616d1ac185d47f2/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b", size = 3379195, upload-time = "2026-01-05T10:40:51.139Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/76/932be4b50ef6ccedf9d3c6639b056a967a86258c6d9200643f01269211ca/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67", size = 3274982, upload-time = "2026-01-05T10:40:58.331Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/28/5f9f5a4cc211b69e89420980e483831bcc29dade307955cc9dc858a40f01/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4", size = 9478245, upload-time = "2026-01-05T10:41:04.053Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/fb/66e2da4704d6aadebf8cb39f1d6d1957df667ab24cff2326b77cda0dcb85/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a", size = 9560069, upload-time = "2026-01-05T10:45:10.673Z" },
+    { url = "https://files.pythonhosted.org/packages/16/04/fed398b05caa87ce9b1a1bb5166645e38196081b225059a6edaff6440fac/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a", size = 9899263, upload-time = "2026-01-05T10:45:12.559Z" },
+    { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload-time = "2026-01-05T10:45:20.593Z" },
+    { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload-time = "2026-01-05T10:45:18.411Z" },
+    { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
+    { url = "https://files.pythonhosted.org/packages/84/04/655b79dbcc9b3ac5f1479f18e931a344af67e5b7d3b251d2dcdcd7558592/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:753d47ebd4542742ef9261d9da92cd545b2cacbb48349a1225466745bb866ec4", size = 3282301, upload-time = "2026-01-05T10:40:34.858Z" },
+    { url = "https://files.pythonhosted.org/packages/46/cd/e4851401f3d8f6f45d8480262ab6a5c8cb9c4302a790a35aa14eeed6d2fd/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e10bf9113d209be7cd046d40fbabbaf3278ff6d18eb4da4c500443185dc1896c", size = 3161308, upload-time = "2026-01-05T10:40:40.737Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/6e/55553992a89982cd12d4a66dddb5e02126c58677ea3931efcbe601d419db/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64d94e84f6660764e64e7e0b22baa72f6cd942279fdbb21d46abd70d179f0195", size = 3718964, upload-time = "2026-01-05T10:40:46.56Z" },
+    { url = "https://files.pythonhosted.org/packages/59/8c/b1c87148aa15e099243ec9f0cf9d0e970cc2234c3257d558c25a2c5304e6/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f01a9c019878532f98927d2bacb79bbb404b43d3437455522a00a30718cdedb5", size = 3373542, upload-time = "2026-01-05T10:40:52.803Z" },
 ]
 
 [[package]]
@@ -5642,177 +6771,132 @@ wheels = [
 
 [[package]]
 name = "tomli"
-version = "2.3.0"
+version = "2.4.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/52/ed/3f73f72945444548f33eba9a87fc7a6e969915e7b1acc8260b30e1f76a2f/tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549", size = 17392, upload-time = "2025-10-08T22:01:47.119Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b3/2e/299f62b401438d5fe1624119c723f5d877acc86a4c2492da405626665f12/tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45", size = 153236, upload-time = "2025-10-08T22:01:00.137Z" },
-    { url = "https://files.pythonhosted.org/packages/86/7f/d8fffe6a7aefdb61bced88fcb5e280cfd71e08939da5894161bd71bea022/tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba", size = 148084, upload-time = "2025-10-08T22:01:01.63Z" },
-    { url = "https://files.pythonhosted.org/packages/47/5c/24935fb6a2ee63e86d80e4d3b58b222dafaf438c416752c8b58537c8b89a/tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf", size = 234832, upload-time = "2025-10-08T22:01:02.543Z" },
-    { url = "https://files.pythonhosted.org/packages/89/da/75dfd804fc11e6612846758a23f13271b76d577e299592b4371a4ca4cd09/tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441", size = 242052, upload-time = "2025-10-08T22:01:03.836Z" },
-    { url = "https://files.pythonhosted.org/packages/70/8c/f48ac899f7b3ca7eb13af73bacbc93aec37f9c954df3c08ad96991c8c373/tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845", size = 239555, upload-time = "2025-10-08T22:01:04.834Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/28/72f8afd73f1d0e7829bfc093f4cb98ce0a40ffc0cc997009ee1ed94ba705/tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c", size = 245128, upload-time = "2025-10-08T22:01:05.84Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/eb/a7679c8ac85208706d27436e8d421dfa39d4c914dcf5fa8083a9305f58d9/tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456", size = 96445, upload-time = "2025-10-08T22:01:06.896Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/fe/3d3420c4cb1ad9cb462fb52967080575f15898da97e21cb6f1361d505383/tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be", size = 107165, upload-time = "2025-10-08T22:01:08.107Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/b7/40f36368fcabc518bb11c8f06379a0fd631985046c038aca08c6d6a43c6e/tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac", size = 154891, upload-time = "2025-10-08T22:01:09.082Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/3f/d9dd692199e3b3aab2e4e4dd948abd0f790d9ded8cd10cbaae276a898434/tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22", size = 148796, upload-time = "2025-10-08T22:01:10.266Z" },
-    { url = "https://files.pythonhosted.org/packages/60/83/59bff4996c2cf9f9387a0f5a3394629c7efa5ef16142076a23a90f1955fa/tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f", size = 242121, upload-time = "2025-10-08T22:01:11.332Z" },
-    { url = "https://files.pythonhosted.org/packages/45/e5/7c5119ff39de8693d6baab6c0b6dcb556d192c165596e9fc231ea1052041/tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52", size = 250070, upload-time = "2025-10-08T22:01:12.498Z" },
-    { url = "https://files.pythonhosted.org/packages/45/12/ad5126d3a278f27e6701abde51d342aa78d06e27ce2bb596a01f7709a5a2/tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8", size = 245859, upload-time = "2025-10-08T22:01:13.551Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/a1/4d6865da6a71c603cfe6ad0e6556c73c76548557a8d658f9e3b142df245f/tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6", size = 250296, upload-time = "2025-10-08T22:01:14.614Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/b7/a7a7042715d55c9ba6e8b196d65d2cb662578b4d8cd17d882d45322b0d78/tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876", size = 97124, upload-time = "2025-10-08T22:01:15.629Z" },
-    { url = "https://files.pythonhosted.org/packages/06/1e/f22f100db15a68b520664eb3328fb0ae4e90530887928558112c8d1f4515/tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878", size = 107698, upload-time = "2025-10-08T22:01:16.51Z" },
-    { url = "https://files.pythonhosted.org/packages/89/48/06ee6eabe4fdd9ecd48bf488f4ac783844fd777f547b8d1b61c11939974e/tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b", size = 154819, upload-time = "2025-10-08T22:01:17.964Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/01/88793757d54d8937015c75dcdfb673c65471945f6be98e6a0410fba167ed/tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae", size = 148766, upload-time = "2025-10-08T22:01:18.959Z" },
-    { url = "https://files.pythonhosted.org/packages/42/17/5e2c956f0144b812e7e107f94f1cc54af734eb17b5191c0bbfb72de5e93e/tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b", size = 240771, upload-time = "2025-10-08T22:01:20.106Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/f4/0fbd014909748706c01d16824eadb0307115f9562a15cbb012cd9b3512c5/tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf", size = 248586, upload-time = "2025-10-08T22:01:21.164Z" },
-    { url = "https://files.pythonhosted.org/packages/30/77/fed85e114bde5e81ecf9bc5da0cc69f2914b38f4708c80ae67d0c10180c5/tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f", size = 244792, upload-time = "2025-10-08T22:01:22.417Z" },
-    { url = "https://files.pythonhosted.org/packages/55/92/afed3d497f7c186dc71e6ee6d4fcb0acfa5f7d0a1a2878f8beae379ae0cc/tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05", size = 248909, upload-time = "2025-10-08T22:01:23.859Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/84/ef50c51b5a9472e7265ce1ffc7f24cd4023d289e109f669bdb1553f6a7c2/tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606", size = 96946, upload-time = "2025-10-08T22:01:24.893Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/b7/718cd1da0884f281f95ccfa3a6cc572d30053cba64603f79d431d3c9b61b/tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999", size = 107705, upload-time = "2025-10-08T22:01:26.153Z" },
-    { url = "https://files.pythonhosted.org/packages/19/94/aeafa14a52e16163008060506fcb6aa1949d13548d13752171a755c65611/tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e", size = 154244, upload-time = "2025-10-08T22:01:27.06Z" },
-    { url = "https://files.pythonhosted.org/packages/db/e4/1e58409aa78eefa47ccd19779fc6f36787edbe7d4cd330eeeedb33a4515b/tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3", size = 148637, upload-time = "2025-10-08T22:01:28.059Z" },
-    { url = "https://files.pythonhosted.org/packages/26/b6/d1eccb62f665e44359226811064596dd6a366ea1f985839c566cd61525ae/tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc", size = 241925, upload-time = "2025-10-08T22:01:29.066Z" },
-    { url = "https://files.pythonhosted.org/packages/70/91/7cdab9a03e6d3d2bb11beae108da5bdc1c34bdeb06e21163482544ddcc90/tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0", size = 249045, upload-time = "2025-10-08T22:01:31.98Z" },
-    { url = "https://files.pythonhosted.org/packages/15/1b/8c26874ed1f6e4f1fcfeb868db8a794cbe9f227299402db58cfcc858766c/tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879", size = 245835, upload-time = "2025-10-08T22:01:32.989Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/42/8e3c6a9a4b1a1360c1a2a39f0b972cef2cc9ebd56025168c4137192a9321/tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005", size = 253109, upload-time = "2025-10-08T22:01:34.052Z" },
-    { url = "https://files.pythonhosted.org/packages/22/0c/b4da635000a71b5f80130937eeac12e686eefb376b8dee113b4a582bba42/tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463", size = 97930, upload-time = "2025-10-08T22:01:35.082Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/74/cb1abc870a418ae99cd5c9547d6bce30701a954e0e721821df483ef7223c/tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8", size = 107964, upload-time = "2025-10-08T22:01:36.057Z" },
-    { url = "https://files.pythonhosted.org/packages/54/78/5c46fff6432a712af9f792944f4fcd7067d8823157949f4e40c56b8b3c83/tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77", size = 163065, upload-time = "2025-10-08T22:01:37.27Z" },
-    { url = "https://files.pythonhosted.org/packages/39/67/f85d9bd23182f45eca8939cd2bc7050e1f90c41f4a2ecbbd5963a1d1c486/tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf", size = 159088, upload-time = "2025-10-08T22:01:38.235Z" },
-    { url = "https://files.pythonhosted.org/packages/26/5a/4b546a0405b9cc0659b399f12b6adb750757baf04250b148d3c5059fc4eb/tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530", size = 268193, upload-time = "2025-10-08T22:01:39.712Z" },
-    { url = "https://files.pythonhosted.org/packages/42/4f/2c12a72ae22cf7b59a7fe75b3465b7aba40ea9145d026ba41cb382075b0e/tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b", size = 275488, upload-time = "2025-10-08T22:01:40.773Z" },
-    { url = "https://files.pythonhosted.org/packages/92/04/a038d65dbe160c3aa5a624e93ad98111090f6804027d474ba9c37c8ae186/tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67", size = 272669, upload-time = "2025-10-08T22:01:41.824Z" },
-    { url = "https://files.pythonhosted.org/packages/be/2f/8b7c60a9d1612a7cbc39ffcca4f21a73bf368a80fc25bccf8253e2563267/tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f", size = 279709, upload-time = "2025-10-08T22:01:43.177Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/46/cc36c679f09f27ded940281c38607716c86cf8ba4a518d524e349c8b4874/tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0", size = 107563, upload-time = "2025-10-08T22:01:44.233Z" },
-    { url = "https://files.pythonhosted.org/packages/84/ff/426ca8683cf7b753614480484f6437f568fd2fda2edbdf57a2d3d8b27a0b/tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba", size = 119756, upload-time = "2025-10-08T22:01:45.234Z" },
-    { url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/82/30/31573e9457673ab10aa432461bee537ce6cef177667deca369efb79df071/tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c", size = 17477, upload-time = "2026-01-11T11:22:38.165Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3c/d9/3dc2289e1f3b32eb19b9785b6a006b28ee99acb37d1d47f78d4c10e28bf8/tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867", size = 153663, upload-time = "2026-01-11T11:21:45.27Z" },
+    { url = "https://files.pythonhosted.org/packages/51/32/ef9f6845e6b9ca392cd3f64f9ec185cc6f09f0a2df3db08cbe8809d1d435/tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9", size = 148469, upload-time = "2026-01-11T11:21:46.873Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/c2/506e44cce89a8b1b1e047d64bd495c22c9f71f21e05f380f1a950dd9c217/tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95", size = 236039, upload-time = "2026-01-11T11:21:48.503Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/40/e1b65986dbc861b7e986e8ec394598187fa8aee85b1650b01dd925ca0be8/tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76", size = 243007, upload-time = "2026-01-11T11:21:49.456Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/6f/6e39ce66b58a5b7ae572a0f4352ff40c71e8573633deda43f6a379d56b3e/tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d", size = 240875, upload-time = "2026-01-11T11:21:50.755Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/ad/cb089cb190487caa80204d503c7fd0f4d443f90b95cf4ef5cf5aa0f439b0/tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576", size = 246271, upload-time = "2026-01-11T11:21:51.81Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/63/69125220e47fd7a3a27fd0de0c6398c89432fec41bc739823bcc66506af6/tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a", size = 96770, upload-time = "2026-01-11T11:21:52.647Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/0d/a22bb6c83f83386b0008425a6cd1fa1c14b5f3dd4bad05e98cf3dbbf4a64/tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa", size = 107626, upload-time = "2026-01-11T11:21:53.459Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/6d/77be674a3485e75cacbf2ddba2b146911477bd887dda9d8c9dfb2f15e871/tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614", size = 94842, upload-time = "2026-01-11T11:21:54.831Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/43/7389a1869f2f26dba52404e1ef13b4784b6b37dac93bac53457e3ff24ca3/tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1", size = 154894, upload-time = "2026-01-11T11:21:56.07Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/05/2f9bf110b5294132b2edf13fe6ca6ae456204f3d749f623307cbb7a946f2/tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8", size = 149053, upload-time = "2026-01-11T11:21:57.467Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/41/1eda3ca1abc6f6154a8db4d714a4d35c4ad90adc0bcf700657291593fbf3/tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a", size = 243481, upload-time = "2026-01-11T11:21:58.661Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/6d/02ff5ab6c8868b41e7d4b987ce2b5f6a51d3335a70aa144edd999e055a01/tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1", size = 251720, upload-time = "2026-01-11T11:22:00.178Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/57/0405c59a909c45d5b6f146107c6d997825aa87568b042042f7a9c0afed34/tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b", size = 247014, upload-time = "2026-01-11T11:22:01.238Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/0e/2e37568edd944b4165735687cbaf2fe3648129e440c26d02223672ee0630/tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51", size = 251820, upload-time = "2026-01-11T11:22:02.727Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/1c/ee3b707fdac82aeeb92d1a113f803cf6d0f37bdca0849cb489553e1f417a/tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729", size = 97712, upload-time = "2026-01-11T11:22:03.777Z" },
+    { url = "https://files.pythonhosted.org/packages/69/13/c07a9177d0b3bab7913299b9278845fc6eaaca14a02667c6be0b0a2270c8/tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da", size = 108296, upload-time = "2026-01-11T11:22:04.86Z" },
+    { url = "https://files.pythonhosted.org/packages/18/27/e267a60bbeeee343bcc279bb9e8fbed0cbe224bc7b2a3dc2975f22809a09/tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3", size = 94553, upload-time = "2026-01-11T11:22:05.854Z" },
+    { url = "https://files.pythonhosted.org/packages/34/91/7f65f9809f2936e1f4ce6268ae1903074563603b2a2bd969ebbda802744f/tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0", size = 154915, upload-time = "2026-01-11T11:22:06.703Z" },
+    { url = "https://files.pythonhosted.org/packages/20/aa/64dd73a5a849c2e8f216b755599c511badde80e91e9bc2271baa7b2cdbb1/tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e", size = 149038, upload-time = "2026-01-11T11:22:07.56Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/8a/6d38870bd3d52c8d1505ce054469a73f73a0fe62c0eaf5dddf61447e32fa/tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4", size = 242245, upload-time = "2026-01-11T11:22:08.344Z" },
+    { url = "https://files.pythonhosted.org/packages/59/bb/8002fadefb64ab2669e5b977df3f5e444febea60e717e755b38bb7c41029/tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e", size = 250335, upload-time = "2026-01-11T11:22:09.951Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/3d/4cdb6f791682b2ea916af2de96121b3cb1284d7c203d97d92d6003e91c8d/tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c", size = 245962, upload-time = "2026-01-11T11:22:11.27Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/4a/5f25789f9a460bd858ba9756ff52d0830d825b458e13f754952dd15fb7bb/tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f", size = 250396, upload-time = "2026-01-11T11:22:12.325Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/2f/b73a36fea58dfa08e8b3a268750e6853a6aac2a349241a905ebd86f3047a/tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86", size = 97530, upload-time = "2026-01-11T11:22:13.865Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/af/ca18c134b5d75de7e8dc551c5234eaba2e8e951f6b30139599b53de9c187/tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87", size = 108227, upload-time = "2026-01-11T11:22:15.224Z" },
+    { url = "https://files.pythonhosted.org/packages/22/c3/b386b832f209fee8073c8138ec50f27b4460db2fdae9ffe022df89a57f9b/tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132", size = 94748, upload-time = "2026-01-11T11:22:16.009Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/c4/84047a97eb1004418bc10bdbcfebda209fca6338002eba2dc27cc6d13563/tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6", size = 154725, upload-time = "2026-01-11T11:22:17.269Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/5d/d39038e646060b9d76274078cddf146ced86dc2b9e8bbf737ad5983609a0/tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc", size = 148901, upload-time = "2026-01-11T11:22:18.287Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e5/383be1724cb30f4ce44983d249645684a48c435e1cd4f8b5cded8a816d3c/tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66", size = 243375, upload-time = "2026-01-11T11:22:19.154Z" },
+    { url = "https://files.pythonhosted.org/packages/31/f0/bea80c17971c8d16d3cc109dc3585b0f2ce1036b5f4a8a183789023574f2/tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d", size = 250639, upload-time = "2026-01-11T11:22:20.168Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/8f/2853c36abbb7608e3f945d8a74e32ed3a74ee3a1f468f1ffc7d1cb3abba6/tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702", size = 246897, upload-time = "2026-01-11T11:22:21.544Z" },
+    { url = "https://files.pythonhosted.org/packages/49/f0/6c05e3196ed5337b9fe7ea003e95fd3819a840b7a0f2bf5a408ef1dad8ed/tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8", size = 254697, upload-time = "2026-01-11T11:22:23.058Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/f5/2922ef29c9f2951883525def7429967fc4d8208494e5ab524234f06b688b/tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776", size = 98567, upload-time = "2026-01-11T11:22:24.033Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/31/22b52e2e06dd2a5fdbc3ee73226d763b184ff21fc24e20316a44ccc4d96b/tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475", size = 108556, upload-time = "2026-01-11T11:22:25.378Z" },
+    { url = "https://files.pythonhosted.org/packages/48/3d/5058dff3255a3d01b705413f64f4306a141a8fd7a251e5a495e3f192a998/tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2", size = 96014, upload-time = "2026-01-11T11:22:26.138Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/4e/75dab8586e268424202d3a1997ef6014919c941b50642a1682df43204c22/tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9", size = 163339, upload-time = "2026-01-11T11:22:27.143Z" },
+    { url = "https://files.pythonhosted.org/packages/06/e3/b904d9ab1016829a776d97f163f183a48be6a4deb87304d1e0116a349519/tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0", size = 159490, upload-time = "2026-01-11T11:22:28.399Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/5a/fc3622c8b1ad823e8ea98a35e3c632ee316d48f66f80f9708ceb4f2a0322/tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df", size = 269398, upload-time = "2026-01-11T11:22:29.345Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/33/62bd6152c8bdd4c305ad9faca48f51d3acb2df1f8791b1477d46ff86e7f8/tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d", size = 276515, upload-time = "2026-01-11T11:22:30.327Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/ff/ae53619499f5235ee4211e62a8d7982ba9e439a0fb4f2f351a93d67c1dd2/tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f", size = 273806, upload-time = "2026-01-11T11:22:32.56Z" },
+    { url = "https://files.pythonhosted.org/packages/47/71/cbca7787fa68d4d0a9f7072821980b39fbb1b6faeb5f5cf02f4a5559fa28/tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b", size = 281340, upload-time = "2026-01-11T11:22:33.505Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/00/d595c120963ad42474cf6ee7771ad0d0e8a49d0f01e29576ee9195d9ecdf/tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087", size = 108106, upload-time = "2026-01-11T11:22:34.451Z" },
+    { url = "https://files.pythonhosted.org/packages/de/69/9aa0c6a505c2f80e519b43764f8b4ba93b5a0bbd2d9a9de6e2b24271b9a5/tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd", size = 120504, upload-time = "2026-01-11T11:22:35.764Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/9f/f1668c281c58cfae01482f7114a4b88d345e4c140386241a1a24dcc9e7bc/tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4", size = 99561, upload-time = "2026-01-11T11:22:36.624Z" },
+    { url = "https://files.pythonhosted.org/packages/23/d1/136eb2cb77520a31e1f64cbae9d33ec6df0d78bdf4160398e86eec8a8754/tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a", size = 14477, upload-time = "2026-01-11T11:22:37.446Z" },
 ]
 
 [[package]]
 name = "tomlkit"
-version = "0.13.3"
+version = "0.14.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cc/18/0bbf3884e9eaa38819ebe46a7bd25dcd56b67434402b66a58c4b8e552575/tomlkit-0.13.3.tar.gz", hash = "sha256:430cf247ee57df2b94ee3fbe588e71d362a941ebb545dec29b53961d61add2a1", size = 185207, upload-time = "2025-06-05T07:13:44.947Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/75/8539d011f6be8e29f339c42e633aae3cb73bffa95dd0f9adec09b9c58e85/tomlkit-0.13.3-py3-none-any.whl", hash = "sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0", size = 38901, upload-time = "2025-06-05T07:13:43.546Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
 ]
 
 [[package]]
 name = "torch"
-version = "2.9.1"
+version = "2.10.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "cuda-bindings", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "filelock" },
-    { name = "fsspec" },
+    { name = "fsspec", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "jinja2" },
     { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cudnn-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cufft-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cufile-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-curand-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cusolver-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-cusparselt-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-nccl-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "nvidia-nvtx-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "setuptools", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", marker = "python_full_version >= '3.12'" },
     { name = "sympy" },
-    { name = "triton", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "triton", marker = "sys_platform == 'never'" },
     { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5f/56/9577683b23072075ed2e40d725c52c2019d71a972fab8e083763da8e707e/torch-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1cc208435f6c379f9b8fdfd5ceb5be1e3b72a6bdf1cb46c0d2812aa73472db9e", size = 104207681, upload-time = "2025-11-12T15:19:56.48Z" },
-    { url = "https://files.pythonhosted.org/packages/38/45/be5a74f221df8f4b609b78ff79dc789b0cc9017624544ac4dd1c03973150/torch-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:9fd35c68b3679378c11f5eb73220fdcb4e6f4592295277fbb657d31fd053237c", size = 899794036, upload-time = "2025-11-12T15:21:01.886Z" },
-    { url = "https://files.pythonhosted.org/packages/67/95/a581e8a382596b69385a44bab2733f1273d45c842f5d4a504c0edc3133b6/torch-2.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:2af70e3be4a13becba4655d6cc07dcfec7ae844db6ac38d6c1dafeb245d17d65", size = 110969861, upload-time = "2025-11-12T15:21:30.145Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/51/1756dc128d2bf6ea4e0a915cb89ea5e730315ff33d60c1ff56fd626ba3eb/torch-2.9.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a83b0e84cc375e3318a808d032510dde99d696a85fe9473fc8575612b63ae951", size = 74452222, upload-time = "2025-11-12T15:20:46.223Z" },
-    { url = "https://files.pythonhosted.org/packages/15/db/c064112ac0089af3d2f7a2b5bfbabf4aa407a78b74f87889e524b91c5402/torch-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:62b3fd888277946918cba4478cf849303da5359f0fb4e3bfb86b0533ba2eaf8d", size = 104220430, upload-time = "2025-11-12T15:20:31.705Z" },
-    { url = "https://files.pythonhosted.org/packages/56/be/76eaa36c9cd032d3b01b001e2c5a05943df75f26211f68fae79e62f87734/torch-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d033ff0ac3f5400df862a51bdde9bad83561f3739ea0046e68f5401ebfa67c1b", size = 899821446, upload-time = "2025-11-12T15:20:15.544Z" },
-    { url = "https://files.pythonhosted.org/packages/47/cc/7a2949e38dfe3244c4df21f0e1c27bce8aedd6c604a587dd44fc21017cb4/torch-2.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:0d06b30a9207b7c3516a9e0102114024755a07045f0c1d2f2a56b1819ac06bcb", size = 110973074, upload-time = "2025-11-12T15:21:39.958Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/ce/7d251155a783fb2c1bb6837b2b7023c622a2070a0a72726ca1df47e7ea34/torch-2.9.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:52347912d868653e1528b47cafaf79b285b98be3f4f35d5955389b1b95224475", size = 74463887, upload-time = "2025-11-12T15:20:36.611Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/27/07c645c7673e73e53ded71705045d6cb5bae94c4b021b03aa8d03eee90ab/torch-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:da5f6f4d7f4940a173e5572791af238cb0b9e21b1aab592bd8b26da4c99f1cd6", size = 104126592, upload-time = "2025-11-12T15:20:41.62Z" },
-    { url = "https://files.pythonhosted.org/packages/19/17/e377a460603132b00760511299fceba4102bd95db1a0ee788da21298ccff/torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:27331cd902fb4322252657f3902adf1c4f6acad9dcad81d8df3ae14c7c4f07c4", size = 899742281, upload-time = "2025-11-12T15:22:17.602Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/1a/64f5769025db846a82567fa5b7d21dba4558a7234ee631712ee4771c436c/torch-2.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:81a285002d7b8cfd3fdf1b98aa8df138d41f1a8334fd9ea37511517cedf43083", size = 110940568, upload-time = "2025-11-12T15:21:18.689Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/ab/07739fd776618e5882661d04c43f5b5586323e2f6a2d7d84aac20d8f20bd/torch-2.9.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:c0d25d1d8e531b8343bea0ed811d5d528958f1dcbd37e7245bc686273177ad7e", size = 74479191, upload-time = "2025-11-12T15:21:25.816Z" },
-    { url = "https://files.pythonhosted.org/packages/20/60/8fc5e828d050bddfab469b3fe78e5ab9a7e53dda9c3bdc6a43d17ce99e63/torch-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c29455d2b910b98738131990394da3e50eea8291dfeb4b12de71ecf1fdeb21cb", size = 104135743, upload-time = "2025-11-12T15:21:34.936Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/b7/6d3f80e6918213babddb2a37b46dbb14c15b14c5f473e347869a51f40e1f/torch-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:524de44cd13931208ba2c4bde9ec7741fd4ae6bfd06409a604fc32f6520c2bc9", size = 899749493, upload-time = "2025-11-12T15:24:36.356Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/47/c7843d69d6de8938c1cbb1eba426b1d48ddf375f101473d3e31a5fc52b74/torch-2.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:545844cc16b3f91e08ce3b40e9c2d77012dd33a48d505aed34b7740ed627a1b2", size = 110944162, upload-time = "2025-11-12T15:21:53.151Z" },
-    { url = "https://files.pythonhosted.org/packages/28/0e/2a37247957e72c12151b33a01e4df651d9d155dd74d8cfcbfad15a79b44a/torch-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5be4bf7496f1e3ffb1dd44b672adb1ac3f081f204c5ca81eba6442f5f634df8e", size = 74830751, upload-time = "2025-11-12T15:21:43.792Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/f7/7a18745edcd7b9ca2381aa03353647bca8aace91683c4975f19ac233809d/torch-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:30a3e170a84894f3652434b56d59a64a2c11366b0ed5776fab33c2439396bf9a", size = 104142929, upload-time = "2025-11-12T15:21:48.319Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/dd/f1c0d879f2863ef209e18823a988dc7a1bf40470750e3ebe927efdb9407f/torch-2.9.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8301a7b431e51764629208d0edaa4f9e4c33e6df0f2f90b90e261d623df6a4e2", size = 899748978, upload-time = "2025-11-12T15:23:04.568Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/9f/6986b83a53b4d043e36f3f898b798ab51f7f20fdf1a9b01a2720f445043d/torch-2.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:2e1c42c0ae92bf803a4b2409fdfed85e30f9027a66887f5e7dcdbc014c7531db", size = 111176995, upload-time = "2025-11-12T15:22:01.618Z" },
-    { url = "https://files.pythonhosted.org/packages/40/60/71c698b466dd01e65d0e9514b5405faae200c52a76901baf6906856f17e4/torch-2.9.1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:2c14b3da5df416cf9cb5efab83aa3056f5b8cd8620b8fde81b4987ecab730587", size = 74480347, upload-time = "2025-11-12T15:21:57.648Z" },
-    { url = "https://files.pythonhosted.org/packages/48/50/c4b5112546d0d13cc9eaa1c732b823d676a9f49ae8b6f97772f795874a03/torch-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1edee27a7c9897f4e0b7c14cfc2f3008c571921134522d5b9b5ec4ebbc69041a", size = 74433245, upload-time = "2025-11-12T15:22:39.027Z" },
-    { url = "https://files.pythonhosted.org/packages/81/c9/2628f408f0518b3bae49c95f5af3728b6ab498c8624ab1e03a43dd53d650/torch-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:19d144d6b3e29921f1fc70503e9f2fc572cde6a5115c0c0de2f7ca8b1483e8b6", size = 104134804, upload-time = "2025-11-12T15:22:35.222Z" },
-    { url = "https://files.pythonhosted.org/packages/28/fc/5bc91d6d831ae41bf6e9e6da6468f25330522e92347c9156eb3f1cb95956/torch-2.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:c432d04376f6d9767a9852ea0def7b47a7bbc8e7af3b16ac9cf9ce02b12851c9", size = 899747132, upload-time = "2025-11-12T15:23:36.068Z" },
-    { url = "https://files.pythonhosted.org/packages/63/5d/e8d4e009e52b6b2cf1684bde2a6be157b96fb873732542fb2a9a99e85a83/torch-2.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:d187566a2cdc726fc80138c3cdb260970fab1c27e99f85452721f7759bbd554d", size = 110934845, upload-time = "2025-11-12T15:22:48.367Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/b2/2d15a52516b2ea3f414643b8de68fa4cb220d3877ac8b1028c83dc8ca1c4/torch-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cb10896a1f7fedaddbccc2017ce6ca9ecaaf990f0973bdfcf405439750118d2c", size = 74823558, upload-time = "2025-11-12T15:22:43.392Z" },
-    { url = "https://files.pythonhosted.org/packages/86/5c/5b2e5d84f5b9850cd1e71af07524d8cbb74cba19379800f1f9f7c997fc70/torch-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0a2bd769944991c74acf0c4ef23603b9c777fdf7637f115605a4b2d8023110c7", size = 104145788, upload-time = "2025-11-12T15:23:52.109Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/8c/3da60787bcf70add986c4ad485993026ac0ca74f2fc21410bc4eb1bb7695/torch-2.9.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:07c8a9660bc9414c39cac530ac83b1fb1b679d7155824144a40a54f4a47bfa73", size = 899735500, upload-time = "2025-11-12T15:24:08.788Z" },
-    { url = "https://files.pythonhosted.org/packages/db/2b/f7818f6ec88758dfd21da46b6cd46af9d1b3433e53ddbb19ad1e0da17f9b/torch-2.9.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c88d3299ddeb2b35dcc31753305612db485ab6f1823e37fb29451c8b2732b87e", size = 111163659, upload-time = "2025-11-12T15:23:20.009Z" },
-]
-
-[[package]]
-name = "torchprofile"
-version = "0.0.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "torch", marker = "sys_platform == 'never'" },
-    { name = "torchvision", marker = "sys_platform == 'never'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/6f/36/574c0c46e818533b78b3c09505211162918188325ab4165ef11a3f295755/torchprofile-0.0.4.tar.gz", hash = "sha256:96b6da17d752a06b02977e078aea95614893b31d4117dd5dcd081f30ce65611b", size = 4557, upload-time = "2021-06-22T04:58:03.592Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/62/15/71ad4ed163b03cba1315f1d96e0bc8e39d5a97f92974ffa610a729b273ab/torchprofile-0.0.4-py3-none-any.whl", hash = "sha256:7151fe88dc770f0eeec241244a4c7feaec2c5e8c7852386bc2d6a8d7dde7384d", size = 7694, upload-time = "2021-06-22T04:58:02.485Z" },
-]
-
-[[package]]
-name = "torchvision"
-version = "0.24.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pillow", marker = "sys_platform != 'linux'" },
-    { name = "torch", marker = "sys_platform == 'never'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f7/09/d51aadf8591138e08b74c64a6eb783630c7a31ca2634416277115a9c3a2b/torchvision-0.24.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ded5e625788572e4e1c4d155d1bbc48805c113794100d70e19c76e39e4d53465", size = 1891441, upload-time = "2025-11-12T15:25:01.687Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/49/a35df863e7c153aad82af7505abd8264a5b510306689712ef86bea862822/torchvision-0.24.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:54ed17c3d30e718e08d8da3fd5b30ea44b0311317e55647cb97077a29ecbc25b", size = 2386226, upload-time = "2025-11-12T15:25:05.449Z" },
-    { url = "https://files.pythonhosted.org/packages/49/20/f2d7cd1eea052887c1083afff0b8df5228ec93b53e03759f20b1a3c6d22a/torchvision-0.24.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f476da4e085b7307aaab6f540219617d46d5926aeda24be33e1359771c83778f", size = 8046093, upload-time = "2025-11-12T15:25:09.425Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/cf/0ff4007c09903199307da5f53a192ff5d62b45447069e9ef3a19bdc5ff12/torchvision-0.24.1-cp310-cp310-win_amd64.whl", hash = "sha256:fbdbdae5e540b868a681240b7dbd6473986c862445ee8a138680a6a97d6c34ff", size = 3696202, upload-time = "2025-11-12T15:25:10.657Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/69/30f5f03752aa1a7c23931d2519b31e557f3f10af5089d787cddf3b903ecf/torchvision-0.24.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:056c525dc875f18fe8e9c27079ada166a7b2755cea5a2199b0bc7f1f8364e600", size = 1891436, upload-time = "2025-11-12T15:25:04.3Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/69/49aae86edb75fe16460b59a191fcc0f568c2378f780bb063850db0fe007a/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1e39619de698e2821d71976c92c8a9e50cdfd1e993507dfb340f2688bfdd8283", size = 2387757, upload-time = "2025-11-12T15:25:06.795Z" },
-    { url = "https://files.pythonhosted.org/packages/11/c9/1dfc3db98797b326f1d0c3f3bb61c83b167a813fc7eab6fcd2edb8c7eb9d/torchvision-0.24.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a0f106663e60332aa4fcb1ca2159ef8c3f2ed266b0e6df88de261048a840e0df", size = 8047682, upload-time = "2025-11-12T15:25:21.125Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/bb/cfc6a6f6ccc84a534ed1fdf029ae5716dd6ff04e57ed9dc2dab38bf652d5/torchvision-0.24.1-cp311-cp311-win_amd64.whl", hash = "sha256:a9308cdd37d8a42e14a3e7fd9d271830c7fecb150dd929b642f3c1460514599a", size = 4037588, upload-time = "2025-11-12T15:25:14.402Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/af/18e2c6b9538a045f60718a0c5a058908ccb24f88fde8e6f0fc12d5ff7bd3/torchvision-0.24.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e48bf6a8ec95872eb45763f06499f87bd2fb246b9b96cb00aae260fda2f96193", size = 1891433, upload-time = "2025-11-12T15:25:03.232Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/43/600e5cfb0643d10d633124f5982d7abc2170dfd7ce985584ff16edab3e76/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7fb7590c737ebe3e1c077ad60c0e5e2e56bb26e7bccc3b9d04dbfc34fd09f050", size = 2386737, upload-time = "2025-11-12T15:25:08.288Z" },
-    { url = "https://files.pythonhosted.org/packages/93/b1/db2941526ecddd84884132e2742a55c9311296a6a38627f9e2627f5ac889/torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:66a98471fc18cad9064123106d810a75f57f0838eee20edc56233fd8484b0cc7", size = 8049868, upload-time = "2025-11-12T15:25:13.058Z" },
-    { url = "https://files.pythonhosted.org/packages/69/98/16e583f59f86cd59949f59d52bfa8fc286f86341a229a9d15cbe7a694f0c/torchvision-0.24.1-cp312-cp312-win_amd64.whl", hash = "sha256:4aa6cb806eb8541e92c9b313e96192c6b826e9eb0042720e2fa250d021079952", size = 4302006, upload-time = "2025-11-12T15:25:16.184Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/97/ab40550f482577f2788304c27220e8ba02c63313bd74cf2f8920526aac20/torchvision-0.24.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:8a6696db7fb71eadb2c6a48602106e136c785642e598eb1533e0b27744f2cce6", size = 1891435, upload-time = "2025-11-12T15:25:28.642Z" },
-    { url = "https://files.pythonhosted.org/packages/30/65/ac0a3f9be6abdbe4e1d82c915d7e20de97e7fd0e9a277970508b015309f3/torchvision-0.24.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:db2125c46f9cb25dc740be831ce3ce99303cfe60439249a41b04fd9f373be671", size = 2338718, upload-time = "2025-11-12T15:25:26.19Z" },
-    { url = "https://files.pythonhosted.org/packages/10/b5/5bba24ff9d325181508501ed7f0c3de8ed3dd2edca0784d48b144b6c5252/torchvision-0.24.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f035f0cacd1f44a8ff6cb7ca3627d84c54d685055961d73a1a9fb9827a5414c8", size = 8049661, upload-time = "2025-11-12T15:25:22.558Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/ec/54a96ae9ab6a0dd66d4bba27771f892e36478a9c3489fa56e51c70abcc4d/torchvision-0.24.1-cp313-cp313-win_amd64.whl", hash = "sha256:16274823b93048e0a29d83415166a2e9e0bf4e1b432668357b657612a4802864", size = 4319808, upload-time = "2025-11-12T15:25:17.318Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/f3/a90a389a7e547f3eb8821b13f96ea7c0563cdefbbbb60a10e08dda9720ff/torchvision-0.24.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e3f96208b4bef54cd60e415545f5200346a65024e04f29a26cd0006dbf9e8e66", size = 2005342, upload-time = "2025-11-12T15:25:11.871Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/fe/ff27d2ed1b524078164bea1062f23d2618a5fc3208e247d6153c18c91a76/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f231f6a4f2aa6522713326d0d2563538fa72d613741ae364f9913027fa52ea35", size = 2341708, upload-time = "2025-11-12T15:25:25.08Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/b9/d6c903495cbdfd2533b3ef6f7b5643ff589ea062f8feb5c206ee79b9d9e5/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:1540a9e7f8cf55fe17554482f5a125a7e426347b71de07327d5de6bfd8d17caa", size = 8177239, upload-time = "2025-11-12T15:25:18.554Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/2b/ba02e4261369c3798310483028495cf507e6cb3f394f42e4796981ecf3a7/torchvision-0.24.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d83e16d70ea85d2f196d678bfb702c36be7a655b003abed84e465988b6128938", size = 4251604, upload-time = "2025-11-12T15:25:34.069Z" },
-    { url = "https://files.pythonhosted.org/packages/42/84/577b2cef8f32094add5f52887867da4c2a3e6b4261538447e9b48eb25812/torchvision-0.24.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cccf4b4fec7fdfcd3431b9ea75d1588c0a8596d0333245dafebee0462abe3388", size = 2005319, upload-time = "2025-11-12T15:25:23.827Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/34/ecb786bffe0159a3b49941a61caaae089853132f3cd1e8f555e3621f7e6f/torchvision-0.24.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:1b495edd3a8f9911292424117544f0b4ab780452e998649425d1f4b2bed6695f", size = 2338844, upload-time = "2025-11-12T15:25:32.625Z" },
-    { url = "https://files.pythonhosted.org/packages/51/99/a84623786a6969504c87f2dc3892200f586ee13503f519d282faab0bb4f0/torchvision-0.24.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ab211e1807dc3e53acf8f6638df9a7444c80c0ad050466e8d652b3e83776987b", size = 8175144, upload-time = "2025-11-12T15:25:31.355Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/ba/8fae3525b233e109317ce6a9c1de922ab2881737b029a7e88021f81e068f/torchvision-0.24.1-cp314-cp314-win_amd64.whl", hash = "sha256:18f9cb60e64b37b551cd605a3d62c15730c086362b40682d23e24b616a697d41", size = 4234459, upload-time = "2025-11-12T15:25:19.859Z" },
-    { url = "https://files.pythonhosted.org/packages/50/33/481602c1c72d0485d4b3a6b48c9534b71c2957c9d83bf860eb837bf5a620/torchvision-0.24.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ec9d7379c519428395e4ffda4dbb99ec56be64b0a75b95989e00f9ec7ae0b2d7", size = 2005336, upload-time = "2025-11-12T15:25:27.225Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/7f/372de60bf3dd8f5593bd0d03f4aecf0d1fd58f5bc6943618d9d913f5e6d5/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:af9201184c2712d808bd4eb656899011afdfce1e83721c7cb08000034df353fe", size = 2341704, upload-time = "2025-11-12T15:25:29.857Z" },
-    { url = "https://files.pythonhosted.org/packages/36/9b/0f3b9ff3d0225ee2324ec663de0e7fb3eb855615ca958ac1875f22f1f8e5/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9ef95d819fd6df81bc7cc97b8f21a15d2c0d3ac5dbfaab5cbc2d2ce57114b19e", size = 8177422, upload-time = "2025-11-12T15:25:37.357Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/ab/e2bcc7c2f13d882a58f8b30ff86f794210b075736587ea50f8c545834f8a/torchvision-0.24.1-cp314-cp314t-win_amd64.whl", hash = "sha256:480b271d6edff83ac2e8d69bbb4cf2073f93366516a50d48f140ccfceedb002e", size = 4335190, upload-time = "2025-11-12T15:25:35.745Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/30/bfebdd8ec77db9a79775121789992d6b3b75ee5494971294d7b4b7c999bc/torch-2.10.0-2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2b980edd8d7c0a68c4e951ee1856334a43193f98730d97408fbd148c1a933313", size = 79411457, upload-time = "2026-02-10T21:44:59.189Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/8b/4b61d6e13f7108f36910df9ab4b58fd389cc2520d54d81b88660804aad99/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:418997cb02d0a0f1497cf6a09f63166f9f5df9f3e16c8a716ab76a72127c714f", size = 79423467, upload-time = "2026-02-10T21:44:48.711Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/23/2c9fe0c9c27f7f6cb865abcea8a4568f29f00acaeadfc6a37f6801f84cb4/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:e521c9f030a3774ed770a9c011751fb47c4d12029a3d6522116e48431f2ff89e", size = 79498254, upload-time = "2026-02-10T21:44:44.095Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" },
+    { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" },
+    { url = "https://files.pythonhosted.org/packages/76/bb/d820f90e69cda6c8169b32a0c6a3ab7b17bf7990b8f2c680077c24a3c14c/torch-2.10.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:35e407430795c8d3edb07a1d711c41cc1f9eaddc8b2f1cc0a165a6767a8fb73d", size = 79411450, upload-time = "2026-01-21T16:25:30.692Z" },
+    { url = "https://files.pythonhosted.org/packages/78/89/f5554b13ebd71e05c0b002f95148033e730d3f7067f67423026cc9c69410/torch-2.10.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3282d9febd1e4e476630a099692b44fdc214ee9bf8ee5377732d9d9dfe5712e4", size = 145992610, upload-time = "2026-01-21T16:25:26.327Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/30/a3a2120621bf9c17779b169fc17e3dc29b230c29d0f8222f499f5e159aa8/torch-2.10.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a2f9edd8dbc99f62bc4dfb78af7bf89499bca3d753423ac1b4e06592e467b763", size = 915607863, upload-time = "2026-01-21T16:25:06.696Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/3d/c87b33c5f260a2a8ad68da7147e105f05868c281c63d65ed85aa4da98c66/torch-2.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:29b7009dba4b7a1c960260fc8ac85022c784250af43af9fb0ebafc9883782ebd", size = 113723116, upload-time = "2026-01-21T16:25:21.916Z" },
+    { url = "https://files.pythonhosted.org/packages/61/d8/15b9d9d3a6b0c01b883787bd056acbe5cc321090d4b216d3ea89a8fcfdf3/torch-2.10.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:b7bd80f3477b830dd166c707c5b0b82a898e7b16f59a7d9d42778dd058272e8b", size = 79423461, upload-time = "2026-01-21T16:24:50.266Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/af/758e242e9102e9988969b5e621d41f36b8f258bb4a099109b7a4b4b50ea4/torch-2.10.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5fd4117d89ffd47e3dcc71e71a22efac24828ad781c7e46aaaf56bf7f2796acf", size = 145996088, upload-time = "2026-01-21T16:24:44.171Z" },
+    { url = "https://files.pythonhosted.org/packages/23/8e/3c74db5e53bff7ed9e34c8123e6a8bfef718b2450c35eefab85bb4a7e270/torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:787124e7db3b379d4f1ed54dd12ae7c741c16a4d29b49c0226a89bea50923ffb", size = 915711952, upload-time = "2026-01-21T16:23:53.503Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/01/624c4324ca01f66ae4c7cd1b74eb16fb52596dce66dbe51eff95ef9e7a4c/torch-2.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:2c66c61f44c5f903046cc696d088e21062644cbe541c7f1c4eaae88b2ad23547", size = 113757972, upload-time = "2026-01-21T16:24:39.516Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/5c/dee910b87c4d5c0fcb41b50839ae04df87c1cfc663cf1b5fca7ea565eeaa/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6d3707a61863d1c4d6ebba7be4ca320f42b869ee657e9b2c21c736bf17000294", size = 79498198, upload-time = "2026-01-21T16:24:34.704Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/6f/f2e91e34e3fcba2e3fc8d8f74e7d6c22e74e480bbd1db7bc8900fdf3e95c/torch-2.10.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:5c4d217b14741e40776dd7074d9006fd28b8a97ef5654db959d8635b2fe5f29b", size = 146004247, upload-time = "2026-01-21T16:24:29.335Z" },
+    { url = "https://files.pythonhosted.org/packages/98/fb/5160261aeb5e1ee12ee95fe599d0541f7c976c3701d607d8fc29e623229f/torch-2.10.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6b71486353fce0f9714ca0c9ef1c850a2ae766b409808acd58e9678a3edb7738", size = 915716445, upload-time = "2026-01-21T16:22:45.353Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/16/502fb1b41e6d868e8deb5b0e3ae926bbb36dab8ceb0d1b769b266ad7b0c3/torch-2.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:c2ee399c644dc92ef7bc0d4f7e74b5360c37cdbe7c5ba11318dda49ffac2bc57", size = 113757050, upload-time = "2026-01-21T16:24:19.204Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/0b/39929b148f4824bc3ad6f9f72a29d4ad865bcf7ebfc2fa67584773e083d2/torch-2.10.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:3202429f58309b9fa96a614885eace4b7995729f44beb54d3e4a47773649d382", size = 79851305, upload-time = "2026-01-21T16:24:09.209Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/14/21fbce63bc452381ba5f74a2c0a959fdf5ad5803ccc0c654e752e0dbe91a/torch-2.10.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:aae1b29cd68e50a9397f5ee897b9c24742e9e306f88a807a27d617f07adb3bd8", size = 146005472, upload-time = "2026-01-21T16:22:29.022Z" },
+    { url = "https://files.pythonhosted.org/packages/54/fd/b207d1c525cb570ef47f3e9f836b154685011fce11a2f444ba8a4084d042/torch-2.10.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6021db85958db2f07ec94e1bc77212721ba4920c12a18dc552d2ae36a3eb163f", size = 915612644, upload-time = "2026-01-21T16:21:47.019Z" },
+    { url = "https://files.pythonhosted.org/packages/36/53/0197f868c75f1050b199fe58f9bf3bf3aecac9b4e85cc9c964383d745403/torch-2.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ff43db38af76fda183156153983c9a096fc4c78d0cd1e07b14a2314c7f01c2c8", size = 113997015, upload-time = "2026-01-21T16:23:00.767Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/13/e76b4d9c160e89fff48bf16b449ea324bda84745d2ab30294c37c2434c0d/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:cdf2a523d699b70d613243211ecaac14fe9c5df8a0b0a9c02add60fb2a413e0f", size = 79498248, upload-time = "2026-01-21T16:23:09.315Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/93/716b5ac0155f1be70ed81bacc21269c3ece8dba0c249b9994094110bfc51/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:bf0d9ff448b0218e0433aeb198805192346c4fd659c852370d5cc245f602a06a", size = 79464992, upload-time = "2026-01-21T16:23:05.162Z" },
+    { url = "https://files.pythonhosted.org/packages/69/2b/51e663ff190c9d16d4a8271203b71bc73a16aa7619b9f271a69b9d4a936b/torch-2.10.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:233aed0659a2503b831d8a67e9da66a62c996204c0bba4f4c442ccc0c68a3f60", size = 146018567, upload-time = "2026-01-21T16:22:23.393Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/cd/4b95ef7f293b927c283db0b136c42be91c8ec6845c44de0238c8c23bdc80/torch-2.10.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:682497e16bdfa6efeec8cde66531bc8d1fbbbb4d8788ec6173c089ed3cc2bfe5", size = 915721646, upload-time = "2026-01-21T16:21:16.983Z" },
+    { url = "https://files.pythonhosted.org/packages/56/97/078a007208f8056d88ae43198833469e61a0a355abc0b070edd2c085eb9a/torch-2.10.0-cp314-cp314-win_amd64.whl", hash = "sha256:6528f13d2a8593a1a412ea07a99812495bec07e9224c28b2a25c0a30c7da025c", size = 113752373, upload-time = "2026-01-21T16:22:13.471Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/94/71994e7d0d5238393df9732fdab607e37e2b56d26a746cb59fdb415f8966/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f5ab4ba32383061be0fb74bda772d470140a12c1c3b58a0cfbf3dae94d164c28", size = 79850324, upload-time = "2026-01-21T16:22:09.494Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/65/1a05346b418ea8ccd10360eef4b3e0ce688fba544e76edec26913a8d0ee0/torch-2.10.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:716b01a176c2a5659c98f6b01bf868244abdd896526f1c692712ab36dbaf9b63", size = 146006482, upload-time = "2026-01-21T16:22:18.42Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/b9/5f6f9d9e859fc3235f60578fa64f52c9c6e9b4327f0fe0defb6de5c0de31/torch-2.10.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:d8f5912ba938233f86361e891789595ff35ca4b4e2ac8fe3670895e5976731d6", size = 915613050, upload-time = "2026-01-21T16:20:49.035Z" },
+    { url = "https://files.pythonhosted.org/packages/66/4d/35352043ee0eaffdeff154fad67cd4a31dbed7ff8e3be1cc4549717d6d51/torch-2.10.0-cp314-cp314t-win_amd64.whl", hash = "sha256:71283a373f0ee2c89e0f0d5f446039bdabe8dbc3c9ccf35f0f784908b0acd185", size = 113995816, upload-time = "2026-01-21T16:22:05.312Z" },
 ]
 
 [[package]]
@@ -5823,7 +6907,8 @@ dependencies = [
     { name = "docker" },
     { name = "docstring-parser" },
     { name = "filelock" },
-    { name = "fsspec" },
+    { name = "fsspec", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'linux' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "importlib-metadata" },
     { name = "pyre-extensions" },
     { name = "pyyaml" },
@@ -5837,82 +6922,40 @@ wheels = [
 
 [[package]]
 name = "tqdm"
-version = "4.67.1"
+version = "4.67.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
+    { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
 ]
 
 [[package]]
 name = "transformer-engine"
-version = "2.9.0"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6d/5c/21152e73aa46ac7c969d694ce86cdeb199024c7810b2d700e900ea4efb1a/transformer_engine-2.9.0-py3-none-any.whl", hash = "sha256:953147ed4c490e54c9884bb0d876a1341f05c5c5b7d304bf61f4740f6faee5af", size = 662107, upload-time = "2025-11-11T15:50:49.167Z" },
-]
-
-[package.optional-dependencies]
-core-cu13 = [
-    { name = "transformer-engine-cu13" },
-]
-pytorch = [
-    { name = "transformer-engine-torch" },
-]
-
-[[package]]
-name = "transformer-engine-cu12"
-version = "2.9.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "importlib-metadata" },
-    { name = "packaging" },
-    { name = "pydantic" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a6/af/1c449ad0c43d3d6b5c529c812a4e8338b20965ae5361a9b612c7dce21e4d/transformer_engine_cu12-2.9.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:81162874c0618f3e62eb5ffba0bb1b608b4e56d70238205b1dced7ee965d82b3", size = 303669451, upload-time = "2025-11-11T15:54:12.008Z" },
-    { url = "https://files.pythonhosted.org/packages/82/21/aa351994d8ade95681763df2b10770c768900ecc7f1cedbfa4e89fe1935a/transformer_engine_cu12-2.9.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ad14981cbbd964f8e4446c35199d1bc5349ea30244e76bc57c1cceb5d469dd24", size = 304164366, upload-time = "2025-11-11T15:50:22.169Z" },
-]
-
-[[package]]
-name = "transformer-engine-cu13"
-version = "2.9.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "importlib-metadata" },
-    { name = "packaging" },
-    { name = "pydantic" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ab/b9/c1c788875848bf50faa22749107d91e92e9c0c78bb1878b99939209e40f9/transformer_engine_cu13-2.9.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:590aaeb3a4d552fe9ebc7019d43315f3e61153fcd1c5a07dc0c90bd8b278316e", size = 185010342, upload-time = "2025-11-13T22:35:04.742Z" },
-    { url = "https://files.pythonhosted.org/packages/95/7f/3019c21565f63eeb79d24fa7d3bae39b5b73f21c72d7d5123d21d7ce945a/transformer_engine_cu13-2.9.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:4e869f5a0fd74aaa05a5d801a96688ed21827d23efe9774bd3038d5f2802ef46", size = 185669069, upload-time = "2025-11-13T22:35:13.709Z" },
-]
-
-[[package]]
-name = "transformer-engine-torch"
-version = "2.9.0"
-source = { registry = "https://pypi.org/simple" }
+version = "2.12.0+5671fd36"
+source = { git = "https://github.com/NVIDIA/TransformerEngine.git?rev=5671fd3675906cda1ade26c24a65d3dedd88eb89#5671fd3675906cda1ade26c24a65d3dedd88eb89" }
 dependencies = [
     { name = "einops" },
+    { name = "importlib-metadata" },
+    { name = "nvdlfw-inspect" },
     { name = "onnx" },
     { name = "onnxscript" },
+    { name = "packaging" },
+    { name = "pydantic" },
     { name = "torch", marker = "sys_platform == 'never'" },
-    { name = "transformer-engine-cu12" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a2/a3/401d741eceb8f402595e63ee0b1828d60cae988b22f2f23c9cfcc24185bd/transformer_engine_torch-2.9.0.tar.gz", hash = "sha256:abbc59f6acf635abf865085ecdf90e7d4ca9a3782bc91a9845e38adb2655a547", size = 215138, upload-time = "2025-11-11T15:49:04.258Z" }
 
 [[package]]
 name = "transformers"
-version = "4.57.3"
+version = "4.57.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "huggingface-hub" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
@@ -5921,54 +6964,54 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dd/70/d42a739e8dfde3d92bb2fff5819cbf331fe9657323221e79415cd5eb65ee/transformers-4.57.3.tar.gz", hash = "sha256:df4945029aaddd7c09eec5cad851f30662f8bd1746721b34cc031d70c65afebc", size = 10139680, upload-time = "2025-11-25T15:51:30.139Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c4/35/67252acc1b929dc88b6602e8c4a982e64f31e733b804c14bc24b47da35e6/transformers-4.57.6.tar.gz", hash = "sha256:55e44126ece9dc0a291521b7e5492b572e6ef2766338a610b9ab5afbb70689d3", size = 10134912, upload-time = "2026-01-16T10:38:39.284Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" },
+    { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" },
 ]
 
 [[package]]
 name = "triton"
-version = "3.5.1"
+version = "3.6.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/2e/f95e673222afa2c7f0c687d8913e98fcf2589ef0b1405de76894e37fe18f/triton-3.5.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f63e34dcb32d7bd3a1d0195f60f30d2aee8b08a69a0424189b71017e23dfc3d2", size = 159821655, upload-time = "2025-11-11T17:51:44.09Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/6e/676ab5019b4dde8b9b7bab71245102fc02778ef3df48218b298686b9ffd6/triton-3.5.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fc53d849f879911ea13f4a877243afc513187bc7ee92d1f2c0f1ba3169e3c94", size = 170320692, upload-time = "2025-11-11T17:40:46.074Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/dc/6ce44d055f2fc2403c4ec6b3cfd3a9b25f57b7d95efadccdea91497f8e81/triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc", size = 159928005, upload-time = "2025-11-11T17:51:50.008Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/72/ec90c3519eaf168f22cb1757ad412f3a2add4782ad3a92861c9ad135d886/triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579", size = 170425802, upload-time = "2025-11-11T17:40:53.209Z" },
-    { url = "https://files.pythonhosted.org/packages/db/53/2bcc46879910991f09c063eea07627baef2bc62fe725302ba8f46a2c1ae5/triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4", size = 159940689, upload-time = "2025-11-11T17:51:55.938Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/50/9a8358d3ef58162c0a415d173cfb45b67de60176e1024f71fbc4d24c0b6d/triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232", size = 170470207, upload-time = "2025-11-11T17:41:00.253Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/ba/805684a992ee32d486b7948d36aed2f5e3c643fc63883bf8bdca1c3f3980/triton-3.5.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56765ffe12c554cd560698398b8a268db1f616c120007bfd8829d27139abd24a", size = 159955460, upload-time = "2025-11-11T17:52:01.861Z" },
-    { url = "https://files.pythonhosted.org/packages/27/46/8c3bbb5b0a19313f50edcaa363b599e5a1a5ac9683ead82b9b80fe497c8d/triton-3.5.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3f4346b6ebbd4fad18773f5ba839114f4826037c9f2f34e0148894cd5dd3dba", size = 170470410, upload-time = "2025-11-11T17:41:06.319Z" },
-    { url = "https://files.pythonhosted.org/packages/84/1e/7df59baef41931e21159371c481c31a517ff4c2517343b62503d0cd2be99/triton-3.5.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02c770856f5e407d24d28ddc66e33cf026e6f4d360dcb8b2fabe6ea1fc758621", size = 160072799, upload-time = "2025-11-11T17:52:07.293Z" },
-    { url = "https://files.pythonhosted.org/packages/37/92/e97fcc6b2c27cdb87ce5ee063d77f8f26f19f06916aa680464c8104ef0f6/triton-3.5.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0b4d2c70127fca6a23e247f9348b8adde979d2e7a20391bfbabaac6aebc7e6a8", size = 170579924, upload-time = "2025-11-11T17:41:12.455Z" },
-    { url = "https://files.pythonhosted.org/packages/14/f9/0430e879c1e63a1016cb843261528fd3187c872c3a9539132efc39514753/triton-3.5.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f617aa7925f9ea9968ec2e1adaf93e87864ff51549c8f04ce658f29bbdb71e2d", size = 159956163, upload-time = "2025-11-11T17:52:12.999Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/e6/c595c35e5c50c4bc56a7bac96493dad321e9e29b953b526bbbe20f9911d0/triton-3.5.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0637b1efb1db599a8e9dc960d53ab6e4637db7d4ab6630a0974705d77b14b60", size = 170480488, upload-time = "2025-11-11T17:41:18.222Z" },
-    { url = "https://files.pythonhosted.org/packages/41/1e/63d367c576c75919e268e4fbc33c1cb33b6dc12bb85e8bfe531c2a8bd5d3/triton-3.5.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8932391d7f93698dfe5bc9bead77c47a24f97329e9f20c10786bb230a9083f56", size = 160073620, upload-time = "2025-11-11T17:52:18.403Z" },
-    { url = "https://files.pythonhosted.org/packages/16/b5/b0d3d8b901b6a04ca38df5e24c27e53afb15b93624d7fd7d658c7cd9352a/triton-3.5.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bac7f7d959ad0f48c0e97d6643a1cc0fd5786fe61cb1f83b537c6b2d54776478", size = 170582192, upload-time = "2025-11-11T17:41:23.963Z" },
+    { url = "https://files.pythonhosted.org/packages/44/ba/b1b04f4b291a3205d95ebd24465de0e5bf010a2df27a4e58a9b5f039d8f2/triton-3.6.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c723cfb12f6842a0ae94ac307dba7e7a44741d720a40cf0e270ed4a4e3be781", size = 175972180, upload-time = "2026-01-20T16:15:53.664Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea", size = 188105201, upload-time = "2026-01-20T16:00:29.272Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651", size = 176081190, upload-time = "2026-01-20T16:16:00.523Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" },
+    { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd", size = 176133521, upload-time = "2026-01-20T16:16:13.321Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6", size = 176276087, upload-time = "2026-01-20T16:16:18.989Z" },
+    { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" },
+    { url = "https://files.pythonhosted.org/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43", size = 176138577, upload-time = "2026-01-20T16:16:25.426Z" },
+    { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" },
+    { url = "https://files.pythonhosted.org/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d", size = 176274804, upload-time = "2026-01-20T16:16:31.528Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" },
 ]
 
 [[package]]
 name = "trove-classifiers"
-version = "2025.11.14.15"
+version = "2026.1.14.14"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/bf/a9/880cccf76af9e7b322112f52e4e2dbb3534cbe671197b8f443a42189dfc7/trove_classifiers-2025.11.14.15.tar.gz", hash = "sha256:6b60f49d40bbd895bc61d8dc414fc2f2286d70eb72ed23548db8cf94f62804ca", size = 16995, upload-time = "2025-11-14T15:23:13.78Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/43/7935f8ea93fcb6680bc10a6fdbf534075c198eeead59150dd5ed68449642/trove_classifiers-2026.1.14.14.tar.gz", hash = "sha256:00492545a1402b09d4858605ba190ea33243d361e2b01c9c296ce06b5c3325f3", size = 16997, upload-time = "2026-01-14T14:54:50.526Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/49/f6/73c4aa003d1237ee9bea8a46f49dc38c45dfe95af4f0da7e60678d388011/trove_classifiers-2025.11.14.15-py3-none-any.whl", hash = "sha256:d1dac259c1e908939862e3331177931c6df0a37af2c1a8debcc603d9115fcdd9", size = 14191, upload-time = "2025-11-14T15:23:12.467Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/4a/2e5583e544bc437d5e8e54b47db87430df9031b29b48d17f26d129fa60c0/trove_classifiers-2026.1.14.14-py3-none-any.whl", hash = "sha256:1f9553927f18d0513d8e5ff80ab8980b8202ce37ecae0e3274ed2ef11880e74d", size = 14197, upload-time = "2026-01-14T14:54:49.067Z" },
 ]
 
 [[package]]
 name = "typer"
-version = "0.20.0"
+version = "0.24.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "annotated-doc" },
     { name = "click" },
     { name = "rich" },
     { name = "shellingham" },
-    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8f/28/7c85c8032b91dbe79725b6f17d2fffc595dff06a35c7a30a37bef73a1ab4/typer-0.20.0.tar.gz", hash = "sha256:1aaf6494031793e4876fb0bacfa6a912b551cf43c1e63c800df8b1a866720c37", size = 106492, upload-time = "2025-10-20T17:03:49.445Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload-time = "2026-02-21T16:54:40.609Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/64/7713ffe4b5983314e9d436a90d5bd4f63b6054e2aca783a3cfc44cb95bbf/typer-0.20.0-py3-none-any.whl", hash = "sha256:5b463df6793ec1dca6213a3cf4c0f03bc6e322ac5e16e13ddd622a889489784a", size = 47028, upload-time = "2025-10-20T17:03:47.617Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" },
 ]
 
 [[package]]
@@ -6007,11 +7050,11 @@ wheels = [
 
 [[package]]
 name = "tzdata"
-version = "2025.2"
+version = "2025.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" },
 ]
 
 [[package]]
@@ -6025,21 +7068,36 @@ wheels = [
 
 [[package]]
 name = "uvicorn"
-version = "0.38.0"
+version = "0.41.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
     { name = "h11" },
     { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/cb/ce/f06b84e2697fef4688ca63bdb2fdf113ca0a3be33f94488f2cadb690b0cf/uvicorn-0.38.0.tar.gz", hash = "sha256:fd97093bdd120a2609fc0d3afe931d4d4ad688b6e75f0f929fde1bc36fe0e91d", size = 80605, upload-time = "2025-10-18T13:46:44.63Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/32/ce/eeb58ae4ac36fe09e3842eb02e0eb676bf2c53ae062b98f1b2531673efdd/uvicorn-0.41.0.tar.gz", hash = "sha256:09d11cf7008da33113824ee5a1c6422d89fbc2ff476540d69a34c87fab8b571a", size = 82633, upload-time = "2026-02-16T23:07:24.1Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/e4/d04a086285c20886c0daad0e026f250869201013d18f81d9ff5eada73a88/uvicorn-0.41.0-py3-none-any.whl", hash = "sha256:29e35b1d2c36a04b9e180d4007ede3bcb32a85fbdfd6c6aeb3f26839de088187", size = 68783, upload-time = "2026-02-16T23:07:22.357Z" },
+]
+
+[[package]]
+name = "virtualenv"
+version = "20.39.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "distlib" },
+    { name = "filelock" },
+    { name = "platformdirs" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ed/54/809199edc537dbace273495ac0884d13df26436e910a5ed4d0ec0a69806b/virtualenv-20.39.0.tar.gz", hash = "sha256:a15f0cebd00d50074fd336a169d53422436a12dfe15149efec7072cfe817df8b", size = 5869141, upload-time = "2026-02-23T18:09:13.349Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ee/d9/d88e73ca598f4f6ff671fb5fde8a32925c2e08a637303a1d12883c7305fa/uvicorn-0.38.0-py3-none-any.whl", hash = "sha256:48c0afd214ceb59340075b4a052ea1ee91c16fbc2a9b1469cca0e54566977b02", size = 68109, upload-time = "2025-10-18T13:46:42.958Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/b4/8268da45f26f4fe84f6eae80a6ca1485ffb490a926afecff75fc48f61979/virtualenv-20.39.0-py3-none-any.whl", hash = "sha256:44888bba3775990a152ea1f73f8e5f566d49f11bbd1de61d426fd7732770043e", size = 5839121, upload-time = "2026-02-23T18:09:11.173Z" },
 ]
 
 [[package]]
 name = "wandb"
-version = "0.23.0"
+version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
@@ -6053,17 +7111,17 @@ dependencies = [
     { name = "sentry-sdk" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ef/8b/db2d44395c967cd452517311fd6ede5d1e07310769f448358d4874248512/wandb-0.23.0.tar.gz", hash = "sha256:e5f98c61a8acc3ee84583ca78057f64344162ce026b9f71cb06eea44aec27c93", size = 44413921, upload-time = "2025-11-11T21:06:30.737Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/fd/60/d94952549920469524b689479c864c692ca47eca4b8c2fe3389b64a58778/wandb-0.25.0.tar.gz", hash = "sha256:45840495a288e34245d69d07b5a0b449220fbc5b032e6b51c4f92ec9026d2ad1", size = 43951335, upload-time = "2026-02-13T00:17:45.515Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/41/61/a3220c7fa4cadfb2b2a5c09e3fa401787326584ade86d7c1f58bf1cd43bd/wandb-0.23.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:b682ec5e38fc97bd2e868ac7615a0ab4fc6a15220ee1159e87270a5ebb7a816d", size = 18992250, upload-time = "2025-11-11T21:06:03.412Z" },
-    { url = "https://files.pythonhosted.org/packages/90/16/e69333cf3d11e7847f424afc6c8ae325e1f6061b2e5118d7a17f41b6525d/wandb-0.23.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:ec094eb71b778e77db8c188da19e52c4f96cb9d5b4421d7dc05028afc66fd7e7", size = 20045616, upload-time = "2025-11-11T21:06:07.109Z" },
-    { url = "https://files.pythonhosted.org/packages/62/79/42dc6c7bb0b425775fe77f1a3f1a22d75d392841a06b43e150a3a7f2553a/wandb-0.23.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e43f1f04b98c34f407dcd2744cec0a590abce39bed14a61358287f817514a7b", size = 18758848, upload-time = "2025-11-11T21:06:09.832Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/94/d6ddb78334996ccfc1179444bfcfc0f37ffd07ee79bb98940466da6f68f8/wandb-0.23.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5847f98cbb3175caf5291932374410141f5bb3b7c25f9c5e562c1988ce0bf5", size = 20231493, upload-time = "2025-11-11T21:06:12.323Z" },
-    { url = "https://files.pythonhosted.org/packages/52/4d/0ad6df0e750c19dabd24d2cecad0938964f69a072f05fbdab7281bec2b64/wandb-0.23.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6151355fd922539926e870be811474238c9614b96541773b990f1ce53368aef6", size = 18793473, upload-time = "2025-11-11T21:06:14.967Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/da/c2ba49c5573dff93dafc0acce691bb1c3d57361bf834b2f2c58e6193439b/wandb-0.23.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:df62e426e448ebc44269140deb7240df474e743b12d4b1f53b753afde4aa06d4", size = 20332882, upload-time = "2025-11-11T21:06:17.865Z" },
-    { url = "https://files.pythonhosted.org/packages/40/65/21bfb10ee5cd93fbcaf794958863c7e05bac4bbeb1cc1b652094aa3743a5/wandb-0.23.0-py3-none-win32.whl", hash = "sha256:6c21d3eadda17aef7df6febdffdddfb0b4835c7754435fc4fe27631724269f5c", size = 19433198, upload-time = "2025-11-11T21:06:21.913Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/33/cbe79e66c171204e32cf940c7fdfb8b5f7d2af7a00f301c632f3a38aa84b/wandb-0.23.0-py3-none-win_amd64.whl", hash = "sha256:b50635fa0e16e528bde25715bf446e9153368428634ca7a5dbd7a22c8ae4e915", size = 19433201, upload-time = "2025-11-11T21:06:24.607Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/a0/5ecfae12d78ea036a746c071e4c13b54b28d641efbba61d2947c73b3e6f9/wandb-0.23.0-py3-none-win_arm64.whl", hash = "sha256:fa0181b02ce4d1993588f4a728d8b73ae487eb3cb341e6ce01c156be7a98ec72", size = 17678649, upload-time = "2025-11-11T21:06:27.289Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/7d/0c131db3ec9deaabbd32263d90863cbfbe07659527e11c35a5c738cecdc5/wandb-0.25.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:5eecb3c7b5e60d1acfa4b056bfbaa0b79a482566a9db58c9f99724b3862bc8e5", size = 23287536, upload-time = "2026-02-13T00:17:20.265Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/95/31bb7f76a966ec87495e5a72ac7570685be162494c41757ac871768dbc4f/wandb-0.25.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:daeedaadb183dc466e634fba90ab2bab1d4e93000912be0dee95065a0624a3fd", size = 25196062, upload-time = "2026-02-13T00:17:23.356Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/a1/258cdedbf30cebc692198a774cf0ef945b7ed98ee64bdaf62621281c95d8/wandb-0.25.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:5e0127dbcef13eea48f4b84268da7004d34d3120ebc7b2fa9cefb72b49dbb825", size = 22799744, upload-time = "2026-02-13T00:17:26.437Z" },
+    { url = "https://files.pythonhosted.org/packages/de/91/ec9465d014cfd199c5b2083d271d31b3c2aedeae66f3d8a0712f7f54bdf3/wandb-0.25.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:6c4c38077836f9b7569a35b0e1dcf1f0c43616fcd936d182f475edbfea063665", size = 25262839, upload-time = "2026-02-13T00:17:28.8Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/95/cb2d1c7143f534544147fb53fe87944508b8cb9a058bc5b6f8a94adbee15/wandb-0.25.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6edd8948d305cb73745bf564b807bd73da2ccbd47c548196b8a362f7df40aed8", size = 22853714, upload-time = "2026-02-13T00:17:31.68Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/94/68163f70c1669edcf130822aaaea782d8198b5df74443eca0085ec596774/wandb-0.25.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ada6f08629bb014ad6e0a19d5dec478cdaa116431baa3f0a4bf4ab8d9893611f", size = 25358037, upload-time = "2026-02-13T00:17:34.676Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/fb/9578eed2c01b2fc6c8b693da110aa9c73a33d7bb556480f5cfc42e48c94e/wandb-0.25.0-py3-none-win32.whl", hash = "sha256:020b42ca4d76e347709d65f59b30d4623a115edc28f462af1c92681cb17eae7c", size = 24604118, upload-time = "2026-02-13T00:17:37.641Z" },
+    { url = "https://files.pythonhosted.org/packages/25/97/460f6cb738aaa39b4eb2e6b4c630b2ae4321cdd70a79d5955ea75a878981/wandb-0.25.0-py3-none-win_amd64.whl", hash = "sha256:78307ac0b328f2dc334c8607bec772851215584b62c439eb320c4af4fb077a00", size = 24604122, upload-time = "2026-02-13T00:17:39.991Z" },
+    { url = "https://files.pythonhosted.org/packages/27/6c/5847b4dda1dfd52630dac08711d4348c69ed657f0698fc2d949c7f7a6622/wandb-0.25.0-py3-none-win_arm64.whl", hash = "sha256:c6174401fd6fb726295e98d57b4231c100eca96bd17de51bfc64038a57230aaf", size = 21785298, upload-time = "2026-02-13T00:17:42.475Z" },
 ]
 
 [[package]]
@@ -6183,11 +7241,11 @@ wheels = [
 
 [[package]]
 name = "wcwidth"
-version = "0.2.14"
+version = "0.6.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" },
+    { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" },
 ]
 
 [[package]]
@@ -6197,7 +7255,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "braceexpand" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pyyaml" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5a/3a/68800d92e065cf4750ebecf973b13979c0c929b439e1293012938862038d/webdataset-1.0.2.tar.gz", hash = "sha256:7f0498be827cfa46cc5430a58768a24e2c6a410676a61be1838f53d61afdaab4", size = 80090, upload-time = "2025-06-19T23:26:21.945Z" }
@@ -6207,73 +7265,82 @@ wheels = [
 
 [[package]]
 name = "websockets"
-version = "15.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1e/da/6462a9f510c0c49837bbc9345aca92d767a56c1fb2939e1579df1e1cdcf7/websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b", size = 175423, upload-time = "2025-03-05T20:01:35.363Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/9f/9d11c1a4eb046a9e106483b9ff69bce7ac880443f00e5ce64261b47b07e7/websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205", size = 173080, upload-time = "2025-03-05T20:01:37.304Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/4f/b462242432d93ea45f297b6179c7333dd0402b855a912a04e7fc61c0d71f/websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5756779642579d902eed757b21b0164cd6fe338506a8083eb58af5c372e39d9a", size = 173329, upload-time = "2025-03-05T20:01:39.668Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/0c/6afa1f4644d7ed50284ac59cc70ef8abd44ccf7d45850d989ea7310538d0/websockets-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdfe3e2a29e4db3659dbd5bbf04560cea53dd9610273917799f1cde46aa725e", size = 182312, upload-time = "2025-03-05T20:01:41.815Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/d4/ffc8bd1350b229ca7a4db2a3e1c482cf87cea1baccd0ef3e72bc720caeec/websockets-15.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c2529b320eb9e35af0fa3016c187dffb84a3ecc572bcee7c3ce302bfeba52bf", size = 181319, upload-time = "2025-03-05T20:01:43.967Z" },
-    { url = "https://files.pythonhosted.org/packages/97/3a/5323a6bb94917af13bbb34009fac01e55c51dfde354f63692bf2533ffbc2/websockets-15.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac1e5c9054fe23226fb11e05a6e630837f074174c4c2f0fe442996112a6de4fb", size = 181631, upload-time = "2025-03-05T20:01:46.104Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/cc/1aeb0f7cee59ef065724041bb7ed667b6ab1eeffe5141696cccec2687b66/websockets-15.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5df592cd503496351d6dc14f7cdad49f268d8e618f80dce0cd5a36b93c3fc08d", size = 182016, upload-time = "2025-03-05T20:01:47.603Z" },
-    { url = "https://files.pythonhosted.org/packages/79/f9/c86f8f7af208e4161a7f7e02774e9d0a81c632ae76db2ff22549e1718a51/websockets-15.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9", size = 181426, upload-time = "2025-03-05T20:01:48.949Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/b9/828b0bc6753db905b91df6ae477c0b14a141090df64fb17f8a9d7e3516cf/websockets-15.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d00075aa65772e7ce9e990cab3ff1de702aa09be3940d1dc88d5abf1ab8a09c", size = 181360, upload-time = "2025-03-05T20:01:50.938Z" },
-    { url = "https://files.pythonhosted.org/packages/89/fb/250f5533ec468ba6327055b7d98b9df056fb1ce623b8b6aaafb30b55d02e/websockets-15.0.1-cp310-cp310-win32.whl", hash = "sha256:1234d4ef35db82f5446dca8e35a7da7964d02c127b095e172e54397fb6a6c256", size = 176388, upload-time = "2025-03-05T20:01:52.213Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/46/aca7082012768bb98e5608f01658ff3ac8437e563eca41cf068bd5849a5e/websockets-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:39c1fec2c11dc8d89bba6b2bf1556af381611a173ac2b511cf7231622058af41", size = 176830, upload-time = "2025-03-05T20:01:53.922Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/32/18fcd5919c293a398db67443acd33fde142f283853076049824fc58e6f75/websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431", size = 175423, upload-time = "2025-03-05T20:01:56.276Z" },
-    { url = "https://files.pythonhosted.org/packages/76/70/ba1ad96b07869275ef42e2ce21f07a5b0148936688c2baf7e4a1f60d5058/websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57", size = 173082, upload-time = "2025-03-05T20:01:57.563Z" },
-    { url = "https://files.pythonhosted.org/packages/86/f2/10b55821dd40eb696ce4704a87d57774696f9451108cff0d2824c97e0f97/websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905", size = 173330, upload-time = "2025-03-05T20:01:59.063Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/90/1c37ae8b8a113d3daf1065222b6af61cc44102da95388ac0018fcb7d93d9/websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562", size = 182878, upload-time = "2025-03-05T20:02:00.305Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/8d/96e8e288b2a41dffafb78e8904ea7367ee4f891dafc2ab8d87e2124cb3d3/websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792", size = 181883, upload-time = "2025-03-05T20:02:03.148Z" },
-    { url = "https://files.pythonhosted.org/packages/93/1f/5d6dbf551766308f6f50f8baf8e9860be6182911e8106da7a7f73785f4c4/websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413", size = 182252, upload-time = "2025-03-05T20:02:05.29Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/78/2d4fed9123e6620cbf1706c0de8a1632e1a28e7774d94346d7de1bba2ca3/websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8", size = 182521, upload-time = "2025-03-05T20:02:07.458Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/3b/66d4c1b444dd1a9823c4a81f50231b921bab54eee2f69e70319b4e21f1ca/websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3", size = 181958, upload-time = "2025-03-05T20:02:09.842Z" },
-    { url = "https://files.pythonhosted.org/packages/08/ff/e9eed2ee5fed6f76fdd6032ca5cd38c57ca9661430bb3d5fb2872dc8703c/websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf", size = 181918, upload-time = "2025-03-05T20:02:11.968Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/75/994634a49b7e12532be6a42103597b71098fd25900f7437d6055ed39930a/websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85", size = 176388, upload-time = "2025-03-05T20:02:13.32Z" },
-    { url = "https://files.pythonhosted.org/packages/98/93/e36c73f78400a65f5e236cd376713c34182e6663f6889cd45a4a04d8f203/websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065", size = 176828, upload-time = "2025-03-05T20:02:14.585Z" },
-    { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload-time = "2025-03-05T20:02:16.706Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload-time = "2025-03-05T20:02:18.832Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload-time = "2025-03-05T20:02:20.187Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload-time = "2025-03-05T20:02:22.286Z" },
-    { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload-time = "2025-03-05T20:02:24.368Z" },
-    { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload-time = "2025-03-05T20:02:25.669Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload-time = "2025-03-05T20:02:26.99Z" },
-    { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload-time = "2025-03-05T20:02:30.291Z" },
-    { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload-time = "2025-03-05T20:02:31.634Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload-time = "2025-03-05T20:02:33.017Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload-time = "2025-03-05T20:02:34.498Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/9f/51f0cf64471a9d2b4d0fc6c534f323b664e7095640c34562f5182e5a7195/websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931", size = 175440, upload-time = "2025-03-05T20:02:36.695Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/05/aa116ec9943c718905997412c5989f7ed671bc0188ee2ba89520e8765d7b/websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675", size = 173098, upload-time = "2025-03-05T20:02:37.985Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/0b/33cef55ff24f2d92924923c99926dcce78e7bd922d649467f0eda8368923/websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151", size = 173329, upload-time = "2025-03-05T20:02:39.298Z" },
-    { url = "https://files.pythonhosted.org/packages/31/1d/063b25dcc01faa8fada1469bdf769de3768b7044eac9d41f734fd7b6ad6d/websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22", size = 183111, upload-time = "2025-03-05T20:02:40.595Z" },
-    { url = "https://files.pythonhosted.org/packages/93/53/9a87ee494a51bf63e4ec9241c1ccc4f7c2f45fff85d5bde2ff74fcb68b9e/websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f", size = 182054, upload-time = "2025-03-05T20:02:41.926Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/b2/83a6ddf56cdcbad4e3d841fcc55d6ba7d19aeb89c50f24dd7e859ec0805f/websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8", size = 182496, upload-time = "2025-03-05T20:02:43.304Z" },
-    { url = "https://files.pythonhosted.org/packages/98/41/e7038944ed0abf34c45aa4635ba28136f06052e08fc2168520bb8b25149f/websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375", size = 182829, upload-time = "2025-03-05T20:02:48.812Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/17/de15b6158680c7623c6ef0db361da965ab25d813ae54fcfeae2e5b9ef910/websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d", size = 182217, upload-time = "2025-03-05T20:02:50.14Z" },
-    { url = "https://files.pythonhosted.org/packages/33/2b/1f168cb6041853eef0362fb9554c3824367c5560cbdaad89ac40f8c2edfc/websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4", size = 182195, upload-time = "2025-03-05T20:02:51.561Z" },
-    { url = "https://files.pythonhosted.org/packages/86/eb/20b6cdf273913d0ad05a6a14aed4b9a85591c18a987a3d47f20fa13dcc47/websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa", size = 176393, upload-time = "2025-03-05T20:02:53.814Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/6c/c65773d6cab416a64d191d6ee8a8b1c68a09970ea6909d16965d26bfed1e/websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561", size = 176837, upload-time = "2025-03-05T20:02:55.237Z" },
-    { url = "https://files.pythonhosted.org/packages/02/9e/d40f779fa16f74d3468357197af8d6ad07e7c5a27ea1ca74ceb38986f77a/websockets-15.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0c9e74d766f2818bb95f84c25be4dea09841ac0f734d1966f415e4edfc4ef1c3", size = 173109, upload-time = "2025-03-05T20:03:17.769Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/cd/5b887b8585a593073fd92f7c23ecd3985cd2c3175025a91b0d69b0551372/websockets-15.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1009ee0c7739c08a0cd59de430d6de452a55e42d6b522de7aa15e6f67db0b8e1", size = 173343, upload-time = "2025-03-05T20:03:19.094Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/ae/d34f7556890341e900a95acf4886833646306269f899d58ad62f588bf410/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d1f20b1c7a2fa82367e04982e708723ba0e7b8d43aa643d3dcd404d74f1475", size = 174599, upload-time = "2025-03-05T20:03:21.1Z" },
-    { url = "https://files.pythonhosted.org/packages/71/e6/5fd43993a87db364ec60fc1d608273a1a465c0caba69176dd160e197ce42/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f29d80eb9a9263b8d109135351caf568cc3f80b9928bccde535c235de55c22d9", size = 174207, upload-time = "2025-03-05T20:03:23.221Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/fb/c492d6daa5ec067c2988ac80c61359ace5c4c674c532985ac5a123436cec/websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b359ed09954d7c18bbc1680f380c7301f92c60bf924171629c5db97febb12f04", size = 174155, upload-time = "2025-03-05T20:03:25.321Z" },
-    { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
+version = "16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/04/24/4b2031d72e840ce4c1ccb255f693b15c334757fc50023e4db9537080b8c4/websockets-16.0.tar.gz", hash = "sha256:5f6261a5e56e8d5c42a4497b364ea24d94d9563e8fbd44e78ac40879c60179b5", size = 179346, upload-time = "2026-01-10T09:23:47.181Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/74/221f58decd852f4b59cc3354cccaf87e8ef695fede361d03dc9a7396573b/websockets-16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:04cdd5d2d1dacbad0a7bf36ccbcd3ccd5a30ee188f2560b7a62a30d14107b31a", size = 177343, upload-time = "2026-01-10T09:22:21.28Z" },
+    { url = "https://files.pythonhosted.org/packages/19/0f/22ef6107ee52ab7f0b710d55d36f5a5d3ef19e8a205541a6d7ffa7994e5a/websockets-16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8ff32bb86522a9e5e31439a58addbb0166f0204d64066fb955265c4e214160f0", size = 175021, upload-time = "2026-01-10T09:22:22.696Z" },
+    { url = "https://files.pythonhosted.org/packages/10/40/904a4cb30d9b61c0e278899bf36342e9b0208eb3c470324a9ecbaac2a30f/websockets-16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:583b7c42688636f930688d712885cf1531326ee05effd982028212ccc13e5957", size = 175320, upload-time = "2026-01-10T09:22:23.94Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/2f/4b3ca7e106bc608744b1cdae041e005e446124bebb037b18799c2d356864/websockets-16.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7d837379b647c0c4c2355c2499723f82f1635fd2c26510e1f587d89bc2199e72", size = 183815, upload-time = "2026-01-10T09:22:25.469Z" },
+    { url = "https://files.pythonhosted.org/packages/86/26/d40eaa2a46d4302becec8d15b0fc5e45bdde05191e7628405a19cf491ccd/websockets-16.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df57afc692e517a85e65b72e165356ed1df12386ecb879ad5693be08fac65dde", size = 185054, upload-time = "2026-01-10T09:22:27.101Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/ba/6500a0efc94f7373ee8fefa8c271acdfd4dca8bd49a90d4be7ccabfc397e/websockets-16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2b9f1e0d69bc60a4a87349d50c09a037a2607918746f07de04df9e43252c77a3", size = 184565, upload-time = "2026-01-10T09:22:28.293Z" },
+    { url = "https://files.pythonhosted.org/packages/04/b4/96bf2cee7c8d8102389374a2616200574f5f01128d1082f44102140344cc/websockets-16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:335c23addf3d5e6a8633f9f8eda77efad001671e80b95c491dd0924587ece0b3", size = 183848, upload-time = "2026-01-10T09:22:30.394Z" },
+    { url = "https://files.pythonhosted.org/packages/02/8e/81f40fb00fd125357814e8c3025738fc4ffc3da4b6b4a4472a82ba304b41/websockets-16.0-cp310-cp310-win32.whl", hash = "sha256:37b31c1623c6605e4c00d466c9d633f9b812ea430c11c8a278774a1fde1acfa9", size = 178249, upload-time = "2026-01-10T09:22:32.083Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/5f/7e40efe8df57db9b91c88a43690ac66f7b7aa73a11aa6a66b927e44f26fa/websockets-16.0-cp310-cp310-win_amd64.whl", hash = "sha256:8e1dab317b6e77424356e11e99a432b7cb2f3ec8c5ab4dabbcee6add48f72b35", size = 178685, upload-time = "2026-01-10T09:22:33.345Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/db/de907251b4ff46ae804ad0409809504153b3f30984daf82a1d84a9875830/websockets-16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:31a52addea25187bde0797a97d6fc3d2f92b6f72a9370792d65a6e84615ac8a8", size = 177340, upload-time = "2026-01-10T09:22:34.539Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/fa/abe89019d8d8815c8781e90d697dec52523fb8ebe308bf11664e8de1877e/websockets-16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:417b28978cdccab24f46400586d128366313e8a96312e4b9362a4af504f3bbad", size = 175022, upload-time = "2026-01-10T09:22:36.332Z" },
+    { url = "https://files.pythonhosted.org/packages/58/5d/88ea17ed1ded2079358b40d31d48abe90a73c9e5819dbcde1606e991e2ad/websockets-16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:af80d74d4edfa3cb9ed973a0a5ba2b2a549371f8a741e0800cb07becdd20f23d", size = 175319, upload-time = "2026-01-10T09:22:37.602Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/ae/0ee92b33087a33632f37a635e11e1d99d429d3d323329675a6022312aac2/websockets-16.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:08d7af67b64d29823fed316505a89b86705f2b7981c07848fb5e3ea3020c1abe", size = 184631, upload-time = "2026-01-10T09:22:38.789Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/c5/27178df583b6c5b31b29f526ba2da5e2f864ecc79c99dae630a85d68c304/websockets-16.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7be95cfb0a4dae143eaed2bcba8ac23f4892d8971311f1b06f3c6b78952ee70b", size = 185870, upload-time = "2026-01-10T09:22:39.893Z" },
+    { url = "https://files.pythonhosted.org/packages/87/05/536652aa84ddc1c018dbb7e2c4cbcd0db884580bf8e95aece7593fde526f/websockets-16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d6297ce39ce5c2e6feb13c1a996a2ded3b6832155fcfc920265c76f24c7cceb5", size = 185361, upload-time = "2026-01-10T09:22:41.016Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/e2/d5332c90da12b1e01f06fb1b85c50cfc489783076547415bf9f0a659ec19/websockets-16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1c1b30e4f497b0b354057f3467f56244c603a79c0d1dafce1d16c283c25f6e64", size = 184615, upload-time = "2026-01-10T09:22:42.442Z" },
+    { url = "https://files.pythonhosted.org/packages/77/fb/d3f9576691cae9253b51555f841bc6600bf0a983a461c79500ace5a5b364/websockets-16.0-cp311-cp311-win32.whl", hash = "sha256:5f451484aeb5cafee1ccf789b1b66f535409d038c56966d6101740c1614b86c6", size = 178246, upload-time = "2026-01-10T09:22:43.654Z" },
+    { url = "https://files.pythonhosted.org/packages/54/67/eaff76b3dbaf18dcddabc3b8c1dba50b483761cccff67793897945b37408/websockets-16.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d7f0659570eefb578dacde98e24fb60af35350193e4f56e11190787bee77dac", size = 178684, upload-time = "2026-01-10T09:22:44.941Z" },
+    { url = "https://files.pythonhosted.org/packages/84/7b/bac442e6b96c9d25092695578dda82403c77936104b5682307bd4deb1ad4/websockets-16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:71c989cbf3254fbd5e84d3bff31e4da39c43f884e64f2551d14bb3c186230f00", size = 177365, upload-time = "2026-01-10T09:22:46.787Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/fe/136ccece61bd690d9c1f715baaeefd953bb2360134de73519d5df19d29ca/websockets-16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8b6e209ffee39ff1b6d0fa7bfef6de950c60dfb91b8fcead17da4ee539121a79", size = 175038, upload-time = "2026-01-10T09:22:47.999Z" },
+    { url = "https://files.pythonhosted.org/packages/40/1e/9771421ac2286eaab95b8575b0cb701ae3663abf8b5e1f64f1fd90d0a673/websockets-16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:86890e837d61574c92a97496d590968b23c2ef0aeb8a9bc9421d174cd378ae39", size = 175328, upload-time = "2026-01-10T09:22:49.809Z" },
+    { url = "https://files.pythonhosted.org/packages/18/29/71729b4671f21e1eaa5d6573031ab810ad2936c8175f03f97f3ff164c802/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9b5aca38b67492ef518a8ab76851862488a478602229112c4b0d58d63a7a4d5c", size = 184915, upload-time = "2026-01-10T09:22:51.071Z" },
+    { url = "https://files.pythonhosted.org/packages/97/bb/21c36b7dbbafc85d2d480cd65df02a1dc93bf76d97147605a8e27ff9409d/websockets-16.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e0334872c0a37b606418ac52f6ab9cfd17317ac26365f7f65e203e2d0d0d359f", size = 186152, upload-time = "2026-01-10T09:22:52.224Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/34/9bf8df0c0cf88fa7bfe36678dc7b02970c9a7d5e065a3099292db87b1be2/websockets-16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a0b31e0b424cc6b5a04b8838bbaec1688834b2383256688cf47eb97412531da1", size = 185583, upload-time = "2026-01-10T09:22:53.443Z" },
+    { url = "https://files.pythonhosted.org/packages/47/88/4dd516068e1a3d6ab3c7c183288404cd424a9a02d585efbac226cb61ff2d/websockets-16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:485c49116d0af10ac698623c513c1cc01c9446c058a4e61e3bf6c19dff7335a2", size = 184880, upload-time = "2026-01-10T09:22:55.033Z" },
+    { url = "https://files.pythonhosted.org/packages/91/d6/7d4553ad4bf1c0421e1ebd4b18de5d9098383b5caa1d937b63df8d04b565/websockets-16.0-cp312-cp312-win32.whl", hash = "sha256:eaded469f5e5b7294e2bdca0ab06becb6756ea86894a47806456089298813c89", size = 178261, upload-time = "2026-01-10T09:22:56.251Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/f0/f3a17365441ed1c27f850a80b2bc680a0fa9505d733fe152fdf5e98c1c0b/websockets-16.0-cp312-cp312-win_amd64.whl", hash = "sha256:5569417dc80977fc8c2d43a86f78e0a5a22fee17565d78621b6bb264a115d4ea", size = 178693, upload-time = "2026-01-10T09:22:57.478Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/9c/baa8456050d1c1b08dd0ec7346026668cbc6f145ab4e314d707bb845bf0d/websockets-16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:878b336ac47938b474c8f982ac2f7266a540adc3fa4ad74ae96fea9823a02cc9", size = 177364, upload-time = "2026-01-10T09:22:59.333Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/0c/8811fc53e9bcff68fe7de2bcbe75116a8d959ac699a3200f4847a8925210/websockets-16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:52a0fec0e6c8d9a784c2c78276a48a2bdf099e4ccc2a4cad53b27718dbfd0230", size = 175039, upload-time = "2026-01-10T09:23:01.171Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/82/39a5f910cb99ec0b59e482971238c845af9220d3ab9fa76dd9162cda9d62/websockets-16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e6578ed5b6981005df1860a56e3617f14a6c307e6a71b4fff8c48fdc50f3ed2c", size = 175323, upload-time = "2026-01-10T09:23:02.341Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/28/0a25ee5342eb5d5f297d992a77e56892ecb65e7854c7898fb7d35e9b33bd/websockets-16.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:95724e638f0f9c350bb1c2b0a7ad0e83d9cc0c9259f3ea94e40d7b02a2179ae5", size = 184975, upload-time = "2026-01-10T09:23:03.756Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/66/27ea52741752f5107c2e41fda05e8395a682a1e11c4e592a809a90c6a506/websockets-16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0204dc62a89dc9d50d682412c10b3542d748260d743500a85c13cd1ee4bde82", size = 186203, upload-time = "2026-01-10T09:23:05.01Z" },
+    { url = "https://files.pythonhosted.org/packages/37/e5/8e32857371406a757816a2b471939d51c463509be73fa538216ea52b792a/websockets-16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:52ac480f44d32970d66763115edea932f1c5b1312de36df06d6b219f6741eed8", size = 185653, upload-time = "2026-01-10T09:23:06.301Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/67/f926bac29882894669368dc73f4da900fcdf47955d0a0185d60103df5737/websockets-16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6e5a82b677f8f6f59e8dfc34ec06ca6b5b48bc4fcda346acd093694cc2c24d8f", size = 184920, upload-time = "2026-01-10T09:23:07.492Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/a1/3d6ccdcd125b0a42a311bcd15a7f705d688f73b2a22d8cf1c0875d35d34a/websockets-16.0-cp313-cp313-win32.whl", hash = "sha256:abf050a199613f64c886ea10f38b47770a65154dc37181bfaff70c160f45315a", size = 178255, upload-time = "2026-01-10T09:23:09.245Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/ae/90366304d7c2ce80f9b826096a9e9048b4bb760e44d3b873bb272cba696b/websockets-16.0-cp313-cp313-win_amd64.whl", hash = "sha256:3425ac5cf448801335d6fdc7ae1eb22072055417a96cc6b31b3861f455fbc156", size = 178689, upload-time = "2026-01-10T09:23:10.483Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/1d/e88022630271f5bd349ed82417136281931e558d628dd52c4d8621b4a0b2/websockets-16.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8cc451a50f2aee53042ac52d2d053d08bf89bcb31ae799cb4487587661c038a0", size = 177406, upload-time = "2026-01-10T09:23:12.178Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/78/e63be1bf0724eeb4616efb1ae1c9044f7c3953b7957799abb5915bffd38e/websockets-16.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:daa3b6ff70a9241cf6c7fc9e949d41232d9d7d26fd3522b1ad2b4d62487e9904", size = 175085, upload-time = "2026-01-10T09:23:13.511Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/f4/d3c9220d818ee955ae390cf319a7c7a467beceb24f05ee7aaaa2414345ba/websockets-16.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:fd3cb4adb94a2a6e2b7c0d8d05cb94e6f1c81a0cf9dc2694fb65c7e8d94c42e4", size = 175328, upload-time = "2026-01-10T09:23:14.727Z" },
+    { url = "https://files.pythonhosted.org/packages/63/bc/d3e208028de777087e6fb2b122051a6ff7bbcca0d6df9d9c2bf1dd869ae9/websockets-16.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:781caf5e8eee67f663126490c2f96f40906594cb86b408a703630f95550a8c3e", size = 185044, upload-time = "2026-01-10T09:23:15.939Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/6e/9a0927ac24bd33a0a9af834d89e0abc7cfd8e13bed17a86407a66773cc0e/websockets-16.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:caab51a72c51973ca21fa8a18bd8165e1a0183f1ac7066a182ff27107b71e1a4", size = 186279, upload-time = "2026-01-10T09:23:17.148Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/ca/bf1c68440d7a868180e11be653c85959502efd3a709323230314fda6e0b3/websockets-16.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19c4dc84098e523fd63711e563077d39e90ec6702aff4b5d9e344a60cb3c0cb1", size = 185711, upload-time = "2026-01-10T09:23:18.372Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/f8/fdc34643a989561f217bb477cbc47a3a07212cbda91c0e4389c43c296ebf/websockets-16.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a5e18a238a2b2249c9a9235466b90e96ae4795672598a58772dd806edc7ac6d3", size = 184982, upload-time = "2026-01-10T09:23:19.652Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/d1/574fa27e233764dbac9c52730d63fcf2823b16f0856b3329fc6268d6ae4f/websockets-16.0-cp314-cp314-win32.whl", hash = "sha256:a069d734c4a043182729edd3e9f247c3b2a4035415a9172fd0f1b71658a320a8", size = 177915, upload-time = "2026-01-10T09:23:21.458Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/f1/ae6b937bf3126b5134ce1f482365fde31a357c784ac51852978768b5eff4/websockets-16.0-cp314-cp314-win_amd64.whl", hash = "sha256:c0ee0e63f23914732c6d7e0cce24915c48f3f1512ec1d079ed01fc629dab269d", size = 178381, upload-time = "2026-01-10T09:23:22.715Z" },
+    { url = "https://files.pythonhosted.org/packages/06/9b/f791d1db48403e1f0a27577a6beb37afae94254a8c6f08be4a23e4930bc0/websockets-16.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:a35539cacc3febb22b8f4d4a99cc79b104226a756aa7400adc722e83b0d03244", size = 177737, upload-time = "2026-01-10T09:23:24.523Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/40/53ad02341fa33b3ce489023f635367a4ac98b73570102ad2cdd770dacc9a/websockets-16.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b784ca5de850f4ce93ec85d3269d24d4c82f22b7212023c974c401d4980ebc5e", size = 175268, upload-time = "2026-01-10T09:23:25.781Z" },
+    { url = "https://files.pythonhosted.org/packages/74/9b/6158d4e459b984f949dcbbb0c5d270154c7618e11c01029b9bbd1bb4c4f9/websockets-16.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:569d01a4e7fba956c5ae4fc988f0d4e187900f5497ce46339c996dbf24f17641", size = 175486, upload-time = "2026-01-10T09:23:27.033Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/2d/7583b30208b639c8090206f95073646c2c9ffd66f44df967981a64f849ad/websockets-16.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:50f23cdd8343b984957e4077839841146f67a3d31ab0d00e6b824e74c5b2f6e8", size = 185331, upload-time = "2026-01-10T09:23:28.259Z" },
+    { url = "https://files.pythonhosted.org/packages/45/b0/cce3784eb519b7b5ad680d14b9673a31ab8dcb7aad8b64d81709d2430aa8/websockets-16.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:152284a83a00c59b759697b7f9e9cddf4e3c7861dd0d964b472b70f78f89e80e", size = 186501, upload-time = "2026-01-10T09:23:29.449Z" },
+    { url = "https://files.pythonhosted.org/packages/19/60/b8ebe4c7e89fb5f6cdf080623c9d92789a53636950f7abacfc33fe2b3135/websockets-16.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bc59589ab64b0022385f429b94697348a6a234e8ce22544e3681b2e9331b5944", size = 186062, upload-time = "2026-01-10T09:23:31.368Z" },
+    { url = "https://files.pythonhosted.org/packages/88/a8/a080593f89b0138b6cba1b28f8df5673b5506f72879322288b031337c0b8/websockets-16.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:32da954ffa2814258030e5a57bc73a3635463238e797c7375dc8091327434206", size = 185356, upload-time = "2026-01-10T09:23:32.627Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/b6/b9afed2afadddaf5ebb2afa801abf4b0868f42f8539bfe4b071b5266c9fe/websockets-16.0-cp314-cp314t-win32.whl", hash = "sha256:5a4b4cc550cb665dd8a47f868c8d04c8230f857363ad3c9caf7a0c3bf8c61ca6", size = 178085, upload-time = "2026-01-10T09:23:33.816Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/3e/28135a24e384493fa804216b79a6a6759a38cc4ff59118787b9fb693df93/websockets-16.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b14dc141ed6d2dde437cddb216004bcac6a1df0935d79656387bd41632ba0bbd", size = 178531, upload-time = "2026-01-10T09:23:35.016Z" },
+    { url = "https://files.pythonhosted.org/packages/72/07/c98a68571dcf256e74f1f816b8cc5eae6eb2d3d5cfa44d37f801619d9166/websockets-16.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:349f83cd6c9a415428ee1005cadb5c2c56f4389bc06a9af16103c3bc3dcc8b7d", size = 174947, upload-time = "2026-01-10T09:23:36.166Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/52/93e166a81e0305b33fe416338be92ae863563fe7bce446b0f687b9df5aea/websockets-16.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:4a1aba3340a8dca8db6eb5a7986157f52eb9e436b74813764241981ca4888f03", size = 175260, upload-time = "2026-01-10T09:23:37.409Z" },
+    { url = "https://files.pythonhosted.org/packages/56/0c/2dbf513bafd24889d33de2ff0368190a0e69f37bcfa19009ef819fe4d507/websockets-16.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f4a32d1bd841d4bcbffdcb3d2ce50c09c3909fbead375ab28d0181af89fd04da", size = 176071, upload-time = "2026-01-10T09:23:39.158Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8f/aea9c71cc92bf9b6cc0f7f70df8f0b420636b6c96ef4feee1e16f80f75dd/websockets-16.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0298d07ee155e2e9fda5be8a9042200dd2e3bb0b8a38482156576f863a9d457c", size = 176968, upload-time = "2026-01-10T09:23:41.031Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/3f/f70e03f40ffc9a30d817eef7da1be72ee4956ba8d7255c399a01b135902a/websockets-16.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a653aea902e0324b52f1613332ddf50b00c06fdaf7e92624fbf8c77c78fa5767", size = 178735, upload-time = "2026-01-10T09:23:42.259Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
 ]
 
 [[package]]
 name = "werkzeug"
-version = "3.1.3"
+version = "3.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "markupsafe" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9f/69/83029f1f6300c5fb2471d621ab06f6ec6b3324685a2ce0f9777fd4a8b71e/werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746", size = 806925, upload-time = "2024-11-08T15:52:18.093Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/61/f1/ee81806690a87dab5f5653c1f146c92bc066d7f4cebc603ef88eb9e13957/werkzeug-3.1.6.tar.gz", hash = "sha256:210c6bede5a420a913956b4791a7f4d6843a43b6fcee4dfa08a65e93007d0d25", size = 864736, upload-time = "2026-02-19T15:17:18.884Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload-time = "2024-11-08T15:52:16.132Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/ec/d58832f89ede95652fd01f4f24236af7d32b70cab2196dfcc2d2fd13c5c2/werkzeug-3.1.6-py3-none-any.whl", hash = "sha256:7ddf3357bb9564e407607f988f683d72038551200c704012bb9a4c523d42f131", size = 225166, upload-time = "2026-02-19T15:17:17.475Z" },
 ]
 
 [[package]]
@@ -6295,6 +7362,26 @@ wheels = [
 name = "wrapt"
 version = "1.17.3"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/23/bb82321b86411eb51e5a5db3fb8f8032fd30bd7c2d74bfe936136b2fa1d6/wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04", size = 53482, upload-time = "2025-08-12T05:51:44.467Z" },
@@ -6360,6 +7447,81 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
 ]
 
+[[package]]
+name = "wrapt"
+version = "2.1.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f7/37/ae31f40bec90de2f88d9597d0b5281e23ffe85b893a47ca5d9c05c63a4f6/wrapt-2.1.1.tar.gz", hash = "sha256:5fdcb09bf6db023d88f312bd0767594b414655d58090fc1c46b3414415f67fac", size = 81329, upload-time = "2026-02-03T02:12:13.786Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/21/293b657a27accfbbbb6007ebd78af0efa2083dac83e8f523272ea09b4638/wrapt-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7e927375e43fd5a985b27a8992327c22541b6dede1362fc79df337d26e23604f", size = 60554, upload-time = "2026-02-03T02:11:17.362Z" },
+    { url = "https://files.pythonhosted.org/packages/25/e9/96dd77728b54a899d4ce2798d7b1296989ce687ed3c0cb917d6b3154bf5d/wrapt-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c99544b6a7d40ca22195563b6d8bc3986ee8bb82f272f31f0670fe9440c869", size = 61496, upload-time = "2026-02-03T02:12:54.732Z" },
+    { url = "https://files.pythonhosted.org/packages/44/79/4c755b45df6ef30c0dd628ecfaa0c808854be147ca438429da70a162833c/wrapt-2.1.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2be3fa5f4efaf16ee7c77d0556abca35f5a18ad4ac06f0ef3904c3399010ce9", size = 113528, upload-time = "2026-02-03T02:12:26.405Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/63/23ce28f7b841217d9a6337a340fbb8d4a7fbd67a89d47f377c8550fa34aa/wrapt-2.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67c90c1ae6489a6cb1a82058902caa8006706f7b4e8ff766f943e9d2c8e608d0", size = 115536, upload-time = "2026-02-03T02:11:54.397Z" },
+    { url = "https://files.pythonhosted.org/packages/23/7b/5ca8d3b12768670d16c8329e29960eedd56212770365a02a8de8bf73dc01/wrapt-2.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05c0db35ccffd7480143e62df1e829d101c7b86944ae3be7e4869a7efa621f53", size = 114716, upload-time = "2026-02-03T02:12:20.771Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/3a/9789ccb14a096d30bb847bf3ee137bf682cc9750c2ce155f4c5ae1962abf/wrapt-2.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0c2ec9f616755b2e1e0bf4d0961f59bb5c2e7a77407e7e2c38ef4f7d2fdde12c", size = 113200, upload-time = "2026-02-03T02:12:07.688Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/e5/4ec3526ce6ce920b267c8d35d2c2f0874d3fad2744c8b7259353f1132baa/wrapt-2.1.1-cp310-cp310-win32.whl", hash = "sha256:203ba6b3f89e410e27dbd30ff7dccaf54dcf30fda0b22aa1b82d560c7f9fe9a1", size = 57876, upload-time = "2026-02-03T02:11:42.61Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/4e/661c7c76ecd85375b2bc03488941a3a1078642af481db24949e2b9de01f4/wrapt-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:6f9426d9cfc2f8732922fc96198052e55c09bb9db3ddaa4323a18e055807410e", size = 60224, upload-time = "2026-02-03T02:11:19.096Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/b7/53c7252d371efada4cb119e72e774fa2c6b3011fc33e3e552cdf48fb9488/wrapt-2.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:69c26f51b67076b40714cff81bdd5826c0b10c077fb6b0678393a6a2f952a5fc", size = 58645, upload-time = "2026-02-03T02:12:10.396Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/a8/9254e4da74b30a105935197015b18b31b7a298bf046e67d8952ef74967bd/wrapt-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c366434a7fb914c7a5de508ed735ef9c133367114e1a7cb91dfb5cd806a1549", size = 60554, upload-time = "2026-02-03T02:11:13.038Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/a1/378579880cc7af226354054a2c255f69615b379d8adad482bfe2f22a0dc2/wrapt-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d6a2068bd2e1e19e5a317c8c0b288267eec4e7347c36bc68a6e378a39f19ee7", size = 61491, upload-time = "2026-02-03T02:12:56.077Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/72/957b51c56acca35701665878ad31626182199fc4afecfe67dea072210f95/wrapt-2.1.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:891ab4713419217b2aed7dd106c9200f64e6a82226775a0d2ebd6bef2ebd1747", size = 113949, upload-time = "2026-02-03T02:11:04.516Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/74/36bbebb4a3d2ae9c3e6929639721f8606cd0710a82a777c371aa69e36504/wrapt-2.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8ef36a0df38d2dc9d907f6617f89e113c5892e0a35f58f45f75901af0ce7d81", size = 115989, upload-time = "2026-02-03T02:12:19.398Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/0d/f1177245a083c7be284bc90bddfe5aece32cdd5b858049cb69ce001a0e8d/wrapt-2.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76e9af3ebd86f19973143d4d592cbf3e970cf3f66ddee30b16278c26ae34b8ab", size = 115242, upload-time = "2026-02-03T02:11:08.111Z" },
+    { url = "https://files.pythonhosted.org/packages/62/3e/3b7cf5da27e59df61b1eae2d07dd03ff5d6f75b5408d694873cca7a8e33c/wrapt-2.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ff562067485ebdeaef2fa3fe9b1876bc4e7b73762e0a01406ad81e2076edcebf", size = 113676, upload-time = "2026-02-03T02:12:41.026Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/65/8248d3912c705f2c66f81cb97c77436f37abcbedb16d633b5ab0d795d8cd/wrapt-2.1.1-cp311-cp311-win32.whl", hash = "sha256:9e60a30aa0909435ec4ea2a3c53e8e1b50ac9f640c0e9fe3f21fd248a22f06c5", size = 57863, upload-time = "2026-02-03T02:12:18.112Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/31/d29310ab335f71f00c50466153b3dc985aaf4a9fc03263e543e136859541/wrapt-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:7d79954f51fcf84e5ec4878ab4aea32610d70145c5bbc84b3370eabfb1e096c2", size = 60224, upload-time = "2026-02-03T02:12:29.289Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/90/a6ec319affa6e2894962a0cb9d73c67f88af1a726d15314bfb5c88b8a08d/wrapt-2.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:d3ffc6b0efe79e08fd947605fd598515aebefe45e50432dc3b5cd437df8b1ada", size = 58643, upload-time = "2026-02-03T02:12:43.022Z" },
+    { url = "https://files.pythonhosted.org/packages/df/cb/4d5255d19bbd12be7f8ee2c1fb4269dddec9cef777ef17174d357468efaa/wrapt-2.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab8e3793b239db021a18782a5823fcdea63b9fe75d0e340957f5828ef55fcc02", size = 61143, upload-time = "2026-02-03T02:11:46.313Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/07/7ed02daa35542023464e3c8b7cb937fa61f6c61c0361ecf8f5fecf8ad8da/wrapt-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c0300007836373d1c2df105b40777986accb738053a92fe09b615a7a4547e9f", size = 61740, upload-time = "2026-02-03T02:12:51.966Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/60/a237a4e4a36f6d966061ccc9b017627d448161b19e0a3ab80a7c7c97f859/wrapt-2.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2b27c070fd1132ab23957bcd4ee3ba707a91e653a9268dc1afbd39b77b2799f7", size = 121327, upload-time = "2026-02-03T02:11:06.796Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/fe/9139058a3daa8818fc67e6460a2340e8bbcf3aef8b15d0301338bbe181ca/wrapt-2.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b0e36d845e8b6f50949b6b65fc6cd279f47a1944582ed4ec8258cd136d89a64", size = 122903, upload-time = "2026-02-03T02:12:48.657Z" },
+    { url = "https://files.pythonhosted.org/packages/91/10/b8479202b4164649675846a531763531f0a6608339558b5a0a718fc49a8d/wrapt-2.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4aeea04a9889370fcfb1ef828c4cc583f36a875061505cd6cd9ba24d8b43cc36", size = 121333, upload-time = "2026-02-03T02:11:32.148Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/75/75fc793b791d79444aca2c03ccde64e8b99eda321b003f267d570b7b0985/wrapt-2.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d88b46bb0dce9f74b6817bc1758ff2125e1ca9e1377d62ea35b6896142ab6825", size = 120458, upload-time = "2026-02-03T02:11:16.039Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/8f/c3f30d511082ca6d947c405f9d8f6c8eaf83cfde527c439ec2c9a30eb5ea/wrapt-2.1.1-cp312-cp312-win32.whl", hash = "sha256:63decff76ca685b5c557082dfbea865f3f5f6d45766a89bff8dc61d336348833", size = 58086, upload-time = "2026-02-03T02:12:35.041Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/c8/37625b643eea2849f10c3b90f69c7462faa4134448d4443234adaf122ae5/wrapt-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:b828235d26c1e35aca4107039802ae4b1411be0fe0367dd5b7e4d90e562fcbcd", size = 60328, upload-time = "2026-02-03T02:12:45.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/79/56242f07572d5682ba8065a9d4d9c2218313f576e3c3471873c2a5355ffd/wrapt-2.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:75128507413a9f1bcbe2db88fd18fbdbf80f264b82fa33a6996cdeaf01c52352", size = 58722, upload-time = "2026-02-03T02:12:27.949Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/ca/3cf290212855b19af9fcc41b725b5620b32f470d6aad970c2593500817eb/wrapt-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9646e17fa7c3e2e7a87e696c7de66512c2b4f789a8db95c613588985a2e139", size = 61150, upload-time = "2026-02-03T02:12:50.575Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/33/5b8f89a82a9859ce82da4870c799ad11ce15648b6e1c820fec3e23f4a19f/wrapt-2.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:428cfc801925454395aa468ba7ddb3ed63dc0d881df7b81626cdd433b4e2b11b", size = 61743, upload-time = "2026-02-03T02:11:55.733Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/2f/60c51304fbdf47ce992d9eefa61fbd2c0e64feee60aaa439baf42ea6f40b/wrapt-2.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5797f65e4d58065a49088c3b32af5410751cd485e83ba89e5a45e2aa8905af98", size = 121341, upload-time = "2026-02-03T02:11:20.461Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/03/ce5256e66dd94e521ad5e753c78185c01b6eddbed3147be541f4d38c0cb7/wrapt-2.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a2db44a71202c5ae4bb5f27c6d3afbc5b23053f2e7e78aa29704541b5dad789", size = 122947, upload-time = "2026-02-03T02:11:33.596Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/ae/50ca8854b81b946a11a36fcd6ead32336e6db2c14b6e4a8b092b80741178/wrapt-2.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8d5350c3590af09c1703dd60ec78a7370c0186e11eaafb9dda025a30eee6492d", size = 121370, upload-time = "2026-02-03T02:11:09.886Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/d9/d6a7c654e0043319b4cc137a4caaf7aa16b46b51ee8df98d1060254705b7/wrapt-2.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d9b076411bed964e752c01b49fd224cc385f3a96f520c797d38412d70d08359", size = 120465, upload-time = "2026-02-03T02:11:37.592Z" },
+    { url = "https://files.pythonhosted.org/packages/55/90/65be41e40845d951f714b5a77e84f377a3787b1e8eee6555a680da6d0db5/wrapt-2.1.1-cp313-cp313-win32.whl", hash = "sha256:0bb7207130ce6486727baa85373503bf3334cc28016f6928a0fa7e19d7ecdc06", size = 58090, upload-time = "2026-02-03T02:12:53.342Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/66/6a09e0294c4fc8c26028a03a15191721c9271672467cc33e6617ee0d91d2/wrapt-2.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:cbfee35c711046b15147b0ae7db9b976f01c9520e6636d992cd9e69e5e2b03b1", size = 60341, upload-time = "2026-02-03T02:12:36.384Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/f0/20ceb8b701e9a71555c87a5ddecbed76ec16742cf1e4b87bbaf26735f998/wrapt-2.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:7d2756061022aebbf57ba14af9c16e8044e055c22d38de7bf40d92b565ecd2b0", size = 58731, upload-time = "2026-02-03T02:12:01.328Z" },
+    { url = "https://files.pythonhosted.org/packages/80/b4/fe95beb8946700b3db371f6ce25115217e7075ca063663b8cca2888ba55c/wrapt-2.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4814a3e58bc6971e46baa910ecee69699110a2bf06c201e24277c65115a20c20", size = 62969, upload-time = "2026-02-03T02:11:51.245Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/89/477b0bdc784e3299edf69c279697372b8bd4c31d9c6966eae405442899df/wrapt-2.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:106c5123232ab9b9f4903692e1fa0bdc231510098f04c13c3081f8ad71c3d612", size = 63606, upload-time = "2026-02-03T02:12:02.64Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/55/9d0c1269ab76de87715b3b905df54dd25d55bbffd0b98696893eb613469f/wrapt-2.1.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1a40b83ff2535e6e56f190aff123821eea89a24c589f7af33413b9c19eb2c738", size = 152536, upload-time = "2026-02-03T02:11:24.492Z" },
+    { url = "https://files.pythonhosted.org/packages/44/18/2004766030462f79ad86efaa62000b5e39b1ff001dcce86650e1625f40ae/wrapt-2.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:789cea26e740d71cf1882e3a42bb29052bc4ada15770c90072cb47bf73fb3dbf", size = 158697, upload-time = "2026-02-03T02:12:32.214Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/bb/0a880fa0f35e94ee843df4ee4dd52a699c9263f36881311cfb412c09c3e5/wrapt-2.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ba49c14222d5e5c0ee394495a8655e991dc06cbca5398153aefa5ac08cd6ccd7", size = 155563, upload-time = "2026-02-03T02:11:49.737Z" },
+    { url = "https://files.pythonhosted.org/packages/42/ff/cd1b7c4846c8678fac359a6eb975dc7ab5bd606030adb22acc8b4a9f53f1/wrapt-2.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ac8cda531fe55be838a17c62c806824472bb962b3afa47ecbd59b27b78496f4e", size = 150161, upload-time = "2026-02-03T02:12:33.613Z" },
+    { url = "https://files.pythonhosted.org/packages/38/ec/67c90a7082f452964b4621e4890e9a490f1add23cdeb7483cc1706743291/wrapt-2.1.1-cp313-cp313t-win32.whl", hash = "sha256:b8af75fe20d381dd5bcc9db2e86a86d7fcfbf615383a7147b85da97c1182225b", size = 59783, upload-time = "2026-02-03T02:11:39.863Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/08/466afe4855847d8febdfa2c57c87e991fc5820afbdef01a273683dfd15a0/wrapt-2.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:45c5631c9b6c792b78be2d7352129f776dd72c605be2c3a4e9be346be8376d83", size = 63082, upload-time = "2026-02-03T02:12:09.075Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/62/60b629463c28b15b1eeadb3a0691e17568622b12aa5bfa7ebe9b514bfbeb/wrapt-2.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:da815b9263947ac98d088b6414ac83507809a1d385e4632d9489867228d6d81c", size = 60251, upload-time = "2026-02-03T02:11:21.794Z" },
+    { url = "https://files.pythonhosted.org/packages/95/a0/1c2396e272f91efe6b16a6a8bce7ad53856c8f9ae4f34ceaa711d63ec9e1/wrapt-2.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9aa1765054245bb01a37f615503290d4e207e3fd59226e78341afb587e9c1236", size = 61311, upload-time = "2026-02-03T02:12:44.41Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/9a/d2faba7e61072a7507b5722db63562fdb22f5a24e237d460d18755627f15/wrapt-2.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:feff14b63a6d86c1eee33a57f77573649f2550935981625be7ff3cb7342efe05", size = 61805, upload-time = "2026-02-03T02:11:59.905Z" },
+    { url = "https://files.pythonhosted.org/packages/db/56/073989deb4b5d7d6e7ea424476a4ae4bda02140f2dbeaafb14ba4864dd60/wrapt-2.1.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:81fc5f22d5fcfdbabde96bb3f5379b9f4476d05c6d524d7259dc5dfb501d3281", size = 120308, upload-time = "2026-02-03T02:12:04.46Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/b6/84f37261295e38167a29eb82affaf1dc15948dc416925fe2091beee8e4ac/wrapt-2.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:951b228ecf66def855d22e006ab9a1fc12535111ae7db2ec576c728f8ddb39e8", size = 122688, upload-time = "2026-02-03T02:11:23.148Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/80/32db2eec6671f80c65b7ff175be61bc73d7f5223f6910b0c921bbc4bd11c/wrapt-2.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ddf582a95641b9a8c8bd643e83f34ecbbfe1b68bc3850093605e469ab680ae3", size = 121115, upload-time = "2026-02-03T02:12:39.068Z" },
+    { url = "https://files.pythonhosted.org/packages/49/ef/dcd00383df0cd696614127902153bf067971a5aabcd3c9dcb2d8ef354b2a/wrapt-2.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fc5c500966bf48913f795f1984704e6d452ba2414207b15e1f8c339a059d5b16", size = 119484, upload-time = "2026-02-03T02:11:48.419Z" },
+    { url = "https://files.pythonhosted.org/packages/76/29/0630280cdd2bd8f86f35cb6854abee1c9d6d1a28a0c6b6417cd15d378325/wrapt-2.1.1-cp314-cp314-win32.whl", hash = "sha256:4aa4baadb1f94b71151b8e44a0c044f6af37396c3b8bcd474b78b49e2130a23b", size = 58514, upload-time = "2026-02-03T02:11:58.616Z" },
+    { url = "https://files.pythonhosted.org/packages/db/19/5bed84f9089ed2065f6aeda5dfc4f043743f642bc871454b261c3d7d322b/wrapt-2.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:860e9d3fd81816a9f4e40812f28be4439ab01f260603c749d14be3c0a1170d19", size = 60763, upload-time = "2026-02-03T02:12:24.553Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/cb/b967f2f9669e4249b4fe82e630d2a01bc6b9e362b9b12ed91bbe23ae8df4/wrapt-2.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3c59e103017a2c1ea0ddf589cbefd63f91081d7ce9d491d69ff2512bb1157e23", size = 59051, upload-time = "2026-02-03T02:11:29.602Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/19/6fed62be29f97eb8a56aff236c3f960a4b4a86e8379dc7046a8005901a97/wrapt-2.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9fa7c7e1bee9278fc4f5dd8275bc8d25493281a8ec6c61959e37cc46acf02007", size = 63059, upload-time = "2026-02-03T02:12:06.368Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/1c/b757fd0adb53d91547ed8fad76ba14a5932d83dde4c994846a2804596378/wrapt-2.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:39c35e12e8215628984248bd9c8897ce0a474be2a773db207eb93414219d8469", size = 63618, upload-time = "2026-02-03T02:12:23.197Z" },
+    { url = "https://files.pythonhosted.org/packages/10/fe/e5ae17b1480957c7988d991b93df9f2425fc51f128cf88144d6a18d0eb12/wrapt-2.1.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:94ded4540cac9125eaa8ddf5f651a7ec0da6f5b9f248fe0347b597098f8ec14c", size = 152544, upload-time = "2026-02-03T02:11:43.915Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/cc/99aed210c6b547b8a6e4cb9d1425e4466727158a6aeb833aa7997e9e08dd/wrapt-2.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da0af328373f97ed9bdfea24549ac1b944096a5a71b30e41c9b8b53ab3eec04a", size = 158700, upload-time = "2026-02-03T02:12:30.684Z" },
+    { url = "https://files.pythonhosted.org/packages/81/0e/d442f745f4957944d5f8ad38bc3a96620bfff3562533b87e486e979f3d99/wrapt-2.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4ad839b55f0bf235f8e337ce060572d7a06592592f600f3a3029168e838469d3", size = 155561, upload-time = "2026-02-03T02:11:28.164Z" },
+    { url = "https://files.pythonhosted.org/packages/51/ac/9891816280e0018c48f8dfd61b136af7b0dcb4a088895db2531acde5631b/wrapt-2.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0d89c49356e5e2a50fa86b40e0510082abcd0530f926cbd71cf25bee6b9d82d7", size = 150188, upload-time = "2026-02-03T02:11:57.053Z" },
+    { url = "https://files.pythonhosted.org/packages/24/98/e2f273b6d70d41f98d0739aa9a269d0b633684a5fb17b9229709375748d4/wrapt-2.1.1-cp314-cp314t-win32.whl", hash = "sha256:f4c7dd22cf7f36aafe772f3d88656559205c3af1b7900adfccb70edeb0d2abc4", size = 60425, upload-time = "2026-02-03T02:11:35.007Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/06/b500bfc38a4f82d89f34a13069e748c82c5430d365d9e6b75afb3ab74457/wrapt-2.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f76bc12c583ab01e73ba0ea585465a41e48d968f6d1311b4daec4f8654e356e3", size = 63855, upload-time = "2026-02-03T02:12:15.47Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/cc/5f6193c32166faee1d2a613f278608e6f3b95b96589d020f0088459c46c9/wrapt-2.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7ea74fc0bec172f1ae5f3505b6655c541786a5cabe4bbc0d9723a56ac32eb9b9", size = 60443, upload-time = "2026-02-03T02:11:30.869Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/da/5a086bf4c22a41995312db104ec2ffeee2cf6accca9faaee5315c790377d/wrapt-2.1.1-py3-none-any.whl", hash = "sha256:3b0f4629eb954394a3d7c7a1c8cca25f0b07cefe6aa8545e862e9778152de5b7", size = 43886, upload-time = "2026-02-03T02:11:45.048Z" },
+]
+
 [[package]]
 name = "xattr"
 version = "1.3.0"